Home | History | Annotate | Download | only in softpart
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Soft partitioning metadevice driver (md_sp).
     29  *
     30  * This file contains the primary operations of the soft partitioning
     31  * metadevice driver.  This includes all routines for normal operation
     32  * (open/close/read/write).  Please see mdvar.h for a definition of
     33  * metadevice operations vector (md_ops_t).  This driver is loosely
     34  * based on the stripe driver (md_stripe).
     35  *
     36  * All metadevice administration is done through the use of ioctl's.
     37  * As such, all administrative routines appear in sp_ioctl.c.
     38  *
     39  * Soft partitions are represented both in-core and in the metadb with a
     40  * unit structure.  The soft partition-specific information in the unit
     41  * structure includes the following information:
     42  *	- Device information (md_dev64_t & md key) about the device on which
     43  *	  the soft partition is built.
     44  *	- Soft partition status information.
     45  *	- The size of the soft partition and number of extents used to
     46  *	  make up that size.
     47  *	- An array of exents which define virtual/physical offset
     48  *	  mappings and lengths for each extent.
     49  *
     50  * Typical soft partition operation proceeds as follows:
     51  *	- The unit structure is fetched from the metadb and placed into
     52  *	  an in-core array (as with other metadevices).  This operation
     53  *	  is performed via sp_build_incore( ) and takes place during
     54  *	  "snarfing" (when all metadevices are brought in-core at
     55  *	  once) and when a new soft partition is created.
     56  *	- A soft partition is opened via sp_open( ).  At open time the
     57  *	  the soft partition unit structure is verified with the soft
     58  *	  partition on-disk structures.  Additionally, the soft partition
     59  *	  status is checked (only soft partitions in the OK state may be
     60  *	  opened).
     61  *	- Soft partition I/O is performed via sp_strategy( ) which relies on
     62  *	  a support routine, sp_mapbuf( ), to do most of the work.
     63  *	  sp_mapbuf( ) maps a buffer to a particular extent via a binary
     64  *	  search of the extent array in the soft partition unit structure.
     65  *	  Once a translation has been performed, the I/O is passed down
     66  *	  to the next layer, which may be another metadevice or a physical
     67  *	  disk.  Since a soft partition may contain multiple, non-contiguous
     68  *	  extents, a single I/O may have to be fragmented.
     69  *	- Soft partitions are closed using sp_close.
     70  *
     71  */
     72 
     73 #include <sys/param.h>
     74 #include <sys/systm.h>
     75 #include <sys/conf.h>
     76 #include <sys/file.h>
     77 #include <sys/user.h>
     78 #include <sys/uio.h>
     79 #include <sys/t_lock.h>
     80 #include <sys/buf.h>
     81 #include <sys/dkio.h>
     82 #include <sys/vtoc.h>
     83 #include <sys/kmem.h>
     84 #include <vm/page.h>
     85 #include <sys/cmn_err.h>
     86 #include <sys/sysmacros.h>
     87 #include <sys/types.h>
     88 #include <sys/mkdev.h>
     89 #include <sys/stat.h>
     90 #include <sys/open.h>
     91 #include <sys/lvm/mdvar.h>
     92 #include <sys/lvm/md_sp.h>
     93 #include <sys/lvm/md_convert.h>
     94 #include <sys/lvm/md_notify.h>
     95 #include <sys/lvm/md_crc.h>
     96 #include <sys/modctl.h>
     97 #include <sys/ddi.h>
     98 #include <sys/sunddi.h>
     99 #include <sys/debug.h>
    100 
    101 #include <sys/sysevent/eventdefs.h>
    102 #include <sys/sysevent/svm.h>
    103 
    104 md_ops_t		sp_md_ops;
    105 #ifndef	lint
    106 char			_depends_on[] = "drv/md";
    107 md_ops_t		*md_interface_ops = &sp_md_ops;
    108 #endif
    109 
    110 extern unit_t		md_nunits;
    111 extern set_t		md_nsets;
    112 extern md_set_t		md_set[];
    113 
    114 extern int		md_status;
    115 extern major_t		md_major;
    116 extern mdq_anchor_t	md_done_daemon;
    117 extern mdq_anchor_t	md_sp_daemon;
    118 extern kmutex_t		md_mx;
    119 extern kcondvar_t	md_cv;
    120 extern md_krwlock_t	md_unit_array_rw;
    121 extern clock_t		md_hz;
    122 
    123 static kmem_cache_t	*sp_parent_cache = NULL;
    124 static kmem_cache_t	*sp_child_cache = NULL;
    125 static void		sp_send_stat_ok(mp_unit_t *);
    126 static void		sp_send_stat_err(mp_unit_t *);
    127 
    128 /*
    129  * FUNCTION:	sp_parent_constructor()
    130  * INPUT:	none.
    131  * OUTPUT:	ps	- parent save structure initialized.
    132  * RETURNS:	void *	- ptr to initialized parent save structure.
    133  * PURPOSE:	initialize parent save structure.
    134  */
    135 /*ARGSUSED1*/
    136 static int
    137 sp_parent_constructor(void *p, void *d1, int d2)
    138 {
    139 	mutex_init(&((md_spps_t *)p)->ps_mx,
    140 	    NULL, MUTEX_DEFAULT, NULL);
    141 	return (0);
    142 }
    143 
    144 static void
    145 sp_parent_init(md_spps_t *ps)
    146 {
    147 	bzero(ps, offsetof(md_spps_t, ps_mx));
    148 }
    149 
    150 /*ARGSUSED1*/
    151 static void
    152 sp_parent_destructor(void *p, void *d)
    153 {
    154 	mutex_destroy(&((md_spps_t *)p)->ps_mx);
    155 }
    156 
    157 /*
    158  * FUNCTION:	sp_child_constructor()
    159  * INPUT:	none.
    160  * OUTPUT:	cs	- child save structure initialized.
    161  * RETURNS:	void *	- ptr to initialized child save structure.
    162  * PURPOSE:	initialize child save structure.
    163  */
    164 /*ARGSUSED1*/
    165 static int
    166 sp_child_constructor(void *p, void *d1, int d2)
    167 {
    168 	bioinit(&((md_spcs_t *)p)->cs_buf);
    169 	return (0);
    170 }
    171 
    172 static void
    173 sp_child_init(md_spcs_t *cs)
    174 {
    175 	cs->cs_mdunit = 0;
    176 	cs->cs_ps = NULL;
    177 	md_bioreset(&cs->cs_buf);
    178 }
    179 
    180 /*ARGSUSED1*/
    181 static void
    182 sp_child_destructor(void *p, void *d)
    183 {
    184 	biofini(&((md_spcs_t *)p)->cs_buf);
    185 }
    186 
    187 /*
    188  * FUNCTION:	sp_run_queue()
    189  * INPUT:	none.
    190  * OUTPUT:	none.
    191  * RETURNS:	void.
    192  * PURPOSE:	run the md_daemon to clean up memory pool.
    193  */
    194 /*ARGSUSED*/
    195 static void
    196 sp_run_queue(void *d)
    197 {
    198 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
    199 		md_daemon(1, &md_done_daemon);
    200 }
    201 
    202 
    203 /*
    204  * FUNCTION:	sp_build_incore()
    205  * INPUT:	p		- ptr to unit structure.
    206  *		snarfing	- flag to tell us we are snarfing.
    207  * OUTPUT:	non.
    208  * RETURNS:	int	- 0 (always).
    209  * PURPOSE:	place unit structure into in-core unit array (keyed from
    210  *		minor number).
    211  */
    212 int
    213 sp_build_incore(void *p, int snarfing)
    214 {
    215 	mp_unit_t	*un = (mp_unit_t *)p;
    216 	minor_t		mnum;
    217 	set_t		setno;
    218 	md_dev64_t	tmpdev;
    219 
    220 	mnum = MD_SID(un);
    221 
    222 	if (MD_UNIT(mnum) != NULL)
    223 		return (0);
    224 
    225 	MD_STATUS(un) = 0;
    226 
    227 	if (snarfing) {
    228 		/*
    229 		 * if we are snarfing, we get the device information
    230 		 * from the metadb record (using the metadb key for
    231 		 * that device).
    232 		 */
    233 		setno = MD_MIN2SET(mnum);
    234 
    235 		tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
    236 		    un->un_key, MD_NOTRUST_DEVT);
    237 		un->un_dev = tmpdev;
    238 	}
    239 
    240 	/* place various information in the in-core data structures */
    241 	md_nblocks_set(mnum, un->c.un_total_blocks);
    242 	MD_UNIT(mnum) = un;
    243 
    244 	return (0);
    245 }
    246 
    247 /*
    248  * FUNCTION:	reset_sp()
    249  * INPUT:	un		- unit structure to be reset/removed.
    250  *		mnum		- minor number to be reset/removed.
    251  *		removing	- flag to tell us if we are removing
    252  *				  permanently or just reseting in-core
    253  *				  structures.
    254  * OUTPUT:	none.
    255  * RETURNS:	void.
    256  * PURPOSE:	used to either simply reset in-core structures or to
    257  *		permanently remove metadevices from the metadb.
    258  */
    259 void
    260 reset_sp(mp_unit_t *un, minor_t mnum, int removing)
    261 {
    262 	sv_dev_t	*sv;
    263 	mddb_recid_t	vtoc_id;
    264 
    265 	/* clean up in-core structures */
    266 	md_destroy_unit_incore(mnum, &sp_md_ops);
    267 
    268 	md_nblocks_set(mnum, -1ULL);
    269 	MD_UNIT(mnum) = NULL;
    270 
    271 	/*
    272 	 * Attempt release of minor node
    273 	 */
    274 	md_remove_minor_node(mnum);
    275 
    276 	if (!removing)
    277 		return;
    278 
    279 	/* we are removing the soft partition from the metadb */
    280 
    281 	/*
    282 	 * Save off device information so we can get to
    283 	 * it after we do the mddb_deleterec().
    284 	 */
    285 	sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
    286 	sv->setno = MD_MIN2SET(mnum);
    287 	sv->key = un->un_key;
    288 	vtoc_id = un->c.un_vtoc_id;
    289 
    290 	/*
    291 	 * Remove self from the namespace
    292 	 */
    293 	if (un->c.un_revision & MD_FN_META_DEV) {
    294 		(void) md_rem_selfname(un->c.un_self_id);
    295 	}
    296 
    297 	/* Remove the unit structure */
    298 	mddb_deleterec_wrapper(un->c.un_record_id);
    299 
    300 	if (vtoc_id)
    301 		mddb_deleterec_wrapper(vtoc_id);
    302 
    303 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE,
    304 	    MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
    305 
    306 	/*
    307 	 * remove the underlying device name from the metadb.  if other
    308 	 * soft partitions are built on this device, this will simply
    309 	 * decrease the reference count for this device.  otherwise the
    310 	 * name record for this device will be removed from the metadb.
    311 	 */
    312 	md_rem_names(sv, 1);
    313 	kmem_free(sv, sizeof (sv_dev_t));
    314 }
    315 
    316 /*
    317  * FUNCTION:	sp_send_stat_msg
    318  * INPUT:	un	- unit reference
    319  *		status	- status to be sent to master node
    320  *			MD_SP_OK - soft-partition is now OK
    321  *			MD_SP_ERR	"	"	 errored
    322  * OUTPUT:	none.
    323  * RETURNS:	void.
    324  * PURPOSE:	send a soft-partition status change to the master node. If the
    325  *		message succeeds we simply return. If it fails we panic as the
    326  *		cluster-wide view of the metadevices is now inconsistent.
    327  * CALLING CONTEXT:
    328  *	Blockable. No locks can be held.
    329  */
    330 static void
    331 sp_send_stat_msg(mp_unit_t *un, sp_status_t status)
    332 {
    333 	md_mn_msg_sp_setstat_t	sp_msg;
    334 	md_mn_kresult_t	*kres;
    335 	set_t		setno = MD_UN2SET(un);
    336 	int		rval;
    337 	const char	*str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK";
    338 	int		nretries = 0;
    339 
    340 	sp_msg.sp_setstat_mnum = MD_SID(un);
    341 	sp_msg.sp_setstat_status = status;
    342 
    343 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
    344 
    345 spss_msg:
    346 	rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
    347 	    0, (char *)&sp_msg, sizeof (sp_msg), kres);
    348 
    349 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
    350 		mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
    351 		/* If we're shutting down already, pause things here. */
    352 		if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
    353 			while (!md_mn_is_commd_present()) {
    354 				delay(md_hz);
    355 			}
    356 			/*
    357 			 * commd is available again. Retry the message once.
    358 			 * If it fails we panic as the system is in an
    359 			 * unexpected state.
    360 			 */
    361 			if (nretries++ == 0)
    362 				goto spss_msg;
    363 		}
    364 		/*
    365 		 * Panic as we are now in an inconsistent state.
    366 		 */
    367 		cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
    368 		    md_shortname(MD_SID(un)), str);
    369 	}
    370 
    371 	kmem_free(kres, sizeof (md_mn_kresult_t));
    372 }
    373 
    374 /*
    375  * FUNCTION:	sp_finish_error
    376  * INPUT:	ps	- parent save structure for error-ed I/O.
    377  *		lock_held	- set if the unit readerlock is held
    378  * OUTPUT:	none.
    379  * RETURNS:	void.
    380  * PURPOSE:	report a driver error
    381  */
    382 static void
    383 sp_finish_error(md_spps_t *ps, int lock_held)
    384 {
    385 	struct buf	*pb = ps->ps_bp;
    386 	mdi_unit_t	*ui = ps->ps_ui;
    387 	md_dev64_t	un_dev;			/* underlying device */
    388 	md_dev64_t	md_dev = md_expldev(pb->b_edev); /* metadev in error */
    389 	char		*str;
    390 
    391 	un_dev = md_expldev(ps->ps_un->un_dev);
    392 	/* set error type */
    393 	if (pb->b_flags & B_READ) {
    394 		str = "read";
    395 	} else {
    396 		str = "write";
    397 	}
    398 
    399 
    400 	SPPS_FREE(sp_parent_cache, ps);
    401 	pb->b_flags |= B_ERROR;
    402 
    403 	md_kstat_done(ui, pb, 0);
    404 
    405 	if (lock_held) {
    406 		md_unit_readerexit(ui);
    407 	}
    408 	md_biodone(pb);
    409 
    410 	cmn_err(CE_WARN, "md: %s: %s error on %s",
    411 	    md_shortname(md_getminor(md_dev)), str,
    412 	    md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0));
    413 }
    414 
    415 
    416 /*
    417  * FUNCTION:	sp_xmit_ok
    418  * INPUT:	dq	- daemon queue referencing failing ps structure
    419  * OUTPUT:	none.
    420  * RETURNS:	void.
    421  * PURPOSE:	send a message to the master node in a multi-owner diskset to
    422  *		update all attached nodes view of the soft-part to be MD_SP_OK.
    423  * CALLING CONTEXT:
    424  *	Blockable. No unit lock held.
    425  */
    426 static void
    427 sp_xmit_ok(daemon_queue_t *dq)
    428 {
    429 	md_spps_t	*ps = (md_spps_t *)dq;
    430 
    431 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
    432 	sp_send_stat_msg(ps->ps_un, MD_SP_OK);
    433 
    434 	/*
    435 	 * Successfully transmitted error state to all nodes, now release this
    436 	 * parent structure.
    437 	 */
    438 	SPPS_FREE(sp_parent_cache, ps);
    439 }
    440 
    441 /*
    442  * FUNCTION:	sp_xmit_error
    443  * INPUT:	dq	- daemon queue referencing failing ps structure
    444  * OUTPUT:	none.
    445  * RETURNS:	void.
    446  * PURPOSE:	send a message to the master node in a multi-owner diskset to
    447  *		update all attached nodes view of the soft-part to be MD_SP_ERR.
    448  * CALLING CONTEXT:
    449  *	Blockable. No unit lock held.
    450  */
    451 static void
    452 sp_xmit_error(daemon_queue_t *dq)
    453 {
    454 	md_spps_t	*ps = (md_spps_t *)dq;
    455 
    456 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
    457 	sp_send_stat_msg(ps->ps_un, MD_SP_ERR);
    458 
    459 	/*
    460 	 * Successfully transmitted error state to all nodes, now release this
    461 	 * parent structure.
    462 	 */
    463 	SPPS_FREE(sp_parent_cache, ps);
    464 }
    465 static void
    466 sp_send_stat_ok(mp_unit_t *un)
    467 {
    468 	minor_t		mnum = MD_SID(un);
    469 	md_spps_t	*ps;
    470 
    471 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
    472 	sp_parent_init(ps);
    473 	ps->ps_un = un;
    474 	ps->ps_ui = MDI_UNIT(mnum);
    475 
    476 	daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps,
    477 	    REQ_OLD);
    478 }
    479 
    480 static void
    481 sp_send_stat_err(mp_unit_t *un)
    482 {
    483 	minor_t		mnum = MD_SID(un);
    484 	md_spps_t	*ps;
    485 
    486 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
    487 	sp_parent_init(ps);
    488 	ps->ps_un = un;
    489 	ps->ps_ui = MDI_UNIT(mnum);
    490 
    491 	daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps,
    492 	    REQ_OLD);
    493 }
    494 
    495 
    496 /*
    497  * FUNCTION:	sp_error()
    498  * INPUT:	ps	- parent save structure for error-ed I/O.
    499  * OUTPUT:	none.
    500  * RETURNS:	void.
    501  * PURPOSE:	report a driver error.
    502  * CALLING CONTEXT:
    503  *	Interrupt - non-blockable
    504  */
    505 static void
    506 sp_error(md_spps_t *ps)
    507 {
    508 	set_t		setno = MD_UN2SET(ps->ps_un);
    509 
    510 	/*
    511 	 * Drop the mutex associated with this request before (potentially)
    512 	 * enqueuing the free onto a separate thread. We have to release the
    513 	 * mutex before destroying the parent structure.
    514 	 */
    515 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
    516 		if (MUTEX_HELD(&ps->ps_mx)) {
    517 			mutex_exit(&ps->ps_mx);
    518 		}
    519 	} else {
    520 		/*
    521 		 * this should only ever happen if we are panicking,
    522 		 * since DONTFREE is only set on the parent if panicstr
    523 		 * is non-NULL.
    524 		 */
    525 		ASSERT(panicstr);
    526 	}
    527 
    528 	/*
    529 	 * For a multi-owner set we need to send a message to the master so that
    530 	 * all nodes get the errored status when we first encounter it. To avoid
    531 	 * deadlocking when multiple soft-partitions encounter an error on one
    532 	 * physical unit we drop the unit readerlock before enqueueing the
    533 	 * request. That way we can service any messages that require a
    534 	 * writerlock to be held. Additionally, to avoid deadlocking when at
    535 	 * the bottom of a metadevice stack and a higher level mirror has
    536 	 * multiple requests outstanding on this soft-part, we clone the ps
    537 	 * that failed and pass the error back up the stack to release the
    538 	 * reference that this i/o may have in the higher-level metadevice.
    539 	 * The other nodes in the cluster just have to modify the soft-part
    540 	 * status and we do not need to block the i/o completion for this.
    541 	 */
    542 	if (MD_MNSET_SETNO(setno)) {
    543 		md_spps_t	*err_ps;
    544 		err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
    545 		sp_parent_init(err_ps);
    546 
    547 		err_ps->ps_un = ps->ps_un;
    548 		err_ps->ps_ui = ps->ps_ui;
    549 
    550 		md_unit_readerexit(ps->ps_ui);
    551 
    552 		daemon_request(&md_sp_daemon, sp_xmit_error,
    553 		    (daemon_queue_t *)err_ps, REQ_OLD);
    554 
    555 		sp_finish_error(ps, 0);
    556 
    557 		return;
    558 	} else {
    559 		ps->ps_un->un_status = MD_SP_ERR;
    560 	}
    561 
    562 	/* Flag the error */
    563 	sp_finish_error(ps, 1);
    564 
    565 }
    566 
    567 /*
    568  * FUNCTION:	sp_mapbuf()
    569  * INPUT:	un	- unit structure for soft partition we are doing
    570  *			  I/O on.
    571  *		voff	- virtual offset in soft partition to map.
    572  *		bcount	- # of blocks in the I/O.
    573  * OUTPUT:	bp	- translated buffer to be passed down to next layer.
    574  * RETURNS:	1	- request must be fragmented, more work to do,
    575  *		0	- request satisified, no more work to do
    576  *		-1	- error
    577  * PURPOSE:	Map the the virtual offset in the soft partition (passed
    578  *		in via voff) to the "physical" offset on whatever the soft
    579  *		partition is built on top of.  We do this by doing a binary
    580  *		search of the extent array in the soft partition unit
    581  *		structure.  Once the current extent is found, we do the
    582  *		translation, determine if the I/O will cross extent
    583  *		boundaries (if so, we have to fragment the I/O), then
    584  *		fill in the buf structure to be passed down to the next layer.
    585  */
    586 static int
    587 sp_mapbuf(
    588 	mp_unit_t	*un,
    589 	sp_ext_offset_t	voff,
    590 	sp_ext_length_t	bcount,
    591 	buf_t		*bp
    592 )
    593 {
    594 	int		lo, mid, hi, found, more;
    595 	size_t		new_bcount;
    596 	sp_ext_offset_t new_blkno;
    597 	sp_ext_offset_t	new_offset;
    598 	sp_ext_offset_t	ext_endblk;
    599 	md_dev64_t	new_edev;
    600 	extern unsigned	md_maxphys;
    601 
    602 	found = 0;
    603 	lo = 0;
    604 	hi = un->un_numexts - 1;
    605 
    606 	/*
    607 	 * do a binary search to find the extent that contains the
    608 	 * starting offset.  after this loop, mid contains the index
    609 	 * of the correct extent.
    610 	 */
    611 	while (lo <= hi && !found) {
    612 		mid = (lo + hi) / 2;
    613 		/* is the starting offset contained within the mid-ext? */
    614 		if (voff >= un->un_ext[mid].un_voff &&
    615 		    voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len)
    616 			found = 1;
    617 		else if (voff < un->un_ext[mid].un_voff)
    618 			hi = mid - 1;
    619 		else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */
    620 			lo = mid + 1;
    621 	}
    622 
    623 	if (!found) {
    624 		cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff);
    625 		return (-1);
    626 	}
    627 
    628 	/* translate to underlying physical offset/device */
    629 	new_offset = voff - un->un_ext[mid].un_voff;
    630 	new_blkno = un->un_ext[mid].un_poff + new_offset;
    631 	new_edev = un->un_dev;
    632 
    633 	/* determine if we need to break the I/O into fragments */
    634 	ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len;
    635 	if (voff + btodb(bcount) > ext_endblk) {
    636 		new_bcount = dbtob(ext_endblk - voff);
    637 		more = 1;
    638 	} else {
    639 		new_bcount = bcount;
    640 		more = 0;
    641 	}
    642 
    643 	/* only break up the I/O if we're not built on another metadevice */
    644 	if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) {
    645 		new_bcount = md_maxphys;
    646 		more = 1;
    647 	}
    648 	if (bp != (buf_t *)NULL) {
    649 		/* do bp updates */
    650 		bp->b_bcount = new_bcount;
    651 		bp->b_lblkno = new_blkno;
    652 		bp->b_edev = md_dev64_to_dev(new_edev);
    653 	}
    654 	return (more);
    655 }
    656 
    657 /*
    658  * FUNCTION:	sp_validate()
    659  * INPUT:	un	- unit structure to be validated.
    660  * OUTPUT:	none.
    661  * RETURNS:	0	- soft partition ok.
    662  *		-1	- error.
    663  * PURPOSE:	called on open to sanity check the soft partition.  In
    664  *		order to open a soft partition:
    665  *		- it must have at least one extent
    666  *		- the extent info in core and on disk must match
    667  *		- it may not be in an intermediate state (which would
    668  *		  imply that a two-phase commit was interrupted)
    669  *
    670  *		If the extent checking fails (B_ERROR returned from the read
    671  *		strategy call) _and_ we're a multi-owner diskset, we send a
    672  *		message to the master so that all nodes inherit the same view
    673  *		of the soft partition.
    674  *		If we are checking a soft-part that is marked as in error, and
    675  *		we can actually read and validate the watermarks we send a
    676  *		message to clear the error to the master node.
    677  */
    678 static int
    679 sp_validate(mp_unit_t *un)
    680 {
    681 	uint_t		ext;
    682 	struct buf	*buf;
    683 	sp_ext_length_t	len;
    684 	mp_watermark_t	*wm;
    685 	set_t		setno;
    686 	int		reset_error = 0;
    687 
    688 	setno = MD_UN2SET(un);
    689 
    690 	/* sanity check unit structure components ?? */
    691 	if (un->un_status != MD_SP_OK) {
    692 		if (un->un_status != MD_SP_ERR) {
    693 			cmn_err(CE_WARN, "md: %s: open failed, soft partition "
    694 			    "status is %u.",
    695 			    md_shortname(MD_SID(un)),
    696 			    un->un_status);
    697 			return (-1);
    698 		} else {
    699 			cmn_err(CE_WARN, "md: %s: open of soft partition "
    700 			    "in Errored state.",
    701 			    md_shortname(MD_SID(un)));
    702 			reset_error = 1;
    703 		}
    704 	}
    705 
    706 	if (un->un_numexts == 0) {
    707 		cmn_err(CE_WARN, "md: %s: open failed, soft partition does "
    708 		    "not have any extents.", md_shortname(MD_SID(un)));
    709 		return (-1);
    710 	}
    711 
    712 	len = 0LL;
    713 	for (ext = 0; ext < un->un_numexts; ext++) {
    714 
    715 		/* tally extent lengths to check total size */
    716 		len += un->un_ext[ext].un_len;
    717 
    718 		/* allocate buffer for watermark */
    719 		buf = getrbuf(KM_SLEEP);
    720 
    721 		/* read watermark */
    722 		buf->b_flags = B_READ;
    723 		buf->b_edev = md_dev64_to_dev(un->un_dev);
    724 		buf->b_iodone = NULL;
    725 		buf->b_proc = NULL;
    726 		buf->b_bcount = sizeof (mp_watermark_t);
    727 		buf->b_lblkno = un->un_ext[ext].un_poff - 1;
    728 		buf->b_bufsize = sizeof (mp_watermark_t);
    729 		buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t),
    730 		    KM_SLEEP);
    731 
    732 		/*
    733 		 * make the call non-blocking so that it is not affected
    734 		 * by a set take.
    735 		 */
    736 		md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL);
    737 		(void) biowait(buf);
    738 
    739 		if (buf->b_flags & B_ERROR) {
    740 			cmn_err(CE_WARN, "md: %s: open failed, could not "
    741 			    "read watermark at block %llu for extent %u, "
    742 			    "error %d.", md_shortname(MD_SID(un)),
    743 			    buf->b_lblkno, ext, buf->b_error);
    744 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
    745 			freerbuf(buf);
    746 
    747 			/*
    748 			 * If we're a multi-owner diskset we send a message
    749 			 * indicating that this soft-part has an invalid
    750 			 * extent to the master node. This ensures a consistent
    751 			 * view of the soft-part across the cluster.
    752 			 */
    753 			if (MD_MNSET_SETNO(setno)) {
    754 				sp_send_stat_err(un);
    755 			}
    756 			return (-1);
    757 		}
    758 
    759 		wm = (mp_watermark_t *)buf->b_un.b_addr;
    760 
    761 		/* make sure the checksum is correct first */
    762 		if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
    763 		    (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) {
    764 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
    765 			    "at block %llu for extent %u does not have a "
    766 			    "valid checksum 0x%08x.", md_shortname(MD_SID(un)),
    767 			    buf->b_lblkno, ext, wm->wm_checksum);
    768 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
    769 			freerbuf(buf);
    770 			return (-1);
    771 		}
    772 
    773 		if (wm->wm_magic != MD_SP_MAGIC) {
    774 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
    775 			    "at block %llu for extent %u does not have a "
    776 			    "valid watermark magic number, expected 0x%x, "
    777 			    "found 0x%x.", md_shortname(MD_SID(un)),
    778 			    buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic);
    779 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
    780 			freerbuf(buf);
    781 			return (-1);
    782 		}
    783 
    784 		/* make sure sequence number matches the current extent */
    785 		if (wm->wm_seq != ext) {
    786 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
    787 			    "at block %llu for extent %u has invalid "
    788 			    "sequence number %u.", md_shortname(MD_SID(un)),
    789 			    buf->b_lblkno, ext, wm->wm_seq);
    790 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
    791 			freerbuf(buf);
    792 			return (-1);
    793 		}
    794 
    795 		/* make sure watermark length matches unit structure */
    796 		if (wm->wm_length != un->un_ext[ext].un_len) {
    797 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
    798 			    "at block %llu for extent %u has inconsistent "
    799 			    "length, expected %llu, found %llu.",
    800 			    md_shortname(MD_SID(un)), buf->b_lblkno,
    801 			    ext, un->un_ext[ext].un_len,
    802 			    (u_longlong_t)wm->wm_length);
    803 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
    804 			freerbuf(buf);
    805 			return (-1);
    806 		}
    807 
    808 		/*
    809 		 * make sure the type is a valid soft partition and not
    810 		 * a free extent or the end.
    811 		 */
    812 		if (wm->wm_type != EXTTYP_ALLOC) {
    813 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
    814 			    "at block %llu for extent %u is not marked "
    815 			    "as in-use, type = %u.", md_shortname(MD_SID(un)),
    816 			    buf->b_lblkno, ext, wm->wm_type);
    817 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
    818 			freerbuf(buf);
    819 			return (-1);
    820 		}
    821 		/* free up buffer */
    822 		kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
    823 		freerbuf(buf);
    824 	}
    825 
    826 	if (len != un->un_length) {
    827 		cmn_err(CE_WARN, "md: %s: open failed, computed length "
    828 		    "%llu != expected length %llu.", md_shortname(MD_SID(un)),
    829 		    len, un->un_length);
    830 		return (-1);
    831 	}
    832 
    833 	/*
    834 	 * If we're a multi-owner set _and_ reset_error is set, we should clear
    835 	 * the error condition on all nodes in the set. Use SP_SETSTAT2 with
    836 	 * MD_SP_OK.
    837 	 */
    838 	if (MD_MNSET_SETNO(setno) && reset_error) {
    839 		sp_send_stat_ok(un);
    840 	}
    841 	return (0);
    842 }
    843 
    844 /*
    845  * FUNCTION:	sp_done()
    846  * INPUT:	child_buf	- buffer attached to child save structure.
    847  *				  this is the buffer on which I/O has just
    848  *				  completed.
    849  * OUTPUT:	none.
    850  * RETURNS:	0	- success.
    851  *		1	- error.
    852  * PURPOSE:	called on I/O completion.
    853  */
    854 static int
    855 sp_done(struct buf *child_buf)
    856 {
    857 	struct buf	*parent_buf;
    858 	mdi_unit_t	*ui;
    859 	md_spps_t	*ps;
    860 	md_spcs_t	*cs;
    861 
    862 	/* find the child save structure to which this buffer belongs */
    863 	cs = (md_spcs_t *)((caddr_t)child_buf -
    864 	    (sizeof (md_spcs_t) - sizeof (buf_t)));
    865 	/* now get the parent save structure */
    866 	ps = cs->cs_ps;
    867 	parent_buf = ps->ps_bp;
    868 
    869 	mutex_enter(&ps->ps_mx);
    870 	/* pass any errors back up to the parent */
    871 	if (child_buf->b_flags & B_ERROR) {
    872 		ps->ps_flags |= MD_SPPS_ERROR;
    873 		parent_buf->b_error = child_buf->b_error;
    874 	}
    875 	/* mapout, if needed */
    876 	if (child_buf->b_flags & B_REMAPPED)
    877 		bp_mapout(child_buf);
    878 
    879 	ps->ps_frags--;
    880 	if (ps->ps_frags != 0) {
    881 		/*
    882 		 * if this parent has more children, we just free the
    883 		 * child and return.
    884 		 */
    885 		kmem_cache_free(sp_child_cache, cs);
    886 		mutex_exit(&ps->ps_mx);
    887 		return (1);
    888 	}
    889 	/* there are no more children */
    890 	kmem_cache_free(sp_child_cache, cs);
    891 	if (ps->ps_flags & MD_SPPS_ERROR) {
    892 		sp_error(ps);
    893 		return (1);
    894 	}
    895 	ui = ps->ps_ui;
    896 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
    897 		mutex_exit(&ps->ps_mx);
    898 	} else {
    899 		/*
    900 		 * this should only ever happen if we are panicking,
    901 		 * since DONTFREE is only set on the parent if panicstr
    902 		 * is non-NULL.
    903 		 */
    904 		ASSERT(panicstr);
    905 	}
    906 	SPPS_FREE(sp_parent_cache, ps);
    907 	md_kstat_done(ui, parent_buf, 0);
    908 	md_unit_readerexit(ui);
    909 	md_biodone(parent_buf);
    910 	return (0);
    911 }
    912 
    913 /*
    914  * FUNCTION:	md_sp_strategy()
    915  * INPUT:	parent_buf	- parent buffer
    916  *		flag		- flags
    917  *		private		- private data
    918  * OUTPUT:	none.
    919  * RETURNS:	void.
    920  * PURPOSE:	Soft partitioning I/O strategy.  Performs the main work
    921  *		needed to do I/O to a soft partition.  The basic
    922  *		algorithm is as follows:
    923  *			- Allocate a child save structure to keep track
    924  *			  of the I/O we are going to pass down.
    925  *			- Map the I/O to the correct extent in the soft
    926  *			  partition (see sp_mapbuf()).
    927  *			- bioclone() the buffer and pass it down the
    928  *			  stack using md_call_strategy.
    929  *			- If the I/O needs to split across extents,
    930  *			  repeat the above steps until all fragments
    931  *			  are finished.
    932  */
    933 static void
    934 md_sp_strategy(buf_t *parent_buf, int flag, void *private)
    935 {
    936 	md_spps_t	*ps;
    937 	md_spcs_t	*cs;
    938 	int		more;
    939 	mp_unit_t	*un;
    940 	mdi_unit_t	*ui;
    941 	size_t		current_count;
    942 	off_t		current_offset;
    943 	sp_ext_offset_t	current_blkno;
    944 	buf_t		*child_buf;
    945 	set_t		setno = MD_MIN2SET(getminor(parent_buf->b_edev));
    946 	int		strat_flag = flag;
    947 
    948 	/*
    949 	 * When doing IO to a multi owner meta device, check if set is halted.
    950 	 * We do this check without the needed lock held, for performance
    951 	 * reasons.
    952 	 * If an IO just slips through while the set is locked via an
    953 	 * MD_MN_SUSPEND_SET, we don't care about it.
    954 	 * Only check for suspension if we are a top-level i/o request
    955 	 * (MD_STR_NOTTOP is cleared in 'flag');
    956 	 */
    957 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
    958 	    (MD_SET_HALTED | MD_SET_MNSET)) {
    959 		if ((flag & MD_STR_NOTTOP) == 0) {
    960 			mutex_enter(&md_mx);
    961 			/* Here we loop until the set is no longer halted */
    962 			while (md_set[setno].s_status & MD_SET_HALTED) {
    963 				cv_wait(&md_cv, &md_mx);
    964 			}
    965 			mutex_exit(&md_mx);
    966 		}
    967 	}
    968 
    969 	ui = MDI_UNIT(getminor(parent_buf->b_edev));
    970 
    971 	md_kstat_waitq_enter(ui);
    972 
    973 	un = (mp_unit_t *)md_unit_readerlock(ui);
    974 
    975 	if ((flag & MD_NOBLOCK) == 0) {
    976 		if (md_inc_iocount(setno) != 0) {
    977 			parent_buf->b_flags |= B_ERROR;
    978 			parent_buf->b_error = ENXIO;
    979 			parent_buf->b_resid = parent_buf->b_bcount;
    980 			md_kstat_waitq_exit(ui);
    981 			md_unit_readerexit(ui);
    982 			biodone(parent_buf);
    983 			return;
    984 		}
    985 	} else {
    986 		md_inc_iocount_noblock(setno);
    987 	}
    988 
    989 	if (!(flag & MD_STR_NOTTOP)) {
    990 		if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) {
    991 			md_kstat_waitq_exit(ui);
    992 			return;
    993 		}
    994 	}
    995 
    996 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
    997 	sp_parent_init(ps);
    998 
    999 	/*
   1000 	 * Save essential information from the original buffhdr
   1001 	 * in the parent.
   1002 	 */
   1003 	ps->ps_un = un;
   1004 	ps->ps_ui = ui;
   1005 	ps->ps_bp = parent_buf;
   1006 	ps->ps_addr = parent_buf->b_un.b_addr;
   1007 
   1008 	current_count = parent_buf->b_bcount;
   1009 	current_blkno = (sp_ext_offset_t)parent_buf->b_blkno;
   1010 	current_offset  = 0;
   1011 
   1012 	/*
   1013 	 * if we are at the top and we are panicking,
   1014 	 * we don't free in order to save state.
   1015 	 */
   1016 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL))
   1017 		ps->ps_flags |= MD_SPPS_DONTFREE;
   1018 
   1019 	md_kstat_waitq_to_runq(ui);
   1020 
   1021 	ps->ps_frags++;
   1022 
   1023 	/*
   1024 	 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this
   1025 	 * metadevice.
   1026 	 */
   1027 	if (ui->ui_tstate & MD_ABR_CAP)
   1028 		strat_flag |= MD_STR_ABR;
   1029 
   1030 	/*
   1031 	 * this loop does the main work of an I/O.  we allocate a
   1032 	 * a child save for each buf, do the logical to physical
   1033 	 * mapping, decide if we need to frag the I/O, clone the
   1034 	 * new I/O to pass down the stack.  repeat until we've
   1035 	 * taken care of the entire buf that was passed to us.
   1036 	 */
   1037 	do {
   1038 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
   1039 		sp_child_init(cs);
   1040 		child_buf = &cs->cs_buf;
   1041 		cs->cs_ps = ps;
   1042 
   1043 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
   1044 		if (more == -1) {
   1045 			parent_buf->b_flags |= B_ERROR;
   1046 			parent_buf->b_error = EIO;
   1047 			md_kstat_done(ui, parent_buf, 0);
   1048 			md_unit_readerexit(ui);
   1049 			md_biodone(parent_buf);
   1050 			kmem_cache_free(sp_parent_cache, ps);
   1051 			return;
   1052 		}
   1053 
   1054 		child_buf = md_bioclone(parent_buf, current_offset,
   1055 		    child_buf->b_bcount, child_buf->b_edev,
   1056 		    child_buf->b_blkno, sp_done, child_buf,
   1057 		    KM_NOSLEEP);
   1058 		/* calculate new offset, counts, etc... */
   1059 		current_offset += child_buf->b_bcount;
   1060 		current_count -=  child_buf->b_bcount;
   1061 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
   1062 
   1063 		if (more) {
   1064 			mutex_enter(&ps->ps_mx);
   1065 			ps->ps_frags++;
   1066 			mutex_exit(&ps->ps_mx);
   1067 		}
   1068 
   1069 		md_call_strategy(child_buf, strat_flag, private);
   1070 	} while (more);
   1071 
   1072 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) {
   1073 		while (!(ps->ps_flags & MD_SPPS_DONE)) {
   1074 			md_daemon(1, &md_done_daemon);
   1075 		}
   1076 		kmem_cache_free(sp_parent_cache, ps);
   1077 	}
   1078 }
   1079 
   1080 /*
   1081  * FUNCTION:	sp_directed_read()
   1082  * INPUT:	mnum	- minor number
   1083  *		vdr	- vol_directed_rd_t from user
   1084  *		mode	- access mode for copying data out.
   1085  * OUTPUT:	none.
   1086  * RETURNS:	0	- success
   1087  *		Exxxxx	- failure error-code
   1088  * PURPOSE:	Construct the necessary sub-device i/o requests to perform the
   1089  *		directed read as requested by the user. This is essentially the
   1090  *		same as md_sp_strategy() with the exception being that the
   1091  *		underlying 'md_call_strategy' is replaced with an ioctl call.
   1092  */
   1093 int
   1094 sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode)
   1095 {
   1096 	md_spps_t	*ps;
   1097 	md_spcs_t	*cs;
   1098 	int		more;
   1099 	mp_unit_t	*un;
   1100 	mdi_unit_t	*ui;
   1101 	size_t		current_count;
   1102 	off_t		current_offset;
   1103 	sp_ext_offset_t	current_blkno;
   1104 	buf_t		*child_buf, *parent_buf;
   1105 	void		*kbuffer;
   1106 	vol_directed_rd_t	cvdr;
   1107 	caddr_t		userbuf;
   1108 	offset_t	useroff;
   1109 	int		ret = 0;
   1110 
   1111 	ui = MDI_UNIT(mnum);
   1112 
   1113 	md_kstat_waitq_enter(ui);
   1114 
   1115 	bzero(&cvdr, sizeof (cvdr));
   1116 
   1117 	un = (mp_unit_t *)md_unit_readerlock(ui);
   1118 
   1119 	/*
   1120 	 * Construct a parent_buf header which reflects the user-supplied
   1121 	 * request.
   1122 	 */
   1123 
   1124 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
   1125 	if (kbuffer == NULL) {
   1126 		vdr->vdr_flags |= DKV_DMR_ERROR;
   1127 		md_kstat_waitq_exit(ui);
   1128 		md_unit_readerexit(ui);
   1129 		return (ENOMEM);
   1130 	}
   1131 
   1132 	parent_buf = getrbuf(KM_NOSLEEP);
   1133 	if (parent_buf == NULL) {
   1134 		vdr->vdr_flags |= DKV_DMR_ERROR;
   1135 		md_kstat_waitq_exit(ui);
   1136 		md_unit_readerexit(ui);
   1137 		kmem_free(kbuffer, vdr->vdr_nbytes);
   1138 		return (ENOMEM);
   1139 	}
   1140 	parent_buf->b_un.b_addr = kbuffer;
   1141 	parent_buf->b_flags = B_READ;
   1142 	parent_buf->b_bcount = vdr->vdr_nbytes;
   1143 	parent_buf->b_lblkno = lbtodb(vdr->vdr_offset);
   1144 	parent_buf->b_edev = un->un_dev;
   1145 
   1146 
   1147 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
   1148 	sp_parent_init(ps);
   1149 
   1150 	/*
   1151 	 * Save essential information from the original buffhdr
   1152 	 * in the parent.
   1153 	 */
   1154 	ps->ps_un = un;
   1155 	ps->ps_ui = ui;
   1156 	ps->ps_bp = parent_buf;
   1157 	ps->ps_addr = parent_buf->b_un.b_addr;
   1158 
   1159 	current_count = parent_buf->b_bcount;
   1160 	current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno;
   1161 	current_offset  = 0;
   1162 
   1163 	md_kstat_waitq_to_runq(ui);
   1164 
   1165 	ps->ps_frags++;
   1166 	vdr->vdr_bytesread = 0;
   1167 
   1168 	/*
   1169 	 * this loop does the main work of an I/O.  we allocate a
   1170 	 * a child save for each buf, do the logical to physical
   1171 	 * mapping, decide if we need to frag the I/O, clone the
   1172 	 * new I/O to pass down the stack.  repeat until we've
   1173 	 * taken care of the entire buf that was passed to us.
   1174 	 */
   1175 	do {
   1176 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
   1177 		sp_child_init(cs);
   1178 		child_buf = &cs->cs_buf;
   1179 		cs->cs_ps = ps;
   1180 
   1181 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
   1182 		if (more == -1) {
   1183 			ret = EIO;
   1184 			vdr->vdr_flags |= DKV_DMR_SHORT;
   1185 			kmem_cache_free(sp_child_cache, cs);
   1186 			goto err_out;
   1187 		}
   1188 
   1189 		cvdr.vdr_flags = vdr->vdr_flags;
   1190 		cvdr.vdr_side = vdr->vdr_side;
   1191 		cvdr.vdr_nbytes = child_buf->b_bcount;
   1192 		cvdr.vdr_offset = ldbtob(child_buf->b_lblkno);
   1193 		/* Work out where we are in the allocated buffer */
   1194 		useroff = (offset_t)(uintptr_t)kbuffer;
   1195 		useroff = useroff + (offset_t)current_offset;
   1196 		cvdr.vdr_data = (void *)(uintptr_t)useroff;
   1197 		child_buf = md_bioclone(parent_buf, current_offset,
   1198 		    child_buf->b_bcount, child_buf->b_edev,
   1199 		    child_buf->b_blkno, NULL,
   1200 		    child_buf, KM_NOSLEEP);
   1201 		/* calculate new offset, counts, etc... */
   1202 		current_offset += child_buf->b_bcount;
   1203 		current_count -=  child_buf->b_bcount;
   1204 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
   1205 
   1206 		if (more) {
   1207 			mutex_enter(&ps->ps_mx);
   1208 			ps->ps_frags++;
   1209 			mutex_exit(&ps->ps_mx);
   1210 		}
   1211 
   1212 		ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr,
   1213 		    (mode | FKIOCTL), NULL);
   1214 
   1215 		/*
   1216 		 * Free the child structure as we've finished with it.
   1217 		 * Normally this would be done by sp_done() but we're just
   1218 		 * using md_bioclone() to segment the transfer and we never
   1219 		 * issue a strategy request so the iodone will not be called.
   1220 		 */
   1221 		kmem_cache_free(sp_child_cache, cs);
   1222 		if (ret == 0) {
   1223 			/* copyout the returned data to vdr_data + offset */
   1224 			userbuf = (caddr_t)kbuffer;
   1225 			userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer;
   1226 			if (ddi_copyout(userbuf, vdr->vdr_data,
   1227 			    cvdr.vdr_bytesread, mode)) {
   1228 				ret = EFAULT;
   1229 				goto err_out;
   1230 			}
   1231 			vdr->vdr_bytesread += cvdr.vdr_bytesread;
   1232 		} else {
   1233 			goto err_out;
   1234 		}
   1235 	} while (more);
   1236 
   1237 	/*
   1238 	 * Update the user-supplied vol_directed_rd_t structure with the
   1239 	 * contents of the last issued child request.
   1240 	 */
   1241 	vdr->vdr_flags = cvdr.vdr_flags;
   1242 	vdr->vdr_side = cvdr.vdr_side;
   1243 	bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME);
   1244 
   1245 err_out:
   1246 	if (ret != 0) {
   1247 		vdr->vdr_flags |= DKV_DMR_ERROR;
   1248 	}
   1249 	if (vdr->vdr_bytesread != vdr->vdr_nbytes) {
   1250 		vdr->vdr_flags |= DKV_DMR_SHORT;
   1251 	}
   1252 	kmem_cache_free(sp_parent_cache, ps);
   1253 	kmem_free(kbuffer, vdr->vdr_nbytes);
   1254 	freerbuf(parent_buf);
   1255 	md_unit_readerexit(ui);
   1256 	return (ret);
   1257 }
   1258 
   1259 /*
   1260  * FUNCTION:	sp_snarf()
   1261  * INPUT:	cmd	- snarf cmd.
   1262  *		setno	- set number.
   1263  * OUTPUT:	none.
   1264  * RETURNS:	1	- soft partitions were snarfed.
   1265  *		0	- no soft partitions were snarfed.
   1266  * PURPOSE:	Snarf soft partition metadb records into their in-core
   1267  *		structures.  This routine is called at "snarf time" when
   1268  *		md loads and gets all metadevices records into memory.
   1269  *		The basic algorithm is simply to walk the soft partition
   1270  *		records in the metadb and call the soft partitioning
   1271  *		build_incore routine to set up the in-core structures.
   1272  */
   1273 static int
   1274 sp_snarf(md_snarfcmd_t cmd, set_t setno)
   1275 {
   1276 	mp_unit_t	*un;
   1277 	mddb_recid_t	recid;
   1278 	int		gotsomething;
   1279 	int		all_sp_gotten;
   1280 	mddb_type_t	rec_type;
   1281 	mddb_de_ic_t	*dep;
   1282 	mddb_rb32_t	*rbp;
   1283 	mp_unit_t	*big_un;
   1284 	mp_unit32_od_t	*small_un;
   1285 	size_t		newreqsize;
   1286 
   1287 
   1288 	if (cmd == MD_SNARF_CLEANUP)
   1289 		return (0);
   1290 
   1291 	all_sp_gotten = 1;
   1292 	gotsomething = 0;
   1293 
   1294 	/* get the record type */
   1295 	rec_type = (mddb_type_t)md_getshared_key(setno,
   1296 	    sp_md_ops.md_driver.md_drivername);
   1297 	recid = mddb_makerecid(setno, 0);
   1298 
   1299 	/*
   1300 	 * walk soft partition records in the metadb and call
   1301 	 * sp_build_incore to build in-core structures.
   1302 	 */
   1303 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
   1304 		/* if we've already gotten this record, go to the next one */
   1305 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
   1306 			continue;
   1307 
   1308 
   1309 		dep = mddb_getrecdep(recid);
   1310 		dep->de_flags = MDDB_F_SOFTPART;
   1311 		rbp = dep->de_rb;
   1312 
   1313 		switch (rbp->rb_revision) {
   1314 		case MDDB_REV_RB:
   1315 		case MDDB_REV_RBFN:
   1316 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
   1317 				/*
   1318 				 * This means, we have an old and small record.
   1319 				 * And this record hasn't already been converted
   1320 				 * :-o before we create an incore metadevice
   1321 				 * from this we have to convert it to a big
   1322 				 * record.
   1323 				 */
   1324 				small_un =
   1325 				    (mp_unit32_od_t *)mddb_getrecaddr(recid);
   1326 				newreqsize = sizeof (mp_unit_t) +
   1327 				    ((small_un->un_numexts - 1) *
   1328 				    sizeof (struct mp_ext));
   1329 				big_un = (mp_unit_t *)kmem_zalloc(newreqsize,
   1330 				    KM_SLEEP);
   1331 				softpart_convert((caddr_t)small_un,
   1332 				    (caddr_t)big_un, SMALL_2_BIG);
   1333 				kmem_free(small_un, dep->de_reqsize);
   1334 				dep->de_rb_userdata = big_un;
   1335 				dep->de_reqsize = newreqsize;
   1336 				rbp->rb_private |= MD_PRV_CONVD;
   1337 				un = big_un;
   1338 			} else {
   1339 				/* Record has already been converted */
   1340 				un = (mp_unit_t *)mddb_getrecaddr(recid);
   1341 			}
   1342 			un->c.un_revision &= ~MD_64BIT_META_DEV;
   1343 			break;
   1344 		case MDDB_REV_RB64:
   1345 		case MDDB_REV_RB64FN:
   1346 			/* Large device */
   1347 			un = (mp_unit_t *)mddb_getrecaddr(recid);
   1348 			un->c.un_revision |= MD_64BIT_META_DEV;
   1349 			un->c.un_flag |= MD_EFILABEL;
   1350 			break;
   1351 		}
   1352 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
   1353 
   1354 		/*
   1355 		 * Create minor node for snarfed entry.
   1356 		 */
   1357 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
   1358 
   1359 		if (MD_UNIT(MD_SID(un)) != NULL) {
   1360 			/* unit is already in-core */
   1361 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
   1362 			continue;
   1363 		}
   1364 		all_sp_gotten = 0;
   1365 		if (sp_build_incore((void *)un, 1) == 0) {
   1366 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
   1367 			md_create_unit_incore(MD_SID(un), &sp_md_ops, 0);
   1368 			gotsomething = 1;
   1369 		}
   1370 	}
   1371 
   1372 	if (!all_sp_gotten)
   1373 		return (gotsomething);
   1374 	/* double-check records */
   1375 	recid = mddb_makerecid(setno, 0);
   1376 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0)
   1377 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
   1378 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
   1379 
   1380 	return (0);
   1381 }
   1382 
   1383 /*
   1384  * FUNCTION:	sp_halt()
   1385  * INPUT:	cmd	- halt cmd.
   1386  *		setno	- set number.
   1387  * RETURNS:	0	- success.
   1388  *		1	- err.
   1389  * PURPOSE:	Perform driver halt operations.  As with stripe, we
   1390  *		support MD_HALT_CHECK and MD_HALT_DOIT.  The first
   1391  *		does a check to see if halting can be done safely
   1392  *		(no open soft partitions), the second cleans up and
   1393  *		shuts down the driver.
   1394  */
   1395 static int
   1396 sp_halt(md_haltcmd_t cmd, set_t setno)
   1397 {
   1398 	int		i;
   1399 	mdi_unit_t	*ui;
   1400 	minor_t		mnum;
   1401 
   1402 	if (cmd == MD_HALT_CLOSE)
   1403 		return (0);
   1404 
   1405 	if (cmd == MD_HALT_OPEN)
   1406 		return (0);
   1407 
   1408 	if (cmd == MD_HALT_UNLOAD)
   1409 		return (0);
   1410 
   1411 	if (cmd == MD_HALT_CHECK) {
   1412 		for (i = 0; i < md_nunits; i++) {
   1413 			mnum = MD_MKMIN(setno, i);
   1414 			if ((ui = MDI_UNIT(mnum)) == NULL)
   1415 				continue;
   1416 			if (ui->ui_opsindex != sp_md_ops.md_selfindex)
   1417 				continue;
   1418 			if (md_unit_isopen(ui))
   1419 				return (1);
   1420 		}
   1421 		return (0);
   1422 	}
   1423 
   1424 	if (cmd != MD_HALT_DOIT)
   1425 		return (1);
   1426 
   1427 	for (i = 0; i < md_nunits; i++) {
   1428 		mnum = MD_MKMIN(setno, i);
   1429 		if ((ui = MDI_UNIT(mnum)) == NULL)
   1430 			continue;
   1431 		if (ui->ui_opsindex != sp_md_ops.md_selfindex)
   1432 			continue;
   1433 		reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0);
   1434 	}
   1435 
   1436 	return (0);
   1437 }
   1438 
   1439 /*
   1440  * FUNCTION:	sp_open_dev()
   1441  * INPUT:	un	- unit structure.
   1442  *		oflags	- open flags.
   1443  * OUTPUT:	none.
   1444  * RETURNS:	0		- success.
   1445  *		non-zero	- err.
   1446  * PURPOSE:	open underlying device via md_layered_open.
   1447  */
   1448 static int
   1449 sp_open_dev(mp_unit_t *un, int oflags)
   1450 {
   1451 	minor_t		mnum = MD_SID(un);
   1452 	int		err;
   1453 	md_dev64_t	tmpdev;
   1454 	set_t		setno = MD_MIN2SET(MD_SID(un));
   1455 	side_t		side = mddb_getsidenum(setno);
   1456 
   1457 	tmpdev = un->un_dev;
   1458 	/*
   1459 	 * Do the open by device id if underlying is regular
   1460 	 */
   1461 	if ((md_getmajor(tmpdev) != md_major) &&
   1462 	    md_devid_found(setno, side, un->un_key) == 1) {
   1463 		tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key);
   1464 	}
   1465 	err = md_layered_open(mnum, &tmpdev, oflags);
   1466 	un->un_dev = tmpdev;
   1467 
   1468 	if (err)
   1469 		return (ENXIO);
   1470 
   1471 	return (0);
   1472 }
   1473 
   1474 /*
   1475  * FUNCTION:	sp_open()
   1476  * INPUT:	dev		- device to open.
   1477  *		flag		- pass-through flag.
   1478  *		otyp		- pass-through open type.
   1479  *		cred_p		- credentials.
   1480  *		md_oflags	- open flags.
   1481  * OUTPUT:	none.
   1482  * RETURNS:	0		- success.
   1483  *		non-zero	- err.
   1484  * PURPOSE:	open a soft partition.
   1485  */
   1486 /* ARGSUSED */
   1487 static int
   1488 sp_open(
   1489 	dev_t		*dev,
   1490 	int		flag,
   1491 	int		otyp,
   1492 	cred_t		*cred_p,
   1493 	int		md_oflags
   1494 )
   1495 {
   1496 	minor_t		mnum = getminor(*dev);
   1497 	mdi_unit_t	*ui = MDI_UNIT(mnum);
   1498 	mp_unit_t	*un;
   1499 	int		err = 0;
   1500 	set_t		setno;
   1501 
   1502 	/*
   1503 	 * When doing an open of a multi owner metadevice, check to see if this
   1504 	 * node is a starting node and if a reconfig cycle is underway.
   1505 	 * If so, the system isn't sufficiently set up enough to handle the
   1506 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
   1507 	 */
   1508 	setno = MD_MIN2SET(mnum);
   1509 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
   1510 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
   1511 			return (ENXIO);
   1512 	}
   1513 
   1514 	/* grab necessary locks */
   1515 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
   1516 	setno = MD_UN2SET(un);
   1517 
   1518 	/* open underlying device, if necessary */
   1519 	if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
   1520 		if ((err = sp_open_dev(un, md_oflags)) != 0)
   1521 			goto out;
   1522 
   1523 		if (MD_MNSET_SETNO(setno)) {
   1524 			/* For probe, don't incur the overhead of validate */
   1525 			if (!(md_oflags & MD_OFLG_PROBEDEV)) {
   1526 				/*
   1527 				 * Don't call sp_validate while
   1528 				 * unit_openclose lock is held.  So, actually
   1529 				 * open the device, drop openclose lock,
   1530 				 * call sp_validate, reacquire openclose lock,
   1531 				 * and close the device.  If sp_validate
   1532 				 * succeeds, then device will be re-opened.
   1533 				 */
   1534 				if ((err = md_unit_incopen(mnum, flag,
   1535 				    otyp)) != 0)
   1536 					goto out;
   1537 
   1538 				mutex_enter(&ui->ui_mx);
   1539 				ui->ui_lock |= MD_UL_OPENINPROGRESS;
   1540 				mutex_exit(&ui->ui_mx);
   1541 				md_unit_openclose_exit(ui);
   1542 				if (otyp != OTYP_LYR)
   1543 					rw_exit(&md_unit_array_rw.lock);
   1544 
   1545 				err = sp_validate(un);
   1546 
   1547 				if (otyp != OTYP_LYR)
   1548 					rw_enter(&md_unit_array_rw.lock,
   1549 					    RW_READER);
   1550 				(void) md_unit_openclose_enter(ui);
   1551 				(void) md_unit_decopen(mnum, otyp);
   1552 				mutex_enter(&ui->ui_mx);
   1553 				ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
   1554 				cv_broadcast(&ui->ui_cv);
   1555 				mutex_exit(&ui->ui_mx);
   1556 				/*
   1557 				 * Should be in the same state as before
   1558 				 * the sp_validate.
   1559 				 */
   1560 				if (err != 0) {
   1561 					/* close the device opened above */
   1562 					md_layered_close(un->un_dev, md_oflags);
   1563 					err = EIO;
   1564 					goto out;
   1565 				}
   1566 			}
   1567 			/*
   1568 			 * As we're a multi-owner metadevice we need to ensure
   1569 			 * that all nodes have the same idea of the status.
   1570 			 * sp_validate() will mark the device as errored (if
   1571 			 * it cannot read the watermark) or ok (if it was
   1572 			 * previously errored but the watermark is now valid).
   1573 			 * This code-path is only entered on the non-probe open
   1574 			 * so we will maintain the errored state during a probe
   1575 			 * call. This means the sys-admin must metarecover -m
   1576 			 * to reset the soft-partition error.
   1577 			 */
   1578 		} else {
   1579 			/* For probe, don't incur the overhead of validate */
   1580 			if (!(md_oflags & MD_OFLG_PROBEDEV) &&
   1581 			    (err = sp_validate(un)) != 0) {
   1582 				/* close the device opened above */
   1583 				md_layered_close(un->un_dev, md_oflags);
   1584 				err = EIO;
   1585 				goto out;
   1586 			} else {
   1587 				/*
   1588 				 * we succeeded in validating the on disk
   1589 				 * format versus the in core, so reset the
   1590 				 * status if it's in error
   1591 				 */
   1592 				if (un->un_status == MD_SP_ERR) {
   1593 					un->un_status = MD_SP_OK;
   1594 				}
   1595 			}
   1596 		}
   1597 	}
   1598 
   1599 	/* count open */
   1600 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
   1601 		goto out;
   1602 
   1603 out:
   1604 	md_unit_openclose_exit(ui);
   1605 	return (err);
   1606 }
   1607 
   1608 /*
   1609  * FUNCTION:	sp_close()
   1610  * INPUT:	dev		- device to close.
   1611  *		flag		- pass-through flag.
   1612  *		otyp		- pass-through type.
   1613  *		cred_p		- credentials.
   1614  *		md_cflags	- close flags.
   1615  * OUTPUT:	none.
   1616  * RETURNS:	0		- success.
   1617  *		non-zero	- err.
   1618  * PURPOSE:	close a soft paritition.
   1619  */
   1620 /* ARGSUSED */
   1621 static int
   1622 sp_close(
   1623 	dev_t		dev,
   1624 	int		flag,
   1625 	int		otyp,
   1626 	cred_t		*cred_p,
   1627 	int		md_cflags
   1628 )
   1629 {
   1630 	minor_t		mnum = getminor(dev);
   1631 	mdi_unit_t	*ui = MDI_UNIT(mnum);
   1632 	mp_unit_t	*un;
   1633 	int		err = 0;
   1634 
   1635 	/* grab necessary locks */
   1636 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
   1637 
   1638 	/* count closed */
   1639 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
   1640 		goto out;
   1641 
   1642 	/* close devices, if necessary */
   1643 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
   1644 		md_layered_close(un->un_dev, md_cflags);
   1645 	}
   1646 
   1647 	/*
   1648 	 * If a MN set and transient capabilities (eg ABR/DMR) are set,
   1649 	 * clear these capabilities if this is the last close in
   1650 	 * the cluster
   1651 	 */
   1652 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
   1653 	    (ui->ui_tstate & MD_ABR_CAP)) {
   1654 		md_unit_openclose_exit(ui);
   1655 		mdmn_clear_all_capabilities(mnum);
   1656 		return (0);
   1657 	}
   1658 	/* unlock, return success */
   1659 out:
   1660 	md_unit_openclose_exit(ui);
   1661 	return (err);
   1662 }
   1663 
   1664 
   1665 /* used in sp_dump routine */
   1666 static struct buf dumpbuf;
   1667 
   1668 /*
   1669  * FUNCTION:	sp_dump()
   1670  * INPUT:	dev	- device to dump to.
   1671  *		addr	- address to dump.
   1672  *		blkno	- blkno on device.
   1673  *		nblk	- number of blocks to dump.
   1674  * OUTPUT:	none.
   1675  * RETURNS:	result from bdev_dump.
   1676  * PURPOSE:  This routine dumps memory to the disk.  It assumes that
   1677  *           the memory has already been mapped into mainbus space.
   1678  *           It is called at disk interrupt priority when the system
   1679  *           is in trouble.
   1680  *           NOTE: this function is defined using 32-bit arguments,
   1681  *           but soft partitioning is internally 64-bit.  Arguments
   1682  *           are casted where appropriate.
   1683  */
   1684 static int
   1685 sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
   1686 {
   1687 	mp_unit_t	*un;
   1688 	buf_t		*bp;
   1689 	sp_ext_length_t	nb;
   1690 	daddr_t		mapblk;
   1691 	int		result;
   1692 	int		more;
   1693 	int		saveresult = 0;
   1694 
   1695 	/*
   1696 	 * Don't need to grab the unit lock.
   1697 	 * Cause nothing else is supposed to be happenning.
   1698 	 * Also dump is not supposed to sleep.
   1699 	 */
   1700 	un = (mp_unit_t *)MD_UNIT(getminor(dev));
   1701 
   1702 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
   1703 		return (EINVAL);
   1704 
   1705 	if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks)
   1706 		return (EINVAL);
   1707 
   1708 	bp = &dumpbuf;
   1709 	nb = (sp_ext_length_t)dbtob(nblk);
   1710 	do {
   1711 		bzero((caddr_t)bp, sizeof (*bp));
   1712 		more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp);
   1713 		nblk = (int)(btodb(bp->b_bcount));
   1714 		mapblk = bp->b_blkno;
   1715 		result = bdev_dump(bp->b_edev, addr, mapblk, nblk);
   1716 		if (result)
   1717 			saveresult = result;
   1718 
   1719 		nb -= bp->b_bcount;
   1720 		addr += bp->b_bcount;
   1721 		blkno += nblk;
   1722 	} while (more);
   1723 
   1724 	return (saveresult);
   1725 }
   1726 
   1727 static int
   1728 sp_imp_set(
   1729 	set_t	setno
   1730 )
   1731 {
   1732 	mddb_recid_t	recid;
   1733 	int		gotsomething;
   1734 	mddb_type_t	rec_type;
   1735 	mddb_de_ic_t	*dep;
   1736 	mddb_rb32_t	*rbp;
   1737 	mp_unit_t	*un64;
   1738 	mp_unit32_od_t	*un32;
   1739 	md_dev64_t	self_devt;
   1740 	minor_t		*self_id;	/* minor needs to be updated */
   1741 	md_parent_t	*parent_id;	/* parent needs to be updated */
   1742 	mddb_recid_t	*record_id;	/* record id needs to be updated */
   1743 
   1744 	gotsomething = 0;
   1745 
   1746 	rec_type = (mddb_type_t)md_getshared_key(setno,
   1747 	    sp_md_ops.md_driver.md_drivername);
   1748 	recid = mddb_makerecid(setno, 0);
   1749 
   1750 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
   1751 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
   1752 			continue;
   1753 
   1754 		dep = mddb_getrecdep(recid);
   1755 		rbp = dep->de_rb;
   1756 
   1757 		switch (rbp->rb_revision) {
   1758 		case MDDB_REV_RB:
   1759 		case MDDB_REV_RBFN:
   1760 			/*
   1761 			 * Small device
   1762 			 */
   1763 			un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid);
   1764 			self_id = &(un32->c.un_self_id);
   1765 			parent_id = &(un32->c.un_parent);
   1766 			record_id = &(un32->c.un_record_id);
   1767 
   1768 			if (!md_update_minor(setno, mddb_getsidenum
   1769 			    (setno), un32->un_key))
   1770 				goto out;
   1771 			break;
   1772 
   1773 		case MDDB_REV_RB64:
   1774 		case MDDB_REV_RB64FN:
   1775 			un64 = (mp_unit_t *)mddb_getrecaddr(recid);
   1776 			self_id = &(un64->c.un_self_id);
   1777 			parent_id = &(un64->c.un_parent);
   1778 			record_id = &(un64->c.un_record_id);
   1779 
   1780 			if (!md_update_minor(setno, mddb_getsidenum
   1781 			    (setno), un64->un_key))
   1782 				goto out;
   1783 			break;
   1784 		}
   1785 
   1786 		/*
   1787 		 * If this is a top level and a friendly name metadevice,
   1788 		 * update its minor in the namespace.
   1789 		 */
   1790 		if ((*parent_id == MD_NO_PARENT) &&
   1791 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
   1792 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
   1793 
   1794 			self_devt = md_makedevice(md_major, *self_id);
   1795 			if (!md_update_top_device_minor(setno,
   1796 			    mddb_getsidenum(setno), self_devt))
   1797 				goto out;
   1798 		}
   1799 
   1800 		/*
   1801 		 * Update unit with the imported setno
   1802 		 *
   1803 		 */
   1804 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
   1805 
   1806 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
   1807 		if (*parent_id != MD_NO_PARENT)
   1808 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
   1809 		*record_id = MAKERECID(setno, DBID(*record_id));
   1810 
   1811 		gotsomething = 1;
   1812 	}
   1813 
   1814 out:
   1815 	return (gotsomething);
   1816 }
   1817 
   1818 static md_named_services_t sp_named_services[] = {
   1819 	{NULL,					0}
   1820 };
   1821 
   1822 md_ops_t sp_md_ops = {
   1823 	sp_open,		/* open */
   1824 	sp_close,		/* close */
   1825 	md_sp_strategy,		/* strategy */
   1826 	NULL,			/* print */
   1827 	sp_dump,		/* dump */
   1828 	NULL,			/* read */
   1829 	NULL,			/* write */
   1830 	md_sp_ioctl,		/* ioctl, */
   1831 	sp_snarf,		/* snarf */
   1832 	sp_halt,		/* halt */
   1833 	NULL,			/* aread */
   1834 	NULL,			/* awrite */
   1835 	sp_imp_set,		/* import set */
   1836 	sp_named_services
   1837 };
   1838 
   1839 static void
   1840 init_init()
   1841 {
   1842 	sp_parent_cache = kmem_cache_create("md_softpart_parent",
   1843 	    sizeof (md_spps_t), 0, sp_parent_constructor,
   1844 	    sp_parent_destructor, sp_run_queue, NULL, NULL, 0);
   1845 	sp_child_cache = kmem_cache_create("md_softpart_child",
   1846 	    sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0,
   1847 	    sp_child_constructor, sp_child_destructor, sp_run_queue,
   1848 	    NULL, NULL, 0);
   1849 }
   1850 
   1851 static void
   1852 fini_uninit()
   1853 {
   1854 	kmem_cache_destroy(sp_parent_cache);
   1855 	kmem_cache_destroy(sp_child_cache);
   1856 	sp_parent_cache = sp_child_cache = NULL;
   1857 }
   1858 
   1859 /* define the module linkage */
   1860 MD_PLUGIN_MISC_MODULE("soft partition module", init_init(), fini_uninit())
   1861