Home | History | Annotate | Download | only in stripe
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/param.h>
     28 #include <sys/systm.h>
     29 #include <sys/conf.h>
     30 #include <sys/file.h>
     31 #include <sys/user.h>
     32 #include <sys/uio.h>
     33 #include <sys/t_lock.h>
     34 #include <sys/buf.h>
     35 #include <sys/dkio.h>
     36 #include <sys/vtoc.h>
     37 #include <sys/kmem.h>
     38 #include <vm/page.h>
     39 #include <sys/cmn_err.h>
     40 #include <sys/sysmacros.h>
     41 #include <sys/types.h>
     42 #include <sys/mkdev.h>
     43 #include <sys/stat.h>
     44 #include <sys/open.h>
     45 #include <sys/lvm/mdio.h>
     46 #include <sys/lvm/mdvar.h>
     47 #include <sys/lvm/md_stripe.h>
     48 #include <sys/lvm/md_convert.h>
     49 #include <sys/lvm/md_notify.h>
     50 #include <sys/modctl.h>
     51 #include <sys/ddi.h>
     52 #include <sys/sunddi.h>
     53 #include <sys/debug.h>
     54 #include <sys/sysevent/eventdefs.h>
     55 #include <sys/sysevent/svm.h>
     56 
     57 md_ops_t		stripe_md_ops;
     58 #ifndef	lint
     59 char			_depends_on[] = "drv/md";
     60 md_ops_t		*md_interface_ops = &stripe_md_ops;
     61 #endif
     62 
     63 extern unit_t		md_nunits;
     64 extern set_t		md_nsets;
     65 extern md_set_t		md_set[];
     66 
     67 extern kmutex_t		md_mx;
     68 extern kcondvar_t	md_cv;
     69 
     70 extern int		md_status;
     71 extern major_t		md_major;
     72 extern mdq_anchor_t	md_done_daemon;
     73 
     74 static int		md_stripe_mcs_buf_off;
     75 static kmem_cache_t	*stripe_parent_cache = NULL;
     76 static kmem_cache_t	*stripe_child_cache = NULL;
     77 
     78 /*ARGSUSED1*/
     79 static int
     80 stripe_parent_constructor(void *p, void *d1, int d2)
     81 {
     82 	mutex_init(&((md_sps_t *)p)->ps_mx,
     83 	    NULL, MUTEX_DEFAULT, NULL);
     84 	return (0);
     85 }
     86 
     87 static void
     88 stripe_parent_init(void *ps)
     89 {
     90 	bzero(ps, offsetof(md_sps_t, ps_mx));
     91 }
     92 
     93 /*ARGSUSED1*/
     94 static void
     95 stripe_parent_destructor(void *p, void *d)
     96 {
     97 	mutex_destroy(&((md_sps_t *)p)->ps_mx);
     98 }
     99 
    100 /*ARGSUSED1*/
    101 static int
    102 stripe_child_constructor(void *p, void *d1, int d2)
    103 {
    104 	bioinit(&((md_scs_t *)p)->cs_buf);
    105 	return (0);
    106 }
    107 
    108 static void
    109 stripe_child_init(md_scs_t *cs)
    110 {
    111 	cs->cs_mdunit = 0;
    112 	cs->cs_ps = NULL;
    113 	cs->cs_comp = NULL;
    114 	md_bioreset(&cs->cs_buf);
    115 }
    116 
    117 /*ARGSUSED1*/
    118 static void
    119 stripe_child_destructor(void *p, void *d)
    120 {
    121 	biofini(&((md_scs_t *)p)->cs_buf);
    122 }
    123 
    124 /*ARGSUSED*/
    125 static void
    126 stripe_run_queue(void *d)
    127 {
    128 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
    129 		md_daemon(1, &md_done_daemon);
    130 }
    131 
    132 static void
    133 stripe_close_all_devs(ms_unit_t *un, int md_cflags)
    134 {
    135 	int		row;
    136 	int		i;
    137 	int		c;
    138 	struct ms_comp	*mdcomp;
    139 
    140 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
    141 	for (row = 0; row < un->un_nrows; row++) {
    142 		struct ms_row *mdr = &un->un_row[row];
    143 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
    144 			struct ms_comp	*mdc;
    145 			mdc = &mdcomp[c++];
    146 			if (md_cflags & MD_OFLG_PROBEDEV) {
    147 
    148 			/*
    149 			 * It is possible that the md_layered_open
    150 			 * failed because the stripe unit structure
    151 			 * contained a NODEV.  In such a case since
    152 			 * there is nothing to open, there is nothing
    153 			 * to close.
    154 			 */
    155 				if (mdc->un_dev == NODEV64)
    156 					continue;
    157 			}
    158 			if ((md_cflags & MD_OFLG_PROBEDEV) &&
    159 			    (mdc->un_mirror.ms_flags & MDM_S_PROBEOPEN)) {
    160 				md_layered_close(mdc->un_dev,
    161 				    md_cflags);
    162 				mdc->un_mirror.ms_flags &= ~MDM_S_PROBEOPEN;
    163 			} else if (mdc->un_mirror.ms_flags & MDM_S_ISOPEN) {
    164 				md_layered_close(mdc->un_dev, md_cflags);
    165 				mdc->un_mirror.ms_flags &= ~MDM_S_ISOPEN;
    166 			}
    167 		}
    168 	}
    169 }
    170 
    171 static int
    172 stripe_open_all_devs(ms_unit_t *un, int md_oflags)
    173 {
    174 	minor_t		mnum = MD_SID(un);
    175 	int		row;
    176 	int		i;
    177 	int		c;
    178 	struct ms_comp	*mdcomp;
    179 	int		err;
    180 	int		cont_on_errors = (md_oflags & MD_OFLG_CONT_ERRS);
    181 	int		probe_err_cnt = 0;
    182 	int		total_comp_cnt = 0;
    183 	set_t		setno = MD_MIN2SET(MD_SID(un));
    184 	side_t		side = mddb_getsidenum(setno);
    185 	mdkey_t		key;
    186 
    187 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
    188 
    189 	/*
    190 	 * For a probe call, if any component of a stripe or a concat
    191 	 * can be opened, it is considered to be a success. The total number
    192 	 * of components in a stripe are computed prior to starting a probe.
    193 	 * This number is then compared against the number of components
    194 	 * that could be be successfully opened. If none of the components
    195 	 * in a stripe can be opened, only then an ENXIO is returned for a
    196 	 * probe type open.
    197 	 */
    198 
    199 	for (row = 0; row < un->un_nrows; row++) {
    200 		struct ms_row *mdr = &un->un_row[row];
    201 
    202 		if (md_oflags & MD_OFLG_PROBEDEV)
    203 			total_comp_cnt += mdr->un_ncomp;
    204 
    205 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
    206 			struct ms_comp	*mdc;
    207 			md_dev64_t tmpdev;
    208 
    209 			mdc = &mdcomp[c++];
    210 			tmpdev = mdc->un_dev;
    211 			/*
    212 			 * Do the open by device id
    213 			 * Check if this comp is hotspared and
    214 			 * if it is then use the key for hotspare.
    215 			 * MN disksets don't use devids, so we better don't use
    216 			 * md_devid_found/md_resolve_bydevid there. Rather do,
    217 			 * what's done in stripe_build_incore()
    218 			 */
    219 			if (MD_MNSET_SETNO(setno)) {
    220 				if (mdc->un_mirror.ms_hs_id != 0) {
    221 					(void) md_hot_spare_ifc(HS_MKDEV, 0, 0,
    222 					    0, &mdc->un_mirror.ms_hs_id, NULL,
    223 					    &tmpdev, NULL);
    224 				}
    225 			} else {
    226 				key = mdc->un_mirror.ms_hs_id ?
    227 				    mdc->un_mirror.ms_hs_key : mdc->un_key;
    228 				if ((md_getmajor(tmpdev) != md_major) &&
    229 				    md_devid_found(setno, side, key) == 1) {
    230 					tmpdev = md_resolve_bydevid(mnum,
    231 					    tmpdev, key);
    232 				}
    233 			}
    234 
    235 			/*
    236 			 * For a submirror, we only want to open those devices
    237 			 * that are not errored. If the device is errored then
    238 			 * then there is no reason to open it and leaving it
    239 			 * closed allows the RCM/DR code to work so that the
    240 			 * errored device can be replaced.
    241 			 */
    242 			if ((md_oflags & MD_OFLG_PROBEDEV) ||
    243 			    ! (mdc->un_mirror.ms_state & CS_ERRED)) {
    244 
    245 				err = md_layered_open(mnum, &tmpdev, md_oflags);
    246 			} else {
    247 				err = ENXIO;
    248 			}
    249 
    250 			/*
    251 			 * Only set the un_dev if the tmpdev != NODEV64. If
    252 			 * it is NODEV64 then the md_layered_open() will have
    253 			 * failed in some manner.
    254 			 */
    255 			if (tmpdev != NODEV64)
    256 				mdc->un_dev = tmpdev;
    257 
    258 			if (err) {
    259 				if (!cont_on_errors) {
    260 					stripe_close_all_devs(un, md_oflags);
    261 					return (ENXIO);
    262 				}
    263 
    264 				if (md_oflags & MD_OFLG_PROBEDEV)
    265 					probe_err_cnt++;
    266 			} else {
    267 				if (md_oflags & MD_OFLG_PROBEDEV) {
    268 					mdc->un_mirror.ms_flags |=
    269 					    MDM_S_PROBEOPEN;
    270 				} else
    271 					mdc->un_mirror.ms_flags |= MDM_S_ISOPEN;
    272 			}
    273 		}
    274 	}
    275 
    276 	/* If every component in a stripe could not be opened fail */
    277 	if ((md_oflags & MD_OFLG_PROBEDEV) &&
    278 	    (probe_err_cnt == total_comp_cnt))
    279 		return (ENXIO);
    280 	else
    281 		return (0);
    282 }
    283 
    284 int
    285 stripe_build_incore(void *p, int snarfing)
    286 {
    287 	ms_unit_t *un = (ms_unit_t *)p;
    288 	struct ms_comp	*mdcomp;
    289 	minor_t		mnum;
    290 	int		row;
    291 	int		i;
    292 	int		c;
    293 	int		ncomps;
    294 
    295 	mnum = MD_SID(un);
    296 
    297 	if (MD_UNIT(mnum) != NULL)
    298 		return (0);
    299 
    300 	MD_STATUS(un) = 0;
    301 
    302 	/*
    303 	 * Reset all the is_open flags, these are probably set
    304 	 * cause they just came out of the database.
    305 	 */
    306 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
    307 
    308 	ncomps = 0;
    309 	for (row = 0; row < un->un_nrows; row++) {
    310 		struct ms_row *mdr = &un->un_row[row];
    311 		ncomps += mdr->un_ncomp;
    312 	}
    313 
    314 	for (row = 0; row < un->un_nrows; row++) {
    315 		struct ms_row *mdr = &un->un_row[row];
    316 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
    317 			struct ms_comp		*mdc;
    318 			set_t			setno;
    319 			md_dev64_t		tmpdev;
    320 
    321 			mdc = &mdcomp[c++];
    322 			mdc->un_mirror.ms_flags &=
    323 			    ~(MDM_S_ISOPEN | MDM_S_IOERR | MDM_S_RS_TRIED);
    324 
    325 			if (!snarfing)
    326 				continue;
    327 
    328 			setno = MD_MIN2SET(mnum);
    329 
    330 			tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
    331 			    mdc->un_key, MD_NOTRUST_DEVT);
    332 			mdc->un_dev = tmpdev;
    333 			/*
    334 			 * Check for hotspares. If the hotspares haven't been
    335 			 * snarfed yet, stripe_open_all_devs() will do the
    336 			 * remapping of the dev's later.
    337 			 */
    338 			if (mdc->un_mirror.ms_hs_id != 0) {
    339 				mdc->un_mirror.ms_orig_dev = mdc->un_dev;
    340 				(void) md_hot_spare_ifc(HS_MKDEV, 0, 0,
    341 				    0, &mdc->un_mirror.ms_hs_id, NULL,
    342 				    &tmpdev, NULL);
    343 				mdc->un_dev = tmpdev;
    344 			}
    345 		}
    346 	}
    347 
    348 	/* place various information in the in-core data structures */
    349 	md_nblocks_set(mnum, un->c.un_total_blocks);
    350 	MD_UNIT(mnum) = un;
    351 
    352 	return (0);
    353 }
    354 
    355 void
    356 reset_stripe(ms_unit_t *un, minor_t mnum, int removing)
    357 {
    358 	ms_comp_t	*mdcomp;
    359 	struct ms_row	*mdr;
    360 	int		i, c;
    361 	int		row;
    362 	int		nsv;
    363 	int		isv;
    364 	sv_dev_t	*sv;
    365 	mddb_recid_t	*recids;
    366 	mddb_recid_t	vtoc_id;
    367 	int		rid = 0;
    368 
    369 	md_destroy_unit_incore(mnum, &stripe_md_ops);
    370 
    371 	md_nblocks_set(mnum, -1ULL);
    372 	MD_UNIT(mnum) = NULL;
    373 
    374 	/*
    375 	 * Attempt release of its minor node
    376 	 */
    377 	md_remove_minor_node(mnum);
    378 
    379 	if (!removing)
    380 		return;
    381 
    382 	nsv = 0;
    383 	/* Count the number of devices */
    384 	for (row = 0; row < un->un_nrows; row++) {
    385 		mdr = &un->un_row[row];
    386 		nsv += mdr->un_ncomp;
    387 	}
    388 	sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t) * nsv, KM_SLEEP);
    389 
    390 	/*
    391 	 * allocate recids array.  since we may have to commit
    392 	 * underlying soft partition records, we need an array
    393 	 * of size: total number of components in stripe + 3
    394 	 * (one for the stripe itself, one for the hotspare, one
    395 	 * for the end marker).
    396 	 */
    397 	recids = kmem_alloc(sizeof (mddb_recid_t) * (nsv + 3), KM_SLEEP);
    398 
    399 	/*
    400 	 * Save the md_dev64_t's and driver nm indexes.
    401 	 * Because after the mddb_deleterec() we will
    402 	 * not be able to access the unit structure.
    403 	 *
    404 	 * NOTE: Deleting the names before deleting the
    405 	 *	 unit structure would cause problems if
    406 	 *	 the machine crashed in between the two.
    407 	 */
    408 	isv = 0;
    409 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
    410 
    411 	for (row = 0; row < un->un_nrows; row++) {
    412 		mdr = &un->un_row[row];
    413 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
    414 			struct ms_comp	*mdc;
    415 			md_dev64_t	child_dev;
    416 			md_unit_t	*child_un;
    417 
    418 			mdc = &mdcomp[c++];
    419 			if (mdc->un_mirror.ms_hs_id != 0) {
    420 				mdkey_t		hs_key;
    421 
    422 				hs_key = mdc->un_mirror.ms_hs_key;
    423 
    424 				mdc->un_dev = mdc->un_mirror.ms_orig_dev;
    425 				mdc->un_start_block =
    426 				    mdc->un_mirror.ms_orig_blk;
    427 				mdc->un_mirror.ms_hs_id = 0;
    428 				mdc->un_mirror.ms_hs_key = 0;
    429 				mdc->un_mirror.ms_orig_dev = 0;
    430 				recids[0] = 0;
    431 				recids[1] = 0;	/* recids[1] filled in below */
    432 				recids[2] = 0;
    433 				(void) md_hot_spare_ifc(HS_FREE, un->un_hsp_id,
    434 				    0, 0, &recids[0], &hs_key, NULL, NULL);
    435 				mddb_commitrecs_wrapper(recids);
    436 			}
    437 
    438 			/*
    439 			 * check if we've got metadevice below us and
    440 			 * deparent it if we do.
    441 			 * NOTE: currently soft partitions are the
    442 			 * the only metadevices stripes can be
    443 			 * built on top of.
    444 			 */
    445 			child_dev = mdc->un_dev;
    446 			if (md_getmajor(child_dev) == md_major) {
    447 				child_un = MD_UNIT(md_getminor(child_dev));
    448 				md_reset_parent(child_dev);
    449 				recids[rid++] = MD_RECID(child_un);
    450 			}
    451 
    452 			sv[isv].setno = MD_MIN2SET(mnum);
    453 			sv[isv++].key = mdc->un_key;
    454 		}
    455 	}
    456 
    457 	recids[rid++] = un->c.un_record_id;
    458 	recids[rid] = 0;	/* filled in below */
    459 
    460 	/*
    461 	 * Decrement the HSP reference count and
    462 	 * remove the knowledge of the HSP from the unit struct.
    463 	 * This is done atomically to remove a window.
    464 	 */
    465 	if (un->un_hsp_id != -1) {
    466 		(void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
    467 		    &recids[rid++], NULL, NULL, NULL);
    468 		un->un_hsp_id = -1;
    469 	}
    470 
    471 	/* set end marker and commit records */
    472 	recids[rid] = 0;
    473 	mddb_commitrecs_wrapper(recids);
    474 
    475 	vtoc_id = un->c.un_vtoc_id;
    476 
    477 	/*
    478 	 * Remove self from the namespace
    479 	 */
    480 	if (un->c.un_revision & MD_FN_META_DEV) {
    481 		(void) md_rem_selfname(un->c.un_self_id);
    482 	}
    483 
    484 	/* Remove the unit structure */
    485 	mddb_deleterec_wrapper(un->c.un_record_id);
    486 
    487 	/* Remove the vtoc, if present */
    488 	if (vtoc_id)
    489 		mddb_deleterec_wrapper(vtoc_id);
    490 
    491 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
    492 	    MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
    493 	md_rem_names(sv, nsv);
    494 	kmem_free(sv, sizeof (sv_dev_t) * nsv);
    495 	kmem_free(recids, sizeof (mddb_recid_t) * (nsv + 3));
    496 }
    497 
    498 static void
    499 stripe_error(md_sps_t *ps)
    500 {
    501 	struct buf	*pb = ps->ps_bp;
    502 	mdi_unit_t	*ui = ps->ps_ui;
    503 	md_dev64_t	dev = ps->ps_errcomp->un_dev;
    504 	md_dev64_t	md_dev = md_expldev(pb->b_edev);
    505 	char		*str;
    506 
    507 	if (pb->b_flags & B_READ) {
    508 		ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_READERR;
    509 		str = "read";
    510 	} else {
    511 		ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_WRTERR;
    512 		str = "write";
    513 	}
    514 	if (!(ps->ps_flags & MD_SPS_DONTFREE)) {
    515 		if (MUTEX_HELD(&ps->ps_mx)) {
    516 			mutex_exit(&ps->ps_mx);
    517 		}
    518 	} else {
    519 		ASSERT(panicstr);
    520 	}
    521 	SPS_FREE(stripe_parent_cache, ps);
    522 	pb->b_flags |= B_ERROR;
    523 
    524 	md_kstat_done(ui, pb, 0);
    525 	md_unit_readerexit(ui);
    526 	md_biodone(pb);
    527 
    528 	cmn_err(CE_WARN, "md: %s: %s error on %s",
    529 	    md_shortname(md_getminor(md_dev)), str,
    530 	    md_devname(MD_DEV2SET(md_dev), dev, NULL, 0));
    531 }
    532 
    533 static int
    534 stripe_done(struct buf *cb)
    535 {
    536 	struct buf	*pb;
    537 	mdi_unit_t	*ui;
    538 	md_sps_t	*ps;
    539 	md_scs_t	*cs;
    540 
    541 	/*LINTED*/
    542 	cs = (md_scs_t *)((caddr_t)cb - md_stripe_mcs_buf_off);
    543 	ps = cs->cs_ps;
    544 	pb = ps->ps_bp;
    545 
    546 	mutex_enter(&ps->ps_mx);
    547 	if (cb->b_flags & B_ERROR) {
    548 		ps->ps_flags |= MD_SPS_ERROR;
    549 		pb->b_error = cb->b_error;
    550 		ps->ps_errcomp = cs->cs_comp;
    551 	}
    552 
    553 	if (cb->b_flags & B_REMAPPED)
    554 		bp_mapout(cb);
    555 
    556 	ps->ps_frags--;
    557 	if (ps->ps_frags != 0) {
    558 		mutex_exit(&ps->ps_mx);
    559 		kmem_cache_free(stripe_child_cache, cs);
    560 		return (1);
    561 	}
    562 	kmem_cache_free(stripe_child_cache, cs);
    563 	if (ps->ps_flags & MD_SPS_ERROR) {
    564 		stripe_error(ps);
    565 		return (1);
    566 	}
    567 	ui = ps->ps_ui;
    568 	if (!(ps->ps_flags & MD_SPS_DONTFREE)) {
    569 		mutex_exit(&ps->ps_mx);
    570 	} else {
    571 		ASSERT(panicstr);
    572 	}
    573 	SPS_FREE(stripe_parent_cache, ps);
    574 	md_kstat_done(ui, pb, 0);
    575 	md_unit_readerexit(ui);
    576 	md_biodone(pb);
    577 	return (0);
    578 }
    579 
    580 
    581 /*
    582  * This routine does the mapping from virtual (dev, blkno) of a metapartition
    583  * to the real (dev, blkno) of a real disk partition.
    584  * It goes to the md_conf[] table to find out the correct real partition
    585  * dev and block number for this buffer.
    586  *
    587  * A single buf request can not go across real disk partition boundary.
    588  * When the virtual request specified by (dev, blkno) spans more than one
    589  * real partition, md_mapbuf will return 1. Then the caller should prepare
    590  * another real buf and continue calling md_mapbuf to do the mapping until
    591  * it returns 0.
    592  *
    593  */
    594 
    595 static int
    596 md_mapbuf(
    597 	ms_unit_t	*un,
    598 	diskaddr_t	blkno,
    599 	u_longlong_t	bcount,
    600 	buf_t		*bp,	/* if bp==NULL, skip bp updates */
    601 	ms_comp_t	**mdc)	/* if bp==NULL, skip mdc update */
    602 {
    603 	struct ms_row	*mdr;
    604 	struct ms_comp	*mdcomp;
    605 	diskaddr_t	stripe_blk;
    606 	diskaddr_t	fragment, blk_in_row, endblk;
    607 	offset_t	interlace;
    608 	size_t		dev_index;
    609 	int		row_index, more;
    610 	extern unsigned md_maxphys;
    611 	/* Work var's when bp==NULL */
    612 	u_longlong_t	wb_bcount;
    613 	diskaddr_t	wb_blkno;
    614 	md_dev64_t	wb_edev;
    615 	ms_comp_t	*wmdc;
    616 
    617 	/*
    618 	 * Do a real calculation to derive the minor device of the
    619 	 * Virtual Disk, which in turn will let us derive the
    620 	 * device/minor of the underlying real device.
    621 	 */
    622 
    623 
    624 	for (row_index = 0; row_index < un->un_nrows; row_index++) {
    625 		mdr = &un->un_row[row_index];
    626 		if (blkno < mdr->un_cum_blocks)
    627 			break;
    628 	}
    629 	ASSERT(row_index != un->un_nrows);
    630 
    631 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
    632 
    633 	blk_in_row = blkno - mdr->un_cum_blocks + mdr->un_blocks;
    634 	endblk = (diskaddr_t)(blkno + howmany(bcount, DEV_BSIZE));
    635 	if (mdr->un_ncomp == 1) { /* No striping */
    636 		if (endblk > mdr->un_cum_blocks) {
    637 			wb_bcount = ldbtob(mdr->un_cum_blocks - blkno);
    638 			if ((row_index + 1) == un->un_nrows)
    639 				more = 0;
    640 			else
    641 				more = 1;
    642 		} else {
    643 			wb_bcount = bcount;
    644 			more = 0;
    645 		}
    646 		wmdc = &mdcomp[mdr->un_icomp];
    647 		wb_blkno = blk_in_row;
    648 	} else { /* Have striping */
    649 		interlace = mdr->un_interlace;
    650 		fragment = blk_in_row % interlace;
    651 		if (bcount > ldbtob(interlace - fragment)) {
    652 			more = 1;
    653 			wb_bcount = ldbtob(interlace - fragment);
    654 		} else {
    655 			more = 0;
    656 			wb_bcount = bcount;
    657 		}
    658 
    659 		stripe_blk = blk_in_row / interlace;
    660 		dev_index = (size_t)(stripe_blk % mdr->un_ncomp);
    661 		wmdc = &mdcomp[mdr->un_icomp + dev_index];
    662 		wb_blkno = (diskaddr_t)(((stripe_blk / mdr->un_ncomp) *
    663 		    interlace) + fragment);
    664 	}
    665 
    666 	wb_blkno += wmdc->un_start_block;
    667 	wb_edev = wmdc->un_dev;
    668 
    669 	/* only break up the I/O if we're not built on another metadevice */
    670 	if ((md_getmajor(wb_edev) != md_major) && (wb_bcount > md_maxphys)) {
    671 		wb_bcount = md_maxphys;
    672 		more = 1;
    673 	}
    674 	if (bp != (buf_t *)NULL) {
    675 		/*
    676 		 * wb_bcount is limited by md_maxphys which is 'int'
    677 		 */
    678 		bp->b_bcount = (size_t)wb_bcount;
    679 		bp->b_lblkno = wb_blkno;
    680 		bp->b_edev = md_dev64_to_dev(wb_edev);
    681 		*mdc = wmdc;
    682 	}
    683 	return (more);
    684 }
    685 
    686 static void
    687 md_stripe_strategy(buf_t *pb, int flag, void *private)
    688 {
    689 	md_sps_t	*ps;
    690 	md_scs_t	*cs;
    691 	int		doing_writes;
    692 	int		more;
    693 	ms_unit_t	*un;
    694 	mdi_unit_t	*ui;
    695 	size_t		current_count;
    696 	diskaddr_t	current_blkno;
    697 	off_t		current_offset;
    698 	buf_t		*cb;		/* child buf pointer */
    699 	set_t		setno;
    700 
    701 	setno = MD_MIN2SET(getminor(pb->b_edev));
    702 
    703 	/*
    704 	 * When doing IO to a multi owner meta device, check if set is halted.
    705 	 * We do this check without the needed lock held, for performance
    706 	 * reasons.
    707 	 * If an IO just slips through while the set is locked via an
    708 	 * MD_MN_SUSPEND_SET, we don't care about it.
    709 	 * Only check for a suspended set if we are a top-level i/o request
    710 	 * (MD_STR_NOTTOP is cleared in 'flag').
    711 	 */
    712 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
    713 	    (MD_SET_HALTED | MD_SET_MNSET)) {
    714 		if ((flag & MD_STR_NOTTOP) == 0) {
    715 			mutex_enter(&md_mx);
    716 			/* Here we loop until the set is no longer halted */
    717 			while (md_set[setno].s_status & MD_SET_HALTED) {
    718 				cv_wait(&md_cv, &md_mx);
    719 			}
    720 			mutex_exit(&md_mx);
    721 		}
    722 	}
    723 
    724 	ui = MDI_UNIT(getminor(pb->b_edev));
    725 
    726 	md_kstat_waitq_enter(ui);
    727 
    728 	un = (ms_unit_t *)md_unit_readerlock(ui);
    729 
    730 	if ((flag & MD_NOBLOCK) == 0) {
    731 		if (md_inc_iocount(setno) != 0) {
    732 			pb->b_flags |= B_ERROR;
    733 			pb->b_error = ENXIO;
    734 			pb->b_resid = pb->b_bcount;
    735 			md_kstat_waitq_exit(ui);
    736 			md_unit_readerexit(ui);
    737 			biodone(pb);
    738 			return;
    739 		}
    740 	} else {
    741 		md_inc_iocount_noblock(setno);
    742 	}
    743 
    744 	if (!(flag & MD_STR_NOTTOP)) {
    745 		if (md_checkbuf(ui, (md_unit_t *)un, pb) != 0) {
    746 			md_kstat_waitq_exit(ui);
    747 			return;
    748 		}
    749 	}
    750 
    751 	ps = kmem_cache_alloc(stripe_parent_cache, MD_ALLOCFLAGS);
    752 	stripe_parent_init(ps);
    753 
    754 	/*
    755 	 * Save essential information from the original buffhdr
    756 	 * in the md_save structure.
    757 	 */
    758 	ps->ps_un = un;
    759 	ps->ps_ui = ui;
    760 	ps->ps_bp = pb;
    761 	ps->ps_addr = pb->b_un.b_addr;
    762 
    763 	if ((pb->b_flags & B_READ) == 0)
    764 		doing_writes = 1;
    765 	else
    766 		doing_writes = 0;
    767 
    768 
    769 	current_count = pb->b_bcount;
    770 	current_blkno = pb->b_lblkno;
    771 	current_offset  = 0;
    772 
    773 	if (!(flag & MD_STR_NOTTOP) && panicstr)
    774 		ps->ps_flags |= MD_SPS_DONTFREE;
    775 
    776 	md_kstat_waitq_to_runq(ui);
    777 
    778 	ps->ps_frags++;
    779 	do {
    780 		cs = kmem_cache_alloc(stripe_child_cache, MD_ALLOCFLAGS);
    781 		stripe_child_init(cs);
    782 		cb = &cs->cs_buf;
    783 		cs->cs_ps = ps;
    784 		more = md_mapbuf(un, current_blkno, current_count, cb,
    785 		    &cs->cs_comp);
    786 
    787 		cb = md_bioclone(pb, current_offset, cb->b_bcount, cb->b_edev,
    788 		    cb->b_lblkno, stripe_done, cb, KM_NOSLEEP);
    789 		/*
    790 		 * Do these calculations now,
    791 		 *  so that we pickup a valid b_bcount from the chld_bp.
    792 		 */
    793 		current_offset += cb->b_bcount;
    794 		current_count -=  cb->b_bcount;
    795 		current_blkno +=  (diskaddr_t)(lbtodb(cb->b_bcount));
    796 
    797 		if (more) {
    798 			mutex_enter(&ps->ps_mx);
    799 			ps->ps_frags++;
    800 			mutex_exit(&ps->ps_mx);
    801 		}
    802 
    803 		if (doing_writes &&
    804 		    cs->cs_comp->un_mirror.ms_flags & MDM_S_NOWRITE) {
    805 			(void) stripe_done(cb);
    806 			continue;
    807 		}
    808 		md_call_strategy(cb, flag, private);
    809 	} while (more);
    810 
    811 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
    812 		while (!(ps->ps_flags & MD_SPS_DONE)) {
    813 			md_daemon(1, &md_done_daemon);
    814 			drv_usecwait(10);
    815 		}
    816 		kmem_cache_free(stripe_parent_cache, ps);
    817 	}
    818 }
    819 
    820 static int
    821 stripe_snarf(md_snarfcmd_t cmd, set_t setno)
    822 {
    823 	ms_unit_t	*un;
    824 	mddb_recid_t	recid;
    825 	int		gotsomething;
    826 	int		all_stripes_gotten;
    827 	mddb_type_t	typ1;
    828 	mddb_de_ic_t	*dep;
    829 	mddb_rb32_t	*rbp;
    830 	size_t		newreqsize;
    831 	ms_unit_t	*big_un;
    832 	ms_unit32_od_t	*small_un;
    833 
    834 
    835 	if (cmd == MD_SNARF_CLEANUP)
    836 		return (0);
    837 
    838 	all_stripes_gotten = 1;
    839 	gotsomething = 0;
    840 
    841 	typ1 = (mddb_type_t)md_getshared_key(setno,
    842 	    stripe_md_ops.md_driver.md_drivername);
    843 	recid = mddb_makerecid(setno, 0);
    844 
    845 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
    846 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
    847 			continue;
    848 
    849 		dep = mddb_getrecdep(recid);
    850 		dep->de_flags = MDDB_F_STRIPE;
    851 		rbp = dep->de_rb;
    852 
    853 		switch (rbp->rb_revision) {
    854 		case MDDB_REV_RB:
    855 		case MDDB_REV_RBFN:
    856 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
    857 				/*
    858 				 * This means, we have an old and small record
    859 				 * and this record hasn't already been
    860 				 * converted.  Before we create an incore
    861 				 * metadevice from this we have to convert it to
    862 				 * a big record.
    863 				 */
    864 				small_un =
    865 				    (ms_unit32_od_t *)mddb_getrecaddr(recid);
    866 				newreqsize = get_big_stripe_req_size(small_un,
    867 				    COMPLETE_STRUCTURE);
    868 				big_un = (ms_unit_t *)kmem_zalloc(newreqsize,
    869 				    KM_SLEEP);
    870 				stripe_convert((caddr_t)small_un,
    871 				    (caddr_t)big_un, SMALL_2_BIG);
    872 				kmem_free(small_un, dep->de_reqsize);
    873 				dep->de_rb_userdata = big_un;
    874 				dep->de_reqsize = newreqsize;
    875 				un = big_un;
    876 				rbp->rb_private |= MD_PRV_CONVD;
    877 			} else {
    878 				/* Small device had already been converted */
    879 				un = (ms_unit_t *)mddb_getrecaddr(recid);
    880 			}
    881 			un->c.un_revision &= ~MD_64BIT_META_DEV;
    882 			break;
    883 		case MDDB_REV_RB64:
    884 		case MDDB_REV_RB64FN:
    885 			/* Big device */
    886 			un = (ms_unit_t *)mddb_getrecaddr(recid);
    887 			un->c.un_revision |= MD_64BIT_META_DEV;
    888 			un->c.un_flag |= MD_EFILABEL;
    889 			break;
    890 		}
    891 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
    892 
    893 		/* Create minor node for snarfed unit. */
    894 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
    895 
    896 		if (MD_UNIT(MD_SID(un)) != NULL) {
    897 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
    898 			continue;
    899 		}
    900 		all_stripes_gotten = 0;
    901 		if (stripe_build_incore((void *)un, 1) == 0) {
    902 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
    903 			md_create_unit_incore(MD_SID(un), &stripe_md_ops, 0);
    904 			gotsomething = 1;
    905 		}
    906 	}
    907 
    908 	if (!all_stripes_gotten)
    909 		return (gotsomething);
    910 
    911 	recid = mddb_makerecid(setno, 0);
    912 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0)
    913 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
    914 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
    915 
    916 	return (0);
    917 }
    918 
    919 static int
    920 stripe_halt(md_haltcmd_t cmd, set_t setno)
    921 {
    922 	int		i;
    923 	mdi_unit_t	*ui;
    924 	minor_t		mnum;
    925 
    926 	if (cmd == MD_HALT_CLOSE)
    927 		return (0);
    928 
    929 	if (cmd == MD_HALT_OPEN)
    930 		return (0);
    931 
    932 	if (cmd == MD_HALT_UNLOAD)
    933 		return (0);
    934 
    935 	if (cmd == MD_HALT_CHECK) {
    936 		for (i = 0; i < md_nunits; i++) {
    937 			mnum = MD_MKMIN(setno, i);
    938 			if ((ui = MDI_UNIT(mnum)) == NULL)
    939 				continue;
    940 			if (ui->ui_opsindex != stripe_md_ops.md_selfindex)
    941 				continue;
    942 			if (md_unit_isopen(ui))
    943 				return (1);
    944 		}
    945 		return (0);
    946 	}
    947 
    948 	if (cmd != MD_HALT_DOIT)
    949 		return (1);
    950 
    951 	for (i = 0; i < md_nunits; i++) {
    952 		mnum = MD_MKMIN(setno, i);
    953 		if ((ui = MDI_UNIT(mnum)) == NULL)
    954 			continue;
    955 		if (ui->ui_opsindex != stripe_md_ops.md_selfindex)
    956 			continue;
    957 		reset_stripe((ms_unit_t *)MD_UNIT(mnum), mnum, 0);
    958 	}
    959 
    960 	return (0);
    961 }
    962 
    963 /*ARGSUSED3*/
    964 static int
    965 stripe_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
    966 {
    967 	minor_t		mnum = getminor(*dev);
    968 	mdi_unit_t	*ui = MDI_UNIT(mnum);
    969 	ms_unit_t	*un;
    970 	int		err = 0;
    971 	set_t		setno;
    972 
    973 	/*
    974 	 * When doing an open of a multi owner metadevice, check to see if this
    975 	 * node is a starting node and if a reconfig cycle is underway.
    976 	 * If so, the system isn't sufficiently set up enough to handle the
    977 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
    978 	 */
    979 	setno = MD_MIN2SET(mnum);
    980 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
    981 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
    982 			return (ENXIO);
    983 	}
    984 
    985 	/* single thread */
    986 	un = (ms_unit_t *)md_unit_openclose_enter(ui);
    987 
    988 	/* open devices, if necessary */
    989 	if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
    990 		if ((err = stripe_open_all_devs(un, md_oflags)) != 0) {
    991 			goto out;
    992 		}
    993 	}
    994 
    995 	/* count open */
    996 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
    997 		goto out;
    998 
    999 	/* unlock, return success */
   1000 out:
   1001 	md_unit_openclose_exit(ui);
   1002 	return (err);
   1003 }
   1004 
   1005 /*ARGSUSED1*/
   1006 static int
   1007 stripe_close(
   1008 	dev_t		dev,
   1009 	int		flag,
   1010 	int		otyp,
   1011 	cred_t		*cred_p,
   1012 	int		md_cflags
   1013 )
   1014 {
   1015 	minor_t		mnum = getminor(dev);
   1016 	mdi_unit_t	*ui = MDI_UNIT(mnum);
   1017 	ms_unit_t	*un;
   1018 	int		err = 0;
   1019 
   1020 	/* single thread */
   1021 	un = (ms_unit_t *)md_unit_openclose_enter(ui);
   1022 
   1023 	/* count closed */
   1024 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
   1025 		goto out;
   1026 
   1027 	/* close devices, if necessary */
   1028 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
   1029 		stripe_close_all_devs(un, md_cflags);
   1030 	}
   1031 
   1032 	/* unlock, return success */
   1033 out:
   1034 	md_unit_openclose_exit(ui);
   1035 	return (err);
   1036 }
   1037 
   1038 
   1039 static struct buf dumpbuf;
   1040 
   1041 /*
   1042  * This routine dumps memory to the disk.  It assumes that the memory has
   1043  * already been mapped into mainbus space.  It is called at disk interrupt
   1044  * priority when the system is in trouble.
   1045  *
   1046  */
   1047 static int
   1048 stripe_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
   1049 {
   1050 	ms_unit_t	*un;
   1051 	buf_t		*bp;
   1052 	ms_comp_t	*mdc;
   1053 	u_longlong_t	nb;
   1054 	diskaddr_t	mapblk;
   1055 	int		result;
   1056 	int		more;
   1057 	int		saveresult = 0;
   1058 
   1059 	/*
   1060 	 * Don't need to grab the unit lock.
   1061 	 * Cause nothing else is suppose to be happenning.
   1062 	 * Also dump is not suppose to sleep.
   1063 	 */
   1064 	un = (ms_unit_t *)MD_UNIT(getminor(dev));
   1065 
   1066 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
   1067 		return (EINVAL);
   1068 
   1069 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
   1070 		return (EINVAL);
   1071 
   1072 	bp = &dumpbuf;
   1073 	nb = ldbtob(nblk);
   1074 	do {
   1075 		bzero((caddr_t)bp, sizeof (*bp));
   1076 		more = md_mapbuf(un, (diskaddr_t)blkno, nb, bp, &mdc);
   1077 		nblk = btodb(bp->b_bcount);
   1078 		mapblk = bp->b_lblkno;
   1079 		if (!(mdc->un_mirror.ms_flags & MDM_S_NOWRITE)) {
   1080 			/*
   1081 			 * bdev_dump() is currently only able to take
   1082 			 * 32 bit wide blkno's.
   1083 			 */
   1084 			result = bdev_dump(bp->b_edev, addr, (daddr_t)mapblk,
   1085 			    nblk);
   1086 			if (result)
   1087 				saveresult = result;
   1088 		}
   1089 
   1090 		nb -= bp->b_bcount;
   1091 		addr += bp->b_bcount;
   1092 		blkno += nblk;
   1093 	} while (more);
   1094 
   1095 	return (saveresult);
   1096 }
   1097 
   1098 /*ARGSUSED*/
   1099 static intptr_t
   1100 stripe_shared_by_blk(
   1101 	md_dev64_t dev,
   1102 	void *junk,
   1103 	diskaddr_t blkno,
   1104 	u_longlong_t *cnt)
   1105 {
   1106 	ms_unit_t	*un;
   1107 	buf_t		bp;
   1108 	ms_comp_t	*comp;
   1109 
   1110 	un = MD_UNIT(md_getminor(dev));
   1111 	(void) md_mapbuf(un, blkno, ldbtob(*cnt), &bp, &comp);
   1112 	*cnt = (u_longlong_t)lbtodb(bp.b_bcount);
   1113 	return ((intptr_t)&comp->un_mirror);
   1114 }
   1115 
   1116 /*
   1117  * stripe_block_count_skip_size() returns the following values
   1118  *	so that the logical to physical block mappings can
   1119  *	be calculated without intimate knowledge of the underpinnings.
   1120  *
   1121  *	block - first logical block number of the device.
   1122  *		block = [ # of blocks before THE row ] +
   1123  *			[ # of blocks in THE row before the component ]
   1124  *	count - # of segments (interlaced size).
   1125  *	skip  - # of logical blocks between segments, or delta to
   1126  *		  get to next segment
   1127  *	size  - interlace size used for the block, count, skip.
   1128  */
   1129 /*ARGSUSED*/
   1130 static intptr_t
   1131 stripe_block_count_skip_size(
   1132 	md_dev64_t	 dev,
   1133 	void		*junk,
   1134 	int		ci,
   1135 	diskaddr_t	*block,
   1136 	size_t		*count,
   1137 	u_longlong_t	*skip,
   1138 	u_longlong_t	*size)
   1139 {
   1140 	ms_unit_t	*un;
   1141 	int		row;
   1142 	struct ms_row	*mdr;
   1143 	int		cmpcount = 0;
   1144 
   1145 	un = MD_UNIT(md_getminor(dev));
   1146 
   1147 	for (row = 0; row < un->un_nrows; row++) {
   1148 		mdr = &un->un_row[row];
   1149 		if ((mdr->un_ncomp + cmpcount) > ci)
   1150 			break;
   1151 		cmpcount += mdr->un_ncomp;
   1152 	}
   1153 	ASSERT(row != un->un_nrows);
   1154 
   1155 	/*
   1156 	 * Concatenations are always contiguous blocks,
   1157 	 * you cannot depend on the interlace being a usable
   1158 	 * value (except for stripes).
   1159 	 */
   1160 	if (mdr->un_ncomp == 1) {	/* Concats */
   1161 		*block = mdr->un_cum_blocks - mdr->un_blocks;
   1162 		*count = 1;
   1163 		*skip = 0;
   1164 		*size = mdr->un_blocks;
   1165 	} else {			/* Stripes */
   1166 		*block = (mdr->un_cum_blocks - mdr->un_blocks) +
   1167 		    ((ci - cmpcount) * mdr->un_interlace);
   1168 		*count	= (size_t)(mdr->un_blocks / (mdr->un_interlace *
   1169 		    mdr->un_ncomp));
   1170 		*skip = (mdr->un_interlace * mdr->un_ncomp) - mdr->un_interlace;
   1171 		*size = mdr->un_interlace;
   1172 	}
   1173 
   1174 	return (0);
   1175 }
   1176 
   1177 /*ARGSUSED*/
   1178 static intptr_t
   1179 stripe_shared_by_indx(md_dev64_t dev, void *junk, int indx)
   1180 {
   1181 	ms_unit_t	*un;
   1182 	ms_comp_t	*comp;
   1183 
   1184 	un = MD_UNIT(md_getminor(dev));
   1185 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
   1186 	comp += indx;
   1187 	return ((intptr_t)&comp->un_mirror);
   1188 }
   1189 
   1190 /*ARGSUSED*/
   1191 intptr_t
   1192 stripe_component_count(md_dev64_t dev, void *junk)
   1193 {
   1194 	/*
   1195 	 * See comments for stripe_get_dev
   1196 	 */
   1197 
   1198 	ms_unit_t	*un;
   1199 	int		count = 0;
   1200 	int		row;
   1201 
   1202 	un = MD_UNIT(md_getminor(dev));
   1203 	for (row = 0; row < un->un_nrows; row++)
   1204 		count += un->un_row[row].un_ncomp;
   1205 	return (count);
   1206 }
   1207 
   1208 /*ARGSUSED*/
   1209 intptr_t
   1210 stripe_get_dev(md_dev64_t dev, void *junk, int indx, ms_cd_info_t *cd)
   1211 {
   1212 	/*
   1213 	 * It should be noted that stripe_replace in stripe_ioctl.c calls this
   1214 	 * routine using makedevice(0, minor) for the first argument.
   1215 	 *
   1216 	 * If this routine at some point in the future needs to use the major
   1217 	 * number stripe_replace must be changed.
   1218 	 */
   1219 
   1220 	ms_unit_t	*un;
   1221 	ms_comp_t	*comp;
   1222 	md_dev64_t	tmpdev;
   1223 
   1224 	un = MD_UNIT(md_getminor(dev));
   1225 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
   1226 	comp += indx;
   1227 	tmpdev = comp->un_dev;
   1228 	/*
   1229 	 * Try to resolve devt again if NODEV64
   1230 	 * Check if this comp is hotspared and if it is
   1231 	 * then use key for hotspare
   1232 	 */
   1233 	if (tmpdev == NODEV64) {
   1234 		tmpdev = md_resolve_bydevid(md_getminor(dev), tmpdev,
   1235 		    comp->un_mirror.ms_hs_id ?
   1236 		    comp->un_mirror.ms_hs_key :
   1237 		    comp->un_key);
   1238 		comp->un_dev = tmpdev;
   1239 	}
   1240 
   1241 	cd->cd_dev = comp->un_dev;
   1242 	cd->cd_orig_dev = comp->un_mirror.ms_orig_dev;
   1243 	return (0);
   1244 }
   1245 
   1246 /*ARGSUSED*/
   1247 void
   1248 stripe_replace_done(md_dev64_t dev, sv_dev_t *sv)
   1249 {
   1250 	/*
   1251 	 * See comments for stripe_get_dev
   1252 	 */
   1253 
   1254 	minor_t		mnum = md_getminor(dev);
   1255 
   1256 	if (sv != NULL) {
   1257 		md_rem_names(sv, 1);
   1258 		kmem_free(sv, sizeof (sv_dev_t));
   1259 	}
   1260 
   1261 	md_unit_writerexit(MDI_UNIT(mnum));
   1262 }
   1263 
   1264 /*ARGSUSED*/
   1265 intptr_t
   1266 stripe_replace_dev(md_dev64_t dev, void *junk, int ci, ms_new_dev_t *nd,
   1267     mddb_recid_t *recids, int nrecids, void (**replace_done)(),
   1268     void **replace_data)
   1269 {
   1270 	minor_t		mnum;
   1271 	ms_unit_t	*un;
   1272 	mdi_unit_t	*ui;
   1273 	ms_comp_t	*comp;
   1274 	diskaddr_t	dev_size;
   1275 	int		row;
   1276 	int		ncomps = 0;
   1277 	int		cmpcount = 0;
   1278 	int		rid = 0;
   1279 	struct ms_row	*mdr;
   1280 	sv_dev_t	*sv = NULL;
   1281 	mddb_recid_t	hs_id = 0;
   1282 	set_t		setno;
   1283 	side_t		side;
   1284 	md_dev64_t	this_dev;
   1285 
   1286 	mnum = md_getminor(dev);
   1287 	ui = MDI_UNIT(mnum);
   1288 	setno = MD_MIN2SET(mnum);
   1289 	side = mddb_getsidenum(setno);
   1290 
   1291 	un = md_unit_writerlock(ui);
   1292 
   1293 	*replace_data = NULL;
   1294 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
   1295 
   1296 	comp += ci;
   1297 
   1298 	/*
   1299 	 * Count the number of components
   1300 	 */
   1301 	for (row = 0; row < un->un_nrows; row++) {
   1302 		struct ms_row *mdr = &un->un_row[row];
   1303 		ncomps += mdr->un_ncomp;
   1304 	}
   1305 
   1306 	recids[0] = 0;
   1307 	/*
   1308 	 * No need of checking size of new device,
   1309 	 * when hotsparing (it has already been done), or
   1310 	 * when enabling the device.
   1311 	 */
   1312 	if ((nd != NULL) && (nd->nd_hs_id == 0)) {
   1313 		for (row = 0; row < un->un_nrows; row++) {
   1314 			mdr = &un->un_row[row];
   1315 			if ((mdr->un_ncomp + cmpcount) > ci)
   1316 				break;
   1317 			cmpcount += mdr->un_ncomp;
   1318 		}
   1319 		ASSERT(row != un->un_nrows);
   1320 
   1321 		/* Concatenations have a ncomp = 1 */
   1322 		dev_size = mdr->un_blocks / mdr->un_ncomp;
   1323 
   1324 		/*
   1325 		 * now check to see if new comp can be used in
   1326 		 * place of old comp
   1327 		 */
   1328 		if ((un->c.un_flag & MD_LABELED) && (ci == 0) &&
   1329 		    nd->nd_labeled)
   1330 			nd->nd_start_blk = 0;
   1331 		else
   1332 			nd->nd_nblks -= nd->nd_start_blk;
   1333 
   1334 		if (dev_size > nd->nd_nblks) {
   1335 			md_unit_writerexit(ui);
   1336 			return (MDE_COMP_TOO_SMALL);
   1337 		}
   1338 
   1339 		sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
   1340 		sv->setno = MD_MIN2SET(mnum);
   1341 		sv->key = comp->un_key;
   1342 	}
   1343 
   1344 	/*
   1345 	 * Close this component.
   1346 	 */
   1347 	if (comp->un_mirror.ms_flags & MDM_S_ISOPEN) {
   1348 		md_layered_close(comp->un_dev, MD_OFLG_NULL);
   1349 		comp->un_mirror.ms_flags &= ~MDM_S_ISOPEN;
   1350 	}
   1351 
   1352 	/*
   1353 	 * If the component is hotspared, return to the pool.
   1354 	 */
   1355 	if (comp->un_mirror.ms_hs_id != 0) {
   1356 		hs_cmds_t	cmd;
   1357 		mdkey_t		hs_key;
   1358 
   1359 		hs_key = comp->un_mirror.ms_hs_key;
   1360 		comp->un_dev = comp->un_mirror.ms_orig_dev;
   1361 		comp->un_start_block = comp->un_mirror.ms_orig_blk;
   1362 		comp->un_mirror.ms_hs_key = 0;
   1363 		comp->un_mirror.ms_hs_id = 0;
   1364 		comp->un_mirror.ms_orig_dev = 0;
   1365 
   1366 		cmd = HS_FREE;
   1367 		if ((comp->un_mirror.ms_state != CS_OKAY) &&
   1368 		    (comp->un_mirror.ms_state != CS_RESYNC))
   1369 			cmd = HS_BAD;
   1370 		(void) md_hot_spare_ifc(cmd, un->un_hsp_id, 0, 0, &hs_id,
   1371 		    &hs_key, NULL, NULL);
   1372 	}
   1373 
   1374 	/*
   1375 	 * Open by device id; for enable (indicated by a NULL
   1376 	 * nd pointer), use the existing component info.  For
   1377 	 * replace, use the new device.
   1378 	 */
   1379 	if (nd == NULL) {
   1380 		this_dev = md_resolve_bydevid(mnum, comp->un_dev, comp->un_key);
   1381 		/*
   1382 		 * If someone replaced a new disk in the same slot
   1383 		 * we get NODEV64 since old device id cannot be
   1384 		 * resolved. The new devt is obtained from the
   1385 		 * mddb since devt is going to be unchanged for the
   1386 		 * enable case. No need to check for multiple
   1387 		 * keys here because the caller (comp_replace)
   1388 		 * has already sanity checked it for us.
   1389 		 */
   1390 		if (this_dev == NODEV64) {
   1391 			this_dev = md_getdevnum(setno, side, comp->un_key,
   1392 			    MD_TRUST_DEVT);
   1393 		}
   1394 	} else {
   1395 		/*
   1396 		 * If this is a hotspare, save the original dev_t for later
   1397 		 * use. If this has occured during boot then the value of
   1398 		 * comp->un_dev will be NODEV64 because of the failure to look
   1399 		 * up the devid of the device.
   1400 		 */
   1401 		if (nd->nd_hs_id != 0)
   1402 			comp->un_mirror.ms_orig_dev = comp->un_dev;
   1403 		this_dev = md_resolve_bydevid(mnum, nd->nd_dev, nd->nd_key);
   1404 	}
   1405 
   1406 	comp->un_dev = this_dev;
   1407 
   1408 	/*
   1409 	 * Now open the new device if required. Note for a single component
   1410 	 * stripe it will not be open - leave this for the mirror driver to
   1411 	 * deal with.
   1412 	 */
   1413 	if (md_unit_isopen(ui)) {
   1414 		if (md_layered_open(mnum, &this_dev, MD_OFLG_NULL)) {
   1415 			mddb_recid_t	ids[3];
   1416 
   1417 			ids[0] = un->c.un_record_id;
   1418 			ids[1] = hs_id;
   1419 			ids[2] = 0;
   1420 			mddb_commitrecs_wrapper(ids);
   1421 			if ((nd != NULL) && (nd->nd_hs_id != 0)) {
   1422 				/*
   1423 				 * Revert back to the original device.
   1424 				 */
   1425 				comp->un_dev = comp->un_mirror.ms_orig_dev;
   1426 
   1427 				cmn_err(CE_WARN,
   1428 				    "md: %s: open error of hotspare %s",
   1429 				    md_shortname(mnum),
   1430 				    md_devname(MD_MIN2SET(mnum), nd->nd_dev,
   1431 				    NULL, 0));
   1432 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
   1433 				    SVM_TAG_HS, MD_MIN2SET(mnum), nd->nd_dev);
   1434 			}
   1435 			md_unit_writerexit(ui);
   1436 			return (MDE_COMP_OPEN_ERR);
   1437 		}
   1438 		if (nd != NULL)
   1439 			nd->nd_dev = this_dev;
   1440 
   1441 		comp->un_mirror.ms_flags |= MDM_S_ISOPEN;
   1442 	}
   1443 
   1444 	if (nd == NULL) {
   1445 		recids[0] = un->c.un_record_id;
   1446 		recids[1] = hs_id;
   1447 		recids[2] = 0;
   1448 		*replace_done = stripe_replace_done;
   1449 		return (0);
   1450 	}
   1451 
   1452 	/* if hot sparing this device */
   1453 	if (nd->nd_hs_id != 0) {
   1454 		char	devname[MD_MAX_CTDLEN];
   1455 		char	hs_devname[MD_MAX_CTDLEN];
   1456 		set_t	setno;
   1457 
   1458 		comp->un_mirror.ms_hs_id = nd->nd_hs_id;
   1459 		comp->un_mirror.ms_hs_key = nd->nd_key;
   1460 
   1461 		comp->un_mirror.ms_orig_blk = comp->un_start_block;
   1462 
   1463 		setno = MD_MIN2SET(mnum);
   1464 
   1465 		(void) md_devname(setno, comp->un_mirror.ms_orig_dev, devname,
   1466 		    sizeof (devname));
   1467 		(void) md_devname(setno, nd->nd_dev, hs_devname,
   1468 		    sizeof (hs_devname));
   1469 
   1470 		cmn_err(CE_NOTE, "md: %s: hotspared device %s with %s",
   1471 		    md_shortname(mnum), devname, hs_devname);
   1472 
   1473 	} else {	/* replacing the device */
   1474 		comp->un_key = nd->nd_key;
   1475 		*replace_data = (void *)sv;
   1476 
   1477 		/*
   1478 		 * For the old device, make sure to reset the parent
   1479 		 * if it's a  metadevice.
   1480 		 */
   1481 		if (md_getmajor(comp->un_dev) == md_major) {
   1482 			minor_t	  comp_mnum = md_getminor(comp->un_dev);
   1483 			md_unit_t *comp_un = MD_UNIT(comp_mnum);
   1484 
   1485 			md_reset_parent(comp->un_dev);
   1486 			recids[rid++] = MD_RECID(comp_un);
   1487 		}
   1488 	}
   1489 
   1490 	comp->un_dev = nd->nd_dev;
   1491 	comp->un_start_block = nd->nd_start_blk;
   1492 
   1493 	/*
   1494 	 * For the new device, make sure to set the parent if it's a
   1495 	 * metadevice.
   1496 	 *
   1497 	 * If we ever support using metadevices as hot spares, this
   1498 	 * will need to be tested, and possibly moved into the
   1499 	 * preceding "else" clause, immediately following the parent
   1500 	 * reset block.  For now, it's convenient to leave it here and
   1501 	 * only compress nd->nd_dev once.
   1502 	 */
   1503 	if (md_getmajor(comp->un_dev) == md_major) {
   1504 		minor_t		comp_mnum = md_getminor(comp->un_dev);
   1505 		md_unit_t	*comp_un = MD_UNIT(comp_mnum);
   1506 
   1507 		md_set_parent(comp->un_dev, MD_SID(un));
   1508 		recids[rid++] = MD_RECID(comp_un);
   1509 	}
   1510 
   1511 	recids[rid++] = un->c.un_record_id;
   1512 	recids[rid++] = hs_id;
   1513 	recids[rid] = 0;
   1514 	*replace_done = stripe_replace_done;
   1515 	return (0);
   1516 }
   1517 
   1518 /*ARGSUSED*/
   1519 static intptr_t
   1520 stripe_hotspare_dev(
   1521 	md_dev64_t	dev,
   1522 	void		*junk,
   1523 	int		ci,
   1524 	mddb_recid_t	*recids,
   1525 	int		nrecids,
   1526 	void		(**replace_done)(),
   1527 	void		**replace_data)
   1528 {
   1529 	ms_unit_t	*un;
   1530 	mdi_unit_t	*ui;
   1531 	ms_comp_t	*comp;
   1532 	int		row;
   1533 	struct ms_row	*mdr;
   1534 	ms_new_dev_t	nd;
   1535 	int		err;
   1536 	int		i;
   1537 	minor_t		mnum;
   1538 	set_t		setno;
   1539 	int		cmpcount = 0;
   1540 
   1541 	mnum = md_getminor(dev);
   1542 	ui = MDI_UNIT(mnum);
   1543 	un = MD_UNIT(mnum);
   1544 	setno = MD_MIN2SET(mnum);
   1545 
   1546 	if (md_get_setstatus(setno) & MD_SET_STALE)
   1547 		return (1);
   1548 
   1549 	if (un->un_hsp_id == -1)
   1550 		return (1);
   1551 
   1552 	for (row = 0; row < un->un_nrows; row++) {
   1553 		mdr = &un->un_row[row];
   1554 		if ((mdr->un_ncomp + cmpcount) > ci)
   1555 			break;
   1556 		cmpcount += mdr->un_ncomp;
   1557 	}
   1558 	ASSERT(row != un->un_nrows);
   1559 
   1560 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
   1561 	comp += ci;
   1562 	/* Concatenations have a ncomp = 1 */
   1563 	nd.nd_nblks = mdr->un_blocks / mdr->un_ncomp;
   1564 
   1565 	if ((un->c.un_flag & MD_LABELED) && (ci == 0))
   1566 		nd.nd_labeled = 1;
   1567 	else
   1568 		nd.nd_labeled = 0;
   1569 
   1570 again:
   1571 	err = md_hot_spare_ifc(HS_GET, un->un_hsp_id, nd.nd_nblks,
   1572 	    nd.nd_labeled, &nd.nd_hs_id, &nd.nd_key, &nd.nd_dev,
   1573 	    &nd.nd_start_blk);
   1574 
   1575 	if (err) {
   1576 		if (!stripe_replace_dev(dev, junk, ci, NULL, recids, nrecids,
   1577 		    replace_done, replace_data)) {
   1578 			mddb_commitrecs_wrapper(recids);
   1579 			md_unit_writerexit(ui);
   1580 		}
   1581 		recids[0] = 0;
   1582 		return (1);
   1583 	}
   1584 
   1585 	if (stripe_replace_dev(dev, junk, ci, &nd, recids, nrecids,
   1586 	    replace_done, replace_data)) {
   1587 
   1588 		(void) md_hot_spare_ifc(HS_BAD, un->un_hsp_id, 0, 0,
   1589 		    &nd.nd_hs_id, &nd.nd_key, NULL, NULL);
   1590 		mddb_commitrec_wrapper(nd.nd_hs_id);
   1591 		goto again;
   1592 	}
   1593 
   1594 	/* Leave a slot for the null recid */
   1595 	for (i = 0; i < (nrecids - 1); i++) {
   1596 		if (recids[i] == 0) {
   1597 			recids[i++] = nd.nd_hs_id;
   1598 			recids[i] = 0;
   1599 		}
   1600 	}
   1601 	return (0);
   1602 }
   1603 
   1604 static int
   1605 stripe_imp_set(
   1606 	set_t	setno
   1607 )
   1608 {
   1609 
   1610 	mddb_recid_t	recid;
   1611 	int		i, row, c, gotsomething;
   1612 	mddb_type_t	typ1;
   1613 	mddb_de_ic_t	*dep;
   1614 	mddb_rb32_t	*rbp;
   1615 	ms_unit32_od_t	*un32;
   1616 	ms_unit_t	*un64;
   1617 	md_dev64_t	self_devt;
   1618 	minor_t		*self_id;	/* minor needs to be updated */
   1619 	md_parent_t	*parent_id;	/* parent needs to be updated */
   1620 	mddb_recid_t	*record_id;	/* record id needs to be updated */
   1621 	mddb_recid_t	*hsp_id;
   1622 	ms_comp32_od_t	*comp32;
   1623 	ms_comp_t	*comp64;
   1624 
   1625 
   1626 	gotsomething = 0;
   1627 
   1628 	typ1 = (mddb_type_t)md_getshared_key(setno,
   1629 	    stripe_md_ops.md_driver.md_drivername);
   1630 	recid = mddb_makerecid(setno, 0);
   1631 
   1632 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
   1633 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
   1634 			continue;
   1635 
   1636 		dep = mddb_getrecdep(recid);
   1637 		rbp = dep->de_rb;
   1638 
   1639 		switch (rbp->rb_revision) {
   1640 		case MDDB_REV_RB:
   1641 		case MDDB_REV_RBFN:
   1642 			/*
   1643 			 * Small device
   1644 			 */
   1645 			un32 = (ms_unit32_od_t *)mddb_getrecaddr(recid);
   1646 			self_id = &(un32->c.un_self_id);
   1647 			parent_id = &(un32->c.un_parent);
   1648 			record_id = &(un32->c.un_record_id);
   1649 			hsp_id = &(un32->un_hsp_id);
   1650 
   1651 			comp32 = (ms_comp32_od_t *)
   1652 			    ((void *)&((char *)un32)[un32->un_ocomp]);
   1653 			for (row = 0; row < un32->un_nrows; row++) {
   1654 				struct ms_row32_od *mdr = &un32->un_row[row];
   1655 				for (i = 0, c = mdr->un_icomp;
   1656 				    i < mdr->un_ncomp; i++) {
   1657 					ms_comp32_od_t *mdc;
   1658 
   1659 					mdc = &comp32[c++];
   1660 
   1661 					if (!md_update_minor(setno,
   1662 					    mddb_getsidenum(setno),
   1663 					    mdc->un_key))
   1664 						goto out;
   1665 
   1666 					if (mdc->un_mirror.ms_hs_id != 0)
   1667 						mdc->un_mirror.ms_hs_id =
   1668 						    MAKERECID(setno,
   1669 						    mdc->un_mirror.ms_hs_id);
   1670 				}
   1671 			}
   1672 			break;
   1673 		case MDDB_REV_RB64:
   1674 		case MDDB_REV_RB64FN:
   1675 			un64 = (ms_unit_t *)mddb_getrecaddr(recid);
   1676 			self_id = &(un64->c.un_self_id);
   1677 			parent_id = &(un64->c.un_parent);
   1678 			record_id = &(un64->c.un_record_id);
   1679 			hsp_id = &(un64->un_hsp_id);
   1680 
   1681 			comp64 = (ms_comp_t *)
   1682 			    ((void *)&((char *)un64)[un64->un_ocomp]);
   1683 			for (row = 0; row < un64->un_nrows; row++) {
   1684 				struct ms_row *mdr = &un64->un_row[row];
   1685 
   1686 				for (i = 0, c = mdr->un_icomp;
   1687 				    i < mdr->un_ncomp; i++) {
   1688 					ms_comp_t *mdc;
   1689 
   1690 					mdc = &comp64[c++];
   1691 
   1692 					if (!md_update_minor(setno,
   1693 					    mddb_getsidenum(setno),
   1694 					    mdc->un_key))
   1695 						goto out;
   1696 
   1697 					if (mdc->un_mirror.ms_hs_id != 0)
   1698 						mdc->un_mirror.ms_hs_id =
   1699 						    MAKERECID(setno,
   1700 						    mdc->un_mirror.ms_hs_id);
   1701 				}
   1702 			}
   1703 			break;
   1704 		}
   1705 
   1706 		/*
   1707 		 * If this is a top level and a friendly name metadevice,
   1708 		 * update its minor in the namespace.
   1709 		 */
   1710 		if ((*parent_id == MD_NO_PARENT) &&
   1711 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
   1712 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
   1713 
   1714 			self_devt = md_makedevice(md_major, *self_id);
   1715 			if (!md_update_top_device_minor(setno,
   1716 			    mddb_getsidenum(setno), self_devt))
   1717 				goto out;
   1718 		}
   1719 
   1720 		/*
   1721 		 * Update unit with the imported setno
   1722 		 *
   1723 		 */
   1724 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
   1725 
   1726 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
   1727 
   1728 		if (*hsp_id != -1)
   1729 			*hsp_id = MAKERECID(setno, DBID(*hsp_id));
   1730 
   1731 		if (*parent_id != MD_NO_PARENT)
   1732 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
   1733 		*record_id = MAKERECID(setno, DBID(*record_id));
   1734 
   1735 		gotsomething = 1;
   1736 	}
   1737 
   1738 out:
   1739 	return (gotsomething);
   1740 }
   1741 
   1742 static md_named_services_t stripe_named_services[] = {
   1743 	{stripe_shared_by_blk,			"shared by blk"		    },
   1744 	{stripe_shared_by_indx,			"shared by indx"	    },
   1745 	{stripe_component_count,		"get component count"	    },
   1746 	{stripe_block_count_skip_size,		"get block count skip size" },
   1747 	{stripe_get_dev,			"get device"		    },
   1748 	{stripe_replace_dev,			"replace device"	    },
   1749 	{stripe_hotspare_dev,			"hotspare device"	    },
   1750 	{stripe_rename_check,			MDRNM_CHECK		    },
   1751 	{NULL,					0}
   1752 };
   1753 
   1754 md_ops_t stripe_md_ops = {
   1755 	stripe_open,		/* open */
   1756 	stripe_close,		/* close */
   1757 	md_stripe_strategy,	/* strategy */
   1758 	NULL,			/* print */
   1759 	stripe_dump,		/* dump */
   1760 	NULL,			/* read */
   1761 	NULL,			/* write */
   1762 	md_stripe_ioctl,	/* stripe_ioctl, */
   1763 	stripe_snarf,		/* stripe_snarf */
   1764 	stripe_halt,		/* stripe_halt */
   1765 	NULL,			/* aread */
   1766 	NULL,			/* awrite */
   1767 	stripe_imp_set,		/* import set */
   1768 	stripe_named_services
   1769 };
   1770 
   1771 static void
   1772 init_init()
   1773 {
   1774 	md_stripe_mcs_buf_off = sizeof (md_scs_t) - sizeof (buf_t);
   1775 
   1776 	stripe_parent_cache = kmem_cache_create("md_stripe_parent",
   1777 	    sizeof (md_sps_t), 0, stripe_parent_constructor,
   1778 	    stripe_parent_destructor, stripe_run_queue, NULL, NULL,
   1779 	    0);
   1780 	stripe_child_cache = kmem_cache_create("md_stripe_child",
   1781 	    sizeof (md_scs_t) - sizeof (buf_t) + biosize(), 0,
   1782 	    stripe_child_constructor, stripe_child_destructor,
   1783 	    stripe_run_queue, NULL, NULL, 0);
   1784 }
   1785 
   1786 static void
   1787 fini_uninit()
   1788 {
   1789 	kmem_cache_destroy(stripe_parent_cache);
   1790 	kmem_cache_destroy(stripe_child_cache);
   1791 	stripe_parent_cache = stripe_child_cache = NULL;
   1792 }
   1793 
   1794 /* define the module linkage */
   1795 MD_PLUGIN_MISC_MODULE("stripes module", init_init(), fini_uninit())
   1796