Home | History | Annotate | Download | only in mirror
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/param.h>
     28 #include <sys/systm.h>
     29 #include <sys/conf.h>
     30 #include <sys/file.h>
     31 #include <sys/user.h>
     32 #include <sys/uio.h>
     33 #include <sys/t_lock.h>
     34 #include <sys/buf.h>
     35 #include <sys/dkio.h>
     36 #include <sys/vtoc.h>
     37 #include <sys/kmem.h>
     38 #include <vm/page.h>
     39 #include <sys/sysmacros.h>
     40 #include <sys/types.h>
     41 #include <sys/mkdev.h>
     42 #include <sys/stat.h>
     43 #include <sys/open.h>
     44 #include <sys/modctl.h>
     45 #include <sys/ddi.h>
     46 #include <sys/sunddi.h>
     47 
     48 #include <sys/lvm/mdvar.h>
     49 #include <sys/lvm/md_names.h>
     50 #include <sys/lvm/md_mddb.h>
     51 #include <sys/lvm/md_stripe.h>
     52 #include <sys/lvm/md_mirror.h>
     53 
     54 #include <sys/model.h>
     55 
     56 #include <sys/sysevent/eventdefs.h>
     57 #include <sys/sysevent/svm.h>
     58 #include <sys/lvm/mdmn_commd.h>
     59 
     60 extern int		md_status;
     61 extern kmutex_t		md_mx;
     62 extern kcondvar_t	md_cv;
     63 
     64 extern unit_t		md_nunits;
     65 extern set_t		md_nsets;
     66 extern md_set_t		md_set[];
     67 
     68 extern md_ops_t		mirror_md_ops;
     69 extern int		md_ioctl_cnt;
     70 extern md_krwlock_t	md_unit_array_rw;
     71 extern major_t		md_major;
     72 extern mdq_anchor_t	md_ff_daemonq;
     73 extern void		md_probe_one();
     74 extern void		mirror_openfail_console_info();
     75 
     76 #ifdef DEBUG
     77 extern int		mirror_debug_flag;
     78 #endif
     79 
     80 static void
     81 mirror_resume_writes(mm_unit_t *un)
     82 {
     83 	/*
     84 	 * Release the block on writes to the mirror and resume any blocked
     85 	 * resync thread.
     86 	 * This is only required for MN sets
     87 	 */
     88 	if (MD_MNSET_SETNO(MD_UN2SET(un))) {
     89 #ifdef DEBUG
     90 		if (mirror_debug_flag)
     91 			printf("mirror_resume_writes: mnum %x\n", MD_SID(un));
     92 #endif
     93 		mutex_enter(&un->un_suspend_wr_mx);
     94 		un->un_suspend_wr_flag = 0;
     95 		cv_broadcast(&un->un_suspend_wr_cv);
     96 		mutex_exit(&un->un_suspend_wr_mx);
     97 		mutex_enter(&un->un_rs_thread_mx);
     98 		un->un_rs_thread_flags &= ~MD_RI_BLOCK;
     99 		cv_signal(&un->un_rs_thread_cv);
    100 		mutex_exit(&un->un_rs_thread_mx);
    101 	}
    102 }
    103 
    104 mm_unit_t *
    105 mirror_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock)
    106 {
    107 	mm_unit_t	*un;
    108 	mdi_unit_t	*ui;
    109 	set_t		setno = MD_MIN2SET(mnum);
    110 
    111 	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
    112 		(void) mdmderror(mde, MDE_INVAL_UNIT, mnum);
    113 		return (NULL);
    114 	}
    115 
    116 	if (!(flags & STALE_OK)) {
    117 		if (md_get_setstatus(setno) & MD_SET_STALE) {
    118 			(void) mdmddberror(mde, MDE_DB_STALE, mnum, setno);
    119 			return (NULL);
    120 		}
    121 	}
    122 
    123 	ui = MDI_UNIT(mnum);
    124 	if (flags & NO_OLD) {
    125 		if (ui != NULL) {
    126 			(void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum);
    127 			return (NULL);
    128 		}
    129 		return ((mm_unit_t *)1);
    130 	}
    131 
    132 	if (ui == NULL) {
    133 		(void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum);
    134 		return (NULL);
    135 	}
    136 
    137 	if (flags & ARRAY_WRITER)
    138 		md_array_writer(lock);
    139 	else if (flags & ARRAY_READER)
    140 		md_array_reader(lock);
    141 
    142 	if (!(flags & NO_LOCK)) {
    143 		if (flags & WR_LOCK)
    144 			(void) md_ioctl_writerlock(lock, ui);
    145 		else /* RD_LOCK */
    146 			(void) md_ioctl_readerlock(lock, ui);
    147 	}
    148 	un = (mm_unit_t *)MD_UNIT(mnum);
    149 
    150 	if (un->c.un_type != MD_METAMIRROR) {
    151 		(void) mdmderror(mde, MDE_NOT_MM, mnum);
    152 		return (NULL);
    153 	}
    154 
    155 	return (un);
    156 }
    157 
    158 static int
    159 mirror_set(
    160 	void		*d,
    161 	int		mode
    162 )
    163 {
    164 	minor_t		mnum;
    165 	mm_unit_t	*un;
    166 	mddb_recid_t	recid;
    167 	mddb_type_t	typ1;
    168 	int		err;
    169 	int		i;
    170 	set_t		setno;
    171 	md_set_params_t	*msp = d;
    172 
    173 
    174 	mnum = msp->mnum;
    175 
    176 	mdclrerror(&msp->mde);
    177 
    178 	if (mirror_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL)
    179 		return (0);
    180 
    181 	setno = MD_MIN2SET(mnum);
    182 
    183 	typ1 = (mddb_type_t)md_getshared_key(setno,
    184 	    mirror_md_ops.md_driver.md_drivername);
    185 
    186 	/*
    187 	 * Create the db record for this mdstruct
    188 	 * We don't store incore elements ondisk
    189 	 */
    190 
    191 	if (msp->options & MD_CRO_64BIT) {
    192 #if defined(_ILP32)
    193 		return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum));
    194 #else
    195 		recid = mddb_createrec((size_t)msp->size, typ1, MIRROR_REC,
    196 		    MD_CRO_64BIT | MD_CRO_MIRROR | MD_CRO_FN, setno);
    197 #endif
    198 	} else {
    199 		/*
    200 		 * It's important to use the correct size here
    201 		 */
    202 		msp->size = sizeof (mm_unit32_od_t);
    203 		recid = mddb_createrec((size_t)msp->size, typ1, MIRROR_REC,
    204 		    MD_CRO_32BIT | MD_CRO_MIRROR | MD_CRO_FN, setno);
    205 	}
    206 	if (recid < 0)
    207 		return (mddbstatus2error(&msp->mde, (int)recid,
    208 		    mnum, setno));
    209 
    210 	/* Resize to include incore fields */
    211 	un = (mm_unit_t *)mddb_getrecaddr_resize(recid, sizeof (*un), 0);
    212 	/*
    213 	 * It is okay that we muck with the mdstruct here,
    214 	 * since no one else will know about the mdstruct
    215 	 * until we commit it. If we crash, the record will
    216 	 * be automatically purged, since we haven't
    217 	 * committed it yet.
    218 	 */
    219 
    220 	/* copy in the user's mdstruct */
    221 	if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un,
    222 	    (uint_t)msp->size, mode)) {
    223 		mddb_deleterec_wrapper(recid);
    224 		return (EFAULT);
    225 	}
    226 	/* All 64 bit metadevices only support EFI labels. */
    227 	if (msp->options & MD_CRO_64BIT) {
    228 		un->c.un_flag |= MD_EFILABEL;
    229 	}
    230 
    231 	un->c.un_revision |= MD_FN_META_DEV;
    232 	MD_RECID(un)	= recid;
    233 	MD_CAPAB(un)	= MD_CAN_PARENT | MD_CAN_META_CHILD | MD_CAN_SP;
    234 	MD_PARENT(un)	= MD_NO_PARENT;
    235 
    236 	for (i = 0; i < NMIRROR; i++) {
    237 		struct mm_submirror	*sm;
    238 
    239 		sm = &un->un_sm[i];
    240 		if (!SMS_IS(sm, SMS_INUSE))
    241 			continue;
    242 
    243 		/* ensure that the submirror is a metadevice */
    244 		if (md_getmajor(sm->sm_dev) != md_major)
    245 			return (mdmderror(&msp->mde, MDE_INVAL_UNIT,
    246 			    md_getminor(sm->sm_dev)));
    247 
    248 		if (md_get_parent(sm->sm_dev) == MD_NO_PARENT)
    249 			continue;
    250 
    251 		/* mirror creation should fail here */
    252 		md_nblocks_set(mnum, -1ULL);
    253 		MD_UNIT(mnum) = NULL;
    254 
    255 		mddb_deleterec_wrapper(recid);
    256 		return (mdmderror(&msp->mde, MDE_IN_USE,
    257 		    md_getminor(sm->sm_dev)));
    258 	}
    259 
    260 	if (err = mirror_build_incore(un, 0)) {
    261 		md_nblocks_set(mnum, -1ULL);
    262 		MD_UNIT(mnum) = NULL;
    263 
    264 		mddb_deleterec_wrapper(recid);
    265 		return (err);
    266 	}
    267 
    268 	/*
    269 	 * Update unit availability
    270 	 */
    271 	md_set[setno].s_un_avail--;
    272 
    273 	mirror_commit(un, ALL_SUBMIRRORS, 0);
    274 	md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
    275 	mirror_check_failfast(mnum);
    276 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno,
    277 	    MD_SID(un));
    278 
    279 	resync_start_timeout(setno);
    280 	return (0);
    281 }
    282 
    283 static int
    284 mirror_get(
    285 	void		*migp,
    286 	int		mode,
    287 	IOLOCK		*lock
    288 )
    289 {
    290 	mm_unit_t	*un;
    291 	md_i_get_t	*migph = migp;
    292 
    293 	mdclrerror(&migph->mde);
    294 
    295 	if ((un = mirror_getun(migph->id, &migph->mde, RD_LOCK, lock)) == NULL)
    296 		return (0);
    297 
    298 	if (migph->size == 0) {
    299 		migph->size = un->c.un_size;
    300 		return (0);
    301 	}
    302 
    303 	if (migph->size < un->c.un_size) {
    304 		return (EFAULT);
    305 	}
    306 	if (ddi_copyout(un, (caddr_t)(uintptr_t)migph->mdp,
    307 	    un->c.un_size, mode))
    308 		return (EFAULT);
    309 	return (0);
    310 }
    311 
    312 static int
    313 mirror_getdevs(
    314 	void			*mgdp,
    315 	int			mode,
    316 	IOLOCK			*lock
    317 )
    318 {
    319 	mm_unit_t		*un;
    320 	md_dev64_t		*udevs;
    321 	int			cnt;
    322 	int			i;
    323 	md_dev64_t		unit_dev;
    324 	md_getdevs_params_t	*mgdph = mgdp;
    325 
    326 
    327 	mdclrerror(&mgdph->mde);
    328 
    329 	if ((un = mirror_getun(mgdph->mnum,
    330 	    &mgdph->mde, RD_LOCK, lock)) == NULL)
    331 		return (0);
    332 
    333 	udevs = (md_dev64_t *)(uintptr_t)mgdph->devs;
    334 
    335 	for (cnt = 0, i = 0; i < NMIRROR; i++) {
    336 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
    337 			continue;
    338 		if (cnt < mgdph->cnt) {
    339 			unit_dev = un->un_sm[i].sm_dev;
    340 			if (md_getmajor(unit_dev) != md_major) {
    341 				unit_dev = md_xlate_mini_2_targ(unit_dev);
    342 				if (unit_dev == NODEV64)
    343 					return (ENODEV);
    344 			}
    345 
    346 			if (ddi_copyout((caddr_t)&unit_dev, (caddr_t)udevs,
    347 			    sizeof (*udevs), mode) != 0)
    348 				return (EFAULT);
    349 			++udevs;
    350 		}
    351 		++cnt;
    352 	}
    353 
    354 	mgdph->cnt = cnt;
    355 	return (0);
    356 }
    357 
    358 static int
    359 mirror_reset(
    360 	md_i_reset_t	*mirp
    361 )
    362 {
    363 	minor_t		mnum = mirp->mnum;
    364 	mm_unit_t	*un;
    365 	mdi_unit_t	*ui;
    366 	set_t		setno = MD_MIN2SET(mnum);
    367 
    368 	mdclrerror(&mirp->mde);
    369 
    370 	if ((un = mirror_getun(mnum, &mirp->mde, NO_LOCK, NULL)) == NULL)
    371 		return (0);
    372 
    373 	if (MD_HAS_PARENT(un->c.un_parent)) {
    374 		return (mdmderror(&mirp->mde, MDE_IN_USE, mnum));
    375 	}
    376 
    377 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
    378 
    379 	/* single thread */
    380 	ui = MDI_UNIT(mnum);
    381 	(void) md_unit_openclose_enter(ui);
    382 
    383 	if (md_unit_isopen(ui)) {
    384 		md_unit_openclose_exit(ui);
    385 		rw_exit(&md_unit_array_rw.lock);
    386 		return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum));
    387 	}
    388 
    389 	md_unit_openclose_exit(ui);
    390 
    391 	if (!mirp->force) {
    392 		int	smi;
    393 		for (smi = 0; smi < NMIRROR; smi++) {
    394 			if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
    395 				continue;
    396 
    397 			if (!SMS_BY_INDEX_IS(un, smi, SMS_RUNNING)) {
    398 				rw_exit(&md_unit_array_rw.lock);
    399 				return (mdmderror(&mirp->mde,
    400 				    MDE_C_WITH_INVAL_SM, mnum));
    401 			}
    402 		}
    403 	}
    404 
    405 	reset_mirror(un, mnum, 1);
    406 
    407 	/*
    408 	 * Update unit availability
    409 	 */
    410 	md_set[setno].s_un_avail++;
    411 
    412 	/*
    413 	 * If MN set, reset s_un_next so all nodes can have
    414 	 * the same view of the next available slot when
    415 	 * nodes are -w and -j
    416 	 */
    417 	if (MD_MNSET_SETNO(setno)) {
    418 		(void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum));
    419 	}
    420 
    421 	rw_exit(&md_unit_array_rw.lock);
    422 	return (0);
    423 }
    424 
    425 static int
    426 mirror_get_geom(
    427 	mm_unit_t	*un,
    428 	struct dk_geom	*geomp
    429 )
    430 {
    431 	md_get_geom((md_unit_t *)un, geomp);
    432 
    433 	return (0);
    434 }
    435 
    436 static int
    437 mirror_get_vtoc(
    438 	mm_unit_t	*un,
    439 	struct vtoc	*vtocp
    440 )
    441 {
    442 	md_get_vtoc((md_unit_t *)un, vtocp);
    443 
    444 	return (0);
    445 }
    446 
    447 static int
    448 mirror_set_vtoc(
    449 	mm_unit_t	*un,
    450 	struct vtoc	*vtocp
    451 )
    452 {
    453 	return (md_set_vtoc((md_unit_t *)un, vtocp));
    454 }
    455 
    456 static int
    457 mirror_get_extvtoc(
    458 	mm_unit_t	*un,
    459 	struct extvtoc	*vtocp
    460 )
    461 {
    462 	md_get_extvtoc((md_unit_t *)un, vtocp);
    463 
    464 	return (0);
    465 }
    466 
    467 static int
    468 mirror_set_extvtoc(
    469 	mm_unit_t	*un,
    470 	struct extvtoc	*vtocp
    471 )
    472 {
    473 	return (md_set_extvtoc((md_unit_t *)un, vtocp));
    474 }
    475 
    476 static int
    477 mirror_get_cgapart(
    478 	mm_unit_t	*un,
    479 	struct dk_map	*dkmapp
    480 )
    481 {
    482 	md_get_cgapart((md_unit_t *)un, dkmapp);
    483 	return (0);
    484 }
    485 
    486 static int
    487 mirror_getcomp_by_dev(mm_unit_t *un, replace_params_t *params,
    488     int *smi, int *cip)
    489 {
    490 	mm_submirror_t		*sm;
    491 	mm_submirror_ic_t	*smic;
    492 	ms_comp_t		*comp;
    493 	ms_unit_t		*mous;
    494 	int			ci;
    495 	int			i;
    496 	int			compcnt;
    497 	ms_cd_info_t		cd;
    498 	void			(*get_dev)();
    499 	md_dev64_t		dev = md_expldev(params->old_dev);
    500 	md_error_t		*ep = &params->mde;
    501 	minor_t			mnum = params->mnum;
    502 	mdkey_t			devkey;
    503 	int			nkeys;
    504 	set_t			setno;
    505 	side_t			side;
    506 
    507 	setno = MD_MIN2SET(MD_SID(un));
    508 	side = mddb_getsidenum(setno);
    509 
    510 	if (md_getkeyfromdev(setno, side, dev, &devkey, &nkeys) != 0)
    511 		return (mddeverror(ep, MDE_NAME_SPACE, dev));
    512 
    513 	for (i = 0; i < NMIRROR; i++) {
    514 		sm = &un->un_sm[i];
    515 		smic = &un->un_smic[i];
    516 
    517 		if (!SMS_IS(sm, SMS_INUSE))
    518 			continue;
    519 
    520 		get_dev =
    521 		    (void (*)())md_get_named_service(sm->sm_dev, 0,
    522 		    "get device", 0);
    523 		compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
    524 
    525 		/*
    526 		 * For each of the underlying stripe components get
    527 		 * the info.
    528 		 */
    529 		for (ci = 0; ci < compcnt; ci++) {
    530 			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
    531 			if ((cd.cd_dev == dev) || (cd.cd_orig_dev == dev)) {
    532 				*cip = ci;
    533 				*smi = i;
    534 				return (1);
    535 			}
    536 		}
    537 
    538 		/*
    539 		 * now we rescan looking only for NODEV. If we find
    540 		 * NODEV then we will check the keys to see if its a match.
    541 		 *
    542 		 * If no key was found to match dev, then there is
    543 		 * no way to compare keys - so continue.
    544 		 */
    545 		if (nkeys == 0) {
    546 			continue;
    547 		}
    548 		mous = MD_UNIT(md_getminor(sm->sm_dev));
    549 
    550 		for (ci = 0; ci < compcnt; ci++) {
    551 
    552 			comp = (struct ms_comp *)
    553 			    ((void *)&((char *)mous)[mous->un_ocomp]);
    554 
    555 			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
    556 
    557 			if (cd.cd_dev == NODEV64 || cd.cd_orig_dev == NODEV64) {
    558 				comp += ci;
    559 				if (comp->un_key == devkey) {
    560 					if (nkeys > 1) {
    561 						return (mddeverror(
    562 						    ep, MDE_MULTNM, dev));
    563 					}
    564 					*cip = ci;
    565 					*smi = i;
    566 					return (1);
    567 				}
    568 			}
    569 		}
    570 	}
    571 	return (mdcomperror(ep, MDE_CANT_FIND_COMP, mnum, dev));
    572 }
    573 
    574 /*
    575  * comp_replace:
    576  * ----------------
    577  * Called to implement the component replace function
    578  *
    579  * Owner is returned in the parameter block passed in by the caller.
    580  *
    581  * Returns:
    582  *	0	success
    583  *	error code if the functions fails
    584  *
    585  * For a MN set, on entry all writes to the mirror are suspended, on exit
    586  * from this function, writes must be resumed when not a dryrun.
    587  */
    588 static int
    589 comp_replace(
    590 	replace_params_t	*params,
    591 	IOLOCK			*lock
    592 )
    593 {
    594 	minor_t			mnum = params->mnum;
    595 	set_t			setno;
    596 	side_t			side;
    597 	mm_unit_t		*un;
    598 	mdi_unit_t		*ui;
    599 	ms_unit_t		*ms_un;
    600 	mdi_unit_t		*ms_ui;
    601 	ms_comp_t		*comp;
    602 	mm_submirror_t		*sm;
    603 	md_dev64_t		smdev;
    604 	mddb_recid_t		recids[6]; /* recids for stripe on SP */
    605 	int			smi, ci;
    606 	ms_new_dev_t		nd;
    607 	int			(*repl_dev)();
    608 	void			(*repl_done)();
    609 	void			*repl_data;
    610 	int			err = 0;
    611 	ms_cd_info_t		cd;
    612 	void			(*get_dev)();
    613 
    614 	mdclrerror(&params->mde);
    615 
    616 	if ((un = mirror_getun(mnum, &params->mde, WRITERS, lock)) == NULL) {
    617 		return (0);
    618 	}
    619 
    620 	ui = MDI_UNIT(mnum);
    621 	if (ui->ui_tstate & MD_INACCESSIBLE) {
    622 		(void) mdmderror(&params->mde, MDE_IN_UNAVAIL_STATE, mnum);
    623 		goto errexit;
    624 	}
    625 
    626 	/*
    627 	 * replace cannot be done while a resync is active or we are
    628 	 * still waiting for an optimized resync to be started
    629 	 */
    630 	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) {
    631 		(void) mdmderror(&params->mde, MDE_RESYNC_ACTIVE, mnum);
    632 		goto errexit;
    633 	}
    634 
    635 	if (mirror_getcomp_by_dev(un, params, &smi, &ci) == 0) {
    636 		goto errexit;
    637 	}
    638 
    639 	if (un->un_nsm == 1) {
    640 		(void) mdmderror(&params->mde, MDE_LAST_SM_RE, mnum);
    641 		goto errexit;
    642 	}
    643 
    644 	if (mirror_other_sources(un, smi, ci, 0) != 0) {
    645 		(void) mdcomperror(&params->mde, MDE_REPL_INVAL_STATE,
    646 		    mnum, md_expldev(params->old_dev));
    647 		goto errexit;
    648 	}
    649 
    650 	sm = &un->un_sm[smi];
    651 	if (sm->sm_state & (SMS_OFFLINE | SMS_OFFLINE_RESYNC)) {
    652 		(void) mdmderror(&params->mde, MDE_ILLEGAL_SM_STATE, mnum);
    653 		goto errexit;
    654 	}
    655 
    656 	get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
    657 	    "get device", 0);
    658 	(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
    659 
    660 	repl_dev = (int (*)())md_get_named_service(sm->sm_dev, 0,
    661 	    "replace device", 0);
    662 
    663 	smdev = sm->sm_dev;
    664 	ms_un = MD_UNIT(md_getminor(smdev));
    665 
    666 	if (params->cmd == ENABLE_COMP) {
    667 		md_dev64_t	this_dev;
    668 		int		numkeys;
    669 		mdkey_t		this_key;
    670 
    671 		this_dev = ((cd.cd_orig_dev == 0) ? cd.cd_dev :
    672 		    cd.cd_orig_dev);
    673 		setno = MD_MIN2SET(md_getminor(smdev));
    674 		side = mddb_getsidenum(setno);
    675 		comp = (struct ms_comp *)
    676 		    ((void *)&((char *)ms_un)[ms_un->un_ocomp]);
    677 		comp += ci;
    678 		/*
    679 		 * We trust the dev_t because we cannot determine the
    680 		 * dev_t from the device id since a new disk is in the
    681 		 * same location. Since this is a call from metareplace -e dx
    682 		 * AND it is SCSI a new dev_t is not generated.  So the
    683 		 * dev_t from the mddb is used. Before enabling the device
    684 		 * we check to make sure that multiple entries for the same
    685 		 * device does not exist in the namespace. If they do we
    686 		 * fail the ioctl.
    687 		 * One of the many ways multiple entries in the name space
    688 		 * can occur is if one removed the failed component in the
    689 		 * stripe of a mirror and put another disk that was part of
    690 		 * another metadevice. After reboot metadevadm would correctly
    691 		 * update the device name for the metadevice whose component
    692 		 * has moved. However now in the metadb there are two entries
    693 		 * for the same name (ctds) that belong to different
    694 		 * metadevices. One is valid, the other is a ghost or "last
    695 		 * know as" ctds.
    696 		 */
    697 		this_dev =  md_getdevnum(setno, side,
    698 		    comp->un_key, MD_TRUST_DEVT);
    699 
    700 		/*
    701 		 * Verify that multiple keys for the same
    702 		 * dev_t don't exist
    703 		 */
    704 
    705 		if (md_getkeyfromdev(setno, side, this_dev,
    706 		    &this_key, &numkeys) != 0) {
    707 			(void) mddeverror(&params->mde, MDE_NAME_SPACE,
    708 			    md_expldev(params->old_dev));
    709 			goto errexit;
    710 		}
    711 		/*
    712 		 * Namespace has multiple entries
    713 		 * for the same devt
    714 		 */
    715 		if (numkeys > 1) {
    716 			(void) mddeverror(&params->mde, MDE_MULTNM,
    717 			    md_expldev(params->old_dev));
    718 			goto errexit;
    719 		}
    720 		if ((numkeys == 0) || (comp->un_key != this_key)) {
    721 			(void) mdcomperror(&params->mde, MDE_CANT_FIND_COMP,
    722 			    mnum, this_dev);
    723 			goto errexit;
    724 		}
    725 
    726 		if ((md_getmajor(this_dev) != md_major) &&
    727 		    (md_devid_found(setno, side, this_key) == 1)) {
    728 			if (md_update_namespace_did(setno, side,
    729 			    this_key, &params->mde) != 0) {
    730 				(void) mddeverror(&params->mde, MDE_NAME_SPACE,
    731 				    this_dev);
    732 				goto errexit;
    733 			}
    734 		}
    735 
    736 		if (md_expldev(params->new_dev) != this_dev) {
    737 			(void) mddeverror(&params->mde, MDE_FIX_INVAL_STATE,
    738 			    md_expldev(params->new_dev));
    739 			goto errexit;
    740 		}
    741 
    742 		/* in case of dryrun, don't actually do anything */
    743 		if ((params->options & MDIOCTL_DRYRUN) == 0) {
    744 			err = (*repl_dev)(sm->sm_dev, 0, ci, NULL, recids, 6,
    745 			    &repl_done, &repl_data);
    746 		}
    747 	} else if ((params->options & MDIOCTL_DRYRUN) == 0) {
    748 		nd.nd_dev = md_expldev(params->new_dev);
    749 		nd.nd_key = params->new_key;
    750 		nd.nd_start_blk = params->start_blk;
    751 		nd.nd_nblks = params->number_blks;
    752 		nd.nd_labeled = params->has_label;
    753 		nd.nd_hs_id = 0;
    754 
    755 		err = (*repl_dev)(sm->sm_dev, 0, ci, &nd, recids, 6,
    756 		    &repl_done, &repl_data);
    757 
    758 	}
    759 
    760 	if (err != 0) {
    761 		(void) mdcomperror(&params->mde, err, mnum,
    762 		    md_expldev(params->new_dev));
    763 		goto errexit;
    764 	}
    765 	/* In case of a dryun we're done. */
    766 	if (params->options & MDIOCTL_DRYRUN) {
    767 		mdclrerror(&params->mde);
    768 		return (0);
    769 	}
    770 
    771 	/* set_sm_comp_state() commits the modified records */
    772 	set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, MD_STATE_NO_XMIT,
    773 	    lock);
    774 
    775 	(*repl_done)(sm->sm_dev, repl_data);
    776 
    777 	/*
    778 	 * If the mirror is open then need to make sure that the submirror,
    779 	 * on which the replace ran, is also open and if not then open it.
    780 	 * This is only a concern for a single component sub-mirror stripe
    781 	 * as it may not be open due to the failure of the single component.
    782 	 *
    783 	 * This check has to be done after the call to (*repl_done)
    784 	 * as that function releases the writer lock on the submirror.
    785 	 */
    786 	if (md_unit_isopen(ui)) {
    787 		minor_t ms_mnum = md_getminor(sm->sm_dev);
    788 
    789 		ms_ui = MDI_UNIT(ms_mnum);
    790 
    791 		if (!md_unit_isopen(ms_ui)) {
    792 			/*
    793 			 * Underlying submirror is not open so open it.
    794 			 */
    795 			if (md_layered_open(ms_mnum, &smdev, MD_OFLG_NULL)) {
    796 				mirror_openfail_console_info(un, smi, ci);
    797 				goto errexit;
    798 			}
    799 		}
    800 	}
    801 
    802 	mirror_check_failfast(mnum);
    803 
    804 	if (params->cmd == ENABLE_COMP) {
    805 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE,
    806 		    MD_UN2SET(un), MD_SID(un));
    807 	} else {
    808 		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE,
    809 		    MD_UN2SET(un), MD_SID(un));
    810 	}
    811 
    812 	md_ioctl_writerexit(lock);
    813 	/*
    814 	 * Reset any saved resync location flags as we've now replaced the
    815 	 * component. This means we have to resync the _whole_ component.
    816 	 */
    817 	un->un_rs_resync_done = un->un_rs_resync_2_do = 0;
    818 	un->un_rs_type = MD_RS_NONE;
    819 	mirror_resume_writes(un);
    820 	if (!MD_MNSET_SETNO(MD_UN2SET(un)))
    821 		(void) mirror_resync_unit(mnum, NULL, &params->mde, lock);
    822 	mdclrerror(&params->mde);
    823 	return (0);
    824 errexit:
    825 	/* We need to resume writes unless this is a dryrun */
    826 	if (!(params->options & MDIOCTL_DRYRUN))
    827 		mirror_resume_writes(un);
    828 	return (0);
    829 }
    830 
    831 /*
    832  * mirror_attach:
    833  * ----------------
    834  * Called to implement the submirror attach function
    835  *
    836  * Owner is returned in the parameter block passed in by the caller.
    837  *
    838  * Returns:
    839  *	0	success
    840  *	error code if the functions fails
    841  *
    842  * For a MN set, on entry all writes to the mirror are suspended, on exit
    843  * from this function, writes must be resumed when not a dryrun.
    844  */
    845 static int
    846 mirror_attach(
    847 	md_att_struct_t	*att,
    848 	IOLOCK		*lock
    849 )
    850 {
    851 	minor_t			mnum = att->mnum;
    852 	mm_unit_t		*un;
    853 	md_unit_t		*su;
    854 	mm_submirror_t		*sm;
    855 	mm_submirror_ic_t	*smic;
    856 	int			smi;
    857 	md_dev64_t		sm_dev;
    858 	minor_t			sm_mnum;
    859 	mdkey_t			indx;
    860 	set_t			setno;
    861 	uint_t			options;
    862 
    863 	/*
    864 	 * This routine should not be called during upgrade.
    865 	 */
    866 	if (MD_UPGRADE)  {
    867 		return (0);
    868 	}
    869 
    870 	mdclrerror(&att->mde);
    871 	options = att->options;
    872 
    873 	if ((un = mirror_getun(mnum, &att->mde, WRITERS, lock)) == NULL) {
    874 		return (0);
    875 	}
    876 
    877 	setno = MD_UN2SET(un);
    878 
    879 	for (smi = 0; smi < NMIRROR; smi++)
    880 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
    881 			break;
    882 
    883 	if (smi == NMIRROR) {
    884 		(void) mdmderror(&att->mde, MDE_MIRROR_FULL, mnum);
    885 		goto errexit;
    886 	}
    887 
    888 	sm = &un->un_sm[smi];
    889 	smic = &un->un_smic[smi];
    890 	sm_dev = att->submirror;
    891 	sm_mnum = md_getminor(sm_dev);
    892 
    893 	if (md_get_parent(sm_dev) != MD_NO_PARENT) {
    894 		(void) mdmderror(&att->mde, MDE_IN_USE, sm_mnum);
    895 		goto errexit;
    896 	}
    897 
    898 	if (md_unit_isopen(MDI_UNIT(sm_mnum))) {
    899 		(void) mdmderror(&att->mde, MDE_IS_OPEN, sm_mnum);
    900 		goto errexit;
    901 	}
    902 
    903 	/* Check the size */
    904 	su = (md_unit_t *)MD_UNIT(sm_mnum);
    905 	if (un->c.un_total_blocks > su->c.un_total_blocks) {
    906 		(void) mdmderror(&att->mde, MDE_SM_TOO_SMALL, sm_mnum);
    907 		goto errexit;
    908 	}
    909 
    910 	/* Don't attach labeled sm to unlabeled mirrors */
    911 	if ((su->c.un_flag & MD_LABELED) && !(un->c.un_flag & MD_LABELED)) {
    912 		(void) mdmderror(&att->mde, MDE_NO_LABELED_SM, sm_mnum);
    913 		goto errexit;
    914 	}
    915 
    916 	indx = md_setshared_name(setno,
    917 	    ddi_major_to_name(md_getmajor(sm_dev)), 0L);
    918 
    919 	/* Open the sm, only if the mirror is open */
    920 	if (md_unit_isopen(MDI_UNIT(mnum))) {
    921 		if (md_layered_open(mnum, &sm_dev, MD_OFLG_NULL)) {
    922 			(void) md_remshared_name(setno, indx);
    923 			(void) mdmderror(&att->mde, MDE_SM_OPEN_ERR,
    924 			    md_getminor(att->submirror));
    925 			goto errexit;
    926 		}
    927 		/* in dryrun mode, don't leave the device open */
    928 		if (options & MDIOCTL_DRYRUN) {
    929 			md_layered_close(sm_dev, MD_OFLG_NULL);
    930 		}
    931 	}
    932 
    933 	/*
    934 	 * After this point the checks are done and action is taken.
    935 	 * So, clean up and return in case of dryrun.
    936 	 */
    937 
    938 	if (options & MDIOCTL_DRYRUN) {
    939 		md_ioctl_writerexit(lock);
    940 		mdclrerror(&att->mde);
    941 		return (0);
    942 	}
    943 
    944 	sm->sm_key = att->key;
    945 	sm->sm_dev = sm_dev;
    946 	md_set_parent(sm_dev, MD_SID(un));
    947 	mirror_set_sm_state(sm, smic, SMS_ATTACHED_RESYNC, 1);
    948 	build_submirror(un, smi, 0);
    949 	un->un_nsm++;
    950 	mirror_commit(un, SMI2BIT(smi), 0);
    951 	mirror_check_failfast(mnum);
    952 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_ATTACH, SVM_TAG_METADEVICE,
    953 	    MD_UN2SET(un), MD_SID(un));
    954 
    955 	mirror_resume_writes(un);
    956 	md_ioctl_writerexit(lock);
    957 	if (!MD_MNSET_SETNO(setno))
    958 		(void) mirror_resync_unit(mnum, NULL, &att->mde, lock);
    959 	mdclrerror(&att->mde);
    960 	return (0);
    961 errexit:
    962 	/* We need to resume writes unless this is a dryrun */
    963 	if (!(options & MDIOCTL_DRYRUN))
    964 		mirror_resume_writes(un);
    965 	return (0);
    966 }
    967 
    968 
    969 void
    970 reset_comp_states(mm_submirror_t *sm, mm_submirror_ic_t *smic)
    971 {
    972 	int		compcnt;
    973 	int		i;
    974 	md_m_shared_t	*shared;
    975 
    976 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
    977 	for (i = 0; i < compcnt; i++) {
    978 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
    979 		    (sm->sm_dev, sm, i);
    980 
    981 		shared->ms_state = CS_OKAY;
    982 		shared->ms_flags &= ~MDM_S_NOWRITE;
    983 		shared->ms_lasterrcnt = 0;
    984 	}
    985 }
    986 
    987 
    988 /*
    989  * mirror_detach:
    990  * ----------------
    991  * Called to implement the submirror detach function
    992  *
    993  * Owner is returned in the parameter block passed in by the caller.
    994  *
    995  * Returns:
    996  *	0	success
    997  *	error code if the functions fails
    998  *
    999  * For a MN set, on entry all writes to the mirror are suspended, on exit
   1000  * from this function, writes must be resumed.
   1001  */
   1002 static int
   1003 mirror_detach(
   1004 	md_detach_params_t	*det,
   1005 	IOLOCK			*lock
   1006 )
   1007 {
   1008 	minor_t			mnum = det->mnum;
   1009 	mm_unit_t		*un;
   1010 	mdi_unit_t		*ui;
   1011 	mm_submirror_t		*sm;
   1012 	mm_submirror_t		*old_sm;
   1013 	mm_submirror_t		*new_sm;
   1014 	mm_submirror_ic_t	*smic;
   1015 	int			smi;
   1016 	md_dev64_t		sm_dev;
   1017 	md_unit_t		*su;
   1018 	sv_dev_t		sv;
   1019 	mddb_recid_t		recids[2];
   1020 	int			nsv = 0;
   1021 	int			smi_remove;
   1022 	mm_submirror_ic_t	*old_smic;
   1023 	mm_submirror_ic_t	*new_smic;
   1024 
   1025 	mdclrerror(&det->mde);
   1026 
   1027 	if ((un = mirror_getun(mnum, &det->mde, WRITERS, lock)) == NULL) {
   1028 		return (0);
   1029 	}
   1030 
   1031 	ui = MDI_UNIT(mnum);
   1032 	if (ui->ui_tstate & MD_INACCESSIBLE) {
   1033 		mirror_resume_writes(un);
   1034 		return (mdmderror(&det->mde, MDE_IN_UNAVAIL_STATE, mnum));
   1035 	}
   1036 	/*
   1037 	 * detach cannot be done while a resync is active or we are
   1038 	 * still waiting for an optimized resync to be started
   1039 	 */
   1040 	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) {
   1041 		mirror_resume_writes(un);
   1042 		return (mdmderror(&det->mde, MDE_RESYNC_ACTIVE, mnum));
   1043 	}
   1044 
   1045 	for (smi = 0; smi < NMIRROR; smi++) {
   1046 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
   1047 			continue;
   1048 		}
   1049 		if (un->un_sm[smi].sm_dev == det->submirror) {
   1050 			smi_remove = smi;
   1051 			break;
   1052 		}
   1053 	}
   1054 
   1055 	if (smi == NMIRROR) {
   1056 		mirror_resume_writes(un);
   1057 		return (mdmderror(&det->mde, MDE_CANT_FIND_SM, mnum));
   1058 	}
   1059 
   1060 	if (un->un_nsm == 1) {
   1061 		mirror_resume_writes(un);
   1062 		return (mdmderror(&det->mde, MDE_LAST_SM, mnum));
   1063 	}
   1064 
   1065 	if (mirror_other_sources(un, smi, WHOLE_SM, 0) != 0) {
   1066 		mirror_resume_writes(un);
   1067 		return (mdmderror(&det->mde, MDE_NO_READABLE_SM, mnum));
   1068 	}
   1069 
   1070 	sm = &un->un_sm[smi];
   1071 	smic = &un->un_smic[smi];
   1072 	sm_dev = sm->sm_dev;
   1073 	su = (md_unit_t *)MD_UNIT(md_getminor(sm_dev));
   1074 
   1075 	/*
   1076 	 * Need to pass in the extra record id,
   1077 	 * cause mirror_commit() will not commit
   1078 	 * a sm (from the smmask) if the slot is unused.
   1079 	 * Which it is, since we are detaching.
   1080 	 */
   1081 	recids[0] = ((md_unit_t *)MD_UNIT(md_getminor(sm_dev)))->c.un_record_id;
   1082 	recids[1] = 0;
   1083 
   1084 	mirror_set_sm_state(sm, smic, SMS_UNUSED, det->force_detach);
   1085 	/*
   1086 	 * If there are any erred components
   1087 	 * then make the detach fail and do not unparent the
   1088 	 * submirror.
   1089 	 */
   1090 	if (sm->sm_state == SMS_UNUSED) {
   1091 		/* reallow soft partitioning of submirror */
   1092 		MD_CAPAB(su) |= MD_CAN_SP;
   1093 		md_reset_parent(sm_dev);
   1094 		reset_comp_states(sm, smic);
   1095 		un->un_nsm--;
   1096 		/* Close the sm, only if the mirror is open */
   1097 		if (md_unit_isopen(MDI_UNIT(mnum)))
   1098 			md_layered_close(sm_dev, MD_OFLG_NULL);
   1099 		sv.setno = MD_UN2SET(un);
   1100 		sv.key = sm->sm_key;
   1101 		nsv = 1;
   1102 	} else
   1103 		(void) mdmderror(&det->mde, MDE_SM_FAILED_COMPS, mnum);
   1104 
   1105 	/*
   1106 	 * Perhaps the mirror changed it's size due to this detach.
   1107 	 * (void) mirror_grow_unit(un, &mde);
   1108 	 */
   1109 
   1110 	/*
   1111 	 * NOTE: We are passing the detached sm recid
   1112 	 * and not the smmask field. This is correct.
   1113 	 */
   1114 	mirror_commit(un, 0, recids);
   1115 	md_rem_names(&sv, nsv);
   1116 	if (sm->sm_state == SMS_UNUSED) {
   1117 		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DETACH, SVM_TAG_METADEVICE,
   1118 		    MD_UN2SET(un), MD_SID(un));
   1119 	}
   1120 
   1121 	/*
   1122 	 * Reshuffle the submirror devices in the array as we potentially
   1123 	 * have a dead record in the middle of it.
   1124 	 */
   1125 	for (smi = 0; nsv && (smi < NMIRROR); smi++) {
   1126 		if (smi < smi_remove) {
   1127 			continue;
   1128 		}
   1129 		if (smi > smi_remove) {
   1130 			old_sm = &un->un_sm[smi];
   1131 			new_sm = &un->un_sm[smi - 1];
   1132 			new_sm->sm_key = old_sm->sm_key;
   1133 			new_sm->sm_dev = old_sm->sm_dev;
   1134 			new_sm->sm_state = old_sm->sm_state;
   1135 			new_sm->sm_flags = old_sm->sm_flags;
   1136 			new_sm->sm_shared = old_sm->sm_shared;
   1137 			new_sm->sm_hsp_id = old_sm->sm_hsp_id;
   1138 			new_sm->sm_timestamp = old_sm->sm_timestamp;
   1139 			bzero(old_sm, sizeof (mm_submirror_t));
   1140 			old_smic = &un->un_smic[smi];
   1141 			new_smic = &un->un_smic[smi - 1];
   1142 			bcopy(old_smic, new_smic, sizeof (mm_submirror_ic_t));
   1143 			bzero(old_smic, sizeof (mm_submirror_ic_t));
   1144 		}
   1145 	}
   1146 	mirror_commit(un, 0, NULL);
   1147 	mirror_resume_writes(un);
   1148 	return (0);
   1149 }
   1150 
   1151 /*
   1152  * mirror_offline:
   1153  * ----------------
   1154  * Called to implement the submirror offline function
   1155  *
   1156  * Owner is returned in the parameter block passed in by the caller.
   1157  *
   1158  * Returns:
   1159  *	0	success
   1160  *	error code if the functions fails
   1161  *
   1162  * For a MN set, on entry all writes to the mirror are suspended, on exit
   1163  * from this function, writes must be resumed.
   1164  */
   1165 static int
   1166 mirror_offline(
   1167 	md_i_off_on_t	*miop,
   1168 	IOLOCK		*lock
   1169 )
   1170 {
   1171 	minor_t			mnum = miop->mnum;
   1172 	mm_unit_t		*un;
   1173 	mm_submirror_t		*sm;
   1174 	mm_submirror_ic_t	*smic;
   1175 	int			smi;
   1176 	mdi_unit_t		*ui = MDI_UNIT(mnum);
   1177 
   1178 	mdclrerror(&miop->mde);
   1179 
   1180 	if ((un = mirror_getun(mnum, &miop->mde, WR_LOCK, lock)) == NULL) {
   1181 		return (0);
   1182 	}
   1183 
   1184 	/*
   1185 	 * offline cannot be done while a resync is active or we are
   1186 	 * still waiting for an optimized resync to be started
   1187 	 */
   1188 	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) {
   1189 		mirror_resume_writes(un);
   1190 		return (mdmderror(&miop->mde, MDE_RESYNC_ACTIVE, mnum));
   1191 	}
   1192 
   1193 	/*
   1194 	 * Reject mirror_offline if ABR is set
   1195 	 */
   1196 	if ((ui->ui_tstate & MD_ABR_CAP) || un->un_abr_count) {
   1197 		mirror_resume_writes(un);
   1198 		return (mderror(&miop->mde, MDE_ABR_SET));
   1199 	}
   1200 
   1201 	for (smi = 0; smi < NMIRROR; smi++) {
   1202 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
   1203 			continue;
   1204 		if (un->un_sm[smi].sm_dev == miop->submirror)
   1205 			break;
   1206 	}
   1207 
   1208 	if (smi == NMIRROR) {
   1209 		mirror_resume_writes(un);
   1210 		return (mdmderror(&miop->mde, MDE_CANT_FIND_SM, mnum));
   1211 	}
   1212 
   1213 	sm = &un->un_sm[smi];
   1214 	smic = &un->un_smic[smi];
   1215 	if (!SMS_IS(sm, SMS_RUNNING) && !miop->force_offline) {
   1216 		mirror_resume_writes(un);
   1217 		return (mdmderror(&miop->mde, MDE_ILLEGAL_SM_STATE, mnum));
   1218 	}
   1219 
   1220 	if (mirror_other_sources(un, smi, WHOLE_SM, 0) != 0) {
   1221 		mirror_resume_writes(un);
   1222 		return (mdmderror(&miop->mde, MDE_NO_READABLE_SM, mnum));
   1223 	}
   1224 	mirror_set_sm_state(sm, smic, SMS_OFFLINE, 1);
   1225 	mirror_resume_writes(un);
   1226 
   1227 	MD_STATUS(un) |= MD_UN_OFFLINE_SM;
   1228 	mirror_commit(un, NO_SUBMIRRORS, 0);
   1229 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OFFLINE, SVM_TAG_METADEVICE,
   1230 	    MD_UN2SET(un), MD_SID(un));
   1231 	return (0);
   1232 }
   1233 
   1234 /*
   1235  * mirror_online:
   1236  * ----------------
   1237  * Called to implement the submirror online function
   1238  *
   1239  * Owner is returned in the parameter block passed in by the caller.
   1240  *
   1241  * Returns:
   1242  *	0	success
   1243  *	error code if the functions fails
   1244  *
   1245  * For a MN set, on entry all writes to the mirror are suspended, on exit
   1246  * from this function, writes must be resumed.
   1247  */
   1248 static int
   1249 mirror_online(
   1250 	md_i_off_on_t	*miop,
   1251 	IOLOCK		*lock
   1252 )
   1253 {
   1254 	minor_t			mnum = miop->mnum;
   1255 	mm_unit_t		*un;
   1256 	mm_submirror_t		*sm;
   1257 	mm_submirror_ic_t	*smic;
   1258 	int			smi;
   1259 	set_t			setno = MD_MIN2SET(mnum);
   1260 
   1261 	mdclrerror(&miop->mde);
   1262 
   1263 	if ((un = mirror_getun(mnum, &miop->mde, WR_LOCK, lock)) == NULL) {
   1264 		return (0);
   1265 	}
   1266 
   1267 	for (smi = 0; smi < NMIRROR; smi++) {
   1268 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
   1269 			continue;
   1270 		if (un->un_sm[smi].sm_dev == miop->submirror)
   1271 			break;
   1272 	}
   1273 	if (smi == NMIRROR) {
   1274 		mirror_resume_writes(un);
   1275 		return (mdmderror(&miop->mde, MDE_CANT_FIND_SM, mnum));
   1276 	}
   1277 
   1278 	sm = &un->un_sm[smi];
   1279 	smic = &un->un_smic[smi];
   1280 	if (!SMS_IS(sm, SMS_OFFLINE)) {
   1281 		mirror_resume_writes(un);
   1282 		return (mdmderror(&miop->mde, MDE_ILLEGAL_SM_STATE, mnum));
   1283 	}
   1284 
   1285 	/*
   1286 	 * online cannot be done while a resync is active or we are
   1287 	 * still waiting for an optimized resync to be started
   1288 	 */
   1289 	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) {
   1290 		mirror_resume_writes(un);
   1291 		return (mdmderror(&miop->mde, MDE_RESYNC_ACTIVE, mnum));
   1292 	}
   1293 
   1294 	mirror_set_sm_state(sm, smic, SMS_OFFLINE_RESYNC, 1);
   1295 	mirror_commit(un, NO_SUBMIRRORS, 0);
   1296 	mirror_check_failfast(mnum);
   1297 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ONLINE, SVM_TAG_METADEVICE,
   1298 	    MD_UN2SET(un), MD_SID(un));
   1299 
   1300 
   1301 	/* for MN sets, re-read the resync record from disk */
   1302 	if (MD_MNSET_SETNO(MD_UN2SET(un)))
   1303 		(void) mddb_reread_rr(setno, un->un_rr_dirty_recid);
   1304 
   1305 	bcopy((caddr_t)un->un_dirty_bm, (caddr_t)un->un_resync_bm,
   1306 	    howmany(un->un_rrd_num, NBBY));
   1307 	MD_STATUS(un) |= MD_UN_OPT_NOT_DONE;
   1308 	sm->sm_flags |= MD_SM_RESYNC_TARGET;
   1309 	mirror_resume_writes(un);
   1310 	md_ioctl_writerexit(lock);
   1311 	if (!MD_MNSET_SETNO(setno))
   1312 		return (mirror_resync_unit(mnum, NULL, &miop->mde, lock));
   1313 	else return (0);
   1314 }
   1315 
   1316 int
   1317 mirror_grow_unit(
   1318 	mm_unit_t		*un,
   1319 	md_error_t		*ep
   1320 )
   1321 {
   1322 	md_unit_t		*su;
   1323 	mm_submirror_t		*sm;
   1324 	int			smi;
   1325 	diskaddr_t		total_blocks;
   1326 	diskaddr_t		current_tb;
   1327 	int			spc;		/* sectors per head */
   1328 	minor_t			mnum = MD_SID(un);
   1329 
   1330 	/*
   1331 	 * grow_unit cannot be done while a resync is active or we are
   1332 	 * still waiting for an optimized resync to be started. Set
   1333 	 * flag to indicate GROW_PENDING and once the resync is complete
   1334 	 * the grow_unit function will be executed.
   1335 	 */
   1336 	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) {
   1337 		MD_STATUS(un) |= MD_UN_GROW_PENDING;
   1338 		mirror_commit(un, NO_SUBMIRRORS, 0);
   1339 		return (mdmderror(ep, MDE_GROW_DELAYED, MD_SID(un)));
   1340 	}
   1341 
   1342 	/*
   1343 	 * Find the smallest submirror
   1344 	 */
   1345 	total_blocks = 0;
   1346 	for (smi = 0; smi < NMIRROR; smi++) {
   1347 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
   1348 			continue;
   1349 		sm = &un->un_sm[smi];
   1350 		/*
   1351 		 * Growth is not possible if there is one or more
   1352 		 * submirrors made up of non-Metadevices.
   1353 		 */
   1354 		if (md_getmajor(sm->sm_dev) != md_major)
   1355 			return (0);
   1356 
   1357 		su = MD_UNIT(md_getminor(sm->sm_dev));
   1358 		if ((total_blocks == 0) ||
   1359 		    (su->c.un_total_blocks < total_blocks))
   1360 			total_blocks = su->c.un_total_blocks;
   1361 	}
   1362 
   1363 	/*
   1364 	 * If the smallest submirror is not larger
   1365 	 * than the mirror, we are all done.
   1366 	 */
   1367 	if (total_blocks <= un->c.un_total_blocks)
   1368 		return (0);
   1369 
   1370 	/*
   1371 	 * Growing the mirror now.
   1372 	 * First: Round down the actual_tb to be a multiple
   1373 	 * 	of nheads * nsects.
   1374 	 */
   1375 	spc = un->c.un_nhead * un->c.un_nsect;
   1376 	current_tb = (total_blocks/spc) * spc;
   1377 
   1378 	un->c.un_total_blocks = current_tb;
   1379 	md_nblocks_set(mnum, un->c.un_total_blocks);
   1380 	un->c.un_actual_tb = total_blocks;
   1381 
   1382 	/* Is the mirror growing from 32 bit device to 64 bit device? */
   1383 	if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) &&
   1384 	    (un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS)) {
   1385 #if defined(_ILP32)
   1386 		return (mdmderror(ep, MDE_UNIT_TOO_LARGE, mnum));
   1387 #else
   1388 		mddb_type_t	typ1;
   1389 		mddb_recid_t	recid;
   1390 		set_t		setno;
   1391 		mddb_recid_t	old_recid = un->c.un_record_id;
   1392 		mddb_recid_t	old_vtoc;
   1393 		mddb_de_ic_t    *dep, *old_dep;
   1394 		md_create_rec_option_t	options;
   1395 
   1396 		/* yup, new device size. So we need to replace the record */
   1397 		typ1 = (mddb_type_t)md_getshared_key(MD_UN2SET(un),
   1398 		    mirror_md_ops.md_driver.md_drivername);
   1399 		setno = MD_MIN2SET(mnum);
   1400 
   1401 		/* Preserve the friendly name properties of growing unit */
   1402 		options = MD_CRO_64BIT | MD_CRO_MIRROR;
   1403 		if (un->c.un_revision & MD_FN_META_DEV)
   1404 			options |= MD_CRO_FN;
   1405 		recid = mddb_createrec(offsetof(mm_unit_t, un_smic), typ1,
   1406 		    MIRROR_REC, options, setno);
   1407 		/* Resize to include incore fields */
   1408 		un->c.un_revision |= MD_64BIT_META_DEV;
   1409 		/* All 64 bit metadevices only support EFI labels. */
   1410 		un->c.un_flag |= MD_EFILABEL;
   1411 		/*
   1412 		 * If the device had a vtoc record attached to it, we remove
   1413 		 * the vtoc record, because the layout has changed completely.
   1414 		 */
   1415 		old_vtoc = un->c.un_vtoc_id;
   1416 		if (old_vtoc != 0) {
   1417 			un->c.un_vtoc_id =
   1418 			    md_vtoc_to_efi_record(old_vtoc, setno);
   1419 		}
   1420 		MD_RECID(un) = recid;
   1421 		dep = mddb_getrecdep(recid);
   1422 		old_dep = mddb_getrecdep(old_recid);
   1423 		kmem_free(dep->de_rb_userdata, dep->de_reqsize);
   1424 		dep->de_rb_userdata = old_dep->de_rb_userdata;
   1425 		dep->de_reqsize = old_dep->de_reqsize;
   1426 		dep->de_rb_userdata_ic = old_dep->de_rb_userdata_ic;
   1427 		dep->de_icreqsize = old_dep->de_icreqsize;
   1428 		mirror_commit(un, NO_SUBMIRRORS, 0);
   1429 		old_dep->de_rb_userdata = NULL;
   1430 		old_dep->de_rb_userdata_ic = NULL;
   1431 		mddb_deleterec_wrapper(old_recid);
   1432 		/*
   1433 		 * If there was a vtoc record, it is no longer needed, because
   1434 		 * a new efi record has been created for this un.
   1435 		 */
   1436 		if (old_vtoc != 0) {
   1437 			mddb_deleterec_wrapper(old_vtoc);
   1438 		}
   1439 #endif
   1440 	}
   1441 
   1442 	if ((current_tb/un->un_rrd_blksize) > MD_MAX_NUM_RR) {
   1443 		if (mirror_resize_resync_regions(un, current_tb)) {
   1444 			return (mdmderror(ep, MDE_RR_ALLOC_ERROR, MD_SID(un)));
   1445 		}
   1446 		mirror_check_failfast(mnum);
   1447 		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
   1448 		    MD_UN2SET(un), MD_SID(un));
   1449 		return (0);
   1450 	}
   1451 
   1452 	if (mirror_add_resync_regions(un, current_tb)) {
   1453 		return (mdmderror(ep, MDE_RR_ALLOC_ERROR, MD_SID(un)));
   1454 	}
   1455 
   1456 	mirror_check_failfast(mnum);
   1457 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
   1458 	    MD_UN2SET(un), MD_SID(un));
   1459 
   1460 	return (0);
   1461 }
   1462 
   1463 static int
   1464 mirror_grow(
   1465 	void			*mgp,
   1466 	IOLOCK			*lock
   1467 )
   1468 {
   1469 	mm_unit_t		*un;
   1470 	md_grow_params_t	*mgph = mgp;
   1471 
   1472 	mdclrerror(&mgph->mde);
   1473 
   1474 	if ((un = mirror_getun(mgph->mnum,
   1475 	    &mgph->mde, WR_LOCK, lock)) == NULL)
   1476 		return (0);
   1477 
   1478 	if (MD_STATUS(un) & MD_UN_GROW_PENDING)
   1479 		return (0);
   1480 
   1481 	return (mirror_grow_unit(un, &mgph->mde));
   1482 }
   1483 
   1484 static int
   1485 mirror_change(
   1486 	md_mirror_params_t	*mmp,
   1487 	IOLOCK			*lock
   1488 )
   1489 {
   1490 	mm_params_t		*pp = &mmp->params;
   1491 	mm_unit_t		*un;
   1492 
   1493 	mdclrerror(&mmp->mde);
   1494 
   1495 	if ((un = mirror_getun(mmp->mnum, &mmp->mde, WR_LOCK, lock)) == NULL)
   1496 		return (0);
   1497 
   1498 	if (pp->change_read_option)
   1499 		un->un_read_option = pp->read_option;
   1500 
   1501 	if (pp->change_write_option)
   1502 		un->un_write_option = pp->write_option;
   1503 
   1504 	if (pp->change_pass_num)
   1505 		un->un_pass_num = pp->pass_num;
   1506 
   1507 	mirror_commit(un, NO_SUBMIRRORS, 0);
   1508 
   1509 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE,
   1510 	    MD_UN2SET(un), MD_SID(un));
   1511 	return (0);
   1512 }
   1513 
   1514 static int
   1515 mirror_get_resync(
   1516 	md_resync_ioctl_t	*ri
   1517 )
   1518 {
   1519 	minor_t			mnum = ri->ri_mnum;
   1520 	mm_unit_t		*un;
   1521 	u_longlong_t		percent;
   1522 	uint_t			cnt;
   1523 	uint_t			rr;
   1524 	diskaddr_t		d;
   1525 
   1526 	mdclrerror(&ri->mde);
   1527 
   1528 	if ((un = mirror_getun(mnum, &ri->mde, STALE_OK|NO_LOCK, NULL)) == NULL)
   1529 		return (0);
   1530 
   1531 	ri->ri_flags = 0;
   1532 	if (md_get_setstatus(MD_MIN2SET(mnum)) & MD_SET_STALE) {
   1533 		ri->ri_percent_done = 0;
   1534 		ri->ri_percent_dirty = 0;
   1535 		return (0);
   1536 	}
   1537 
   1538 	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE|MD_UN_RESYNC_CANCEL)) {
   1539 		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
   1540 			ri->ri_flags |= MD_RI_INPROGRESS;
   1541 		/* Return state of resync thread */
   1542 		ri->ri_flags |= (un->un_rs_thread_flags & MD_RI_BLOCK);
   1543 		d = un->un_rs_resync_2_do;
   1544 		if (d) {
   1545 			percent = un->un_rs_resync_done;
   1546 			if (un->c.un_total_blocks >
   1547 			    MD_MAX_BLKS_FOR_SMALL_DEVS) {
   1548 				percent *= 1000;
   1549 				percent /= d;
   1550 				if (percent > 1000)
   1551 					percent = 1000;
   1552 			} else {
   1553 				percent *= 100;
   1554 				percent /= d;
   1555 			}
   1556 			ri->ri_percent_done = (int)percent;
   1557 		} else {
   1558 			ri->ri_percent_done = 0;
   1559 		}
   1560 	}
   1561 	if (un->un_nsm < 2) {
   1562 		ri->ri_percent_dirty = 0;
   1563 		return (0);
   1564 	}
   1565 	cnt = 0;
   1566 	for (rr = 0; rr < un->un_rrd_num; rr++)
   1567 		if (IS_REGION_DIRTY(rr, un))
   1568 			cnt++;
   1569 	d = un->un_rrd_num;
   1570 	if (d) {
   1571 		percent = cnt;
   1572 		percent *= 100;
   1573 		percent += d - 1;		/* round up */
   1574 		percent /= d;
   1575 	} else
   1576 		percent = 0;
   1577 	ri->ri_percent_dirty = (int)percent;
   1578 	return (0);
   1579 }
   1580 
   1581 /*
   1582  * mirror_get_owner:
   1583  * ----------------
   1584  * Called to obtain the current owner of a mirror.
   1585  *
   1586  * Owner is returned in the parameter block passed in by the caller.
   1587  *
   1588  * Returns:
   1589  *	0	success
   1590  *	EINVAL	metadevice does not exist or is not a member of a multi-owned
   1591  *		set.
   1592  */
   1593 static int
   1594 mirror_get_owner(md_set_mmown_params_t *p, IOLOCK *lock)
   1595 {
   1596 	mm_unit_t	*un;
   1597 	set_t		setno;
   1598 
   1599 	if ((un = mirror_getun(p->d.mnum, &p->mde, RD_LOCK, lock)) == NULL)
   1600 		return (EINVAL);
   1601 
   1602 	setno = MD_UN2SET(un);
   1603 	if (!MD_MNSET_SETNO(setno)) {
   1604 		return (EINVAL);
   1605 	}
   1606 	p->d.owner = un->un_mirror_owner;
   1607 	return (0);
   1608 }
   1609 
   1610 /*
   1611  * mirror_choose_owner_thread:
   1612  * --------------------------
   1613  * Called to send a CHOOSE_OWNER message to the commd running on the master
   1614  * node. This needs to run in a separate context so that mutex livelock is
   1615  * avoided. This can occur because the original request is issued from a call
   1616  * to metaioctl() which acquires the global ioctl lock, calls down into the
   1617  * mirror_ioctl code and then attempts to mdmn_ksend_message() to the master
   1618  * node. As the handler for the choose_owner message needs to send another
   1619  * ioctl through the metaioctl() entry point, any other use (by rpc.metad or
   1620  * mdcommd checking on set ownership) will deadlock the system leading to
   1621  * cluster reconfiguration timeouts and eventually a node or (at worst) a
   1622  * cluster-wide panic
   1623  */
   1624 static void
   1625 mirror_choose_owner_thread(md_mn_msg_chooseid_t	*msg)
   1626 {
   1627 	int		rval;
   1628 	md_mn_kresult_t	*kres;
   1629 	set_t		setno = MD_MIN2SET(msg->msg_chooseid_mnum);
   1630 
   1631 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
   1632 	rval = mdmn_ksend_message(setno, MD_MN_MSG_CHOOSE_OWNER,
   1633 	    MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)msg,
   1634 	    sizeof (md_mn_msg_chooseid_t), kres);
   1635 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
   1636 		mdmn_ksend_show_error(rval, kres, "CHOOSE OWNER");
   1637 		cmn_err(CE_WARN, "ksend_message failure: CHOOSE_OWNER");
   1638 	}
   1639 
   1640 	kmem_free(kres, sizeof (md_mn_kresult_t));
   1641 	kmem_free(msg, sizeof (md_mn_msg_chooseid_t));
   1642 	thread_exit();
   1643 }
   1644 
   1645 /*
   1646  * mirror_owner_thread:
   1647  * -------------------
   1648  * Called to request an ownership change from a thread context. This issues
   1649  * a mdmn_ksend_message() and then completes the appropriate ownership change
   1650  * on successful completion of the message transport.
   1651  * The originating application must poll for completion on the 'flags' member
   1652  * of the MD_MN_MM_OWNER_STATUS ioctl() parameter block.
   1653  * Success is marked by a return value of MD_MN_MM_RES_OK, Failure by
   1654  * MD_MN_MM_RES_FAIL
   1655  */
   1656 static void
   1657 mirror_owner_thread(md_mn_req_owner_t *ownp)
   1658 {
   1659 	int		rval;
   1660 	set_t		setno = MD_MIN2SET(ownp->mnum);
   1661 	mm_unit_t	*un = MD_UNIT(ownp->mnum);
   1662 	md_mn_kresult_t	*kresult;
   1663 	md_mps_t	*ps1;
   1664 
   1665 	un->un_mirror_owner_status = 0;
   1666 
   1667 	mutex_enter(&un->un_owner_mx);
   1668 	un->un_owner_state |= MM_MN_OWNER_SENT;
   1669 	mutex_exit(&un->un_owner_mx);
   1670 
   1671 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
   1672 	rval = mdmn_ksend_message(setno, MD_MN_MSG_REQUIRE_OWNER,
   1673 	    MD_MSGF_NO_LOG, 0, (char *)ownp, sizeof (md_mn_req_owner_t),
   1674 	    kresult);
   1675 
   1676 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
   1677 		/*
   1678 		 * Message transport layer failed. Return the failure code to
   1679 		 * the application.
   1680 		 */
   1681 		mdmn_ksend_show_error(rval, kresult, "CHANGE OWNER");
   1682 		mutex_enter(&un->un_owner_mx);
   1683 		un->un_owner_state &= ~(MM_MN_BECOME_OWNER|MM_MN_OWNER_SENT);
   1684 		mutex_exit(&un->un_owner_mx);
   1685 		un->un_mirror_owner_status =
   1686 		    MD_MN_MM_RESULT | MD_MN_MM_RES_FAIL;
   1687 	} else {
   1688 		/*
   1689 		 * Ownership change succeeded. Update in-core version of
   1690 		 * mirror owner.
   1691 		 */
   1692 		mutex_enter(&un->un_owner_mx);
   1693 		if (un->un_owner_state & MM_MN_BECOME_OWNER) {
   1694 			un->un_mirror_owner = md_mn_mynode_id;
   1695 			/* Sets node owner of un_rr_dirty record */
   1696 			if (un->un_rr_dirty_recid)
   1697 				(void) mddb_setowner(un->un_rr_dirty_recid,
   1698 				    md_mn_mynode_id);
   1699 			/*
   1700 			 * Release the block on the current resync region if it
   1701 			 * is blocked
   1702 			 */
   1703 			ps1 = un->un_rs_prev_overlap;
   1704 			if ((ps1 != NULL) &&
   1705 			    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
   1706 				mirror_overlap_tree_remove(ps1);
   1707 		}
   1708 
   1709 		un->un_owner_state &= ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
   1710 		mutex_exit(&un->un_owner_mx);
   1711 		un->un_mirror_owner_status =
   1712 		    MD_MN_MM_RESULT | MD_MN_MM_RES_OK;
   1713 
   1714 		/* Restart the resync thread if it was previously blocked */
   1715 		if (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) {
   1716 			mutex_enter(&un->un_rs_thread_mx);
   1717 			un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
   1718 			cv_signal(&un->un_rs_thread_cv);
   1719 			mutex_exit(&un->un_rs_thread_mx);
   1720 		}
   1721 	}
   1722 	kmem_free(kresult, sizeof (md_mn_kresult_t));
   1723 	kmem_free(ownp, sizeof (md_mn_req_owner_t));
   1724 	thread_exit();
   1725 }
   1726 
   1727 /*
   1728  * mirror_set_owner:
   1729  * ----------------
   1730  * Called to change the owner of a mirror to the specified node. If we
   1731  * are not the owner of the mirror, we do nothing apart from update the in-core
   1732  * ownership. It can also be used to choose a new owner for the resync of a
   1733  * mirror, this case is specified by the flag MD_MN_MM_CHOOSE_OWNER, see below.
   1734  *
   1735  * The p->d.flags bitfield controls how subsequent ownership changes will be
   1736  * handled:
   1737  *	MD_MN_MM_SPAWN_THREAD
   1738  *		a separate thread is created which emulates the behaviour of
   1739  *		become_owner() [mirror.c]. This is needed when changing the
   1740  *		ownership from user context as there needs to be a controlling
   1741  *		kernel thread which updates the owner info on the originating
   1742  *		node. Successful completion of the mdmn_ksend_message() means
   1743  *		that the owner field can be changed.
   1744  *
   1745  *	MD_MN_MM_PREVENT_CHANGE
   1746  *		Disallow any change of ownership once this ownership change has
   1747  *		been processed. The only way of changing the owner away from
   1748  *		the p->d.owner node specified in the call is to issue a request
   1749  *		with MD_MN_MM_ALLOW_CHANGE set in the flags. Any request to
   1750  *		become owner from a different node while the PREVENT_CHANGE
   1751  *		is in operation will result in an EAGAIN return value.
   1752  *		un->un_owner_state has MM_MN_PREVENT_CHANGE set.
   1753  *
   1754  *	MD_MN_MM_ALLOW_CHANGE
   1755  *		Allow the owner to be changed by a subsequent request.
   1756  *		un->un_owner_state has MM_MN_PREVENT_CHANGE cleared.
   1757  *
   1758  *	MD_MN_MM_CHOOSE_OWNER
   1759  *		Choose a new owner for a mirror resync. In this case, the new
   1760  *		owner argument is not used. The selection of a new owner
   1761  *		is a round robin allocation using a resync owner count. This
   1762  *		ioctl passes this value in a message to the master node
   1763  *		which uses it to select a node from the node list and then
   1764  *		sends it a message to become the owner.
   1765  *
   1766  * If we are the current owner, we must stop further i/o from being scheduled
   1767  * and wait for any pending i/o to drain. We wait for any in-progress resync
   1768  * bitmap updates to complete and we can then set the owner. If an update to
   1769  * the resync bitmap is attempted after this we simply don't write this out to
   1770  * disk until the ownership is restored.
   1771  *
   1772  * If we are the node that wants to become the owner we update the in-core
   1773  * owner and return. The i/o that initiated the ownership change will complete
   1774  * on successful return from this ioctl.
   1775  *
   1776  * Return Value:
   1777  *	0		Success
   1778  * 	EINVAL		Invalid unit referenced
   1779  *	EAGAIN		Ownership couldn't be transferred away or change of
   1780  *			ownership is prevented. Caller should retry later on.
   1781  */
   1782 static int
   1783 mirror_set_owner(md_set_mmown_params_t *p, IOLOCK *lock)
   1784 {
   1785 	mdi_unit_t	*ui;
   1786 	mm_unit_t	*un;
   1787 	set_t		setno;
   1788 
   1789 	if ((un = mirror_getun(p->d.mnum, &p->mde, RD_LOCK, lock)) == NULL)
   1790 		return (EINVAL);
   1791 	ui = MDI_UNIT(p->d.mnum);
   1792 	setno = MD_MIN2SET(p->d.mnum);
   1793 	if (!MD_MNSET_SETNO(setno)) {
   1794 		return (EINVAL);
   1795 	}
   1796 
   1797 	/*
   1798 	 * If we are choosing a new resync owner, send a message to the master
   1799 	 * to make the choice.
   1800 	 */
   1801 	if (p->d.flags & MD_MN_MM_CHOOSE_OWNER) {
   1802 		/* Release ioctl lock before we call ksend_message() */
   1803 		md_ioctl_readerexit(lock);
   1804 		/* If we're resetting the owner pass the node id in */
   1805 		if (p->d.owner != MD_MN_MIRROR_UNOWNED) {
   1806 			return (mirror_choose_owner(un, &p->d));
   1807 		} else {
   1808 			return (mirror_choose_owner(un, NULL));
   1809 		}
   1810 	}
   1811 
   1812 	/*
   1813 	 * Check for whether we have to spawn a thread to issue this request.
   1814 	 * If set we issue a mdmn_ksend_message() to cause the appropriate
   1815 	 * ownership change. On completion of this request the calling
   1816 	 * application _must_ poll the structure 'flags' field to determine the
   1817 	 * result of the request. All this is necessary until we have true
   1818 	 * multi-entrant ioctl support.
   1819 	 * If we are just clearing the owner, then MD_MN_MM_SPAWN_THREAD can
   1820 	 * be ignored.
   1821 	 */
   1822 	if ((p->d.flags & MD_MN_MM_SPAWN_THREAD) && (p->d.owner != 0)) {
   1823 		md_mn_req_owner_t	*ownp;
   1824 		ownp = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
   1825 		p->d.flags &= ~MD_MN_MM_SPAWN_THREAD;
   1826 		bcopy(&p->d, ownp, sizeof (md_mn_req_owner_t));
   1827 		if (thread_create(NULL, 0, mirror_owner_thread, (caddr_t)ownp,
   1828 		    0, &p0, TS_RUN, 60) == NULL) {
   1829 			kmem_free(ownp, sizeof (md_mn_req_owner_t));
   1830 			return (EFAULT);
   1831 		} else {
   1832 			return (0);
   1833 		}
   1834 	}
   1835 
   1836 	/*
   1837 	 * If setting owner to NULL, this is being done because the owner has
   1838 	 * died and therefore we set OPT_NOT_DONE to ensure that the
   1839 	 * mirror is marked as "Needs Maintenance" and that an optimized
   1840 	 * resync will be done when we resync the mirror, Also clear the
   1841 	 * PREVENT_CHANGE flag and remove the last resync region from the
   1842 	 * overlap tree.
   1843 	 */
   1844 	if (p->d.owner == 0) {
   1845 		md_mps_t	*ps;
   1846 		int		i;
   1847 
   1848 		md_ioctl_readerexit(lock);
   1849 		un = md_ioctl_writerlock(lock, ui);
   1850 		/*
   1851 		 * If the ABR capability is not set and the pass_num is non-zero
   1852 		 * there is need to perform an optimized resync
   1853 		 * Therefore set OPT_NOT_DONE, setup the resync_bm and set
   1854 		 * the submirrors as resync targets.
   1855 		 */
   1856 		if (!(ui->ui_tstate & MD_ABR_CAP) && un->un_pass_num) {
   1857 			MD_STATUS(un) |= MD_UN_OPT_NOT_DONE;
   1858 
   1859 			(void) mddb_reread_rr(setno, un->un_rr_dirty_recid);
   1860 			bcopy((caddr_t)un->un_dirty_bm,
   1861 			    (caddr_t)un->un_resync_bm,
   1862 			    howmany(un->un_rrd_num, NBBY));
   1863 			for (i = 0; i < NMIRROR; i++) {
   1864 				if ((SUBMIRROR_IS_READABLE(un, i)) ||
   1865 				    SMS_BY_INDEX_IS(un, i,
   1866 				    SMS_OFFLINE_RESYNC))
   1867 					un->un_sm[i].sm_flags |=
   1868 					    MD_SM_RESYNC_TARGET;
   1869 			}
   1870 		}
   1871 		mutex_enter(&un->un_owner_mx);
   1872 		un->un_owner_state &= ~MD_MN_MM_PREVENT_CHANGE;
   1873 		mutex_exit(&un->un_owner_mx);
   1874 		ps = un->un_rs_prev_overlap;
   1875 		if ((ps != NULL) && (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
   1876 			mirror_overlap_tree_remove(ps);
   1877 			ps->ps_firstblk = 0;
   1878 			ps->ps_lastblk = 0;
   1879 		}
   1880 		md_ioctl_writerexit(lock);
   1881 		un = md_ioctl_readerlock(lock, ui);
   1882 	}
   1883 
   1884 	mutex_enter(&un->un_owner_mx);
   1885 	if (!(un->un_owner_state & MM_MN_BECOME_OWNER)) {
   1886 		/*
   1887 		 * If we are not trying to become owner ourselves check
   1888 		 * to see if we have to change the owner
   1889 		 */
   1890 		if (un->un_mirror_owner == p->d.owner) {
   1891 			/*
   1892 			 * No need to change owner,
   1893 			 * Clear/set PREVENT_CHANGE bit
   1894 			 */
   1895 			if (p->d.flags & MD_MN_MM_PREVENT_CHANGE) {
   1896 				un->un_owner_state |= MM_MN_PREVENT_CHANGE;
   1897 			} else if (p->d.flags & MD_MN_MM_ALLOW_CHANGE) {
   1898 				un->un_owner_state &= ~MM_MN_PREVENT_CHANGE;
   1899 			}
   1900 			mutex_exit(&un->un_owner_mx);
   1901 			return (0);
   1902 		}
   1903 	}
   1904 
   1905 	/*
   1906 	 * Disallow ownership change if previously requested to. This can only
   1907 	 * be reset by issuing a request with MD_MN_MM_ALLOW_CHANGE set in the
   1908 	 * flags field.
   1909 	 */
   1910 	if ((un->un_owner_state & MM_MN_PREVENT_CHANGE) &&
   1911 	    !(p->d.flags & MD_MN_MM_ALLOW_CHANGE)) {
   1912 		mutex_exit(&un->un_owner_mx);
   1913 #ifdef DEBUG
   1914 		cmn_err(CE_WARN, "mirror_ioctl: Node %x attempted to become "
   1915 		    "owner while node %x has exclusive access to %s",
   1916 		    p->d.owner, un->un_mirror_owner, md_shortname(MD_SID(un)));
   1917 #endif
   1918 		return (EAGAIN);
   1919 	}
   1920 	if (p->d.owner == md_mn_mynode_id) {
   1921 		/*
   1922 		 * I'm becoming the mirror owner. Flag this so that the
   1923 		 * message sender can change the in-core owner when all
   1924 		 * nodes have processed this message
   1925 		 */
   1926 		un->un_owner_state &= ~MM_MN_OWNER_SENT;
   1927 		un->un_owner_state |= MM_MN_BECOME_OWNER;
   1928 		un->un_owner_state |= (p->d.flags & MD_MN_MM_PREVENT_CHANGE) ?
   1929 		    MM_MN_PREVENT_CHANGE : 0;
   1930 		un->un_owner_state &= (p->d.flags & MD_MN_MM_ALLOW_CHANGE) ?
   1931 		    ~MM_MN_PREVENT_CHANGE : ~0;
   1932 
   1933 		mutex_exit(&un->un_owner_mx);
   1934 	} else if ((un->un_mirror_owner == md_mn_mynode_id) ||
   1935 	    un->un_owner_state & MM_MN_BECOME_OWNER) {
   1936 		mutex_exit(&un->un_owner_mx);
   1937 
   1938 		/*
   1939 		 * I'm releasing ownership. Block and drain i/o. This also
   1940 		 * blocks until any in-progress resync record update completes.
   1941 		 */
   1942 		md_ioctl_readerexit(lock);
   1943 		un = md_ioctl_writerlock(lock, ui);
   1944 		/* Block the resync thread */
   1945 		mutex_enter(&un->un_rs_thread_mx);
   1946 		un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
   1947 		mutex_exit(&un->un_rs_thread_mx);
   1948 		mutex_enter(&un->un_owner_mx);
   1949 		un->un_mirror_owner = p->d.owner;
   1950 
   1951 		/* Sets node owner of un_rr_dirty record */
   1952 		if (un->un_rr_dirty_recid)
   1953 			(void) mddb_setowner(un->un_rr_dirty_recid, p->d.owner);
   1954 		un->un_owner_state &= ~MM_MN_BECOME_OWNER;
   1955 		un->un_owner_state |= (p->d.flags & MD_MN_MM_PREVENT_CHANGE) ?
   1956 		    MM_MN_PREVENT_CHANGE : 0;
   1957 		un->un_owner_state &= (p->d.flags & MD_MN_MM_ALLOW_CHANGE) ?
   1958 		    ~MM_MN_PREVENT_CHANGE : ~0;
   1959 		mutex_exit(&un->un_owner_mx);
   1960 		/*
   1961 		 * Allow further i/o to occur. Any write() from another node
   1962 		 * will now cause another ownership change to occur.
   1963 		 */
   1964 		md_ioctl_writerexit(lock);
   1965 	} else {
   1966 		/* Update the in-core mirror owner */
   1967 		un->un_mirror_owner = p->d.owner;
   1968 		/* Sets node owner of un_rr_dirty record */
   1969 		if (un->un_rr_dirty_recid)
   1970 			(void) mddb_setowner(un->un_rr_dirty_recid, p->d.owner);
   1971 		un->un_owner_state |= (p->d.flags & MD_MN_MM_PREVENT_CHANGE) ?
   1972 		    MM_MN_PREVENT_CHANGE : 0;
   1973 		un->un_owner_state &= (p->d.flags & MD_MN_MM_ALLOW_CHANGE) ?
   1974 		    ~MM_MN_PREVENT_CHANGE : ~0;
   1975 		mutex_exit(&un->un_owner_mx);
   1976 	}
   1977 	return (0);
   1978 }
   1979 /*
   1980  * mirror_allocate_hotspare:
   1981  * ------------------------
   1982  * Called to allocate a hotspare for a failed component. This function is
   1983  * called by the MD_MN_ALLOCATE_HOTSPARE ioctl.
   1984  */
   1985 static int
   1986 mirror_allocate_hotspare(md_alloc_hotsp_params_t *p, IOLOCK *lockp)
   1987 {
   1988 	set_t		setno;
   1989 	mm_unit_t	*un;
   1990 
   1991 #ifdef DEBUG
   1992 	if (mirror_debug_flag)
   1993 		printf("mirror_allocate_hotspare: mnum,sm,comp = %x, %x, %x\n",
   1994 		    p->mnum, p->sm, p->comp);
   1995 #endif
   1996 
   1997 	if ((un = mirror_getun(p->mnum, &p->mde, WR_LOCK, lockp)) == NULL)
   1998 		return (EINVAL);
   1999 
   2000 	/* This function is only valid for a multi-node set */
   2001 	setno = MD_MIN2SET(p->mnum);
   2002 	if (!MD_MNSET_SETNO(setno)) {
   2003 		return (EINVAL);
   2004 	}
   2005 	(void) check_comp_4_hotspares(un, p->sm, p->comp, MD_HOTSPARE_NO_XMIT,
   2006 	    p->hs_id, lockp);
   2007 	md_ioctl_writerexit(lockp);
   2008 	return (0);
   2009 }
   2010 
   2011 /*
   2012  * mirror_get_owner_status:
   2013  * -----------------------
   2014  * Return the status of a previously issued ioctl to change ownership. This is
   2015  * required for soft-partition support as the request to change mirror owner
   2016  * needs to be run from a separate daemon thread.
   2017  *
   2018  * Returns:
   2019  *	0	Success (contents of un_mirror_owner_status placed in 'flags')
   2020  *	EINVAL	Invalid unit
   2021  */
   2022 static int
   2023 mirror_get_owner_status(md_mn_own_status_t *p, IOLOCK *lock)
   2024 {
   2025 	mm_unit_t	*un;
   2026 	set_t		setno;
   2027 
   2028 	if ((un = mirror_getun(p->mnum, &p->mde, RD_LOCK, lock)) == NULL)
   2029 		return (EINVAL);
   2030 
   2031 	setno = MD_MIN2SET(p->mnum);
   2032 	if (!MD_MNSET_SETNO(setno)) {
   2033 		return (EINVAL);
   2034 	}
   2035 
   2036 	p->flags = un->un_mirror_owner_status;
   2037 	return (0);
   2038 }
   2039 
   2040 /*
   2041  * mirror_set_state:
   2042  * ---------------
   2043  * Called to set the state of the component of a submirror to the specified
   2044  * value. This function is called by the MD_MN_SET_STATE ioctl.
   2045  */
   2046 static int
   2047 mirror_set_state(md_set_state_params_t *p, IOLOCK *lockp)
   2048 {
   2049 	mm_unit_t		*un;
   2050 	mm_submirror_t		*sm;
   2051 	mm_submirror_ic_t	*smic;
   2052 	md_m_shared_t		*shared;
   2053 	set_t			setno;
   2054 
   2055 #ifdef DEBUG
   2056 	if (mirror_debug_flag)
   2057 		printf("mirror_set_state: mnum,sm,comp,state, hs_id = %x, "
   2058 		    "%x, %x, %x %x\n", p->mnum, p->sm, p->comp,
   2059 		    p->state, p->hs_id);
   2060 #endif
   2061 	if ((un = mirror_getun(p->mnum, &p->mde, WR_LOCK, lockp)) == NULL)
   2062 		return (EINVAL);
   2063 
   2064 	/* This function is only valid for a multi-node set */
   2065 	setno = MD_MIN2SET(p->mnum);
   2066 	if (!MD_MNSET_SETNO(setno)) {
   2067 		return (EINVAL);
   2068 	}
   2069 	sm = &un->un_sm[p->sm];
   2070 	smic = &un->un_smic[p->sm];
   2071 
   2072 	/* Set state in component and update ms_flags */
   2073 	shared = (md_m_shared_t *)
   2074 	    (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, p->comp);
   2075 	/*
   2076 	 * If a CS_ERRED state is being sent, verify that the sender
   2077 	 * has the same view of the component that this node currently has.
   2078 	 *
   2079 	 * There is a case where the sender was sending a CS_ERRED when a
   2080 	 * component was in error, but before the sender returns from
   2081 	 * ksend_message the component has been hotspared and resync'd.
   2082 	 *
   2083 	 * In this case, the hs_id will be different from the shared ms_hs_id,
   2084 	 * so the component has already been hotspared.  Just return in this
   2085 	 * case.
   2086 	 */
   2087 	if (p->state == CS_ERRED) {
   2088 		if (shared->ms_hs_id != p->hs_id) {
   2089 #ifdef DEBUG
   2090 			if (mirror_debug_flag) {
   2091 				printf("mirror_set_state: short circuit "
   2092 				    "hs_id=0x%x, ms_hs_id=0x%x\n",
   2093 				    p->hs_id, shared->ms_hs_id);
   2094 			}
   2095 #endif
   2096 			/* release the block on writes to the mirror */
   2097 			mirror_resume_writes(un);
   2098 			md_ioctl_writerexit(lockp);
   2099 			return (0);
   2100 		}
   2101 	}
   2102 
   2103 	/*
   2104 	 * If the device is newly errored then make sure that it is
   2105 	 * closed. Closing the device allows for the RCM framework
   2106 	 * to unconfigure the device if required.
   2107 	 */
   2108 	if (!(shared->ms_state & CS_ERRED) && (p->state & CS_ERRED) &&
   2109 	    (shared->ms_flags & MDM_S_ISOPEN)) {
   2110 		void		(*get_dev)();
   2111 		ms_cd_info_t	cd;
   2112 
   2113 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
   2114 		    "get device", 0);
   2115 		(void) (*get_dev)(sm->sm_dev, sm, p->comp, &cd);
   2116 
   2117 		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
   2118 		shared->ms_flags &= ~MDM_S_ISOPEN;
   2119 	}
   2120 
   2121 	shared->ms_state = p->state;
   2122 	uniqtime32(&shared->ms_timestamp);
   2123 
   2124 	if (p->state == CS_ERRED) {
   2125 		shared->ms_flags |= MDM_S_NOWRITE;
   2126 	} else
   2127 		shared->ms_flags &= ~MDM_S_NOWRITE;
   2128 
   2129 	shared->ms_flags &= ~MDM_S_IOERR;
   2130 	un->un_changecnt++;
   2131 	shared->ms_lasterrcnt = un->un_changecnt;
   2132 
   2133 	/* Update state in submirror */
   2134 	mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
   2135 	/*
   2136 	 * Commit the state change to the metadb, only the master will write
   2137 	 * to disk
   2138 	 */
   2139 	mirror_commit(un, SMI2BIT(p->sm), 0);
   2140 
   2141 	/* release the block on writes to the mirror */
   2142 	mirror_resume_writes(un);
   2143 
   2144 	/* generate NOTIFY events for error state changes */
   2145 	if (p->state == CS_ERRED) {
   2146 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE,
   2147 		    MD_UN2SET(un), MD_SID(un));
   2148 	} else if (p->state == CS_LAST_ERRED) {
   2149 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE,
   2150 		    MD_UN2SET(un), MD_SID(un));
   2151 	}
   2152 	md_ioctl_writerexit(lockp);
   2153 	return (0);
   2154 }
   2155 
   2156 /*
   2157  * mirror_suspend_writes:
   2158  * ---------------------
   2159  * Called to suspend writes to a mirror region. The flag un_suspend_wr_flag is
   2160  * tested in mirror_write_strategy, and if set all writes are blocked.
   2161  * This function is called by the MD_MN_SUSPEND_WRITES ioctl.
   2162  */
   2163 static int
   2164 mirror_suspend_writes(md_suspend_wr_params_t *p)
   2165 {
   2166 	set_t		setno;
   2167 	mm_unit_t	*un;
   2168 
   2169 #ifdef DEBUG
   2170 	if (mirror_debug_flag)
   2171 		printf("mirror_suspend_writes: mnum = %x\n", p->mnum);
   2172 #endif
   2173 	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
   2174 		return (EINVAL); /* No unit */
   2175 
   2176 	/* This function is only valid for a multi-node set */
   2177 	setno = MD_MIN2SET(p->mnum);
   2178 	if (!MD_MNSET_SETNO(setno)) {
   2179 		return (EINVAL);
   2180 	}
   2181 
   2182 	/*
   2183 	 * Mark the resync as blocked. This will stop any currently running
   2184 	 * thread and will prevent a new resync from attempting to perform
   2185 	 * i/o
   2186 	 */
   2187 	mutex_enter(&un->un_rs_thread_mx);
   2188 	un->un_rs_thread_flags |= MD_RI_BLOCK;
   2189 	mutex_exit(&un->un_rs_thread_mx);
   2190 
   2191 	mutex_enter(&un->un_suspend_wr_mx);
   2192 	un->un_suspend_wr_flag = 1;
   2193 	mutex_exit(&un->un_suspend_wr_mx);
   2194 
   2195 	return (0);
   2196 }
   2197 
   2198 /*
   2199  * mirror_set_capability:
   2200  * ------------------------
   2201  * Called to set or clear a capability for a mirror
   2202  * called by the MD_MN_SET_CAP ioctl.
   2203  */
   2204 static int
   2205 mirror_set_capability(md_mn_setcap_params_t *p, IOLOCK *lockp)
   2206 {
   2207 	set_t		setno;
   2208 	mm_unit_t	*un;
   2209 	mdi_unit_t	*ui;
   2210 
   2211 #ifdef DEBUG
   2212 	if (mirror_debug_flag)
   2213 		printf("mirror_set_capability: mnum = %x\n", p->mnum);
   2214 #endif
   2215 	if ((un = mirror_getun(p->mnum, &p->mde, RD_LOCK, lockp)) == NULL)
   2216 		return (EINVAL);
   2217 
   2218 	/* This function is only valid for a multi-node set */
   2219 	setno = MD_MIN2SET(p->mnum);
   2220 	if (!MD_MNSET_SETNO(setno)) {
   2221 		return (EINVAL);
   2222 	}
   2223 	ui = MDI_UNIT(p->mnum);
   2224 
   2225 	if (p->sc_set & DKV_ABR_CAP) {
   2226 		ui->ui_tstate |= MD_ABR_CAP; /* Set ABR capability */
   2227 		/* Clear DRL and set owner to 0 if no resync active */
   2228 		mirror_process_unit_resync(un);
   2229 		if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
   2230 			mutex_enter(&un->un_owner_mx);
   2231 			un->un_mirror_owner = 0;
   2232 			mutex_exit(&un->un_owner_mx);
   2233 		}
   2234 	} else {
   2235 		ui->ui_tstate &= ~MD_ABR_CAP; /* Clear ABR capability */
   2236 	}
   2237 	if (p->sc_set & DKV_DMR_CAP) {
   2238 		ui->ui_tstate |= MD_DMR_CAP; /* Set DMR capability */
   2239 	} else {
   2240 		ui->ui_tstate &= ~MD_DMR_CAP; /* Clear DMR capability */
   2241 	}
   2242 	return (0);
   2243 }
   2244 
   2245 /*
   2246  * mirror_choose_owner:
   2247  * ------------------------
   2248  * Called to choose an owner for a mirror resync. Can be called when starting
   2249  * resync or by the MD_MN_SET_MM_OWNER ioctl with the MD_MN_MM_CHOOSE_OWNER flag
   2250  * set. The ioctl is called with this flag set when we are in the cluster
   2251  * reconfig and we wish to set a new owner for a resync whose owner has left
   2252  * the cluster. We use a resync owner count to implement a round robin
   2253  * allocation of resync owners. We send a message to the master including
   2254  * this count and the message handler uses it to select an owner from the
   2255  * nodelist and then sends a SET_MM_OWNER message to the chosen node to
   2256  * become the owner.
   2257  *
   2258  * Input:
   2259  *	un	- unit reference
   2260  *	ownp	- owner information (if non-NULL)
   2261  */
   2262 int
   2263 mirror_choose_owner(mm_unit_t *un, md_mn_req_owner_t *ownp)
   2264 {
   2265 	set_t		setno;
   2266 	md_mn_msg_chooseid_t	*msg;
   2267 
   2268 	/* This function is only valid for a multi-node set */
   2269 	setno = MD_UN2SET(un);
   2270 	if (!MD_MNSET_SETNO(setno)) {
   2271 		return (EINVAL);
   2272 	}
   2273 
   2274 
   2275 #ifdef DEBUG
   2276 	if (mirror_debug_flag)
   2277 		printf("send choose owner message, mnum = %x,"
   2278 		    "rcnt = %d\n", MD_SID(un), md_set[setno].s_rcnt);
   2279 #endif
   2280 
   2281 	/*
   2282 	 * setup message with current resync count
   2283 	 * and then increment the count. If we're called with a non-NULL
   2284 	 * owner then we are reestablishing the owner of the mirror. In this
   2285 	 * case we have to flag this to the message handler and set rcnt to
   2286 	 * the new owner node.
   2287 	 */
   2288 	msg = kmem_zalloc(sizeof (md_mn_msg_chooseid_t), KM_SLEEP);
   2289 	msg->msg_chooseid_mnum = MD_SID(un);
   2290 	if (ownp == NULL) {
   2291 		mutex_enter(&md_mx);
   2292 		msg->msg_chooseid_rcnt = md_set[setno].s_rcnt;
   2293 		md_set[setno].s_rcnt++;
   2294 		mutex_exit(&md_mx);
   2295 		msg->msg_chooseid_set_node = B_FALSE;
   2296 	} else {
   2297 		msg->msg_chooseid_rcnt = ownp->owner;
   2298 		msg->msg_chooseid_set_node = B_TRUE;
   2299 	}
   2300 
   2301 	/*
   2302 	 * Spawn a thread to issue the ksend_message() call so that we can
   2303 	 * drop the ioctl lock hierarchy that is blocking further rpc.metad and
   2304 	 * commd set ownership checking.
   2305 	 */
   2306 	if (thread_create(NULL, 0, mirror_choose_owner_thread, (caddr_t)msg,
   2307 	    0, &p0, TS_RUN, 60) == NULL) {
   2308 		kmem_free(msg, sizeof (md_mn_msg_chooseid_t));
   2309 		return (EFAULT);
   2310 	} else {
   2311 		return (0);
   2312 	}
   2313 }
   2314 
   2315 /*
   2316  * mirror_get_status:
   2317  * ----------------------------------
   2318  * Called by nodes which are not the master node of the cluster. Obtains the
   2319  * master abr state and the submirror status for each valid submirror of the
   2320  * unit so that the status returned by metastat is consistent across the
   2321  * cluster.
   2322  * We update tstate for the mirror and both the sm_flag and the sm_state for
   2323  * each submirror.
   2324  *
   2325  * Input:
   2326  *	un	mirror to obtain status from
   2327  *
   2328  * Calling Convention:
   2329  *	writerlock (either ioctl or unit) must be held
   2330  */
   2331 void
   2332 mirror_get_status(mm_unit_t *un, IOLOCK *lockp)
   2333 {
   2334 	mm_submirror_t		*sm;
   2335 	int			smi;
   2336 	int			rval;
   2337 	md_mn_kresult_t		*kres;
   2338 	md_mn_msg_mir_state_t	msg;
   2339 	md_mn_msg_mir_state_res_t	*res;
   2340 	set_t			setno = MD_UN2SET(un);
   2341 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
   2342 
   2343 
   2344 	ASSERT(ui->ui_lock & MD_UL_WRITER);
   2345 
   2346 	/*
   2347 	 * Get all of the information for the mirror.
   2348 	 */
   2349 	bzero(&msg, sizeof (msg));
   2350 	msg.mir_state_mnum = MD_SID(un);
   2351 
   2352 	/*
   2353 	 * Must drop the writerlock over ksend_message since another
   2354 	 * thread on this node could be running a higher class message
   2355 	 * and be trying grab the readerlock.
   2356 	 *
   2357 	 * If we are in the context of an ioctl, drop the ioctl lock.
   2358 	 * lockp holds the list of locks held.
   2359 	 */
   2360 	if (lockp) {
   2361 		IOLOCK_RETURN_RELEASE(0, lockp);
   2362 	} else {
   2363 		md_unit_writerexit(ui);
   2364 	}
   2365 
   2366 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
   2367 	rval = mdmn_ksend_message(setno, MD_MN_MSG_GET_MIRROR_STATE,
   2368 	    MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)&msg,
   2369 	    sizeof (msg), kres);
   2370 
   2371 	/* if the node hasn't yet joined, it's Ok. */
   2372 	if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
   2373 	    (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
   2374 		mdmn_ksend_show_error(rval, kres, "GET_MIRROR_STATE");
   2375 		cmn_err(CE_WARN, "ksend_message failure: GET_MIRROR_STATE");
   2376 	}
   2377 
   2378 	/* if dropped the lock previously, regain it */
   2379 	if (lockp) {
   2380 		IOLOCK_RETURN_REACQUIRE(lockp);
   2381 	} else {
   2382 		/*
   2383 		 * Reacquire dropped locks and update acquirecnts
   2384 		 * appropriately.
   2385 		 */
   2386 		(void) md_unit_writerlock(ui);
   2387 	}
   2388 
   2389 	/*
   2390 	 * Check to see if we've got a believable amount of returned data.
   2391 	 * If not, we simply return as there is no usable information.
   2392 	 */
   2393 	if (kres->kmmr_res_size < sizeof (*res)) {
   2394 		cmn_err(CE_WARN, "GET_MIRROR_STATE: returned %d bytes, expected"
   2395 		    " %d\n", kres->kmmr_res_size, (int)sizeof (*res));
   2396 		kmem_free(kres, sizeof (md_mn_kresult_t));
   2397 		return;
   2398 	}
   2399 
   2400 	/*
   2401 	 * Copy the results from the call back into our sm_state/sm_flags
   2402 	 */
   2403 	res = (md_mn_msg_mir_state_res_t *)kres->kmmr_res_data;
   2404 #ifdef DEBUG
   2405 	if (mirror_debug_flag)
   2406 		printf("mirror_get_status: %s\n", md_shortname(MD_SID(un)));
   2407 #endif
   2408 	for (smi = 0; smi < NMIRROR; smi++) {
   2409 		sm = &un->un_sm[smi];
   2410 #ifdef DEBUG
   2411 		if (mirror_debug_flag) {
   2412 			printf("curr state %4x, new state %4x\n", sm->sm_state,
   2413 			    res->sm_state[smi]);
   2414 			printf("curr_flags %4x, new flags %4x\n", sm->sm_flags,
   2415 			    res->sm_flags[smi]);
   2416 		}
   2417 #endif
   2418 		sm->sm_state = res->sm_state[smi];
   2419 		sm->sm_flags = res->sm_flags[smi];
   2420 	}
   2421 
   2422 	/* Set ABR if set on the Master node */
   2423 	ui->ui_tstate |= (res->mir_tstate & MD_ABR_CAP);
   2424 
   2425 	kmem_free(kres, sizeof (md_mn_kresult_t));
   2426 }
   2427 
   2428 /*
   2429  * mirror_get_mir_state:
   2430  * -------------------
   2431  * Obtain the ABR state of a mirror and the state of all submirrors from the
   2432  * master node for the unit specified in sm_state->mnum.
   2433  * Called by MD_MN_GET_MIRROR_STATE ioctl.
   2434  */
   2435 static int
   2436 mirror_get_mir_state(md_mn_get_mir_state_t *p, IOLOCK *lockp)
   2437 {
   2438 	mm_unit_t	*un;
   2439 	set_t		setno;
   2440 	md_error_t	mde;
   2441 
   2442 	mdclrerror(&mde);
   2443 
   2444 	if ((un = mirror_getun(p->mnum, &mde, WR_LOCK, lockp)) == NULL) {
   2445 		return (EINVAL);
   2446 	}
   2447 	setno = MD_MIN2SET(p->mnum);
   2448 	if (!MD_MNSET_SETNO(setno)) {
   2449 		return (EINVAL);
   2450 	}
   2451 
   2452 	/*
   2453 	 * We've now got a writerlock on the unit structure (so no-one can
   2454 	 * modify the incore values) and we'll now send the message to the
   2455 	 * master node. Since we're only called as part of a reconfig cycle
   2456 	 * we don't need to release the unit locks across the ksend_message as
   2457 	 * only the master node will process it, and we never send this to
   2458 	 * ourselves if we're the master.
   2459 	 */
   2460 
   2461 	mirror_get_status(un, lockp);
   2462 
   2463 	return (0);
   2464 }
   2465 
   2466 static int
   2467 mirror_admin_ioctl(int cmd, void *data, int mode, IOLOCK *lockp)
   2468 {
   2469 	size_t	sz = 0;
   2470 	void	*d = NULL;
   2471 	int	err = 0;
   2472 
   2473 	/* We can only handle 32-bit clients for internal commands */
   2474 	if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) {
   2475 		return (EINVAL);
   2476 	}
   2477 	/* dispatch ioctl */
   2478 	switch (cmd) {
   2479 
   2480 	case MD_IOCSET:
   2481 	{
   2482 		if (! (mode & FWRITE))
   2483 			return (EACCES);
   2484 
   2485 		sz = sizeof (md_set_params_t);
   2486 
   2487 		d = kmem_alloc(sz, KM_SLEEP);
   2488 
   2489 		if (ddi_copyin(data, d, sz, mode)) {
   2490 			err = EFAULT;
   2491 			break;
   2492 		}
   2493 
   2494 		err = mirror_set(d, mode);
   2495 		break;
   2496 	}
   2497 
   2498 	case MD_IOCGET:
   2499 	{
   2500 		if (! (mode & FREAD))
   2501 			return (EACCES);
   2502 
   2503 		sz = sizeof (md_i_get_t);
   2504 
   2505 		d = kmem_alloc(sz, KM_SLEEP);
   2506 
   2507 		if (ddi_copyin(data, d, sz, mode)) {
   2508 			err = EFAULT;
   2509 			break;
   2510 		}
   2511 
   2512 		err = mirror_get(d, mode, lockp);
   2513 		break;
   2514 	}
   2515 
   2516 	case MD_IOCRESET:
   2517 	{
   2518 		if (! (mode & FWRITE))
   2519 			return (EACCES);
   2520 
   2521 		sz = sizeof (md_i_reset_t);
   2522 		d = kmem_alloc(sz, KM_SLEEP);
   2523 
   2524 		if (ddi_copyin(data, d, sz, mode)) {
   2525 			err = EFAULT;
   2526 			break;
   2527 		}
   2528 
   2529 		err = mirror_reset((md_i_reset_t *)d);
   2530 		break;
   2531 	}
   2532 
   2533 	case MD_IOCSETSYNC:
   2534 	case MD_MN_SETSYNC:
   2535 	{
   2536 		if (! (mode & FWRITE))
   2537 			return (EACCES);
   2538 
   2539 		sz = sizeof (md_resync_ioctl_t);
   2540 		d = kmem_alloc(sz, KM_SLEEP);
   2541 
   2542 		if (ddi_copyin(data, d, sz, mode)) {
   2543 			err = EFAULT;
   2544 			break;
   2545 		}
   2546 
   2547 		err = mirror_ioctl_resync((md_resync_ioctl_t *)d, lockp);
   2548 		break;
   2549 	}
   2550 
   2551 	case MD_IOCGETSYNC:
   2552 	{
   2553 		if (! (mode & FREAD))
   2554 			return (EACCES);
   2555 
   2556 		sz = sizeof (md_resync_ioctl_t);
   2557 		d = kmem_alloc(sz, KM_SLEEP);
   2558 
   2559 		if (ddi_copyin(data, d, sz, mode)) {
   2560 			err = EFAULT;
   2561 			break;
   2562 		}
   2563 
   2564 		err = mirror_get_resync((md_resync_ioctl_t *)d);
   2565 		break;
   2566 	}
   2567 
   2568 	case MD_IOCREPLACE:
   2569 	{
   2570 		if (! (mode & FWRITE))
   2571 			return (EACCES);
   2572 
   2573 		sz = sizeof (replace_params_t);
   2574 		d = kmem_alloc(sz, KM_SLEEP);
   2575 
   2576 		if (ddi_copyin(data, d, sz, mode)) {
   2577 			err = EFAULT;
   2578 			break;
   2579 		}
   2580 
   2581 		err = comp_replace((replace_params_t *)d, lockp);
   2582 		break;
   2583 	}
   2584 
   2585 	case MD_IOCOFFLINE:
   2586 	{
   2587 		if (! (mode & FWRITE))
   2588 			return (EACCES);
   2589 
   2590 		sz = sizeof (md_i_off_on_t);
   2591 		d = kmem_alloc(sz, KM_SLEEP);
   2592 
   2593 		if (ddi_copyin(data, d, sz, mode)) {
   2594 			err = EFAULT;
   2595 			break;
   2596 		}
   2597 
   2598 		err = mirror_offline((md_i_off_on_t *)d, lockp);
   2599 		break;
   2600 	}
   2601 
   2602 	case MD_IOCONLINE:
   2603 	{
   2604 		if (! (mode & FWRITE))
   2605 			return (EACCES);
   2606 
   2607 		sz = sizeof (md_i_off_on_t);
   2608 		d = kmem_alloc(sz, KM_SLEEP);
   2609 
   2610 		if (ddi_copyin(data, d, sz, mode)) {
   2611 			err = EFAULT;
   2612 			break;
   2613 		}
   2614 
   2615 		err = mirror_online((md_i_off_on_t *)d, lockp);
   2616 		break;
   2617 	}
   2618 
   2619 	case MD_IOCDETACH:
   2620 	{
   2621 		if (! (mode & FWRITE))
   2622 			return (EACCES);
   2623 
   2624 		sz = sizeof (md_detach_params_t);
   2625 		d = kmem_alloc(sz, KM_SLEEP);
   2626 
   2627 		if (ddi_copyin(data, d, sz, mode)) {
   2628 			err = EFAULT;
   2629 			break;
   2630 		}
   2631 
   2632 		err = mirror_detach((md_detach_params_t *)d, lockp);
   2633 		break;
   2634 	}
   2635 
   2636 	case MD_IOCATTACH:
   2637 	{
   2638 
   2639 		if (! (mode & FWRITE))
   2640 			return (EACCES);
   2641 
   2642 		sz = sizeof (md_att_struct_t);
   2643 		d = kmem_alloc(sz, KM_SLEEP);
   2644 
   2645 		if (ddi_copyin(data, d, sz, mode)) {
   2646 			err = EFAULT;
   2647 			break;
   2648 		}
   2649 
   2650 		err = mirror_attach((md_att_struct_t *)d, lockp);
   2651 		break;
   2652 	}
   2653 
   2654 	case MD_IOCGET_DEVS:
   2655 	{
   2656 		if (! (mode & FREAD))
   2657 			return (EACCES);
   2658 
   2659 		sz = sizeof (md_getdevs_params_t);
   2660 
   2661 		d = kmem_alloc(sz, KM_SLEEP);
   2662 
   2663 		if (ddi_copyin(data, d, sz, mode)) {
   2664 			err = EFAULT;
   2665 			break;
   2666 		}
   2667 
   2668 		err = mirror_getdevs(d, mode, lockp);
   2669 		break;
   2670 	}
   2671 
   2672 	case MD_IOCGROW:
   2673 	{
   2674 		if (! (mode & FWRITE))
   2675 			return (EACCES);
   2676 
   2677 		sz = sizeof (md_grow_params_t);
   2678 
   2679 		d = kmem_alloc(sz, KM_SLEEP);
   2680 
   2681 		if (ddi_copyin(data, d, sz, mode)) {
   2682 			err = EFAULT;
   2683 			break;
   2684 		}
   2685 
   2686 		err = mirror_grow(d, lockp);
   2687 		break;
   2688 	}
   2689 
   2690 	case MD_IOCCHANGE:
   2691 	{
   2692 		if (! (mode & FWRITE))
   2693 			return (EACCES);
   2694 
   2695 		sz = sizeof (md_mirror_params_t);
   2696 		d = kmem_alloc(sz, KM_SLEEP);
   2697 
   2698 		if (ddi_copyin(data, d, sz, mode)) {
   2699 			err = EFAULT;
   2700 			break;
   2701 		}
   2702 
   2703 		err = mirror_change((md_mirror_params_t *)d, lockp);
   2704 		break;
   2705 	}
   2706 
   2707 	case MD_IOCPROBE_DEV:
   2708 	{
   2709 		md_probedev_impl_t	*p = NULL;
   2710 		md_probedev_t		*ph = NULL;
   2711 		daemon_queue_t		*hdr = NULL;
   2712 		int			i;
   2713 		size_t			sz2 = 0;
   2714 
   2715 		if (! (mode & FREAD))
   2716 			return (EACCES);
   2717 
   2718 
   2719 		sz = sizeof (md_probedev_t);
   2720 		d = kmem_alloc(sz, KM_SLEEP);
   2721 
   2722 		/* now copy in the data */
   2723 		if (ddi_copyin(data, d, sz, mode)) {
   2724 			err = EFAULT;
   2725 			goto free_mem;
   2726 		}
   2727 
   2728 		/*
   2729 		 * Sanity test the args. Test name should have the keyword
   2730 		 * probe.
   2731 		 */
   2732 
   2733 		p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP);
   2734 
   2735 		p->probe_sema = NULL;
   2736 		p->probe_mx = NULL;
   2737 		p->probe.mnum_list = (uint64_t)NULL;
   2738 
   2739 		ph = (struct md_probedev *)d;
   2740 
   2741 		p->probe.nmdevs = ph->nmdevs;
   2742 		(void) strcpy(p->probe.test_name, ph->test_name);
   2743 		bcopy(&ph->md_driver, &(p->probe.md_driver),
   2744 		    sizeof (md_driver_t));
   2745 
   2746 		if ((p->probe.nmdevs < 1) ||
   2747 		    (strstr(p->probe.test_name, "probe") == NULL)) {
   2748 			err = EINVAL;
   2749 			goto free_mem;
   2750 		}
   2751 
   2752 
   2753 		sz2 = sizeof (minor_t) * p->probe.nmdevs;
   2754 		p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz2,
   2755 		    KM_SLEEP);
   2756 
   2757 		if (ddi_copyin((void *)(uintptr_t)ph->mnum_list,
   2758 		    (void *)(uintptr_t)p->probe.mnum_list, sz2, mode)) {
   2759 			err = EFAULT;
   2760 			goto free_mem;
   2761 		}
   2762 
   2763 		if (err = md_init_probereq(p, &hdr))
   2764 			goto free_mem;
   2765 
   2766 		/*
   2767 		 * put the request on the queue and wait.
   2768 		 */
   2769 
   2770 		daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW);
   2771 
   2772 		(void) IOLOCK_RETURN(0, lockp);
   2773 		/* wait for the events to occur */
   2774 		for (i = 0; i < p->probe.nmdevs; i++) {
   2775 			sema_p(PROBE_SEMA(p));
   2776 		}
   2777 		while (md_ioctl_lock_enter() == EINTR)
   2778 		;
   2779 
   2780 		/*
   2781 		 * clean up. The hdr list is freed in the probe routines
   2782 		 * since the list is NULL by the time we get here.
   2783 		 */
   2784 free_mem:
   2785 		if (p) {
   2786 			if (p->probe_sema != NULL) {
   2787 				sema_destroy(PROBE_SEMA(p));
   2788 				kmem_free(p->probe_sema, sizeof (ksema_t));
   2789 			}
   2790 			if (p->probe_mx != NULL) {
   2791 				mutex_destroy(PROBE_MX(p));
   2792 				kmem_free(p->probe_mx, sizeof (kmutex_t));
   2793 			}
   2794 			if ((uintptr_t)p->probe.mnum_list)
   2795 				kmem_free((void *)(uintptr_t)
   2796 				    p->probe.mnum_list, sz2);
   2797 
   2798 			kmem_free(p, sizeof (md_probedev_impl_t));
   2799 		}
   2800 		break;
   2801 	}
   2802 
   2803 	case MD_MN_SET_MM_OWNER:
   2804 	{
   2805 		if (! (mode & FWRITE))
   2806 			return (EACCES);
   2807 
   2808 		sz = sizeof (md_set_mmown_params_t);
   2809 		d = kmem_alloc(sz, KM_SLEEP);
   2810 
   2811 		if (ddi_copyin(data, d, sz, mode) != 0) {
   2812 			err = EFAULT;
   2813 			break;
   2814 		}
   2815 
   2816 		err = mirror_set_owner((md_set_mmown_params_t *)d, lockp);
   2817 		break;
   2818 	}
   2819 
   2820 	case MD_MN_GET_MM_OWNER:
   2821 	{
   2822 		if (! (mode & FREAD))
   2823 			return (EACCES);
   2824 
   2825 		sz = sizeof (md_set_mmown_params_t);
   2826 		d = kmem_alloc(sz, KM_SLEEP);
   2827 
   2828 		if (ddi_copyin(data, d, sz, mode) != 0) {
   2829 			err = EFAULT;
   2830 			break;
   2831 		}
   2832 
   2833 		err = mirror_get_owner((md_set_mmown_params_t *)d, lockp);
   2834 		break;
   2835 	}
   2836 
   2837 	case MD_MN_MM_OWNER_STATUS:
   2838 	{
   2839 		if (! (mode & FREAD))
   2840 			return (EACCES);
   2841 
   2842 		sz = sizeof (md_mn_own_status_t);
   2843 		d = kmem_alloc(sz, KM_SLEEP);
   2844 
   2845 		if (ddi_copyin(data, d, sz, mode) != 0) {
   2846 			err = EFAULT;
   2847 			break;
   2848 		}
   2849 
   2850 		err = mirror_get_owner_status((md_mn_own_status_t *)d, lockp);
   2851 		break;
   2852 	}
   2853 
   2854 	case MD_MN_SET_STATE:
   2855 	{
   2856 		if (! (mode & FWRITE))
   2857 			return (EACCES);
   2858 
   2859 		sz = sizeof (md_set_state_params_t);
   2860 		d = kmem_alloc(sz, KM_SLEEP);
   2861 
   2862 		if (ddi_copyin(data, d, sz, mode)) {
   2863 			err = EFAULT;
   2864 			break;
   2865 		}
   2866 
   2867 		err  = mirror_set_state((md_set_state_params_t *)d, lockp);
   2868 		break;
   2869 	}
   2870 
   2871 	case MD_MN_SUSPEND_WRITES:
   2872 	{
   2873 		if (! (mode & FREAD))
   2874 			return (EACCES);
   2875 
   2876 		sz = sizeof (md_suspend_wr_params_t);
   2877 		d = kmem_alloc(sz, KM_SLEEP);
   2878 
   2879 		if (ddi_copyin(data, d, sz, mode) != 0) {
   2880 			err = EFAULT;
   2881 			break;
   2882 		}
   2883 
   2884 		err = mirror_suspend_writes((md_suspend_wr_params_t *)d);
   2885 		break;
   2886 	}
   2887 
   2888 	case MD_MN_RESYNC:
   2889 	{
   2890 		sz = sizeof (md_mn_rs_params_t);
   2891 		d = kmem_alloc(sz, KM_SLEEP);
   2892 
   2893 		if (ddi_copyin(data, d, sz, mode) != 0) {
   2894 			err = EFAULT;
   2895 			break;
   2896 		}
   2897 
   2898 		err = mirror_resync_message((md_mn_rs_params_t *)d, lockp);
   2899 		break;
   2900 	}
   2901 
   2902 	case MD_MN_ALLOCATE_HOTSPARE:
   2903 	{
   2904 		if (! (mode & FWRITE))
   2905 			return (EACCES);
   2906 
   2907 		sz = sizeof (md_alloc_hotsp_params_t);
   2908 		d = kmem_alloc(sz, KM_SLEEP);
   2909 
   2910 		if (ddi_copyin(data, d, sz, mode)) {
   2911 			err = EFAULT;
   2912 			break;
   2913 		}
   2914 
   2915 		err  = mirror_allocate_hotspare((md_alloc_hotsp_params_t *)d,
   2916 		    lockp);
   2917 		break;
   2918 	}
   2919 
   2920 	case MD_MN_POKE_HOTSPARES:
   2921 	{
   2922 		(void) poke_hotspares();
   2923 		break;
   2924 	}
   2925 
   2926 	case MD_MN_SET_CAP:
   2927 	{
   2928 		if (! (mode & FWRITE))
   2929 			return (EACCES);
   2930 
   2931 		sz = sizeof (md_mn_setcap_params_t);
   2932 		d = kmem_alloc(sz, KM_SLEEP);
   2933 
   2934 		if (ddi_copyin(data, d, sz, mode)) {
   2935 			err = EFAULT;
   2936 			break;
   2937 		}
   2938 
   2939 		err  = mirror_set_capability((md_mn_setcap_params_t *)d,
   2940 		    lockp);
   2941 		break;
   2942 	}
   2943 
   2944 	case MD_MN_GET_MIRROR_STATE:
   2945 	{
   2946 		sz = sizeof (md_mn_get_mir_state_t);
   2947 		d = kmem_zalloc(sz, KM_SLEEP);
   2948 
   2949 		if (ddi_copyin(data, d, sz, mode)) {
   2950 			err = EFAULT;
   2951 			break;
   2952 		}
   2953 
   2954 		err = mirror_get_mir_state((md_mn_get_mir_state_t *)d,
   2955 		    lockp);
   2956 		break;
   2957 	}
   2958 
   2959 	case MD_MN_RR_DIRTY:
   2960 	{
   2961 		sz = sizeof (md_mn_rr_dirty_params_t);
   2962 		d = kmem_zalloc(sz, KM_SLEEP);
   2963 
   2964 		if (ddi_copyin(data, d, sz, mode)) {
   2965 			err = EFAULT;
   2966 			break;
   2967 		}
   2968 
   2969 		err = mirror_set_dirty_rr((md_mn_rr_dirty_params_t *)d);
   2970 		break;
   2971 	}
   2972 
   2973 	case MD_MN_RR_CLEAN:
   2974 	{
   2975 		md_mn_rr_clean_params_t tmp;
   2976 
   2977 		/* get the first part of the structure to find the size */
   2978 		if (ddi_copyin(data, &tmp, sizeof (tmp), mode)) {
   2979 			err = EFAULT;
   2980 			break;
   2981 		}
   2982 
   2983 		sz = MDMN_RR_CLEAN_PARAMS_SIZE(&tmp);
   2984 		d = kmem_zalloc(sz, KM_SLEEP);
   2985 
   2986 		if (ddi_copyin(data, d, sz, mode)) {
   2987 			err = EFAULT;
   2988 			break;
   2989 		}
   2990 
   2991 		err = mirror_set_clean_rr((md_mn_rr_clean_params_t *)d);
   2992 		break;
   2993 	}
   2994 
   2995 	default:
   2996 		return (ENOTTY);
   2997 	}
   2998 
   2999 	/*
   3000 	 * copyout and free any args
   3001 	 */
   3002 	if (sz != 0) {
   3003 		if (err == 0) {
   3004 			if (ddi_copyout(d, data, sz, mode) != 0) {
   3005 				err = EFAULT;
   3006 			}
   3007 		}
   3008 		kmem_free(d, sz);
   3009 	}
   3010 	return (err);
   3011 }
   3012 
   3013 int
   3014 md_mirror_ioctl(
   3015 	dev_t		ddi_dev,
   3016 	int		cmd,
   3017 	void		*data,
   3018 	int		mode,
   3019 	IOLOCK		*lockp
   3020 )
   3021 {
   3022 	minor_t		mnum = getminor(ddi_dev);
   3023 	mm_unit_t	*un;
   3024 	int		err = 0;
   3025 
   3026 	/* handle admin ioctls */
   3027 	if (mnum == MD_ADM_MINOR)
   3028 		return (mirror_admin_ioctl(cmd, data, mode, lockp));
   3029 
   3030 	/* check unit */
   3031 	if ((MD_MIN2SET(mnum) >= md_nsets) ||
   3032 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
   3033 	    ((un = MD_UNIT(mnum)) == NULL))
   3034 		return (ENXIO);
   3035 	/* is this a supported ioctl? */
   3036 	err = md_check_ioctl_against_unit(cmd, un->c);
   3037 	if (err != 0) {
   3038 		return (err);
   3039 	}
   3040 
   3041 	/* dispatch ioctl */
   3042 	switch (cmd) {
   3043 
   3044 	case DKIOCINFO:
   3045 	{
   3046 		struct dk_cinfo	*p;
   3047 
   3048 		if (! (mode & FREAD))
   3049 			return (EACCES);
   3050 
   3051 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
   3052 
   3053 		get_info(p, mnum);
   3054 		if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0)
   3055 			err = EFAULT;
   3056 
   3057 		kmem_free(p, sizeof (*p));
   3058 		return (err);
   3059 	}
   3060 
   3061 	case DKIOCGMEDIAINFO:
   3062 	{
   3063 		struct dk_minfo	p;
   3064 
   3065 		if (! (mode & FREAD))
   3066 			return (EACCES);
   3067 
   3068 		get_minfo(&p, mnum);
   3069 		if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0)
   3070 			err = EFAULT;
   3071 
   3072 		return (err);
   3073 	}
   3074 
   3075 	case DKIOCGGEOM:
   3076 	{
   3077 		struct dk_geom	*p;
   3078 
   3079 		if (! (mode & FREAD))
   3080 			return (EACCES);
   3081 
   3082 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
   3083 
   3084 		if ((err = mirror_get_geom(un, p)) == 0) {
   3085 			if (ddi_copyout((caddr_t)p, data, sizeof (*p),
   3086 			    mode) != 0)
   3087 				err = EFAULT;
   3088 		}
   3089 
   3090 		kmem_free(p, sizeof (*p));
   3091 		return (err);
   3092 	}
   3093 
   3094 	case DKIOCGVTOC:
   3095 	{
   3096 		struct vtoc	*vtoc;
   3097 
   3098 		if (! (mode & FREAD))
   3099 			return (EACCES);
   3100 
   3101 		vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
   3102 
   3103 		if ((err = mirror_get_vtoc(un, vtoc)) != 0) {
   3104 			kmem_free(vtoc, sizeof (*vtoc));
   3105 			return (err);
   3106 		}
   3107 
   3108 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
   3109 			if (ddi_copyout(vtoc, data, sizeof (*vtoc), mode))
   3110 				err = EFAULT;
   3111 		}
   3112 #ifdef _SYSCALL32
   3113 		else {
   3114 			struct vtoc32	*vtoc32;
   3115 
   3116 			vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
   3117 
   3118 			vtoctovtoc32((*vtoc), (*vtoc32));
   3119 			if (ddi_copyout(vtoc32, data, sizeof (*vtoc32), mode))
   3120 				err = EFAULT;
   3121 			kmem_free(vtoc32, sizeof (*vtoc32));
   3122 		}
   3123 #endif /* _SYSCALL32 */
   3124 
   3125 		kmem_free(vtoc, sizeof (*vtoc));
   3126 		return (err);
   3127 	}
   3128 
   3129 	case DKIOCSVTOC:
   3130 	{
   3131 		struct vtoc	*vtoc;
   3132 
   3133 		if (! (mode & FWRITE))
   3134 			return (EACCES);
   3135 
   3136 		vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
   3137 
   3138 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
   3139 			if (ddi_copyin(data, vtoc, sizeof (*vtoc), mode)) {
   3140 				err = EFAULT;
   3141 			}
   3142 		}
   3143 #ifdef _SYSCALL32
   3144 		else {
   3145 			struct vtoc32	*vtoc32;
   3146 
   3147 			vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
   3148 
   3149 			if (ddi_copyin(data, vtoc32, sizeof (*vtoc32), mode)) {
   3150 				err = EFAULT;
   3151 			} else {
   3152 				vtoc32tovtoc((*vtoc32), (*vtoc));
   3153 			}
   3154 			kmem_free(vtoc32, sizeof (*vtoc32));
   3155 		}
   3156 #endif /* _SYSCALL32 */
   3157 
   3158 		if (err == 0)
   3159 			err = mirror_set_vtoc(un, vtoc);
   3160 
   3161 		kmem_free(vtoc, sizeof (*vtoc));
   3162 		return (err);
   3163 	}
   3164 
   3165 	case DKIOCGEXTVTOC:
   3166 	{
   3167 		struct extvtoc	*extvtoc;
   3168 
   3169 		if (! (mode & FREAD))
   3170 			return (EACCES);
   3171 
   3172 		extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
   3173 
   3174 		if ((err = mirror_get_extvtoc(un, extvtoc)) != 0) {
   3175 			kmem_free(extvtoc, sizeof (*extvtoc));
   3176 			return (err);
   3177 		}
   3178 
   3179 		if (ddi_copyout(extvtoc, data, sizeof (*extvtoc), mode))
   3180 			err = EFAULT;
   3181 
   3182 		kmem_free(extvtoc, sizeof (*extvtoc));
   3183 		return (err);
   3184 	}
   3185 
   3186 	case DKIOCSEXTVTOC:
   3187 	{
   3188 		struct extvtoc	*extvtoc;
   3189 
   3190 		if (! (mode & FWRITE))
   3191 			return (EACCES);
   3192 
   3193 		extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
   3194 
   3195 		if (ddi_copyin(data, extvtoc, sizeof (*extvtoc), mode)) {
   3196 			err = EFAULT;
   3197 		}
   3198 
   3199 		if (err == 0)
   3200 			err = mirror_set_extvtoc(un, extvtoc);
   3201 
   3202 		kmem_free(extvtoc, sizeof (*extvtoc));
   3203 		return (err);
   3204 	}
   3205 
   3206 	case DKIOCGAPART:
   3207 	{
   3208 		struct dk_map	dmp;
   3209 
   3210 		if ((err = mirror_get_cgapart(un, &dmp)) != 0) {
   3211 			return (err);
   3212 		}
   3213 
   3214 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
   3215 			if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp),
   3216 			    mode) != 0)
   3217 				err = EFAULT;
   3218 		}
   3219 #ifdef _SYSCALL32
   3220 		else {
   3221 			struct dk_map32 dmp32;
   3222 
   3223 			dmp32.dkl_cylno = dmp.dkl_cylno;
   3224 			dmp32.dkl_nblk = dmp.dkl_nblk;
   3225 
   3226 			if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32),
   3227 			    mode) != 0)
   3228 				err = EFAULT;
   3229 		}
   3230 #endif /* _SYSCALL32 */
   3231 
   3232 		return (err);
   3233 	}
   3234 	case DKIOCGETEFI:
   3235 	{
   3236 		/*
   3237 		 * This one can be done centralized,
   3238 		 * no need to put in the same code for all types of metadevices
   3239 		 */
   3240 		return (md_dkiocgetefi(mnum, data, mode));
   3241 	}
   3242 	case DKIOCSETEFI:
   3243 	{
   3244 		/*
   3245 		 * This one can be done centralized,
   3246 		 * no need to put in the same code for all types of metadevices
   3247 		 */
   3248 		return (md_dkiocsetefi(mnum, data, mode));
   3249 	}
   3250 	case DKIOCPARTITION:
   3251 	{
   3252 		return (md_dkiocpartition(mnum, data, mode));
   3253 	}
   3254 
   3255 	case DKIOCGETVOLCAP:
   3256 	{
   3257 		volcap_t	vc;
   3258 		mdi_unit_t	*ui;
   3259 
   3260 		/* Only valid for MN sets */
   3261 		if (!MD_MNSET_SETNO(MD_MIN2SET(mnum)))
   3262 			return (EINVAL);
   3263 
   3264 		ui = MDI_UNIT(mnum);
   3265 		if (! (mode & FREAD))
   3266 			return (EACCES);
   3267 
   3268 		vc.vc_info = DKV_ABR_CAP | DKV_DMR_CAP;
   3269 		vc.vc_set = 0;
   3270 		if (ui->ui_tstate & MD_ABR_CAP) {
   3271 			vc.vc_set |= DKV_ABR_CAP;
   3272 		}
   3273 		if (ddi_copyout(&vc, data, sizeof (volcap_t), mode))
   3274 			err = EFAULT;
   3275 		return (err);
   3276 	}
   3277 
   3278 	case DKIOCSETVOLCAP:
   3279 	{
   3280 		volcap_t	vc;
   3281 		volcapset_t	volcap = 0;
   3282 		mdi_unit_t	*ui;
   3283 
   3284 		/* Only valid for MN sets */
   3285 		if (!MD_MNSET_SETNO(MD_MIN2SET(mnum)))
   3286 			return (EINVAL);
   3287 
   3288 		ui = MDI_UNIT(mnum);
   3289 		if (! (mode & FWRITE))
   3290 			return (EACCES);
   3291 
   3292 		if (ddi_copyin(data, &vc, sizeof (volcap_t), mode))
   3293 			return (EFAULT);
   3294 
   3295 		/* Not valid if a submirror is offline */
   3296 		if (un->c.un_status & MD_UN_OFFLINE_SM) {
   3297 			return (EINVAL);
   3298 		}
   3299 		if (ui->ui_tstate & MD_ABR_CAP)
   3300 			volcap |= DKV_ABR_CAP;
   3301 		/* Only send capability message if there is a change */
   3302 		if ((vc.vc_set & (DKV_ABR_CAP)) != volcap)
   3303 			err = mdmn_send_capability_message(mnum, vc, lockp);
   3304 		return (err);
   3305 	}
   3306 
   3307 	case DKIOCDMR:
   3308 	{
   3309 		vol_directed_rd_t	*vdr;
   3310 
   3311 #ifdef _MULTI_DATAMODEL
   3312 		vol_directed_rd32_t	*vdr32;
   3313 #endif	/* _MULTI_DATAMODEL */
   3314 
   3315 		/* Only valid for MN sets */
   3316 		if (!MD_MNSET_SETNO(MD_MIN2SET(mnum)))
   3317 			return (EINVAL);
   3318 
   3319 		vdr = kmem_zalloc(sizeof (vol_directed_rd_t), KM_NOSLEEP);
   3320 		if (vdr == NULL)
   3321 			return (ENOMEM);
   3322 
   3323 #ifdef _MULTI_DATAMODEL
   3324 		vdr32 = kmem_zalloc(sizeof (vol_directed_rd32_t), KM_NOSLEEP);
   3325 		if (vdr32 == NULL) {
   3326 			kmem_free(vdr, sizeof (vol_directed_rd_t));
   3327 			return (ENOMEM);
   3328 		}
   3329 
   3330 		switch (ddi_model_convert_from(mode & FMODELS)) {
   3331 		case DDI_MODEL_ILP32:
   3332 			/*
   3333 			 * If we're called from a higher-level driver we don't
   3334 			 * need to manipulate the data. Its already been done by
   3335 			 * the caller.
   3336 			 */
   3337 			if (!(mode & FKIOCTL)) {
   3338 				if (ddi_copyin(data, vdr32, sizeof (*vdr32),
   3339 				    mode)) {
   3340 					kmem_free(vdr, sizeof (*vdr));
   3341 					return (EFAULT);
   3342 				}
   3343 				vdr->vdr_flags = vdr32->vdr_flags;
   3344 				vdr->vdr_offset = vdr32->vdr_offset;
   3345 				vdr->vdr_nbytes = vdr32->vdr_nbytes;
   3346 				vdr->vdr_data =
   3347 				    (void *)(uintptr_t)vdr32->vdr_data;
   3348 				vdr->vdr_side = vdr32->vdr_side;
   3349 				break;
   3350 			}
   3351 			/* FALLTHROUGH */
   3352 
   3353 		case DDI_MODEL_NONE:
   3354 			if (ddi_copyin(data, vdr, sizeof (*vdr), mode)) {
   3355 				kmem_free(vdr32, sizeof (*vdr32));
   3356 				kmem_free(vdr, sizeof (*vdr));
   3357 				return (EFAULT);
   3358 			}
   3359 			break;
   3360 
   3361 		default:
   3362 			kmem_free(vdr32, sizeof (*vdr32));
   3363 			kmem_free(vdr, sizeof (*vdr));
   3364 			return (EFAULT);
   3365 		}
   3366 #else	/* ! _MULTI_DATAMODEL */
   3367 		if (ddi_copyin(data, vdr, sizeof (*vdr), mode)) {
   3368 			kmem_free(vdr, sizeof (*vdr));
   3369 			return (EFAULT);
   3370 		}
   3371 #endif	/* _MULTI_DATAMODEL */
   3372 
   3373 		err = mirror_directed_read(ddi_dev, vdr, mode);
   3374 
   3375 		if (err == 0) {
   3376 #ifdef _MULTI_DATAMODEL
   3377 			switch (ddi_model_convert_from(mode & FMODELS)) {
   3378 			case DDI_MODEL_ILP32:
   3379 				if (!(mode & FKIOCTL)) {
   3380 					vdr32->vdr_flags = vdr->vdr_flags;
   3381 					vdr32->vdr_offset = vdr->vdr_offset;
   3382 					vdr32->vdr_side = vdr->vdr_side;
   3383 					vdr32->vdr_bytesread =
   3384 					    vdr->vdr_bytesread;
   3385 					bcopy(vdr->vdr_side_name,
   3386 					    vdr32->vdr_side_name,
   3387 					    sizeof (vdr32->vdr_side_name));
   3388 
   3389 					if (ddi_copyout(vdr32, data,
   3390 					    sizeof (*vdr32), mode)) {
   3391 						err = EFAULT;
   3392 					}
   3393 					break;
   3394 				}
   3395 				/* FALLTHROUGH */
   3396 
   3397 			case DDI_MODEL_NONE:
   3398 				if (ddi_copyout(vdr, data, sizeof (*vdr), mode))
   3399 					err = EFAULT;
   3400 				break;
   3401 			}
   3402 #else	/* ! _MULTI_DATAMODEL */
   3403 			if (ddi_copyout(vdr, data, sizeof (*vdr), mode))
   3404 				err = EFAULT;
   3405 #endif	/* _MULTI_DATAMODEL */
   3406 			if (vdr->vdr_flags &  DKV_DMR_ERROR)
   3407 				err = EIO;
   3408 		}
   3409 
   3410 #ifdef _MULTI_DATAMODEL
   3411 		kmem_free(vdr32, sizeof (*vdr32));
   3412 #endif	/* _MULTI_DATAMODEL */
   3413 
   3414 		kmem_free(vdr, sizeof (*vdr));
   3415 
   3416 		return (err);
   3417 	}
   3418 
   3419 	default:
   3420 		return (ENOTTY);
   3421 	}
   3422 }
   3423 
   3424 /*
   3425  * rename named service entry points and support functions
   3426  */
   3427 
   3428 /*
   3429  * rename/exchange role swap functions
   3430  *
   3431  * most of these are handled by generic role swap functions
   3432  */
   3433 
   3434 /*
   3435  * MDRNM_UPDATE_KIDS
   3436  * rename/exchange of our child or grandchild
   3437  */
   3438 void
   3439 mirror_renexch_update_kids(md_rendelta_t *delta, md_rentxn_t *rtxnp)
   3440 {
   3441 	mm_submirror_t		*sm;
   3442 	int			smi;
   3443 
   3444 	ASSERT(rtxnp);
   3445 	ASSERT((MDRNOP_RENAME == rtxnp->op) || (rtxnp->op == MDRNOP_EXCHANGE));
   3446 	ASSERT(rtxnp->recids);
   3447 	ASSERT(delta);
   3448 	ASSERT(delta->unp);
   3449 	ASSERT(delta->old_role == MDRR_PARENT);
   3450 	ASSERT(delta->new_role == MDRR_PARENT);
   3451 
   3452 	/*
   3453 	 * since our role isn't changing (parent->parent)
   3454 	 * one of our children must be changing
   3455 	 * find the child being modified, and update
   3456 	 * our notion of it
   3457 	 */
   3458 	for (smi = 0; smi < NMIRROR; smi++) {
   3459 		mm_unit_t *un = (mm_unit_t *)delta->unp;
   3460 
   3461 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
   3462 			continue;
   3463 		}
   3464 		sm = &un->un_sm[smi];
   3465 
   3466 		if (md_getminor(sm->sm_dev) == rtxnp->from.mnum) {
   3467 			sm->sm_dev = md_makedevice(md_major, rtxnp->to.mnum);
   3468 			sm->sm_key = rtxnp->to.key;
   3469 			break;
   3470 		}
   3471 	}
   3472 
   3473 	md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
   3474 }
   3475 
   3476 /*
   3477  * exchange down (self->child)
   3478  */
   3479 void
   3480 mirror_exchange_self_update_from_down(
   3481 	md_rendelta_t	*delta,
   3482 	md_rentxn_t	*rtxnp
   3483 )
   3484 {
   3485 	int			smi;
   3486 	mm_submirror_t		*found;
   3487 	minor_t			from_min, to_min;
   3488 	sv_dev_t		sv;
   3489 
   3490 	ASSERT(rtxnp);
   3491 	ASSERT(MDRNOP_EXCHANGE == rtxnp->op);
   3492 	ASSERT(rtxnp->recids);
   3493 	ASSERT(rtxnp->rec_idx >= 0);
   3494 	ASSERT(delta);
   3495 	ASSERT(delta->unp);
   3496 	ASSERT(delta->uip);
   3497 	ASSERT(delta->old_role == MDRR_SELF);
   3498 	ASSERT(delta->new_role == MDRR_CHILD);
   3499 	ASSERT(md_getminor(delta->dev) == rtxnp->from.mnum);
   3500 
   3501 	from_min = rtxnp->from.mnum;
   3502 	to_min = rtxnp->to.mnum;
   3503 
   3504 	/*
   3505 	 * self id changes in our own unit struct
   3506 	 */
   3507 
   3508 	MD_SID(delta->unp) = to_min;
   3509 
   3510 	/*
   3511 	 * parent identifier need not change
   3512 	 */
   3513 
   3514 	/*
   3515 	 * point the set array pointers at the "new" unit and unit in-cores
   3516 	 * Note: the other half of this transfer is done in the "update_to"
   3517 	 * exchange named service.
   3518 	 */
   3519 
   3520 	MDI_VOIDUNIT(to_min) = delta->uip;
   3521 	MD_VOIDUNIT(to_min) = delta->unp;
   3522 
   3523 	/*
   3524 	 * transfer kstats
   3525 	 */
   3526 
   3527 	delta->uip->ui_kstat = rtxnp->to.kstatp;
   3528 
   3529 	/*
   3530 	 * the unit in-core reference to the get next link's id changes
   3531 	 */
   3532 
   3533 	delta->uip->ui_link.ln_id = to_min;
   3534 
   3535 	/*
   3536 	 * find the child whose identity we're assuming
   3537 	 */
   3538 
   3539 	for (found = NULL, smi = 0; !found && smi < NMIRROR; smi++) {
   3540 		mm_submirror_t		*sm;
   3541 		mm_unit_t		*un = (mm_unit_t *)delta->unp;
   3542 
   3543 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
   3544 			continue;
   3545 		}
   3546 		sm = &un->un_sm[smi];
   3547 
   3548 		if (md_getminor(sm->sm_dev) == to_min) {
   3549 			found = sm;
   3550 		}
   3551 	}
   3552 	ASSERT(found);
   3553 
   3554 	/*
   3555 	 * Update the sub-mirror's identity
   3556 	 */
   3557 	found->sm_dev = md_makedevice(md_major, rtxnp->from.mnum);
   3558 	sv.key = found->sm_key;
   3559 
   3560 	ASSERT(rtxnp->from.key != MD_KEYWILD);
   3561 	ASSERT(rtxnp->from.key != MD_KEYBAD);
   3562 
   3563 	found->sm_key = rtxnp->from.key;
   3564 
   3565 	/*
   3566 	 * delete the key for the old sub-mirror from the name space
   3567 	 */
   3568 
   3569 	sv.setno = MD_MIN2SET(from_min);
   3570 	md_rem_names(&sv, 1);
   3571 
   3572 	/*
   3573 	 * and store the record id (from the unit struct) into recids
   3574 	 */
   3575 
   3576 	md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
   3577 }
   3578 
   3579 /*
   3580  * exchange down (parent->self)
   3581  */
   3582 void
   3583 mirror_exchange_parent_update_to(
   3584 		md_rendelta_t	*delta,
   3585 		md_rentxn_t	*rtxnp
   3586 )
   3587 {
   3588 	int			smi;
   3589 	mm_submirror_t		*found;
   3590 	minor_t			from_min, to_min;
   3591 	sv_dev_t		sv;
   3592 
   3593 	ASSERT(rtxnp);
   3594 	ASSERT(MDRNOP_EXCHANGE == rtxnp->op);
   3595 	ASSERT(rtxnp->recids);
   3596 	ASSERT(rtxnp->rec_idx >= 0);
   3597 	ASSERT(delta);
   3598 	ASSERT(delta->unp);
   3599 	ASSERT(delta->uip);
   3600 	ASSERT(delta->old_role == MDRR_PARENT);
   3601 	ASSERT(delta->new_role == MDRR_SELF);
   3602 	ASSERT(md_getminor(delta->dev) == rtxnp->to.mnum);
   3603 
   3604 	from_min = rtxnp->from.mnum;
   3605 	to_min = rtxnp->to.mnum;
   3606 
   3607 	/*
   3608 	 * self id changes in our own unit struct
   3609 	 */
   3610 
   3611 	MD_SID(delta->unp) = from_min;
   3612 
   3613 	/*
   3614 	 * parent identifier need not change
   3615 	 */
   3616 
   3617 	/*
   3618 	 * point the set array pointers at the "new" unit and unit in-cores
   3619 	 * Note: the other half of this transfer is done in the "update_to"
   3620 	 * exchange named service.
   3621 	 */
   3622 
   3623 	MDI_VOIDUNIT(from_min) = delta->uip;
   3624 	MD_VOIDUNIT(from_min) = delta->unp;
   3625 
   3626 	/*
   3627 	 * transfer kstats
   3628 	 */
   3629 
   3630 	delta->uip->ui_kstat = rtxnp->from.kstatp;
   3631 
   3632 	/*
   3633 	 * the unit in-core reference to the get next link's id changes
   3634 	 */
   3635 
   3636 	delta->uip->ui_link.ln_id = from_min;
   3637 
   3638 	/*
   3639 	 * find the child whose identity we're assuming
   3640 	 */
   3641 
   3642 	for (found = NULL, smi = 0; !found && smi < NMIRROR; smi++) {
   3643 		mm_submirror_t		*sm;
   3644 		mm_unit_t		*un = (mm_unit_t *)delta->unp;
   3645 
   3646 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
   3647 			continue;
   3648 		}
   3649 		sm = &un->un_sm[smi];
   3650 
   3651 		if (md_getminor(sm->sm_dev) == from_min) {
   3652 			found = sm;
   3653 		}
   3654 	}
   3655 	ASSERT(found);
   3656 
   3657 	/*
   3658 	 * Update the sub-mirror's identity
   3659 	 */
   3660 	found->sm_dev = md_makedevice(md_major, rtxnp->to.mnum);
   3661 	sv.key = found->sm_key;
   3662 
   3663 	ASSERT(rtxnp->to.key != MD_KEYWILD);
   3664 	ASSERT(rtxnp->to.key != MD_KEYBAD);
   3665 
   3666 	found->sm_key = rtxnp->to.key;
   3667 
   3668 	/*
   3669 	 * delete the key for the old sub-mirror from the name space
   3670 	 */
   3671 
   3672 	sv.setno = MD_MIN2SET(to_min);
   3673 	md_rem_names(&sv, 1);
   3674 
   3675 	/*
   3676 	 * and store the record id (from the unit struct) into recids
   3677 	 */
   3678 
   3679 	md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
   3680 }
   3681 
   3682 /*
   3683  * MDRNM_LIST_URKIDS: named svc entry point
   3684  * all all delta entries appropriate for our children onto the
   3685  * deltalist pointd to by dlpp
   3686  */
   3687 int
   3688 mirror_rename_listkids(md_rendelta_t **dlpp, md_rentxn_t *rtxnp)
   3689 {
   3690 	minor_t			from_min, to_min;
   3691 	mm_unit_t		*from_un;
   3692 	md_rendelta_t		*new, *p;
   3693 	int			smi;
   3694 	int			n_children;
   3695 	mm_submirror_t		*sm;
   3696 
   3697 	ASSERT(rtxnp);
   3698 	ASSERT(dlpp);
   3699 	ASSERT((rtxnp->op == MDRNOP_EXCHANGE) || (rtxnp->op == MDRNOP_RENAME));
   3700 
   3701 	from_min = rtxnp->from.mnum;
   3702 	to_min = rtxnp->to.mnum;
   3703 	n_children = 0;
   3704 
   3705 	if (!MDI_UNIT(from_min) || !(from_un = MD_UNIT(from_min))) {
   3706 		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, from_min);
   3707 		return (-1);
   3708 	}
   3709 
   3710 	for (p = *dlpp; p && p->next != NULL; p = p->next) {
   3711 		/* NULL */
   3712 	}
   3713 
   3714 	for (smi = 0; smi < NMIRROR; smi++) {
   3715 		minor_t	child_min;
   3716 
   3717 		if (!SMS_BY_INDEX_IS(from_un, smi, SMS_INUSE)) {
   3718 			continue;
   3719 		}
   3720 
   3721 		sm = &from_un->un_sm[smi];
   3722 		child_min = md_getminor(sm->sm_dev);
   3723 
   3724 		p = new = md_build_rendelta(MDRR_CHILD,
   3725 		    to_min == child_min? MDRR_SELF: MDRR_CHILD,
   3726 		    sm->sm_dev, p,
   3727 		    MD_UNIT(child_min), MDI_UNIT(child_min),
   3728 		    &rtxnp->mde);
   3729 
   3730 		if (!new) {
   3731 			if (mdisok(&rtxnp->mde)) {
   3732 				(void) mdsyserror(&rtxnp->mde, ENOMEM);
   3733 			}
   3734 			return (-1);
   3735 		}
   3736 		++n_children;
   3737 	}
   3738 
   3739 	return (n_children);
   3740 }
   3741 
   3742 /*
   3743  * support routine for MDRNM_CHECK
   3744  */
   3745 static int
   3746 mirror_may_renexch_self(
   3747 	mm_unit_t	*un,
   3748 	mdi_unit_t	*ui,
   3749 	md_rentxn_t	*rtxnp)
   3750 {
   3751 	minor_t			 from_min;
   3752 	minor_t			 to_min;
   3753 	bool_t			 toplevel;
   3754 	bool_t			 related;
   3755 	int			 smi;
   3756 	mm_submirror_t		*sm;
   3757 
   3758 	from_min = rtxnp->from.mnum;
   3759 	to_min = rtxnp->to.mnum;
   3760 
   3761 	if (!un || !ui) {
   3762 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
   3763 		    from_min);
   3764 		return (EINVAL);
   3765 	}
   3766 
   3767 	ASSERT(MD_CAPAB(un) & MD_CAN_META_CHILD);
   3768 	if (!(MD_CAPAB(un) & MD_CAN_META_CHILD)) {
   3769 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
   3770 		return (EINVAL);
   3771 	}
   3772 
   3773 	if (MD_PARENT(un) == MD_MULTI_PARENT) {
   3774 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
   3775 		return (EINVAL);
   3776 	}
   3777 
   3778 	toplevel = !MD_HAS_PARENT(MD_PARENT(un));
   3779 
   3780 	/* we're related if trying to swap with our parent */
   3781 	related = (!toplevel) && (MD_PARENT(un) == to_min);
   3782 
   3783 	switch (rtxnp->op) {
   3784 	case MDRNOP_EXCHANGE:
   3785 		/*
   3786 		 * check for a swap with our child
   3787 		 */
   3788 		for (smi = 0; smi < NMIRROR; smi++) {
   3789 
   3790 			if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
   3791 				continue;
   3792 			}
   3793 
   3794 			sm = &un->un_sm[smi];
   3795 			if (md_getminor(sm->sm_dev) == to_min) {
   3796 				related |= TRUE;
   3797 			}
   3798 		}
   3799 		if (!related) {
   3800 			(void) mdmderror(&rtxnp->mde,
   3801 			    MDE_RENAME_TARGET_UNRELATED, to_min);
   3802 			return (EINVAL);
   3803 		}
   3804 
   3805 		break;
   3806 
   3807 	case MDRNOP_RENAME:
   3808 		/*
   3809 		 * if from is top-level and is open, then the kernel is using
   3810 		 * the md_dev64_t.
   3811 		 */
   3812 
   3813 		if (toplevel && md_unit_isopen(ui)) {
   3814 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
   3815 			    from_min);
   3816 			return (EBUSY);
   3817 		}
   3818 		break;
   3819 
   3820 	default:
   3821 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
   3822 		    from_min);
   3823 		return (EINVAL);
   3824 	}
   3825 
   3826 	return (0);	/* ok */
   3827 }
   3828 
   3829 /*
   3830  * Named service entry point: MDRNM_CHECK
   3831  */
   3832 intptr_t
   3833 mirror_rename_check(
   3834 	md_rendelta_t	*delta,
   3835 	md_rentxn_t	*rtxnp)
   3836 {
   3837 	mm_submirror_t		*sm;
   3838 	mm_submirror_ic_t	*smic;
   3839 	md_m_shared_t		*shared;
   3840 	int			ci;
   3841 	int			i;
   3842 	int			compcnt;
   3843 	mm_unit_t		*un;
   3844 	int			err = 0;
   3845 
   3846 	ASSERT(delta);
   3847 	ASSERT(rtxnp);
   3848 	ASSERT(delta->unp);
   3849 	ASSERT(delta->uip);
   3850 	ASSERT((rtxnp->op == MDRNOP_RENAME) || (rtxnp->op == MDRNOP_EXCHANGE));
   3851 
   3852 	if (!delta || !rtxnp || !delta->unp || !delta->uip) {
   3853 		(void) mdsyserror(&rtxnp->mde, EINVAL);
   3854 		return (EINVAL);
   3855 	}
   3856 
   3857 	un = (mm_unit_t *)delta->unp;
   3858 
   3859 	for (i = 0; i < NMIRROR; i++) {
   3860 		sm = &un->un_sm[i];
   3861 		smic = &un->un_smic[i];
   3862 
   3863 		if (!SMS_IS(sm, SMS_INUSE))
   3864 			continue;
   3865 
   3866 		ASSERT(smic->sm_get_component_count);
   3867 		if (!smic->sm_get_component_count) {
   3868 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
   3869 			    md_getminor(delta->dev));
   3870 			return (ENXIO);
   3871 		}
   3872 
   3873 		compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
   3874 
   3875 		for (ci = 0; ci < compcnt; ci++) {
   3876 
   3877 			ASSERT(smic->sm_shared_by_indx);
   3878 			if (!smic->sm_shared_by_indx) {
   3879 				(void) mdmderror(&rtxnp->mde,
   3880 				    MDE_RENAME_CONFIG_ERROR,
   3881 				    md_getminor(delta->dev));
   3882 				return (ENXIO);
   3883 			}
   3884 
   3885 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
   3886 			    (sm->sm_dev, sm, ci);
   3887 
   3888 			ASSERT(shared);
   3889 			if (!shared) {
   3890 				(void) mdmderror(&rtxnp->mde,
   3891 				    MDE_RENAME_CONFIG_ERROR,
   3892 				    md_getminor(delta->dev));
   3893 				return (ENXIO);
   3894 			}
   3895 
   3896 			if (shared->ms_hs_id != 0) {
   3897 				(void) mdmderror(&rtxnp->mde,
   3898 				    MDE_SM_FAILED_COMPS,
   3899 				    md_getminor(delta->dev));
   3900 				return (EIO);
   3901 			}
   3902 
   3903 			switch (shared->ms_state) {
   3904 			case CS_OKAY:
   3905 				break;
   3906 
   3907 			case CS_RESYNC:
   3908 				(void) mdmderror(&rtxnp->mde,
   3909 				    MDE_RESYNC_ACTIVE,
   3910 				    md_getminor(delta->dev));
   3911 				return (EBUSY);
   3912 
   3913 			default:
   3914 				(void) mdmderror(&rtxnp->mde,
   3915 				    MDE_SM_FAILED_COMPS,
   3916 				    md_getminor(delta->dev));
   3917 				return (EINVAL);
   3918 			}
   3919 
   3920 		}
   3921 	}
   3922 
   3923 	/* self does additional checks */
   3924 	if (delta->old_role == MDRR_SELF) {
   3925 		err = mirror_may_renexch_self(un, delta->uip, rtxnp);
   3926 	}
   3927 
   3928 	return (err);
   3929 }
   3930 
   3931 /* end of rename/exchange */
   3932