Home | History | Annotate | Download | only in raid
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * NAME:	raid_ioctl.c
     29  *
     30  * DESCRIPTION: RAID driver source file containing IOCTL operations.
     31  *
     32  * ROUTINES PROVIDED FOR EXTERNAL USE:
     33  *	  raid_commit() - commits MD database updates for a RAID metadevice
     34  *	md_raid_ioctl() - RAID metadevice IOCTL operations entry point.
     35  *
     36  * ROUTINES PROVIDED FOR INTERNAL USE:
     37  *	 raid_getun() - Performs unit checking on a RAID metadevice
     38  *    init_col_nextio() - normal backend when zeroing column of RAID metadevice.
     39  *	 init_col_int() - I/O interrupt while zeroing column of RAID metadevice.
     40  *  raid_init_columns() - Zero one or more columns of a RAID metadevice.
     41  *	     raid_set() - used to create a RAID metadevice
     42  *	     raid_get() - used to get the unit structure of a RAID metadevice
     43  *	 raid_replace() - used to replace a component of a RAID metadevice
     44  *	    raid_grow() - Concatenate to a RAID metadevice
     45  *	  raid_change() - change dynamic values of a RAID metadevice
     46  *	   raid_reset() - used to reset (clear / remove) a RAID metadevice
     47  *	raid_get_geom() - used to get the geometry of a RAID metadevice
     48  *	raid_get_vtoc() - used to get the VTOC on a RAID metadevice
     49  *	raid_set_vtoc() - used to set the VTOC on a RAID metadevice
     50  *	raid_get_extvtoc() - used to get the extended VTOC on a RAID metadevice
     51  *	raid_set_extvtoc() - used to set the extended VTOC on a RAID metadevice
     52  *	 raid_getdevs() - return all devices within a RAID metadevice
     53  *   raid_admin_ioctl() - IOCTL operations unique to metadevices and RAID
     54  */
     55 
     56 
     57 #include <sys/param.h>
     58 #include <sys/systm.h>
     59 #include <sys/conf.h>
     60 #include <sys/file.h>
     61 #include <sys/user.h>
     62 #include <sys/uio.h>
     63 #include <sys/t_lock.h>
     64 #include <sys/buf.h>
     65 #include <sys/dkio.h>
     66 #include <sys/vtoc.h>
     67 #include <sys/kmem.h>
     68 #include <vm/page.h>
     69 #include <sys/sysmacros.h>
     70 #include <sys/types.h>
     71 #include <sys/mkdev.h>
     72 #include <sys/stat.h>
     73 #include <sys/open.h>
     74 #include <sys/disp.h>
     75 #include <sys/modctl.h>
     76 #include <sys/ddi.h>
     77 #include <sys/sunddi.h>
     78 #include <sys/cred.h>
     79 #include <sys/lvm/mdvar.h>
     80 #include <sys/lvm/md_names.h>
     81 #include <sys/lvm/md_mddb.h>
     82 #include <sys/lvm/md_raid.h>
     83 #include <sys/lvm/md_convert.h>
     84 
     85 #include <sys/sysevent/eventdefs.h>
     86 #include <sys/sysevent/svm.h>
     87 
     88 extern int		md_status;
     89 extern unit_t		md_nunits;
     90 extern set_t		md_nsets;
     91 extern md_set_t		md_set[];
     92 extern md_ops_t		raid_md_ops;
     93 extern major_t		md_major;
     94 extern md_krwlock_t	md_unit_array_rw;
     95 extern mdq_anchor_t	md_done_daemon;
     96 extern mdq_anchor_t	md_ff_daemonq;
     97 extern	int		mdopen();
     98 extern	int		mdclose();
     99 extern	void		md_probe_one();
    100 extern int		md_init_probereq(md_probedev_impl_t *,
    101 				daemon_queue_t **);
    102 extern md_resync_t	md_cpr_resync;
    103 
    104 
    105 extern void dump_mr_unit(mr_unit_t *);
    106 
    107 typedef struct raid_ci {
    108 	DAEMON_QUEUE
    109 	struct raid_ci	*ci_next;
    110 	mr_unit_t	*ci_un;
    111 	int		ci_col;
    112 	int		ci_err;
    113 	int		ci_flag;
    114 	size_t		ci_zerosize;
    115 	diskaddr_t	ci_blkno;
    116 	diskaddr_t	ci_lastblk;
    117 	buf_t		ci_buf;
    118 } raid_ci_t;
    119 /* values for the ci_flag */
    120 #define	COL_INITING	(0x0001)
    121 #define	COL_INIT_DONE	(0x0002)
    122 #define	COL_READY	(0x0004)
    123 
    124 /*
    125  * NAME:	raid_getun
    126  * DESCRIPTION: performs a lot of unit checking on a RAID metadevice
    127  * PARAMETERS:	minor_t	      mnum - minor device number for RAID unit
    128  *		md_error_t    *mde - pointer to error reporting structure
    129  *		int	     flags - pointer to error reporting structure
    130  *					STALE_OK - allow stale MD memory
    131  *					  NO_OLD - unit must not exist
    132  *					 NO_LOCK - no IOCTL lock needed
    133  *					 WR_LOCK - write IOCTL lock needed
    134  *					 RD_LOCK - read IOCTL lock needed
    135  *		IOLOCK	     *lock - pointer to IOCTL lock
    136  *
    137  * LOCKS:	obtains unit reader or writer lock via IOLOCK
    138  *
    139  */
    140 static mr_unit_t *
    141 raid_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock)
    142 {
    143 	mr_unit_t	*un;
    144 	mdi_unit_t	*ui;
    145 	set_t		setno = MD_MIN2SET(mnum);
    146 
    147 	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
    148 		(void) mdmderror(mde, MDE_INVAL_UNIT, mnum);
    149 		return (NULL);
    150 	}
    151 
    152 	if (!(flags & STALE_OK)) {
    153 		if (md_get_setstatus(setno) & MD_SET_STALE) {
    154 			(void) mdmddberror(mde, MDE_DB_STALE, mnum, setno);
    155 			return (NULL);
    156 		}
    157 	}
    158 
    159 	ui = MDI_UNIT(mnum);
    160 	if (flags & NO_OLD) {
    161 		if (ui != NULL) {
    162 			(void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum);
    163 			return (NULL);
    164 		}
    165 		return ((mr_unit_t *)1);
    166 	}
    167 
    168 	if (ui == NULL) {
    169 		(void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum);
    170 		return (NULL);
    171 	}
    172 	if (flags & ARRAY_WRITER)
    173 		md_array_writer(lock);
    174 	else if (flags & ARRAY_READER)
    175 		md_array_reader(lock);
    176 
    177 	if (!(flags & NO_LOCK)) {
    178 		if (flags & WR_LOCK) {
    179 			(void) md_ioctl_io_lock(lock, ui);
    180 			(void) md_ioctl_writerlock(lock, ui);
    181 		} else /* RD_LOCK */
    182 			(void) md_ioctl_readerlock(lock, ui);
    183 	}
    184 	un = (mr_unit_t *)MD_UNIT(mnum);
    185 
    186 	if (un->c.un_type != MD_METARAID) {
    187 		(void) mdmderror(mde, MDE_NOT_RAID, mnum);
    188 		return (NULL);
    189 	}
    190 
    191 	return (un);
    192 }
    193 
    194 
    195 /*
    196  * NAME:	raid_commit
    197  * DESCRIPTION: commits MD database updates for a RAID metadevice
    198  * PARAMETERS:	mr_unit_t	 *un - RAID unit to update in the MD database
    199  *		mddb_recid_t *extras - array of other record IDs to update
    200  *
    201  * LOCKS:	assumes caller holds unit writer lock
    202  *
    203  */
    204 void
    205 raid_commit(mr_unit_t *un, mddb_recid_t	*extras)
    206 {
    207 	mddb_recid_t	*recids;
    208 	int 		ri = 0;
    209 	int		nrecids = 0;
    210 
    211 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
    212 		return;
    213 
    214 	/* Count the extra recids */
    215 	if (extras != NULL) {
    216 		while (extras[nrecids] != 0) {
    217 			nrecids++;
    218 		}
    219 	}
    220 
    221 	/*
    222 	 * Allocate space for two recids in addition to the extras:
    223 	 * one for the unit structure, one for the null terminator.
    224 	 */
    225 	nrecids += 2;
    226 	recids = (mddb_recid_t *)
    227 	    kmem_zalloc(nrecids * sizeof (mddb_recid_t), KM_SLEEP);
    228 
    229 	if (un != NULL) {
    230 		ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
    231 		recids[ri++] = un->c.un_record_id;
    232 	}
    233 
    234 	if (extras != NULL) {
    235 		while (*extras != 0) {
    236 			recids[ri++] = *extras;
    237 			extras++;
    238 		}
    239 	}
    240 
    241 	if (ri > 0) {
    242 		mddb_commitrecs_wrapper(recids);
    243 	}
    244 
    245 	kmem_free(recids, nrecids * sizeof (mddb_recid_t));
    246 }
    247 
    248 static int
    249 raid_check_pw(mr_unit_t *un)
    250 {
    251 	buf_t		bp;
    252 	char		*buf;
    253 	mr_column_t	*colptr;
    254 	minor_t		mnum = MD_SID(un);
    255 	int		i;
    256 	int		err = 0;
    257 	minor_t		unit;
    258 
    259 	buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
    260 
    261 	for (i = 0; i < un->un_totalcolumncnt; i++) {
    262 		md_dev64_t tmpdev;
    263 
    264 		colptr = &un->un_column[i];
    265 
    266 		tmpdev = colptr->un_dev;
    267 		/*
    268 		 * Open by device id
    269 		 * If this device is hotspared
    270 		 * use the hotspare key
    271 		 */
    272 		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
    273 		    colptr->un_hs_key : colptr->un_orig_key);
    274 		if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
    275 			colptr->un_dev = tmpdev;
    276 			return (1);
    277 		}
    278 		colptr->un_dev = tmpdev;
    279 
    280 		bzero((caddr_t)&bp, sizeof (buf_t));
    281 		bp.b_back = &bp;
    282 		bp.b_forw = &bp;
    283 		bp.b_flags = B_READ | B_BUSY;
    284 		sema_init(&bp.b_io, 0, NULL,
    285 		    SEMA_DEFAULT, NULL);
    286 		sema_init(&bp.b_sem, 0, NULL,
    287 		    SEMA_DEFAULT, NULL);
    288 		bp.b_edev = md_dev64_to_dev(colptr->un_dev);
    289 		bp.b_lblkno = colptr->un_pwstart;
    290 		bp.b_bcount = DEV_BSIZE;
    291 		bp.b_bufsize = DEV_BSIZE;
    292 		bp.b_un.b_addr = (caddr_t)buf;
    293 		bp.b_offset = -1;
    294 		(void) md_call_strategy(&bp, 0, NULL);
    295 		if (biowait(&bp))
    296 			err = 1;
    297 		if (i == 0) {
    298 			if (un->c.un_revision & MD_64BIT_META_DEV) {
    299 				unit = ((raid_pwhdr_t *)buf)->rpw_unit;
    300 			} else {
    301 				unit = ((raid_pwhdr32_od_t *)buf)->rpw_unit;
    302 			}
    303 		}
    304 		/*
    305 		 * depending upon being an 64bit or 32 bit raid, the
    306 		 * pre write headers have different layout
    307 		 */
    308 		if (un->c.un_revision & MD_64BIT_META_DEV) {
    309 			if ((((raid_pwhdr_t *)buf)->rpw_column != i) ||
    310 			    (((raid_pwhdr_t *)buf)->rpw_unit != unit))
    311 				err = 1;
    312 		} else {
    313 			if ((((raid_pwhdr32_od_t *)buf)->rpw_column != i) ||
    314 			    (((raid_pwhdr32_od_t *)buf)->rpw_unit != unit))
    315 				err = 1;
    316 		}
    317 		md_layered_close(colptr->un_dev, MD_OFLG_NULL);
    318 		if (err)
    319 			break;
    320 	}
    321 	kmem_free(buf, DEV_BSIZE);
    322 	return (err);
    323 }
    324 
    325 /*
    326  * NAME:	init_col_nextio
    327  * DESCRIPTION: normal backend process when zeroing column of a RAID metadevice.
    328  * PARAMETERS:	raid_ci_t *cur - struct for column being zeroed
    329  *
    330  * LOCKS:	assumes caller holds unit reader lock,
    331  *		preiodically releases and reacquires unit reader lock,
    332  *		broadcasts on unit conditional variable (un_cv)
    333  *
    334  */
    335 #define	INIT_RLS_CNT	10
    336 static void
    337 init_col_nextio(raid_ci_t *cur)
    338 {
    339 	mr_unit_t	*un;
    340 
    341 	un = cur->ci_un;
    342 
    343 	cur->ci_blkno += cur->ci_zerosize;
    344 
    345 	mutex_enter(&un->un_mx);
    346 	/* ===> update un_percent_done */
    347 	un->un_init_iocnt += btodb(cur->ci_buf.b_bcount);
    348 	mutex_exit(&un->un_mx);
    349 
    350 	/*
    351 	 * When gorwing a device, normal I/O is still going on.
    352 	 * The init thread still holds the unit reader lock which
    353 	 * prevents I/O from doing state changes.
    354 	 * So every INIT_RLS_CNT init I/Os, we will release the
    355 	 * unit reader lock.
    356 	 *
    357 	 * CAVEAT:
    358 	 * We know we are in the middle of a grow operation and the
    359 	 * unit cannot be grown or removed (through reset or halt)
    360 	 * so the mr_unit_t structure will not move or disappear.
    361 	 * In addition, we know that only one of the init I/Os
    362 	 * can be in col_init_nextio at a time because they are
    363 	 * placed on the md_done_daemon queue and md only processes
    364 	 * one element of this queue at a time. In addition, any
    365 	 * code that needs to acquire the unit writer lock to change
    366 	 * state is supposed to be on the md_mstr_daemon queue so
    367 	 * it can be processing while we sit here waiting to get the
    368 	 * unit reader lock back.
    369 	 */
    370 
    371 	if (cur->ci_blkno < cur->ci_lastblk) {
    372 		/* truncate last chunk to end_addr if needed */
    373 		if (cur->ci_blkno + cur->ci_zerosize > cur->ci_lastblk) {
    374 			cur->ci_zerosize = (size_t)
    375 			    (cur->ci_lastblk - cur->ci_blkno);
    376 		}
    377 
    378 		/* set address and length for I/O bufs */
    379 		cur->ci_buf.b_bufsize = dbtob(cur->ci_zerosize);
    380 		cur->ci_buf.b_bcount = dbtob(cur->ci_zerosize);
    381 		cur->ci_buf.b_lblkno = cur->ci_blkno;
    382 
    383 		(void) md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
    384 		return;
    385 	}
    386 	/* finished initializing this column */
    387 	mutex_enter(&un->un_mx);
    388 	cur->ci_flag = COL_INIT_DONE;
    389 	uniqtime32(&un->un_column[cur->ci_col].un_devtimestamp);
    390 	mutex_exit(&un->un_mx);
    391 	cv_broadcast(&un->un_cv);
    392 }
    393 
    394 /*
    395  * NAME:	init_col_int
    396  * DESCRIPTION: I/O interrupt while zeroing column of a RAID metadevice.
    397  * PARAMETERS:	buf_t	  *cb - I/O buffer for which interrupt occurred
    398  *
    399  * LOCKS:	assumes caller holds unit reader or writer lock
    400  *
    401  */
    402 static int
    403 init_col_int(buf_t *cb)
    404 {
    405 	raid_ci_t	*cur;
    406 
    407 	cur = (raid_ci_t *)cb->b_chain;
    408 	if (cb->b_flags & B_ERROR) {
    409 		mutex_enter(&cur->ci_un->un_mx);
    410 		cur->ci_err = EIO;
    411 		mutex_exit(&cur->ci_un->un_mx);
    412 		cv_broadcast(&cur->ci_un->un_cv);
    413 		return (1);
    414 	}
    415 	daemon_request(&md_done_daemon, init_col_nextio,
    416 	    (daemon_queue_t *)cur, REQ_OLD);
    417 	return (1);
    418 }
    419 
    420 /*
    421  * NAME:	raid_init_columns
    422  * DESCRIPTION: Zero one or more columns of a RAID metadevice.
    423  * PARAMETERS:	minor_t	 mnum - RAID unit minor identifier
    424  *
    425  * LOCKS:	obtains and releases unit reader lock,
    426  *		obtains and releases unit writer lock,
    427  *		obtains and releases md_unit_array_rw write lock,
    428  *		obtains and releases unit mutex (un_mx) lock,
    429  *		waits on unit conditional variable (un_cv)
    430  *
    431  */
    432 static void
    433 raid_init_columns(minor_t mnum)
    434 {
    435 	mr_unit_t	*un;
    436 	mdi_unit_t	*ui;
    437 	raid_ci_t	*ci_chain = NULL, *cur;
    438 	rus_state_t	state;
    439 	caddr_t		zero_addr;
    440 	diskaddr_t	end_off;
    441 	size_t		zerosize;
    442 	int		err = 0;
    443 	int		ix;
    444 	int		colcnt = 0;
    445 	int		col;
    446 	set_t		setno = MD_MIN2SET(mnum);
    447 
    448 	/*
    449 	 * Increment the raid resync count for cpr
    450 	 */
    451 	mutex_enter(&md_cpr_resync.md_resync_mutex);
    452 	md_cpr_resync.md_raid_resync++;
    453 	mutex_exit(&md_cpr_resync.md_resync_mutex);
    454 
    455 	/*
    456 	 * initialization is a multiple step process.  The first step
    457 	 * is to go through the unit structure and start each device
    458 	 * in the init state writing zeros over the component.
    459 	 * Next initialize the prewrite areas, so the device can be
    460 	 * used if a metainit -k is done.  Now close the componenets.
    461 	 *
    462 	 * Once this complete set the state of each component being
    463 	 * zeroed and set the correct state for the unit.
    464 	 *
    465 	 * last commit the records.
    466 	 */
    467 
    468 	ui = MDI_UNIT(mnum);
    469 	un = md_unit_readerlock(ui);
    470 
    471 	/* check for active init on this column */
    472 	/* exiting is cpr safe */
    473 	if ((un->un_init_colcnt > 0) && (un->un_resync_index != -1)) {
    474 		md_unit_readerexit(ui);
    475 		(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
    476 		/*
    477 		 * Decrement the raid resync count for cpr
    478 		 */
    479 		mutex_enter(&md_cpr_resync.md_resync_mutex);
    480 		md_cpr_resync.md_raid_resync--;
    481 		mutex_exit(&md_cpr_resync.md_resync_mutex);
    482 		thread_exit();
    483 	}
    484 
    485 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_START, SVM_TAG_METADEVICE, setno,
    486 	    MD_SID(un));
    487 	un->un_init_colcnt = 0;
    488 	un->un_init_iocnt = 0;
    489 	end_off = un->un_pwsize + (un->un_segsize * un->un_segsincolumn);
    490 	zerosize = (size_t)MIN((diskaddr_t)un->un_maxio, end_off);
    491 
    492 	/* allocate zero-filled buffer */
    493 	zero_addr = kmem_zalloc(dbtob(zerosize), KM_SLEEP);
    494 
    495 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
    496 		if (un->un_column[ix].un_devstate != RCS_INIT)
    497 			continue;
    498 		/* allocate new column init structure */
    499 		cur = (raid_ci_t *)kmem_zalloc((sizeof (raid_ci_t)), KM_SLEEP);
    500 		ASSERT(cur != NULL);
    501 		un->un_init_colcnt++;
    502 		cur->ci_next = ci_chain;
    503 		ci_chain = cur;
    504 		cur->ci_un = un;
    505 		cur->ci_col = ix;
    506 		cur->ci_err = 0;
    507 		cur->ci_flag = COL_INITING;
    508 		cur->ci_zerosize = zerosize;
    509 		cur->ci_blkno = un->un_column[ix].un_pwstart;
    510 		cur->ci_lastblk = cur->ci_blkno + un->un_pwsize
    511 		    + (un->un_segsize * un->un_segsincolumn);
    512 		/* initialize static buf fields */
    513 		cur->ci_buf.b_un.b_addr = zero_addr;
    514 		cur->ci_buf.b_chain = (buf_t *)cur;
    515 		cur->ci_buf.b_back = &cur->ci_buf;
    516 		cur->ci_buf.b_forw = &cur->ci_buf;
    517 		cur->ci_buf.b_iodone = init_col_int;
    518 		cur->ci_buf.b_flags = B_BUSY | B_WRITE;
    519 		cur->ci_buf.b_edev = md_dev64_to_dev(un->un_column[ix].un_dev);
    520 		sema_init(&cur->ci_buf.b_io, 0, NULL, SEMA_DEFAULT, NULL);
    521 		sema_init(&cur->ci_buf.b_sem, 0, NULL, SEMA_DEFAULT, NULL);
    522 		/* set address and length for I/O bufs */
    523 		cur->ci_buf.b_bufsize = dbtob(zerosize);
    524 		cur->ci_buf.b_bcount = dbtob(zerosize);
    525 		cur->ci_buf.b_lblkno = un->un_column[ix].un_pwstart;
    526 		cur->ci_buf.b_offset = -1;
    527 
    528 		if (! (un->un_column[ix].un_devflags & MD_RAID_DEV_ISOPEN)) {
    529 			md_dev64_t tmpdev = un->un_column[ix].un_dev;
    530 			/*
    531 			 * Open by device id
    532 			 * If this column is hotspared then
    533 			 * use the hotspare key
    534 			 */
    535 			tmpdev = md_resolve_bydevid(mnum, tmpdev,
    536 			    HOTSPARED(un, ix) ?
    537 			    un->un_column[ix].un_hs_key :
    538 			    un->un_column[ix].un_orig_key);
    539 			if ((cur->ci_err = md_layered_open(mnum, &tmpdev,
    540 			    MD_OFLG_NULL)) == 0)
    541 				un->un_column[ix].un_devflags |=
    542 				    MD_RAID_DEV_ISOPEN;
    543 			un->un_column[ix].un_dev = tmpdev;
    544 		}
    545 		if (cur->ci_err == 0)
    546 			md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
    547 	}
    548 
    549 	md_unit_readerexit(ui);
    550 	state = un->un_state;
    551 	colcnt = un->un_init_colcnt;
    552 	mutex_enter(&un->un_mx);
    553 	while (colcnt) {
    554 		cv_wait(&un->un_cv, &un->un_mx);
    555 
    556 		colcnt = 0;
    557 		for (cur = ci_chain; cur != NULL; cur = cur->ci_next) {
    558 			col = cur->ci_col;
    559 			if ((cur->ci_flag != COL_INITING) || (cur->ci_err)) {
    560 				if (cur->ci_err)
    561 					err = cur->ci_err;
    562 				else if (cur->ci_flag == COL_INIT_DONE) {
    563 					(void) init_pw_area(un,
    564 					    un->un_column[col].un_dev,
    565 					    un->un_column[col].un_pwstart,
    566 					    col);
    567 					cur->ci_flag = COL_READY;
    568 				}
    569 			} else {
    570 				colcnt++;
    571 			}
    572 		}
    573 	}
    574 	mutex_exit(&un->un_mx);
    575 
    576 	/* This prevents new opens */
    577 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
    578 	(void) md_io_writerlock(ui);
    579 	un = (mr_unit_t *)md_unit_writerlock(ui);
    580 	while (ci_chain) {
    581 		cur = ci_chain;
    582 
    583 		/* take this element out of the chain */
    584 		ci_chain = cur->ci_next;
    585 		/* free this element */
    586 		sema_destroy(&cur->ci_buf.b_io);
    587 		sema_destroy(&cur->ci_buf.b_sem);
    588 		if (cur->ci_err)
    589 			raid_set_state(cur->ci_un, cur->ci_col,
    590 			    RCS_INIT_ERRED, 0);
    591 		else
    592 			raid_set_state(cur->ci_un, cur->ci_col,
    593 			    RCS_OKAY, 0);
    594 		kmem_free(cur, sizeof (raid_ci_t));
    595 	}
    596 
    597 	/* free the zeroed buffer */
    598 	kmem_free(zero_addr, dbtob(zerosize));
    599 
    600 	/* determine new unit state */
    601 	if (err == 0) {
    602 		if (state == RUS_INIT)
    603 			un->un_state = RUS_OKAY;
    604 		else {
    605 			un->c.un_total_blocks = un->un_grow_tb;
    606 			md_nblocks_set(mnum, un->c.un_total_blocks);
    607 			un->un_grow_tb = 0;
    608 			if (raid_state_cnt(un, RCS_OKAY) ==
    609 			    un->un_totalcolumncnt)
    610 				un->un_state = RUS_OKAY;
    611 		}
    612 	} else {  /* error orcurred */
    613 		if (state & RUS_INIT)
    614 			un->un_state = RUS_DOI;
    615 	}
    616 	uniqtime32(&un->un_timestamp);
    617 	MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
    618 	un->un_init_colcnt = 0;
    619 	un->un_init_iocnt = 0;
    620 	raid_commit(un, NULL);
    621 	md_unit_writerexit(ui);
    622 	(void) md_io_writerexit(ui);
    623 	rw_exit(&md_unit_array_rw.lock);
    624 	if (err) {
    625 		if (un->un_state & RUS_DOI) {
    626 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
    627 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
    628 		} else {
    629 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
    630 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
    631 		}
    632 	} else {
    633 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_SUCCESS,
    634 		    SVM_TAG_METADEVICE, setno, MD_SID(un));
    635 	}
    636 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
    637 	/*
    638 	 * Decrement the raid resync count for cpr
    639 	 */
    640 	mutex_enter(&md_cpr_resync.md_resync_mutex);
    641 	md_cpr_resync.md_raid_resync--;
    642 	mutex_exit(&md_cpr_resync.md_resync_mutex);
    643 	thread_exit();
    644 	/*NOTREACHED*/
    645 }
    646 
    647 static int
    648 raid_init_unit(minor_t mnum, md_error_t *ep)
    649 {
    650 	mdi_unit_t	*ui;
    651 	mr_unit_t	*un;
    652 	int		rval, i;
    653 	set_t		setno = MD_MIN2SET(mnum);
    654 
    655 	ui = MDI_UNIT(mnum);
    656 	if (md_get_setstatus(setno) & MD_SET_STALE)
    657 		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
    658 
    659 	/* Don't start an init if the device is not available */
    660 	if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
    661 		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
    662 	}
    663 
    664 	if (raid_internal_open(mnum, (FREAD | FWRITE),
    665 	    OTYP_LYR, MD_OFLG_ISINIT)) {
    666 		rval = mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum);
    667 		goto out;
    668 	}
    669 
    670 	un = md_unit_readerlock(ui);
    671 	un->un_percent_done = 0;
    672 	md_unit_readerexit(ui);
    673 	/* start resync_unit thread */
    674 	(void) thread_create(NULL, 0, raid_init_columns,
    675 	    (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
    676 
    677 	return (0);
    678 
    679 out:
    680 	un = md_unit_writerlock(ui);
    681 	MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
    682 	/* recover state */
    683 	for (i = 0; i < un->un_totalcolumncnt; i++)
    684 		if (COLUMN_STATE(un, i) == RCS_INIT)
    685 			raid_set_state(un, i, RCS_ERRED, 0);
    686 	if (un->un_state & RUS_INIT)
    687 		un->un_state = RUS_DOI;
    688 	raid_commit(un, NULL);
    689 	md_unit_writerexit(ui);
    690 	if (un->un_state & RUS_DOI) {
    691 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
    692 		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
    693 	} else {
    694 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
    695 		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
    696 	}
    697 	return (rval);
    698 }
    699 
    700 /*
    701  * NAME:	raid_regen
    702  *
    703  * DESCRIPTION:	regenerate all the parity on the raid device.  This
    704  *		routine starts a thread that will regenerate the
    705  *		parity on a raid device.  If an I/O error occurs during
    706  *		this process the entire device is placed in error.
    707  *
    708  * PARAMETERS:	md_set_params_t *msp - ioctl packet
    709  */
    710 static void
    711 regen_unit(minor_t mnum)
    712 {
    713 	mdi_unit_t	*ui = MDI_UNIT(mnum);
    714 	mr_unit_t	*un = MD_UNIT(mnum);
    715 	buf_t		buf, *bp;
    716 	caddr_t		buffer;
    717 	int		err = 0;
    718 	diskaddr_t	total_segments;
    719 	diskaddr_t	line;
    720 	size_t		iosize;
    721 
    722 	/*
    723 	 * Increment raid resync count for cpr
    724 	 */
    725 	mutex_enter(&md_cpr_resync.md_resync_mutex);
    726 	md_cpr_resync.md_raid_resync++;
    727 	mutex_exit(&md_cpr_resync.md_resync_mutex);
    728 
    729 	iosize = dbtob(un->un_segsize);
    730 	buffer = kmem_alloc(iosize, KM_SLEEP);
    731 	bp = &buf;
    732 	total_segments = un->un_segsincolumn;
    733 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_START, SVM_TAG_METADEVICE,
    734 	    MD_UN2SET(un), MD_SID(un));
    735 	un->un_percent_done = 0;
    736 	init_buf(bp, B_READ | B_BUSY, iosize);
    737 
    738 	for (line = 0; line < total_segments; line++) {
    739 		bp->b_lblkno = line *
    740 		    ((un->un_origcolumncnt - 1) * un->un_segsize);
    741 		bp->b_un.b_addr = buffer;
    742 		bp->b_bcount = iosize;
    743 		bp->b_iodone = NULL;
    744 		/*
    745 		 * The following assignment is only correct because
    746 		 * md_raid_strategy is fine when it's only a minor number
    747 		 * and not a real dev_t. Yuck.
    748 		 */
    749 		bp->b_edev = mnum;
    750 		md_raid_strategy(bp, MD_STR_NOTTOP, NULL);
    751 		if (biowait(bp)) {
    752 			err = 1;
    753 			break;
    754 		}
    755 		un->un_percent_done = (uint_t)((line * 1000) /
    756 		    un->un_segsincolumn);
    757 		/* just to avoid rounding errors */
    758 		if (un->un_percent_done > 1000)
    759 			un->un_percent_done = 1000;
    760 		reset_buf(bp, B_READ | B_BUSY, iosize);
    761 	}
    762 	destroy_buf(bp);
    763 	kmem_free(buffer, iosize);
    764 
    765 	(void) md_io_writerlock(ui);
    766 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
    767 	(void) md_io_writerexit(ui);
    768 	un = md_unit_writerlock(ui);
    769 	if (!err &&
    770 	    (raid_state_cnt(un, RCS_OKAY) == un->un_totalcolumncnt))
    771 			un->un_state = RUS_OKAY;
    772 	raid_commit(un, NULL);
    773 	md_unit_writerexit(ui);
    774 	if (err ||
    775 	    raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) {
    776 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_FAILED,
    777 		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
    778 	} else {
    779 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_DONE, SVM_TAG_METADEVICE,
    780 		    MD_UN2SET(un), MD_SID(un));
    781 	}
    782 
    783 	/*
    784 	 * Decrement the raid resync count for cpr
    785 	 */
    786 	mutex_enter(&md_cpr_resync.md_resync_mutex);
    787 	md_cpr_resync.md_raid_resync--;
    788 	mutex_exit(&md_cpr_resync.md_resync_mutex);
    789 	thread_exit();
    790 }
    791 
    792 static int
    793 raid_regen_unit(minor_t mnum, md_error_t *ep)
    794 {
    795 	mdi_unit_t	*ui;
    796 	mr_unit_t	*un;
    797 	int		i;
    798 	set_t		setno = MD_MIN2SET(mnum);
    799 
    800 	ui = MDI_UNIT(mnum);
    801 	un = (mr_unit_t *)MD_UNIT(mnum);
    802 
    803 	if (md_get_setstatus(setno) & MD_SET_STALE)
    804 		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
    805 
    806 	/* Don't start a regen if the device is not available */
    807 	if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
    808 		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
    809 	}
    810 
    811 	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
    812 		(void) md_unit_writerlock(ui);
    813 		for (i = 0; i < un->un_totalcolumncnt; i++)
    814 			raid_set_state(un, i, RCS_ERRED, 0);
    815 		md_unit_writerexit(ui);
    816 		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
    817 	}
    818 
    819 	/* start resync_unit thread */
    820 	(void) thread_create(NULL, 0, regen_unit,
    821 	    (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
    822 
    823 	return (0);
    824 }
    825 
    826 static int
    827 raid_regen(md_regen_param_t *mrp, IOLOCK *lock)
    828 {
    829 	minor_t		mnum = mrp->mnum;
    830 	mr_unit_t	*un;
    831 
    832 	mdclrerror(&mrp->mde);
    833 
    834 	un = md_unit_readerlock(MDI_UNIT(mnum));
    835 
    836 	if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
    837 		md_unit_readerexit(MDI_UNIT(mnum));
    838 		return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
    839 	}
    840 
    841 	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
    842 	    (raid_state_cnt(un, RCS_RESYNC))) {
    843 		md_unit_readerexit(MDI_UNIT(mnum));
    844 		return (mdmderror(&mrp->mde, MDE_RESYNC_ACTIVE, mnum));
    845 	}
    846 
    847 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
    848 		md_unit_readerexit(MDI_UNIT(mnum));
    849 		return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
    850 	}
    851 
    852 	if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
    853 	    (! (un->un_state & RUS_OKAY))) {
    854 		md_unit_readerexit(MDI_UNIT(mnum));
    855 		return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
    856 	}
    857 
    858 	md_unit_readerexit(MDI_UNIT(mnum));
    859 
    860 	/* get locks and recheck to be sure something did not change */
    861 	if ((un = raid_getun(mnum, &mrp->mde, WRITERS, lock)) == NULL)
    862 		return (0);
    863 
    864 	if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
    865 	    (! (un->un_state & RUS_OKAY))) {
    866 		return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
    867 	}
    868 
    869 	raid_set_state(un, 0, RCS_REGEN, 0);
    870 	raid_commit(un, NULL);
    871 	md_ioctl_droplocks(lock);
    872 	return (raid_regen_unit(mnum, &mrp->mde));
    873 }
    874 
    875 /*
    876  * NAME:	raid_set
    877  * DESCRIPTION: used to create a RAID metadevice
    878  * PARAMETERS:	md_set_params_t *d   - pointer to set data structure
    879  *		int		mode - must be FWRITE
    880  *
    881  * LOCKS:	none
    882  *
    883  */
    884 static int
    885 raid_set(void	*d, int mode)
    886 {
    887 	minor_t		mnum;
    888 	mr_unit_t	*un;
    889 	mddb_recid_t	mr_recid;
    890 	mddb_recid_t	*recids;
    891 	mddb_type_t	typ1;
    892 	int		err;
    893 	set_t		setno;
    894 	int		num_recs;
    895 	int		rid;
    896 	int		col;
    897 	md_set_params_t	*msp = d;
    898 
    899 
    900 	mnum = msp->mnum;
    901 	setno = MD_MIN2SET(mnum);
    902 
    903 	mdclrerror(&msp->mde);
    904 
    905 	if (raid_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL)
    906 		return (0);
    907 
    908 	typ1 = (mddb_type_t)md_getshared_key(setno,
    909 	    raid_md_ops.md_driver.md_drivername);
    910 
    911 	/* create the db record for this mdstruct */
    912 
    913 	if (msp->options & MD_CRO_64BIT) {
    914 #if defined(_ILP32)
    915 		return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum));
    916 #else
    917 		mr_recid = mddb_createrec(msp->size, typ1, 0,
    918 		    MD_CRO_64BIT | MD_CRO_RAID | MD_CRO_FN, setno);
    919 #endif
    920 	} else {
    921 		mr_recid = mddb_createrec(msp->size, typ1, 0,
    922 		    MD_CRO_32BIT | MD_CRO_RAID | MD_CRO_FN, setno);
    923 	}
    924 
    925 	if (mr_recid < 0)
    926 		return (mddbstatus2error(&msp->mde,
    927 		    (int)mr_recid, mnum, setno));
    928 
    929 	/* get the address of the mdstruct */
    930 	un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
    931 	/*
    932 	 * It is okay that we muck with the mdstruct here,
    933 	 * since no one else will know about the mdstruct
    934 	 * until we commit it. If we crash, the record will
    935 	 * be automatically purged, since we haven't
    936 	 * committed it yet.
    937 	 */
    938 
    939 	/* copy in the user's mdstruct */
    940 	if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un,
    941 	    msp->size, mode)) {
    942 		mddb_deleterec_wrapper(mr_recid);
    943 		return (EFAULT);
    944 	}
    945 	/* All 64 bit metadevices only support EFI labels. */
    946 	if (msp->options & MD_CRO_64BIT) {
    947 		un->c.un_flag |= MD_EFILABEL;
    948 	}
    949 
    950 	/*
    951 	 * allocate the real recids array.  since we may have to commit
    952 	 * underlying metadevice records, we need an array of size:
    953 	 * total number of components in raid + 3 (1 for the raid itself,
    954 	 * one for the hotspare, one for the end marker).
    955 	 */
    956 	num_recs = un->un_totalcolumncnt + 3;
    957 	rid = 0;
    958 	recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
    959 	recids[rid++] = mr_recid;
    960 
    961 	MD_SID(un) = mnum;
    962 	MD_RECID(un) = recids[0];
    963 	MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_SP;
    964 	MD_PARENT(un) = MD_NO_PARENT;
    965 	un->un_resync_copysize = 0;
    966 	un->c.un_revision |= MD_FN_META_DEV;
    967 
    968 	if (UNIT_STATE(un) == RUS_INIT)
    969 		MD_STATUS(un) |= MD_UN_GROW_PENDING;
    970 
    971 	if ((UNIT_STATE(un) != RUS_INIT) && raid_check_pw(un)) {
    972 		mddb_deleterec_wrapper(mr_recid);
    973 		err = mderror(&msp->mde, MDE_RAID_INVALID);
    974 		goto out;
    975 	}
    976 
    977 	if (err = raid_build_incore(un, 0)) {
    978 		if (un->mr_ic) {
    979 			kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
    980 			    un->un_totalcolumncnt);
    981 			kmem_free(un->mr_ic, sizeof (*un->mr_ic));
    982 		}
    983 
    984 		md_nblocks_set(mnum, -1ULL);
    985 		MD_UNIT(mnum) = NULL;
    986 
    987 		mddb_deleterec_wrapper(mr_recid);
    988 		goto out;
    989 	}
    990 
    991 	/*
    992 	 * Update unit availability
    993 	 */
    994 	md_set[setno].s_un_avail--;
    995 
    996 	recids[rid] = 0;
    997 	if (un->un_hsp_id != -1) {
    998 		/* increment the reference count of the hot spare pool */
    999 		err = md_hot_spare_ifc(HSP_INCREF, un->un_hsp_id, 0, 0,
   1000 		    &recids[rid], NULL, NULL, NULL);
   1001 		if (err) {
   1002 			md_nblocks_set(mnum, -1ULL);
   1003 			MD_UNIT(mnum) = NULL;
   1004 
   1005 			mddb_deleterec_wrapper(mr_recid);
   1006 			goto out;
   1007 		}
   1008 		rid++;
   1009 	}
   1010 
   1011 	/*
   1012 	 * set the parent on any metadevice components.
   1013 	 * NOTE: currently soft partitions are the only metadevices
   1014 	 * which can appear within a RAID metadevice.
   1015 	 */
   1016 	for (col = 0; col < un->un_totalcolumncnt; col++) {
   1017 		mr_column_t	*mr_col = &un->un_column[col];
   1018 		md_unit_t	*comp_un;
   1019 
   1020 		if (md_getmajor(mr_col->un_dev) == md_major) {
   1021 			comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
   1022 			recids[rid++] = MD_RECID(comp_un);
   1023 			md_set_parent(mr_col->un_dev, MD_SID(un));
   1024 		}
   1025 	}
   1026 
   1027 	/* set the end marker */
   1028 	recids[rid] = 0;
   1029 
   1030 	mddb_commitrecs_wrapper(recids);
   1031 	md_create_unit_incore(mnum, &raid_md_ops, 1);
   1032 
   1033 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno,
   1034 	    MD_SID(un));
   1035 
   1036 out:
   1037 	kmem_free(recids, (num_recs * sizeof (mddb_recid_t)));
   1038 	if (err)
   1039 		return (err);
   1040 
   1041 	/* only attempt to init a device that is in the init state */
   1042 	if (UNIT_STATE(un) != RUS_INIT)
   1043 		return (0);
   1044 
   1045 	return (raid_init_unit(mnum, &msp->mde));
   1046 }
   1047 
   1048 /*
   1049  * NAME:	raid_get
   1050  * DESCRIPTION: used to get the unit structure of a RAID metadevice
   1051  * PARAMETERS:	md_i_get_t   *migp - pointer to get data structure
   1052  *		int	      mode - must be FREAD
   1053  *		IOLOCK	     *lock - pointer to IOCTL lock
   1054  *
   1055  * LOCKS:	obtains unit reader lock via IOLOCK
   1056  *
   1057  */
   1058 static int
   1059 raid_get(
   1060 	void		*migp,
   1061 	int		mode,
   1062 	IOLOCK		*lock
   1063 )
   1064 {
   1065 	minor_t		mnum;
   1066 	mr_unit_t	*un;
   1067 	md_i_get_t	*migph = migp;
   1068 
   1069 
   1070 	mnum = migph->id;
   1071 
   1072 	mdclrerror(&migph->mde);
   1073 
   1074 	if ((un = raid_getun(mnum, &migph->mde,
   1075 	    RD_LOCK, lock)) == NULL)
   1076 		return (0);
   1077 
   1078 	if (migph->size == 0) {
   1079 		migph->size = un->c.un_size;
   1080 		return (0);
   1081 	}
   1082 
   1083 	if (migph->size < un->c.un_size) {
   1084 		return (EFAULT);
   1085 	}
   1086 	if (ddi_copyout(un, (void *)(uintptr_t)migph->mdp,
   1087 	    un->c.un_size, mode))
   1088 		return (EFAULT);
   1089 
   1090 	return (0);
   1091 }
   1092 
   1093 
   1094 /*
   1095  * NAME:	raid_replace
   1096  * DESCRIPTION: used to replace a component of a RAID metadevice
   1097  * PARAMETERS:	replace_params_t *mrp - pointer to replace data structure
   1098  *		IOLOCK	     *lock - pointer to IOCTL lock
   1099  *
   1100  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
   1101  *		obtains and releases md_unit_array_rw write lock
   1102  *
   1103  */
   1104 static int
   1105 raid_replace(
   1106 	replace_params_t	*mrp,
   1107 	IOLOCK			*lock
   1108 )
   1109 {
   1110 	minor_t		mnum = mrp->mnum;
   1111 	md_dev64_t	odev = mrp->old_dev;
   1112 	md_error_t	*ep = &mrp->mde;
   1113 	mr_unit_t	*un;
   1114 	rcs_state_t	state;
   1115 	int		ix, col = -1;
   1116 	int		force = 0;
   1117 	int		err = 0;
   1118 	replace_cmd_t	cmd;
   1119 	set_t		setno;
   1120 	side_t		side;
   1121 	mdkey_t		devkey;
   1122 	int		nkeys;
   1123 	mddb_recid_t	extra_recids[3] = { 0, 0, 0 };
   1124 	int		extra_rids = 0;
   1125 	md_error_t	mde = mdnullerror;
   1126 	sv_dev_t	sv = {MD_SET_BAD, MD_SIDEWILD, MD_KEYWILD};
   1127 
   1128 	mdclrerror(ep);
   1129 	setno = MD_MIN2SET(mnum);
   1130 	side = mddb_getsidenum(setno);
   1131 
   1132 	un = md_unit_readerlock(MDI_UNIT(mnum));
   1133 
   1134 	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
   1135 	    (raid_state_cnt(un, RCS_RESYNC) != 0)) {
   1136 		md_unit_readerexit(MDI_UNIT(mnum));
   1137 		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
   1138 	}
   1139 
   1140 	if (un->un_state & RUS_DOI) {
   1141 		md_unit_readerexit(MDI_UNIT(mnum));
   1142 		return (mdmderror(ep, MDE_RAID_DOI, mnum));
   1143 	}
   1144 
   1145 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
   1146 	    (MD_STATUS(un) & MD_UN_GROW_PENDING)) {
   1147 		md_unit_readerexit(MDI_UNIT(mnum));
   1148 		return (mdmderror(ep, MDE_IN_USE, mnum));
   1149 	}
   1150 
   1151 	md_unit_readerexit(MDI_UNIT(mnum));
   1152 
   1153 	/* get locks and recheck to be sure something did not change */
   1154 	if ((un = raid_getun(mnum, ep, WRITERS, lock)) == NULL)
   1155 		return (0);
   1156 
   1157 	if (md_getkeyfromdev(setno, side, odev, &devkey, &nkeys) != 0) {
   1158 		return (mddeverror(ep, MDE_NAME_SPACE, odev));
   1159 	}
   1160 
   1161 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
   1162 		md_dev64_t tmpdevt = un->un_column[ix].un_orig_dev;
   1163 		/*
   1164 		 * Try to resolve devt again if NODEV64
   1165 		 */
   1166 		if (tmpdevt == NODEV64) {
   1167 			tmpdevt = md_resolve_bydevid(mnum, tmpdevt,
   1168 			    un->un_column[ix].un_orig_key);
   1169 			un->un_column[ix].un_orig_dev = tmpdevt;
   1170 		}
   1171 
   1172 		if (un->un_column[ix].un_orig_dev == odev) {
   1173 			col = ix;
   1174 			break;
   1175 		} else {
   1176 			if (un->un_column[ix].un_orig_dev == NODEV64) {
   1177 				/*
   1178 				 * Now we use the keys to match.
   1179 				 * If no key found, continue.
   1180 				 */
   1181 				if (nkeys == 0) {
   1182 					continue;
   1183 				}
   1184 				if (un->un_column[ix].un_orig_key == devkey) {
   1185 					if (nkeys > 1)
   1186 						return (mddeverror(ep,
   1187 						    MDE_MULTNM, odev));
   1188 					col = ix;
   1189 					break;
   1190 				}
   1191 			}
   1192 		}
   1193 	}
   1194 
   1195 	if (col == -1)
   1196 		return (mdcomperror(ep, MDE_CANT_FIND_COMP,
   1197 		    mnum, odev));
   1198 
   1199 	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
   1200 	    (raid_state_cnt(un, RCS_RESYNC) != 0))
   1201 		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
   1202 
   1203 	if (un->un_state & RUS_DOI)
   1204 		return (mdcomperror(ep, MDE_REPL_INVAL_STATE, mnum,
   1205 		    un->un_column[col].un_dev));
   1206 
   1207 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
   1208 	    (MD_STATUS(un) & MD_UN_GROW_PENDING))
   1209 		return (mdmderror(ep, MDE_IN_USE, mnum));
   1210 
   1211 	if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == FORCE_REPLACE_COMP))
   1212 		force = 1;
   1213 	if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == ENABLE_COMP))
   1214 		cmd = ENABLE_COMP;
   1215 	if ((mrp->cmd == FORCE_REPLACE_COMP) || (mrp->cmd == REPLACE_COMP))
   1216 		cmd = REPLACE_COMP;
   1217 
   1218 	if (un->un_state == RUS_LAST_ERRED) {
   1219 		/* Must use -f force flag for unit in LAST_ERRED state */
   1220 		if (!force)
   1221 			return (mdmderror(ep, MDE_RAID_NEED_FORCE, mnum));
   1222 
   1223 		/* Must use -f force flag on ERRED column first */
   1224 		if (un->un_column[col].un_devstate != RCS_ERRED) {
   1225 			for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
   1226 				if (un->un_column[ix].un_devstate & RCS_ERRED)
   1227 					return (mdcomperror(ep,
   1228 					    MDE_RAID_COMP_ERRED, mnum,
   1229 					    un->un_column[ix].un_dev));
   1230 			}
   1231 		}
   1232 
   1233 		/* must use -f force flag on LAST_ERRED columns next */
   1234 		if ((un->un_column[col].un_devstate != RCS_LAST_ERRED) &&
   1235 		    (un->un_column[col].un_devstate != RCS_ERRED))
   1236 			return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
   1237 			    mnum, un->un_column[col].un_dev));
   1238 	}
   1239 
   1240 	if (un->un_state == RUS_ERRED) {
   1241 		if (! (un->un_column[col].un_devstate &
   1242 		    (RCS_ERRED | RCS_INIT_ERRED)))
   1243 			return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
   1244 			    mnum, un->un_column[ix].un_dev));
   1245 	}
   1246 
   1247 	ASSERT(!(un->un_column[col].un_devflags & MD_RAID_ALT_ISOPEN));
   1248 	ASSERT(!(un->un_column[col].un_devflags & MD_RAID_WRITE_ALT));
   1249 
   1250 	state = un->un_column[col].un_devstate;
   1251 	if (state & RCS_INIT_ERRED) {
   1252 		MD_STATUS(un) |= MD_UN_GROW_PENDING;
   1253 		un->un_percent_done = 0;
   1254 		raid_set_state(un, col, RCS_INIT, 0);
   1255 	} else if (((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) &&
   1256 	    resync_request(mnum, col, 0, ep))
   1257 		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
   1258 
   1259 
   1260 	if (cmd == REPLACE_COMP) {
   1261 		md_dev64_t tmpdev = mrp->new_dev;
   1262 
   1263 		/*
   1264 		 * open the device by device id
   1265 		 */
   1266 		tmpdev = md_resolve_bydevid(mnum, tmpdev, mrp->new_key);
   1267 		if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
   1268 			return (mdcomperror(ep, MDE_COMP_OPEN_ERR, mnum,
   1269 			    tmpdev));
   1270 		}
   1271 
   1272 		/*
   1273 		 * If it's a metadevice, make sure it gets reparented
   1274 		 */
   1275 		if (md_getmajor(tmpdev) == md_major) {
   1276 			minor_t		new_mnum = md_getminor(tmpdev);
   1277 			md_unit_t	*new_un = MD_UNIT(new_mnum);
   1278 
   1279 			md_set_parent(tmpdev, MD_SID(un));
   1280 			extra_recids[extra_rids++] = MD_RECID(new_un);
   1281 		}
   1282 
   1283 		mrp->new_dev = tmpdev;
   1284 		un->un_column[col].un_orig_dev = tmpdev;
   1285 		un->un_column[col].un_orig_key = mrp->new_key;
   1286 		un->un_column[col].un_orig_pwstart = mrp->start_blk;
   1287 		un->un_column[col].un_orig_devstart =
   1288 		    mrp->start_blk + un->un_pwsize;
   1289 
   1290 		/*
   1291 		 * If the old device was a metadevice, make sure to
   1292 		 * reset its parent.
   1293 		 */
   1294 		if (md_getmajor(odev) == md_major) {
   1295 			minor_t		old_mnum = md_getminor(odev);
   1296 			md_unit_t	*old_un = MD_UNIT(old_mnum);
   1297 
   1298 			md_reset_parent(odev);
   1299 			extra_recids[extra_rids++] =
   1300 			    MD_RECID(old_un);
   1301 		}
   1302 
   1303 		if (HOTSPARED(un, col)) {
   1304 			md_layered_close(mrp->new_dev, MD_OFLG_NULL);
   1305 			un->un_column[col].un_alt_dev = mrp->new_dev;
   1306 			un->un_column[col].un_alt_pwstart = mrp->start_blk;
   1307 			un->un_column[col].un_alt_devstart =
   1308 			    mrp->start_blk + un->un_pwsize;
   1309 			un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
   1310 		} else {
   1311 			/*
   1312 			 * not hot spared.  Close the old device and
   1313 			 * move the new device in.
   1314 			 */
   1315 			if (un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN)
   1316 				md_layered_close(odev, MD_OFLG_NULL);
   1317 			un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
   1318 			un->un_column[col].un_dev = mrp->new_dev;
   1319 			un->un_column[col].un_pwstart = mrp->start_blk;
   1320 			un->un_column[col].un_devstart =
   1321 			    mrp->start_blk + un->un_pwsize;
   1322 			if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) {
   1323 				un->un_column[col].un_devflags |=
   1324 				    MD_RAID_REGEN_RESYNC;
   1325 			}
   1326 		}
   1327 		/*
   1328 		 * If the old device is not a metadevice then
   1329 		 * save off the set number and key so that it
   1330 		 * can be removed from the namespace later.
   1331 		 */
   1332 		if (md_getmajor(odev) != md_major) {
   1333 			sv.setno = setno;
   1334 			sv.key = devkey;
   1335 		}
   1336 	}
   1337 
   1338 	if (cmd == ENABLE_COMP) {
   1339 		md_dev64_t tmpdev = un->un_column[col].un_orig_dev;
   1340 		mdkey_t raidkey =  un->un_column[col].un_orig_key;
   1341 
   1342 		/*
   1343 		 * We trust the dev_t because we cannot determine the
   1344 		 * dev_t from the device id since a new disk is in the
   1345 		 * same location. Since this is a call from metareplace -e dx
   1346 		 * AND it is SCSI a new dev_t is not generated.  So the
   1347 		 * dev_t from the mddb is used. Before enabling the device
   1348 		 * we check to make sure that multiple entries for the same
   1349 		 * device does not exist in the namespace. If they do we
   1350 		 * fail the ioctl.
   1351 		 * One of the many ways multiple entries in the name space
   1352 		 * can occur is if one removed the failed component in a
   1353 		 * RAID metadevice and put another disk that was part of
   1354 		 * another metadevice. After reboot metadevadm would correctly
   1355 		 * update the device name for the metadevice whose component
   1356 		 * has moved. However now in the metadb there are two entries
   1357 		 * for the same name (ctds) that belong to different
   1358 		 * metadevices. One is valid, the other is a ghost or "last
   1359 		 * know as" ctds.
   1360 		 */
   1361 		tmpdev = md_resolve_bydevid(mnum, tmpdev, raidkey);
   1362 		if (tmpdev == NODEV64)
   1363 			tmpdev = md_getdevnum(setno, side, raidkey,
   1364 			    MD_TRUST_DEVT);
   1365 		/*
   1366 		 * check for multiple entries in namespace for the
   1367 		 * same dev
   1368 		 */
   1369 
   1370 		if (md_getkeyfromdev(setno, side, tmpdev, &devkey,
   1371 		    &nkeys) != 0)
   1372 			return (mddeverror(ep, MDE_NAME_SPACE, tmpdev));
   1373 		/*
   1374 		 * If number of keys are greater that
   1375 		 * 1, then we have an invalid
   1376 		 * namespace. STOP and return.
   1377 		 */
   1378 		if (nkeys > 1)
   1379 			return (mddeverror(ep, MDE_MULTNM, tmpdev));
   1380 		if (devkey != raidkey)
   1381 			return (mdcomperror(ep, MDE_CANT_FIND_COMP,
   1382 			    mnum, tmpdev));
   1383 
   1384 		if (un->un_column[col].un_orig_dev == NODEV64)
   1385 			un->un_column[col].un_orig_dev = tmpdev;
   1386 
   1387 		if (HOTSPARED(un, col)) {
   1388 			un->un_column[col].un_alt_dev =
   1389 			    un->un_column[col].un_orig_dev;
   1390 			un->un_column[col].un_alt_pwstart =
   1391 			    un->un_column[col].un_orig_pwstart;
   1392 			un->un_column[col].un_alt_devstart =
   1393 			    un->un_column[col].un_orig_devstart;
   1394 			un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
   1395 		} else {
   1396 			if (!(un->un_column[col].un_devflags &
   1397 			    MD_RAID_DEV_ISOPEN)) {
   1398 				if (md_layered_open(mnum, &tmpdev,
   1399 				    MD_OFLG_NULL)) {
   1400 					un->un_column[col].un_dev = tmpdev;
   1401 					return (mdcomperror(ep,
   1402 					    MDE_COMP_OPEN_ERR, mnum, tmpdev));
   1403 				}
   1404 				ASSERT(tmpdev != NODEV64 &&
   1405 				    tmpdev != 0);
   1406 
   1407 				if ((md_getmajor(tmpdev) != md_major) &&
   1408 				    (md_devid_found(setno, side, raidkey)
   1409 				    == 1)) {
   1410 					if (md_update_namespace_did(setno, side,
   1411 					    raidkey, &mde) != 0) {
   1412 						cmn_err(CE_WARN,
   1413 						    "md: could not"
   1414 						    " update namespace\n");
   1415 					}
   1416 				}
   1417 				un->un_column[col].un_dev =
   1418 				    un->un_column[col].un_orig_dev;
   1419 			}
   1420 			un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
   1421 			un->un_column[col].un_devflags |= MD_RAID_REGEN_RESYNC;
   1422 		}
   1423 	}
   1424 	if (mrp->has_label) {
   1425 		un->un_column[col].un_devflags |= MD_RAID_HAS_LABEL;
   1426 	} else {
   1427 		un->un_column[col].un_devflags &= ~MD_RAID_HAS_LABEL;
   1428 	}
   1429 
   1430 	raid_commit(un, extra_recids);
   1431 
   1432 	/* If the component has been replaced - clean up the name space */
   1433 	if (sv.setno != MD_SET_BAD) {
   1434 		md_rem_names(&sv, 1);
   1435 	}
   1436 
   1437 	md_ioctl_droplocks(lock);
   1438 
   1439 	if ((cmd == ENABLE_COMP) || (cmd == FORCE_ENABLE_COMP)) {
   1440 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE,
   1441 		    setno, MD_SID(un));
   1442 	} else {
   1443 		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE,
   1444 		    setno, MD_SID(un));
   1445 	}
   1446 
   1447 	if (un->un_column[col].un_devstate & RCS_INIT)
   1448 		err = raid_init_unit(mnum, ep);
   1449 	else if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0)
   1450 		err = raid_resync_unit(mnum, ep);
   1451 
   1452 	mdclrerror(ep);
   1453 	if (!err)
   1454 		return (0);
   1455 
   1456 	/* be sure state */
   1457 	/* is already set by this time */
   1458 	/* fix state  and commit record */
   1459 	un = md_unit_writerlock(MDI_UNIT(mnum));
   1460 	if (state & RCS_INIT_ERRED)
   1461 		raid_set_state(un, col, state, 1);
   1462 	else if (state & RCS_OKAY)
   1463 		raid_set_state(un, col, RCS_ERRED, 0);
   1464 	else
   1465 		raid_set_state(un, col, state, 1);
   1466 	raid_commit(un, NULL);
   1467 	md_unit_writerexit(MDI_UNIT(mnum));
   1468 	mdclrerror(ep);
   1469 	return (0);
   1470 }
   1471 
   1472 
   1473 /*
   1474  * NAME:	raid_set_sync
   1475  * DESCRIPTION: used to sync a component of a RAID metadevice
   1476  * PARAMETERS:	md_resync_ioctl_t *mrp - pointer to resync data structure
   1477  *		int	      mode - must be FWRITE
   1478  *		IOLOCK	     *lock - pointer to IOCTL lock
   1479  *
   1480  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
   1481  *		obtains and releases md_unit_array_rw write lock
   1482  *
   1483  */
   1484 static int
   1485 raid_set_sync(
   1486 	md_resync_ioctl_t	*rip,
   1487 	IOLOCK			*lock
   1488 )
   1489 {
   1490 	minor_t			mnum = rip->ri_mnum;
   1491 	mr_unit_t		*un;
   1492 	int			init = 0;
   1493 	int			resync = 0;
   1494 	int			regen = 0;
   1495 	int			ix;
   1496 	int			err;
   1497 
   1498 	mdclrerror(&rip->mde);
   1499 
   1500 	if ((un = raid_getun(mnum, &rip->mde, WRITERS, lock)) == NULL)
   1501 		return (0);
   1502 
   1503 	if (un->un_state & RUS_DOI)
   1504 		return (mdmderror(&rip->mde, MDE_RAID_DOI, mnum));
   1505 
   1506 	if (un->c.un_status & MD_UN_RESYNC_ACTIVE)
   1507 		return (mdmderror(&rip->mde, MDE_RESYNC_ACTIVE, mnum));
   1508 
   1509 	/* This prevents new opens */
   1510 
   1511 	rip->ri_flags = 0;
   1512 	if (un->un_state & RUS_REGEN)
   1513 		regen++;
   1514 
   1515 	if (raid_state_cnt(un, RCS_RESYNC))
   1516 		resync++;
   1517 
   1518 	if (raid_state_cnt(un, RCS_INIT) || (un->un_state & RUS_INIT))
   1519 		init++;
   1520 
   1521 	ASSERT(!(resync && init && regen));
   1522 	md_ioctl_droplocks(lock);
   1523 	rip->ri_percent_done = 0;
   1524 
   1525 	if (init) {
   1526 		MD_STATUS(un) |= MD_UN_GROW_PENDING;
   1527 		return (raid_init_unit(mnum, &rip->mde));
   1528 	}
   1529 
   1530 	/*
   1531 	 * If resync is needed, it will call raid_internal_open forcing
   1532 	 * replay before the open completes.
   1533 	 * Otherwise, call raid_internal_open directly to force
   1534 	 * replay to complete during boot (metasync -r).
   1535 	 * NOTE: the unit writer lock must remain held while setting
   1536 	 *	 MD_UN_RESYNC_ACTIVE but must be released before
   1537 	 *	 calling raid_resync_unit or raid_internal_open.
   1538 	 */
   1539 	if (resync) {
   1540 		ASSERT(resync < 2);
   1541 		un = md_unit_writerlock(MDI_UNIT(mnum));
   1542 		MD_STATUS(un) |= MD_UN_RESYNC_ACTIVE;
   1543 		/* Must release unit writer lock for resync */
   1544 		/*
   1545 		 * correctly setup the devices before trying to start the
   1546 		 * resync operation.
   1547 		 */
   1548 		for (ix = 0; un->un_totalcolumncnt; ix++) {
   1549 			if (un->un_column[ix].un_devstate & RCS_RESYNC) {
   1550 				if ((un->un_column[ix].un_devflags &
   1551 				    MD_RAID_COPY_RESYNC) &&
   1552 				    HOTSPARED(un, ix)) {
   1553 					un->un_column[ix].un_alt_dev =
   1554 					    un->un_column[ix].un_orig_dev;
   1555 					un->un_column[ix].un_alt_devstart =
   1556 					    un->un_column[ix].un_orig_devstart;
   1557 					un->un_column[ix].un_alt_pwstart =
   1558 					    un->un_column[ix].un_orig_pwstart;
   1559 				}
   1560 				break;
   1561 			}
   1562 		}
   1563 		ASSERT(un->un_column[ix].un_devflags &
   1564 		    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
   1565 		rip->ri_percent_done = 0;
   1566 		un->un_column[ix].un_devflags |= MD_RAID_RESYNC;
   1567 		(void) resync_request(mnum, ix, 0, NULL);
   1568 		md_unit_writerexit(MDI_UNIT(mnum));
   1569 		err = raid_resync_unit(mnum, &rip->mde);
   1570 		return (err);
   1571 	}
   1572 
   1573 	if (regen) {
   1574 		err = raid_regen_unit(mnum, &rip->mde);
   1575 		return (err);
   1576 	}
   1577 
   1578 	/* The unit requires not work so just force replay of the device */
   1579 	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0))
   1580 		return (mdmderror(&rip->mde,
   1581 		    MDE_RAID_OPEN_FAILURE, mnum));
   1582 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
   1583 
   1584 	return (0);
   1585 }
   1586 
   1587 /*
   1588  * NAME:	raid_get_resync
   1589  * DESCRIPTION: used to check resync status on a component of a RAID metadevice
   1590  * PARAMETERS:	md_resync_ioctl_t *mrp - pointer to resync data structure
   1591  *		int	      mode - must be FWRITE
   1592  *		IOLOCK	     *lock - pointer to IOCTL lock
   1593  *
   1594  * LOCKS:	none
   1595  *
   1596  */
   1597 static int
   1598 raid_get_resync(
   1599 	md_resync_ioctl_t	*rip,
   1600 	IOLOCK			*lock
   1601 )
   1602 {
   1603 	minor_t			mnum = rip->ri_mnum;
   1604 	mr_unit_t		*un;
   1605 	u_longlong_t		percent;
   1606 	int			cnt;
   1607 	int			ix;
   1608 	uint64_t		d;
   1609 
   1610 	mdclrerror(&rip->mde);
   1611 
   1612 	if ((un = raid_getun(mnum, &rip->mde, RD_LOCK, lock)) == NULL)
   1613 		return (0);
   1614 
   1615 	rip->ri_flags = 0;
   1616 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
   1617 		d = un->un_segsincolumn;
   1618 		percent = d ? ((1000 * un->un_resync_line_index) / d) : 0;
   1619 		if (percent > 1000)
   1620 			percent = 1000;	/* can't go over 100% */
   1621 		rip->ri_percent_done = (int)percent;
   1622 		rip->ri_flags |= MD_RI_INPROGRESS;
   1623 	}
   1624 
   1625 	if (UNIT_STATE(un) & RUS_INIT) {
   1626 		d = un->un_segsize * un->un_segsincolumn *
   1627 		    un->un_totalcolumncnt;
   1628 		percent =
   1629 		    d ? ((1000 * (u_longlong_t)un->un_init_iocnt) / d) : 0;
   1630 		if (percent > 1000)
   1631 			percent = 1000;	/* can't go over 100% */
   1632 		rip->ri_percent_done = (int)percent;
   1633 		rip->ri_flags |= MD_GROW_INPROGRESS;
   1634 	} else if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
   1635 		d = un->un_segsize * un->un_segsincolumn * un->un_init_colcnt;
   1636 		percent =
   1637 		    d ? (((u_longlong_t)un->un_init_iocnt * 1000) / d) : 0;
   1638 		if (percent > 1000)
   1639 			percent = 1000;
   1640 		rip->ri_percent_done = (int)percent;
   1641 		rip->ri_flags |= MD_GROW_INPROGRESS;
   1642 	}
   1643 
   1644 	if (un->un_state & RUS_REGEN)
   1645 		rip->ri_percent_done = un->un_percent_done;
   1646 
   1647 	cnt = 0;
   1648 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
   1649 		switch (un->un_column[ix].un_devstate) {
   1650 		case RCS_INIT:
   1651 		case RCS_ERRED:
   1652 		case RCS_LAST_ERRED:
   1653 			cnt++;
   1654 			break;
   1655 		default:
   1656 			break;
   1657 		}
   1658 	}
   1659 	d = un->un_totalcolumncnt;
   1660 	rip->ri_percent_dirty = d ? (((u_longlong_t)cnt * 100) / d) : 0;
   1661 	return (0);
   1662 }
   1663 
   1664 /*
   1665  * NAME:	raid_grow
   1666  * DESCRIPTION: Concatenate to a RAID metadevice
   1667  * PARAMETERS:	md_grow_params_t *mgp
   1668  *			      - pointer to IOCGROW data structure
   1669  *		int	 mode - must be FWRITE
   1670  *		IOLOCK *lockp - IOCTL read/write and unit_array_rw lock
   1671  *
   1672  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
   1673  *		obtains and releases md_unit_array_rw write lock
   1674  *
   1675  */
   1676 static int
   1677 raid_grow(void *mgp, int mode, IOLOCK *lock)
   1678 {
   1679 	minor_t		mnum;
   1680 	mr_unit_t	*un, *new_un;
   1681 	mdi_unit_t	*ui;
   1682 	mddb_type_t	typ1;
   1683 	mddb_recid_t	mr_recid;
   1684 	mddb_recid_t	old_vtoc = 0;
   1685 	mddb_recid_t	*recids;
   1686 	md_create_rec_option_t options;
   1687 	int		err;
   1688 	int		col, i;
   1689 	int64_t		tb, atb;
   1690 	u_longlong_t	unrev;
   1691 	int		tc;
   1692 	int		rval = 0;
   1693 	set_t		setno;
   1694 	mr_column_ic_t	*mrc;
   1695 	int		num_recs, rid;
   1696 	md_grow_params_t	*mgph = mgp;
   1697 
   1698 
   1699 	mnum = mgph->mnum;
   1700 
   1701 	mdclrerror(&mgph->mde);
   1702 
   1703 	ui = MDI_UNIT(mnum);
   1704 	un = md_unit_readerlock(ui);
   1705 
   1706 	if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
   1707 		md_unit_readerexit(ui);
   1708 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
   1709 	}
   1710 
   1711 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
   1712 		md_unit_readerexit(ui);
   1713 		return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
   1714 	}
   1715 
   1716 	if (UNIT_STATE(un) & RUS_LAST_ERRED) {
   1717 		md_unit_readerexit(ui);
   1718 		return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
   1719 	}
   1720 
   1721 	if (UNIT_STATE(un) & RUS_DOI) {
   1722 		md_unit_readerexit(ui);
   1723 		return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
   1724 	}
   1725 
   1726 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
   1727 		md_unit_readerexit(ui);
   1728 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
   1729 	}
   1730 
   1731 	md_unit_readerexit(ui);
   1732 
   1733 	if ((un = raid_getun(mnum, &mgph->mde, WRITERS, lock)) ==
   1734 	    NULL)
   1735 		return (0);
   1736 
   1737 	if (MD_STATUS(un) & MD_UN_GROW_PENDING)
   1738 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
   1739 
   1740 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
   1741 		return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
   1742 
   1743 	if (un->c.un_size >= mgph->size)
   1744 		return (EINVAL);
   1745 
   1746 	if (UNIT_STATE(un) & RUS_LAST_ERRED)
   1747 		return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
   1748 
   1749 	if (UNIT_STATE(un) & RUS_DOI)
   1750 		return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
   1751 
   1752 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT))
   1753 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
   1754 
   1755 	setno = MD_MIN2SET(mnum);
   1756 
   1757 	typ1 = (mddb_type_t)md_getshared_key(setno,
   1758 	    raid_md_ops.md_driver.md_drivername);
   1759 
   1760 	/*
   1761 	 * Preserve the friendly name nature of the device that is
   1762 	 * growing.
   1763 	 */
   1764 	options = MD_CRO_RAID;
   1765 	if (un->c.un_revision & MD_FN_META_DEV)
   1766 		options |= MD_CRO_FN;
   1767 	if (mgph->options & MD_CRO_64BIT) {
   1768 #if defined(_ILP32)
   1769 		return (mdmderror(&mgph->mde, MDE_UNIT_TOO_LARGE, mnum));
   1770 #else
   1771 		mr_recid = mddb_createrec(mgph->size, typ1, 0,
   1772 		    MD_CRO_64BIT | options, setno);
   1773 #endif
   1774 	} else {
   1775 		mr_recid = mddb_createrec(mgph->size, typ1, 0,
   1776 		    MD_CRO_32BIT | options, setno);
   1777 	}
   1778 	if (mr_recid < 0) {
   1779 		rval = mddbstatus2error(&mgph->mde, (int)mr_recid,
   1780 		    mnum, setno);
   1781 		return (rval);
   1782 	}
   1783 
   1784 	/* get the address of the new unit */
   1785 	new_un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
   1786 
   1787 	/*
   1788 	 * It is okay that we muck with the new unit here,
   1789 	 * since no one else will know about the unit struct
   1790 	 * until we commit it. If we crash, the record will
   1791 	 * be automatically purged, since we haven't
   1792 	 * committed it yet and the old unit struct will be found.
   1793 	 */
   1794 
   1795 	/* copy in the user's unit struct */
   1796 	err = ddi_copyin((void *)(uintptr_t)mgph->mdp, new_un,
   1797 	    mgph->size, mode);
   1798 	if (err) {
   1799 		mddb_deleterec_wrapper(mr_recid);
   1800 		return (EFAULT);
   1801 	}
   1802 
   1803 	/* make sure columns are being added */
   1804 	if (un->un_totalcolumncnt >= new_un->un_totalcolumncnt) {
   1805 		mddb_deleterec_wrapper(mr_recid);
   1806 		return (EINVAL);
   1807 	}
   1808 
   1809 	/*
   1810 	 * Save a few of the new unit structs fields.
   1811 	 * Before they get clobbered.
   1812 	 */
   1813 	tc = new_un->un_totalcolumncnt;
   1814 	tb = new_un->c.un_total_blocks;
   1815 	atb = new_un->c.un_actual_tb;
   1816 	unrev = new_un->c.un_revision;
   1817 
   1818 	/*
   1819 	 * Copy the old unit struct (static stuff)
   1820 	 * into new unit struct
   1821 	 */
   1822 	bcopy((caddr_t)un, (caddr_t)new_un, un->c.un_size);
   1823 
   1824 	/*
   1825 	 * Restore a few of the new unit struct values.
   1826 	 */
   1827 	new_un->un_totalcolumncnt = tc;
   1828 	new_un->c.un_actual_tb = atb;
   1829 	new_un->un_grow_tb = tb;
   1830 	new_un->c.un_revision = unrev;
   1831 	new_un->c.un_record_id = mr_recid;
   1832 	new_un->c.un_size = mgph->size;
   1833 
   1834 	ASSERT(new_un->mr_ic == un->mr_ic);
   1835 
   1836 	/*
   1837 	 * Save old column slots
   1838 	 */
   1839 	mrc = un->un_column_ic;
   1840 
   1841 	/*
   1842 	 * Allocate new column slot
   1843 	 */
   1844 	new_un->un_column_ic = (mr_column_ic_t *)
   1845 	    kmem_zalloc(sizeof (mr_column_ic_t) * new_un->un_totalcolumncnt,
   1846 	    KM_SLEEP);
   1847 
   1848 	/*
   1849 	 * Restore old column slots
   1850 	 * Free the old column slots
   1851 	 */
   1852 	bcopy(mrc, new_un->un_column_ic,
   1853 	    sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
   1854 	kmem_free(mrc, sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
   1855 
   1856 	/* All 64 bit metadevices only support EFI labels. */
   1857 	if (mgph->options & MD_CRO_64BIT) {
   1858 		new_un->c.un_flag |= MD_EFILABEL;
   1859 		/*
   1860 		 * If the device was previously smaller than a terabyte,
   1861 		 * and had a vtoc record attached to it, we remove the
   1862 		 * vtoc record, because the layout has changed completely.
   1863 		 */
   1864 		if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) &&
   1865 		    (un->c.un_vtoc_id != 0)) {
   1866 			old_vtoc = un->c.un_vtoc_id;
   1867 			new_un->c.un_vtoc_id =
   1868 			    md_vtoc_to_efi_record(old_vtoc, setno);
   1869 		}
   1870 	}
   1871 
   1872 
   1873 	/*
   1874 	 * allocate the real recids array.  since we may have to commit
   1875 	 * underlying metadevice records, we need an array of size:
   1876 	 * total number of new components being attach + 2 (one for the
   1877 	 * raid itself, one for the end marker).
   1878 	 */
   1879 	num_recs = new_un->un_totalcolumncnt + 2;
   1880 	rid = 0;
   1881 	recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
   1882 	recids[rid++] = mr_recid;
   1883 
   1884 	for (col = un->un_totalcolumncnt;
   1885 	    (col < new_un->un_totalcolumncnt); col++) {
   1886 		mr_column_t	*mr_col = &new_un->un_column[col];
   1887 		md_unit_t	*comp_un;
   1888 
   1889 		if (raid_build_pw_reservation(new_un, col) != 0) {
   1890 			/* release pwslots already allocated by grow */
   1891 			for (i = un->un_totalcolumncnt; i < col; i++) {
   1892 				raid_free_pw_reservation(new_un, i);
   1893 			}
   1894 			kmem_free(new_un->un_column_ic,
   1895 			    sizeof (mr_column_ic_t) *
   1896 			    new_un->un_totalcolumncnt);
   1897 			kmem_free(new_un->mr_ic, sizeof (*un->mr_ic));
   1898 			kmem_free(recids, num_recs * sizeof (mddb_recid_t));
   1899 			mddb_deleterec_wrapper(mr_recid);
   1900 			return (EINVAL);
   1901 		}
   1902 		/*
   1903 		 * set parent on metadevices being added.
   1904 		 * NOTE: currently soft partitions are the only metadevices
   1905 		 * which can appear within a RAID metadevice.
   1906 		 */
   1907 		if (md_getmajor(mr_col->un_dev) == md_major) {
   1908 			comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
   1909 			recids[rid++] = MD_RECID(comp_un);
   1910 			md_set_parent(mr_col->un_dev, MD_SID(new_un));
   1911 		}
   1912 		new_un->un_column[col].un_devflags = 0;
   1913 	}
   1914 
   1915 	/* set end marker */
   1916 	recids[rid] = 0;
   1917 
   1918 	/* commit new unit struct */
   1919 	mddb_commitrecs_wrapper(recids);
   1920 
   1921 	/* delete old unit struct */
   1922 	mddb_deleterec_wrapper(un->c.un_record_id);
   1923 
   1924 	/* place new unit in in-core array */
   1925 	md_nblocks_set(mnum, new_un->c.un_total_blocks);
   1926 	MD_UNIT(mnum) = new_un;
   1927 
   1928 	/*
   1929 	 * If old_vtoc has a non zero value, we know:
   1930 	 * - This unit crossed the border from smaller to larger one TB
   1931 	 * - There was a vtoc record for the unit,
   1932 	 * - This vtoc record is no longer needed, because
   1933 	 *   a new efi record has been created for this un.
   1934 	 */
   1935 	if (old_vtoc != 0) {
   1936 		mddb_deleterec_wrapper(old_vtoc);
   1937 	}
   1938 
   1939 	/* free recids */
   1940 	kmem_free(recids, num_recs * sizeof (mddb_recid_t));
   1941 
   1942 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
   1943 	    MD_UN2SET(new_un), MD_SID(new_un));
   1944 	MD_STATUS(new_un) |= MD_UN_GROW_PENDING;
   1945 
   1946 	/*
   1947 	 * Since the md_ioctl_writelock aquires the unit write lock
   1948 	 * and open/close aquires the unit reader lock it is necessary
   1949 	 * to drop the unit write lock and then reaquire it as needed
   1950 	 * later.
   1951 	 */
   1952 	md_unit_writerexit(ui);
   1953 
   1954 	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
   1955 		rval = mdmderror(&mgph->mde, MDE_RAID_OPEN_FAILURE, mnum);
   1956 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
   1957 		    MD_UN2SET(new_un), MD_SID(new_un));
   1958 		return (rval);
   1959 	}
   1960 	(void) md_unit_writerlock(ui);
   1961 	for (i = 0; i < new_un->un_totalcolumncnt; i++) {
   1962 		if (new_un->un_column[i].un_devstate & RCS_OKAY)
   1963 			(void) init_pw_area(new_un, new_un->un_column[i].un_dev,
   1964 			    new_un->un_column[i].un_pwstart, i);
   1965 	}
   1966 	md_unit_writerexit(ui);
   1967 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
   1968 	(void) md_unit_writerlock(ui);
   1969 	/* create a background thread to initialize the columns */
   1970 	md_ioctl_droplocks(lock);
   1971 
   1972 	return (raid_init_unit(mnum, &mgph->mde));
   1973 }
   1974 
   1975 /*
   1976  * NAME:	raid_reset
   1977  * DESCRIPTION: used to reset (clear / remove) a RAID metadevice
   1978  * PARAMETERS:	md_i_reset_t *mirp - pointer to reset data structure
   1979  *
   1980  * LOCKS:	obtains and releases md_unit_array_rw write lock
   1981  *
   1982  */
   1983 static int
   1984 raid_reset(md_i_reset_t	*mirp)
   1985 {
   1986 	minor_t		mnum = mirp->mnum;
   1987 	mr_unit_t	*un;
   1988 	mdi_unit_t	*ui;
   1989 	set_t		setno = MD_MIN2SET(mnum);
   1990 
   1991 	mdclrerror(&mirp->mde);
   1992 
   1993 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
   1994 	/*
   1995 	 * NOTE: need to get md_unit_writerlock to avoid conflict
   1996 	 * with raid_init thread.
   1997 	 */
   1998 	if ((un = raid_getun(mnum, &mirp->mde, NO_LOCK, NULL)) ==
   1999 	    NULL) {
   2000 		rw_exit(&md_unit_array_rw.lock);
   2001 		return (0);
   2002 	}
   2003 	ui = MDI_UNIT(mnum);
   2004 
   2005 	if (MD_HAS_PARENT(MD_PARENT(un))) {
   2006 		rw_exit(&md_unit_array_rw.lock);
   2007 		return (mdmderror(&mirp->mde, MDE_IN_USE, mnum));
   2008 	}
   2009 
   2010 	un = (mr_unit_t *)md_unit_openclose_enter(ui);
   2011 	if (md_unit_isopen(MDI_UNIT(mnum))) {
   2012 		md_unit_openclose_exit(ui);
   2013 		rw_exit(&md_unit_array_rw.lock);
   2014 		return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum));
   2015 	}
   2016 	md_unit_openclose_exit(ui);
   2017 	if (UNIT_STATE(un) != RUS_OKAY && !mirp->force) {
   2018 		rw_exit(&md_unit_array_rw.lock);
   2019 		return (mdmderror(&mirp->mde, MDE_RAID_NEED_FORCE, mnum));
   2020 	}
   2021 
   2022 	reset_raid(un, mnum, 1);
   2023 
   2024 	/*
   2025 	 * Update unit availability
   2026 	 */
   2027 	md_set[setno].s_un_avail++;
   2028 
   2029 	/*
   2030 	 * If MN set, reset s_un_next so all nodes can have
   2031 	 * the same view of the next available slot when
   2032 	 * nodes are -w and -j
   2033 	 */
   2034 	if (MD_MNSET_SETNO(setno)) {
   2035 		(void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum));
   2036 	}
   2037 
   2038 	rw_exit(&md_unit_array_rw.lock);
   2039 
   2040 	return (0);
   2041 }
   2042 
   2043 /*
   2044  * NAME:	raid_get_geom
   2045  * DESCRIPTION: used to get the geometry of a RAID metadevice
   2046  * PARAMETERS:	mr_unit_t    *un - RAID unit to get the geometry for
   2047  *		struct dk_geom *gp - pointer to geometry data structure
   2048  *
   2049  * LOCKS:	none
   2050  *
   2051  */
   2052 static int
   2053 raid_get_geom(
   2054 	mr_unit_t	*un,
   2055 	struct dk_geom	*geomp
   2056 )
   2057 {
   2058 	md_get_geom((md_unit_t *)un, geomp);
   2059 
   2060 	return (0);
   2061 }
   2062 
   2063 /*
   2064  * NAME:	raid_get_vtoc
   2065  * DESCRIPTION: used to get the VTOC on a RAID metadevice
   2066  * PARAMETERS:	mr_unit_t    *un - RAID unit to get the VTOC from
   2067  *		struct vtoc *vtocp - pointer to VTOC data structure
   2068  *
   2069  * LOCKS:	none
   2070  *
   2071  */
   2072 static int
   2073 raid_get_vtoc(
   2074 	mr_unit_t	*un,
   2075 	struct vtoc	*vtocp
   2076 )
   2077 {
   2078 	md_get_vtoc((md_unit_t *)un, vtocp);
   2079 
   2080 	return (0);
   2081 }
   2082 
   2083 /*
   2084  * NAME:	raid_set_vtoc
   2085  * DESCRIPTION: used to set the VTOC on a RAID metadevice
   2086  * PARAMETERS:	mr_unit_t    *un - RAID unit to set the VTOC on
   2087  *		struct vtoc *vtocp - pointer to VTOC data structure
   2088  *
   2089  * LOCKS:	none
   2090  *
   2091  */
   2092 static int
   2093 raid_set_vtoc(
   2094 	mr_unit_t	*un,
   2095 	struct vtoc	*vtocp
   2096 )
   2097 {
   2098 	return (md_set_vtoc((md_unit_t *)un, vtocp));
   2099 }
   2100 
   2101 
   2102 /*
   2103  * NAME:	raid_get_extvtoc
   2104  * DESCRIPTION: used to get the extended VTOC on a RAID metadevice
   2105  * PARAMETERS:	mr_unit_t    *un - RAID unit to get the VTOC from
   2106  *		struct extvtoc *vtocp - pointer to extended VTOC data structure
   2107  *
   2108  * LOCKS:	none
   2109  *
   2110  */
   2111 static int
   2112 raid_get_extvtoc(
   2113 	mr_unit_t	*un,
   2114 	struct extvtoc	*vtocp
   2115 )
   2116 {
   2117 	md_get_extvtoc((md_unit_t *)un, vtocp);
   2118 
   2119 	return (0);
   2120 }
   2121 
   2122 /*
   2123  * NAME:	raid_set_extvtoc
   2124  * DESCRIPTION: used to set the extended VTOC on a RAID metadevice
   2125  * PARAMETERS:	mr_unit_t    *un - RAID unit to set the VTOC on
   2126  *		struct extvtoc *vtocp - pointer to extended VTOC data structure
   2127  *
   2128  * LOCKS:	none
   2129  *
   2130  */
   2131 static int
   2132 raid_set_extvtoc(
   2133 	mr_unit_t	*un,
   2134 	struct extvtoc	*vtocp
   2135 )
   2136 {
   2137 	return (md_set_extvtoc((md_unit_t *)un, vtocp));
   2138 }
   2139 
   2140 
   2141 
   2142 /*
   2143  * NAME:	raid_get_cgapart
   2144  * DESCRIPTION: used to get the dk_map on a RAID metadevice
   2145  * PARAMETERS:	mr_unit_t    *un - RAID unit to set the VTOC on
   2146  *		struct vtoc *dkmapp - pointer to dk_map data structure
   2147  *
   2148  * LOCKS:	none
   2149  *
   2150  */
   2151 
   2152 static int
   2153 raid_get_cgapart(
   2154 	mr_unit_t	*un,
   2155 	struct dk_map	*dkmapp
   2156 )
   2157 {
   2158 	md_get_cgapart((md_unit_t *)un, dkmapp);
   2159 	return (0);
   2160 }
   2161 
   2162 /*
   2163  * NAME:	raid_getdevs
   2164  * DESCRIPTION: return all devices within a RAID metadevice
   2165  * PARAMETERS:	md_getdevs_params_t *mgdp
   2166  *			      - pointer to getdevs IOCTL data structure
   2167  *		int	 mode - should be FREAD
   2168  *		IOLOCK *lockp - IOCTL read/write lock
   2169  *
   2170  * LOCKS:	obtains unit reader lock via IOLOCK
   2171  *
   2172  */
   2173 static int
   2174 raid_getdevs(
   2175 	void			*mgdp,
   2176 	int			mode,
   2177 	IOLOCK			*lock
   2178 )
   2179 {
   2180 	minor_t			mnum;
   2181 	mr_unit_t		*un;
   2182 	md_dev64_t		*udevs;
   2183 	int			i, cnt;
   2184 	md_dev64_t		unit_dev;
   2185 	md_getdevs_params_t	*mgdph = mgdp;
   2186 
   2187 
   2188 	mnum = mgdph->mnum;
   2189 
   2190 	/* check out unit */
   2191 	mdclrerror(&mgdph->mde);
   2192 
   2193 	if ((un = raid_getun(mnum, &mgdph->mde, RD_LOCK, lock)) == NULL)
   2194 		return (0);
   2195 
   2196 	udevs = (md_dev64_t *)(uintptr_t)mgdph->devs;
   2197 
   2198 	for (cnt = 0, i = 0; i < un->un_totalcolumncnt; i++, cnt++) {
   2199 		if (cnt < mgdph->cnt) {
   2200 			unit_dev = un->un_column[i].un_orig_dev;
   2201 			if (md_getmajor(unit_dev) != md_major) {
   2202 				if ((unit_dev = md_xlate_mini_2_targ
   2203 				    (unit_dev)) == NODEV64)
   2204 					return (ENODEV);
   2205 			}
   2206 
   2207 			if (ddi_copyout((caddr_t)&unit_dev,
   2208 			    (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
   2209 				return (EFAULT);
   2210 		}
   2211 		if (HOTSPARED(un, i)) {
   2212 			cnt++;
   2213 			if (cnt >= mgdph->cnt)
   2214 				continue;
   2215 
   2216 			unit_dev = un->un_column[i].un_dev;
   2217 			if (md_getmajor(unit_dev) != md_major) {
   2218 				if ((unit_dev = md_xlate_mini_2_targ
   2219 				    (unit_dev)) == NODEV64)
   2220 					return (ENODEV);
   2221 			}
   2222 
   2223 			if (ddi_copyout((caddr_t)&unit_dev,
   2224 			    (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
   2225 				return (EFAULT);
   2226 		}
   2227 	}
   2228 	mgdph->cnt = cnt;
   2229 	return (0);
   2230 }
   2231 
   2232 /*
   2233  * NAME:	raid_change
   2234  * DESCRIPTION: used to change the following dynamic values:
   2235  *			the hot spare pool
   2236  *		in the unit structure of a RAID metadevice
   2237  * PARAMETERS:	md_change_params_t   *mcp - pointer to change data structure
   2238  *		IOLOCK	     *lock - pointer to IOCTL lock
   2239  *
   2240  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun)
   2241  *
   2242  */
   2243 static int
   2244 raid_change(
   2245 	md_raid_params_t	*mrp,
   2246 	IOLOCK			*lock
   2247 )
   2248 {
   2249 	minor_t		mnum = mrp->mnum;
   2250 	mr_unit_t	*un;
   2251 	int		ix;
   2252 	mddb_recid_t	recids[3] = {0, 0, 0};
   2253 	int		err;
   2254 	int		irecid;
   2255 	int		inc_new_hsp = 0;
   2256 
   2257 	mdclrerror(&mrp->mde);
   2258 
   2259 	if ((un = raid_getun(mnum, &mrp->mde, WR_LOCK, lock)) == NULL)
   2260 		return (0);
   2261 
   2262 	if (!mrp->params.change_hsp_id)
   2263 		return (0);
   2264 
   2265 	/* verify that no hotspare is in use */
   2266 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
   2267 		if (HOTSPARED(un, ix)) {
   2268 			return (mdmderror(&mrp->mde, MDE_HS_IN_USE, mnum));
   2269 		}
   2270 	}
   2271 
   2272 	/* replace the hot spare pool */
   2273 
   2274 	irecid = 0;
   2275 	if (mrp->params.hsp_id != -1) {
   2276 		/* increment the reference count of the new hsp */
   2277 		err = md_hot_spare_ifc(HSP_INCREF, mrp->params.hsp_id, 0, 0,
   2278 		    &recids[0], NULL, NULL, NULL);
   2279 		if (err) {
   2280 			return (mdhsperror(&mrp->mde, MDE_INVAL_HSP,
   2281 			    mrp->params.hsp_id));
   2282 		}
   2283 		inc_new_hsp = 1;
   2284 		irecid++;
   2285 	}
   2286 
   2287 	if (un->un_hsp_id != -1) {
   2288 		/* decrement the reference count of the old hsp */
   2289 		err = md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
   2290 		    &recids[irecid], NULL, NULL, NULL);
   2291 		if (err) {
   2292 			err = mdhsperror(&mrp->mde, MDE_INVAL_HSP,
   2293 			    mrp->params.hsp_id);
   2294 			if (inc_new_hsp) {
   2295 				(void) md_hot_spare_ifc(HSP_DECREF,
   2296 				    mrp->params.hsp_id, 0, 0,
   2297 				    &recids[0], NULL, NULL, NULL);
   2298 				/*
   2299 				 * Don't need to commit the record,
   2300 				 * because it wasn't committed before
   2301 				 */
   2302 			}
   2303 			return (err);
   2304 		}
   2305 	}
   2306 
   2307 	un->un_hsp_id = mrp->params.hsp_id;
   2308 
   2309 	raid_commit(un, recids);
   2310 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE,
   2311 	    MD_UN2SET(un), MD_SID(un));
   2312 
   2313 	/* Now trigger hot spare processing in case one is needed. */
   2314 	if ((un->un_hsp_id != -1) && (un->un_state == RUS_ERRED))
   2315 		(void) raid_hotspares();
   2316 
   2317 	return (0);
   2318 }
   2319 
   2320 /*
   2321  * NAME:	raid_admin_ioctl
   2322  * DESCRIPTION: IOCTL operations unique to metadevices and RAID
   2323  * PARAMETERS:	int	  cmd - IOCTL command to be executed
   2324  *		void	*data - pointer to IOCTL data structure
   2325  *		int	 mode - either FREAD or FWRITE
   2326  *		IOLOCK *lockp - IOCTL read/write lock
   2327  *
   2328  * LOCKS:	none
   2329  *
   2330  */
   2331 static int
   2332 raid_admin_ioctl(
   2333 	int		cmd,
   2334 	void		*data,
   2335 	int		mode,
   2336 	IOLOCK		*lockp
   2337 )
   2338 {
   2339 	size_t		sz = 0;
   2340 	void		*d = NULL;
   2341 	int		err = 0;
   2342 
   2343 	/* We can only handle 32-bit clients for internal commands */
   2344 	if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) {
   2345 		return (EINVAL);
   2346 	}
   2347 
   2348 
   2349 	/* dispatch ioctl */
   2350 	switch (cmd) {
   2351 
   2352 	case MD_IOCSET:
   2353 	{
   2354 		if (! (mode & FWRITE))
   2355 			return (EACCES);
   2356 
   2357 		sz = sizeof (md_set_params_t);
   2358 		d = kmem_alloc(sz, KM_SLEEP);
   2359 
   2360 		if (ddi_copyin(data, d, sz, mode)) {
   2361 			err = EFAULT;
   2362 			break;
   2363 		}
   2364 
   2365 		err = raid_set(d, mode);
   2366 		break;
   2367 	}
   2368 
   2369 	case MD_IOCGET:
   2370 	{
   2371 		if (! (mode & FREAD))
   2372 			return (EACCES);
   2373 
   2374 		sz = sizeof (md_i_get_t);
   2375 		d = kmem_alloc(sz, KM_SLEEP);
   2376 
   2377 		if (ddi_copyin(data, d, sz, mode)) {
   2378 			err = EFAULT;
   2379 			break;
   2380 		}
   2381 
   2382 		err = raid_get(d, mode, lockp);
   2383 		break;
   2384 	}
   2385 
   2386 	case MD_IOCREPLACE:
   2387 	{
   2388 		if (! (mode & FWRITE))
   2389 			return (EACCES);
   2390 
   2391 		sz = sizeof (replace_params_t);
   2392 		d = kmem_alloc(sz, KM_SLEEP);
   2393 
   2394 		if (ddi_copyin(data, d, sz, mode)) {
   2395 			err = EFAULT;
   2396 			break;
   2397 		}
   2398 
   2399 		err = raid_replace((replace_params_t *)d, lockp);
   2400 		break;
   2401 	}
   2402 
   2403 	case MD_IOCSETSYNC:
   2404 	{
   2405 		if (! (mode & FWRITE))
   2406 			return (EACCES);
   2407 
   2408 		sz = sizeof (md_resync_ioctl_t);
   2409 		d = kmem_alloc(sz, KM_SLEEP);
   2410 
   2411 		if (ddi_copyin(data, d, sz, mode)) {
   2412 			err = EFAULT;
   2413 			break;
   2414 		}
   2415 
   2416 		err = raid_set_sync((md_resync_ioctl_t *)d, lockp);
   2417 		break;
   2418 	}
   2419 
   2420 	case MD_IOCGETSYNC:
   2421 	{
   2422 		if (! (mode & FREAD))
   2423 			return (EACCES);
   2424 
   2425 		sz = sizeof (md_resync_ioctl_t);
   2426 		d = kmem_alloc(sz, KM_SLEEP);
   2427 
   2428 		if (ddi_copyin(data, d, sz, mode)) {
   2429 			err = EFAULT;
   2430 			break;
   2431 		}
   2432 		err = raid_get_resync((md_resync_ioctl_t *)d, lockp);
   2433 
   2434 		break;
   2435 	}
   2436 
   2437 	case MD_IOCGROW:
   2438 	{
   2439 		if (! (mode & FWRITE))
   2440 			return (EACCES);
   2441 
   2442 		sz = sizeof (md_grow_params_t);
   2443 		d = kmem_alloc(sz, KM_SLEEP);
   2444 
   2445 		if (ddi_copyin(data, d, sz, mode)) {
   2446 			err = EFAULT;
   2447 			break;
   2448 		}
   2449 
   2450 		err = raid_grow(d, mode, lockp);
   2451 		break;
   2452 	}
   2453 
   2454 	case MD_IOCCHANGE:
   2455 	{
   2456 		if (! (mode & FWRITE))
   2457 			return (EACCES);
   2458 
   2459 		sz = sizeof (md_raid_params_t);
   2460 		d = kmem_alloc(sz, KM_SLEEP);
   2461 
   2462 		if (ddi_copyin(data, d, sz, mode)) {
   2463 			err = EFAULT;
   2464 			break;
   2465 		}
   2466 
   2467 		err = raid_change((md_raid_params_t *)d, lockp);
   2468 		break;
   2469 	}
   2470 
   2471 	case MD_IOCRESET:
   2472 	{
   2473 		if (! (mode & FWRITE))
   2474 			return (EACCES);
   2475 
   2476 		sz = sizeof (md_i_reset_t);
   2477 		d = kmem_alloc(sz, KM_SLEEP);
   2478 
   2479 		if (ddi_copyin(data, d, sz, mode)) {
   2480 			err = EFAULT;
   2481 			break;
   2482 		}
   2483 
   2484 		err = raid_reset((md_i_reset_t *)d);
   2485 		break;
   2486 	}
   2487 
   2488 	case MD_IOCGET_DEVS:
   2489 	{
   2490 		if (! (mode & FREAD))
   2491 			return (EACCES);
   2492 
   2493 		sz = sizeof (md_getdevs_params_t);
   2494 		d = kmem_alloc(sz, KM_SLEEP);
   2495 
   2496 		if (ddi_copyin(data, d, sz, mode)) {
   2497 			err = EFAULT;
   2498 			break;
   2499 		}
   2500 
   2501 		err = raid_getdevs(d, mode, lockp);
   2502 		break;
   2503 	}
   2504 
   2505 	case MD_IOCSETREGEN:
   2506 	{
   2507 		if (! (mode & FWRITE))
   2508 			return (EACCES);
   2509 
   2510 		sz = sizeof (md_regen_param_t);
   2511 		d = kmem_alloc(sz, KM_SLEEP);
   2512 
   2513 		if (ddi_copyin(data, d, sz, mode)) {
   2514 			err = EFAULT;
   2515 			break;
   2516 		}
   2517 
   2518 		err = raid_regen((md_regen_param_t *)d, lockp);
   2519 		break;
   2520 	}
   2521 
   2522 	case MD_IOCPROBE_DEV:
   2523 	{
   2524 		md_probedev_impl_t	*p = NULL;
   2525 		md_probedev_t		*ph = NULL;
   2526 		daemon_queue_t		*hdr = NULL;
   2527 		int			i;
   2528 		size_t			sz1 = 0;
   2529 
   2530 
   2531 		if (! (mode & FREAD))
   2532 			return (EACCES);
   2533 
   2534 		sz = sizeof (md_probedev_t);
   2535 
   2536 		d = kmem_alloc(sz, KM_SLEEP);
   2537 
   2538 		/* now copy in the data */
   2539 		if (ddi_copyin(data, d, sz, mode)) {
   2540 			err = EFAULT;
   2541 			goto free_mem;
   2542 		}
   2543 
   2544 		/*
   2545 		 * Sanity test the args. Test name should have the keyword
   2546 		 * probe.
   2547 		 */
   2548 		p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP);
   2549 		p->probe_sema = NULL;
   2550 		p->probe_mx = NULL;
   2551 		p->probe.mnum_list = (uint64_t)NULL;
   2552 
   2553 		ph = (md_probedev_t *)d;
   2554 		p->probe.nmdevs = ph->nmdevs;
   2555 		(void) strcpy(p->probe.test_name, ph->test_name);
   2556 		bcopy(&ph->md_driver, &(p->probe.md_driver),
   2557 		    sizeof (md_driver_t));
   2558 
   2559 		if ((p->probe.nmdevs < 1) ||
   2560 		    (strstr(p->probe.test_name, "probe") == NULL)) {
   2561 			err = EINVAL;
   2562 			goto free_mem;
   2563 		}
   2564 
   2565 		sz1 = sizeof (minor_t) * p->probe.nmdevs;
   2566 
   2567 		p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz1,
   2568 		    KM_SLEEP);
   2569 
   2570 		if (ddi_copyin((caddr_t)(uintptr_t)ph->mnum_list,
   2571 		    (caddr_t)(uintptr_t)p->probe.mnum_list, sz1, mode)) {
   2572 			err = EFAULT;
   2573 			goto free_mem;
   2574 		}
   2575 
   2576 		if (err = md_init_probereq(p, &hdr))
   2577 			goto free_mem;
   2578 
   2579 		/*
   2580 		 * put the request on the queue and wait.
   2581 		 */
   2582 
   2583 		daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW);
   2584 
   2585 		(void) IOLOCK_RETURN(0, lockp);
   2586 		/* wait for the events to occur */
   2587 		for (i = 0; i < p->probe.nmdevs; i++) {
   2588 			sema_p(PROBE_SEMA(p));
   2589 		}
   2590 		while (md_ioctl_lock_enter() == EINTR)
   2591 			;
   2592 
   2593 		/*
   2594 		 * clean up. The hdr list is freed in the probe routines
   2595 		 * since the list is NULL by the time we get here.
   2596 		 */
   2597 free_mem:
   2598 		if (p) {
   2599 			if (p->probe_sema != NULL) {
   2600 				sema_destroy(PROBE_SEMA(p));
   2601 				kmem_free(p->probe_sema, sizeof (ksema_t));
   2602 			}
   2603 			if (p->probe_mx != NULL) {
   2604 				mutex_destroy(PROBE_MX(p));
   2605 				kmem_free(p->probe_mx, sizeof (kmutex_t));
   2606 			}
   2607 			if (p->probe.mnum_list)
   2608 				kmem_free((caddr_t)(uintptr_t)
   2609 				    p->probe.mnum_list, sz1);
   2610 
   2611 			kmem_free(p, sizeof (md_probedev_impl_t));
   2612 		}
   2613 		break;
   2614 	}
   2615 
   2616 	default:
   2617 		return (ENOTTY);
   2618 	}
   2619 
   2620 	/*
   2621 	 * copyout and free any args
   2622 	 */
   2623 	if (sz != 0) {
   2624 		if (err == 0) {
   2625 			if (ddi_copyout(d, data, sz, mode) != 0) {
   2626 				err = EFAULT;
   2627 			}
   2628 		}
   2629 		kmem_free(d, sz);
   2630 	}
   2631 	return (err);
   2632 }
   2633 
   2634 /*
   2635  * NAME:	md_raid_ioctl
   2636  * DESCRIPTION: RAID metadevice IOCTL operations entry point.
   2637  * PARAMETERS:	md_dev64_t dev - RAID device identifier
   2638  *		int	  cmd  - IOCTL command to be executed
   2639  *		void	*data  - pointer to IOCTL data structure
   2640  *		int	 mode  - either FREAD or FWRITE
   2641  *		IOLOCK *lockp  - IOCTL read/write lock
   2642  *
   2643  * LOCKS:	none
   2644  *
   2645  */
   2646 int
   2647 md_raid_ioctl(
   2648 	dev_t		dev,
   2649 	int		cmd,
   2650 	void		*data,
   2651 	int		mode,
   2652 	IOLOCK		*lockp
   2653 )
   2654 {
   2655 	minor_t		mnum = getminor(dev);
   2656 	mr_unit_t	*un;
   2657 	int		err = 0;
   2658 
   2659 	/* handle admin ioctls */
   2660 	if (mnum == MD_ADM_MINOR)
   2661 		return (raid_admin_ioctl(cmd, data, mode, lockp));
   2662 
   2663 	/* check unit */
   2664 	if ((MD_MIN2SET(mnum) >= md_nsets) ||
   2665 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
   2666 	    ((un = MD_UNIT(mnum)) == NULL))
   2667 		return (ENXIO);
   2668 
   2669 	/* is this a supported ioctl? */
   2670 	err = md_check_ioctl_against_unit(cmd, un->c);
   2671 	if (err != 0) {
   2672 		return (err);
   2673 	}
   2674 
   2675 	/* dispatch ioctl */
   2676 	switch (cmd) {
   2677 
   2678 	case DKIOCINFO:
   2679 	{
   2680 		struct dk_cinfo *p;
   2681 
   2682 		if (! (mode & FREAD))
   2683 			return (EACCES);
   2684 
   2685 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
   2686 
   2687 		get_info(p, mnum);
   2688 		if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0)
   2689 			err = EFAULT;
   2690 
   2691 		kmem_free(p, sizeof (*p));
   2692 		return (err);
   2693 	}
   2694 
   2695 	case DKIOCGMEDIAINFO:
   2696 	{
   2697 		struct dk_minfo	p;
   2698 
   2699 		if (! (mode & FREAD))
   2700 			return (EACCES);
   2701 
   2702 		get_minfo(&p, mnum);
   2703 		if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0)
   2704 			err = EFAULT;
   2705 
   2706 		return (err);
   2707 	}
   2708 
   2709 	case DKIOCGGEOM:
   2710 	{
   2711 		struct dk_geom	*p;
   2712 
   2713 		if (! (mode & FREAD))
   2714 			return (EACCES);
   2715 
   2716 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
   2717 
   2718 		if ((err = raid_get_geom(un, p)) == 0) {
   2719 			if (ddi_copyout((caddr_t)p, data, sizeof (*p),
   2720 			    mode) != 0)
   2721 				err = EFAULT;
   2722 		}
   2723 
   2724 		kmem_free(p, sizeof (*p));
   2725 		return (err);
   2726 	}
   2727 
   2728 	case DKIOCGVTOC:
   2729 	{
   2730 		struct vtoc	*vtoc;
   2731 
   2732 		if (! (mode & FREAD))
   2733 			return (EACCES);
   2734 
   2735 		vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
   2736 		if ((err = raid_get_vtoc(un, vtoc)) != 0) {
   2737 			kmem_free(vtoc, sizeof (*vtoc));
   2738 			return (err);
   2739 		}
   2740 
   2741 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
   2742 			if (ddi_copyout(vtoc, data, sizeof (*vtoc), mode))
   2743 				err = EFAULT;
   2744 		}
   2745 #ifdef _SYSCALL32
   2746 		else {
   2747 			struct vtoc32	*vtoc32;
   2748 
   2749 			vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
   2750 
   2751 			vtoctovtoc32((*vtoc), (*vtoc32));
   2752 			if (ddi_copyout(vtoc32, data, sizeof (*vtoc32), mode))
   2753 				err = EFAULT;
   2754 			kmem_free(vtoc32, sizeof (*vtoc32));
   2755 		}
   2756 #endif /* _SYSCALL32 */
   2757 
   2758 		kmem_free(vtoc, sizeof (*vtoc));
   2759 		return (err);
   2760 	}
   2761 
   2762 	case DKIOCSVTOC:
   2763 	{
   2764 		struct vtoc	*vtoc;
   2765 
   2766 		if (! (mode & FWRITE))
   2767 			return (EACCES);
   2768 
   2769 		vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
   2770 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
   2771 			if (ddi_copyin(data, vtoc, sizeof (*vtoc), mode)) {
   2772 				err = EFAULT;
   2773 			}
   2774 		}
   2775 #ifdef _SYSCALL32
   2776 		else {
   2777 			struct vtoc32	*vtoc32;
   2778 
   2779 			vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
   2780 
   2781 			if (ddi_copyin(data, vtoc32, sizeof (*vtoc32), mode)) {
   2782 				err = EFAULT;
   2783 			} else {
   2784 				vtoc32tovtoc((*vtoc32), (*vtoc));
   2785 			}
   2786 			kmem_free(vtoc32, sizeof (*vtoc32));
   2787 		}
   2788 #endif /* _SYSCALL32 */
   2789 
   2790 		if (err == 0)
   2791 			err = raid_set_vtoc(un, vtoc);
   2792 
   2793 		kmem_free(vtoc, sizeof (*vtoc));
   2794 		return (err);
   2795 	}
   2796 
   2797 	case DKIOCGEXTVTOC:
   2798 	{
   2799 		struct extvtoc	*extvtoc;
   2800 
   2801 		if (! (mode & FREAD))
   2802 			return (EACCES);
   2803 
   2804 		extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
   2805 		if ((err = raid_get_extvtoc(un, extvtoc)) != 0) {
   2806 			kmem_free(extvtoc, sizeof (*extvtoc));
   2807 			return (err);
   2808 		}
   2809 
   2810 		if (ddi_copyout(extvtoc, data, sizeof (*extvtoc), mode))
   2811 			err = EFAULT;
   2812 
   2813 		kmem_free(extvtoc, sizeof (*extvtoc));
   2814 		return (err);
   2815 	}
   2816 
   2817 	case DKIOCSEXTVTOC:
   2818 	{
   2819 		struct extvtoc	*extvtoc;
   2820 
   2821 		if (! (mode & FWRITE))
   2822 			return (EACCES);
   2823 
   2824 		extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
   2825 		if (ddi_copyin(data, extvtoc, sizeof (*extvtoc), mode)) {
   2826 			err = EFAULT;
   2827 		}
   2828 
   2829 		if (err == 0)
   2830 			err = raid_set_extvtoc(un, extvtoc);
   2831 
   2832 		kmem_free(extvtoc, sizeof (*extvtoc));
   2833 		return (err);
   2834 	}
   2835 
   2836 	case DKIOCGAPART:
   2837 	{
   2838 		struct dk_map	dmp;
   2839 
   2840 		if ((err = raid_get_cgapart(un, &dmp)) != 0) {
   2841 			return (err);
   2842 		}
   2843 
   2844 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
   2845 			if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp),
   2846 			    mode) != 0)
   2847 				err = EFAULT;
   2848 		}
   2849 #ifdef _SYSCALL32
   2850 		else {
   2851 			struct dk_map32 dmp32;
   2852 
   2853 			dmp32.dkl_cylno = dmp.dkl_cylno;
   2854 			dmp32.dkl_nblk = dmp.dkl_nblk;
   2855 
   2856 			if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32),
   2857 			    mode) != 0)
   2858 				err = EFAULT;
   2859 		}
   2860 #endif /* _SYSCALL32 */
   2861 
   2862 		return (err);
   2863 	}
   2864 	case DKIOCGETEFI:
   2865 	{
   2866 		/*
   2867 		 * This one can be done centralized,
   2868 		 * no need to put in the same code for all types of metadevices
   2869 		 */
   2870 		return (md_dkiocgetefi(mnum, data, mode));
   2871 	}
   2872 
   2873 	case DKIOCSETEFI:
   2874 	{
   2875 		/*
   2876 		 * This one can be done centralized,
   2877 		 * no need to put in the same code for all types of metadevices
   2878 		 */
   2879 		return (md_dkiocsetefi(mnum, data, mode));
   2880 	}
   2881 
   2882 	case DKIOCPARTITION:
   2883 	{
   2884 		return (md_dkiocpartition(mnum, data, mode));
   2885 	}
   2886 
   2887 	default:
   2888 		return (ENOTTY);
   2889 	}
   2890 }
   2891 
   2892 /*
   2893  * rename/exchange named service entry points and support functions follow.
   2894  * Most functions are handled generically, except for raid-specific locking
   2895  * and checking
   2896  */
   2897 
   2898 /*
   2899  * NAME:	raid_may_renexch_self
   2900  * DESCRIPTION: support routine for rename check ("MDRNM_CHECK") named service
   2901  * PARAMETERS:	mr_unit_t	*un - unit struct of raid unit to be renamed
   2902  *		mdi_unit_t	*ui - in-core unit struct of same raid unit
   2903  *		md_rentxn_t	*rtxnp - rename transaction state
   2904  *
   2905  * LOCKS:	none
   2906  *
   2907  */
   2908 static int
   2909 raid_may_renexch_self(
   2910 	mr_unit_t	*un,
   2911 	mdi_unit_t	*ui,
   2912 	md_rentxn_t	*rtxnp)
   2913 {
   2914 	minor_t	from_min;
   2915 	minor_t	to_min;
   2916 	bool_t	toplevel;
   2917 	bool_t	related;
   2918 
   2919 	from_min = rtxnp->from.mnum;
   2920 	to_min = rtxnp->to.mnum;
   2921 
   2922 	if (!un || !ui) {
   2923 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
   2924 		    from_min);
   2925 		return (EINVAL);
   2926 	}
   2927 
   2928 	ASSERT(!(MD_CAPAB(un) & MD_CAN_META_CHILD));
   2929 	if (MD_CAPAB(un) & MD_CAN_META_CHILD) {
   2930 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
   2931 		return (EINVAL);
   2932 	}
   2933 
   2934 	if (MD_PARENT(un) == MD_MULTI_PARENT) {
   2935 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
   2936 		return (EINVAL);
   2937 	}
   2938 
   2939 	toplevel = !MD_HAS_PARENT(MD_PARENT(un));
   2940 
   2941 	/* we're related if trying to swap with our parent */
   2942 	related = (!toplevel) && (MD_PARENT(un) == to_min);
   2943 
   2944 	switch (rtxnp->op) {
   2945 	case MDRNOP_EXCHANGE:
   2946 
   2947 		if (!related) {
   2948 			(void) mdmderror(&rtxnp->mde,
   2949 			    MDE_RENAME_TARGET_UNRELATED, to_min);
   2950 			return (EINVAL);
   2951 		}
   2952 
   2953 		break;
   2954 
   2955 	case MDRNOP_RENAME:
   2956 		/*
   2957 		 * if from is top-level and is open, then the kernel is using
   2958 		 * the md_dev64_t.
   2959 		 */
   2960 
   2961 		if (toplevel && md_unit_isopen(ui)) {
   2962 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
   2963 			    from_min);
   2964 			return (EBUSY);
   2965 		}
   2966 		break;
   2967 
   2968 	default:
   2969 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
   2970 		    from_min);
   2971 		return (EINVAL);
   2972 	}
   2973 
   2974 	return (0);	/* ok */
   2975 }
   2976 
   2977 /*
   2978  * NAME:	raid_rename_check
   2979  * DESCRIPTION: ("MDRNM_CHECK") rename/exchange named service entry point
   2980  * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
   2981  *					 raid device for rename transaction
   2982  *		md_rentxn_t	*rtxnp - rename transaction state
   2983  *
   2984  * LOCKS:	none
   2985  *
   2986  */
   2987 intptr_t
   2988 raid_rename_check(
   2989 	md_rendelta_t	*delta,
   2990 	md_rentxn_t	*rtxnp)
   2991 {
   2992 	int		 err	= 0;
   2993 	int		 column;
   2994 	mr_unit_t	*un;
   2995 
   2996 	ASSERT(delta);
   2997 	ASSERT(rtxnp);
   2998 	ASSERT(delta->unp);
   2999 	ASSERT(delta->uip);
   3000 
   3001 	if (!delta || !rtxnp || !delta->unp || !delta->uip) {
   3002 		(void) mdsyserror(&rtxnp->mde, EINVAL);
   3003 		return (EINVAL);
   3004 	}
   3005 
   3006 	un = (mr_unit_t *)delta->unp;
   3007 
   3008 	for (column = 0; column < un->un_totalcolumncnt; column++) {
   3009 		rcs_state_t	colstate;
   3010 
   3011 		colstate = un->un_column[column].un_devstate;
   3012 
   3013 		if (colstate & RCS_LAST_ERRED) {
   3014 			(void) mdmderror(&rtxnp->mde, MDE_RAID_LAST_ERRED,
   3015 			    md_getminor(delta->dev));
   3016 			return (EINVAL);
   3017 		}
   3018 
   3019 		if (colstate & RCS_INIT_ERRED) {
   3020 			(void) mdmderror(&rtxnp->mde, MDE_RAID_DOI,
   3021 			    md_getminor(delta->dev));
   3022 			return (EINVAL);
   3023 		}
   3024 
   3025 		/* How did we get this far before detecting this? */
   3026 		if (colstate & RCS_RESYNC) {
   3027 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
   3028 			    md_getminor(delta->dev));
   3029 			return (EBUSY);
   3030 		}
   3031 
   3032 		if (colstate & RCS_ERRED) {
   3033 			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
   3034 			    md_getminor(delta->dev));
   3035 			return (EINVAL);
   3036 		}
   3037 
   3038 		if (!(colstate & RCS_OKAY)) {
   3039 			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
   3040 			    md_getminor(delta->dev));
   3041 			return (EINVAL);
   3042 		}
   3043 
   3044 		if (HOTSPARED(un, column)) {
   3045 			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
   3046 			    md_getminor(delta->dev));
   3047 			return (EINVAL);
   3048 		}
   3049 	}
   3050 
   3051 	/* self does additional checks */
   3052 	if (delta->old_role == MDRR_SELF) {
   3053 		err = raid_may_renexch_self((mr_unit_t *)delta->unp,
   3054 		    delta->uip, rtxnp);
   3055 	}
   3056 	return (err);
   3057 }
   3058 
   3059 /*
   3060  * NAME:	raid_rename_lock
   3061  * DESCRIPTION: ("MDRNM_LOCK") rename/exchange named service entry point
   3062  * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
   3063  *					 raid device for rename transaction
   3064  *		md_rentxn_t	*rtxnp - rename transaction state
   3065  *
   3066  * LOCKS:	io and unit locks (taken explicitly *not* via ioctl wrappers)
   3067  *
   3068  */
   3069 intptr_t
   3070 raid_rename_lock(
   3071 	md_rendelta_t	*delta,
   3072 	md_rentxn_t	*rtxnp)
   3073 {
   3074 	minor_t		mnum;
   3075 
   3076 	ASSERT(delta);
   3077 	ASSERT(rtxnp);
   3078 
   3079 	mnum = md_getminor(delta->dev);
   3080 	if (mnum == rtxnp->to.mnum && rtxnp->op == MDRNOP_RENAME) {
   3081 		return (0);
   3082 	}
   3083 
   3084 	ASSERT(delta->uip);
   3085 	if (!delta->uip) {
   3086 		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
   3087 		return (ENODEV);
   3088 	}
   3089 
   3090 	ASSERT(delta->unp);
   3091 	if (!delta->unp) {
   3092 
   3093 		return (ENODEV);
   3094 	}
   3095 
   3096 	ASSERT(!IO_WRITER_HELD(delta->unp));
   3097 	(void) md_io_writerlock(delta->uip);
   3098 	ASSERT(IO_WRITER_HELD(delta->unp));
   3099 
   3100 
   3101 	ASSERT(!UNIT_WRITER_HELD(delta->unp));
   3102 	(void) md_unit_writerlock(delta->uip);
   3103 	ASSERT(UNIT_WRITER_HELD(delta->unp));
   3104 
   3105 	return (0);
   3106 }
   3107 
   3108 /*
   3109  * NAME:	raid_rename_unlock
   3110  * DESCRIPTION: ("MDRNM_UNLOCK") rename/exchange named service entry point
   3111  * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
   3112  *					 raid device for rename transaction
   3113  *		md_rentxn_t	*rtxnp - rename transaction state
   3114  *
   3115  * LOCKS:	drops io and unit locks
   3116  *
   3117  */
   3118 /* ARGSUSED */
   3119 void
   3120 raid_rename_unlock(
   3121 	md_rendelta_t	*delta,
   3122 	md_rentxn_t	*rtxnp)
   3123 {
   3124 	mr_unit_t	*un = (mr_unit_t *)delta->unp;
   3125 	minor_t		mnum = MD_SID(un);
   3126 	int		col;
   3127 
   3128 	ASSERT(delta);
   3129 	ASSERT(delta->unp);
   3130 	ASSERT(delta->uip);
   3131 
   3132 	ASSERT(UNIT_WRITER_HELD(delta->unp));
   3133 	md_unit_writerexit(delta->uip);
   3134 	ASSERT(!UNIT_WRITER_HELD(delta->unp));
   3135 
   3136 	if (! (delta->txn_stat.role_swapped) || ! (delta->txn_stat.is_open)) {
   3137 		goto out;
   3138 	}
   3139 	if (raid_internal_open(mnum, (FREAD | FWRITE),
   3140 	    OTYP_LYR, MD_OFLG_ISINIT) == 0) {
   3141 		for (col = 0; col < un->un_totalcolumncnt; col++) {
   3142 			if (un->un_column[col].un_devstate & RCS_OKAY)
   3143 				(void) init_pw_area(un,
   3144 				    un->un_column[col].un_dev,
   3145 				    un->un_column[col].un_pwstart, col);
   3146 		}
   3147 		(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
   3148 	}
   3149 
   3150 out:
   3151 	ASSERT(IO_WRITER_HELD(delta->unp));
   3152 	md_io_writerexit(delta->uip);
   3153 	ASSERT(!IO_WRITER_HELD(delta->unp));
   3154 }
   3155 /* end of rename/exchange named service and support functions */
   3156