Home | History | Annotate | Download | only in raid
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * NAME:	raid.c
     29  *
     30  * DESCRIPTION: Main RAID driver source file containing open, close and I/O
     31  *		operations.
     32  *
     33  * ROUTINES PROVIDED FOR EXTERNAL USE:
     34  *  raid_open()			- open the RAID metadevice for access.
     35  *  raid_internal_open()	- internal open routine of RAID metdevice.
     36  *  md_raid_strategy()		- perform normal I/O operations,
     37  *				    such as read and write.
     38  *  raid_close()		- close the RAID metadevice.
     39  *  raid_internal_close()	- internal close routine of RAID metadevice.
     40  *  raid_snarf()		- initialize and clean up MDD records.
     41  *  raid_halt()			- reset the RAID metadevice
     42  *  raid_line()			- return the line # of this segment
     43  *  raid_dcolumn()		- return the data column # of this segment
     44  *  raid_pcolumn()		- return the parity column # of this segment
     45  */
     46 
     47 #include <sys/param.h>
     48 #include <sys/systm.h>
     49 #include <sys/conf.h>
     50 #include <sys/file.h>
     51 #include <sys/user.h>
     52 #include <sys/uio.h>
     53 #include <sys/t_lock.h>
     54 #include <sys/buf.h>
     55 #include <sys/dkio.h>
     56 #include <sys/vtoc.h>
     57 #include <sys/kmem.h>
     58 #include <vm/page.h>
     59 #include <sys/cmn_err.h>
     60 #include <sys/sysmacros.h>
     61 #include <sys/types.h>
     62 #include <sys/mkdev.h>
     63 #include <sys/stat.h>
     64 #include <sys/open.h>
     65 #include <sys/modctl.h>
     66 #include <sys/ddi.h>
     67 #include <sys/sunddi.h>
     68 #include <sys/debug.h>
     69 #include <sys/lvm/md_raid.h>
     70 #include <sys/lvm/mdvar.h>
     71 #include <sys/lvm/md_convert.h>
     72 
     73 #include <sys/sysevent/eventdefs.h>
     74 #include <sys/sysevent/svm.h>
     75 
     76 md_ops_t		raid_md_ops;
     77 #ifndef lint
     78 char			_depends_on[] = "drv/md";
     79 md_ops_t		*md_interface_ops = &raid_md_ops;
     80 #endif	/* lint */
     81 
     82 extern unit_t		md_nunits;
     83 extern unit_t		md_nsets;
     84 extern md_set_t		md_set[];
     85 extern int		md_status;
     86 extern major_t		md_major;
     87 extern mdq_anchor_t	md_done_daemon;
     88 extern mdq_anchor_t	md_mstr_daemon;
     89 extern int		md_sleep_for_test;
     90 extern clock_t		md_hz;
     91 
     92 extern md_event_queue_t	*md_event_queue;
     93 
     94 
     95 int pchunks		= 16;
     96 int phigh		= 1024;
     97 int plow		= 128;
     98 int cchunks		= 64;
     99 int chigh		= 1024;
    100 int clow		= 512;
    101 int bchunks		= 32;
    102 int bhigh		= 256;
    103 int blow		= 128;
    104 
    105 int raid_total_io		= 0;
    106 int raid_reads			= 0;
    107 int raid_writes			= 0;
    108 int raid_no_bpmaps		= 0;
    109 int raid_512			= 0;
    110 int raid_1024			= 0;
    111 int raid_1024_8192		= 0;
    112 int raid_8192			= 0;
    113 int raid_8192_bigger		= 0;
    114 int raid_line_lock_wait	= 0;
    115 
    116 int data_buffer_waits		= 0;
    117 int parity_buffer_waits	= 0;
    118 
    119 /* writer line locks */
    120 int raid_writer_locks		= 0; /* total writer locks */
    121 int raid_write_waits		= 0; /* total writer locks that waited */
    122 int raid_full_line_writes	= 0; /* total full line writes */
    123 int raid_write_queue_length	= 0; /* wait queue length */
    124 int raid_max_write_q_length	= 0; /* maximum queue length */
    125 int raid_write_locks_active	= 0; /* writer locks at any time */
    126 int raid_max_write_locks	= 0; /* maximum writer locks active */
    127 
    128 /* read line locks */
    129 int raid_reader_locks		= 0; /* total reader locks held */
    130 int raid_reader_locks_active	= 0; /* reader locks held */
    131 int raid_max_reader_locks	= 0; /* maximum reader locks held in run */
    132 int raid_read_overlaps		= 0; /* number of times 2 reads hit same line */
    133 int raid_read_waits		= 0; /* times a reader waited on writer */
    134 
    135 /* prewrite stats */
    136 int raid_prewrite_waits		= 0; /* number of waits for a pw slot */
    137 int raid_pw			= 0; /* number of pw slots in use */
    138 int raid_prewrite_max		= 0; /* maximum number of pw slots in use */
    139 int raid_pw_invalidates		= 0;
    140 
    141 static clock_t md_wr_wait	= 0;
    142 
    143 int nv_available	= 0; /* presence of nv-ram support in device */
    144 int nv_prewrite		= 1; /* mark prewrites with nv_available */
    145 int nv_parity		= 1; /* mark parity with nv_available */
    146 
    147 kmem_cache_t	*raid_parent_cache = NULL;
    148 kmem_cache_t	*raid_child_cache = NULL;
    149 kmem_cache_t	*raid_cbuf_cache = NULL;
    150 
    151 int			raid_internal_open(minor_t mnum, int flag, int otyp,
    152 			    int md_oflags);
    153 
    154 static void		freebuffers(md_raidcs_t *cs);
    155 static int		raid_read(mr_unit_t *un, md_raidcs_t *cs);
    156 static void		raid_read_io(mr_unit_t *un, md_raidcs_t *cs);
    157 static int		raid_write(mr_unit_t *un, md_raidcs_t *cs);
    158 static void		raid_write_io(mr_unit_t *un, md_raidcs_t *cs);
    159 static void		raid_stage(md_raidcs_t *cs);
    160 static void		raid_enqueue(md_raidcs_t *cs);
    161 static diskaddr_t	raid_line(diskaddr_t segment, mr_unit_t *un);
    162 uint_t			raid_dcolumn(diskaddr_t segment, mr_unit_t *un);
    163 static void		getpbuffer(md_raidcs_t *cs);
    164 static void		getdbuffer(md_raidcs_t *cs);
    165 static void		raid_done(buf_t *bp);
    166 static void		raid_io_startup(mr_unit_t *un);
    167 
    168 static rus_state_t
    169 raid_col2unit(rcs_state_t state, rus_state_t unitstate)
    170 {
    171 	switch (state) {
    172 	case RCS_INIT:
    173 		return (RUS_INIT);
    174 	case RCS_OKAY:
    175 		return (RUS_OKAY);
    176 	case RCS_RESYNC:
    177 		if (unitstate & RUS_LAST_ERRED)
    178 			return (RUS_LAST_ERRED);
    179 		else
    180 			return (RUS_ERRED);
    181 	case RCS_ERRED:
    182 		return (RUS_ERRED);
    183 	case RCS_LAST_ERRED:
    184 		return (RUS_ERRED);
    185 	default:
    186 		break;
    187 	}
    188 	panic("raid_col2unit");
    189 	/*NOTREACHED*/
    190 }
    191 
    192 void
    193 raid_set_state(mr_unit_t *un, int col, rcs_state_t newstate, int force)
    194 {
    195 
    196 	rus_state_t	unitstate, origstate;
    197 	rcs_state_t	colstate;
    198 	rcs_state_t	orig_colstate;
    199 	int		errcnt = 0, okaycnt = 0, resynccnt = 0;
    200 	int		i;
    201 	char		*devname;
    202 
    203 	ASSERT(un);
    204 	ASSERT(col < un->un_totalcolumncnt);
    205 	ASSERT(newstate &
    206 	    (RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED |
    207 	    RCS_LAST_ERRED | RCS_REGEN));
    208 	ASSERT((newstate &
    209 	    ~(RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED |
    210 	    RCS_LAST_ERRED | RCS_REGEN))
    211 	    == 0);
    212 
    213 	ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
    214 
    215 	unitstate = un->un_state;
    216 	origstate = unitstate;
    217 
    218 	if (force) {
    219 		un->un_column[col].un_devstate = newstate;
    220 		un->un_state = raid_col2unit(newstate, unitstate);
    221 		uniqtime32(&un->un_column[col].un_devtimestamp);
    222 		uniqtime32(&un->un_timestamp);
    223 		return;
    224 	}
    225 
    226 	ASSERT(un->un_state &
    227 	    (RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED |
    228 	    RUS_REGEN));
    229 	ASSERT((un->un_state & ~(RUS_INIT |
    230 	    RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)) == 0);
    231 
    232 	if (un->un_column[col].un_devstate == newstate)
    233 		return;
    234 
    235 	if (newstate == RCS_REGEN) {
    236 		if (raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt)
    237 			return;
    238 		un->un_state = RUS_REGEN;
    239 		return;
    240 	}
    241 
    242 	orig_colstate = un->un_column[col].un_devstate;
    243 
    244 	/*
    245 	 * if there is another column in the error state then this
    246 	 * column should go to the last errored state
    247 	 */
    248 	for (i = 0; i < un->un_totalcolumncnt; i++) {
    249 		if (i == col)
    250 			colstate = newstate;
    251 		else
    252 			colstate = un->un_column[i].un_devstate;
    253 		if (colstate & (RCS_ERRED | RCS_LAST_ERRED | RCS_INIT_ERRED))
    254 			errcnt++;
    255 		if (colstate & RCS_OKAY)
    256 			okaycnt++;
    257 		if (colstate & RCS_RESYNC)
    258 			resynccnt++;
    259 	}
    260 	ASSERT(resynccnt < 2);
    261 
    262 	if (okaycnt == un->un_totalcolumncnt)
    263 		unitstate = RUS_OKAY;
    264 	else if (errcnt > 1) {
    265 		unitstate = RUS_LAST_ERRED;
    266 		if (newstate & RCS_ERRED)
    267 			newstate = RCS_LAST_ERRED;
    268 	} else if (errcnt == 1)
    269 		if (!(unitstate & RUS_LAST_ERRED))
    270 			unitstate = RUS_ERRED;
    271 
    272 	if (un->un_state == RUS_DOI)
    273 		unitstate = RUS_DOI;
    274 
    275 	un->un_column[col].un_devstate = newstate;
    276 	uniqtime32(&un->un_column[col].un_devtimestamp);
    277 	/*
    278 	 * if there are last errored column being brought back online
    279 	 * by open or snarf, then be sure to clear the RUS_LAST_ERRED
    280 	 * bit to allow writes.  If there is a real error then the
    281 	 * column will go back into last erred.
    282 	 */
    283 	if ((raid_state_cnt(un, RCS_LAST_ERRED) == 0) &&
    284 	    (raid_state_cnt(un, RCS_ERRED) == 1))
    285 		unitstate = RUS_ERRED;
    286 
    287 	un->un_state = unitstate;
    288 	uniqtime32(&un->un_timestamp);
    289 
    290 	if ((! (origstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) &&
    291 	    (unitstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) {
    292 		devname = md_devname(MD_UN2SET(un),
    293 		    un->un_column[col].un_dev, NULL, 0);
    294 
    295 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
    296 		    md_shortname(MD_SID(un)), devname);
    297 
    298 		if (unitstate & RUS_LAST_ERRED) {
    299 			cmn_err(CE_WARN, "md: %s: %s last erred",
    300 			    md_shortname(MD_SID(un)), devname);
    301 
    302 		} else if (un->un_column[col].un_devflags &
    303 		    MD_RAID_DEV_ISOPEN) {
    304 			/*
    305 			 * Close the broken device and clear the open flag on
    306 			 * it.  We have to check that the device is open,
    307 			 * otherwise the first open on it has resulted in the
    308 			 * error that is being processed and the actual un_dev
    309 			 * will be NODEV64.
    310 			 */
    311 			md_layered_close(un->un_column[col].un_dev,
    312 			    MD_OFLG_NULL);
    313 			un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN;
    314 		}
    315 	} else if (orig_colstate == RCS_LAST_ERRED && newstate == RCS_ERRED &&
    316 	    un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) {
    317 		/*
    318 		 * Similar to logic above except no log messages since we
    319 		 * are just transitioning from Last Erred to Erred.
    320 		 */
    321 		md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL);
    322 		un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN;
    323 	}
    324 
    325 	/*
    326 	 * If a resync has completed, see if there is a Last Erred
    327 	 * component that we can change to the Erred state.
    328 	 */
    329 	if ((orig_colstate == RCS_RESYNC) && (newstate == RCS_OKAY)) {
    330 		for (i = 0; i < un->un_totalcolumncnt; i++) {
    331 			if (i != col &&
    332 			    (un->un_column[i].un_devstate & RCS_LAST_ERRED)) {
    333 				raid_set_state(un, i, RCS_ERRED, 0);
    334 				break;
    335 			}
    336 		}
    337 	}
    338 }
    339 
    340 /*
    341  * NAME:	erred_check_line
    342  *
    343  * DESCRIPTION: Return the type of write to perform on an erred column based
    344  *		upon any resync activity.
    345  *
    346  *		if a column is being resynced and the write is above the
    347  *		resync point may have to write to the target being resynced.
    348  *
    349  *		Column state may make it impossible to do the write
    350  *		in which case RCL_EIO or RCL_ENXIO is returned.
    351  *
    352  *		If a column cannot be written directly, RCL_ERRED is
    353  *		returned and processing should proceed accordingly.
    354  *
    355  * PARAMETERS:	minor_t		 mnum - minor number identity of metadevice
    356  *		md_raidcs_t	 *cs - child save structure
    357  *		mr_column_t	 *dcolumn - pointer to data column structure
    358  *		mr_column_t	 *pcolumn - pointer to parity column structure
    359  *
    360  * RETURNS:	RCL_OKAY, RCL_ERRED
    361  *
    362  * LOCKS:	Expects Line Writer Lock and Unit Resource Lock to be held
    363  *		across call.
    364  */
    365 
    366 static int
    367 erred_check_line(mr_unit_t *un, md_raidcs_t *cs, mr_column_t *column)
    368 {
    369 
    370 	ASSERT(un != NULL);
    371 	ASSERT(cs->cs_flags & MD_RCS_LLOCKD);
    372 
    373 	if (column->un_devstate & RCS_OKAY)
    374 		return (RCL_OKAY);
    375 
    376 	if (column->un_devstate & RCS_ERRED)
    377 		return (RCL_ERRED);  /* do not read from errored disk */
    378 
    379 	/*
    380 	 * for the last errored case their are two considerations.
    381 	 * When the last errored column is the only errored column then
    382 	 * do treat it like a maintenance column, not doing I/O from
    383 	 * it.   When it there are other failures then just attempt
    384 	 * to use it.
    385 	 */
    386 	if (column->un_devstate & RCS_LAST_ERRED)
    387 		return (RCL_ERRED);
    388 
    389 	ASSERT(column->un_devstate & RCS_RESYNC);
    390 
    391 	/*
    392 	 * When a resync from a hotspare is being done (copy resync)
    393 	 * then always treat it as an OKAY column, since no regen
    394 	 * is required.
    395 	 */
    396 	if (column->un_devflags & MD_RAID_COPY_RESYNC) {
    397 		return (RCL_OKAY);
    398 	}
    399 
    400 	mutex_enter(&un->un_mx);
    401 	if (cs->cs_line < un->un_resync_line_index) {
    402 		mutex_exit(&un->un_mx);
    403 		return (RCL_OKAY);
    404 	}
    405 	mutex_exit(&un->un_mx);
    406 	return (RCL_ERRED);
    407 
    408 }
    409 
    410 /*
    411  * NAMES:	raid_state_cnt
    412  *
    413  * DESCRIPTION: counts number of column in a specific state
    414  *
    415  * PARAMETERS:	md_raid_t *un
    416  *		rcs_state state
    417  */
    418 int
    419 raid_state_cnt(mr_unit_t *un, rcs_state_t state)
    420 {
    421 	int	i, retval = 0;
    422 
    423 	for (i = 0; i < un->un_totalcolumncnt; i++)
    424 		if (un->un_column[i].un_devstate & state)
    425 			retval++;
    426 	return (retval);
    427 }
    428 
    429 /*
    430  * NAMES:	raid_io_overlaps
    431  *
    432  * DESCRIPTION: checkst for overlap of 2 child save structures
    433  *
    434  * PARAMETERS:	md_raidcs_t cs1
    435  *		md_raidcs_t cs2
    436  *
    437  * RETURNS:	0 - no overlap
    438  *		1 - overlap
    439  */
    440 int
    441 raid_io_overlaps(md_raidcs_t *cs1, md_raidcs_t *cs2)
    442 {
    443 	if (cs1->cs_blkno > cs2->cs_lastblk)
    444 		return (0);
    445 	if (cs1->cs_lastblk < cs2->cs_blkno)
    446 		return (0);
    447 	return (1);
    448 }
    449 
    450 /*
    451  * NAMES:	raid_parent_constructor
    452  * DESCRIPTION: parent structure constructor routine
    453  * PARAMETERS:
    454  */
    455 /*ARGSUSED1*/
    456 static int
    457 raid_parent_constructor(void *p, void *d1, int d2)
    458 {
    459 	mutex_init(&((md_raidps_t *)p)->ps_mx,
    460 	    NULL, MUTEX_DEFAULT, NULL);
    461 	mutex_init(&((md_raidps_t *)p)->ps_mapin_mx,
    462 	    NULL, MUTEX_DEFAULT, NULL);
    463 	return (0);
    464 }
    465 
    466 void
    467 raid_parent_init(md_raidps_t *ps)
    468 {
    469 	bzero(ps, offsetof(md_raidps_t, ps_mx));
    470 	((md_raidps_t *)ps)->ps_flags = MD_RPS_INUSE;
    471 	((md_raidps_t *)ps)->ps_magic = RAID_PSMAGIC;
    472 }
    473 
    474 /*ARGSUSED1*/
    475 static void
    476 raid_parent_destructor(void *p, void *d)
    477 {
    478 	mutex_destroy(&((md_raidps_t *)p)->ps_mx);
    479 	mutex_destroy(&((md_raidps_t *)p)->ps_mapin_mx);
    480 }
    481 
    482 /*
    483  * NAMES:	raid_child_constructor
    484  * DESCRIPTION: child structure constructor routine
    485  * PARAMETERS:
    486  */
    487 /*ARGSUSED1*/
    488 static int
    489 raid_child_constructor(void *p, void *d1, int d2)
    490 {
    491 	md_raidcs_t	*cs = (md_raidcs_t *)p;
    492 	mutex_init(&cs->cs_mx, NULL, MUTEX_DEFAULT, NULL);
    493 	bioinit(&cs->cs_dbuf);
    494 	bioinit(&cs->cs_pbuf);
    495 	bioinit(&cs->cs_hbuf);
    496 	return (0);
    497 }
    498 
    499 void
    500 raid_child_init(md_raidcs_t *cs)
    501 {
    502 	bzero(cs, offsetof(md_raidcs_t, cs_mx));
    503 
    504 	md_bioreset(&cs->cs_dbuf);
    505 	md_bioreset(&cs->cs_pbuf);
    506 	md_bioreset(&cs->cs_hbuf);
    507 
    508 	((md_raidcs_t *)cs)->cs_dbuf.b_chain =
    509 	    ((md_raidcs_t *)cs)->cs_pbuf.b_chain =
    510 	    ((md_raidcs_t *)cs)->cs_hbuf.b_chain =
    511 	    (struct buf *)(cs);
    512 
    513 	cs->cs_magic = RAID_CSMAGIC;
    514 	cs->cs_line = MD_DISKADDR_ERROR;
    515 	cs->cs_dpwslot = -1;
    516 	cs->cs_ppwslot = -1;
    517 }
    518 
    519 /*ARGSUSED1*/
    520 static void
    521 raid_child_destructor(void *p, void *d)
    522 {
    523 	biofini(&((md_raidcs_t *)p)->cs_dbuf);
    524 	biofini(&((md_raidcs_t *)p)->cs_hbuf);
    525 	biofini(&((md_raidcs_t *)p)->cs_pbuf);
    526 	mutex_destroy(&((md_raidcs_t *)p)->cs_mx);
    527 }
    528 
    529 /*ARGSUSED1*/
    530 static int
    531 raid_cbuf_constructor(void *p, void *d1, int d2)
    532 {
    533 	bioinit(&((md_raidcbuf_t *)p)->cbuf_bp);
    534 	return (0);
    535 }
    536 
    537 static void
    538 raid_cbuf_init(md_raidcbuf_t *cb)
    539 {
    540 	bzero(cb, offsetof(md_raidcbuf_t, cbuf_bp));
    541 	md_bioreset(&cb->cbuf_bp);
    542 	cb->cbuf_magic = RAID_BUFMAGIC;
    543 	cb->cbuf_pwslot = -1;
    544 	cb->cbuf_flags = CBUF_WRITE;
    545 }
    546 
    547 /*ARGSUSED1*/
    548 static void
    549 raid_cbuf_destructor(void *p, void *d)
    550 {
    551 	biofini(&((md_raidcbuf_t *)p)->cbuf_bp);
    552 }
    553 
    554 /*
    555  * NAMES:	raid_run_queue
    556  * DESCRIPTION: spawn a backend processing daemon for RAID metadevice.
    557  * PARAMETERS:
    558  */
    559 /*ARGSUSED*/
    560 static void
    561 raid_run_queue(void *d)
    562 {
    563 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
    564 		md_daemon(1, &md_done_daemon);
    565 }
    566 
    567 /*
    568  * NAME:	raid_build_pwslot
    569  * DESCRIPTION: builds mr_pw_reserve for the column
    570  * PARAMETERS:	un is the pointer to the unit structure
    571  *		colindex is the column to create the structure for
    572  */
    573 int
    574 raid_build_pw_reservation(mr_unit_t *un, int colindex)
    575 {
    576 	mr_pw_reserve_t	*pw;
    577 	mr_scoreboard_t	*sb;
    578 	int		i;
    579 
    580 	pw = (mr_pw_reserve_t *) kmem_zalloc(sizeof (mr_pw_reserve_t) +
    581 	    (sizeof (mr_scoreboard_t) * un->un_pwcnt), KM_SLEEP);
    582 	pw->pw_magic = RAID_PWMAGIC;
    583 	pw->pw_column = colindex;
    584 	pw->pw_free = un->un_pwcnt;
    585 	sb = &pw->pw_sb[0];
    586 	for (i = 0; i < un->un_pwcnt; i++) {
    587 		sb[i].sb_column = colindex;
    588 		sb[i].sb_flags = SB_UNUSED;
    589 		sb[i].sb_start_blk = 0;
    590 		sb[i].sb_last_blk = 0;
    591 		sb[i].sb_cs = NULL;
    592 	}
    593 	un->un_column_ic[colindex].un_pw_reserve = pw;
    594 	return (0);
    595 }
    596 /*
    597  * NAME:	raid_free_pw_reservation
    598  * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine
    599  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
    600  *		int colindex  - index of the column whose pre-write slot struct
    601  *			is to be destroyed.
    602  */
    603 void
    604 raid_free_pw_reservation(mr_unit_t *un, int colindex)
    605 {
    606 	mr_pw_reserve_t	*pw = un->un_column_ic[colindex].un_pw_reserve;
    607 
    608 	kmem_free(pw, sizeof (mr_pw_reserve_t) +
    609 	    (sizeof (mr_scoreboard_t) * un->un_pwcnt));
    610 }
    611 
    612 /*
    613  * NAME:	raid_cancel_pwslot
    614  * DESCRIPTION: RAID metadevice write routine
    615  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
    616  */
    617 static void
    618 raid_cancel_pwslot(md_raidcs_t *cs)
    619 {
    620 	mr_unit_t		*un = cs->cs_un;
    621 	mr_pw_reserve_t		*pw;
    622 	mr_scoreboard_t		*sb;
    623 	mr_column_ic_t		*col;
    624 	md_raidcbuf_t		*cbuf;
    625 	int			broadcast = 0;
    626 
    627 	if (cs->cs_ps->ps_flags & MD_RPS_READ)
    628 		return;
    629 	if (cs->cs_dpwslot != -1) {
    630 		col = &un->un_column_ic[cs->cs_dcolumn];
    631 		pw = col->un_pw_reserve;
    632 		sb = &pw->pw_sb[cs->cs_dpwslot];
    633 		sb->sb_flags = SB_AVAIL;
    634 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
    635 			broadcast++;
    636 		sb->sb_cs = NULL;
    637 	}
    638 
    639 	if (cs->cs_ppwslot != -1) {
    640 		col = &un->un_column_ic[cs->cs_pcolumn];
    641 		pw = col->un_pw_reserve;
    642 		sb = &pw->pw_sb[cs->cs_ppwslot];
    643 		sb->sb_flags = SB_AVAIL;
    644 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
    645 			broadcast++;
    646 		sb->sb_cs = NULL;
    647 	}
    648 
    649 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
    650 		if (cbuf->cbuf_pwslot == -1)
    651 			continue;
    652 		col = &un->un_column_ic[cbuf->cbuf_column];
    653 		pw = col->un_pw_reserve;
    654 		sb = &pw->pw_sb[cbuf->cbuf_pwslot];
    655 		sb->sb_flags = SB_AVAIL;
    656 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
    657 			broadcast++;
    658 		sb->sb_cs = NULL;
    659 	}
    660 	if (broadcast) {
    661 		cv_broadcast(&un->un_cv);
    662 		return;
    663 	}
    664 	mutex_enter(&un->un_mx);
    665 	if (un->un_rflags & MD_RFLAG_NEEDPW)
    666 		cv_broadcast(&un->un_cv);
    667 	mutex_exit(&un->un_mx);
    668 }
    669 
    670 static void
    671 raid_free_pwinvalidate(md_raidcs_t *cs)
    672 {
    673 	md_raidcbuf_t		*cbuf;
    674 	md_raidcbuf_t		*cbuf_to_free;
    675 	mr_unit_t		*un = cs->cs_un;
    676 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
    677 	mr_pw_reserve_t		*pw;
    678 	mr_scoreboard_t		*sb;
    679 	int			broadcast = 0;
    680 
    681 	cbuf = cs->cs_pw_inval_list;
    682 	ASSERT(cbuf);
    683 	mutex_enter(&un->un_linlck_mx);
    684 	while (cbuf) {
    685 		pw = un->un_column_ic[cbuf->cbuf_column].un_pw_reserve;
    686 		sb = &pw->pw_sb[0];
    687 		ASSERT(sb[cbuf->cbuf_pwslot].sb_flags & SB_INVAL_PEND);
    688 		sb[cbuf->cbuf_pwslot].sb_flags = SB_UNUSED;
    689 		sb[cbuf->cbuf_pwslot].sb_cs = NULL;
    690 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
    691 			broadcast++;
    692 		cbuf_to_free = cbuf;
    693 		cbuf = cbuf->cbuf_next;
    694 		kmem_free(cbuf_to_free->cbuf_buffer, dbtob(un->un_iosize));
    695 		kmem_cache_free(raid_cbuf_cache, cbuf_to_free);
    696 	}
    697 	cs->cs_pw_inval_list = (md_raidcbuf_t *)NULL;
    698 	/*
    699 	 * now that there is a free prewrite slot, check to see if there
    700 	 * are any io operations waiting first wake up the raid_io_startup
    701 	 * then signal the the processes waiting in raid_write.
    702 	 */
    703 	if (ui->ui_io_lock->io_list_front)
    704 		raid_io_startup(un);
    705 	mutex_exit(&un->un_linlck_mx);
    706 	if (broadcast) {
    707 		cv_broadcast(&un->un_cv);
    708 		return;
    709 	}
    710 	mutex_enter(&un->un_mx);
    711 	if (un->un_rflags & MD_RFLAG_NEEDPW)
    712 		cv_broadcast(&un->un_cv);
    713 	mutex_exit(&un->un_mx);
    714 }
    715 
    716 
    717 static int
    718 raid_get_pwslot(md_raidcs_t *cs, int column)
    719 {
    720 	mr_scoreboard_t	*sb;
    721 	mr_pw_reserve_t	*pw;
    722 	mr_unit_t	*un = cs->cs_un;
    723 	diskaddr_t	start_blk = cs->cs_blkno;
    724 	diskaddr_t	last_blk = cs->cs_lastblk;
    725 	int		i;
    726 	int		pwcnt = un->un_pwcnt;
    727 	int		avail = -1;
    728 	int		use = -1;
    729 	int		flags;
    730 
    731 
    732 	/* start with the data column */
    733 	pw = cs->cs_un->un_column_ic[column].un_pw_reserve;
    734 	sb = &pw->pw_sb[0];
    735 	ASSERT(pw->pw_free > 0);
    736 	for (i = 0; i < pwcnt; i++) {
    737 		flags = sb[i].sb_flags;
    738 		if (flags & SB_INVAL_PEND)
    739 			continue;
    740 
    741 		if ((avail == -1) && (flags & (SB_AVAIL | SB_UNUSED)))
    742 			avail = i;
    743 
    744 		if ((start_blk > sb[i].sb_last_blk) ||
    745 		    (last_blk < sb[i].sb_start_blk))
    746 			continue;
    747 
    748 		/* OVERLAP */
    749 		ASSERT(! (sb[i].sb_flags & SB_INUSE));
    750 
    751 		/*
    752 		 * raid_invalidate_pwslot attempts to zero out prewrite entry
    753 		 * in parallel with other disk reads/writes related to current
    754 		 * transaction. however cs_frags accounting for this case is
    755 		 * broken because raid_write_io resets cs_frags i.e. ignoring
    756 		 * that it could have been been set to > 0 value by
    757 		 * raid_invalidate_pwslot. While this can be fixed an
    758 		 * additional problem is that we don't seem to handle
    759 		 * correctly the case of getting a disk error for prewrite
    760 		 * entry invalidation.
    761 		 * It does not look like we really need
    762 		 * to invalidate prewrite slots because raid_replay sorts
    763 		 * prewrite id's in ascending order and during recovery the
    764 		 * latest prewrite entry for the same block will be replay
    765 		 * last. That's why i ifdef'd out the call to
    766 		 * raid_invalidate_pwslot. --aguzovsk@east
    767 		 */
    768 
    769 		if (use == -1) {
    770 			use = i;
    771 		}
    772 	}
    773 
    774 	ASSERT(avail != -1);
    775 	pw->pw_free--;
    776 	if (use == -1)
    777 		use = avail;
    778 
    779 	ASSERT(! (sb[use].sb_flags & SB_INUSE));
    780 	sb[use].sb_flags = SB_INUSE;
    781 	sb[use].sb_cs = cs;
    782 	sb[use].sb_start_blk = start_blk;
    783 	sb[use].sb_last_blk = last_blk;
    784 	ASSERT((use >= 0) && (use < un->un_pwcnt));
    785 	return (use);
    786 }
    787 
    788 static int
    789 raid_check_pw(md_raidcs_t *cs)
    790 {
    791 
    792 	mr_unit_t	*un = cs->cs_un;
    793 	int		i;
    794 
    795 	ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS));
    796 	/*
    797 	 * check to be sure there is a prewrite slot available
    798 	 * if not just return.
    799 	 */
    800 	if (cs->cs_flags & MD_RCS_LINE) {
    801 		for (i = 0; i < un->un_totalcolumncnt; i++)
    802 			if (un->un_column_ic[i].un_pw_reserve->pw_free <= 0)
    803 				return (1);
    804 		return (0);
    805 	}
    806 
    807 	if (un->un_column_ic[cs->cs_dcolumn].un_pw_reserve->pw_free <= 0)
    808 		return (1);
    809 	if (un->un_column_ic[cs->cs_pcolumn].un_pw_reserve->pw_free <= 0)
    810 		return (1);
    811 	return (0);
    812 }
    813 static int
    814 raid_alloc_pwslot(md_raidcs_t *cs)
    815 {
    816 	mr_unit_t	*un = cs->cs_un;
    817 	md_raidcbuf_t	*cbuf;
    818 
    819 	ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS));
    820 	if (raid_check_pw(cs))
    821 		return (1);
    822 
    823 	mutex_enter(&un->un_mx);
    824 	un->un_pwid++;
    825 	cs->cs_pwid = un->un_pwid;
    826 	mutex_exit(&un->un_mx);
    827 
    828 	cs->cs_dpwslot = raid_get_pwslot(cs, cs->cs_dcolumn);
    829 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
    830 		cbuf->cbuf_pwslot = raid_get_pwslot(cs, cbuf->cbuf_column);
    831 	}
    832 	cs->cs_ppwslot = raid_get_pwslot(cs, cs->cs_pcolumn);
    833 
    834 	cs->cs_flags |= MD_RCS_HAVE_PW_SLOTS;
    835 
    836 	return (0);
    837 }
    838 
    839 /*
    840  * NAMES:	raid_build_incore
    841  * DESCRIPTION: RAID metadevice incore structure building routine
    842  * PARAMETERS:	void *p - pointer to a unit structure
    843  *		int snarfing - a flag to indicate snarfing is required
    844  */
    845 int
    846 raid_build_incore(void *p, int snarfing)
    847 {
    848 	mr_unit_t	*un = (mr_unit_t *)p;
    849 	minor_t		mnum = MD_SID(un);
    850 	mddb_recid_t	hs_recid = 0;
    851 	int		i;
    852 	int		preserve_flags;
    853 	mr_column_t	*column;
    854 	int		iosize;
    855 	md_dev64_t	hs, dev;
    856 	int		resync_cnt = 0, error_cnt = 0;
    857 
    858 	hs = NODEV64;
    859 	dev = NODEV64;
    860 
    861 	/* clear out bogus pointer incase we return(1) prior to alloc */
    862 	un->mr_ic = NULL;
    863 
    864 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
    865 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
    866 		return (1);
    867 	}
    868 
    869 	if (MD_UNIT(mnum) != NULL)
    870 		return (0);
    871 
    872 	if (snarfing)
    873 		MD_STATUS(un) = 0;
    874 
    875 	un->mr_ic = (mr_unit_ic_t *)kmem_zalloc(sizeof (*un->mr_ic),
    876 	    KM_SLEEP);
    877 
    878 	un->un_column_ic = (mr_column_ic_t *)
    879 	    kmem_zalloc(sizeof (mr_column_ic_t) *
    880 	    un->un_totalcolumncnt, KM_SLEEP);
    881 
    882 	for (i = 0; i < un->un_totalcolumncnt; i++) {
    883 
    884 		column	= &un->un_column[i];
    885 		preserve_flags = column->un_devflags &
    886 		    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC);
    887 		column->un_devflags &=
    888 		    ~(MD_RAID_ALT_ISOPEN | MD_RAID_DEV_ISOPEN |
    889 		    MD_RAID_WRITE_ALT);
    890 		if (raid_build_pw_reservation(un, i) != 0) {
    891 			/* could not build pwslot */
    892 			return (1);
    893 		}
    894 
    895 		if (snarfing) {
    896 			set_t		setno = MD_MIN2SET(mnum);
    897 			dev =  md_getdevnum(setno, mddb_getsidenum(setno),
    898 			    column->un_orig_key, MD_NOTRUST_DEVT);
    899 			/*
    900 			 * Comment out instead of remove so we have history
    901 			 * In the pre-SVM releases stored devt is used so
    902 			 * as long as there is one snarf is always happy
    903 			 * even the component is powered off.  This is not
    904 			 * the case in current SVM implementation.  NODEV64
    905 			 * can be returned and in this case since we resolve
    906 			 * the devt at 'open' time (first use of metadevice)
    907 			 * we will allow snarf continue.
    908 			 *
    909 			 * if (dev == NODEV64)
    910 			 *	return (1);
    911 			 */
    912 
    913 			/*
    914 			 * Setup un_orig_dev from device id info if the device
    915 			 * is valid (not NODEV64).
    916 			 */
    917 			if (dev != NODEV64)
    918 				column->un_orig_dev = dev;
    919 
    920 			if (column->un_devstate & RCS_RESYNC)
    921 				resync_cnt++;
    922 			if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED))
    923 				error_cnt++;
    924 
    925 			if (HOTSPARED(un, i)) {
    926 				(void) md_hot_spare_ifc(HS_MKDEV,
    927 				    0, 0, 0, &column->un_hs_id, NULL,
    928 				    &hs, NULL);
    929 				/*
    930 				 * Same here
    931 				 *
    932 				 * if (hs == NODEV64)
    933 				 *	return (1);
    934 				 */
    935 			}
    936 
    937 			if (HOTSPARED(un, i)) {
    938 				if (column->un_devstate &
    939 				    (RCS_OKAY | RCS_LAST_ERRED)) {
    940 					column->un_dev = hs;
    941 					column->un_pwstart =
    942 					    column->un_hs_pwstart;
    943 					column->un_devstart =
    944 					    column->un_hs_devstart;
    945 					preserve_flags &=
    946 					    ~(MD_RAID_COPY_RESYNC |
    947 					    MD_RAID_REGEN_RESYNC);
    948 				} else  if (column->un_devstate & RCS_RESYNC) {
    949 					/*
    950 					 * if previous system was 4.0 set
    951 					 * the direction flags
    952 					 */
    953 					if ((preserve_flags &
    954 					    (MD_RAID_COPY_RESYNC |
    955 					    MD_RAID_REGEN_RESYNC)) == 0) {
    956 						if (column->un_alt_dev !=
    957 						    NODEV64)
    958 							preserve_flags |=
    959 							    MD_RAID_COPY_RESYNC;
    960 						else
    961 							preserve_flags |=
    962 							/* CSTYLED */
    963 							    MD_RAID_REGEN_RESYNC;
    964 					}
    965 				}
    966 			} else { /* no hot spares */
    967 				column->un_dev = dev;
    968 				column->un_pwstart = column->un_orig_pwstart;
    969 				column->un_devstart = column->un_orig_devstart;
    970 				if (column->un_devstate & RCS_RESYNC) {
    971 					preserve_flags |= MD_RAID_REGEN_RESYNC;
    972 					preserve_flags &= ~MD_RAID_COPY_RESYNC;
    973 				}
    974 			}
    975 			if (! (column->un_devstate & RCS_RESYNC)) {
    976 				preserve_flags &=
    977 				    ~(MD_RAID_REGEN_RESYNC |
    978 				    MD_RAID_COPY_RESYNC);
    979 			}
    980 
    981 			column->un_devflags = preserve_flags;
    982 			column->un_alt_dev = NODEV64;
    983 			column->un_alt_pwstart = 0;
    984 			column->un_alt_devstart = 0;
    985 			un->un_resync_line_index = 0;
    986 			un->un_resync_index = 0;
    987 			un->un_percent_done = 0;
    988 		}
    989 	}
    990 
    991 	if (resync_cnt && error_cnt) {
    992 		for (i = 0; i < un->un_totalcolumncnt; i++) {
    993 			column  = &un->un_column[i];
    994 			if (HOTSPARED(un, i) &&
    995 			    (column->un_devstate & RCS_RESYNC) &&
    996 			    (column->un_devflags & MD_RAID_COPY_RESYNC))
    997 				/* hotspare has data */
    998 				continue;
    999 
   1000 			if (HOTSPARED(un, i) &&
   1001 			    (column->un_devstate & RCS_RESYNC)) {
   1002 				/* hotspare does not have data */
   1003 				raid_hs_release(HS_FREE, un, &hs_recid, i);
   1004 				column->un_dev = column->un_orig_dev;
   1005 				column->un_pwstart = column->un_orig_pwstart;
   1006 				column->un_devstart = column->un_orig_devstart;
   1007 				mddb_setrecprivate(hs_recid, MD_PRV_PENDCOM);
   1008 			}
   1009 
   1010 			if (column->un_devstate & RCS_ERRED)
   1011 				column->un_devstate = RCS_LAST_ERRED;
   1012 
   1013 			if (column->un_devstate & RCS_RESYNC)
   1014 				column->un_devstate = RCS_ERRED;
   1015 		}
   1016 	}
   1017 	mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
   1018 
   1019 	un->un_pwid = 1; /* or some other possible value */
   1020 	un->un_magic = RAID_UNMAGIC;
   1021 	iosize = un->un_iosize;
   1022 	un->un_pbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP);
   1023 	un->un_dbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP);
   1024 	mutex_init(&un->un_linlck_mx, NULL, MUTEX_DEFAULT, NULL);
   1025 	cv_init(&un->un_linlck_cv, NULL, CV_DEFAULT, NULL);
   1026 	un->un_linlck_chn = NULL;
   1027 
   1028 	/* place various information in the in-core data structures */
   1029 	md_nblocks_set(mnum, un->c.un_total_blocks);
   1030 	MD_UNIT(mnum) = un;
   1031 
   1032 	return (0);
   1033 }
   1034 
   1035 /*
   1036  * NAMES:	reset_raid
   1037  * DESCRIPTION: RAID metadevice reset routine
   1038  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
   1039  *		minor_t mnum - RAID metadevice minor number
   1040  *		int removing - a flag to imply removing device name from
   1041  *			MDDB database.
   1042  */
   1043 void
   1044 reset_raid(mr_unit_t *un, minor_t mnum, int removing)
   1045 {
   1046 	int		i, n = 0;
   1047 	sv_dev_t	*sv;
   1048 	mr_column_t	*column;
   1049 	int		column_cnt = un->un_totalcolumncnt;
   1050 	mddb_recid_t	*recids, vtoc_id;
   1051 	int		hserr;
   1052 
   1053 	ASSERT((MDI_UNIT(mnum)->ui_io_lock->io_list_front == NULL) &&
   1054 	    (MDI_UNIT(mnum)->ui_io_lock->io_list_back == NULL));
   1055 
   1056 	md_destroy_unit_incore(mnum, &raid_md_ops);
   1057 
   1058 	md_nblocks_set(mnum, -1ULL);
   1059 	MD_UNIT(mnum) = NULL;
   1060 
   1061 	if (un->un_pbuffer) {
   1062 		kmem_free(un->un_pbuffer, dbtob(un->un_iosize));
   1063 		un->un_pbuffer = NULL;
   1064 	}
   1065 	if (un->un_dbuffer) {
   1066 		kmem_free(un->un_dbuffer, dbtob(un->un_iosize));
   1067 		un->un_dbuffer = NULL;
   1068 	}
   1069 
   1070 	/* free all pre-write slots created during build incore */
   1071 	for (i = 0; i < un->un_totalcolumncnt; i++)
   1072 		raid_free_pw_reservation(un, i);
   1073 
   1074 	kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
   1075 	    un->un_totalcolumncnt);
   1076 
   1077 	kmem_free(un->mr_ic, sizeof (*un->mr_ic));
   1078 
   1079 	/*
   1080 	 * Attempt release of its minor node
   1081 	 */
   1082 	md_remove_minor_node(mnum);
   1083 
   1084 	if (!removing)
   1085 		return;
   1086 
   1087 	sv = (sv_dev_t *)kmem_zalloc((column_cnt + 1) * sizeof (sv_dev_t),
   1088 	    KM_SLEEP);
   1089 
   1090 	recids = (mddb_recid_t *)
   1091 	    kmem_zalloc((column_cnt + 2) * sizeof (mddb_recid_t), KM_SLEEP);
   1092 
   1093 	for (i = 0; i < column_cnt; i++) {
   1094 		md_unit_t	*comp_un;
   1095 		md_dev64_t	comp_dev;
   1096 
   1097 		column = &un->un_column[i];
   1098 		sv[i].setno = MD_MIN2SET(mnum);
   1099 		sv[i].key = column->un_orig_key;
   1100 		if (HOTSPARED(un, i)) {
   1101 			if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED))
   1102 				hserr = HS_BAD;
   1103 			else
   1104 				hserr = HS_FREE;
   1105 			raid_hs_release(hserr, un, &recids[n++], i);
   1106 		}
   1107 		/*
   1108 		 * deparent any metadevices.
   1109 		 * NOTE: currently soft partitions are the only metadevices
   1110 		 * allowed in RAID metadevices.
   1111 		 */
   1112 		comp_dev = column->un_dev;
   1113 		if (md_getmajor(comp_dev) == md_major) {
   1114 			comp_un = MD_UNIT(md_getminor(comp_dev));
   1115 			recids[n++] = MD_RECID(comp_un);
   1116 			md_reset_parent(comp_dev);
   1117 		}
   1118 	}
   1119 	/* decrement the reference count of the old hsp */
   1120 	if (un->un_hsp_id != -1)
   1121 		(void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
   1122 		    &recids[n++], NULL, NULL, NULL);
   1123 	recids[n] = 0;
   1124 	MD_STATUS(un) |= MD_UN_BEING_RESET;
   1125 	vtoc_id = un->c.un_vtoc_id;
   1126 
   1127 	raid_commit(un, recids);
   1128 
   1129 	/*
   1130 	 * Remove self from the namespace
   1131 	 */
   1132 	if (un->c.un_revision & MD_FN_META_DEV) {
   1133 		(void) md_rem_selfname(un->c.un_self_id);
   1134 	}
   1135 
   1136 	/* Remove the unit structure */
   1137 	mddb_deleterec_wrapper(un->c.un_record_id);
   1138 
   1139 	/* Remove the vtoc, if present */
   1140 	if (vtoc_id)
   1141 		mddb_deleterec_wrapper(vtoc_id);
   1142 	md_rem_names(sv, column_cnt);
   1143 	kmem_free(sv, (column_cnt + 1) * sizeof (sv_dev_t));
   1144 	kmem_free(recids, (column_cnt + 2) * sizeof (mddb_recid_t));
   1145 
   1146 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
   1147 	    MD_MIN2SET(mnum), mnum);
   1148 }
   1149 
   1150 /*
   1151  * NAMES:	raid_error_parent
   1152  * DESCRIPTION: mark a parent structure in error
   1153  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   1154  *		int	error - error value to set
   1155  * NOTE:	(TBR) - this routine currently is not in use.
   1156  */
   1157 static void
   1158 raid_error_parent(md_raidps_t *ps, int error)
   1159 {
   1160 	mutex_enter(&ps->ps_mx);
   1161 	ps->ps_flags |= MD_RPS_ERROR;
   1162 	ps->ps_error = error;
   1163 	mutex_exit(&ps->ps_mx);
   1164 }
   1165 
   1166 /*
   1167  * The following defines tell raid_free_parent
   1168  *	RFP_RLS_LOCK		release the unit reader lock when done.
   1169  *	RFP_DECR_PWFRAGS	decrement ps_pwfrags
   1170  *	RFP_DECR_FRAGS		decrement ps_frags
   1171  *	RFP_DECR_READFRAGS	read keeps FRAGS and PWFRAGS in lockstep
   1172  */
   1173 #define	RFP_RLS_LOCK		0x00001
   1174 #define	RFP_DECR_PWFRAGS	0x00002
   1175 #define	RFP_DECR_FRAGS		0x00004
   1176 #define	RFP_DECR_READFRAGS	(RFP_DECR_PWFRAGS | RFP_DECR_FRAGS)
   1177 
   1178 /*
   1179  * NAMES:	raid_free_parent
   1180  * DESCRIPTION: free a parent structure
   1181  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   1182  *		int	todo - indicates what needs to be done
   1183  */
   1184 static void
   1185 raid_free_parent(md_raidps_t *ps, int todo)
   1186 {
   1187 	mdi_unit_t	*ui = ps->ps_ui;
   1188 
   1189 	ASSERT(ps->ps_magic == RAID_PSMAGIC);
   1190 	ASSERT(ps->ps_flags & MD_RPS_INUSE);
   1191 	mutex_enter(&ps->ps_mx);
   1192 	if (todo & RFP_DECR_PWFRAGS) {
   1193 		ASSERT(ps->ps_pwfrags);
   1194 		ps->ps_pwfrags--;
   1195 		if (ps->ps_pwfrags == 0 && (! (ps->ps_flags & MD_RPS_IODONE))) {
   1196 			if (ps->ps_flags & MD_RPS_ERROR) {
   1197 				ps->ps_bp->b_flags |= B_ERROR;
   1198 				ps->ps_bp->b_error = ps->ps_error;
   1199 			}
   1200 			md_kstat_done(ui, ps->ps_bp, 0);
   1201 			biodone(ps->ps_bp);
   1202 			ps->ps_flags |= MD_RPS_IODONE;
   1203 		}
   1204 	}
   1205 
   1206 	if (todo & RFP_DECR_FRAGS) {
   1207 		ASSERT(ps->ps_frags);
   1208 		ps->ps_frags--;
   1209 	}
   1210 
   1211 	if (ps->ps_frags != 0) {
   1212 		mutex_exit(&ps->ps_mx);
   1213 		return;
   1214 	}
   1215 
   1216 	ASSERT((ps->ps_frags == 0) && (ps->ps_pwfrags == 0));
   1217 	mutex_exit(&ps->ps_mx);
   1218 
   1219 	if (todo & RFP_RLS_LOCK)
   1220 		md_io_readerexit(ui);
   1221 
   1222 	if (panicstr) {
   1223 		ps->ps_flags |= MD_RPS_DONE;
   1224 		return;
   1225 	}
   1226 
   1227 	if (ps->ps_flags & MD_RPS_HSREQ)
   1228 		(void) raid_hotspares();
   1229 
   1230 	ASSERT(todo & RFP_RLS_LOCK);
   1231 	ps->ps_flags &= ~MD_RPS_INUSE;
   1232 
   1233 	md_dec_iocount(MD_MIN2SET(ps->ps_un->c.un_self_id));
   1234 
   1235 	kmem_cache_free(raid_parent_cache, ps);
   1236 }
   1237 
   1238 /*
   1239  * NAMES:	raid_free_child
   1240  * DESCRIPTION: free a parent structure
   1241  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   1242  *		int drop_locks	- 0 for no locks held
   1243  * NOTE:	(TBR) - this routine currently is not in use.
   1244  */
   1245 static void
   1246 raid_free_child(md_raidcs_t *cs, int drop_locks)
   1247 {
   1248 	mr_unit_t	*un = cs->cs_un;
   1249 	md_raidcbuf_t	*cbuf, *cbuf1;
   1250 
   1251 	if (cs->cs_pw_inval_list)
   1252 		raid_free_pwinvalidate(cs);
   1253 
   1254 	if (drop_locks) {
   1255 		ASSERT(cs->cs_flags & MD_RCS_LLOCKD &&
   1256 		    (cs->cs_flags & (MD_RCS_READER | MD_RCS_WRITER)));
   1257 		md_unit_readerexit(MDI_UNIT(MD_SID(un)));
   1258 		raid_line_exit(cs);
   1259 	} else {
   1260 		ASSERT(!(cs->cs_flags & MD_RCS_LLOCKD));
   1261 	}
   1262 
   1263 	freebuffers(cs);
   1264 	cbuf = cs->cs_buflist;
   1265 	while (cbuf) {
   1266 		cbuf1 = cbuf->cbuf_next;
   1267 		kmem_cache_free(raid_cbuf_cache, cbuf);
   1268 		cbuf = cbuf1;
   1269 	}
   1270 	if (cs->cs_dbuf.b_flags & B_REMAPPED)
   1271 		bp_mapout(&cs->cs_dbuf);
   1272 	kmem_cache_free(raid_child_cache, cs);
   1273 }
   1274 
   1275 /*
   1276  * NAME:	raid_regen_parity
   1277  *
   1278  * DESCRIPTION:	This routine is used to regenerate the parity blocks
   1279  *		for the entire raid device.  It is called from
   1280  *		both the regen thread and the IO path.
   1281  *
   1282  *		On error the entire device is marked as in error by
   1283  *		placing the erroring device in error and all other
   1284  *		devices in last_errored.
   1285  *
   1286  * PARAMETERS:	md_raidcs_t	*cs
   1287  */
   1288 void
   1289 raid_regen_parity(md_raidcs_t *cs)
   1290 {
   1291 	mr_unit_t	*un = cs->cs_un;
   1292 	mdi_unit_t	*ui = MDI_UNIT(un->c.un_self_id);
   1293 	caddr_t		buffer;
   1294 	caddr_t		parity_buffer;
   1295 	buf_t		*bp;
   1296 	uint_t		*dbuf, *pbuf;
   1297 	uint_t		colcnt = un->un_totalcolumncnt;
   1298 	int		column;
   1299 	int		parity_column = cs->cs_pcolumn;
   1300 	size_t		bcount;
   1301 	int		j;
   1302 
   1303 	/*
   1304 	 * This routine uses the data and parity buffers allocated to a
   1305 	 * write.  In the case of a read the buffers are allocated and
   1306 	 * freed at the end.
   1307 	 */
   1308 
   1309 	ASSERT(IO_READER_HELD(un));
   1310 	ASSERT(cs->cs_flags & MD_RCS_LLOCKD);
   1311 	ASSERT(UNIT_READER_HELD(un));
   1312 
   1313 	if (raid_state_cnt(un, RCS_OKAY) != colcnt)
   1314 		return;
   1315 
   1316 	if (cs->cs_flags & MD_RCS_READER) {
   1317 		getpbuffer(cs);
   1318 		getdbuffer(cs);
   1319 	}
   1320 	ASSERT(cs->cs_dbuffer && cs->cs_pbuffer);
   1321 	bcount = cs->cs_bcount;
   1322 	buffer = cs->cs_dbuffer;
   1323 	parity_buffer = cs->cs_pbuffer;
   1324 	bzero(parity_buffer, bcount);
   1325 	bp = &cs->cs_dbuf;
   1326 	for (column = 0; column < colcnt; column++) {
   1327 		if (column == parity_column)
   1328 			continue;
   1329 		reset_buf(bp, B_READ | B_BUSY, bcount);
   1330 		bp->b_un.b_addr = buffer;
   1331 		bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
   1332 		bp->b_lblkno = cs->cs_blkno + un->un_column[column].un_devstart;
   1333 		bp->b_bcount = bcount;
   1334 		bp->b_bufsize = bcount;
   1335 		(void) md_call_strategy(bp, MD_STR_NOTTOP, NULL);
   1336 		if (biowait(bp))
   1337 			goto bail;
   1338 		pbuf = (uint_t *)(void *)parity_buffer;
   1339 		dbuf = (uint_t *)(void *)buffer;
   1340 		for (j = 0; j < (bcount / (sizeof (uint_t))); j++) {
   1341 			*pbuf = *pbuf ^ *dbuf;
   1342 			pbuf++;
   1343 			dbuf++;
   1344 		}
   1345 	}
   1346 
   1347 	reset_buf(bp, B_WRITE | B_BUSY, cs->cs_bcount);
   1348 	bp->b_un.b_addr = parity_buffer;
   1349 	bp->b_edev = md_dev64_to_dev(un->un_column[parity_column].un_dev);
   1350 	bp->b_lblkno = cs->cs_blkno + un->un_column[parity_column].un_devstart;
   1351 	bp->b_bcount = bcount;
   1352 	bp->b_bufsize = bcount;
   1353 	(void) md_call_strategy(bp, MD_STR_NOTTOP, NULL);
   1354 	if (biowait(bp))
   1355 		goto bail;
   1356 
   1357 	if (cs->cs_flags & MD_RCS_READER) {
   1358 		freebuffers(cs);
   1359 		cs->cs_pbuffer = NULL;
   1360 		cs->cs_dbuffer = NULL;
   1361 	}
   1362 	bp->b_chain = (struct buf *)cs;
   1363 	return;
   1364 bail:
   1365 	if (cs->cs_flags & MD_RCS_READER) {
   1366 		freebuffers(cs);
   1367 		cs->cs_pbuffer = NULL;
   1368 		cs->cs_dbuffer = NULL;
   1369 	}
   1370 	md_unit_readerexit(ui);
   1371 	un = md_unit_writerlock(ui);
   1372 	raid_set_state(un, column, RCS_ERRED, 0);
   1373 	for (column = 0; column < colcnt; column++)
   1374 		raid_set_state(un, column, RCS_ERRED, 0);
   1375 	raid_commit(un, NULL);
   1376 	md_unit_writerexit(ui);
   1377 	un = md_unit_readerlock(ui);
   1378 	bp->b_chain = (struct buf *)cs;
   1379 }
   1380 
   1381 /*
   1382  * NAMES:	raid_error_state
   1383  * DESCRIPTION: check unit and column states' impact on I/O error
   1384  *		NOTE:	the state now may not be the state when the
   1385  *			I/O completed due to race conditions.
   1386  * PARAMETERS:	mr_unit_t *un - pointer to raid unit structure
   1387  *		md_raidcs_t *cs - pointer to child structure
   1388  *		buf_t	  *bp - pointer to buffer structure
   1389  */
   1390 static int
   1391 raid_error_state(mr_unit_t *un, buf_t *bp)
   1392 {
   1393 	int		column;
   1394 	int		i;
   1395 
   1396 	ASSERT(IO_READER_HELD(un));
   1397 	ASSERT(UNIT_WRITER_HELD(un));
   1398 
   1399 	column = -1;
   1400 	for (i = 0; i < un->un_totalcolumncnt; i++) {
   1401 		if (un->un_column[i].un_dev == md_expldev(bp->b_edev)) {
   1402 			column = i;
   1403 			break;
   1404 		}
   1405 		if (un->un_column[i].un_alt_dev == md_expldev(bp->b_edev)) {
   1406 			column = i;
   1407 			break;
   1408 		}
   1409 	}
   1410 
   1411 	/* in case a replace snuck in while waiting on unit writer lock */
   1412 
   1413 	if (column == -1) {
   1414 		return (0);
   1415 	}
   1416 
   1417 	(void) raid_set_state(un, column, RCS_ERRED, 0);
   1418 	ASSERT(un->un_state & (RUS_ERRED | RUS_LAST_ERRED));
   1419 
   1420 	raid_commit(un, NULL);
   1421 	if (un->un_state & RUS_ERRED) {
   1422 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE,
   1423 		    MD_UN2SET(un), MD_SID(un));
   1424 	} else if (un->un_state & RUS_LAST_ERRED) {
   1425 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE,
   1426 		    MD_UN2SET(un), MD_SID(un));
   1427 	}
   1428 
   1429 	return (EIO);
   1430 }
   1431 
   1432 /*
   1433  * NAME:	raid_mapin_buf
   1434  * DESCRIPTION:	wait for the input buffer header to be maped in
   1435  * PARAMETERS:	md_raidps_t *ps
   1436  */
   1437 static void
   1438 raid_mapin_buf(md_raidcs_t *cs)
   1439 {
   1440 	md_raidps_t	*ps = cs->cs_ps;
   1441 
   1442 	/*
   1443 	 * check to see if the buffer is maped.  If all is ok return the
   1444 	 * offset of the data and return.  Since it is expensive to grab
   1445 	 * a mutex this is only done if the mapin is not complete.
   1446 	 * Once the mutex is aquired it is possible that the mapin was
   1447 	 * not done so recheck and if necessary do the mapin.
   1448 	 */
   1449 	if (ps->ps_mapin > 0) {
   1450 		cs->cs_addr = ps->ps_addr + cs->cs_offset;
   1451 		return;
   1452 	}
   1453 	mutex_enter(&ps->ps_mapin_mx);
   1454 	if (ps->ps_mapin > 0) {
   1455 		cs->cs_addr = ps->ps_addr + cs->cs_offset;
   1456 		mutex_exit(&ps->ps_mapin_mx);
   1457 		return;
   1458 	}
   1459 	bp_mapin(ps->ps_bp);
   1460 	/*
   1461 	 * get the new b_addr out of the parent since bp_mapin just changed it
   1462 	 */
   1463 	ps->ps_addr = ps->ps_bp->b_un.b_addr;
   1464 	cs->cs_addr = ps->ps_addr + cs->cs_offset;
   1465 	ps->ps_mapin++;
   1466 	mutex_exit(&ps->ps_mapin_mx);
   1467 }
   1468 
   1469 /*
   1470  * NAMES:	raid_read_no_retry
   1471  * DESCRIPTION: I/O retry routine for a RAID metadevice read
   1472  *		read failed attempting to regenerate the data,
   1473  *		no retry possible, error occured in raid_raidregenloop().
   1474  * PARAMETERS:	mr_unit_t   *un - pointer to raid unit structure
   1475  *		md_raidcs_t *cs - pointer to child structure
   1476  */
   1477 /*ARGSUSED*/
   1478 static void
   1479 raid_read_no_retry(mr_unit_t *un, md_raidcs_t *cs)
   1480 {
   1481 	md_raidps_t	*ps = cs->cs_ps;
   1482 
   1483 	raid_error_parent(ps, EIO);
   1484 	raid_free_child(cs, 1);
   1485 
   1486 	/* decrement readfrags */
   1487 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
   1488 }
   1489 
   1490 /*
   1491  * NAMES:	raid_read_retry
   1492  * DESCRIPTION: I/O retry routine for a RAID metadevice read
   1493  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   1494  */
   1495 static void
   1496 raid_read_retry(mr_unit_t *un, md_raidcs_t *cs)
   1497 {
   1498 	/* re-initialize the buf_t structure for raid_read() */
   1499 	cs->cs_dbuf.b_chain = (struct buf *)cs;
   1500 	cs->cs_dbuf.b_back = &cs->cs_dbuf;
   1501 	cs->cs_dbuf.b_forw = &cs->cs_dbuf;
   1502 	cs->cs_dbuf.b_flags = B_BUSY;	/* initialize flags */
   1503 	cs->cs_dbuf.b_error = 0;	/* initialize error */
   1504 	cs->cs_dbuf.b_offset = -1;
   1505 	/* Initialize semaphores */
   1506 	sema_init(&cs->cs_dbuf.b_io, 0, NULL,
   1507 	    SEMA_DEFAULT, NULL);
   1508 	sema_init(&cs->cs_dbuf.b_sem, 0, NULL,
   1509 	    SEMA_DEFAULT, NULL);
   1510 
   1511 	cs->cs_pbuf.b_chain = (struct buf *)cs;
   1512 	cs->cs_pbuf.b_back = &cs->cs_pbuf;
   1513 	cs->cs_pbuf.b_forw = &cs->cs_pbuf;
   1514 	cs->cs_pbuf.b_flags = B_BUSY;	/* initialize flags */
   1515 	cs->cs_pbuf.b_error = 0;	/* initialize error */
   1516 	cs->cs_pbuf.b_offset = -1;
   1517 	sema_init(&cs->cs_pbuf.b_io, 0, NULL,
   1518 	    SEMA_DEFAULT, NULL);
   1519 	sema_init(&cs->cs_pbuf.b_sem, 0, NULL,
   1520 	    SEMA_DEFAULT, NULL);
   1521 
   1522 	cs->cs_flags &= ~MD_RCS_ERROR;	/* reset child error flag */
   1523 	cs->cs_flags |= MD_RCS_RECOVERY;  /* set RECOVERY flag */
   1524 
   1525 	/*
   1526 	 * re-scheduling I/O with raid_read_io() is simpler. basically,
   1527 	 * raid_read_io() is invoked again with same child structure.
   1528 	 * (NOTE: we aren`t supposed to do any error recovery when an I/O
   1529 	 * error occured in raid_raidregenloop().
   1530 	 */
   1531 	raid_mapin_buf(cs);
   1532 	raid_read_io(un, cs);
   1533 }
   1534 
   1535 /*
   1536  * NAMES:	raid_rderr
   1537  * DESCRIPTION: I/O error handling routine for a RAID metadevice read
   1538  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   1539  * LOCKS:	must obtain unit writer lock while calling raid_error_state
   1540  *		since a unit or column state transition may take place.
   1541  *		must obtain unit reader lock to retry I/O.
   1542  */
   1543 /*ARGSUSED*/
   1544 static void
   1545 raid_rderr(md_raidcs_t *cs)
   1546 {
   1547 	md_raidps_t	*ps;
   1548 	mdi_unit_t	*ui;
   1549 	mr_unit_t	*un;
   1550 	int		error = 0;
   1551 
   1552 	ps = cs->cs_ps;
   1553 	ui = ps->ps_ui;
   1554 	un = (mr_unit_t *)md_unit_writerlock(ui);
   1555 	ASSERT(un != 0);
   1556 
   1557 	if (cs->cs_dbuf.b_flags & B_ERROR)
   1558 		error = raid_error_state(un, &cs->cs_dbuf);
   1559 	if (cs->cs_pbuf.b_flags & B_ERROR)
   1560 		error |= raid_error_state(un, &cs->cs_pbuf);
   1561 
   1562 	md_unit_writerexit(ui);
   1563 
   1564 	ps->ps_flags |= MD_RPS_HSREQ;
   1565 
   1566 	un = (mr_unit_t *)md_unit_readerlock(ui);
   1567 	ASSERT(un != 0);
   1568 	/* now attempt the appropriate retry routine */
   1569 	(*(cs->cs_retry_call))(un, cs);
   1570 }
   1571 
   1572 
   1573 /*
   1574  * NAMES:	raid_read_error
   1575  * DESCRIPTION: I/O error handling routine for a RAID metadevice read
   1576  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   1577  */
   1578 /*ARGSUSED*/
   1579 static void
   1580 raid_read_error(md_raidcs_t *cs)
   1581 {
   1582 	md_raidps_t	*ps;
   1583 	mdi_unit_t	*ui;
   1584 	mr_unit_t	*un;
   1585 	set_t		setno;
   1586 
   1587 	ps = cs->cs_ps;
   1588 	ui = ps->ps_ui;
   1589 	un = cs->cs_un;
   1590 
   1591 	setno = MD_UN2SET(un);
   1592 
   1593 	if ((cs->cs_dbuf.b_flags & B_ERROR) &&
   1594 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) &&
   1595 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED))
   1596 		cmn_err(CE_WARN, "md %s: read error on %s",
   1597 		    md_shortname(MD_SID(un)),
   1598 		    md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0));
   1599 
   1600 	if ((cs->cs_pbuf.b_flags & B_ERROR) &&
   1601 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) &&
   1602 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED))
   1603 		cmn_err(CE_WARN, "md %s: read error on %s",
   1604 		    md_shortname(MD_SID(un)),
   1605 		    md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0));
   1606 
   1607 	md_unit_readerexit(ui);
   1608 
   1609 	ASSERT(cs->cs_frags == 0);
   1610 
   1611 	/* now schedule processing for possible state change */
   1612 	daemon_request(&md_mstr_daemon, raid_rderr,
   1613 	    (daemon_queue_t *)cs, REQ_OLD);
   1614 
   1615 }
   1616 
   1617 /*
   1618  * NAMES:	getdbuffer
   1619  * DESCRIPTION: data buffer allocation for a child structure
   1620  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   1621  *
   1622  * NOTE: always get dbuffer before pbuffer
   1623  *	 and get both buffers before pwslot
   1624  *	 otherwise a deadlock could be introduced.
   1625  */
   1626 static void
   1627 getdbuffer(md_raidcs_t *cs)
   1628 {
   1629 	mr_unit_t	*un;
   1630 
   1631 	cs->cs_dbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP);
   1632 	if (cs->cs_dbuffer != NULL)
   1633 		return;
   1634 	un = cs->cs_ps->ps_un;
   1635 	mutex_enter(&un->un_mx);
   1636 	while (un->un_dbuffer == NULL) {
   1637 		STAT_INC(data_buffer_waits);
   1638 		un->un_rflags |= MD_RFLAG_NEEDBUF;
   1639 		cv_wait(&un->un_cv, &un->un_mx);
   1640 	}
   1641 	cs->cs_dbuffer = un->un_dbuffer;
   1642 	cs->cs_flags |= MD_RCS_UNDBUF;
   1643 	un->un_dbuffer = NULL;
   1644 	mutex_exit(&un->un_mx);
   1645 }
   1646 
   1647 /*
   1648  * NAMES:	getpbuffer
   1649  * DESCRIPTION: parity buffer allocation for a child structure
   1650  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   1651  *
   1652  * NOTE: always get dbuffer before pbuffer
   1653  *	 and get both buffers before pwslot
   1654  *	 otherwise a deadlock could be introduced.
   1655  */
   1656 static void
   1657 getpbuffer(md_raidcs_t *cs)
   1658 {
   1659 	mr_unit_t *un;
   1660 
   1661 	cs->cs_pbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP);
   1662 	if (cs->cs_pbuffer != NULL)
   1663 		return;
   1664 	un = cs->cs_ps->ps_un;
   1665 	mutex_enter(&un->un_mx);
   1666 	while (un->un_pbuffer == NULL) {
   1667 		STAT_INC(parity_buffer_waits);
   1668 		un->un_rflags |= MD_RFLAG_NEEDBUF;
   1669 		cv_wait(&un->un_cv, &un->un_mx);
   1670 	}
   1671 	cs->cs_pbuffer = un->un_pbuffer;
   1672 	cs->cs_flags |= MD_RCS_UNPBUF;
   1673 	un->un_pbuffer = NULL;
   1674 	mutex_exit(&un->un_mx);
   1675 }
   1676 static void
   1677 getresources(md_raidcs_t *cs)
   1678 {
   1679 	md_raidcbuf_t	*cbuf;
   1680 	/*
   1681 	 * NOTE: always get dbuffer before pbuffer
   1682 	 *	 and get both buffers before pwslot
   1683 	 *	 otherwise a deadlock could be introduced.
   1684 	 */
   1685 	getdbuffer(cs);
   1686 	getpbuffer(cs);
   1687 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
   1688 		cbuf->cbuf_buffer =
   1689 		    kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_SLEEP);
   1690 }
   1691 /*
   1692  * NAMES:	freebuffers
   1693  * DESCRIPTION: child structure buffer freeing routine
   1694  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   1695  */
   1696 static void
   1697 freebuffers(md_raidcs_t *cs)
   1698 {
   1699 	mr_unit_t	*un;
   1700 	md_raidcbuf_t	*cbuf;
   1701 
   1702 	/* free buffers used for full line write */
   1703 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
   1704 		if (cbuf->cbuf_buffer == NULL)
   1705 			continue;
   1706 		kmem_free(cbuf->cbuf_buffer, cbuf->cbuf_bcount + DEV_BSIZE);
   1707 		cbuf->cbuf_buffer = NULL;
   1708 		cbuf->cbuf_bcount = 0;
   1709 	}
   1710 
   1711 	if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) {
   1712 		un = cs->cs_un;
   1713 		mutex_enter(&un->un_mx);
   1714 	}
   1715 	if (cs->cs_dbuffer) {
   1716 		if (cs->cs_flags & MD_RCS_UNDBUF)
   1717 			un->un_dbuffer = cs->cs_dbuffer;
   1718 		else
   1719 			kmem_free(cs->cs_dbuffer, cs->cs_bcount + DEV_BSIZE);
   1720 	}
   1721 	if (cs->cs_pbuffer) {
   1722 		if (cs->cs_flags & MD_RCS_UNPBUF)
   1723 			un->un_pbuffer = cs->cs_pbuffer;
   1724 		else
   1725 			kmem_free(cs->cs_pbuffer, cs->cs_bcount + DEV_BSIZE);
   1726 	}
   1727 	if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) {
   1728 		un->un_rflags &= ~MD_RFLAG_NEEDBUF;
   1729 		cv_broadcast(&un->un_cv);
   1730 		mutex_exit(&un->un_mx);
   1731 	}
   1732 }
   1733 
   1734 /*
   1735  * NAMES:	raid_line_reader_lock, raid_line_writer_lock
   1736  * DESCRIPTION: RAID metadevice line reader and writer lock routines
   1737  *		data column # and parity column #.
   1738  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   1739  */
   1740 
   1741 void
   1742 raid_line_reader_lock(md_raidcs_t *cs, int resync_thread)
   1743 {
   1744 	mr_unit_t	*un;
   1745 	md_raidcs_t	*cs1;
   1746 
   1747 	ASSERT(cs->cs_line != MD_DISKADDR_ERROR);
   1748 	un = cs->cs_un;
   1749 	cs->cs_flags |= MD_RCS_READER;
   1750 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
   1751 	if (!panicstr)
   1752 		mutex_enter(&un->un_linlck_mx);
   1753 	cs1 = un->un_linlck_chn;
   1754 	while (cs1 != NULL) {
   1755 		for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
   1756 			if (raid_io_overlaps(cs, cs1) == 1)
   1757 				if (cs1->cs_flags & MD_RCS_WRITER)
   1758 					break;
   1759 
   1760 		if (cs1 != NULL) {
   1761 			if (panicstr)
   1762 				panic("md; raid line write lock held");
   1763 			un->un_linlck_flg = 1;
   1764 			cv_wait(&un->un_linlck_cv, &un->un_linlck_mx);
   1765 			STAT_INC(raid_read_waits);
   1766 		}
   1767 	}
   1768 	STAT_MAX(raid_max_reader_locks, raid_reader_locks_active);
   1769 	STAT_INC(raid_reader_locks);
   1770 	cs1 = un->un_linlck_chn;
   1771 	if (cs1 != NULL)
   1772 		cs1->cs_linlck_prev = cs;
   1773 	cs->cs_linlck_next = cs1;
   1774 	cs->cs_linlck_prev = NULL;
   1775 	un->un_linlck_chn = cs;
   1776 	cs->cs_flags |= MD_RCS_LLOCKD;
   1777 	if (resync_thread) {
   1778 		diskaddr_t lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
   1779 		diskaddr_t line = (lastblk + 1) / un->un_segsize;
   1780 		ASSERT(raid_state_cnt(un, RCS_RESYNC));
   1781 		mutex_enter(&un->un_mx);
   1782 		un->un_resync_line_index = line;
   1783 		mutex_exit(&un->un_mx);
   1784 	}
   1785 	if (!panicstr)
   1786 		mutex_exit(&un->un_linlck_mx);
   1787 }
   1788 
   1789 int
   1790 raid_line_writer_lock(md_raidcs_t *cs, int lock)
   1791 {
   1792 	mr_unit_t	*un;
   1793 	md_raidcs_t	*cs1;
   1794 
   1795 	ASSERT(cs->cs_line != MD_DISKADDR_ERROR);
   1796 	cs->cs_flags |= MD_RCS_WRITER;
   1797 	un = cs->cs_ps->ps_un;
   1798 
   1799 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
   1800 	if (lock && !panicstr)
   1801 		mutex_enter(&un->un_linlck_mx);
   1802 	ASSERT(MUTEX_HELD(&un->un_linlck_mx));
   1803 
   1804 	cs1 = un->un_linlck_chn;
   1805 	for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
   1806 		if (raid_io_overlaps(cs, cs1))
   1807 			break;
   1808 
   1809 	if (cs1 != NULL) {
   1810 		if (panicstr)
   1811 			panic("md: line writer lock inaccessible");
   1812 		goto no_lock_exit;
   1813 	}
   1814 
   1815 	if (raid_alloc_pwslot(cs)) {
   1816 		if (panicstr)
   1817 			panic("md: no prewrite slots");
   1818 		STAT_INC(raid_prewrite_waits);
   1819 		goto no_lock_exit;
   1820 	}
   1821 
   1822 	cs1 = un->un_linlck_chn;
   1823 	if (cs1 != NULL)
   1824 		cs1->cs_linlck_prev = cs;
   1825 	cs->cs_linlck_next = cs1;
   1826 	cs->cs_linlck_prev = NULL;
   1827 	un->un_linlck_chn = cs;
   1828 	cs->cs_flags |= MD_RCS_LLOCKD;
   1829 	cs->cs_flags &= ~MD_RCS_WAITING;
   1830 	STAT_INC(raid_writer_locks);
   1831 	STAT_MAX(raid_max_write_locks, raid_write_locks_active);
   1832 	if (lock && !panicstr)
   1833 		mutex_exit(&un->un_linlck_mx);
   1834 	return (0);
   1835 
   1836 no_lock_exit:
   1837 	/* if this is already queued then do not requeue it */
   1838 	ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD));
   1839 	if (!lock || (cs->cs_flags & MD_RCS_WAITING))
   1840 		return (1);
   1841 	cs->cs_flags |= MD_RCS_WAITING;
   1842 	cs->cs_un = un;
   1843 	raid_enqueue(cs);
   1844 	if (lock && !panicstr)
   1845 		mutex_exit(&un->un_linlck_mx);
   1846 	return (1);
   1847 }
   1848 
   1849 static void
   1850 raid_startio(md_raidcs_t *cs)
   1851 {
   1852 	mdi_unit_t	*ui = cs->cs_ps->ps_ui;
   1853 	mr_unit_t	*un = cs->cs_un;
   1854 
   1855 	un = md_unit_readerlock(ui);
   1856 	raid_write_io(un, cs);
   1857 }
   1858 
   1859 void
   1860 raid_io_startup(mr_unit_t *un)
   1861 {
   1862 	md_raidcs_t	*waiting_list, *cs1;
   1863 	md_raidcs_t	*previous = NULL, *next = NULL;
   1864 	mdi_unit_t	*ui =  MDI_UNIT(un->c.un_self_id);
   1865 	kmutex_t	*io_list_mutex = &ui->ui_io_lock->io_list_mutex;
   1866 
   1867 	ASSERT(MUTEX_HELD(&un->un_linlck_mx));
   1868 	mutex_enter(io_list_mutex);
   1869 
   1870 	/*
   1871 	 * check to be sure there are no reader locks outstanding.  If
   1872 	 * there are not then pass on the writer lock.
   1873 	 */
   1874 	waiting_list = ui->ui_io_lock->io_list_front;
   1875 	while (waiting_list) {
   1876 		ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
   1877 		ASSERT(! (waiting_list->cs_flags & MD_RCS_LLOCKD));
   1878 		for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
   1879 			if (raid_io_overlaps(waiting_list, cs1) == 1)
   1880 				break;
   1881 		/*
   1882 		 * there was an IOs that overlaps this io so go onto
   1883 		 * the next io in the waiting list
   1884 		 */
   1885 		if (cs1) {
   1886 			previous = waiting_list;
   1887 			waiting_list = waiting_list->cs_linlck_next;
   1888 			continue;
   1889 		}
   1890 
   1891 		/*
   1892 		 * There are no IOs that overlap this, so remove it from
   1893 		 * the waiting queue, and start it
   1894 		 */
   1895 
   1896 		if (raid_check_pw(waiting_list)) {
   1897 			ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
   1898 			previous = waiting_list;
   1899 			waiting_list = waiting_list->cs_linlck_next;
   1900 			continue;
   1901 		}
   1902 		ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
   1903 
   1904 		next = waiting_list->cs_linlck_next;
   1905 		if (previous)
   1906 			previous->cs_linlck_next = next;
   1907 		else
   1908 			ui->ui_io_lock->io_list_front = next;
   1909 
   1910 		if (ui->ui_io_lock->io_list_front == NULL)
   1911 			ui->ui_io_lock->io_list_back = NULL;
   1912 
   1913 		if (ui->ui_io_lock->io_list_back == waiting_list)
   1914 			ui->ui_io_lock->io_list_back = previous;
   1915 
   1916 		waiting_list->cs_linlck_next = NULL;
   1917 		waiting_list->cs_flags &= ~MD_RCS_WAITING;
   1918 		STAT_DEC(raid_write_queue_length);
   1919 		if (raid_line_writer_lock(waiting_list, 0))
   1920 			panic("region locking corrupted");
   1921 
   1922 		ASSERT(waiting_list->cs_flags & MD_RCS_LLOCKD);
   1923 		daemon_request(&md_mstr_daemon, raid_startio,
   1924 		    (daemon_queue_t *)waiting_list, REQ_OLD);
   1925 		waiting_list = next;
   1926 
   1927 	}
   1928 	mutex_exit(io_list_mutex);
   1929 }
   1930 
   1931 void
   1932 raid_line_exit(md_raidcs_t *cs)
   1933 {
   1934 	mr_unit_t	*un;
   1935 
   1936 	un = cs->cs_ps->ps_un;
   1937 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
   1938 	mutex_enter(&un->un_linlck_mx);
   1939 	if (cs->cs_flags & MD_RCS_READER)
   1940 		STAT_DEC(raid_reader_locks_active);
   1941 	else
   1942 		STAT_DEC(raid_write_locks_active);
   1943 
   1944 	if (cs->cs_linlck_prev)
   1945 		cs->cs_linlck_prev->cs_linlck_next = cs->cs_linlck_next;
   1946 	else
   1947 		un->un_linlck_chn = cs->cs_linlck_next;
   1948 	if (cs->cs_linlck_next)
   1949 		cs->cs_linlck_next->cs_linlck_prev = cs->cs_linlck_prev;
   1950 
   1951 	cs->cs_flags &= ~MD_RCS_LLOCKD;
   1952 
   1953 	if (un->un_linlck_flg)
   1954 		cv_broadcast(&un->un_linlck_cv);
   1955 
   1956 	un->un_linlck_flg = 0;
   1957 	cs->cs_line = MD_DISKADDR_ERROR;
   1958 
   1959 	raid_cancel_pwslot(cs);
   1960 	/*
   1961 	 * now that the lock is droped go ahead and see if there are any
   1962 	 * other writes that can be started up
   1963 	 */
   1964 	raid_io_startup(un);
   1965 
   1966 	mutex_exit(&un->un_linlck_mx);
   1967 }
   1968 
   1969 /*
   1970  * NAMES:	raid_line, raid_pcolumn, raid_dcolumn
   1971  * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #,
   1972  *		data column # and parity column #.
   1973  * PARAMETERS:	int segment - segment number
   1974  *		mr_unit_t *un - pointer to an unit structure
   1975  * RETURNS:	raid_line returns line #
   1976  *		raid_dcolumn returns data column #
   1977  *		raid_pcolumn returns parity column #
   1978  */
   1979 static diskaddr_t
   1980 raid_line(diskaddr_t segment, mr_unit_t *un)
   1981 {
   1982 	diskaddr_t	adj_seg;
   1983 	diskaddr_t	line;
   1984 	diskaddr_t	max_orig_segment;
   1985 
   1986 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
   1987 	if (segment >= max_orig_segment) {
   1988 		adj_seg = segment - max_orig_segment;
   1989 		line = adj_seg % un->un_segsincolumn;
   1990 	} else {
   1991 		line = segment / (un->un_origcolumncnt - 1);
   1992 	}
   1993 	return (line);
   1994 }
   1995 
   1996 uint_t
   1997 raid_dcolumn(diskaddr_t segment, mr_unit_t *un)
   1998 {
   1999 	diskaddr_t	adj_seg;
   2000 	diskaddr_t	line;
   2001 	diskaddr_t	max_orig_segment;
   2002 	uint_t		column;
   2003 
   2004 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
   2005 	if (segment >= max_orig_segment) {
   2006 		adj_seg = segment - max_orig_segment;
   2007 		column = un->un_origcolumncnt  +
   2008 		    (uint_t)(adj_seg / un->un_segsincolumn);
   2009 	} else {
   2010 		line = segment / (un->un_origcolumncnt - 1);
   2011 		column = (uint_t)((segment %
   2012 		    (un->un_origcolumncnt - 1) + line) % un->un_origcolumncnt);
   2013 	}
   2014 	return (column);
   2015 }
   2016 
   2017 uint_t
   2018 raid_pcolumn(diskaddr_t segment, mr_unit_t *un)
   2019 {
   2020 	diskaddr_t	adj_seg;
   2021 	diskaddr_t	line;
   2022 	diskaddr_t	max_orig_segment;
   2023 	uint_t		column;
   2024 
   2025 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
   2026 	if (segment >= max_orig_segment) {
   2027 		adj_seg = segment - max_orig_segment;
   2028 		line = adj_seg % un->un_segsincolumn;
   2029 	} else {
   2030 		line = segment / (un->un_origcolumncnt - 1);
   2031 	}
   2032 	column = (uint_t)((line + (un->un_origcolumncnt - 1)) %
   2033 	    un->un_origcolumncnt);
   2034 	return (column);
   2035 }
   2036 
   2037 
   2038 /*
   2039  * Is called in raid_iosetup to probe each column to insure
   2040  * that all the columns are in 'okay' state and meet the
   2041  * 'full line' requirement.  If any column is in error,
   2042  * we don't want to enable the 'full line' flag.  Previously,
   2043  * we would do so and disable it only when a error is
   2044  * detected after the first 'full line' io which is too late
   2045  * and leads to the potential data corruption.
   2046  */
   2047 static int
   2048 raid_check_cols(mr_unit_t *un)
   2049 {
   2050 	buf_t		bp;
   2051 	char		*buf;
   2052 	mr_column_t	*colptr;
   2053 	minor_t		mnum = MD_SID(un);
   2054 	int		i;
   2055 	int		err = 0;
   2056 
   2057 	buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
   2058 
   2059 	for (i = 0; i < un->un_totalcolumncnt; i++) {
   2060 		md_dev64_t tmpdev;
   2061 
   2062 		colptr = &un->un_column[i];
   2063 
   2064 		tmpdev = colptr->un_dev;
   2065 		/*
   2066 		 * Open by device id
   2067 		 * If this device is hotspared
   2068 		 * use the hotspare key
   2069 		 */
   2070 		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
   2071 		    colptr->un_hs_key : colptr->un_orig_key);
   2072 
   2073 		if (tmpdev == NODEV64) {
   2074 			err = 1;
   2075 			break;
   2076 		}
   2077 
   2078 		colptr->un_dev = tmpdev;
   2079 
   2080 		bzero((caddr_t)&bp, sizeof (buf_t));
   2081 		bp.b_back = &bp;
   2082 		bp.b_forw = &bp;
   2083 		bp.b_flags = (B_READ | B_BUSY);
   2084 		sema_init(&bp.b_io, 0, NULL,
   2085 		    SEMA_DEFAULT, NULL);
   2086 		sema_init(&bp.b_sem, 0, NULL,
   2087 		    SEMA_DEFAULT, NULL);
   2088 		bp.b_edev = md_dev64_to_dev(colptr->un_dev);
   2089 		bp.b_lblkno = colptr->un_pwstart;
   2090 		bp.b_bcount = DEV_BSIZE;
   2091 		bp.b_bufsize = DEV_BSIZE;
   2092 		bp.b_un.b_addr = (caddr_t)buf;
   2093 		(void) md_call_strategy(&bp, 0, NULL);
   2094 		if (biowait(&bp)) {
   2095 			err = 1;
   2096 			break;
   2097 		}
   2098 	}
   2099 
   2100 	kmem_free(buf, DEV_BSIZE);
   2101 	return (err);
   2102 }
   2103 
   2104 /*
   2105  * NAME:	raid_iosetup
   2106  * DESCRIPTION: RAID metadevice specific I/O set up routine which does
   2107  *		all the necessary calculations to determine the location
   2108  *		of the segement for the I/O.
   2109  * PARAMETERS:	mr_unit_t *un - unit number of RAID metadevice
   2110  *		diskaddr_t	blkno - block number of the I/O attempt
   2111  *		size_t		blkcnt - block count for this I/O
   2112  *		md_raidcs_t *cs - child structure for each segmented I/O
   2113  *
   2114  * NOTE:	The following is an example of a raid disk layer out:
   2115  *
   2116  *		Total Column = 5
   2117  *		Original Column = 4
   2118  *		Segment Per Column = 10
   2119  *
   2120  *			Col#0	Col#1	Col#2	Col#3	Col#4	Col#5	Col#6
   2121  *		-------------------------------------------------------------
   2122  *		line#0	Seg#0	Seg#1	Seg#2	Parity	Seg#30	Seg#40
   2123  *		line#1	Parity	Seg#3	Seg#4	Seg#5	Seg#31
   2124  *		line#2	Seg#8	Parity	Seg#6	Seg#7	Seg#32
   2125  *		line#3	Seg#10	Seg#11	Parity	Seg#9	Seg#33
   2126  *		line#4	Seg#12	Seg#13	Seg#14	Parity	Seg#34
   2127  *		line#5	Parity	Seg#15	Seg#16	Seg#17	Seg#35
   2128  *		line#6	Seg#20	Parity	Seg#18	Seg#19	Seg#36
   2129  *		line#7	Seg#22	Seg#23	Parity	Seg#21	Seg#37
   2130  *		line#8	Seg#24	Seg#25	Seg#26	Parity	Seg#38
   2131  *		line#9	Parity	Seg#27	Seg#28	Seg#29	Seg#39
   2132  */
   2133 static size_t
   2134 raid_iosetup(
   2135 	mr_unit_t	*un,
   2136 	diskaddr_t	blkno,
   2137 	size_t		blkcnt,
   2138 	md_raidcs_t	*cs
   2139 )
   2140 {
   2141 	diskaddr_t	segment;
   2142 	diskaddr_t	segstart;
   2143 	diskaddr_t	segoff;
   2144 	size_t		leftover;
   2145 	diskaddr_t	line;
   2146 	uint_t		iosize;
   2147 	uint_t		colcnt;
   2148 
   2149 	/* caculate the segment# and offset for the block */
   2150 	segment = blkno / un->un_segsize;
   2151 	segstart = segment * un->un_segsize;
   2152 	segoff = blkno - segstart;
   2153 	iosize = un->un_iosize - 1;
   2154 	colcnt = un->un_totalcolumncnt - 1;
   2155 	line = raid_line(segment, un);
   2156 	cs->cs_dcolumn = raid_dcolumn(segment, un);
   2157 	cs->cs_pcolumn = raid_pcolumn(segment, un);
   2158 	cs->cs_dflags = un->un_column[cs->cs_dcolumn].un_devflags;
   2159 	cs->cs_pflags = un->un_column[cs->cs_pcolumn].un_devflags;
   2160 	cs->cs_line = line;
   2161 
   2162 	if ((cs->cs_ps->ps_flags & MD_RPS_WRITE) &&
   2163 	    (UNIT_STATE(un) & RCS_OKAY) &&
   2164 	    (segoff == 0) &&
   2165 	    (un->un_totalcolumncnt == un->un_origcolumncnt) &&
   2166 	    (un->un_segsize < un->un_iosize) &&
   2167 	    (un->un_iosize <= un->un_maxio) &&
   2168 	    (blkno == line * un->un_segsize * colcnt) &&
   2169 	    (blkcnt >= ((un->un_totalcolumncnt -1) * un->un_segsize)) &&
   2170 	    (raid_state_cnt(un, RCS_OKAY) == un->un_origcolumncnt) &&
   2171 	    (raid_check_cols(un) == 0)) {
   2172 
   2173 		md_raidcbuf_t	**cbufp;
   2174 		md_raidcbuf_t	*cbuf;
   2175 		int		i, j;
   2176 
   2177 		STAT_INC(raid_full_line_writes);
   2178 		leftover = blkcnt - (un->un_segsize * colcnt);
   2179 		ASSERT(blkcnt >= (un->un_segsize * colcnt));
   2180 		cs->cs_blkno = line * un->un_segsize;
   2181 		cs->cs_blkcnt = un->un_segsize;
   2182 		cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
   2183 		cs->cs_bcount = dbtob(cs->cs_blkcnt);
   2184 		cs->cs_flags |= MD_RCS_LINE;
   2185 
   2186 		cbufp = &cs->cs_buflist;
   2187 		for (i = 0; i < un->un_totalcolumncnt; i++) {
   2188 			j = cs->cs_dcolumn + i;
   2189 			j = j % un->un_totalcolumncnt;
   2190 
   2191 			if ((j == cs->cs_dcolumn) || (j == cs->cs_pcolumn))
   2192 				continue;
   2193 			cbuf = kmem_cache_alloc(raid_cbuf_cache,
   2194 			    MD_ALLOCFLAGS);
   2195 			raid_cbuf_init(cbuf);
   2196 			cbuf->cbuf_un = cs->cs_un;
   2197 			cbuf->cbuf_ps = cs->cs_ps;
   2198 			cbuf->cbuf_column = j;
   2199 			cbuf->cbuf_bcount = dbtob(un->un_segsize);
   2200 			*cbufp = cbuf;
   2201 			cbufp = &cbuf->cbuf_next;
   2202 		}
   2203 		return (leftover);
   2204 	}
   2205 
   2206 	leftover = blkcnt - (un->un_segsize - segoff);
   2207 	if (blkcnt > (un->un_segsize - segoff))
   2208 		blkcnt -= leftover;
   2209 	else
   2210 		leftover = 0;
   2211 
   2212 	if (blkcnt > (size_t)iosize) {
   2213 		leftover += (blkcnt - iosize);
   2214 		blkcnt = iosize;
   2215 	}
   2216 
   2217 	/* calculate the line# and column# for the segment */
   2218 	cs->cs_flags &= ~MD_RCS_LINE;
   2219 	cs->cs_blkno = line * un->un_segsize + segoff;
   2220 	cs->cs_blkcnt = (uint_t)blkcnt;
   2221 	cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
   2222 	cs->cs_bcount = dbtob((uint_t)blkcnt);
   2223 	return (leftover);
   2224 }
   2225 
   2226 /*
   2227  * NAME:	raid_done
   2228  * DESCRIPTION: RAID metadevice I/O done interrupt routine
   2229  * PARAMETERS:	struct buf *bp - pointer to a buffer structure
   2230  */
   2231 static void
   2232 raid_done(struct buf *bp)
   2233 {
   2234 	md_raidcs_t	*cs;
   2235 	int		flags, frags;
   2236 
   2237 	sema_v(&bp->b_io);
   2238 	cs = (md_raidcs_t *)bp->b_chain;
   2239 
   2240 	ASSERT(cs != NULL);
   2241 
   2242 	mutex_enter(&cs->cs_mx);
   2243 	if (bp->b_flags & B_ERROR) {
   2244 		cs->cs_flags |= MD_RCS_ERROR;
   2245 		cs->cs_flags &= ~(MD_RCS_ISCALL);
   2246 	}
   2247 
   2248 	flags = cs->cs_flags;
   2249 	frags = --cs->cs_frags;
   2250 	mutex_exit(&cs->cs_mx);
   2251 	if (frags != 0) {
   2252 		return;
   2253 	}
   2254 
   2255 	if (flags & MD_RCS_ERROR) {
   2256 		if (cs->cs_error_call) {
   2257 			daemon_request(&md_done_daemon, cs->cs_error_call,
   2258 			    (daemon_queue_t *)cs, REQ_OLD);
   2259 		}
   2260 		return;
   2261 	}
   2262 
   2263 	if (flags & MD_RCS_ISCALL) {
   2264 		cs->cs_flags &= ~(MD_RCS_ISCALL);
   2265 		(*(cs->cs_call))(cs);
   2266 		return;
   2267 	}
   2268 	daemon_request(&md_done_daemon, cs->cs_call,
   2269 	    (daemon_queue_t *)cs, REQ_OLD);
   2270 }
   2271 /*
   2272  * the flag RIO_EXTRA is used when dealing with a column in the process
   2273  * of being resynced. During the resync, writes may have to take place
   2274  * on both the original component and a hotspare component.
   2275  */
   2276 #define	RIO_DATA	0x00100		/* use data buffer & data column */
   2277 #define	RIO_PARITY	0x00200		/* use parity buffer & parity column */
   2278 #define	RIO_WRITE	0x00400		/* issue a write */
   2279 #define	RIO_READ	0x00800		/* issue a read */
   2280 #define	RIO_PWIO	0x01000		/* do the I/O to the prewrite entry */
   2281 #define	RIO_ALT		0x02000		/* do write to alternate device */
   2282 #define	RIO_EXTRA	0x04000		/* use extra buffer */
   2283 
   2284 #define	RIO_COLMASK	0x000ff
   2285 
   2286 #define	RIO_PREWRITE	RIO_WRITE | RIO_PWIO
   2287 
   2288 /*
   2289  * NAME:	raidio
   2290  * DESCRIPTION: RAID metadevice write routine
   2291  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
   2292  */
   2293 static void
   2294 raidio(md_raidcs_t *cs, int flags)
   2295 {
   2296 	buf_t		*bp;
   2297 	int		column;
   2298 	int		flag;
   2299 	void		*private;
   2300 	mr_unit_t	*un;
   2301 	int		iosize;
   2302 	diskaddr_t	pwstart;
   2303 	diskaddr_t	devstart;
   2304 	md_dev64_t	dev;
   2305 
   2306 	un = cs->cs_un;
   2307 
   2308 	ASSERT(IO_READER_HELD(un));
   2309 	ASSERT(UNIT_READER_HELD(un));
   2310 
   2311 	if (flags & RIO_DATA) {
   2312 		if (flags & RIO_EXTRA)
   2313 			bp = &cs->cs_hbuf;
   2314 		else
   2315 			bp = &cs->cs_dbuf;
   2316 		bp->b_un.b_addr = cs->cs_dbuffer;
   2317 		column = cs->cs_dcolumn;
   2318 	} else {
   2319 		if (flags & RIO_EXTRA)
   2320 			bp = &cs->cs_hbuf;
   2321 		else
   2322 			bp = &cs->cs_pbuf;
   2323 		bp->b_un.b_addr = cs->cs_pbuffer;
   2324 		column = cs->cs_pcolumn;
   2325 	}
   2326 	if (flags & RIO_COLMASK)
   2327 		column = (flags & RIO_COLMASK) - 1;
   2328 
   2329 	bp->b_bcount = cs->cs_bcount;
   2330 	bp->b_bufsize = cs->cs_bcount;
   2331 	iosize = un->un_iosize;
   2332 
   2333 	/* check if the hotspared device will be used */
   2334 	if (flags & RIO_ALT && (flags & RIO_WRITE)) {
   2335 		pwstart = un->un_column[column].un_alt_pwstart;
   2336 		devstart = un->un_column[column].un_alt_devstart;
   2337 		dev = un->un_column[column].un_alt_dev;
   2338 	} else {
   2339 		pwstart = un->un_column[column].un_pwstart;
   2340 		devstart = un->un_column[column].un_devstart;
   2341 		dev = un->un_column[column].un_dev;
   2342 	}
   2343 
   2344 	/* if not writing to log skip log header */
   2345 	if ((flags & RIO_PWIO) == 0) {
   2346 		bp->b_lblkno = devstart + cs->cs_blkno;
   2347 		bp->b_un.b_addr += DEV_BSIZE;
   2348 	} else {
   2349 		bp->b_bcount += DEV_BSIZE;
   2350 		bp->b_bufsize = bp->b_bcount;
   2351 		if (flags & RIO_DATA) {
   2352 			bp->b_lblkno = cs->cs_dpwslot * iosize + pwstart;
   2353 		} else { /* not DATA -> PARITY */
   2354 			bp->b_lblkno = cs->cs_ppwslot * iosize + pwstart;
   2355 		}
   2356 	}
   2357 
   2358 	bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR | nv_available);
   2359 	bp->b_flags |= B_BUSY;
   2360 	if (flags & RIO_READ) {
   2361 		bp->b_flags |= B_READ;
   2362 	} else {
   2363 		bp->b_flags |= B_WRITE;
   2364 		if ((nv_available && nv_parity && (flags & RIO_PARITY)) ||
   2365 		    (nv_available && nv_prewrite && (flags & RIO_PWIO)))
   2366 			bp->b_flags |= nv_available;
   2367 	}
   2368 	bp->b_iodone = (int (*)())raid_done;
   2369 	bp->b_edev = md_dev64_to_dev(dev);
   2370 
   2371 	ASSERT((bp->b_edev != 0) && (bp->b_edev != NODEV));
   2372 
   2373 	private = cs->cs_strategy_private;
   2374 	flag = cs->cs_strategy_flag;
   2375 
   2376 	md_call_strategy(bp, flag, private);
   2377 }
   2378 
   2379 /*
   2380  * NAME:	genstandardparity
   2381  * DESCRIPTION: This routine
   2382  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
   2383  */
   2384 static void
   2385 genstandardparity(md_raidcs_t *cs)
   2386 {
   2387 	uint_t		*dbuf, *pbuf;
   2388 	size_t		wordcnt;
   2389 	uint_t		dsum = 0;
   2390 	uint_t		psum = 0;
   2391 
   2392 	ASSERT((cs->cs_bcount & 0x3) == 0);
   2393 
   2394 	wordcnt = cs->cs_bcount / sizeof (uint_t);
   2395 
   2396 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
   2397 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
   2398 
   2399 	/* Word aligned */
   2400 	if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
   2401 		uint_t	*uwbuf = (uint_t *)(void *)(cs->cs_addr);
   2402 		uint_t	uval;
   2403 
   2404 		while (wordcnt--) {
   2405 			uval = *uwbuf++;
   2406 			psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ uval));
   2407 			++pbuf;
   2408 			*dbuf = uval;
   2409 			dsum ^= uval;
   2410 			++dbuf;
   2411 		}
   2412 	} else {
   2413 		uchar_t	*ubbuf = (uchar_t *)(cs->cs_addr);
   2414 		union {
   2415 			uint_t	wb;
   2416 			uchar_t	bb[4];
   2417 		} cb;
   2418 
   2419 		while (wordcnt--) {
   2420 			cb.bb[0] = *ubbuf++;
   2421 			cb.bb[1] = *ubbuf++;
   2422 			cb.bb[2] = *ubbuf++;
   2423 			cb.bb[3] = *ubbuf++;
   2424 			psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ cb.wb));
   2425 			++pbuf;
   2426 			*dbuf = cb.wb;
   2427 			dsum ^= cb.wb;
   2428 			++dbuf;
   2429 		}
   2430 	}
   2431 
   2432 	RAID_FILLIN_RPW(cs->cs_dbuffer, cs->cs_un, dsum, cs->cs_pcolumn,
   2433 	    cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
   2434 	    2, cs->cs_dcolumn, RAID_PWMAGIC);
   2435 
   2436 	RAID_FILLIN_RPW(cs->cs_pbuffer, cs->cs_un, psum, cs->cs_dcolumn,
   2437 	    cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
   2438 	    2, cs->cs_pcolumn, RAID_PWMAGIC);
   2439 }
   2440 
   2441 static void
   2442 genlineparity(md_raidcs_t *cs)
   2443 {
   2444 
   2445 	mr_unit_t	*un = cs->cs_un;
   2446 	md_raidcbuf_t	*cbuf;
   2447 	uint_t		*pbuf, *dbuf;
   2448 	uint_t		*uwbuf;
   2449 	uchar_t		*ubbuf;
   2450 	size_t		wordcnt;
   2451 	uint_t		psum = 0, dsum = 0;
   2452 	size_t		count = un->un_segsize * DEV_BSIZE;
   2453 	uint_t		col;
   2454 	buf_t		*bp;
   2455 
   2456 	ASSERT((cs->cs_bcount & 0x3) == 0);
   2457 
   2458 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
   2459 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
   2460 	uwbuf = (uint_t *)(void *)(cs->cs_addr);
   2461 	ubbuf = (uchar_t *)(void *)(cs->cs_addr);
   2462 
   2463 	wordcnt = count / sizeof (uint_t);
   2464 
   2465 	/* Word aligned */
   2466 	if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
   2467 		uint_t	 uval;
   2468 
   2469 		while (wordcnt--) {
   2470 			uval = *uwbuf++;
   2471 			*dbuf = uval;
   2472 			*pbuf = uval;
   2473 			dsum ^= uval;
   2474 			++pbuf;
   2475 			++dbuf;
   2476 		}
   2477 	} else {
   2478 		union {
   2479 			uint_t	wb;
   2480 			uchar_t	bb[4];
   2481 		} cb;
   2482 
   2483 		while (wordcnt--) {
   2484 			cb.bb[0] = *ubbuf++;
   2485 			cb.bb[1] = *ubbuf++;
   2486 			cb.bb[2] = *ubbuf++;
   2487 			cb.bb[3] = *ubbuf++;
   2488 			*dbuf = cb.wb;
   2489 			*pbuf = cb.wb;
   2490 			dsum ^= cb.wb;
   2491 			++pbuf;
   2492 			++dbuf;
   2493 		}
   2494 	}
   2495 
   2496 	RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, cs->cs_pcolumn,
   2497 	    cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
   2498 	    un->un_totalcolumncnt, cs->cs_dcolumn, RAID_PWMAGIC);
   2499 
   2500 	raidio(cs, RIO_PREWRITE | RIO_DATA);
   2501 
   2502 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
   2503 
   2504 		dsum = 0;
   2505 		pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
   2506 		dbuf = (uint_t *)(void *)(cbuf->cbuf_buffer + DEV_BSIZE);
   2507 
   2508 		wordcnt = count / sizeof (uint_t);
   2509 
   2510 		col = cbuf->cbuf_column;
   2511 
   2512 		/* Word aligned */
   2513 		if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
   2514 			uint_t	uval;
   2515 
   2516 			/*
   2517 			 * Only calculate psum when working on the last
   2518 			 * data buffer.
   2519 			 */
   2520 			if (cbuf->cbuf_next == NULL) {
   2521 				psum = 0;
   2522 				while (wordcnt--) {
   2523 					uval = *uwbuf++;
   2524 					*dbuf = uval;
   2525 					psum ^= (*pbuf ^= uval);
   2526 					dsum ^= uval;
   2527 					++dbuf;
   2528 					++pbuf;
   2529 				}
   2530 			} else {
   2531 				while (wordcnt--) {
   2532 					uval = *uwbuf++;
   2533 					*dbuf = uval;
   2534 					*pbuf ^= uval;
   2535 					dsum ^= uval;
   2536 					++dbuf;
   2537 					++pbuf;
   2538 				}
   2539 			}
   2540 		} else {
   2541 			union {
   2542 				uint_t	wb;
   2543 				uchar_t	bb[4];
   2544 			} cb;
   2545 
   2546 			/*
   2547 			 * Only calculate psum when working on the last
   2548 			 * data buffer.
   2549 			 */
   2550 			if (cbuf->cbuf_next == NULL) {
   2551 				psum = 0;
   2552 				while (wordcnt--) {
   2553 					cb.bb[0] = *ubbuf++;
   2554 					cb.bb[1] = *ubbuf++;
   2555 					cb.bb[2] = *ubbuf++;
   2556 					cb.bb[3] = *ubbuf++;
   2557 					*dbuf = cb.wb;
   2558 					psum ^= (*pbuf ^= cb.wb);
   2559 					dsum ^= cb.wb;
   2560 					++dbuf;
   2561 					++pbuf;
   2562 				}
   2563 			} else {
   2564 				while (wordcnt--) {
   2565 					cb.bb[0] = *ubbuf++;
   2566 					cb.bb[1] = *ubbuf++;
   2567 					cb.bb[2] = *ubbuf++;
   2568 					cb.bb[3] = *ubbuf++;
   2569 					*dbuf = cb.wb;
   2570 					*pbuf ^= cb.wb;
   2571 					dsum ^= cb.wb;
   2572 					++dbuf;
   2573 					++pbuf;
   2574 				}
   2575 			}
   2576 		}
   2577 		RAID_FILLIN_RPW(cbuf->cbuf_buffer, un, dsum, cs->cs_pcolumn,
   2578 		    cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
   2579 		    un->un_totalcolumncnt, col, RAID_PWMAGIC);
   2580 
   2581 		/*
   2582 		 * fill in buffer for write to prewrite area
   2583 		 */
   2584 		bp = &cbuf->cbuf_bp;
   2585 		bp->b_un.b_addr = cbuf->cbuf_buffer;
   2586 		bp->b_bcount = cbuf->cbuf_bcount + DEV_BSIZE;
   2587 		bp->b_bufsize = bp->b_bcount;
   2588 		bp->b_lblkno = (cbuf->cbuf_pwslot * un->un_iosize) +
   2589 		    un->un_column[col].un_pwstart;
   2590 		bp->b_flags = B_WRITE | B_BUSY;
   2591 		if (nv_available && nv_prewrite)
   2592 			bp->b_flags |= nv_available;
   2593 		bp->b_iodone = (int (*)())raid_done;
   2594 		bp->b_edev = md_dev64_to_dev(un->un_column[col].un_dev);
   2595 		bp->b_chain = (struct buf *)cs;
   2596 		md_call_strategy(bp,
   2597 		    cs->cs_strategy_flag, cs->cs_strategy_private);
   2598 	}
   2599 
   2600 	RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, cs->cs_dcolumn,
   2601 	    cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
   2602 	    un->un_totalcolumncnt, cs->cs_pcolumn, RAID_PWMAGIC);
   2603 
   2604 	raidio(cs, RIO_PREWRITE | RIO_PARITY);
   2605 }
   2606 
   2607 /*
   2608  * NAME:	raid_readregenloop
   2609  * DESCRIPTION: RAID metadevice write routine
   2610  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
   2611  */
   2612 static void
   2613 raid_readregenloop(md_raidcs_t *cs)
   2614 {
   2615 	mr_unit_t	*un;
   2616 	md_raidps_t	*ps;
   2617 	uint_t		*dbuf;
   2618 	uint_t		*pbuf;
   2619 	size_t		wordcnt;
   2620 
   2621 	un = cs->cs_un;
   2622 
   2623 	/*
   2624 	 * XOR the parity with data bytes, must skip the
   2625 	 * pre-write entry header in all data/parity buffers
   2626 	 */
   2627 	wordcnt = cs->cs_bcount / sizeof (uint_t);
   2628 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
   2629 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
   2630 	while (wordcnt--)
   2631 		*dbuf++ ^= *pbuf++;
   2632 
   2633 	/* bump up the loop count */
   2634 	cs->cs_loop++;
   2635 
   2636 	/* skip the errored component */
   2637 	if (cs->cs_loop == cs->cs_dcolumn)
   2638 		cs->cs_loop++;
   2639 
   2640 	if (cs->cs_loop != un->un_totalcolumncnt) {
   2641 		cs->cs_frags = 1;
   2642 		raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1));
   2643 		return;
   2644 	}
   2645 	/* reaching the end sof loop */
   2646 	ps = cs->cs_ps;
   2647 	bcopy(cs->cs_dbuffer + DEV_BSIZE, cs->cs_addr, cs->cs_bcount);
   2648 	raid_free_child(cs, 1);
   2649 
   2650 	/* decrement readfrags */
   2651 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
   2652 }
   2653 
   2654 /*
   2655  * NAME:	raid_read_io
   2656  * DESCRIPTION: RAID metadevice read I/O routine
   2657  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
   2658  *		md_raidcs_t *cs - pointer to a child structure
   2659  */
   2660 static void
   2661 raid_read_io(mr_unit_t *un, md_raidcs_t *cs)
   2662 {
   2663 	int	flag;
   2664 	void	*private;
   2665 	buf_t	*bp;
   2666 	buf_t	*pb = cs->cs_ps->ps_bp;
   2667 	mr_column_t	*column;
   2668 
   2669 	flag = cs->cs_strategy_flag;
   2670 	private = cs->cs_strategy_private;
   2671 	column = &un->un_column[cs->cs_dcolumn];
   2672 
   2673 	/*
   2674 	 * The component to be read is good, simply set up bp structure
   2675 	 * and call low level md routine doing the read.
   2676 	 */
   2677 
   2678 	if (COLUMN_ISOKAY(un, cs->cs_dcolumn) ||
   2679 	    (COLUMN_ISLASTERR(un, cs->cs_dcolumn) &&
   2680 	    (cs->cs_flags & MD_RCS_RECOVERY) == 0)) {
   2681 		dev_t ddi_dev; /* needed for bioclone, so not md_dev64_t */
   2682 		ddi_dev = md_dev64_to_dev(column->un_dev);
   2683 
   2684 		bp = &cs->cs_dbuf;
   2685 		bp = md_bioclone(pb, cs->cs_offset, cs->cs_bcount, ddi_dev,
   2686 		    column->un_devstart + cs->cs_blkno,
   2687 		    (int (*)())raid_done, bp, KM_NOSLEEP);
   2688 
   2689 		bp->b_chain = (buf_t *)cs;
   2690 
   2691 		cs->cs_frags = 1;
   2692 		cs->cs_error_call = raid_read_error;
   2693 		cs->cs_retry_call = raid_read_retry;
   2694 		cs->cs_flags |= MD_RCS_ISCALL;
   2695 		cs->cs_stage = RAID_READ_DONE;
   2696 		cs->cs_call = raid_stage;
   2697 
   2698 		ASSERT(bp->b_edev != 0);
   2699 
   2700 		md_call_strategy(bp, flag, private);
   2701 		return;
   2702 	}
   2703 
   2704 	/*
   2705 	 * The component to be read is bad, have to go through
   2706 	 * raid specific method to read data from other members.
   2707 	 */
   2708 	cs->cs_loop = 0;
   2709 	/*
   2710 	 * NOTE: always get dbuffer before pbuffer
   2711 	 *	 and get both buffers before pwslot
   2712 	 *	 otherwise a deadlock could be introduced.
   2713 	 */
   2714 	raid_mapin_buf(cs);
   2715 	getdbuffer(cs);
   2716 	getpbuffer(cs);
   2717 	if (cs->cs_loop == cs->cs_dcolumn)
   2718 		cs->cs_loop++;
   2719 
   2720 	/* zero out data buffer for use as a data sink */
   2721 	bzero(cs->cs_dbuffer + DEV_BSIZE, cs->cs_bcount);
   2722 	cs->cs_stage = RAID_NONE;
   2723 	cs->cs_call = raid_readregenloop;
   2724 	cs->cs_error_call = raid_read_error;
   2725 	cs->cs_retry_call = raid_read_no_retry;
   2726 	cs->cs_frags = 1;
   2727 
   2728 	/* use parity buffer to read other columns */
   2729 	raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1));
   2730 }
   2731 
   2732 /*
   2733  * NAME:	raid_read
   2734  * DESCRIPTION: RAID metadevice write routine
   2735  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
   2736  *		md_raidcs_t *cs - pointer to a child structure
   2737  */
   2738 static int
   2739 raid_read(mr_unit_t *un, md_raidcs_t *cs)
   2740 {
   2741 	int		error = 0;
   2742 	md_raidps_t	*ps;
   2743 	mdi_unit_t	*ui;
   2744 	minor_t		mnum;
   2745 
   2746 	ASSERT(IO_READER_HELD(un));
   2747 	ps = cs->cs_ps;
   2748 	ui = ps->ps_ui;
   2749 	raid_line_reader_lock(cs, 0);
   2750 	un = (mr_unit_t *)md_unit_readerlock(ui);
   2751 	ASSERT(UNIT_STATE(un) != RUS_INIT);
   2752 	mnum = MD_SID(un);
   2753 	cs->cs_un = un;
   2754 
   2755 	/* make sure the read doesn't go beyond the end of the column */
   2756 	if (cs->cs_blkno + cs->cs_blkcnt >
   2757 	    un->un_segsize * un->un_segsincolumn) {
   2758 		error = ENXIO;
   2759 	}
   2760 	if (error)
   2761 		goto rerror;
   2762 
   2763 	if (un->un_state & RUS_REGEN) {
   2764 		raid_regen_parity(cs);
   2765 		un = MD_UNIT(mnum);
   2766 		cs->cs_un = un;
   2767 	}
   2768 
   2769 	raid_read_io(un, cs);
   2770 	return (0);
   2771 
   2772 rerror:
   2773 	raid_error_parent(ps, error);
   2774 	raid_free_child(cs, 1);
   2775 	/* decrement readfrags */
   2776 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
   2777 	return (0);
   2778 }
   2779 
   2780 /*
   2781  * NAME:	raid_write_err_retry
   2782  * DESCRIPTION: RAID metadevice write retry routine
   2783  *		write was for parity or data only;
   2784  *		complete write with error, no recovery possible
   2785  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
   2786  *		md_raidcs_t *cs - pointer to a child structure
   2787  */
   2788 /*ARGSUSED*/
   2789 static void
   2790 raid_write_err_retry(mr_unit_t *un, md_raidcs_t *cs)
   2791 {
   2792 	md_raidps_t	*ps = cs->cs_ps;
   2793 	int		flags = RFP_DECR_FRAGS | RFP_RLS_LOCK;
   2794 
   2795 	/* decrement pwfrags if needed, and frags */
   2796 	if (!(cs->cs_flags & MD_RCS_PWDONE))
   2797 		flags |= RFP_DECR_PWFRAGS;
   2798 	raid_error_parent(ps, EIO);
   2799 	raid_free_child(cs, 1);
   2800 	raid_free_parent(ps, flags);
   2801 }
   2802 
   2803 /*
   2804  * NAME:	raid_write_err_retry
   2805  * DESCRIPTION: RAID metadevice write retry routine
   2806  *		 write is too far along to retry and parent
   2807  *		 has already been signaled with iodone.
   2808  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
   2809  *		md_raidcs_t *cs - pointer to a child structure
   2810  */
   2811 /*ARGSUSED*/
   2812 static void
   2813 raid_write_no_retry(mr_unit_t *un, md_raidcs_t *cs)
   2814 {
   2815 	md_raidps_t	*ps = cs->cs_ps;
   2816 	int		flags = RFP_DECR_FRAGS | RFP_RLS_LOCK;
   2817 
   2818 	/* decrement pwfrags if needed, and frags */
   2819 	if (!(cs->cs_flags & MD_RCS_PWDONE))
   2820 		flags |= RFP_DECR_PWFRAGS;
   2821 	raid_free_child(cs, 1);
   2822 	raid_free_parent(ps, flags);
   2823 }
   2824 
   2825 /*
   2826  * NAME:	raid_write_retry
   2827  * DESCRIPTION: RAID metadevice write retry routine
   2828  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
   2829  *		md_raidcs_t *cs - pointer to a child structure
   2830  */
   2831 static void
   2832 raid_write_retry(mr_unit_t *un, md_raidcs_t *cs)
   2833 {
   2834 	md_raidps_t	*ps;
   2835 
   2836 	ps = cs->cs_ps;
   2837 
   2838 	/* re-initialize the buf_t structure for raid_write() */
   2839 	cs->cs_dbuf.b_chain = (struct buf *)cs;
   2840 	cs->cs_dbuf.b_back = &cs->cs_dbuf;
   2841 	cs->cs_dbuf.b_forw = &cs->cs_dbuf;
   2842 	cs->cs_dbuf.b_flags = B_BUSY;	/* initialize flags */
   2843 	cs->cs_dbuf.b_error = 0;	/* initialize error */
   2844 	cs->cs_dbuf.b_offset = -1;
   2845 	/* Initialize semaphores */
   2846 	sema_init(&cs->cs_dbuf.b_io, 0, NULL,
   2847 	    SEMA_DEFAULT, NULL);
   2848 	sema_init(&cs->cs_dbuf.b_sem, 0, NULL,
   2849 	    SEMA_DEFAULT, NULL);
   2850 
   2851 	cs->cs_pbuf.b_chain = (struct buf *)cs;
   2852 	cs->cs_pbuf.b_back = &cs->cs_pbuf;
   2853 	cs->cs_pbuf.b_forw = &cs->cs_pbuf;
   2854 	cs->cs_pbuf.b_flags = B_BUSY;	/* initialize flags */
   2855 	cs->cs_pbuf.b_error = 0;	/* initialize error */
   2856 	cs->cs_pbuf.b_offset = -1;
   2857 	sema_init(&cs->cs_pbuf.b_io, 0, NULL,
   2858 	    SEMA_DEFAULT, NULL);
   2859 	sema_init(&cs->cs_pbuf.b_sem, 0, NULL,
   2860 	    SEMA_DEFAULT, NULL);
   2861 
   2862 	cs->cs_hbuf.b_chain = (struct buf *)cs;
   2863 	cs->cs_hbuf.b_back = &cs->cs_hbuf;
   2864 	cs->cs_hbuf.b_forw = &cs->cs_hbuf;
   2865 	cs->cs_hbuf.b_flags = B_BUSY;	/* initialize flags */
   2866 	cs->cs_hbuf.b_error = 0;	/* initialize error */
   2867 	cs->cs_hbuf.b_offset = -1;
   2868 	sema_init(&cs->cs_hbuf.b_io, 0, NULL,
   2869 	    SEMA_DEFAULT, NULL);
   2870 	sema_init(&cs->cs_hbuf.b_sem, 0, NULL,
   2871 	    SEMA_DEFAULT, NULL);
   2872 
   2873 	cs->cs_flags &= ~(MD_RCS_ERROR);
   2874 	/*
   2875 	 * If we have already done'ed the i/o but have done prewrite
   2876 	 * on this child, then reset PWDONE flag and bump pwfrags before
   2877 	 * restarting i/o.
   2878 	 * If pwfrags is zero, we have already 'iodone'd the i/o so
   2879 	 * leave things alone.  We don't want to re-'done' it.
   2880 	 */
   2881 	mutex_enter(&ps->ps_mx);
   2882 	if (cs->cs_flags & MD_RCS_PWDONE) {
   2883 		cs->cs_flags &= ~MD_RCS_PWDONE;
   2884 		ps->ps_pwfrags++;
   2885 	}
   2886 	mutex_exit(&ps->ps_mx);
   2887 	raid_write_io(un, cs);
   2888 }
   2889 
   2890 /*
   2891  * NAME:	raid_wrerr
   2892  * DESCRIPTION: RAID metadevice write routine
   2893  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
   2894  * LOCKS:	must obtain unit writer lock while calling raid_error_state
   2895  *		since a unit or column state transition may take place.
   2896  *		must obtain unit reader lock to retry I/O.
   2897  */
   2898 static void
   2899 raid_wrerr(md_raidcs_t *cs)
   2900 {
   2901 	md_raidps_t	*ps;
   2902 	mdi_unit_t	*ui;
   2903 	mr_unit_t	*un;
   2904 	md_raidcbuf_t	*cbuf;
   2905 
   2906 	ps = cs->cs_ps;
   2907 	ui = ps->ps_ui;
   2908 
   2909 	un = (mr_unit_t *)md_unit_writerlock(ui);
   2910 	ASSERT(un != 0);
   2911 
   2912 	if (cs->cs_dbuf.b_flags & B_ERROR)
   2913 		(void) raid_error_state(un, &cs->cs_dbuf);
   2914 	if (cs->cs_pbuf.b_flags & B_ERROR)
   2915 		(void) raid_error_state(un, &cs->cs_pbuf);
   2916 	if (cs->cs_hbuf.b_flags & B_ERROR)
   2917 		(void) raid_error_state(un, &cs->cs_hbuf);
   2918 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
   2919 		if (cbuf->cbuf_bp.b_flags & B_ERROR)
   2920 			(void) raid_error_state(un, &cbuf->cbuf_bp);
   2921 
   2922 	md_unit_writerexit(ui);
   2923 
   2924 	ps->ps_flags |= MD_RPS_HSREQ;
   2925 
   2926 	un = (mr_unit_t *)md_unit_readerlock(ui);
   2927 
   2928 	/* now attempt the appropriate retry routine */
   2929 	(*(cs->cs_retry_call))(un, cs);
   2930 }
   2931 /*
   2932  * NAMES:	raid_write_error
   2933  * DESCRIPTION: I/O error handling routine for a RAID metadevice write
   2934  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   2935  */
   2936 /*ARGSUSED*/
   2937 static void
   2938 raid_write_error(md_raidcs_t *cs)
   2939 {
   2940 	md_raidps_t	*ps;
   2941 	mdi_unit_t	*ui;
   2942 	mr_unit_t	*un;
   2943 	md_raidcbuf_t	*cbuf;
   2944 	set_t		setno;
   2945 
   2946 	ps = cs->cs_ps;
   2947 	ui = ps->ps_ui;
   2948 	un = cs->cs_un;
   2949 
   2950 	setno = MD_UN2SET(un);
   2951 
   2952 	/*
   2953 	 * locate each buf that is in error on this io and then
   2954 	 * output an error message
   2955 	 */
   2956 	if ((cs->cs_dbuf.b_flags & B_ERROR) &&
   2957 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) &&
   2958 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED))
   2959 		cmn_err(CE_WARN, "md %s: write error on %s",
   2960 		    md_shortname(MD_SID(un)),
   2961 		    md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0));
   2962 
   2963 	if ((cs->cs_pbuf.b_flags & B_ERROR) &&
   2964 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) &&
   2965 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED))
   2966 		cmn_err(CE_WARN, "md %s: write error on %s",
   2967 		    md_shortname(MD_SID(un)),
   2968 		    md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0));
   2969 
   2970 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
   2971 		if ((cbuf->cbuf_bp.b_flags & B_ERROR) &&
   2972 		    (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_ERRED) &&
   2973 		    (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_LAST_ERRED))
   2974 			cmn_err(CE_WARN, "md %s: write error on %s",
   2975 			    md_shortname(MD_SID(un)),
   2976 			    md_devname(setno, md_expldev(cbuf->cbuf_bp.b_edev),
   2977 			    NULL, 0));
   2978 
   2979 	md_unit_readerexit(ui);
   2980 
   2981 	ASSERT(cs->cs_frags == 0);
   2982 
   2983 	/* now schedule processing for possible state change */
   2984 	daemon_request(&md_mstr_daemon, raid_wrerr,
   2985 	    (daemon_queue_t *)cs, REQ_OLD);
   2986 
   2987 }
   2988 
   2989 /*
   2990  * NAME:	raid_write_ponly
   2991  * DESCRIPTION: RAID metadevice write routine
   2992  *		in the case where only the parity column can be written
   2993  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
   2994  */
   2995 static void
   2996 raid_write_ponly(md_raidcs_t *cs)
   2997 {
   2998 	md_raidps_t	*ps;
   2999 	mr_unit_t	*un = cs->cs_un;
   3000 
   3001 	ps = cs->cs_ps;
   3002 	/* decrement pwfrags if needed, but not frags */
   3003 	ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
   3004 	raid_free_parent(ps, RFP_DECR_PWFRAGS);
   3005 	cs->cs_flags |= MD_RCS_PWDONE;
   3006 	cs->cs_frags = 1;
   3007 	cs->cs_stage = RAID_WRITE_PONLY_DONE;
   3008 	cs->cs_call = raid_stage;
   3009 	cs->cs_error_call = raid_write_error;
   3010 	cs->cs_retry_call = raid_write_no_retry;
   3011 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
   3012 		cs->cs_frags++;
   3013 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE);
   3014 	}
   3015 	raidio(cs, RIO_PARITY | RIO_WRITE);
   3016 }
   3017 
   3018 /*
   3019  * NAME:	raid_write_ploop
   3020  * DESCRIPTION: RAID metadevice write routine, constructs parity from
   3021  *		data in other columns.
   3022  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
   3023  */
   3024 static void
   3025 raid_write_ploop(md_raidcs_t *cs)
   3026 {
   3027 	mr_unit_t *un = cs->cs_un;
   3028 	uint_t *dbuf;
   3029 	uint_t *pbuf;
   3030 	size_t wordcnt;
   3031 	uint_t psum = 0;
   3032 
   3033 	wordcnt = cs->cs_bcount / sizeof (uint_t);
   3034 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
   3035 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
   3036 	while (wordcnt--)
   3037 		*pbuf++ ^= *dbuf++;
   3038 	cs->cs_loop++;
   3039 
   3040 	/*
   3041 	 * build parity from scratch using new data,
   3042 	 * skip reading the data and parity columns.
   3043 	 */
   3044 	while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn)
   3045 		cs->cs_loop++;
   3046 
   3047 	if (cs->cs_loop != un->un_totalcolumncnt) {
   3048 		cs->cs_frags = 1;
   3049 		raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1));
   3050 		return;
   3051 	}
   3052 
   3053 	/* construct checksum for parity buffer */
   3054 	wordcnt = cs->cs_bcount / sizeof (uint_t);
   3055 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
   3056 	while (wordcnt--) {
   3057 		psum ^= *pbuf;
   3058 		pbuf++;
   3059 	}
   3060 	RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, -1,
   3061 	    cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
   3062 	    1, cs->cs_pcolumn, RAID_PWMAGIC);
   3063 
   3064 	cs->cs_stage = RAID_NONE;
   3065 	cs->cs_call = raid_write_ponly;
   3066 	cs->cs_error_call = raid_write_error;
   3067 	cs->cs_retry_call = raid_write_err_retry;
   3068 	cs->cs_frags = 1;
   3069 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
   3070 		cs->cs_frags++;
   3071 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE);
   3072 	}
   3073 	raidio(cs, RIO_PARITY | RIO_PREWRITE);
   3074 }
   3075 
   3076 /*
   3077  * NAME:	raid_write_donly
   3078  * DESCRIPTION: RAID metadevice write routine
   3079  *		Completed writing data to prewrite entry
   3080  *		in the case where only the data column can be written
   3081  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
   3082  */
   3083 static void
   3084 raid_write_donly(md_raidcs_t *cs)
   3085 {
   3086 	md_raidps_t	*ps;
   3087 	mr_unit_t	*un = cs->cs_un;
   3088 
   3089 	ps = cs->cs_ps;
   3090 	/* WARNING: don't release unit reader lock here... */
   3091 	/* decrement pwfrags if needed, but not frags */
   3092 	ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
   3093 	raid_free_parent(ps, RFP_DECR_PWFRAGS);
   3094 	cs->cs_flags |= MD_RCS_PWDONE;
   3095 	cs->cs_frags = 1;
   3096 	cs->cs_stage = RAID_WRITE_DONLY_DONE;
   3097 	cs->cs_call = raid_stage;
   3098 	cs->cs_error_call = raid_write_error;
   3099 	cs->cs_retry_call = raid_write_err_retry;
   3100 	if (WRITE_ALT(un, cs->cs_dcolumn)) {
   3101 		cs->cs_frags++;
   3102 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE);
   3103 	}
   3104 	raidio(cs, RIO_DATA | RIO_WRITE);
   3105 }
   3106 
   3107 /*
   3108  * NAME:	raid_write_got_old
   3109  * DESCRIPTION: RAID metadevice write routine
   3110  *		completed read of old data and old parity
   3111  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
   3112  */
   3113 static void
   3114 raid_write_got_old(md_raidcs_t *cs)
   3115 {
   3116 	mr_unit_t *un = cs->cs_un;
   3117 
   3118 	ASSERT(IO_READER_HELD(cs->cs_un));
   3119 	ASSERT(UNIT_READER_HELD(cs->cs_un));
   3120 
   3121 	raid_mapin_buf(cs);
   3122 	genstandardparity(cs);
   3123 	cs->cs_frags = 2;
   3124 	cs->cs_call = raid_stage;
   3125 	cs->cs_stage = RAID_PREWRITE_DONE;
   3126 	cs->cs_error_call = raid_write_error;
   3127 	cs->cs_retry_call = raid_write_retry;
   3128 
   3129 	if (WRITE_ALT(un, cs->cs_dcolumn)) {
   3130 		cs->cs_frags++;
   3131 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_PREWRITE);
   3132 	}
   3133 
   3134 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
   3135 		cs->cs_frags++;
   3136 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE);
   3137 	}
   3138 	ASSERT(cs->cs_frags < 4);
   3139 	raidio(cs,  RIO_DATA | RIO_PREWRITE);
   3140 	raidio(cs,  RIO_PARITY | RIO_PREWRITE);
   3141 }
   3142 
   3143 /*
   3144  * NAME:	raid_write_io
   3145  * DESCRIPTION: RAID metadevice write I/O routine
   3146  * PARAMETERS:	mr_unit_t *un -  pointer to a unit structure
   3147  *		md_raidcs_t *cs - pointer to a child structure
   3148  */
   3149 
   3150 /*ARGSUSED*/
   3151 static void
   3152 raid_write_io(mr_unit_t *un, md_raidcs_t *cs)
   3153 {
   3154 	md_raidps_t	*ps = cs->cs_ps;
   3155 	uint_t		*dbuf;
   3156 	uint_t		*ubuf;
   3157 	size_t		wordcnt;
   3158 	uint_t		dsum = 0;
   3159 	int		pcheck;
   3160 	int		dcheck;
   3161 
   3162 	ASSERT((un->un_column[cs->cs_pcolumn].un_devstate &
   3163 	    RCS_INIT) == 0);
   3164 	ASSERT((un->un_column[cs->cs_dcolumn].un_devstate &
   3165 	    RCS_INIT) == 0);
   3166 	ASSERT(IO_READER_HELD(un));
   3167 	ASSERT(UNIT_READER_HELD(un));
   3168 	ASSERT(cs->cs_flags & MD_RCS_HAVE_PW_SLOTS);
   3169 	if (cs->cs_flags & MD_RCS_LINE) {
   3170 
   3171 		mr_unit_t	*un = cs->cs_un;
   3172 
   3173 		ASSERT(un->un_origcolumncnt == un->un_totalcolumncnt);
   3174 		raid_mapin_buf(cs);
   3175 		cs->cs_frags = un->un_origcolumncnt;
   3176 		cs->cs_call = raid_stage;
   3177 		cs->cs_error_call = raid_write_error;
   3178 		cs->cs_retry_call = raid_write_no_retry;
   3179 		cs->cs_stage = RAID_LINE_PWDONE;
   3180 		genlineparity(cs);
   3181 		return;
   3182 	}
   3183 
   3184 	pcheck = erred_check_line(un, cs, &un->un_column[cs->cs_pcolumn]);
   3185 	dcheck = erred_check_line(un, cs, &un->un_column[cs->cs_dcolumn]);
   3186 	cs->cs_resync_check = pcheck << RCL_PARITY_OFFSET || dcheck;
   3187 
   3188 	if (pcheck == RCL_ERRED && dcheck == RCL_ERRED) {
   3189 		int err = EIO;
   3190 
   3191 		if ((un->un_column[cs->cs_pcolumn].un_devstate ==
   3192 		    RCS_LAST_ERRED) ||
   3193 		    (un->un_column[cs->cs_dcolumn].un_devstate ==
   3194 		    RCS_LAST_ERRED))
   3195 			err = ENXIO;
   3196 		raid_error_parent(ps, err);
   3197 		ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
   3198 		raid_free_child(cs, 1);
   3199 		raid_free_parent(ps,  RFP_DECR_FRAGS
   3200 		    | RFP_RLS_LOCK | RFP_DECR_PWFRAGS);
   3201 		return;
   3202 	}
   3203 
   3204 	if (pcheck & RCL_ERRED) {
   3205 		/*
   3206 		 * handle case of only having data drive
   3207 		 */
   3208 		raid_mapin_buf(cs);
   3209 		wordcnt = cs->cs_bcount / sizeof (uint_t);
   3210 
   3211 		dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
   3212 		ubuf = (uint_t *)(void *)(cs->cs_addr);
   3213 
   3214 		while (wordcnt--) {
   3215 			*dbuf = *ubuf;
   3216 			dsum ^= *ubuf;
   3217 			dbuf++;
   3218 			ubuf++;
   3219 		}
   3220 		RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, -1,
   3221 		    cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
   3222 		    1, cs->cs_dcolumn, RAID_PWMAGIC);
   3223 		cs->cs_frags = 1;
   3224 		cs->cs_stage = RAID_NONE;
   3225 		cs->cs_call = raid_write_donly;
   3226 		cs->cs_error_call = raid_write_error;
   3227 		cs->cs_retry_call = raid_write_err_retry;
   3228 		if (WRITE_ALT(un, cs->cs_dcolumn)) {
   3229 			cs->cs_frags++;
   3230 			raidio(cs, RIO_DATA | RIO_ALT | RIO_EXTRA |
   3231 			    RIO_PREWRITE);
   3232 		}
   3233 		raidio(cs, RIO_DATA | RIO_PREWRITE);
   3234 		return;
   3235 	}
   3236 
   3237 	if (dcheck & RCL_ERRED) {
   3238 		/*
   3239 		 * handle case of only having parity drive
   3240 		 * build parity from scratch using new data,
   3241 		 * skip reading the data and parity columns.
   3242 		 */
   3243 		raid_mapin_buf(cs);
   3244 		cs->cs_loop = 0;
   3245 		while (cs->cs_loop == cs->cs_dcolumn ||
   3246 		    cs->cs_loop == cs->cs_pcolumn)
   3247 			cs->cs_loop++;
   3248 
   3249 		/* copy new data in to begin building parity */
   3250 		bcopy(cs->cs_addr, cs->cs_pbuffer + DEV_BSIZE, cs->cs_bcount);
   3251 		cs->cs_stage = RAID_NONE;
   3252 		cs->cs_call = raid_write_ploop;
   3253 		cs->cs_error_call = raid_write_error;
   3254 		cs->cs_retry_call = raid_write_err_retry;
   3255 		cs->cs_frags = 1;
   3256 		raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1));
   3257 		return;
   3258 	}
   3259 	/*
   3260 	 * handle normal cases
   3261 	 * read old data and old parity
   3262 	 */
   3263 	cs->cs_frags = 2;
   3264 	cs->cs_stage = RAID_NONE;
   3265 	cs->cs_call = raid_write_got_old;
   3266 	cs->cs_error_call = raid_write_error;
   3267 	cs->cs_retry_call = raid_write_retry;
   3268 	ASSERT(ps->ps_magic == RAID_PSMAGIC);
   3269 	raidio(cs, RIO_DATA | RIO_READ);
   3270 	raidio(cs, RIO_PARITY | RIO_READ);
   3271 }
   3272 
   3273 static void
   3274 raid_enqueue(md_raidcs_t *cs)
   3275 {
   3276 	mdi_unit_t	*ui = cs->cs_ps->ps_ui;
   3277 	kmutex_t	*io_list_mutex = &ui->ui_io_lock->io_list_mutex;
   3278 	md_raidcs_t	*cs1;
   3279 
   3280 	mutex_enter(io_list_mutex);
   3281 	ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD));
   3282 	if (ui->ui_io_lock->io_list_front == NULL) {
   3283 		ui->ui_io_lock->io_list_front = cs;
   3284 		ui->ui_io_lock->io_list_back = cs;
   3285 	} else {
   3286 		cs1 = ui->ui_io_lock->io_list_back;
   3287 		cs1->cs_linlck_next = cs;
   3288 		ui->ui_io_lock->io_list_back = cs;
   3289 	}
   3290 	STAT_INC(raid_write_waits);
   3291 	STAT_MAX(raid_max_write_q_length, raid_write_queue_length);
   3292 	cs->cs_linlck_next = NULL;
   3293 	mutex_exit(io_list_mutex);
   3294 }
   3295 
   3296 /*
   3297  * NAME:	raid_write
   3298  * DESCRIPTION: RAID metadevice write routine
   3299  * PARAMETERS:	mr_unit_t *un -  pointer to a unit structure
   3300  *		md_raidcs_t *cs - pointer to a child structure
   3301  */
   3302 
   3303 /*ARGSUSED*/
   3304 static int
   3305 raid_write(mr_unit_t *un, md_raidcs_t *cs)
   3306 {
   3307 	int		error = 0;
   3308 	md_raidps_t	*ps;
   3309 	mdi_unit_t	*ui;
   3310 	minor_t		mnum;
   3311 
   3312 	ASSERT(IO_READER_HELD(un));
   3313 	ps = cs->cs_ps;
   3314 	ui = ps->ps_ui;
   3315 
   3316 	ASSERT(UNIT_STATE(un) != RUS_INIT);
   3317 	if (UNIT_STATE(un) == RUS_LAST_ERRED)
   3318 		error = EIO;
   3319 
   3320 	/* make sure the write doesn't go beyond the column */
   3321 	if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn)
   3322 		error = ENXIO;
   3323 	if (error)
   3324 		goto werror;
   3325 
   3326 	getresources(cs);
   3327 
   3328 	/*
   3329 	 * this is an advisory loop that keeps the waiting lists short
   3330 	 * to reduce cpu time.  Since there is a race introduced by not
   3331 	 * aquiring all the correct mutexes, use a cv_timedwait to be
   3332 	 * sure the write always will wake up and start.
   3333 	 */
   3334 	while (raid_check_pw(cs)) {
   3335 		mutex_enter(&un->un_mx);
   3336 		un->un_rflags |= MD_RFLAG_NEEDPW;
   3337 		STAT_INC(raid_prewrite_waits);
   3338 		(void) cv_reltimedwait(&un->un_cv, &un->un_mx, md_wr_wait,
   3339 		    TR_CLOCK_TICK);
   3340 		un->un_rflags &= ~MD_RFLAG_NEEDPW;
   3341 		mutex_exit(&un->un_mx);
   3342 	}
   3343 
   3344 	if (raid_line_writer_lock(cs, 1))
   3345 		return (0);
   3346 
   3347 	un = (mr_unit_t *)md_unit_readerlock(ui);
   3348 	cs->cs_un = un;
   3349 	mnum = MD_SID(un);
   3350 
   3351 	if (un->un_state & RUS_REGEN) {
   3352 		raid_regen_parity(cs);
   3353 		un = MD_UNIT(mnum);
   3354 		cs->cs_un = un;
   3355 	}
   3356 
   3357 	raid_write_io(un, cs);
   3358 	return (0);
   3359 werror:
   3360 	/* aquire unit reader lock sinc raid_free_child always drops it */
   3361 	raid_error_parent(ps, error);
   3362 	raid_free_child(cs, 0);
   3363 	/* decrement both pwfrags and frags */
   3364 	raid_free_parent(ps, RFP_DECR_PWFRAGS | RFP_DECR_FRAGS | RFP_RLS_LOCK);
   3365 	return (0);
   3366 }
   3367 
   3368 
   3369 /*
   3370  * NAMES:	raid_stage
   3371  * DESCRIPTION: post-processing routine for a RAID metadevice
   3372  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
   3373  */
   3374 static void
   3375 raid_stage(md_raidcs_t *cs)
   3376 {
   3377 	md_raidps_t	*ps = cs->cs_ps;
   3378 	mr_unit_t	*un = cs->cs_un;
   3379 	md_raidcbuf_t	*cbuf;
   3380 	buf_t		*bp;
   3381 	void		*private;
   3382 	int		flag;
   3383 
   3384 	switch (cs->cs_stage) {
   3385 	case RAID_READ_DONE:
   3386 		raid_free_child(cs, 1);
   3387 		/* decrement readfrags */
   3388 		raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
   3389 		return;
   3390 
   3391 	case RAID_WRITE_DONE:
   3392 	case RAID_WRITE_PONLY_DONE:
   3393 	case RAID_WRITE_DONLY_DONE:
   3394 		/*
   3395 		 *  Completed writing real parity and/or data.
   3396 		 */
   3397 		ASSERT(cs->cs_flags & MD_RCS_PWDONE);
   3398 		raid_free_child(cs, 1);
   3399 		/* decrement frags but not pwfrags */
   3400 		raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK);
   3401 		return;
   3402 
   3403 	case RAID_PREWRITE_DONE:
   3404 		/*
   3405 		 * completed writing data and parity to prewrite entries
   3406 		 */
   3407 		/*
   3408 		 * WARNING: don't release unit reader lock here..
   3409 		 * decrement pwfrags but not frags
   3410 		 */
   3411 		raid_free_parent(ps, RFP_DECR_PWFRAGS);
   3412 		cs->cs_flags |= MD_RCS_PWDONE;
   3413 		cs->cs_frags = 2;
   3414 		cs->cs_stage = RAID_WRITE_DONE;
   3415 		cs->cs_call = raid_stage;
   3416 		cs->cs_error_call = raid_write_error;
   3417 		cs->cs_retry_call = raid_write_no_retry;
   3418 		if (WRITE_ALT(un, cs->cs_pcolumn)) {
   3419 			cs->cs_frags++;
   3420 			raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY |
   3421 			    RIO_WRITE);
   3422 		}
   3423 		if (WRITE_ALT(un, cs->cs_dcolumn)) {
   3424 			cs->cs_frags++;
   3425 			raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE);
   3426 		}
   3427 		ASSERT(cs->cs_frags < 4);
   3428 		raidio(cs, RIO_DATA | RIO_WRITE);
   3429 		raidio(cs, RIO_PARITY | RIO_WRITE);
   3430 		if (cs->cs_pw_inval_list) {
   3431 			raid_free_pwinvalidate(cs);
   3432 		}
   3433 		return;
   3434 
   3435 	case RAID_LINE_PWDONE:
   3436 		ASSERT(cs->cs_frags == 0);
   3437 		raid_free_parent(ps, RFP_DECR_PWFRAGS);
   3438 		cs->cs_flags |= MD_RCS_PWDONE;
   3439 		cs->cs_frags = un->un_origcolumncnt;
   3440 		cs->cs_call = raid_stage;
   3441 		cs->cs_error_call = raid_write_error;
   3442 		cs->cs_retry_call = raid_write_no_retry;
   3443 		cs->cs_stage = RAID_WRITE_DONE;
   3444 		for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
   3445 			/*
   3446 			 * fill in buffer for write to prewrite area
   3447 			 */
   3448 			bp = &cbuf->cbuf_bp;
   3449 			bp->b_back = bp;
   3450 			bp->b_forw = bp;
   3451 			bp->b_un.b_addr = cbuf->cbuf_buffer + DEV_BSIZE;
   3452 			bp->b_bcount = cbuf->cbuf_bcount;
   3453 			bp->b_bufsize = cbuf->cbuf_bcount;
   3454 			bp->b_lblkno =
   3455 			    un->un_column[cbuf->cbuf_column].un_devstart +
   3456 			    cs->cs_blkno;
   3457 			bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR);
   3458 			bp->b_flags &= ~nv_available;
   3459 			bp->b_flags |= B_WRITE | B_BUSY;
   3460 			bp->b_iodone = (int (*)())raid_done;
   3461 			bp->b_edev = md_dev64_to_dev(
   3462 			    un->un_column[cbuf->cbuf_column].un_dev);
   3463 			bp->b_chain = (struct buf *)cs;
   3464 			private = cs->cs_strategy_private;
   3465 			flag = cs->cs_strategy_flag;
   3466 			md_call_strategy(bp, flag, private);
   3467 		}
   3468 		raidio(cs, RIO_DATA | RIO_WRITE);
   3469 		raidio(cs, RIO_PARITY | RIO_WRITE);
   3470 		if (cs->cs_pw_inval_list) {
   3471 			raid_free_pwinvalidate(cs);
   3472 		}
   3473 		return;
   3474 
   3475 	default:
   3476 		ASSERT(0);
   3477 		break;
   3478 	}
   3479 }
   3480 /*
   3481  * NAME:	md_raid_strategy
   3482  * DESCRIPTION: RAID metadevice I/O oprations entry point.
   3483  * PARAMETERS:	buf_t	  *pb - pointer to a user I/O buffer
   3484  *		int	 flag - metadevice specific flag
   3485  *		void *private - carry over flag ??
   3486  *
   3487  */
   3488 
   3489 void
   3490 md_raid_strategy(buf_t *pb, int flag, void *private)
   3491 {
   3492 	md_raidps_t	*ps;
   3493 	md_raidcs_t	*cs;
   3494 	int		doing_writes;
   3495 	int		err;
   3496 	mr_unit_t	*un;
   3497 	mdi_unit_t	*ui;
   3498 	size_t		count;
   3499 	diskaddr_t	blkno;
   3500 	caddr_t		addr;
   3501 	off_t		offset;
   3502 	int		colcnt;
   3503 	minor_t		mnum;
   3504 	set_t		setno;
   3505 
   3506 	ui = MDI_UNIT(getminor(pb->b_edev));
   3507 	md_kstat_waitq_enter(ui);
   3508 	un = (mr_unit_t *)md_io_readerlock(ui);
   3509 	setno = MD_MIN2SET(getminor(pb->b_edev));
   3510 
   3511 	if ((flag & MD_NOBLOCK) == 0) {
   3512 		if (md_inc_iocount(setno) != 0) {
   3513 			pb->b_flags |= B_ERROR;
   3514 			pb->b_error = ENXIO;
   3515 			pb->b_resid = pb->b_bcount;
   3516 			md_kstat_waitq_exit(ui);
   3517 			md_io_readerexit(ui);
   3518 			biodone(pb);
   3519 			return;
   3520 		}
   3521 	} else {
   3522 		md_inc_iocount_noblock(setno);
   3523 	}
   3524 
   3525 	mnum = MD_SID(un);
   3526 	colcnt = un->un_totalcolumncnt - 1;
   3527 	count = pb->b_bcount;
   3528 
   3529 	STAT_CHECK(raid_512, count == 512);
   3530 	STAT_CHECK(raid_1024, count == 1024);
   3531 	STAT_CHECK(raid_1024_8192, count > 1024 && count < 8192);
   3532 	STAT_CHECK(raid_8192, count == 8192);
   3533 	STAT_CHECK(raid_8192_bigger, count > 8192);
   3534 
   3535 	(void *) md_unit_readerlock(ui);
   3536 	if (!(flag & MD_STR_NOTTOP)) {
   3537 		err = md_checkbuf(ui, (md_unit_t *)un, pb); /* check and map */
   3538 		if (err != 0) {
   3539 			md_kstat_waitq_exit(ui);
   3540 			md_io_readerexit(ui);
   3541 			return;
   3542 		}
   3543 	}
   3544 	md_unit_readerexit(ui);
   3545 
   3546 	STAT_INC(raid_total_io);
   3547 
   3548 	/* allocate a parent structure for the user I/O */
   3549 	ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS);
   3550 	raid_parent_init(ps);
   3551 
   3552 	/*
   3553 	 * Save essential information from the original buffhdr
   3554 	 * in the md_save structure.
   3555 	 */
   3556 	ps->ps_un = un;
   3557 	ps->ps_ui = ui;
   3558 	ps->ps_bp = pb;
   3559 	ps->ps_addr = pb->b_un.b_addr;
   3560 
   3561 	if ((pb->b_flags & B_READ) == 0) {
   3562 		ps->ps_flags |= MD_RPS_WRITE;
   3563 		doing_writes = 1;
   3564 		STAT_INC(raid_writes);
   3565 	} else {
   3566 		ps->ps_flags |= MD_RPS_READ;
   3567 		doing_writes = 0;
   3568 		STAT_INC(raid_reads);
   3569 	}
   3570 
   3571 	count = lbtodb(pb->b_bcount);	/* transfer count (in blocks) */
   3572 	blkno = pb->b_lblkno;		/* block number on device */
   3573 	addr  = 0;
   3574 	offset = 0;
   3575 	ps->ps_pwfrags = 1;
   3576 	ps->ps_frags = 1;
   3577 	md_kstat_waitq_to_runq(ui);
   3578 
   3579 	do {
   3580 		cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS);
   3581 		raid_child_init(cs);
   3582 		cs->cs_ps = ps;
   3583 		cs->cs_un = un;
   3584 		cs->cs_mdunit = mnum;
   3585 		cs->cs_strategy_flag = flag;
   3586 		cs->cs_strategy_private = private;
   3587 		cs->cs_addr = addr;
   3588 		cs->cs_offset = offset;
   3589 		count = raid_iosetup(un, blkno, count, cs);
   3590 		if (cs->cs_flags & MD_RCS_LINE) {
   3591 			blkno += (cs->cs_blkcnt * colcnt);
   3592 			offset += (cs->cs_bcount * colcnt);
   3593 		} else {
   3594 			blkno +=  cs->cs_blkcnt;
   3595 			offset += cs->cs_bcount;
   3596 		}
   3597 		/* for each cs bump up the ps_pwfrags and ps_frags fields */
   3598 		if (count) {
   3599 			mutex_enter(&ps->ps_mx);
   3600 			ps->ps_pwfrags++;
   3601 			ps->ps_frags++;
   3602 			mutex_exit(&ps->ps_mx);
   3603 			if (doing_writes)
   3604 				(void) raid_write(un, cs);
   3605 			else
   3606 				(void) raid_read(un, cs);
   3607 		}
   3608 	} while (count);
   3609 	if (doing_writes) {
   3610 		(void) raid_write(un, cs);
   3611 	} else
   3612 		(void) raid_read(un, cs);
   3613 
   3614 	if (! (flag & MD_STR_NOTTOP) && panicstr) {
   3615 		while (! (ps->ps_flags & MD_RPS_DONE)) {
   3616 			md_daemon(1, &md_done_daemon);
   3617 			drv_usecwait(10);
   3618 		}
   3619 		kmem_cache_free(raid_parent_cache, ps);
   3620 	}
   3621 }
   3622 
   3623 /*
   3624  * NAMES:	raid_snarf
   3625  * DESCRIPTION: RAID metadevice SNARF entry point
   3626  * PARAMETERS:	md_snarfcmd_t cmd,
   3627  *		set_t setno
   3628  * RETURNS:
   3629  */
   3630 static int
   3631 raid_snarf(md_snarfcmd_t cmd, set_t setno)
   3632 {
   3633 	mr_unit_t	*un;
   3634 	mddb_recid_t	recid;
   3635 	int		gotsomething;
   3636 	int		all_raid_gotten;
   3637 	mddb_type_t	typ1;
   3638 	uint_t		ncol;
   3639 	mddb_de_ic_t	*dep;
   3640 	mddb_rb32_t	*rbp;
   3641 	size_t		newreqsize;
   3642 	mr_unit_t	*big_un;
   3643 	mr_unit32_od_t	*small_un;
   3644 
   3645 
   3646 	if (cmd == MD_SNARF_CLEANUP)
   3647 		return (0);
   3648 
   3649 	all_raid_gotten = 1;
   3650 	gotsomething = 0;
   3651 	typ1 = (mddb_type_t)md_getshared_key(setno,
   3652 	    raid_md_ops.md_driver.md_drivername);
   3653 	recid = mddb_makerecid(setno, 0);
   3654 
   3655 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
   3656 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) {
   3657 			continue;
   3658 		}
   3659 
   3660 		dep = mddb_getrecdep(recid);
   3661 		dep->de_flags = MDDB_F_RAID;
   3662 		rbp = dep->de_rb;
   3663 		switch (rbp->rb_revision) {
   3664 		case MDDB_REV_RB:
   3665 		case MDDB_REV_RBFN:
   3666 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
   3667 				/*
   3668 				 * This means, we have an old and small record
   3669 				 * and this record hasn't already been
   3670 				 * converted.  Before we create an incore
   3671 				 * metadevice from this we have to convert it to
   3672 				 * a big record.
   3673 				 */
   3674 				small_un =
   3675 				    (mr_unit32_od_t *)mddb_getrecaddr(recid);
   3676 				ncol = small_un->un_totalcolumncnt;
   3677 				newreqsize = sizeof (mr_unit_t) +
   3678 				    ((ncol - 1) * sizeof (mr_column_t));
   3679 				big_un = (mr_unit_t *)kmem_zalloc(newreqsize,
   3680 				    KM_SLEEP);
   3681 				raid_convert((caddr_t)small_un, (caddr_t)big_un,
   3682 				    SMALL_2_BIG);
   3683 				kmem_free(small_un, dep->de_reqsize);
   3684 				dep->de_rb_userdata = big_un;
   3685 				dep->de_reqsize = newreqsize;
   3686 				un = big_un;
   3687 				rbp->rb_private |= MD_PRV_CONVD;
   3688 			} else {
   3689 				/*
   3690 				 * Record has already been converted.  Just
   3691 				 * get its address.
   3692 				 */
   3693 				un = (mr_unit_t *)mddb_getrecaddr(recid);
   3694 			}
   3695 			un->c.un_revision &= ~MD_64BIT_META_DEV;
   3696 			break;
   3697 		case MDDB_REV_RB64:
   3698 		case MDDB_REV_RB64FN:
   3699 			/* Big device */
   3700 			un = (mr_unit_t *)mddb_getrecaddr(recid);
   3701 			un->c.un_revision |= MD_64BIT_META_DEV;
   3702 			un->c.un_flag |= MD_EFILABEL;
   3703 			break;
   3704 		}
   3705 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
   3706 
   3707 		/*
   3708 		 * Create minor device node for snarfed entry.
   3709 		 */
   3710 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
   3711 
   3712 		if (MD_UNIT(MD_SID(un)) != NULL) {
   3713 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
   3714 			continue;
   3715 		}
   3716 		all_raid_gotten = 0;
   3717 		if (raid_build_incore((void *)un, 1) == 0) {
   3718 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
   3719 			md_create_unit_incore(MD_SID(un), &raid_md_ops, 1);
   3720 			gotsomething = 1;
   3721 		} else if (un->mr_ic) {
   3722 			kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
   3723 			    un->un_totalcolumncnt);
   3724 			kmem_free(un->mr_ic, sizeof (*un->mr_ic));
   3725 		}
   3726 	}
   3727 
   3728 	if (!all_raid_gotten) {
   3729 		return (gotsomething);
   3730 	}
   3731 
   3732 	recid = mddb_makerecid(setno, 0);
   3733 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0)
   3734 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
   3735 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
   3736 
   3737 	return (0);
   3738 }
   3739 
   3740 /*
   3741  * NAMES:	raid_halt
   3742  * DESCRIPTION: RAID metadevice HALT entry point
   3743  * PARAMETERS:	md_haltcmd_t cmd -
   3744  *		set_t	setno -
   3745  * RETURNS:
   3746  */
   3747 static int
   3748 raid_halt(md_haltcmd_t cmd, set_t setno)
   3749 {
   3750 	set_t		i;
   3751 	mdi_unit_t	*ui;
   3752 	minor_t		mnum;
   3753 
   3754 	if (cmd == MD_HALT_CLOSE)
   3755 		return (0);
   3756 
   3757 	if (cmd == MD_HALT_OPEN)
   3758 		return (0);
   3759 
   3760 	if (cmd == MD_HALT_UNLOAD)
   3761 		return (0);
   3762 
   3763 	if (cmd == MD_HALT_CHECK) {
   3764 		for (i = 0; i < md_nunits; i++) {
   3765 			mnum = MD_MKMIN(setno, i);
   3766 			if ((ui = MDI_UNIT(mnum)) == NULL)
   3767 				continue;
   3768 			if (ui->ui_opsindex != raid_md_ops.md_selfindex)
   3769 				continue;
   3770 			if (md_unit_isopen(ui))
   3771 				return (1);
   3772 		}
   3773 		return (0);
   3774 	}
   3775 
   3776 	if (cmd != MD_HALT_DOIT)
   3777 		return (1);
   3778 
   3779 	for (i = 0; i < md_nunits; i++) {
   3780 		mnum = MD_MKMIN(setno, i);
   3781 		if ((ui = MDI_UNIT(mnum)) == NULL)
   3782 			continue;
   3783 		if (ui->ui_opsindex != raid_md_ops.md_selfindex)
   3784 			continue;
   3785 		reset_raid((mr_unit_t *)MD_UNIT(mnum), mnum, 0);
   3786 	}
   3787 	return (0);
   3788 }
   3789 
   3790 /*
   3791  * NAMES:	raid_close_all_devs
   3792  * DESCRIPTION: Close all the devices of the unit.
   3793  * PARAMETERS:	mr_unit_t *un - pointer to unit structure
   3794  * RETURNS:
   3795  */
   3796 void
   3797 raid_close_all_devs(mr_unit_t *un, int init_pw, int md_cflags)
   3798 {
   3799 	int		i;
   3800 	mr_column_t	*device;
   3801 
   3802 	for (i = 0; i < un->un_totalcolumncnt; i++) {
   3803 		device = &un->un_column[i];
   3804 		if (device->un_devflags & MD_RAID_DEV_ISOPEN) {
   3805 			ASSERT((device->un_dev != (md_dev64_t)0) &&
   3806 			    (device->un_dev != NODEV64));
   3807 			if ((device->un_devstate & RCS_OKAY) && init_pw)
   3808 				(void) init_pw_area(un, device->un_dev,
   3809 				    device->un_pwstart, i);
   3810 			md_layered_close(device->un_dev, md_cflags);
   3811 			device->un_devflags &= ~MD_RAID_DEV_ISOPEN;
   3812 		}
   3813 	}
   3814 }
   3815 
   3816 /*
   3817  * NAMES:	raid_open_all_devs
   3818  * DESCRIPTION: Open all the components (columns) of the device unit.
   3819  * PARAMETERS:	mr_unit_t *un - pointer to unit structure
   3820  * RETURNS:
   3821  */
   3822 static int
   3823 raid_open_all_devs(mr_unit_t *un, int md_oflags)
   3824 {
   3825 	minor_t		mnum = MD_SID(un);
   3826 	int		i;
   3827 	int		not_opened = 0;
   3828 	int		commit = 0;
   3829 	int		col = -1;
   3830 	mr_column_t	*device;
   3831 	set_t		setno = MD_MIN2SET(MD_SID(un));
   3832 	side_t		side = mddb_getsidenum(setno);
   3833 	mdkey_t		key;
   3834 	mdi_unit_t	*ui = MDI_UNIT(mnum);
   3835 
   3836 	ui->ui_tstate &= ~MD_INACCESSIBLE;
   3837 
   3838 	for (i = 0; i < un->un_totalcolumncnt; i++) {
   3839 		md_dev64_t tmpdev;
   3840 
   3841 		device = &un->un_column[i];
   3842 
   3843 		if (COLUMN_STATE(un, i) & RCS_ERRED) {
   3844 			not_opened++;
   3845 			continue;
   3846 		}
   3847 
   3848 		if (device->un_devflags & MD_RAID_DEV_ISOPEN)
   3849 			continue;
   3850 
   3851 		tmpdev = device->un_dev;
   3852 		/*
   3853 		 * Open by device id
   3854 		 */
   3855 		key = HOTSPARED(un, i) ?
   3856 		    device->un_hs_key : device->un_orig_key;
   3857 		if ((md_getmajor(tmpdev) != md_major) &&
   3858 		    md_devid_found(setno, side, key) == 1) {
   3859 			tmpdev = md_resolve_bydevid(mnum, tmpdev, key);
   3860 		}
   3861 		if (md_layered_open(mnum, &tmpdev, md_oflags)) {
   3862 			device->un_dev = tmpdev;
   3863 			not_opened++;
   3864 			continue;
   3865 		}
   3866 		device->un_dev = tmpdev;
   3867 		device->un_devflags |= MD_RAID_DEV_ISOPEN;
   3868 	}
   3869 
   3870 	/* if open errors and errored devices are 1 then device can run */
   3871 	if (not_opened > 1) {
   3872 		cmn_err(CE_WARN,
   3873 		    "md: %s failed to open. open error on %s\n",
   3874 		    md_shortname(MD_SID(un)),
   3875 		    md_devname(MD_UN2SET(un), device->un_orig_dev, NULL, 0));
   3876 
   3877 		ui->ui_tstate |= MD_INACCESSIBLE;
   3878 
   3879 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
   3880 		    MD_UN2SET(un), MD_SID(un));
   3881 
   3882 		return (not_opened > 1);
   3883 	}
   3884 
   3885 	for (i = 0; i < un->un_totalcolumncnt; i++) {
   3886 		device = &un->un_column[i];
   3887 		if (device->un_devflags & MD_RAID_DEV_ISOPEN) {
   3888 			if (device->un_devstate & RCS_LAST_ERRED) {
   3889 			/*
   3890 			 * At this point in time there is a possibility
   3891 			 * that errors were the result of a controller
   3892 			 * failure with more than a single column on it
   3893 			 * so clear out last errored columns and let errors
   3894 			 * re-occur is necessary.
   3895 			 */
   3896 				raid_set_state(un, i, RCS_OKAY, 0);
   3897 				commit++;
   3898 			}
   3899 			continue;
   3900 		}
   3901 		ASSERT(col == -1);
   3902 		col = i;
   3903 	}
   3904 
   3905 	if (col != -1) {
   3906 		raid_set_state(un, col, RCS_ERRED, 0);
   3907 		commit++;
   3908 	}
   3909 
   3910 	if (commit)
   3911 		raid_commit(un, NULL);
   3912 
   3913 	if (col != -1) {
   3914 		if (COLUMN_STATE(un, col) & RCS_ERRED) {
   3915 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
   3916 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
   3917 		} else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) {
   3918 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
   3919 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
   3920 		}
   3921 	}
   3922 
   3923 	return (0);
   3924 }
   3925 
   3926 /*
   3927  * NAMES:	raid_internal_open
   3928  * DESCRIPTION: Do the actual RAID open
   3929  * PARAMETERS:	minor_t mnum - minor number of the RAID device
   3930  *		int flag -
   3931  *		int otyp -
   3932  *		int md_oflags - RAID open flags
   3933  * RETURNS:	0 if successful, nonzero otherwise
   3934  */
   3935 int
   3936 raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags)
   3937 {
   3938 	mr_unit_t	*un;
   3939 	mdi_unit_t	*ui;
   3940 	int		err = 0;
   3941 	int		replay_error = 0;
   3942 
   3943 	ui = MDI_UNIT(mnum);
   3944 	ASSERT(ui != NULL);
   3945 
   3946 	un = (mr_unit_t *)md_unit_openclose_enter(ui);
   3947 	/*
   3948 	 * this MUST be checked before md_unit_isopen is checked.
   3949 	 * raid_init_columns sets md_unit_isopen to block reset, halt.
   3950 	 */
   3951 	if ((UNIT_STATE(un) & (RUS_INIT | RUS_DOI)) &&
   3952 	    !(md_oflags & MD_OFLG_ISINIT)) {
   3953 		md_unit_openclose_exit(ui);
   3954 		return (EAGAIN);
   3955 	}
   3956 
   3957 	if ((md_oflags & MD_OFLG_ISINIT) || md_unit_isopen(ui)) {
   3958 		err = md_unit_incopen(mnum, flag, otyp);
   3959 		goto out;
   3960 	}
   3961 
   3962 	md_unit_readerexit(ui);
   3963 
   3964 	un = (mr_unit_t *)md_unit_writerlock(ui);
   3965 	if (raid_open_all_devs(un, md_oflags) == 0) {
   3966 		if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) {
   3967 			md_unit_writerexit(ui);
   3968 			un = (mr_unit_t *)md_unit_readerlock(ui);
   3969 			raid_close_all_devs(un, 0, md_oflags);
   3970 			goto out;
   3971 		}
   3972 	} else {
   3973 		/*
   3974 		 * if this unit contains more than two errored components
   3975 		 * should return error and close all opened devices
   3976 		 */
   3977 
   3978 		md_unit_writerexit(ui);
   3979 		un = (mr_unit_t *)md_unit_readerlock(ui);
   3980 		raid_close_all_devs(un, 0, md_oflags);
   3981 		md_unit_openclose_exit(ui);
   3982 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
   3983 		    MD_UN2SET(un), MD_SID(un));
   3984 		return (ENXIO);
   3985 	}
   3986 
   3987 	if (!(MD_STATUS(un) & MD_UN_REPLAYED)) {
   3988 		replay_error = raid_replay(un);
   3989 		MD_STATUS(un) |= MD_UN_REPLAYED;
   3990 	}
   3991 
   3992 	md_unit_writerexit(ui);
   3993 	un = (mr_unit_t *)md_unit_readerlock(ui);
   3994 
   3995 	if ((replay_error == RAID_RPLY_READONLY) &&
   3996 	    ((flag & (FREAD | FWRITE)) == FREAD)) {
   3997 		md_unit_openclose_exit(ui);
   3998 		return (0);
   3999 	}
   4000 
   4001 	/* allocate hotspare if possible */
   4002 	(void) raid_hotspares();
   4003 
   4004 
   4005 out:
   4006 	md_unit_openclose_exit(ui);
   4007 	return (err);
   4008 }
   4009 /*
   4010  * NAMES:	raid_open
   4011  * DESCRIPTION: RAID metadevice OPEN entry point
   4012  * PARAMETERS:	dev_t dev -
   4013  *		int flag -
   4014  *		int otyp -
   4015  *		cred_t * cred_p -
   4016  *		int md_oflags -
   4017  * RETURNS:
   4018  */
   4019 /*ARGSUSED1*/
   4020 static int
   4021 raid_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
   4022 {
   4023 	int		error = 0;
   4024 
   4025 	if (error = raid_internal_open(getminor(*dev), flag, otyp, md_oflags)) {
   4026 		return (error);
   4027 	}
   4028 	return (0);
   4029 }
   4030 
   4031 /*
   4032  * NAMES:	raid_internal_close
   4033  * DESCRIPTION: RAID metadevice CLOSE actual implementation
   4034  * PARAMETERS:	minor_t - minor number of the RAID device
   4035  *		int otyp -
   4036  *		int init_pw -
   4037  *		int md_cflags - RAID close flags
   4038  * RETURNS:	0 if successful, nonzero otherwise
   4039  */
   4040 /*ARGSUSED*/
   4041 int
   4042 raid_internal_close(minor_t mnum, int otyp, int init_pw, int md_cflags)
   4043 {
   4044 	mdi_unit_t	*ui = MDI_UNIT(mnum);
   4045 	mr_unit_t	*un;
   4046 	int		err = 0;
   4047 
   4048 	/* single thread */
   4049 	un = (mr_unit_t *)md_unit_openclose_enter(ui);
   4050 
   4051 	/* count closed */
   4052 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
   4053 		goto out;
   4054 	/* close devices, if necessary */
   4055 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
   4056 		raid_close_all_devs(un, init_pw, md_cflags);
   4057 	}
   4058 
   4059 	/* unlock, return success */
   4060 out:
   4061 	md_unit_openclose_exit(ui);
   4062 	return (err);
   4063 }
   4064 
   4065 /*
   4066  * NAMES:	raid_close
   4067  * DESCRIPTION: RAID metadevice close entry point
   4068  * PARAMETERS:	dev_t dev -
   4069  *		int flag -
   4070  *		int otyp -
   4071  *		cred_t * cred_p -
   4072  *		int md_oflags -
   4073  * RETURNS:
   4074  */
   4075 /*ARGSUSED1*/
   4076 static int
   4077 raid_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
   4078 {
   4079 	int retval;
   4080 
   4081 	(void) md_io_writerlock(MDI_UNIT(getminor(dev)));
   4082 	retval = raid_internal_close(getminor(dev), otyp, 1, md_cflags);
   4083 	(void) md_io_writerexit(MDI_UNIT(getminor(dev)));
   4084 	return (retval);
   4085 }
   4086 
   4087 /*
   4088  * raid_probe_close_all_devs
   4089  */
   4090 void
   4091 raid_probe_close_all_devs(mr_unit_t *un)
   4092 {
   4093 	int		i;
   4094 	mr_column_t	*device;
   4095 
   4096 	for (i = 0; i < un->un_totalcolumncnt; i++) {
   4097 		device = &un->un_column[i];
   4098 
   4099 		if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) {
   4100 			md_layered_close(device->un_dev,
   4101 			    MD_OFLG_PROBEDEV);
   4102 			device->un_devflags &= ~MD_RAID_DEV_PROBEOPEN;
   4103 		}
   4104 	}
   4105 }
   4106 /*
   4107  * Raid_probe_dev:
   4108  *
   4109  * On entry the unit writerlock is held
   4110  */
   4111 static int
   4112 raid_probe_dev(mdi_unit_t *ui, minor_t mnum)
   4113 {
   4114 	mr_unit_t	*un;
   4115 	int		i;
   4116 	int		not_opened = 0;
   4117 	int		commit = 0;
   4118 	int		col = -1;
   4119 	mr_column_t	*device;
   4120 	int		md_devopen = 0;
   4121 
   4122 	if (md_unit_isopen(ui))
   4123 		md_devopen++;
   4124 
   4125 	un = MD_UNIT(mnum);
   4126 	/*
   4127 	 * If the state has been set to LAST_ERRED because
   4128 	 * of an error when the raid device was open at some
   4129 	 * point in the past, don't probe. We really don't want
   4130 	 * to reset the state in this case.
   4131 	 */
   4132 	if (UNIT_STATE(un) == RUS_LAST_ERRED)
   4133 		return (0);
   4134 
   4135 	ui->ui_tstate &= ~MD_INACCESSIBLE;
   4136 
   4137 	for (i = 0; i < un->un_totalcolumncnt; i++) {
   4138 		md_dev64_t tmpdev;
   4139 
   4140 		device = &un->un_column[i];
   4141 		if (COLUMN_STATE(un, i) & RCS_ERRED) {
   4142 			not_opened++;
   4143 			continue;
   4144 		}
   4145 
   4146 		tmpdev = device->un_dev;
   4147 		/*
   4148 		 * Currently the flags passed are not needed since
   4149 		 * there cannot be an underlying metadevice. However
   4150 		 * they are kept here for consistency.
   4151 		 *
   4152 		 * Open by device id
   4153 		 */
   4154 		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i)?
   4155 		    device->un_hs_key : device->un_orig_key);
   4156 		if (md_layered_open(mnum, &tmpdev,
   4157 		    MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV)) {
   4158 			device->un_dev = tmpdev;
   4159 			not_opened++;
   4160 			continue;
   4161 		}
   4162 		device->un_dev = tmpdev;
   4163 
   4164 		device->un_devflags |= MD_RAID_DEV_PROBEOPEN;
   4165 	}
   4166 
   4167 	/*
   4168 	 * The code below is careful on setting the LAST_ERRED state.
   4169 	 *
   4170 	 * If open errors and exactly one device has failed we can run.
   4171 	 * If more then one device fails we have to figure out when to set
   4172 	 * LAST_ERRED state.  The rationale is to avoid unnecessary resyncs
   4173 	 * since they are painful and time consuming.
   4174 	 *
   4175 	 * When more than one component/column fails there are 2 scenerios.
   4176 	 *
   4177 	 * 1. Metadevice has NOT been opened: In this case, the behavior
   4178 	 *    mimics the open symantics. ie. Only the first failed device
   4179 	 *    is ERRED and LAST_ERRED is not set.
   4180 	 *
   4181 	 * 2. Metadevice has been opened: Here the read/write sematics are
   4182 	 *    followed. The first failed devicce is ERRED and on the next
   4183 	 *    failed device LAST_ERRED is set.
   4184 	 */
   4185 
   4186 	if (not_opened > 1 && !md_devopen) {
   4187 		cmn_err(CE_WARN,
   4188 		    "md: %s failed to open. open error on %s\n",
   4189 		    md_shortname(MD_SID(un)),
   4190 		    md_devname(MD_UN2SET(un), device->un_orig_dev, NULL, 0));
   4191 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
   4192 		    MD_UN2SET(un), MD_SID(un));
   4193 		raid_probe_close_all_devs(un);
   4194 		ui->ui_tstate |= MD_INACCESSIBLE;
   4195 		return (not_opened > 1);
   4196 	}
   4197 
   4198 	if (!md_devopen) {
   4199 		for (i = 0; i < un->un_totalcolumncnt; i++) {
   4200 			device = &un->un_column[i];
   4201 			if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) {
   4202 				if (device->un_devstate & RCS_LAST_ERRED) {
   4203 					/*
   4204 					 * At this point in time there is a
   4205 					 * possibility that errors were the
   4206 					 * result of a controller failure with
   4207 					 * more than a single column on it so
   4208 					 * clear out last errored columns and
   4209 					 * let errors re-occur is necessary.
   4210 					 */
   4211 					raid_set_state(un, i, RCS_OKAY, 0);
   4212 					commit++;
   4213 					}
   4214 				continue;
   4215 			}
   4216 			ASSERT(col == -1);
   4217 			/*
   4218 			 * note if multiple devices are failing then only
   4219 			 * the last one is marked as error
   4220 			 */
   4221 			col = i;
   4222 		}
   4223 
   4224 		if (col != -1) {
   4225 			raid_set_state(un, col, RCS_ERRED, 0);
   4226 			commit++;
   4227 		}
   4228 
   4229 	} else {
   4230 		for (i = 0; i < un->un_totalcolumncnt; i++) {
   4231 			device = &un->un_column[i];
   4232 
   4233 			/* if we have LAST_ERRED go ahead and commit. */
   4234 			if (un->un_state & RUS_LAST_ERRED)
   4235 				break;
   4236 			/*
   4237 			 * could not open the component
   4238 			 */
   4239 
   4240 			if (!(device->un_devflags & MD_RAID_DEV_PROBEOPEN)) {
   4241 				col = i;
   4242 				raid_set_state(un, col, RCS_ERRED, 0);
   4243 				commit++;
   4244 			}
   4245 		}
   4246 	}
   4247 
   4248 	if (commit)
   4249 		raid_commit(un, NULL);
   4250 
   4251 	if (col != -1) {
   4252 		if (COLUMN_STATE(un, col) & RCS_ERRED) {
   4253 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
   4254 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
   4255 		} else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) {
   4256 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
   4257 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
   4258 		}
   4259 	}
   4260 
   4261 	raid_probe_close_all_devs(un);
   4262 	return (0);
   4263 }
   4264 
   4265 static int
   4266 raid_imp_set(
   4267 	set_t	setno
   4268 )
   4269 {
   4270 	mddb_recid_t    recid;
   4271 	int		i, gotsomething;
   4272 	mddb_type_t	typ1;
   4273 	mddb_de_ic_t	*dep;
   4274 	mddb_rb32_t	*rbp;
   4275 	mr_unit_t	*un64;
   4276 	mr_unit32_od_t	*un32;
   4277 	md_dev64_t	self_devt;
   4278 	minor_t		*self_id;	/* minor needs to be updated */
   4279 	md_parent_t	*parent_id;	/* parent needs to be updated */
   4280 	mddb_recid_t	*record_id;	 /* record id needs to be updated */
   4281 	hsp_t		*hsp_id;
   4282 
   4283 	gotsomething = 0;
   4284 
   4285 	typ1 = (mddb_type_t)md_getshared_key(setno,
   4286 	    raid_md_ops.md_driver.md_drivername);
   4287 	recid = mddb_makerecid(setno, 0);
   4288 
   4289 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
   4290 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
   4291 			continue;
   4292 
   4293 		dep = mddb_getrecdep(recid);
   4294 		rbp = dep->de_rb;
   4295 
   4296 		switch (rbp->rb_revision) {
   4297 		case MDDB_REV_RB:
   4298 		case MDDB_REV_RBFN:
   4299 			/*
   4300 			 * Small device
   4301 			 */
   4302 			un32 = (mr_unit32_od_t *)mddb_getrecaddr(recid);
   4303 			self_id = &(un32->c.un_self_id);
   4304 			parent_id = &(un32->c.un_parent);
   4305 			record_id = &(un32->c.un_record_id);
   4306 			hsp_id = &(un32->un_hsp_id);
   4307 
   4308 			for (i = 0; i < un32->un_totalcolumncnt; i++) {
   4309 				mr_column32_od_t *device;
   4310 
   4311 				device = &un32->un_column[i];
   4312 				if (!md_update_minor(setno, mddb_getsidenum
   4313 				    (setno), device->un_orig_key))
   4314 					goto out;
   4315 
   4316 				if (device->un_hs_id != 0)
   4317 					device->un_hs_id =
   4318 					    MAKERECID(setno, device->un_hs_id);
   4319 			}
   4320 			break;
   4321 		case MDDB_REV_RB64:
   4322 		case MDDB_REV_RB64FN:
   4323 			un64 = (mr_unit_t *)mddb_getrecaddr(recid);
   4324 			self_id = &(un64->c.un_self_id);
   4325 			parent_id = &(un64->c.un_parent);
   4326 			record_id = &(un64->c.un_record_id);
   4327 			hsp_id = &(un64->un_hsp_id);
   4328 
   4329 			for (i = 0; i < un64->un_totalcolumncnt; i++) {
   4330 				mr_column_t	*device;
   4331 
   4332 				device = &un64->un_column[i];
   4333 				if (!md_update_minor(setno, mddb_getsidenum
   4334 				    (setno), device->un_orig_key))
   4335 					goto out;
   4336 
   4337 				if (device->un_hs_id != 0)
   4338 					device->un_hs_id =
   4339 					    MAKERECID(setno, device->un_hs_id);
   4340 			}
   4341 			break;
   4342 		}
   4343 
   4344 		/*
   4345 		 * If this is a top level and a friendly name metadevice,
   4346 		 * update its minor in the namespace.
   4347 		 */
   4348 		if ((*parent_id == MD_NO_PARENT) &&
   4349 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
   4350 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
   4351 
   4352 			self_devt = md_makedevice(md_major, *self_id);
   4353 			if (!md_update_top_device_minor(setno,
   4354 			    mddb_getsidenum(setno), self_devt))
   4355 				goto out;
   4356 		}
   4357 
   4358 		/*
   4359 		 * Update unit with the imported setno
   4360 		 */
   4361 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
   4362 
   4363 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
   4364 
   4365 		if (*hsp_id != -1)
   4366 			*hsp_id = MAKERECID(setno, DBID(*hsp_id));
   4367 
   4368 		if (*parent_id != MD_NO_PARENT)
   4369 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
   4370 		*record_id = MAKERECID(setno, DBID(*record_id));
   4371 		gotsomething = 1;
   4372 	}
   4373 
   4374 out:
   4375 	return (gotsomething);
   4376 }
   4377 
   4378 static md_named_services_t raid_named_services[] = {
   4379 	{raid_hotspares,			"poke hotspares"	},
   4380 	{raid_rename_check,			MDRNM_CHECK		},
   4381 	{raid_rename_lock,			MDRNM_LOCK		},
   4382 	{(intptr_t (*)()) raid_rename_unlock,	MDRNM_UNLOCK		},
   4383 	{(intptr_t (*)()) raid_probe_dev,	"probe open test"	},
   4384 	{NULL,					0			}
   4385 };
   4386 
   4387 md_ops_t raid_md_ops = {
   4388 	raid_open,		/* open */
   4389 	raid_close,		/* close */
   4390 	md_raid_strategy,	/* strategy */
   4391 	NULL,			/* print */
   4392 	NULL,			/* dump */
   4393 	NULL,			/* read */
   4394 	NULL,			/* write */
   4395 	md_raid_ioctl,		/* ioctl, */
   4396 	raid_snarf,		/* raid_snarf */
   4397 	raid_halt,		/* raid_halt */
   4398 	NULL,			/* aread */
   4399 	NULL,			/* awrite */
   4400 	raid_imp_set,		/* import set */
   4401 	raid_named_services
   4402 };
   4403 
   4404 static void
   4405 init_init()
   4406 {
   4407 	/* default to a second */
   4408 	if (md_wr_wait == 0)
   4409 		md_wr_wait = md_hz >> 1;
   4410 
   4411 	raid_parent_cache = kmem_cache_create("md_raid_parent",
   4412 	    sizeof (md_raidps_t), 0, raid_parent_constructor,
   4413 	    raid_parent_destructor, raid_run_queue, NULL, NULL, 0);
   4414 	raid_child_cache = kmem_cache_create("md_raid_child",
   4415 	    sizeof (md_raidcs_t) - sizeof (buf_t) + biosize(), 0,
   4416 	    raid_child_constructor, raid_child_destructor,
   4417 	    raid_run_queue, NULL, NULL, 0);
   4418 	raid_cbuf_cache = kmem_cache_create("md_raid_cbufs",
   4419 	    sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor,
   4420 	    raid_cbuf_destructor, raid_run_queue, NULL, NULL, 0);
   4421 }
   4422 
   4423 static void
   4424 fini_uninit()
   4425 {
   4426 	kmem_cache_destroy(raid_parent_cache);
   4427 	kmem_cache_destroy(raid_child_cache);
   4428 	kmem_cache_destroy(raid_cbuf_cache);
   4429 	raid_parent_cache = raid_child_cache = raid_cbuf_cache = NULL;
   4430 }
   4431 
   4432 /* define the module linkage */
   4433 MD_PLUGIN_MISC_MODULE("raid module", init_init(), fini_uninit())
   4434