Home | History | Annotate | Download | only in mirror
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/param.h>
     28 #include <sys/systm.h>
     29 #include <sys/conf.h>
     30 #include <sys/file.h>
     31 #include <sys/user.h>
     32 #include <sys/uio.h>
     33 #include <sys/t_lock.h>
     34 #include <sys/buf.h>
     35 #include <sys/dkio.h>
     36 #include <sys/vtoc.h>
     37 #include <sys/kmem.h>
     38 #include <vm/page.h>
     39 #include <sys/cmn_err.h>
     40 #include <sys/sysmacros.h>
     41 #include <sys/types.h>
     42 #include <sys/mkdev.h>
     43 #include <sys/stat.h>
     44 #include <sys/open.h>
     45 #include <sys/modctl.h>
     46 #include <sys/ddi.h>
     47 #include <sys/sunddi.h>
     48 #include <sys/debug.h>
     49 #include <sys/dklabel.h>
     50 #include <vm/hat.h>
     51 #include <sys/lvm/mdvar.h>
     52 #include <sys/lvm/md_mirror.h>
     53 #include <sys/lvm/md_convert.h>
     54 #include <sys/lvm/md_mddb.h>
     55 #include <sys/esunddi.h>
     56 
     57 #include <sys/sysevent/eventdefs.h>
     58 #include <sys/sysevent/svm.h>
     59 #include <sys/lvm/mdmn_commd.h>
     60 #include <sys/avl.h>
     61 
     62 md_ops_t		mirror_md_ops;
     63 #ifndef	lint
     64 char			_depends_on[] = "drv/md";
     65 md_ops_t		*md_interface_ops = &mirror_md_ops;
     66 #endif
     67 
     68 extern mdq_anchor_t	md_done_daemon;
     69 extern mdq_anchor_t	md_mstr_daemon;
     70 extern mdq_anchor_t	md_mirror_daemon;
     71 extern mdq_anchor_t	md_mirror_io_daemon;
     72 extern mdq_anchor_t	md_mirror_rs_daemon;
     73 extern mdq_anchor_t	md_mhs_daemon;
     74 
     75 extern unit_t		md_nunits;
     76 extern set_t		md_nsets;
     77 extern md_set_t		md_set[];
     78 
     79 extern int		md_status;
     80 extern clock_t		md_hz;
     81 
     82 extern md_krwlock_t	md_unit_array_rw;
     83 extern kmutex_t		md_mx;
     84 extern kcondvar_t	md_cv;
     85 extern int		md_mtioctl_cnt;
     86 
     87 daemon_request_t	mirror_timeout;
     88 static daemon_request_t	hotspare_request;
     89 static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */
     90 
     91 int	md_mirror_mcs_buf_off;
     92 
     93 /* Flags for mdmn_ksend_message to allow debugging */
     94 int	md_mirror_msg_flags;
     95 
     96 #ifdef DEBUG
     97 /* Flag to switch on debug messages */
     98 int	mirror_debug_flag = 0;
     99 #endif
    100 
    101 /*
    102  * Struct used to hold count of DMR reads and the timestamp of last DMR read
    103  * It is used to verify, using a debugger, that the DMR read ioctl has been
    104  * executed.
    105  */
    106 dmr_stats_t	mirror_dmr_stats = {0, 0};
    107 
    108 /*
    109  * Mutex protecting list of non-failfast drivers.
    110  */
    111 static kmutex_t	non_ff_drv_mutex;
    112 extern char	**non_ff_drivers;
    113 
    114 extern major_t	md_major;
    115 
    116 /*
    117  * Write-On-Write memory pool.
    118  */
    119 static void		copy_write_cont(wowhdr_t *wowhdr);
    120 static kmem_cache_t	*mirror_wowblk_cache = NULL;
    121 static int		md_wowbuf_size = 16384;
    122 static size_t		md_wowblk_size;
    123 
    124 /*
    125  * This is a flag that allows:
    126  *	- disabling the write-on-write mechanism.
    127  *	- logging occurrences of write-on-write
    128  *	- switching wow handling procedure processing
    129  * Counter for occurences of WOW.
    130  */
    131 static uint_t	md_mirror_wow_flg = 0;
    132 static int	md_mirror_wow_cnt = 0;
    133 
    134 /*
    135  * Tunable to enable/disable dirty region
    136  * processing when closing down a mirror.
    137  */
    138 static int	new_resync = 1;
    139 kmem_cache_t	*mirror_parent_cache = NULL;
    140 kmem_cache_t	*mirror_child_cache = NULL;
    141 
    142 extern int	md_ff_disable;		/* disable failfast */
    143 
    144 static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
    145 static void	mirror_read_strategy(buf_t *, int, void *);
    146 static void	mirror_write_strategy(buf_t *, int, void *);
    147 static void	become_owner(daemon_queue_t *);
    148 static int	mirror_done(struct buf *cb);
    149 static int	mirror_done_common(struct buf *cb);
    150 static void	clear_retry_error(struct buf *cb);
    151 
    152 /*
    153  * patchables
    154  */
    155 int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
    156 int	md_def_num_rr	= 1000;	/* Default number of dirty regions */
    157 
    158 /*
    159  * patchable to change delay before rescheduling mirror ownership request.
    160  * Value is clock ticks, default 0.5 seconds
    161  */
    162 clock_t	md_mirror_owner_to = 500000;
    163 
    164 /*ARGSUSED1*/
    165 static int
    166 mirror_parent_constructor(void *p, void *d1, int d2)
    167 {
    168 	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
    169 	return (0);
    170 }
    171 
    172 static void
    173 mirror_parent_init(md_mps_t *ps)
    174 {
    175 	bzero(ps, offsetof(md_mps_t, ps_mx));
    176 	bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
    177 }
    178 
    179 /*ARGSUSED1*/
    180 static void
    181 mirror_parent_destructor(void *p, void *d)
    182 {
    183 	mutex_destroy(&((md_mps_t *)p)->ps_mx);
    184 }
    185 
    186 /*ARGSUSED1*/
    187 static int
    188 mirror_child_constructor(void *p, void *d1, int d2)
    189 {
    190 	bioinit(&((md_mcs_t *)p)->cs_buf);
    191 	return (0);
    192 }
    193 
    194 void
    195 mirror_child_init(md_mcs_t *cs)
    196 {
    197 	cs->cs_ps = NULL;
    198 	cs->cs_mdunit = 0;
    199 	md_bioreset(&cs->cs_buf);
    200 }
    201 
    202 /*ARGSUSED1*/
    203 static void
    204 mirror_child_destructor(void *p, void *d)
    205 {
    206 	biofini(&((md_mcs_t *)p)->cs_buf);
    207 }
    208 
    209 static void
    210 mirror_wowblk_init(wowhdr_t *p)
    211 {
    212 	bzero(p, md_wowblk_size);
    213 }
    214 
    215 static void
    216 send_poke_hotspares_msg(daemon_request_t *drq)
    217 {
    218 	int			rval;
    219 	int			nretries = 0;
    220 	md_mn_msg_pokehsp_t	pokehsp;
    221 	md_mn_kresult_t		*kresult;
    222 	set_t			setno = (set_t)drq->dq.qlen;
    223 
    224 	pokehsp.pokehsp_setno = setno;
    225 
    226 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
    227 
    228 retry_sphmsg:
    229 	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
    230 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
    231 	    sizeof (pokehsp), kresult);
    232 
    233 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
    234 		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
    235 		/* If we're shutting down already, pause things here. */
    236 		if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
    237 			while (!md_mn_is_commd_present()) {
    238 				delay(md_hz);
    239 			}
    240 			/*
    241 			 * commd has become reachable again, so retry once.
    242 			 * If this fails we'll panic as the system is in an
    243 			 * unexpected state.
    244 			 */
    245 			if (nretries++ == 0)
    246 				goto retry_sphmsg;
    247 		}
    248 		cmn_err(CE_PANIC,
    249 		    "ksend_message failure: POKE_HOTSPARES");
    250 	}
    251 	kmem_free(kresult, sizeof (md_mn_kresult_t));
    252 
    253 	/* Allow further requests to use this set's queue structure */
    254 	mutex_enter(&drq->dr_mx);
    255 	drq->dr_pending = 0;
    256 	mutex_exit(&drq->dr_mx);
    257 }
    258 
    259 /*
    260  * Send a poke_hotspares message to the master node. To avoid swamping the
    261  * commd handler with requests we only send a message if there is not one
    262  * already outstanding. We punt the request to a separate thread context as
    263  * cannot afford to block waiting on the request to be serviced. This is
    264  * essential when a reconfig cycle is in progress as any open() of a multinode
    265  * metadevice may result in a livelock.
    266  */
    267 static void
    268 send_poke_hotspares(set_t setno)
    269 {
    270 	daemon_request_t	*drq = &mn_hs_request[setno];
    271 
    272 	mutex_enter(&drq->dr_mx);
    273 	if (drq->dr_pending == 0) {
    274 		drq->dr_pending = 1;
    275 		drq->dq.qlen = (int)setno;
    276 		daemon_request(&md_mhs_daemon,
    277 		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
    278 	}
    279 	mutex_exit(&drq->dr_mx);
    280 }
    281 
    282 void
    283 mirror_set_sm_state(
    284 	mm_submirror_t		*sm,
    285 	mm_submirror_ic_t	*smic,
    286 	sm_state_t		newstate,
    287 	int			force)
    288 {
    289 	int			compcnt;
    290 	int			i;
    291 	int			errcnt;
    292 	sm_state_t		origstate;
    293 	md_m_shared_t		*shared;
    294 
    295 	if (force) {
    296 		sm->sm_state = newstate;
    297 		uniqtime32(&sm->sm_timestamp);
    298 		return;
    299 	}
    300 
    301 	origstate = newstate;
    302 
    303 	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
    304 	for (i = 0, errcnt = 0; i < compcnt; i++) {
    305 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
    306 		    (sm->sm_dev, sm, i);
    307 		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
    308 			newstate |= SMS_COMP_ERRED;
    309 		if (shared->ms_state & (CS_RESYNC))
    310 			newstate |= SMS_COMP_RESYNC;
    311 		if (shared->ms_state & CS_ERRED)
    312 			errcnt++;
    313 	}
    314 
    315 	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
    316 		newstate &= ~origstate;
    317 
    318 	if (errcnt == compcnt)
    319 		newstate |= SMS_ALL_ERRED;
    320 	else
    321 		newstate &= ~SMS_ALL_ERRED;
    322 
    323 	sm->sm_state = newstate;
    324 	uniqtime32(&sm->sm_timestamp);
    325 }
    326 
    327 static int
    328 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
    329 							int frm_probe)
    330 {
    331 	mm_submirror_t		*sm;
    332 	mm_submirror_ic_t	*smic;
    333 	md_m_shared_t		*shared;
    334 	int			ci;
    335 	int			i;
    336 	int			compcnt;
    337 	int			open_comp; /* flag for open component */
    338 
    339 	for (i = *smi; i < NMIRROR; i++) {
    340 		sm = &un->un_sm[i];
    341 		smic = &un->un_smic[i];
    342 
    343 		if (!SMS_IS(sm, SMS_INUSE))
    344 			continue;
    345 
    346 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
    347 		for (ci = *cip; ci < compcnt; ci++) {
    348 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
    349 			    (sm->sm_dev, sm, ci);
    350 			/*
    351 			 * if called from any routine but probe, we check for
    352 			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
    353 			 * it sets MDM_S_PROBEOPEN flag and we test for this
    354 			 * flag. They are both exclusive tests.
    355 			 */
    356 			open_comp = (frm_probe) ?
    357 			    (shared->ms_flags & MDM_S_PROBEOPEN):
    358 			    (shared->ms_flags & MDM_S_ISOPEN);
    359 			if ((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
    360 			    ((shared->ms_state == CS_OKAY) ||
    361 			    (shared->ms_state == CS_RESYNC))) {
    362 				if (clr_error) {
    363 					shared->ms_flags &= ~MDM_S_IOERR;
    364 				}
    365 				*cip = ci;
    366 				*smi = i;
    367 				return (1);
    368 			}
    369 
    370 			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
    371 				shared->ms_flags &= ~MDM_S_IOERR;
    372 			}
    373 		}
    374 
    375 		*cip = 0;
    376 	}
    377 	return (0);
    378 }
    379 
    380 /*ARGSUSED*/
    381 static void
    382 mirror_run_queue(void *d)
    383 {
    384 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
    385 		md_daemon(1, &md_done_daemon);
    386 }
    387 /*
    388  * check_comp_4_hotspares
    389  *
    390  * This function attempts to allocate a hotspare for this component if the
    391  * component is in error. In a MN set, the function can be called in 2 modes.
    392  * It can be called either when a component error has been detected or when a
    393  * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
    394  * in flags and the request is sent to all nodes.
    395  * The handler on each of the nodes then calls this function with
    396  * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
    397  *
    398  * For non-MN sets the function simply attempts to allocate a hotspare.
    399  *
    400  * On entry, the following locks are held
    401  *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
    402  *	md_unit_writerlock
    403  *
    404  * Returns	0 if ok
    405  *		1 if the unit containing the component has been cleared while
    406  *		  the mdmn_ksend_message() was being executed
    407  */
    408 extern int
    409 check_comp_4_hotspares(
    410 	mm_unit_t	*un,
    411 	int		smi,
    412 	int		ci,
    413 	uint_t		flags,
    414 	mddb_recid_t	hs_id,	/* Only used by MN disksets */
    415 	IOLOCK		*lockp	/* can be NULL */
    416 )
    417 {
    418 	mm_submirror_t		*sm;
    419 	mm_submirror_ic_t	*smic;
    420 	md_m_shared_t		*shared;
    421 	mddb_recid_t		recids[6];
    422 	minor_t			mnum;
    423 	intptr_t		(*hs_dev)();
    424 	void			(*hs_done)();
    425 	void			*hs_data;
    426 	md_error_t		mde = mdnullerror;
    427 	set_t			setno;
    428 	md_mn_msg_allochsp_t	allochspmsg;
    429 	md_mn_kresult_t		*kresult;
    430 	mm_unit_t		*new_un;
    431 	int			rval;
    432 	int			nretries = 0;
    433 
    434 	mnum = MD_SID(un);
    435 	setno = MD_UN2SET(un);
    436 	sm = &un->un_sm[smi];
    437 	smic = &un->un_smic[smi];
    438 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
    439 	    (sm->sm_dev, sm, ci);
    440 
    441 	if (shared->ms_state != CS_ERRED)
    442 		return (0);
    443 
    444 	/* Don't start a new component resync if a resync is already running. */
    445 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
    446 		return (0);
    447 
    448 	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
    449 		uint_t		msgflags;
    450 		md_mn_msgtype_t	msgtype;
    451 
    452 		/* Send allocate hotspare message to all nodes */
    453 
    454 		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
    455 		allochspmsg.msg_allochsp_sm = smi;
    456 		allochspmsg.msg_allochsp_comp = ci;
    457 		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
    458 
    459 		/*
    460 		 * Before calling mdmn_ksend_message(), release locks
    461 		 * Can never be in the context of an ioctl.
    462 		 */
    463 		md_unit_writerexit(MDI_UNIT(mnum));
    464 		if (flags & MD_HOTSPARE_LINKHELD)
    465 			rw_exit(&mirror_md_ops.md_link_rw.lock);
    466 #ifdef DEBUG
    467 		if (mirror_debug_flag)
    468 			printf("send alloc hotspare, flags="
    469 			    "0x%x %x, %x, %x, %x\n", flags,
    470 			    allochspmsg.msg_allochsp_mnum,
    471 			    allochspmsg.msg_allochsp_sm,
    472 			    allochspmsg.msg_allochsp_comp,
    473 			    allochspmsg.msg_allochsp_hs_id);
    474 #endif
    475 		if (flags & MD_HOTSPARE_WMUPDATE) {
    476 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
    477 			/*
    478 			 * When coming from an update of watermarks, there
    479 			 * must already be a message logged that triggered
    480 			 * this action. So, no need to log this message, too.
    481 			 */
    482 			msgflags = MD_MSGF_NO_LOG;
    483 		} else {
    484 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
    485 			msgflags = MD_MSGF_DEFAULT_FLAGS;
    486 		}
    487 
    488 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
    489 
    490 cc4hs_msg:
    491 		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
    492 		    (char *)&allochspmsg, sizeof (allochspmsg),
    493 		    kresult);
    494 
    495 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
    496 #ifdef DEBUG
    497 			if (mirror_debug_flag)
    498 				mdmn_ksend_show_error(rval, kresult,
    499 				    "ALLOCATE HOTSPARE");
    500 #endif
    501 			/*
    502 			 * If message is sent ok but exitval indicates an error
    503 			 * it must be because the mirror has been cleared. In
    504 			 * this case re-obtain lock and return an error
    505 			 */
    506 			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
    507 				if (flags & MD_HOTSPARE_LINKHELD) {
    508 					rw_enter(&mirror_md_ops.md_link_rw.lock,
    509 					    RW_READER);
    510 				}
    511 				kmem_free(kresult, sizeof (md_mn_kresult_t));
    512 				return (1);
    513 			}
    514 			/* If we're shutting down already, pause things here. */
    515 			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
    516 				while (!md_mn_is_commd_present()) {
    517 					delay(md_hz);
    518 				}
    519 				/*
    520 				 * commd has become reachable again, so retry
    521 				 * once. If this fails we'll panic as the
    522 				 * system is in an unexpected state.
    523 				 */
    524 				if (nretries++ == 0)
    525 					goto cc4hs_msg;
    526 			}
    527 			cmn_err(CE_PANIC,
    528 			    "ksend_message failure: ALLOCATE_HOTSPARE");
    529 		}
    530 		kmem_free(kresult, sizeof (md_mn_kresult_t));
    531 
    532 		/*
    533 		 * re-obtain the locks
    534 		 */
    535 		if (flags & MD_HOTSPARE_LINKHELD)
    536 			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
    537 		new_un = md_unit_writerlock(MDI_UNIT(mnum));
    538 
    539 		/*
    540 		 * As we had to release the locks in order to send the
    541 		 * message to all nodes, we need to check to see if the
    542 		 * unit has changed. If it has we release the writerlock
    543 		 * and return fail.
    544 		 */
    545 		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
    546 			md_unit_writerexit(MDI_UNIT(mnum));
    547 			return (1);
    548 		}
    549 	} else {
    550 		if (MD_MNSET_SETNO(setno)) {
    551 			/*
    552 			 * If 2 or more nodes simultaneously see a
    553 			 * component failure, these nodes will each
    554 			 * send an ALLOCATE_HOTSPARE[2] message.
    555 			 * The first message will allocate the hotspare
    556 			 * and the subsequent messages should do nothing.
    557 			 *
    558 			 * If a slave node doesn't have a hotspare allocated
    559 			 * at the time the message is initiated, then the
    560 			 * passed in hs_id will be 0.  If the node
    561 			 * executing this routine has a component shared
    562 			 * ms_hs_id of non-zero, but the message shows a
    563 			 * hs_id of 0, then just return since a hotspare
    564 			 * has already been allocated for this failing
    565 			 * component.  When the slave node returns from
    566 			 * the ksend_message the hotspare will have
    567 			 * already been allocated.
    568 			 *
    569 			 * If the slave node does send an hs_id of non-zero,
    570 			 * and the slave node's hs_id matches this node's
    571 			 * ms_hs_id, then the hotspare has error'd and
    572 			 * should be replaced.
    573 			 *
    574 			 * If the slave node sends an hs_id of non-zero and
    575 			 * this node has a different shared ms_hs_id, then
    576 			 * just return since this hotspare has already
    577 			 * been hotspared.
    578 			 */
    579 			if (shared->ms_hs_id != 0) {
    580 				if (hs_id == 0) {
    581 #ifdef DEBUG
    582 					if (mirror_debug_flag) {
    583 						printf("check_comp_4_hotspares"
    584 						    "(NOXMIT), short circuit "
    585 						    "hs_id=0x%x, "
    586 						    "ms_hs_id=0x%x\n",
    587 						    hs_id, shared->ms_hs_id);
    588 					}
    589 #endif
    590 					return (0);
    591 				}
    592 				if (hs_id != shared->ms_hs_id) {
    593 #ifdef DEBUG
    594 					if (mirror_debug_flag) {
    595 						printf("check_comp_4_hotspares"
    596 						    "(NOXMIT), short circuit2 "
    597 						    "hs_id=0x%x, "
    598 						    "ms_hs_id=0x%x\n",
    599 						    hs_id, shared->ms_hs_id);
    600 					}
    601 #endif
    602 					return (0);
    603 				}
    604 			}
    605 		}
    606 
    607 		sm = &un->un_sm[smi];
    608 		hs_dev = md_get_named_service(sm->sm_dev, 0,
    609 		    "hotspare device", 0);
    610 		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
    611 		    &hs_data) != 0)
    612 			return (0);
    613 
    614 		/*
    615 		 * set_sm_comp_state() commits the modified records.
    616 		 * As we don't transmit the changes, no need to drop the lock.
    617 		 */
    618 		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
    619 		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
    620 
    621 		(*hs_done)(sm->sm_dev, hs_data);
    622 
    623 		mirror_check_failfast(mnum);
    624 
    625 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
    626 		    setno, MD_SID(un));
    627 
    628 		/*
    629 		 * For a multi-node set we need to reset the un_rs_type,
    630 		 * un_rs_resync_done and un_rs_resync_2_do fields as the
    631 		 * hot-spare resync must copy all applicable data.
    632 		 */
    633 		if (MD_MNSET_SETNO(setno)) {
    634 			un->un_rs_type = MD_RS_NONE;
    635 			un->un_rs_resync_done = 0;
    636 			un->un_rs_resync_2_do = 0;
    637 		}
    638 
    639 		/*
    640 		 * Must drop writer lock since mirror_resync_unit will
    641 		 * open devices and must be able to grab readerlock.
    642 		 * Don't need to drop IOLOCK since any descendent routines
    643 		 * calling ksend_messages will drop the IOLOCK as needed.
    644 		 *
    645 		 */
    646 		if (lockp) {
    647 			md_ioctl_writerexit(lockp);
    648 		} else {
    649 			md_unit_writerexit(MDI_UNIT(mnum));
    650 		}
    651 
    652 		/* start resync */
    653 		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);
    654 
    655 		if (lockp) {
    656 			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
    657 		} else {
    658 			new_un = md_unit_writerlock(MDI_UNIT(mnum));
    659 		}
    660 	}
    661 	return (0);
    662 }
    663 
    664 /*
    665  * check_unit_4_hotspares
    666  *
    667  * For a given mirror, allocate hotspares, if available for any components
    668  * that are in error
    669  *
    670  * Returns	0 if ok
    671  *		1 if check_comp_4_hotspares returns non-zero. This will only
    672  *		  happen for a MN unit where the unit has been cleared while
    673  *		  the allocate hotspare message is sent to all nodes.
    674  */
    675 static int
    676 check_unit_4_hotspares(mm_unit_t *un, int flags)
    677 {
    678 	mm_submirror_t		*sm;
    679 	mm_submirror_ic_t	*smic;
    680 	int			ci;
    681 	int			i;
    682 	int			compcnt;
    683 
    684 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
    685 		return (0);
    686 
    687 	for (i = 0; i < NMIRROR; i++) {
    688 		sm = &un->un_sm[i];
    689 		smic = &un->un_smic[i];
    690 		if (!SMS_IS(sm, SMS_INUSE))
    691 			continue;
    692 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
    693 		for (ci = 0; ci < compcnt; ci++) {
    694 			md_m_shared_t		*shared;
    695 
    696 			shared = (md_m_shared_t *)
    697 			    (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
    698 			/*
    699 			 * Never called from ioctl context, so pass in
    700 			 * (IOLOCK *)NULL.  Pass through flags from calling
    701 			 * routine, also setting XMIT flag.
    702 			 */
    703 			if (check_comp_4_hotspares(un, i, ci,
    704 			    (MD_HOTSPARE_XMIT | flags),
    705 			    shared->ms_hs_id, (IOLOCK *)NULL) != 0)
    706 				return (1);
    707 		}
    708 	}
    709 	return (0);
    710 }
    711 
    712 static void
    713 check_4_hotspares(daemon_request_t *drq)
    714 {
    715 	mdi_unit_t	*ui;
    716 	mm_unit_t	*un;
    717 	md_link_t	*next;
    718 	int		x;
    719 
    720 	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
    721 	drq->dr_pending = 0;		/* again in low level routine if */
    722 	mutex_exit(&drq->dr_mx);	/* something found to do	*/
    723 
    724 	/*
    725 	 * Used to have a problem here. The disksets weren't marked as being
    726 	 * MNHOLD. This opened a window where we could be searching for
    727 	 * hotspares and have the disk set unloaded (released) from under
    728 	 * us causing a panic in stripe_component_count().
    729 	 * The way to prevent that is to mark the set MNHOLD which prevents
    730 	 * any diskset from being released while we are scanning the mirrors,
    731 	 * submirrors and components.
    732 	 */
    733 
    734 	for (x = 0; x < md_nsets; x++)
    735 		md_holdset_enter(x);
    736 
    737 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
    738 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
    739 		ui = MDI_UNIT(next->ln_id);
    740 
    741 		un = (mm_unit_t *)md_unit_readerlock(ui);
    742 
    743 		/*
    744 		 * Only check the unit if we are the master for this set
    745 		 * For an MN set, poke_hotspares() is only effective on the
    746 		 * master
    747 		 */
    748 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
    749 		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
    750 			md_unit_readerexit(ui);
    751 			continue;
    752 		}
    753 		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
    754 			md_unit_readerexit(ui);
    755 			continue;
    756 		}
    757 		md_unit_readerexit(ui);
    758 
    759 		un = (mm_unit_t *)md_unit_writerlock(ui);
    760 		/*
    761 		 * check_unit_4_hotspares will exit 1 if the unit has been
    762 		 * removed during the process of allocating the hotspare.
    763 		 * This can only happen for a MN metadevice. If unit no longer
    764 		 * exists, no need to release writerlock
    765 		 */
    766 		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
    767 			md_unit_writerexit(ui);
    768 		else {
    769 			/*
    770 			 * If check_unit_4_hotspares failed, queue another
    771 			 * request and break out of this one
    772 			 */
    773 			(void) poke_hotspares();
    774 			break;
    775 		}
    776 	}
    777 	rw_exit(&mirror_md_ops.md_link_rw.lock);
    778 
    779 	for (x = 0; x < md_nsets; x++)
    780 		md_holdset_exit(x);
    781 }
    782 
    783 /*
    784  * poke_hotspares
    785  *
    786  * If there is not a pending poke_hotspares request pending, queue a requent
    787  * to call check_4_hotspares(). This will scan all mirrors and attempt to
    788  * allocate hotspares for all components in error.
    789  */
    790 int
    791 poke_hotspares()
    792 {
    793 	mutex_enter(&hotspare_request.dr_mx);
    794 	if (hotspare_request.dr_pending == 0) {
    795 		hotspare_request.dr_pending = 1;
    796 		daemon_request(&md_mhs_daemon,
    797 		    check_4_hotspares, (daemon_queue_t *)&hotspare_request,
    798 		    REQ_OLD);
    799 	}
    800 	mutex_exit(&hotspare_request.dr_mx);
    801 	return (0);
    802 }
    803 
    804 static void
    805 free_all_ecomps(err_comp_t *ecomp)
    806 {
    807 	err_comp_t	*d;
    808 
    809 	while (ecomp != NULL) {
    810 		d = ecomp;
    811 		ecomp = ecomp->ec_next;
    812 		kmem_free(d, sizeof (err_comp_t));
    813 	}
    814 }
    815 
    816 /*
    817  * NAME: mirror_openfail_console_info
    818  *
    819  * DESCRIPTION: Prints a informative message to the console when mirror
    820  *		cannot be opened.
    821  *
    822  * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
    823  *	       int		smi - submirror index
    824  *	       int		ci - component index
    825  */
    826 
    827 void
    828 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
    829 {
    830 	void (*get_dev)();
    831 	ms_cd_info_t cd;
    832 	md_dev64_t tmpdev;
    833 
    834 	tmpdev = un->un_sm[smi].sm_dev;
    835 	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
    836 	if (get_dev != NULL) {
    837 		(void) (*get_dev)(tmpdev, smi, ci, &cd);
    838 		cmn_err(CE_WARN, "md %s: open error on %s",
    839 		    md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
    840 		    cd.cd_dev, NULL, 0));
    841 	} else {
    842 		cmn_err(CE_WARN, "md %s: open error",
    843 		    md_shortname(MD_SID(un)));
    844 	}
    845 }
    846 
    847 static int
    848 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
    849 {
    850 	int i;
    851 	md_dev64_t dev;
    852 
    853 	for (i = 0; i < NMIRROR; i++) {
    854 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
    855 			continue;
    856 		dev = un->un_sm[i].sm_dev;
    857 		md_layered_close(dev, md_cflags);
    858 	}
    859 	return (0);
    860 }
    861 
    862 /*
    863  * Keep track of drivers that don't support failfast.  We use this so that
    864  * we only log one diagnostic message for each of these drivers, no matter
    865  * how many times we run the mirror_check_failfast function.
    866  * Return 1 if this is a new driver that does not support failfast,
    867  * return 0 if we have already seen this non-failfast driver.
    868  */
    869 static int
    870 new_non_ff_driver(const char *s)
    871 {
    872 	mutex_enter(&non_ff_drv_mutex);
    873 	if (non_ff_drivers == NULL) {
    874 		non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
    875 		    KM_NOSLEEP);
    876 		if (non_ff_drivers == NULL) {
    877 			mutex_exit(&non_ff_drv_mutex);
    878 			return (1);
    879 		}
    880 
    881 		non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
    882 		    KM_NOSLEEP);
    883 		if (non_ff_drivers[0] == NULL) {
    884 			kmem_free(non_ff_drivers, 2 * sizeof (char *));
    885 			non_ff_drivers = NULL;
    886 			mutex_exit(&non_ff_drv_mutex);
    887 			return (1);
    888 		}
    889 
    890 		(void) strcpy(non_ff_drivers[0], s);
    891 		non_ff_drivers[1] = NULL;
    892 
    893 	} else {
    894 		int i;
    895 		char **tnames;
    896 		char **tmp;
    897 
    898 		for (i = 0; non_ff_drivers[i] != NULL; i++) {
    899 			if (strcmp(s, non_ff_drivers[i]) == 0) {
    900 				mutex_exit(&non_ff_drv_mutex);
    901 				return (0);
    902 			}
    903 		}
    904 
    905 		/* allow for new element and null */
    906 		i += 2;
    907 		tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
    908 		if (tnames == NULL) {
    909 			mutex_exit(&non_ff_drv_mutex);
    910 			return (1);
    911 		}
    912 
    913 		for (i = 0; non_ff_drivers[i] != NULL; i++)
    914 			tnames[i] = non_ff_drivers[i];
    915 
    916 		tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
    917 		if (tnames[i] == NULL) {
    918 			/* adjust i so that it is the right count to free */
    919 			kmem_free(tnames, (i + 2) * sizeof (char *));
    920 			mutex_exit(&non_ff_drv_mutex);
    921 			return (1);
    922 		}
    923 
    924 		(void) strcpy(tnames[i++], s);
    925 		tnames[i] = NULL;
    926 
    927 		tmp = non_ff_drivers;
    928 		non_ff_drivers = tnames;
    929 		/* i now represents the count we previously alloced */
    930 		kmem_free(tmp, i * sizeof (char *));
    931 	}
    932 	mutex_exit(&non_ff_drv_mutex);
    933 
    934 	return (1);
    935 }
    936 
    937 /*
    938  * Check for the "ddi-failfast-supported" devtree property on each submirror
    939  * component to indicate if we should do I/O to that submirror with the
    940  * B_FAILFAST flag set or not.  This check is made at various state transitions
    941  * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
    942  * only need to check one drive (e.g. hotspare) but since the check is
    943  * fast and infrequent and sometimes needs to be done on all components we
    944  * just check all components on each call.
    945  */
    946 void
    947 mirror_check_failfast(minor_t mnum)
    948 {
    949 	int		i;
    950 	mm_unit_t	*un;
    951 
    952 	if (md_ff_disable)
    953 		return;
    954 
    955 	un = MD_UNIT(mnum);
    956 
    957 	for (i = 0; i < NMIRROR; i++) {
    958 		int			ci;
    959 		int			cnt;
    960 		int			ff = 1;
    961 		mm_submirror_t		*sm;
    962 		mm_submirror_ic_t	*smic;
    963 		void			(*get_dev)();
    964 
    965 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
    966 			continue;
    967 
    968 		sm = &un->un_sm[i];
    969 		smic = &un->un_smic[i];
    970 
    971 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
    972 		    "get device", 0);
    973 
    974 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
    975 		for (ci = 0; ci < cnt; ci++) {
    976 			int		found = 0;
    977 			dev_t		ci_dev;
    978 			major_t		major;
    979 			dev_info_t	*devi;
    980 			ms_cd_info_t	cd;
    981 
    982 			/*
    983 			 * this already returns the hs
    984 			 * dev if the device is spared
    985 			 */
    986 			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
    987 
    988 			ci_dev = md_dev64_to_dev(cd.cd_dev);
    989 			major = getmajor(ci_dev);
    990 
    991 			if (major == md_major) {
    992 				/*
    993 				 * this component must be a soft
    994 				 * partition; get the real dev
    995 				 */
    996 				minor_t	dev_mnum;
    997 				mdi_unit_t	*ui;
    998 				mp_unit_t	*un;
    999 				set_t	setno;
   1000 				side_t	side;
   1001 				md_dev64_t	tmpdev;
   1002 
   1003 				ui = MDI_UNIT(getminor(ci_dev));
   1004 
   1005 				/* grab necessary lock */
   1006 				un = (mp_unit_t *)md_unit_readerlock(ui);
   1007 
   1008 				dev_mnum = MD_SID(un);
   1009 				setno = MD_MIN2SET(dev_mnum);
   1010 				side = mddb_getsidenum(setno);
   1011 
   1012 				tmpdev = un->un_dev;
   1013 
   1014 				/* Get dev by device id */
   1015 				if (md_devid_found(setno, side,
   1016 				    un->un_key) == 1) {
   1017 					tmpdev = md_resolve_bydevid(dev_mnum,
   1018 					    tmpdev, un->un_key);
   1019 				}
   1020 
   1021 				md_unit_readerexit(ui);
   1022 
   1023 				ci_dev = md_dev64_to_dev(tmpdev);
   1024 				major = getmajor(ci_dev);
   1025 			}
   1026 
   1027 			if (ci_dev != NODEV32 &&
   1028 			    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
   1029 			    != NULL) {
   1030 				ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
   1031 				int		propvalue = 0;
   1032 				int		proplength = sizeof (int);
   1033 				int		error;
   1034 				struct cb_ops	*cb;
   1035 
   1036 				if ((cb = devopsp[major]->devo_cb_ops) !=
   1037 				    NULL) {
   1038 					error = (*cb->cb_prop_op)
   1039 					    (DDI_DEV_T_ANY, devi, prop_op,
   1040 					    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
   1041 					    "ddi-failfast-supported",
   1042 					    (caddr_t)&propvalue, &proplength);
   1043 
   1044 					if (error == DDI_PROP_SUCCESS)
   1045 						found = 1;
   1046 				}
   1047 
   1048 				if (!found && new_non_ff_driver(
   1049 				    ddi_driver_name(devi))) {
   1050 					cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
   1051 					    "disabled on %s",
   1052 					    ddi_driver_name(devi));
   1053 				}
   1054 
   1055 				ddi_release_devi(devi);
   1056 			}
   1057 
   1058 			/*
   1059 			 * All components must support
   1060 			 * failfast in the submirror.
   1061 			 */
   1062 			if (!found) {
   1063 				ff = 0;
   1064 				break;
   1065 			}
   1066 		}
   1067 
   1068 		if (ff) {
   1069 			sm->sm_flags |= MD_SM_FAILFAST;
   1070 		} else {
   1071 			sm->sm_flags &= ~MD_SM_FAILFAST;
   1072 		}
   1073 	}
   1074 }
   1075 
   1076 /*
   1077  * Return true if the submirror is unavailable.
   1078  * If any of the submirror components are opened then the submirror cannot
   1079  * be unavailable (MD_INACCESSIBLE).
   1080  * If any of the components are already in the errored state, then the submirror
   1081  * cannot be unavailable (MD_INACCESSIBLE).
   1082  */
   1083 static bool_t
   1084 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
   1085 {
   1086 	mm_submirror_t		*sm;
   1087 	mm_submirror_ic_t	*smic;
   1088 	md_m_shared_t		*shared;
   1089 	int			ci;
   1090 	int			compcnt;
   1091 
   1092 	sm = &un->un_sm[smi];
   1093 	smic = &un->un_smic[smi];
   1094 
   1095 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
   1096 	for (ci = 0; ci < compcnt; ci++) {
   1097 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
   1098 		    (sm->sm_dev, sm, ci);
   1099 		if (from_probe) {
   1100 			if (shared->ms_flags & MDM_S_PROBEOPEN)
   1101 				return (B_FALSE);
   1102 		} else {
   1103 			if (shared->ms_flags & MDM_S_ISOPEN)
   1104 				return (B_FALSE);
   1105 		}
   1106 		if (shared->ms_state == CS_ERRED ||
   1107 		    shared->ms_state == CS_LAST_ERRED)
   1108 			return (B_FALSE);
   1109 	}
   1110 
   1111 	return (B_TRUE);
   1112 }
   1113 
   1114 static int
   1115 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
   1116 {
   1117 	int		i;
   1118 	mm_unit_t	*un;
   1119 	mdi_unit_t	*ui;
   1120 	int		err;
   1121 	int		smi;
   1122 	int		ci;
   1123 	err_comp_t	*c;
   1124 	err_comp_t	*ecomps = NULL;
   1125 	int		smmask = 0;
   1126 	set_t		setno;
   1127 	int		sm_cnt;
   1128 	int		sm_unavail_cnt;
   1129 
   1130 	mirror_check_failfast(mnum);
   1131 
   1132 	un = MD_UNIT(mnum);
   1133 	ui = MDI_UNIT(mnum);
   1134 	setno = MD_UN2SET(un);
   1135 
   1136 	for (i = 0; i < NMIRROR; i++) {
   1137 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
   1138 
   1139 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
   1140 			continue;
   1141 		if (md_layered_open(mnum, &tmpdev, md_oflags))
   1142 			smmask |= SMI2BIT(i);
   1143 		un->un_sm[i].sm_dev = tmpdev;
   1144 	}
   1145 
   1146 	/*
   1147 	 * If smmask is clear, all submirrors are accessible. Clear the
   1148 	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
   1149 	 * mirror device.   If smmask is set, we have to determine which of the
   1150 	 * submirrors are in error. If no submirror is accessible we mark the
   1151 	 * whole mirror as MD_INACCESSIBLE.
   1152 	 */
   1153 	if (smmask == 0) {
   1154 		if (lockp) {
   1155 			md_ioctl_readerexit(lockp);
   1156 			(void) md_ioctl_writerlock(lockp, ui);
   1157 		} else {
   1158 			md_unit_readerexit(ui);
   1159 			(void) md_unit_writerlock(ui);
   1160 		}
   1161 		ui->ui_tstate &= ~MD_INACCESSIBLE;
   1162 		if (lockp) {
   1163 			md_ioctl_writerexit(lockp);
   1164 			(void) md_ioctl_readerlock(lockp, ui);
   1165 		} else {
   1166 			md_unit_writerexit(ui);
   1167 			(void) md_unit_readerlock(ui);
   1168 		}
   1169 
   1170 		for (i = 0; i < NMIRROR; i++) {
   1171 			md_dev64_t	tmpdev;
   1172 			mdi_unit_t	*sm_ui;
   1173 
   1174 			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
   1175 				continue;
   1176 
   1177 			tmpdev = un->un_sm[i].sm_dev;
   1178 			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
   1179 			(void) md_unit_writerlock(sm_ui);
   1180 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
   1181 			md_unit_writerexit(sm_ui);
   1182 		}
   1183 
   1184 		return (0);
   1185 	}
   1186 
   1187 	for (i = 0; i < NMIRROR; i++) {
   1188 		md_dev64_t tmpdev;
   1189 
   1190 		if (!(smmask & SMI2BIT(i)))
   1191 			continue;
   1192 
   1193 		tmpdev = un->un_sm[i].sm_dev;
   1194 		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
   1195 		un->un_sm[i].sm_dev = tmpdev;
   1196 		ASSERT(err == 0);
   1197 	}
   1198 
   1199 	if (lockp) {
   1200 		md_ioctl_readerexit(lockp);
   1201 		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
   1202 	} else {
   1203 		md_unit_readerexit(ui);
   1204 		un = (mm_unit_t *)md_unit_writerlock(ui);
   1205 	}
   1206 
   1207 	/*
   1208 	 * We want to make sure the unavailable flag is not masking a real
   1209 	 * error on the submirror.
   1210 	 * For each submirror,
   1211 	 *    if all of the submirror components couldn't be opened and there
   1212 	 *    are no errors on the submirror, then set the unavailable flag
   1213 	 *    otherwise, clear unavailable.
   1214 	 */
   1215 	sm_cnt = 0;
   1216 	sm_unavail_cnt = 0;
   1217 	for (i = 0; i < NMIRROR; i++) {
   1218 		md_dev64_t	tmpdev;
   1219 		mdi_unit_t	*sm_ui;
   1220 
   1221 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
   1222 			continue;
   1223 
   1224 		sm_cnt++;
   1225 		tmpdev = un->un_sm[i].sm_dev;
   1226 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
   1227 
   1228 		(void) md_unit_writerlock(sm_ui);
   1229 		if (submirror_unavailable(un, i, 0)) {
   1230 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
   1231 			sm_unavail_cnt++;
   1232 		} else {
   1233 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
   1234 		}
   1235 		md_unit_writerexit(sm_ui);
   1236 	}
   1237 
   1238 	/*
   1239 	 * If all of the submirrors are unavailable, the mirror is also
   1240 	 * unavailable.
   1241 	 */
   1242 	if (sm_cnt == sm_unavail_cnt) {
   1243 		ui->ui_tstate |= MD_INACCESSIBLE;
   1244 	} else {
   1245 		ui->ui_tstate &= ~MD_INACCESSIBLE;
   1246 	}
   1247 
   1248 	smi = 0;
   1249 	ci = 0;
   1250 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
   1251 		if (mirror_other_sources(un, smi, ci, 1) == 1) {
   1252 
   1253 			free_all_ecomps(ecomps);
   1254 			(void) mirror_close_all_devs(un, md_oflags);
   1255 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
   1256 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
   1257 			mirror_openfail_console_info(un, smi, ci);
   1258 			if (lockp) {
   1259 				md_ioctl_writerexit(lockp);
   1260 				(void) md_ioctl_readerlock(lockp, ui);
   1261 			} else {
   1262 				md_unit_writerexit(ui);
   1263 				(void) md_unit_readerlock(ui);
   1264 			}
   1265 			return (ENXIO);
   1266 		}
   1267 
   1268 		/* track all component states that need changing */
   1269 		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
   1270 		c->ec_next = ecomps;
   1271 		c->ec_smi = smi;
   1272 		c->ec_ci = ci;
   1273 		ecomps = c;
   1274 		ci++;
   1275 	}
   1276 
   1277 	/* Make all state changes and commit them */
   1278 	for (c = ecomps; c != NULL; c = c->ec_next) {
   1279 		/*
   1280 		 * If lockp is set, then entering kernel through ioctl.
   1281 		 * For a MN set, the only ioctl path is via a commd message
   1282 		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
   1283 		 * being sent to each node.
   1284 		 * In this case, set NO_XMIT so that set_sm_comp_state
   1285 		 * won't attempt to send a message on a message.
   1286 		 *
   1287 		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
   1288 		 * which flag is passed.
   1289 		 */
   1290 		if (lockp) {
   1291 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
   1292 			    MD_STATE_NO_XMIT, lockp);
   1293 		} else {
   1294 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
   1295 			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
   1296 		}
   1297 		/*
   1298 		 * For a MN set, the NOTIFY is done when the state change is
   1299 		 * processed on each node
   1300 		 */
   1301 		if (!MD_MNSET_SETNO(setno)) {
   1302 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
   1303 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
   1304 		}
   1305 	}
   1306 
   1307 	if (lockp) {
   1308 		md_ioctl_writerexit(lockp);
   1309 		(void) md_ioctl_readerlock(lockp, ui);
   1310 	} else {
   1311 		md_unit_writerexit(ui);
   1312 		(void) md_unit_readerlock(ui);
   1313 	}
   1314 
   1315 	free_all_ecomps(ecomps);
   1316 
   1317 	/* allocate hotspares for all errored components */
   1318 	if (MD_MNSET_SETNO(setno)) {
   1319 		/*
   1320 		 * If we're called from an ioctl (lockp set) then we cannot
   1321 		 * directly call send_poke_hotspares as this will block until
   1322 		 * the message gets despatched to all nodes. If the cluster is
   1323 		 * going through a reconfig cycle then the message will block
   1324 		 * until the cycle is complete, and as we originate from a
   1325 		 * service call from commd we will livelock.
   1326 		 */
   1327 		if (lockp == NULL) {
   1328 			md_unit_readerexit(ui);
   1329 			send_poke_hotspares(setno);
   1330 			(void) md_unit_readerlock(ui);
   1331 		}
   1332 	} else {
   1333 		(void) poke_hotspares();
   1334 	}
   1335 	return (0);
   1336 }
   1337 
   1338 void
   1339 mirror_overlap_tree_remove(md_mps_t *ps)
   1340 {
   1341 	mm_unit_t	*un;
   1342 
   1343 	if (panicstr)
   1344 		return;
   1345 
   1346 	VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
   1347 	un = ps->ps_un;
   1348 
   1349 	mutex_enter(&un->un_overlap_tree_mx);
   1350 	avl_remove(&un->un_overlap_root, ps);
   1351 	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
   1352 	if (un->un_overlap_tree_flag != 0) {
   1353 		un->un_overlap_tree_flag = 0;
   1354 		cv_broadcast(&un->un_overlap_tree_cv);
   1355 	}
   1356 	mutex_exit(&un->un_overlap_tree_mx);
   1357 }
   1358 
   1359 
   1360 /*
   1361  * wait_for_overlaps:
   1362  * -----------------
   1363  * Check that given i/o request does not cause an overlap with already pending
   1364  * i/o. If it does, block until the overlapped i/o completes.
   1365  *
   1366  * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
   1367  * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
   1368  * it must not already be in the tree.
   1369  */
   1370 static void
   1371 wait_for_overlaps(md_mps_t *ps, int flags)
   1372 {
   1373 	mm_unit_t	*un;
   1374 	avl_index_t	where;
   1375 	md_mps_t	*ps1;
   1376 
   1377 	if (panicstr)
   1378 		return;
   1379 
   1380 	un = ps->ps_un;
   1381 	mutex_enter(&un->un_overlap_tree_mx);
   1382 	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
   1383 	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
   1384 		mutex_exit(&un->un_overlap_tree_mx);
   1385 		return;
   1386 	}
   1387 
   1388 	VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
   1389 
   1390 	do {
   1391 		ps1 = avl_find(&un->un_overlap_root, ps, &where);
   1392 		if (ps1 == NULL) {
   1393 			/*
   1394 			 * The candidate range does not overlap with any
   1395 			 * range in the tree.  Insert it and be done.
   1396 			 */
   1397 			avl_insert(&un->un_overlap_root, ps, where);
   1398 			ps->ps_flags |= MD_MPS_ON_OVERLAP;
   1399 		} else {
   1400 			/*
   1401 			 * The candidate range would overlap.  Set the flag
   1402 			 * indicating we need to be woken up, and sleep
   1403 			 * until another thread removes a range.  If upon
   1404 			 * waking up we find this mps was put on the tree
   1405 			 * by another thread, the loop terminates.
   1406 			 */
   1407 			un->un_overlap_tree_flag = 1;
   1408 			cv_wait(&un->un_overlap_tree_cv,
   1409 			    &un->un_overlap_tree_mx);
   1410 		}
   1411 	} while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
   1412 	mutex_exit(&un->un_overlap_tree_mx);
   1413 }
   1414 
   1415 /*
   1416  * This function is called from mirror_done to check whether any pages have
   1417  * been modified while a mirrored write was in progress.  Returns 0 if
   1418  * all pages associated with bp are clean, 1 otherwise.
   1419  */
   1420 static int
   1421 any_pages_dirty(struct buf *bp)
   1422 {
   1423 	int	rval;
   1424 
   1425 	rval = biomodified(bp);
   1426 	if (rval == -1)
   1427 		rval = 0;
   1428 
   1429 	return (rval);
   1430 }
   1431 
   1432 #define	MAX_EXTRAS 10
   1433 
   1434 void
   1435 mirror_commit(
   1436 	mm_unit_t	*un,
   1437 	int		smmask,
   1438 	mddb_recid_t	*extras
   1439 )
   1440 {
   1441 	mm_submirror_t		*sm;
   1442 	md_unit_t		*su;
   1443 	int			i;
   1444 
   1445 	/* 2=mirror,null id */
   1446 	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];
   1447 
   1448 	int			ri = 0;
   1449 
   1450 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
   1451 		return;
   1452 
   1453 	/* Add two, this includes the mirror unit and the null recid */
   1454 	if (extras != NULL) {
   1455 		int	nrecids = 0;
   1456 		while (extras[nrecids] != 0) {
   1457 			nrecids++;
   1458 		}
   1459 		ASSERT(nrecids <= MAX_EXTRAS);
   1460 	}
   1461 
   1462 	if (un != NULL)
   1463 		recids[ri++] = un->c.un_record_id;
   1464 	for (i = 0;  i < NMIRROR; i++) {
   1465 		if (!(smmask & SMI2BIT(i)))
   1466 			continue;
   1467 		sm = &un->un_sm[i];
   1468 		if (!SMS_IS(sm, SMS_INUSE))
   1469 			continue;
   1470 		if (md_getmajor(sm->sm_dev) != md_major)
   1471 			continue;
   1472 		su =  MD_UNIT(md_getminor(sm->sm_dev));
   1473 		recids[ri++] = su->c.un_record_id;
   1474 	}
   1475 
   1476 	if (extras != NULL)
   1477 		while (*extras != 0) {
   1478 			recids[ri++] = *extras;
   1479 			extras++;
   1480 		}
   1481 
   1482 	if (ri == 0)
   1483 		return;
   1484 	recids[ri] = 0;
   1485 
   1486 	/*
   1487 	 * Ok to hold ioctl lock across record commit to mddb as
   1488 	 * long as the record(s) being committed aren't resync records.
   1489 	 */
   1490 	mddb_commitrecs_wrapper(recids);
   1491 }
   1492 
   1493 
   1494 /*
   1495  * This routine is used to set a bit in the writable_bm bitmap
   1496  * which represents each submirror in a metamirror which
   1497  * is writable. The first writable submirror index is assigned
   1498  * to the sm_index.  The number of writable submirrors are returned in nunits.
   1499  *
   1500  * This routine returns the submirror's unit number.
   1501  */
   1502 
   1503 static void
   1504 select_write_units(struct mm_unit *un, md_mps_t *ps)
   1505 {
   1506 
   1507 	int		i;
   1508 	unsigned	writable_bm = 0;
   1509 	unsigned	nunits = 0;
   1510 
   1511 	for (i = 0; i < NMIRROR; i++) {
   1512 		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
   1513 			/* set bit of all writable units */
   1514 			writable_bm |= SMI2BIT(i);
   1515 			nunits++;
   1516 		}
   1517 	}
   1518 	ps->ps_writable_sm = writable_bm;
   1519 	ps->ps_active_cnt = nunits;
   1520 	ps->ps_current_sm = 0;
   1521 }
   1522 
   1523 static
   1524 unsigned
   1525 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
   1526 {
   1527 
   1528 	int		i;
   1529 	unsigned	writable_bm = 0;
   1530 	unsigned	nunits = 0;
   1531 
   1532 	for (i = 0; i < NMIRROR; i++) {
   1533 		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
   1534 		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
   1535 			writable_bm |= SMI2BIT(i);
   1536 			nunits++;
   1537 		}
   1538 	}
   1539 	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
   1540 		writable_bm &= ~ps->ps_allfrom_sm;
   1541 		nunits--;
   1542 	}
   1543 	ps->ps_writable_sm = writable_bm;
   1544 	ps->ps_active_cnt = nunits;
   1545 	ps->ps_current_sm = 0;
   1546 	return (nunits);
   1547 }
   1548 
   1549 static md_dev64_t
   1550 select_read_unit(
   1551 	mm_unit_t	*un,
   1552 	diskaddr_t	blkno,
   1553 	u_longlong_t	reqcount,
   1554 	u_longlong_t	*cando,
   1555 	int		must_be_opened,
   1556 	md_m_shared_t	**shared,
   1557 	md_mcs_t	*cs)
   1558 {
   1559 	int			i;
   1560 	md_m_shared_t		*s;
   1561 	uint_t			lasterrcnt = 0;
   1562 	md_dev64_t		dev = 0;
   1563 	u_longlong_t		cnt;
   1564 	u_longlong_t		mincnt;
   1565 	mm_submirror_t		*sm;
   1566 	mm_submirror_ic_t	*smic;
   1567 	mdi_unit_t		*ui;
   1568 
   1569 	mincnt = reqcount;
   1570 	for (i = 0; i < NMIRROR; i++) {
   1571 		if (!SUBMIRROR_IS_READABLE(un, i))
   1572 			continue;
   1573 		sm = &un->un_sm[i];
   1574 		smic = &un->un_smic[i];
   1575 		cnt = reqcount;
   1576 
   1577 		/*
   1578 		 * If the current submirror is marked as inaccessible, do not
   1579 		 * try to access it.
   1580 		 */
   1581 		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
   1582 		(void) md_unit_readerlock(ui);
   1583 		if (ui->ui_tstate & MD_INACCESSIBLE) {
   1584 			md_unit_readerexit(ui);
   1585 			continue;
   1586 		}
   1587 		md_unit_readerexit(ui);
   1588 
   1589 		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
   1590 		    (sm->sm_dev, sm, blkno, &cnt);
   1591 
   1592 		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
   1593 			continue;
   1594 		if (s->ms_state == CS_OKAY) {
   1595 			*cando = cnt;
   1596 			if (shared != NULL)
   1597 				*shared = s;
   1598 
   1599 			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
   1600 			    cs != NULL) {
   1601 				cs->cs_buf.b_flags |= B_FAILFAST;
   1602 			}
   1603 
   1604 			return (un->un_sm[i].sm_dev);
   1605 		}
   1606 		if (s->ms_state != CS_LAST_ERRED)
   1607 			continue;
   1608 
   1609 		/* don't use B_FAILFAST since we're Last Erred */
   1610 
   1611 		if (mincnt > cnt)
   1612 			mincnt = cnt;
   1613 		if (s->ms_lasterrcnt > lasterrcnt) {
   1614 			lasterrcnt = s->ms_lasterrcnt;
   1615 			if (shared != NULL)
   1616 				*shared = s;
   1617 			dev = un->un_sm[i].sm_dev;
   1618 		}
   1619 	}
   1620 	*cando = mincnt;
   1621 	return (dev);
   1622 }
   1623 
   1624 /*
   1625  * Given a 32-bit bitmap, this routine will return the bit number
   1626  * of the nth bit set.	The nth bit set is passed via the index integer.
   1627  *
   1628  * This routine is used to run through the writable submirror bitmap
   1629  * and starting all of the writes.  See the value returned is the
   1630  * index to appropriate submirror structure, in the md_sm
   1631  * array for metamirrors.
   1632  */
   1633 static int
   1634 md_find_nth_unit(uint_t mask, int index)
   1635 {
   1636 	int	bit, nfound;
   1637 
   1638 	for (bit = -1, nfound = -1; nfound != index; bit++) {
   1639 		ASSERT(mask != 0);
   1640 		nfound += (mask & 1);
   1641 		mask >>= 1;
   1642 	}
   1643 	return (bit);
   1644 }
   1645 
   1646 static int
   1647 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
   1648 {
   1649 	mm_unit_t	*un;
   1650 	buf_t		*bp;
   1651 	int		i;
   1652 	unsigned	nunits = 0;
   1653 	int		iunit;
   1654 	uint_t		running_bm = 0;
   1655 	uint_t		sm_index;
   1656 
   1657 	bp = &cs->cs_buf;
   1658 	un = ps->ps_un;
   1659 
   1660 	for (i = 0; i < NMIRROR; i++) {
   1661 		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
   1662 			continue;
   1663 		running_bm |= SMI2BIT(i);
   1664 		nunits++;
   1665 	}
   1666 	if (nunits == 0)
   1667 		return (1);
   1668 
   1669 	/*
   1670 	 * For directed mirror read (DMR) we only use the specified side and
   1671 	 * do not compute the source of the read.
   1672 	 * If we're running with MD_MPS_DIRTY_RD set we always return the
   1673 	 * first mirror side (this prevents unnecessary ownership switching).
   1674 	 * Otherwise we return the submirror according to the mirror read option
   1675 	 */
   1676 	if (ps->ps_flags & MD_MPS_DMR) {
   1677 		sm_index = un->un_dmr_last_read;
   1678 	} else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
   1679 		sm_index = md_find_nth_unit(running_bm, 0);
   1680 	} else {
   1681 		/* Normal (non-DMR) operation */
   1682 		switch (un->un_read_option) {
   1683 		case RD_GEOMETRY:
   1684 			iunit = (int)(bp->b_lblkno /
   1685 			    howmany(un->c.un_total_blocks, nunits));
   1686 			sm_index = md_find_nth_unit(running_bm, iunit);
   1687 			break;
   1688 		case RD_FIRST:
   1689 			sm_index = md_find_nth_unit(running_bm, 0);
   1690 			break;
   1691 		case RD_LOAD_BAL:
   1692 			/* this is intentional to fall into the default */
   1693 		default:
   1694 			un->un_last_read = (un->un_last_read + 1) % nunits;
   1695 			sm_index = md_find_nth_unit(running_bm,
   1696 			    un->un_last_read);
   1697 			break;
   1698 		}
   1699 	}
   1700 	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
   1701 	ps->ps_allfrom_sm = SMI2BIT(sm_index);
   1702 
   1703 	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
   1704 		bp->b_flags |= B_FAILFAST;
   1705 	}
   1706 
   1707 	return (0);
   1708 }
   1709 
   1710 static
   1711 int
   1712 mirror_are_submirrors_available(mm_unit_t *un)
   1713 {
   1714 	int i;
   1715 	for (i = 0; i < NMIRROR; i++) {
   1716 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
   1717 
   1718 		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
   1719 		    md_getmajor(tmpdev) != md_major)
   1720 			continue;
   1721 
   1722 		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
   1723 		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
   1724 			return (0);
   1725 
   1726 		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
   1727 			return (0);
   1728 	}
   1729 	return (1);
   1730 }
   1731 
   1732 void
   1733 build_submirror(mm_unit_t *un, int i, int snarfing)
   1734 {
   1735 	struct mm_submirror	*sm;
   1736 	struct mm_submirror_ic	*smic;
   1737 	md_unit_t		*su;
   1738 	set_t			setno;
   1739 
   1740 	sm = &un->un_sm[i];
   1741 	smic = &un->un_smic[i];
   1742 
   1743 	sm->sm_flags = 0; /* sometime we may need to do more here */
   1744 
   1745 	setno = MD_UN2SET(un);
   1746 
   1747 	if (!SMS_IS(sm, SMS_INUSE))
   1748 		return;
   1749 	if (snarfing) {
   1750 		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
   1751 		    sm->sm_key, MD_NOTRUST_DEVT);
   1752 	} else {
   1753 		if (md_getmajor(sm->sm_dev) == md_major) {
   1754 			su = MD_UNIT(md_getminor(sm->sm_dev));
   1755 			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
   1756 			/* submirror can no longer be soft partitioned */
   1757 			MD_CAPAB(su) &= (~MD_CAN_SP);
   1758 		}
   1759 	}
   1760 	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
   1761 	    0, "shared by blk", 0);
   1762 	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
   1763 	    0, "shared by indx", 0);
   1764 	smic->sm_get_component_count = (int (*)())md_get_named_service(
   1765 	    sm->sm_dev, 0, "get component count", 0);
   1766 	smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
   1767 	    "get block count skip size", 0);
   1768 	sm->sm_state &= ~SMS_IGNORE;
   1769 	if (SMS_IS(sm, SMS_OFFLINE))
   1770 		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
   1771 	md_set_parent(sm->sm_dev, MD_SID(un));
   1772 }
   1773 
   1774 static void
   1775 mirror_cleanup(mm_unit_t *un)
   1776 {
   1777 	mddb_recid_t	recid;
   1778 	int		smi;
   1779 	sv_dev_t	sv[NMIRROR];
   1780 	int		nsv = 0;
   1781 
   1782 	/*
   1783 	 * If a MN diskset and this node is not the master, do
   1784 	 * not delete any records on snarf of the mirror records.
   1785 	 */
   1786 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
   1787 	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
   1788 		return;
   1789 	}
   1790 
   1791 	for (smi = 0; smi < NMIRROR; smi++) {
   1792 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
   1793 			continue;
   1794 		sv[nsv].setno = MD_UN2SET(un);
   1795 		sv[nsv++].key = un->un_sm[smi].sm_key;
   1796 	}
   1797 
   1798 	recid = un->un_rr_dirty_recid;
   1799 	mddb_deleterec_wrapper(un->c.un_record_id);
   1800 	if (recid > 0)
   1801 		mddb_deleterec_wrapper(recid);
   1802 
   1803 	md_rem_names(sv, nsv);
   1804 }
   1805 
   1806 /*
   1807  * Comparison function for the avl tree which tracks
   1808  * outstanding writes on submirrors.
   1809  *
   1810  * Returns:
   1811  *	-1: ps1 < ps2
   1812  *	 0: ps1 and ps2 overlap
   1813  *	 1: ps1 > ps2
   1814  */
   1815 static int
   1816 mirror_overlap_compare(const void *p1, const void *p2)
   1817 {
   1818 	const md_mps_t *ps1 = (md_mps_t *)p1;
   1819 	const md_mps_t *ps2 = (md_mps_t *)p2;
   1820 
   1821 	if (ps1->ps_firstblk < ps2->ps_firstblk) {
   1822 		if (ps1->ps_lastblk >= ps2->ps_firstblk)
   1823 			return (0);
   1824 		return (-1);
   1825 	}
   1826 
   1827 	if (ps1->ps_firstblk > ps2->ps_firstblk) {
   1828 		if (ps1->ps_firstblk <= ps2->ps_lastblk)
   1829 			return (0);
   1830 		return (1);
   1831 	}
   1832 
   1833 	return (0);
   1834 }
   1835 
   1836 /*
   1837  * Collapse any sparse submirror entries snarfed from the on-disk replica.
   1838  * Only the in-core entries are updated. The replica will be updated on-disk
   1839  * when the in-core replica is committed on shutdown of the SVM subsystem.
   1840  */
   1841 static void
   1842 collapse_submirrors(mm_unit_t *un)
   1843 {
   1844 	int			smi, nremovals, smiremove;
   1845 	mm_submirror_t		*sm, *new_sm, *old_sm;
   1846 	mm_submirror_ic_t	*smic;
   1847 	int			nsmidx = un->un_nsm - 1;
   1848 
   1849 rescan:
   1850 	nremovals = 0;
   1851 	smiremove = -1;
   1852 
   1853 	for (smi = 0; smi <= nsmidx; smi++) {
   1854 		sm = &un->un_sm[smi];
   1855 
   1856 		/*
   1857 		 * Check to see if this submirror is marked as in-use.
   1858 		 * If it isn't then it is a potential sparse entry and
   1859 		 * may need to be cleared from the configuration.
   1860 		 * The records should _already_ have been cleared by the
   1861 		 * original mirror_detach() code, but we need to shuffle
   1862 		 * any NULL entries in un_sm[] to the end of the array.
   1863 		 * Any NULL un_smic[] entries need to be reset to the underlying
   1864 		 * submirror/slice accessor functions.
   1865 		 */
   1866 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
   1867 			nremovals++;
   1868 			smiremove = smi;
   1869 			break;
   1870 		}
   1871 	}
   1872 
   1873 	if (nremovals == 0) {
   1874 		/*
   1875 		 * Ensure that we have a matching contiguous set of un_smic[]
   1876 		 * entries for the corresponding un_sm[] entries
   1877 		 */
   1878 		for (smi = 0; smi <= nsmidx; smi++) {
   1879 			smic = &un->un_smic[smi];
   1880 			sm = &un->un_sm[smi];
   1881 
   1882 			smic->sm_shared_by_blk =
   1883 			    md_get_named_service(sm->sm_dev, 0,
   1884 			    "shared by_blk", 0);
   1885 			smic->sm_shared_by_indx =
   1886 			    md_get_named_service(sm->sm_dev, 0,
   1887 			    "shared by indx", 0);
   1888 			smic->sm_get_component_count =
   1889 			    (int (*)())md_get_named_service(sm->sm_dev, 0,
   1890 			    "get component count", 0);
   1891 			smic->sm_get_bcss =
   1892 			    (int (*)())md_get_named_service(sm->sm_dev, 0,
   1893 			    "get block count skip size", 0);
   1894 		}
   1895 		return;
   1896 	}
   1897 
   1898 	/*
   1899 	 * Reshuffle the submirror devices so that we do not have a dead record
   1900 	 * in the middle of the array. Once we've done this we need to rescan
   1901 	 * the mirror to check for any other holes.
   1902 	 */
   1903 	for (smi = 0; smi < NMIRROR; smi++) {
   1904 		if (smi < smiremove)
   1905 			continue;
   1906 		if (smi > smiremove) {
   1907 			old_sm = &un->un_sm[smi];
   1908 			new_sm = &un->un_sm[smi - 1];
   1909 			bcopy(old_sm, new_sm, sizeof (mm_submirror_t));
   1910 			bzero(old_sm, sizeof (mm_submirror_t));
   1911 		}
   1912 	}
   1913 
   1914 	/*
   1915 	 * Now we need to rescan the array to find the next potential dead
   1916 	 * entry.
   1917 	 */
   1918 	goto rescan;
   1919 }
   1920 
   1921 /* Return a -1 if optimized record unavailable and set should be released */
   1922 int
   1923 mirror_build_incore(mm_unit_t *un, int snarfing)
   1924 {
   1925 	int		i;
   1926 
   1927 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
   1928 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
   1929 		return (1);
   1930 	}
   1931 
   1932 	if (mirror_are_submirrors_available(un) == 0)
   1933 		return (1);
   1934 
   1935 	if (MD_UNIT(MD_SID(un)) != NULL)
   1936 		return (0);
   1937 
   1938 	MD_STATUS(un) = 0;
   1939 
   1940 	/* pre-4.1 didn't define CAN_META_CHILD capability */
   1941 	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
   1942 
   1943 	un->un_overlap_tree_flag = 0;
   1944 	avl_create(&un->un_overlap_root, mirror_overlap_compare,
   1945 	    sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
   1946 
   1947 	/*
   1948 	 * We need to collapse any sparse submirror entries into a non-sparse
   1949 	 * array. This is to cover the case where we have an old replica image
   1950 	 * which has not been updated (i.e. snarfed) since being modified.
   1951 	 * The new code expects all submirror access to be sequential (i.e.
   1952 	 * both the un_sm[] and un_smic[] entries correspond to non-empty
   1953 	 * submirrors.
   1954 	 */
   1955 
   1956 	collapse_submirrors(un);
   1957 
   1958 	for (i = 0; i < NMIRROR; i++)
   1959 		build_submirror(un, i, snarfing);
   1960 
   1961 	if (unit_setup_resync(un, snarfing) != 0) {
   1962 		if (snarfing) {
   1963 			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
   1964 			/*
   1965 			 * If a MN set and set is not stale, then return -1
   1966 			 * which will force the caller to unload the set.
   1967 			 * The MN diskset nodes will return failure if
   1968 			 * unit_setup_resync fails so that nodes won't
   1969 			 * get out of sync.
   1970 			 *
   1971 			 * If set is STALE, the master node can't allocate
   1972 			 * a resync record (if needed), but node needs to
   1973 			 * join the set so that user can delete broken mddbs.
   1974 			 * So, if set is STALE, just continue on.
   1975 			 */
   1976 			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
   1977 			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
   1978 				return (-1);
   1979 			}
   1980 		} else
   1981 			return (1);
   1982 	}
   1983 
   1984 	mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
   1985 	cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
   1986 
   1987 	un->un_suspend_wr_flag = 0;
   1988 	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
   1989 	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
   1990 
   1991 	/*
   1992 	 * Allocate mutexes for mirror-owner and resync-owner changes.
   1993 	 * All references to the owner message state field must be guarded
   1994 	 * by this mutex.
   1995 	 */
   1996 	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
   1997 
   1998 	/*
   1999 	 * Allocate mutex and condvar for resync thread manipulation. These
   2000 	 * will be used by mirror_resync_unit/mirror_ioctl_resync
   2001 	 */
   2002 	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
   2003 	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
   2004 
   2005 	/*
   2006 	 * Allocate mutex and condvar for resync progress thread manipulation.
   2007 	 * This allows resyncs to be continued across an intervening reboot.
   2008 	 */
   2009 	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
   2010 	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
   2011 
   2012 	/*
   2013 	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
   2014 	 * provides synchronization between a user-ioctl and the resulting
   2015 	 * strategy() call that performs the read().
   2016 	 */
   2017 	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
   2018 	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
   2019 
   2020 	/*
   2021 	 * Allocate rwlocks for un_pernode_dirty_bm accessing.
   2022 	 */
   2023 	for (i = 0; i < MD_MNMAXSIDES; i++) {
   2024 		rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
   2025 	}
   2026 
   2027 	/* place various information in the in-core data structures */
   2028 	md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
   2029 	MD_UNIT(MD_SID(un)) = un;
   2030 
   2031 	return (0);
   2032 }
   2033 
   2034 
   2035 void
   2036 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
   2037 {
   2038 	mddb_recid_t	recid, vtoc_id;
   2039 	size_t		bitcnt;
   2040 	size_t		shortcnt;
   2041 	int		smi;
   2042 	sv_dev_t	sv[NMIRROR];
   2043 	int		nsv = 0;
   2044 	uint_t		bits = 0;
   2045 	minor_t		selfid;
   2046 	md_unit_t	*su;
   2047 	int		i;
   2048 
   2049 	md_destroy_unit_incore(mnum, &mirror_md_ops);
   2050 
   2051 	shortcnt = un->un_rrd_num * sizeof (short);
   2052 	bitcnt = howmany(un->un_rrd_num, NBBY);
   2053 
   2054 	if (un->un_outstanding_writes)
   2055 		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
   2056 	if (un->un_goingclean_bm)
   2057 		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
   2058 	if (un->un_goingdirty_bm)
   2059 		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
   2060 	if (un->un_resync_bm)
   2061 		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
   2062 	if (un->un_pernode_dirty_sum)
   2063 		kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
   2064 
   2065 	/*
   2066 	 * Destroy the taskq for deferred processing of DRL clean requests.
   2067 	 * This taskq will only be present for Multi Owner mirrors.
   2068 	 */
   2069 	if (un->un_drl_task != NULL)
   2070 		ddi_taskq_destroy(un->un_drl_task);
   2071 
   2072 	md_nblocks_set(mnum, -1ULL);
   2073 	MD_UNIT(mnum) = NULL;
   2074 
   2075 	/*
   2076 	 * Attempt release of its minor node
   2077 	 */
   2078 	md_remove_minor_node(mnum);
   2079 
   2080 	if (!removing)
   2081 		return;
   2082 
   2083 	for (smi = 0; smi < NMIRROR; smi++) {
   2084 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
   2085 			continue;
   2086 		/* reallow soft partitioning of submirror and reset parent */
   2087 		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
   2088 		MD_CAPAB(su) |= MD_CAN_SP;
   2089 		md_reset_parent(un->un_sm[smi].sm_dev);
   2090 		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
   2091 
   2092 		sv[nsv].setno = MD_MIN2SET(mnum);
   2093 		sv[nsv++].key = un->un_sm[smi].sm_key;
   2094 		bits |= SMI2BIT(smi);
   2095 	}
   2096 
   2097 	MD_STATUS(un) |= MD_UN_BEING_RESET;
   2098 	recid = un->un_rr_dirty_recid;
   2099 	vtoc_id = un->c.un_vtoc_id;
   2100 	selfid = MD_SID(un);
   2101 
   2102 	mirror_commit(un, bits, 0);
   2103 
   2104 	avl_destroy(&un->un_overlap_root);
   2105 
   2106 	/* Destroy all mutexes and condvars before returning. */
   2107 	mutex_destroy(&un->un_suspend_wr_mx);
   2108 	cv_destroy(&un->un_suspend_wr_cv);
   2109 	mutex_destroy(&un->un_overlap_tree_mx);
   2110 	cv_destroy(&un->un_overlap_tree_cv);
   2111 	mutex_destroy(&un->un_owner_mx);
   2112 	mutex_destroy(&un->un_rs_thread_mx);
   2113 	cv_destroy(&un->un_rs_thread_cv);
   2114 	mutex_destroy(&un->un_rs_progress_mx);
   2115 	cv_destroy(&un->un_rs_progress_cv);
   2116 	mutex_destroy(&un->un_dmr_mx);
   2117 	cv_destroy(&un->un_dmr_cv);
   2118 
   2119 	for (i = 0; i < MD_MNMAXSIDES; i++) {
   2120 		rw_destroy(&un->un_pernode_dirty_mx[i]);
   2121 		if (un->un_pernode_dirty_bm[i])
   2122 			kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
   2123 	}
   2124 
   2125 	/*
   2126 	 * Remove self from the namespace
   2127 	 */
   2128 	if (un->c.un_revision & MD_FN_META_DEV) {
   2129 		(void) md_rem_selfname(un->c.un_self_id);
   2130 	}
   2131 
   2132 	/* This frees the unit structure. */
   2133 	mddb_deleterec_wrapper(un->c.un_record_id);
   2134 
   2135 	if (recid != 0)
   2136 		mddb_deleterec_wrapper(recid);
   2137 
   2138 	/* Remove the vtoc, if present */
   2139 	if (vtoc_id)
   2140 		mddb_deleterec_wrapper(vtoc_id);
   2141 
   2142 	md_rem_names(sv, nsv);
   2143 
   2144 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
   2145 	    MD_MIN2SET(selfid), selfid);
   2146 }
   2147 
   2148 int
   2149 mirror_internal_open(
   2150 	minor_t		mnum,
   2151 	int		flag,
   2152 	int		otyp,
   2153 	int		md_oflags,
   2154 	IOLOCK		*lockp		/* can be NULL */
   2155 )
   2156 {
   2157 	mdi_unit_t	*ui = MDI_UNIT(mnum);
   2158 	int		err = 0;
   2159 
   2160 tryagain:
   2161 	/* single thread */
   2162 	if (lockp) {
   2163 		/*
   2164 		 * If ioctl lock is held, use openclose_enter
   2165 		 * routine that will set the ioctl flag when
   2166 		 * grabbing the readerlock.
   2167 		 */
   2168 		(void) md_ioctl_openclose_enter(lockp, ui);
   2169 	} else {
   2170 		(void) md_unit_openclose_enter(ui);
   2171 	}
   2172 
   2173 	/*
   2174 	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
   2175 	 * message in a MN diskset and this requires that the openclose
   2176 	 * lock is dropped in order to send this message.  So, another
   2177 	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
   2178 	 * attempting an open while this thread has an open in progress.
   2179 	 * Call the *_lh version of the lock exit routines since the ui_mx
   2180 	 * mutex must be held from checking for OPENINPROGRESS until
   2181 	 * after the cv_wait call.
   2182 	 */
   2183 	mutex_enter(&ui->ui_mx);
   2184 	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
   2185 		if (lockp) {
   2186 			(void) md_ioctl_openclose_exit_lh(lockp);
   2187 		} else {
   2188 			md_unit_openclose_exit_lh(ui);
   2189 		}
   2190 		cv_wait(&ui->ui_cv, &ui->ui_mx);
   2191 		mutex_exit(&ui->ui_mx);
   2192 		goto tryagain;
   2193 	}
   2194 
   2195 	ui->ui_lock |= MD_UL_OPENINPROGRESS;
   2196 	mutex_exit(&ui->ui_mx);
   2197 
   2198 	/* open devices, if necessary */
   2199 	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
   2200 		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
   2201 			goto out;
   2202 	}
   2203 
   2204 	/* count open */
   2205 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
   2206 		goto out;
   2207 
   2208 	/* unlock, return success */
   2209 out:
   2210 	mutex_enter(&ui->ui_mx);
   2211 	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
   2212 	mutex_exit(&ui->ui_mx);
   2213 
   2214 	if (lockp) {
   2215 		/*
   2216 		 * If ioctl lock is held, use openclose_exit
   2217 		 * routine that will clear the lockp reader flag.
   2218 		 */
   2219 		(void) md_ioctl_openclose_exit(lockp);
   2220 	} else {
   2221 		md_unit_openclose_exit(ui);
   2222 	}
   2223 	return (err);
   2224 }
   2225 
   2226 int
   2227 mirror_internal_close(
   2228 	minor_t		mnum,
   2229 	int		otyp,
   2230 	int		md_cflags,
   2231 	IOLOCK		*lockp		/* can be NULL */
   2232 )
   2233 {
   2234 	mdi_unit_t	*ui = MDI_UNIT(mnum);
   2235 	mm_unit_t	*un;
   2236 	int		err = 0;
   2237 
   2238 	/* single thread */
   2239 	if (lockp) {
   2240 		/*
   2241 		 * If ioctl lock is held, use openclose_enter
   2242 		 * routine that will set the ioctl flag when
   2243 		 * grabbing the readerlock.
   2244 		 */
   2245 		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
   2246 	} else {
   2247 		un = (mm_unit_t *)md_unit_openclose_enter(ui);
   2248 	}
   2249 
   2250 	/* count closed */
   2251 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
   2252 		goto out;
   2253 
   2254 	/* close devices, if necessary */
   2255 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
   2256 		/*
   2257 		 * Clean up dirty bitmap for this unit. Do this
   2258 		 * before closing the underlying devices to avoid
   2259 		 * race conditions with reset_mirror() as a
   2260 		 * result of a 'metaset -r' command running in
   2261 		 * parallel. This might cause deallocation of
   2262 		 * dirty region bitmaps; with underlying metadevices
   2263 		 * in place this can't happen.
   2264 		 * Don't do this if a MN set and ABR not set
   2265 		 */
   2266 		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
   2267 			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
   2268 			    !(ui->ui_tstate & MD_ABR_CAP))
   2269 				mirror_process_unit_resync(un);
   2270 		}
   2271 		(void) mirror_close_all_devs(un, md_cflags);
   2272 
   2273 		/*
   2274 		 * For a MN set with transient capabilities (eg ABR/DMR) set,
   2275 		 * clear these capabilities on the last open in the cluster.
   2276 		 * To do this we send a message to all nodes to see of the
   2277 		 * device is open.
   2278 		 */
   2279 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
   2280 		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
   2281 			if (lockp) {
   2282 				(void) md_ioctl_openclose_exit(lockp);
   2283 			} else {
   2284 				md_unit_openclose_exit(ui);
   2285 			}
   2286 
   2287 			/*
   2288 			 * if we are in the context of an ioctl, drop the
   2289 			 * ioctl lock.
   2290 			 * Otherwise, no other locks should be held.
   2291 			 */
   2292 			if (lockp) {
   2293 				IOLOCK_RETURN_RELEASE(0, lockp);
   2294 			}
   2295 
   2296 			mdmn_clear_all_capabilities(mnum);
   2297 
   2298 			/* if dropped the lock previously, regain it */
   2299 			if (lockp) {
   2300 				IOLOCK_RETURN_REACQUIRE(lockp);
   2301 			}
   2302 			return (0);
   2303 		}
   2304 		/* unlock and return success */
   2305 	}
   2306 out:
   2307 	/* Call whether lockp is NULL or not. */
   2308 	if (lockp) {
   2309 		md_ioctl_openclose_exit(lockp);
   2310 	} else {
   2311 		md_unit_openclose_exit(ui);
   2312 	}
   2313 	return (err);
   2314 }
   2315 
   2316 /*
   2317  * When a component has completed resyncing and is now ok, check if the
   2318  * corresponding component in the other submirrors is in the Last Erred
   2319  * state.  If it is, we want to change that to the Erred state so we stop
   2320  * using that component and start using this good component instead.
   2321  *
   2322  * This is called from set_sm_comp_state and recursively calls
   2323  * set_sm_comp_state if it needs to change the Last Erred state.
   2324  */
   2325 static void
   2326 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
   2327 	IOLOCK *lockp)
   2328 {
   2329 	mm_submirror_t		*sm;
   2330 	mm_submirror_ic_t	*smic;
   2331 	int			ci;
   2332 	int			i;
   2333 	int			compcnt;
   2334 	int			changed = 0;
   2335 
   2336 	for (i = 0; i < NMIRROR; i++) {
   2337 		sm = &un->un_sm[i];
   2338 		smic = &un->un_smic[i];
   2339 
   2340 		if (!SMS_IS(sm, SMS_INUSE))
   2341 			continue;
   2342 
   2343 		/* ignore the submirror that we just made ok */
   2344 		if (i == smi)
   2345 			continue;
   2346 
   2347 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
   2348 		for (ci = 0; ci < compcnt; ci++) {
   2349 			md_m_shared_t	*shared;
   2350 
   2351 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
   2352 			    (sm->sm_dev, sm, ci);
   2353 
   2354 			if ((shared->ms_state & CS_LAST_ERRED) &&
   2355 			    !mirror_other_sources(un, i, ci, 1)) {
   2356 
   2357 				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
   2358 				    flags, lockp);
   2359 				changed = 1;
   2360 			}
   2361 		}
   2362 	}
   2363 
   2364 	/* maybe there is a hotspare for this newly erred component */
   2365 	if (changed) {
   2366 		set_t	setno;
   2367 
   2368 		setno = MD_UN2SET(un);
   2369 		if (MD_MNSET_SETNO(setno)) {
   2370 			send_poke_hotspares(setno);
   2371 		} else {
   2372 			(void) poke_hotspares();
   2373 		}
   2374 	}
   2375 }
   2376 
   2377 /*
   2378  * set_sm_comp_state
   2379  *
   2380  * Set the state of a submirror component to the specified new state.
   2381  * If the mirror is in a multi-node set, send messages to all nodes to
   2382  * block all writes to the mirror and then update the state and release the
   2383  * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
   2384  * MD_STATE_XMIT will be unset in 2 cases:
   2385  * 1. When the state is changed to CS_RESYNC as this state change
   2386  * will already have been updated on each node by the processing of the
   2387  * distributed metasync command, hence no need to xmit.
   2388  * 2. When the state is change to CS_OKAY after a resync has completed. Again
   2389  * the resync completion will already have been processed on each node by
   2390  * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
   2391  * resync, hence no need to xmit.
   2392  *
   2393  * In case we are called from the updates of a watermark,
   2394  * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
   2395  * a metainit or similar. In this case the message that we sent to propagate
   2396  * the state change must not be a class1 message as that would deadlock with
   2397  * the metainit command that is still being processed.
   2398  * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
   2399  * instead. This also makes the submessage generator to create a class2
   2400  * submessage rather than a class1 (which would also block)
   2401  *
   2402  * On entry, unit_writerlock is held
   2403  * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
   2404  * also held.
   2405  */
   2406 void
   2407 set_sm_comp_state(
   2408 	mm_unit_t	*un,
   2409 	int		smi,
   2410 	int		ci,
   2411 	int		newstate,
   2412 	mddb_recid_t	*extras,
   2413 	uint_t		flags,
   2414 	IOLOCK		*lockp
   2415 )
   2416 {
   2417 	mm_submirror_t		*sm;
   2418 	mm_submirror_ic_t	*smic;
   2419 	md_m_shared_t		*shared;
   2420 	int			origstate;
   2421 	void			(*get_dev)();
   2422 	ms_cd_info_t		cd;
   2423 	char			devname[MD_MAX_CTDLEN];
   2424 	int			err;
   2425 	set_t			setno = MD_UN2SET(un);
   2426 	md_mn_msg_stch_t	stchmsg;
   2427 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
   2428 	md_mn_kresult_t		*kresult;
   2429 	int			rval;
   2430 	uint_t			msgflags;
   2431 	md_mn_msgtype_t		msgtype;
   2432 	int			save_lock = 0;
   2433 	mdi_unit_t		*ui_sm;
   2434 	int			nretries = 0;
   2435 
   2436 	sm = &un->un_sm[smi];
   2437 	smic = &un->un_smic[smi];
   2438 
   2439 	/* If we have a real error status then turn off MD_INACCESSIBLE. */
   2440 	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
   2441 	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
   2442 	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
   2443 		ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
   2444 	}
   2445 
   2446 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
   2447 	    (sm->sm_dev, sm, ci);
   2448 	origstate = shared->ms_state;
   2449 
   2450 	/*
   2451 	 * If the new state is an error and the old one wasn't, generate
   2452 	 * a console message. We do this before we send the state to other
   2453 	 * nodes in a MN set because the state change may change the component
   2454 	 * name  if a hotspare is allocated.
   2455 	 */
   2456 	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
   2457 	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {
   2458 
   2459 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
   2460 		    "get device", 0);
   2461 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
   2462 
   2463 		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
   2464 		    cd.cd_dev, devname, sizeof (devname));
   2465 
   2466 		if (err == ENOENT) {
   2467 			(void) md_devname(setno, cd.cd_dev, devname,
   2468 			    sizeof (devname));
   2469 		}
   2470 
   2471 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
   2472 		    md_shortname(md_getminor(sm->sm_dev)), devname);
   2473 
   2474 		if (newstate & CS_LAST_ERRED) {
   2475 			cmn_err(CE_WARN, "md: %s: %s last erred",
   2476 			    md_shortname(md_getminor(sm->sm_dev)),
   2477 			    devname);
   2478 
   2479 		} else if (shared->ms_flags & MDM_S_ISOPEN) {
   2480 			/*
   2481 			 * Close the broken device and clear the open flag on
   2482 			 * it.  Closing the device means the RCM framework will
   2483 			 * be able to unconfigure the device if required.
   2484 			 *
   2485 			 * We have to check that the device is open, otherwise
   2486 			 * the first open on it has resulted in the error that
   2487 			 * is being processed and the actual cd.cd_dev will be
   2488 			 * NODEV64.
   2489 			 *
   2490 			 * If this is a multi-node mirror, then the multinode
   2491 			 * state checks following this code will cause the
   2492 			 * slave nodes to close the mirror in the function
   2493 			 * mirror_set_state().
   2494 			 */
   2495 			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
   2496 			shared->ms_flags &= ~MDM_S_ISOPEN;
   2497 		}
   2498 
   2499 	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
   2500 	    (shared->ms_flags & MDM_S_ISOPEN)) {
   2501 		/*
   2502 		 * Similar to logic above except no log messages since we
   2503 		 * are just transitioning from Last Erred to Erred.
   2504 		 */
   2505 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
   2506 		    "get device", 0);
   2507 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
   2508 
   2509 		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
   2510 		shared->ms_flags &= ~MDM_S_ISOPEN;
   2511 	}
   2512 
   2513 	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
   2514 	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
   2515 		/*
   2516 		 * For a multi-node mirror, send the state change to the
   2517 		 * master, which broadcasts to all nodes, including this
   2518 		 * one. Once the message is received, the state is set
   2519 		 * in-core and the master commits the change to disk.
   2520 		 * There is a case, comp_replace,  where this function
   2521 		 * can be called from within an ioctl and therefore in this
   2522 		 * case, as the ioctl will already be called on each node,
   2523 		 * there is no need to xmit the state change to the master for
   2524 		 * distribution to the other nodes. MD_STATE_XMIT flag is used
   2525 		 * to indicate whether a xmit is required. The mirror's
   2526 		 * transient state is set to MD_ERR_PENDING to avoid sending
   2527 		 * multiple messages.
   2528 		 */
   2529 		if (newstate & (CS_ERRED|CS_LAST_ERRED))
   2530 			ui->ui_tstate |= MD_ERR_PENDING;
   2531 
   2532 		/*
   2533 		 * Send a state update message to all nodes. This message
   2534 		 * will generate 2 submessages, the first one to suspend
   2535 		 * all writes to the mirror and the second to update the
   2536 		 * state and resume writes.
   2537 		 */
   2538 		stchmsg.msg_stch_mnum = un->c.un_self_id;
   2539 		stchmsg.msg_stch_sm = smi;
   2540 		stchmsg.msg_stch_comp = ci;
   2541 		stchmsg.msg_stch_new_state = newstate;
   2542 		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
   2543 #ifdef DEBUG
   2544 		if (mirror_debug_flag)
   2545 			printf("send set state, %x, %x, %x, %x, %x\n",
   2546 			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
   2547 			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
   2548 			    stchmsg.msg_stch_hs_id);
   2549 #endif
   2550 		if (flags & MD_STATE_WMUPDATE) {
   2551 			msgtype  = MD_MN_MSG_STATE_UPDATE2;
   2552 			/*
   2553 			 * When coming from an update of watermarks, there
   2554 			 * must already be a message logged that triggered
   2555 			 * this action. So, no need to log this message, too.
   2556 			 */
   2557 			msgflags = MD_MSGF_NO_LOG;
   2558 		} else {
   2559 			msgtype  = MD_MN_MSG_STATE_UPDATE;
   2560 			msgflags = MD_MSGF_DEFAULT_FLAGS;
   2561 		}
   2562 
   2563 		/*
   2564 		 * If we are in the context of an ioctl, drop the ioctl lock.
   2565 		 * lockp holds the list of locks held.
   2566 		 *
   2567 		 * Otherwise, increment the appropriate reacquire counters.
   2568 		 * If openclose lock is *held, then must reacquire reader
   2569 		 * lock before releasing the openclose lock.
   2570 		 * Do not drop the ARRAY_WRITER lock as we may not be able
   2571 		 * to reacquire it.
   2572 		 */
   2573 		if (lockp) {
   2574 			if (lockp->l_flags & MD_ARRAY_WRITER) {
   2575 				save_lock = MD_ARRAY_WRITER;
   2576 				lockp->l_flags &= ~MD_ARRAY_WRITER;
   2577 			} else if (lockp->l_flags & MD_ARRAY_READER) {
   2578 				save_lock = MD_ARRAY_READER;
   2579 				lockp->l_flags &= ~MD_ARRAY_READER;
   2580 			}
   2581 			IOLOCK_RETURN_RELEASE(0, lockp);
   2582 		} else {
   2583 			if (flags & MD_STATE_OCHELD) {
   2584 				md_unit_writerexit(ui);
   2585 				(void) md_unit_readerlock(ui);
   2586 				md_unit_openclose_exit(ui);
   2587 			} else {
   2588 				md_unit_writerexit(ui);
   2589 			}
   2590 		}
   2591 
   2592 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
   2593 sscs_msg:
   2594 		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
   2595 		    (char *)&stchmsg, sizeof (stchmsg), kresult);
   2596 
   2597 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
   2598 			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
   2599 			/* If we're shutting down already, pause things here. */
   2600 			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
   2601 				while (!md_mn_is_commd_present()) {
   2602 					delay(md_hz);
   2603 				}
   2604 				/*
   2605 				 * commd is now available; retry the message
   2606 				 * one time. If that fails we fall through and
   2607 				 * panic as the system is in an unexpected state
   2608 				 */
   2609 				if (nretries++ == 0)
   2610 					goto sscs_msg;
   2611 			}
   2612 			cmn_err(CE_PANIC,
   2613 			    "ksend_message failure: STATE_UPDATE");
   2614 		}
   2615 		kmem_free(kresult, sizeof (md_mn_kresult_t));
   2616 
   2617 		/* if dropped the lock previously, regain it */
   2618 		if (lockp) {
   2619 			IOLOCK_RETURN_REACQUIRE(lockp);
   2620 			lockp->l_flags |= save_lock;
   2621 		} else {
   2622 			/*
   2623 			 * Reacquire dropped locks and update acquirecnts
   2624 			 * appropriately.
   2625 			 */
   2626 			if (flags & MD_STATE_OCHELD) {
   2627 				/*
   2628 				 * openclose also grabs readerlock.
   2629 				 */
   2630 				(void) md_unit_openclose_enter(ui);
   2631 				md_unit_readerexit(ui);
   2632 				(void) md_unit_writerlock(ui);
   2633 			} else {
   2634 				(void) md_unit_writerlock(ui);
   2635 			}
   2636 		}
   2637 
   2638 		ui->ui_tstate &= ~MD_ERR_PENDING;
   2639 	} else {
   2640 		shared->ms_state = newstate;
   2641 		uniqtime32(&shared->ms_timestamp);
   2642 
   2643 		if (newstate == CS_ERRED)
   2644 			shared->ms_flags |= MDM_S_NOWRITE;
   2645 		else
   2646 			shared->ms_flags &= ~MDM_S_NOWRITE;
   2647 
   2648 		shared->ms_flags &= ~MDM_S_IOERR;
   2649 		un->un_changecnt++;
   2650 		shared->ms_lasterrcnt = un->un_changecnt;
   2651 
   2652 		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
   2653 		mirror_commit(un, SMI2BIT(smi), extras);
   2654 	}
   2655 
   2656 	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
   2657 		/*
   2658 		 * Resetting the Last Erred state will recursively call back
   2659 		 * into this function (set_sm_comp_state) to update the state.
   2660 		 */
   2661 		reset_lasterred(un, smi, extras, flags, lockp);
   2662 	}
   2663 }
   2664 
   2665 static int
   2666 find_another_logical(
   2667 	mm_unit_t		*un,
   2668 	mm_submirror_t		*esm,
   2669 	diskaddr_t		blk,
   2670 	u_longlong_t		cnt,
   2671 	int			must_be_open,
   2672 	int			state,
   2673 	int			err_cnt)
   2674 {
   2675 	u_longlong_t	cando;
   2676 	md_dev64_t	dev;
   2677 	md_m_shared_t	*s;
   2678 
   2679 	esm->sm_state |= SMS_IGNORE;
   2680 	while (cnt != 0) {
   2681 		u_longlong_t	 mcnt;
   2682 
   2683 		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */
   2684 
   2685 		dev = select_read_unit(un, blk, mcnt, &cando,
   2686 		    must_be_open, &s, NULL);
   2687 		if (dev == (md_dev64_t)0)
   2688 			break;
   2689 
   2690 		if ((state == CS_LAST_ERRED) &&
   2691 		    (s->ms_state == CS_LAST_ERRED) &&
   2692 		    (err_cnt > s->ms_lasterrcnt))
   2693 			break;
   2694 
   2695 		cnt -= cando;
   2696 		blk += cando;
   2697 	}
   2698 	esm->sm_state &= ~SMS_IGNORE;
   2699 	return (cnt != 0);
   2700 }
   2701 
   2702 int
   2703 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
   2704 {
   2705 	mm_submirror_t		*sm;
   2706 	mm_submirror_ic_t	*smic;
   2707 	size_t			count;
   2708 	diskaddr_t		block;
   2709 	u_longlong_t		skip;
   2710 	u_longlong_t		size;
   2711 	md_dev64_t		dev;
   2712 	int			cnt;
   2713 	md_m_shared_t		*s;
   2714 	int			not_found;
   2715 
   2716 	sm = &un->un_sm[smi];
   2717 	smic = &un->un_smic[smi];
   2718 	dev = sm->sm_dev;
   2719 
   2720 	/*
   2721 	 * Make sure every component of the submirror
   2722 	 * has other sources.
   2723 	 */
   2724 	if (ci < 0) {
   2725 		/* Find the highest lasterrcnt */
   2726 		cnt = (*(smic->sm_get_component_count))(dev, sm);
   2727 		for (ci = 0; ci < cnt; ci++) {
   2728 			not_found = mirror_other_sources(un, smi, ci,
   2729 			    must_be_open);
   2730 			if (not_found)
   2731 				return (1);
   2732 		}
   2733 		return (0);
   2734 	}
   2735 
   2736 	/*
   2737 	 * Make sure this component has other sources
   2738 	 */
   2739 	(void) (*(smic->sm_get_bcss))
   2740 	    (dev, sm, ci, &block, &count, &skip, &size);
   2741 
   2742 	if (count == 0)
   2743 		return (1);
   2744 
   2745 	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
   2746 
   2747 	while (count--) {
   2748 		if (block >= un->c.un_total_blocks)
   2749 			return (0);
   2750 
   2751 		if ((block + size) > un->c.un_total_blocks)
   2752 			size = un->c.un_total_blocks - block;
   2753 
   2754 		not_found = find_another_logical(un, sm, block, size,
   2755 		    must_be_open, s->ms_state, s->ms_lasterrcnt);
   2756 		if (not_found)
   2757 			return (1);
   2758 
   2759 		block += size + skip;
   2760 	}
   2761 	return (0);
   2762 }
   2763 
   2764 static void
   2765 finish_error(md_mps_t *ps)
   2766 {
   2767 	struct buf	*pb;
   2768 	mm_unit_t	*un;
   2769 	mdi_unit_t	*ui;
   2770 	uint_t		new_str_flags;
   2771 
   2772 	pb = ps->ps_bp;
   2773 	un = ps->ps_un;
   2774 	ui = ps->ps_ui;
   2775 
   2776 	/*
   2777 	 * Must flag any error to the resync originator if we're performing
   2778 	 * a Write-after-Read. This corresponds to an i/o error on a resync
   2779 	 * target device and in this case we ought to abort the resync as there
   2780 	 * is nothing that can be done to recover from this without operator
   2781 	 * intervention. If we don't set the B_ERROR flag we will continue
   2782 	 * reading from the mirror but won't write to the target (as it will
   2783 	 * have been placed into an errored state).
   2784 	 * To handle the case of multiple components within a submirror we only
   2785 	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
   2786 	 * The originator of the resync read will cause this bit to be set if
   2787 	 * the underlying component count is one for a submirror resync. All
   2788 	 * other resync types will have the flag set as there is no underlying
   2789 	 * resync which can be performed on a contained metadevice for these
   2790 	 * resync types (optimized or component).
   2791 	 */
   2792 
   2793 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
   2794 		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
   2795 			pb->b_flags |= B_ERROR;
   2796 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
   2797 		MPS_FREE(mirror_parent_cache, ps);
   2798 		md_unit_readerexit(ui);
   2799 		md_biodone(pb);
   2800 		return;
   2801 	}
   2802 	/*
   2803 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
   2804 	 * operation therefore this I/O request has already been counted,
   2805 	 * the I/O count variable will be decremented by mirror_done()'s
   2806 	 * call to md_biodone().
   2807 	 */
   2808 	if (ps->ps_changecnt != un->un_changecnt) {
   2809 		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
   2810 		if (ps->ps_flags & MD_MPS_WOW)
   2811 			new_str_flags |= MD_STR_WOW;
   2812 		if (ps->ps_flags & MD_MPS_MAPPED)
   2813 			new_str_flags |= MD_STR_MAPPED;
   2814 		/*
   2815 		 * If this I/O request was a read that was part of a resync,
   2816 		 * set MD_STR_WAR for the retried read to ensure that the
   2817 		 * resync write (i.e. write-after-read) will be performed
   2818 		 */
   2819 		if (ps->ps_flags & MD_MPS_RESYNC_READ)
   2820 			new_str_flags |= MD_STR_WAR;
   2821 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
   2822 		MPS_FREE(mirror_parent_cache, ps);
   2823 		md_unit_readerexit(ui);
   2824 		(void) md_mirror_strategy(pb, new_str_flags, NULL);
   2825 		return;
   2826 	}
   2827 
   2828 	pb->b_flags |= B_ERROR;
   2829 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
   2830 	MPS_FREE(mirror_parent_cache, ps);
   2831 	md_unit_readerexit(ui);
   2832 	md_biodone(pb);
   2833 }
   2834 
   2835 static void
   2836 error_update_unit(md_mps_t *ps)
   2837 {
   2838 	mm_unit_t		*un;
   2839 	mdi_unit_t		*ui;
   2840 	int			smi;	/* sub mirror index */
   2841 	int			ci;	/* errored component */
   2842 	set_t			setno;
   2843 	uint_t			flags;	/* for set_sm_comp_state() */
   2844 	uint_t			hspflags; /* for check_comp_4_hotspares() */
   2845 
   2846 	ui = ps->ps_ui;
   2847 	un = (mm_unit_t *)md_unit_writerlock(ui);
   2848 	setno = MD_UN2SET(un);
   2849 
   2850 	/* All of these updates have to propagated in case of MN set */
   2851 	flags = MD_STATE_XMIT;
   2852 	hspflags = MD_HOTSPARE_XMIT;
   2853 
   2854 	/* special treatment if we are called during updating watermarks */
   2855 	if (ps->ps_flags & MD_MPS_WMUPDATE) {
   2856 		flags |= MD_STATE_WMUPDATE;
   2857 		hspflags |= MD_HOTSPARE_WMUPDATE;
   2858 	}
   2859 	smi = 0;
   2860 	ci = 0;
   2861 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
   2862 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
   2863 
   2864 			/* Never called from ioctl context, so (IOLOCK *)NULL */
   2865 			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
   2866 			    (IOLOCK *)NULL);
   2867 			/*
   2868 			 * For a MN set, the NOTIFY is done when the state
   2869 			 * change is processed on each node
   2870 			 */
   2871 			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
   2872 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
   2873 				    SVM_TAG_METADEVICE, setno, MD_SID(un));
   2874 			}
   2875 			continue;
   2876 		}
   2877 		/* Never called from ioctl context, so (IOLOCK *)NULL */
   2878 		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
   2879 		    (IOLOCK *)NULL);
   2880 		/*
   2881 		 * For a MN set, the NOTIFY is done when the state
   2882 		 * change is processed on each node
   2883 		 */
   2884 		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
   2885 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
   2886 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
   2887 		}
   2888 		smi = 0;
   2889 		ci = 0;
   2890 	}
   2891 
   2892 	md_unit_writerexit(ui);
   2893 	if (MD_MNSET_SETNO(setno)) {
   2894 		send_poke_hotspares(setno);
   2895 	} else {
   2896 		(void) poke_hotspares();
   2897 	}
   2898 	(void) md_unit_readerlock(ui);
   2899 
   2900 	finish_error(ps);
   2901 }
   2902 
   2903 /*
   2904  * When we have a B_FAILFAST IO error on a Last Erred component we need to
   2905  * retry the IO without B_FAILFAST set so that we try to ensure that the
   2906  * component "sees" each IO.
   2907  */
   2908 static void
   2909 last_err_retry(md_mcs_t *cs)
   2910 {
   2911 	struct buf	*cb;
   2912 	md_mps_t	*ps;
   2913 	uint_t		flags;
   2914 
   2915 	cb = &cs->cs_buf;
   2916 	cb->b_flags &= ~B_FAILFAST;
   2917 
   2918 	/* if we're panicing just let this I/O error out */
   2919 	if (panicstr) {
   2920 		(void) mirror_done(cb);
   2921 		return;
   2922 	}
   2923 
   2924 	/* reissue the I/O */
   2925 
   2926 	ps = cs->cs_ps;
   2927 
   2928 	bioerror(cb, 0);
   2929 
   2930 	mutex_enter(&ps->ps_mx);
   2931 
   2932 	flags = MD_STR_NOTTOP;
   2933 	if (ps->ps_flags & MD_MPS_MAPPED)
   2934 		flags |= MD_STR_MAPPED;
   2935 	if (ps->ps_flags & MD_MPS_NOBLOCK)
   2936 		flags |= MD_NOBLOCK;
   2937 
   2938 	mutex_exit(&ps->ps_mx);
   2939 
   2940 	clear_retry_error(cb);
   2941 
   2942 	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
   2943 	    md_shortname(getminor(cb->b_edev)));
   2944 
   2945 	md_call_strategy(cb, flags, NULL);
   2946 }
   2947 
   2948 static void
   2949 mirror_error(md_mps_t *ps)
   2950 {
   2951 	int		smi;	/* sub mirror index */
   2952 	int		ci;	/* errored component */
   2953 
   2954 	if (panicstr) {
   2955 		finish_error(ps);
   2956 		return;
   2957 	}
   2958 
   2959 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
   2960 		mirror_overlap_tree_remove(ps);
   2961 
   2962 	smi = 0;
   2963 	ci = 0;
   2964 	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
   2965 		md_unit_readerexit(ps->ps_ui);
   2966 		daemon_request(&md_mstr_daemon, error_update_unit,
   2967 		    (daemon_queue_t *)ps, REQ_OLD);
   2968 		return;
   2969 	}
   2970 
   2971 	finish_error(ps);
   2972 }
   2973 
   2974 static int
   2975 copy_write_done(struct buf *cb)
   2976 {
   2977 	md_mps_t	*ps;
   2978 	buf_t		*pb;
   2979 	char		*wowbuf;
   2980 	wowhdr_t	*wowhdr;
   2981 	ssize_t		wow_resid;
   2982 
   2983 	/* get wowbuf ans save structure */
   2984 	wowbuf = cb->b_un.b_addr;
   2985 	wowhdr = WOWBUF_HDR(wowbuf);
   2986 	ps = wowhdr->wow_ps;
   2987 	pb = ps->ps_bp;
   2988 
   2989 	/* Save error information, then free cb */
   2990 	if (cb->b_flags & B_ERROR)
   2991 		pb->b_flags |= B_ERROR;
   2992 
   2993 	if (cb->b_flags & B_REMAPPED)
   2994 		bp_mapout(cb);
   2995 
   2996 	freerbuf(cb);
   2997 
   2998 	/* update residual and continue if needed */
   2999 	if ((pb->b_flags & B_ERROR) == 0) {
   3000 		wow_resid = pb->b_bcount - wowhdr->wow_offset;
   3001 		pb->b_resid = wow_resid;
   3002 		if (wow_resid > 0)  {
   3003 			daemon_request(&md_mstr_daemon, copy_write_cont,
   3004 			    (daemon_queue_t *)wowhdr, REQ_OLD);
   3005 			return (1);
   3006 		}
   3007 	}
   3008 
   3009 	/* Write is complete, release resources. */
   3010 	kmem_cache_free(mirror_wowblk_cache, wowhdr);
   3011 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
   3012 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
   3013 	MPS_FREE(mirror_parent_cache, ps);
   3014 	md_biodone(pb);
   3015 	return (0);
   3016 }
   3017 
   3018 static void
   3019 copy_write_cont(wowhdr_t *wowhdr)
   3020 {
   3021 	buf_t		*pb;
   3022 	buf_t		*cb;
   3023 	char		*wowbuf;
   3024 	int		wow_offset;
   3025 	size_t		wow_resid;
   3026 	diskaddr_t	wow_blkno;
   3027 
   3028 	wowbuf = WOWHDR_BUF(wowhdr);
   3029 	pb = wowhdr->wow_ps->ps_bp;
   3030 
   3031 	/* get data on current location */
   3032 	wow_offset = wowhdr->wow_offset;
   3033 	wow_resid = pb->b_bcount - wow_offset;
   3034 	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
   3035 
   3036 	/* setup child buffer */
   3037 	cb = getrbuf(KM_SLEEP);
   3038 	cb->b_flags = B_WRITE;
   3039 	cb->b_edev = pb->b_edev;
   3040 	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
   3041 	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
   3042 	cb->b_iodone = copy_write_done;
   3043 	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
   3044 	cb->b_lblkno = wow_blkno;
   3045 
   3046 	/* move offset to next section */
   3047 	wowhdr->wow_offset += cb->b_bcount;
   3048 
   3049 	/* copy and setup write for current section */
   3050 	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
   3051 
   3052 	/* do it */
   3053 	/*
   3054 	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
   3055 	 * that handles the WOW condition. The resultant increment on the
   3056 	 * I/O count variable is cleared by copy_write_done()'s call to
   3057 	 * md_biodone().
   3058 	 */
   3059 	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
   3060 	    | MD_STR_MAPPED, NULL);
   3061 }
   3062 
   3063 static void
   3064 md_mirror_copy_write(md_mps_t *ps)
   3065 {
   3066 	wowhdr_t	*wowhdr;
   3067 
   3068 	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
   3069 	mirror_wowblk_init(wowhdr);
   3070 	wowhdr->wow_ps = ps;
   3071 	wowhdr->wow_offset = 0;
   3072 	copy_write_cont(wowhdr);
   3073 }
   3074 
   3075 static void
   3076 handle_wow(md_mps_t *ps)
   3077 {
   3078 	buf_t		*pb;
   3079 
   3080 	pb = ps->ps_bp;
   3081 
   3082 	bp_mapin(pb);
   3083 
   3084 	md_mirror_wow_cnt++;
   3085 	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
   3086 		cmn_err(CE_NOTE,
   3087 		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
   3088 		    md_shortname(getminor(pb->b_edev)),
   3089 		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
   3090 	}
   3091 
   3092 	/*
   3093 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
   3094 	 * operation therefore this I/O request has already been counted,
   3095 	 * the I/O count variable will be decremented by mirror_done()'s
   3096 	 * call to md_biodone().
   3097 	 */
   3098 	if (md_mirror_wow_flg & WOW_NOCOPY)
   3099 		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
   3100 		    MD_STR_MAPPED | MD_IO_COUNTED, ps);
   3101 	else
   3102 		md_mirror_copy_write(ps);
   3103 }
   3104 
   3105 /*
   3106  * Return true if the specified submirror is either in the Last Erred
   3107  * state or is transitioning into the Last Erred state.
   3108  */
   3109 static bool_t
   3110 submirror_is_lasterred(mm_unit_t *un, int smi)
   3111 {
   3112 	mm_submirror_t		*sm;
   3113 	mm_submirror_ic_t	*smic;
   3114 	md_m_shared_t		*shared;
   3115 	int			ci;
   3116 	int			compcnt;
   3117 
   3118 	sm = &un->un_sm[smi];
   3119 	smic = &un->un_smic[smi];
   3120 
   3121 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
   3122 	for (ci = 0; ci < compcnt; ci++) {
   3123 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
   3124 		    (sm->sm_dev, sm, ci);
   3125 
   3126 		if (shared->ms_state == CS_LAST_ERRED)
   3127 			return (B_TRUE);
   3128 
   3129 		/*
   3130 		 * It is not currently Last Erred, check if entering Last Erred.
   3131 		 */
   3132 		if ((shared->ms_flags & MDM_S_IOERR) &&
   3133 		    ((shared->ms_state == CS_OKAY) ||
   3134 		    (shared->ms_state == CS_RESYNC))) {
   3135 			if (mirror_other_sources(un, smi, ci, 0) == 1)
   3136 				return (B_TRUE);
   3137 		}
   3138 	}
   3139 
   3140 	return (B_FALSE);
   3141 }
   3142 
   3143 
   3144 static int
   3145 mirror_done(struct buf *cb)
   3146 {
   3147 	md_mps_t	*ps;
   3148 	md_mcs_t	*cs;
   3149 
   3150 	/*LINTED*/
   3151 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
   3152 	ps = cs->cs_ps;
   3153 
   3154 	mutex_enter(&ps->ps_mx);
   3155 
   3156 	/* check if we need to retry an errored failfast I/O */
   3157 	if (cb->b_flags & B_ERROR) {
   3158 		struct buf *pb = ps->ps_bp;
   3159 
   3160 		if (cb->b_flags & B_FAILFAST) {
   3161 			int		i;
   3162 			mm_unit_t	*un = ps->ps_un;
   3163 
   3164 			for (i = 0; i < NMIRROR; i++) {
   3165 				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
   3166 					continue;
   3167 
   3168 				if (cb->b_edev ==
   3169 				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {
   3170 
   3171 					/*
   3172 					 * This is the submirror that had the
   3173 					 * error.  Check if it is Last Erred.
   3174 					 */
   3175 					if (submirror_is_lasterred(un, i)) {
   3176 						daemon_queue_t *dqp;
   3177 
   3178 						mutex_exit(&ps->ps_mx);
   3179 						dqp = (daemon_queue_t *)cs;
   3180 						dqp->dq_prev = NULL;
   3181 						dqp->dq_next = NULL;
   3182 						daemon_request(&md_done_daemon,
   3183 						    last_err_retry, dqp,
   3184 						    REQ_OLD);
   3185 						return (1);
   3186 					}
   3187 					break;
   3188 				}
   3189 			}
   3190 		}
   3191 
   3192 		/* continue to process the buf without doing a retry */
   3193 		ps->ps_flags |= MD_MPS_ERROR;
   3194 		pb->b_error = cb->b_error;
   3195 	}
   3196 
   3197 	return (mirror_done_common(cb));
   3198 }
   3199 
   3200 /*
   3201  * Split from the original mirror_done function so we can handle bufs after a
   3202  * retry.
   3203  * ps->ps_mx is already held in the caller of this function and the cb error
   3204  * has already been checked and handled in the caller.
   3205  */
   3206 static int
   3207 mirror_done_common(struct buf *cb)
   3208 {
   3209 	struct buf	*pb;
   3210 	mm_unit_t	*un;
   3211 	mdi_unit_t	*ui;
   3212 	md_mps_t	*ps;
   3213 	md_mcs_t	*cs;
   3214 	size_t		end_rr, start_rr, current_rr;
   3215 
   3216 	/*LINTED*/
   3217 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
   3218 	ps = cs->cs_ps;
   3219 	pb = ps->ps_bp;
   3220 
   3221 	if (cb->b_flags & B_REMAPPED)
   3222 		bp_mapout(cb);
   3223 
   3224 	ps->ps_frags--;
   3225 	if (ps->ps_frags != 0) {
   3226 		mutex_exit(&ps->ps_mx);
   3227 		kmem_cache_free(mirror_child_cache, cs);
   3228 		return (1);
   3229 	}
   3230 	un = ps->ps_un;
   3231 	ui = ps->ps_ui;
   3232 
   3233 	/*
   3234 	 * Do not update outstanding_writes if we're running with ABR
   3235 	 * set for this mirror or the write() was issued with MD_STR_ABR set.
   3236 	 * Also a resync initiated write() has no outstanding_writes update
   3237 	 * either.
   3238 	 */
   3239 	if (((cb->b_flags & B_READ) == 0) &&
   3240 	    (un->un_nsm >= 2) &&
   3241 	    (ps->ps_call == NULL) &&
   3242 	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
   3243 	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
   3244 		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
   3245 		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
   3246 		mutex_enter(&un->un_resync_mx);
   3247 		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
   3248 			un->un_outstanding_writes[current_rr]--;
   3249 		mutex_exit(&un->un_resync_mx);
   3250 	}
   3251 	kmem_cache_free(mirror_child_cache, cs);
   3252 	mutex_exit(&ps->ps_mx);
   3253 
   3254 	if (ps->ps_call != NULL) {
   3255 		daemon_request(&md_done_daemon, ps->ps_call,
   3256 		    (daemon_queue_t *)ps, REQ_OLD);
   3257 		return (1);
   3258 	}
   3259 
   3260 	if ((ps->ps_flags & MD_MPS_ERROR)) {
   3261 		daemon_request(&md_done_daemon, mirror_error,
   3262 		    (daemon_queue_t *)ps, REQ_OLD);
   3263 		return (1);
   3264 	}
   3265 
   3266 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
   3267 		mirror_overlap_tree_remove(ps);
   3268 
   3269 	/*
   3270 	 * Handle Write-on-Write problem.
   3271 	 * Skip In case of Raw and Direct I/O as they are
   3272 	 * handled earlier.
   3273 	 *
   3274 	 */
   3275 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
   3276 	    !(pb->b_flags & B_READ) &&
   3277 	    !(ps->ps_flags & MD_MPS_WOW) &&
   3278 	    !(pb->b_flags & B_PHYS) &&
   3279 	    any_pages_dirty(pb)) {
   3280 		md_unit_readerexit(ps->ps_ui);
   3281 		daemon_request(&md_mstr_daemon, handle_wow,
   3282 		    (daemon_queue_t *)ps, REQ_OLD);
   3283 		return (1);
   3284 	}
   3285 
   3286 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
   3287 	MPS_FREE(mirror_parent_cache, ps);
   3288 	md_unit_readerexit(ui);
   3289 	md_biodone(pb);
   3290 	return (0);
   3291 }
   3292 
   3293 /*
   3294  * Clear error state in submirror component if the retry worked after
   3295  * a failfast error.
   3296  */
   3297 static void
   3298 clear_retry_error(struct buf *cb)
   3299 {
   3300 	int			smi;
   3301 	md_mcs_t		*cs;
   3302 	mm_unit_t		*un;
   3303 	mdi_unit_t		*ui_sm;
   3304 	mm_submirror_t		*sm;
   3305 	mm_submirror_ic_t	*smic;
   3306 	u_longlong_t		cnt;
   3307 	md_m_shared_t		*shared;
   3308 
   3309 	/*LINTED*/
   3310 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
   3311 	un = cs->cs_ps->ps_un;
   3312 
   3313 	for (smi = 0; smi < NMIRROR; smi++) {
   3314 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
   3315 			continue;
   3316 
   3317 		if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
   3318 			break;
   3319 	}
   3320 
   3321 	if (smi >= NMIRROR)
   3322 		return;
   3323 
   3324 	sm = &un->un_sm[smi];
   3325 	smic = &un->un_smic[smi];
   3326 	cnt = cb->b_bcount;
   3327 
   3328 	ui_sm = MDI_UNIT(getminor(cb->b_edev));
   3329 	(void) md_unit_writerlock(ui_sm);
   3330 
   3331 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
   3332 	    cb->b_blkno, &cnt);
   3333 
   3334 	if (shared->ms_flags & MDM_S_IOERR) {
   3335 		shared->ms_flags &= ~MDM_S_IOERR;
   3336 
   3337 	} else {
   3338 		/* the buf spans components and the first one is not erred */
   3339 		int	cnt;
   3340 		int	i;
   3341 
   3342 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
   3343 		for (i = 0; i < cnt; i++) {
   3344 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
   3345 			    (sm->sm_dev, sm, i);
   3346 
   3347 			if (shared->ms_flags & MDM_S_IOERR &&
   3348 			    shared->ms_state == CS_OKAY) {
   3349 
   3350 				shared->ms_flags &= ~MDM_S_IOERR;
   3351 				break;
   3352 			}
   3353 		}
   3354 	}
   3355 
   3356 	md_unit_writerexit(ui_sm);
   3357 }
   3358 
   3359 static size_t
   3360 mirror_map_read(
   3361 	md_mps_t *ps,
   3362 	md_mcs_t *cs,
   3363 	diskaddr_t blkno,
   3364 	u_longlong_t	count
   3365 )
   3366 {
   3367 	mm_unit_t	*un;
   3368 	buf_t		*bp;
   3369 	u_longlong_t	cando;
   3370 
   3371 	bp = &cs->cs_buf;
   3372 	un = ps->ps_un;
   3373 
   3374 	bp->b_lblkno = blkno;
   3375 	if (fast_select_read_unit(ps, cs) == 0) {
   3376 		bp->b_bcount = ldbtob(count);
   3377 		return (0);
   3378 	}
   3379 	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
   3380 	    count, &cando, 0, NULL, cs));
   3381 	bp->b_bcount = ldbtob(cando);
   3382 	if (count != cando)
   3383 		return (cando);
   3384 	return (0);
   3385 }
   3386 
   3387 static void
   3388 write_after_read(md_mps_t *ps)
   3389 {
   3390 	struct buf	*pb;
   3391 	int		flags;
   3392 
   3393 	if (ps->ps_flags & MD_MPS_ERROR) {
   3394 		mirror_error(ps);
   3395 		return;
   3396 	}
   3397 
   3398 	pb = ps->ps_bp;
   3399 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
   3400 	ps->ps_call = NULL;
   3401 	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
   3402 	flags = MD_STR_NOTTOP | MD_STR_WAR;
   3403 	if (ps->ps_flags & MD_MPS_MAPPED)
   3404 		flags |= MD_STR_MAPPED;
   3405 	if (ps->ps_flags & MD_MPS_NOBLOCK)
   3406 		flags |= MD_NOBLOCK;
   3407 	if (ps->ps_flags & MD_MPS_DIRTY_RD)
   3408 		flags |= MD_STR_DIRTY_RD;
   3409 	(void) mirror_write_strategy(pb, flags, ps);
   3410 }
   3411 
   3412 static void
   3413 continue_serial(md_mps_t *ps)
   3414 {
   3415 	md_mcs_t	*cs;
   3416 	buf_t		*cb;
   3417 	mm_unit_t	*un;
   3418 	int		flags;
   3419 
   3420 	un = ps->ps_un;
   3421 	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
   3422 	mirror_child_init(cs);
   3423 	cb = &cs->cs_buf;
   3424 	ps->ps_call = NULL;
   3425 	ps->ps_frags = 1;
   3426 	(void) mirror_map_write(un, cs, ps, 0);
   3427 	flags = MD_STR_NOTTOP;
   3428 	if (ps->ps_flags & MD_MPS_MAPPED)
   3429 		flags |= MD_STR_MAPPED;
   3430 	md_call_strategy(cb, flags, NULL);
   3431 }
   3432 
   3433 static int
   3434 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
   3435 {
   3436 	int i;
   3437 	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
   3438 	buf_t		*cb;
   3439 	buf_t		*pb;
   3440 	diskaddr_t	blkno;
   3441 	size_t		bcount;
   3442 	off_t		offset;
   3443 
   3444 	pb = ps->ps_bp;
   3445 	cb = &cs->cs_buf;
   3446 	cs->cs_ps = ps;
   3447 
   3448 	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
   3449 
   3450 	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
   3451 
   3452 	blkno = pb->b_lblkno;
   3453 	bcount = pb->b_bcount;
   3454 	offset = 0;
   3455 	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
   3456 		blkno = DK_LABEL_LOC + 1;
   3457 		/*
   3458 		 * This handles the case where we're requesting
   3459 		 * a write to block 0 on a label partition
   3460 		 * and the request size was smaller than the
   3461 		 * size of the label.  If this is the case
   3462 		 * then we'll return -1.  Failure to do so will
   3463 		 * either cause the calling thread to hang due to
   3464 		 * an ssd bug, or worse if the bcount were allowed
   3465 		 * to go negative (ie large).
   3466 		 */
   3467 		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
   3468 			return (-1);
   3469 		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
   3470 		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
   3471 	}
   3472 
   3473 	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
   3474 	    cb, KM_NOSLEEP);
   3475 	if (war)
   3476 		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
   3477 
   3478 	/*
   3479 	 * If the submirror is in the erred stated, check if any component is
   3480 	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
   3481 	 * flag on the IO.
   3482 	 *
   3483 	 * Provide a fast path for the non-erred case (which should be the
   3484 	 * normal case).
   3485 	 */
   3486 	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
   3487 		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
   3488 			mm_submirror_t		*sm;
   3489 			mm_submirror_ic_t	*smic;
   3490 			int			ci;
   3491 			int			compcnt;
   3492 
   3493 			sm = &un->un_sm[i];
   3494 			smic = &un->un_smic[i];
   3495 
   3496 			compcnt = (*(smic->sm_get_component_count))
   3497 			    (sm->sm_dev, un);
   3498 			for (ci = 0; ci < compcnt; ci++) {
   3499 				md_m_shared_t	*shared;
   3500 
   3501 				shared = (md_m_shared_t *)
   3502 				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
   3503 				    sm, ci);
   3504 
   3505 				if (shared->ms_state == CS_LAST_ERRED)
   3506 					break;
   3507 			}
   3508 			if (ci >= compcnt)
   3509 				cb->b_flags |= B_FAILFAST;
   3510 
   3511 		} else {
   3512 			cb->b_flags |= B_FAILFAST;
   3513 		}
   3514 	}
   3515 
   3516 	ps->ps_current_sm++;
   3517 	if (ps->ps_current_sm != ps->ps_active_cnt) {
   3518 		if (un->un_write_option == WR_SERIAL) {
   3519 			ps->ps_call = continue_serial;
   3520 			return (0);
   3521 		}
   3522 		return (1);
   3523 	}
   3524 	return (0);
   3525 }
   3526 
   3527 /*
   3528  * directed_read_done:
   3529  * ------------------
   3530  * Completion routine called when a DMR request has been returned from the
   3531  * underlying driver. Wake-up the original ioctl() and return the data to
   3532  * the user.
   3533  */
   3534 static void
   3535 directed_read_done(md_mps_t *ps)
   3536 {
   3537 	mm_unit_t	*un;
   3538 	mdi_unit_t	*ui;
   3539 
   3540 	un = ps->ps_un;
   3541 	ui = ps->ps_ui;
   3542 
   3543 	md_unit_readerexit(ui);
   3544 	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
   3545 	ps->ps_call = NULL;
   3546 
   3547 	mutex_enter(&un->un_dmr_mx);
   3548 	cv_signal(&un->un_dmr_cv);
   3549 	mutex_exit(&un->un_dmr_mx);
   3550 
   3551 	/* release the parent structure */
   3552 	kmem_cache_free(mirror_parent_cache, ps);
   3553 }
   3554 
   3555 /*
   3556  * daemon_io:
   3557  * ------------
   3558  * Called to issue a mirror_write_strategy() or mirror_read_strategy
   3559  * call from a blockable context. NOTE: no mutex can be held on entry to this
   3560  * routine
   3561  */
   3562 static void
   3563 daemon_io(daemon_queue_t *dq)
   3564 {
   3565 	md_mps_t	*ps = (md_mps_t *)dq;
   3566 	int		flag = MD_STR_NOTTOP;
   3567 	buf_t		*pb = ps->ps_bp;
   3568 
   3569 	if (ps->ps_flags & MD_MPS_MAPPED)
   3570 		flag |= MD_STR_MAPPED;
   3571 	if (ps->ps_flags & MD_MPS_WOW)
   3572 		flag |= MD_STR_WOW;
   3573 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
   3574 		flag |= MD_STR_WAR;
   3575 	if (ps->ps_flags & MD_MPS_ABR)
   3576 		flag |= MD_STR_ABR;
   3577 	if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
   3578 		flag |= MD_STR_BLOCK_OK;
   3579 
   3580 	/*
   3581 	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
   3582 	 * MD_STR_WAR before calling mirror_read_strategy
   3583 	 */
   3584 	if (pb->b_flags & B_READ) {
   3585 		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
   3586 			flag |= MD_STR_WAR;
   3587 		mirror_read_strategy(pb, flag, ps);
   3588 	} else
   3589 		mirror_write_strategy(pb, flag, ps);
   3590 }
   3591 
   3592 /*
   3593  * update_resync:
   3594  * -------------
   3595  * Called to update the in-core version of the resync record with the latest
   3596  * version that was committed to disk when the previous mirror owner
   3597  * relinquished ownership. This call is likely to block as we must hold-off
   3598  * any current resync processing that may be occurring.
   3599  * On completion of the resync record update we issue the mirror_write_strategy
   3600  * call to complete the i/o that first started this sequence. To remove a race
   3601  * condition between a new write() request which is submitted and the resync
   3602  * record update we acquire the writerlock. This will hold off all i/o to the
   3603  * mirror until the resync update has completed.
   3604  * NOTE: no mutex can be held on entry to this routine
   3605  */
   3606 static void
   3607 update_resync(daemon_queue_t *dq)
   3608 {
   3609 	md_mps_t	*ps = (md_mps_t *)dq;
   3610 	buf_t		*pb = ps->ps_bp;
   3611 	mdi_unit_t	*ui = ps->ps_ui;
   3612 	mm_unit_t	*un = MD_UNIT(ui->ui_link.ln_id);
   3613 	set_t		setno;
   3614 	int		restart_resync;
   3615 
   3616 	mutex_enter(&un->un_rrp_inflight_mx);
   3617 	(void) md_unit_writerlock(ui);
   3618 	ps->ps_un = un;
   3619 	setno = MD_MIN2SET(getminor(pb->b_edev));
   3620 	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
   3621 		/*
   3622 		 * Synchronize our in-core view of what regions need to be
   3623 		 * resync'd with the on-disk version.
   3624 		 */
   3625 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
   3626 		    un->un_dirty_bm);
   3627 
   3628 		/* Region dirty map is now up to date */
   3629 	}
   3630 	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
   3631 	md_unit_writerexit(ui);
   3632 	mutex_exit(&un->un_rrp_inflight_mx);
   3633 
   3634 	/* Restart the resync thread if it was previously blocked */
   3635 	if (restart_resync) {
   3636 		mutex_enter(&un->un_rs_thread_mx);
   3637 		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
   3638 		cv_signal(&un->un_rs_thread_cv);
   3639 		mutex_exit(&un->un_rs_thread_mx);
   3640 	}
   3641 	/* Continue with original deferred i/o */
   3642 	daemon_io(dq);
   3643 }
   3644 
   3645 /*
   3646  * owner_timeout:
   3647  * -------------
   3648  * Called if the original mdmn_ksend_message() failed and the request is to be
   3649  * retried. Reattempt the original ownership change.
   3650  *
   3651  * NOTE: called at interrupt context (see timeout(9f)).
   3652  */
   3653 static void
   3654 owner_timeout(void *arg)
   3655 {
   3656 	daemon_queue_t	*dq = (daemon_queue_t *)arg;
   3657 
   3658 	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
   3659 }
   3660 
   3661 /*
   3662  * become_owner:
   3663  * ------------
   3664  * Called to issue RPC request to become the owner of the mirror
   3665  * associated with this i/o request. We assume that the ownership request
   3666  * is synchronous, so if it succeeds we will issue the request via
   3667  * mirror_write_strategy().
   3668  * If multiple i/o's are outstanding we will be called from the mirror_daemon
   3669  * service thread.
   3670  * NOTE: no mutex should be held on entry to this routine.
   3671  */
   3672 static void
   3673 become_owner(daemon_queue_t *dq)
   3674 {
   3675 	md_mps_t	*ps = (md_mps_t *)dq;
   3676 	mm_unit_t	*un = ps->ps_un;
   3677 	buf_t		*pb = ps->ps_bp;
   3678 	set_t		setno;
   3679 	md_mn_kresult_t	*kres;
   3680 	int		msg_flags = md_mirror_msg_flags;
   3681 	md_mps_t	*ps1;
   3682 
   3683 	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
   3684 
   3685 	/*
   3686 	 * If we're already the mirror owner we do not need to send a message
   3687 	 * but can simply process the i/o request immediately.
   3688 	 * If we've already sent the request to become owner we requeue the
   3689 	 * request as we're waiting for the synchronous ownership message to
   3690 	 * be processed.
   3691 	 */
   3692 	if (MD_MN_MIRROR_OWNER(un)) {
   3693 		/*
   3694 		 * As the strategy() call will potentially block we need to
   3695 		 * punt this to a separate thread and complete this request
   3696 		 * as quickly as possible. Note: if we're a read request
   3697 		 * this must be a resync, we cannot afford to be queued
   3698 		 * behind any intervening i/o requests. In this case we put the
   3699 		 * request on the md_mirror_rs_daemon queue.
   3700 		 */
   3701 		if (pb->b_flags & B_READ) {
   3702 			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
   3703 			    REQ_OLD);
   3704 		} else {
   3705 			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
   3706 			    REQ_OLD);
   3707 		}
   3708 	} else {
   3709 		mutex_enter(&un->un_owner_mx);
   3710 		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
   3711 			md_mn_req_owner_t	*msg;
   3712 			int			rval = 0;
   3713 
   3714 			/*
   3715 			 * Check to see that we haven't exceeded the maximum
   3716 			 * retry count. If we have we fail the i/o as the
   3717 			 * comms mechanism has become wedged beyond recovery.
   3718 			 */
   3719 			if (dq->qlen++ >= MD_OWNER_RETRIES) {
   3720 				mutex_exit(&un->un_owner_mx);
   3721 				cmn_err(CE_WARN,
   3722 				    "md_mirror: Request exhausted ownership "
   3723 				    "retry limit of %d attempts", dq->qlen);
   3724 				pb->b_error = EIO;
   3725 				pb->b_flags |= B_ERROR;
   3726 				pb->b_resid = pb->b_bcount;
   3727 				kmem_cache_free(mirror_parent_cache, ps);
   3728 				md_biodone(pb);
   3729 				return;
   3730 			}
   3731 
   3732 			/*
   3733 			 * Issue request to change ownership. The call is
   3734 			 * synchronous so when it returns we can complete the
   3735 			 * i/o (if successful), or enqueue it again so that
   3736 			 * the operation will be retried.
   3737 			 */
   3738 			un->un_owner_state |= MM_MN_OWNER_SENT;
   3739 			mutex_exit(&un->un_owner_mx);
   3740 
   3741 			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
   3742 			setno = MD_MIN2SET(getminor(pb->b_edev));
   3743 			msg->mnum = MD_SID(un);
   3744 			msg->owner = md_mn_mynode_id;
   3745 			msg_flags |= MD_MSGF_NO_LOG;
   3746 			/*
   3747 			 * If this IO is triggered by updating a watermark,
   3748 			 * it might be issued by the creation of a softpartition
   3749 			 * while the commd subsystem is suspended.
   3750 			 * We don't want this message to block.
   3751 			 */
   3752 			if (ps->ps_flags & MD_MPS_WMUPDATE) {
   3753 				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
   3754 			}
   3755 
   3756 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
   3757 			rval = mdmn_ksend_message(setno,
   3758 			    MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
   3759 			    (char *)msg, sizeof (md_mn_req_owner_t), kres);
   3760 
   3761 			kmem_free(msg, sizeof (md_mn_req_owner_t));
   3762 
   3763 			if (MDMN_KSEND_MSG_OK(rval, kres)) {
   3764 				dq->qlen = 0;
   3765 				/*
   3766 				 * Successfully changed owner, reread the
   3767 				 * resync record so that we have a valid idea of
   3768 				 * any previously committed incomplete write()s.
   3769 				 * NOTE: As we need to acquire the resync mutex
   3770 				 * this may block, so we defer it to a separate
   3771 				 * thread handler. This makes us (effectively)
   3772 				 * non-blocking once the ownership message
   3773 				 * handling has completed.
   3774 				 */
   3775 				mutex_enter(&un->un_owner_mx);
   3776 				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
   3777 					un->un_mirror_owner = md_mn_mynode_id;
   3778 					/* Sets owner of un_rr_dirty record */
   3779 					if (un->un_rr_dirty_recid)
   3780 						(void) mddb_setowner(
   3781 						    un->un_rr_dirty_recid,
   3782 						    md_mn_mynode_id);
   3783 					un->un_owner_state &=
   3784 					    ~MM_MN_BECOME_OWNER;
   3785 					/*
   3786 					 * Release the block on the current
   3787 					 * resync region if it is blocked
   3788 					 */
   3789 					ps1 = un->un_rs_prev_overlap;
   3790 					if ((ps1 != NULL) &&
   3791 					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
   3792 						mirror_overlap_tree_remove(ps1);
   3793 					mutex_exit(&un->un_owner_mx);
   3794 
   3795 					/*
   3796 					 * If we're a read, this must be a
   3797 					 * resync request, issue
   3798 					 * the i/o request on the
   3799 					 * md_mirror_rs_daemon queue. This is
   3800 					 * to avoid a deadlock between the
   3801 					 * resync_unit thread and
   3802 					 * subsequent i/o requests that may
   3803 					 * block on the resync region.
   3804 					 */
   3805 					if (pb->b_flags & B_READ) {
   3806 						daemon_request(
   3807 						    &md_mirror_rs_daemon,
   3808 						    update_resync, dq, REQ_OLD);
   3809 					} else {
   3810 						daemon_request(
   3811 						    &md_mirror_io_daemon,
   3812 						    update_resync, dq, REQ_OLD);
   3813 					}
   3814 					kmem_free(kres,
   3815 					    sizeof (md_mn_kresult_t));
   3816 					return;
   3817 				} else {
   3818 					/*
   3819 					 * Some other node has beaten us to
   3820 					 * obtain ownership. We need to
   3821 					 * reschedule our ownership request
   3822 					 */
   3823 					mutex_exit(&un->un_owner_mx);
   3824 				}
   3825 			} else {
   3826 				mdmn_ksend_show_error(rval, kres,
   3827 				    "MD_MN_MSG_REQUIRE_OWNER");
   3828 				/*
   3829 				 * Message transport failure is handled by the
   3830 				 * comms layer. If the ownership change request
   3831 				 * does not succeed we need to flag the error to
   3832 				 * the initiator of the i/o. This is handled by
   3833 				 * the retry logic above. As the request failed
   3834 				 * we do not know _who_ the owner of the mirror
   3835 				 * currently is. We reset our idea of the owner
   3836 				 * to None so that any further write()s will
   3837 				 * attempt to become the owner again. This stops
   3838 				 * multiple nodes writing to the same mirror
   3839 				 * simultaneously.
   3840 				 */
   3841 				mutex_enter(&un->un_owner_mx);
   3842 				un->un_owner_state &=
   3843 				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
   3844 				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
   3845 				mutex_exit(&un->un_owner_mx);
   3846 			}
   3847 			kmem_free(kres, sizeof (md_mn_kresult_t));
   3848 		} else
   3849 			mutex_exit(&un->un_owner_mx);
   3850 
   3851 		/*
   3852 		 * Re-enqueue this request on the deferred i/o list. Delay the
   3853 		 * request for md_mirror_owner_to usecs to stop thrashing.
   3854 		 */
   3855 		(void) timeout(owner_timeout, dq,
   3856 		    drv_usectohz(md_mirror_owner_to));
   3857 	}
   3858 }
   3859 
   3860 static void
   3861 mirror_write_strategy(buf_t *pb, int flag, void *private)
   3862 {
   3863 	md_mps_t	*ps;
   3864 	md_mcs_t	*cs;
   3865 	int		more;
   3866 	mm_unit_t	*un;
   3867 	mdi_unit_t	*ui;
   3868 	buf_t		*cb;		/* child buf pointer */
   3869 	set_t		setno;
   3870 	int		rs_on_overlap = 0;
   3871 
   3872 	ui = MDI_UNIT(getminor(pb->b_edev));
   3873 	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
   3874 
   3875 
   3876 	md_kstat_waitq_enter(ui);
   3877 
   3878 	/*
   3879 	 * If a state change is in progress for this mirror in a MN set,
   3880 	 * suspend all non-resync writes until the state change is complete.
   3881 	 * The objective of this suspend is to ensure that it is not
   3882 	 * possible for one node to read data from a submirror that another node
   3883 	 * has not written to because of the state change. Therefore we
   3884 	 * suspend all writes until the state change has been made. As it is
   3885 	 * not possible to read from the target of a resync, there is no need
   3886 	 * to suspend resync writes.
   3887 	 * Note that we only block here if the caller can handle a busy-wait.
   3888 	 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
   3889 	 */
   3890 
   3891 	if (!(flag & MD_STR_WAR)) {
   3892 		if (flag & MD_STR_BLOCK_OK) {
   3893 			mutex_enter(&un->un_suspend_wr_mx);
   3894 			while (un->un_suspend_wr_flag) {
   3895 				cv_wait(&un->un_suspend_wr_cv,
   3896 				    &un->un_suspend_wr_mx);
   3897 			}
   3898 			mutex_exit(&un->un_suspend_wr_mx);
   3899 		}
   3900 		(void) md_unit_readerlock(ui);
   3901 	}
   3902 
   3903 	if (!(flag & MD_STR_NOTTOP)) {
   3904 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
   3905 			md_kstat_waitq_exit(ui);
   3906 			return;
   3907 		}
   3908 	}
   3909 
   3910 	setno = MD_MIN2SET(getminor(pb->b_edev));
   3911 
   3912 	/* If an ABR write has been requested, set MD_STR_ABR flag */
   3913 	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
   3914 		flag |= MD_STR_ABR;
   3915 
   3916 	if (private == NULL) {
   3917 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
   3918 		mirror_parent_init(ps);
   3919 	} else {
   3920 		ps = private;
   3921 		private = NULL;
   3922 	}
   3923 	if (flag & MD_STR_MAPPED)
   3924 		ps->ps_flags |= MD_MPS_MAPPED;
   3925 
   3926 	if (flag & MD_STR_WOW)
   3927 		ps->ps_flags |= MD_MPS_WOW;
   3928 
   3929 	if (flag & MD_STR_ABR)
   3930 		ps->ps_flags |= MD_MPS_ABR;
   3931 
   3932 	if (flag & MD_STR_WMUPDATE)
   3933 		ps->ps_flags |= MD_MPS_WMUPDATE;
   3934 
   3935 	/*
   3936 	 * Save essential information from the original buffhdr
   3937 	 * in the md_save structure.
   3938 	 */
   3939 	ps->ps_un = un;
   3940 	ps->ps_ui = ui;
   3941 	ps->ps_bp = pb;
   3942 	ps->ps_addr = pb->b_un.b_addr;
   3943 	ps->ps_firstblk = pb->b_lblkno;
   3944 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
   3945 	ps->ps_changecnt = un->un_changecnt;
   3946 
   3947 	/*
   3948 	 * Check for suspended writes here. This is where we can defer the
   3949 	 * write request to the daemon_io queue which will then call us with
   3950 	 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
   3951 	 * the top of this routine.
   3952 	 */
   3953 	if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
   3954 		mutex_enter(&un->un_suspend_wr_mx);
   3955 		if (un->un_suspend_wr_flag) {
   3956 			ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
   3957 			mutex_exit(&un->un_suspend_wr_mx);
   3958 			md_unit_readerexit(ui);
   3959 			daemon_request(&md_mirror_daemon, daemon_io,
   3960 			    (daemon_queue_t *)ps, REQ_OLD);
   3961 			return;
   3962 		}
   3963 		mutex_exit(&un->un_suspend_wr_mx);
   3964 	}
   3965 
   3966 	/*
   3967 	 * If not MN owner and this is an ABR write, make sure the current
   3968 	 * resync region is in the overlaps tree
   3969 	 */
   3970 	mutex_enter(&un->un_owner_mx);
   3971 	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
   3972 	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
   3973 		md_mps_t	*ps1;
   3974 		/* Block the current resync region, if not already blocked */
   3975 		ps1 = un->un_rs_prev_overlap;
   3976 
   3977 		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
   3978 		    (ps1->ps_lastblk != 0))) {
   3979 			/* Drop locks to avoid deadlock */
   3980 			mutex_exit(&un->un_owner_mx);
   3981 			md_unit_readerexit(ui);
   3982 			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
   3983 			rs_on_overlap = 1;
   3984 			(void) md_unit_readerlock(ui);
   3985 			mutex_enter(&un->un_owner_mx);
   3986 			/*
   3987 			 * Check to see if we have obtained ownership
   3988 			 * while waiting for overlaps. If we have, remove
   3989 			 * the resync_region entry from the overlap tree
   3990 			 */
   3991 			if (MD_MN_MIRROR_OWNER(un) &&
   3992 			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
   3993 				mirror_overlap_tree_remove(ps1);
   3994 				rs_on_overlap = 0;
   3995 			}
   3996 		}
   3997 	}
   3998 	mutex_exit(&un->un_owner_mx);
   3999 
   4000 
   4001 	/*
   4002 	 * following keep write after read from writing to the
   4003 	 * source in the case where it all came from one place
   4004 	 */
   4005 	if (flag & MD_STR_WAR) {
   4006 		int	abort_write = 0;
   4007 		/*
   4008 		 * We are perfoming a write-after-read. This is either as a
   4009 		 * result of a resync read or as a result of a read in a
   4010 		 * dirty resync region when the optimized resync is not
   4011 		 * complete. If in a MN set and a resync generated i/o,
   4012 		 * if the current block is not in the current
   4013 		 * resync region terminate the write as another node must have
   4014 		 * completed this resync region
   4015 		 */
   4016 		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
   4017 		    (!flag & MD_STR_DIRTY_RD)) {
   4018 			if (!IN_RESYNC_REGION(un, ps))
   4019 				abort_write = 1;
   4020 		}
   4021 		if ((select_write_after_read_units(un, ps) == 0) ||
   4022 		    (abort_write)) {
   4023 #ifdef DEBUG
   4024 			if (mirror_debug_flag)
   4025 				printf("Abort resync write on %x, block %lld\n",
   4026 				    MD_SID(un), ps->ps_firstblk);
   4027 #endif
   4028 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
   4029 				mirror_overlap_tree_remove(ps);
   4030 			kmem_cache_free(mirror_parent_cache, ps);
   4031 			md_kstat_waitq_exit(ui);
   4032 			md_unit_readerexit(ui);
   4033 			md_biodone(pb);
   4034 			return;
   4035 		}
   4036 	} else {
   4037 		select_write_units(un, ps);
   4038 
   4039 		/* Drop readerlock to avoid deadlock */
   4040 		md_unit_readerexit(ui);
   4041 		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
   4042 		un = md_unit_readerlock(ui);
   4043 		/*
   4044 		 * For a MN set with an ABR write, if we are now the
   4045 		 * owner and we have a resync region in the overlap
   4046 		 * tree, remove the entry from overlaps and retry the write.
   4047 		 */
   4048 
   4049 		if (MD_MNSET_SETNO(setno) &&
   4050 		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
   4051 			mutex_enter(&un->un_owner_mx);
   4052 			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
   4053 				mirror_overlap_tree_remove(ps);
   4054 				md_kstat_waitq_exit(ui);
   4055 				mutex_exit(&un->un_owner_mx);
   4056 				md_unit_readerexit(ui);
   4057 				daemon_request(&md_mirror_daemon, daemon_io,
   4058 				    (daemon_queue_t *)ps, REQ_OLD);
   4059 				return;
   4060 			}
   4061 			mutex_exit(&un->un_owner_mx);
   4062 		}
   4063 	}
   4064 
   4065 	/*
   4066 	 * For Multinode mirrors with no owner and a Resync Region (not ABR)
   4067 	 * we need to become the mirror owner before continuing with the
   4068 	 * write(). For ABR mirrors we check that we 'own' the resync if
   4069 	 * we're in write-after-read mode. We do this _after_ ensuring that
   4070 	 * there are no overlaps to ensure that once we know that we are
   4071 	 * the owner, the readerlock will not be released until the write is
   4072 	 * complete. As a change of ownership in a MN set requires the
   4073 	 * writerlock, this ensures that ownership cannot be changed until
   4074 	 * the write is complete.
   4075 	 */
   4076 	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
   4077 	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
   4078 		if (MD_MN_NO_MIRROR_OWNER(un))  {
   4079 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
   4080 				mirror_overlap_tree_remove(ps);
   4081 			md_kstat_waitq_exit(ui);
   4082 			ASSERT(!(flag & MD_STR_WAR));
   4083 			md_unit_readerexit(ui);
   4084 			daemon_request(&md_mirror_daemon, become_owner,
   4085 			    (daemon_queue_t *)ps, REQ_OLD);
   4086 			return;
   4087 		}
   4088 	}
   4089 
   4090 	/*
   4091 	 * Mark resync region if mirror has a Resync Region _and_ we are not
   4092 	 * a resync initiated write(). Don't mark region if we're flagged as
   4093 	 * an ABR write.
   4094 	 */
   4095 	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
   4096 	    !(flag & MD_STR_WAR)) {
   4097 		if (mirror_mark_resync_region(un, ps->ps_firstblk,
   4098 		    ps->ps_lastblk, md_mn_mynode_id)) {
   4099 			pb->b_flags |= B_ERROR;
   4100 			pb->b_resid = pb->b_bcount;
   4101 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
   4102 				mirror_overlap_tree_remove(ps);
   4103 			kmem_cache_free(mirror_parent_cache, ps);
   4104 			md_kstat_waitq_exit(ui);
   4105 			md_unit_readerexit(ui);
   4106 			md_biodone(pb);
   4107 			return;
   4108 		}
   4109 	}
   4110 
   4111 	ps->ps_childbflags = pb->b_flags | B_WRITE;
   4112 	ps->ps_childbflags &= ~B_READ;
   4113 	if (flag & MD_STR_MAPPED)
   4114 		ps->ps_childbflags &= ~B_PAGEIO;
   4115 
   4116 	if (!(flag & MD_STR_NOTTOP) && panicstr)
   4117 		/* Disable WOW and don't free ps */
   4118 		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
   4119 
   4120 	md_kstat_waitq_to_runq(ui);
   4121 
   4122 	/*
   4123 	 * Treat Raw and Direct I/O as Write-on-Write always
   4124 	 */
   4125 
   4126 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
   4127 	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
   4128 	    (pb->b_flags & B_PHYS) &&
   4129 	    !(ps->ps_flags & MD_MPS_WOW)) {
   4130 		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
   4131 			mirror_overlap_tree_remove(ps);
   4132 		md_unit_readerexit(ui);
   4133 		daemon_request(&md_mstr_daemon, handle_wow,
   4134 		    (daemon_queue_t *)ps, REQ_OLD);
   4135 		return;
   4136 	}
   4137 
   4138 	ps->ps_frags = 1;
   4139 	do {
   4140 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
   4141 		mirror_child_init(cs);
   4142 		cb = &cs->cs_buf;
   4143 		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
   4144 
   4145 		/*
   4146 		 * This handles the case where we're requesting
   4147 		 * a write to block 0 on a label partition.  (more < 0)
   4148 		 * means that the request size was smaller than the
   4149 		 * size of the label.  If so this request is done.
   4150 		 */
   4151 		if (more < 0) {
   4152 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
   4153 				mirror_overlap_tree_remove(ps);
   4154 			md_kstat_runq_exit(ui);
   4155 			kmem_cache_free(mirror_child_cache, cs);
   4156 			kmem_cache_free(mirror_parent_cache, ps);
   4157 			md_unit_readerexit(ui);
   4158 			md_biodone(pb);
   4159 			return;
   4160 		}
   4161 		if (more) {
   4162 			mutex_enter(&ps->ps_mx);
   4163 			ps->ps_frags++;
   4164 			mutex_exit(&ps->ps_mx);
   4165 		}
   4166 		md_call_strategy(cb, flag, private);
   4167 	} while (more);
   4168 
   4169 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
   4170 		while (!(ps->ps_flags & MD_MPS_DONE)) {
   4171 			md_daemon(1, &md_done_daemon);
   4172 			drv_usecwait(10);
   4173 		}
   4174 		kmem_cache_free(mirror_parent_cache, ps);
   4175 	}
   4176 }
   4177 
   4178 static void
   4179 mirror_read_strategy(buf_t *pb, int flag, void *private)
   4180 {
   4181 	md_mps_t	*ps;
   4182 	md_mcs_t	*cs;
   4183 	size_t		more;
   4184 	mm_unit_t	*un;
   4185 	mdi_unit_t	*ui;
   4186 	size_t		current_count;
   4187 	diskaddr_t	current_blkno;
   4188 	off_t		current_offset;
   4189 	buf_t		*cb;		/* child buf pointer */
   4190 	set_t		setno;
   4191 
   4192 	ui = MDI_UNIT(getminor(pb->b_edev));
   4193 
   4194 	md_kstat_waitq_enter(ui);
   4195 
   4196 	un = (mm_unit_t *)md_unit_readerlock(ui);
   4197 
   4198 	if (!(flag & MD_STR_NOTTOP)) {
   4199 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
   4200 			md_kstat_waitq_exit(ui);
   4201 			return;
   4202 		}
   4203 	}
   4204 
   4205 	if (private == NULL) {
   4206 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
   4207 		mirror_parent_init(ps);
   4208 	} else {
   4209 		ps = private;
   4210 		private = NULL;
   4211 	}
   4212 
   4213 	if (flag & MD_STR_MAPPED)
   4214 		ps->ps_flags |= MD_MPS_MAPPED;
   4215 	if (flag & MD_NOBLOCK)
   4216 		ps->ps_flags |= MD_MPS_NOBLOCK;
   4217 	if (flag & MD_STR_WMUPDATE)
   4218 		ps->ps_flags |= MD_MPS_WMUPDATE;
   4219 
   4220 	/*
   4221 	 * Check to see if this is a DMR driven read. If so we need to use the
   4222 	 * specified side (in un->un_dmr_last_read) for the source of the data.
   4223 	 */
   4224 	if (flag & MD_STR_DMR)
   4225 		ps->ps_flags |= MD_MPS_DMR;
   4226 
   4227 	/*
   4228 	 * Save essential information from the original buffhdr
   4229 	 * in the md_save structure.
   4230 	 */
   4231 	ps->ps_un = un;
   4232 	ps->ps_ui = ui;
   4233 	ps->ps_bp = pb;
   4234 	ps->ps_addr = pb->b_un.b_addr;
   4235 	ps->ps_firstblk = pb->b_lblkno;
   4236 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
   4237 	ps->ps_changecnt = un->un_changecnt;
   4238 
   4239 	current_count = btodb(pb->b_bcount);
   4240 	current_blkno = pb->b_lblkno;
   4241 	current_offset = 0;
   4242 
   4243 	/*
   4244 	 * If flag has MD_STR_WAR set this means that the read is issued by a
   4245 	 * resync thread which may or may not be an optimised resync.
   4246 	 *
   4247 	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
   4248 	 * code has not completed; either a resync has not started since snarf,
   4249 	 * or there is an optimized resync in progress.
   4250 	 *
   4251 	 * We need to generate a write after this read in the following two
   4252 	 * cases,
   4253 	 *
   4254 	 * 1. Any Resync-Generated read
   4255 	 *
   4256 	 * 2. Any read to a DIRTY REGION if there is an optimized resync
   4257 	 *    pending or in progress.
   4258 	 *
   4259 	 * The write after read is done in these cases to ensure that all sides
   4260 	 * of the mirror are in sync with the read data and that it is not
   4261 	 * possible for an application to read the same block multiple times
   4262 	 * and get different data.
   4263 	 *
   4264 	 * This would be possible if the block was in a dirty region.
   4265 	 *
   4266 	 * If we're performing a directed read we don't write the data out as
   4267 	 * the application is responsible for restoring the mirror to a known
   4268 	 * state.
   4269 	 */
   4270 	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
   4271 	    !(flag & MD_STR_DMR)) {
   4272 		size_t	start_rr, i, end_rr;
   4273 		int	region_dirty = 1;
   4274 
   4275 		/*
   4276 		 * We enter here under three circumstances,
   4277 		 *
   4278 		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
   4279 		 * 0			1
   4280 		 * 1			0
   4281 		 * 1			1
   4282 		 *
   4283 		 * To be optimal we only care to explicitly check for dirty
   4284 		 * regions in the second case since if MD_STR_WAR is set we
   4285 		 * always do the write after read.
   4286 		 */
   4287 		if (!(flag & MD_STR_WAR)) {
   4288 			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
   4289 			BLK_TO_RR(start_rr, ps->ps_firstblk, un);
   4290 
   4291 			for (i = start_rr; i <= end_rr; i++)
   4292 				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
   4293 					break;
   4294 		}
   4295 
   4296 		if ((region_dirty) &&
   4297 		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
   4298 			ps->ps_call = write_after_read;
   4299 			/*
   4300 			 * Mark this as a RESYNC_READ in ps_flags.
   4301 			 * This is used if the read fails during a
   4302 			 * resync of a 3-way mirror to ensure that
   4303 			 * the retried read to the remaining
   4304 			 * good submirror has MD_STR_WAR set. This
   4305 			 * is needed to ensure that the resync write
   4306 			 * (write-after-read) takes place.
   4307 			 */
   4308 			ps->ps_flags |= MD_MPS_RESYNC_READ;
   4309 
   4310 			/*
   4311 			 * If MD_STR_FLAG_ERR is set in the flags we
   4312 			 * set MD_MPS_FLAG_ERROR so that an error on the resync
   4313 			 * write (issued by write_after_read) will be flagged
   4314 			 * to the biowait'ing resync thread. This allows us to
   4315 			 * avoid issuing further resync requests to a device
   4316 			 * that has had a write failure.
   4317 			 */
   4318 			if (flag & MD_STR_FLAG_ERR)
   4319 				ps->ps_flags |= MD_MPS_FLAG_ERROR;
   4320 
   4321 			setno = MD_UN2SET(un);
   4322 			/*
   4323 			 * Drop the readerlock to avoid
   4324 			 * deadlock
   4325 			 */
   4326 			md_unit_readerexit(ui);
   4327 			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
   4328 			un = md_unit_readerlock(ui);
   4329 			/*
   4330 			 * Ensure that we are owner
   4331 			 */
   4332 			if (MD_MNSET_SETNO(setno)) {
   4333 				/*
   4334 				 * For a non-resync read that requires a
   4335 				 * write-after-read to be done, set a flag
   4336 				 * in the parent structure, so that the
   4337 				 * write_strategy routine can omit the
   4338 				 * test that the write is still within the
   4339 				 * resync region
   4340 				 */
   4341 				if (!(flag & MD_STR_WAR))
   4342 					ps->ps_flags |= MD_MPS_DIRTY_RD;
   4343 
   4344 				/*
   4345 				 * Before reading the buffer, see if
   4346 				 * there is an owner.
   4347 				 */
   4348 				if (MD_MN_NO_MIRROR_OWNER(un))  {
   4349 					ps->ps_call = NULL;
   4350 					mirror_overlap_tree_remove(ps);
   4351 					md_kstat_waitq_exit(ui);
   4352 					md_unit_readerexit(ui);
   4353 					daemon_request(
   4354 					    &md_mirror_daemon,
   4355 					    become_owner,
   4356 					    (daemon_queue_t *)ps,
   4357 					    REQ_OLD);
   4358 					return;
   4359 				}
   4360 				/*
   4361 				 * For a resync read, check to see if I/O is
   4362 				 * outside of the current resync region, or
   4363 				 * the resync has finished. If so
   4364 				 * just terminate the I/O
   4365 				 */
   4366 				if ((flag & MD_STR_WAR) &&
   4367 				    (!(un->c.un_status & MD_UN_WAR) ||
   4368 				    (!IN_RESYNC_REGION(un, ps)))) {
   4369 #ifdef DEBUG
   4370 					if (mirror_debug_flag)
   4371 						printf("Abort resync read "
   4372 						    "%x: %lld\n",
   4373 						    MD_SID(un),
   4374 						    ps->ps_firstblk);
   4375 #endif
   4376 					mirror_overlap_tree_remove(ps);
   4377 					kmem_cache_free(mirror_parent_cache,
   4378 					    ps);
   4379 					md_kstat_waitq_exit(ui);
   4380 					md_unit_readerexit(ui);
   4381 					md_biodone(pb);
   4382 					return;
   4383 				}
   4384 			}
   4385 		}
   4386 	}
   4387 
   4388 	if (flag & MD_STR_DMR) {
   4389 		ps->ps_call = directed_read_done;
   4390 	}
   4391 
   4392 	if (!(flag & MD_STR_NOTTOP) && panicstr)
   4393 		ps->ps_flags |= MD_MPS_DONTFREE;
   4394 
   4395 	md_kstat_waitq_to_runq(ui);
   4396 
   4397 	ps->ps_frags++;
   4398 	do {
   4399 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
   4400 		mirror_child_init(cs);
   4401 		cb = &cs->cs_buf;
   4402 		cs->cs_ps = ps;
   4403 
   4404 		cb = md_bioclone(pb, current_offset, current_count, NODEV,
   4405 		    current_blkno, mirror_done, cb, KM_NOSLEEP);
   4406 
   4407 		more = mirror_map_read(ps, cs, current_blkno,
   4408 		    (u_longlong_t)current_count);
   4409 		if (more) {
   4410 			mutex_enter(&ps->ps_mx);
   4411 			ps->ps_frags++;
   4412 			mutex_exit(&ps->ps_mx);
   4413 		}
   4414 
   4415 		/*
   4416 		 * Do these calculations now,
   4417 		 *  so that we pickup a valid b_bcount from the chld_bp.
   4418 		 */
   4419 		current_count -= more;
   4420 		current_offset += cb->b_bcount;
   4421 		current_blkno +=  more;
   4422 		md_call_strategy(cb, flag, private);
   4423 	} while (more);
   4424 
   4425 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
   4426 		while (!(ps->ps_flags & MD_MPS_DONE)) {
   4427 			md_daemon(1, &md_done_daemon);
   4428 			drv_usecwait(10);
   4429 		}
   4430 		kmem_cache_free(mirror_parent_cache, ps);
   4431 	}
   4432 }
   4433 
   4434 void
   4435 md_mirror_strategy(buf_t *bp, int flag, void *private)
   4436 {
   4437 	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));
   4438 
   4439 	/*
   4440 	 * When doing IO to a multi owner meta device, check if set is halted.
   4441 	 * We do this check without the needed lock held, for performance
   4442 	 * reasons.
   4443 	 * If an IO just slips through while the set is locked via an
   4444 	 * MD_MN_SUSPEND_SET, we don't care about it.
   4445 	 * Only check for suspension if we are a top-level i/o request
   4446 	 * (MD_STR_NOTTOP is cleared in 'flag').
   4447 	 */
   4448 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
   4449 	    (MD_SET_HALTED | MD_SET_MNSET)) {
   4450 		if ((flag & MD_STR_NOTTOP) == 0) {
   4451 			mutex_enter(&md_mx);
   4452 			/* Here we loop until the set is no longer halted */
   4453 			while (md_set[setno].s_status & MD_SET_HALTED) {
   4454 				cv_wait(&md_cv, &md_mx);
   4455 			}
   4456 			mutex_exit(&md_mx);
   4457 		}
   4458 	}
   4459 
   4460 	if ((flag & MD_IO_COUNTED) == 0) {
   4461 		if ((flag & MD_NOBLOCK) == 0) {
   4462 			if (md_inc_iocount(setno) != 0) {
   4463 				bp->b_flags |= B_ERROR;
   4464 				bp->b_error = ENXIO;
   4465 				bp->b_resid = bp->b_bcount;
   4466 				biodone(bp);
   4467 				return;
   4468 			}
   4469 		} else {
   4470 			md_inc_iocount_noblock(setno);
   4471 		}
   4472 	}
   4473 
   4474 	if (bp->b_flags & B_READ)
   4475 		mirror_read_strategy(bp, flag, private);
   4476 	else
   4477 		mirror_write_strategy(bp, flag, private);
   4478 }
   4479 
   4480 /*
   4481  * mirror_directed_read:
   4482  * --------------------
   4483  * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
   4484  * so that the application can determine what (if any) resync needs to be
   4485  * performed. The data is copied out to the user-supplied buffer.
   4486  *
   4487  * Parameters:
   4488  *	mdev	- dev_t for the mirror device
   4489  *	vdr	- directed read parameters specifying location and submirror
   4490  *		  to perform the read from
   4491  *	mode	- used to ddi_copyout() any resulting data from the read
   4492  *
   4493  * Returns:
   4494  *	0	success
   4495  *	!0	error code
   4496  *		EINVAL - invalid request format
   4497  */
   4498 int
   4499 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
   4500 {
   4501 	buf_t		*bp;
   4502 	minor_t		mnum = getminor(mdev);
   4503 	mdi_unit_t	*ui = MDI_UNIT(mnum);
   4504 	mm_unit_t	*un;
   4505 	mm_submirror_t	*sm;
   4506 	char		*sm_nm;
   4507 	uint_t		next_side;
   4508 	void		*kbuffer;
   4509 
   4510 	if (ui == NULL)
   4511 		return (ENXIO);
   4512 
   4513 	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
   4514 		return (EINVAL);
   4515 	}
   4516 
   4517 	/* Check for aligned block access. We disallow non-aligned requests. */
   4518 	if (vdr->vdr_offset % DEV_BSIZE) {
   4519 		return (EINVAL);
   4520 	}
   4521 
   4522 	/*
   4523 	 * Allocate kernel buffer for target of read(). If we had a reliable
   4524 	 * (sorry functional) DDI this wouldn't be needed.
   4525 	 */
   4526 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
   4527 	if (kbuffer == NULL) {
   4528 		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
   4529 		    " bytes\n", vdr->vdr_nbytes);
   4530 		return (ENOMEM);
   4531 	}
   4532 
   4533 	bp = getrbuf(KM_SLEEP);
   4534 
   4535 	bp->b_un.b_addr = kbuffer;
   4536 	bp->b_flags = B_READ;
   4537 	bp->b_bcount = vdr->vdr_nbytes;
   4538 	bp->b_lblkno = lbtodb(vdr->vdr_offset);
   4539 	bp->b_edev = mdev;
   4540 
   4541 	un = md_unit_readerlock(ui);
   4542 
   4543 	/*
   4544 	 * If DKV_SIDE_INIT is set we need to determine the first available
   4545 	 * side to start reading from. If it isn't set we increment to the
   4546 	 * next readable submirror.
   4547 	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
   4548 	 * Note: we check for a readable submirror on completion of the i/o so
   4549 	 * we should _always_ have one available. If this becomes unavailable
   4550 	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
   4551 	 * a metadetach is made between the completion of one DKIOCDMR ioctl
   4552 	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
   4553 	 * The chance of this is small, but not non-existent.
   4554 	 */
   4555 	if (vdr->vdr_side == DKV_SIDE_INIT) {
   4556 		next_side = 0;
   4557 	} else {
   4558 		next_side = vdr->vdr_side + 1;
   4559 	}
   4560 	while ((next_side < NMIRROR) &&
   4561 	    !SUBMIRROR_IS_READABLE(un, next_side))
   4562 		next_side++;
   4563 	if (next_side >= NMIRROR) {
   4564 		vdr->vdr_flags |= DKV_DMR_ERROR;
   4565 		freerbuf(bp);
   4566 		vdr->vdr_bytesread = 0;
   4567 		md_unit_readerexit(ui);
   4568 		return (0);
   4569 	}
   4570 
   4571 	/* Set the side to read from */
   4572 	un->un_dmr_last_read = next_side;
   4573 
   4574 	md_unit_readerexit(ui);
   4575 
   4576 	/*
   4577 	 * Save timestamp for verification purposes. Can be read by debugger
   4578 	 * to verify that this ioctl has been executed and to find the number
   4579 	 * of DMR reads and the time of the last DMR read.
   4580 	 */
   4581 	uniqtime(&mirror_dmr_stats.dmr_timestamp);
   4582 	mirror_dmr_stats.dmr_count++;
   4583 
   4584 	/* Issue READ request and wait for completion */
   4585 	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
   4586 
   4587 	mutex_enter(&un->un_dmr_mx);
   4588 	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
   4589 	mutex_exit(&un->un_dmr_mx);
   4590 
   4591 	/*
   4592 	 * Check to see if we encountered an error during the read. If so we
   4593 	 * can make no guarantee about any possibly returned data.
   4594 	 */
   4595 	if ((bp->b_flags & B_ERROR) == 0) {
   4596 		vdr->vdr_flags &= ~DKV_DMR_ERROR;
   4597 		if (bp->b_resid) {
   4598 			vdr->vdr_flags |= DKV_DMR_SHORT;
   4599 			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
   4600 		} else {
   4601 			vdr->vdr_flags |= DKV_DMR_SUCCESS;
   4602 			vdr->vdr_bytesread = vdr->vdr_nbytes;
   4603 		}
   4604 		/* Copy the data read back out to the user supplied buffer */
   4605 		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
   4606 		    mode)) {
   4607 			kmem_free(kbuffer, vdr->vdr_nbytes);
   4608 			return (EFAULT);
   4609 		}
   4610 
   4611 	} else {
   4612 		/* Error out with DKV_DMR_ERROR */
   4613 		vdr->vdr_flags |= DKV_DMR_ERROR;
   4614 		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
   4615 	}
   4616 	/*
   4617 	 * Update the DMR parameters with the side and name of submirror that
   4618 	 * we have just read from (un->un_dmr_last_read)
   4619 	 */
   4620 	un = md_unit_readerlock(ui);
   4621 
   4622 	vdr->vdr_side = un->un_dmr_last_read;
   4623 	sm = &un->un_sm[un->un_dmr_last_read];
   4624 	sm_nm = md_shortname(md_getminor(sm->sm_dev));
   4625 
   4626 	(void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
   4627 
   4628 	/*
   4629 	 * Determine if we've completed the read cycle. This is true iff the
   4630 	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
   4631 	 * use un_nsm as we need to handle a sparse array of submirrors (which
   4632 	 * can occur if a submirror is metadetached).
   4633 	 */
   4634 	next_side = un->un_dmr_last_read + 1;
   4635 	while ((next_side < NMIRROR) &&
   4636 	    !SUBMIRROR_IS_READABLE(un, next_side))
   4637 		next_side++;
   4638 	if (next_side >= NMIRROR) {
   4639 		/* We've finished */
   4640 		vdr->vdr_flags |= DKV_DMR_DONE;
   4641 	}
   4642 
   4643 	md_unit_readerexit(ui);
   4644 	freerbuf(bp);
   4645 	kmem_free(kbuffer, vdr->vdr_nbytes);
   4646 
   4647 	return (0);
   4648 }
   4649 
   4650 /*
   4651  * mirror_resync_message:
   4652  * ---------------------
   4653  * Handle the multi-node resync messages that keep all nodes within a given
   4654  * disk-set in sync with their view of a mirror's resync status.
   4655  *
   4656  * The message types dealt with are:
   4657  * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
   4658  * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
   4659  * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
   4660  * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
   4661  *
   4662  * Returns:
   4663  *	0	Success
   4664  *	>0	Failure error number
   4665  */
   4666 int
   4667 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
   4668 {
   4669 	mdi_unit_t		*ui;
   4670 	mm_unit_t		*un;
   4671 	set_t			setno;
   4672 	int			is_ABR;
   4673 	int			smi;
   4674 	int			ci;
   4675 	sm_state_t		state;
   4676 	int			broke_out;
   4677 	mm_submirror_t		*sm;
   4678 	mm_submirror_ic_t	*smic;
   4679 	md_m_shared_t		*shared;
   4680 	md_error_t		mde = mdnullerror;
   4681 	md_mps_t		*ps;
   4682 	int			rs_active;
   4683 	int			rr, rr_start, rr_end;
   4684 
   4685 	/* Check that the given device is part of a multi-node set */
   4686 	setno = MD_MIN2SET(p->mnum);
   4687 	if (setno >= md_nsets) {
   4688 		return (ENXIO);
   4689 	}
   4690 	if (!MD_MNSET_SETNO(setno)) {
   4691 		return (EINVAL);
   4692 	}
   4693 
   4694 	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
   4695 		return (EINVAL);
   4696 	if ((ui = MDI_UNIT(p->mnum)) == NULL)
   4697 		return (EINVAL);
   4698 	is_ABR = (ui->ui_tstate & MD_ABR_CAP);
   4699 
   4700 	/* Obtain the current resync status */
   4701 	(void) md_ioctl_readerlock(lockp, ui);
   4702 	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
   4703 	md_ioctl_readerexit(lockp);
   4704 
   4705 	switch ((md_mn_msgtype_t)p->msg_type) {
   4706 	case MD_MN_MSG_RESYNC_STARTING:
   4707 		/* Start the resync thread for the mirror */
   4708 		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
   4709 		break;
   4710 
   4711 	case MD_MN_MSG_RESYNC_NEXT:
   4712 		/*
   4713 		 * We have to release any previously marked overlap regions
   4714 		 * so that i/o can resume. Then we need to block the region
   4715 		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
   4716 		 * Update un_rs_resync_done and un_rs_resync_2_do.
   4717 		 */
   4718 		(void) md_ioctl_readerlock(lockp, ui);
   4719 		/*
   4720 		 * Ignore the message if there is no active resync thread or
   4721 		 * if it is for a resync type that we have already completed.
   4722 		 * un_resync_completed is set to the last resync completed
   4723 		 * when processing a PHASE_DONE message.
   4724 		 */
   4725 		if (!rs_active || (p->rs_type == un->un_resync_completed))
   4726 			break;
   4727 		/*
   4728 		 * If this message is for the same resync and is for an earlier
   4729 		 * resync region, just ignore it. This can only occur if this
   4730 		 * node has progressed on to the next resync region before
   4731 		 * we receive this message. This can occur if the class for
   4732 		 * this message is busy and the originator has to retry thus
   4733 		 * allowing this node to move onto the next resync_region.
   4734 		 */
   4735 		if ((p->rs_type == un->un_rs_type) &&
   4736 		    (p->rs_start < un->un_resync_startbl))
   4737 			break;
   4738 		ps = un->un_rs_prev_overlap;
   4739 
   4740 		/* Allocate previous overlap reference if needed */
   4741 		if (ps == NULL) {
   4742 			ps = kmem_cache_alloc(mirror_parent_cache,
   4743 			    MD_ALLOCFLAGS);
   4744 			ps->ps_un = un;
   4745 			ps->ps_ui = ui;
   4746 			ps->ps_firstblk = 0;
   4747 			ps->ps_lastblk = 0;
   4748 			ps->ps_flags = 0;
   4749 			md_ioctl_readerexit(lockp);
   4750 			(void) md_ioctl_writerlock(lockp, ui);
   4751 			un->un_rs_prev_overlap = ps;
   4752 			md_ioctl_writerexit(lockp);
   4753 		} else
   4754 			md_ioctl_readerexit(lockp);
   4755 
   4756 		if (p->rs_originator != md_mn_mynode_id) {
   4757 			/*
   4758 			 * Clear our un_resync_bm for the regions completed.
   4759 			 * The owner (originator) will take care of itself.
   4760 			 */
   4761 			BLK_TO_RR(rr_end, ps->ps_lastblk, un);
   4762 			BLK_TO_RR(rr_start, p->rs_start, un);
   4763 			if (ps->ps_lastblk && rr_end < rr_start) {
   4764 				BLK_TO_RR(rr_start, ps->ps_firstblk, un);
   4765 				mutex_enter(&un->un_resync_mx);
   4766 				/*
   4767 				 * Update our resync bitmap to reflect that
   4768 				 * another node has synchronized this range.
   4769 				 */
   4770 				for (rr = rr_start; rr <= rr_end; rr++) {
   4771 					CLR_KEEPDIRTY(rr, un);
   4772 				}
   4773 				mutex_exit(&un->un_resync_mx);
   4774 			}
   4775 
   4776 			/*
   4777 			 * On all but the originating node, first update
   4778 			 * the resync state, then unblock the previous
   4779 			 * region and block the next one. No need
   4780 			 * to do this if the region is already blocked.
   4781 			 * Update the submirror state and flags from the
   4782 			 * originator. This keeps the cluster in sync with
   4783 			 * regards to the resync status.
   4784 			 */
   4785 
   4786 			(void) md_ioctl_writerlock(lockp, ui);
   4787 			un->un_rs_resync_done = p->rs_done;
   4788 			un->un_rs_resync_2_do = p->rs_2_do;
   4789 			un->un_rs_type = p->rs_type;
   4790 			un->un_resync_startbl = p->rs_start;
   4791 			md_ioctl_writerexit(lockp);
   4792 			/*
   4793 			 * Use un_owner_mx to ensure that an ownership change
   4794 			 * cannot happen at the same time as this message
   4795 			 */
   4796 			mutex_enter(&un->un_owner_mx);
   4797 			if (MD_MN_MIRROR_OWNER(un)) {
   4798 				ps->ps_firstblk = p->rs_start;
   4799 				ps->ps_lastblk = ps->ps_firstblk +
   4800 				    p->rs_size - 1;
   4801 			} else {
   4802 				if ((ps->ps_firstblk != p->rs_start) ||
   4803 				    (ps->ps_lastblk != p->rs_start +
   4804 				    p->rs_size - 1)) {
   4805 					/* Remove previous overlap range */
   4806 					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
   4807 						mirror_overlap_tree_remove(ps);
   4808 
   4809 					ps->ps_firstblk = p->rs_start;
   4810 					ps->ps_lastblk = ps->ps_firstblk +
   4811 					    p->rs_size - 1;
   4812 
   4813 					mutex_exit(&un->un_owner_mx);
   4814 					/* Block this range from all i/o. */
   4815 					if (ps->ps_firstblk != 0 ||
   4816 					    ps->ps_lastblk != 0)
   4817 						wait_for_overlaps(ps,
   4818 						    MD_OVERLAP_ALLOW_REPEAT);
   4819 					mutex_enter(&un->un_owner_mx);
   4820 					/*
   4821 					 * Check to see if we have obtained
   4822 					 * ownership while waiting for
   4823 					 * overlaps. If we have, remove
   4824 					 * the resync_region entry from the
   4825 					 * overlap tree
   4826 					 */
   4827 					if (MD_MN_MIRROR_OWNER(un) &&
   4828 					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
   4829 						mirror_overlap_tree_remove(ps);
   4830 				}
   4831 			}
   4832 			mutex_exit(&un->un_owner_mx);
   4833 
   4834 			/*
   4835 			 * If this is the first RESYNC_NEXT message (i.e.
   4836 			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
   4837 			 * issue RESYNC_START NOTIFY event
   4838 			 */
   4839 			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
   4840 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
   4841 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
   4842 				    MD_SID(un));
   4843 			}
   4844 
   4845 			/* Ensure that our local resync thread is running */
   4846 			if (un->un_rs_thread == NULL) {
   4847 				(void) mirror_resync_unit(p->mnum, NULL,
   4848 				    &p->mde, lockp);
   4849 			}
   4850 		}
   4851 
   4852 		break;
   4853 	case MD_MN_MSG_RESYNC_FINISH:
   4854 		/*
   4855 		 * Complete the resync by stopping the resync thread.
   4856 		 * Also release the previous overlap region field.
   4857 		 * Update the resync_progress_thread by cv_signal'ing it so
   4858 		 * that we mark the end of the resync as soon as possible. This
   4859 		 * stops an unnecessary delay should be panic after resync
   4860 		 * completion.
   4861 		 */
   4862 #ifdef DEBUG
   4863 		if (!rs_active) {
   4864 			if (mirror_debug_flag)
   4865 				printf("RESYNC_FINISH (mnum = %x), "
   4866 				    "Resync *NOT* active",
   4867 				    p->mnum);
   4868 		}
   4869 #endif
   4870 
   4871 		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
   4872 		    (p->rs_originator != md_mn_mynode_id)) {
   4873 			mutex_enter(&un->un_rs_thread_mx);
   4874 			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
   4875 			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
   4876 			un->un_rs_thread_flags &=
   4877 			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
   4878 			cv_signal(&un->un_rs_thread_cv);
   4879 			mutex_exit(&un->un_rs_thread_mx);
   4880 		}
   4881 		if (is_ABR) {
   4882 			/* Resync finished, if ABR set owner to NULL */
   4883 			mutex_enter(&un->un_owner_mx);
   4884 			un->un_mirror_owner = 0;
   4885 			mutex_exit(&un->un_owner_mx);
   4886 		}
   4887 		(void) md_ioctl_writerlock(lockp, ui);
   4888 		ps = un->un_rs_prev_overlap;
   4889 		if (ps != NULL) {
   4890 			/* Remove previous overlap range */
   4891 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
   4892 				mirror_overlap_tree_remove(ps);
   4893 			/*
   4894 			 * Release the overlap range reference
   4895 			 */
   4896 			un->un_rs_prev_overlap = NULL;
   4897 			kmem_cache_free(mirror_parent_cache,
   4898 			    ps);
   4899 		}
   4900 		md_ioctl_writerexit(lockp);
   4901 
   4902 		/* Mark the resync as complete in the metadb */
   4903 		un->un_rs_resync_done = p->rs_done;
   4904 		un->un_rs_resync_2_do = p->rs_2_do;
   4905 		un->un_rs_type = p->rs_type;
   4906 		mutex_enter(&un->un_rs_progress_mx);
   4907 		cv_signal(&un->un_rs_progress_cv);
   4908 		mutex_exit(&un->un_rs_progress_mx);
   4909 
   4910 		un = md_ioctl_writerlock(lockp, ui);
   4911 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
   4912 		/* Deal with any pending grow_unit */
   4913 		if (un->c.un_status & MD_UN_GROW_PENDING) {
   4914 			if ((mirror_grow_unit(un, &mde) != 0) ||
   4915 			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
   4916 				un->c.un_status &= ~MD_UN_GROW_PENDING;
   4917 			}
   4918 		}
   4919 		md_ioctl_writerexit(lockp);
   4920 		break;
   4921 
   4922 	case MD_MN_MSG_RESYNC_PHASE_DONE:
   4923 		/*
   4924 		 * A phase of the resync, optimized. component or
   4925 		 * submirror is complete. Update mirror status.
   4926 		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
   4927 		 * mirror owner is peforming a resync. If we have just snarfed
   4928 		 * this set, then we must clear any of the flags set at snarf
   4929 		 * time by unit_setup_resync().
   4930 		 * Note that unit_setup_resync() sets up these flags to
   4931 		 * indicate that an optimized resync is required. These flags
   4932 		 * need to be reset because if we get here,  the mirror owner
   4933 		 * will have handled the optimized resync.
   4934 		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
   4935 		 * MD_UN_WAR. In addition, for each submirror,
   4936 		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
   4937 		 * set to SMS_OFFLINE.
   4938 		 */
   4939 #ifdef DEBUG
   4940 		if (mirror_debug_flag)
   4941 			printf("phase done mess received from %d, mnum=%x,"
   4942 			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
   4943 			    p->rs_type, p->rs_flags);
   4944 #endif
   4945 		/*
   4946 		 * Ignore the message if there is no active resync thread.
   4947 		 */
   4948 		if (!rs_active)
   4949 			break;
   4950 
   4951 		broke_out = p->rs_flags & MD_MN_RS_ERR;
   4952 		switch (RS_TYPE(p->rs_type)) {
   4953 		case MD_RS_OPTIMIZED:
   4954 			un = md_ioctl_writerlock(lockp, ui);
   4955 			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
   4956 				/* If we are originator, just clear rs_type */
   4957 				if (p->rs_originator == md_mn_mynode_id) {
   4958 					SET_RS_TYPE_NONE(un->un_rs_type);
   4959 					md_ioctl_writerexit(lockp);
   4960 					break;
   4961 				}
   4962 				/*
   4963 				 * If CLEAR_OPT_NOT_DONE is set, only clear the
   4964 				 * flags if OPT_NOT_DONE is set *and* rs_type
   4965 				 * is MD_RS_NONE.
   4966 				 */
   4967 				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
   4968 				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
   4969 					/* No resync in progress */
   4970 					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
   4971 					un->c.un_status &= ~MD_UN_WAR;
   4972 				} else {
   4973 					/*
   4974 					 * We are in the middle of an
   4975 					 * optimized resync and this message
   4976 					 * should be ignored.
   4977 					 */
   4978 					md_ioctl_writerexit(lockp);
   4979 					break;
   4980 				}
   4981 			} else {
   4982 				/*
   4983 				 * This is the end of an optimized resync,
   4984 				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
   4985 				 */
   4986 
   4987 				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
   4988 				if (!broke_out)
   4989 					un->c.un_status &= ~MD_UN_WAR;
   4990 
   4991 				/*
   4992 				 * Clear our un_resync_bm for the regions
   4993 				 * completed.  The owner (originator) will
   4994 				 * take care of itself.
   4995 				 */
   4996 				if (p->rs_originator != md_mn_mynode_id &&
   4997 				    (ps = un->un_rs_prev_overlap) != NULL) {
   4998 					BLK_TO_RR(rr_start, ps->ps_firstblk,
   4999 					    un);
   5000 					BLK_TO_RR(rr_end, ps->ps_lastblk, un);
   5001 					mutex_enter(&un->un_resync_mx);
   5002 					for (rr = rr_start; rr <= rr_end;
   5003 					    rr++) {
   5004 						CLR_KEEPDIRTY(rr, un);
   5005 					}
   5006 					mutex_exit(&un->un_resync_mx);
   5007 				}
   5008 			}
   5009 
   5010 			/*
   5011 			 * Set resync_completed to last resync type and then
   5012 			 * clear resync_type to indicate no resync in progress
   5013 			 */
   5014 			un->un_resync_completed = un->un_rs_type;
   5015 			SET_RS_TYPE_NONE(un->un_rs_type);
   5016 
   5017 			/*
   5018 			 * If resync is as a result of a submirror ONLINE,
   5019 			 * reset the submirror state to SMS_RUNNING if the
   5020 			 * resync was ok else set back to SMS_OFFLINE.
   5021 			 */
   5022 			for (smi = 0; smi < NMIRROR; smi++) {
   5023 				un->un_sm[smi].sm_flags &=
   5024 				    ~MD_SM_RESYNC_TARGET;
   5025 				if (SMS_BY_INDEX_IS(un, smi,
   5026 				    SMS_OFFLINE_RESYNC)) {
   5027 					if (p->rs_flags &
   5028 					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
   5029 						state = SMS_OFFLINE;
   5030 					} else {
   5031 						state = (broke_out ?
   5032 						    SMS_OFFLINE : SMS_RUNNING);
   5033 					}
   5034 					mirror_set_sm_state(
   5035 					    &un->un_sm[smi],
   5036 					    &un->un_smic[smi], state,
   5037 					    broke_out);
   5038 					mirror_commit(un, NO_SUBMIRRORS,
   5039 					    0);
   5040 				}
   5041 				/*
   5042 				 * If we still have an offline submirror, reset
   5043 				 * the OFFLINE_SM flag in the mirror status
   5044 				 */
   5045 				if (SMS_BY_INDEX_IS(un, smi,
   5046 				    SMS_OFFLINE))
   5047 					un->c.un_status |=
   5048 					    MD_UN_OFFLINE_SM;
   5049 			}
   5050 			md_ioctl_writerexit(lockp);
   5051 			break;
   5052 		case MD_RS_SUBMIRROR:
   5053 			un = md_ioctl_writerlock(lockp, ui);
   5054 			smi = RS_SMI(p->rs_type);
   5055 			sm = &un->un_sm[smi];
   5056 			smic = &un->un_smic[smi];
   5057 			/* Clear RESYNC target */
   5058 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
   5059 			/*
   5060 			 * Set resync_completed to last resync type and then
   5061 			 * clear resync_type to indicate no resync in progress
   5062 			 */
   5063 			un->un_resync_completed = un->un_rs_type;
   5064 			SET_RS_TYPE_NONE(un->un_rs_type);
   5065 			/*
   5066 			 * If the resync completed ok reset the submirror
   5067 			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
   5068 			 */
   5069 			state = (broke_out ?
   5070 			    SMS_ATTACHED : SMS_RUNNING);
   5071 			mirror_set_sm_state(sm, smic, state, broke_out);
   5072 			un->c.un_status &= ~MD_UN_WAR;
   5073 			mirror_commit(un, SMI2BIT(smi), 0);
   5074 			md_ioctl_writerexit(lockp);
   5075 			break;
   5076 		case MD_RS_COMPONENT:
   5077 			un = md_ioctl_writerlock(lockp, ui);
   5078 			smi = RS_SMI(p->rs_type);
   5079 			ci = RS_CI(p->rs_type);
   5080 			sm = &un->un_sm[smi];
   5081 			smic = &un->un_smic[smi];
   5082 			shared = (md_m_shared_t *)
   5083 			    (*(smic->sm_shared_by_indx))
   5084 			    (sm->sm_dev, sm, ci);
   5085 			un->c.un_status &= ~MD_UN_WAR;
   5086 			/* Clear RESYNC target */
   5087 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
   5088 			/*
   5089 			 * Set resync_completed to last resync type and then
   5090 			 * clear resync_type to indicate no resync in progress
   5091 			 */
   5092 			un->un_resync_completed = un->un_rs_type;
   5093 			SET_RS_TYPE_NONE(un->un_rs_type);
   5094 
   5095 			/*
   5096 			 * If the resync completed ok, set the component state
   5097 			 * to CS_OKAY.
   5098 			 */
   5099 			if (broke_out)
   5100 				shared->ms_flags |= MDM_S_RS_TRIED;
   5101 			else {
   5102 				/*
   5103 				 * As we don't transmit the changes,
   5104 				 * no need to drop the lock.
   5105 				 */
   5106 				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
   5107 				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
   5108 			}
   5109 			md_ioctl_writerexit(lockp);
   5110 		default:
   5111 			break;
   5112 		}
   5113 		/*
   5114 		 * If the purpose of this PHASE_DONE message is just to
   5115 		 * indicate to all other nodes that the optimized resync
   5116 		 * required (OPT_NOT_DONE) flag is to be cleared, there is
   5117 		 * no need to generate a notify event as there has not
   5118 		 * actually been a resync.
   5119 		 */
   5120 		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
   5121 			if (broke_out) {
   5122 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
   5123 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
   5124 				    MD_SID(un));
   5125 			} else {
   5126 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
   5127 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
   5128 				    MD_SID(un));
   5129 			}
   5130 		}
   5131 		break;
   5132 
   5133 	default:
   5134 #ifdef DEBUG
   5135 		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
   5136 		    " %x\n", p->msg_type);
   5137 #endif
   5138 		return (EINVAL);
   5139 	}
   5140 	return (0);
   5141 }
   5142 
   5143 /* Return a -1 if snarf of optimized record failed and set should be released */
   5144 static int
   5145 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
   5146 {
   5147 	mddb_recid_t	recid;
   5148 	int		gotsomething;
   5149 	int		all_mirrors_gotten;
   5150 	mm_unit_t	*un;
   5151 	mddb_type_t	typ1;
   5152 	mddb_de_ic_t    *dep;
   5153 	mddb_rb32_t	*rbp;
   5154 	size_t		newreqsize;
   5155 	mm_unit_t	*big_un;
   5156 	mm_unit32_od_t	*small_un;
   5157 	int		retval;
   5158 	mdi_unit_t	*ui;
   5159 
   5160 	if (cmd == MD_SNARF_CLEANUP) {
   5161 		if (md_get_setstatus(setno) & MD_SET_STALE)
   5162 			return (0);
   5163 
   5164 		recid = mddb_makerecid(setno, 0);
   5165 		typ1 = (mddb_type_t)md_getshared_key(setno,
   5166 		    mirror_md_ops.md_driver.md_drivername);
   5167 		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
   5168 			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
   5169 				un = (mm_unit_t *)mddb_getrecaddr(recid);
   5170 				mirror_cleanup(un);
   5171 				recid = mddb_makerecid(setno, 0);
   5172 			}
   5173 		}
   5174 		return (0);
   5175 	}
   5176 
   5177 	all_mirrors_gotten = 1;
   5178 	gotsomething = 0;
   5179 
   5180 	recid = mddb_makerecid(setno, 0);
   5181 	typ1 = (mddb_type_t)md_getshared_key(setno,
   5182 	    mirror_md_ops.md_driver.md_drivername);
   5183 
   5184 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
   5185 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
   5186 			continue;
   5187 
   5188 		dep = mddb_getrecdep(recid);
   5189 		dep->de_flags = MDDB_F_MIRROR;
   5190 		rbp = dep->de_rb;
   5191 
   5192 		switch (rbp->rb_revision) {
   5193 		case MDDB_REV_RB:
   5194 		case MDDB_REV_RBFN:
   5195 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
   5196 				/*
   5197 				 * This means, we have an old and small
   5198 				 * record and this record hasn't already
   5199 				 * been converted.  Before we create an
   5200 				 * incore metadevice from this we have to
   5201 				 * convert it to a big record.
   5202 				 */
   5203 				small_un =
   5204 				    (mm_unit32_od_t *)mddb_getrecaddr(recid);
   5205 				newreqsize = sizeof (mm_unit_t);
   5206 				big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
   5207 				    KM_SLEEP);
   5208 				mirror_convert((caddr_t)small_un,
   5209 				    (caddr_t)big_un, SMALL_2_BIG);
   5210 				kmem_free(small_un, dep->de_reqsize);
   5211 
   5212 				/*
   5213 				 * Update userdata and incore userdata
   5214 				 * incores are at the end of un
   5215 				 */
   5216 				dep->de_rb_userdata_ic = big_un;
   5217 				dep->de_rb_userdata = big_un;
   5218 				dep->de_icreqsize = newreqsize;
   5219 				un = big_un;
   5220 				rbp->rb_private |= MD_PRV_CONVD;
   5221 			} else {
   5222 				/*
   5223 				 * Unit already converted, just get the
   5224 				 * record address.
   5225 				 */
   5226 				un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
   5227 				    sizeof (*un), 0);
   5228 			}
   5229 			un->c.un_revision &= ~MD_64BIT_META_DEV;
   5230 			break;
   5231 		case MDDB_REV_RB64:
   5232 		case MDDB_REV_RB64FN:
   5233 			/* Big device */
   5234 			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
   5235 			    sizeof (*un), 0);
   5236 			un->c.un_revision |= MD_64BIT_META_DEV;
   5237 			un->c.un_flag |= MD_EFILABEL;
   5238 			break;
   5239 		}
   5240 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
   5241 
   5242 		/*
   5243 		 * Create minor device node for snarfed entry.
   5244 		 */
   5245 		(void) md_create_minor_node(setno, MD_SID(un));
   5246 
   5247 		if (MD_UNIT(MD_SID(un)) != NULL) {
   5248 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
   5249 			continue;
   5250 		}
   5251 		all_mirrors_gotten = 0;
   5252 		retval = mirror_build_incore(un, 1);
   5253 		if (retval == 0) {
   5254 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
   5255 			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
   5256 			resync_start_timeout(setno);
   5257 			gotsomething = 1;
   5258 		} else {
   5259 			return (retval);
   5260 		}
   5261 		/*
   5262 		 * Set flag to indicate that the mirror has not yet
   5263 		 * been through a reconfig. This flag is used for MN sets
   5264 		 * when determining whether to update the mirror state from
   5265 		 * the Master node.
   5266 		 */
   5267 		if (MD_MNSET_SETNO(setno)) {
   5268 			ui = MDI_UNIT(MD_SID(un));
   5269 			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
   5270 		}
   5271 	}
   5272 
   5273 	if (!all_mirrors_gotten)
   5274 		return (gotsomething);
   5275 
   5276 	recid = mddb_makerecid(setno, 0);
   5277 	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
   5278 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
   5279 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
   5280 
   5281 	return (0);
   5282 }
   5283 
   5284 static int
   5285 mirror_halt(md_haltcmd_t cmd, set_t setno)
   5286 {
   5287 	unit_t		i;
   5288 	mdi_unit_t	*ui;
   5289 	minor_t		mnum;
   5290 	int		reset_mirror_flag = 0;
   5291 
   5292 	if (cmd == MD_HALT_CLOSE)
   5293 		return (0);
   5294 
   5295 	if (cmd == MD_HALT_OPEN)
   5296 		return (0);
   5297 
   5298 	if (cmd == MD_HALT_UNLOAD)
   5299 		return (0);
   5300 
   5301 	if (cmd == MD_HALT_CHECK) {
   5302 		for (i = 0; i < md_nunits; i++) {
   5303 			mnum = MD_MKMIN(setno, i);
   5304 			if ((ui = MDI_UNIT(mnum)) == NULL)
   5305 				continue;
   5306 			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
   5307 				continue;
   5308 			if (md_unit_isopen(ui))
   5309 				return (1);
   5310 		}
   5311 		return (0);
   5312 	}
   5313 
   5314 	if (cmd != MD_HALT_DOIT)
   5315 		return (1);
   5316 
   5317 	for (i = 0; i < md_nunits; i++) {
   5318 		mnum = MD_MKMIN(setno, i);
   5319 		if ((ui = MDI_UNIT(mnum)) == NULL)
   5320 			continue;
   5321 		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
   5322 			continue;
   5323 		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
   5324 
   5325 		/* Set a flag if there is at least one mirror metadevice. */
   5326 		reset_mirror_flag = 1;
   5327 	}
   5328 
   5329 	/*
   5330 	 * Only wait for the global dr_timeout to finish
   5331 	 *  - if there are mirror metadevices in this diskset or
   5332 	 *  - if this is the local set since an unload of the md_mirror
   5333 	 *    driver could follow a successful mirror halt in the local set.
   5334 	 */
   5335 	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
   5336 		while ((mirror_md_ops.md_head == NULL) &&
   5337 		    (mirror_timeout.dr_timeout_id != 0))
   5338 			delay(md_hz);
   5339 	}
   5340 
   5341 	return (0);
   5342 }
   5343 
   5344 /*ARGSUSED3*/
   5345 static int
   5346 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
   5347 {
   5348 	IOLOCK	lock;
   5349 	minor_t		mnum = getminor(*dev);
   5350 	set_t		setno;
   5351 
   5352 	/*
   5353 	 * When doing an open of a multi owner metadevice, check to see if this
   5354 	 * node is a starting node and if a reconfig cycle is underway.
   5355 	 * If so, the system isn't sufficiently set up enough to handle the
   5356 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
   5357 	 */
   5358 	setno = MD_MIN2SET(mnum);
   5359 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
   5360 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
   5361 			return (ENXIO);
   5362 	}
   5363 
   5364 	if (md_oflags & MD_OFLG_FROMIOCTL) {
   5365 		/*
   5366 		 * This indicates that the caller is an ioctl service routine.
   5367 		 * In this case we initialise our stack-based IOLOCK and pass
   5368 		 * this into the internal open routine. This allows multi-owner
   5369 		 * metadevices to avoid deadlocking if an error is encountered
   5370 		 * during the open() attempt. The failure case is:
   5371 		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
   5372 		 * this configuration would deadlock as the mirror code has to
   5373 		 * send a state-update to the other nodes when it detects the
   5374 		 * failure of the underlying submirror with an errored soft-part
   5375 		 * on it. As there is a class1 message in progress (metaclear)
   5376 		 * set_sm_comp_state() cannot send another class1 message;
   5377 		 * instead we do not send a state_update message as the
   5378 		 * metaclear is distributed and the failed submirror will be
   5379 		 * cleared from the configuration by the metaclear.
   5380 		 */
   5381 		IOLOCK_INIT(&lock);
   5382 		return (mirror_internal_open(getminor(*dev), flag, otyp,
   5383 		    md_oflags, &lock));
   5384 	} else {
   5385 		return (mirror_internal_open(getminor(*dev), flag, otyp,
   5386 		    md_oflags, (IOLOCK *)NULL));
   5387 	}
   5388 }
   5389 
   5390 
   5391 /*ARGSUSED1*/
   5392 static int
   5393 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
   5394 {
   5395 	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
   5396 	    (IOLOCK *)NULL));
   5397 }
   5398 
   5399 
   5400 /*
   5401  * This routine dumps memory to the disk.  It assumes that the memory has
   5402  * already been mapped into mainbus space.  It is called at disk interrupt
   5403  * priority when the system is in trouble.
   5404  *
   5405  */
   5406 static int
   5407 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
   5408 {
   5409 	mm_unit_t	*un;
   5410 	dev_t		mapdev;
   5411 	int		result;
   5412 	int		smi;
   5413 	int		any_succeed = 0;
   5414 	int		save_result = 0;
   5415 
   5416 	/*
   5417 	 * Don't need to grab the unit lock.
   5418 	 * Cause nothing else is suppose to be happenning.
   5419 	 * Also dump is not suppose to sleep.
   5420 	 */
   5421 	un = (mm_unit_t *)MD_UNIT(getminor(dev));
   5422 
   5423 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
   5424 		return (EINVAL);
   5425 
   5426 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
   5427 		return (EINVAL);
   5428 
   5429 	for (smi = 0; smi < NMIRROR; smi++) {
   5430 		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
   5431 			continue;
   5432 		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
   5433 		result = bdev_dump(mapdev, addr, blkno, nblk);
   5434 		if (result)
   5435 			save_result = result;
   5436 
   5437 		if (result == 0)
   5438 			any_succeed++;
   5439 	}
   5440 
   5441 	if (any_succeed)
   5442 		return (0);
   5443 
   5444 	return (save_result);
   5445 }
   5446 
   5447 /*
   5448  * NAME: mirror_probe_dev
   5449  *
   5450  * DESCRITPION: force opens every component of a mirror.
   5451  *
   5452  * On entry the unit writerlock is held
   5453  */
   5454 static int
   5455 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
   5456 {
   5457 	int		i;
   5458 	int		smi;
   5459 	int		ci;
   5460 	mm_unit_t	*un;
   5461 	int		md_devopen = 0;
   5462 	set_t		setno;
   5463 	int		sm_cnt;
   5464 	int		sm_unavail_cnt;
   5465 
   5466 	if (md_unit_isopen(ui))
   5467 		md_devopen++;
   5468 
   5469 	un = MD_UNIT(mnum);
   5470 	setno = MD_UN2SET(un);
   5471 
   5472 	sm_cnt = 0;
   5473 	sm_unavail_cnt = 0;
   5474 	for (i = 0; i < NMIRROR; i++) {
   5475 		md_dev64_t tmpdev;
   5476 		mdi_unit_t	*sm_ui;
   5477 
   5478 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
   5479 			continue;
   5480 		}
   5481 
   5482 		sm_cnt++;
   5483 		tmpdev = un->un_sm[i].sm_dev;
   5484 		(void) md_layered_open(mnum, &tmpdev,
   5485 		    MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
   5486 		un->un_sm[i].sm_dev = tmpdev;
   5487 
   5488 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
   5489 
   5490 		/*
   5491 		 * Logic similar to that in mirror_open_all_devs.  We set or
   5492 		 * clear the submirror Unavailable bit.
   5493 		 */
   5494 		(void) md_unit_writerlock(sm_ui);
   5495 		if (submirror_unavailable(un, i, 1)) {
   5496 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
   5497 			sm_unavail_cnt++;
   5498 		} else {
   5499 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
   5500 		}
   5501 		md_unit_writerexit(sm_ui);
   5502 	}
   5503 
   5504 	/*
   5505 	 * If all of the submirrors are unavailable, the mirror is also
   5506 	 * unavailable.
   5507 	 */
   5508 	if (sm_cnt == sm_unavail_cnt) {
   5509 		ui->ui_tstate |= MD_INACCESSIBLE;
   5510 	} else {
   5511 		ui->ui_tstate &= ~MD_INACCESSIBLE;
   5512 	}
   5513 
   5514 	/*
   5515 	 * Start checking from probe failures. If failures occur we
   5516 	 * set the appropriate erred state only if the metadevice is in
   5517 	 * use. This is specifically to prevent unnecessary resyncs.
   5518 	 * For instance if the disks were accidentally disconnected when
   5519 	 * the system booted up then until the metadevice is accessed
   5520 	 * (like file system mount) the user can shutdown, recable and
   5521 	 * reboot w/o incurring a potentially huge resync.
   5522 	 */
   5523 
   5524 	smi = 0;
   5525 	ci = 0;
   5526 	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
   5527 
   5528 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
   5529 			/*
   5530 			 * Note that for a MN set, there is no need to call
   5531 			 * SE_NOTIFY as that is done when processing the
   5532 			 * state change
   5533 			 */
   5534 			if (md_devopen) {
   5535 				/*
   5536 				 * Never called from ioctl context,
   5537 				 * so (IOLOCK *)NULL
   5538 				 */
   5539 				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
   5540 				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
   5541 				if (!MD_MNSET_SETNO(setno)) {
   5542 					SE_NOTIFY(EC_SVM_STATE,
   5543 					    ESC_SVM_LASTERRED,
   5544 					    SVM_TAG_METADEVICE, setno,
   5545 					    MD_SID(un));
   5546 				}
   5547 				continue;
   5548 			} else {
   5549 				(void) mirror_close_all_devs(un,
   5550 				    MD_OFLG_PROBEDEV);
   5551 				if (!MD_MNSET_SETNO(setno)) {
   5552 					SE_NOTIFY(EC_SVM_STATE,
   5553 					    ESC_SVM_OPEN_FAIL,
   5554 					    SVM_TAG_METADEVICE, setno,
   5555 					    MD_SID(un));
   5556 				}
   5557 				mirror_openfail_console_info(un, smi, ci);
   5558 				return (ENXIO);
   5559 			}
   5560 		}
   5561 
   5562 		/*
   5563 		 * Note that for a MN set, there is no need to call
   5564 		 * SE_NOTIFY as that is done when processing the
   5565 		 * state change
   5566 		 */
   5567 		if (md_devopen) {
   5568 			/* Never called from ioctl context, so (IOLOCK *)NULL */
   5569 			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
   5570 			    MD_STATE_XMIT, (IOLOCK *)NULL);
   5571 			if (!MD_MNSET_SETNO(setno)) {
   5572 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
   5573 				    SVM_TAG_METADEVICE, setno,
   5574 				    MD_SID(un));
   5575 			}
   5576 		}
   5577 		mirror_openfail_console_info(un, smi, ci);
   5578 		ci++;
   5579 	}
   5580 
   5581 	if (MD_MNSET_SETNO(setno)) {
   5582 		send_poke_hotspares(setno);
   5583 	} else {
   5584 		(void) poke_hotspares();
   5585 	}
   5586 	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
   5587 
   5588 	return (0);
   5589 }
   5590 
   5591 
   5592 static int
   5593 mirror_imp_set(
   5594 	set_t	setno
   5595 )
   5596 {
   5597 
   5598 	mddb_recid_t	recid;
   5599 	int		gotsomething, i;
   5600 	mddb_type_t	typ1;
   5601 	mddb_de_ic_t	*dep;
   5602 	mddb_rb32_t	*rbp;
   5603 	mm_unit32_od_t	*un32;
   5604 	mm_unit_t	*un64;
   5605 	md_dev64_t	self_devt;
   5606 	minor_t		*self_id;	/* minor needs to be updated */
   5607 	md_parent_t	*parent_id;	/* parent needs to be updated */
   5608 	mddb_recid_t	*record_id;	/* record id needs to be updated */
   5609 	mddb_recid_t	*optrec_id;
   5610 	md_dev64_t	tmpdev;
   5611 
   5612 
   5613 	gotsomething = 0;
   5614 
   5615 	typ1 = (mddb_type_t)md_getshared_key(setno,
   5616 	    mirror_md_ops.md_driver.md_drivername);
   5617 	recid = mddb_makerecid(setno, 0);
   5618 
   5619 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
   5620 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
   5621 			continue;
   5622 
   5623 		dep = mddb_getrecdep(recid);
   5624 		rbp = dep->de_rb;
   5625 
   5626 		switch (rbp->rb_revision) {
   5627 		case MDDB_REV_RB:
   5628 		case MDDB_REV_RBFN:
   5629 			/*
   5630 			 * Small device
   5631 			 */
   5632 			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
   5633 			self_id = &(un32->c.un_self_id);
   5634 			parent_id = &(un32->c.un_parent);
   5635 			record_id = &(un32->c.un_record_id);
   5636 			optrec_id = &(un32->un_rr_dirty_recid);
   5637 
   5638 			for (i = 0; i < un32->un_nsm; i++) {
   5639 				tmpdev = md_expldev(un32->un_sm[i].sm_dev);
   5640 				un32->un_sm[i].sm_dev = md_cmpldev
   5641 				    (md_makedevice(md_major, MD_MKMIN(setno,
   5642 				    MD_MIN2UNIT(md_getminor(tmpdev)))));
   5643 
   5644 				if (!md_update_minor(setno, mddb_getsidenum
   5645 				    (setno), un32->un_sm[i].sm_key))
   5646 				goto out;
   5647 			}
   5648 			break;
   5649 		case MDDB_REV_RB64:
   5650 		case MDDB_REV_RB64FN:
   5651 			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
   5652 			self_id = &(un64->c.un_self_id);
   5653 			parent_id = &(un64->c.un_parent);
   5654 			record_id = &(un64->c.un_record_id);
   5655 			optrec_id = &(un64->un_rr_dirty_recid);
   5656 
   5657 			for (i = 0; i < un64->un_nsm; i++) {
   5658 				tmpdev = un64->un_sm[i].sm_dev;
   5659 				un64->un_sm[i].sm_dev = md_makedevice
   5660 				    (md_major, MD_MKMIN(setno, MD_MIN2UNIT
   5661 				    (md_getminor(tmpdev))));
   5662 
   5663 				if (!md_update_minor(setno, mddb_getsidenum
   5664 				    (setno), un64->un_sm[i].sm_key))
   5665 				goto out;
   5666 			}
   5667 			break;
   5668 		}
   5669 
   5670 		/*
   5671 		 * If this is a top level and a friendly name metadevice,
   5672 		 * update its minor in the namespace.
   5673 		 */
   5674 		if ((*parent_id == MD_NO_PARENT) &&
   5675 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
   5676 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
   5677 
   5678 			self_devt = md_makedevice(md_major, *self_id);
   5679 			if (!md_update_top_device_minor(setno,
   5680 			    mddb_getsidenum(setno), self_devt))
   5681 				goto out;
   5682 		}
   5683 
   5684 		/*
   5685 		 * Update unit with the imported setno
   5686 		 *
   5687 		 */
   5688 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
   5689 
   5690 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
   5691 		if (*parent_id != MD_NO_PARENT)
   5692 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
   5693 		*record_id = MAKERECID(setno, DBID(*record_id));
   5694 		*optrec_id = MAKERECID(setno, DBID(*optrec_id));
   5695 
   5696 		gotsomething = 1;
   5697 	}
   5698 
   5699 out:
   5700 	return (gotsomething);
   5701 }
   5702 
   5703 /*
   5704  * NAME: mirror_check_offline
   5705  *
   5706  * DESCRIPTION: return offline_status = 1 if any submirrors are offline
   5707  *
   5708  * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
   5709  * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
   5710  * ioctl.
   5711  */
   5712 int
   5713 mirror_check_offline(md_dev64_t dev, int *offline_status)
   5714 {
   5715 	mm_unit_t		*un;
   5716 	md_error_t		mde = mdnullerror;
   5717 
   5718 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
   5719 		return (EINVAL);
   5720 	*offline_status = 0;
   5721 	if (un->c.un_status & MD_UN_OFFLINE_SM)
   5722 		*offline_status = 1;
   5723 	return (0);
   5724 }
   5725 
   5726 /*
   5727  * NAME: mirror_inc_abr_count
   5728  *
   5729  * DESCRIPTION: increment the count of layered soft parts with ABR set
   5730  *
   5731  * Called from ioctl, so access to un_abr_count is protected by the global
   5732  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
   5733  */
   5734 int
   5735 mirror_inc_abr_count(md_dev64_t dev)
   5736 {
   5737 	mm_unit_t		*un;
   5738 	md_error_t		mde = mdnullerror;
   5739 
   5740 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
   5741 		return (EINVAL);
   5742 	un->un_abr_count++;
   5743 	return (0);
   5744 }
   5745 
   5746 /*
   5747  * NAME: mirror_dec_abr_count
   5748  *
   5749  * DESCRIPTION: decrement the count of layered soft parts with ABR set
   5750  *
   5751  * Called from ioctl, so access to un_abr_count is protected by the global
   5752  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
   5753  */
   5754 int
   5755 mirror_dec_abr_count(md_dev64_t dev)
   5756 {
   5757 	mm_unit_t		*un;
   5758 	md_error_t		mde = mdnullerror;
   5759 
   5760 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
   5761 		return (EINVAL);
   5762 	un->un_abr_count--;
   5763 	return (0);
   5764 }
   5765 
   5766 static md_named_services_t mirror_named_services[] = {
   5767 	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
   5768 	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
   5769 	{mirror_rename_check,				MDRNM_CHECK	    },
   5770 	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
   5771 	{(intptr_t (*)()) mirror_exchange_parent_update_to,
   5772 			MDRNM_PARENT_UPDATE_TO},
   5773 	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
   5774 			MDRNM_SELF_UPDATE_FROM_DOWN },
   5775 	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
   5776 	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
   5777 	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
   5778 	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
   5779 	{ NULL,						0		    }
   5780 };
   5781 
   5782 md_ops_t mirror_md_ops = {
   5783 	mirror_open,		/* open */
   5784 	mirror_close,		/* close */
   5785 	md_mirror_strategy,	/* strategy */
   5786 	NULL,			/* print */
   5787 	mirror_dump,		/* dump */
   5788 	NULL,			/* read */
   5789 	NULL,			/* write */
   5790 	md_mirror_ioctl,	/* mirror_ioctl, */
   5791 	mirror_snarf,		/* mirror_snarf */
   5792 	mirror_halt,		/* mirror_halt */
   5793 	NULL,			/* aread */
   5794 	NULL,			/* awrite */
   5795 	mirror_imp_set,		/* import set */
   5796 	mirror_named_services
   5797 };
   5798 
   5799 /* module specific initilization */
   5800 static void
   5801 init_init()
   5802 {
   5803 	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
   5804 
   5805 	/* Initialize the parent and child save memory pools */
   5806 	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
   5807 	    sizeof (md_mps_t), 0, mirror_parent_constructor,
   5808 	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
   5809 	    0);
   5810 
   5811 	mirror_child_cache = kmem_cache_create("md_mirror_child",
   5812 	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
   5813 	    mirror_child_constructor, mirror_child_destructor,
   5814 	    mirror_run_queue, NULL, NULL, 0);
   5815 
   5816 	/*
   5817 	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
   5818 	 * then initialize wowbuf memory pool.
   5819 	 */
   5820 	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
   5821 	if (md_wowbuf_size <= 0)
   5822 		md_wowbuf_size = 2 * DEV_BSIZE;
   5823 	if (md_wowbuf_size > (32 * DEV_BSIZE))
   5824 		md_wowbuf_size = (32 * DEV_BSIZE);
   5825 
   5826 	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
   5827 	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
   5828 	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
   5829 
   5830 	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
   5831 	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
   5832 
   5833 	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
   5834 }
   5835 
   5836 /* module specific uninitilization (undo init_init()) */
   5837 static void
   5838 fini_uninit()
   5839 {
   5840 	kmem_cache_destroy(mirror_parent_cache);
   5841 	kmem_cache_destroy(mirror_child_cache);
   5842 	kmem_cache_destroy(mirror_wowblk_cache);
   5843 	mirror_parent_cache = mirror_child_cache =
   5844 	    mirror_wowblk_cache = NULL;
   5845 
   5846 	mutex_destroy(&mirror_timeout.dr_mx);
   5847 	mutex_destroy(&hotspare_request.dr_mx);
   5848 	mutex_destroy(&non_ff_drv_mutex);
   5849 }
   5850 
   5851 /* define the module linkage */
   5852 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())
   5853