Home | History | Annotate | Download | only in md
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Driver for Virtual Disk.
     29  */
     30 #include <sys/param.h>
     31 #include <sys/systm.h>
     32 #include <sys/buf.h>
     33 #include <sys/conf.h>
     34 #include <sys/user.h>
     35 #include <sys/uio.h>
     36 #include <sys/proc.h>
     37 #include <sys/t_lock.h>
     38 #include <sys/dkio.h>
     39 #include <sys/kmem.h>
     40 #include <sys/debug.h>
     41 #include <sys/cmn_err.h>
     42 #include <sys/sysmacros.h>
     43 #include <sys/types.h>
     44 #include <sys/mkdev.h>
     45 #include <sys/vtoc.h>
     46 #include <sys/open.h>
     47 #include <sys/file.h>
     48 #include <vm/page.h>
     49 #include <sys/callb.h>
     50 #include <sys/disp.h>
     51 #include <sys/modctl.h>
     52 #include <sys/errno.h>
     53 #include <sys/door.h>
     54 #include <sys/lvm/mdmn_commd.h>
     55 #include <sys/lvm/md_hotspares.h>
     56 
     57 #include <sys/lvm/mdvar.h>
     58 #include <sys/lvm/md_names.h>
     59 
     60 #include <sys/ddi.h>
     61 #include <sys/proc.h>
     62 #include <sys/sunddi.h>
     63 #include <sys/esunddi.h>
     64 
     65 #include <sys/sysevent.h>
     66 #include <sys/sysevent/eventdefs.h>
     67 
     68 #include <sys/sysevent/svm.h>
     69 #include <sys/lvm/md_basic.h>
     70 
     71 
     72 /*
     73  * Machine specific Hertz is kept here
     74  */
     75 extern clock_t			md_hz;
     76 
     77 /*
     78  * Externs.
     79  */
     80 extern int			(*mdv_strategy_tstpnt)(buf_t *, int, void*);
     81 extern major_t			md_major;
     82 extern unit_t			md_nunits;
     83 extern set_t			md_nsets;
     84 extern md_set_t			md_set[];
     85 extern md_set_io_t		md_set_io[];
     86 extern md_ops_t			**md_ops;
     87 extern md_ops_t			*md_opslist;
     88 extern ddi_modhandle_t		*md_mods;
     89 extern dev_info_t		*md_devinfo;
     90 
     91 extern md_krwlock_t		md_unit_array_rw;
     92 extern kmutex_t			md_mx;
     93 extern kcondvar_t		md_cv;
     94 
     95 extern md_krwlock_t		hsp_rwlp;
     96 extern md_krwlock_t		ni_rwlp;
     97 
     98 extern int			md_num_daemons;
     99 extern int			md_status;
    100 extern int			md_ioctl_cnt;
    101 extern int			md_mtioctl_cnt;
    102 
    103 extern struct metatransops	metatransops;
    104 extern md_event_queue_t		*md_event_queue;
    105 extern md_resync_t		md_cpr_resync;
    106 extern int			md_done_daemon_threads;
    107 extern int			md_ff_daemon_threads;
    108 
    109 
    110 extern mddb_set_t	*mddb_setenter(set_t setno, int flag, int *errorcodep);
    111 extern void		mddb_setexit(mddb_set_t *s);
    112 extern void		*lookup_entry(struct nm_next_hdr *, set_t,
    113 				side_t, mdkey_t, md_dev64_t, int);
    114 extern struct nm_next_hdr	*get_first_record(set_t, int, int);
    115 extern dev_t		getrootdev(void);
    116 
    117 struct mdq_anchor	md_done_daemon; /* done request queue */
    118 struct mdq_anchor	md_mstr_daemon; /* mirror error, WOW requests */
    119 struct mdq_anchor	md_mhs_daemon;	/* mirror hotspare requests queue */
    120 struct mdq_anchor	md_hs_daemon;	/* raid hotspare requests queue */
    121 struct mdq_anchor	md_ff_daemonq;	/* failfast request queue */
    122 struct mdq_anchor	md_mirror_daemon; /* mirror owner queue */
    123 struct mdq_anchor	md_mirror_io_daemon; /* mirror owner i/o queue */
    124 struct mdq_anchor	md_mirror_rs_daemon; /* mirror resync done queue */
    125 struct mdq_anchor	md_sp_daemon;	/* soft-part error daemon queue */
    126 struct mdq_anchor	md_mto_daemon;	/* mirror timeout daemon queue */
    127 
    128 int md_done_daemon_threads = 1;	/* threads for md_done_daemon requestq */
    129 int md_mstr_daemon_threads = 1;	/* threads for md_mstr_daemon requestq */
    130 int md_mhs_daemon_threads = 1;	/* threads for md_mhs_daemon requestq */
    131 int md_hs_daemon_threads = 1;	/* threads for md_hs_daemon requestq */
    132 int md_ff_daemon_threads = 3;	/* threads for md_ff_daemon requestq */
    133 int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */
    134 int md_sp_daemon_threads = 1;	/* threads for md_sp_daemon requestq */
    135 int md_mto_daemon_threads = 1;	/* threads for md_mto_daemon requestq */
    136 
    137 #ifdef DEBUG
    138 /* Flag to switch on debug messages */
    139 int md_release_reacquire_debug = 0;	/* debug flag */
    140 #endif
    141 
    142 /*
    143  *
    144  * The md_request_queues is table of pointers to request queues and the number
    145  * of threads associated with the request queues.
    146  * When the number of threads is set to 1, then the order of execution is
    147  * sequential.
    148  * The number of threads for all the queues have been defined as global
    149  * variables to enable kernel tuning.
    150  *
    151  */
    152 
    153 #define	MD_DAEMON_QUEUES 11
    154 
    155 md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
    156 	{&md_done_daemon, &md_done_daemon_threads},
    157 	{&md_mstr_daemon, &md_mstr_daemon_threads},
    158 	{&md_hs_daemon, &md_hs_daemon_threads},
    159 	{&md_ff_daemonq, &md_ff_daemon_threads},
    160 	{&md_mirror_daemon, &md_mirror_daemon_threads},
    161 	{&md_mirror_io_daemon, &md_mirror_daemon_threads},
    162 	{&md_mirror_rs_daemon, &md_mirror_daemon_threads},
    163 	{&md_sp_daemon, &md_sp_daemon_threads},
    164 	{&md_mhs_daemon, &md_mhs_daemon_threads},
    165 	{&md_mto_daemon, &md_mto_daemon_threads},
    166 	{0, 0}
    167 };
    168 
    169 /*
    170  * Number of times a message is retried before issuing a warning to the operator
    171  */
    172 #define	MD_MN_WARN_INTVL	10
    173 
    174 /*
    175  * Setting retry cnt to one (pre decremented) so that we actually do no
    176  * retries when committing/deleting a mddb rec. The underlying disk driver
    177  * does several retries to check if the disk is really dead or not so there
    178  * is no reason for us to retry on top of the drivers retries.
    179  */
    180 
    181 uint_t			md_retry_cnt = 1; /* global so it can be patched */
    182 
    183 /*
    184  * How many times to try to do the door_ki_upcall() in mdmn_ksend_message.
    185  * Again, made patchable here should it prove useful.
    186  */
    187 uint_t			md_send_retry_limit = 30;
    188 
    189 /*
    190  * Bug # 1212146
    191  * Before this change the user had to pass in a short aligned buffer because of
    192  * problems in some underlying device drivers.  This problem seems to have been
    193  * corrected in the underlying drivers so we will default to not requiring any
    194  * alignment.  If the user needs to check for a specific alignment,
    195  * md_uio_alignment_mask may be set in /etc/system to accomplish this.  To get
    196  * the behavior before this fix, the md_uio_alignment_mask would be set to 1,
    197  * to check for word alignment, it can be set to 3, for double word alignment,
    198  * it can be set to 7, etc.
    199  *
    200  * [Other part of fix is in function md_chk_uio()]
    201  */
    202 static int		md_uio_alignment_mask = 0;
    203 
    204 /*
    205  * for md_dev64_t translation
    206  */
    207 struct md_xlate_table		*md_tuple_table;
    208 struct md_xlate_major_table	*md_major_tuple_table;
    209 int				md_tuple_length;
    210 uint_t				md_majortab_len;
    211 
    212 /* Function declarations */
    213 
    214 static int md_create_probe_rqlist(md_probedev_impl_t *plist,
    215 			daemon_queue_t **hdr, intptr_t (*probe_test)());
    216 
    217 /*
    218  * manipulate global status
    219  */
    220 void
    221 md_set_status(int bits)
    222 {
    223 	mutex_enter(&md_mx);
    224 	md_status |= bits;
    225 	mutex_exit(&md_mx);
    226 }
    227 
    228 void
    229 md_clr_status(int bits)
    230 {
    231 	mutex_enter(&md_mx);
    232 	md_status &= ~bits;
    233 	mutex_exit(&md_mx);
    234 }
    235 
    236 int
    237 md_get_status()
    238 {
    239 	int result;
    240 	mutex_enter(&md_mx);
    241 	result = md_status;
    242 	mutex_exit(&md_mx);
    243 	return (result);
    244 }
    245 
    246 void
    247 md_set_setstatus(set_t setno, int bits)
    248 {
    249 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
    250 
    251 	mutex_enter(&md_mx);
    252 	md_set[setno].s_status |= bits;
    253 	mutex_exit(&md_mx);
    254 }
    255 
    256 void
    257 md_clr_setstatus(set_t setno, int bits)
    258 {
    259 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
    260 
    261 	mutex_enter(&md_mx);
    262 	md_set[setno].s_status &= ~bits;
    263 	mutex_exit(&md_mx);
    264 }
    265 
    266 uint_t
    267 md_get_setstatus(set_t setno)
    268 {
    269 	uint_t result;
    270 
    271 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
    272 
    273 	mutex_enter(&md_mx);
    274 	result = md_set[setno].s_status;
    275 	mutex_exit(&md_mx);
    276 	return (result);
    277 }
    278 
    279 /*
    280  * md_unit_readerlock_common:
    281  * -------------------------
    282  * Mark the given unit as having a reader reference. Spin waiting for any
    283  * writer references to be released.
    284  *
    285  * Input:
    286  *	ui		unit reference
    287  *	lock_held	0 => ui_mx needs to be grabbed
    288  *			1 => ui_mx already held
    289  * Output:
    290  *	mm_unit_t corresponding to unit structure
    291  *	ui->ui_readercnt incremented
    292  */
    293 static void *
    294 md_unit_readerlock_common(mdi_unit_t *ui, int lock_held)
    295 {
    296 	uint_t	flag = MD_UL_WRITER | MD_UL_WANABEWRITER;
    297 
    298 	if (!lock_held)
    299 		mutex_enter(&ui->ui_mx);
    300 	while (ui->ui_lock & flag) {
    301 		if (panicstr) {
    302 			if (ui->ui_lock & MD_UL_WRITER)
    303 				panic("md: writer lock is held");
    304 			break;
    305 		}
    306 		cv_wait(&ui->ui_cv, &ui->ui_mx);
    307 	}
    308 	ui->ui_readercnt++;
    309 	if (!lock_held)
    310 		mutex_exit(&ui->ui_mx);
    311 	return (MD_UNIT(ui->ui_link.ln_id));
    312 }
    313 
    314 void *
    315 md_unit_readerlock(mdi_unit_t *ui)
    316 {
    317 	return (md_unit_readerlock_common(ui, 0));
    318 }
    319 
    320 /*
    321  * md_unit_writerlock_common:
    322  * -------------------------
    323  * Acquire a unique writer reference. Causes previous readers to drain.
    324  * Spins if a writer reference already exists or if a previous reader/writer
    325  * dropped the lock to allow a ksend_message to be despatched.
    326  *
    327  * Input:
    328  *	ui		unit reference
    329  *	lock_held	0 => grab ui_mx
    330  *			1 => ui_mx already held on entry
    331  * Output:
    332  *	mm_unit_t reference
    333  */
    334 static void *
    335 md_unit_writerlock_common(mdi_unit_t *ui, int lock_held)
    336 {
    337 	uint_t	flag = MD_UL_WRITER;
    338 
    339 	if (panicstr)
    340 		panic("md: writer lock not allowed");
    341 
    342 	if (!lock_held)
    343 		mutex_enter(&ui->ui_mx);
    344 
    345 	while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) {
    346 		ui->ui_wanabecnt++;
    347 		ui->ui_lock |= MD_UL_WANABEWRITER;
    348 		cv_wait(&ui->ui_cv, &ui->ui_mx);
    349 		if (--ui->ui_wanabecnt == 0)
    350 			ui->ui_lock &= ~MD_UL_WANABEWRITER;
    351 	}
    352 	ui->ui_lock |= MD_UL_WRITER;
    353 	ui->ui_owner = curthread;
    354 
    355 	if (!lock_held)
    356 		mutex_exit(&ui->ui_mx);
    357 	return (MD_UNIT(ui->ui_link.ln_id));
    358 }
    359 
    360 void *
    361 md_unit_writerlock(mdi_unit_t *ui)
    362 {
    363 	return (md_unit_writerlock_common(ui, 0));
    364 }
    365 
    366 /*
    367  * md_unit_readerexit_common:
    368  * -------------------------
    369  * Release the readerlock for the specified unit. If the reader count reaches
    370  * zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up.
    371  *
    372  * Input:
    373  *	ui		unit reference
    374  *	lock_held	0 => ui_mx needs to be acquired
    375  *			1 => ui_mx already held
    376  */
    377 static void
    378 md_unit_readerexit_common(mdi_unit_t *ui, int lock_held)
    379 {
    380 	if (!lock_held)
    381 		mutex_enter(&ui->ui_mx);
    382 	ASSERT((ui->ui_lock & MD_UL_WRITER) == 0);
    383 	ASSERT(ui->ui_readercnt != 0);
    384 	ui->ui_readercnt--;
    385 	if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0))
    386 		cv_broadcast(&ui->ui_cv);
    387 
    388 	if (!lock_held)
    389 		mutex_exit(&ui->ui_mx);
    390 }
    391 
    392 void
    393 md_unit_readerexit(mdi_unit_t *ui)
    394 {
    395 	md_unit_readerexit_common(ui, 0);
    396 }
    397 
    398 /*
    399  * md_unit_writerexit_common:
    400  * -------------------------
    401  * Release the writerlock currently held on the unit. Wake any threads waiting
    402  * on becoming reader or writer (MD_UL_WANABEWRITER set).
    403  *
    404  * Input:
    405  *	ui		unit reference
    406  *	lock_held	0 => ui_mx to be acquired
    407  *			1 => ui_mx already held
    408  */
    409 static void
    410 md_unit_writerexit_common(mdi_unit_t *ui, int lock_held)
    411 {
    412 	if (!lock_held)
    413 		mutex_enter(&ui->ui_mx);
    414 	ASSERT((ui->ui_lock & MD_UL_WRITER) != 0);
    415 	ASSERT(ui->ui_readercnt == 0);
    416 	ui->ui_lock &= ~MD_UL_WRITER;
    417 	ui->ui_owner = NULL;
    418 
    419 	cv_broadcast(&ui->ui_cv);
    420 	if (!lock_held)
    421 		mutex_exit(&ui->ui_mx);
    422 }
    423 
    424 void
    425 md_unit_writerexit(mdi_unit_t *ui)
    426 {
    427 	md_unit_writerexit_common(ui, 0);
    428 }
    429 
    430 void *
    431 md_io_readerlock(mdi_unit_t *ui)
    432 {
    433 	md_io_lock_t	*io = ui->ui_io_lock;
    434 
    435 	ASSERT(io);  /* checks case where no io lock allocated */
    436 	mutex_enter(&io->io_mx);
    437 	while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) {
    438 		if (panicstr) {
    439 			if (io->io_lock & MD_UL_WRITER)
    440 				panic("md: writer lock is held");
    441 			break;
    442 		}
    443 		cv_wait(&io->io_cv, &io->io_mx);
    444 	}
    445 	io->io_readercnt++;
    446 	mutex_exit(&io->io_mx);
    447 	return (MD_UNIT(ui->ui_link.ln_id));
    448 }
    449 
    450 void *
    451 md_io_writerlock(mdi_unit_t *ui)
    452 {
    453 	md_io_lock_t	*io = ui->ui_io_lock;
    454 
    455 	ASSERT(io);  /* checks case where no io lock allocated */
    456 	if (panicstr)
    457 		panic("md: writer lock not allowed");
    458 
    459 	mutex_enter(&io->io_mx);
    460 	while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) {
    461 		io->io_wanabecnt++;
    462 		io->io_lock |= MD_UL_WANABEWRITER;
    463 		cv_wait(&io->io_cv, &io->io_mx);
    464 		if (--io->io_wanabecnt == 0)
    465 			io->io_lock &= ~MD_UL_WANABEWRITER;
    466 	}
    467 	io->io_lock |= MD_UL_WRITER;
    468 	io->io_owner = curthread;
    469 
    470 	mutex_exit(&io->io_mx);
    471 	return (MD_UNIT(ui->ui_link.ln_id));
    472 }
    473 
    474 void
    475 md_io_readerexit(mdi_unit_t *ui)
    476 {
    477 	md_io_lock_t	*io = ui->ui_io_lock;
    478 
    479 	mutex_enter(&io->io_mx);
    480 	ASSERT((io->io_lock & MD_UL_WRITER) == 0);
    481 	ASSERT(io->io_readercnt != 0);
    482 	io->io_readercnt--;
    483 	if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) {
    484 		cv_broadcast(&io->io_cv);
    485 	}
    486 	mutex_exit(&io->io_mx);
    487 }
    488 
    489 void
    490 md_io_writerexit(mdi_unit_t *ui)
    491 {
    492 	md_io_lock_t	*io = ui->ui_io_lock;
    493 
    494 	mutex_enter(&io->io_mx);
    495 	ASSERT((io->io_lock & MD_UL_WRITER) != 0);
    496 	ASSERT(io->io_readercnt == 0);
    497 	io->io_lock &= ~MD_UL_WRITER;
    498 	io->io_owner = NULL;
    499 
    500 	cv_broadcast(&io->io_cv);
    501 	mutex_exit(&io->io_mx);
    502 }
    503 
    504 /*
    505  * Attempt to grab that set of locks defined as global.
    506  * A mask containing the set of global locks that are owned upon
    507  * entry is input.  Any additional global locks are then grabbed.
    508  * This keeps the caller from having to know the set of global
    509  * locks.
    510  */
    511 static int
    512 md_global_lock_enter(int global_locks_owned_mask)
    513 {
    514 
    515 	/*
    516 	 * The current implementation has been verified by inspection
    517 	 * and test to be deadlock free.  If another global lock is
    518 	 * added, changing the algorithm used by this function should
    519 	 * be considered.  With more than 2 locks it is difficult to
    520 	 * guarantee that locks are being acquired in the correct order.
    521 	 * The safe approach would be to drop all of the locks that are
    522 	 * owned at function entry and then reacquire all of the locks
    523 	 * in the order defined by the lock hierarchy.
    524 	 */
    525 	mutex_enter(&md_mx);
    526 	if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
    527 		while ((md_mtioctl_cnt != 0) ||
    528 		    (md_status & MD_GBL_IOCTL_LOCK)) {
    529 			if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
    530 				mutex_exit(&md_mx);
    531 				return (EINTR);
    532 			}
    533 		}
    534 		md_status |= MD_GBL_IOCTL_LOCK;
    535 		md_ioctl_cnt++;
    536 	}
    537 	if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) {
    538 		while (md_status & MD_GBL_HS_LOCK) {
    539 			if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
    540 				md_status &= ~MD_GBL_IOCTL_LOCK;
    541 				mutex_exit(&md_mx);
    542 				return (EINTR);
    543 			}
    544 		}
    545 		md_status |= MD_GBL_HS_LOCK;
    546 	}
    547 	mutex_exit(&md_mx);
    548 	return (0);
    549 }
    550 
    551 /*
    552  * Release the set of global locks that were grabbed in md_global_lock_enter
    553  * that were not already owned by the calling thread.  The set of previously
    554  * owned global locks is passed in as a mask parameter.
    555  */
    556 static int
    557 md_global_lock_exit(int global_locks_owned_mask, int code,
    558 	int flags, mdi_unit_t *ui)
    559 {
    560 	mutex_enter(&md_mx);
    561 
    562 	/* If MT ioctl decrement mt_ioctl_cnt */
    563 	if ((flags & MD_MT_IOCTL)) {
    564 		md_mtioctl_cnt--;
    565 	} else {
    566 		if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
    567 			/* clear the lock and decrement count */
    568 			ASSERT(md_ioctl_cnt == 1);
    569 			md_ioctl_cnt--;
    570 			md_status &= ~MD_GBL_IOCTL_LOCK;
    571 		}
    572 		if (!(global_locks_owned_mask & MD_GBL_HS_LOCK))
    573 			md_status &= ~MD_GBL_HS_LOCK;
    574 	}
    575 	if (flags & MD_READER_HELD)
    576 		md_unit_readerexit(ui);
    577 	if (flags & MD_WRITER_HELD)
    578 		md_unit_writerexit(ui);
    579 	if (flags & MD_IO_HELD)
    580 		md_io_writerexit(ui);
    581 	if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
    582 		rw_exit(&md_unit_array_rw.lock);
    583 	}
    584 	cv_broadcast(&md_cv);
    585 	mutex_exit(&md_mx);
    586 
    587 	return (code);
    588 }
    589 
    590 /*
    591  * The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make
    592  * use of the md_global_lock_{enter|exit} functions to avoid duplication
    593  * of code.  They rely upon the fact that the locks that are specified in
    594  * the input mask are not acquired or freed.  If this algorithm changes
    595  * as described in the block comment at the beginning of md_global_lock_enter
    596  * then it will be necessary to change these 2 functions.  Otherwise these
    597  * functions will be grabbing and holding global locks unnecessarily.
    598  */
    599 int
    600 md_ioctl_lock_enter(void)
    601 {
    602 	/* grab only the ioctl lock */
    603 	return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK));
    604 }
    605 
    606 /*
    607  * If md_ioctl_lock_exit is being called at the end of an ioctl before
    608  * returning to user space, then ioctl_end is set to 1.
    609  * Otherwise, the ioctl lock is being dropped in the middle of handling
    610  * an ioctl and will be reacquired before the end of the ioctl.
    611  * Do not attempt to process the MN diskset mddb parse flags unless
    612  * ioctl_end is true - otherwise a deadlock situation could arise.
    613  */
    614 int
    615 md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end)
    616 {
    617 	int				ret_val;
    618 	uint_t				status;
    619 	mddb_set_t			*s;
    620 	int				i;
    621 	int				err;
    622 	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
    623 	md_mn_kresult_t			*kresult;
    624 	mddb_lb_t			*lbp;
    625 	int				rval = 1;
    626 	int				flag;
    627 
    628 	/* release only the ioctl lock */
    629 	ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
    630 
    631 	/*
    632 	 * If md_ioctl_lock_exit is being called with a possible lock held
    633 	 * (ioctl_end is 0), then don't check the MN disksets since the
    634 	 * call to mddb_setenter may cause a lock ordering deadlock.
    635 	 */
    636 	if (!ioctl_end)
    637 		return (ret_val);
    638 
    639 	/*
    640 	 * Walk through disksets to see if there is a MN diskset that
    641 	 * has messages that need to be sent.  Set must be snarfed and
    642 	 * be a MN diskset in order to be checked.
    643 	 *
    644 	 * In a MN diskset, this routine may send messages to the
    645 	 * rpc.mdcommd in order to have the slave nodes re-parse parts
    646 	 * of the mddb.  Messages can only be sent with no locks held,
    647 	 * so if mddb change occurred while the ioctl lock is held, this
    648 	 * routine must send the messages.
    649 	 */
    650 	for (i = 1; i < md_nsets; i++) {
    651 		status = md_get_setstatus(i);
    652 
    653 		/* Set must be snarfed and be a MN diskset */
    654 		if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) !=
    655 		    (MD_SET_SNARFED | MD_SET_MNSET))
    656 			continue;
    657 
    658 		/* Grab set lock so that set can't change */
    659 		if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL)
    660 			continue;
    661 
    662 		lbp = s->s_lbp;
    663 
    664 		/* Re-get set status now that lock is held */
    665 		status = md_get_setstatus(i);
    666 
    667 		/*
    668 		 * If MN parsing block flag is set - continue to next set.
    669 		 *
    670 		 * If s_mn_parseflags_sending is non-zero, then another thread
    671 		 * is already currently sending a parse message, so just
    672 		 * release the set mutex.  If this ioctl had caused an mddb
    673 		 * change that results in a parse message to be generated,
    674 		 * the thread that is currently sending a parse message would
    675 		 * generate the additional parse message.
    676 		 *
    677 		 * If s_mn_parseflags_sending is zero then loop until
    678 		 * s_mn_parseflags is 0 (until there are no more
    679 		 * messages to send).
    680 		 * While s_mn_parseflags is non-zero,
    681 		 *	put snapshot of parse_flags in s_mn_parseflags_sending
    682 		 *	set s_mn_parseflags to zero
    683 		 *	release set mutex
    684 		 *	send message
    685 		 *	re-grab set mutex
    686 		 *	set s_mn_parseflags_sending to zero
    687 		 *
    688 		 * If set is STALE, send message with NO_LOG flag so that
    689 		 * rpc.mdcommd won't attempt to log message to non-writeable
    690 		 * replica.
    691 		 */
    692 		mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
    693 		    KM_SLEEP);
    694 		while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
    695 		    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
    696 		    (!(status & MD_SET_MNPARSE_BLK))) {
    697 
    698 			/* Grab snapshot of parse flags */
    699 			s->s_mn_parseflags_sending = s->s_mn_parseflags;
    700 			s->s_mn_parseflags = 0;
    701 
    702 			mutex_exit(&md_set[(s)->s_setno].s_dbmx);
    703 
    704 			/*
    705 			 * Send the message to the slaves to re-parse
    706 			 * the indicated portions of the mddb. Send the status
    707 			 * of the 50 mddbs in this set so that slaves know
    708 			 * which mddbs that the master node thinks are 'good'.
    709 			 * Otherwise, slave may reparse, but from wrong
    710 			 * replica.
    711 			 */
    712 			mddb_parse_msg->msg_parse_flags =
    713 			    s->s_mn_parseflags_sending;
    714 
    715 			for (i = 0; i < MDDB_NLB; i++) {
    716 				mddb_parse_msg->msg_lb_flags[i] =
    717 				    lbp->lb_locators[i].l_flags;
    718 			}
    719 			kresult = kmem_alloc(sizeof (md_mn_kresult_t),
    720 			    KM_SLEEP);
    721 			while (rval != 0) {
    722 				flag = 0;
    723 				if (status & MD_SET_STALE)
    724 					flag |= MD_MSGF_NO_LOG;
    725 				rval = mdmn_ksend_message(s->s_setno,
    726 				    MD_MN_MSG_MDDB_PARSE, flag, 0,
    727 				    (char *)mddb_parse_msg,
    728 				    sizeof (md_mn_msg_mddb_parse_t), kresult);
    729 				/* if the node hasn't yet joined, it's Ok. */
    730 				if ((!MDMN_KSEND_MSG_OK(rval, kresult)) &&
    731 				    (kresult->kmmr_comm_state !=
    732 				    MDMNE_NOT_JOINED)) {
    733 					mdmn_ksend_show_error(rval, kresult,
    734 					    "MD_MN_MSG_MDDB_PARSE");
    735 					cmn_err(CE_WARN, "md_ioctl_lock_exit: "
    736 					    "Unable to send mddb update "
    737 					    "message to other nodes in "
    738 					    "diskset %s\n", s->s_setname);
    739 					rval = 1;
    740 				}
    741 			}
    742 			kmem_free(kresult, sizeof (md_mn_kresult_t));
    743 
    744 			/*
    745 			 * Re-grab mutex to clear sending field and to
    746 			 * see if another parse message needs to be generated.
    747 			 */
    748 			mutex_enter(&md_set[(s)->s_setno].s_dbmx);
    749 			s->s_mn_parseflags_sending = 0;
    750 		}
    751 		kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
    752 		mutex_exit(&md_set[(s)->s_setno].s_dbmx);
    753 	}
    754 	return (ret_val);
    755 }
    756 
    757 /*
    758  * Called when in an ioctl and need readerlock.
    759  */
    760 void *
    761 md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui)
    762 {
    763 	ASSERT(lock != NULL);
    764 	lock->l_ui = ui;
    765 	lock->l_flags |= MD_READER_HELD;
    766 	return (md_unit_readerlock_common(ui, 0));
    767 }
    768 
    769 /*
    770  * Called when in an ioctl and need writerlock.
    771  */
    772 void *
    773 md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui)
    774 {
    775 	ASSERT(lock != NULL);
    776 	lock->l_ui = ui;
    777 	lock->l_flags |= MD_WRITER_HELD;
    778 	return (md_unit_writerlock_common(ui, 0));
    779 }
    780 
    781 void *
    782 md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui)
    783 {
    784 	ASSERT(lock != NULL);
    785 	lock->l_ui = ui;
    786 	lock->l_flags |= MD_IO_HELD;
    787 	return (md_io_writerlock(ui));
    788 }
    789 
    790 void
    791 md_ioctl_readerexit(IOLOCK *lock)
    792 {
    793 	ASSERT(lock != NULL);
    794 	lock->l_flags &= ~MD_READER_HELD;
    795 	md_unit_readerexit(lock->l_ui);
    796 }
    797 
    798 void
    799 md_ioctl_writerexit(IOLOCK *lock)
    800 {
    801 	ASSERT(lock != NULL);
    802 	lock->l_flags &= ~MD_WRITER_HELD;
    803 	md_unit_writerexit(lock->l_ui);
    804 }
    805 
    806 void
    807 md_ioctl_io_exit(IOLOCK *lock)
    808 {
    809 	ASSERT(lock != NULL);
    810 	lock->l_flags &= ~MD_IO_HELD;
    811 	md_io_writerexit(lock->l_ui);
    812 }
    813 
    814 /*
    815  * md_ioctl_releaselocks:
    816  * --------------------
    817  * Release the unit locks that are held and stop subsequent
    818  * md_unit_reader/writerlock calls from progressing. This allows the caller
    819  * to send messages across the cluster when running in a multinode
    820  * environment.
    821  * ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are
    822  * allowed to progress as normal. This is required as these typically are
    823  * invoked by the message handler that may be called while a unit lock is
    824  * marked as released.
    825  *
    826  * On entry:
    827  *	variety of unit locks may be held including ioctl lock
    828  *
    829  * On exit:
    830  *      locks released and unit structure updated to prevent subsequent reader/
    831  *      writer locks being acquired until md_ioctl_reacquirelocks is called
    832  */
    833 void
    834 md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui)
    835 {
    836 	/* This actually releases the locks. */
    837 	(void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
    838 }
    839 
    840 /*
    841  * md_ioctl_reacquirelocks:
    842  * ----------------------
    843  * Reacquire the locks that were held when md_ioctl_releaselocks
    844  * was called.
    845  *
    846  * On entry:
    847  *      No unit locks held
    848  * On exit:
    849  *	locks held that were held at md_ioctl_releaselocks time including
    850  *	the ioctl lock.
    851  */
    852 void
    853 md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui)
    854 {
    855 	if (flags & MD_MT_IOCTL) {
    856 		mutex_enter(&md_mx);
    857 		md_mtioctl_cnt++;
    858 		mutex_exit(&md_mx);
    859 	} else {
    860 		while (md_ioctl_lock_enter() == EINTR)
    861 			;
    862 	}
    863 	if (flags & MD_ARRAY_WRITER) {
    864 		rw_enter(&md_unit_array_rw.lock, RW_WRITER);
    865 	} else if (flags & MD_ARRAY_READER) {
    866 		rw_enter(&md_unit_array_rw.lock, RW_READER);
    867 	}
    868 	if (ui != (mdi_unit_t *)NULL) {
    869 		if (flags & MD_IO_HELD) {
    870 			(void) md_io_writerlock(ui);
    871 		}
    872 
    873 		mutex_enter(&ui->ui_mx);
    874 		if (flags & MD_READER_HELD) {
    875 			(void) md_unit_readerlock_common(ui, 1);
    876 		} else if (flags & MD_WRITER_HELD) {
    877 			(void) md_unit_writerlock_common(ui, 1);
    878 		}
    879 		/* Wake up any blocked readerlock() calls */
    880 		cv_broadcast(&ui->ui_cv);
    881 		mutex_exit(&ui->ui_mx);
    882 	}
    883 }
    884 
    885 void
    886 md_ioctl_droplocks(IOLOCK *lock)
    887 {
    888 	mdi_unit_t	*ui;
    889 	int		flags;
    890 
    891 	ASSERT(lock != NULL);
    892 	ui = lock->l_ui;
    893 	flags = lock->l_flags;
    894 	if (flags & MD_READER_HELD) {
    895 		lock->l_flags &= ~MD_READER_HELD;
    896 		md_unit_readerexit(ui);
    897 	}
    898 	if (flags & MD_WRITER_HELD) {
    899 		lock->l_flags &= ~MD_WRITER_HELD;
    900 		md_unit_writerexit(ui);
    901 	}
    902 	if (flags & MD_IO_HELD) {
    903 		lock->l_flags &= ~MD_IO_HELD;
    904 		md_io_writerexit(ui);
    905 	}
    906 	if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
    907 		lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER);
    908 		rw_exit(&md_unit_array_rw.lock);
    909 	}
    910 }
    911 
    912 void
    913 md_array_writer(IOLOCK *lock)
    914 {
    915 	ASSERT(lock != NULL);
    916 	lock->l_flags |= MD_ARRAY_WRITER;
    917 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
    918 }
    919 
    920 void
    921 md_array_reader(IOLOCK *lock)
    922 {
    923 	ASSERT(lock != NULL);
    924 	lock->l_flags |= MD_ARRAY_READER;
    925 	rw_enter(&md_unit_array_rw.lock, RW_READER);
    926 }
    927 
    928 /*
    929  * Called when in an ioctl and need opencloselock.
    930  * Sets flags in lockp for READER_HELD.
    931  */
    932 void *
    933 md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui)
    934 {
    935 	void	*un;
    936 
    937 	ASSERT(lockp != NULL);
    938 	mutex_enter(&ui->ui_mx);
    939 	while (ui->ui_lock & MD_UL_OPENORCLOSE)
    940 		cv_wait(&ui->ui_cv, &ui->ui_mx);
    941 	ui->ui_lock |= MD_UL_OPENORCLOSE;
    942 
    943 	/* Maintain mutex across the readerlock call */
    944 	lockp->l_ui = ui;
    945 	lockp->l_flags |= MD_READER_HELD;
    946 	un = md_unit_readerlock_common(ui, 1);
    947 	mutex_exit(&ui->ui_mx);
    948 
    949 	return (un);
    950 }
    951 
    952 /*
    953  * Clears reader lock using md_ioctl instead of md_unit
    954  * and updates lockp.
    955  */
    956 void
    957 md_ioctl_openclose_exit(IOLOCK *lockp)
    958 {
    959 	mdi_unit_t	*ui;
    960 
    961 	ASSERT(lockp != NULL);
    962 	ui = lockp->l_ui;
    963 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
    964 
    965 	md_ioctl_readerexit(lockp);
    966 
    967 	mutex_enter(&ui->ui_mx);
    968 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
    969 
    970 	cv_broadcast(&ui->ui_cv);
    971 	mutex_exit(&ui->ui_mx);
    972 }
    973 
    974 /*
    975  * Clears reader lock using md_ioctl instead of md_unit
    976  * and updates lockp.
    977  * Does not acquire or release the ui_mx lock since the calling
    978  * routine has already acquired this lock.
    979  */
    980 void
    981 md_ioctl_openclose_exit_lh(IOLOCK *lockp)
    982 {
    983 	mdi_unit_t	*ui;
    984 
    985 	ASSERT(lockp != NULL);
    986 	ui = lockp->l_ui;
    987 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
    988 
    989 	lockp->l_flags &= ~MD_READER_HELD;
    990 	md_unit_readerexit_common(lockp->l_ui, 1);
    991 
    992 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
    993 	cv_broadcast(&ui->ui_cv);
    994 }
    995 
    996 void *
    997 md_unit_openclose_enter(mdi_unit_t *ui)
    998 {
    999 	void	*un;
   1000 
   1001 	mutex_enter(&ui->ui_mx);
   1002 	while (ui->ui_lock & (MD_UL_OPENORCLOSE))
   1003 		cv_wait(&ui->ui_cv, &ui->ui_mx);
   1004 	ui->ui_lock |= MD_UL_OPENORCLOSE;
   1005 
   1006 	/* Maintain mutex across the readerlock call */
   1007 	un = md_unit_readerlock_common(ui, 1);
   1008 	mutex_exit(&ui->ui_mx);
   1009 
   1010 	return (un);
   1011 }
   1012 
   1013 void
   1014 md_unit_openclose_exit(mdi_unit_t *ui)
   1015 {
   1016 	md_unit_readerexit(ui);
   1017 
   1018 	mutex_enter(&ui->ui_mx);
   1019 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
   1020 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
   1021 
   1022 	cv_broadcast(&ui->ui_cv);
   1023 	mutex_exit(&ui->ui_mx);
   1024 }
   1025 
   1026 /*
   1027  * Drop the openclose and readerlocks without acquiring or
   1028  * releasing the ui_mx lock since the calling routine has
   1029  * already acquired this lock.
   1030  */
   1031 void
   1032 md_unit_openclose_exit_lh(mdi_unit_t *ui)
   1033 {
   1034 	md_unit_readerexit_common(ui, 1);
   1035 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
   1036 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
   1037 	cv_broadcast(&ui->ui_cv);
   1038 }
   1039 
   1040 int
   1041 md_unit_isopen(
   1042 	mdi_unit_t	*ui
   1043 )
   1044 {
   1045 	int		isopen;
   1046 
   1047 	/* check status */
   1048 	mutex_enter(&ui->ui_mx);
   1049 	isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0);
   1050 	mutex_exit(&ui->ui_mx);
   1051 	return (isopen);
   1052 }
   1053 
   1054 int
   1055 md_unit_incopen(
   1056 	minor_t		mnum,
   1057 	int		flag,
   1058 	int		otyp
   1059 )
   1060 {
   1061 	mdi_unit_t	*ui = MDI_UNIT(mnum);
   1062 	int		err = 0;
   1063 
   1064 	/* check type and flags */
   1065 	ASSERT(ui != NULL);
   1066 	mutex_enter(&ui->ui_mx);
   1067 	if ((otyp < 0) || (otyp >= OTYPCNT)) {
   1068 		err = EINVAL;
   1069 		goto out;
   1070 	}
   1071 	if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) ||
   1072 	    (ui->ui_lock & MD_UL_EXCL)) {
   1073 		err = EBUSY;
   1074 		goto out;
   1075 	}
   1076 
   1077 	/* count and flag open */
   1078 	ui->ui_ocnt[otyp]++;
   1079 	ui->ui_lock |= MD_UL_OPEN;
   1080 	if (flag & FEXCL)
   1081 		ui->ui_lock |= MD_UL_EXCL;
   1082 
   1083 	/* setup kstat, return success */
   1084 	mutex_exit(&ui->ui_mx);
   1085 	md_kstat_init(mnum);
   1086 	return (0);
   1087 
   1088 	/* return error */
   1089 out:
   1090 	mutex_exit(&ui->ui_mx);
   1091 	return (err);
   1092 }
   1093 
   1094 int
   1095 md_unit_decopen(
   1096 	minor_t		mnum,
   1097 	int		otyp
   1098 )
   1099 {
   1100 	mdi_unit_t	*ui = MDI_UNIT(mnum);
   1101 	int		err = 0;
   1102 	unsigned	i;
   1103 
   1104 	/* check type and flags */
   1105 	ASSERT(ui != NULL);
   1106 	mutex_enter(&ui->ui_mx);
   1107 	if ((otyp < 0) || (otyp >= OTYPCNT)) {
   1108 		err = EINVAL;
   1109 		goto out;
   1110 	} else if (ui->ui_ocnt[otyp] == 0) {
   1111 		err = ENXIO;
   1112 		goto out;
   1113 	}
   1114 
   1115 	/* count and flag closed */
   1116 	if (otyp == OTYP_LYR)
   1117 		ui->ui_ocnt[otyp]--;
   1118 	else
   1119 		ui->ui_ocnt[otyp] = 0;
   1120 	ui->ui_lock &= ~MD_UL_OPEN;
   1121 	for (i = 0; (i < OTYPCNT); ++i)
   1122 		if (ui->ui_ocnt[i] != 0)
   1123 			ui->ui_lock |= MD_UL_OPEN;
   1124 	if (! (ui->ui_lock & MD_UL_OPEN))
   1125 		ui->ui_lock &= ~MD_UL_EXCL;
   1126 
   1127 	/* teardown kstat, return success */
   1128 	if (! (ui->ui_lock & MD_UL_OPEN)) {
   1129 
   1130 		/*
   1131 		 * We have a race condition inherited from specfs between
   1132 		 * open() and close() calls. This results in the kstat
   1133 		 * for a pending I/O being torn down, and then a panic.
   1134 		 * To avoid this, only tear the kstat down if there are
   1135 		 * no other readers on this device.
   1136 		 */
   1137 		if (ui->ui_readercnt > 1) {
   1138 			mutex_exit(&ui->ui_mx);
   1139 		} else {
   1140 			mutex_exit(&ui->ui_mx);
   1141 			md_kstat_destroy(mnum);
   1142 		}
   1143 		return (0);
   1144 	}
   1145 
   1146 	/* return success */
   1147 out:
   1148 	mutex_exit(&ui->ui_mx);
   1149 	return (err);
   1150 }
   1151 
   1152 md_dev64_t
   1153 md_xlate_targ_2_mini(md_dev64_t targ_devt)
   1154 {
   1155 	dev32_t		mini_32_devt, targ_32_devt;
   1156 	int		i;
   1157 
   1158 	/*
   1159 	 * check to see if we're in an upgrade situation
   1160 	 * if we are not in upgrade just return the input device
   1161 	 */
   1162 
   1163 	if (!MD_UPGRADE)
   1164 		return (targ_devt);
   1165 
   1166 	targ_32_devt = md_cmpldev(targ_devt);
   1167 
   1168 	i = 0;
   1169 	while (i != md_tuple_length) {
   1170 		if (md_tuple_table[i].targ_devt == targ_32_devt) {
   1171 			mini_32_devt = md_tuple_table[i].mini_devt;
   1172 			return (md_expldev((md_dev64_t)mini_32_devt));
   1173 		}
   1174 		i++;
   1175 	}
   1176 	return (NODEV64);
   1177 }
   1178 
   1179 md_dev64_t
   1180 md_xlate_mini_2_targ(md_dev64_t mini_devt)
   1181 {
   1182 	dev32_t		mini_32_devt, targ_32_devt;
   1183 	int		i;
   1184 
   1185 	if (!MD_UPGRADE)
   1186 		return (mini_devt);
   1187 
   1188 	mini_32_devt = md_cmpldev(mini_devt);
   1189 
   1190 	i = 0;
   1191 	while (i != md_tuple_length) {
   1192 		if (md_tuple_table[i].mini_devt == mini_32_devt) {
   1193 			targ_32_devt = md_tuple_table[i].targ_devt;
   1194 			return (md_expldev((md_dev64_t)targ_32_devt));
   1195 		}
   1196 		i++;
   1197 	}
   1198 	return (NODEV64);
   1199 }
   1200 
   1201 void
   1202 md_xlate_free(int size)
   1203 {
   1204 	kmem_free(md_tuple_table, size);
   1205 }
   1206 
   1207 char *
   1208 md_targ_major_to_name(major_t maj)
   1209 {
   1210 	char *drv_name = NULL;
   1211 	int	i;
   1212 
   1213 	if (!MD_UPGRADE)
   1214 		return (ddi_major_to_name(maj));
   1215 
   1216 	for (i = 0; i < md_majortab_len; i++) {
   1217 		if (md_major_tuple_table[i].targ_maj == maj) {
   1218 			drv_name = md_major_tuple_table[i].drv_name;
   1219 			break;
   1220 		}
   1221 	}
   1222 	return (drv_name);
   1223 }
   1224 
   1225 major_t
   1226 md_targ_name_to_major(char *drv_name)
   1227 {
   1228 	major_t maj;
   1229 	int	i;
   1230 
   1231 	maj = md_getmajor(NODEV64);
   1232 	if (!MD_UPGRADE)
   1233 		return (ddi_name_to_major(drv_name));
   1234 
   1235 	for (i = 0; i < md_majortab_len; i++) {
   1236 		if ((strcmp(md_major_tuple_table[i].drv_name,
   1237 		    drv_name)) == 0) {
   1238 			maj = md_major_tuple_table[i].targ_maj;
   1239 			break;
   1240 		}
   1241 	}
   1242 
   1243 	return (maj);
   1244 }
   1245 
   1246 void
   1247 md_majortab_free()
   1248 {
   1249 	size_t	sz;
   1250 	int	i;
   1251 
   1252 	for (i = 0; i < md_majortab_len; i++) {
   1253 		freestr(md_major_tuple_table[i].drv_name);
   1254 	}
   1255 
   1256 	sz = md_majortab_len * sizeof (struct md_xlate_major_table);
   1257 	kmem_free(md_major_tuple_table, sz);
   1258 }
   1259 
   1260 /* functions return a pointer to a function which returns an int */
   1261 
   1262 intptr_t (*
   1263 md_get_named_service(md_dev64_t dev, int modindex, char *name,
   1264 	intptr_t (*Default)()))()
   1265 {
   1266 	mdi_unit_t		*ui;
   1267 	md_named_services_t	*sp;
   1268 	int			i;
   1269 
   1270 	/*
   1271 	 * Return the first named service found.
   1272 	 * Use this path when it is known that there is only
   1273 	 * one named service possible (e.g., hotspare interface)
   1274 	 */
   1275 	if ((dev == NODEV64) && (modindex == ANY_SERVICE)) {
   1276 		for (i = 0; i < MD_NOPS; i++) {
   1277 			if (md_ops[i] == NULL) {
   1278 				continue;
   1279 			}
   1280 			sp = md_ops[i]->md_services;
   1281 			if (sp == NULL)
   1282 				continue;
   1283 			while (sp->md_service != NULL) {
   1284 				if (strcmp(name, sp->md_name) == 0)
   1285 					return (sp->md_service);
   1286 				sp++;
   1287 			}
   1288 		}
   1289 		return (Default);
   1290 	}
   1291 
   1292 	/*
   1293 	 * Return the named service for the given modindex.
   1294 	 * This is used if there are multiple possible named services
   1295 	 * and each one needs to be called (e.g., poke hotspares)
   1296 	 */
   1297 	if (dev == NODEV64) {
   1298 		if (modindex >= MD_NOPS)
   1299 			return (Default);
   1300 
   1301 		if (md_ops[modindex] == NULL)
   1302 			return (Default);
   1303 
   1304 		sp = md_ops[modindex]->md_services;
   1305 		if (sp == NULL)
   1306 			return (Default);
   1307 
   1308 		while (sp->md_service != NULL) {
   1309 			if (strcmp(name, sp->md_name) == 0)
   1310 				return (sp->md_service);
   1311 			sp++;
   1312 		}
   1313 		return (Default);
   1314 	}
   1315 
   1316 	/*
   1317 	 * Return the named service for this md_dev64_t
   1318 	 */
   1319 	if (md_getmajor(dev) != md_major)
   1320 		return (Default);
   1321 
   1322 	if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
   1323 	    (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
   1324 		return (NULL);
   1325 
   1326 
   1327 	if ((ui = MDI_UNIT(md_getminor(dev))) == NULL)
   1328 		return (NULL);
   1329 
   1330 	sp = md_ops[ui->ui_opsindex]->md_services;
   1331 	if (sp == NULL)
   1332 		return (Default);
   1333 	while (sp->md_service != NULL) {
   1334 		if (strcmp(name, sp->md_name) == 0)
   1335 			return (sp->md_service);
   1336 		sp++;
   1337 	}
   1338 	return (Default);
   1339 }
   1340 
   1341 /*
   1342  * md_daemon callback routine
   1343  */
   1344 boolean_t
   1345 callb_md_cpr(void *arg, int code)
   1346 {
   1347 	callb_cpr_t *cp = (callb_cpr_t *)arg;
   1348 	int ret = 0;				/* assume success */
   1349 	clock_t delta;
   1350 
   1351 	mutex_enter(cp->cc_lockp);
   1352 
   1353 	switch (code) {
   1354 	case CB_CODE_CPR_CHKPT:
   1355 		/*
   1356 		 * Check for active resync threads
   1357 		 */
   1358 		mutex_enter(&md_cpr_resync.md_resync_mutex);
   1359 		if ((md_cpr_resync.md_mirror_resync > 0) ||
   1360 		    (md_cpr_resync.md_raid_resync > 0)) {
   1361 			mutex_exit(&md_cpr_resync.md_resync_mutex);
   1362 			cmn_err(CE_WARN, "There are Solaris Volume Manager "
   1363 			    "synchronization threads running.");
   1364 			cmn_err(CE_WARN, "Please try system suspension at "
   1365 			    "a later time.");
   1366 			ret = -1;
   1367 			break;
   1368 		}
   1369 		mutex_exit(&md_cpr_resync.md_resync_mutex);
   1370 
   1371 		cp->cc_events |= CALLB_CPR_START;
   1372 		delta = CPR_KTHREAD_TIMEOUT_SEC * hz;
   1373 		while (!(cp->cc_events & CALLB_CPR_SAFE))
   1374 			/* cv_reltimedwait() returns -1 if it times out. */
   1375 			if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
   1376 			    cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1)
   1377 				break;
   1378 			break;
   1379 
   1380 	case CB_CODE_CPR_RESUME:
   1381 		cp->cc_events &= ~CALLB_CPR_START;
   1382 		cv_signal(&cp->cc_stop_cv);
   1383 		break;
   1384 	}
   1385 	mutex_exit(cp->cc_lockp);
   1386 	return (ret != -1);
   1387 }
   1388 
   1389 void
   1390 md_daemon(int pass_thru, mdq_anchor_t *anchor)
   1391 {
   1392 	daemon_queue_t  *dq;
   1393 	callb_cpr_t	cprinfo;
   1394 
   1395 	if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE))
   1396 		return;
   1397 	/*
   1398 	 * Register cpr callback
   1399 	 */
   1400 	CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon");
   1401 
   1402 	/*CONSTCOND*/
   1403 	while (1) {
   1404 		mutex_enter(&anchor->a_mx);
   1405 		while ((dq = anchor->dq.dq_next) == &(anchor->dq)) {
   1406 			if (pass_thru) {
   1407 				/*
   1408 				 * CALLB_CPR_EXIT Will do
   1409 				 * mutex_exit(&anchor->a_mx)
   1410 				 */
   1411 				CALLB_CPR_EXIT(&cprinfo);
   1412 				return;
   1413 			}
   1414 			if (md_get_status() & MD_GBL_DAEMONS_DIE) {
   1415 				mutex_exit(&anchor->a_mx);
   1416 				mutex_enter(&md_mx);
   1417 				md_num_daemons--;
   1418 				mutex_exit(&md_mx);
   1419 				/*
   1420 				 * CALLB_CPR_EXIT will do
   1421 				 * mutex_exit(&anchor->a_mx)
   1422 				 */
   1423 				mutex_enter(&anchor->a_mx);
   1424 				CALLB_CPR_EXIT(&cprinfo);
   1425 				thread_exit();
   1426 			}
   1427 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
   1428 			cv_wait(&anchor->a_cv, &anchor->a_mx);
   1429 			CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx);
   1430 		}
   1431 		dq->dq_prev->dq_next = dq->dq_next;
   1432 		dq->dq_next->dq_prev = dq->dq_prev;
   1433 		dq->dq_prev = dq->dq_next = NULL;
   1434 		anchor->dq.qlen--;
   1435 		mutex_exit(&anchor->a_mx);
   1436 		(*(dq->dq_call))(dq);
   1437 	}
   1438 	/*NOTREACHED*/
   1439 }
   1440 
   1441 /*
   1442  * daemon_request:
   1443  *
   1444  * Adds requests to appropriate requestq which is
   1445  * anchored by *anchor.
   1446  * The request is the first element of a doubly linked circular list.
   1447  * When the request is a single element, the forward and backward
   1448  * pointers MUST point to the element itself.
   1449  */
   1450 
   1451 void
   1452 daemon_request(mdq_anchor_t *anchor, void (*func)(),
   1453 				daemon_queue_t *request, callstyle_t style)
   1454 {
   1455 	daemon_queue_t *rqtp;
   1456 	int i = 0;
   1457 
   1458 	rqtp = request;
   1459 	if (style == REQ_OLD) {
   1460 		ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL));
   1461 		/* set it to the new style */
   1462 		rqtp->dq_prev = rqtp->dq_next = rqtp;
   1463 	}
   1464 	ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL));
   1465 
   1466 	/* scan the list and add the function to each element */
   1467 
   1468 	do {
   1469 		rqtp->dq_call = func;
   1470 		i++;
   1471 		rqtp = rqtp->dq_next;
   1472 	} while (rqtp != request);
   1473 
   1474 	/* save pointer to tail of the request list */
   1475 	rqtp = request->dq_prev;
   1476 
   1477 	mutex_enter(&anchor->a_mx);
   1478 	/* stats */
   1479 	anchor->dq.qlen += i;
   1480 	anchor->dq.treqs += i;
   1481 	anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ?
   1482 	    anchor->dq.qlen : anchor->dq.maxq_len;
   1483 
   1484 	/* now add the list to request queue */
   1485 	request->dq_prev = anchor->dq.dq_prev;
   1486 	rqtp->dq_next = &anchor->dq;
   1487 	anchor->dq.dq_prev->dq_next = request;
   1488 	anchor->dq.dq_prev = rqtp;
   1489 	cv_broadcast(&anchor->a_cv);
   1490 	mutex_exit(&anchor->a_mx);
   1491 }
   1492 
   1493 void
   1494 mddb_commitrec_wrapper(mddb_recid_t recid)
   1495 {
   1496 	int sent_log = 0;
   1497 	uint_t retry = md_retry_cnt;
   1498 	set_t	setno;
   1499 
   1500 	while (mddb_commitrec(recid)) {
   1501 		if (! sent_log) {
   1502 			cmn_err(CE_WARN,
   1503 			    "md: state database commit failed");
   1504 			sent_log = 1;
   1505 		}
   1506 		delay(md_hz);
   1507 
   1508 		/*
   1509 		 * Setting retry cnt to one (pre decremented) so that we
   1510 		 * actually do no retries when committing/deleting a mddb rec.
   1511 		 * The underlying disk driver does several retries to check
   1512 		 * if the disk is really dead or not so there
   1513 		 * is no reason for us to retry on top of the drivers retries.
   1514 		 */
   1515 
   1516 		if (--retry == 0) {
   1517 			setno = mddb_getsetnum(recid);
   1518 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
   1519 				panic(
   1520 				    "md: Panic due to lack of DiskSuite state\n"
   1521 				    " database replicas. Fewer than 50%% of "
   1522 				    "the total were available,\n so panic to "
   1523 				    "ensure data integrity.");
   1524 			} else {
   1525 				panic("md: state database problem");
   1526 			}
   1527 			/*NOTREACHED*/
   1528 		}
   1529 	}
   1530 }
   1531 
   1532 void
   1533 mddb_commitrecs_wrapper(mddb_recid_t *recids)
   1534 {
   1535 	int sent_log = 0;
   1536 	uint_t retry = md_retry_cnt;
   1537 	set_t	setno;
   1538 
   1539 	while (mddb_commitrecs(recids)) {
   1540 		if (! sent_log) {
   1541 			cmn_err(CE_WARN,
   1542 			    "md: state database commit failed");
   1543 			sent_log = 1;
   1544 		}
   1545 		delay(md_hz);
   1546 
   1547 		/*
   1548 		 * Setting retry cnt to one (pre decremented) so that we
   1549 		 * actually do no retries when committing/deleting a mddb rec.
   1550 		 * The underlying disk driver does several retries to check
   1551 		 * if the disk is really dead or not so there
   1552 		 * is no reason for us to retry on top of the drivers retries.
   1553 		 */
   1554 
   1555 		if (--retry == 0) {
   1556 			/*
   1557 			 * since all the records are part of the same set
   1558 			 * use the first one to get setno
   1559 			 */
   1560 			setno = mddb_getsetnum(*recids);
   1561 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
   1562 				panic(
   1563 				    "md: Panic due to lack of DiskSuite state\n"
   1564 				    " database replicas. Fewer than 50%% of "
   1565 				    "the total were available,\n so panic to "
   1566 				    "ensure data integrity.");
   1567 			} else {
   1568 				panic("md: state database problem");
   1569 			}
   1570 			/*NOTREACHED*/
   1571 		}
   1572 	}
   1573 }
   1574 
   1575 void
   1576 mddb_deleterec_wrapper(mddb_recid_t recid)
   1577 {
   1578 	int sent_log = 0;
   1579 	uint_t retry = md_retry_cnt;
   1580 	set_t	setno;
   1581 
   1582 	while (mddb_deleterec(recid)) {
   1583 		if (! sent_log) {
   1584 			cmn_err(CE_WARN,
   1585 			    "md: state database delete failed");
   1586 			sent_log = 1;
   1587 		}
   1588 		delay(md_hz);
   1589 
   1590 		/*
   1591 		 * Setting retry cnt to one (pre decremented) so that we
   1592 		 * actually do no retries when committing/deleting a mddb rec.
   1593 		 * The underlying disk driver does several retries to check
   1594 		 * if the disk is really dead or not so there
   1595 		 * is no reason for us to retry on top of the drivers retries.
   1596 		 */
   1597 
   1598 		if (--retry == 0) {
   1599 			setno = mddb_getsetnum(recid);
   1600 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
   1601 				panic(
   1602 				    "md: Panic due to lack of DiskSuite state\n"
   1603 				    " database replicas. Fewer than 50%% of "
   1604 				    "the total were available,\n so panic to "
   1605 				    "ensure data integrity.");
   1606 			} else {
   1607 				panic("md: state database problem");
   1608 			}
   1609 			/*NOTREACHED*/
   1610 		}
   1611 	}
   1612 }
   1613 
   1614 /*
   1615  * md_holdset_enter is called in order to hold the set in its
   1616  * current state (loaded, unloaded, snarfed, unsnarfed, etc)
   1617  * until md_holdset_exit is called.  This is used by the mirror
   1618  * code to mark the set as HOLD so that the set won't be
   1619  * unloaded while hotspares are being allocated in check_4_hotspares.
   1620  * The original fix to the mirror code to hold the set was to call
   1621  * md_haltsnarf_enter, but this will block all ioctls and ioctls
   1622  * must work for a MN diskset while hotspares are allocated.
   1623  */
   1624 void
   1625 md_holdset_enter(set_t setno)
   1626 {
   1627 	mutex_enter(&md_mx);
   1628 	while (md_set[setno].s_status & MD_SET_HOLD)
   1629 		cv_wait(&md_cv, &md_mx);
   1630 	md_set[setno].s_status |= MD_SET_HOLD;
   1631 	mutex_exit(&md_mx);
   1632 }
   1633 
   1634 void
   1635 md_holdset_exit(set_t setno)
   1636 {
   1637 	mutex_enter(&md_mx);
   1638 	md_set[setno].s_status &= ~MD_SET_HOLD;
   1639 	cv_broadcast(&md_cv);
   1640 	mutex_exit(&md_mx);
   1641 }
   1642 
   1643 /*
   1644  * Returns a 0 if this thread marked the set as HOLD (success),
   1645  * returns a -1 if set was already marked HOLD (failure).
   1646  * Used by the release_set code to see if set is marked HOLD.
   1647  * HOLD is set by a daemon when hotspares are being allocated
   1648  * to mirror units.
   1649  */
   1650 int
   1651 md_holdset_testandenter(set_t setno)
   1652 {
   1653 	mutex_enter(&md_mx);
   1654 	if (md_set[setno].s_status & MD_SET_HOLD) {
   1655 		mutex_exit(&md_mx);
   1656 		return (-1);
   1657 	}
   1658 	md_set[setno].s_status |= MD_SET_HOLD;
   1659 	mutex_exit(&md_mx);
   1660 	return (0);
   1661 }
   1662 
   1663 void
   1664 md_haltsnarf_enter(set_t setno)
   1665 {
   1666 	mutex_enter(&md_mx);
   1667 	while (md_set[setno].s_status & MD_SET_SNARFING)
   1668 		cv_wait(&md_cv, &md_mx);
   1669 
   1670 	md_set[setno].s_status |= MD_SET_SNARFING;
   1671 	mutex_exit(&md_mx);
   1672 }
   1673 
   1674 void
   1675 md_haltsnarf_exit(set_t setno)
   1676 {
   1677 	mutex_enter(&md_mx);
   1678 	md_set[setno].s_status &= ~MD_SET_SNARFING;
   1679 	cv_broadcast(&md_cv);
   1680 	mutex_exit(&md_mx);
   1681 }
   1682 
   1683 void
   1684 md_haltsnarf_wait(set_t setno)
   1685 {
   1686 	mutex_enter(&md_mx);
   1687 	while (md_set[setno].s_status & MD_SET_SNARFING)
   1688 		cv_wait(&md_cv, &md_mx);
   1689 	mutex_exit(&md_mx);
   1690 }
   1691 
   1692 /*
   1693  * ASSUMED that the md_unit_array_rw WRITER lock is held.
   1694  */
   1695 int
   1696 md_halt_set(set_t setno, enum md_haltcmd cmd)
   1697 {
   1698 	int	i, err;
   1699 
   1700 	if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) {
   1701 		return (0);
   1702 	}
   1703 
   1704 	if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) {
   1705 		for (i = 0; i < MD_NOPS; i++) {
   1706 			if (md_ops[i] == NULL)
   1707 				continue;
   1708 			if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) {
   1709 				for (--i; i > 0; --i) {
   1710 					if (md_ops[i] == NULL)
   1711 						continue;
   1712 					(void) (*(md_ops[i]->md_halt))
   1713 					    (MD_HALT_OPEN, setno);
   1714 				}
   1715 				return (EBUSY);
   1716 			}
   1717 		}
   1718 
   1719 		for (i = 0; i < MD_NOPS; i++) {
   1720 			if (md_ops[i] == NULL)
   1721 				continue;
   1722 			if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) {
   1723 				for (i = 0; i < MD_NOPS; i++) {
   1724 					if (md_ops[i] == NULL)
   1725 						continue;
   1726 					(void) (*(md_ops[i]->md_halt))
   1727 					    (MD_HALT_OPEN, setno);
   1728 				}
   1729 				return (EBUSY);
   1730 			}
   1731 		}
   1732 	}
   1733 
   1734 	if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) {
   1735 		for (i = 0; i < MD_NOPS; i++) {
   1736 			if (md_ops[i] == NULL)
   1737 				continue;
   1738 			err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno);
   1739 			if (err != 0)
   1740 				cmn_err(CE_NOTE,
   1741 				    "md: halt failed for %s, error %d",
   1742 				    md_ops[i]->md_driver.md_drivername, err);
   1743 		}
   1744 
   1745 		/*
   1746 		 * Unload the devid namespace if it is loaded
   1747 		 */
   1748 		md_unload_namespace(setno, NM_DEVID);
   1749 		md_unload_namespace(setno, 0L);
   1750 		md_clr_setstatus(setno, MD_SET_SNARFED);
   1751 	}
   1752 
   1753 	return (0);
   1754 }
   1755 
   1756 int
   1757 md_halt(int global_locks_owned_mask)
   1758 {
   1759 	set_t			i, j;
   1760 	int			err;
   1761 	int			init_queues;
   1762 	md_requestq_entry_t	*rqp;
   1763 	md_ops_t		**pops, *ops, *lops;
   1764 	ddi_modhandle_t		mod;
   1765 	char			*name;
   1766 
   1767 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
   1768 
   1769 	/*
   1770 	 * Grab the all of the global locks that are not
   1771 	 * already owned to ensure that there isn't another
   1772 	 * thread trying to access a global resource
   1773 	 * while the halt is in progress
   1774 	 */
   1775 	if (md_global_lock_enter(global_locks_owned_mask) == EINTR)
   1776 		return (EINTR);
   1777 
   1778 	for (i = 0; i < md_nsets; i++)
   1779 		md_haltsnarf_enter(i);
   1780 
   1781 	/*
   1782 	 * Kill the daemon threads.
   1783 	 */
   1784 	init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE);
   1785 	md_clr_status(MD_GBL_DAEMONS_LIVE);
   1786 	md_set_status(MD_GBL_DAEMONS_DIE);
   1787 
   1788 	rqp = &md_daemon_queues[0];
   1789 	i = 0;
   1790 	while (!NULL_REQUESTQ_ENTRY(rqp)) {
   1791 		cv_broadcast(&rqp->dispq_headp->a_cv);
   1792 		rqp = &md_daemon_queues[++i];
   1793 	}
   1794 
   1795 	mutex_enter(&md_mx);
   1796 	while (md_num_daemons != 0) {
   1797 		mutex_exit(&md_mx);
   1798 		delay(md_hz);
   1799 		mutex_enter(&md_mx);
   1800 	}
   1801 	mutex_exit(&md_mx);
   1802 	md_clr_status(MD_GBL_DAEMONS_DIE);
   1803 
   1804 	for (i = 0; i < md_nsets; i++)
   1805 		/*
   1806 		 * Only call into md_halt_set if s_un / s_ui are both set.
   1807 		 * If they are NULL this set hasn't been accessed, so its
   1808 		 * pointless performing the call.
   1809 		 */
   1810 		if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
   1811 			if (md_halt_set(i, MD_HALT_CHECK)) {
   1812 				if (md_start_daemons(init_queues))
   1813 					cmn_err(CE_WARN,
   1814 					    "md: restart of daemon threads "
   1815 					    "failed");
   1816 				for (j = 0; j < md_nsets; j++)
   1817 					md_haltsnarf_exit(j);
   1818 
   1819 				return (md_global_lock_exit(
   1820 				    global_locks_owned_mask, EBUSY,
   1821 				    MD_ARRAY_WRITER, NULL));
   1822 			}
   1823 		}
   1824 
   1825 	/*
   1826 	 * if we get here we are going to do it
   1827 	 */
   1828 	for (i = 0; i < md_nsets; i++) {
   1829 		/*
   1830 		 * Only call into md_halt_set if s_un / s_ui are both set.
   1831 		 * If they are NULL this set hasn't been accessed, so its
   1832 		 * pointless performing the call.
   1833 		 */
   1834 		if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
   1835 			err = md_halt_set(i, MD_HALT_DOIT);
   1836 			if (err != 0)
   1837 				cmn_err(CE_NOTE,
   1838 				    "md: halt failed set %u, error %d",
   1839 				    (unsigned)i, err);
   1840 		}
   1841 	}
   1842 
   1843 	/*
   1844 	 * issue a halt unload to each module to indicate that it
   1845 	 * is about to be unloaded.  Each module is called once, set
   1846 	 * has no meaning at this point in time.
   1847 	 */
   1848 	for (i = 0; i < MD_NOPS; i++) {
   1849 		if (md_ops[i] == NULL)
   1850 			continue;
   1851 		err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0);
   1852 		if (err != 0)
   1853 			cmn_err(CE_NOTE,
   1854 			    "md: halt failed for %s, error %d",
   1855 			    md_ops[i]->md_driver.md_drivername, err);
   1856 	}
   1857 
   1858 	/* ddi_modclose the submodules */
   1859 	for (i = 0; i < MD_NOPS; i++) {
   1860 		/* skip if not open */
   1861 		if ((md_ops[i] == NULL) || (md_mods[i] == NULL))
   1862 			continue;
   1863 
   1864 		/* find and unlink from md_opslist */
   1865 		ops = md_ops[i];
   1866 		mod = md_mods[i];
   1867 		pops = &md_opslist;
   1868 		for (lops = *pops; lops;
   1869 		    pops = &lops->md_next, lops = *pops) {
   1870 			if (lops == ops) {
   1871 				*pops = ops->md_next;
   1872 				ops->md_next = NULL;
   1873 				break;
   1874 			}
   1875 		}
   1876 
   1877 		/* uninitialize */
   1878 		name = ops->md_driver.md_drivername;
   1879 		md_ops[i] = NULL;
   1880 		md_mods[i] = NULL;
   1881 		ops->md_selfindex = 0;
   1882 		ops->md_driver.md_drivername[0] = '\0';
   1883 		rw_destroy(&ops->md_link_rw.lock);
   1884 
   1885 		/* close */
   1886 		err = ddi_modclose(mod);
   1887 		if (err != 0)
   1888 			cmn_err(CE_NOTE,
   1889 			    "md: halt close failed for %s, error %d",
   1890 			    name ? name : "UNKNOWN", err);
   1891 	}
   1892 
   1893 	/* Unload the database */
   1894 	mddb_unload();
   1895 
   1896 	md_set_status(MD_GBL_HALTED);	/* we are ready to be unloaded */
   1897 
   1898 	for (i = 0; i < md_nsets; i++)
   1899 		md_haltsnarf_exit(i);
   1900 
   1901 	return (md_global_lock_exit(global_locks_owned_mask, 0,
   1902 	    MD_ARRAY_WRITER, NULL));
   1903 }
   1904 
   1905 /*
   1906  * md_layered_open() is an internal routine only for SVM modules.
   1907  * So the input device will be a md_dev64_t, because all SVM modules internally
   1908  * work with that device type.
   1909  * ddi routines on the other hand work with dev_t. So, if we call any ddi
   1910  * routines from here we first have to convert that device into a dev_t.
   1911  */
   1912 
   1913 int
   1914 md_layered_open(
   1915 	minor_t		mnum,
   1916 	md_dev64_t	*dev,
   1917 	int		md_oflags
   1918 )
   1919 {
   1920 	int		flag = (FREAD | FWRITE);
   1921 	cred_t		*cred_p = kcred;
   1922 	major_t		major;
   1923 	int		err;
   1924 	dev_t		ddi_dev = md_dev64_to_dev(*dev);
   1925 
   1926 	if (ddi_dev == NODEV)
   1927 		return (ENODEV);
   1928 
   1929 	major = getmajor(ddi_dev);
   1930 
   1931 	/* metadevice */
   1932 	if (major == md_major) {
   1933 		mdi_unit_t	*ui;
   1934 
   1935 		/* open underlying driver */
   1936 		mnum = getminor(ddi_dev);
   1937 
   1938 		ui = MDI_UNIT(mnum);
   1939 		if (md_ops[ui->ui_opsindex]->md_open != NULL) {
   1940 			int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev,
   1941 			    flag, OTYP_LYR, cred_p, md_oflags);
   1942 			/*
   1943 			 * As open() may change the device,
   1944 			 * send this info back to the caller.
   1945 			 */
   1946 			*dev = md_expldev(ddi_dev);
   1947 			return (ret);
   1948 		}
   1949 
   1950 		/* or do it ourselves */
   1951 		(void) md_unit_openclose_enter(ui);
   1952 		err = md_unit_incopen(mnum, flag, OTYP_LYR);
   1953 		md_unit_openclose_exit(ui);
   1954 		/* convert our ddi_dev back to the dev we were given */
   1955 		*dev = md_expldev(ddi_dev);
   1956 		return (err);
   1957 	}
   1958 
   1959 	/*
   1960 	 * Open regular device, since open() may change dev_t give new dev_t
   1961 	 * back to the caller.
   1962 	 */
   1963 	err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p);
   1964 	*dev = md_expldev(ddi_dev);
   1965 	return (err);
   1966 }
   1967 
   1968 /*
   1969  * md_layered_close() is an internal routine only for SVM modules.
   1970  * So the input device will be a md_dev64_t, because all SVM modules internally
   1971  * work with that device type.
   1972  * ddi routines on the other hand work with dev_t. So, if we call any ddi
   1973  * routines from here we first have to convert that device into a dev_t.
   1974  */
   1975 void
   1976 md_layered_close(
   1977 	md_dev64_t	dev,
   1978 	int		md_cflags
   1979 )
   1980 {
   1981 	int		flag = (FREAD | FWRITE);
   1982 	cred_t		*cred_p = kcred;
   1983 	dev_t		ddi_dev = md_dev64_to_dev(dev);
   1984 	major_t		major = getmajor(ddi_dev);
   1985 	minor_t		mnum = getminor(ddi_dev);
   1986 
   1987 	/* metadevice */
   1988 	if (major == md_major) {
   1989 		mdi_unit_t	*ui = MDI_UNIT(mnum);
   1990 
   1991 		/* close underlying driver */
   1992 		if (md_ops[ui->ui_opsindex]->md_close != NULL) {
   1993 			(*md_ops[ui->ui_opsindex]->md_close)
   1994 			    (ddi_dev, flag, OTYP_LYR, cred_p, md_cflags);
   1995 			return;
   1996 		}
   1997 
   1998 		/* or do it ourselves */
   1999 		(void) md_unit_openclose_enter(ui);
   2000 		(void) md_unit_decopen(mnum, OTYP_LYR);
   2001 		md_unit_openclose_exit(ui);
   2002 		return;
   2003 	}
   2004 
   2005 	/* close regular device */
   2006 	(void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p);
   2007 }
   2008 
   2009 /*
   2010  * saves a little code in mdstrategy
   2011  */
   2012 int
   2013 errdone(mdi_unit_t *ui, struct buf *bp, int err)
   2014 {
   2015 	if ((bp->b_error = err) != 0)
   2016 		bp->b_flags |= B_ERROR;
   2017 	else
   2018 		bp->b_resid = bp->b_bcount;
   2019 	md_unit_readerexit(ui);
   2020 	md_biodone(bp);
   2021 	return (1);
   2022 }
   2023 
   2024 static int	md_write_label = 0;
   2025 
   2026 int
   2027 md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp)
   2028 {
   2029 	diskaddr_t endblk;
   2030 	set_t	setno = MD_UN2SET(un);
   2031 
   2032 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
   2033 	    (! (bp->b_flags & B_READ)))
   2034 		return (errdone(ui, bp, EROFS));
   2035 	/*
   2036 	 * Check early for unreasonable block number.
   2037 	 *
   2038 	 * b_blkno is defined as adaddr_t which is typedef'd to a long.
   2039 	 * A problem occurs if b_blkno has bit 31 set and un_total_blocks
   2040 	 * doesn't, b_blkno is then compared as a negative number which is
   2041 	 * always less than a positive.
   2042 	 */
   2043 	if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks)
   2044 		return (errdone(ui, bp, EINVAL));
   2045 
   2046 	if (bp->b_lblkno == un->c.un_total_blocks)
   2047 		return (errdone(ui, bp, 0));
   2048 
   2049 	/*
   2050 	 * make sure we don't clobber any labels
   2051 	 */
   2052 	if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) &&
   2053 	    (un->c.un_flag & MD_LABELED) && (! md_write_label)) {
   2054 		cmn_err(CE_NOTE, "md: %s: write to label",
   2055 		    md_shortname(getminor(bp->b_edev)));
   2056 		return (errdone(ui, bp, EINVAL));
   2057 	}
   2058 
   2059 	bp->b_resid = 0;
   2060 	endblk = (diskaddr_t)(bp->b_lblkno +
   2061 	    howmany(bp->b_bcount, DEV_BSIZE) - 1);
   2062 
   2063 	if (endblk > (un->c.un_total_blocks - 1)) {
   2064 		bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1));
   2065 		endblk = un->c.un_total_blocks - 1;
   2066 		bp->b_bcount -= bp->b_resid;
   2067 	}
   2068 	return (0);
   2069 }
   2070 
   2071 /*
   2072  * init_request_queue: initializes the request queues and creates the threads.
   2073  *	return value =  0  :invalid num_threads
   2074  *		     =  n   : n is the number of threads created.
   2075  */
   2076 
   2077 int
   2078 init_requestq(
   2079 	md_requestq_entry_t *rq, /* request queue info */
   2080 	void (*threadfn)(),	 /* function to start the thread */
   2081 	caddr_t threadfn_args,	 /* args to the function */
   2082 	int pri,		 /* thread priority */
   2083 	int init_queue)		 /* flag to init queues */
   2084 {
   2085 	struct mdq_anchor *rqhead;
   2086 	int	i;
   2087 	int	num_threads;
   2088 
   2089 
   2090 	num_threads = *(rq->num_threadsp);
   2091 	rqhead = rq->dispq_headp;
   2092 
   2093 	if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0)
   2094 		return (0);
   2095 
   2096 	if (init_queue) {
   2097 		rqhead->dq.maxq_len = 0;
   2098 		rqhead->dq.treqs = 0;
   2099 		rqhead->dq.dq_next = &rqhead->dq;
   2100 		rqhead->dq.dq_prev = &rqhead->dq;
   2101 		cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL);
   2102 		mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL);
   2103 	}
   2104 	for (i = 0; i < num_threads; i++) {
   2105 		(void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0,
   2106 		    TS_RUN, pri);
   2107 	}
   2108 	return (i);
   2109 }
   2110 
   2111 static void
   2112 start_daemon(struct mdq_anchor *q)
   2113 {
   2114 	md_daemon(0, q);
   2115 	ASSERT(0);
   2116 }
   2117 
   2118 /*
   2119  * Creates all the md daemons.
   2120  * Global:
   2121  *	md_num_daemons is set to number of daemons.
   2122  *	MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active.
   2123  *
   2124  * Return value: 0  success
   2125  *		 1  failure
   2126  */
   2127 int
   2128 md_start_daemons(int init_queue)
   2129 {
   2130 	md_requestq_entry_t	*rqp;
   2131 	int	cnt;
   2132 	int	i;
   2133 	int	retval = 0;
   2134 
   2135 
   2136 	if (md_get_status() & MD_GBL_DAEMONS_LIVE) {
   2137 		return (retval);
   2138 	}
   2139 	md_clr_status(MD_GBL_DAEMONS_DIE);
   2140 
   2141 	rqp = &md_daemon_queues[0];
   2142 	i = 0;
   2143 	while (!NULL_REQUESTQ_ENTRY(rqp)) {
   2144 		cnt = init_requestq(rqp, start_daemon,
   2145 		    (caddr_t)rqp->dispq_headp, minclsyspri, init_queue);
   2146 
   2147 		if (cnt && cnt != *rqp->num_threadsp) {
   2148 			retval = 1;
   2149 			break;
   2150 		}
   2151 		/*
   2152 		 * initialize variables
   2153 		 */
   2154 		md_num_daemons += cnt;
   2155 		rqp = &md_daemon_queues[++i];
   2156 	}
   2157 
   2158 	md_set_status(MD_GBL_DAEMONS_LIVE);
   2159 	return (retval);
   2160 }
   2161 
   2162 int
   2163 md_loadsubmod(set_t setno, char *name, int drvrid)
   2164 {
   2165 	ddi_modhandle_t	mod;
   2166 	md_ops_t	**pops, *ops;
   2167 	int		i, err;
   2168 
   2169 	/*
   2170 	 * See if the submodule is mdopened. If not, i is the index of the
   2171 	 * next empty slot.
   2172 	 */
   2173 	for (i = 0; md_ops[i] != NULL; i++) {
   2174 		if (strncmp(name, md_ops[i]->md_driver.md_drivername,
   2175 		    MD_DRIVERNAMELEN) == 0)
   2176 			return (i);
   2177 
   2178 		if (i == (MD_NOPS - 1))
   2179 			return (-1);
   2180 	}
   2181 
   2182 	if (drvrid < 0) {
   2183 		/* Do not try to add any records to the DB when stale. */
   2184 		if (md_get_setstatus(setno) & MD_SET_STALE)
   2185 			return (-1);
   2186 		drvrid = md_setshared_name(setno, name, 0L);
   2187 	}
   2188 
   2189 	if (drvrid < 0)
   2190 		return (-1);
   2191 
   2192 	/* open and import the md_ops of the submodules */
   2193 	mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err);
   2194 	if (mod == NULL) {
   2195 		cmn_err(CE_WARN, "md_loadsubmod: "
   2196 		    "unable to ddi_modopen %s, error %d\n", name, err);
   2197 		return (-1);
   2198 	}
   2199 	pops = ddi_modsym(mod, "md_interface_ops", &err);
   2200 	if (pops == NULL) {
   2201 		cmn_err(CE_WARN, "md_loadsubmod: "
   2202 		    "unable to import md_interface_ops from %s, error %d\n",
   2203 		    name, err);
   2204 		(void) ddi_modclose(mod);
   2205 		return (-1);
   2206 	}
   2207 
   2208 	/* ddi_modsym returns pointer to md_interface_ops in submod */
   2209 	ops = *pops;
   2210 
   2211 	/* initialize */
   2212 	ops->md_selfindex = i;
   2213 	rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL);
   2214 	(void) strncpy(ops->md_driver.md_drivername, name,
   2215 	    MD_DRIVERNAMELEN);
   2216 
   2217 	/* plumb */
   2218 	md_ops[i] = ops;
   2219 	md_mods[i] = mod;
   2220 	ops->md_next = md_opslist;
   2221 	md_opslist = ops;
   2222 
   2223 	/* return index */
   2224 	return (i);
   2225 }
   2226 
   2227 int
   2228 md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired)
   2229 {
   2230 	int	i;
   2231 	int	modindex;
   2232 	char	*name = driver->md_drivername;
   2233 	set_t	setno = driver->md_setno;
   2234 	int	drvid;
   2235 	int	local_dont_load;
   2236 
   2237 	if (setno >= md_nsets)
   2238 		return (-1);
   2239 
   2240 	for (i = 0; name[i] != 0; i++)
   2241 		if (i == (MD_DRIVERNAMELEN -1))
   2242 			return (-1);
   2243 
   2244 	/*
   2245 	 * If set is STALE, set local_dont_load to 1 since no records
   2246 	 * should be added to DB when stale.
   2247 	 */
   2248 	if (md_get_setstatus(setno) & MD_SET_STALE) {
   2249 		local_dont_load = 1;
   2250 	} else {
   2251 		local_dont_load = dont_load;
   2252 	}
   2253 
   2254 	/*
   2255 	 * Single thread ioctl module binding with respect to
   2256 	 * similar code executed in md_loadsubmod that is called
   2257 	 * from md_snarf_db_set (which is where that path does
   2258 	 * its md_haltsnarf_enter call).
   2259 	 */
   2260 	md_haltsnarf_enter(setno);
   2261 
   2262 	/* See if the submodule is already ddi_modopened. */
   2263 	for (i = 0; md_ops[i] != NULL; i++) {
   2264 		if (strncmp(name, md_ops[i]->md_driver.md_drivername,
   2265 		    MD_DRIVERNAMELEN) == 0) {
   2266 			if (! local_dont_load &&
   2267 			    (md_getshared_key(setno, name) == MD_KEYBAD)) {
   2268 				if (md_setshared_name(setno, name, 0L)
   2269 				    == MD_KEYBAD) {
   2270 					if (!db_notrequired)
   2271 						goto err;
   2272 				}
   2273 			}
   2274 			md_haltsnarf_exit(setno);
   2275 			return (i);
   2276 		}
   2277 
   2278 		if (i == (MD_NOPS -1))
   2279 			break;
   2280 	}
   2281 
   2282 	if (local_dont_load)
   2283 		goto err;
   2284 
   2285 	drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name));
   2286 
   2287 	/* ddi_modopen the submodule */
   2288 	modindex = md_loadsubmod(setno, name, drvid);
   2289 	if (modindex < 0)
   2290 		goto err;
   2291 
   2292 	if (md_ops[modindex]->md_snarf != NULL)
   2293 		(*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno);
   2294 
   2295 	md_haltsnarf_exit(setno);
   2296 	return (modindex);
   2297 
   2298 err:	md_haltsnarf_exit(setno);
   2299 	return (-1);
   2300 }
   2301 
   2302 void
   2303 md_call_strategy(buf_t *bp, int flags, void *private)
   2304 {
   2305 	mdi_unit_t	*ui;
   2306 
   2307 	if (mdv_strategy_tstpnt)
   2308 		if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0)
   2309 			return;
   2310 	if (getmajor(bp->b_edev) != md_major) {
   2311 		(void) bdev_strategy(bp);
   2312 		return;
   2313 	}
   2314 
   2315 	flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP;
   2316 	ui = MDI_UNIT(getminor(bp->b_edev));
   2317 	ASSERT(ui != NULL);
   2318 	(*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private);
   2319 }
   2320 
   2321 /*
   2322  * md_call_ioctl:
   2323  * -------------
   2324  * Issue the specified ioctl to the device associated with the given md_dev64_t
   2325  *
   2326  * Arguments:
   2327  *	dev	- underlying device [md_dev64_t]
   2328  *	cmd	- ioctl to perform
   2329  *	data	- arguments / result location
   2330  *	mode	- read/write/layered ioctl
   2331  *	lockp	- lock reference
   2332  *
   2333  * Returns:
   2334  *	0	success
   2335  *	!=0	Failure (error code)
   2336  */
   2337 int
   2338 md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp)
   2339 {
   2340 	dev_t		device = md_dev64_to_dev(dev);
   2341 	int		rval;
   2342 	mdi_unit_t	*ui;
   2343 
   2344 	/*
   2345 	 * See if device is a metadevice. If not call cdev_ioctl(), otherwise
   2346 	 * call the ioctl entry-point in the metadevice.
   2347 	 */
   2348 	if (md_getmajor(dev) != md_major) {
   2349 		int	rv;
   2350 		rval = cdev_ioctl(device, cmd, (intptr_t)data, mode,
   2351 		    ddi_get_cred(), &rv);
   2352 	} else {
   2353 		ui = MDI_UNIT(md_getminor(dev));
   2354 		ASSERT(ui != NULL);
   2355 		rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data,
   2356 		    mode, lockp);
   2357 	}
   2358 	return (rval);
   2359 }
   2360 
   2361 void
   2362 md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head)
   2363 {
   2364 	md_link_t	*next;
   2365 	md_link_t	**pprev;
   2366 
   2367 	rw_enter(rw, RW_WRITER);
   2368 
   2369 	next = *head;
   2370 	pprev = head;
   2371 	while (next) {
   2372 		if ((next->ln_setno == setno) && (next->ln_id == id)) {
   2373 			*pprev = next->ln_next;
   2374 			rw_exit(rw);
   2375 			return;
   2376 		}
   2377 		pprev = &next->ln_next;
   2378 		next = next->ln_next;
   2379 	}
   2380 
   2381 	rw_exit(rw);
   2382 }
   2383 
   2384 int
   2385 md_dev_exists(md_dev64_t dev)
   2386 {
   2387 
   2388 	if (dev == NODEV64)
   2389 		return (0);
   2390 
   2391 	if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0)
   2392 		return (1);
   2393 
   2394 	if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
   2395 	    (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
   2396 		return (0);
   2397 
   2398 	if (MDI_UNIT(md_getminor(dev)) != NULL)
   2399 		return (1);
   2400 
   2401 	return (0);
   2402 }
   2403 
   2404 md_parent_t
   2405 md_get_parent(md_dev64_t dev)
   2406 {
   2407 	md_unit_t	*un;
   2408 	mdi_unit_t	*ui;
   2409 	md_parent_t	parent;
   2410 
   2411 	if (md_getmajor(dev) != md_major)
   2412 		return (MD_NO_PARENT);
   2413 
   2414 	ui = MDI_UNIT(md_getminor(dev));
   2415 
   2416 	un = (md_unit_t *)md_unit_readerlock(ui);
   2417 	parent = un->c.un_parent;
   2418 	md_unit_readerexit(ui);
   2419 
   2420 	return (parent);
   2421 }
   2422 
   2423 void
   2424 md_set_parent(md_dev64_t dev, md_parent_t parent)
   2425 {
   2426 	md_unit_t	*un;
   2427 	mdi_unit_t	*ui;
   2428 
   2429 	if (md_getmajor(dev) != md_major)
   2430 		return;
   2431 
   2432 	ui = MDI_UNIT(md_getminor(dev));
   2433 
   2434 	un = (md_unit_t *)md_unit_readerlock(ui);
   2435 	un->c.un_parent = parent;
   2436 	md_unit_readerexit(ui);
   2437 }
   2438 
   2439 void
   2440 md_reset_parent(md_dev64_t dev)
   2441 {
   2442 	md_unit_t	*un;
   2443 	mdi_unit_t	*ui;
   2444 
   2445 	if (md_getmajor(dev) != md_major)
   2446 		return;
   2447 
   2448 	ui = MDI_UNIT(md_getminor(dev));
   2449 
   2450 	un = (md_unit_t *)md_unit_readerlock(ui);
   2451 	un->c.un_parent = MD_NO_PARENT;
   2452 	md_unit_readerexit(ui);
   2453 }
   2454 
   2455 
   2456 static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL;
   2457 
   2458 int
   2459 md_hot_spare_ifc(
   2460 	hs_cmds_t	cmd,
   2461 	mddb_recid_t	id,
   2462 	u_longlong_t	size,
   2463 	int		labeled,
   2464 	mddb_recid_t	*hs_id,
   2465 	mdkey_t		*key,
   2466 	md_dev64_t	*dev,
   2467 	diskaddr_t	*sblock)
   2468 {
   2469 	int		err;
   2470 
   2471 	/*
   2472 	 * RW lock on hot_spare_interface. We don't want it to change from
   2473 	 * underneath us. If hot_spare_interface is NULL we're going to
   2474 	 * need to set it. So we need to upgrade to a WRITER lock. If that
   2475 	 * doesn't work, we drop the lock and reenter as WRITER. This leaves
   2476 	 * a small hole during which hot_spare_interface could be modified
   2477 	 * so we check it for NULL again. What a pain. Then if still null
   2478 	 * load from md_get_named_service.
   2479 	 */
   2480 
   2481 	rw_enter(&hsp_rwlp.lock, RW_READER);
   2482 	if (hot_spare_interface == NULL) {
   2483 		if (rw_tryupgrade(&hsp_rwlp.lock) == 0) {
   2484 			rw_exit(&hsp_rwlp.lock);
   2485 			rw_enter(&hsp_rwlp.lock, RW_WRITER);
   2486 			if (hot_spare_interface != NULL) {
   2487 				err = ((*hot_spare_interface)
   2488 				    (cmd, id, size, labeled, hs_id, key, dev,
   2489 				    sblock));
   2490 				rw_exit(&hsp_rwlp.lock);
   2491 				return (err);
   2492 			}
   2493 		}
   2494 		hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE,
   2495 		    "hot spare interface", 0);
   2496 		rw_downgrade(&hsp_rwlp.lock);
   2497 	}
   2498 
   2499 	if (hot_spare_interface == NULL) {
   2500 		cmn_err(CE_WARN, "md: no hotspare interface");
   2501 		rw_exit(&hsp_rwlp.lock);
   2502 		return (0);
   2503 	}
   2504 
   2505 	err = ((*hot_spare_interface)
   2506 	    (cmd, id, size, labeled, hs_id, key, dev, sblock));
   2507 	rw_exit(&hsp_rwlp.lock);
   2508 	return (err);
   2509 }
   2510 
   2511 void
   2512 md_clear_hot_spare_interface()
   2513 {
   2514 	rw_enter(&hsp_rwlp.lock, RW_WRITER);
   2515 	hot_spare_interface = NULL;
   2516 	rw_exit(&hsp_rwlp.lock);
   2517 }
   2518 
   2519 
   2520 static intptr_t (*notify_interface)() = (intptr_t (*)())NULL;
   2521 
   2522 int
   2523 md_notify_interface(
   2524 	md_event_cmds_t cmd,
   2525 	md_tags_t	tag,
   2526 	set_t		set,
   2527 	md_dev64_t	dev,
   2528 	md_event_type_t event
   2529 )
   2530 {
   2531 	int		err;
   2532 
   2533 	if (md_event_queue == NULL)
   2534 		return (0);
   2535 	rw_enter(&ni_rwlp.lock, RW_READER);
   2536 	if (notify_interface == NULL) {
   2537 		if (rw_tryupgrade(&ni_rwlp.lock) == 0) {
   2538 			rw_exit(&ni_rwlp.lock);
   2539 			rw_enter(&ni_rwlp.lock, RW_WRITER);
   2540 			if (notify_interface != NULL) {
   2541 				err = ((*notify_interface)
   2542 				    (cmd, tag, set, dev, event));
   2543 				rw_exit(&ni_rwlp.lock);
   2544 				return (err);
   2545 			}
   2546 		}
   2547 		notify_interface = md_get_named_service(NODEV64, ANY_SERVICE,
   2548 		    "notify interface", 0);
   2549 		rw_downgrade(&ni_rwlp.lock);
   2550 	}
   2551 	if (notify_interface == NULL) {
   2552 		cmn_err(CE_WARN, "md: no notify interface");
   2553 		rw_exit(&ni_rwlp.lock);
   2554 		return (0);
   2555 	}
   2556 	err = ((*notify_interface)(cmd, tag, set, dev, event));
   2557 	rw_exit(&ni_rwlp.lock);
   2558 	return (err);
   2559 }
   2560 
   2561 char *
   2562 obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev)
   2563 {
   2564 	char		*setname;
   2565 	char		name[MD_MAX_CTDLEN];
   2566 	minor_t		mnum = md_getminor(dev);
   2567 	major_t		maj = md_getmajor(dev);
   2568 	int		rtn = 0;
   2569 
   2570 	/*
   2571 	 * Verify that the passed dev_t refers to a valid metadevice.
   2572 	 * If it doesn't we can make no assumptions as to what the device
   2573 	 * name is. Return NULL in these cases.
   2574 	 */
   2575 	if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) ||
   2576 	    (MD_MIN2SET(mnum) >= md_nsets)) {
   2577 		return (NULL);
   2578 	}
   2579 
   2580 	setname = NULL;
   2581 	name[0] = '\0';
   2582 	switch (tag) {
   2583 	case SVM_TAG_HSP:
   2584 		if (setno == 0) {
   2585 			rtn = snprintf(name, sizeof (name), "hsp%u",
   2586 			    (unsigned)MD_MIN2UNIT(mnum));
   2587 		} else {
   2588 			setname = mddb_getsetname(setno);
   2589 			if (setname != NULL) {
   2590 				rtn = snprintf(name, sizeof (name), "%s/hsp%u",
   2591 				    setname, (unsigned)MD_MIN2UNIT(mnum));
   2592 			}
   2593 		}
   2594 		break;
   2595 	case SVM_TAG_DRIVE:
   2596 		(void) sprintf(name, "drive");
   2597 		break;
   2598 	case SVM_TAG_HOST:
   2599 		(void) sprintf(name, "host");
   2600 		break;
   2601 	case SVM_TAG_SET:
   2602 		rtn = snprintf(name, sizeof (name), "%s",
   2603 		    mddb_getsetname(setno));
   2604 		if ((name[0] == '\0') || (rtn >= sizeof (name))) {
   2605 			(void) sprintf(name, "diskset");
   2606 			rtn = 0;
   2607 		}
   2608 		break;
   2609 	default:
   2610 		rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum));
   2611 		break;
   2612 	}
   2613 
   2614 	/* Check if we got any rubbish for any of the snprintf's */
   2615 	if ((name[0] == '\0') || (rtn >= sizeof (name))) {
   2616 		return (NULL);
   2617 	}
   2618 
   2619 	return (md_strdup(name));
   2620 }
   2621 
   2622 /* Sysevent subclass and mdnotify event type pairs */
   2623 struct node {
   2624 	char		*se_ev;
   2625 	md_event_type_t	md_ev;
   2626 };
   2627 
   2628 /*
   2629  * Table must be sorted in case sensitive ascending order of
   2630  * the sysevents values
   2631  */
   2632 static struct node ev_table[] = {
   2633 	{ ESC_SVM_ADD,			EQ_ADD },
   2634 	{ ESC_SVM_ATTACH,		EQ_ATTACH },
   2635 	{ ESC_SVM_ATTACHING,		EQ_ATTACHING },
   2636 	{ ESC_SVM_CHANGE,		EQ_CHANGE },
   2637 	{ ESC_SVM_CREATE,		EQ_CREATE },
   2638 	{ ESC_SVM_DELETE,		EQ_DELETE },
   2639 	{ ESC_SVM_DETACH,		EQ_DETACH },
   2640 	{ ESC_SVM_DETACHING,		EQ_DETACHING },
   2641 	{ ESC_SVM_DRIVE_ADD,		EQ_DRIVE_ADD },
   2642 	{ ESC_SVM_DRIVE_DELETE,		EQ_DRIVE_DELETE },
   2643 	{ ESC_SVM_ENABLE,		EQ_ENABLE },
   2644 	{ ESC_SVM_ERRED,		EQ_ERRED },
   2645 	{ ESC_SVM_EXCHANGE,		EQ_EXCHANGE },
   2646 	{ ESC_SVM_GROW,			EQ_GROW },
   2647 	{ ESC_SVM_HS_CHANGED,		EQ_HS_CHANGED },
   2648 	{ ESC_SVM_HS_FREED,		EQ_HS_FREED },
   2649 	{ ESC_SVM_HOST_ADD,		EQ_HOST_ADD },
   2650 	{ ESC_SVM_HOST_DELETE,		EQ_HOST_DELETE },
   2651 	{ ESC_SVM_HOTSPARED,		EQ_HOTSPARED },
   2652 	{ ESC_SVM_INIT_FAILED,		EQ_INIT_FAILED },
   2653 	{ ESC_SVM_INIT_FATAL,		EQ_INIT_FATAL },
   2654 	{ ESC_SVM_INIT_START,		EQ_INIT_START },
   2655 	{ ESC_SVM_INIT_SUCCESS,		EQ_INIT_SUCCESS },
   2656 	{ ESC_SVM_IOERR,		EQ_IOERR },
   2657 	{ ESC_SVM_LASTERRED,		EQ_LASTERRED },
   2658 	{ ESC_SVM_MEDIATOR_ADD,		EQ_MEDIATOR_ADD },
   2659 	{ ESC_SVM_MEDIATOR_DELETE,	EQ_MEDIATOR_DELETE },
   2660 	{ ESC_SVM_OFFLINE,		EQ_OFFLINE },
   2661 	{ ESC_SVM_OK,			EQ_OK },
   2662 	{ ESC_SVM_ONLINE,		EQ_ONLINE },
   2663 	{ ESC_SVM_OPEN_FAIL,		EQ_OPEN_FAIL },
   2664 	{ ESC_SVM_REGEN_DONE,		EQ_REGEN_DONE },
   2665 	{ ESC_SVM_REGEN_FAILED,		EQ_REGEN_FAILED },
   2666 	{ ESC_SVM_REGEN_START,		EQ_REGEN_START },
   2667 	{ ESC_SVM_RELEASE,		EQ_RELEASE },
   2668 	{ ESC_SVM_REMOVE,		EQ_REMOVE },
   2669 	{ ESC_SVM_RENAME_DST,		EQ_RENAME_DST },
   2670 	{ ESC_SVM_RENAME_SRC,		EQ_RENAME_SRC },
   2671 	{ ESC_SVM_REPLACE,		EQ_REPLACE },
   2672 	{ ESC_SVM_RESYNC_DONE,		EQ_RESYNC_DONE },
   2673 	{ ESC_SVM_RESYNC_FAILED,	EQ_RESYNC_FAILED },
   2674 	{ ESC_SVM_RESYNC_START,		EQ_RESYNC_START },
   2675 	{ ESC_SVM_RESYNC_SUCCESS,	EQ_RESYNC_SUCCESS },
   2676 	{ ESC_SVM_TAKEOVER,		EQ_TAKEOVER }
   2677 };
   2678 
   2679 static md_tags_t md_tags[] = {
   2680 	TAG_UNK,
   2681 	TAG_METADEVICE,
   2682 	TAG_UNK,
   2683 	TAG_UNK,
   2684 	TAG_UNK,
   2685 	TAG_UNK,
   2686 	TAG_REPLICA,
   2687 	TAG_HSP,
   2688 	TAG_HS,
   2689 	TAG_SET,
   2690 	TAG_DRIVE,
   2691 	TAG_HOST,
   2692 	TAG_MEDIATOR
   2693 };
   2694 
   2695 md_event_type_t
   2696 ev_get(char *subclass)
   2697 {
   2698 	int	high, mid, low, p;
   2699 
   2700 	low = 0;
   2701 	high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1;
   2702 	while (low <= high) {
   2703 		mid = (high + low) / 2;
   2704 		p = strcmp(subclass, ev_table[mid].se_ev);
   2705 		if (p == 0) {
   2706 			return (ev_table[mid].md_ev);
   2707 		} else if (p < 0) {
   2708 			high = mid - 1;
   2709 		} else {
   2710 			low = mid + 1;
   2711 		}
   2712 	}
   2713 
   2714 	return (EQ_EMPTY);
   2715 }
   2716 
   2717 /*
   2718  * Log mdnotify event
   2719  */
   2720 void
   2721 do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid)
   2722 {
   2723 	md_event_type_t	ev_type;
   2724 	md_tags_t	md_tag;
   2725 
   2726 	/* Translate sysevent into mdnotify event */
   2727 	ev_type = ev_get(se_subclass);
   2728 
   2729 	if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) {
   2730 		md_tag = TAG_UNK;
   2731 	} else {
   2732 		md_tag = md_tags[tag];
   2733 	}
   2734 
   2735 	NOTIFY_MD(md_tag, setno, devid, ev_type);
   2736 }
   2737 
   2738 /*
   2739  * Log SVM sys events
   2740  */
   2741 void
   2742 svm_gen_sysevent(
   2743 	char		*se_class,
   2744 	char		*se_subclass,
   2745 	uint32_t	tag,
   2746 	set_t		setno,
   2747 	md_dev64_t	devid
   2748 )
   2749 {
   2750 	nvlist_t		*attr_list;
   2751 	sysevent_id_t		eid;
   2752 	int			err = DDI_SUCCESS;
   2753 	char			*devname;
   2754 	extern dev_info_t	*md_devinfo;
   2755 
   2756 	/* Raise the mdnotify event before anything else */
   2757 	do_mdnotify(se_subclass, tag, setno, devid);
   2758 
   2759 	if (md_devinfo == NULL) {
   2760 		return;
   2761 	}
   2762 
   2763 	err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP);
   2764 
   2765 	if (err == DDI_SUCCESS) {
   2766 		/* Add the version numver */
   2767 		err = nvlist_add_uint32(attr_list, SVM_VERSION_NO,
   2768 		    (uint32_t)SVM_VERSION);
   2769 		if (err != DDI_SUCCESS) {
   2770 			goto fail;
   2771 		}
   2772 
   2773 		/* Add the tag attribute */
   2774 		err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag);
   2775 		if (err != DDI_SUCCESS) {
   2776 			goto fail;
   2777 		}
   2778 
   2779 		/* Add the set number attribute */
   2780 		err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno);
   2781 		if (err != DDI_SUCCESS) {
   2782 			goto fail;
   2783 		}
   2784 
   2785 		/* Add the device id attribute */
   2786 		err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid);
   2787 		if (err != DDI_SUCCESS) {
   2788 			goto fail;
   2789 		}
   2790 
   2791 		/* Add the device name attribute */
   2792 		devname = obj2devname(tag, setno, devid);
   2793 		if (devname != NULL) {
   2794 			err = nvlist_add_string(attr_list, SVM_DEV_NAME,
   2795 			    devname);
   2796 			freestr(devname);
   2797 		} else {
   2798 			err = nvlist_add_string(attr_list, SVM_DEV_NAME,
   2799 			    "unspecified");
   2800 		}
   2801 		if (err != DDI_SUCCESS) {
   2802 			goto fail;
   2803 		}
   2804 
   2805 		/* Attempt to post event */
   2806 		err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class,
   2807 		    se_subclass, attr_list, &eid, DDI_SLEEP);
   2808 
   2809 		nvlist_free(attr_list);
   2810 		if (err != DDI_SUCCESS) {
   2811 			cmn_err(CE_WARN, "Failed to log event for %s, %s,"
   2812 			    " err=%x", se_class, se_subclass, err);
   2813 		}
   2814 	}
   2815 
   2816 	return;
   2817 
   2818 fail:
   2819 	nvlist_free(attr_list);
   2820 	cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x",
   2821 	    se_class, se_subclass, err);
   2822 }
   2823 
   2824 void
   2825 md_clear_named_service()
   2826 {
   2827 	rw_enter(&ni_rwlp.lock, RW_WRITER);
   2828 	notify_interface = NULL;
   2829 	rw_exit(&ni_rwlp.lock);
   2830 }
   2831 
   2832 void
   2833 md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock)
   2834 {
   2835 	mdi_unit_t	*ui;
   2836 	set_t		setno = MD_MIN2SET(mnum);
   2837 
   2838 	ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP);
   2839 	ui->ui_opsindex = ops->md_selfindex;
   2840 
   2841 	/* initialize all the incore conditional variables */
   2842 	mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL);
   2843 	cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL);
   2844 
   2845 	if (alloc_lock) {
   2846 		ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
   2847 		mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
   2848 		cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
   2849 		mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
   2850 		    MUTEX_DEFAULT, NULL);
   2851 		ui->ui_io_lock->io_list_front = NULL;
   2852 		ui->ui_io_lock->io_list_back = NULL;
   2853 	}
   2854 	if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) {
   2855 		rw_enter(&md_unit_array_rw.lock, RW_WRITER);
   2856 		MDI_VOIDUNIT(mnum) = (void *) ui;
   2857 		rw_exit(&md_unit_array_rw.lock);
   2858 	} else
   2859 		MDI_VOIDUNIT(mnum) = (void *) ui;
   2860 
   2861 	rw_enter(&ops->md_link_rw.lock, RW_WRITER);
   2862 	ui->ui_link.ln_next = ops->md_head;
   2863 	ui->ui_link.ln_setno = setno;
   2864 	ui->ui_link.ln_id = mnum;
   2865 	ops->md_head = &ui->ui_link;
   2866 	/* setup the unavailable field */
   2867 #if defined(_ILP32)
   2868 	if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) {
   2869 		ui->ui_tstate |= MD_64MD_ON_32KERNEL;
   2870 		cmn_err(CE_NOTE, "d%d is unavailable because 64 bit "
   2871 		    "metadevices are not accessible on a 32 bit kernel",
   2872 		    mnum);
   2873 	}
   2874 #endif
   2875 
   2876 	rw_exit(&ops->md_link_rw.lock);
   2877 }
   2878 
   2879 void
   2880 md_destroy_unit_incore(minor_t mnum, md_ops_t *ops)
   2881 {
   2882 	mdi_unit_t	*ui;
   2883 
   2884 	/*
   2885 	 * ASSUMPTION: md_unit_array_rw WRITER lock is held.
   2886 	 */
   2887 	ui = MDI_UNIT(mnum);
   2888 	if (ui == NULL)
   2889 		return;
   2890 
   2891 	md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock,
   2892 	    &ops->md_head);
   2893 
   2894 	/* destroy the io lock if one is being used */
   2895 	if (ui->ui_io_lock) {
   2896 		mutex_destroy(&ui->ui_io_lock->io_mx);
   2897 		cv_destroy(&ui->ui_io_lock->io_cv);
   2898 		kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t));
   2899 	}
   2900 
   2901 	/* teardown kstat */
   2902 	md_kstat_destroy(mnum);
   2903 
   2904 	/* destroy all the incore conditional variables */
   2905 	mutex_destroy(&ui->ui_mx);
   2906 	cv_destroy(&ui->ui_cv);
   2907 
   2908 	kmem_free(ui, sizeof (mdi_unit_t));
   2909 	MDI_VOIDUNIT(mnum) = (void *) NULL;
   2910 }
   2911 
   2912 void
   2913 md_rem_names(sv_dev_t *sv, int nsv)
   2914 {
   2915 	int	i, s;
   2916 	int	max_sides;
   2917 
   2918 	if (nsv == 0)
   2919 		return;
   2920 
   2921 	/* All entries removed are in the same diskset */
   2922 	if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET)
   2923 		max_sides = MD_MNMAXSIDES;
   2924 	else
   2925 		max_sides = MD_MAXSIDES;
   2926 
   2927 	for (i = 0; i < nsv; i++)
   2928 		for (s = 0; s < max_sides; s++)
   2929 			(void) md_remdevname(sv[i].setno, s, sv[i].key);
   2930 }
   2931 
   2932 /*
   2933  * Checking user args before we get into physio - returns 0 for ok, else errno
   2934  * We do a lot of checking against illegal arguments here because some of the
   2935  * real disk drivers don't like certain kinds of arguments. (e.g xy doesn't
   2936  * like odd address user buffer.) Those drivers capture bad arguments in
   2937  * xxread and xxwrite. But since meta-driver calls their strategy routines
   2938  * directly, two bad scenario might happen:
   2939  *	1. the real strategy doesn't like it and panic.
   2940  *	2. the real strategy doesn't like it and set B_ERROR.
   2941  *
   2942  * The second case is no better than the first one, since the meta-driver
   2943  * will treat it as a media-error and off line the mirror metapartition.
   2944  * (Too bad there is no way to tell what error it is.)
   2945  *
   2946  */
   2947 int
   2948 md_chk_uio(struct uio *uio)
   2949 {
   2950 	int	i;
   2951 	struct iovec *iov;
   2952 
   2953 	/*
   2954 	 * Check for negative or not block-aligned offset
   2955 	 */
   2956 	if ((uio->uio_loffset < 0) ||
   2957 	    ((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) {
   2958 		return (EINVAL);
   2959 	}
   2960 	iov = uio->uio_iov;
   2961 	i = uio->uio_iovcnt;
   2962 
   2963 	while (i--) {
   2964 		if ((iov->iov_len & (DEV_BSIZE - 1)) != 0)
   2965 			return (EINVAL);
   2966 		/*
   2967 		 * Bug # 1212146
   2968 		 * The default is to not check alignment, but we can now check
   2969 		 * for a larger number of alignments if desired.
   2970 		 */
   2971 		if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask)
   2972 			return (EINVAL);
   2973 		iov++;
   2974 	}
   2975 	return (0);
   2976 }
   2977 
   2978 char *
   2979 md_shortname(
   2980 	minor_t		mnum
   2981 )
   2982 {
   2983 	static char	buf[MAXPATHLEN];
   2984 	char		*devname;
   2985 	char		*invalid = " (Invalid minor number %u) ";
   2986 	char		*metaname;
   2987 	mdc_unit_t	*un;
   2988 	side_t		side;
   2989 	set_t		setno = MD_MIN2SET(mnum);
   2990 	unit_t		unit = MD_MIN2UNIT(mnum);
   2991 
   2992 	if ((un = MD_UNIT(mnum)) == NULL) {
   2993 		(void) snprintf(buf, sizeof (buf), invalid, mnum);
   2994 		return (buf);
   2995 	}
   2996 
   2997 	/*
   2998 	 * If unit is not a friendly name unit, derive the name from the
   2999 	 * minor number.
   3000 	 */
   3001 	if ((un->un_revision & MD_FN_META_DEV) == 0) {
   3002 		/* This is a traditional metadevice */
   3003 		if (setno == MD_LOCAL_SET) {
   3004 			(void) snprintf(buf, sizeof (buf), "d%u",
   3005 			    (unsigned)unit);
   3006 		} else {
   3007 			(void) snprintf(buf, sizeof (buf), "%s/d%u",
   3008 			    mddb_getsetname(setno), (unsigned)unit);
   3009 		}
   3010 		return (buf);
   3011 	}
   3012 
   3013 	/*
   3014 	 * It is a friendly name metadevice, so we need to get its name.
   3015 	 */
   3016 	side = mddb_getsidenum(setno);
   3017 	devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP);
   3018 	if (md_getdevname(setno, side, MD_KEYWILD,
   3019 	    md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) {
   3020 		/*
   3021 		 * md_getdevname has given us either /dev/md/dsk/<metaname>
   3022 		 * or /dev/md/<setname>/dsk/<metname> depending on whether
   3023 		 * or not we are in the local set.  Thus, we'll pull the
   3024 		 * metaname from this string.
   3025 		 */
   3026 		if ((metaname = strrchr(devname, '/')) == NULL) {
   3027 			(void) snprintf(buf, sizeof (buf), invalid, mnum);
   3028 			goto out;
   3029 		}
   3030 		metaname++;	/* move past slash */
   3031 		if (setno == MD_LOCAL_SET) {
   3032 			/* No set name. */
   3033 			(void) snprintf(buf, sizeof (buf), "%s", metaname);
   3034 		} else {
   3035 			/* Include setname */
   3036 			(void) snprintf(buf, sizeof (buf), "%s/%s",
   3037 			    mddb_getsetname(setno), metaname);
   3038 		}
   3039 	} else {
   3040 		/* We couldn't find the name. */
   3041 		(void) snprintf(buf, sizeof (buf), invalid, mnum);
   3042 	}
   3043 
   3044 out:
   3045 	kmem_free(devname, MAXPATHLEN);
   3046 	return (buf);
   3047 }
   3048 
   3049 char *
   3050 md_devname(
   3051 	set_t		setno,
   3052 	md_dev64_t	dev,
   3053 	char		*buf,
   3054 	size_t		size
   3055 )
   3056 {
   3057 	static char	mybuf[MD_MAX_CTDLEN];
   3058 	int		err;
   3059 
   3060 	if (buf == NULL) {
   3061 		buf = mybuf;
   3062 		size = sizeof (mybuf);
   3063 	} else {
   3064 		ASSERT(size >= MD_MAX_CTDLEN);
   3065 	}
   3066 
   3067 	err = md_getdevname_common(setno, mddb_getsidenum(setno),
   3068 	    0, dev, buf, size, MD_NOWAIT_LOCK);
   3069 	if (err) {
   3070 		if (err == ENOENT) {
   3071 			(void) sprintf(buf, "(Unavailable)");
   3072 		} else {
   3073 			(void) sprintf(buf, "(%u.%u)",
   3074 			    md_getmajor(dev), md_getminor(dev));
   3075 		}
   3076 	}
   3077 
   3078 	return (buf);
   3079 }
   3080 void
   3081 md_minphys(buf_t *pb)
   3082 {
   3083 	extern unsigned md_maxbcount;
   3084 
   3085 	if (pb->b_bcount > md_maxbcount)
   3086 		pb->b_bcount = md_maxbcount;
   3087 }
   3088 
   3089 void
   3090 md_bioinit(struct buf *bp)
   3091 {
   3092 	ASSERT(bp);
   3093 
   3094 	bioinit(bp);
   3095 	bp->b_back = bp;
   3096 	bp->b_forw = bp;
   3097 	bp->b_flags = B_BUSY;	/* initialize flags */
   3098 }
   3099 
   3100 void
   3101 md_bioreset(struct buf *bp)
   3102 {
   3103 	ASSERT(bp);
   3104 
   3105 	bioreset(bp);
   3106 	bp->b_back = bp;
   3107 	bp->b_forw = bp;
   3108 	bp->b_flags = B_BUSY;	/* initialize flags */
   3109 }
   3110 
   3111 /*
   3112  * md_bioclone is needed as long as the real bioclone only takes a daddr_t
   3113  * as block number.
   3114  * We simply call bioclone with all input parameters but blkno, and set the
   3115  * correct blkno afterwards.
   3116  * Caveat Emptor: bp_mem must not be NULL!
   3117  */
   3118 buf_t *
   3119 md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno,
   3120 		int (*iodone)(buf_t *), buf_t *bp_mem, int sleep)
   3121 {
   3122 	(void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep);
   3123 	bp_mem->b_lblkno = blkno;
   3124 	return (bp_mem);
   3125 }
   3126 
   3127 
   3128 /*
   3129  * kstat stuff
   3130  */
   3131 void
   3132 md_kstat_init_ui(
   3133 	minor_t		 mnum,
   3134 	mdi_unit_t	*ui
   3135 )
   3136 {
   3137 	if ((ui != NULL) && (ui->ui_kstat == NULL)) {
   3138 		set_t	setno = MD_MIN2SET(mnum);
   3139 		unit_t  unit = MD_MIN2UNIT(mnum);
   3140 		char	module[KSTAT_STRLEN];
   3141 		char	*p = module;
   3142 
   3143 		if (setno != MD_LOCAL_SET) {
   3144 			char	buf[64];
   3145 			char	*s = buf;
   3146 			char	*e = module + sizeof (module) - 4;
   3147 
   3148 			(void) sprintf(buf, "%u", setno);
   3149 			while ((p < e) && (*s != '\0'))
   3150 				*p++ = *s++;
   3151 			*p++ = '/';
   3152 		}
   3153 		*p++ = 'm';
   3154 		*p++ = 'd';
   3155 		*p = '\0';
   3156 		if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk",
   3157 		    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
   3158 			ui->ui_kstat->ks_lock = &ui->ui_mx;
   3159 			kstat_install(ui->ui_kstat);
   3160 		}
   3161 	}
   3162 }
   3163 
   3164 void
   3165 md_kstat_init(
   3166 	minor_t		mnum
   3167 )
   3168 {
   3169 	md_kstat_init_ui(mnum, MDI_UNIT(mnum));
   3170 }
   3171 
   3172 void
   3173 md_kstat_destroy_ui(
   3174 	mdi_unit_t	*ui
   3175 )
   3176 {
   3177 	/*
   3178 	 * kstat_delete() interface has it's own locking mechanism and
   3179 	 * does not allow holding of kstat lock (ks_lock).
   3180 	 * Note: ks_lock == ui_mx from the md_kstat_init_ui().
   3181 	 */
   3182 	if ((ui != NULL) && (ui->ui_kstat != NULL)) {
   3183 		kstat_delete(ui->ui_kstat);
   3184 		ui->ui_kstat = NULL;
   3185 	}
   3186 }
   3187 
   3188 void
   3189 md_kstat_destroy(
   3190 	minor_t		mnum
   3191 )
   3192 {
   3193 	md_kstat_destroy_ui(MDI_UNIT(mnum));
   3194 }
   3195 
   3196 /*
   3197  * In the following subsequent routines, locks are held before checking the
   3198  * validity of ui_kstat. This is done to make sure that we don't trip over
   3199  * a NULL ui_kstat anymore.
   3200  */
   3201 
   3202 void
   3203 md_kstat_waitq_enter(
   3204 	mdi_unit_t	*ui
   3205 )
   3206 {
   3207 	mutex_enter(&ui->ui_mx);
   3208 	if (ui->ui_kstat != NULL)
   3209 		kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat));
   3210 	mutex_exit(&ui->ui_mx);
   3211 }
   3212 
   3213 void
   3214 md_kstat_waitq_to_runq(
   3215 	mdi_unit_t	*ui
   3216 )
   3217 {
   3218 	mutex_enter(&ui->ui_mx);
   3219 	if (ui->ui_kstat != NULL)
   3220 		kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat));
   3221 	mutex_exit(&ui->ui_mx);
   3222 }
   3223 
   3224 void
   3225 md_kstat_waitq_exit(
   3226 	mdi_unit_t	*ui
   3227 )
   3228 {
   3229 	mutex_enter(&ui->ui_mx);
   3230 	if (ui->ui_kstat != NULL)
   3231 		kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat));
   3232 	mutex_exit(&ui->ui_mx);
   3233 }
   3234 
   3235 void
   3236 md_kstat_runq_enter(
   3237 	mdi_unit_t	*ui
   3238 )
   3239 {
   3240 	mutex_enter(&ui->ui_mx);
   3241 	if (ui->ui_kstat != NULL)
   3242 		kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat));
   3243 	mutex_exit(&ui->ui_mx);
   3244 }
   3245 
   3246 void
   3247 md_kstat_runq_exit(
   3248 	mdi_unit_t	*ui
   3249 )
   3250 {
   3251 	mutex_enter(&ui->ui_mx);
   3252 	if (ui->ui_kstat != NULL)
   3253 		kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
   3254 	mutex_exit(&ui->ui_mx);
   3255 }
   3256 
   3257 void
   3258 md_kstat_done(
   3259 	mdi_unit_t	*ui,
   3260 	buf_t		*bp,
   3261 	int		war
   3262 )
   3263 {
   3264 	size_t  n_done;
   3265 
   3266 	/* check for end of device */
   3267 	if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) {
   3268 		n_done = bp->b_bcount;
   3269 	} else if (bp->b_bcount < bp->b_resid) {
   3270 		n_done = 0;
   3271 	} else {
   3272 		n_done = bp->b_bcount - bp->b_resid;
   3273 	}
   3274 
   3275 	/* do accounting */
   3276 	mutex_enter(&ui->ui_mx);
   3277 	if (ui->ui_kstat != NULL) {
   3278 		if ((! war) && (bp->b_flags & B_READ)) {
   3279 			KSTAT_IO_PTR(ui->ui_kstat)->reads++;
   3280 			KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done;
   3281 		} else {
   3282 			KSTAT_IO_PTR(ui->ui_kstat)->writes++;
   3283 			KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done;
   3284 		}
   3285 		kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
   3286 	}
   3287 	mutex_exit(&ui->ui_mx);
   3288 }
   3289 
   3290 pid_t
   3291 md_getpid()
   3292 {
   3293 	pid_t valuep;
   3294 	if (drv_getparm(PPID, (pid_t *)&valuep) != 0) {
   3295 		ASSERT(0);
   3296 		return ((pid_t)0);
   3297 	} else {
   3298 		ASSERT(valuep);
   3299 		return (valuep);
   3300 	}
   3301 }
   3302 
   3303 
   3304 proc_t *
   3305 md_getproc()
   3306 {
   3307 	proc_t  *valuep;
   3308 	if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) {
   3309 		ASSERT(0);
   3310 		return ((proc_t *)NULL);
   3311 	} else {
   3312 		ASSERT(valuep);
   3313 		return (valuep);
   3314 	}
   3315 }
   3316 
   3317 extern kmutex_t pidlock;
   3318 
   3319 /*
   3320  * this check to see if a process pid pair are still running.  For the
   3321  * disk set lock when both pid/proc are zero then the locks is not
   3322  * currently held.
   3323  */
   3324 int
   3325 md_checkpid(pid_t pid, proc_t *proc)
   3326 {
   3327 	int	retval = 1;
   3328 
   3329 	if (pid == 0 && proc == NULL)
   3330 		return (0);
   3331 
   3332 	mutex_enter(&pidlock);
   3333 	if (prfind(pid)  != proc)
   3334 		retval = 0;
   3335 	mutex_exit(&pidlock);
   3336 	return (retval);
   3337 }
   3338 
   3339 /*
   3340  * NAME: md_init_probereq
   3341  *
   3342  * DESCRIPTION: initializes a probe request. Parcels out the mnums such that
   3343  *		they can be dispatched to multiple daemon threads.
   3344  *
   3345  * PARAMETERS: struct md_probedev *p	pointer ioctl input
   3346  *
   3347  * RETURN VALUE: Returns errno
   3348  *
   3349  */
   3350 
   3351 int
   3352 md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp)
   3353 {
   3354 	int		err = 0;
   3355 	int		modindx;
   3356 	intptr_t	(*probe_test)();
   3357 
   3358 	/*
   3359 	 * Initialize the semaphores and mutex
   3360 	 * for the request
   3361 	 */
   3362 
   3363 	p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP);
   3364 
   3365 	p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP);
   3366 	sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL);
   3367 	mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL);
   3368 
   3369 	modindx = md_getmodindex(&(p->probe.md_driver), 1, 1);
   3370 	probe_test = md_get_named_service(NODEV64, modindx,
   3371 	    p->probe.test_name, 0);
   3372 	if (probe_test == NULL) {
   3373 		err = EINVAL;
   3374 		goto err_out;
   3375 	}
   3376 
   3377 	err = md_create_probe_rqlist(p, hdrpp, probe_test);
   3378 err_out:
   3379 	return (err);
   3380 }
   3381 
   3382 /*
   3383  * NAME: md_probe_one
   3384  *
   3385  * DESCRIPTION: Generic routine for probing disks. This is called from the
   3386  *		daemon.
   3387  *
   3388  * PARAMETERS: probe_req_t	*reqp	pointer to the probe request structure.
   3389  *
   3390  */
   3391 
   3392 void
   3393 md_probe_one(probe_req_t *reqp)
   3394 {
   3395 	mdi_unit_t		*ui;
   3396 	md_probedev_impl_t	*p;
   3397 	int			err = 0;
   3398 	set_t			setno;
   3399 
   3400 	p = (md_probedev_impl_t *)reqp->private_handle;
   3401 	/*
   3402 	 * Validate the unit while holding the global ioctl lock, then
   3403 	 * obtain the unit_writerlock. Once the writerlock has been obtained
   3404 	 * we can release the global lock. As long as we hold one of these
   3405 	 * locks this will prevent a metaclear operation being performed
   3406 	 * on the metadevice because metaclear takes the readerlock (via
   3407 	 * openclose lock).
   3408 	 * To avoid a potential deadlock with the probe_fcn() causing i/o to
   3409 	 * be issued to the writerlock'd metadevice we only grab the writerlock
   3410 	 * if the unit is not an SVM root device.
   3411 	 */
   3412 	while (md_ioctl_lock_enter() == EINTR)
   3413 		;
   3414 	setno = MD_MIN2SET(reqp->mnum);
   3415 	ui = MDI_UNIT(reqp->mnum);
   3416 	if (ui != NULL) {
   3417 		int	writer_grabbed;
   3418 		dev_t	svm_root;
   3419 
   3420 		if ((setno == MD_LOCAL_SET) && root_is_svm) {
   3421 			svm_root = getrootdev();
   3422 
   3423 			if (getminor(svm_root) == reqp->mnum) {
   3424 				writer_grabbed = 0;
   3425 			} else {
   3426 				writer_grabbed = 1;
   3427 				(void) md_unit_writerlock_common(ui, 0);
   3428 			}
   3429 		} else {
   3430 			writer_grabbed = 1;
   3431 			(void) md_unit_writerlock_common(ui, 0);
   3432 		}
   3433 		(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
   3434 		err = (*reqp->probe_fcn)(ui, reqp->mnum);
   3435 		if (writer_grabbed) {
   3436 			md_unit_writerexit(ui);
   3437 		}
   3438 	} else {
   3439 		(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
   3440 	}
   3441 
   3442 	/* update the info in the probe structure */
   3443 
   3444 	mutex_enter(PROBE_MX(p));
   3445 	if (err != 0) {
   3446 		cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err,
   3447 		    reqp->mnum);
   3448 		(void) mdsyserror(&(p->probe.mde), err);
   3449 	}
   3450 
   3451 	mutex_exit(PROBE_MX(p));
   3452 	sema_v(PROBE_SEMA(p));
   3453 
   3454 	kmem_free(reqp, sizeof (probe_req_t));
   3455 }
   3456 char *
   3457 md_strdup(char *cp)
   3458 {
   3459 	char *new_cp = NULL;
   3460 
   3461 	new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP);
   3462 
   3463 	return (strcpy(new_cp, cp));
   3464 }
   3465 
   3466 void
   3467 freestr(char *cp)
   3468 {
   3469 	kmem_free(cp, strlen(cp) + 1);
   3470 }
   3471 
   3472 /*
   3473  * Validate the list and skip invalid devices. Then create
   3474  * a doubly linked circular list of devices to probe.
   3475  * The hdr points to the head and tail of this list.
   3476  */
   3477 
   3478 static int
   3479 md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr,
   3480 			intptr_t (*probe_test)())
   3481 {
   3482 	int i, err, nodevcnt;
   3483 	probe_req_t *tp;
   3484 	daemon_queue_t *hp;
   3485 	minor_t mnum;
   3486 
   3487 	nodevcnt = 0;
   3488 
   3489 	hp = NULL;
   3490 
   3491 	for (i = 0; i <  plist->probe.nmdevs; i++) {
   3492 		mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i];
   3493 		if (MDI_UNIT(mnum) == NULL) {
   3494 			cmn_err(CE_WARN, "md: Cannot probe %s since it does "
   3495 			    "not exist", md_shortname(mnum));
   3496 			nodevcnt++;
   3497 			continue;
   3498 		}
   3499 		tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP);
   3500 		tp->mnum = mnum;
   3501 		tp->private_handle = (void *)plist;
   3502 		tp->probe_fcn = probe_test;
   3503 		if (hp == NULL) {
   3504 			hp = (daemon_queue_t *)tp;
   3505 			hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp;
   3506 		} else {
   3507 			tp->dq.dq_next = hp;
   3508 			tp->dq.dq_prev = hp->dq_prev;
   3509 			hp->dq_prev->dq_next = (daemon_queue_t *)tp;
   3510 			hp->dq_prev = (daemon_queue_t *)tp;
   3511 		}
   3512 	}
   3513 
   3514 	*hdr = hp;
   3515 	if (nodevcnt > 0)
   3516 		plist->probe.nmdevs -= nodevcnt;
   3517 
   3518 	/*
   3519 	 * If there are no devices to be probed because they were
   3520 	 * incorrect, then return an error.
   3521 	 */
   3522 	err = (plist->probe.nmdevs == 0) ? ENODEV : 0;
   3523 
   3524 	return (err);
   3525 }
   3526 
   3527 /*
   3528  * This routine increments the I/O count for set I/O operations.  This
   3529  * value is used to determine if an I/O can done.  If a release is in
   3530  * process this will return an error and cause the I/O to be errored.
   3531  */
   3532 int
   3533 md_inc_iocount(set_t setno)
   3534 {
   3535 	int	rc = 0;
   3536 
   3537 	if (setno == 0)
   3538 		return (0);
   3539 
   3540 	mutex_enter(&md_set_io[setno].md_io_mx);
   3541 	if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) {
   3542 		rc = EIO;
   3543 		goto out;
   3544 	}
   3545 
   3546 	ASSERT(md_set_io[setno].io_cnt >= 0);
   3547 	md_set_io[setno].io_cnt++;
   3548 
   3549 out:	mutex_exit(&md_set_io[setno].md_io_mx);
   3550 	return (rc);
   3551 }
   3552 
   3553 void
   3554 md_inc_iocount_noblock(set_t setno)
   3555 {
   3556 
   3557 	if (setno == 0)
   3558 		return;
   3559 
   3560 	mutex_enter(&md_set_io[setno].md_io_mx);
   3561 	md_set_io[setno].io_cnt++;
   3562 	mutex_exit(&md_set_io[setno].md_io_mx);
   3563 }
   3564 void
   3565 md_dec_iocount(set_t setno)
   3566 {
   3567 
   3568 	if (setno == 0)
   3569 		return;
   3570 
   3571 	mutex_enter(&md_set_io[setno].md_io_mx);
   3572 	md_set_io[setno].io_cnt--;
   3573 	ASSERT(md_set_io[setno].io_cnt >= 0);
   3574 	if ((md_set_io[setno].io_state & MD_SET_RELEASE) &&
   3575 	    (md_set_io[setno].io_cnt == 0))
   3576 		cv_broadcast(&md_set_io[setno].md_io_cv);
   3577 	mutex_exit(&md_set_io[setno].md_io_mx);
   3578 }
   3579 
   3580 int
   3581 md_isblock_setio(set_t setno)
   3582 {
   3583 	int	rc = 0;
   3584 
   3585 	if (setno == 0)
   3586 		return (0);
   3587 
   3588 	mutex_enter(&md_set_io[setno].md_io_mx);
   3589 	if (md_set_io[setno].io_state & MD_SET_RELEASE)
   3590 		rc = 1;
   3591 
   3592 	mutex_exit(&md_set_io[setno].md_io_mx);
   3593 	return (rc);
   3594 }
   3595 
   3596 int
   3597 md_block_setio(set_t setno)
   3598 {
   3599 	int	rc = 0;
   3600 
   3601 	if (setno == 0)
   3602 		return (1);
   3603 
   3604 	mutex_enter(&md_set_io[setno].md_io_mx);
   3605 	md_set_io[setno].io_state = MD_SET_RELEASE;
   3606 
   3607 	while (md_set_io[setno].io_cnt > 0) {
   3608 		cv_wait(&md_set_io[setno].md_io_cv,
   3609 		    &md_set_io[setno].md_io_mx);
   3610 	}
   3611 	rc = 1;
   3612 
   3613 
   3614 	ASSERT(md_set_io[setno].io_cnt == 0);
   3615 	mutex_exit(&md_set_io[setno].md_io_mx);
   3616 
   3617 	return (rc);
   3618 }
   3619 
   3620 void
   3621 md_clearblock_setio(set_t setno)
   3622 {
   3623 	if (setno == 0)
   3624 		return;
   3625 
   3626 	mutex_enter(&md_set_io[setno].md_io_mx);
   3627 	md_set_io[setno].io_state = MD_SET_ACTIVE;
   3628 	mutex_exit(&md_set_io[setno].md_io_mx);
   3629 }
   3630 
   3631 void
   3632 md_unblock_setio(set_t setno)
   3633 {
   3634 	if (setno == 0)
   3635 		return;
   3636 
   3637 	mutex_enter(&md_set_io[setno].md_io_mx);
   3638 #ifdef DEBUG
   3639 	if (md_set_io[setno].io_cnt != 0) {
   3640 		cmn_err(CE_NOTE, "set %d count was %ld at take",
   3641 		    setno, md_set_io[setno].io_cnt);
   3642 	}
   3643 #endif /* DEBUG */
   3644 
   3645 	md_set_io[setno].io_state = MD_SET_ACTIVE;
   3646 	md_set_io[setno].io_cnt = 0;
   3647 	mutex_exit(&md_set_io[setno].md_io_mx);
   3648 }
   3649 
   3650 /*
   3651  * Test and set version of the md_block_setio.
   3652  * Set the io_state to keep new I/O from being issued.
   3653  * If there is I/O currently in progress, then set io_state to active
   3654  * and return failure.  Otherwise, return a 1 for success.
   3655  *
   3656  * Used in a MN diskset since the commd must be suspended before
   3657  * this node can attempt to withdraw from a diskset.  But, with commd
   3658  * suspended, I/O may have been issued that can never finish until
   3659  * commd is resumed (allocation of hotspare, etc). So, if I/O is
   3660  * outstanding after diskset io_state is marked RELEASE, then set diskset
   3661  * io_state back to ACTIVE and return failure.
   3662  */
   3663 int
   3664 md_tas_block_setio(set_t setno)
   3665 {
   3666 	int	rc;
   3667 
   3668 	if (setno == 0)
   3669 		return (1);
   3670 
   3671 	mutex_enter(&md_set_io[setno].md_io_mx);
   3672 	md_set_io[setno].io_state = MD_SET_RELEASE;
   3673 
   3674 	if (md_set_io[setno].io_cnt > 0) {
   3675 		md_set_io[setno].io_state = MD_SET_ACTIVE;
   3676 		rc = 0;
   3677 	} else {
   3678 		rc = 1;
   3679 	}
   3680 
   3681 	mutex_exit(&md_set_io[setno].md_io_mx);
   3682 
   3683 	return (rc);
   3684 }
   3685 
   3686 void
   3687 md_biodone(struct buf *pb)
   3688 {
   3689 	minor_t	mnum;
   3690 	set_t	setno;
   3691 	mdi_unit_t	*ui;
   3692 
   3693 	mnum = getminor(pb->b_edev);
   3694 	setno = MD_MIN2SET(mnum);
   3695 
   3696 	if (setno == 0) {
   3697 		biodone(pb);
   3698 		return;
   3699 	}
   3700 
   3701 #ifdef DEBUG
   3702 	ui = MDI_UNIT(mnum);
   3703 	if (!md_unit_isopen(ui))
   3704 		cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum));
   3705 #endif /* DEBUG */
   3706 
   3707 	/*
   3708 	 * Handle the local diskset
   3709 	 */
   3710 	if (md_set_io[setno].io_cnt > 0)
   3711 		md_dec_iocount(setno);
   3712 
   3713 #ifdef DEBUG
   3714 	/*
   3715 	 * this is being done after the lock is dropped so there
   3716 	 * are cases it may be invalid.  It is advisory.
   3717 	 */
   3718 	if (md_set_io[setno].io_state & MD_SET_RELEASE) {
   3719 		/* Only display this error once for this metadevice */
   3720 		if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) {
   3721 			cmn_err(CE_NOTE,
   3722 			    "I/O to %s attempted during set RELEASE\n",
   3723 			    md_shortname(mnum));
   3724 			ui->ui_tstate |= MD_RELEASE_IOERR_DONE;
   3725 		}
   3726 	}
   3727 #endif /* DEBUG */
   3728 
   3729 	biodone(pb);
   3730 }
   3731 
   3732 
   3733 /*
   3734  * Driver special private devt handling routine
   3735  * INPUT:  md_dev64_t
   3736  * OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel.
   3737  */
   3738 dev_t
   3739 md_dev64_to_dev(md_dev64_t dev)
   3740 {
   3741 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
   3742 	minor_t minor = (minor_t)(dev & MAXMIN64);
   3743 
   3744 	return (makedevice(major, minor));
   3745 
   3746 }
   3747 
   3748 /*
   3749  * Driver private makedevice routine
   3750  * INPUT:  major_t major, minor_t minor
   3751  * OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel.
   3752  */
   3753 md_dev64_t
   3754 md_makedevice(major_t major, minor_t minor)
   3755 {
   3756 	return (((md_dev64_t)major << NBITSMINOR64) | minor);
   3757 
   3758 }
   3759 
   3760 
   3761 /*
   3762  * Driver private devt md_getmajor routine
   3763  * INPUT:  dev	a 64 bit container holding either a 32 bit or a 64 bit device
   3764  * OUTPUT: the appropriate major number
   3765  */
   3766 major_t
   3767 md_getmajor(md_dev64_t dev)
   3768 {
   3769 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
   3770 
   3771 	if (major == 0) {
   3772 		/* Here we were given a 32bit dev */
   3773 		major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32;
   3774 	}
   3775 	return (major);
   3776 }
   3777 
   3778 /*
   3779  * Driver private devt md_getminor routine
   3780  * INPUT:  dev	a 64 bit container holding either a 32 bit or a 64 bit device
   3781  * OUTPUT: the appropriate minor number
   3782  */
   3783 minor_t
   3784 md_getminor(md_dev64_t dev)
   3785 {
   3786 	minor_t minor;
   3787 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
   3788 
   3789 	if (major == 0) {
   3790 		/* Here we were given a 32bit dev */
   3791 		minor = (minor_t)(dev & MAXMIN32);
   3792 	} else {
   3793 		minor = (minor_t)(dev & MAXMIN64);
   3794 	}
   3795 	return (minor);
   3796 }
   3797 
   3798 int
   3799 md_check_ioctl_against_unit(int cmd, mdc_unit_t c)
   3800 {
   3801 	/*
   3802 	 * If the metadevice is an old style device, it has a vtoc,
   3803 	 *	in that case all reading EFI ioctls are not applicable.
   3804 	 * If the metadevice has an EFI label, reading vtoc and geom ioctls
   3805 	 *	are not supposed to work.
   3806 	 */
   3807 	switch (cmd) {
   3808 		case DKIOCGGEOM:
   3809 		case DKIOCGAPART:
   3810 			/* if > 2 TB then fail */
   3811 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
   3812 				return (ENOTSUP);
   3813 			}
   3814 			break;
   3815 		case DKIOCGVTOC:
   3816 			/* if > 2 TB then fail */
   3817 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
   3818 				return (ENOTSUP);
   3819 			}
   3820 
   3821 			/* if > 1 TB but < 2TB return overflow */
   3822 			if (c.un_revision & MD_64BIT_META_DEV) {
   3823 				return (EOVERFLOW);
   3824 			}
   3825 			break;
   3826 		case DKIOCGEXTVTOC:
   3827 			/* if > 2 TB then fail */
   3828 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
   3829 				return (ENOTSUP);
   3830 			}
   3831 			break;
   3832 		case DKIOCGETEFI:
   3833 		case DKIOCPARTITION:
   3834 			if ((c.un_flag & MD_EFILABEL) == 0) {
   3835 				return (ENOTSUP);
   3836 			}
   3837 			break;
   3838 
   3839 		case DKIOCSETEFI:
   3840 		/* setting an EFI label should always be ok */
   3841 			return (0);
   3842 
   3843 		case DKIOCSVTOC:
   3844 			/* if > 2 TB then fail */
   3845 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
   3846 				return (ENOTSUP);
   3847 			}
   3848 
   3849 			/* if > 1 TB but < 2TB return overflow */
   3850 			if (c.un_revision & MD_64BIT_META_DEV) {
   3851 				return (EOVERFLOW);
   3852 			}
   3853 			break;
   3854 		case DKIOCSEXTVTOC:
   3855 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
   3856 				return (ENOTSUP);
   3857 			}
   3858 			break;
   3859 	}
   3860 	return (0);
   3861 }
   3862 
   3863 /*
   3864  * md_vtoc_to_efi_record()
   3865  * Input:  record id of the vtoc record
   3866  * Output: record id of the efi record
   3867  * Function:
   3868  *	- reads the  volume name from the vtoc record
   3869  *	- converts the volume name to a format, libefi understands
   3870  *	- creates a new record of size MD_EFI_PARTNAME_BYTES
   3871  *	- stores the volname in that record,
   3872  *	- commits that record
   3873  *	- returns the recid of the efi record.
   3874  * Caveat Emptor:
   3875  *	The calling routine must do something like
   3876  *	- un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid)
   3877  *	- commit(un)
   3878  *	- delete(vtoc_recid)
   3879  *	in order to keep the mddb consistent in case of a panic in the middle.
   3880  * Errors:
   3881  *	- returns 0 on any error
   3882  */
   3883 mddb_recid_t
   3884 md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno)
   3885 {
   3886 	struct vtoc	*vtoc;
   3887 	ushort_t	*v;
   3888 	mddb_recid_t	efi_recid;
   3889 	int		i;
   3890 
   3891 	if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) {
   3892 		return (0);
   3893 	}
   3894 	vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid);
   3895 	efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0,
   3896 	    MD_CRO_32BIT, setno);
   3897 	if (efi_recid < 0) {
   3898 		return (0);
   3899 	}
   3900 	v = (ushort_t *)mddb_getrecaddr(efi_recid);
   3901 
   3902 	/* This for loop read, converts and writes */
   3903 	for (i = 0; i < LEN_DKL_VVOL; i++) {
   3904 		v[i] = LE_16((uint16_t)vtoc->v_volume[i]);
   3905 	}
   3906 	/* commit the new record */
   3907 	mddb_commitrec_wrapper(efi_recid);
   3908 
   3909 	return (efi_recid);
   3910 }
   3911 
   3912 /*
   3913  * Send a kernel message.
   3914  * user has to provide for an allocated result structure
   3915  * If the door handler disappears we retry, emitting warnings every so often.
   3916  *
   3917  * The recipient argument is almost always unused, and is therefore typically
   3918  * set to zero, as zero is an invalid cluster nodeid.  The exceptions are the
   3919  * marking and clearing of the DRL from a node that is not currently the
   3920  * owner.  In these cases, the recipient argument will be the nodeid of the
   3921  * mirror owner, and MD_MSGF_DIRECTED will be set in the flags.  Non-owner
   3922  * nodes will not receive these messages.
   3923  *
   3924  * For the case where md_mn_is_commd_present() is false, we simply pre-set
   3925  * the result->kmmr_comm_state to MDMNE_RPC_FAIL.
   3926  * This covers the case where the service mdcommd has been killed and so we do
   3927  * not get a 'new' result structure copied back. Instead we return with the
   3928  * supplied result field, and we need to flag a failure to the caller.
   3929  */
   3930 int
   3931 mdmn_ksend_message(
   3932 	set_t		setno,
   3933 	md_mn_msgtype_t	type,
   3934 	uint_t		flags,
   3935 	md_mn_nodeid_t	recipient,
   3936 	char		*data,
   3937 	int		size,
   3938 	md_mn_kresult_t	*result)
   3939 {
   3940 	door_arg_t	da;
   3941 	md_mn_kmsg_t	*kmsg;
   3942 	uint_t		send_try_cnt = 0;
   3943 	uint_t		retry_noise_cnt = 0;
   3944 	int		rval;
   3945 	k_sigset_t	oldmask, newmask;
   3946 
   3947 	/*
   3948 	 * Ensure that we default to a recoverable failure state if the
   3949 	 * door upcall cannot pass the request on to rpc.mdcommd.
   3950 	 * This may occur when shutting the node down while there is still
   3951 	 * a mirror resync or metadevice state update occurring.
   3952 	 */
   3953 	result->kmmr_comm_state = MDMNE_RPC_FAIL;
   3954 	result->kmmr_exitval = ~0;
   3955 
   3956 	if (size > MDMN_MAX_KMSG_DATA)
   3957 		return (ENOMEM);
   3958 	kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP);
   3959 	kmsg->kmsg_flags = flags;
   3960 	kmsg->kmsg_setno = setno;
   3961 	kmsg->kmsg_recipient = recipient;
   3962 	kmsg->kmsg_type	= type;
   3963 	kmsg->kmsg_size	= size;
   3964 	bcopy(data, &(kmsg->kmsg_data), size);
   3965 
   3966 	/*
   3967 	 * Wait for the door handle to be established.
   3968 	 */
   3969 	while (mdmn_door_did == -1) {
   3970 		if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
   3971 			cmn_err(CE_WARN, "door handle not yet ready. "
   3972 			    "Check if /usr/lib/lvm/mddoors is running");
   3973 		}
   3974 		delay(md_hz);
   3975 	}
   3976 
   3977 	/*
   3978 	 * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we
   3979 	 * do not fail if the user process receives a signal while we're
   3980 	 * active in the door interface.
   3981 	 */
   3982 	if (flags & MD_MSGF_BLK_SIGNAL) {
   3983 		sigfillset(&newmask);
   3984 		sigreplace(&newmask, &oldmask);
   3985 	}
   3986 
   3987 	/*
   3988 	 * If message failed with an RPC_FAILURE when rpc.mdcommd had
   3989 	 * been gracefully shutdown (md_mn_is_commd_present returns FALSE)
   3990 	 * then don't retry the message anymore.  If message
   3991 	 * failed due to any other reason, then retry up to MD_MN_WARN_INTVL
   3992 	 * times which should allow a shutting down system time to
   3993 	 * notify the kernel of a graceful shutdown of rpc.mdcommd.
   3994 	 *
   3995 	 * Caller of this routine will need to check the md_mn_commd_present
   3996 	 * flag and the failure error in order to determine whether to panic
   3997 	 * or not.  If md_mn_commd_present is set to 0 and failure error
   3998 	 * is RPC_FAILURE, the calling routine should not panic since the
   3999 	 * system is in the process of being shutdown.
   4000 	 *
   4001 	 */
   4002 
   4003 	retry_noise_cnt = send_try_cnt = 0;
   4004 	while (md_mn_is_commd_present_lite()) {
   4005 		/*
   4006 		 * data_ptr and data_size are initialized here because on
   4007 		 * return from the upcall, they contain data duplicated from
   4008 		 * rbuf and rsize.  This causes subsequent upcalls to fail.
   4009 		 */
   4010 		da.data_ptr = (char *)(kmsg);
   4011 		da.data_size = sizeof (md_mn_kmsg_t);
   4012 		da.desc_ptr = NULL;
   4013 		da.desc_num = 0;
   4014 		da.rbuf = (char *)result;
   4015 		da.rsize = sizeof (*result);
   4016 
   4017 		while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da,
   4018 		    NULL, SIZE_MAX, 0)) != 0) {
   4019 			if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
   4020 				if (rval == EAGAIN)  {
   4021 					cmn_err(CE_WARN,
   4022 					    "md: door_upcall failed. "
   4023 					    "Check if mddoors is running.");
   4024 				} else if (rval == EINTR) {
   4025 					cmn_err(CE_WARN,
   4026 					    "md: door_upcall failed. "
   4027 					    "Check if rpc.mdcommd is running.");
   4028 				} else {
   4029 					cmn_err(CE_WARN,
   4030 					    "md: door_upcall failed. "
   4031 					    "Returned %d",
   4032 					    rval);
   4033 				}
   4034 			}
   4035 			if (++send_try_cnt >= md_send_retry_limit)
   4036 				break;
   4037 
   4038 			delay(md_hz);
   4039 
   4040 			/*
   4041 			 * data_ptr and data_size are re-initialized here
   4042 			 * because on return from the upcall, they contain
   4043 			 * data duplicated from rbuf and rsize.  This causes
   4044 			 * subsequent upcalls to fail.
   4045 			 */
   4046 			da.data_ptr = (char *)(kmsg);
   4047 			da.data_size = sizeof (md_mn_kmsg_t);
   4048 			da.desc_ptr = NULL;
   4049 			da.desc_num = 0;
   4050 			da.rbuf = (char *)result;
   4051 			da.rsize = sizeof (*result);
   4052 		}
   4053 
   4054 
   4055 		/*
   4056 		 * If:
   4057 		 * - the send succeeded (MDMNE_ACK)
   4058 		 * - we had an MDMNE_RPC_FAIL and commd is now gone
   4059 		 *   (note: since the outer loop is commd-dependent,
   4060 		 *   checking MDMN_RPC_FAIL here is meaningless)
   4061 		 * - we were told not to retry
   4062 		 * - we exceeded the RPC failure send limit
   4063 		 * punch out of the outer loop prior to the delay()
   4064 		 */
   4065 		if (result->kmmr_comm_state == MDMNE_ACK ||
   4066 		    (flags & MD_MSGF_KSEND_NORETRY) ||
   4067 		    (++send_try_cnt % md_send_retry_limit) == 0 ||
   4068 		    !md_mn_is_commd_present())
   4069 			break;
   4070 		delay(md_hz);
   4071 	}
   4072 
   4073 	if (flags & MD_MSGF_BLK_SIGNAL) {
   4074 		sigreplace(&oldmask, (k_sigset_t *)NULL);
   4075 	}
   4076 	kmem_free(kmsg, sizeof (md_mn_kmsg_t));
   4077 
   4078 	return (0);
   4079 }
   4080 
   4081 /*
   4082  * Called to propagate the capability of a metadevice to all nodes in the set.
   4083  *
   4084  * On entry, lockp is set if the function has been called from within an ioctl.
   4085  *
   4086  * IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this
   4087  * routine to enable other mdioctls to enter the kernel while this
   4088  * thread of execution waits on the completion of mdmn_ksend_message. When
   4089  * the message is completed the thread continues and md_ioctl_lock must be
   4090  * reacquired.  Even though md_ioctl_lock is interruptable, we choose to
   4091  * ignore EINTR as we must not return without acquiring md_ioctl_lock.
   4092  */
   4093 
   4094 int
   4095 mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp)
   4096 {
   4097 	md_mn_msg_setcap_t	msg;
   4098 	md_mn_kresult_t		*kres;
   4099 	mdi_unit_t		*ui = MDI_UNIT(mnum);
   4100 	int			ret;
   4101 	k_sigset_t		oldmask, newmask;
   4102 
   4103 	(void) strncpy((char *)&msg.msg_setcap_driver,
   4104 	    md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN);
   4105 	msg.msg_setcap_mnum = mnum;
   4106 	msg.msg_setcap_set = vc.vc_set;
   4107 
   4108 	if (lockp)
   4109 		IOLOCK_RETURN_RELEASE(0, lockp);
   4110 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
   4111 
   4112 	/*
   4113 	 * Mask signals for the mdmd_ksend_message call.  This keeps the door
   4114 	 * interface from failing if the user process receives a signal while
   4115 	 * in mdmn_ksend_message.
   4116 	 */
   4117 	sigfillset(&newmask);
   4118 	sigreplace(&newmask, &oldmask);
   4119 	ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP,
   4120 	    MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t),
   4121 	    kres));
   4122 	sigreplace(&oldmask, (k_sigset_t *)NULL);
   4123 
   4124 	if (!MDMN_KSEND_MSG_OK(ret, kres)) {
   4125 		mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP");
   4126 		ret = EIO;
   4127 	}
   4128 	kmem_free(kres, sizeof (md_mn_kresult_t));
   4129 
   4130 	if (lockp) {
   4131 		IOLOCK_RETURN_REACQUIRE(lockp);
   4132 	}
   4133 	return (ret);
   4134 }
   4135 
   4136 /*
   4137  * Called to clear all of the transient capabilities for a metadevice when it is
   4138  * not open on any node in the cluster
   4139  * Called from close for mirror and sp.
   4140  */
   4141 
   4142 void
   4143 mdmn_clear_all_capabilities(minor_t mnum)
   4144 {
   4145 	md_isopen_t	clumsg;
   4146 	int		ret;
   4147 	md_mn_kresult_t	*kresult;
   4148 	volcap_t	vc;
   4149 	k_sigset_t	oldmask, newmask;
   4150 
   4151 	clumsg.dev = md_makedevice(md_major, mnum);
   4152 	clumsg.mde = mdnullerror;
   4153 	/*
   4154 	 * The check open message doesn't have to be logged, nor should the
   4155 	 * result be stored in the MCT. We want an up-to-date state.
   4156 	 */
   4157 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
   4158 
   4159 	/*
   4160 	 * Mask signals for the mdmd_ksend_message call.  This keeps the door
   4161 	 * interface from failing if the user process receives a signal while
   4162 	 * in mdmn_ksend_message.
   4163 	 */
   4164 	sigfillset(&newmask);
   4165 	sigreplace(&newmask, &oldmask);
   4166 	ret = mdmn_ksend_message(MD_MIN2SET(mnum),
   4167 	    MD_MN_MSG_CLU_CHECK,
   4168 	    MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0,
   4169 	    (char *)&clumsg, sizeof (clumsg), kresult);
   4170 	sigreplace(&oldmask, (k_sigset_t *)NULL);
   4171 
   4172 	if ((ret == 0) && (kresult->kmmr_exitval == 0)) {
   4173 		/*
   4174 		 * Not open on any node, clear all capabilities, eg ABR and
   4175 		 * DMR
   4176 		 */
   4177 		vc.vc_set = 0;
   4178 		(void) mdmn_send_capability_message(mnum, vc, NULL);
   4179 	}
   4180 	kmem_free(kresult, sizeof (md_mn_kresult_t));
   4181 }
   4182 
   4183 /*
   4184  * mdmn_ksend_show_error:
   4185  * ---------------------
   4186  * Called to display the error contents of a failing mdmn_ksend_message() result
   4187  *
   4188  * Input:
   4189  *	rv	- return value from mdmn_ksend_message()
   4190  *	kres	- pointer to result structure filled in by mdmn_ksend_message
   4191  *	s	- Informative message to identify failing condition (e.g.
   4192  *		  "Ownership change") This string will be displayed with
   4193  *		  cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system
   4194  *		  administrator
   4195  */
   4196 void
   4197 mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s)
   4198 {
   4199 	if (rv == 0) {
   4200 		cmn_err(CE_WARN, "%s *FAILED*", s);
   4201 		cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node"
   4202 		    " = %d", kres->kmmr_exitval, kres->kmmr_comm_state,
   4203 		    kres->kmmr_failing_node);
   4204 	} else {
   4205 		cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv);
   4206 	}
   4207 }
   4208 
   4209 /*
   4210  * Callback routine for resync thread. If requested to suspend we mark the
   4211  * commd as not being present.
   4212  */
   4213 boolean_t
   4214 callb_md_mrs_cpr(void *arg, int code)
   4215 {
   4216 	callb_cpr_t *cp = (callb_cpr_t *)arg;
   4217 	int ret = 0;				/* assume success */
   4218 	clock_t delta;
   4219 
   4220 	mutex_enter(cp->cc_lockp);
   4221 
   4222 	switch (code) {
   4223 	case CB_CODE_CPR_CHKPT:
   4224 		/*
   4225 		 * Mark the rpc.mdcommd as no longer present. We are trying to
   4226 		 * suspend the system and so we should expect RPC failures to
   4227 		 * occur.
   4228 		 */
   4229 		md_mn_clear_commd_present();
   4230 		cp->cc_events |= CALLB_CPR_START;
   4231 		delta = CPR_KTHREAD_TIMEOUT_SEC * hz;
   4232 		while (!(cp->cc_events & CALLB_CPR_SAFE))
   4233 			/* cv_timedwait() returns -1 if it times out. */
   4234 			if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
   4235 			    cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1)
   4236 				break;
   4237 			break;
   4238 
   4239 	case CB_CODE_CPR_RESUME:
   4240 		cp->cc_events &= ~CALLB_CPR_START;
   4241 		cv_signal(&cp->cc_stop_cv);
   4242 		break;
   4243 	}
   4244 	mutex_exit(cp->cc_lockp);
   4245 	return (ret != -1);
   4246 }
   4247 
   4248 
   4249 void
   4250 md_rem_hspname(set_t setno, mdkey_t n_key)
   4251 {
   4252 	int	s;
   4253 	int	max_sides;
   4254 
   4255 
   4256 	/* All entries removed are in the same diskset */
   4257 	if (md_get_setstatus(setno) & MD_SET_MNSET)
   4258 		max_sides = MD_MNMAXSIDES;
   4259 	else
   4260 		max_sides = MD_MAXSIDES;
   4261 
   4262 	for (s = 0; s < max_sides; s++)
   4263 		(void) md_remdevname(setno, s, n_key);
   4264 }
   4265 
   4266 
   4267 int
   4268 md_rem_selfname(minor_t selfid)
   4269 {
   4270 	int	s;
   4271 	set_t	setno = MD_MIN2SET(selfid);
   4272 	int	max_sides;
   4273 	md_dev64_t	dev;
   4274 	struct nm_next_hdr	*nh;
   4275 	struct nm_name	*n;
   4276 	mdkey_t key;
   4277 
   4278 	/*
   4279 	 * Get the key since remove routine expects it
   4280 	 */
   4281 	dev = md_makedevice(md_major, selfid);
   4282 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
   4283 		return (ENOENT);
   4284 	}
   4285 
   4286 	if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD,
   4287 	    MD_KEYWILD, dev, 0L)) == NULL) {
   4288 		return (ENOENT);
   4289 	}
   4290 
   4291 	/* All entries removed are in the same diskset */
   4292 	key = n->n_key;
   4293 	if (md_get_setstatus(setno) & MD_SET_MNSET)
   4294 		max_sides = MD_MNMAXSIDES;
   4295 	else
   4296 		max_sides = MD_MAXSIDES;
   4297 
   4298 	for (s = 0; s < max_sides; s++)
   4299 		(void) md_remdevname(setno, s, key);
   4300 
   4301 	return (0);
   4302 }
   4303 
   4304 void
   4305 md_upd_set_unnext(set_t setno, unit_t un)
   4306 {
   4307 	if (un < md_set[setno].s_un_next) {
   4308 		md_set[setno].s_un_next = un;
   4309 	}
   4310 }
   4311 
   4312 struct hot_spare_pool *
   4313 find_hot_spare_pool(set_t setno, int hsp_id)
   4314 {
   4315 	hot_spare_pool_t *hsp;
   4316 
   4317 	hsp = (hot_spare_pool_t *)md_set[setno].s_hsp;
   4318 	while (hsp != NULL) {
   4319 		if (hsp->hsp_self_id == hsp_id)
   4320 			return (hsp);
   4321 		hsp = hsp->hsp_next;
   4322 	}
   4323 
   4324 	return ((hot_spare_pool_t *)0);
   4325 }
   4326 
   4327 /*
   4328  * md_create_taskq:
   4329  *
   4330  * Create a kernel taskq for the given set/unit combination. This is typically
   4331  * used to complete a RR_CLEAN request when the callee is unable to obtain the
   4332  * mutex / condvar access required to update the DRL safely.
   4333  */
   4334 void *
   4335 md_create_taskq(set_t setno, minor_t mnum)
   4336 {
   4337 	char			name[20];
   4338 	ddi_taskq_t		*tqp;
   4339 
   4340 	(void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum));
   4341 
   4342 	tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0);
   4343 
   4344 	return ((void *)tqp);
   4345 }
   4346