Home | History | Annotate | Download | only in md
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Md - is the meta-disk driver.   It sits below the UFS file system
     29  * but above the 'real' disk drivers, xy, id, sd etc.
     30  *
     31  * To the UFS software, md looks like a normal driver, since it has
     32  * the normal kinds of entries in the bdevsw and cdevsw arrays. So
     33  * UFS accesses md in the usual ways.  In particular, the strategy
     34  * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
     35  * and ufs_writelbn().
     36  *
     37  * Md maintains an array of minor devices (meta-partitions).   Each
     38  * meta partition stands for a matrix of real partitions, in rows
     39  * which are not necessarily of equal length.	Md maintains a table,
     40  * with one entry for each meta-partition,  which lists the rows and
     41  * columns of actual partitions, and the job of the strategy routine
     42  * is to translate from the meta-partition device and block numbers
     43  * known to UFS into the actual partitions' device and block numbers.
     44  *
     45  * See below, in mdstrategy(), mdreal(), and mddone() for details of
     46  * this translation.
     47  */
     48 
     49 /*
     50  * Driver for Virtual Disk.
     51  */
     52 
     53 #include <sys/user.h>
     54 #include <sys/sysmacros.h>
     55 #include <sys/conf.h>
     56 #include <sys/stat.h>
     57 #include <sys/errno.h>
     58 #include <sys/param.h>
     59 #include <sys/systm.h>
     60 #include <sys/file.h>
     61 #include <sys/open.h>
     62 #include <sys/dkio.h>
     63 #include <sys/vtoc.h>
     64 #include <sys/cmn_err.h>
     65 #include <sys/ddi.h>
     66 #include <sys/sunddi.h>
     67 #include <sys/debug.h>
     68 #include <sys/utsname.h>
     69 #include <sys/lvm/mdvar.h>
     70 #include <sys/lvm/md_names.h>
     71 #include <sys/lvm/md_mddb.h>
     72 #include <sys/lvm/md_sp.h>
     73 #include <sys/types.h>
     74 #include <sys/kmem.h>
     75 #include <sys/cladm.h>
     76 #include <sys/priv_names.h>
     77 #include <sys/modhash.h>
     78 
     79 #ifndef	lint
     80 char 		_depends_on[] = "strmod/rpcmod";
     81 #endif	/* lint */
     82 int		md_init_debug	= 0;	/* module binding debug */
     83 
     84 /*
     85  * Tunable to turn off the failfast behavior.
     86  */
     87 int		md_ff_disable = 0;
     88 
     89 /*
     90  * dynamically allocated list of non FF driver names - needs to
     91  * be freed when md is detached.
     92  */
     93 char	**non_ff_drivers = NULL;
     94 
     95 md_krwlock_t	md_unit_array_rw;	/* protects all unit arrays */
     96 md_krwlock_t	nm_lock;		/* protects all the name spaces */
     97 
     98 md_resync_t	md_cpr_resync;
     99 
    100 extern char	svm_bootpath[];
    101 #define	SVM_PSEUDO_STR	"/pseudo/md@0:"
    102 
    103 #define		VERSION_LENGTH	6
    104 #define		VERSION		"1.0"
    105 
    106 /*
    107  * Keep track of possible 'orphan' entries in the name space
    108  */
    109 int		*md_nm_snarfed = NULL;
    110 
    111 /*
    112  * Global tunable giving the percentage of free space left in replica during
    113  * conversion of non-devid style replica to devid style replica.
    114  */
    115 int		md_conv_perc = MDDB_DEVID_CONV_PERC;
    116 
    117 #ifdef	DEBUG
    118 /* debug code to verify framework exclusion guarantees */
    119 int		md_in;
    120 kmutex_t	md_in_mx;			/* used to md global stuff */
    121 #define	IN_INIT		0x01
    122 #define	IN_FINI		0x02
    123 #define	IN_ATTACH	0x04
    124 #define	IN_DETACH	0x08
    125 #define	IN_OPEN		0x10
    126 #define	MD_SET_IN(x) {						\
    127 	mutex_enter(&md_in_mx);					\
    128 	if (md_in)						\
    129 		debug_enter("MD_SET_IN exclusion lost");	\
    130 	if (md_in & x)						\
    131 		debug_enter("MD_SET_IN already set");		\
    132 	md_in |= x;						\
    133 	mutex_exit(&md_in_mx);					\
    134 }
    135 
    136 #define	MD_CLR_IN(x) {						\
    137 	mutex_enter(&md_in_mx);					\
    138 	if (md_in & ~(x))					\
    139 		debug_enter("MD_CLR_IN exclusion lost");	\
    140 	if (!(md_in & x))					\
    141 		debug_enter("MD_CLR_IN already clr");		\
    142 	md_in &= ~x;						\
    143 	mutex_exit(&md_in_mx);					\
    144 }
    145 #else	/* DEBUG */
    146 #define	MD_SET_IN(x)
    147 #define	MD_CLR_IN(x)
    148 #endif	/* DEBUG */
    149 hrtime_t savetime1, savetime2;
    150 
    151 
    152 /*
    153  * list things protected by md_mx even if they aren't
    154  * used in this file.
    155  */
    156 kmutex_t	md_mx;			/* used to md global stuff */
    157 kcondvar_t	md_cv;			/* md_status events */
    158 int		md_status = 0;		/* global status for the meta-driver */
    159 int		md_num_daemons = 0;
    160 int		md_ioctl_cnt = 0;
    161 int		md_mtioctl_cnt = 0;	/* multithreaded ioctl cnt */
    162 uint_t		md_mdelay = 10;		/* variable so can be patched */
    163 
    164 int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);
    165 
    166 major_t		md_major, md_major_targ;
    167 
    168 unit_t		md_nunits = MD_MAXUNITS;
    169 set_t		md_nsets = MD_MAXSETS;
    170 int		md_nmedh = 0;
    171 char		*md_med_trans_lst = NULL;
    172 md_set_t	md_set[MD_MAXSETS];
    173 md_set_io_t	md_set_io[MD_MAXSETS];
    174 
    175 md_krwlock_t	hsp_rwlp;		/* protects hot_spare_interface */
    176 md_krwlock_t	ni_rwlp;		/* protects notify_interface */
    177 md_ops_t	**md_ops = NULL;
    178 ddi_modhandle_t	*md_mods = NULL;
    179 md_ops_t	*md_opslist;
    180 clock_t		md_hz;
    181 md_event_queue_t	*md_event_queue = NULL;
    182 
    183 int		md_in_upgrade;
    184 int		md_keep_repl_state;
    185 int		md_devid_destroy;
    186 
    187 /* for sending messages thru a door to userland */
    188 door_handle_t	mdmn_door_handle = NULL;
    189 int		mdmn_door_did = -1;
    190 
    191 dev_info_t		*md_devinfo = NULL;
    192 
    193 md_mn_nodeid_t	md_mn_mynode_id = ~0u;	/* My node id (for multi-node sets) */
    194 
    195 static	uint_t		md_ocnt[OTYPCNT];
    196 
    197 static int		mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
    198 static int		mdattach(dev_info_t *, ddi_attach_cmd_t);
    199 static int		mddetach(dev_info_t *, ddi_detach_cmd_t);
    200 static int		mdopen(dev_t *, int, int, cred_t *);
    201 static int		mdclose(dev_t, int, int, cred_t *);
    202 static int		mddump(dev_t, caddr_t, daddr_t, int);
    203 static int		mdread(dev_t, struct uio *, cred_t *);
    204 static int		mdwrite(dev_t, struct uio *, cred_t *);
    205 static int		mdaread(dev_t, struct aio_req *, cred_t *);
    206 static int		mdawrite(dev_t, struct aio_req *, cred_t *);
    207 static int		mdioctl(dev_t, int, intptr_t, int, cred_t *, int *);
    208 static int		mdprop_op(dev_t, dev_info_t *,
    209 				ddi_prop_op_t, int, char *, caddr_t, int *);
    210 
    211 static struct cb_ops md_cb_ops = {
    212 	mdopen,			/* open */
    213 	mdclose,		/* close */
    214 	mdstrategy,		/* strategy */
    215 				/* print routine -- none yet */
    216 	(int(*)(dev_t, char *))nulldev,
    217 	mddump,			/* dump */
    218 	mdread,			/* read */
    219 	mdwrite,		/* write */
    220 	mdioctl,		/* ioctl */
    221 				/* devmap */
    222 	(int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
    223 			uint_t))nodev,
    224 				/* mmap */
    225 	(int(*)(dev_t, off_t, int))nodev,
    226 				/* segmap */
    227 	(int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned,
    228 		unsigned, unsigned, cred_t *))nodev,
    229 	nochpoll,		/* poll */
    230 	mdprop_op,		/* prop_op */
    231 	0,			/* streamtab */
    232 	(D_64BIT|D_MP|D_NEW),	/* driver compatibility flag */
    233 	CB_REV,			/* cb_ops version */
    234 	mdaread,		/* aread */
    235 	mdawrite,		/* awrite */
    236 };
    237 
    238 static struct dev_ops md_devops = {
    239 	DEVO_REV,		/* dev_ops version */
    240 	0,			/* device reference count */
    241 	mdinfo,			/* info routine */
    242 	nulldev,		/* identify routine */
    243 	nulldev,		/* probe - not defined */
    244 	mdattach,		/* attach routine */
    245 	mddetach,		/* detach routine */
    246 	nodev,			/* reset - not defined */
    247 	&md_cb_ops,		/* driver operations */
    248 	NULL,			/* bus operations */
    249 	nodev,			/* power management */
    250 	ddi_quiesce_not_needed,		/* quiesce */
    251 };
    252 
    253 /*
    254  * loadable module wrapper
    255  */
    256 #include <sys/modctl.h>
    257 
    258 static struct modldrv modldrv = {
    259 	&mod_driverops,			/* type of module -- a pseudodriver */
    260 	"Solaris Volume Manager base module", /* name of the module */
    261 	&md_devops,			/* driver ops */
    262 };
    263 
    264 static struct modlinkage modlinkage = {
    265 	MODREV_1,
    266 	(void *)&modldrv,
    267 	NULL
    268 };
    269 
    270 
    271 /* md_medd.c */
    272 extern	void	med_init(void);
    273 extern	void	med_fini(void);
    274 extern  void	md_devid_cleanup(set_t, uint_t);
    275 
    276 /* md_names.c */
    277 extern void			*lookup_entry(struct nm_next_hdr *, set_t,
    278 					side_t, mdkey_t, md_dev64_t, int);
    279 extern struct nm_next_hdr	*get_first_record(set_t, int, int);
    280 extern int			remove_entry(struct nm_next_hdr *,
    281 					side_t, mdkey_t, int);
    282 
    283 int		md_maxphys	= 0;	/* maximum io size in bytes */
    284 #define		MD_MAXBCOUNT	(1024 * 1024)
    285 unsigned	md_maxbcount	= 0;	/* maximum physio size in bytes */
    286 
    287 /*
    288  * Some md ioctls trigger io framework device tree operations.  An
    289  * example is md ioctls that call md_resolve_bydevid(): which uses the
    290  * io framework to resolve a devid. Such operations result in acquiring
    291  * io framework locks (like ndi_devi_enter() of "/") while holding
    292  * driver locks (like md_unit_writerlock()).
    293  *
    294  * The prop_op(9E) entry point is called from the devinfo driver with
    295  * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op
    296  * implementation must avoid taking a lock that is held per above md
    297  * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock()
    298  * without risking deadlock.
    299  *
    300  * To service "size" requests without risking deadlock, we maintain a
    301  * "mnum->nblocks" sizemap (protected by a short-term global mutex).
    302  */
    303 static kmutex_t		md_nblocks_mutex;
    304 static mod_hash_t	*md_nblocksmap;		/* mnum -> nblocks */
    305 int			md_nblocksmap_size = 512;
    306 
    307 /*
    308  * Maintain "mnum->nblocks" sizemap for mdprop_op use:
    309  *
    310  * Create: any code that establishes a unit's un_total_blocks needs the
    311  * following type of call to establish nblocks for mdprop_op():
    312  *	md_nblocks_set(mnum, un->c.un_total_blocks);"
    313  *	NOTE: locate via cscope md_create_minor_node/md_create_unit_incore
    314  *		...or  "MD_UNIT..*="
    315  *
    316  * Change: any code that changes a unit's un_total_blocks needs the
    317  * following type of call to sync nblocks for mdprop_op():
    318  *	md_nblocks_set(mnum, un->c.un_total_blocks);"
    319  *	NOTE: locate via cscope for "un_total_blocks[ \t]*="
    320  *
    321  * Destroy: any code that deletes a unit needs the following type of call
    322  * to sync nblocks for mdprop_op():
    323  *	md_nblocks_set(mnum, -1ULL);
    324  *	NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore
    325  *		...or  "MD_UNIT..*="
    326  */
    327 void
    328 md_nblocks_set(minor_t mnum, uint64_t nblocks)
    329 {
    330 	mutex_enter(&md_nblocks_mutex);
    331 	if (nblocks == -1ULL)
    332 		(void) mod_hash_destroy(md_nblocksmap,
    333 		    (mod_hash_key_t)(intptr_t)mnum);
    334 	else
    335 		(void) mod_hash_replace(md_nblocksmap,
    336 		    (mod_hash_key_t)(intptr_t)mnum,
    337 		    (mod_hash_val_t)(intptr_t)nblocks);
    338 	mutex_exit(&md_nblocks_mutex);
    339 }
    340 
    341 /* get the size of a mnum from "mnum->nblocks" sizemap */
    342 uint64_t
    343 md_nblocks_get(minor_t mnum)
    344 {
    345 	mod_hash_val_t	hv;
    346 
    347 	mutex_enter(&md_nblocks_mutex);
    348 	if (mod_hash_find(md_nblocksmap,
    349 	    (mod_hash_key_t)(intptr_t)mnum, &hv) == 0) {
    350 		mutex_exit(&md_nblocks_mutex);
    351 		return ((uint64_t)(intptr_t)hv);
    352 	}
    353 	mutex_exit(&md_nblocks_mutex);
    354 	return (0);
    355 }
    356 
    357 /* allocate/free dynamic space associated with driver globals */
    358 void
    359 md_global_alloc_free(int alloc)
    360 {
    361 	set_t	s;
    362 
    363 	if (alloc) {
    364 		/* initialize driver global locks */
    365 		cv_init(&md_cv, NULL, CV_DEFAULT, NULL);
    366 		mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL);
    367 		rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL);
    368 		rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL);
    369 		rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL);
    370 		rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL);
    371 		mutex_init(&md_cpr_resync.md_resync_mutex, NULL,
    372 		    MUTEX_DEFAULT, NULL);
    373 		mutex_init(&md_nblocks_mutex, NULL, MUTEX_DEFAULT, NULL);
    374 
    375 		/* initialize per set driver global locks */
    376 		for (s = 0; s < MD_MAXSETS; s++) {
    377 			/* initialize per set driver globals locks */
    378 			mutex_init(&md_set[s].s_dbmx,
    379 			    NULL, MUTEX_DEFAULT, NULL);
    380 			mutex_init(&md_set_io[s].md_io_mx,
    381 			    NULL, MUTEX_DEFAULT, NULL);
    382 			cv_init(&md_set_io[s].md_io_cv,
    383 			    NULL, CV_DEFAULT, NULL);
    384 		}
    385 	} else {
    386 		/* destroy per set driver global locks */
    387 		for (s = 0; s < MD_MAXSETS; s++) {
    388 			cv_destroy(&md_set_io[s].md_io_cv);
    389 			mutex_destroy(&md_set_io[s].md_io_mx);
    390 			mutex_destroy(&md_set[s].s_dbmx);
    391 		}
    392 
    393 		/* destroy driver global locks */
    394 		mutex_destroy(&md_nblocks_mutex);
    395 		mutex_destroy(&md_cpr_resync.md_resync_mutex);
    396 		rw_destroy(&hsp_rwlp.lock);
    397 		rw_destroy(&ni_rwlp.lock);
    398 		rw_destroy(&nm_lock.lock);
    399 		rw_destroy(&md_unit_array_rw.lock);
    400 		mutex_destroy(&md_mx);
    401 		cv_destroy(&md_cv);
    402 	}
    403 }
    404 
    405 int
    406 _init(void)
    407 {
    408 	set_t	s;
    409 	int	err;
    410 
    411 	MD_SET_IN(IN_INIT);
    412 
    413 	/* allocate dynamic space associated with driver globals */
    414 	md_global_alloc_free(1);
    415 
    416 	/* initialize driver globals */
    417 	md_major = ddi_name_to_major("md");
    418 	md_hz = drv_usectohz(NUM_USEC_IN_SEC);
    419 
    420 	/* initialize tunable globals */
    421 	if (md_maxphys == 0)		/* maximum io size in bytes */
    422 		md_maxphys = maxphys;
    423 	if (md_maxbcount == 0)		/* maximum physio size in bytes */
    424 		md_maxbcount = MD_MAXBCOUNT;
    425 
    426 	/* initialize per set driver globals */
    427 	for (s = 0; s < MD_MAXSETS; s++)
    428 		md_set_io[s].io_state = MD_SET_ACTIVE;
    429 
    430 	/*
    431 	 * NOTE: the framework does not currently guarantee exclusion
    432 	 * between _init and attach after calling mod_install.
    433 	 */
    434 	MD_CLR_IN(IN_INIT);
    435 	if ((err = mod_install(&modlinkage))) {
    436 		MD_SET_IN(IN_INIT);
    437 		md_global_alloc_free(0);	/* free dynamic space */
    438 		MD_CLR_IN(IN_INIT);
    439 	}
    440 	return (err);
    441 }
    442 
    443 int
    444 _fini(void)
    445 {
    446 	int	err;
    447 
    448 	/*
    449 	 * NOTE: the framework currently does not guarantee exclusion
    450 	 * with attach until after mod_remove returns 0.
    451 	 */
    452 	if ((err = mod_remove(&modlinkage)))
    453 		return (err);
    454 
    455 	MD_SET_IN(IN_FINI);
    456 	md_global_alloc_free(0);	/* free dynamic space */
    457 	MD_CLR_IN(IN_FINI);
    458 	return (err);
    459 }
    460 
    461 int
    462 _info(struct modinfo *modinfop)
    463 {
    464 	return (mod_info(&modlinkage, modinfop));
    465 }
    466 
    467 /* ARGSUSED */
    468 static int
    469 mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
    470 {
    471 	int	len;
    472 	unit_t	i;
    473 	size_t	sz;
    474 	char	ver[VERSION_LENGTH];
    475 	char	**maj_str_array;
    476 	char	*str, *str2;
    477 
    478 	MD_SET_IN(IN_ATTACH);
    479 	md_in_upgrade = 0;
    480 	md_keep_repl_state = 0;
    481 	md_devid_destroy = 0;
    482 
    483 	if (cmd != DDI_ATTACH) {
    484 		MD_CLR_IN(IN_ATTACH);
    485 		return (DDI_FAILURE);
    486 	}
    487 
    488 	if (md_devinfo != NULL) {
    489 		MD_CLR_IN(IN_ATTACH);
    490 		return (DDI_FAILURE);
    491 	}
    492 
    493 	mddb_init();
    494 
    495 	if (md_start_daemons(TRUE)) {
    496 		MD_CLR_IN(IN_ATTACH);
    497 		mddb_unload();		/* undo mddb_init() allocations */
    498 		return (DDI_FAILURE);
    499 	}
    500 
    501 	/* clear the halted state */
    502 	md_clr_status(MD_GBL_HALTED);
    503 
    504 	/* see if the diagnostic switch is on */
    505 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
    506 	    DDI_PROP_DONTPASS, "md_init_debug", 0))
    507 		md_init_debug++;
    508 
    509 	/* see if the failfast disable switch is on */
    510 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
    511 	    DDI_PROP_DONTPASS, "md_ff_disable", 0))
    512 		md_ff_disable++;
    513 
    514 	/* try and get the md_nmedh property */
    515 	md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
    516 	    DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS);
    517 	if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS))
    518 		md_nmedh = MED_DEF_HOSTS;
    519 
    520 	/* try and get the md_med_trans_lst property */
    521 	len = 0;
    522 	if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN,
    523 	    0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS ||
    524 	    len == 0) {
    525 		md_med_trans_lst = md_strdup("tcp");
    526 	} else {
    527 		md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP);
    528 		if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
    529 		    0, "md_med_trans_lst", md_med_trans_lst, &len) !=
    530 		    DDI_PROP_SUCCESS) {
    531 			kmem_free(md_med_trans_lst, (size_t)len);
    532 			md_med_trans_lst = md_strdup("tcp");
    533 		}
    534 	}
    535 
    536 	/*
    537 	 * Must initialize the internal data structures before the
    538 	 * any possible calls to 'goto attach_failure' as _fini
    539 	 * routine references them.
    540 	 */
    541 	med_init();
    542 
    543 	md_ops = (md_ops_t **)kmem_zalloc(
    544 	    sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP);
    545 	md_mods = (ddi_modhandle_t *)kmem_zalloc(
    546 	    sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP);
    547 
    548 	/* try and get the md_xlate property */
    549 	/* Should we only do this if upgrade? */
    550 	len = sizeof (char) * 5;
    551 	if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
    552 	    0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) {
    553 		if (strcmp(ver, VERSION) == 0) {
    554 			len = 0;
    555 			if (ddi_prop_op(DDI_DEV_T_ANY, dip,
    556 			    PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate",
    557 			    (caddr_t)&md_tuple_table, &len) !=
    558 			    DDI_PROP_SUCCESS) {
    559 				if (md_init_debug)
    560 					cmn_err(CE_WARN,
    561 					    "md_xlate ddi_prop_op failed");
    562 				goto attach_failure;
    563 			} else {
    564 				md_tuple_length =
    565 				    len/(2 * ((int)sizeof (dev32_t)));
    566 				md_in_upgrade = 1;
    567 			}
    568 
    569 			/* Get target's name to major table */
    570 			if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY,
    571 			    dip, DDI_PROP_DONTPASS,
    572 			    "md_targ_nm_table", &maj_str_array,
    573 			    &md_majortab_len) != DDI_PROP_SUCCESS) {
    574 				md_majortab_len = 0;
    575 				if (md_init_debug)
    576 					cmn_err(CE_WARN, "md_targ_nm_table "
    577 					    "ddi_prop_lookup_string_array "
    578 					    "failed");
    579 				goto attach_failure;
    580 			}
    581 
    582 			md_major_tuple_table =
    583 			    (struct md_xlate_major_table *)
    584 			    kmem_zalloc(md_majortab_len *
    585 			    sizeof (struct md_xlate_major_table), KM_SLEEP);
    586 
    587 			for (i = 0; i < md_majortab_len; i++) {
    588 				/* Getting major name */
    589 				str = strchr(maj_str_array[i], ' ');
    590 				if (str == NULL)
    591 					continue;
    592 				*str = '\0';
    593 				md_major_tuple_table[i].drv_name =
    594 				    md_strdup(maj_str_array[i]);
    595 
    596 				/* Simplified atoi to get major number */
    597 				str2 = str + 1;
    598 				md_major_tuple_table[i].targ_maj = 0;
    599 				while ((*str2 >= '0') && (*str2 <= '9')) {
    600 					md_major_tuple_table[i].targ_maj *= 10;
    601 					md_major_tuple_table[i].targ_maj +=
    602 					    *str2++ - '0';
    603 				}
    604 				*str = ' ';
    605 			}
    606 			ddi_prop_free((void *)maj_str_array);
    607 		} else {
    608 			if (md_init_debug)
    609 				cmn_err(CE_WARN, "md_xlate_ver is incorrect");
    610 			goto attach_failure;
    611 		}
    612 	}
    613 
    614 	/*
    615 	 * Check for properties:
    616 	 * 	md_keep_repl_state and md_devid_destroy
    617 	 * and set globals if these exist.
    618 	 */
    619 	md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip,
    620 	    0, "md_keep_repl_state", 0);
    621 
    622 	md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip,
    623 	    0, "md_devid_destroy", 0);
    624 
    625 	if (MD_UPGRADE)
    626 		md_major_targ = md_targ_name_to_major("md");
    627 	else
    628 		md_major_targ = 0;
    629 
    630 	/* allocate admin device node */
    631 	if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR,
    632 	    MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640))
    633 		goto attach_failure;
    634 
    635 	if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
    636 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS)
    637 		goto attach_failure;
    638 
    639 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip,
    640 	    "ddi-abrwrite-supported", 1) != DDI_SUCCESS)
    641 		goto attach_failure;
    642 
    643 	/* these could have been cleared by a detach */
    644 	md_nunits = MD_MAXUNITS;
    645 	md_nsets = MD_MAXSETS;
    646 
    647 	sz = sizeof (void *) * MD_MAXUNITS;
    648 	if (md_set[0].s_un == NULL)
    649 		md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP);
    650 	if (md_set[0].s_ui == NULL)
    651 		md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP);
    652 
    653 	md_devinfo = dip;
    654 
    655 	/*
    656 	 * Only allocate device node for root mirror metadevice.
    657 	 * Don't pre-allocate unnecessary device nodes (thus slowing down a
    658 	 * boot when we attach).
    659 	 * We can't read the mddbs in attach.  The mddbs will be read
    660 	 * by metainit during the boot process when it is doing the
    661 	 * auto-take processing and any other minor nodes will be
    662 	 * allocated at that point.
    663 	 *
    664 	 * There are two scenarios to be aware of here:
    665 	 * 1) when we are booting from a mirrored root we need the root
    666 	 *    metadevice to exist very early (during vfs_mountroot processing)
    667 	 * 2) we need all of the nodes to be created so that any mnttab entries
    668 	 *    will succeed (handled by metainit reading the mddb during boot).
    669 	 */
    670 	if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1)
    671 	    == 0) {
    672 		char *p;
    673 		int mnum = 0;
    674 
    675 		/*
    676 		 * The svm_bootpath string looks something like
    677 		 * /pseudo/md@0:0,150,blk where 150 is the minor number
    678 		 * in this example so we need to set the pointer p onto
    679 		 * the first digit of the minor number and convert it
    680 		 * from ascii.
    681 		 */
    682 		for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1;
    683 		    *p >= '0' && *p <= '9'; p++) {
    684 			mnum *= 10;
    685 			mnum += *p - '0';
    686 		}
    687 
    688 		if (md_create_minor_node(0, mnum)) {
    689 			kmem_free(md_set[0].s_un, sz);
    690 			kmem_free(md_set[0].s_ui, sz);
    691 			goto attach_failure;
    692 		}
    693 	}
    694 
    695 	/* create the hash to store the meta device sizes */
    696 	md_nblocksmap = mod_hash_create_idhash("md_nblocksmap",
    697 	    md_nblocksmap_size, mod_hash_null_valdtor);
    698 
    699 	MD_CLR_IN(IN_ATTACH);
    700 	return (DDI_SUCCESS);
    701 
    702 attach_failure:
    703 	/*
    704 	 * Use our own detach routine to toss any stuff we allocated above.
    705 	 * NOTE: detach will call md_halt to free the mddb_init allocations.
    706 	 */
    707 	MD_CLR_IN(IN_ATTACH);
    708 	if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS)
    709 		cmn_err(CE_WARN, "detach from attach failed");
    710 	return (DDI_FAILURE);
    711 }
    712 
    713 /* ARGSUSED */
    714 static int
    715 mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
    716 {
    717 	extern int	check_active_locators();
    718 	set_t		s;
    719 	size_t		sz;
    720 	int		len;
    721 
    722 	MD_SET_IN(IN_DETACH);
    723 
    724 	/* check command */
    725 	if (cmd != DDI_DETACH) {
    726 		MD_CLR_IN(IN_DETACH);
    727 		return (DDI_FAILURE);
    728 	}
    729 
    730 	/*
    731 	 * if we have not already halted yet we have no active config
    732 	 * then automatically initiate a halt so we can detach.
    733 	 */
    734 	if (!(md_get_status() & MD_GBL_HALTED)) {
    735 		if (check_active_locators() == 0) {
    736 			/*
    737 			 * NOTE: a successful md_halt will have done the
    738 			 * mddb_unload to free allocations done in mddb_init
    739 			 */
    740 			if (md_halt(MD_NO_GBL_LOCKS_HELD)) {
    741 				cmn_err(CE_NOTE, "md:detach: "
    742 				    "Could not halt Solaris Volume Manager");
    743 				MD_CLR_IN(IN_DETACH);
    744 				return (DDI_FAILURE);
    745 			}
    746 		}
    747 
    748 		/* fail detach if we have not halted */
    749 		if (!(md_get_status() & MD_GBL_HALTED)) {
    750 			MD_CLR_IN(IN_DETACH);
    751 			return (DDI_FAILURE);
    752 		}
    753 	}
    754 
    755 	/* must be in halted state, this will be cleared on next attach */
    756 	ASSERT(md_get_status() & MD_GBL_HALTED);
    757 
    758 	/* cleanup attach allocations and initializations */
    759 	md_major_targ = 0;
    760 
    761 	sz = sizeof (void *) * md_nunits;
    762 	for (s = 0; s < md_nsets; s++) {
    763 		if (md_set[s].s_un != NULL) {
    764 			kmem_free(md_set[s].s_un, sz);
    765 			md_set[s].s_un = NULL;
    766 		}
    767 
    768 		if (md_set[s].s_ui != NULL) {
    769 			kmem_free(md_set[s].s_ui, sz);
    770 			md_set[s].s_ui = NULL;
    771 		}
    772 	}
    773 	md_nunits = 0;
    774 	md_nsets = 0;
    775 	md_nmedh = 0;
    776 
    777 	if (non_ff_drivers != NULL) {
    778 		int	i;
    779 
    780 		for (i = 0; non_ff_drivers[i] != NULL; i++)
    781 			kmem_free(non_ff_drivers[i],
    782 			    strlen(non_ff_drivers[i]) + 1);
    783 
    784 		/* free i+1 entries because there is a null entry at list end */
    785 		kmem_free(non_ff_drivers, (i + 1) * sizeof (char *));
    786 		non_ff_drivers = NULL;
    787 	}
    788 
    789 	if (md_med_trans_lst != NULL) {
    790 		kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1);
    791 		md_med_trans_lst = NULL;
    792 	}
    793 
    794 	if (md_mods != NULL) {
    795 		kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS);
    796 		md_mods = NULL;
    797 	}
    798 
    799 	if (md_ops != NULL) {
    800 		kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS);
    801 		md_ops = NULL;
    802 	}
    803 
    804 	if (MD_UPGRADE) {
    805 		len = md_tuple_length * (2 * ((int)sizeof (dev32_t)));
    806 		md_in_upgrade = 0;
    807 		md_xlate_free(len);
    808 		md_majortab_free();
    809 	}
    810 
    811 	/*
    812 	 * Undo what we did in mdattach, freeing resources
    813 	 * and removing things we installed.  The system
    814 	 * framework guarantees we are not active with this devinfo
    815 	 * node in any other entry points at this time.
    816 	 */
    817 	ddi_prop_remove_all(dip);
    818 	ddi_remove_minor_node(dip, NULL);
    819 
    820 	med_fini();
    821 
    822 	mod_hash_destroy_idhash(md_nblocksmap);
    823 
    824 	md_devinfo = NULL;
    825 
    826 	MD_CLR_IN(IN_DETACH);
    827 	return (DDI_SUCCESS);
    828 }
    829 
    830 
    831 /*
    832  * Given the device number return the devinfo pointer
    833  * given to md via md_attach
    834  */
    835 /*ARGSUSED*/
    836 static int
    837 mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
    838 {
    839 	int		error = DDI_FAILURE;
    840 
    841 	switch (infocmd) {
    842 	case DDI_INFO_DEVT2DEVINFO:
    843 		if (md_devinfo) {
    844 			*result = (void *)md_devinfo;
    845 			error = DDI_SUCCESS;
    846 		}
    847 		break;
    848 
    849 	case DDI_INFO_DEVT2INSTANCE:
    850 		*result = (void *)0;
    851 		error = DDI_SUCCESS;
    852 		break;
    853 	}
    854 	return (error);
    855 }
    856 
    857 /*
    858  * property operation routine.  return the number of blocks for the partition
    859  * in question or forward the request to the property facilities.
    860  */
    861 static int
    862 mdprop_op(
    863 	dev_t dev,		/* device number associated with device */
    864 	dev_info_t *dip,	/* device info struct for this device */
    865 	ddi_prop_op_t prop_op,	/* property operator */
    866 	int mod_flags,		/* property flags */
    867 	char *name,		/* name of property */
    868 	caddr_t valuep,		/* where to put property value */
    869 	int *lengthp)		/* put length of property here */
    870 {
    871 	return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
    872 	    name, valuep, lengthp, md_nblocks_get(getminor(dev))));
    873 }
    874 
    875 static void
    876 snarf_user_data(set_t setno)
    877 {
    878 	mddb_recid_t		recid;
    879 	mddb_recstatus_t	status;
    880 
    881 	recid = mddb_makerecid(setno, 0);
    882 	while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) {
    883 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
    884 			continue;
    885 
    886 		status = mddb_getrecstatus(recid);
    887 		if (status == MDDB_STALE)
    888 			continue;
    889 
    890 		if (status == MDDB_NODATA) {
    891 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
    892 			continue;
    893 		}
    894 
    895 		ASSERT(status == MDDB_OK);
    896 
    897 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
    898 	}
    899 }
    900 
    901 static void
    902 md_print_block_usage(mddb_set_t *s, uint_t blks)
    903 {
    904 	uint_t		ib;
    905 	int		li;
    906 	mddb_mb_ic_t	*mbip;
    907 	uint_t		max_blk_needed;
    908 	mddb_lb_t	*lbp;
    909 	mddb_sidelocator_t	*slp;
    910 	int		drv_index;
    911 	md_splitname	sn;
    912 	char		*name;
    913 	char		*suffix;
    914 	size_t		prefixlen;
    915 	size_t		suffixlen;
    916 	int		alloc_sz;
    917 
    918 
    919 	max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks;
    920 
    921 	cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n"
    922 	    "            Additional Blocks Needed:            %d\n\n"
    923 	    "            Increase size of following replicas for\n"
    924 	    "            device relocatability by deleting listed\n"
    925 	    "            replica and re-adding replica with\n"
    926 	    "            increased size (see metadb(1M)):\n"
    927 	    "                Replica                   Increase By",
    928 	    s->s_totalblkcnt, (blks - s->s_freeblkcnt));
    929 
    930 	lbp = s->s_lbp;
    931 
    932 	for (li = 0; li < lbp->lb_loccnt; li++) {
    933 		if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED)
    934 			continue;
    935 		ib = 0;
    936 		for (mbip = s->s_mbiarray[li]; mbip != NULL;
    937 		    mbip = mbip->mbi_next) {
    938 			ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt;
    939 		}
    940 		if (ib == 0)
    941 			continue;
    942 		if (ib < max_blk_needed) {
    943 			slp = &lbp->lb_sidelocators[s->s_sideno][li];
    944 			drv_index = slp->l_drvnm_index;
    945 			mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno,
    946 			    &sn);
    947 			prefixlen = SPN_PREFIX(&sn).pre_len;
    948 			suffixlen = SPN_SUFFIX(&sn).suf_len;
    949 			alloc_sz = (int)(prefixlen + suffixlen + 2);
    950 			name = (char *)kmem_alloc(alloc_sz, KM_SLEEP);
    951 			(void) strncpy(name, SPN_PREFIX(&sn).pre_data,
    952 			    prefixlen);
    953 			name[prefixlen] = '/';
    954 			suffix = name + (prefixlen + 1);
    955 			(void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data,
    956 			    suffixlen);
    957 			name[prefixlen + suffixlen + 1] = '\0';
    958 			cmn_err(CE_WARN,
    959 			    "  %s (%s:%d:%d)   %d blocks",
    960 			    name, lbp->lb_drvnm[drv_index].dn_data,
    961 			    slp->l_mnum, lbp->lb_locators[li].l_blkno,
    962 			    (max_blk_needed - ib));
    963 			kmem_free(name, alloc_sz);
    964 		}
    965 	}
    966 }
    967 
    968 /*
    969  * md_create_minor_node:
    970  *	Create the minor device for the given set and un_self_id.
    971  *
    972  * Input:
    973  *	setno	- set number
    974  *	mnum	- selfID of unit
    975  *
    976  * Output:
    977  *	None.
    978  *
    979  * Returns 0 for success, 1 for failure.
    980  *
    981  * Side-effects:
    982  *	None.
    983  */
    984 int
    985 md_create_minor_node(set_t setno, minor_t mnum)
    986 {
    987 	char		name[20];
    988 
    989 	/* Check for valid arguments */
    990 	if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS)
    991 		return (1);
    992 
    993 	(void) snprintf(name, 20, "%u,%u,blk",
    994 	    (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
    995 
    996 	if (ddi_create_minor_node(md_devinfo, name, S_IFBLK,
    997 	    MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
    998 		return (1);
    999 
   1000 	(void) snprintf(name, 20, "%u,%u,raw",
   1001 	    (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
   1002 
   1003 	if (ddi_create_minor_node(md_devinfo, name, S_IFCHR,
   1004 	    MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
   1005 		return (1);
   1006 
   1007 	return (0);
   1008 }
   1009 
   1010 /*
   1011  * For a given key check if it is an orphaned record.
   1012  * The following conditions are used to determine an orphan.
   1013  * 1. The device associated with that key is not a metadevice.
   1014  * 2. If DEVID_STYLE then the physical device does not have a device Id
   1015  * associated with it.
   1016  *
   1017  * If a key does not have an entry in the devid namespace it could be
   1018  * a device that does not support device ids. Hence the record is not
   1019  * deleted.
   1020  */
   1021 
   1022 static int
   1023 md_verify_orphaned_record(set_t setno, mdkey_t key)
   1024 {
   1025 	md_dev64_t	odev; /* orphaned dev */
   1026 	mddb_set_t	*s;
   1027 	side_t		side = 0;
   1028 	struct nm_next_hdr	*did_nh = NULL;
   1029 
   1030 	s = (mddb_set_t *)md_set[setno].s_db;
   1031 	if ((did_nh = get_first_record(setno, 1,  (NM_DEVID | NM_NOTSHARED)))
   1032 	    == NULL)
   1033 		return (0);
   1034 	/*
   1035 	 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT
   1036 	 */
   1037 	if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) {
   1038 		odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT);
   1039 		if ((odev == NODEV64) || (md_getmajor(odev) == md_major))
   1040 			return (0);
   1041 		if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) ==
   1042 		    NULL)
   1043 			return (1);
   1044 	}
   1045 	return (0);
   1046 }
   1047 
   1048 int
   1049 md_snarf_db_set(set_t setno, md_error_t *ep)
   1050 {
   1051 	int			err = 0;
   1052 	int			i;
   1053 	mddb_recid_t		recid;
   1054 	mddb_type_t		drvrid;
   1055 	mddb_recstatus_t	status;
   1056 	md_ops_t		*ops;
   1057 	uint_t			privat;
   1058 	mddb_set_t		*s;
   1059 	uint_t			cvt_blks;
   1060 	struct nm_next_hdr	*nh;
   1061 	mdkey_t			key = MD_KEYWILD;
   1062 	side_t			side = 0;
   1063 	int			size;
   1064 	int			devid_flag;
   1065 	int			retval;
   1066 	uint_t			un;
   1067 	int			un_next_set = 0;
   1068 
   1069 	md_haltsnarf_enter(setno);
   1070 
   1071 	mutex_enter(&md_mx);
   1072 	if (md_set[setno].s_status & MD_SET_SNARFED) {
   1073 		mutex_exit(&md_mx);
   1074 		md_haltsnarf_exit(setno);
   1075 		return (0);
   1076 	}
   1077 	mutex_exit(&md_mx);
   1078 
   1079 	if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) {
   1080 		if (md_start_daemons(TRUE)) {
   1081 			if (ep != NULL)
   1082 				(void) mdsyserror(ep, ENXIO);
   1083 			err = -1;
   1084 			goto out;
   1085 		}
   1086 	}
   1087 
   1088 
   1089 	/*
   1090 	 * Load the devid name space if it exists
   1091 	 */
   1092 	(void) md_load_namespace(setno, NULL, NM_DEVID);
   1093 	if (!md_load_namespace(setno, ep, 0L)) {
   1094 		/*
   1095 		 * Unload the devid namespace
   1096 		 */
   1097 		(void) md_unload_namespace(setno, NM_DEVID);
   1098 		err = -1;
   1099 		goto out;
   1100 	}
   1101 
   1102 	/*
   1103 	 * If replica is in non-devid state, convert if:
   1104 	 * 	- not in probe during upgrade (md_keep_repl_state = 0)
   1105 	 * 	- enough space available in replica
   1106 	 *	- local set
   1107 	 *	- not a multi-node diskset
   1108 	 *	- clustering is not present (for non-local set)
   1109 	 */
   1110 	s = (mddb_set_t *)md_set[setno].s_db;
   1111 	devid_flag = 0;
   1112 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state)
   1113 		devid_flag = 1;
   1114 	if (cluster_bootflags & CLUSTER_CONFIGURED)
   1115 		if (setno != MD_LOCAL_SET)
   1116 			devid_flag = 0;
   1117 	if (MD_MNSET_SETNO(setno))
   1118 		devid_flag = 0;
   1119 	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
   1120 		devid_flag = 0;
   1121 
   1122 	/*
   1123 	 * if we weren't devid style before and md_keep_repl_state=1
   1124 	 * we need to stay non-devid
   1125 	 */
   1126 	if ((md_keep_repl_state == 1) &&
   1127 	    ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0))
   1128 		devid_flag = 0;
   1129 	if (devid_flag) {
   1130 		/*
   1131 		 * Determine number of free blocks needed to convert
   1132 		 * entire replica to device id format - locator blocks
   1133 		 * and namespace.
   1134 		 */
   1135 		cvt_blks = 0;
   1136 		if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) {
   1137 			if (ep != NULL)
   1138 				(void) mdsyserror(ep, EIO);
   1139 			err = -1;
   1140 			goto out;
   1141 
   1142 		}
   1143 		cvt_blks += md_nm_did_chkspace(setno);
   1144 
   1145 		/* add MDDB_DEVID_CONV_PERC% */
   1146 		if ((md_conv_perc > 0) && (md_conv_perc <= 100)) {
   1147 			cvt_blks = cvt_blks * (100 + md_conv_perc) / 100;
   1148 		}
   1149 
   1150 		if (cvt_blks <= s->s_freeblkcnt) {
   1151 			if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) {
   1152 				if (ep != NULL)
   1153 					(void) mdsyserror(ep, EIO);
   1154 				err = -1;
   1155 				goto out;
   1156 			}
   1157 
   1158 		} else {
   1159 			/*
   1160 			 * Print message that replica can't be converted for
   1161 			 * lack of space.   No failure - just continue to
   1162 			 * run without device ids.
   1163 			 */
   1164 			cmn_err(CE_WARN,
   1165 			    "Unable to add Solaris Volume Manager device "
   1166 			    "relocation data.\n"
   1167 			    "          To use device relocation feature:\n"
   1168 			    "          - Increase size of listed replicas\n"
   1169 			    "          - Reboot");
   1170 			md_print_block_usage(s, cvt_blks);
   1171 			cmn_err(CE_WARN,
   1172 			    "Loading set without device relocation data.\n"
   1173 			    "          Solaris Volume Manager disk movement "
   1174 			    "not tracked in local set.");
   1175 		}
   1176 	}
   1177 
   1178 	/*
   1179 	 * go through and load any modules referenced in
   1180 	 * data base
   1181 	 */
   1182 	recid = mddb_makerecid(setno, 0);
   1183 	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
   1184 		status = mddb_getrecstatus(recid);
   1185 		if (status == MDDB_STALE) {
   1186 			if (! (md_get_setstatus(setno) & MD_SET_STALE)) {
   1187 				md_set_setstatus(setno, MD_SET_STALE);
   1188 				cmn_err(CE_WARN,
   1189 				    "md: state database is stale");
   1190 			}
   1191 		} else if (status == MDDB_NODATA) {
   1192 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
   1193 			continue;
   1194 		}
   1195 		drvrid = mddb_getrectype1(recid);
   1196 		if (drvrid < MDDB_FIRST_MODID)
   1197 			continue;
   1198 		if (md_loadsubmod(setno, md_getshared_name(setno, drvrid),
   1199 		    drvrid) < 0) {
   1200 			cmn_err(CE_NOTE, "md: could not load misc/%s",
   1201 			    md_getshared_name(setno, drvrid));
   1202 		}
   1203 	}
   1204 
   1205 	if (recid < 0)
   1206 		goto out;
   1207 
   1208 	snarf_user_data(setno);
   1209 
   1210 	/*
   1211 	 * Initialize the md_nm_snarfed array
   1212 	 * this array is indexed by the key and
   1213 	 * is set by md_getdevnum during the snarf time
   1214 	 */
   1215 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) {
   1216 		size = (int)((((struct nm_rec_hdr *)nh->nmn_record)->
   1217 		    r_next_key) * (sizeof (int)));
   1218 		md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP);
   1219 	}
   1220 
   1221 	/*
   1222 	 * go through and snarf until nothing gets added
   1223 	 */
   1224 	do {
   1225 		i = 0;
   1226 		for (ops = md_opslist; ops != NULL; ops = ops->md_next) {
   1227 			if (ops->md_snarf != NULL) {
   1228 				retval = ops->md_snarf(MD_SNARF_DOIT, setno);
   1229 				if (retval == -1) {
   1230 					err = -1;
   1231 					/* Don't know the failed unit */
   1232 					(void) mdmderror(ep, MDE_RR_ALLOC_ERROR,
   1233 					    0);
   1234 					(void) md_halt_set(setno, MD_HALT_ALL);
   1235 					(void) mddb_unload_set(setno);
   1236 					md_haltsnarf_exit(setno);
   1237 					return (err);
   1238 				} else {
   1239 					i += retval;
   1240 				}
   1241 			}
   1242 		}
   1243 	} while (i);
   1244 
   1245 	/*
   1246 	 * Set the first available slot and availability
   1247 	 */
   1248 	md_set[setno].s_un_avail = 0;
   1249 	for (un = 0; un < MD_MAXUNITS; un++) {
   1250 		if (md_set[setno].s_un[un] != NULL) {
   1251 			continue;
   1252 		} else {
   1253 			if (!un_next_set) {
   1254 				md_set[setno].s_un_next = un;
   1255 				un_next_set = 1;
   1256 			}
   1257 			md_set[setno].s_un_avail++;
   1258 		}
   1259 	}
   1260 
   1261 	md_set_setstatus(setno, MD_SET_SNARFED);
   1262 
   1263 	recid = mddb_makerecid(setno, 0);
   1264 	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
   1265 		privat = mddb_getrecprivate(recid);
   1266 		if (privat & MD_PRV_COMMIT) {
   1267 			if (mddb_commitrec(recid)) {
   1268 				if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
   1269 					md_set_setstatus(setno, MD_SET_STALE);
   1270 					cmn_err(CE_WARN,
   1271 					    "md: state database is stale");
   1272 				}
   1273 			}
   1274 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
   1275 		}
   1276 	}
   1277 
   1278 	/* Deletes must happen after all the commits */
   1279 	recid = mddb_makerecid(setno, 0);
   1280 	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
   1281 		privat = mddb_getrecprivate(recid);
   1282 		if (privat & MD_PRV_DELETE) {
   1283 			if (mddb_deleterec(recid)) {
   1284 				if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
   1285 					md_set_setstatus(setno, MD_SET_STALE);
   1286 					cmn_err(CE_WARN,
   1287 					    "md: state database is stale");
   1288 				}
   1289 				mddb_setrecprivate(recid, MD_PRV_GOTIT);
   1290 			}
   1291 			recid = mddb_makerecid(setno, 0);
   1292 		}
   1293 	}
   1294 
   1295 	/*
   1296 	 * go through and clean up records until nothing gets cleaned up.
   1297 	 */
   1298 	do {
   1299 		i = 0;
   1300 		for (ops = md_opslist; ops != NULL; ops = ops->md_next)
   1301 			if (ops->md_snarf != NULL)
   1302 				i += ops->md_snarf(MD_SNARF_CLEANUP, setno);
   1303 	} while (i);
   1304 
   1305 	if (md_nm_snarfed != NULL &&
   1306 	    !(md_get_setstatus(setno) & MD_SET_STALE)) {
   1307 		/*
   1308 		 * go thru and cleanup the namespace and the device id
   1309 		 * name space
   1310 		 */
   1311 		for (key = 1;
   1312 		    key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key;
   1313 		    key++) {
   1314 			/*
   1315 			 * Is the entry an 'orphan'?
   1316 			 */
   1317 			if (lookup_entry(nh, setno, side, key, NODEV64, 0L) !=
   1318 			    NULL) {
   1319 				/*
   1320 				 * If the value is not set then apparently
   1321 				 * it is not part of the current configuration,
   1322 				 * remove it this can happen when system panic
   1323 				 * between the primary name space update and
   1324 				 * the device id name space update
   1325 				 */
   1326 				if (md_nm_snarfed[key] == 0) {
   1327 					if (md_verify_orphaned_record(setno,
   1328 					    key) == 1)
   1329 						(void) remove_entry(nh,
   1330 						    side, key, 0L);
   1331 				}
   1332 			}
   1333 		}
   1334 	}
   1335 
   1336 	if (md_nm_snarfed != NULL) {
   1337 		/*
   1338 		 * Done and free the memory
   1339 		 */
   1340 		kmem_free(md_nm_snarfed, size);
   1341 		md_nm_snarfed = NULL;
   1342 	}
   1343 
   1344 	if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE &&
   1345 	    !(md_get_setstatus(setno) & MD_SET_STALE)) {
   1346 		/*
   1347 		 * if the destroy flag has been set and
   1348 		 * the MD_SET_DIDCLUP bit is not set in
   1349 		 * the set's status field, cleanup the
   1350 		 * entire device id namespace
   1351 		 */
   1352 		if (md_devid_destroy &&
   1353 		    !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) {
   1354 			(void) md_devid_cleanup(setno, 1);
   1355 			md_set_setstatus(setno, MD_SET_DIDCLUP);
   1356 		} else
   1357 			(void) md_devid_cleanup(setno, 0);
   1358 	}
   1359 
   1360 	/*
   1361 	 * clear single threading on snarf, return success or error
   1362 	 */
   1363 out:
   1364 	md_haltsnarf_exit(setno);
   1365 	return (err);
   1366 }
   1367 
   1368 void
   1369 get_minfo(struct dk_minfo *info, minor_t mnum)
   1370 {
   1371 	md_unit_t	*un;
   1372 	mdi_unit_t	*ui;
   1373 
   1374 	info->dki_capacity = 0;
   1375 	info->dki_lbsize = 0;
   1376 	info->dki_media_type = 0;
   1377 
   1378 	if ((ui = MDI_UNIT(mnum)) == NULL) {
   1379 		return;
   1380 	}
   1381 	un = (md_unit_t *)md_unit_readerlock(ui);
   1382 	info->dki_capacity = un->c.un_total_blocks;
   1383 	md_unit_readerexit(ui);
   1384 	info->dki_lbsize = DEV_BSIZE;
   1385 	info->dki_media_type = DK_UNKNOWN;
   1386 }
   1387 
   1388 
   1389 void
   1390 get_info(struct dk_cinfo *info, minor_t mnum)
   1391 {
   1392 	/*
   1393 	 * Controller Information
   1394 	 */
   1395 	info->dki_ctype = DKC_MD;
   1396 	info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo));
   1397 	(void) strcpy(info->dki_cname,
   1398 	    ddi_get_name(ddi_get_parent(md_devinfo)));
   1399 	/*
   1400 	 * Unit Information
   1401 	 */
   1402 	info->dki_unit = mnum;
   1403 	info->dki_slave = 0;
   1404 	(void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo));
   1405 	info->dki_flags = 0;
   1406 	info->dki_partition = 0;
   1407 	info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE);
   1408 
   1409 	/*
   1410 	 * We can't get from here to there yet
   1411 	 */
   1412 	info->dki_addr = 0;
   1413 	info->dki_space = 0;
   1414 	info->dki_prio = 0;
   1415 	info->dki_vec = 0;
   1416 }
   1417 
   1418 /*
   1419  * open admin device
   1420  */
   1421 static int
   1422 mdadminopen(
   1423 	int	flag,
   1424 	int	otyp)
   1425 {
   1426 	int	err = 0;
   1427 
   1428 	/* single thread */
   1429 	mutex_enter(&md_mx);
   1430 
   1431 	/* check type and flags */
   1432 	if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) {
   1433 		err = EINVAL;
   1434 		goto out;
   1435 	}
   1436 	if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) ||
   1437 	    (md_status & MD_GBL_EXCL)) {
   1438 		err = EBUSY;
   1439 		goto out;
   1440 	}
   1441 
   1442 	/* count and flag open */
   1443 	md_ocnt[otyp]++;
   1444 	md_status |= MD_GBL_OPEN;
   1445 	if (flag & FEXCL)
   1446 		md_status |= MD_GBL_EXCL;
   1447 
   1448 	/* unlock return success */
   1449 out:
   1450 	mutex_exit(&md_mx);
   1451 	return (err);
   1452 }
   1453 
   1454 /*
   1455  * open entry point
   1456  */
   1457 static int
   1458 mdopen(
   1459 	dev_t		*dev,
   1460 	int		flag,
   1461 	int		otyp,
   1462 	cred_t		*cred_p)
   1463 {
   1464 	minor_t		mnum = getminor(*dev);
   1465 	unit_t		unit = MD_MIN2UNIT(mnum);
   1466 	set_t		setno = MD_MIN2SET(mnum);
   1467 	mdi_unit_t	*ui = NULL;
   1468 	int		err = 0;
   1469 	md_parent_t	parent;
   1470 
   1471 	/* dispatch admin device opens */
   1472 	if (mnum == MD_ADM_MINOR)
   1473 		return (mdadminopen(flag, otyp));
   1474 
   1475 	/* lock, check status */
   1476 	rw_enter(&md_unit_array_rw.lock, RW_READER);
   1477 
   1478 tryagain:
   1479 	if (md_get_status() & MD_GBL_HALTED)  {
   1480 		err = ENODEV;
   1481 		goto out;
   1482 	}
   1483 
   1484 	/* check minor */
   1485 	if ((setno >= md_nsets) || (unit >= md_nunits)) {
   1486 		err = ENXIO;
   1487 		goto out;
   1488 	}
   1489 
   1490 	/* make sure we're snarfed */
   1491 	if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) {
   1492 		if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) {
   1493 			err = ENODEV;
   1494 			goto out;
   1495 		}
   1496 	}
   1497 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) {
   1498 		err = ENODEV;
   1499 		goto out;
   1500 	}
   1501 
   1502 	/* check unit */
   1503 	if ((ui = MDI_UNIT(mnum)) == NULL) {
   1504 		err = ENXIO;
   1505 		goto out;
   1506 	}
   1507 
   1508 	/*
   1509 	 * The softpart open routine may do an I/O during the open, in
   1510 	 * which case the open routine will set the OPENINPROGRESS flag
   1511 	 * and drop all locks during the I/O.  If this thread sees
   1512 	 * the OPENINPROGRESS flag set, if should wait until the flag
   1513 	 * is reset before calling the driver's open routine.  It must
   1514 	 * also revalidate the world after it grabs the unit_array lock
   1515 	 * since the set may have been released or the metadevice cleared
   1516 	 * during the sleep.
   1517 	 */
   1518 	if (MD_MNSET_SETNO(setno)) {
   1519 		mutex_enter(&ui->ui_mx);
   1520 		if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
   1521 			rw_exit(&md_unit_array_rw.lock);
   1522 			cv_wait(&ui->ui_cv, &ui->ui_mx);
   1523 			rw_enter(&md_unit_array_rw.lock, RW_READER);
   1524 			mutex_exit(&ui->ui_mx);
   1525 			goto tryagain;
   1526 		}
   1527 		mutex_exit(&ui->ui_mx);
   1528 	}
   1529 
   1530 	/* Test if device is openable */
   1531 	if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) {
   1532 		err = ENXIO;
   1533 		goto out;
   1534 	}
   1535 
   1536 	/* don't allow opens w/WRITE flag if stale */
   1537 	if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) {
   1538 		err = EROFS;
   1539 		goto out;
   1540 	}
   1541 
   1542 	/* don't allow writes to subdevices */
   1543 	parent = md_get_parent(md_expldev(*dev));
   1544 	if ((flag & FWRITE) && MD_HAS_PARENT(parent)) {
   1545 		err = EROFS;
   1546 		goto out;
   1547 	}
   1548 
   1549 	/* open underlying driver */
   1550 	if (md_ops[ui->ui_opsindex]->md_open != NULL) {
   1551 		if ((err = (*md_ops[ui->ui_opsindex]->md_open)
   1552 		    (dev, flag, otyp, cred_p, 0)) != 0)
   1553 			goto out;
   1554 	}
   1555 
   1556 	/* or do it ourselves */
   1557 	else {
   1558 		/* single thread */
   1559 		(void) md_unit_openclose_enter(ui);
   1560 		err = md_unit_incopen(mnum, flag, otyp);
   1561 		md_unit_openclose_exit(ui);
   1562 		if (err != 0)
   1563 			goto out;
   1564 	}
   1565 
   1566 	/* unlock, return status */
   1567 out:
   1568 	rw_exit(&md_unit_array_rw.lock);
   1569 	return (err);
   1570 }
   1571 
   1572 /*
   1573  * close admin device
   1574  */
   1575 static int
   1576 mdadminclose(
   1577 	int	otyp)
   1578 {
   1579 	int	i;
   1580 	int	err = 0;
   1581 
   1582 	/* single thread */
   1583 	mutex_enter(&md_mx);
   1584 
   1585 	/* check type and flags */
   1586 	if ((otyp < 0) || (otyp >= OTYPCNT)) {
   1587 		err = EINVAL;
   1588 		goto out;
   1589 	} else if (md_ocnt[otyp] == 0) {
   1590 		err = ENXIO;
   1591 		goto out;
   1592 	}
   1593 
   1594 	/* count and flag closed */
   1595 	if (otyp == OTYP_LYR)
   1596 		md_ocnt[otyp]--;
   1597 	else
   1598 		md_ocnt[otyp] = 0;
   1599 	md_status &= ~MD_GBL_OPEN;
   1600 	for (i = 0; (i < OTYPCNT); ++i)
   1601 		if (md_ocnt[i] != 0)
   1602 			md_status |= MD_GBL_OPEN;
   1603 	if (! (md_status & MD_GBL_OPEN))
   1604 		md_status &= ~MD_GBL_EXCL;
   1605 
   1606 	/* unlock return success */
   1607 out:
   1608 	mutex_exit(&md_mx);
   1609 	return (err);
   1610 }
   1611 
   1612 /*
   1613  * close entry point
   1614  */
   1615 static int
   1616 mdclose(
   1617 	dev_t		dev,
   1618 	int		flag,
   1619 	int		otyp,
   1620 	cred_t		*cred_p)
   1621 {
   1622 	minor_t		mnum = getminor(dev);
   1623 	set_t		setno = MD_MIN2SET(mnum);
   1624 	unit_t		unit = MD_MIN2UNIT(mnum);
   1625 	mdi_unit_t	*ui = NULL;
   1626 	int		err = 0;
   1627 
   1628 	/* dispatch admin device closes */
   1629 	if (mnum == MD_ADM_MINOR)
   1630 		return (mdadminclose(otyp));
   1631 
   1632 	/* check minor */
   1633 	if ((setno >= md_nsets) || (unit >= md_nunits) ||
   1634 	    ((ui = MDI_UNIT(mnum)) == NULL)) {
   1635 		err = ENXIO;
   1636 		goto out;
   1637 	}
   1638 
   1639 	/* close underlying driver */
   1640 	if (md_ops[ui->ui_opsindex]->md_close != NULL) {
   1641 		if ((err = (*md_ops[ui->ui_opsindex]->md_close)
   1642 		    (dev, flag, otyp, cred_p, 0)) != 0)
   1643 			goto out;
   1644 	}
   1645 
   1646 	/* or do it ourselves */
   1647 	else {
   1648 		/* single thread */
   1649 		(void) md_unit_openclose_enter(ui);
   1650 		err = md_unit_decopen(mnum, otyp);
   1651 		md_unit_openclose_exit(ui);
   1652 		if (err != 0)
   1653 			goto out;
   1654 	}
   1655 
   1656 	/* return success */
   1657 out:
   1658 	return (err);
   1659 }
   1660 
   1661 
   1662 /*
   1663  * This routine performs raw read operations.  It is called from the
   1664  * device switch at normal priority.
   1665  *
   1666  * The main catch is that the *uio struct which is passed to us may
   1667  * specify a read which spans two buffers, which would be contiguous
   1668  * on a single partition,  but not on a striped partition. This will
   1669  * be handled by mdstrategy.
   1670  */
   1671 /*ARGSUSED*/
   1672 static int
   1673 mdread(dev_t dev, struct uio *uio, cred_t *credp)
   1674 {
   1675 	minor_t		mnum;
   1676 	mdi_unit_t	*ui;
   1677 	int		error;
   1678 
   1679 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
   1680 	    (MD_MIN2SET(mnum) >= md_nsets) ||
   1681 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
   1682 	    ((ui = MDI_UNIT(mnum)) == NULL))
   1683 		return (ENXIO);
   1684 
   1685 	if (md_ops[ui->ui_opsindex]->md_read  != NULL)
   1686 		return ((*md_ops[ui->ui_opsindex]->md_read)
   1687 		    (dev, uio, credp));
   1688 
   1689 	if ((error = md_chk_uio(uio)) != 0)
   1690 		return (error);
   1691 
   1692 	return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio));
   1693 }
   1694 
   1695 /*
   1696  * This routine performs async raw read operations.  It is called from the
   1697  * device switch at normal priority.
   1698  *
   1699  * The main catch is that the *aio struct which is passed to us may
   1700  * specify a read which spans two buffers, which would be contiguous
   1701  * on a single partition,  but not on a striped partition. This will
   1702  * be handled by mdstrategy.
   1703  */
   1704 /*ARGSUSED*/
   1705 static int
   1706 mdaread(dev_t dev, struct aio_req *aio, cred_t *credp)
   1707 {
   1708 	minor_t		mnum;
   1709 	mdi_unit_t	*ui;
   1710 	int		error;
   1711 
   1712 
   1713 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
   1714 	    (MD_MIN2SET(mnum) >= md_nsets) ||
   1715 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
   1716 	    ((ui = MDI_UNIT(mnum)) == NULL))
   1717 		return (ENXIO);
   1718 
   1719 	if (md_ops[ui->ui_opsindex]->md_aread  != NULL)
   1720 		return ((*md_ops[ui->ui_opsindex]->md_aread)
   1721 		    (dev, aio, credp));
   1722 
   1723 	if ((error = md_chk_uio(aio->aio_uio)) != 0)
   1724 		return (error);
   1725 
   1726 	return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio));
   1727 }
   1728 
   1729 /*
   1730  * This routine performs raw write operations.	It is called from the
   1731  * device switch at normal priority.
   1732  *
   1733  * The main catch is that the *uio struct which is passed to us may
   1734  * specify a write which spans two buffers, which would be contiguous
   1735  * on a single partition,  but not on a striped partition. This is
   1736  * handled by mdstrategy.
   1737  *
   1738  */
   1739 /*ARGSUSED*/
   1740 static int
   1741 mdwrite(dev_t dev, struct uio *uio, cred_t *credp)
   1742 {
   1743 	minor_t		mnum;
   1744 	mdi_unit_t	*ui;
   1745 	int		error;
   1746 
   1747 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
   1748 	    (MD_MIN2SET(mnum) >= md_nsets) ||
   1749 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
   1750 	    ((ui = MDI_UNIT(mnum)) == NULL))
   1751 		return (ENXIO);
   1752 
   1753 	if (md_ops[ui->ui_opsindex]->md_write  != NULL)
   1754 		return ((*md_ops[ui->ui_opsindex]->md_write)
   1755 		    (dev, uio, credp));
   1756 
   1757 	if ((error = md_chk_uio(uio)) != 0)
   1758 		return (error);
   1759 
   1760 	return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio));
   1761 }
   1762 
   1763 /*
   1764  * This routine performs async raw write operations.  It is called from the
   1765  * device switch at normal priority.
   1766  *
   1767  * The main catch is that the *aio struct which is passed to us may
   1768  * specify a write which spans two buffers, which would be contiguous
   1769  * on a single partition,  but not on a striped partition. This is
   1770  * handled by mdstrategy.
   1771  *
   1772  */
   1773 /*ARGSUSED*/
   1774 static int
   1775 mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp)
   1776 {
   1777 	minor_t		mnum;
   1778 	mdi_unit_t	*ui;
   1779 	int		error;
   1780 
   1781 
   1782 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
   1783 	    (MD_MIN2SET(mnum) >= md_nsets) ||
   1784 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
   1785 	    ((ui = MDI_UNIT(mnum)) == NULL))
   1786 		return (ENXIO);
   1787 
   1788 	if (md_ops[ui->ui_opsindex]->md_awrite  != NULL)
   1789 		return ((*md_ops[ui->ui_opsindex]->md_awrite)
   1790 		    (dev, aio, credp));
   1791 
   1792 	if ((error = md_chk_uio(aio->aio_uio)) != 0)
   1793 		return (error);
   1794 
   1795 	return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio));
   1796 }
   1797 
   1798 int
   1799 mdstrategy(struct buf *bp)
   1800 {
   1801 	minor_t		mnum;
   1802 	mdi_unit_t	*ui;
   1803 
   1804 	ASSERT((bp->b_flags & B_DONE) == 0);
   1805 
   1806 	if (panicstr)
   1807 		md_clr_status(MD_GBL_DAEMONS_LIVE);
   1808 
   1809 	if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) ||
   1810 	    (MD_MIN2SET(mnum) >= md_nsets) ||
   1811 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
   1812 	    ((ui = MDI_UNIT(mnum)) == NULL)) {
   1813 		bp->b_flags |= B_ERROR;
   1814 		bp->b_error = ENXIO;
   1815 		bp->b_resid = bp->b_bcount;
   1816 		biodone(bp);
   1817 		return (0);
   1818 	}
   1819 
   1820 	bp->b_flags &= ~(B_ERROR | B_DONE);
   1821 	if (md_ops[ui->ui_opsindex]->md_strategy  != NULL) {
   1822 		(*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL);
   1823 	} else {
   1824 		(void) errdone(ui, bp, ENXIO);
   1825 	}
   1826 	return (0);
   1827 }
   1828 
   1829 /*
   1830  * Return true if the ioctl is allowed to be multithreaded.
   1831  * All the ioctls with MN are sent only from the message handlers through
   1832  * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two
   1833  * ioctl for the same metadevice are issued at the same time.
   1834  * So we are safe here.
   1835  * The other ioctls do not mess with any metadevice structures and therefor
   1836  * are harmless too, if called multiple times at the same time.
   1837  */
   1838 static boolean_t
   1839 is_mt_ioctl(int cmd) {
   1840 
   1841 	switch (cmd) {
   1842 	case MD_IOCGUNIQMSGID:
   1843 	case MD_IOCGVERSION:
   1844 	case MD_IOCISOPEN:
   1845 	case MD_MN_SET_MM_OWNER:
   1846 	case MD_MN_SET_STATE:
   1847 	case MD_MN_SUSPEND_WRITES:
   1848 	case MD_MN_ALLOCATE_HOTSPARE:
   1849 	case MD_MN_SET_SETFLAGS:
   1850 	case MD_MN_GET_SETFLAGS:
   1851 	case MD_MN_MDDB_OPTRECFIX:
   1852 	case MD_MN_MDDB_PARSE:
   1853 	case MD_MN_MDDB_BLOCK:
   1854 	case MD_MN_DB_USERREQ:
   1855 	case MD_IOC_SPSTATUS:
   1856 	case MD_MN_COMMD_ERR:
   1857 	case MD_MN_SET_COMMD_RUNNING:
   1858 	case MD_MN_RESYNC:
   1859 	case MD_MN_SETSYNC:
   1860 	case MD_MN_POKE_HOTSPARES:
   1861 	case MD_MN_RR_DIRTY:
   1862 	case MD_MN_RR_CLEAN:
   1863 	case MD_MN_IOC_SPUPDATEWM:
   1864 		return (1);
   1865 	default:
   1866 		return (0);
   1867 	}
   1868 }
   1869 
   1870 /*
   1871  * This routine implements the ioctl calls for the Virtual Disk System.
   1872  * It is called from the device switch at normal priority.
   1873  */
   1874 /* ARGSUSED */
   1875 static int
   1876 mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p,
   1877 	int *rval_p)
   1878 {
   1879 	minor_t		mnum = getminor(dev);
   1880 	mdi_unit_t	*ui;
   1881 	IOLOCK		lock;
   1882 	int		err;
   1883 
   1884 	/*
   1885 	 * For multinode disksets  number of ioctls are allowed to be
   1886 	 * multithreaded.
   1887 	 * A fundamental assumption made in this implementation is that
   1888 	 * ioctls either do not interact with other md structures  or the
   1889 	 * ioctl to the admin device can only occur if the metadevice
   1890 	 * device is open. i.e. avoid a race between metaclear and the
   1891 	 * progress of a multithreaded ioctl.
   1892 	 */
   1893 
   1894 	if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) {
   1895 		return (EINTR);
   1896 	}
   1897 
   1898 	/*
   1899 	 * initialize lock tracker
   1900 	 */
   1901 	IOLOCK_INIT(&lock);
   1902 
   1903 	/* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */
   1904 
   1905 	if (is_mt_ioctl(cmd)) {
   1906 		/* increment the md_mtioctl_cnt */
   1907 		mutex_enter(&md_mx);
   1908 		md_mtioctl_cnt++;
   1909 		mutex_exit(&md_mx);
   1910 		lock.l_flags |= MD_MT_IOCTL;
   1911 	}
   1912 
   1913 	/*
   1914 	 * this has been added to prevent notification from re-snarfing
   1915 	 * so metaunload will work.  It may interfere with other modules
   1916 	 * halt process.
   1917 	 */
   1918 	if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE))
   1919 		return (IOLOCK_RETURN(ENXIO, &lock));
   1920 
   1921 	/*
   1922 	 * admin device ioctls
   1923 	 */
   1924 	if (mnum == MD_ADM_MINOR) {
   1925 		err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data,
   1926 		    mode, &lock);
   1927 	}
   1928 
   1929 	/*
   1930 	 * metadevice ioctls
   1931 	 */
   1932 	else if ((MD_MIN2SET(mnum) >= md_nsets) ||
   1933 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
   1934 	    (md_set[MD_MIN2SET(mnum)].s_ui == NULL) ||
   1935 	    ((ui = MDI_UNIT(mnum)) == NULL)) {
   1936 		err = ENXIO;
   1937 	} else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) {
   1938 		err = ENOTTY;
   1939 	} else {
   1940 		err = (*md_ops[ui->ui_opsindex]->md_ioctl)
   1941 		    (dev, cmd, (void *) data, mode, &lock);
   1942 	}
   1943 
   1944 	/*
   1945 	 * drop any locks we grabbed
   1946 	 */
   1947 	return (IOLOCK_RETURN_IOCTLEND(err, &lock));
   1948 }
   1949 
   1950 static int
   1951 mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
   1952 {
   1953 	minor_t		mnum;
   1954 	set_t		setno;
   1955 	mdi_unit_t	*ui;
   1956 
   1957 	if ((mnum = getminor(dev)) == MD_ADM_MINOR)
   1958 		return (ENXIO);
   1959 
   1960 	setno = MD_MIN2SET(mnum);
   1961 
   1962 	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) ||
   1963 	    ((ui = MDI_UNIT(mnum)) == NULL))
   1964 		return (ENXIO);
   1965 
   1966 
   1967 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
   1968 		return (ENXIO);
   1969 
   1970 	if (md_ops[ui->ui_opsindex]->md_dump  != NULL)
   1971 		return ((*md_ops[ui->ui_opsindex]->md_dump)
   1972 		    (dev, addr, blkno, nblk));
   1973 
   1974 	return (ENXIO);
   1975 }
   1976 
   1977 /*
   1978  * Metadevice unit number dispatcher
   1979  * When this routine is called it will scan the
   1980  * incore unit array and return the avail slot
   1981  * hence the unit number to the caller
   1982  *
   1983  * Return -1 if there is nothing available
   1984  */
   1985 unit_t
   1986 md_get_nextunit(set_t setno)
   1987 {
   1988 	unit_t	un, start;
   1989 
   1990 	/*
   1991 	 * If nothing available
   1992 	 */
   1993 	if (md_set[setno].s_un_avail == 0) {
   1994 		return (MD_UNITBAD);
   1995 	}
   1996 
   1997 	mutex_enter(&md_mx);
   1998 	start = un = md_set[setno].s_un_next;
   1999 
   2000 	/* LINTED: E_CONSTANT_CONDITION */
   2001 	while (1) {
   2002 		if (md_set[setno].s_un[un] == NULL) {
   2003 			/*
   2004 			 * Advance the starting index for the next
   2005 			 * md_get_nextunit call
   2006 			 */
   2007 			if (un == MD_MAXUNITS - 1) {
   2008 				md_set[setno].s_un_next = 0;
   2009 			} else {
   2010 				md_set[setno].s_un_next = un + 1;
   2011 			}
   2012 			break;
   2013 		}
   2014 
   2015 		un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1);
   2016 
   2017 		if (un == start) {
   2018 			un = MD_UNITBAD;
   2019 			break;
   2020 		}
   2021 
   2022 	}
   2023 
   2024 	mutex_exit(&md_mx);
   2025 	return (un);
   2026 }
   2027