Home | History | Annotate | Download | only in md
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/conf.h>
     29 #include <sys/time.h>
     30 #include <sys/uio.h>
     31 #include <sys/param.h>
     32 #include <sys/systm.h>
     33 #include <sys/systeminfo.h>
     34 #include <sys/sysmacros.h>
     35 #include <sys/buf.h>
     36 #include <sys/kmem.h>
     37 #include <sys/file.h>
     38 #include <sys/open.h>
     39 #include <sys/debug.h>
     40 #include <sys/stat.h>
     41 #include <sys/lvm/mdvar.h>
     42 #include <sys/lvm/md_crc.h>
     43 #include <sys/lvm/md_convert.h>
     44 #include <sys/types.h>
     45 #include <sys/kmem.h>
     46 #include <sys/lvm/mdmn_commd.h>
     47 #include <sys/cladm.h>
     48 
     49 mhd_mhiargs_t	defmhiargs = {
     50 	1000,
     51 	{ 6000, 6000, 30000 }
     52 };
     53 
     54 #define	MDDB
     55 
     56 #include <sys/lvm/mdvar.h>
     57 #include <sys/lvm/mdmed.h>
     58 #include <sys/lvm/md_names.h>
     59 #include <sys/cred.h>
     60 #include <sys/ddi.h>
     61 #include <sys/sunddi.h>
     62 #include <sys/esunddi.h>
     63 
     64 #include <sys/sysevent/eventdefs.h>
     65 #include <sys/sysevent/svm.h>
     66 
     67 extern char svm_bootpath[];
     68 
     69 int			md_maxbootlist = MAXBOOTLIST;
     70 static ulong_t		mddb_maxblocks = 0;	/* tune for small records */
     71 static int		mddb_maxbufheaders = 50;
     72 static uint_t		mddb_maxcopies = MDDB_NLB;
     73 
     74 /*
     75  * If this is set, more detailed messages about DB init will be given, instead
     76  * of just the MDE_DB_NODB.
     77  */
     78 static int		mddb_db_err_detail = 0;
     79 
     80 /*
     81  * This lock is used to single-thread load/unload of all sets
     82  */
     83 static kmutex_t		mddb_lock;
     84 
     85 /*
     86  * You really do NOT want to change this boolean.
     87  * It can be VERY dangerous to do so.  Loss of
     88  * data may occur. USE AT YOUR OWN RISK!!!!
     89  */
     90 static int		mddb_allow_half = 0;
     91 /*
     92  * For mirrored root allow reboot with only half the replicas available
     93  * Flag inserted for Santa Fe project.
     94  */
     95 int mirrored_root_flag;
     96 
     97 #define	ISWHITE(c)	(((c) == ' ') || ((c) == '\t') || \
     98 			    ((c) == '\r') || ((c) == '\n'))
     99 #define	ISNUM(c)	(((c) >= '0') && ((c) <= '9'))
    100 
    101 #define	SETMUTEX(setno)	(&md_set[setno].s_dbmx)
    102 
    103 extern md_krwlock_t	md_unit_array_rw;	/* md.c */
    104 extern set_t		md_nsets;		/* md.c */
    105 extern int		md_nmedh;		/* md.c */
    106 extern md_set_t		md_set[];		/* md.c */
    107 extern int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);
    108 extern dev_info_t	*md_devinfo;
    109 extern int		md_init_debug;
    110 extern int		md_status;
    111 extern md_ops_t		*md_opslist;
    112 extern md_krwlock_t	nm_lock;
    113 
    114 static int 		update_locatorblock(mddb_set_t *s, md_dev64_t dev,
    115 				ddi_devid_t didptr, ddi_devid_t old_didptr);
    116 
    117 /*
    118  * Defines for crc calculation for records
    119  * rec_crcgen generates a crc checksum for a record block
    120  * rec_crcchk checks the crc checksum for a record block
    121  */
    122 #define	REC_CRCGEN	0
    123 #define	REC_CRCCHK	1
    124 #define	rec_crcgen(s, dep, rbp) \
    125 	(void) rec_crcfunc(s, dep, rbp, REC_CRCGEN)
    126 #define	rec_crcchk(s, dep, rbp) \
    127 	rec_crcfunc(s, dep, rbp, REC_CRCCHK)
    128 
    129 /*
    130  * During upgrade, SVM basically runs with the devt from the target
    131  * being upgraded.  Translations are made from the target devt to the
    132  * miniroot devt when writing data out to the disk.  This is done by
    133  * the following routines:
    134  *	wrtblklst
    135  *	writeblks
    136  *	readblklst
    137  *	readblks
    138  *	dt_read
    139  *
    140  * The following routines are used by the routines listed above and
    141  * expect a translated (aka miniroot) devt:
    142  *	getblks
    143  * 	getmasters
    144  *
    145  * Also, when calling any system routines, such as ddi_lyr_get_devid,
    146  * the translated (aka miniroot) devt must be used.
    147  *
    148  * By the same token, the major number and major name conversion operations
    149  * need to use the name_to_major file from the target system instead
    150  * of the name_to_major file on the miniroot.  So, calls to
    151  * ddi_name_to_major must be replaced with calls to md_targ_name_to_major
    152  * when running on an upgrade.  Same is true with calls to
    153  * ddi_major_to_name.
    154  */
    155 
    156 
    157 #ifndef MDDB_FAKE
    158 
    159 static int
    160 mddb_rwdata(
    161 	mddb_set_t	*s,	/* incore db set structure */
    162 	int		flag,	/* B_ASYNC, B_FAILFAST or 0 passed in here */
    163 	buf_t		*bp
    164 )
    165 {
    166 	int		err = 0;
    167 
    168 	bp->b_flags = (flag | B_BUSY) & (~B_ASYNC);
    169 
    170 	mutex_exit(SETMUTEX(s->s_setno));
    171 	if (mdv_strategy_tstpnt == NULL ||
    172 	    (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0)
    173 		(void) bdev_strategy(bp);
    174 
    175 	if (flag & B_ASYNC) {
    176 		mutex_enter(SETMUTEX(s->s_setno));
    177 		return (0);
    178 	}
    179 
    180 	err = biowait(bp);
    181 	mutex_enter(SETMUTEX(s->s_setno));
    182 	return (err);
    183 }
    184 
    185 static void
    186 setidentifier(
    187 	mddb_set_t	*s,
    188 	identifier_t	*ident
    189 )
    190 {
    191 	if (s->s_setno == MD_LOCAL_SET)
    192 		(void) strcpy(&ident->serial[0], s->s_ident.serial);
    193 	else
    194 		ident->createtime = s->s_ident.createtime;
    195 }
    196 
    197 static int
    198 cmpidentifier(
    199 	mddb_set_t	*s,
    200 	identifier_t	*ident
    201 )
    202 {
    203 	if (s->s_setno == MD_LOCAL_SET)
    204 		return (strcmp(ident->serial, s->s_ident.serial));
    205 	else
    206 		return (timercmp(&ident->createtime,
    207 		    /*CSTYLED*/
    208 		    &s->s_ident.createtime, !=));
    209 }
    210 
    211 static int
    212 mddb_devopen(
    213 	md_dev64_t	dev
    214 )
    215 {
    216 	dev_t		ddi_dev = md_dev64_to_dev(dev);
    217 
    218 	if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0)
    219 		return (0);
    220 	return (1);
    221 }
    222 
    223 static void
    224 mddb_devclose(
    225 	md_dev64_t	dev
    226 )
    227 {
    228 	(void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred);
    229 }
    230 
    231 /*
    232  * stripe_skip_ts
    233  *
    234  * Returns a list of fields to be skipped in the stripe record structure.
    235  * These fields are ms_timestamp in the component structure.
    236  * Used to skip these fields when calculating the checksum.
    237  */
    238 static crc_skip_t *
    239 stripe_skip_ts(void *un, uint_t revision)
    240 {
    241 	struct ms_row32_od	*small_mdr;
    242 	struct ms_row		*big_mdr;
    243 	uint_t			row, comp, ncomps, compoff;
    244 	crc_skip_t		*skip;
    245 	crc_skip_t		*skip_prev;
    246 	crc_skip_t		skip_start = {0, 0, 0};
    247 	ms_unit_t		*big_un;
    248 	ms_unit32_od_t		*small_un;
    249 	uint_t			rb_off = offsetof(mddb_rb32_t, rb_data[0]);
    250 
    251 	switch (revision) {
    252 	case MDDB_REV_RB:
    253 	case MDDB_REV_RBFN:
    254 		small_un = (ms_unit32_od_t *)un;
    255 		skip_prev = &skip_start;
    256 
    257 		if (small_un->un_nrows == 0)
    258 			return (NULL);
    259 		/*
    260 		 * walk through all rows to find the total number
    261 		 * of components
    262 		 */
    263 		small_mdr   = &small_un->un_row[0];
    264 		ncomps = 0;
    265 		for (row = 0; (row < small_un->un_nrows); row++) {
    266 			ncomps += small_mdr[row].un_ncomp;
    267 		}
    268 
    269 		/* Now walk through the components */
    270 		compoff = small_un->un_ocomp + rb_off;
    271 		for (comp = 0; (comp < ncomps); ++comp) {
    272 			uint_t	mdcp = compoff +
    273 			    (comp * sizeof (ms_comp32_od_t));
    274 			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
    275 			    KM_SLEEP);
    276 			skip->skip_offset = mdcp +
    277 			    offsetof(ms_comp32_od_t, un_mirror.ms_timestamp);
    278 			skip->skip_size = sizeof (md_timeval32_t);
    279 			skip_prev->skip_next = skip;
    280 			skip_prev = skip;
    281 		}
    282 		break;
    283 	case MDDB_REV_RB64:
    284 	case MDDB_REV_RB64FN:
    285 		big_un = (ms_unit_t *)un;
    286 		skip_prev = &skip_start;
    287 
    288 		if (big_un->un_nrows == 0)
    289 			return (NULL);
    290 		/*
    291 		 * walk through all rows to find the total number
    292 		 * of components
    293 		 */
    294 		big_mdr   = &big_un->un_row[0];
    295 		ncomps = 0;
    296 		for (row = 0; (row < big_un->un_nrows); row++) {
    297 			ncomps += big_mdr[row].un_ncomp;
    298 		}
    299 
    300 		/* Now walk through the components */
    301 		compoff = big_un->un_ocomp + rb_off;
    302 		for (comp = 0; (comp < ncomps); ++comp) {
    303 			uint_t	mdcp = compoff +
    304 			    (comp * sizeof (ms_comp_t));
    305 			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
    306 			    KM_SLEEP);
    307 			skip->skip_offset = mdcp +
    308 			    offsetof(ms_comp_t, un_mirror.ms_timestamp);
    309 			skip->skip_size = sizeof (md_timeval32_t);
    310 			skip_prev->skip_next = skip;
    311 			skip_prev = skip;
    312 		}
    313 		break;
    314 	}
    315 	/* Return the start of the list of fields to skip */
    316 	return (skip_start.skip_next);
    317 }
    318 
    319 /*
    320  * mirror_skip_ts
    321  *
    322  * Returns a list of fields to be skipped in the mirror record structure.
    323  * This includes un_last_read and sm_timestamp for each submirror
    324  * Used to skip these fields when calculating the checksum.
    325  */
    326 static crc_skip_t *
    327 mirror_skip_ts(uint_t revision)
    328 {
    329 	int		i;
    330 	crc_skip_t	*skip;
    331 	crc_skip_t	*skip_prev;
    332 	crc_skip_t	skip_start = {0, 0, 0};
    333 	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
    334 
    335 	skip_prev = &skip_start;
    336 
    337 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
    338 	switch (revision) {
    339 	case MDDB_REV_RB:
    340 	case MDDB_REV_RBFN:
    341 		skip->skip_offset = offsetof(mm_unit32_od_t,
    342 		    un_last_read) + rb_off;
    343 		break;
    344 	case MDDB_REV_RB64:
    345 	case MDDB_REV_RB64FN:
    346 		skip->skip_offset = offsetof(mm_unit_t,
    347 		    un_last_read) + rb_off;
    348 		break;
    349 	}
    350 	skip->skip_size = sizeof (int);
    351 	skip_prev->skip_next = skip;
    352 	skip_prev = skip;
    353 
    354 	for (i = 0; i < NMIRROR; i++) {
    355 		skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
    356 		switch (revision) {
    357 		case MDDB_REV_RB:
    358 		case MDDB_REV_RBFN:
    359 			skip->skip_offset = offsetof(mm_unit32_od_t,
    360 			    un_sm[i].sm_timestamp) + rb_off;
    361 			break;
    362 		case MDDB_REV_RB64:
    363 		case MDDB_REV_RB64FN:
    364 			skip->skip_offset = offsetof(mm_unit_t,
    365 			    un_sm[i].sm_timestamp) + rb_off;
    366 			break;
    367 		}
    368 		skip->skip_size = sizeof (md_timeval32_t);
    369 		skip_prev->skip_next = skip;
    370 		skip_prev = skip;
    371 	}
    372 	/* Return the start of the list of fields to skip */
    373 	return (skip_start.skip_next);
    374 }
    375 
    376 /*
    377  * hotspare_skip_ts
    378  *
    379  * Returns a list of the timestamp fields in the hotspare record structure.
    380  * Used to skip these fields when calculating the checksum.
    381  */
    382 static crc_skip_t *
    383 hotspare_skip_ts(uint_t revision)
    384 {
    385 	crc_skip_t	*skip;
    386 	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
    387 
    388 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
    389 	switch (revision) {
    390 	case MDDB_REV_RB:
    391 	case MDDB_REV_RBFN:
    392 		skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) +
    393 		    rb_off;
    394 		break;
    395 	case MDDB_REV_RB64:
    396 	case MDDB_REV_RB64FN:
    397 		skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) +
    398 		    rb_off;
    399 		break;
    400 	}
    401 	skip->skip_size = sizeof (md_timeval32_t);
    402 	return (skip);
    403 }
    404 
    405 /*
    406  * rec_crcfunc
    407  *
    408  * Calculate or check the checksum for a record
    409  * Calculate the crc if check == 0, Check the crc if check == 1
    410  *
    411  * Record block may be written by different nodes in a multi-owner diskset
    412  * (in case of master change), the function rec_crcchk excludes timestamp
    413  * fields in crc computation of record data.
    414  * Otherwise, timestamp fields will cause each node to have a different
    415  * checksum for same record block causing the exclusive-or of all record block
    416  * checksums and data block record sums to be non-zero after new master writes
    417  * at least one record block.
    418  */
    419 static uint_t
    420 rec_crcfunc(
    421 	mddb_set_t	*s,
    422 	mddb_de_ic_t	*dep,
    423 	mddb_rb32_t	*rbp,
    424 	int		check
    425 )
    426 {
    427 	crc_skip_t	*skip;
    428 	crc_skip_t	*skip_tail;
    429 	mddb_type_t	type = dep->de_type1;
    430 	uint_t		ret;
    431 
    432 	/*
    433 	 * Generate a list of the areas to be skipped when calculating
    434 	 * the checksum.
    435 	 * First skip rb_checksum, rb_private and rb_userdata.
    436 	 */
    437 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
    438 	skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle);
    439 	skip->skip_size = 3 * sizeof (uint_t);
    440 	skip_tail = skip;
    441 	if (MD_MNSET_SETNO(s->s_setno)) {
    442 		/* For a MN set, skip rb_timestamp */
    443 		skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
    444 		    KM_SLEEP);
    445 		skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp);
    446 		skip_tail->skip_size = sizeof (md_timeval32_t);
    447 		skip->skip_next = skip_tail;
    448 
    449 		/* Now add a list of timestamps to be skipped */
    450 		if (type >= MDDB_FIRST_MODID) {
    451 			switch (dep->de_flags) {
    452 				case MDDB_F_STRIPE:
    453 					skip_tail->skip_next =
    454 					    stripe_skip_ts((void *)rbp->rb_data,
    455 					    rbp->rb_revision);
    456 					break;
    457 				case MDDB_F_MIRROR:
    458 					skip_tail->skip_next =
    459 					    mirror_skip_ts(rbp->rb_revision);
    460 					break;
    461 				case MDDB_F_HOTSPARE:
    462 					skip_tail->skip_next =
    463 					    hotspare_skip_ts(rbp->rb_revision);
    464 					break;
    465 				default:
    466 					break;
    467 			}
    468 		}
    469 	}
    470 
    471 	if (check) {
    472 		ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
    473 	} else {
    474 		crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
    475 		ret = rbp->rb_checksum;
    476 	}
    477 	while (skip) {
    478 		crc_skip_t	*skip_save = skip;
    479 
    480 		skip = skip->skip_next;
    481 		kmem_free(skip_save, sizeof (crc_skip_t));
    482 	}
    483 	return (ret);
    484 }
    485 
    486 static mddb_bf_t *
    487 allocbuffer(
    488 	mddb_set_t	*s,
    489 	int		sleepflag
    490 )
    491 {
    492 	mddb_bf_t	*bfp;
    493 
    494 	while ((bfp = s->s_freebufhead) == NULL) {
    495 		if (sleepflag == MDDB_NOSLEEP)
    496 			return ((mddb_bf_t *)NULL);
    497 		++s->s_bufmisses;
    498 #ifdef	DEBUG
    499 		if (s->s_bufmisses == 1)
    500 			cmn_err(CE_NOTE,
    501 			    "md: mddb: set %u sleeping for buffer", s->s_setno);
    502 #endif
    503 		s->s_bufwakeup = 1;
    504 		cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno));
    505 	}
    506 	s->s_freebufhead = bfp->bf_next;
    507 	bzero((caddr_t)bfp, sizeof (*bfp));
    508 	bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf;
    509 	bfp->bf_buf.b_flags = B_BUSY;	/* initialize flags */
    510 	return (bfp);
    511 }
    512 
    513 static void
    514 freebuffer(
    515 	mddb_set_t		*s,
    516 	mddb_bf_t	*bfp
    517 )
    518 {
    519 	bfp->bf_next = s->s_freebufhead;
    520 	s->s_freebufhead = bfp;
    521 	if (s->s_bufwakeup) {
    522 		cv_broadcast(&s->s_buf_cv);
    523 		s->s_bufwakeup = 0;
    524 	}
    525 }
    526 
    527 
    528 static void
    529 blkbusy(
    530 	mddb_set_t	*s,
    531 	mddb_block_t	blk
    532 )
    533 {
    534 	int		bit, byte;
    535 
    536 	s->s_freeblkcnt--;
    537 	byte = blk / 8;
    538 	bit = 1 << (blk & 7);
    539 	ASSERT(! (s->s_freebitmap[byte] & bit));
    540 	s->s_freebitmap[byte] |= bit;
    541 }
    542 
    543 static void
    544 blkfree(
    545 	mddb_set_t	*s,
    546 	mddb_block_t	blk
    547 )
    548 {
    549 	int		bit, byte;
    550 
    551 	s->s_freeblkcnt++;
    552 	byte = blk / 8;
    553 	bit = 1 << (blk & 7);
    554 	ASSERT(s->s_freebitmap[byte] & bit);
    555 	s->s_freebitmap[byte] &= ~bit;
    556 }
    557 
    558 static int
    559 blkcheck(
    560 	mddb_set_t	*s,
    561 	mddb_block_t	blk
    562 )
    563 {
    564 	int		bit, byte;
    565 
    566 	byte = blk / 8;
    567 	bit = 1 << (blk & 7);
    568 	return (s->s_freebitmap[byte] & bit);
    569 }
    570 
    571 /*
    572  * not fast but simple
    573  */
    574 static mddb_block_t
    575 getfreeblks(
    576 	mddb_set_t	*s,
    577 	size_t		count
    578 )
    579 {
    580 	int		i;
    581 	size_t		contig;
    582 
    583 	contig = 0;
    584 	for (i = 0; i < s->s_totalblkcnt; i++) {
    585 		if (blkcheck(s, i)) {
    586 			contig = 0;
    587 		} else {
    588 			contig++;
    589 			if (contig == count) {
    590 				contig = i - count + 1;
    591 				for (i = (int)contig; i < contig + count; i++)
    592 					blkbusy(s, i);
    593 				return ((mddb_block_t)contig);
    594 			}
    595 		}
    596 	}
    597 	return (0);
    598 }
    599 
    600 static void
    601 computefreeblks(
    602 	mddb_set_t	*s
    603 )
    604 {
    605 	mddb_db_t	*dbp;
    606 	mddb_de_ic_t	*dep;
    607 	int		i;
    608 	int		minblks;
    609 	int		freeblks;
    610 	mddb_mb_ic_t	*mbip;
    611 	mddb_lb_t	*lbp;
    612 	mddb_block_t	maxblk;
    613 	mddb_did_db_t	*did_dbp;
    614 	int		nblks;
    615 
    616 	minblks = 0;
    617 	lbp = s->s_lbp;
    618 	maxblk = 0;
    619 
    620 	/*
    621 	 * Determine the max number of blocks.
    622 	 */
    623 	nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS;
    624 	/*
    625 	 * go through and find highest logical block
    626 	 */
    627 	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
    628 		if (dbp->db_blknum > maxblk)
    629 			maxblk = dbp->db_blknum;
    630 		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
    631 			for (i = 0; i < dep->de_blkcount; i++)
    632 				if (dep->de_blks[i] > maxblk)
    633 					maxblk = dep->de_blks[i];
    634 	}
    635 
    636 	for (i = 0; i < lbp->lb_loccnt; i++) {
    637 		mddb_locator_t	*lp = &lbp->lb_locators[i];
    638 
    639 		if ((lp->l_flags & MDDB_F_DELETED) ||
    640 		    (lp->l_flags & MDDB_F_EMASTER))
    641 			continue;
    642 
    643 		freeblks = 0;
    644 		for (mbip = s->s_mbiarray[i]; mbip != NULL;
    645 		    mbip = mbip->mbi_next) {
    646 			freeblks += mbip->mbi_mddb_mb.mb_blkcnt;
    647 		}
    648 		if (freeblks == 0)	/* this happen when there is no */
    649 			continue;	/*	master blk		*/
    650 
    651 		if (freeblks <= maxblk) {
    652 			lp->l_flags |= MDDB_F_TOOSMALL;
    653 			lp->l_flags &= ~MDDB_F_ACTIVE;
    654 		}
    655 
    656 		if (freeblks < minblks || minblks == 0)
    657 			minblks = freeblks;
    658 	}
    659 	/*
    660 	 * set up reasonable freespace if no
    661 	 * data bases exist
    662 	 */
    663 	if (minblks == 0)
    664 		minblks = 100;
    665 	if (minblks > nblks)
    666 		minblks = nblks;
    667 	s->s_freeblkcnt = minblks;
    668 	s->s_totalblkcnt = minblks;
    669 	if (! s->s_freebitmapsize) {
    670 		s->s_freebitmapsize = nblks / 8;
    671 		s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize,
    672 		    KM_SLEEP);
    673 	}
    674 	bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
    675 
    676 	/* locator block sectors */
    677 	for (i = 0; i < s->s_lbp->lb_blkcnt; i++)
    678 		blkbusy(s, i);
    679 
    680 	/* locator name sectors */
    681 	for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++)
    682 		blkbusy(s, (s->s_lbp->lb_lnfirstblk + i));
    683 
    684 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
    685 		/* locator block device id information */
    686 		for (i = 0; i < s->s_lbp->lb_didblkcnt; i++)
    687 			blkbusy(s, (s->s_lbp->lb_didfirstblk + i));
    688 
    689 		/* disk blocks containing actual device ids */
    690 		did_dbp = s->s_did_icp->did_ic_dbp;
    691 		while (did_dbp) {
    692 			for (i = 0; i < did_dbp->db_blkcnt; i++) {
    693 				blkbusy(s, did_dbp->db_firstblk + i);
    694 			}
    695 			did_dbp = did_dbp->db_next;
    696 		}
    697 	}
    698 
    699 	/* Only use data tags if not a MN set */
    700 	if (!(lbp->lb_flags & MDDB_MNSET)) {
    701 		/* Found a bad tag, do NOT mark the data tag blks busy here */
    702 		if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) {
    703 			for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++)
    704 				blkbusy(s, (s->s_lbp->lb_dtfirstblk + i));
    705 		}
    706 	}
    707 
    708 	/* directory block/entry sectors */
    709 	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
    710 		blkbusy(s, dbp->db_blknum);
    711 		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
    712 			for (i = 0; i < dep->de_blkcount; i++)
    713 				blkbusy(s, dep->de_blks[i]);
    714 	}
    715 }
    716 
    717 /*
    718  * Add free space to the device id incore free list.
    719  * Called:
    720  *    - During startup when all devid blocks are temporarily placed on the
    721  *       free list
    722  *    - After a devid has been deleted via the metadb command.
    723  *    - When mddb_devid_free_get adds unused space from a disk block
    724  *       to free list
    725  */
    726 static int
    727 mddb_devid_free_add(
    728 	mddb_set_t *s,
    729 	uint_t firstblk,
    730 	uint_t offset,
    731 	uint_t length
    732 )
    733 {
    734 	mddb_did_free_t	*did_freep;
    735 
    736 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
    737 		return (0);
    738 	}
    739 
    740 	did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t),
    741 	    KM_SLEEP);
    742 	did_freep->free_blk = firstblk;
    743 	did_freep->free_offset = offset;
    744 	did_freep->free_length = length;
    745 	did_freep->free_next = s->s_did_icp->did_ic_freep;
    746 	s->s_did_icp->did_ic_freep = did_freep;
    747 
    748 	return (0);
    749 }
    750 
    751 /*
    752  * Remove specific free space from the device id incore free list.
    753  * Called at startup (after all devid blocks have been placed on
    754  * free list) in order to remove the free space from the list that
    755  * contains actual devids.
    756  * Returns 0 if area successfully removed.
    757  * Returns 1 if no matching area is found - so nothing removed.
    758  */
    759 static int
    760 mddb_devid_free_delete(
    761 	mddb_set_t *s,
    762 	uint_t firstblk,
    763 	uint_t offset,
    764 	uint_t length
    765 )
    766 {
    767 	int		block_found = 0;
    768 	mddb_did_free_t	*did_freep1;		/* next free block */
    769 	mddb_did_free_t	*did_freep2 = 0;	/* previous free block */
    770 	mddb_did_free_t *did_freep_before;	/* area before offset, len */
    771 	mddb_did_free_t	*did_freep_after;	/* area after offset, len */
    772 	uint_t		old_length;
    773 
    774 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
    775 		return (1);
    776 	}
    777 
    778 	/* find free block for this devid */
    779 	did_freep1 = s->s_did_icp->did_ic_freep;
    780 	while (did_freep1) {
    781 		/*
    782 		 * Look through free list of <block, offset, length> to
    783 		 * find our entry in the free list.  Our entry should
    784 		 * exist since the entire devid block was placed into
    785 		 * this free list at startup.  This code is just removing
    786 		 * the non-free (in-use) portions of the devid block so
    787 		 * that the remaining linked list does indeed just
    788 		 * contain a free list.
    789 		 *
    790 		 * Our entry has been found if
    791 		 *   - the blocks match,
    792 		 *   - the offset (starting address) in the free list is
    793 		 *	less than the offset of our entry and
    794 		 *   - the length+offset (ending address) in the free list is
    795 		 *	greater than the length+offset of our entry.
    796 		 */
    797 		if ((did_freep1->free_blk == firstblk) &&
    798 		    (did_freep1->free_offset <= offset) &&
    799 		    ((did_freep1->free_length + did_freep1->free_offset) >=
    800 		    (length + offset))) {
    801 			/* Have found our entry - remove from list */
    802 			block_found = 1;
    803 			did_freep_before = did_freep1;
    804 			old_length = did_freep1->free_length;
    805 			/* did_freep1 - pts to next free block */
    806 			did_freep1 = did_freep1->free_next;
    807 			if (did_freep2) {
    808 				did_freep2->free_next = did_freep1;
    809 			} else {
    810 				s->s_did_icp->did_ic_freep = did_freep1;
    811 			}
    812 
    813 			/*
    814 			 * did_freep_before points to area in block before
    815 			 * offset, length.
    816 			 */
    817 			did_freep_before->free_length = offset -
    818 			    did_freep_before->free_offset;
    819 			/*
    820 			 * did_freep_after points to area in block after
    821 			 * offset, length.
    822 			 */
    823 			did_freep_after = (mddb_did_free_t *)kmem_zalloc
    824 			    (sizeof (mddb_did_free_t), KM_SLEEP);
    825 			did_freep_after->free_blk = did_freep_before->free_blk;
    826 			did_freep_after->free_offset = offset + length;
    827 			did_freep_after->free_length = old_length - length -
    828 			    did_freep_before->free_length;
    829 			/*
    830 			 * Add before and after areas to free list
    831 			 * If area before or after offset, length has length
    832 			 * of 0, that entry is not added.
    833 			 */
    834 			if (did_freep_after->free_length) {
    835 				did_freep_after->free_next = did_freep1;
    836 				if (did_freep2) {
    837 					did_freep2->free_next =
    838 					    did_freep_after;
    839 				} else {
    840 					s->s_did_icp->did_ic_freep =
    841 					    did_freep_after;
    842 				}
    843 				did_freep1 = did_freep_after;
    844 			} else {
    845 				kmem_free(did_freep_after,
    846 				    sizeof (mddb_did_free_t));
    847 			}
    848 
    849 			if (did_freep_before->free_length) {
    850 				did_freep_before->free_next = did_freep1;
    851 				if (did_freep2) {
    852 					did_freep2->free_next =
    853 					    did_freep_before;
    854 				} else {
    855 					s->s_did_icp->did_ic_freep =
    856 					    did_freep_before;
    857 				}
    858 			} else {
    859 				kmem_free(did_freep_before,
    860 				    sizeof (mddb_did_free_t));
    861 			}
    862 			break;
    863 		} else {
    864 			did_freep2 = did_freep1;
    865 			did_freep1 = did_freep1->free_next;
    866 		}
    867 	}
    868 	if (block_found == 0) {
    869 		return (1);
    870 	} else {
    871 		return (0);
    872 	}
    873 }
    874 
    875 /*
    876  * Find free space of devid length and remove free space from list.
    877  * Return a pointer to the previously free area.
    878  *
    879  * If there's not enough free space on the free list, get an empty
    880  * disk block, put the empty disk block on the did_ic_dbp linked list,
    881  * and add the disk block space not used for devid to the free list.
    882  *
    883  * Return pointer to address (inside disk block) of free area for devid.
    884  * Return 0 if error.
    885  */
    886 static caddr_t
    887 mddb_devid_free_get(
    888 	mddb_set_t *s,
    889 	uint_t len,
    890 	uint_t *blk,
    891 	uint_t *cnt,
    892 	uint_t *offset
    893 )
    894 {
    895 	mddb_did_free_t	*freep, *freep2;
    896 	mddb_did_db_t	*dbp;
    897 	uint_t		blk_cnt, blk_num;
    898 	ddi_devid_t	devid_ptr = NULL;
    899 
    900 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
    901 		return (0);
    902 	}
    903 
    904 	freep = s->s_did_icp->did_ic_freep;
    905 	freep2 = (mddb_did_free_t *)NULL;
    906 	while (freep) {
    907 		/* found a free area - remove from free list */
    908 		if (len <= freep->free_length) {
    909 			*blk = freep->free_blk;
    910 			*offset = freep->free_offset;
    911 			/* find disk block pointer that contains free area */
    912 			dbp = s->s_did_icp->did_ic_dbp;
    913 			while (dbp) {
    914 				if (dbp->db_firstblk == *blk)
    915 					break;
    916 				else
    917 					dbp = dbp->db_next;
    918 			}
    919 			/*
    920 			 * If a disk block pointer can't be found - something
    921 			 * is wrong, so don't use this free space.
    922 			 */
    923 			if (dbp == NULL) {
    924 				freep2 = freep;
    925 				freep = freep->free_next;
    926 				continue;
    927 			}
    928 
    929 			devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset);
    930 			*cnt = dbp->db_blkcnt;
    931 
    932 			/* Update free list information */
    933 			freep->free_offset += len;
    934 			freep->free_length -= len;
    935 			if (freep->free_length == 0) {
    936 				if (freep2) {
    937 					freep2->free_next =
    938 					    freep->free_next;
    939 				} else {
    940 					s->s_did_icp->did_ic_freep =
    941 					    freep->free_next;
    942 				}
    943 				kmem_free(freep, sizeof (mddb_did_free_t));
    944 			}
    945 			break;
    946 		}
    947 		freep2 = freep;
    948 		freep = freep->free_next;
    949 	}
    950 
    951 	/* Didn't find a free spot */
    952 	if (freep == NULL) {
    953 		/* get free logical disk blk in replica */
    954 		blk_cnt = btodb(len + (MDDB_BSIZE - 1));
    955 		blk_num = getfreeblks(s, blk_cnt);
    956 		if (blk_num == 0)
    957 			return (0);
    958 
    959 		/* Add disk block to disk block linked list */
    960 		dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP);
    961 		dbp->db_firstblk = blk_num;
    962 		dbp->db_blkcnt = blk_cnt;
    963 		dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP);
    964 		dbp->db_next = s->s_did_icp->did_ic_dbp;
    965 		s->s_did_icp->did_ic_dbp = dbp;
    966 		devid_ptr = (ddi_devid_t)dbp->db_ptr;
    967 
    968 		/* Update return values */
    969 		*blk = blk_num;
    970 		*offset = 0;
    971 		*cnt = blk_cnt;
    972 
    973 		/* Add unused part of block to free list */
    974 		(void) mddb_devid_free_add(s, blk_num,
    975 		    len, (dbtob(blk_cnt) - len));
    976 	}
    977 
    978 	return ((caddr_t)devid_ptr);
    979 }
    980 
    981 /*
    982  * Add device id information for locator index to device id area in set.
    983  * Get free area to store device id from free list.   Update checksum
    984  * for mddb_did_blk.
    985  *
    986  * This routine does not write any data out to disk.
    987  * After this routine has been called, the routine, writelocall, should
    988  * be called to write both the locator block and device id area out
    989  * to disk.
    990  */
    991 static int
    992 mddb_devid_add(
    993 	mddb_set_t	*s,
    994 	uint_t		index,
    995 	ddi_devid_t	devid,
    996 	char		*minor_name
    997 )
    998 {
    999 	uint_t		devid_len;
   1000 	uint_t		blk, offset;
   1001 	ddi_devid_t	devid_ptr;
   1002 	mddb_did_info_t	*did_info;
   1003 	uint_t		blkcnt, i;
   1004 	mddb_did_blk_t	*did_blk;
   1005 
   1006 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
   1007 		return (1);
   1008 	}
   1009 	if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1))
   1010 		return (1);
   1011 
   1012 	/* Check if device id has already been added */
   1013 	did_blk = s->s_did_icp->did_ic_blkp;
   1014 	did_info = &(did_blk->blk_info[index]);
   1015 	if (did_info->info_flags & MDDB_DID_EXISTS)
   1016 		return (0);
   1017 
   1018 	devid_len = ddi_devid_sizeof(devid);
   1019 	devid_ptr = (ddi_devid_t)mddb_devid_free_get(s,
   1020 	    devid_len, &blk, &blkcnt, &offset);
   1021 
   1022 	if (devid_ptr == NULL) {
   1023 		return (1);
   1024 	}
   1025 
   1026 	/* Copy devid into devid free area */
   1027 	for (i = 0; i < devid_len; i++)
   1028 		((char *)devid_ptr)[i] = ((char *)devid)[i];
   1029 
   1030 	/* Update mddb_did_info area for new device id */
   1031 	did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID;
   1032 
   1033 	/*
   1034 	 * Only set UPDATED flag for non-replicated import cases.
   1035 	 * This allows the side locator driver name index to get
   1036 	 * updated in load_old_replicas.
   1037 	 */
   1038 	if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT))
   1039 		did_info->info_flags |= MDDB_DID_UPDATED;
   1040 
   1041 	did_info->info_firstblk = blk;
   1042 	did_info->info_blkcnt = blkcnt;
   1043 	did_info->info_offset = offset;
   1044 	did_info->info_length = devid_len;
   1045 	(void) strcpy(did_info->info_minor_name, minor_name);
   1046 	crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL);
   1047 
   1048 	/* Add device id pointer to did_ic_devid array */
   1049 	s->s_did_icp->did_ic_devid[index] = devid_ptr;
   1050 
   1051 	return (0);
   1052 }
   1053 
   1054 
   1055 /*
   1056  * Delete device id information for locator index from device id area in set.
   1057  * Add device id space to free area.
   1058  *
   1059  * This routine does not write any data out to disk.
   1060  * After this routine has been called, the routine, writelocall, should
   1061  * be called to write both the locator block and device id area out
   1062  * to disk.
   1063  */
   1064 static int
   1065 mddb_devid_delete(mddb_set_t *s, uint_t index)
   1066 {
   1067 	mddb_did_info_t	*did_info;
   1068 	mddb_did_blk_t	*did_blk;
   1069 
   1070 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
   1071 		return (1);
   1072 	}
   1073 
   1074 	/* Get device id information from mddb_did_blk */
   1075 	did_blk = s->s_did_icp->did_ic_blkp;
   1076 	did_info = &(did_blk->blk_info[index]);
   1077 
   1078 	/*
   1079 	 * Ensure that the underlying device supports device ids
   1080 	 * before arbitrarily removing them.
   1081 	 */
   1082 	if (!(did_info->info_flags & MDDB_DID_EXISTS)) {
   1083 		return (1);
   1084 	}
   1085 
   1086 	/* Remove device id information from mddb_did_blk */
   1087 	did_info->info_flags = 0;
   1088 
   1089 	/* Remove device id from incore area */
   1090 	s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL;
   1091 
   1092 	/* Add new free space in disk block to free list */
   1093 	(void) mddb_devid_free_add(s, did_info->info_firstblk,
   1094 	    did_info->info_offset, did_info->info_length);
   1095 
   1096 	return (0);
   1097 }
   1098 
   1099 /*
   1100  * Check if there is a device id for a locator index.
   1101  *
   1102  * Caller of this routine should not free devid or minor_name since
   1103  * these will point to internal data structures that should not
   1104  * be freed.
   1105  */
   1106 static int
   1107 mddb_devid_get(
   1108 	mddb_set_t *s,
   1109 	uint_t index,
   1110 	ddi_devid_t *devid,
   1111 	char **minor_name
   1112 )
   1113 {
   1114 	mddb_did_info_t	*did_info;
   1115 
   1116 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
   1117 		return (0);
   1118 	}
   1119 	did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]);
   1120 
   1121 	if (did_info->info_flags & MDDB_DID_EXISTS) {
   1122 		*devid = s->s_did_icp->did_ic_devid[index];
   1123 		*minor_name =
   1124 		    s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name;
   1125 		return (1);
   1126 	} else
   1127 		return (0);
   1128 
   1129 
   1130 }
   1131 
   1132 /*
   1133  * Check if device id is valid on current system.
   1134  * Needs devid, previously known dev_t and current minor_name.
   1135  *
   1136  * Success:
   1137  * 	Returns 0 if valid device id is found and updates
   1138  * 	dev_t if the dev_t associated with the device id is
   1139  *	different than dev_t.
   1140  * Failure:
   1141  * 	Returns 1 if device id not valid on current system.
   1142  */
   1143 static int
   1144 mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name)
   1145 {
   1146 	int		retndevs;
   1147 	dev_t		*ddi_devs;
   1148 	int		devid_flag = 0;
   1149 	int 		cnt;
   1150 
   1151 	if (dev == 0)
   1152 		return (1);
   1153 	/*
   1154 	 * See if devid is valid in the current system.
   1155 	 * If so, set dev to match the devid.
   1156 	 */
   1157 	if (ddi_lyr_devid_to_devlist(devid, minor_name,
   1158 	    &retndevs, &ddi_devs) == DDI_SUCCESS) {
   1159 		if (retndevs > 0) {
   1160 			/* devid is valid to use */
   1161 			devid_flag = 1;
   1162 			/* does dev_t in list match dev */
   1163 			cnt = 0;
   1164 			while (cnt < retndevs) {
   1165 				if (*dev == md_expldev(ddi_devs[cnt]))
   1166 					break;
   1167 				cnt++;
   1168 			}
   1169 			/*
   1170 			 * If a different dev_t, then setup
   1171 			 * new dev and new major name
   1172 			 */
   1173 			if (cnt == retndevs) {
   1174 				*dev = md_expldev(ddi_devs[0]);
   1175 			}
   1176 			ddi_lyr_free_devlist(ddi_devs, retndevs);
   1177 		}
   1178 	}
   1179 	if (devid_flag)
   1180 		return (0);
   1181 	else
   1182 		return (1);
   1183 }
   1184 
   1185 
   1186 /*
   1187  * Free the devid incore data areas
   1188  */
   1189 static void
   1190 mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp)
   1191 {
   1192 	mddb_did_free_t	*did_freep1, *did_freep2;
   1193 	mddb_did_db_t	*did_dbp1, *did_dbp2;
   1194 	mddb_did_ic_t	*icp = *did_icp;
   1195 
   1196 	if (icp) {
   1197 		if (icp->did_ic_blkp) {
   1198 			kmem_free((caddr_t)icp->did_ic_blkp,
   1199 			    dbtob(lbp->lb_didblkcnt));
   1200 			icp->did_ic_blkp = (mddb_did_blk_t *)NULL;
   1201 		}
   1202 
   1203 		if (icp->did_ic_dbp) {
   1204 			did_dbp1 = icp->did_ic_dbp;
   1205 			while (did_dbp1) {
   1206 				did_dbp2 = did_dbp1->db_next;
   1207 				kmem_free((caddr_t)did_dbp1->db_ptr,
   1208 				    dbtob(did_dbp1->db_blkcnt));
   1209 				kmem_free((caddr_t)did_dbp1,
   1210 				    sizeof (mddb_did_db_t));
   1211 				did_dbp1 = did_dbp2;
   1212 			}
   1213 		}
   1214 
   1215 		if (icp->did_ic_freep) {
   1216 			did_freep1 = icp->did_ic_freep;
   1217 			while (did_freep1) {
   1218 				did_freep2 = did_freep1->free_next;
   1219 				kmem_free((caddr_t)did_freep1,
   1220 				    sizeof (mddb_did_free_t));
   1221 				did_freep1 = did_freep2;
   1222 			}
   1223 		}
   1224 
   1225 		kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t));
   1226 		*did_icp = (mddb_did_ic_t *)NULL;
   1227 	}
   1228 
   1229 }
   1230 
   1231 static daddr_t
   1232 getphysblk(
   1233 	mddb_block_t		blk,
   1234 	mddb_mb_ic_t		*mbip
   1235 )
   1236 {
   1237 	mddb_mb_t	*mbp = &(mbip->mbi_mddb_mb);
   1238 
   1239 	while (blk >= mbp->mb_blkcnt) {
   1240 		if (! mbip->mbi_next)
   1241 			return ((daddr_t)-1);	/* no such block */
   1242 		blk -= mbp->mb_blkcnt;
   1243 		mbip = mbip->mbi_next;
   1244 		mbp = &(mbip->mbi_mddb_mb);
   1245 	}
   1246 
   1247 	if (blk >= mbp->mb_blkmap.m_consecutive)
   1248 		return ((daddr_t)-1);	/* no such block */
   1249 
   1250 	return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk));
   1251 }
   1252 
   1253 /*
   1254  * when a buf header is passed in the new buffer must be
   1255  * put on the front of the chain. writerec counts on it
   1256  */
   1257 static int
   1258 putblks(
   1259 	mddb_set_t	*s,		/* incore db set structure */
   1260 	caddr_t		buffer,		/* adr of buffer to be written */
   1261 	daddr_t		blk,		/* block number for first block */
   1262 	int		cnt,		/* number of blocks to be written */
   1263 	md_dev64_t	device,		/* device to be written to */
   1264 	mddb_bf_t	**bufhead	/* if non-zero then ASYNC I/O */
   1265 					/*    and put buf address here */
   1266 )
   1267 {
   1268 	buf_t		*bp;
   1269 	mddb_bf_t	*bfp;
   1270 	int		err = 0;
   1271 
   1272 	bfp = allocbuffer(s, MDDB_SLEEPOK);
   1273 	bp = &bfp->bf_buf;
   1274 	bp->b_bcount = MDDB_BSIZE * cnt;
   1275 	bp->b_un.b_addr = buffer;
   1276 	bp->b_blkno = blk;
   1277 	bp->b_edev = md_dev64_to_dev(device);
   1278 	/*
   1279 	 * if a header for a buf chain is passed in this is async io.
   1280 	 * currently only done for optimize  records
   1281 	 */
   1282 	if (bufhead) {
   1283 		bfp->bf_next = *bufhead;
   1284 		*bufhead = bfp;
   1285 		(void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp);
   1286 		return (0);
   1287 	}
   1288 	err = mddb_rwdata(s, B_WRITE, bp);
   1289 	freebuffer(s, bfp);
   1290 	if (err) {
   1291 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
   1292 		    s->s_setno, device);
   1293 		return (MDDB_F_EWRITE);
   1294 	}
   1295 	return (0);
   1296 }
   1297 
   1298 /*
   1299  * wrtblklst - takes an array of logical block numbers
   1300  *		and writes the buffer to those blocks (scatter).
   1301  * If called during upgrade, this routine expects a
   1302  * non-translated (aka target) dev.
   1303  */
   1304 static int
   1305 wrtblklst(
   1306 	mddb_set_t	*s,		/* incore set structure */
   1307 	caddr_t		buffer,		/* buffer to be written (record blk) */
   1308 	mddb_block_t	blka[],		/* list of logical blks for record */
   1309 	daddr_t		cnt,		/* number of logical blks */
   1310 	const int	li,		/* locator index */
   1311 	mddb_bf_t	**bufhead,	/* if non-zero then ASYNC I/O */
   1312 					/*    and put buf address here */
   1313 	int		master_only	/* allow only master node to write */
   1314 )
   1315 {
   1316 	daddr_t		blk;
   1317 	daddr_t		blk1;
   1318 	int		err = 0;
   1319 	int		cons;
   1320 	mddb_lb_t	*lbp = s->s_lbp;
   1321 	mddb_locator_t	*lp = &lbp->lb_locators[li];
   1322 	md_dev64_t	dev;
   1323 	mddb_mb_ic_t	*mbip = s->s_mbiarray[li];
   1324 
   1325 	/*
   1326 	 * If a MN diskset and only the master can write,
   1327 	 * then a non-master node will just return success.
   1328 	 */
   1329 	if (lbp->lb_flags & MDDB_MNSET) {
   1330 		if (master_only == MDDB_WR_ONLY_MASTER) {
   1331 			/* return successfully if we aren't the master */
   1332 			if (!(md_set[s->s_setno].s_am_i_master)) {
   1333 				return (0);
   1334 			}
   1335 		}
   1336 		if (mbip == NULL)
   1337 			return (MDDB_F_EWRITE);
   1338 	}
   1339 
   1340 	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
   1341 	if (dev == NODEV64) {
   1342 		return (1);
   1343 	}
   1344 
   1345 	blk = getphysblk(blka[0], mbip);
   1346 	ASSERT(blk >= 0);
   1347 
   1348 	cons = 1;
   1349 	while (cnt) {
   1350 		if (cons != cnt) {
   1351 			blk1 = getphysblk(blka[cons], mbip);
   1352 			ASSERT(blk1 >= 0);
   1353 			if ((blk + cons) == blk1) {
   1354 				cons++;
   1355 				continue;
   1356 			}
   1357 		}
   1358 		if (err = putblks(s, buffer, blk, cons, dev, bufhead)) {
   1359 			/*
   1360 			 * If an MN diskset and any_node_can_write
   1361 			 * then this request is coming from writeoptrecord
   1362 			 * and l_flags field should not be updated.
   1363 			 * l_flags will be updated as a result of sending
   1364 			 * a class1 message to the master.  Setting l_flags
   1365 			 * here will cause slave to be out of sync with
   1366 			 * master.
   1367 			 *
   1368 			 * Otherwise, set the error in l_flags
   1369 			 * (this occurs if this is not a MN diskset or
   1370 			 * only_master_can_write is set).
   1371 			 */
   1372 			if ((!(lbp->lb_flags & MDDB_MNSET)) ||
   1373 			    (master_only == MDDB_WR_ONLY_MASTER)) {
   1374 				lp->l_flags |= MDDB_F_EWRITE;
   1375 			}
   1376 			return (err);
   1377 		}
   1378 		if (bufhead)
   1379 			(*bufhead)->bf_locator = lp;
   1380 
   1381 		buffer += MDDB_BSIZE * cons;
   1382 		cnt -= cons;
   1383 		blka += cons;
   1384 		if (cnt) {
   1385 			blk = getphysblk(blka[0], mbip);
   1386 			ASSERT(blk >= 0);
   1387 		}
   1388 		cons = 1;
   1389 	}
   1390 
   1391 	return (0);
   1392 }
   1393 
   1394 /*
   1395  * writeblks - takes a logical block number/block count pair
   1396  * 		and writes the buffer to those contiguous logical blocks.
   1397  * If called during upgrade, this routine expects a non-translated
   1398  * (aka target) dev.
   1399  */
   1400 static int
   1401 writeblks(
   1402 	mddb_set_t	*s,		/* incore set structure */
   1403 	caddr_t		buffer,		/* buffer to be written */
   1404 	mddb_block_t	blk,		/* starting logical block number */
   1405 	int		cnt,		/* number of log blocks to be written */
   1406 	const int	li,		/* locator index */
   1407 	int		master_only	/* allow only master node to write */
   1408 )
   1409 {
   1410 	daddr_t		physblk;
   1411 	int		err = 0;
   1412 	int		i;
   1413 	mddb_lb_t	*lbp = s->s_lbp;
   1414 	mddb_locator_t	*lp = &lbp->lb_locators[li];
   1415 	md_dev64_t	dev;
   1416 	mddb_block_t	*blkarray;
   1417 	int		size;
   1418 	int		ret;
   1419 
   1420 	/*
   1421 	 * If a MN diskset and only the master can write,
   1422 	 * then a non-master node will just return success.
   1423 	 */
   1424 	if ((lbp->lb_flags & MDDB_MNSET) &&
   1425 	    (master_only == MDDB_WR_ONLY_MASTER)) {
   1426 		/* return successfully if we aren't the master */
   1427 		if (!(md_set[s->s_setno].s_am_i_master)) {
   1428 			return (0);
   1429 		}
   1430 	}
   1431 
   1432 	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
   1433 	if (dev == NODEV64) {
   1434 		return (1);
   1435 	}
   1436 
   1437 	if (cnt > 1) {
   1438 		size = sizeof (mddb_block_t) * cnt;
   1439 		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
   1440 		for (i = 0; i < cnt; i++)
   1441 			blkarray[i] = blk + i;
   1442 		ret = wrtblklst(s, buffer, blkarray, cnt,
   1443 		    li, 0, MDDB_WR_ONLY_MASTER);
   1444 		kmem_free(blkarray, size);
   1445 		return (ret);
   1446 	}
   1447 	physblk = getphysblk(blk, s->s_mbiarray[li]);
   1448 	ASSERT(physblk > 0);
   1449 	if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) {
   1450 		lp->l_flags |= MDDB_F_EWRITE;
   1451 		return (err);
   1452 	}
   1453 	return (0);
   1454 }
   1455 
   1456 /*
   1457  * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas.
   1458  */
   1459 static int
   1460 writeall(
   1461 	mddb_set_t	*s,		/* incore set structure */
   1462 	caddr_t		buffer,		/* buffer to be written */
   1463 	mddb_block_t	block,		/* starting logical block number */
   1464 	int		cnt,		/* number of log blocks to be written */
   1465 	int		master_only	/* allow only master node to write */
   1466 )
   1467 {
   1468 	int		li;
   1469 	int		err = 0;
   1470 	mddb_lb_t	*lbp = s->s_lbp;
   1471 
   1472 	for (li = 0; li < lbp->lb_loccnt; li++) {
   1473 		mddb_locator_t	*lp = &lbp->lb_locators[li];
   1474 
   1475 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
   1476 		    (lp->l_flags & MDDB_F_EWRITE))
   1477 			continue;
   1478 
   1479 		err |= writeblks(s, buffer, block, cnt, li, master_only);
   1480 	}
   1481 
   1482 	return (err);
   1483 }
   1484 
   1485 /*
   1486  * writelocall - write the locator block and device id information (if
   1487  * replica is in device id format) to all ACTIVE/NON-ERRORER replicas.
   1488  *
   1489  * Increments the locator block's commitcnt.  Updates the device id area's
   1490  * commitcnt if the replica is in device id format.  Regenerates the
   1491  * checksums after updating the commitcnt(s).
   1492  */
   1493 static int
   1494 writelocall(
   1495 	mddb_set_t	*s	/* incore set structure */
   1496 )
   1497 {
   1498 	int		li;
   1499 	int		err = 0;
   1500 	mddb_lb_t	*lbp = s->s_lbp;
   1501 	mddb_did_blk_t	*did_blk;
   1502 	mddb_did_db_t	*did_dbp;
   1503 
   1504 	s->s_lbp->lb_commitcnt++;
   1505 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   1506 		did_blk = s->s_did_icp->did_ic_blkp;
   1507 		did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt;
   1508 		crcgen(did_blk, &did_blk->blk_checksum,
   1509 		    dbtob(lbp->lb_didblkcnt), NULL);
   1510 	}
   1511 	crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
   1512 
   1513 	for (li = 0; li < lbp->lb_loccnt; li++) {
   1514 		mddb_locator_t	*lp = &lbp->lb_locators[li];
   1515 
   1516 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
   1517 		    (lp->l_flags & MDDB_F_EWRITE))
   1518 			continue;
   1519 
   1520 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   1521 			/* write out blocks containing actual device ids */
   1522 			did_dbp = s->s_did_icp->did_ic_dbp;
   1523 			while (did_dbp) {
   1524 				err |= writeblks(s, (caddr_t)did_dbp->db_ptr,
   1525 				    did_dbp->db_firstblk,
   1526 				    did_dbp->db_blkcnt, li,
   1527 				    MDDB_WR_ONLY_MASTER);
   1528 				did_dbp = did_dbp->db_next;
   1529 			}
   1530 
   1531 			/* write out device id area block */
   1532 			err |= writeblks(s, (caddr_t)did_blk,
   1533 			    lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
   1534 			    MDDB_WR_ONLY_MASTER);
   1535 		}
   1536 		/* write out locator block */
   1537 		err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
   1538 		    MDDB_WR_ONLY_MASTER);
   1539 	}
   1540 
   1541 	/*
   1542 	 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag
   1543 	 * in the mddb_set structure to show that the locator block has
   1544 	 * been changed.
   1545 	 */
   1546 
   1547 	if ((lbp->lb_flags & MDDB_MNSET) &&
   1548 	    (md_set[s->s_setno].s_am_i_master)) {
   1549 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
   1550 	}
   1551 	return (err);
   1552 }
   1553 
   1554 /*
   1555  * If called during upgrade, this routine expects a translated
   1556  * (aka miniroot) dev.
   1557  */
   1558 static int
   1559 getblks(
   1560 	mddb_set_t	*s,	/* incore db set structure */
   1561 	caddr_t		buffer,	/* buffer to read data into */
   1562 	md_dev64_t	device,	/* device to read from */
   1563 	daddr_t		blk,	/* physical block number to read */
   1564 	int		cnt,	/* number of blocks to read */
   1565 	int		flag	/* flags for I/O */
   1566 )
   1567 {
   1568 	buf_t		*bp;
   1569 	mddb_bf_t	*bfp;
   1570 	int		err = 0;
   1571 
   1572 	bfp = allocbuffer(s, MDDB_SLEEPOK);	/* this will never sleep */
   1573 	bp = &bfp->bf_buf;
   1574 	bp->b_bcount = MDDB_BSIZE * cnt;
   1575 	bp->b_un.b_addr = buffer;
   1576 	bp->b_blkno = blk;
   1577 	bp->b_edev = md_dev64_to_dev(device);
   1578 	err = mddb_rwdata(s, (B_READ | flag), bp);
   1579 	freebuffer(s, bfp);
   1580 	if (err) {
   1581 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
   1582 		    s->s_setno, device);
   1583 		return (MDDB_F_EREAD);
   1584 	}
   1585 	return (0);
   1586 }
   1587 
   1588 /*
   1589  * readblklst - takes an array of logical block numbers
   1590  * 		and reads those blocks (gather) into the buffer.
   1591  * If called during upgrade, this routine expects a non-translated
   1592  * (aka target) dev.
   1593  */
   1594 static int
   1595 readblklst(
   1596 	mddb_set_t	*s,	/* incore set structure */
   1597 	caddr_t		buffer,	/* buffer to be read (record block) */
   1598 	mddb_block_t	blka[],	/* list of logical blocks to be read */
   1599 	daddr_t		cnt,	/* number of logical blocks */
   1600 	int		li,	/* locator index */
   1601 	int		flag	/* flags for I/O */
   1602 )
   1603 {
   1604 	daddr_t		blk;
   1605 	daddr_t		blk1;
   1606 	int		err = 0;
   1607 	int		cons;
   1608 	md_dev64_t	dev;
   1609 	mddb_mb_ic_t	*mbip;
   1610 
   1611 	mbip = s->s_mbiarray[li];
   1612 	dev = md_expldev(s->s_lbp->lb_locators[li].l_dev);
   1613 	dev = md_xlate_targ_2_mini(dev);
   1614 	if (dev == NODEV64) {
   1615 		return (1);
   1616 	}
   1617 
   1618 	blk = getphysblk(blka[0], mbip);
   1619 	ASSERT(blk >= 0);
   1620 
   1621 	cons = 1;
   1622 	while (cnt) {
   1623 		if (cons != cnt) {
   1624 			blk1 = getphysblk(blka[cons], mbip);
   1625 			ASSERT(blk1 >= 0);
   1626 			if ((blk + cons) == blk1) {
   1627 				cons++;
   1628 				continue;
   1629 			}
   1630 		}
   1631 		if (err = getblks(s, buffer, dev, blk, cons, flag))
   1632 			return (err);
   1633 		buffer += MDDB_BSIZE * cons;
   1634 		cnt -= cons;
   1635 		blka += cons;
   1636 		if (cnt) {
   1637 			blk = getphysblk(blka[0], mbip);
   1638 			ASSERT(blk >= 0);
   1639 		}
   1640 		cons = 1;
   1641 	}
   1642 	return (0);
   1643 }
   1644 
   1645 /*
   1646  * readblks - takes a logical block number/block count pair
   1647  * 		and reads those contiguous logical blocks into the buffer.
   1648  * If called during upgrade, this routine expects a non-translated
   1649  * (aka target) dev.
   1650  */
   1651 static int
   1652 readblks(
   1653 	mddb_set_t	*s,	/* incore set structure */
   1654 	caddr_t		buffer,	/* buffer to be read into */
   1655 	mddb_block_t	blk,	/* logical block number to be read */
   1656 	int		cnt,	/* number of logical blocks to be read */
   1657 	int		li	/* locator index */
   1658 )
   1659 {
   1660 	daddr_t		physblk;
   1661 	md_dev64_t	device;
   1662 	int		i;
   1663 	mddb_block_t	*blkarray;
   1664 	int		size;
   1665 	int		ret;
   1666 
   1667 	if (cnt > 1) {
   1668 		size = sizeof (mddb_block_t) * cnt;
   1669 		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
   1670 		for (i = 0; i < cnt; i++)
   1671 			blkarray[i] = blk + i;
   1672 		ret = readblklst(s, buffer, blkarray, cnt, li, 0);
   1673 		kmem_free(blkarray, size);
   1674 		return (ret);
   1675 	}
   1676 	physblk = getphysblk(blk, s->s_mbiarray[li]);
   1677 	ASSERT(physblk > 0);
   1678 	device = md_expldev(s->s_lbp->lb_locators[li].l_dev);
   1679 	device = md_xlate_targ_2_mini(device);
   1680 	if (device == NODEV64) {
   1681 		return (1);
   1682 	}
   1683 	return (getblks(s, buffer, device, physblk, 1, 0));
   1684 }
   1685 
   1686 static void
   1687 single_thread_start(
   1688 	mddb_set_t	*s
   1689 )
   1690 {
   1691 	while (s->s_singlelockgotten) {
   1692 		s->s_singlelockwanted++;
   1693 		cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno));
   1694 	}
   1695 	s->s_singlelockgotten++;
   1696 }
   1697 
   1698 static void
   1699 single_thread_end(
   1700 	mddb_set_t	*s
   1701 )
   1702 {
   1703 	ASSERT(s->s_singlelockgotten);
   1704 	s->s_singlelockgotten = 0;
   1705 	if (s->s_singlelockwanted) {
   1706 		s->s_singlelockwanted = 0;
   1707 		cv_broadcast(&s->s_single_thread_cv);
   1708 	}
   1709 }
   1710 
   1711 static size_t
   1712 sizeofde(
   1713 	mddb_de_ic_t	*dep
   1714 )
   1715 {
   1716 	size_t		size;
   1717 
   1718 	size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) +
   1719 	    sizeof (mddb_block_t) * dep->de_blkcount;
   1720 	return (size);
   1721 }
   1722 
   1723 static size_t
   1724 sizeofde32(
   1725 	mddb_de32_t	*dep
   1726 )
   1727 {
   1728 	size_t		size;
   1729 
   1730 	size = sizeof (*dep) - sizeof (dep->de32_blks) +
   1731 	    sizeof (mddb_block_t) * dep->de32_blkcount;
   1732 	return (size);
   1733 }
   1734 
   1735 static mddb_de32_t *
   1736 nextentry(
   1737 	mddb_de32_t	*dep
   1738 )
   1739 {
   1740 	mddb_de32_t	*ret;
   1741 
   1742 	ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep)));
   1743 	return (ret);
   1744 }
   1745 
   1746 static void
   1747 create_db32rec(
   1748 	mddb_db32_t *db32p,
   1749 	mddb_db_t *dbp
   1750 )
   1751 {
   1752 	mddb_de_ic_t *dep;
   1753 	mddb_de32_t *de32p;
   1754 
   1755 #if defined(_ILP32) && !defined(lint)
   1756 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
   1757 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
   1758 #endif
   1759 
   1760 	dbtodb32(dbp, db32p);
   1761 	if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0))
   1762 		db32p->db32_firstentry = 0x4;
   1763 	de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry)
   1764 	    + sizeof (db32p->db32_firstentry)));
   1765 	for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
   1766 		detode32(dep, de32p);
   1767 		if ((dep->de_next != NULL) && (de32p->de32_next == 0))
   1768 			de32p->de32_next = 0x4;
   1769 		de32p = nextentry(de32p);
   1770 	}
   1771 	ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE);
   1772 }
   1773 
   1774 /*
   1775  * If called during upgrade, this routine expects a translated
   1776  * (aka miniroot) dev.
   1777  * If master blocks are found, set the mn_set parameter to 1 if the
   1778  * the master block revision number is MDDB_REV_MNMB; otherwise,
   1779  * set it to 0.
   1780  * If master blocks are not found, do not change the mnset parameter.
   1781  */
   1782 static mddb_mb_ic_t *
   1783 getmasters(
   1784 	mddb_set_t	*s,
   1785 	md_dev64_t	dev,
   1786 	daddr_t		blkno,
   1787 	uint_t		*flag,
   1788 	int		*mn_set
   1789 )
   1790 {
   1791 	mddb_mb_ic_t	*mbi = NULL;
   1792 	mddb_mb_t	*mb;
   1793 	int		error = 0;
   1794 	ddi_devid_t	devid;
   1795 
   1796 
   1797 	if (mddb_devopen(dev)) {
   1798 		if (flag)
   1799 			*flag |= MDDB_F_EMASTER;
   1800 		return ((mddb_mb_ic_t *)NULL);
   1801 	}
   1802 
   1803 
   1804 	mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP);
   1805 	mb = &(mbi->mbi_mddb_mb);
   1806 	if (error = getblks(s, (caddr_t)mb, dev, blkno,
   1807 	    btodb(MDDB_BSIZE), 0)) {
   1808 		error |= MDDB_F_EMASTER;
   1809 	}
   1810 	if (mb->mb_magic != MDDB_MAGIC_MB) {
   1811 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
   1812 	}
   1813 	/* Check for MDDB_REV_MNMB and lower */
   1814 	if (revchk(MDDB_REV_MNMB, mb->mb_revision)) {
   1815 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
   1816 	}
   1817 	if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) {
   1818 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
   1819 	}
   1820 
   1821 	if (!(md_get_setstatus(s->s_setno) &
   1822 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
   1823 	    (mb->mb_setno != s->s_setno)) {
   1824 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
   1825 	}
   1826 	if (mb->mb_blkno != blkno) {
   1827 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
   1828 	}
   1829 	mb->mb_next = NULL;
   1830 	mbi->mbi_next = NULL;
   1831 
   1832 	if (error)
   1833 		goto out;
   1834 
   1835 	/*
   1836 	 * Check the md_devid_destroy and md_keep_repl_state flags
   1837 	 * to see if we need to regen the devid or not.
   1838 	 *
   1839 	 * Don't care about devid in local set since it is not used
   1840 	 * and this should not be part of set importing
   1841 	 */
   1842 	if ((s->s_setno != MD_LOCAL_SET) &&
   1843 	    !(md_get_setstatus(s->s_setno) &
   1844 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) {
   1845 		/*
   1846 		 * Now check the destroy flag. We also need to handle
   1847 		 * the case where the destroy flag is reset after the
   1848 		 * destroy
   1849 		 */
   1850 		if (md_devid_destroy || (mb->mb_devid_len == 0)) {
   1851 
   1852 			if (md_devid_destroy) {
   1853 				bzero(mb->mb_devid, mb->mb_devid_len);
   1854 				mb->mb_devid_len = 0;
   1855 			}
   1856 
   1857 			/*
   1858 			 * Try to regenerate it if the 'keep' flag is not set
   1859 			 */
   1860 			if (!md_keep_repl_state) {
   1861 				if (ddi_lyr_get_devid(md_dev64_to_dev(dev),
   1862 				    &devid) == DDI_SUCCESS) {
   1863 					mb->mb_devid_len =
   1864 					    ddi_devid_sizeof(devid);
   1865 					bcopy(devid, mb->mb_devid,
   1866 					    mb->mb_devid_len);
   1867 					ddi_devid_free(devid);
   1868 				} else {
   1869 					error = MDDB_F_EFMT | MDDB_F_EMASTER;
   1870 				}
   1871 			}
   1872 
   1873 			crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
   1874 
   1875 			/*
   1876 			 * Push
   1877 			 */
   1878 			if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) {
   1879 				error = MDDB_F_EFMT | MDDB_F_EMASTER;
   1880 			}
   1881 		}
   1882 	}
   1883 
   1884 	if (! error) {
   1885 		/* Set mn_set parameter to 1 if a MN set */
   1886 		if (mb->mb_revision == MDDB_REV_MNMB)
   1887 			*mn_set = 1;
   1888 		else
   1889 			*mn_set = 0;
   1890 		return (mbi);
   1891 	}
   1892 
   1893 out:
   1894 	/* Error Out */
   1895 	if (flag)
   1896 		*flag |= error;
   1897 
   1898 	kmem_free((caddr_t)mbi, MDDB_IC_BSIZE);
   1899 	mddb_devclose(dev);
   1900 	return ((mddb_mb_ic_t *)NULL);
   1901 }
   1902 
   1903 static int
   1904 getrecord(
   1905 	mddb_set_t	*s,
   1906 	mddb_de_ic_t	*dep,
   1907 	int		li
   1908 )
   1909 {
   1910 	int		err = 0;
   1911 	mddb_rb32_t	*rbp;
   1912 
   1913 #if defined(_ILP32) && !defined(lint)
   1914 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
   1915 #endif
   1916 
   1917 
   1918 	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP);
   1919 	rbp = dep->de_rb;
   1920 
   1921 	err = readblklst(s, (caddr_t)rbp, dep->de_blks,
   1922 	    dep->de_blkcount, li, 0);
   1923 	if (err) {
   1924 		return (MDDB_F_EDATA | err);
   1925 	}
   1926 	if (rbp->rb_magic != MDDB_MAGIC_RB) {
   1927 		return (MDDB_F_EFMT | MDDB_F_EDATA);
   1928 	}
   1929 	if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) &&
   1930 	    (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) &&
   1931 	    (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) &&
   1932 	    (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) {
   1933 		return (MDDB_F_EFMT | MDDB_F_EDATA);
   1934 	}
   1935 	/* Check crc for this record */
   1936 	if (rec_crcchk(s, dep, rbp)) {
   1937 		return (MDDB_F_EFMT | MDDB_F_EDATA);
   1938 	}
   1939 	return (0);
   1940 }
   1941 
   1942 /*
   1943  * Code to read in the locator name information
   1944  */
   1945 static int
   1946 readlocnames(
   1947 	mddb_set_t	*s,
   1948 	int		li
   1949 )
   1950 {
   1951 	mddb_ln_t	*lnp;
   1952 	int		err = 0;
   1953 	mddb_block_t	ln_blkcnt, ln_blkno;
   1954 
   1955 	/*
   1956 	 * read in the locator name blocks
   1957 	 */
   1958 	s->s_lnp = NULL;
   1959 
   1960 	ln_blkno = s->s_lbp->lb_lnfirstblk;
   1961 	ln_blkcnt = s->s_lbp->lb_lnblkcnt;
   1962 	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP);
   1963 
   1964 	err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li);
   1965 	if (err) {
   1966 		err |= MDDB_F_EDATA;
   1967 		goto out;
   1968 	}
   1969 	if (lnp->ln_magic != MDDB_MAGIC_LN) {
   1970 		err = MDDB_F_EDATA | MDDB_F_EFMT;
   1971 		goto out;
   1972 	}
   1973 	if (s->s_lbp->lb_flags & MDDB_MNSET) {
   1974 		if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) {
   1975 			err = MDDB_F_EDATA | MDDB_F_EFMT;
   1976 			goto out;
   1977 		}
   1978 	} else {
   1979 		if (revchk(MDDB_REV_LN, lnp->ln_revision)) {
   1980 			err = MDDB_F_EDATA | MDDB_F_EFMT;
   1981 			goto out;
   1982 		}
   1983 	}
   1984 	if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) {
   1985 		err = MDDB_F_EDATA | MDDB_F_EFMT;
   1986 		goto out;
   1987 	}
   1988 out:
   1989 	/*
   1990 	 *	if error occurred in locator name blocks free them
   1991 	 *	and return
   1992 	 */
   1993 	if (err) {
   1994 		kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
   1995 		return (err);
   1996 	}
   1997 	s->s_lnp = lnp;
   1998 	return (0);
   1999 }
   2000 
   2001 /*
   2002  * code to read in a copy of the database.
   2003  */
   2004 
   2005 static int
   2006 readcopy(
   2007 	mddb_set_t	*s,
   2008 	int		li
   2009 )
   2010 {
   2011 	uint_t		blk;
   2012 	mddb_db_t	*dbp, *dbp1, *dbhp;
   2013 	mddb_db32_t	*db32p;
   2014 	mddb_de_ic_t	*dep, *dep2;
   2015 	mddb_de32_t	*de32p, *de32p2;
   2016 	int		err = 0;
   2017 	uint_t		checksum;
   2018 
   2019 
   2020 #if defined(_ILP32) && !defined(lint)
   2021 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
   2022 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
   2023 #endif
   2024 
   2025 	dbp = NULL;
   2026 	dbhp = NULL;
   2027 	/*
   2028 	 *	read in all the directory blocks
   2029 	 */
   2030 	blk = s->s_lbp->lb_dbfirstblk;
   2031 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
   2032 
   2033 	for (; blk != 0; blk = dbp->db_nextblk) {
   2034 		dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
   2035 		if (! dbhp) {
   2036 			dbhp = dbp1;
   2037 		} else {
   2038 			dbp->db_next = dbp1;
   2039 		}
   2040 		dbp = dbp1;
   2041 
   2042 		err = readblks(s, (caddr_t)db32p, blk, 1, li);
   2043 		if (err) {
   2044 			err |= MDDB_F_EDATA;
   2045 			break;
   2046 		}
   2047 		db32todb(db32p, dbp);
   2048 		if (db32p->db32_magic != MDDB_MAGIC_DB) {
   2049 			err = MDDB_F_EDATA | MDDB_F_EFMT;
   2050 			break;
   2051 		}
   2052 		if (revchk(MDDB_REV_DB, db32p->db32_revision)) {
   2053 			err = MDDB_F_EDATA | MDDB_F_EFMT;
   2054 			break;
   2055 		}
   2056 		if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) {
   2057 			err = MDDB_F_EDATA | MDDB_F_EFMT;
   2058 			break;
   2059 		}
   2060 		/*
   2061 		 * first go through and fix up all de_next pointers
   2062 		 */
   2063 		if (dbp->db_firstentry) {
   2064 
   2065 			de32p = (mddb_de32_t *)
   2066 			    ((void *) ((caddr_t)(&db32p->db32_firstentry)
   2067 			    + sizeof (db32p->db32_firstentry)));
   2068 
   2069 			dep = (mddb_de_ic_t *)
   2070 			    kmem_zalloc(sizeof (mddb_de_ic_t) -
   2071 			    sizeof (mddb_block_t) +
   2072 			    sizeof (mddb_block_t) * de32p->de32_blkcount,
   2073 			    KM_SLEEP);
   2074 			de32tode(de32p, dep);
   2075 
   2076 			dbp->db_firstentry = dep;
   2077 			while (de32p && de32p->de32_next) {
   2078 
   2079 				de32p2 = nextentry(de32p);
   2080 
   2081 				dep2 = (mddb_de_ic_t *)kmem_zalloc(
   2082 				    sizeof (mddb_de_ic_t) -
   2083 				    sizeof (mddb_block_t) +
   2084 				    sizeof (mddb_block_t) *
   2085 				    de32p2->de32_blkcount, KM_SLEEP);
   2086 
   2087 				de32tode(de32p2, dep2);
   2088 
   2089 				dep->de_next = dep2;
   2090 				dep = dep2;
   2091 				de32p = de32p2;
   2092 			}
   2093 		}
   2094 		/*
   2095 		 * go through and make all of the pointer to record blocks
   2096 		 * are null;
   2097 		 */
   2098 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
   2099 			dep->de_rb = NULL;
   2100 	}
   2101 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
   2102 	dbp->db_next = NULL;
   2103 	/*
   2104 	 *	if error occurred in directory blocks free them
   2105 	 *	and return
   2106 	 */
   2107 	if (err) {
   2108 		dbp = dbhp;
   2109 		while (dbp) {
   2110 			dep = dbp->db_firstentry;
   2111 			while (dep) {
   2112 				/* No mddb_rb32_t structures yet */
   2113 				dep2 = dep->de_next;
   2114 				kmem_free((caddr_t)dep, sizeofde(dep));
   2115 				dep = dep2;
   2116 			}
   2117 			dbp1 = dbp->db_next;
   2118 			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
   2119 			dbp = dbp1;
   2120 		}
   2121 		s->s_dbp = NULL;
   2122 		return (err);
   2123 
   2124 	}
   2125 	/*
   2126 	 */
   2127 	err = 0;
   2128 	checksum = MDDB_GLOBAL_XOR;
   2129 	for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) {
   2130 		checksum ^= dbp->db_recsum;
   2131 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
   2132 			if (dep->de_flags & MDDB_F_OPT)
   2133 				continue;
   2134 			err = getrecord(s, dep, li);
   2135 			if (err)
   2136 				break;
   2137 			/* Don't include CHANGELOG in big XOR */
   2138 			if (dep->de_flags & MDDB_F_CHANGELOG)
   2139 				continue;
   2140 			checksum ^= dep->de_rb->rb_checksum;
   2141 			checksum ^= dep->de_rb->rb_checksum_fiddle;
   2142 		}
   2143 		if (err)
   2144 			break;
   2145 	}
   2146 	if (checksum) {
   2147 		if (! err)
   2148 			err = MDDB_F_EDATA | MDDB_F_EFMT;
   2149 	}
   2150 	if (err) {
   2151 		dbp = dbhp;
   2152 		dbhp = NULL;
   2153 		while (dbp) {
   2154 			dep = dbp->db_firstentry;
   2155 			while (dep) {
   2156 				if (dep->de_rb)
   2157 					kmem_free((caddr_t)dep->de_rb,
   2158 					    dep->de_recsize);
   2159 				dep2 = dep->de_next;
   2160 				kmem_free((caddr_t)dep, sizeofde(dep));
   2161 				dep = dep2;
   2162 			}
   2163 			dbp1 = dbp->db_next;
   2164 			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
   2165 			dbp = dbp1;
   2166 		}
   2167 	}
   2168 	s->s_dbp = dbhp;
   2169 	return (err);
   2170 }
   2171 
   2172 static int
   2173 getoptcnt(
   2174 	mddb_set_t	*s,
   2175 	int		li)
   2176 {
   2177 	int		result;
   2178 	mddb_de_ic_t	*dep;
   2179 	mddb_db_t	*dbp;
   2180 
   2181 #if defined(_ILP32) && !defined(lint)
   2182 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
   2183 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
   2184 #endif
   2185 
   2186 	result = 0;
   2187 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
   2188 		dep = dbp->db_firstentry;
   2189 		for (; dep != NULL; dep = dep->de_next) {
   2190 			if (! (dep->de_flags & MDDB_F_OPT))
   2191 				continue;
   2192 			if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) &&
   2193 			    (li == dep->de_optinfo[0].o_li)) ||
   2194 			    ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) &&
   2195 			    (li == dep->de_optinfo[1].o_li)))
   2196 			result++;
   2197 		}
   2198 	}
   2199 	return (result);
   2200 }
   2201 
   2202 static void
   2203 getoptdev(
   2204 	mddb_set_t	*s,
   2205 	mddb_de_ic_t	*rdep,
   2206 	int		opti
   2207 )
   2208 {
   2209 	mddb_lb_t	*lbp;
   2210 	mddb_locator_t	*lp;
   2211 	mddb_optinfo_t	*otherop;
   2212 	mddb_optinfo_t	*resultop;
   2213 	int		li;
   2214 	dev_t		otherdev;
   2215 	int		blkonly = 0;
   2216 	int		mincnt;
   2217 	int		thiscnt;
   2218 
   2219 	lbp = s->s_lbp;
   2220 
   2221 	resultop = &rdep->de_optinfo[opti];
   2222 	otherop = &rdep->de_optinfo[1-opti];
   2223 
   2224 	resultop->o_flags = 0;
   2225 
   2226 	/*
   2227 	 * scan through and see if data bases have to vary by only device
   2228 	 */
   2229 
   2230 	if (otherop->o_flags & MDDB_F_ACTIVE) {
   2231 		blkonly = 1;
   2232 		otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev);
   2233 		for (li = 0; li < lbp->lb_loccnt; li++) {
   2234 			lp = &lbp->lb_locators[li];
   2235 			if (! (lp->l_flags & MDDB_F_ACTIVE))
   2236 				continue;
   2237 			if (expldev(lp->l_dev) != otherdev) {
   2238 				blkonly = 0;
   2239 				break;
   2240 			}
   2241 		}
   2242 	}
   2243 
   2244 	mincnt = 999999;
   2245 	for (li = 0; li < lbp->lb_loccnt; li++) {
   2246 		dev_info_t	*devi;
   2247 		int		removable = 0;
   2248 
   2249 		lp = &lbp->lb_locators[li];
   2250 		if (! (lp->l_flags & MDDB_F_ACTIVE))
   2251 			continue;
   2252 		if (otherop->o_flags & MDDB_F_ACTIVE) {
   2253 			if (blkonly) {
   2254 				if (otherop->o_li == li)
   2255 					continue;
   2256 			} else {
   2257 				if (otherdev == expldev(lp->l_dev))
   2258 					continue;
   2259 			}
   2260 		}
   2261 
   2262 		/*
   2263 		 * Check if this is a removable device.  If it is we
   2264 		 * assume it is something like a USB flash disk, a zip disk
   2265 		 * or even a floppy that is being used to help maintain
   2266 		 * mddb quorum.  We don't want to put any optimized resync
   2267 		 * records on these kinds of disks since they are usually
   2268 		 * slower or don't have the same read/write lifetimes as
   2269 		 * a regular fixed disk.
   2270 		 */
   2271 		if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) {
   2272 			int		error;
   2273 			struct cb_ops	*cb;
   2274 			ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
   2275 			int		propvalue = 0;
   2276 			int		proplength = sizeof (int);
   2277 
   2278 			if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops)
   2279 			    != NULL) {
   2280 				error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi,
   2281 				    prop_op, DDI_PROP_NOTPROM |
   2282 				    DDI_PROP_DONTPASS, "removable-media",
   2283 				    (caddr_t)&propvalue, &proplength);
   2284 
   2285 				if (error == DDI_PROP_SUCCESS)
   2286 					removable = 1;
   2287 			}
   2288 
   2289 			ddi_release_devi(devi);
   2290 		}
   2291 
   2292 		if (removable)
   2293 			continue;
   2294 
   2295 		thiscnt = getoptcnt(s, li);
   2296 		if (thiscnt < mincnt) {
   2297 			resultop->o_li  = li;
   2298 			mincnt = thiscnt;
   2299 			resultop->o_flags = MDDB_F_ACTIVE;
   2300 		}
   2301 	}
   2302 }
   2303 
   2304 static void
   2305 allocuserdata(
   2306 	mddb_de_ic_t	*dep
   2307 )
   2308 {
   2309 	mddb_rb32_t	*rbp;
   2310 
   2311 #if defined(_ILP32) && !defined(lint)
   2312 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
   2313 #endif
   2314 
   2315 	rbp = dep->de_rb;
   2316 	rbp->rb_private = 0;
   2317 	dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP);
   2318 	rbp->rb_userdata = 0x4;	/* Make sure this is non-zero */
   2319 	bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize);
   2320 }
   2321 
   2322 
   2323 static void
   2324 getuserdata(
   2325 	set_t		setno,
   2326 	mddb_de_ic_t	*dep
   2327 )
   2328 {
   2329 	mddb_rb32_t	 *rbp;
   2330 
   2331 
   2332 	mddb_type_t	type = dep->de_type1;
   2333 	caddr_t		data, udata;
   2334 
   2335 #if defined(_ILP32) && !defined(lint)
   2336 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
   2337 #endif
   2338 	rbp = dep->de_rb;
   2339 	data = (caddr_t)rbp->rb_data;
   2340 	udata = (caddr_t)dep->de_rb_userdata;
   2341 
   2342 	/*
   2343 	 * If it's a driver record, and an old style record, and not a DRL
   2344 	 * record, we must convert it because it was incore as a 64 bit
   2345 	 * structure but its on disk layout has only 32 bit for block sizes
   2346 	 */
   2347 	if (!(md_get_setstatus(setno) &
   2348 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
   2349 	    (type >= MDDB_FIRST_MODID) &&
   2350 	    ((rbp->rb_revision == MDDB_REV_RB) ||
   2351 	    (rbp->rb_revision == MDDB_REV_RBFN))) {
   2352 
   2353 		switch (dep->de_flags) {
   2354 
   2355 			case MDDB_F_STRIPE:
   2356 				stripe_convert(data, udata, BIG_2_SMALL);
   2357 				break;
   2358 
   2359 			case MDDB_F_MIRROR:
   2360 				mirror_convert(data, udata, BIG_2_SMALL);
   2361 				break;
   2362 
   2363 			case MDDB_F_RAID:
   2364 				raid_convert(data, udata, BIG_2_SMALL);
   2365 				break;
   2366 
   2367 			case MDDB_F_SOFTPART:
   2368 				softpart_convert(data, udata, BIG_2_SMALL);
   2369 				break;
   2370 
   2371 			case MDDB_F_TRANS_MASTER:
   2372 				trans_master_convert(data, udata, BIG_2_SMALL);
   2373 				break;
   2374 
   2375 			case MDDB_F_TRANS_LOG:
   2376 				trans_log_convert(data, udata, BIG_2_SMALL);
   2377 				break;
   2378 
   2379 			case MDDB_F_HOTSPARE:
   2380 				hs_convert(data, udata, BIG_2_SMALL);
   2381 				break;
   2382 
   2383 			case MDDB_F_OPT:
   2384 			default:
   2385 				bcopy(udata, data, dep->de_reqsize);
   2386 		}
   2387 	} else {
   2388 		bcopy(udata, data, dep->de_reqsize);
   2389 	}
   2390 }
   2391 
   2392 static void
   2393 getoptrecord(
   2394 	mddb_set_t	*s,
   2395 	mddb_de_ic_t	*dep
   2396 )
   2397 {
   2398 	mddb_lb_t	*lbp;
   2399 	mddb_locator_t	*lp;
   2400 	mddb_rb32_t	*rbp, *crbp;
   2401 	int		li;
   2402 	int		i;
   2403 	int		err = 0;
   2404 	size_t		recsize;
   2405 
   2406 #if defined(_ILP32) && !defined(lint)
   2407 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
   2408 #endif
   2409 
   2410 	lbp = s->s_lbp;
   2411 
   2412 	recsize = dep->de_recsize;
   2413 	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
   2414 	rbp = dep->de_rb;
   2415 	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
   2416 
   2417 	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
   2418 	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
   2419 
   2420 	for (i = 0; i < 2; i++) {
   2421 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
   2422 			continue;
   2423 		li = dep->de_optinfo[i].o_li;
   2424 		lp = &lbp->lb_locators[li];
   2425 
   2426 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
   2427 		    (lp->l_flags & MDDB_F_EMASTER))
   2428 			continue;
   2429 
   2430 		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
   2431 		    dep->de_blkcount, li, 0);
   2432 
   2433 		if (err)
   2434 			continue;
   2435 
   2436 		if (rbp->rb_magic != MDDB_MAGIC_RB)
   2437 			continue;
   2438 
   2439 		if (revchk(MDDB_REV_RB, rbp->rb_revision))
   2440 			continue;
   2441 
   2442 		/* Check the crc for this record */
   2443 		if (rec_crcchk(s, dep, rbp)) {
   2444 			continue;
   2445 		}
   2446 
   2447 		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
   2448 
   2449 		if (rbp == crbp) {
   2450 			if (rbp->rb_checksum != crbp->rb_checksum)
   2451 				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
   2452 			break;
   2453 		}
   2454 		rbp = crbp;
   2455 	}
   2456 
   2457 	if (rbp == crbp) {
   2458 		rbp->rb_private = 0;
   2459 		kmem_free((caddr_t)crbp, recsize);
   2460 		return;
   2461 	}
   2462 	bzero((caddr_t)rbp, recsize);
   2463 	rbp->rb_magic = MDDB_MAGIC_RB;
   2464 	rbp->rb_revision = MDDB_REV_RB;
   2465 	uniqtime32(&rbp->rb_timestamp);
   2466 	/* Generate the crc for this record */
   2467 	rec_crcgen(s, dep, rbp);
   2468 	kmem_free((caddr_t)crbp, recsize);
   2469 }
   2470 
   2471 /*
   2472  * writeoptrecord writes out an optimized record.
   2473  */
   2474 static int
   2475 writeoptrecord(
   2476 	mddb_set_t	*s,
   2477 	mddb_de_ic_t	*dep
   2478 )
   2479 {
   2480 	mddb_rb32_t	*rbp;
   2481 	int		li;
   2482 	int		err = 0, wrt_err = 0;
   2483 	mddb_bf_t	*bufhead, *bfp;
   2484 	mddb_lb_t	*lbp = s->s_lbp;
   2485 	mddb_locator_t	*lp;
   2486 	int		i;
   2487 
   2488 #if defined(_ILP32) && !defined(lint)
   2489 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
   2490 #endif
   2491 
   2492 	bufhead = NULL;
   2493 	err = 0;
   2494 
   2495 	while (s->s_opthavequeuinglck) {
   2496 		s->s_optwantqueuinglck++;
   2497 		cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno));
   2498 	}
   2499 	s->s_opthavequeuinglck++;
   2500 	rbp = dep->de_rb;
   2501 	for (i = 0; i < 2; i++) {
   2502 		/*
   2503 		 * only possible error is xlate. This can
   2504 		 * occur if a replica was off line and came
   2505 		 * back. During the mean time the database grew
   2506 		 * large than the now on line replica can store
   2507 		 */
   2508 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
   2509 			continue;
   2510 		li = dep->de_optinfo[i].o_li;
   2511 		/*
   2512 		 * In a MN diskset, any node can write optimized record(s).
   2513 		 */
   2514 		wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
   2515 		    dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
   2516 		/*
   2517 		 * For MN diskset, set error in optinfo structure so
   2518 		 * that mddb_commitrec knows which replica failed.
   2519 		 */
   2520 		if ((MD_MNSET_SETNO(s->s_setno)) &&
   2521 		    (wrt_err & MDDB_F_EWRITE)) {
   2522 			dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE;
   2523 		}
   2524 		err |= wrt_err;
   2525 	}
   2526 	s->s_opthavequeuinglck = 0;
   2527 	if (s->s_optwantqueuinglck) {
   2528 		s->s_optwantqueuinglck = 0;
   2529 		cv_broadcast(&s->s_optqueuing_cv);
   2530 	}
   2531 	for (bfp = bufhead; bfp; bfp = bufhead) {
   2532 		mutex_exit(SETMUTEX(s->s_setno));
   2533 		(void) biowait(&bfp->bf_buf);
   2534 		mutex_enter(SETMUTEX(s->s_setno));
   2535 		if (bfp->bf_buf.b_flags & B_ERROR) {
   2536 			/*
   2537 			 * If an MN diskset, don't set replica
   2538 			 * in error since this hasn't been set in master.
   2539 			 * Setting replica in error before master could
   2540 			 * leave the nodes with different views of the
   2541 			 * world since a class 1 configuration change
   2542 			 * could occur in mddb_commitrec as soon as
   2543 			 * all locks are dropped.  Must keep this
   2544 			 * node the same as master and can't afford a
   2545 			 * failure from the class 1 config change
   2546 			 * if master succeeded.
   2547 			 */
   2548 			if (!(MD_MNSET_SETNO(s->s_setno))) {
   2549 				bfp->bf_locator->l_flags |= MDDB_F_EWRITE;
   2550 			} else {
   2551 				/*
   2552 				 * Find which de_optinfo (which replica)
   2553 				 * had a failure and set the failure in
   2554 				 * the o_flags field.
   2555 				 */
   2556 				lp = &lbp->lb_locators[dep->de_optinfo[0].o_li];
   2557 				if (lp == bfp->bf_locator) {
   2558 					dep->de_optinfo[0].o_flags |=
   2559 					    MDDB_F_EWRITE;
   2560 				} else {
   2561 					dep->de_optinfo[1].o_flags |=
   2562 					    MDDB_F_EWRITE;
   2563 				}
   2564 			}
   2565 			err |= MDDB_F_EWRITE;
   2566 		}
   2567 		bufhead = bfp->bf_next;
   2568 		freebuffer(s, bfp);
   2569 	}
   2570 	return (err);
   2571 }
   2572 
   2573 /*
   2574  * Fix up the optimized resync record.  Used in the traditional and local
   2575  * disksets to move an optimized record from a failed or deleted mddb
   2576  * to an active one.
   2577  *
   2578  * In a MN diskset, the fixing of the optimized record is split between
   2579  * the master and slave nodes.  If the master node moves the optimized
   2580  * resync record, then the master node will send a MDDB_PARSE_OPTRECS
   2581  * message to the slave nodes causing the slave nodes to reget the
   2582  * directory entry containing the location of the optimized resync record.
   2583  * After the record is reread from disk, then writeoptrecord is called
   2584  * if the location of the optimized resync record or flags have changed.
   2585  * When writeoptrecord is called, the node that is the owner of this record
   2586  * will write the optimized record to the location specified in the directory
   2587  * entry.  Since the master node uses the highest class message (PARSE)
   2588  * the record owner node is guaranteed to already have an updated
   2589  * directory entry incore.
   2590  *
   2591  * The other difference between the traditional/local set and MN diskset
   2592  * is that the directory entry can be written to disk before the optimized
   2593  * record in a MN diskset if the record is owned by a slave node.  So,
   2594  * the users of an optimized record must handle the failure case when no
   2595  * data is available from an optimized record since the master node could
   2596  * have failed during the relocation of the optimized record to another mddb.
   2597  */
   2598 static int
   2599 fixoptrecord(
   2600 	mddb_set_t	*s,
   2601 	mddb_de_ic_t	*dep,
   2602 	mddb_db_t	*dbp
   2603 )
   2604 {
   2605 	int		changed;
   2606 	int		writedata;
   2607 	int		err = 0;
   2608 	int		i;
   2609 	mddb_lb_t	*lbp;
   2610 	mddb_optinfo_t	*op;
   2611 	mddb_db32_t	*db32p;
   2612 	int		rec_owner;	/* Is node owner of record? */
   2613 
   2614 #if defined(_ILP32) && !defined(lint)
   2615 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
   2616 #endif
   2617 
   2618 	lbp = s->s_lbp;
   2619 	changed = 0;
   2620 	writedata = 0;
   2621 	for (i = 0; i < 2; i++) {
   2622 		op = &dep->de_optinfo[i];
   2623 
   2624 		if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE))
   2625 			op->o_flags = 0;
   2626 
   2627 		/*
   2628 		 * If optimized record has seen a replica failure,
   2629 		 * assign new replica to record and re-write data
   2630 		 * to new record.
   2631 		 */
   2632 		if (! (op->o_flags & MDDB_F_ACTIVE)) {
   2633 			getoptdev(s, dep, i);
   2634 			writedata++;
   2635 			changed++;
   2636 			/* Set flag for slaves to reread dep and write rec */
   2637 			if (lbp->lb_flags & MDDB_MNSET) {
   2638 				s->s_mn_parseflags |= MDDB_PARSE_OPTRECS;
   2639 			}
   2640 		}
   2641 
   2642 		/*
   2643 		 * If just an error in the data was seen, set
   2644 		 * the optimized record's replica flag to active (ok)
   2645 		 * and try again.
   2646 		 */
   2647 		if (op->o_flags & MDDB_F_EDATA) {
   2648 			dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE;
   2649 			writedata++;
   2650 		}
   2651 	}
   2652 
   2653 	rec_owner = 0;
   2654 	if (lbp->lb_flags & MDDB_MNSET) {
   2655 		/*
   2656 		 * If a MN diskset then check the owner of optimized record.
   2657 		 * If the master node owns the record or if there is
   2658 		 * no owner of the record, then the master can write the
   2659 		 * optimized record to disk.
   2660 		 * Master node can write the optimized record now, but
   2661 		 * slave nodes write their records during handling of
   2662 		 * the MDDB_PARSE_OPTRECS message.
   2663 		 */
   2664 		if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) ||
   2665 		    (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) {
   2666 			rec_owner = 1;
   2667 		}
   2668 	} else {
   2669 		/*
   2670 		 * In traditional diskset and local set, this node
   2671 		 * is always the record owner and always the master.
   2672 		 */
   2673 		rec_owner = 1;
   2674 	}
   2675 
   2676 	/*
   2677 	 * If this node is the record owner, write out record.
   2678 	 */
   2679 	if ((writedata) && (rec_owner)) {
   2680 		if (err = writeoptrecord(s, dep)) {
   2681 			return (err);
   2682 		}
   2683 	}
   2684 	if (! changed)
   2685 		return (0);
   2686 	uniqtime32(&dbp->db_timestamp);
   2687 	dbp->db_revision = MDDB_REV_DB;
   2688 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
   2689 	create_db32rec(db32p, dbp);
   2690 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
   2691 	err = writeall(s, (caddr_t)db32p, db32p->db32_blknum,
   2692 	    1, MDDB_WR_ONLY_MASTER);
   2693 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
   2694 	return (err);
   2695 }
   2696 
   2697 static int
   2698 fixoptrecords(
   2699 	mddb_set_t		*s
   2700 )
   2701 {
   2702 	mddb_de_ic_t	*dep;
   2703 	mddb_db_t	*dbp;
   2704 	int		err = 0;
   2705 	set_t		setno;
   2706 
   2707 	/*
   2708 	 * In a MN diskset, the master node is the only node that runs
   2709 	 * fixoptrecords.  If the master node changes anything, then the
   2710 	 * master node sends PARSE message to the slave nodes.  The slave
   2711 	 * nodes will then re-read in the locator block or re-read in the
   2712 	 * directory blocks and re-write the optimized resync records.
   2713 	 */
   2714 	setno = s->s_setno;
   2715 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
   2716 	    (md_set[setno].s_am_i_master == 0)) {
   2717 		return (0);
   2718 	}
   2719 
   2720 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
   2721 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
   2722 			if (! (dep->de_flags & MDDB_F_OPT))
   2723 				continue;
   2724 			err = fixoptrecord(s, dep, dbp);
   2725 			if (err != 0)
   2726 				return (err);
   2727 		}
   2728 	}
   2729 	return (0);
   2730 }
   2731 
   2732 /*
   2733  * Checks incore version of mddb data to mddb data ondisk.
   2734  *
   2735  * Returns:
   2736  *	- 0 if the data was successfully read and is good.
   2737  *	- MDDB_F_EREAD if a read error occurred.
   2738  *	- 1 if the data read is bad (checksum failed, etc)
   2739  */
   2740 static int
   2741 checkcopy
   2742 (
   2743 	mddb_set_t	*s,
   2744 	int		li
   2745 )
   2746 {
   2747 	mddb_db_t	*dbp;
   2748 	mddb_db32_t	*cdb32p;
   2749 	mddb_de_ic_t	*dep;
   2750 	mddb_de32_t	*cde32p;
   2751 	mddb_rb32_t	*rbp, *crbp;
   2752 	size_t		size;
   2753 	int		i;
   2754 	int		retval = 1;
   2755 
   2756 #if defined(_ILP32) && !defined(lint)
   2757 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
   2758 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
   2759 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
   2760 #endif
   2761 
   2762 	if (s->s_databuffer_size == 0) {
   2763 		size_t maxrecsize = MDDB_BSIZE;
   2764 
   2765 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
   2766 			for (dep = dbp->db_firstentry; dep; dep = dep->de_next)
   2767 				if (! (dep->de_flags & MDDB_F_OPT) &&
   2768 				    dep->de_recsize > maxrecsize)
   2769 					maxrecsize = dep->de_recsize;
   2770 
   2771 		s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
   2772 		s->s_databuffer_size = maxrecsize;
   2773 	}
   2774 
   2775 	cdb32p = (mddb_db32_t *)s->s_databuffer;
   2776 
   2777 	/*
   2778 	 * first go through and make sure all directory stuff
   2779 	 * is the same
   2780 	 */
   2781 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
   2782 		if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) {
   2783 			retval = MDDB_F_EREAD;
   2784 			goto err;
   2785 		}
   2786 		if (cdb32p->db32_magic != MDDB_MAGIC_DB)
   2787 			goto err;
   2788 		if (revchk(MDDB_REV_DB, cdb32p->db32_revision))
   2789 			goto err;
   2790 		if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL))
   2791 			goto err;
   2792 		if (cdb32p->db32_nextblk != dbp->db_nextblk)
   2793 			goto err;
   2794 		if (cdb32p->db32_recsum != dbp->db_recsum)
   2795 			goto err;
   2796 		if (cdb32p->db32_firstentry) {
   2797 			cde32p = (mddb_de32_t *)
   2798 			    ((void *)((caddr_t)(&cdb32p->db32_firstentry)
   2799 			    + sizeof (cdb32p->db32_firstentry)));
   2800 		} else
   2801 			cde32p = NULL;
   2802 
   2803 		dep = dbp->db_firstentry;
   2804 		/*
   2805 		 * check if all directory entries are identical
   2806 		 */
   2807 		while (dep && cde32p) {
   2808 			if (dep->de_recid != cde32p->de32_recid)
   2809 				goto err;
   2810 			if (dep->de_type1 != cde32p->de32_type1)
   2811 				goto err;
   2812 			if (dep->de_type2 != cde32p->de32_type2)
   2813 				goto err;
   2814 			if (dep->de_reqsize != cde32p->de32_reqsize)
   2815 				goto err;
   2816 			if (dep->de_flags != cde32p->de32_flags)
   2817 				goto err;
   2818 
   2819 			for (i = 0; i < 2; i++) {
   2820 				if (dep->de_optinfo[i].o_li !=
   2821 				    cde32p->de32_optinfo[i].o_li)
   2822 					break;
   2823 			}
   2824 			if (i != 2)
   2825 				goto err;
   2826 			size = sizeof (mddb_block_t) * dep->de_blkcount;
   2827 			if (bcmp((caddr_t)dep->de_blks,
   2828 			    (caddr_t)cde32p->de32_blks, size))
   2829 				goto err;
   2830 			dep = dep->de_next;
   2831 			if (cde32p->de32_next)
   2832 				cde32p = nextentry(cde32p);
   2833 			else
   2834 				cde32p = NULL;
   2835 		}
   2836 		if (dep || cde32p)
   2837 			goto err;
   2838 	}
   2839 	/*
   2840 	 * If here, all directories are functionally identical
   2841 	 * check to make sure all records are identical
   2842 	 * the reason the records are not just bcmped is that the
   2843 	 * lock flag does not want to be compared.
   2844 	 */
   2845 	crbp = (mddb_rb32_t *)cdb32p;
   2846 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
   2847 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
   2848 			if ((dep->de_flags & MDDB_F_OPT) ||
   2849 			    (dep->de_flags & MDDB_F_CHANGELOG))
   2850 				continue;
   2851 			rbp = (mddb_rb32_t *)dep->de_rb;
   2852 			if (readblklst(s, (caddr_t)crbp, dep->de_blks,
   2853 			    dep->de_blkcount, li, 0)) {
   2854 				retval = MDDB_F_EREAD;
   2855 				goto err;
   2856 			}
   2857 			/* Check the crc for this record */
   2858 			if (rec_crcchk(s, dep, crbp))
   2859 				goto err;
   2860 
   2861 			if (rbp->rb_checksum != crbp->rb_checksum ||
   2862 			    rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle)
   2863 				goto err;
   2864 		}
   2865 	}
   2866 	return (0);
   2867 err:
   2868 	return (retval);
   2869 }
   2870 
   2871 /*
   2872  * Determine if the location information for two mddbs is the same.
   2873  * The device slice and block offset should match.  If both have devids then
   2874  * use that for the comparison, otherwise we compare the dev_ts.
   2875  * Comparing with the devid allows us to handle the case where a mddb was
   2876  * relocated to a dead mddbs dev_t.  The live mddb will have the dev_t of
   2877  * the dead mddb but the devid comparison will catch this and not match.
   2878  *
   2879  * Return 1 if the location of the two mddbs match, 0 if not.
   2880  */
   2881 static int
   2882 match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev,
   2883 	daddr32_t blkno)
   2884 {
   2885 	if (rip->ri_flags & MDDB_F_EMASTER) {
   2886 		/*
   2887 		 * If this element is errored then we don't try to match on it.
   2888 		 * If we try to match we could erroneously match on the dev_t
   2889 		 * of a relocated disk.
   2890 		 */
   2891 		return (0);
   2892 	}
   2893 
   2894 	if (rip->ri_devid && devid && minor) {
   2895 		/*
   2896 		 * If old devid exists, then this is a replicated diskset
   2897 		 * and both old and new devids must be checked.
   2898 		 */
   2899 		if (rip->ri_old_devid) {
   2900 			if (((ddi_devid_compare(rip->ri_devid, devid) != 0) &&
   2901 			    (ddi_devid_compare(rip->ri_old_devid,
   2902 			    devid) != 0)) ||
   2903 			    (strcmp(rip->ri_minor_name, minor) != 0))
   2904 				return (0);
   2905 		} else {
   2906 			if (ddi_devid_compare(rip->ri_devid, devid) != 0 ||
   2907 			    strcmp(rip->ri_minor_name, minor) != 0)
   2908 				return (0);
   2909 		}
   2910 	} else {
   2911 		if (rip->ri_dev != dev)
   2912 			return (0);
   2913 	}
   2914 
   2915 	if (rip->ri_blkno != blkno)
   2916 		return (0);
   2917 
   2918 	return (1);
   2919 }
   2920 
   2921 static int
   2922 ridev(
   2923 	mddb_ri_t	**rip,
   2924 	mddb_cfg_loc_t	*clp,
   2925 	dev32_t		*dev_2b_fixed,
   2926 	int		flag)
   2927 {
   2928 	mddb_ri_t	*r, *r1;
   2929 	md_dev64_t	ldev, ndev;
   2930 	major_t		majordev;
   2931 	int		sz;
   2932 
   2933 	if (MD_UPGRADE) {
   2934 		ldev = md_makedevice(md_targ_name_to_major(clp->l_driver),
   2935 		    clp->l_mnum);
   2936 	} else {
   2937 		if (ddi_name_to_major(clp->l_driver) == (major_t)-1)
   2938 			return (EINVAL);
   2939 
   2940 		ldev = md_makedevice(ddi_name_to_major(clp->l_driver),
   2941 		    clp->l_mnum);
   2942 	}
   2943 
   2944 	if (clp->l_devid != 0) {
   2945 		/*
   2946 		 * Get dev associated with device id and minor name.
   2947 		 * Setup correct driver name if dev is now different.
   2948 		 * Don't change driver name if during upgrade.
   2949 		 */
   2950 		ndev = ldev;
   2951 		if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid,
   2952 		    &ndev, clp->l_minor_name)) {
   2953 			if ((ndev != ldev) && (!(MD_UPGRADE))) {
   2954 				majordev = md_getmajor(ndev);
   2955 				(void) strcpy(clp->l_driver,
   2956 				    ddi_major_to_name(majordev));
   2957 				clp->l_mnum = md_getminor(ndev);
   2958 				clp->l_devid_flags |= MDDB_DEVID_VALID;
   2959 				ldev = ndev;
   2960 			}
   2961 		} else {
   2962 			/* Mark as invalid */
   2963 			clp->l_devid_flags &= ~MDDB_DEVID_VALID;
   2964 		}
   2965 	}
   2966 
   2967 	clp->l_dev = md_cmpldev(ldev);
   2968 	if (dev_2b_fixed)
   2969 		*dev_2b_fixed = clp->l_dev;
   2970 	r = *rip;
   2971 
   2972 	while (r) {
   2973 		if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid,
   2974 		    clp->l_minor_name, ldev, clp->l_blkno)) {
   2975 			if ((clp->l_devid != 0) &&
   2976 			    !(clp->l_devid_flags & MDDB_DEVID_VALID)) {
   2977 				r->ri_flags |= MDDB_F_EMASTER;
   2978 			} else {
   2979 				r->ri_flags |= flag;
   2980 			}
   2981 			return (0);	/* already entered return success */
   2982 		}
   2983 		r = r->ri_next;
   2984 	}
   2985 
   2986 	/*
   2987 	 * This replica not represented in the current rip list,
   2988 	 * so add it to the list.
   2989 	 */
   2990 	r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP);
   2991 	r->ri_dev = ldev;
   2992 	r->ri_blkno = clp->l_blkno;
   2993 	(void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM);
   2994 	if (strlen(clp->l_driver) >= MD_MAXDRVNM) {
   2995 		r->ri_driver[(MD_MAXDRVNM -1)] = '\0';
   2996 	}
   2997 	if (clp->l_devname != NULL) {
   2998 		(void) strcpy(r->ri_devname, clp->l_devname);
   2999 	}
   3000 	r->ri_flags |= flag;
   3001 	if (clp->l_devid != 0) {
   3002 		sz = clp->l_devid_sz;
   3003 		r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP);
   3004 		bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz);
   3005 
   3006 		if (clp->l_old_devid != NULL) {
   3007 			sz = clp->l_old_devid_sz;
   3008 			r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz,
   3009 			    KM_SLEEP);
   3010 			bcopy((char *)(uintptr_t)clp->l_old_devid,
   3011 			    (char *)r->ri_old_devid, sz);
   3012 		} else {
   3013 			r->ri_old_devid = 0;
   3014 		}
   3015 		if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX)
   3016 			(void) strcpy(r->ri_minor_name, clp->l_minor_name);
   3017 
   3018 		if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) {
   3019 			/*
   3020 			 * Devid is present, but not valid.  This could
   3021 			 * happen if device has been powered off or if
   3022 			 * the device has been removed.  Mark the device in
   3023 			 * error.  Don't allow any writes to this device
   3024 			 * based on the dev_t since another device could
   3025 			 * have been placed in its spot and be responding to
   3026 			 * the dev_t accesses.
   3027 			 */
   3028 			r->ri_flags |= MDDB_F_EMASTER;
   3029 		}
   3030 	} else {
   3031 		r->ri_devid = 0;
   3032 		r->ri_old_devid = 0;
   3033 	}
   3034 
   3035 	/*
   3036 	 * If the rip list is empty then this entry
   3037 	 * is the list.
   3038 	 */
   3039 	if (*rip == NULL) {
   3040 		*rip = r;
   3041 		return (0);
   3042 	}
   3043 
   3044 	/*
   3045 	 * Add this entry to the end of the rip list
   3046 	 */
   3047 	r1 = *rip;
   3048 	while (r1->ri_next)
   3049 		r1 = r1->ri_next;
   3050 	r1->ri_next = r;
   3051 	return (0);
   3052 }
   3053 
   3054 /*
   3055  * writecopy writes the incore data blocks out to all of the replicas.
   3056  * This is called from writestart
   3057  *	- when a diskset is started or
   3058  *	- when an error has been enountered during the write to a mddb.
   3059  * and from newdev when a new mddb is being added.
   3060  *
   3061  * flag can be 2 values:
   3062  *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
   3063  *		always used for traditional and local disksets.
   3064  *		For MN diskset:
   3065  *			All nodes can call writecopy, but only the
   3066  *			master node actually writes data to the disk
   3067  *			except for optimized resync records.
   3068  *			An optimized resync record can only be written to
   3069  *			by the record owner.
   3070  *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
   3071  *		master has been chosen, the new master may need to
   3072  * 		write its incore mddb to disk (this is the case where the
   3073  *		old master had executed a message but hadn't relayed it
   3074  *		to this slave yet).  New master should not write the
   3075  *		change log records since new master would be overwriting
   3076  *		valuable data.  Only used during a reconfig cycle.
   3077  */
   3078 static int
   3079 writecopy(
   3080 	mddb_set_t	*s,
   3081 	int		li,
   3082 	int		flag
   3083 )
   3084 {
   3085 	mddb_db_t	*dbp;
   3086 	mddb_db32_t	*db32p;
   3087 	mddb_de_ic_t	*dep;
   3088 	mddb_rb32_t	*rbp;
   3089 	uint_t		checksum;
   3090 	int		err = 0;
   3091 
   3092 #if defined(_ILP32) && !defined(lint)
   3093 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
   3094 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
   3095 #endif
   3096 
   3097 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
   3098 		db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
   3099 		create_db32rec(db32p, dbp);
   3100 		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
   3101 		err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li,
   3102 		    MDDB_WR_ONLY_MASTER);
   3103 		kmem_free((caddr_t)db32p, MDDB_BSIZE);
   3104 		if (err)
   3105 			return (err);
   3106 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
   3107 			/*
   3108 			 * In a multinode diskset, when a new master is
   3109 			 * chosen the new master may need to write its
   3110 			 * incore copy of the mddb to disk.  In this case,
   3111 			 * don't want to overwrite the change log records
   3112 			 * so new master sets flag to MDDB_WRITECOPY_SYNC.
   3113 			 */
   3114 			if (flag == MDDB_WRITECOPY_SYNC) {
   3115 				if (dep->de_flags & MDDB_F_CHANGELOG)
   3116 					continue;
   3117 			}
   3118 			/*
   3119 			 * In a multinode diskset, don't write out optimized
   3120 			 * resync resyncs since only the mirror owner node
   3121 			 * will have the correct data.  If writecopy is
   3122 			 * being called from writestart as a result of
   3123 			 * an mddb failure, then writestart will handle
   3124 			 * the optimized records when it calls fixoptrecords.
   3125 			 */
   3126 			if ((MD_MNSET_SETNO(s->s_setno)) &&
   3127 			    (dep->de_flags & MDDB_F_OPT)) {
   3128 				continue;
   3129 			}
   3130 
   3131 			rbp = dep->de_rb;
   3132 			checksum = rbp->rb_checksum_fiddle;
   3133 			checksum ^= rbp->rb_checksum;
   3134 			/* Generate the crc for this record */
   3135 			rec_crcgen(s, dep, rbp);
   3136 			checksum ^= rbp->rb_checksum;
   3137 			rbp->rb_checksum_fiddle = checksum;
   3138 			if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
   3139 			    dep->de_blkcount, li, (mddb_bf_t **)0,
   3140 			    MDDB_WR_ONLY_MASTER))
   3141 				return (err);
   3142 		}
   3143 	}
   3144 	return (0);
   3145 }
   3146 
   3147 static int
   3148 upd_med(
   3149 	mddb_set_t	*s,
   3150 	char		*tag
   3151 )
   3152 {
   3153 	med_data_t	meddb;
   3154 	int		medok;
   3155 	mddb_lb_t	*lbp = s->s_lbp;
   3156 	set_t		setno = s->s_setno;
   3157 	int		li;
   3158 	int		alc;
   3159 	int		lc;
   3160 
   3161 
   3162 	/* If no mediator hosts, nothing to do */
   3163 	if (s->s_med.n_cnt == 0)
   3164 		return (0);
   3165 
   3166 	/*
   3167 	 * If this is a MN set and we are not the master, then don't
   3168 	 * update mediator hosts or mark mediator as golden since
   3169 	 * only master node should do that.
   3170 	 */
   3171 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
   3172 	    (md_set[setno].s_am_i_master == 0)) {
   3173 		return (0);
   3174 	}
   3175 
   3176 	bzero((char *)&meddb, sizeof (med_data_t));
   3177 	meddb.med_dat_mag = MED_DATA_MAGIC;
   3178 	meddb.med_dat_rev = MED_DATA_REV;
   3179 	meddb.med_dat_fl = 0;
   3180 	meddb.med_dat_sn = setno;
   3181 	meddb.med_dat_cc = lbp->lb_commitcnt;
   3182 	TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime);
   3183 	crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
   3184 
   3185 	/* count accessible mediators */
   3186 	medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
   3187 
   3188 	/* count accessible and existing replicas */
   3189 	for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) {
   3190 		mddb_locator_t	*lp = &lbp->lb_locators[li];
   3191 
   3192 		if (lp->l_flags & MDDB_F_DELETED)
   3193 			continue;
   3194 
   3195 		lc++;
   3196 
   3197 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
   3198 		    (lp->l_flags & MDDB_F_EMASTER) ||
   3199 		    (lp->l_flags & MDDB_F_EWRITE))
   3200 			continue;
   3201 
   3202 		alc++;
   3203 	}
   3204 
   3205 	/*
   3206 	 * Mediator update quorum is >= 50%: check for less than
   3207 	 * "mediator update" quorum.
   3208 	 */
   3209 	if ((medok * 2) < s->s_med.n_cnt) {
   3210 		/* panic if <= 50% of all replicas are accessible */
   3211 		if ((lc > 0) && ((alc * 2) <= lc)) {
   3212 			cmn_err(CE_PANIC,
   3213 			    "md: Update of 50%% of the mediator hosts failed");
   3214 			/* NOTREACHED */
   3215 		}
   3216 
   3217 		cmn_err(CE_WARN,
   3218 		    "md: Update of 50%% of the mediator hosts failed");
   3219 	}
   3220 
   3221 	/*
   3222 	 * If we have mediator update quorum and exactly 50% of the replicas
   3223 	 * are accessible then mark the mediator as golden.
   3224 	 */
   3225 	if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) &&
   3226 	    ((alc * 2) == lc)) {
   3227 		meddb.med_dat_fl = MED_DFL_GOLDEN;
   3228 		crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
   3229 		(void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
   3230 	}
   3231 
   3232 	return (0);
   3233 }
   3234 
   3235 static int
   3236 push_lb(mddb_set_t *s)
   3237 {
   3238 	mddb_lb_t	*lbp = s->s_lbp;
   3239 
   3240 	/* push the change to all the replicas */
   3241 	uniqtime32(&lbp->lb_timestamp);
   3242 	if (MD_MNSET_SETNO(s->s_setno)) {
   3243 		lbp->lb_revision = MDDB_REV_MNLB;
   3244 	} else {
   3245 		lbp->lb_revision = MDDB_REV_LB;
   3246 	}
   3247 	/*
   3248 	 * The updates to the mediator hosts are done
   3249 	 * by the callers of this function.
   3250 	 */
   3251 	return (writelocall(s));
   3252 }
   3253 
   3254 /* Should not call for MN diskset since data tags are not supported */
   3255 static int
   3256 dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp)
   3257 {
   3258 	int 		diff = 0;
   3259 
   3260 	diff = (int)(odtp->dt_setno - ndtp->dt_setno);
   3261 	if (diff)
   3262 		return (diff);
   3263 
   3264 	diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN);
   3265 	if (diff)
   3266 		return (diff);
   3267 
   3268 	diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1);
   3269 	if (diff)
   3270 		return (diff);
   3271 
   3272 	/*CSTYLED*/
   3273 	return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=));
   3274 }
   3275 
   3276 /* Should not call for MN diskset since data tags are not supported */
   3277 static int
   3278 dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp)
   3279 {
   3280 	int		nextid = 0;
   3281 	mddb_dtag_lst_t **dtlpp = &s->s_dtlp;
   3282 
   3283 	/* Run to the end of the list */
   3284 	for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) {
   3285 		if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0)
   3286 			return (0);
   3287 		nextid++;
   3288 	}
   3289 
   3290 	/* Add the new member */
   3291 	*dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP);
   3292 
   3293 	/* Update the dtag portion of the list */
   3294 	bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt),
   3295 	    sizeof (mddb_dtag_t));
   3296 
   3297 	/* Fix up the id value */
   3298 	(*dtlpp)->dtl_dt.dt_id = ++nextid;
   3299 
   3300 	return (0);
   3301 }
   3302 
   3303 /*
   3304  * Even though data tags are not supported in MN disksets, dt_cntl may
   3305  * be called for a MN diskset since this routine is called even before
   3306  * it is known the kind of diskset being read in from disk.
   3307  * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned.
   3308  */
   3309 static int
   3310 dtl_cntl(mddb_set_t *s)
   3311 {
   3312 	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
   3313 	int		ndt = 0;
   3314 
   3315 	while (dtlp != NULL) {
   3316 		ndt++;
   3317 		dtlp = dtlp->dtl_nx;
   3318 	}
   3319 
   3320 	return (ndt);
   3321 }
   3322 
   3323 /*
   3324  * Even though data tags are not supported in MN disksets, dt_cntl may
   3325  * be called for a MN diskset since this routine is called even before
   3326  * it is known the kind of diskset being read in from disk.
   3327  * For a MNdiskset, s_dtlp is 0 so a 0 is returned.
   3328  */
   3329 static mddb_dtag_t *
   3330 dtl_findl(mddb_set_t *s, int id)
   3331 {
   3332 	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
   3333 
   3334 	while (dtlp != NULL) {
   3335 		if (dtlp->dtl_dt.dt_id == id)
   3336 			return (&dtlp->dtl_dt);
   3337 		dtlp = dtlp->dtl_nx;
   3338 	}
   3339 	return ((mddb_dtag_t *)NULL);
   3340 }
   3341 
   3342 /* Should not call for MN diskset since data tags are not supported */
   3343 static void
   3344 dtl_freel(mddb_dtag_lst_t **dtlpp)
   3345 {
   3346 	mddb_dtag_lst_t	*dtlp;
   3347 	mddb_dtag_lst_t	*tdtlp;
   3348 
   3349 
   3350 	for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) {
   3351 		dtlp = tdtlp->dtl_nx;
   3352 		kmem_free(tdtlp, sizeof (mddb_dtag_lst_t));
   3353 	}
   3354 	*dtlpp = (mddb_dtag_lst_t *)NULL;
   3355 }
   3356 
   3357 /*
   3358  * Even though data tags are not supported in MN disksets, dt_setup will
   3359  * be called for a MN diskset since this routine is called even before
   3360  * it is known the kind of diskset being read in from disk.
   3361  * Once this set is known as a MN diskset, the dtp area will be freed.
   3362  */
   3363 static void
   3364 dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp)
   3365 {
   3366 	mddb_dt_t	*dtp;
   3367 	set_t		setno = s->s_setno;
   3368 
   3369 
   3370 	if (md_set[setno].s_dtp == (mddb_dt_t *)NULL)
   3371 		md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
   3372 	else if (dtagp == (mddb_dtag_t *)NULL)
   3373 		bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
   3374 
   3375 	/* shorthand */
   3376 	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
   3377 
   3378 	dtp->dt_mag = MDDB_MAGIC_DT;
   3379 	dtp->dt_rev = MDDB_REV_DT;
   3380 
   3381 	if (dtagp != NULL)
   3382 		dtp->dt_dtag = *dtagp;		/* structure assignment */
   3383 
   3384 	/* Initialize the setno */
   3385 	dtp->dt_dtag.dt_setno = setno;
   3386 
   3387 	/* Clear the id and flags, this is only used in user land */
   3388 	dtp->dt_dtag.dt_id = 0;
   3389 
   3390 	/* Checksum it */
   3391 	crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
   3392 }
   3393 
   3394 /* Should not call for MN diskset since data tags are not supported */
   3395 static int
   3396 set_dtag(mddb_set_t *s, md_error_t *ep)
   3397 {
   3398 	mddb_lb_t	*lbp = s->s_lbp;
   3399 	mddb_dtag_t	tag;
   3400 
   3401 	if (lbp->lb_dtblkcnt == 0) {
   3402 		/* Data tags not used in a MN set - so no failure returned */
   3403 		if (lbp->lb_flags & MDDB_MNSET)
   3404 			return (0);
   3405 
   3406 		cmn_err(CE_WARN,
   3407 		    "No tag record allocated, unable to tag data");
   3408 		(void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno);
   3409 		return (1);
   3410 	}
   3411 
   3412 	/* Clear the stack variable */
   3413 	bzero((caddr_t)&tag, sizeof (mddb_dtag_t));
   3414 
   3415 	/* Get the HW serial number for this host */
   3416 	(void) snprintf(tag.dt_sn, MDDB_SN_LEN, "%u", zone_get_hostid(NULL));
   3417 	tag.dt_sn[MDDB_SN_LEN - 1] = '\0';
   3418 
   3419 	/* Get the nodename that this host goes by */
   3420 	(void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME);
   3421 	tag.dt_hn[MD_MAX_NODENAME] = '\0';
   3422 
   3423 	/* Get a time stamp for NOW */
   3424 	uniqtime32(&tag.dt_tv);
   3425 
   3426 	/* Setup the data tag record */
   3427 	dt_setup(s, &tag);
   3428 
   3429 	/* Free any list of tags if they exist */
   3430 	dtl_freel(&s->s_dtlp);
   3431 
   3432 	/* Put the new tag onto the tag list */
   3433 	(void) dtl_addl(s, &tag);
   3434 
   3435 	return (0);
   3436 }
   3437 
   3438 /*
   3439  * If called during upgrade, this routine expects a non-translated
   3440  * (aka target) dev.
   3441  * Should not call for MN diskset since data tags are not supported.
   3442  */
   3443 static int
   3444 dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip)
   3445 {
   3446 	int		err = 0;
   3447 	md_dev64_t	dev;
   3448 	caddr_t		tbuf;
   3449 	daddr_t		physblk;
   3450 	mddb_block_t	blk;
   3451 	mddb_dt_t	*dtp;
   3452 	mddb_dtag_t	*dtagp;
   3453 	set_t		setno = s->s_setno;
   3454 
   3455 	/* If have not allocated a data tag record, there is nothing to do */
   3456 	if (lbp->lb_dtblkcnt == 0)
   3457 		return (1);
   3458 
   3459 	dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
   3460 
   3461 	if (dtp == (mddb_dt_t *)NULL)
   3462 		return (1);
   3463 
   3464 	/* shorthand */
   3465 	dev = md_xlate_targ_2_mini(rip->ri_dev);
   3466 	if (dev == NODEV64) {
   3467 		return (1);
   3468 	}
   3469 
   3470 	tbuf = (caddr_t)rip->ri_dtp;
   3471 
   3472 	for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) {
   3473 		physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip);
   3474 		err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0);
   3475 		/* error reading the tag */
   3476 		if (err) {
   3477 			err = 1;
   3478 			goto out;
   3479 		}
   3480 		tbuf += MDDB_BSIZE;
   3481 	}
   3482 
   3483 	/* magic is valid? */
   3484 	if (dtp->dt_mag != MDDB_MAGIC_DT) {
   3485 		err = 1;
   3486 		goto out;
   3487 	}
   3488 
   3489 	/* revision is valid? */
   3490 	if (revchk(MDDB_REV_DT, dtp->dt_rev)) {
   3491 		err = 1;
   3492 		goto out;
   3493 	}
   3494 
   3495 	/* crc is valid? */
   3496 	if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) {
   3497 		err = 1;
   3498 		goto out;
   3499 	}
   3500 
   3501 	/* shorthand */
   3502 	dtagp = &dtp->dt_dtag;
   3503 
   3504 	/* set number match? */
   3505 	if (dtagp->dt_setno != setno) {
   3506 		err = 1;
   3507 		goto out;
   3508 	}
   3509 
   3510 	/* tag is not empty? */
   3511 	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
   3512 	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
   3513 	    dtagp->dt_id == 0) {
   3514 		err = 2;
   3515 		goto out;
   3516 	}
   3517 
   3518 	/* Mark the locator as having tagged data */
   3519 	rip->ri_flags |= MDDB_F_TAGDATA;
   3520 
   3521 out:
   3522 	if (err) {
   3523 		if (err == 1) {
   3524 			md_set_setstatus(setno, MD_SET_BADTAG);
   3525 			rip->ri_flags |= MDDB_F_BADTAG;
   3526 		}
   3527 		if (dtp != NULL) {
   3528 			kmem_free(dtp, MDDB_DT_BYTES);
   3529 			rip->ri_dtp = (mddb_dt_t *)NULL;
   3530 		}
   3531 	}
   3532 
   3533 	return (err);
   3534 }
   3535 
   3536 /* Should not call for MN diskset since data tags are not supported */
   3537 static int
   3538 dt_write(mddb_set_t *s)
   3539 {
   3540 	int		li;
   3541 	int		err = 0;
   3542 	int		werr;
   3543 	int		empty_tag = 0;
   3544 	mddb_dtag_t	*dtagp;
   3545 	mddb_dt_t	*dtp;
   3546 	mddb_lb_t	*lbp = s->s_lbp;
   3547 	set_t		setno = s->s_setno;
   3548 	uint_t		set_status = md_get_setstatus(setno);
   3549 
   3550 
   3551 	ASSERT(md_set[setno].s_dtp != NULL);
   3552 
   3553 	/* Nowhere to write to */
   3554 	if (lbp->lb_dtblkcnt == 0)
   3555 		return (err);
   3556 
   3557 	if (set_status & MD_SET_BADTAG)
   3558 		return (err);
   3559 
   3560 	/* shorthand */
   3561 	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
   3562 	dtagp = &dtp->dt_dtag;
   3563 
   3564 	/* See if the tag is empty. */
   3565 	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
   3566 	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
   3567 	    dtagp->dt_id == 0)
   3568 		empty_tag = 1;
   3569 
   3570 	/* Write the tag to the locators and reset appropriate flags. */
   3571 	for (li = 0; li < lbp->lb_loccnt; li++) {
   3572 		mddb_locator_t	*lp = &lbp->lb_locators[li];
   3573 
   3574 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
   3575 		    (lp->l_flags & MDDB_F_DELETED) ||
   3576 		    (lp->l_flags & MDDB_F_EWRITE))
   3577 			continue;
   3578 
   3579 		werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk,
   3580 		    MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER);
   3581 
   3582 		if (werr) {
   3583 			err |= werr;
   3584 			continue;
   3585 		}
   3586 
   3587 		if (empty_tag)
   3588 			lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA);
   3589 		else {
   3590 			lp->l_flags |= MDDB_F_TAGDATA;
   3591 			lp->l_flags &= ~MDDB_F_BADTAG;
   3592 		}
   3593 	}
   3594 
   3595 	if (err)
   3596 		return (err);
   3597 
   3598 
   3599 	/* If the tags were written, check to see if any tags remain. */
   3600 	for (li = 0; li < lbp->lb_loccnt; li++) {
   3601 		mddb_locator_t	*lp = &lbp->lb_locators[li];
   3602 
   3603 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
   3604 		    (lp->l_flags & MDDB_F_DELETED) ||
   3605 		    (lp->l_flags & MDDB_F_EWRITE))
   3606 			continue;
   3607 
   3608 		if (lp->l_flags & MDDB_F_TAGDATA)
   3609 			break;
   3610 	}
   3611 
   3612 	/* If there are no tags, then clear CLRTAG and TAGDATA */
   3613 	if (li == lbp->lb_loccnt) {
   3614 		md_clr_setstatus(setno, MD_SET_CLRTAG);
   3615 		md_clr_setstatus(setno, MD_SET_TAGDATA);
   3616 	}
   3617 
   3618 	return (err);
   3619 }
   3620 
   3621 /* Should not call for MN diskset since data tags are not supported */
   3622 static int
   3623 dt_alloc_if_needed(mddb_set_t *s)
   3624 {
   3625 	int		i;
   3626 	int		li;
   3627 	int		moveit = 0;
   3628 	mddb_lb_t	*lbp = s->s_lbp;
   3629 	mddb_block_t	blkcnt = lbp->lb_dtblkcnt;
   3630 	set_t		setno = s->s_setno;
   3631 	uint_t		set_status = md_get_setstatus(setno);
   3632 
   3633 	/*
   3634 	 * If the data tag record is allocated (blkcnt != 0) and a bad tag was
   3635 	 * not detected, there is nothing to do.
   3636 	 */
   3637 	if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG))
   3638 		return (0);
   3639 
   3640 	/* Bitmap not setup, checks can't be done */
   3641 	if (s->s_totalblkcnt == 0)
   3642 		return (0);
   3643 
   3644 	/* While reading the tag(s) an invalid tag data record was seen */
   3645 	if (set_status & MD_SET_BADTAG)
   3646 		/* See if the invalid tag needs to be moved */
   3647 		for (i = 0; i < MDDB_DT_BLOCKS; i++)
   3648 			if (blkcheck(s, (i + lbp->lb_dtfirstblk))) {
   3649 				moveit = 1;
   3650 				break;
   3651 			}
   3652 
   3653 	/* Need to move or allocate the tag data record */
   3654 	if (moveit || blkcnt == 0) {
   3655 		lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS);
   3656 		if (lbp->lb_dtfirstblk == 0) {
   3657 			cmn_err(CE_WARN,
   3658 			    "Unable to allocate data tag record");
   3659 			return (0);
   3660 		}
   3661 		lbp->lb_dtblkcnt = MDDB_DT_BLOCKS;
   3662 
   3663 		/* Mark the locators so that they get written to disk. */
   3664 		for (li = 0; li < lbp->lb_loccnt; li++) {
   3665 			mddb_locator_t	*lp = &lbp->lb_locators[li];
   3666 
   3667 			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
   3668 			    (lp->l_flags & MDDB_F_DELETED) ||
   3669 			    (lp->l_flags & MDDB_F_EWRITE))
   3670 				continue;
   3671 
   3672 			lp->l_flags |= MDDB_F_BADTAG;
   3673 		}
   3674 		return (1);
   3675 	}
   3676 
   3677 	/*
   3678 	 * Make sure the blocks are owned, since the calculation in
   3679 	 * computefreeblks() is bypassed when MD_SET_BADTAG is set.
   3680 	 */
   3681 	for (i = 0; i < MDDB_DT_BLOCKS; i++)
   3682 		blkbusy(s, (i + lbp->lb_dtfirstblk));
   3683 
   3684 	return (1);
   3685 }
   3686 
   3687 /*
   3688  * Writestart writes the incore mddb out to all of the replicas.
   3689  * This is called when a diskset is started and when an error has
   3690  * been enountered during the write to a mddb.
   3691  *
   3692  * flag can be 2 values:
   3693  *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
   3694  *		always used for traditional and local disksets.
   3695  *		This is the normal path for MN disksets since the slave
   3696  *		nodes aren't actually allowed to write to disk.
   3697  *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
   3698  *		master has been chosen, the new master may need to
   3699  * 		write its incore mddb to disk (this is the case where the
   3700  *		old master had executed a message but hadn't relayed it
   3701  *		to this slave yet).  New master should not write the
   3702  *		change log records since new master would be overwriting
   3703  *		valuable data.  Only used during a reconfig cycle.
   3704  */
   3705 static int
   3706 writestart(
   3707 	mddb_set_t	*s,
   3708 	int		flag
   3709 )
   3710 {
   3711 	int		li;
   3712 	mddb_locator_t	*lp;
   3713 	mddb_lb_t	*lbp;
   3714 	mddb_ln_t	*lnp;
   3715 	int		err = 0;
   3716 	uint_t		set_status;
   3717 
   3718 	lbp = s->s_lbp;
   3719 
   3720 	for (li = 0; li < lbp->lb_loccnt; li++) {
   3721 		lp = &lbp->lb_locators[li];
   3722 		if (! (lp->l_flags & MDDB_F_ACTIVE))
   3723 			continue;
   3724 		if (! (lp->l_flags & MDDB_F_SUSPECT))
   3725 			continue;
   3726 		if (writecopy(s, li, flag))
   3727 			return (1);
   3728 		lp->l_flags |= MDDB_F_UP2DATE;
   3729 	}
   3730 
   3731 	for (li = 0; li < lbp->lb_loccnt; li++) {
   3732 		lp = &lbp->lb_locators[li];
   3733 		if (! (lp->l_flags & MDDB_F_ACTIVE))
   3734 			continue;
   3735 		if ((lp->l_flags & MDDB_F_UP2DATE))
   3736 			continue;
   3737 		if (checkcopy(s, li))
   3738 			if (err = writecopy(s, li, flag))
   3739 				return (1);
   3740 		lp->l_flags |= MDDB_F_UP2DATE;
   3741 	}
   3742 
   3743 	/*
   3744 	 * Call fixoptrecord even during a reconfig cycle since a replica
   3745 	 * failure may force the master to re-assign the optimized
   3746 	 * resync record to another replica.
   3747 	 */
   3748 	if (fixoptrecords(s))
   3749 		return (1);
   3750 
   3751 	set_status = md_get_setstatus(s->s_setno);
   3752 
   3753 	/* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */
   3754 	for (li = 0; li < lbp->lb_loccnt; li++) {
   3755 		lp = &lbp->lb_locators[li];
   3756 
   3757 		if (lp->l_flags & MDDB_F_DELETED)
   3758 			continue;
   3759 
   3760 		if (((lp->l_flags & MDDB_F_ACTIVE) != 0 &&
   3761 		    (lp->l_flags & MDDB_F_OLDACT) == 0) ||
   3762 		    ((lp->l_flags & MDDB_F_ACTIVE) == 0 &&
   3763 		    (lp->l_flags & MDDB_F_OLDACT) != 0))
   3764 			break;
   3765 
   3766 		if ((set_status & MD_SET_TAGDATA) ||
   3767 		    (set_status & MD_SET_CLRTAG))
   3768 			if ((lp->l_flags & MDDB_F_TAGDATA) ||
   3769 			    (lp->l_flags & MDDB_F_BADTAG))
   3770 				break;
   3771 	}
   3772 
   3773 	/*
   3774 	 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT)
   3775 	 * the lbp identifier and the set identifier doesn't match.
   3776 	 */
   3777 	if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) {
   3778 
   3779 		/* Only call for traditional and local sets */
   3780 		if (!(lbp->lb_flags & MDDB_MNSET))
   3781 			(void) dt_write(s);
   3782 
   3783 		setidentifier(s, &lbp->lb_ident);
   3784 
   3785 		if (err = push_lb(s)) {
   3786 			(void) upd_med(s, "writestart(0)");
   3787 			return (err);
   3788 		}
   3789 
   3790 		(void) upd_med(s, "writestart(0)");
   3791 
   3792 		if (err = push_lb(s)) {
   3793 			(void) upd_med(s, "writestart(1)");
   3794 			return (err);
   3795 		}
   3796 
   3797 		(void) upd_med(s, "writestart(1)");
   3798 
   3799 		lnp = s->s_lnp;
   3800 		uniqtime32(&lnp->ln_timestamp);
   3801 		if (lbp->lb_flags & MDDB_MNSET)
   3802 			lnp->ln_revision = MDDB_REV_MNLN;
   3803 		else
   3804 			lnp->ln_revision = MDDB_REV_LN;
   3805 		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
   3806 		err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
   3807 		    lbp->lb_lnblkcnt, 0);
   3808 		/*
   3809 		 * If a MN diskset and this is the master, set the PARSE_LOCNM
   3810 		 * flag in the mddb_set structure to show that the locator
   3811 		 * names have changed.
   3812 		 * Don't set parseflags as a result of a new master sync
   3813 		 * during reconfig cycle since slaves nodes are already
   3814 		 * in-sync with the new master.
   3815 		 */
   3816 
   3817 		if ((lbp->lb_flags & MDDB_MNSET) &&
   3818 		    (md_set[s->s_setno].s_am_i_master) &&
   3819 		    (flag != MDDB_WRITECOPY_SYNC)) {
   3820 			s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
   3821 		}
   3822 
   3823 		if (err)
   3824 			return (err);
   3825 	}
   3826 
   3827 	for (li = 0; li < lbp->lb_loccnt; li++) {
   3828 		lp = &lbp->lb_locators[li];
   3829 		if (lp->l_flags & MDDB_F_DELETED)
   3830 			continue;
   3831 		if (lp->l_flags & MDDB_F_ACTIVE) {
   3832 			lp->l_flags |= MDDB_F_OLDACT;
   3833 		} else {
   3834 			lp->l_flags &= ~MDDB_F_OLDACT;
   3835 		}
   3836 	}
   3837 
   3838 	md_clr_setstatus(s->s_setno, MD_SET_STALE);
   3839 
   3840 	return (0);
   3841 }
   3842 
   3843 /*
   3844  * selectreplicas selects the working replicas and may write the incore
   3845  * version of the mddb out to the replicas ondisk.
   3846  *
   3847  * flag can be 3 values:
   3848  *	MDDB_RETRYSCAN - quick scan to see if there is an error.
   3849  *			If no new error, returns without writing mddb
   3850  *			to disks.  If a new error is seen, writes out
   3851  *			mddb to disks.
   3852  *	MDDB_SCANALL  - lengthy scan to check out mddbs and always writes
   3853  *			out mddb to the replica ondisk.  Calls writecopy
   3854  *			with MDDB_WRITECOPY_ALL flag which writes out
   3855  *			all records to the replicas ondisk.
   3856  *	MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore
   3857  *			and ondisk mddbs by writing incore values to disk.
   3858  *			Calls writecopy with MDDB_WRITECOPY_SYNC flag so
   3859  *			that change log records are not written out.
   3860  *			Only used by MN disksets.
   3861  *
   3862  * Returns:
   3863  *	0 - Successful
   3864  *	1 - Unable to write incore mddb data to disk since < 50% replicas.
   3865  */
   3866 int
   3867 selectreplicas(
   3868 	mddb_set_t	*s,
   3869 	int		flag
   3870 )
   3871 {
   3872 	int		li;
   3873 	int		alc;
   3874 	int		lc;
   3875 	mddb_locator_t	*lp;
   3876 	mddb_lb_t	*lbp = s->s_lbp;
   3877 	set_t		setno = s->s_setno;
   3878 	int		wc_flag;
   3879 
   3880 	/*
   3881 	 * can never transition from stale to not stale
   3882 	 */
   3883 	if (md_get_setstatus(setno) & MD_SET_STALE) {
   3884 		for (li = 0; li < lbp->lb_loccnt; li++) {
   3885 			lp = &lbp->lb_locators[li];
   3886 			if (lp->l_flags & MDDB_F_DELETED)
   3887 				continue;
   3888 			if (! (lp->l_flags & MDDB_F_EMASTER)) {
   3889 				lp->l_flags |= MDDB_F_ACTIVE;
   3890 			} else {
   3891 				lp->l_flags &= ~MDDB_F_ACTIVE;
   3892 			}
   3893 		}
   3894 		return (1);
   3895 	}
   3896 
   3897 	if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) {
   3898 		for (li = 0; li < lbp->lb_loccnt; li++) {
   3899 			lp = &lbp->lb_locators[li];
   3900 			if (lp->l_flags & MDDB_F_DELETED)
   3901 				continue;
   3902 			if (lp->l_flags & MDDB_F_ACTIVE) {
   3903 				lp->l_flags |= MDDB_F_OLDACT;
   3904 				lp->l_flags &= ~MDDB_F_SUSPECT;
   3905 			} else {
   3906 				lp->l_flags |= MDDB_F_SUSPECT;
   3907 				lp->l_flags &= ~MDDB_F_OLDACT;
   3908 			}
   3909 
   3910 			if (! (lp->l_flags & MDDB_F_EMASTER)) {
   3911 				lp->l_flags |= MDDB_F_ACTIVE;
   3912 				lp->l_flags &= ~MDDB_F_EWRITE;
   3913 				lp->l_flags &= ~MDDB_F_TOOSMALL;
   3914 			} else {
   3915 				lp->l_flags &= ~MDDB_F_ACTIVE;
   3916 			}
   3917 		}
   3918 		computefreeblks(s); /* set up free block bits */
   3919 	} else {
   3920 		for (li = 0; li < lbp->lb_loccnt; li++) {
   3921 			lp = &lbp->lb_locators[li];
   3922 			if (! (lp->l_flags & MDDB_F_ACTIVE))
   3923 				continue;
   3924 			if (lp->l_flags & MDDB_F_EWRITE)
   3925 				break;
   3926 		}
   3927 
   3928 		/*
   3929 		 * if there are no errors this is error has already
   3930 		 * been processed return current state
   3931 		 */
   3932 		if (li == lbp->lb_loccnt)
   3933 			return (md_get_setstatus(setno) & MD_SET_TOOFEW);
   3934 
   3935 		lp->l_flags &= ~MDDB_F_ACTIVE;
   3936 		do {
   3937 			lp = &lbp->lb_locators[li];
   3938 			lp->l_flags &= ~MDDB_F_UP2DATE;
   3939 		} while (++li < lbp->lb_loccnt);
   3940 	}
   3941 
   3942 	alc = 0;
   3943 	lc = 0;
   3944 	for (li = 0; li < lbp->lb_loccnt; li++) {
   3945 		lp = &lbp->lb_locators[li];
   3946 		if (lp->l_flags & MDDB_F_DELETED)
   3947 			continue;
   3948 		lc++;
   3949 		if (! (lp->l_flags & MDDB_F_ACTIVE))
   3950 			continue;
   3951 		alc++;
   3952 	}
   3953 
   3954 	if (alc < ((lc + 1) / 2)) {
   3955 		md_set_setstatus(setno, MD_SET_TOOFEW);
   3956 		return (1);
   3957 	}
   3958 
   3959 	/* Set wc_flag based on flag passed in. */
   3960 	if (flag == MDDB_SCANALLSYNC)
   3961 		wc_flag = MDDB_WRITECOPY_SYNC;
   3962 	else
   3963 		wc_flag = MDDB_WRITECOPY_ALL;
   3964 
   3965 	do {
   3966 		if (! writestart(s, wc_flag)) {
   3967 			md_clr_setstatus(setno, MD_SET_TOOFEW);
   3968 			return (0);
   3969 		}
   3970 		alc  = 0;
   3971 		for (li = 0; li < lbp->lb_loccnt; li++) {
   3972 			lp = &lbp->lb_locators[li];
   3973 			if ((lp->l_flags & MDDB_F_DELETED) ||
   3974 			    (lp->l_flags & MDDB_F_EMASTER))
   3975 				continue;
   3976 
   3977 			if (lp->l_flags & MDDB_F_EWRITE) {
   3978 				lp->l_flags &= ~MDDB_F_ACTIVE;
   3979 				lp->l_flags &= ~MDDB_F_UP2DATE;
   3980 				continue;
   3981 			}
   3982 			alc++;
   3983 		}
   3984 	} while (alc >= ((lc + 1) / 2));
   3985 	md_set_setstatus(setno, MD_SET_TOOFEW);
   3986 	return (1);
   3987 }
   3988 
   3989 static int
   3990 checkstate(
   3991 	mddb_set_t	*s,
   3992 	int		probe
   3993 )
   3994 {
   3995 	int		error;
   3996 	uint_t		set_status = md_get_setstatus(s->s_setno);
   3997 
   3998 	ASSERT(s != NULL);
   3999 
   4000 	if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW))
   4001 		return (0);
   4002 
   4003 	if (probe == MDDB_NOPROBE)
   4004 		return (1);
   4005 
   4006 	single_thread_start(s);
   4007 	error = selectreplicas(s, MDDB_SCANALL);
   4008 	single_thread_end(s);
   4009 
   4010 	if (error == 0 && s->s_zombie != 0) {
   4011 		mutex_exit(SETMUTEX(s->s_setno));
   4012 		error = mddb_deleterec(s->s_zombie);
   4013 		mutex_enter(SETMUTEX(s->s_setno));
   4014 		if (error == 0)
   4015 			s->s_zombie = 0;
   4016 	}
   4017 	return (error);
   4018 }
   4019 
   4020 static int
   4021 writeretry(
   4022 	mddb_set_t	*s
   4023 )
   4024 {
   4025 	if (selectreplicas(s, MDDB_RETRYSCAN))
   4026 		if (selectreplicas(s, MDDB_SCANALL))
   4027 			return (1);
   4028 	return (0);
   4029 }
   4030 
   4031 static void
   4032 free_mbipp(mddb_mb_ic_t **mbipp)
   4033 {
   4034 	mddb_mb_ic_t	*mbip1, *mbip2;
   4035 
   4036 	for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) {
   4037 		mbip2 = mbip1->mbi_next;
   4038 		kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE);
   4039 	}
   4040 	*mbipp = (mddb_mb_ic_t *)NULL;
   4041 }
   4042 
   4043 static mddb_ri_t *
   4044 save_rip(mddb_set_t *s)
   4045 {
   4046 	mddb_ri_t	*trip = s->s_rip;
   4047 	mddb_ri_t	*nrip = NULL;
   4048 	mddb_ri_t	**nripp = &nrip;
   4049 	mddb_ri_t	*rip;
   4050 
   4051 	while (trip) {
   4052 		/* Run to the end of the list */
   4053 		for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next)
   4054 			/* void */;
   4055 
   4056 		/* Add the new member */
   4057 		*nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP);
   4058 
   4059 		ASSERT(*nripp != NULL);
   4060 
   4061 		/* shorthand */
   4062 		rip = *nripp;
   4063 
   4064 		*rip = *trip;			/* structure assignment */
   4065 
   4066 		/* Clear the stuff that is not needed for hints */
   4067 		rip->ri_flags = 0;
   4068 		rip->ri_commitcnt = 0;
   4069 		rip->ri_transplant = 0;
   4070 		rip->ri_mbip = (mddb_mb_ic_t *)NULL;
   4071 		rip->ri_dtp = (mddb_dt_t *)NULL;
   4072 		rip->ri_lbp = (mddb_lb_t *)NULL;
   4073 		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
   4074 		rip->ri_devid = (ddi_devid_t)NULL;
   4075 		rip->ri_old_devid = (ddi_devid_t)NULL;
   4076 		rip->ri_next = (mddb_ri_t *)NULL;
   4077 
   4078 		trip = trip->ri_next;
   4079 	}
   4080 	return (nrip);
   4081 }
   4082 
   4083 static void
   4084 free_rip(mddb_ri_t **ripp)
   4085 {
   4086 	mddb_ri_t	*rip;
   4087 	mddb_ri_t	*arip;
   4088 
   4089 	for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) {
   4090 		arip = rip->ri_next;
   4091 		if (rip->ri_devid != (ddi_devid_t)NULL) {
   4092 			ddi_devid_free(rip->ri_devid);
   4093 			rip->ri_devid = (ddi_devid_t)NULL;
   4094 		}
   4095 		if (rip->ri_old_devid != (ddi_devid_t)NULL) {
   4096 			ddi_devid_free(rip->ri_old_devid);
   4097 			rip->ri_old_devid = (ddi_devid_t)NULL;
   4098 		}
   4099 		kmem_free((caddr_t)rip, sizeof (*rip));
   4100 	}
   4101 	*ripp = (mddb_ri_t *)NULL;
   4102 }
   4103 
   4104 /*
   4105  * this routine selects the correct replica to use
   4106  * the rules are as follows
   4107  *	1.	if all replica has same init time select highest commit count
   4108  *	2.	if some but not all replicas are from another hostid discard
   4109  *		them.
   4110  *	3.	find which init time is present is most replicas
   4111  *	4.	discard all replicas which do not match most init times
   4112  *	5.	select replica with highest commit count
   4113  */
   4114 
   4115 static mddb_lb_t *
   4116 selectlocator(
   4117 	mddb_set_t	*s
   4118 )
   4119 {
   4120 	mddb_ri_t	*rip = s->s_rip;
   4121 	mddb_ri_t	*r, *r1;
   4122 	mddb_lb_t	*lbp;
   4123 	struct timeval32 *tp = (struct timeval32 *)NULL;
   4124 	int		different;
   4125 	int		same;
   4126 	int		count;
   4127 	int		maxcount;
   4128 	set_t		setno = s->s_setno;
   4129 	size_t		sz;
   4130 	int		mn_set = 0;
   4131 
   4132 	/* Clear the ri_transplant flag on all the rip entries. */
   4133 	/* Set ri_commitcnt to locator's commitcnt - if available */
   4134 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
   4135 		r->ri_transplant = 0;
   4136 		if (r->ri_lbp != (mddb_lb_t *)NULL) {
   4137 			r->ri_commitcnt = r->ri_lbp->lb_commitcnt;
   4138 			/* If any locators have MN bit set, set flag */
   4139 			if (r->ri_lbp->lb_flags & MDDB_MNSET)
   4140 				mn_set = 1;
   4141 		}
   4142 	}
   4143 
   4144 	/*
   4145 	 * A data tag is being used, so use it to limit the selection first.
   4146 	 * Data tags not used in MN diskset.
   4147 	 */
   4148 	if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) {
   4149 		mddb_dt_t	*dtp = (mddb_dt_t *)md_set[setno].s_dtp;
   4150 
   4151 		/*
   4152 		 * now toss any locators that have a different data tag
   4153 		 */
   4154 		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
   4155 			if (r->ri_lbp == (mddb_lb_t *)NULL)
   4156 				continue;
   4157 
   4158 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
   4159 				/* If same tag, keep it */
   4160 				if (dtl_cmp(&dtp->dt_dtag,
   4161 				    &r->ri_dtp->dt_dtag) == 0)
   4162 					continue;
   4163 			}
   4164 
   4165 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
   4166 				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
   4167 				r->ri_dtp = (mddb_dt_t *)NULL;
   4168 			}
   4169 
   4170 			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
   4171 			if (!(md_get_setstatus(setno) &
   4172 			    MD_SET_REPLICATED_IMPORT)) {
   4173 				if (r->ri_old_devid != (ddi_devid_t)NULL) {
   4174 					sz = ddi_devid_sizeof(r->ri_old_devid);
   4175 					kmem_free((caddr_t)r->ri_old_devid, sz);
   4176 					r->ri_old_devid = (ddi_devid_t)NULL;
   4177 				}
   4178 			}
   4179 
   4180 			kmem_free((caddr_t)r->ri_lbp,
   4181 			    dbtob(r->ri_lbp->lb_blkcnt));
   4182 			r->ri_lbp = (mddb_lb_t *)NULL;
   4183 
   4184 			r->ri_transplant = 1;
   4185 		}
   4186 
   4187 		/* Tag used, clear the bit */
   4188 		md_clr_setstatus(s->s_setno, MD_SET_USETAG);
   4189 
   4190 		if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) {
   4191 			/*
   4192 			 * Get rid of the list of tags.
   4193 			 */
   4194 			dtl_freel(&s->s_dtlp);
   4195 
   4196 			/*
   4197 			 * Re-create the list with the tag used.
   4198 			 */
   4199 			(void) dtl_addl(s, &dtp->dt_dtag);
   4200 		}
   4201 	}
   4202 
   4203 	/*
   4204 	 * scan to see if all replicas have same time
   4205 	 */
   4206 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
   4207 		if (r->ri_lbp == (mddb_lb_t *)NULL)
   4208 			continue;
   4209 		if (tp == NULL) {
   4210 			tp = &r->ri_lbp->lb_inittime;
   4211 			continue;
   4212 		}
   4213 		/* CSTYLED */
   4214 		if (timercmp(tp, &r->ri_lbp->lb_inittime, !=))
   4215 			break;
   4216 	}
   4217 
   4218 	/*
   4219 	 * if r == NULL then they were all them same. Choose highest
   4220 	 * commit count
   4221 	 */
   4222 	if (r == (mddb_ri_t *)NULL)
   4223 		goto out;
   4224 
   4225 	/*
   4226 	 * If here, a bogus replica is present and at least 1 lb_inittime
   4227 	 * did not match.
   4228 	 */
   4229 
   4230 	/*
   4231 	 * look and see if any but not all are from different id
   4232 	 */
   4233 
   4234 	different = 0;
   4235 	same = 0;
   4236 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
   4237 		if (r->ri_lbp == (mddb_lb_t *)NULL)
   4238 			continue;
   4239 		if (cmpidentifier(s, &r->ri_lbp->lb_ident))
   4240 			different = 1;
   4241 		else
   4242 			same = 1;
   4243 	}
   4244 
   4245 	/*
   4246 	 * now go through and throw out different if there are some
   4247 	 * that are the same
   4248 	 */
   4249 	if (different != 0 && same != 0) {
   4250 		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
   4251 			if (r->ri_lbp == (mddb_lb_t *)NULL)
   4252 				continue;
   4253 
   4254 			if (!cmpidentifier(s, &r->ri_lbp->lb_ident))
   4255 				continue;
   4256 
   4257 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
   4258 				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
   4259 				r->ri_dtp = (mddb_dt_t *)NULL;
   4260 			}
   4261 
   4262 			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
   4263 			if (!(md_get_setstatus(setno) &
   4264 			    MD_SET_REPLICATED_IMPORT)) {
   4265 				if (r->ri_old_devid != (ddi_devid_t)NULL) {
   4266 					sz = ddi_devid_sizeof(r->ri_old_devid);
   4267 					kmem_free((caddr_t)r->ri_old_devid, sz);
   4268 					r->ri_old_devid = (ddi_devid_t)NULL;
   4269 				}
   4270 			}
   4271 
   4272 			kmem_free((caddr_t)r->ri_lbp,
   4273 			    dbtob(r->ri_lbp->lb_blkcnt));
   4274 			r->ri_lbp = (mddb_lb_t *)NULL;
   4275 
   4276 			r->ri_transplant = 1;
   4277 		}
   4278 	}
   4279 
   4280 	/*
   4281 	 * go through and pick highest. Use n square because it is
   4282 	 * simple and 40 some is max possible
   4283 	 */
   4284 	maxcount = 0;
   4285 	lbp = (mddb_lb_t *)NULL;
   4286 	for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) {
   4287 		if (r1->ri_lbp == (mddb_lb_t *)NULL)
   4288 			continue;
   4289 		count = 0;
   4290 		for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) {
   4291 			if (r->ri_lbp == (mddb_lb_t *)NULL)
   4292 				continue;
   4293 			if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */
   4294 			    &r->ri_lbp->lb_inittime, ==))
   4295 				count++;
   4296 		}
   4297 		if (count > maxcount) {
   4298 			maxcount = count;
   4299 			lbp = r1->ri_lbp;
   4300 		}
   4301 	}
   4302 
   4303 	/*
   4304 	 * now go though and toss any that are of a different time stamp
   4305 	 */
   4306 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
   4307 		if (r->ri_lbp == (mddb_lb_t *)NULL)
   4308 			continue;
   4309 		if (timercmp(&lbp->lb_inittime, /* CSTYLED */
   4310 		    &r->ri_lbp->lb_inittime, ==))
   4311 			continue;
   4312 
   4313 		if (r->ri_dtp != (mddb_dt_t *)NULL) {
   4314 			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
   4315 			r->ri_dtp = (mddb_dt_t *)NULL;
   4316 		}
   4317 
   4318 		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
   4319 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
   4320 			if (r->ri_old_devid != (ddi_devid_t)NULL) {
   4321 				sz = ddi_devid_sizeof(r->ri_old_devid);
   4322 				kmem_free((caddr_t)r->ri_old_devid, sz);
   4323 				r->ri_old_devid = (ddi_devid_t)NULL;
   4324 			}
   4325 		}
   4326 
   4327 		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
   4328 		r->ri_lbp = (mddb_lb_t *)NULL;
   4329 
   4330 		r->ri_transplant = 1;
   4331 	}
   4332 
   4333 out:
   4334 	/*
   4335 	 * Find the locator with the highest commit count, and make it the
   4336 	 * "chosen" one.
   4337 	 */
   4338 	lbp = (mddb_lb_t *)NULL;
   4339 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
   4340 		if (r->ri_lbp == (mddb_lb_t *)NULL)
   4341 			continue;
   4342 
   4343 		if (lbp == NULL) {
   4344 			lbp = r->ri_lbp;
   4345 			continue;
   4346 		}
   4347 
   4348 		if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt)
   4349 			lbp = r->ri_lbp;
   4350 	}
   4351 
   4352 	/* Toss all locator blocks, except the "chosen" one. */
   4353 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
   4354 		if (r->ri_lbp == (mddb_lb_t *)NULL)
   4355 			continue;
   4356 
   4357 		/* Get rid of all dtp's */
   4358 		if (r->ri_dtp != (mddb_dt_t *)NULL) {
   4359 			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
   4360 			r->ri_dtp = (mddb_dt_t *)NULL;
   4361 		}
   4362 
   4363 		if (r->ri_lbp == lbp)
   4364 			continue;
   4365 
   4366 		/* Get rid of extra locator devid block info */
   4367 		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
   4368 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
   4369 			if (r->ri_old_devid != (ddi_devid_t)NULL) {
   4370 				sz = ddi_devid_sizeof(r->ri_old_devid);
   4371 				kmem_free((caddr_t)r->ri_old_devid, sz);
   4372 				r->ri_old_devid = (ddi_devid_t)NULL;
   4373 			}
   4374 		}
   4375 
   4376 		/* Get rid of extra locators */
   4377 		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
   4378 		r->ri_lbp = (mddb_lb_t *)NULL;
   4379 	}
   4380 	return (lbp);
   4381 }
   4382 
   4383 static void
   4384 locator2cfgloc(
   4385 	mddb_lb_t		*lbp,
   4386 	mddb_cfg_loc_t		*clp,
   4387 	int			li,
   4388 	side_t			sideno,
   4389 	mddb_did_ic_t		*did_icp
   4390 )
   4391 {
   4392 	mddb_drvnm_t		*dn;
   4393 	mddb_locator_t		*lp = &lbp->lb_locators[li];
   4394 	mddb_sidelocator_t	*slp;
   4395 	mddb_mnsidelocator_t	*mnslp;
   4396 	mddb_did_info_t		*did_info;
   4397 	int 			i, sz, szalloc;
   4398 	int			mn_set = 0;
   4399 	mddb_mnlb_t		*mnlbp;
   4400 
   4401 	if (lbp->lb_flags & MDDB_MNSET) {
   4402 		mn_set = 1;
   4403 		mnlbp = (mddb_mnlb_t *)lbp;
   4404 		for (i = 0; i < MD_MNMAXSIDES; i++) {
   4405 			mnslp = &mnlbp->lb_mnsidelocators[i][li];
   4406 			if (mnslp->mnl_sideno == sideno)
   4407 				break;
   4408 		}
   4409 		if (i == MD_MNMAXSIDES)
   4410 			return;
   4411 	} else {
   4412 		slp = &lbp->lb_sidelocators[sideno][li];
   4413 	}
   4414 
   4415 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   4416 		did_info = &(did_icp->did_ic_blkp->blk_info[li]);
   4417 		if (did_info->info_flags & MDDB_DID_EXISTS) {
   4418 			sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
   4419 			if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
   4420 				/*
   4421 				 * copy device id from mddb to
   4422 				 * cfg_loc structure
   4423 				 */
   4424 				szalloc = clp->l_devid_sz;
   4425 				if (sz <= szalloc) {
   4426 					for (i = 0; i < sz; i++) {
   4427 						((char *)(uintptr_t)
   4428 						    clp->l_devid)[i] =
   4429 						    ((char *)did_icp->
   4430 						    did_ic_devid[li])[i];
   4431 					}
   4432 					clp->l_devid_flags |= MDDB_DEVID_VALID;
   4433 					(void) strcpy(clp->l_minor_name,
   4434 					    did_info->info_minor_name);
   4435 				} else {
   4436 					clp->l_devid_flags |=
   4437 					    MDDB_DEVID_NOSPACE;
   4438 				}
   4439 			} else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
   4440 				clp->l_devid_flags = MDDB_DEVID_SZ;
   4441 				clp->l_devid_sz = sz;
   4442 			}
   4443 		}
   4444 	}
   4445 
   4446 	/*
   4447 	 * Even if a devid exists, use the dev, drvnm and mnum in the locators
   4448 	 * and sidelocators.  During startup, the dev, drvnm and mnum in
   4449 	 * these structures may not match the devid (the locators and
   4450 	 * sidelocators will be updated to match the devid by the routine
   4451 	 * load_old_replicas).  Using out-of-sync values won't cause any
   4452 	 * problems since ridev will re-derive these from the devid and mnum.
   4453 	 * After startup, the dev, drvnm and mnum in these structures have
   4454 	 * been updated and can be used.
   4455 	 */
   4456 
   4457 	clp->l_blkno = lp->l_blkno;
   4458 	clp->l_flags = lp->l_flags;
   4459 	clp->l_dev = lp->l_dev;
   4460 
   4461 	if (mn_set) {
   4462 		dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
   4463 		clp->l_mnum = mnslp->mnl_mnum;
   4464 	} else {
   4465 		dn = &lbp->lb_drvnm[slp->l_drvnm_index];
   4466 		clp->l_mnum = slp->l_mnum;
   4467 	}
   4468 	(void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM);
   4469 }
   4470 
   4471 /*
   4472  * Find the index into the mnsidelocator where entry will go.
   4473  * Then index can be fed into both splitname2locatorblocks and
   4474  * cfgloc2locator so that those entries can be kept in sync.
   4475  *
   4476  * Returns:
   4477  *	-1 if failed to find unused slot or if a traditional diskset
   4478  *	index, if successful  (0 <= index <= MD_MNMAXSIDES)
   4479  */
   4480 static int
   4481 checklocator(
   4482 	mddb_lb_t		*lbp,
   4483 	int			li,
   4484 	side_t			sideno
   4485 )
   4486 {
   4487 	uchar_t			i;
   4488 	mddb_mnsidelocator_t	*mnslp;
   4489 	mddb_mnlb_t		*mnlbp;
   4490 	int			index = -1;
   4491 
   4492 	if (lbp->lb_flags & MDDB_MNSET) {
   4493 		/*
   4494 		 * Checking side locator structure.  First, check if
   4495 		 * there is already an entry for this side.  If so,
   4496 		 * then use that entry.  Otherwise, find an entry
   4497 		 * that has a sideno of 0.
   4498 		 */
   4499 		mnlbp = (mddb_mnlb_t *)lbp;
   4500 		for (i = 0; i < MD_MNMAXSIDES; i++) {
   4501 			mnslp = &mnlbp->lb_mnsidelocators[i][li];
   4502 			if (mnslp->mnl_sideno == sideno) {
   4503 				/* Found a match - stop looking */
   4504 				index = i;
   4505 				break;
   4506 			} else if ((mnslp->mnl_sideno == 0) && (index == -1)) {
   4507 				/* Set first empty slot, but keep looking */
   4508 				index = i;
   4509 			}
   4510 		}
   4511 		/* Didn't find empty slot or previously used slot */
   4512 		if ((i == MD_MNMAXSIDES) && (index == -1)) {
   4513 			return (-1);
   4514 		}
   4515 		return (index);
   4516 	} else
   4517 		return (0);
   4518 }
   4519 
   4520 /*
   4521  * Takes locator information (driver name, minor number, sideno) and
   4522  * stores it in the locator block.
   4523  * For traditional diskset, the sideno is the index into the sidelocator
   4524  * array in the locator block.
   4525  * For the MN diskset, the sideno is the nodeid which can be any number,
   4526  * so the index passed in is the index into the mnsidelocator array
   4527  * in the locator block.
   4528  */
   4529 static int
   4530 cfgloc2locator(
   4531 	mddb_lb_t		*lbp,
   4532 	mddb_cfg_loc_t		*clp,
   4533 	int			li,
   4534 	side_t			sideno,
   4535 	int			index	/* Only useful in MNsets when > 1 */
   4536 )
   4537 {
   4538 	uchar_t			i;
   4539 	mddb_sidelocator_t	*slp;
   4540 	mddb_mnsidelocator_t	*mnslp;
   4541 	mddb_set_t		*s;
   4542 	int			mn_set = 0;
   4543 	mddb_mnlb_t		*mnlbp;
   4544 
   4545 	if (lbp->lb_flags & MDDB_MNSET) {
   4546 		mnlbp = (mddb_mnlb_t *)lbp;
   4547 		mn_set = 1;
   4548 		/*
   4549 		 * Index will be the slot that has the given sideno or
   4550 		 * the first empty slot if no match is found.
   4551 		 * This was pre-checked out in check locator.
   4552 		 */
   4553 		mnslp = &mnlbp->lb_mnsidelocators[index][li];
   4554 	} else {
   4555 		slp = &lbp->lb_sidelocators[sideno][li];
   4556 	}
   4557 
   4558 	/*
   4559 	 * Look for the driver name
   4560 	 */
   4561 	for (i = 0; i < MDDB_DRVNMCNT; i++) {
   4562 		if (lbp->lb_drvnm[i].dn_len == 0)
   4563 			continue;
   4564 		if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver,
   4565 		    MD_MAXDRVNM) == 0)
   4566 			break;
   4567 	}
   4568 
   4569 	/*
   4570 	 * Didn't find one, add a new one
   4571 	 */
   4572 	if (i == MDDB_DRVNMCNT) {
   4573 		for (i = 0; i < MDDB_DRVNMCNT; i++) {
   4574 			if (lbp->lb_drvnm[i].dn_len == 0)
   4575 				break;
   4576 		}
   4577 		if (i == MDDB_DRVNMCNT)
   4578 			return (1);
   4579 		(void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver,
   4580 		    MD_MAXDRVNM);
   4581 		lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver);
   4582 	}
   4583 
   4584 	/* Fill in the drvnm index */
   4585 	if (mn_set) {
   4586 		mnslp->mnl_drvnm_index = i;
   4587 		mnslp->mnl_mnum = clp->l_mnum;
   4588 		mnslp->mnl_sideno = sideno;
   4589 	} else {
   4590 		slp->l_drvnm_index = i;
   4591 		slp->l_mnum = clp->l_mnum;
   4592 	}
   4593 
   4594 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   4595 		/*
   4596 		 * This device id could already be associated with this index
   4597 		 * if this is not the first side added to the set.
   4598 		 * If device id is 0, there is no device id for this device.
   4599 		 */
   4600 		if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0)
   4601 			return (0);
   4602 		s = (mddb_set_t *)md_set[lbp->lb_setno].s_db;
   4603 		if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid,
   4604 		    clp->l_minor_name)) {
   4605 			return (1);
   4606 		}
   4607 	}
   4608 
   4609 	return (0);
   4610 }
   4611 
   4612 /*
   4613  * See if there are mediator hosts and try to use the data.
   4614  */
   4615 static int
   4616 mediate(
   4617 	mddb_set_t	*s
   4618 )
   4619 {
   4620 	mddb_lb_t	*lbp = s->s_lbp;
   4621 	med_data_lst_t	*meddlp = NULL;
   4622 	med_data_lst_t	*tmeddlp = NULL;
   4623 	med_data_t	*meddp;
   4624 	int		medok = 0;
   4625 	int		medacc = 0;
   4626 	uint_t		maxcc;
   4627 	int		golden = 0;
   4628 	int		err = 1;
   4629 	set_t		setno = s->s_setno;
   4630 
   4631 	/* Do not have a mediator, then the state is stale */
   4632 	if (s->s_med.n_cnt == 0)
   4633 		return (err);
   4634 
   4635 	/* Contact the mediator hosts for the data */
   4636 	meddlp = get_med_host_data(&s->s_med, s->s_setname, setno);
   4637 
   4638 	/* No mediator data, stale */
   4639 	if (meddlp == NULL)
   4640 		return (err);
   4641 
   4642 	/* Mark all the mediator data that is not for this set as errored */
   4643 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
   4644 		struct timeval32 tmptime;
   4645 		meddp = tmeddlp->mdl_med;
   4646 
   4647 		/* Count the number of mediators contacted */
   4648 		medacc++;
   4649 
   4650 		/* Paranoid check */
   4651 		if (meddp->med_dat_sn != setno)
   4652 			meddp->med_dat_fl |= MED_DFL_ERROR;
   4653 
   4654 		TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id);
   4655 
   4656 		/*CSTYLED*/
   4657 		if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=))
   4658 			meddp->med_dat_fl |= MED_DFL_ERROR;
   4659 	}
   4660 
   4661 	/* Get the max commitcount */
   4662 	maxcc = 0;
   4663 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
   4664 		meddp = tmeddlp->mdl_med;
   4665 		if (meddp->med_dat_fl & MED_DFL_ERROR)
   4666 			continue;
   4667 		if (meddp->med_dat_cc > maxcc)
   4668 			maxcc = meddp->med_dat_cc;
   4669 	}
   4670 
   4671 	/* Now mark the records that don't have the highest cc as errored */
   4672 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
   4673 		meddp = tmeddlp->mdl_med;
   4674 		if (meddp->med_dat_fl & MED_DFL_ERROR)
   4675 			continue;
   4676 		if (meddp->med_dat_cc != maxcc)
   4677 			meddp->med_dat_fl |= MED_DFL_ERROR;
   4678 	}
   4679 
   4680 	/* Now mark the records that don't match the lb commitcnt as errored */
   4681 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
   4682 		meddp = tmeddlp->mdl_med;
   4683 		if (meddp->med_dat_fl & MED_DFL_ERROR)
   4684 			continue;
   4685 		if (meddp->med_dat_cc != lbp->lb_commitcnt)
   4686 			meddp->med_dat_fl |= MED_DFL_ERROR;
   4687 	}
   4688 
   4689 	/* Is there a "golden" copy and how many valid mediators */
   4690 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
   4691 		meddp = tmeddlp->mdl_med;
   4692 		if (meddp->med_dat_fl & MED_DFL_ERROR)
   4693 			continue;
   4694 
   4695 		if (meddp->med_dat_fl & MED_DFL_GOLDEN)
   4696 			golden++;
   4697 
   4698 		medok++;
   4699 	}
   4700 
   4701 	/* No survivors, stale */
   4702 	if (medok == 0)
   4703 		goto out;
   4704 
   4705 	/* No mediator quorum and no golden copies, stale */
   4706 	if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) {
   4707 		/* Skip odd numbers, no exact 50% */
   4708 		if (s->s_med.n_cnt & 1)
   4709 			goto out;
   4710 		/* Have 50%, allow an accept */
   4711 		if (medacc == (s->s_med.n_cnt / 2))
   4712 			md_set_setstatus(setno, MD_SET_ACCOK);
   4713 		goto out;
   4714 	}
   4715 
   4716 	/* We either have a quorum or a golden copy, or both */
   4717 	err = 0;
   4718 
   4719 out:
   4720 	if (meddlp) {
   4721 		for (/* void */; meddlp != NULL; meddlp = tmeddlp) {
   4722 			tmeddlp = meddlp->mdl_nx;
   4723 			kmem_free(meddlp->mdl_med, sizeof (med_data_t));
   4724 			kmem_free(meddlp, sizeof (med_data_lst_t));
   4725 		}
   4726 	}
   4727 
   4728 	return (err);
   4729 }
   4730 
   4731 /*
   4732  *	1. read masterblks and locator blocks for all know database locations
   4733  *		a. keep track of which have good master blks
   4734  *		b. keep track of which have good locators
   4735  *
   4736  */
   4737 static int
   4738 get_mbs_n_lbs(
   4739 	mddb_set_t	*s,
   4740 	int		*write_lb
   4741 )
   4742 {
   4743 	mddb_lb_t	*lbp = NULL;		/* pointer to locator block */
   4744 						/* May be cast to mddb_mnlb_t */
   4745 						/* if accessing sidenames in */
   4746 						/* MN set */
   4747 	mddb_did_ic_t	*did_icp = NULL;	/* ptr to Device ID incore */
   4748 	mddb_did_blk_t	*did_blkp = 0;
   4749 	int		did_blkp_sz = 0;
   4750 	mddb_did_db_t	*did_dbp;
   4751 	mddb_did_info_t	*did_info;
   4752 	caddr_t		did_block;
   4753 	mddb_ri_t	*rip;
   4754 	mddb_dtag_lst_t	*dtlp;
   4755 	mddb_locator_t	*lp;
   4756 	daddr_t		physblk;
   4757 	int		li;
   4758 	uint_t		blk;
   4759 	md_dev64_t	dev;
   4760 	caddr_t		buffer;
   4761 	uint_t		lb_blkcnt;
   4762 	int		retval = 0;
   4763 	int		err = 0;
   4764 	int		lb_ok = 0;
   4765 	int		lb_total = 0;
   4766 	int		lb_tagged = 0;
   4767 	int		lb_tags;
   4768 	set_t		setno = s->s_setno;
   4769 	int		cont_flag, i;
   4770 	mddb_did_db_t	*did_dbp1, *did_dbp2;
   4771 	int		mn_set = 0;
   4772 	mddb_cfg_loc_t	*cl;
   4773 
   4774 	/*
   4775 	 * read in master blocks and locator block for all known locators.
   4776 	 * lb_blkcnt will be set correctly for MN set later once getmasters
   4777 	 * has determined that the set is a MN set.
   4778 	 */
   4779 	lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT);
   4780 
   4781 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
   4782 		rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL |
   4783 		    MDDB_F_EMASTER);
   4784 		rip->ri_lbp = (mddb_lb_t *)NULL;
   4785 		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
   4786 
   4787 		/*
   4788 		 * Translated dev is only used in calls to getmasters and
   4789 		 * getblks which expect a translated (aka miniroot) dev.
   4790 		 */
   4791 		dev = md_xlate_targ_2_mini(rip->ri_dev);
   4792 		if (dev == NODEV64) {
   4793 			/* Set error flag that getmasters would have set */
   4794 			/* if getmasters had been allowed to fail */
   4795 			rip->ri_flags |= MDDB_F_EMASTER;
   4796 		}
   4797 
   4798 		/*
   4799 		 * Invalid device id on system (due to failed or
   4800 		 * removed device) or invalid devt during upgrade
   4801 		 * (due to powered off device) will cause this
   4802 		 * replica to be marked in error and not used.
   4803 		 */
   4804 		if (rip->ri_flags & MDDB_F_EMASTER)
   4805 			continue;
   4806 
   4807 		/* get all master blocks, does mddb_devopen() */
   4808 		rip->ri_mbip = getmasters(s, dev, rip->ri_blkno,
   4809 		    &rip->ri_flags, &mn_set);
   4810 
   4811 		/* if invalid master block - try next replica */
   4812 		if (! rip->ri_mbip)
   4813 			continue;
   4814 
   4815 		/*
   4816 		 * If lbp alloc'd to wrong size - reset it.
   4817 		 * If MN set, lb_blkcnt must be MDDB_MNLBCNT.
   4818 		 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT.
   4819 		 */
   4820 		if (lbp) {
   4821 			if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) ||
   4822 			    ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) {
   4823 				kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
   4824 				lbp = (mddb_lb_t *)NULL;
   4825 			}
   4826 		}
   4827 
   4828 		if (lbp == (mddb_lb_t *)NULL) {
   4829 			/* If a MN set, set lb_blkcnt for MN loc blk size */
   4830 			if (mn_set)
   4831 				lb_blkcnt = MDDB_MNLBCNT;
   4832 			lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt),
   4833 			    KM_SLEEP);
   4834 		}
   4835 
   4836 		/*
   4837 		 * Read in all the sectors for the locator block
   4838 		 * NOTE: Need to use getblks, rather than readblklst.
   4839 		 *	because it is too early and things are
   4840 		 *	NOT set up yet for read*()'s
   4841 		 */
   4842 		buffer = (caddr_t)lbp;
   4843 		for (blk = 0; blk < lb_blkcnt; blk++) {
   4844 			physblk = getphysblk(blk, rip->ri_mbip);
   4845 			err = getblks(s, buffer, dev, physblk,
   4846 			    btodb(MDDB_BSIZE), 0);
   4847 			if (err) {
   4848 				rip->ri_flags |= err;
   4849 				break;
   4850 			}
   4851 			buffer += MDDB_BSIZE;
   4852 		}
   4853 
   4854 		if (err)
   4855 			continue;
   4856 
   4857 		/* Verify the locator block */
   4858 		if (blk != lb_blkcnt)
   4859 			continue;
   4860 		if (lbp->lb_magic != MDDB_MAGIC_LB)
   4861 			continue;
   4862 		if (lbp->lb_blkcnt != lb_blkcnt)
   4863 			continue;
   4864 		if (mn_set) {
   4865 			/* If a MN set, check for MNLB revision in lb. */
   4866 			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
   4867 				continue;
   4868 		} else {
   4869 			/* If not a MN set, check for LB revision in lb. */
   4870 			if (revchk(MDDB_REV_LB, lbp->lb_revision))
   4871 				continue;
   4872 		}
   4873 		if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL))
   4874 			continue;
   4875 
   4876 		/*
   4877 		 * With the addition of MultiNode Disksets, we must make sure
   4878 		 * to verify that this is the correct set.  A node could
   4879 		 * have been out of the config for awhile and this disk could
   4880 		 * have been moved to a different diskset and we don't want
   4881 		 * to accidentally start the wrong set.
   4882 		 *
   4883 		 * We don't do this check if we're in the middle of
   4884 		 * importing a set.
   4885 		 */
   4886 		if (!(md_get_setstatus(s->s_setno) &
   4887 		    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
   4888 		    (lbp->lb_setno != s->s_setno))
   4889 			continue;
   4890 
   4891 		rip->ri_flags |= MDDB_F_LOCACC;
   4892 
   4893 		/*
   4894 		 * a commit count of zero means this locator has been deleted
   4895 		 */
   4896 		if (lbp->lb_commitcnt == 0)
   4897 			continue;
   4898 
   4899 		/*
   4900 		 * If replica is in the device ID style and md_devid_destroy
   4901 		 * flag is set, turn off device id style.  This is only to be
   4902 		 * used in a catastrophic failure case.  Examples would be
   4903 		 * where the device id of all drives in the system
   4904 		 * (especially the mirror'd root drives) had been changed
   4905 		 * by firmware upgrade or by a patch to an existing disk
   4906 		 * driver.  Another example would be in the case of non-unique
   4907 		 * device ids due to a bug.  The device id would be valid on
   4908 		 * the system, but would return the wrong dev_t.
   4909 		 */
   4910 		if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) {
   4911 			lbp->lb_flags &= ~MDDB_DEVID_STYLE;
   4912 			lbp->lb_didfirstblk = 0;
   4913 			lbp->lb_didblkcnt = 0;
   4914 			*write_lb = 1;
   4915 		}
   4916 
   4917 
   4918 		/*
   4919 		 * If replica is in device ID style, read in device ID
   4920 		 * block and verify device ID block information.
   4921 		 */
   4922 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   4923 
   4924 			/* Read in device ID block */
   4925 			if (did_icp == NULL) {
   4926 				did_icp = (mddb_did_ic_t *)
   4927 				    kmem_zalloc(sizeof (mddb_did_ic_t),
   4928 				    KM_SLEEP);
   4929 			} else {
   4930 				/* Reuse did_icp, but clear out data */
   4931 				if (did_icp->did_ic_blkp !=
   4932 				    (mddb_did_blk_t *)NULL) {
   4933 					kmem_free((caddr_t)did_icp->did_ic_blkp,
   4934 					    did_blkp_sz);
   4935 					did_blkp = (mddb_did_blk_t *)NULL;
   4936 					did_icp->did_ic_blkp =
   4937 					    (mddb_did_blk_t *)NULL;
   4938 				}
   4939 				if (did_icp->did_ic_dbp !=
   4940 				    (mddb_did_db_t *)NULL) {
   4941 					did_dbp1 = did_icp->did_ic_dbp;
   4942 					while (did_dbp1) {
   4943 						did_dbp2 = did_dbp1->db_next;
   4944 						kmem_free((caddr_t)
   4945 						    did_dbp1->db_ptr,
   4946 						    dbtob(did_dbp1->db_blkcnt));
   4947 						kmem_free((caddr_t)did_dbp1,
   4948 						    sizeof (mddb_did_db_t));
   4949 						did_dbp1 = did_dbp2;
   4950 					}
   4951 					did_icp->did_ic_dbp =
   4952 					    (mddb_did_db_t *)NULL;
   4953 				}
   4954 				for (i = 0; i < MDDB_NLB; i++) {
   4955 					did_icp->did_ic_devid[i] =
   4956 					    (ddi_devid_t)NULL;
   4957 				}
   4958 			}
   4959 
   4960 			/* Can't reuse blkp since size could be different */
   4961 			if (did_blkp != (mddb_did_blk_t *)NULL) {
   4962 				kmem_free(did_blkp, did_blkp_sz);
   4963 			}
   4964 			did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt);
   4965 			did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz,
   4966 			    KM_SLEEP);
   4967 			did_icp->did_ic_blkp = did_blkp;
   4968 			buffer = (caddr_t)did_blkp;
   4969 			for (blk = lbp->lb_didfirstblk;
   4970 			    blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk);
   4971 			    blk++) {
   4972 				physblk = getphysblk(blk, rip->ri_mbip);
   4973 				err = getblks(s, buffer, dev, physblk,
   4974 				    btodb(MDDB_BSIZE), 0);
   4975 				if (err) {
   4976 					rip->ri_flags |= err;
   4977 					break;
   4978 				}
   4979 				buffer += MDDB_BSIZE;
   4980 			}
   4981 			if (err)
   4982 				continue;
   4983 
   4984 			/* Verify the Device ID block */
   4985 			if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk))
   4986 				continue;
   4987 			if (did_blkp->blk_magic != MDDB_MAGIC_DI)
   4988 				continue;
   4989 			if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS)
   4990 				continue;
   4991 			if (revchk(MDDB_REV_DI, did_blkp->blk_revision))
   4992 				continue;
   4993 			if (crcchk(did_blkp, &did_blkp->blk_checksum,
   4994 			    dbtob(lbp->lb_didblkcnt), NULL))
   4995 				continue;
   4996 
   4997 			/*
   4998 			 * Check if device ID block is out of sync with the
   4999 			 * Locator Block by checking if the locator block
   5000 			 * commitcnt does not match the device id block
   5001 			 * commitcnt.  If an 'out of sync' condition
   5002 			 * exists, discard this replica since it has
   5003 			 * inconsistent data and can't be used in
   5004 			 * determining the best replica.
   5005 			 *
   5006 			 * An 'out of sync' condition could happen if old
   5007 			 * SDS code was running with new devid style replicas
   5008 			 * or if a failure occurred between the writing of
   5009 			 * the locator block's commitcnt and the device
   5010 			 * id block's commitcnt.
   5011 			 *
   5012 			 * If old SDS code had been running, the upgrade
   5013 			 * process should detect this situation and
   5014 			 * have removed all of the device id information
   5015 			 * via the md_devid_destroy flag in md.conf.
   5016 			 */
   5017 			if (did_blkp->blk_commitcnt !=
   5018 			    lbp->lb_commitcnt) {
   5019 				continue;
   5020 			}
   5021 		}
   5022 
   5023 
   5024 		/*
   5025 		 * If replica is still in device ID style, read in all
   5026 		 * of the device IDs, verify the checksum of the device IDs.
   5027 		 */
   5028 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   5029 			/*
   5030 			 * Reset valid bit in device id info block flags. This
   5031 			 * flag is stored on disk, but the valid bit is reset
   5032 			 * when reading in the replica.  If the corresponding
   5033 			 * device id is valid (aka meaning that the system
   5034 			 * knows about this device id), the valid bit will
   5035 			 * be set at a later time.  The valid bit for this
   5036 			 * replica's device ID will be set in this routine.
   5037 			 * The valid bits for the rest of the device id's
   5038 			 * will be set after the 'best' replica has
   5039 			 * been selected in routine load_old_replicas.
   5040 			 * Reset updated bit in device id info block flags.
   5041 			 * This flag is also stored on disk, reset when read
   5042 			 * in and set when the locators and side locators
   5043 			 * have been updated to match this valid device
   5044 			 * id information.
   5045 			 */
   5046 			for (li = 0; li < lbp->lb_loccnt; li++) {
   5047 				did_info = &did_blkp->blk_info[li];
   5048 				if (did_info->info_flags & MDDB_DID_EXISTS)
   5049 					did_info->info_flags &=
   5050 					    ~(MDDB_DID_VALID |
   5051 					    MDDB_DID_UPDATED);
   5052 			}
   5053 
   5054 			cont_flag = 0;
   5055 			for (li = 0; li < lbp->lb_loccnt; li++) {
   5056 				did_info = &did_blkp->blk_info[li];
   5057 				did_block = (caddr_t)NULL;
   5058 				if (did_info->info_flags & MDDB_DID_EXISTS) {
   5059 					/*
   5060 					 * Check if block has
   5061 					 * already been read in
   5062 					 */
   5063 					did_dbp = did_icp->did_ic_dbp;
   5064 					while (did_dbp != 0) {
   5065 						if (did_dbp->db_firstblk ==
   5066 						    did_info->info_firstblk)
   5067 							break;
   5068 						else
   5069 							did_dbp =
   5070 							    did_dbp->db_next;
   5071 					}
   5072 					/* if block not found, read it in */
   5073 					if (did_dbp == NULL) {
   5074 						did_block = (caddr_t)
   5075 						    (kmem_zalloc(dbtob(
   5076 						    did_info->info_blkcnt),
   5077 						    KM_SLEEP));
   5078 						buffer = (caddr_t)did_block;
   5079 						for (blk =
   5080 						    did_info->info_firstblk;
   5081 						    blk < (did_info->
   5082 						    info_firstblk +
   5083 						    did_info->info_blkcnt);
   5084 						    blk++) {
   5085 							physblk =
   5086 							    getphysblk(blk,
   5087 							    rip->ri_mbip);
   5088 							err = getblks(s,
   5089 							    buffer, dev,
   5090 							    physblk, btodb(
   5091 							    MDDB_BSIZE), 0);
   5092 							if (err) {
   5093 								rip->ri_flags |=
   5094 								    err;
   5095 								break;
   5096 							}
   5097 							buffer += MDDB_BSIZE;
   5098 						}
   5099 						if (err) {
   5100 							kmem_free(did_block,
   5101 							    dbtob(did_info->
   5102 							    info_blkcnt));
   5103 							did_block =
   5104 							    (caddr_t)NULL;
   5105 							cont_flag = 1;
   5106 							break;
   5107 						}
   5108 
   5109 						/*
   5110 						 * Block read in -
   5111 						 * alloc Disk Block area
   5112 						 */
   5113 						did_dbp = (mddb_did_db_t *)
   5114 						    kmem_zalloc(
   5115 						    sizeof (mddb_did_db_t),
   5116 						    KM_SLEEP);
   5117 						did_dbp->db_ptr = did_block;
   5118 						did_dbp->db_firstblk =
   5119 						    did_info->info_firstblk;
   5120 						did_dbp->db_blkcnt =
   5121 						    did_info->info_blkcnt;
   5122 
   5123 						/* Add to front of dbp list */
   5124 						did_dbp->db_next =
   5125 						    did_icp->did_ic_dbp;
   5126 						did_icp->did_ic_dbp = did_dbp;
   5127 					}
   5128 					/* Check validity of devid in block */
   5129 					if (crcchk(((char *)did_dbp->db_ptr +
   5130 					    did_info->info_offset),
   5131 					    &did_info->info_checksum,
   5132 					    did_info->info_length, NULL)) {
   5133 						cont_flag = 1;
   5134 						break;
   5135 					}
   5136 
   5137 					/* Block now pointed to by did_dbp */
   5138 					did_icp->did_ic_devid[li] =
   5139 					    (ddi_devid_t)((char *)
   5140 					    did_dbp->db_ptr +
   5141 					    did_info->info_offset);
   5142 				}
   5143 			}
   5144 			if (cont_flag)
   5145 				continue;
   5146 		}
   5147 
   5148 		/*
   5149 		 * All blocks containing devids are now in core.
   5150 		 */
   5151 
   5152 		/*
   5153 		 * If we're doing a replicated import (also known as
   5154 		 * remote copy import), the device id in the locator
   5155 		 * block is incorrect and we need to fix it up here
   5156 		 * alongwith the l_dev otherwise we run into lots of
   5157 		 * trouble later on.
   5158 		 */
   5159 		if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
   5160 			mddb_ri_t	*trip;
   5161 			for (li = 0; li < lbp->lb_loccnt; li++) {
   5162 				did_info = &did_blkp->blk_info[li];
   5163 				lp = &lbp->lb_locators[li];
   5164 
   5165 				if (lp->l_flags & MDDB_F_DELETED)
   5166 					continue;
   5167 
   5168 				if (!(did_info->info_flags & MDDB_DID_EXISTS))
   5169 					continue;
   5170 
   5171 				if (did_icp->did_ic_devid[li] == NULL)
   5172 					continue;
   5173 
   5174 				for (trip = s->s_rip; trip != NULL;
   5175 				    trip = trip->ri_next) {
   5176 					if (trip->ri_old_devid == NULL)
   5177 						continue;
   5178 					if (ddi_devid_compare(
   5179 					    trip->ri_old_devid,
   5180 					    did_icp->did_ic_devid[li]) != 0) {
   5181 						continue;
   5182 					}
   5183 
   5184 					/* update l_dev and side mnum */
   5185 					lp->l_dev = md_cmpldev(trip->ri_dev);
   5186 					lbp->lb_sidelocators[0][li].l_mnum =
   5187 					    md_getminor(trip->ri_dev);
   5188 				}
   5189 			}
   5190 		}
   5191 
   5192 		/*
   5193 		 * If there is a valid devid, verify that this locator
   5194 		 * block has information about itself by checking the
   5195 		 * device ID, minor_name and block
   5196 		 * number from this replica's incore data structure
   5197 		 * against the locator block information that has just
   5198 		 * been read in from disk.
   5199 		 *
   5200 		 * If not a valid devid, verify that this locator block
   5201 		 * has information about itself by checking the minor
   5202 		 * number, block number and driver name from this
   5203 		 * replica's incore data structure against the locator
   5204 		 * block information that has just been read in from disk.
   5205 		 */
   5206 		if ((rip->ri_devid != NULL) &&
   5207 		    (lbp->lb_flags & MDDB_DEVID_STYLE)) {
   5208 			/*
   5209 			 * This locator block MUST have locator (replica)
   5210 			 * information about itself.  Check against devid,
   5211 			 * slice part of minor number, and block number.
   5212 			 */
   5213 			for (li = 0; li < lbp->lb_loccnt; li++) {
   5214 				did_info = &did_blkp->blk_info[li];
   5215 				lp = &lbp->lb_locators[li];
   5216 				if (lp->l_flags & MDDB_F_DELETED)
   5217 					continue;
   5218 
   5219 				if (!(did_info->info_flags & MDDB_DID_EXISTS))
   5220 					continue;
   5221 
   5222 				if (((md_get_setstatus(setno) &
   5223 				    MD_SET_REPLICATED_IMPORT)) &&
   5224 				    (rip->ri_old_devid != (ddi_devid_t)NULL)) {
   5225 					if (ddi_devid_compare(rip->ri_old_devid,
   5226 					    did_icp->did_ic_devid[li]) != 0)
   5227 						continue;
   5228 				} else {
   5229 					if (ddi_devid_compare(rip->ri_devid,
   5230 					    did_icp->did_ic_devid[li]) != 0)
   5231 						continue;
   5232 				}
   5233 
   5234 				if (strcmp(rip->ri_minor_name,
   5235 				    did_info->info_minor_name) != 0)
   5236 					continue;
   5237 
   5238 				if (lp->l_blkno == rip->ri_blkno)
   5239 					break;
   5240 			}
   5241 		} else {
   5242 			/*
   5243 			 * This locator block MUST have locator (replica)
   5244 			 * information about itself.
   5245 			 */
   5246 			if (!mn_set) {
   5247 				for (li = 0; li < lbp->lb_loccnt; li++) {
   5248 					mddb_drvnm_t		*dn;
   5249 					mddb_sidelocator_t	*slp;
   5250 
   5251 					lp = &lbp->lb_locators[li];
   5252 					slp = &lbp->
   5253 					    lb_sidelocators[s->s_sideno][li];
   5254 					if (lp->l_flags & MDDB_F_DELETED)
   5255 						continue;
   5256 					if (slp->l_mnum != md_getminor(
   5257 					    rip->ri_dev))
   5258 						continue;
   5259 					if (lp->l_blkno != rip->ri_blkno)
   5260 						continue;
   5261 					dn = &lbp->lb_drvnm[slp->l_drvnm_index];
   5262 					if (strncmp(dn->dn_data,
   5263 					    rip->ri_driver, MD_MAXDRVNM) == 0)
   5264 						break;
   5265 				}
   5266 			} else {
   5267 				for (li = 0; li < lbp->lb_loccnt; li++) {
   5268 					mddb_drvnm_t		*dn;
   5269 					mddb_mnsidelocator_t	*mnslp;
   5270 					mddb_mnlb_t		*mnlbp;
   5271 					int			i;
   5272 
   5273 					/*
   5274 					 * Check all possible locators locking
   5275 					 * for match to the currently read-in
   5276 					 * locator, must match on:
   5277 					 *	- blkno
   5278 					 *	- side locator for this
   5279 					 *	  node's side
   5280 					 *	- side locator minor number
   5281 					 *	- side locator driver name
   5282 					 */
   5283 
   5284 					/*
   5285 					 * Looking at sidelocs:
   5286 					 * cast lbp -> mnlbp
   5287 					 */
   5288 					mnlbp = (mddb_mnlb_t *)lbp;
   5289 					lp = &mnlbp->lb_locators[li];
   5290 					if (lp->l_flags & MDDB_F_DELETED)
   5291 						continue;
   5292 					if (lp->l_blkno != rip->ri_blkno)
   5293 						continue;
   5294 
   5295 					for (i = 0; i < MD_MNMAXSIDES; i++) {
   5296 						mnslp = &mnlbp->
   5297 						    lb_mnsidelocators[i][li];
   5298 						if (mnslp->mnl_sideno ==
   5299 						    s->s_sideno) {
   5300 							break;
   5301 						}
   5302 					}
   5303 					/* No matching side found */
   5304 					if (i == MD_MNMAXSIDES)
   5305 						continue;
   5306 					if (mnslp->mnl_mnum !=
   5307 					    md_getminor(rip->ri_dev))
   5308 						continue;
   5309 					dn = &lbp->
   5310 					    lb_drvnm[mnslp->mnl_drvnm_index];
   5311 					if (strncmp(dn->dn_data,
   5312 					    rip->ri_driver, MD_MAXDRVNM) == 0)
   5313 						break;
   5314 				}
   5315 			}
   5316 		}
   5317 
   5318 		/*
   5319 		 * Didn't find ourself in this locator block it means
   5320 		 * the locator block is a stale transplant. Probably from
   5321 		 * a user doing a dd.
   5322 		 */
   5323 		if (li == lbp->lb_loccnt)
   5324 			continue;
   5325 
   5326 		/*
   5327 		 * Keep track of the number of accessed and valid
   5328 		 * locator blocks.
   5329 		 */
   5330 		lb_ok++;
   5331 
   5332 		/*
   5333 		 * Read the tag in, skips invalid or blank tags.
   5334 		 * Only valid tags allocate storage
   5335 		 * Data tags are not used in MN disksets.
   5336 		 */
   5337 		if ((!mn_set) && (! dt_read(s, lbp, rip))) {
   5338 			/*
   5339 			 * Keep track of the number of tagged
   5340 			 * locator blocks.
   5341 			 */
   5342 			lb_tagged++;
   5343 
   5344 			/* Keep a list of unique tags. */
   5345 			(void) dtl_addl(s, &rip->ri_dtp->dt_dtag);
   5346 		}
   5347 
   5348 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
   5349 			/*
   5350 			 * go through locator block and add any other
   5351 			 * locations of the data base.
   5352 			 * For the replicated import case, this was done earlier
   5353 			 * and we really don't need or want to do so again
   5354 			 */
   5355 			cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
   5356 			for (li = 0; li < lbp->lb_loccnt; li++) {
   5357 				lp = &lbp->lb_locators[li];
   5358 				if (lp->l_flags & MDDB_F_DELETED)
   5359 					continue;
   5360 
   5361 				cl->l_devid_flags = MDDB_DEVID_GETSZ;
   5362 				cl->l_devid = (uint64_t)0;
   5363 				cl->l_devid_sz = 0;
   5364 				cl->l_old_devid = (uint64_t)0;
   5365 				cl->l_old_devid_sz = 0;
   5366 				cl->l_minor_name[0] = '\0';
   5367 				locator2cfgloc(lbp, cl, li, s->s_sideno,
   5368 				    did_icp);
   5369 
   5370 				if (cl->l_devid_flags & MDDB_DEVID_SZ) {
   5371 					if ((cl->l_devid = (uintptr_t)kmem_alloc
   5372 					    (cl->l_devid_sz, KM_SLEEP))
   5373 					    == NULL) {
   5374 						continue;
   5375 					} else {
   5376 						cl->l_devid_flags =
   5377 						    MDDB_DEVID_SPACE;
   5378 					}
   5379 				}
   5380 				locator2cfgloc(lbp, cl, li, s->s_sideno,
   5381 				    did_icp);
   5382 
   5383 				(void) ridev(&s->s_rip, cl, &lp->l_dev, 0);
   5384 
   5385 				if (cl->l_devid_flags & MDDB_DEVID_SPACE)
   5386 					kmem_free((caddr_t)(uintptr_t)
   5387 					    cl->l_devid, cl->l_devid_sz);
   5388 			}
   5389 			kmem_free(cl, sizeof (mddb_cfg_loc_t));
   5390 		}
   5391 
   5392 		/* Save LB for later */
   5393 		rip->ri_lbp = lbp;
   5394 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   5395 			rip->ri_did_icp = did_icp;
   5396 			did_icp = (mddb_did_ic_t *)NULL;
   5397 			did_blkp = (mddb_did_blk_t *)NULL;
   5398 		} else
   5399 			rip->ri_did_icp = NULL;
   5400 		lbp = (mddb_lb_t *)NULL;
   5401 	}
   5402 
   5403 	if (lbp != (mddb_lb_t *)NULL)
   5404 		kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
   5405 
   5406 	if (did_icp != (mddb_did_ic_t *)NULL) {
   5407 		if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) {
   5408 			kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz);
   5409 			did_blkp = (mddb_did_blk_t *)NULL;
   5410 		}
   5411 		if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) {
   5412 			mddb_did_db_t	*did_dbp1, *did_dbp2;
   5413 
   5414 			did_dbp1 = did_icp->did_ic_dbp;
   5415 			while (did_dbp1) {
   5416 				did_dbp2 = did_dbp1->db_next;
   5417 				kmem_free((caddr_t)did_dbp1->db_ptr,
   5418 				    dbtob(did_dbp1->db_blkcnt));
   5419 				kmem_free((caddr_t)did_dbp1,
   5420 				    sizeof (mddb_did_db_t));
   5421 				did_dbp1 = did_dbp2;
   5422 			}
   5423 		}
   5424 		kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t));
   5425 	}
   5426 
   5427 	if (did_blkp != (mddb_did_blk_t *)NULL) {
   5428 		kmem_free((caddr_t)did_blkp, did_blkp_sz);
   5429 	}
   5430 
   5431 	/* No locator blocks were ok */
   5432 	if (lb_ok == 0)
   5433 		goto out;
   5434 
   5435 	/* No tagged data was found - will be 0 for MN diskset */
   5436 	if (lb_tagged == 0)
   5437 		goto out;
   5438 
   5439 	/* Find the highest non-deleted replica count */
   5440 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
   5441 		int		lb_tot = 0;
   5442 
   5443 		if (rip->ri_mbip == (mddb_mb_ic_t *)NULL)
   5444 			continue;
   5445 
   5446 		if (rip->ri_lbp == (mddb_lb_t *)NULL)
   5447 			continue;
   5448 
   5449 		for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) {
   5450 			lp = &rip->ri_lbp->lb_locators[li];
   5451 			if (lp->l_flags & MDDB_F_DELETED)
   5452 				continue;
   5453 			lb_tot++;
   5454 		}
   5455 
   5456 		if (lb_tot > lb_total)
   5457 			lb_total = lb_tot;
   5458 	}
   5459 
   5460 	/* Count the number of unique tags */
   5461 	for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx)
   5462 		lb_tags++;
   5463 
   5464 	/* Should have at least one tag at this point */
   5465 	ASSERT(lb_tags > 0);
   5466 
   5467 
   5468 	/*
   5469 	 * If the number of tagged locators is not the same as the number of
   5470 	 * OK locators OR more than one tag exists, then make sure the
   5471 	 * selected tag will be written out later.
   5472 	 */
   5473 	if ((lb_tagged - lb_ok) != 0 || lb_tags > 1)
   5474 		md_set_setstatus(setno, MD_SET_TAGDATA);
   5475 
   5476 	/* Only a single tag, take the tagged data */
   5477 	if (lb_tags == 1) {
   5478 		dt_setup(s, &s->s_dtlp->dtl_dt);
   5479 		md_set_setstatus(setno, MD_SET_USETAG);
   5480 		goto out;
   5481 	}
   5482 
   5483 	/* Multiple tags, not selecting a tag, tag mode is on */
   5484 	if (! (md_get_setstatus(setno) & MD_SET_USETAG))
   5485 		retval = MDDB_E_TAGDATA;
   5486 
   5487 out:
   5488 
   5489 	return (retval);
   5490 }
   5491 
   5492 /*
   5493  *	1. Select a locator.
   5494  *	2. check if enough locators now have current copies
   5495  *	3. read in database from one of latest
   5496  *	4. if known to have latest make all database the same
   5497  *	5. if configuration has changed rewrite locators
   5498  *
   5499  * Parameters:
   5500  * 	s - pointer to mddb_set structure
   5501  *	flag - used in MN disksets to tell if this node is being joined to
   5502  *		a diskset that is in the STALE state.  If the flag is
   5503  *		MDDB_MN_STALE, then this node should be marked in the STALE
   5504  *		state even if > 50% mddbs are available.  (The diskset can
   5505  *		only change from STALE->OK if all nodes withdraw from the
   5506  *		MN diskset and then rejoin).
   5507  */
   5508 static int
   5509 load_old_replicas(
   5510 	mddb_set_t	*s,
   5511 	int		flag
   5512 )
   5513 {
   5514 	mddb_lb_t	*lbp = NULL;
   5515 	mddb_mnlb_t	*mnlbp = NULL;
   5516 	mddb_ri_t	*rip;
   5517 	mddb_locator_t	*lp;
   5518 	mddb_db_t	*dbp;
   5519 	mddb_de_ic_t	*dep;
   5520 	int		li;
   5521 	int		alc;
   5522 	int		lc;
   5523 	int		tlc;
   5524 	int		retval = 0;
   5525 	caddr_t		p;
   5526 	size_t		maxrecsize;
   5527 	set_t		setno = s->s_setno;
   5528 	mddb_did_db_t	*did_dbp1;
   5529 	mddb_did_info_t	*did_info;
   5530 	mddb_did_ic_t	*did_icp = NULL;
   5531 	md_dev64_t	*newdev;
   5532 	mddb_sidelocator_t	*slp = 0;
   5533 	mddb_mnsidelocator_t	*mnslp = 0;
   5534 	uchar_t		i;
   5535 	char		*name;
   5536 	ddi_devid_t	ret_devid;
   5537 	md_dev64_t	dev;
   5538 	uint_t		len, sz;
   5539 	char		*minor_name;
   5540 	int		write_lb = 0;
   5541 	int		rval;
   5542 	int		stale_rtn = 0;
   5543 
   5544 	/* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */
   5545 	if (retval = get_mbs_n_lbs(s, &write_lb))
   5546 		goto errout;
   5547 
   5548 	if ((lbp = s->s_lbp = selectlocator(s)) == NULL) {
   5549 		retval = MDDB_E_NOLOCBLK;
   5550 		goto errout;
   5551 	}
   5552 
   5553 	/* If a multi-node set, then set md_set.s_status flag */
   5554 	if (lbp->lb_flags & MDDB_MNSET) {
   5555 		md_set_setstatus(setno, MD_SET_MNSET);
   5556 		/*
   5557 		 * If data tag area had been allocated before set type was
   5558 		 * known - free it now.
   5559 		 */
   5560 		if (md_set[setno].s_dtp) {
   5561 			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
   5562 			md_set[setno].s_dtp = NULL;
   5563 		}
   5564 	}
   5565 
   5566 	/*
   5567 	 * If the replica is in devid format, setup the devid incore ptr.
   5568 	 */
   5569 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   5570 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
   5571 			if (rip->ri_lbp == s->s_lbp) {
   5572 				did_icp = s->s_did_icp = rip->ri_did_icp;
   5573 				break;
   5574 			}
   5575 		}
   5576 		/*
   5577 		 * If no devid incore info found - something has gone
   5578 		 * wrong so errout.
   5579 		 */
   5580 		if (rip == NULL) {
   5581 			retval = MDDB_E_NODEVID;
   5582 			goto errout;
   5583 		}
   5584 
   5585 		/*
   5586 		 * Add all blocks containing devids to free list.
   5587 		 * Then remove addresses that actually contain devids.
   5588 		 */
   5589 		did_dbp1 = did_icp->did_ic_dbp;
   5590 		while (did_dbp1) {
   5591 			if (mddb_devid_free_add(s, did_dbp1->db_firstblk,
   5592 			    0, dbtob(did_dbp1->db_blkcnt))) {
   5593 				retval = MDDB_E_NOSPACE;
   5594 				goto errout;
   5595 			}
   5596 
   5597 			did_dbp1 = did_dbp1->db_next;
   5598 		}
   5599 		for (li = 0; li < lbp->lb_loccnt; li++) {
   5600 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
   5601 			if (!(did_info->info_flags & MDDB_DID_EXISTS))
   5602 				continue;
   5603 
   5604 			if (mddb_devid_free_delete(s, did_info->info_firstblk,
   5605 			    did_info->info_offset, did_info->info_length)) {
   5606 				/* unable to find disk block */
   5607 				retval = MDDB_E_NODEVID;
   5608 				goto errout;
   5609 			}
   5610 		}
   5611 	}
   5612 
   5613 	/*
   5614 	 * create mddb_mbaray, count all locators and active locators.
   5615 	 */
   5616 	alc = 0;
   5617 	lc = 0;
   5618 	for (li = 0; li < lbp->lb_loccnt; li++) {
   5619 		ddi_devid_t	li_devid;
   5620 
   5621 		lp = &lbp->lb_locators[li];
   5622 
   5623 		if (lp->l_flags & MDDB_F_DELETED)
   5624 			continue;
   5625 
   5626 		/* Count non-deleted replicas */
   5627 		lc++;
   5628 
   5629 		/*
   5630 		 * Use the devid of this locator to compare with the rip
   5631 		 * list.  The scenario to watch out for here is that this
   5632 		 * locator could be on a disk that is dead and there could
   5633 		 * be a valid entry in the rip list for a different disk
   5634 		 * that has been moved to the dead disks dev_t.  We don't
   5635 		 * want to match with the moved disk.
   5636 		 */
   5637 		li_devid = NULL;
   5638 		(void) mddb_devid_get(s, li, &li_devid, &minor_name);
   5639 
   5640 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
   5641 			if (match_mddb(rip, li_devid, minor_name,
   5642 			    md_expldev(lp->l_dev), lp->l_blkno)) {
   5643 				break;
   5644 			}
   5645 		}
   5646 		if (rip == NULL) {
   5647 			/*
   5648 			 * If rip not found, then mark error in master block
   5649 			 * so that no writes are later attempted to this
   5650 			 * replica.  rip may not be setup if ridev
   5651 			 * failed due to un-found driver name.
   5652 			 */
   5653 			lp->l_flags |= MDDB_F_EMASTER;
   5654 			continue;
   5655 		}
   5656 
   5657 		s->s_mbiarray[li] = rip->ri_mbip;
   5658 
   5659 		lp->l_flags &= MDDB_F_ACTIVE;
   5660 		lp->l_flags |= (int)rip->ri_flags;
   5661 
   5662 		if (rip->ri_transplant)
   5663 			lp->l_flags &= ~MDDB_F_ACTIVE;
   5664 
   5665 		if (lp->l_flags & MDDB_F_LOCACC)
   5666 			alc++;
   5667 	}
   5668 
   5669 	/* Save on a divide - calculate 50% + 1 up front */
   5670 	tlc = ((lc + 1) / 2);
   5671 
   5672 	if (alc > tlc) {		/* alc > tlc		- OK */
   5673 		md_clr_setstatus(setno, MD_SET_STALE);
   5674 	} else if (alc < tlc) {		/* alc < tlc		- stale */
   5675 		md_set_setstatus(setno, MD_SET_STALE);
   5676 	} else if (lc & 1) {		/* alc == tlc && odd	- OK */
   5677 		md_clr_setstatus(setno, MD_SET_STALE);
   5678 	} else {			/* alc == tlc && even	- ? */
   5679 		/* Can do an accept, and are */
   5680 		if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) {
   5681 			md_clr_setstatus(setno, MD_SET_STALE);
   5682 		} else {		/* possibly has a mediator */
   5683 			if (mediate(s)) {
   5684 				md_set_setstatus(setno, MD_SET_STALE);
   5685 			} else {
   5686 				md_clr_setstatus(setno, MD_SET_STALE);
   5687 			}
   5688 		}
   5689 
   5690 		/*
   5691 		 * The mirrored_root_flag allows the sysadmin to decide to
   5692 		 * start the local set in a read/write (non-stale) mode
   5693 		 * when there are only 50% available mddbs on the system and
   5694 		 * when the root file system is on a mirror.  This is useful
   5695 		 * in a 2 disk system where 1 disk failure would cause an mddb
   5696 		 * quorum failure and subsequent boot failures since the root
   5697 		 * filesystem would be in a read-only state.
   5698 		 */
   5699 		if (mirrored_root_flag == 1 && setno == 0 &&
   5700 		    svm_bootpath[0] != 0) {
   5701 			md_clr_setstatus(setno, MD_SET_STALE);
   5702 		} else {
   5703 			if (md_get_setstatus(setno) & MD_SET_STALE) {
   5704 				/* Allow half mode - CAREFUL! */
   5705 				if (mddb_allow_half)
   5706 					md_clr_setstatus(setno, MD_SET_STALE);
   5707 			}
   5708 		}
   5709 
   5710 		/*
   5711 		 * In a MN diskset,
   5712 		 *	- if 50% mddbs are unavailable and this
   5713 		 *		has been marked STALE above
   5714 		 * 	- master node isn't in the STALE state
   5715 		 *	- this node isn't the master node (this node
   5716 		 *		isn't the first node to join the set)
   5717 		 * then clear the STALE state and set TOOFEW.
   5718 		 *
   5719 		 * If this node is the master node and set was marked STALE,
   5720 		 * then the set stays STALE.
   5721 		 *
   5722 		 * If this node is not the master and this node's state is
   5723 		 * STALE and the master node is not marked STALE,
   5724 		 * then master node must be in the TOOFEW state or the
   5725 		 * master is panic'ing.  A MN diskset can only be placed into
   5726 		 * the STALE state by having the first node join the set
   5727 		 * with <= 50% mddbs.  There's no way for a MN diskset to
   5728 		 * transition between STALE and not-STALE states unless all
   5729 		 * nodes are withdrawn from the diskset or all nodes in the
   5730 		 * diskset are rebooted at the same time.
   5731 		 *
   5732 		 * So, mark this node's state as TOOFEW instead of STALE.
   5733 		 */
   5734 		if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE))
   5735 		    == (MD_SET_MNSET | MD_SET_STALE)) &&
   5736 		    ((flag & MDDB_MN_STALE) == 0) &&
   5737 		    (!(md_set[setno].s_am_i_master))) {
   5738 			md_clr_setstatus(setno, MD_SET_STALE);
   5739 			md_set_setstatus(setno, MD_SET_TOOFEW);
   5740 		}
   5741 	}
   5742 
   5743 	/*
   5744 	 * If a MN set is marked STALE on the other nodes,
   5745 	 * mark it stale here.  Override all other considerations
   5746 	 * such as a mediator or > 50% mddbs available.
   5747 	 */
   5748 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
   5749 		if (flag & MDDB_MN_STALE)
   5750 			md_set_setstatus(setno, MD_SET_STALE);
   5751 	}
   5752 
   5753 	/*
   5754 	 * read a good copy of the locator names
   5755 	 * if an error occurs reading what is suppose
   5756 	 * to be a good copy continue looking for another
   5757 	 * good copy
   5758 	 */
   5759 	s->s_lnp = NULL;
   5760 	for (li = 0; li < lbp->lb_loccnt; li++) {
   5761 		lp = &lbp->lb_locators[li];
   5762 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
   5763 		    (lp->l_flags & MDDB_F_EMASTER))
   5764 			continue;
   5765 
   5766 		/* Find rip entry for this locator if one exists */
   5767 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
   5768 			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
   5769 			    lp->l_blkno))
   5770 				break;
   5771 		}
   5772 
   5773 		if (rip == NULL) {
   5774 			continue;
   5775 		}
   5776 
   5777 		/*
   5778 		 * Use the rip commitcnt since the commitcnt in lbp could
   5779 		 * been cleared by selectlocator.  Looking for a replica with
   5780 		 * the same commitcnt as the 'golden' copy in order to
   5781 		 * get the same data.
   5782 		 */
   5783 		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
   5784 			continue;
   5785 		}
   5786 
   5787 		/*
   5788 		 * Now have a copy of the database that is equivalent
   5789 		 * to the chosen locator block with respect to
   5790 		 * inittime, identifier and commitcnt.   Trying the
   5791 		 * equivalent databases in the order that they were
   5792 		 * written will provide the most up to date data.
   5793 		 */
   5794 		lp->l_flags |= readlocnames(s, li);
   5795 		if (s->s_lnp)
   5796 			break;
   5797 	}
   5798 
   5799 	if (s->s_lnp == NULL) {
   5800 		retval = MDDB_E_NOLOCNMS;
   5801 		goto errout;
   5802 	}
   5803 
   5804 	/*
   5805 	 * read a good copy of the data base
   5806 	 * if an error occurs reading what is suppose
   5807 	 * to be a good copy continue looking for another
   5808 	 * good copy
   5809 	 */
   5810 
   5811 	s->s_dbp = NULL;
   5812 	for (li = 0; li < lbp->lb_loccnt; li++) {
   5813 		lp = &lbp->lb_locators[li];
   5814 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
   5815 		    (lp->l_flags & MDDB_F_EMASTER))
   5816 			continue;
   5817 
   5818 		/* Find rip entry for this locator if one exists */
   5819 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
   5820 			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
   5821 			    lp->l_blkno))
   5822 				break;
   5823 		}
   5824 
   5825 		if (rip == NULL) {
   5826 			continue;
   5827 		}
   5828 
   5829 		/*
   5830 		 * Use the rip commitcnt since the commitcnt in lbp could
   5831 		 * been cleared by selectlocator.  Looking for a replica with
   5832 		 * the same commitcnt as the 'golden' copy in order to
   5833 		 * get the same data.
   5834 		 */
   5835 		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
   5836 			continue;
   5837 		}
   5838 
   5839 		/*
   5840 		 * Now have a copy of the database that is equivalent
   5841 		 * to the chosen locator block with respect to
   5842 		 * inittime, identifier and commitcnt.   Trying the
   5843 		 * equivalent databases in the order that they were
   5844 		 * written will provide the most up to date data.
   5845 		 */
   5846 		lp->l_flags |= readcopy(s, li);
   5847 
   5848 		if (s->s_dbp)
   5849 			break;
   5850 	}
   5851 
   5852 	if (s->s_dbp == NULL) {
   5853 		retval = MDDB_E_NODIRBLK;
   5854 		goto errout;
   5855 	}
   5856 
   5857 	lp->l_flags |= MDDB_F_MASTER;
   5858 	lp->l_flags |= MDDB_F_UP2DATE;
   5859 
   5860 	/*
   5861 	 * go through and find largest record;
   5862 	 * Also fixup the user data area's
   5863 	 */
   5864 	maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size);
   5865 
   5866 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
   5867 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
   5868 			if (dep->de_flags & MDDB_F_OPT)
   5869 				getoptrecord(s, dep);
   5870 			else {
   5871 				allocuserdata(dep);
   5872 				maxrecsize = MAX(dep->de_recsize, maxrecsize);
   5873 			}
   5874 
   5875 	if (maxrecsize > s->s_databuffer_size) {
   5876 		p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
   5877 		if (s->s_databuffer_size)
   5878 			kmem_free(s->s_databuffer, s->s_databuffer_size);
   5879 		s->s_databuffer = p;
   5880 		s->s_databuffer_size = maxrecsize;
   5881 	}
   5882 
   5883 	/* If we can clear the tag data record, do it now. */
   5884 	/* Data tags not supported on MN sets */
   5885 	if ((md_get_setstatus(setno) & MD_SET_CLRTAG) &&
   5886 	    (!(md_get_setstatus(setno) & MD_SET_MNSET)))
   5887 		dt_setup(s, NULL);
   5888 
   5889 	/* This will return non-zero if STALE or TOOFEW */
   5890 	/* This will write out chosen replica image to all replicas */
   5891 	stale_rtn = selectreplicas(s, MDDB_SCANALL);
   5892 
   5893 	if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
   5894 		ddi_devid_t	devidptr;
   5895 
   5896 		/*
   5897 		 * ignore the return value from selectreplicas because we
   5898 		 * may have a STALE or TOOFEW set in the case of a partial
   5899 		 * replicated diskset. We will fix that up later.
   5900 		 */
   5901 
   5902 		lbp = s->s_lbp;
   5903 		for (li = 0; li < lbp->lb_loccnt; li++) {
   5904 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
   5905 
   5906 			if (did_info->info_flags & MDDB_DID_EXISTS) {
   5907 				devidptr = s->s_did_icp->did_ic_devid[li];
   5908 				lp = &lbp->lb_locators[li];
   5909 				for (rip = s->s_rip; rip != NULL;
   5910 				    rip = rip->ri_next) {
   5911 					if (rip->ri_old_devid == 0)
   5912 						continue;
   5913 					if (ddi_devid_compare(rip->ri_old_devid,
   5914 					    devidptr) != 0) {
   5915 						continue;
   5916 					}
   5917 					if (update_locatorblock(s,
   5918 					    md_expldev(lp->l_dev),
   5919 					    rip->ri_devid, rip->ri_old_devid)) {
   5920 						goto errout;
   5921 					}
   5922 				}
   5923 			}
   5924 		}
   5925 	} else {
   5926 		if (stale_rtn)
   5927 			goto errout;
   5928 	}
   5929 
   5930 	/*
   5931 	 * If the replica is in device id style - validate the device id's,
   5932 	 * if present, in the locator block devid area.
   5933 	 */
   5934 	newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP);
   5935 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   5936 		for (li = 0; li < lbp->lb_loccnt; li++) {
   5937 			newdev[li] = 0;
   5938 			lp = &lbp->lb_locators[li];
   5939 			if (lp->l_flags & MDDB_F_DELETED)
   5940 				continue;
   5941 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
   5942 			dev = md_expldev(lp->l_dev);
   5943 			if (did_info->info_flags & MDDB_DID_EXISTS) {
   5944 				/* Validate device id on current system */
   5945 				newdev[li] = dev;
   5946 				if (mddb_devid_validate(
   5947 				    did_icp->did_ic_devid[li],
   5948 				    &(newdev[li]),
   5949 				    did_info->info_minor_name) == 0) {
   5950 					/* Set valid flag */
   5951 					did_info->info_flags |= MDDB_DID_VALID;
   5952 				} else {
   5953 					lp->l_flags |= MDDB_F_EMASTER;
   5954 				}
   5955 			} else if (!(MD_UPGRADE)) {
   5956 				/*
   5957 				 * If a device doesn't have a device id,
   5958 				 * check if there is now a device ID
   5959 				 * associated with device.  If one exists,
   5960 				 * add it to the locator block devid area.
   5961 				 * If there's not enough space to add it,
   5962 				 * print a warning.
   5963 				 * Don't do this during upgrade.
   5964 				 */
   5965 				dev_t ddi_dev = md_dev64_to_dev(dev);
   5966 				if (ddi_lyr_get_devid(ddi_dev, &ret_devid) ==
   5967 				    DDI_SUCCESS) {
   5968 					if (ddi_lyr_get_minor_name(ddi_dev,
   5969 					    S_IFBLK, &minor_name)
   5970 					    == DDI_SUCCESS) {
   5971 						if (mddb_devid_add(s, li,
   5972 						    ret_devid, minor_name)) {
   5973 							cmn_err(CE_WARN,
   5974 							    "Not enough space"
   5975 							    " in metadevice"
   5976 							    " state"
   5977 							    " database\n");
   5978 							cmn_err(CE_WARN,
   5979 							    "to add relocation"
   5980 							    " information for"
   5981 							    " device:\n");
   5982 							cmn_err(CE_WARN,
   5983 							    " major = %d, "
   5984 							    " minor = %d\n",
   5985 							    getmajor(ddi_dev),
   5986 							    getminor(ddi_dev));
   5987 						} else {
   5988 							write_lb = 1;
   5989 						}
   5990 						kmem_free(minor_name,
   5991 						    strlen(minor_name) + 1);
   5992 					}
   5993 					ddi_devid_free(ret_devid);
   5994 				}
   5995 			}
   5996 		}
   5997 
   5998 		/*
   5999 		 * If a device has a valid device id and if the dev_t
   6000 		 * associated with the device id has changed, update the
   6001 		 * driver name, minor num and dev_t in the local and side
   6002 		 * locators to match the dev_t that the system currently
   6003 		 * associates with the device id.
   6004 		 *
   6005 		 * Don't do this during upgrade.
   6006 		 */
   6007 		if (!(MD_UPGRADE)) {
   6008 			for (li = 0; li < lbp->lb_loccnt; li++) {
   6009 				lp = &lbp->lb_locators[li];
   6010 				if (lp->l_flags & MDDB_F_DELETED)
   6011 					continue;
   6012 				did_info = &(did_icp->did_ic_blkp->blk_info
   6013 				    [li]);
   6014 				if ((did_info->info_flags & MDDB_DID_VALID) &&
   6015 				    !(did_info->info_flags &
   6016 				    MDDB_DID_UPDATED)) {
   6017 					if (lbp->lb_flags & MDDB_MNSET) {
   6018 						int j;
   6019 						int index = -1;
   6020 						mnlbp = (mddb_mnlb_t *)lbp;
   6021 						for (j = 0; j < MD_MNMAXSIDES;
   6022 						    j++) {
   6023 							mnslp = &mnlbp->
   6024 							    lb_mnsidelocators[j]
   6025 							    [li];
   6026 							if (mnslp->mnl_sideno ==
   6027 							    s->s_sideno)
   6028 								break;
   6029 							if (mnslp->mnl_sideno ==
   6030 							    0)
   6031 								index = j;
   6032 						}
   6033 						if (j == MD_MNMAXSIDES) {
   6034 							/*
   6035 							 * No match found; take
   6036 							 * empty
   6037 							 */
   6038 							mnslp = &mnlbp->
   6039 							    lb_mnsidelocators
   6040 							    [index][li];
   6041 							write_lb = 1;
   6042 							mnslp->mnl_mnum =
   6043 							    md_getminor(newdev
   6044 							    [li]);
   6045 						} else if (mnslp->mnl_mnum !=
   6046 						    md_getminor(newdev[li])) {
   6047 							write_lb = 1;
   6048 							mnslp->mnl_mnum =
   6049 							    md_getminor(newdev
   6050 							    [li]);
   6051 						}
   6052 					} else {
   6053 						slp = &lbp->
   6054 						    lb_sidelocators[s->s_sideno]
   6055 						    [li];
   6056 						if (slp->l_mnum !=
   6057 						    md_getminor(newdev[li])) {
   6058 							write_lb = 1;
   6059 							slp->l_mnum =
   6060 							    md_getminor(newdev
   6061 							    [li]);
   6062 						}
   6063 					}
   6064 					name = ddi_major_to_name(md_getmajor(
   6065 					    newdev[li]));
   6066 					if (lbp->lb_flags & MDDB_MNSET)
   6067 						i = mnslp->mnl_drvnm_index;
   6068 					else
   6069 						i = slp->l_drvnm_index;
   6070 					if (strncmp(lbp->lb_drvnm[i].dn_data,
   6071 					    name, lbp->lb_drvnm[i].dn_len) !=
   6072 					    0) {
   6073 						/* Driver name has changed */
   6074 						len = strlen(name);
   6075 						/* Look for the driver name */
   6076 						for (i = 0; i < MDDB_DRVNMCNT;
   6077 						    i++) {
   6078 							if (lbp->lb_drvnm[i].
   6079 							    dn_len != len)
   6080 								continue;
   6081 							if (strncmp(lbp->
   6082 							    lb_drvnm[i].dn_data,
   6083 							    name, len) == 0)
   6084 								break;
   6085 						}
   6086 						/* Didn't find one, add it */
   6087 						if (i == MDDB_DRVNMCNT) {
   6088 							for (i = 0; i <
   6089 							    MDDB_DRVNMCNT;
   6090 							    i++) {
   6091 								if (lbp->
   6092 								    lb_drvnm[i].
   6093 								    dn_len == 0)
   6094 									break;
   6095 							}
   6096 							if (i ==
   6097 							    MDDB_DRVNMCNT) {
   6098 								cmn_err(CE_WARN,
   6099 								    "Unable to "
   6100 								    " update "
   6101 								    "driver "
   6102 								    " name for "
   6103 								    "dev:  "
   6104 								    "major = %d"
   6105 								    ", minor = "
   6106 								    "%d\n",
   6107 								    md_getmajor(
   6108 								    newdev[li]),
   6109 								    md_getminor(
   6110 								    newdev
   6111 								    [li]));
   6112 								continue;
   6113 							}
   6114 							(void) strncpy(lbp->
   6115 							    lb_drvnm[i].dn_data,
   6116 							    name, MD_MAXDRVNM);
   6117 							lbp->lb_drvnm[i].
   6118 							    dn_len = (uchar_t)
   6119 							    strlen(name);
   6120 						}
   6121 						/* Fill in the drvnm index */
   6122 						if (lbp->lb_flags &
   6123 						    MDDB_MNSET)
   6124 							mnslp->mnl_drvnm_index =
   6125 							    i;
   6126 						else
   6127 							slp->l_drvnm_index = i;
   6128 						write_lb = 1;
   6129 					}
   6130 					did_info->info_flags |=
   6131 					    MDDB_DID_UPDATED;
   6132 				}
   6133 			}
   6134 		}
   6135 	}
   6136 	kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB);
   6137 
   6138 	/*
   6139 	 * If locator block has been changed by get_mbs_n_lbs,
   6140 	 * by addition of new device id, by updated minor name or
   6141 	 * by updated driver name - write out locator block.
   6142 	 */
   6143 	if (write_lb) {
   6144 		rval = push_lb(s);
   6145 		(void) upd_med(s, "load_old_replicas(0)");
   6146 		if (rval)
   6147 			goto errout;
   6148 	}
   6149 
   6150 	/*
   6151 	 * If the tag was moved, allocated, or a BADTAG was seen for some other
   6152 	 * reason, then make sure tags are written to all the replicas.
   6153 	 * Data tags not supported on MN sets.
   6154 	 */
   6155 	if (!(md_get_setstatus(setno) & MD_SET_MNSET)) {
   6156 		if (! (lc = dt_alloc_if_needed(s))) {
   6157 			for (li = 0; li < lbp->lb_loccnt; li++) {
   6158 				lp = &lbp->lb_locators[li];
   6159 
   6160 				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
   6161 				    (lp->l_flags & MDDB_F_EMASTER))
   6162 					continue;
   6163 
   6164 				if (lp->l_flags & MDDB_F_BADTAG) {
   6165 					lc = 1;
   6166 					break;
   6167 				}
   6168 			}
   6169 		}
   6170 
   6171 		if (lc) {
   6172 			md_set_setstatus(setno, MD_SET_TAGDATA);
   6173 			md_clr_setstatus(setno, MD_SET_BADTAG);
   6174 			(void) selectreplicas(s, MDDB_SCANALL);
   6175 		}
   6176 	}
   6177 
   6178 errout:
   6179 
   6180 	/* Free extraneous rip components. */
   6181 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
   6182 		/* Get rid of lbp's and dtp's */
   6183 
   6184 		if (rip->ri_lbp != lbp) {
   6185 			if (rip->ri_dtp != (mddb_dt_t *)NULL) {
   6186 				kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES);
   6187 				rip->ri_dtp = (mddb_dt_t *)NULL;
   6188 			}
   6189 
   6190 			if (rip->ri_devid != (ddi_devid_t)NULL) {
   6191 				sz = (int)ddi_devid_sizeof(rip->ri_devid);
   6192 				kmem_free((caddr_t)rip->ri_devid, sz);
   6193 				rip->ri_devid = (ddi_devid_t)NULL;
   6194 			}
   6195 			if (rip->ri_old_devid != (ddi_devid_t)NULL) {
   6196 				sz = (int)ddi_devid_sizeof(rip->ri_old_devid);
   6197 				kmem_free((caddr_t)rip->ri_old_devid, sz);
   6198 				rip->ri_old_devid = (ddi_devid_t)NULL;
   6199 			}
   6200 
   6201 			if (rip->ri_lbp != (mddb_lb_t *)NULL) {
   6202 				mddb_devid_icp_free(&rip->ri_did_icp,
   6203 				    rip->ri_lbp);
   6204 
   6205 				kmem_free((caddr_t)rip->ri_lbp,
   6206 				    dbtob(rip->ri_lbp->lb_blkcnt));
   6207 				rip->ri_lbp = (mddb_lb_t *)NULL;
   6208 			}
   6209 		}
   6210 
   6211 		if (lbp != NULL) {
   6212 			for (li = 0; li < lbp->lb_loccnt; li++) {
   6213 				lp = &lbp->lb_locators[li];
   6214 				if (lp->l_flags & MDDB_F_DELETED)
   6215 					continue;
   6216 				if (rip->ri_dev == md_expldev(lp->l_dev) &&
   6217 				    rip->ri_blkno == lp->l_blkno)
   6218 					break;
   6219 			}
   6220 			if (li < lbp->lb_loccnt)
   6221 				continue;
   6222 		}
   6223 
   6224 		/*
   6225 		 * Get rid of mbp's:
   6226 		 *	if lbp, those out of lb_loccnt bounds
   6227 		 *	if !lbp,  all of them.
   6228 		 */
   6229 		if (rip->ri_mbip) {
   6230 			md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev);
   6231 			if (dev64 != NODEV64)
   6232 				mddb_devclose(dev64);
   6233 
   6234 			free_mbipp(&rip->ri_mbip);
   6235 		}
   6236 		/*
   6237 		 * Turn off MDDB_F_EMASTER flag in a diskset since diskset
   6238 		 * code always ends up calling ridev for all replicas
   6239 		 * before calling load_old_replicas.  ridev will reset
   6240 		 * MDDB_F_EMASTER flag if flag was due to unresolved devid.
   6241 		 */
   6242 		if (setno != MD_LOCAL_SET)
   6243 			rip->ri_flags &= ~MDDB_F_EMASTER;
   6244 	}
   6245 	return (retval);
   6246 }
   6247 
   6248 /*
   6249  * Given the devt from the md.conf info, get the devid for the device.
   6250  */
   6251 static void
   6252 lookup_db_devid(mddb_cfg_loc_t *cl)
   6253 {
   6254 	dev_t		ldev;
   6255 	ddi_devid_t	devid;
   6256 	char		*minor;
   6257 
   6258 	if (ddi_name_to_major(cl->l_driver) == (major_t)-1) {
   6259 		cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver);
   6260 		return;
   6261 	}
   6262 
   6263 	ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum);
   6264 	if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) {
   6265 		cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x",
   6266 		    cl->l_driver, cl->l_mnum);
   6267 		return;
   6268 	}
   6269 
   6270 	if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) {
   6271 		cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x",
   6272 		    cl->l_mnum);
   6273 		return;
   6274 	}
   6275 
   6276 	cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ;
   6277 	cl->l_devid_sz = (int)ddi_devid_sizeof(devid);
   6278 	cl->l_devid = (uint64_t)(uintptr_t)devid;
   6279 	(void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX);
   6280 
   6281 	kmem_free(minor, strlen(minor) + 1);
   6282 }
   6283 
   6284 /*
   6285  * grab driver name, minor, block and devid out of
   6286  * strings like "driver:minor:block:devid"
   6287  */
   6288 static int
   6289 parse_db_loc(
   6290 	char		*str,
   6291 	mddb_cfg_loc_t	*clp
   6292 )
   6293 {
   6294 	char		*p, *e;
   6295 	char		*minor_name;
   6296 	ddi_devid_t	ret_devid;
   6297 
   6298 	clp->l_dev = 0;
   6299 	p = clp->l_driver;
   6300 	e = p + sizeof (clp->l_driver) - 1;
   6301 	while ((*str != ':') && (*str != '\0') && (p < e))
   6302 		*p++ = *str++;
   6303 	*p = '\0';
   6304 	if (*str++ != ':')
   6305 		return (-1);
   6306 	clp->l_mnum = 0;
   6307 	while (ISNUM(*str)) {
   6308 		clp->l_mnum *= 10;
   6309 		clp->l_mnum += *str++ - '0';
   6310 	}
   6311 	if (*str++ != ':')
   6312 		return (-1);
   6313 	clp->l_blkno = 0;
   6314 	while (ISNUM(*str)) {
   6315 		clp->l_blkno *= 10;
   6316 		clp->l_blkno += *str++ - '0';
   6317 	}
   6318 	if (*str++ != ':')
   6319 		return (-1);
   6320 
   6321 	/*
   6322 	 * If the md_devid_destroy flag is set, ignore the device ids.
   6323 	 * This is only to used in a catastrophic failure case.  Examples
   6324 	 * would be where the device id of all drives in the system
   6325 	 * (especially the mirror'd root drives) had been changed
   6326 	 * by firmware upgrade or by a patch to an existing disk
   6327 	 * driver.  Another example would be in the case of non-unique
   6328 	 * device ids due to a bug.  The device id would be valid on
   6329 	 * the system, but would return the wrong dev_t.
   6330 	 */
   6331 	if (md_devid_destroy) {
   6332 		clp->l_devid_flags = 0;
   6333 		clp->l_devid = (uint64_t)NULL;
   6334 		clp->l_devid_sz = 0;
   6335 		clp->l_old_devid = (uint64_t)NULL;
   6336 		clp->l_old_devid_sz = 0;
   6337 		clp->l_minor_name[0] = '\0';
   6338 		return (0);
   6339 	}
   6340 
   6341 	if (ddi_devid_str_decode(str,
   6342 	    (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE)
   6343 		return (-1);
   6344 
   6345 	clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
   6346 	clp->l_devid_flags = 0;
   6347 	clp->l_old_devid = (uint64_t)NULL;
   6348 	clp->l_old_devid_sz = 0;
   6349 
   6350 	/* If no device id associated with device, just return */
   6351 	if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) {
   6352 		clp->l_devid_sz = 0;
   6353 		clp->l_minor_name[0] = '\0';
   6354 		if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 &&
   6355 		    md_keep_repl_state == 0) {
   6356 			/*
   6357 			 * No devid in md.conf; we're in recovery mode so
   6358 			 * lookup the devid for the device as specified by
   6359 			 * the devt in md.conf.
   6360 			 */
   6361 			lookup_db_devid(clp);
   6362 		}
   6363 		return (0);
   6364 	}
   6365 
   6366 	clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID |
   6367 	    MDDB_DEVID_SZ;
   6368 	clp->l_devid_sz = (int)ddi_devid_sizeof(
   6369 	    (ddi_devid_t)(uintptr_t)clp->l_devid);
   6370 	(void) strcpy(clp->l_minor_name, minor_name);
   6371 	kmem_free(minor_name, strlen(minor_name) + 1);
   6372 
   6373 	return (0);
   6374 }
   6375 
   6376 /*
   6377  * grab driver name, minor, and block out of
   6378  * strings like "driver:minor:block:devid driver:minor:block:devid ..."
   6379  */
   6380 static void
   6381 parse_db_string(
   6382 	char		*str
   6383 )
   6384 {
   6385 	char		*p, *e;
   6386 	mddb_cfg_loc_t	*cl;
   6387 	char		restore_space;
   6388 
   6389 	/* CSTYLED */
   6390 	cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
   6391 	for (p = str; (*p != '\0'); ) {
   6392 		for (; ((*p != '\0') && (ISWHITE(*p))); ++p)
   6393 			;
   6394 		if (*p == '\0')
   6395 			break;
   6396 		for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e)
   6397 			;
   6398 		/*
   6399 		 * Only give parse_db_loc 1 entry, so stuff a null into
   6400 		 * the string if we're not at the end.  We need to save this
   6401 		 * char and restore it after call.
   6402 		 */
   6403 		restore_space = '\0';
   6404 		if (*e != '\0') {
   6405 			restore_space = *e;
   6406 			*e = '\0';
   6407 		}
   6408 		if (parse_db_loc(p, cl) != 0) {
   6409 			cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p);
   6410 		} else {
   6411 			(void) ridev(
   6412 			    &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip,
   6413 			    cl, NULL, MDDB_F_PTCHED);
   6414 			if (cl->l_devid_flags & MDDB_DEVID_SPACE) {
   6415 				kmem_free((caddr_t)(uintptr_t)cl->l_devid,
   6416 				    cl->l_devid_sz);
   6417 			}
   6418 		}
   6419 		if (restore_space != '\0') {
   6420 			*e = restore_space;
   6421 		}
   6422 		p = e;
   6423 	}
   6424 	kmem_free(cl, sizeof (mddb_cfg_loc_t));
   6425 }
   6426 
   6427 /*
   6428  * grab database locations supplied by md.conf as properties
   6429  */
   6430 static void
   6431 parse_db_strings(void)
   6432 {
   6433 	int		bootlist_id;
   6434 	int		proplen;
   6435 	/*
   6436 	 * size of _bootlist_name should match uses of line and entry in
   6437 	 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c)
   6438 	 */
   6439 	char 		_bootlist_name[MDDB_BOOTLIST_MAX_LEN];
   6440 	char		*bootlist_name;
   6441 	caddr_t		prop;
   6442 
   6443 /*
   6444  * Step through the bootlist properties one at a time by forming the
   6445  * correct name, fetching the property, parsing the property and
   6446  * then freeing the memory.  If a property does not exist or returns
   6447  * some form of error just ignore it.  There is no guarantee that
   6448  * the properties will always exist in sequence, for example
   6449  * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with
   6450  * mddb_bootlist3 existing.
   6451  */
   6452 	bootlist_name = &_bootlist_name[0];
   6453 	for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) {
   6454 
   6455 		proplen = 0;
   6456 		(void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id);
   6457 
   6458 		if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo,
   6459 		    DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop,
   6460 		    &proplen) != DDI_PROP_SUCCESS)
   6461 			continue;
   6462 
   6463 		if (proplen <= 0)
   6464 			continue;
   6465 
   6466 		if (md_init_debug)
   6467 			cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop);
   6468 
   6469 		parse_db_string(prop);
   6470 		kmem_free(prop, proplen);
   6471 	}
   6472 }
   6473 
   6474 static int
   6475 initit(
   6476 	set_t		setno,
   6477 	int		flag
   6478 )
   6479 {
   6480 	int		i;
   6481 	mddb_set_t	*s;
   6482 	mddb_lb_t	*lbp;		/* pointer to locator block */
   6483 	mddb_ln_t	*lnp;		/* pointer to locator names */
   6484 	mddb_db_t	*dbp;		/* pointer to directory block */
   6485 	mddb_did_blk_t	*did_blkp;	/* pointer to Device ID block */
   6486 	mddb_did_ic_t	*did_icp;	/* pointer to Device ID incore area */
   6487 	mddb_bf_t	*bfp;
   6488 	side_t		sideno;
   6489 	side_t		maxsides;
   6490 	mddb_block_t	lb_blkcnt;
   6491 	int		retval = 0;
   6492 	md_dev64_t	dev;
   6493 	mddb_mnlb_t	*mnlbp;
   6494 	int		devid_flag;
   6495 
   6496 	/* single thread's all loads/unloads of set's */
   6497 	mutex_enter(&mddb_lock);
   6498 	mutex_enter(SETMUTEX(setno));
   6499 
   6500 	if (((mddb_set_t *)md_set[setno].s_db) == NULL) {
   6501 		mutex_exit(SETMUTEX(setno));
   6502 		mutex_exit(&mddb_lock);
   6503 		return (MDDB_E_NOTNOW);
   6504 	}
   6505 
   6506 	s = (mddb_set_t *)md_set[setno].s_db;
   6507 
   6508 	single_thread_start(s);
   6509 
   6510 	/*
   6511 	 * init is already underway, block. Return success.
   6512 	 */
   6513 	if (s->s_lbp) {
   6514 		single_thread_end(s);
   6515 		mutex_exit(SETMUTEX(setno));
   6516 		mutex_exit(&mddb_lock);
   6517 		return (0);
   6518 	}
   6519 
   6520 	uniqtime32(&s->s_inittime);
   6521 
   6522 	/* grab database locations patched by /etc/system */
   6523 	if (setno == MD_LOCAL_SET)
   6524 		parse_db_strings();
   6525 
   6526 	s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc(
   6527 	    sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP);
   6528 
   6529 	s->s_zombie = 0;
   6530 	s->s_staledeletes = 0;
   6531 	s->s_optcmtcnt = 0;
   6532 	s->s_opthavelck = 0;
   6533 	s->s_optwantlck = 0;
   6534 	s->s_optwaiterr = 0;
   6535 	s->s_opthungerr = 0;
   6536 
   6537 	/*
   6538 	 * KEEPTAG can never be set for a MN diskset since no tags are
   6539 	 * allowed to be stored in a MN diskset.  No way to check
   6540 	 * if this is a MN diskset or not at this point since the mddb
   6541 	 * hasn't been read in from disk yet.  (flag will only have
   6542 	 * MUTLINODE bit set if a new set is being created.)
   6543 	 */
   6544 	if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG))
   6545 		dt_setup(s, NULL);
   6546 
   6547 	md_clr_setstatus(s->s_setno, MD_SET_TOOFEW);
   6548 
   6549 	for (i = 0; i <	mddb_maxbufheaders; i++) {
   6550 		bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP);
   6551 		sema_init(&bfp->bf_buf.b_io, 0, NULL,
   6552 		    SEMA_DEFAULT, NULL);
   6553 		sema_init(&bfp->bf_buf.b_sem, 0, NULL,
   6554 		    SEMA_DEFAULT, NULL);
   6555 		bfp->bf_buf.b_offset = -1;
   6556 		freebuffer(s, bfp);
   6557 	}
   6558 
   6559 	retval = load_old_replicas(s, flag);
   6560 	/* If 0 return value - success */
   6561 	if (! retval) {
   6562 		single_thread_end(s);
   6563 		mutex_exit(SETMUTEX(setno));
   6564 		mutex_exit(&mddb_lock);
   6565 		return (0);
   6566 	}
   6567 
   6568 	/*
   6569 	 * If here, then the load_old_replicas() failed
   6570 	 */
   6571 
   6572 
   6573 	/* If the database was supposed to exist. */
   6574 	if (flag & MDDB_MUSTEXIST) {
   6575 		if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) {
   6576 			for (i = 0; i < mddb_maxcopies;	 i++) {
   6577 				if (! s->s_mbiarray[i])
   6578 					continue;
   6579 				dev = md_expldev(
   6580 				    s->s_lbp->lb_locators[i].l_dev);
   6581 				dev = md_xlate_targ_2_mini(dev);
   6582 				if (dev != NODEV64)
   6583 					mddb_devclose(dev);
   6584 
   6585 				free_mbipp(&s->s_mbiarray[i]);
   6586 			}
   6587 
   6588 			kmem_free((caddr_t)s->s_mbiarray,
   6589 			    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
   6590 			s->s_mbiarray = NULL;
   6591 		}
   6592 
   6593 		if (s->s_lnp != (mddb_ln_t *)NULL) {
   6594 			kmem_free((caddr_t)s->s_lnp,
   6595 			    dbtob(s->s_lbp->lb_lnblkcnt));
   6596 			s->s_lnp = (mddb_ln_t *)NULL;
   6597 		}
   6598 
   6599 		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
   6600 
   6601 		if (s->s_lbp != (mddb_lb_t *)NULL) {
   6602 			kmem_free((caddr_t)s->s_lbp,
   6603 			    dbtob(s->s_lbp->lb_blkcnt));
   6604 			s->s_lbp = (mddb_lb_t *)NULL;
   6605 		}
   6606 
   6607 		while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
   6608 			kmem_free((caddr_t)bfp, sizeof (*bfp));
   6609 
   6610 		single_thread_end(s);
   6611 		mutex_exit(SETMUTEX(setno));
   6612 		mutex_exit(&mddb_lock);
   6613 
   6614 		if (retval == MDDB_E_TAGDATA)
   6615 			return (retval);
   6616 
   6617 		/* Want a bit more detailed error messages */
   6618 		if (mddb_db_err_detail)
   6619 			return (retval);
   6620 
   6621 		return (MDDB_E_NODB);
   6622 	}
   6623 
   6624 
   6625 	/*
   6626 	 * MDDB_NOOLDOK set - Creating a new database, so do
   6627 	 * more initialization.
   6628 	 */
   6629 
   6630 	lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
   6631 	    MDDB_LOCAL_LBCNT : MDDB_LBCNT);
   6632 	if (flag & MDDB_MULTINODE) {
   6633 		lb_blkcnt = MDDB_MNLBCNT;
   6634 	}
   6635 
   6636 	if (s->s_lbp == NULL)
   6637 		s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP);
   6638 	lbp = s->s_lbp;
   6639 
   6640 	bzero((caddr_t)lbp, dbtob(lb_blkcnt));
   6641 	lbp->lb_setno = setno;
   6642 	lbp->lb_magic = MDDB_MAGIC_LB;
   6643 	if (flag & MDDB_MULTINODE) {
   6644 		lbp->lb_revision = MDDB_REV_MNLB;
   6645 	} else {
   6646 		lbp->lb_revision = MDDB_REV_LB;
   6647 	}
   6648 	lbp->lb_inittime = s->s_inittime;
   6649 	if (flag & MDDB_MULTINODE) {
   6650 		mnlbp = (mddb_mnlb_t *)lbp;
   6651 		for (i = 0; i < MDDB_NLB; i++) {
   6652 			for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) {
   6653 				mddb_mnsidelocator_t	*mnslp;
   6654 				mnslp = &mnlbp->lb_mnsidelocators[sideno][i];
   6655 				mnslp->mnl_mnum = NODEV32;
   6656 				mnslp->mnl_sideno = 0;
   6657 				mnslp->mnl_drvnm_index = 0;
   6658 			}
   6659 		}
   6660 	} else {
   6661 		maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES);
   6662 		for (i = 0; i < MDDB_NLB; i++) {
   6663 			for (sideno = 0; sideno < maxsides; sideno++) {
   6664 				mddb_sidelocator_t	*slp;
   6665 				slp = &lbp->lb_sidelocators[sideno][i];
   6666 				slp->l_mnum = NODEV32;
   6667 			}
   6668 		}
   6669 	}
   6670 	lbp->lb_blkcnt = lb_blkcnt;
   6671 
   6672 	/* lb starts on block 0 */
   6673 	/* locator names starts after locator block */
   6674 	lbp->lb_lnfirstblk = lb_blkcnt;
   6675 	if (flag & MDDB_MULTINODE) {
   6676 		lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT;
   6677 	} else {
   6678 		lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
   6679 		    MDDB_LOCAL_LNCNT : MDDB_LNCNT);
   6680 	}
   6681 
   6682 	if (flag & MDDB_MULTINODE) {
   6683 		/* Creating a multinode diskset */
   6684 		md_set_setstatus(setno, MD_SET_MNSET);
   6685 		lbp->lb_flags |= MDDB_MNSET;
   6686 	}
   6687 
   6688 	/* Data portion of mddb located after locator names */
   6689 	lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt;
   6690 
   6691 	/* the btodb that follows is converting the directory block size */
   6692 	/* Data tag part of mddb located after first block of mddb data */
   6693 	lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk +
   6694 	    btodb(MDDB_BSIZE));
   6695 	/* Data tags are not used in MN diskset - so set count to 0 */
   6696 	if (flag & MDDB_MULTINODE)
   6697 		lbp->lb_dtblkcnt = (mddb_block_t)0;
   6698 	else
   6699 		lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS;
   6700 
   6701 
   6702 	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP);
   6703 	lnp->ln_magic = MDDB_MAGIC_LN;
   6704 	if (flag & MDDB_MULTINODE) {
   6705 		lnp->ln_revision = MDDB_REV_MNLN;
   6706 	} else {
   6707 		lnp->ln_revision = MDDB_REV_LN;
   6708 	}
   6709 	s->s_lnp = lnp;
   6710 
   6711 	/*
   6712 	 * Set up Device ID portion of Locator Block.
   6713 	 * Do not set locator to device id style if
   6714 	 * md_devid_destroy is 1 and md_keep_repl_state is 1
   6715 	 * (destroy all device id data and keep replica in
   6716 	 * non device id mode).
   6717 	 *
   6718 	 * This is logically equivalent to set locator to
   6719 	 * device id style if md_devid_destroy is 0 or
   6720 	 * md_keep_repl_state is 0.
   6721 	 *
   6722 	 * In SunCluster environment, device id mode is disabled
   6723 	 * which means diskset will be run in non-devid mode.  For
   6724 	 * localset, the behavior will remain intact and run in
   6725 	 * device id mode.
   6726 	 *
   6727 	 * In multinode diskset devids are turned off.
   6728 	 */
   6729 	devid_flag = 1;
   6730 	if (cluster_bootflags & CLUSTER_CONFIGURED)
   6731 		if (setno != MD_LOCAL_SET)
   6732 			devid_flag = 0;
   6733 	if (flag & MDDB_MULTINODE)
   6734 		devid_flag = 0;
   6735 	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
   6736 		devid_flag = 0;
   6737 	/*
   6738 	 * if we weren't devid style before and md_keep_repl_state=1
   6739 	 * we need to stay non-devid
   6740 	 */
   6741 	if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) &&
   6742 	    (md_keep_repl_state == 1))
   6743 		devid_flag = 0;
   6744 	if (devid_flag) {
   6745 		lbp->lb_didfirstblk = lbp->lb_dtfirstblk +
   6746 		    lbp->lb_dtblkcnt;
   6747 		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
   6748 		lbp->lb_flags |= MDDB_DEVID_STYLE;
   6749 
   6750 		did_icp = (mddb_did_ic_t *)kmem_zalloc
   6751 		    (sizeof (mddb_did_ic_t), KM_SLEEP);
   6752 		did_blkp = (mddb_did_blk_t *)
   6753 		    kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
   6754 		did_blkp->blk_magic = MDDB_MAGIC_DI;
   6755 		did_blkp->blk_revision = MDDB_REV_DI;
   6756 		did_icp->did_ic_blkp = did_blkp;
   6757 		s->s_did_icp = did_icp;
   6758 	}
   6759 
   6760 	setidentifier(s, &lbp->lb_ident);
   6761 	uniqtime32(&lbp->lb_timestamp);
   6762 	dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
   6763 	dbp->db_magic = MDDB_MAGIC_DB;
   6764 	dbp->db_revision = MDDB_REV_DB;
   6765 	uniqtime32(&dbp->db_timestamp);
   6766 	dbp->db_nextblk = 0;
   6767 	dbp->db_firstentry = NULL;
   6768 	dbp->db_blknum = lbp->lb_dbfirstblk;
   6769 	dbp->db_recsum = MDDB_GLOBAL_XOR;
   6770 	s->s_dbp = dbp;
   6771 	single_thread_end(s);
   6772 	mutex_exit(SETMUTEX(setno));
   6773 	mutex_exit(&mddb_lock);
   6774 	return (0);
   6775 }
   6776 
   6777 mddb_set_t *
   6778 mddb_setenter(
   6779 	set_t		setno,
   6780 	int		flag,
   6781 	int		*errorcodep
   6782 )
   6783 {
   6784 	mddb_set_t	*s;
   6785 	int		err = 0;
   6786 	size_t		sz = sizeof (void *) * MD_MAXUNITS;
   6787 
   6788 	mutex_enter(SETMUTEX(setno));
   6789 	if (! md_set[setno].s_db) {
   6790 		mutex_exit(SETMUTEX(setno));
   6791 		if (errorcodep != NULL)
   6792 			*errorcodep = MDDB_E_NOTOWNER;
   6793 		return (NULL);
   6794 	}
   6795 
   6796 	/* Allocate s_un and s_ui arrays if not already present. */
   6797 	if (md_set[setno].s_un == NULL) {
   6798 		md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP);
   6799 		if (md_set[setno].s_un == NULL) {
   6800 			mutex_exit(SETMUTEX(setno));
   6801 			if (errorcodep != NULL)
   6802 				*errorcodep = MDDB_E_NOTOWNER;
   6803 			return (NULL);
   6804 		}
   6805 	}
   6806 	if (md_set[setno].s_ui == NULL) {
   6807 		md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP);
   6808 		if (md_set[setno].s_ui == NULL) {
   6809 			mutex_exit(&md_set[setno].s_dbmx);
   6810 			kmem_free(md_set[setno].s_un, sz);
   6811 			md_set[setno].s_un = NULL;
   6812 			if (errorcodep != NULL)
   6813 				*errorcodep = MDDB_E_NOTOWNER;
   6814 			return (NULL);
   6815 		}
   6816 	}
   6817 	s = (mddb_set_t *)md_set[setno].s_db;
   6818 	if (s->s_lbp)
   6819 		return (s);
   6820 
   6821 	if (flag & MDDB_NOINIT)
   6822 		return (s);
   6823 
   6824 	/*
   6825 	 * Release the set mutex - it will be acquired and released in
   6826 	 * initit after acquiring the mddb_lock.  This is done to assure
   6827 	 * that mutexes are always acquired in the same order to prevent
   6828 	 * possible deadlock
   6829 	 */
   6830 	mutex_exit(SETMUTEX(setno));
   6831 
   6832 	if ((err = initit(setno, flag)) != 0) {
   6833 		if (errorcodep != NULL)
   6834 			*errorcodep = err;
   6835 		return (NULL);
   6836 	}
   6837 
   6838 	mutex_enter(SETMUTEX(setno));
   6839 	return ((mddb_set_t *)md_set[setno].s_db);
   6840 }
   6841 
   6842 /*
   6843  * Release the set lock for a given set.
   6844  *
   6845  * In a MN diskset, this routine may send messages to the rpc.mdcommd
   6846  * in order to have the slave nodes re-parse parts of the mddb.
   6847  * Messages are only sent if the global ioctl lock is not held.
   6848  *
   6849  * With the introduction of multi-threaded ioctls, there is no way
   6850  * to determine which thread(s) are holding the ioctl lock.  So, if
   6851  * the ioctl lock is held (by process X) process X will send the
   6852  * messages to the slave nodes when process X releases the ioctl lock.
   6853  */
   6854 void
   6855 mddb_setexit(
   6856 	mddb_set_t	*s
   6857 )
   6858 {
   6859 	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
   6860 	md_mn_kresult_t			*kresult;
   6861 	mddb_lb_t			*lbp = s->s_lbp;
   6862 	int				i;
   6863 	int				rval = 1;
   6864 
   6865 	/*
   6866 	 * If not a MN diskset OR
   6867 	 * a MN diskset but this node isn't master,
   6868 	 * then release the mutex.
   6869 	 */
   6870 	if (!(MD_MNSET_SETNO(s->s_setno)) ||
   6871 	    ((MD_MNSET_SETNO(s->s_setno)) &&
   6872 	    (!md_set[s->s_setno].s_am_i_master))) {
   6873 		mutex_exit(SETMUTEX(s->s_setno));
   6874 		return;
   6875 	}
   6876 
   6877 	/*
   6878 	 * If global ioctl lock is held, then send no messages,
   6879 	 * just release mutex and return.
   6880 	 *
   6881 	 */
   6882 	if (md_status & MD_GBL_IOCTL_LOCK) {
   6883 		mutex_exit(SETMUTEX(s->s_setno));
   6884 		return;
   6885 	}
   6886 
   6887 	/*
   6888 	 * This thread is not holding the ioctl lock, so drop the set
   6889 	 * lock, send messages to slave nodes to reparse portions
   6890 	 * of the mddb and return.
   6891 	 *
   6892 	 * If the block parse flag is set, do not send parse messages.
   6893 	 * This flag is set when master is adding a new mddb that would
   6894 	 * cause parse messages to be sent to the slaves, but the slaves
   6895 	 * don't have knowledge of the new mddb yet since the mddb add
   6896 	 * operation hasn't been run on the slave nodes yet.  When the
   6897 	 * master unblocks the parse flag, the parse messages will be
   6898 	 * generated.
   6899 	 *
   6900 	 * If s_mn_parseflags_sending is non-zero, then another thread
   6901 	 * is already currently sending a parse message, so just release
   6902 	 * the mutex and return.  If an mddb change occurred that results
   6903 	 * in a parse message to be generated, the thread that is currently
   6904 	 * sending a parse message would generate the additional parse message.
   6905 	 *
   6906 	 * If s_mn_parseflags_sending is zero and parsing is not blocked,
   6907 	 * then loop until s_mn_parseflags is 0 (until there are no more
   6908 	 * messages to send).
   6909 	 * While s_mn_parseflags is non-zero,
   6910 	 * 	put snapshot of parse_flags in s_mn_parseflags_sending
   6911 	 * 	set s_mn_parseflags to zero
   6912 	 *	release mutex
   6913 	 *	send message
   6914 	 *	re-grab mutex
   6915 	 *	set s_mn_parseflags_sending to zero
   6916 	 */
   6917 	mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP);
   6918 	while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
   6919 	    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
   6920 	    (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) {
   6921 		/* Grab snapshot of parse flags */
   6922 		s->s_mn_parseflags_sending = s->s_mn_parseflags;
   6923 		s->s_mn_parseflags = 0;
   6924 
   6925 		mutex_exit(SETMUTEX(s->s_setno));
   6926 
   6927 		/*
   6928 		 * Send the message to the slaves to re-parse
   6929 		 * the indicated portions of the mddb. Send the status
   6930 		 * of the 50 mddbs in this set so that slaves know which
   6931 		 * mddbs that the master node thinks are 'good'.
   6932 		 * Otherwise, slave may reparse, but from wrong replica.
   6933 		 */
   6934 		mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending;
   6935 		for (i = 0; i < MDDB_NLB; i++) {
   6936 			mddb_parse_msg->msg_lb_flags[i] =
   6937 			    lbp->lb_locators[i].l_flags;
   6938 		}
   6939 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
   6940 		while (rval != 0) {
   6941 			rval = mdmn_ksend_message(s->s_setno,
   6942 			    MD_MN_MSG_MDDB_PARSE, 0, 0,
   6943 			    (char *)mddb_parse_msg,
   6944 			    sizeof (md_mn_msg_mddb_parse_t), kresult);
   6945 			if (rval != 0)
   6946 				cmn_err(CE_WARN, "mddb_setexit: Unable to send "
   6947 				    "mddb update message to other nodes in "
   6948 				    "diskset %s\n", s->s_setname);
   6949 		}
   6950 		kmem_free(kresult, sizeof (md_mn_kresult_t));
   6951 
   6952 		/*
   6953 		 * Re-grab mutex to clear sending field and to
   6954 		 * see if another parse message needs to be generated.
   6955 		 */
   6956 		mutex_enter(SETMUTEX(s->s_setno));
   6957 		s->s_mn_parseflags_sending = 0;
   6958 	}
   6959 	kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
   6960 	mutex_exit(SETMUTEX(s->s_setno));
   6961 }
   6962 
   6963 static void
   6964 mddb_setexit_no_parse(
   6965 	mddb_set_t	*s
   6966 )
   6967 {
   6968 	mutex_exit(SETMUTEX(s->s_setno));
   6969 }
   6970 
   6971 uint_t
   6972 mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt)
   6973 {
   6974 	uint_t			li;
   6975 	mddb_lb_t		*lbp = s->s_lbp;
   6976 	mddb_locator_t		*lp;
   6977 	ddi_devid_t		ret_devid;
   6978 	uint_t			devid_len;
   6979 	dev_t			ddi_dev;
   6980 	mddb_did_ic_t		*did_icp;
   6981 	mddb_did_blk_t		*did_blkp;
   6982 	char			*minor_name;
   6983 	size_t			sz;
   6984 	int			retval;
   6985 	int			err;
   6986 	md_dev64_t		dev64; /* tmp var to make code look better */
   6987 
   6988 
   6989 	/* Need disk block(s) to hold mddb_did_blk_t */
   6990 	*blk_cnt = MDDB_DID_BLOCKS;
   6991 
   6992 	if (doit) {
   6993 		/*
   6994 		 * Alloc mddb_did_blk_t disk block and fill in header area.
   6995 		 * Don't fill in did magic number until end of routine so
   6996 		 * if machine panics in the middle of conversion, the
   6997 		 * device id information will be thrown away at the
   6998 		 * next snarfing of this set.
   6999 		 * Need to set DEVID_STYLE so that mddb_devid_add will
   7000 		 * function properly.
   7001 		 */
   7002 		/* grab the mutex */
   7003 		if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) {
   7004 			return (1);
   7005 		}
   7006 		single_thread_start(s);
   7007 		lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS);
   7008 		if (lbp->lb_didfirstblk == 0) {
   7009 			single_thread_end(s);
   7010 			mddb_setexit(s);
   7011 			return (1);
   7012 		}
   7013 		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
   7014 		did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t),
   7015 		    KM_SLEEP);
   7016 		did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES,
   7017 		    KM_SLEEP);
   7018 
   7019 		did_blkp->blk_revision = MDDB_REV_DI;
   7020 		did_icp->did_ic_blkp = did_blkp;
   7021 		s->s_did_icp = did_icp;
   7022 		lbp->lb_flags |= MDDB_DEVID_STYLE;
   7023 	}
   7024 
   7025 	/* Fill in information in mddb_did_info_t array */
   7026 	for (li = 0; li < lbp->lb_loccnt; li++) {
   7027 		lp = &lbp->lb_locators[li];
   7028 		if (lp->l_flags & MDDB_F_DELETED)
   7029 			continue;
   7030 
   7031 		dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
   7032 		ddi_dev = md_dev64_to_dev(dev64);
   7033 		if (ddi_dev == NODEV) {
   7034 			/*
   7035 			 * No translation available for replica.
   7036 			 * Could fail conversion to device id replica,
   7037 			 * but instead will just continue with next
   7038 			 * replica in list.
   7039 			 */
   7040 			continue;
   7041 		}
   7042 		if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) {
   7043 			/*
   7044 			 * Just count each devid as at least 1 block.  This
   7045 			 * is conservative since several device id's may fit
   7046 			 * into 1 disk block, but it's better to overestimate
   7047 			 * the number of blocks needed than to underestimate.
   7048 			 */
   7049 			devid_len = (int)ddi_devid_sizeof(ret_devid);
   7050 			*blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1));
   7051 			if (doit) {
   7052 				if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK,
   7053 				    &minor_name) == DDI_SUCCESS) {
   7054 					if (mddb_devid_add(s, li, ret_devid,
   7055 					    minor_name)) {
   7056 						cmn_err(CE_WARN,
   7057 						    "Not enough space in metadb"
   7058 						    " to add device id for"
   7059 						    "  dev: major = %d, "
   7060 						    "minor = %d\n",
   7061 						    getmajor(ddi_dev),
   7062 						    getminor(ddi_dev));
   7063 					}
   7064 					sz = strlen(minor_name) + 1;
   7065 					kmem_free(minor_name, sz);
   7066 				}
   7067 			}
   7068 			ddi_devid_free(ret_devid);
   7069 		}
   7070 	}
   7071 
   7072 	if (doit) {
   7073 		did_blkp->blk_magic = MDDB_MAGIC_DI;
   7074 		retval = push_lb(s);
   7075 		(void) upd_med(s, "mddb_lb_did_convert(0)");
   7076 		single_thread_end(s);
   7077 		mddb_setexit(s);
   7078 		if (retval != 0)
   7079 			return (1);
   7080 	}
   7081 
   7082 	return (0);
   7083 }
   7084 
   7085 static mddb_set_t *
   7086 init_set(
   7087 	mddb_config_t	*cp,
   7088 	int		flag,
   7089 	int		*errp
   7090 )
   7091 {
   7092 	mddb_set_t	*s;
   7093 	char		*setname = NULL;
   7094 	set_t		setno = MD_LOCAL_SET;
   7095 	side_t		sideno = 0;
   7096 	struct timeval32 *created = NULL;
   7097 
   7098 	if (cp != NULL) {
   7099 		setname = cp->c_setname;
   7100 		setno = cp->c_setno;
   7101 		sideno = cp->c_sideno;
   7102 		created = &cp->c_timestamp;
   7103 	}
   7104 
   7105 	if (setno >= MD_MAXSETS)
   7106 		return ((mddb_set_t *)NULL);
   7107 
   7108 	if (md_set[setno].s_db)
   7109 		return (mddb_setenter(setno, flag, errp));
   7110 
   7111 	s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP);
   7112 
   7113 	cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL);
   7114 	cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL);
   7115 	cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL);
   7116 	cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL);
   7117 	cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL);
   7118 
   7119 	s->s_setno = setno;
   7120 	s->s_sideno = sideno;
   7121 	if (setno == MD_LOCAL_SET) {
   7122 		(void) snprintf(s->s_ident.serial, sizeof (s->s_ident.serial),
   7123 		    "%u", zone_get_hostid(NULL));
   7124 	} else {
   7125 		s->s_ident.createtime = *created;
   7126 		s->s_setname = (char *)kmem_alloc(strlen(setname) + 1,
   7127 		    KM_SLEEP);
   7128 		(void) strcpy(s->s_setname, setname);
   7129 	}
   7130 
   7131 	/* have a config struct,  copy mediator information */
   7132 	if (cp != NULL)
   7133 		s->s_med = cp->c_med;		/* structure assignment */
   7134 
   7135 	md_set[setno].s_db = (void *) s;
   7136 
   7137 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64);
   7138 
   7139 	return (mddb_setenter(setno, flag, errp));
   7140 }
   7141 
   7142 void
   7143 mddb_unload_set(
   7144 	set_t		setno
   7145 )
   7146 {
   7147 
   7148 	mddb_set_t	*s;
   7149 	mddb_db_t	*dbp, *adbp = NULL;
   7150 	mddb_de_ic_t	*dep, *dep2;
   7151 	mddb_bf_t	*bfp;
   7152 	int		i;
   7153 	md_dev64_t	dev;
   7154 
   7155 	if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL)
   7156 		return;
   7157 
   7158 	single_thread_start(s);
   7159 
   7160 	s->s_opthavequeuinglck = 0;
   7161 	s->s_optwantqueuinglck = 0;
   7162 
   7163 	for (dbp = s->s_dbp; dbp != 0; dbp = adbp) {
   7164 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) {
   7165 			if (dep->de_rb_userdata != NULL) {
   7166 				if (dep->de_icreqsize)
   7167 					kmem_free(dep->de_rb_userdata_ic,
   7168 					    dep->de_icreqsize);
   7169 				else
   7170 					kmem_free(dep->de_rb_userdata,
   7171 					    dep->de_reqsize);
   7172 			}
   7173 			kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
   7174 			dep2 = dep->de_next;
   7175 			kmem_free((caddr_t)dep, sizeofde(dep));
   7176 		}
   7177 		adbp = dbp->db_next;
   7178 		kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
   7179 	}
   7180 	s->s_dbp = (mddb_db_t *)NULL;
   7181 
   7182 	free_rip(&s->s_rip);
   7183 
   7184 	for (i = 0; i < mddb_maxcopies;	 i++) {
   7185 		if (! s->s_mbiarray)
   7186 			break;
   7187 
   7188 		if (! s->s_mbiarray[i])
   7189 			continue;
   7190 
   7191 		dev = md_expldev(s->s_lbp->lb_locators[i].l_dev);
   7192 		dev = md_xlate_targ_2_mini(dev);
   7193 		if (dev != NODEV64)
   7194 			mddb_devclose(dev);
   7195 
   7196 		free_mbipp(&s->s_mbiarray[i]);
   7197 	}
   7198 
   7199 	if (s->s_mbiarray) {
   7200 		kmem_free((caddr_t)s->s_mbiarray,
   7201 		    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
   7202 		s->s_mbiarray = (mddb_mb_ic_t **)NULL;
   7203 	}
   7204 
   7205 	if (s->s_lnp) {
   7206 		kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt));
   7207 		s->s_lnp = (mddb_ln_t *)NULL;
   7208 	}
   7209 
   7210 	if (s->s_lbp) {
   7211 		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
   7212 		kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt));
   7213 		s->s_lbp = (mddb_lb_t *)NULL;
   7214 	}
   7215 
   7216 	if (s->s_freebitmap) {
   7217 		kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
   7218 		s->s_freebitmap = NULL;
   7219 		s->s_freebitmapsize = 0;
   7220 	}
   7221 
   7222 	while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
   7223 		kmem_free((caddr_t)bfp, sizeof (*bfp));
   7224 
   7225 	if (s->s_databuffer_size) {
   7226 		kmem_free(s->s_databuffer, s->s_databuffer_size);
   7227 		s->s_databuffer_size = 0;
   7228 	}
   7229 
   7230 	if (s->s_setname != NULL)
   7231 		kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1);
   7232 
   7233 	/* Data tags not supported on MN sets. */
   7234 	if (!(md_get_setstatus(setno) & MD_SET_MNSET))
   7235 		dtl_freel(&s->s_dtlp);
   7236 
   7237 	md_set[setno].s_db = NULL;
   7238 	ASSERT(s->s_singlelockwanted == 0);
   7239 	kmem_free(s, sizeof (mddb_set_t));
   7240 
   7241 	/* Take care of things setup in the md_set array */
   7242 	if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) {
   7243 		if (md_set[setno].s_dtp) {
   7244 			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
   7245 			md_set[setno].s_dtp = NULL;
   7246 		}
   7247 	}
   7248 
   7249 	md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT |
   7250 	    MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE |
   7251 	    MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET |
   7252 	    MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC |
   7253 	    MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
   7254 
   7255 	mutex_exit(SETMUTEX(setno));
   7256 }
   7257 
   7258 /*
   7259  * returns 0 if name can be put into locator block
   7260  * returns 1 if locator block prefixes are all used
   7261  *
   7262  * Takes splitname (suffix, prefix, sideno) and
   7263  * stores it in the locator name structure.
   7264  * For traditional diskset, the sideno is the index into the suffixes
   7265  * array in the locator name structure.
   7266  * For the MN diskset, the sideno is the nodeid which can be any number,
   7267  * so the index passed in is the index into the mnsuffixes array
   7268  * in the locator structure.  This index was computed by the
   7269  * routine checklocator which basically checked the locator block
   7270  * mnside locator structure.
   7271  */
   7272 static int
   7273 splitname2locatorblock(
   7274 	md_splitname	*spn,
   7275 	mddb_ln_t	*lnp,
   7276 	int		li,
   7277 	side_t		sideno,
   7278 	int		index
   7279 )
   7280 {
   7281 	uchar_t			i;
   7282 	md_name_suffix		*sn;
   7283 	md_mnname_suffix_t	*mnsn;
   7284 	mddb_mnln_t		*mnlnp;
   7285 
   7286 	for (i = 0; i < MDDB_PREFIXCNT; i++) {
   7287 		if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len)
   7288 			continue;
   7289 		if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data,
   7290 		    SPN_PREFIX(spn).pre_len) == 0)
   7291 			break;
   7292 	}
   7293 	if (i == MDDB_PREFIXCNT) {
   7294 		for (i = 0; i < MDDB_PREFIXCNT; i++) {
   7295 			if (lnp->ln_prefixes[i].pre_len == 0)
   7296 				break;
   7297 		}
   7298 		if (i == MDDB_PREFIXCNT)
   7299 			return (1);
   7300 		bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data,
   7301 		    SPN_PREFIX(spn).pre_len);
   7302 		lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len;
   7303 	}
   7304 
   7305 	if (lnp->ln_revision == MDDB_REV_MNLN) {
   7306 		/* If a MN diskset, use index */
   7307 		mnlnp = (mddb_mnln_t *)lnp;
   7308 		mnsn = &mnlnp->ln_mnsuffixes[index][li];
   7309 		mnsn->mn_ln_sideno = sideno;
   7310 		mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len;
   7311 		mnsn->mn_ln_suffix.suf_prefix = i;
   7312 		bcopy(SPN_SUFFIX(spn).suf_data,
   7313 		    mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len);
   7314 	} else {
   7315 		sn = &lnp->ln_suffixes[sideno][li];
   7316 		sn->suf_len = SPN_SUFFIX(spn).suf_len;
   7317 		sn->suf_prefix = i;
   7318 		bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data,
   7319 		    SPN_SUFFIX(spn).suf_len);
   7320 	}
   7321 	return (0);
   7322 }
   7323 
   7324 /*
   7325  * Find the locator name for the given sideno and convert the locator name
   7326  * information into a splitname structure.
   7327  */
   7328 void
   7329 mddb_locatorblock2splitname(
   7330 	mddb_ln_t	*lnp,
   7331 	int		li,
   7332 	side_t		sideno,
   7333 	md_splitname	*spn
   7334 )
   7335 {
   7336 	int			iprefix;
   7337 	md_name_suffix		*sn;
   7338 	md_mnname_suffix_t	*mnsn;
   7339 	int			i;
   7340 	mddb_mnln_t		*mnlnp;
   7341 
   7342 	if (lnp->ln_revision == MDDB_REV_MNLN) {
   7343 		mnlnp = (mddb_mnln_t *)lnp;
   7344 		for (i = 0; i < MD_MNMAXSIDES; i++) {
   7345 			mnsn = &mnlnp->ln_mnsuffixes[i][li];
   7346 			if (mnsn->mn_ln_sideno == sideno)
   7347 				break;
   7348 		}
   7349 		if (i == MD_MNMAXSIDES)
   7350 			return;
   7351 
   7352 		SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len;
   7353 		bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data,
   7354 		    SPN_SUFFIX(spn).suf_len);
   7355 		iprefix = mnsn->mn_ln_suffix.suf_prefix;
   7356 	} else {
   7357 		sn = &lnp->ln_suffixes[sideno][li];
   7358 		SPN_SUFFIX(spn).suf_len = sn->suf_len;
   7359 		bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data,
   7360 		    SPN_SUFFIX(spn).suf_len);
   7361 		iprefix = sn->suf_prefix;
   7362 	}
   7363 	SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len;
   7364 	bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data,
   7365 	    SPN_PREFIX(spn).pre_len);
   7366 }
   7367 
   7368 static int
   7369 getdeldev(
   7370 	mddb_config_t	*cp,
   7371 	int		command,
   7372 	md_error_t	*ep
   7373 )
   7374 {
   7375 	mddb_set_t	*s;
   7376 	mddb_lb_t	*lbp;
   7377 	mddb_locator_t	*locators;
   7378 	uint_t		loccnt;
   7379 	mddb_mb_ic_t	*mbip;
   7380 	mddb_block_t	blk;
   7381 	int		err = 0;
   7382 	int		i, j;
   7383 	int		li;
   7384 	uint_t		commitcnt;
   7385 	set_t		setno = cp->c_setno;
   7386 	uint_t		set_status;
   7387 	md_dev64_t	dev;
   7388 	int		flags = MDDB_MUSTEXIST;
   7389 	mddb_ri_t	*rip;
   7390 
   7391 	cp->c_dbmax = MDDB_NLB;
   7392 
   7393 	/*
   7394 	 * Data checking
   7395 	 */
   7396 	if (setno >= md_nsets || cp->c_id < 0 ||
   7397 	    cp->c_id > cp->c_dbmax) {
   7398 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
   7399 	}
   7400 
   7401 	if (cp->c_flags & MDDB_C_STALE)
   7402 		flags |= MDDB_MN_STALE;
   7403 
   7404 	if ((s = mddb_setenter(setno, flags, &err)) == NULL)
   7405 		return (mddbstatus2error(ep, err, NODEV32, setno));
   7406 
   7407 	cp->c_flags = 0;
   7408 
   7409 	lbp = s->s_lbp;
   7410 	loccnt = lbp->lb_loccnt;
   7411 	locators = lbp->lb_locators;
   7412 
   7413 	/* shorthand */
   7414 	set_status = md_get_setstatus(setno);
   7415 
   7416 	if (set_status & MD_SET_STALE)
   7417 		cp->c_flags |= MDDB_C_STALE;
   7418 
   7419 	if (set_status & MD_SET_TOOFEW)
   7420 		cp->c_flags |= MDDB_C_TOOFEW;
   7421 
   7422 	cp->c_sideno = s->s_sideno;
   7423 
   7424 	cp->c_dbcnt = 0;
   7425 	/*
   7426 	 * go through and count active entries
   7427 	 */
   7428 	for (i = 0; i < loccnt;	 i++) {
   7429 		if (locators[i].l_flags & MDDB_F_DELETED)
   7430 			continue;
   7431 		cp->c_dbcnt++;
   7432 	}
   7433 
   7434 	/*
   7435 	 * add the ability to accept a locator block index
   7436 	 * which is not relative to previously deleted replicas.  This
   7437 	 * is for support of MD_DEBUG=STAT in metastat since it asks for
   7438 	 * replica information specifically for each of the mirror resync
   7439 	 * records.  MDDB_CONFIG_SUBCMD uses one of the pad spares in
   7440 	 * the mddb_config_t type.
   7441 	 */
   7442 	if (cp->c_subcmd == MDDB_CONFIG_ABS) {
   7443 		if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) {
   7444 			mddb_setexit(s);
   7445 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
   7446 			    setno));
   7447 		}
   7448 		li = cp->c_id;
   7449 	} else {
   7450 		if (cp->c_id >= cp->c_dbcnt) {
   7451 			mddb_setexit(s);
   7452 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
   7453 			    setno));
   7454 		}
   7455 
   7456 		/* CSTYLED */
   7457 		for (li = 0, j = 0; /* void */; li++) {
   7458 			if (locators[li].l_flags & MDDB_F_DELETED)
   7459 				continue;
   7460 			j++;
   7461 			if (j > cp->c_id)
   7462 				break;
   7463 		}
   7464 	}
   7465 
   7466 	if (command == MDDB_ENDDEV) {
   7467 		daddr_t ib = 0, jb;
   7468 
   7469 		blk = 0;
   7470 		if ((s != NULL) && s->s_mbiarray[li]) {
   7471 			mbip = s->s_mbiarray[li];
   7472 			while ((jb = getphysblk(blk++, mbip)) > 0) {
   7473 				if (jb > ib)
   7474 					ib = jb;
   7475 			}
   7476 			cp->c_dbend = (int)ib;
   7477 		} else {
   7478 			cp->c_dbend = 0;
   7479 		}
   7480 	}
   7481 
   7482 	locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp);
   7483 	mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname);
   7484 
   7485 	if (command != MDDB_DELDEV) {
   7486 		mddb_setexit(s);
   7487 		return (0);
   7488 	}
   7489 
   7490 	/* Currently don't allow addition/deletion of sides during upgrade */
   7491 	if (MD_UPGRADE) {
   7492 		cmn_err(CE_WARN,
   7493 		    "Deletion of replica not allowed during upgrade.\n");
   7494 		mddb_setexit(s);
   7495 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
   7496 	}
   7497 
   7498 	/*
   7499 	 * If here, replica delete in progress.
   7500 	 */
   7501 	single_thread_start(s);
   7502 
   7503 	if ((! (locators[li].l_flags & MDDB_F_EMASTER)) &&
   7504 	    (locators[li].l_flags & MDDB_F_ACTIVE)) {
   7505 		commitcnt = lbp->lb_commitcnt;
   7506 		lbp->lb_commitcnt = 0;
   7507 		setidentifier(s, &lbp->lb_ident);
   7508 		crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
   7509 		/*
   7510 		 * Don't need to write out device id area, since locator
   7511 		 * block on this replica is being deleted by setting the
   7512 		 * commitcnt to 0.
   7513 		 */
   7514 		(void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
   7515 		    MDDB_WR_ONLY_MASTER);
   7516 		lbp->lb_commitcnt = commitcnt;
   7517 	}
   7518 
   7519 	if (s->s_mbiarray[li]) {
   7520 		/* A freed mbi pointer still exists in the mddb_ri_t */
   7521 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
   7522 			if (rip->ri_mbip == s->s_mbiarray[li])
   7523 				rip->ri_mbip = NULL;
   7524 		}
   7525 		free_mbipp(&s->s_mbiarray[li]);
   7526 	}
   7527 
   7528 	if (! (locators[li].l_flags & MDDB_F_EMASTER)) {
   7529 		dev = md_expldev(locators[li].l_dev);
   7530 		dev = md_xlate_targ_2_mini(dev);
   7531 		if (dev != NODEV64)
   7532 			mddb_devclose(dev);
   7533 	}
   7534 
   7535 	s->s_mbiarray[li] = 0;
   7536 	lbp->lb_locators[li].l_flags = MDDB_F_DELETED;
   7537 
   7538 	/* Only support data tags for traditional and local sets */
   7539 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
   7540 	    (!(lbp->lb_flags & MDDB_MNSET)) &&
   7541 	    setno != MD_LOCAL_SET)
   7542 		if (set_dtag(s, ep))
   7543 			mdclrerror(ep);
   7544 
   7545 	/* Write data tags to all accessible devices */
   7546 	/* Only support data tags for traditional and local sets */
   7547 	if (!(lbp->lb_flags & MDDB_MNSET)) {
   7548 		(void) dt_write(s);
   7549 	}
   7550 
   7551 	/* Delete device id of deleted replica */
   7552 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   7553 		(void) mddb_devid_delete(s, li);
   7554 	}
   7555 	/* write new locator to all devices */
   7556 	err = writelocall(s);
   7557 
   7558 	(void) upd_med(s, "getdeldev(0)");
   7559 
   7560 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno,
   7561 	    md_expldev(locators[li].l_dev));
   7562 
   7563 	computefreeblks(s); /* recompute always it may be larger */
   7564 	cp->c_dbcnt--;
   7565 	err |= fixoptrecords(s);
   7566 	if (err) {
   7567 		if (writeretry(s)) {
   7568 			single_thread_end(s);
   7569 			mddb_setexit(s);
   7570 			return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno));
   7571 		}
   7572 	}
   7573 
   7574 	single_thread_end(s);
   7575 	mddb_setexit(s);
   7576 	return (0);
   7577 }
   7578 
   7579 static int
   7580 getdriver(
   7581 	mddb_cfg_loc_t	*clp
   7582 )
   7583 {
   7584 	major_t		majordev;
   7585 
   7586 	/*
   7587 	 * Data checking
   7588 	 */
   7589 	if (clp->l_dev <= 0)
   7590 		return (EINVAL);
   7591 
   7592 	majordev = getmajor(expldev(clp->l_dev));
   7593 
   7594 	if (ddi_major_to_name(majordev) == (char *)NULL)
   7595 		return (EINVAL);
   7596 
   7597 	if (MD_UPGRADE)
   7598 		(void) strcpy(clp->l_driver, md_targ_major_to_name(majordev));
   7599 	else
   7600 		(void) strcpy(clp->l_driver, ddi_major_to_name(majordev));
   7601 	return (0);
   7602 }
   7603 
   7604 /*
   7605  * update_valid_replica - updates the locator block namespace (prefix
   7606  * 	and/or suffix) with new pathname and devname.
   7607  *	RETURN
   7608  *		1	Error
   7609  *		0	Success
   7610  */
   7611 static int
   7612 update_valid_replica(
   7613 	side_t		side,
   7614 	mddb_locator_t	*lp,
   7615 	mddb_set_t	*s,
   7616 	int		li,
   7617 	char		*devname,
   7618 	char		*pathname,
   7619 	md_dev64_t	devt
   7620 )
   7621 {
   7622 	uchar_t		pre_len, suf_len;
   7623 	md_name_suffix	*sn;
   7624 	mddb_ln_t	*lnp;
   7625 	uchar_t		pre_index;
   7626 	uchar_t		i;
   7627 
   7628 	if (md_expldev(lp->l_dev) != devt) {
   7629 		return (0);
   7630 	}
   7631 
   7632 	if (pathname[strlen(pathname) - 1] == '/')
   7633 		pathname[strlen(pathname) - 1] = '\0';
   7634 
   7635 	pre_len = (uchar_t)strlen(pathname);
   7636 	suf_len = (uchar_t)strlen(devname);
   7637 
   7638 	if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX))
   7639 		return (1);
   7640 
   7641 	lnp = s->s_lnp;
   7642 
   7643 	/*
   7644 	 * Future note:  Need to do something here for the MN diskset case
   7645 	 * when device ids are supported in disksets.
   7646 	 * Can't add until merging devids_in_diskset code into code base
   7647 	 * Currently only called with side of 0.
   7648 	 */
   7649 
   7650 	sn = &lnp->ln_suffixes[side][li];
   7651 
   7652 	/*
   7653 	 * Check if prefix (Ex: /dev/dsk) needs to be changed.
   7654 	 * If new prefix is the same as the previous prefix - no change.
   7655 	 *
   7656 	 * If new prefix is not the same, check if new prefix
   7657 	 * matches an existing one.  If so, use that one.
   7658 	 *
   7659 	 * If new prefix doesn't exist, add a new prefix.  If not enough
   7660 	 * space, return failure.
   7661 	 */
   7662 	pre_index = sn->suf_prefix;
   7663 	/* Check if new prefix is the same as the old prefix. */
   7664 	if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) ||
   7665 	    (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname,
   7666 	    pre_len) != 0)) {
   7667 		/* Check if new prefix is an already known prefix. */
   7668 		for (i = 0; i < MDDB_PREFIXCNT; i++) {
   7669 			if (lnp->ln_prefixes[i].pre_len != pre_len) {
   7670 				continue;
   7671 			}
   7672 			if (bcmp(lnp->ln_prefixes[i].pre_data, pathname,
   7673 			    pre_len) == 0) {
   7674 				break;
   7675 			}
   7676 		}
   7677 		/* If no match found for new prefix - add the new prefix */
   7678 		if (i == MDDB_PREFIXCNT) {
   7679 			for (i = 0; i < MDDB_PREFIXCNT; i++) {
   7680 				if (lnp->ln_prefixes[i].pre_len == 0)
   7681 					break;
   7682 			}
   7683 			/* No space to add new prefix - return failure */
   7684 			if (i == MDDB_PREFIXCNT) {
   7685 				return (1);
   7686 			}
   7687 			bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len);
   7688 			lnp->ln_prefixes[i].pre_len = pre_len;
   7689 		}
   7690 		sn->suf_prefix = i;
   7691 	}
   7692 
   7693 	/* Now, update the suffix (Ex: c0t0d0s0) if needed */
   7694 	if ((sn->suf_len != suf_len) ||
   7695 	    (bcmp(sn->suf_data, devname, suf_len) != 0)) {
   7696 		bcopy(devname, sn->suf_data, suf_len);
   7697 		sn->suf_len = suf_len;
   7698 	}
   7699 	return (0);
   7700 }
   7701 
   7702 
   7703 /*
   7704  * md_update_locator_namespace - If in devid style and active and the devid's
   7705  *		exist and are valid update the locator namespace pathname
   7706  *		and devname.
   7707  *	RETURN
   7708  *		1	Error
   7709  *		0	Success
   7710  */
   7711 int
   7712 md_update_locator_namespace(
   7713 	set_t		setno,		/* which set to get name from */
   7714 	side_t		side,
   7715 	char		*dname,
   7716 	char		*pname,
   7717 	md_dev64_t	devt
   7718 )
   7719 {
   7720 	mddb_set_t	*s;
   7721 	mddb_lb_t	*lbp;
   7722 	int		li;
   7723 	uint_t		flg;
   7724 	int		err = 0;
   7725 	mddb_ln_t	*lnp;
   7726 
   7727 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
   7728 		return (1);
   7729 	single_thread_start(s);
   7730 	lbp = s->s_lbp;
   7731 	/* must be DEVID_STYLE */
   7732 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   7733 		for (li = 0; li < lbp->lb_loccnt; li++) {
   7734 			mddb_locator_t *lp = &lbp->lb_locators[li];
   7735 
   7736 			if (lp->l_flags & MDDB_F_DELETED) {
   7737 				continue;
   7738 			}
   7739 
   7740 			/* replica also must be active */
   7741 			if (lp->l_flags & MDDB_F_ACTIVE) {
   7742 				flg = s->s_did_icp->did_ic_blkp->
   7743 				    blk_info[li].info_flags;
   7744 				/* only update if did exists and is valid */
   7745 				if ((flg & MDDB_DID_EXISTS) &&
   7746 				    (flg & MDDB_DID_VALID)) {
   7747 					if (update_valid_replica(side, lp, s,
   7748 					    li, dname, pname, devt)) {
   7749 						err = 1;
   7750 						goto out;
   7751 					}
   7752 				}
   7753 			}
   7754 		}
   7755 	}
   7756 	lnp = s->s_lnp;
   7757 	uniqtime32(&lnp->ln_timestamp);
   7758 	if (lbp->lb_flags & MDDB_MNSET)
   7759 		lnp->ln_revision = MDDB_REV_MNLN;
   7760 	else
   7761 		lnp->ln_revision = MDDB_REV_LN;
   7762 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
   7763 	err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
   7764 	    lbp->lb_lnblkcnt, 0);
   7765 	/*
   7766 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
   7767 	 * flag in the mddb_set structure to show that the locator
   7768 	 * names have changed.
   7769 	 */
   7770 
   7771 	if ((lbp->lb_flags & MDDB_MNSET) &&
   7772 	    (md_set[s->s_setno].s_am_i_master)) {
   7773 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
   7774 	}
   7775 out:
   7776 	single_thread_end(s);
   7777 	mddb_setexit(s);
   7778 	if (err)
   7779 		return (1);
   7780 	return (0);
   7781 }
   7782 
   7783 /*
   7784  * update_locatorblock - for active entries in the locator block, check
   7785  *		the devt to see if it matches the given devt. If so, and
   7786  *		there is an associated device id which is not the same
   7787  *		as the passed in devid, delete old devid and add a new one.
   7788  *
   7789  *		During import of replicated disksets, old_didptr contains
   7790  *		the original disk's device id.  Use this device id in
   7791  *		addition to the devt to determine if an entry is a match
   7792  *		and should be updated with the new device id of the
   7793  *		replicated disk.  Specifically, this is the case being handled:
   7794  *
   7795  *		Original_disk	Replicated_disk	Disk_Available_During_Import
   7796  *		c1t1d0		c1t3d0		no - so old name c1t1d0 shown
   7797  *		c1t2d0		c1t1d0		yes - name is c1t1d0
   7798  *		c1t3d0		c1t2d0		yes - name is c1t2d0
   7799  *
   7800  *		Can't just match on devt since devt for the first and third
   7801  *		disks will be the same, but the original disk's device id
   7802  *		is known and can be used to distinguish which disk's
   7803  *		replicated device id should be updated.
   7804  *	RETURN
   7805  *		MDDB_E_NODEVID
   7806  *		MDDB_E_NOLOCBLK
   7807  *		1	Error
   7808  *		0	Success
   7809  */
   7810 static int
   7811 update_locatorblock(
   7812 	mddb_set_t	*s,
   7813 	md_dev64_t	dev,
   7814 	ddi_devid_t	didptr,
   7815 	ddi_devid_t	old_didptr
   7816 )
   7817 {
   7818 	mddb_lb_t	*lbp = NULL;
   7819 	mddb_locator_t	*lp;
   7820 	int		li;
   7821 	uint_t		flg;
   7822 	ddi_devid_t	devid_ptr;
   7823 	int		retval = 0;
   7824 	char		*minor_name;
   7825 	int		repl_import_flag;
   7826 
   7827 	/* Set replicated flag if this is a replicated import */
   7828 	repl_import_flag = md_get_setstatus(s->s_setno) &
   7829 	    MD_SET_REPLICATED_IMPORT;
   7830 
   7831 	lbp = s->s_lbp;
   7832 	/* find replicas that haven't been deleted */
   7833 	for (li = 0; li < lbp->lb_loccnt; li++) {
   7834 		lp = &lbp->lb_locators[li];
   7835 
   7836 		if ((lp->l_flags & MDDB_F_DELETED)) {
   7837 			continue;
   7838 		}
   7839 		/*
   7840 		 * check to see if locator devt matches given dev
   7841 		 * and if there is a device ID associated with it
   7842 		 */
   7843 		flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags;
   7844 		if ((md_expldev(lp->l_dev) == dev) &&
   7845 		    (flg & MDDB_DID_EXISTS)) {
   7846 			if (flg & MDDB_DID_VALID) {
   7847 				continue; /* cont to nxt active entry */
   7848 			}
   7849 			devid_ptr = s->s_did_icp->did_ic_devid[li];
   7850 			if (devid_ptr == NULL) {
   7851 				return (MDDB_E_NODEVID);
   7852 			}
   7853 
   7854 			/*
   7855 			 * During a replicated import the old_didptr
   7856 			 * must match the current devid before the
   7857 			 * devid can be updated.
   7858 			 */
   7859 			if (repl_import_flag) {
   7860 				if (ddi_devid_compare(devid_ptr,
   7861 				    old_didptr) != 0)
   7862 					continue;
   7863 			}
   7864 
   7865 			if (ddi_devid_compare(devid_ptr, didptr) != 0) {
   7866 				/*
   7867 				 * devid's not equal so
   7868 				 * delete and add
   7869 				 */
   7870 				if (ddi_lyr_get_minor_name(
   7871 				    md_dev64_to_dev(dev),
   7872 				    S_IFBLK, &minor_name) == DDI_SUCCESS) {
   7873 					(void) mddb_devid_delete(s, li);
   7874 					(void) mddb_devid_add(s, li, didptr,
   7875 					    minor_name);
   7876 					kmem_free(minor_name,
   7877 					    strlen(minor_name)+1);
   7878 					break;
   7879 				} else {
   7880 					retval = 1;
   7881 					goto err_out;
   7882 				}
   7883 			}
   7884 		}
   7885 	} /* end for */
   7886 	retval = push_lb(s);
   7887 	(void) upd_med(s, "update_locatorblock(0)");
   7888 err_out:
   7889 	return (retval);
   7890 }
   7891 
   7892 static int
   7893 update_mb_devid(
   7894 	mddb_set_t	*s,
   7895 	mddb_ri_t	*rip,
   7896 	ddi_devid_t	devidptr
   7897 )
   7898 {
   7899 	mddb_mb_ic_t	*mbip;
   7900 	mddb_mb_t	*mb = NULL;
   7901 	daddr_t		blkno;
   7902 	md_dev64_t	device;
   7903 	uint_t		sz;
   7904 	int		mb2free = 0;
   7905 	int		err = 0;
   7906 
   7907 
   7908 	/*
   7909 	 * There is case where a disk may not have mddb,
   7910 	 * and only has dummy mddb which contains
   7911 	 * a valid devid we like to update and in this
   7912 	 * case, the rip_lbp will be NULL but we still
   7913 	 * like to update the devid embedded in the
   7914 	 * dummy mb block.
   7915 	 *
   7916 	 */
   7917 	if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) {
   7918 		mbip = rip->ri_mbip;
   7919 		mb = &mbip->mbi_mddb_mb;
   7920 	} else {
   7921 		/*
   7922 		 * Done if it is non-replicated set
   7923 		 */
   7924 		if (devidptr != (ddi_devid_t)NULL) {
   7925 			mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE,
   7926 			    KM_SLEEP);
   7927 			mb->mb_magic = MDDB_MAGIC_DU;
   7928 			mb->mb_revision = MDDB_REV_MB;
   7929 			mb2free = 1;
   7930 		} else {
   7931 			goto out;
   7932 		}
   7933 	}
   7934 
   7935 	blkno = rip->ri_blkno;
   7936 	device = rip->ri_dev;
   7937 	/*
   7938 	 * Replace the mb_devid with the new/valid one
   7939 	 */
   7940 	if (devidptr != (ddi_devid_t)NULL) {
   7941 		/*
   7942 		 * Zero out what we have previously
   7943 		 */
   7944 		if (mb->mb_devid_len)
   7945 			bzero(mb->mb_devid, mb->mb_devid_len);
   7946 		sz = ddi_devid_sizeof(devidptr);
   7947 		bcopy((char *)devidptr, (char *)mb->mb_devid, sz);
   7948 		mb->mb_devid_len = sz;
   7949 	}
   7950 
   7951 	mb->mb_setno = s->s_setno;
   7952 	uniqtime32(&mb->mb_timestamp);
   7953 	crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
   7954 	/*
   7955 	 * putblks will
   7956 	 *
   7957 	 *	- drop the s_dbmx lock
   7958 	 *	- biowait
   7959 	 *	- regain the s_dbmx lock
   7960 	 *
   7961 	 * Need to update this if we wants to handle
   7962 	 * mb_next != NULL which it is unlikely will happen
   7963 	 */
   7964 	err = putblks(s, (caddr_t)mb, blkno, 1, device, 0);
   7965 
   7966 	if (mb2free) {
   7967 		kmem_free(mb, MDDB_BSIZE);
   7968 	}
   7969 out:
   7970 	return (err);
   7971 }
   7972 
   7973 static int
   7974 setdid(
   7975 	mddb_config_t		*cp
   7976 )
   7977 {
   7978 	ddi_devid_t		devidp;
   7979 	dev_t			ddi_dev;
   7980 	mddb_set_t		*s;
   7981 	int			err = 0;
   7982 	mddb_ri_t		*rip;
   7983 
   7984 	/*
   7985 	 * Data integrity check
   7986 	 */
   7987 	if (cp->c_setno >= md_nsets || cp->c_devt <= 0)
   7988 		return (EINVAL);
   7989 
   7990 	if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE))
   7991 		return (0);
   7992 
   7993 	ddi_dev = md_dev64_to_dev(cp->c_devt);
   7994 	if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) {
   7995 		return (-1);
   7996 	}
   7997 	if (devidp == NULL) {
   7998 		return (-1);
   7999 	}
   8000 
   8001 	if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL)
   8002 		return (-1);
   8003 	single_thread_start(s);
   8004 
   8005 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
   8006 		if (rip->ri_lbp == (mddb_lb_t *)NULL)
   8007 			continue;
   8008 		/*
   8009 		 * We only update what is asked
   8010 		 */
   8011 		if (rip->ri_dev == cp->c_devt) {
   8012 			if (update_mb_devid(s, rip, devidp) != 0) {
   8013 				err = -1;
   8014 				goto out;
   8015 			}
   8016 		}
   8017 	}
   8018 
   8019 	if (update_locatorblock(s, cp->c_devt, devidp, NULL)) {
   8020 		err = -1;
   8021 		goto out;
   8022 	}
   8023 
   8024 out:
   8025 	single_thread_end(s);
   8026 	mddb_setexit(s);
   8027 	ddi_devid_free(devidp);
   8028 	return (err);
   8029 }
   8030 
   8031 static int
   8032 delnewside(
   8033 	mddb_config_t		*cp,
   8034 	int			command,
   8035 	md_error_t		*ep
   8036 )
   8037 {
   8038 	mddb_set_t		*s;
   8039 	int			li;
   8040 	mddb_lb_t		*lbp;		/* pointer to locator block */
   8041 	mddb_ln_t		*lnp;		/* pointer to locator names */
   8042 	mddb_mnln_t		*mnlnp;		/* pointer to locator names */
   8043 	mddb_locator_t		*lp;
   8044 	mddb_sidelocator_t	*slp;
   8045 	mddb_cfg_loc_t		*clp;
   8046 	int			err = 0;
   8047 	set_t			setno = cp->c_setno;
   8048 	ddi_devid_t		devid;
   8049 	ddi_devid_t		ret_devid = NULL;
   8050 	char			*minor_name;
   8051 	uint_t			use_devid = 0;
   8052 	dev_t			ddi_dev;
   8053 	md_mnname_suffix_t	*mnsn;
   8054 	mddb_mnlb_t		*mnlbp;
   8055 	mddb_mnsidelocator_t	*mnslp;
   8056 
   8057 	/* Currently don't allow addition/deletion of sides during upgrade */
   8058 	if (MD_UPGRADE) {
   8059 		cmn_err(CE_WARN,
   8060 		    "Addition and deletion of sides not allowed"
   8061 		    " during upgrade. \n");
   8062 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
   8063 	}
   8064 
   8065 	/*
   8066 	 * Data integrity check
   8067 	 */
   8068 	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
   8069 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
   8070 
   8071 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
   8072 		return (mddbstatus2error(ep, err, NODEV32, setno));
   8073 
   8074 	single_thread_start(s);
   8075 	clp = &cp->c_locator;
   8076 
   8077 	lbp = s->s_lbp;
   8078 
   8079 	if (lbp->lb_setno != setno) {
   8080 		single_thread_end(s);
   8081 		mddb_setexit(s);
   8082 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
   8083 	}
   8084 
   8085 	/*
   8086 	 * Find this device/blkno pair
   8087 	 */
   8088 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
   8089 		ddi_dev = md_dev64_to_dev(clp->l_dev);
   8090 		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
   8091 		    (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name)
   8092 		    == DDI_SUCCESS)) {
   8093 			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
   8094 				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
   8095 				use_devid = 1;
   8096 				(void) strcpy(clp->l_minor_name, minor_name);
   8097 			}
   8098 			kmem_free(minor_name, strlen(minor_name)+1);
   8099 		}
   8100 		if (use_devid != 1 && ret_devid != NULL)
   8101 			ddi_devid_free(ret_devid);
   8102 	}
   8103 	for (li = 0; li < lbp->lb_loccnt; li++) {
   8104 		lp = &lbp->lb_locators[li];
   8105 		if (lp->l_flags & MDDB_F_DELETED)
   8106 			continue;
   8107 		if (use_devid) {
   8108 			if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0)
   8109 				continue;
   8110 			if ((ddi_devid_compare(devid,
   8111 			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
   8112 			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
   8113 			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
   8114 				break;
   8115 			}
   8116 		} else {
   8117 			if (lp->l_dev == clp->l_dev &&
   8118 			    (daddr_t)lp->l_blkno == clp->l_blkno) {
   8119 				break;
   8120 			}
   8121 		}
   8122 	}
   8123 
   8124 	if (li == lbp->lb_loccnt) {
   8125 		if (use_devid)
   8126 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
   8127 		single_thread_end(s);
   8128 		mddb_setexit(s);
   8129 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
   8130 	}
   8131 
   8132 	lnp = s->s_lnp;
   8133 	if (command == MDDB_NEWSIDE) {
   8134 		int 	index = 0;
   8135 		/*
   8136 		 * If a MN diskset, need to find the index where the new
   8137 		 * locator information is to be stored in the mnsidelocator
   8138 		 * field of the locator block so that the locator name can
   8139 		 * be stored at the same array index in the mnsuffixes
   8140 		 * field of the locator names structure.
   8141 		 */
   8142 		if (lbp->lb_flags & MDDB_MNSET) {
   8143 			if ((index = checklocator(lbp, li,
   8144 			    cp->c_sideno)) == -1) {
   8145 				if (use_devid) {
   8146 					ddi_devid_free((ddi_devid_t)
   8147 					    (uintptr_t)clp->l_devid);
   8148 				}
   8149 				single_thread_end(s);
   8150 				mddb_setexit(s);
   8151 				return (mdmddberror(ep, MDE_DB_TOOSMALL,
   8152 				    NODEV32, setno));
   8153 			}
   8154 		}
   8155 
   8156 		/*
   8157 		 * Store the locator name before the sidelocator information
   8158 		 * in case a panic occurs between these 2 steps.  Must have
   8159 		 * the locator name information in order to print reasonable
   8160 		 * error information.
   8161 		 */
   8162 		if (splitname2locatorblock(&cp->c_devname, lnp, li,
   8163 		    cp->c_sideno, index)) {
   8164 			if (use_devid)
   8165 				ddi_devid_free(
   8166 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
   8167 			single_thread_end(s);
   8168 			mddb_setexit(s);
   8169 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
   8170 			    setno));
   8171 		}
   8172 
   8173 		if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) {
   8174 			if (use_devid)
   8175 				ddi_devid_free(
   8176 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
   8177 			single_thread_end(s);
   8178 			mddb_setexit(s);
   8179 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
   8180 			    setno));
   8181 		}
   8182 	}
   8183 
   8184 	if (use_devid)
   8185 		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
   8186 
   8187 	if (command == MDDB_DELSIDE) {
   8188 		int i;
   8189 		for (i = 0; i < lbp->lb_loccnt; i++) {
   8190 			if (lbp->lb_flags & MDDB_MNSET) {
   8191 				int	j;
   8192 				mnlbp = (mddb_mnlb_t *)lbp;
   8193 				for (j = 0; j < MD_MNMAXSIDES; j++) {
   8194 					mnslp = &mnlbp->lb_mnsidelocators[j][i];
   8195 					if (mnslp->mnl_sideno == cp->c_sideno)
   8196 						break;
   8197 				}
   8198 				if (j < MD_MNMAXSIDES) {
   8199 					mnslp->mnl_mnum = NODEV32;
   8200 					mnslp->mnl_sideno = 0;
   8201 					mnlnp = (mddb_mnln_t *)lnp;
   8202 					mnsn = &(mnlnp->ln_mnsuffixes[j][i]);
   8203 					bzero((caddr_t)mnsn,
   8204 					    sizeof (md_mnname_suffix_t));
   8205 				}
   8206 			} else {
   8207 				slp = &lbp->lb_sidelocators[cp->c_sideno][i];
   8208 				bzero((caddr_t)&lnp->ln_suffixes
   8209 				    [cp->c_sideno][i], sizeof (md_name_suffix));
   8210 				slp->l_mnum = NODEV32;
   8211 			}
   8212 		}
   8213 	}
   8214 
   8215 	/* write new locator names to all devices */
   8216 	uniqtime32(&lnp->ln_timestamp);
   8217 	if (lbp->lb_flags & MDDB_MNSET)
   8218 		lnp->ln_revision = MDDB_REV_MNLN;
   8219 	else
   8220 		lnp->ln_revision = MDDB_REV_LN;
   8221 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
   8222 	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
   8223 	    lbp->lb_lnblkcnt, 0);
   8224 	/*
   8225 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
   8226 	 * flag in the mddb_set structure to show that the locator
   8227 	 * names have changed.
   8228 	 */
   8229 
   8230 	if ((lbp->lb_flags & MDDB_MNSET) &&
   8231 	    (md_set[s->s_setno].s_am_i_master)) {
   8232 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
   8233 	}
   8234 	if (err) {
   8235 		if (writeretry(s)) {
   8236 			single_thread_end(s);
   8237 			mddb_setexit(s);
   8238 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
   8239 		}
   8240 	}
   8241 
   8242 	uniqtime32(&lbp->lb_timestamp);
   8243 	/* write new locator to all devices */
   8244 	err = writelocall(s);
   8245 
   8246 	(void) upd_med(s, "delnewside(0)");
   8247 
   8248 	computefreeblks(s); /* recompute always it may be larger */
   8249 	if (err) {
   8250 		if (writeretry(s)) {
   8251 			single_thread_end(s);
   8252 			mddb_setexit(s);
   8253 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
   8254 		}
   8255 	}
   8256 
   8257 	single_thread_end(s);
   8258 	mddb_setexit(s);
   8259 
   8260 	return (0);
   8261 }
   8262 
   8263 static int
   8264 newdev(
   8265 	mddb_config_t	*cp,
   8266 	int		command,
   8267 	md_error_t	*ep
   8268 )
   8269 {
   8270 	mddb_set_t	*s;
   8271 	mddb_mb_ic_t	*mbip, *mbip1;
   8272 	int		i, j;
   8273 	int		li;
   8274 	mddb_lb_t	*lbp;		/* pointer to locator block */
   8275 	mddb_ln_t	*lnp;		/* pointer to locator names */
   8276 	mddb_locator_t	*lp;
   8277 	mddb_cfg_loc_t	*clp;
   8278 	int		err