Home | History | Annotate | Download | only in common
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Database location balancing code.
     28  */
     29 
     30 #include <meta.h>
     31 #include <sys/lvm/md_mddb.h>
     32 #include <sdssc.h>
     33 
     34 #define	MD_MINBALREP	2
     35 
     36 /*
     37  * Stuff for DB balancing.
     38  */
     39 enum md_ctlr_ops_t {
     40 	DRV_NOP = 0,
     41 	DRV_ADD = 1,
     42 	DRV_DEL = 2
     43 };
     44 typedef enum md_ctlr_ops_t md_ctlr_ops_t;
     45 
     46 /* drive flag fields */
     47 #define	DRV_F_ERROR	0x1
     48 #define	DRV_F_INDISKSET	0x2
     49 
     50 struct md_ctlr_drv_t {
     51 	md_ctlr_ops_t drv_op;
     52 	int drv_flags;
     53 	int drv_dbcnt;
     54 	int drv_new_dbcnt;
     55 	daddr_t drv_dbsize;
     56 	mddrivename_t *drv_dnp;
     57 	struct md_ctlr_drv_t *drv_next;
     58 };
     59 typedef struct md_ctlr_drv_t md_ctlr_drv_t;
     60 
     61 struct md_ctlr_ctl_t {
     62 	mdcinfo_t *ctl_cinfop;
     63 	int ctl_dbcnt;
     64 	int ctl_drcnt;
     65 	md_ctlr_drv_t *ctl_drvs;
     66 	struct md_ctlr_ctl_t *ctl_next;
     67 };
     68 typedef struct md_ctlr_ctl_t md_ctlr_ctl_t;
     69 
     70 static int
     71 add_replica(
     72 	mdsetname_t		*sp,
     73 	mddrivename_t		*dnp,
     74 	int			dbcnt,
     75 	daddr_t			dbsize,
     76 	md_error_t		*ep
     77 )
     78 {
     79 	mdnamelist_t		*nlp = NULL;
     80 	mdname_t		*np;
     81 	md_set_desc		*sd;
     82 	uint_t			rep_slice;
     83 
     84 	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
     85 		return (-1);
     86 
     87 	if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
     88 		return (-1);
     89 
     90 	(void) metanamelist_append(&nlp, np);
     91 
     92 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
     93 		metafreenamelist(nlp);
     94 		return (-1);
     95 	}
     96 
     97 	if (meta_db_attach(sp, nlp, (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
     98 	    (&sd->sd_ctime), dbcnt, dbsize, NULL, ep) == -1) {
     99 		metafreenamelist(nlp);
    100 		return (-1);
    101 	}
    102 
    103 	metafreenamelist(nlp);
    104 	return (0);
    105 }
    106 
    107 static int
    108 del_replica(
    109 	mdsetname_t		*sp,
    110 	mddrivename_t		*dnp,
    111 	md_error_t		*ep
    112 )
    113 {
    114 	mdnamelist_t		*nlp = NULL;
    115 	mdname_t		*np;
    116 	uint_t			rep_slice;
    117 
    118 	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
    119 		return (-1);
    120 
    121 	if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
    122 		return (-1);
    123 
    124 	(void) metanamelist_append(&nlp, np);
    125 
    126 	if (meta_db_detach(sp, nlp, (MDFORCE_DS | MDFORCE_SET_LOCKED),
    127 	    NULL, ep) == -1) {
    128 		metafreenamelist(nlp);
    129 		return (-1);
    130 	}
    131 
    132 	metafreenamelist(nlp);
    133 	return (0);
    134 }
    135 
    136 static int
    137 rep_has_err(md_replicalist_t *rlp, mdname_t *np)
    138 {
    139 	md_replicalist_t	*rl;
    140 
    141 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
    142 		md_replica_t	*r = rl->rl_repp;
    143 
    144 		if (strcmp(r->r_namep->cname, np->cname) != 0)
    145 			continue;
    146 
    147 		if (r->r_flags & (MDDB_F_EREAD | MDDB_F_EFMT | MDDB_F_EDATA |
    148 		    MDDB_F_EMASTER | MDDB_F_EWRITE))
    149 			return (1);
    150 
    151 	}
    152 	return (0);
    153 }
    154 
    155 static int
    156 add_drv_to_ctl_lst(
    157 	md_ctlr_ctl_t		**clpp,
    158 	md_replicalist_t	*rlp,
    159 	mddrivename_t		*dnp,
    160 	int			dbcnt,
    161 	daddr_t			dbsize,
    162 	mdcinfo_t		*cinfop,
    163 	int			indiskset,
    164 	int			with_bus,
    165 	int			errored,
    166 	md_error_t		*ep
    167 )
    168 {
    169 	md_ctlr_drv_t		**dpp;
    170 	mdname_t		*np;
    171 	mdcinfo_t		*tcinfop;
    172 	char			*cmp_name_1, *cmp_name_2;
    173 	int			not_found;
    174 
    175 	/*
    176 	 * The user must pass in a list head.
    177 	 */
    178 	assert(clpp != NULL);
    179 
    180 	if (cinfop == NULL) {
    181 		uint_t	rep_slice;
    182 
    183 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
    184 			/*
    185 			 * A failure to get the slice information can occur
    186 			 * because the drive has failed, if this is the
    187 			 * case then there is nothing that can be done
    188 			 * with this drive, so do not include it in the
    189 			 * list of drives. Clear the error and return.
    190 			 */
    191 			mdclrerror(ep);
    192 			return (0);
    193 		}
    194 
    195 		if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
    196 			return (-1);
    197 
    198 		if ((tcinfop = metagetcinfo(np, ep)) == NULL)
    199 			return (-1);
    200 
    201 		if (metagetvtoc(np, FALSE, NULL, ep) == NULL)
    202 			errored = 1;
    203 
    204 		if (rep_has_err(rlp, np))
    205 			errored = 1;
    206 	} else
    207 		tcinfop = cinfop;
    208 
    209 	for (/* void */; *clpp != NULL; clpp = &(*clpp)->ctl_next) {
    210 		/*
    211 		 * Try to locate ctlr.
    212 		 */
    213 		(void) sdssc_convert_cluster_path(tcinfop->cname, &cmp_name_1);
    214 		(void) sdssc_convert_cluster_path((*clpp)->ctl_cinfop->cname,
    215 		    &cmp_name_2);
    216 
    217 		if (tcinfop->ctype != (*clpp)->ctl_cinfop->ctype ||
    218 		    tcinfop->cnum != (*clpp)->ctl_cinfop->cnum ||
    219 		    strncmp(cmp_name_1, cmp_name_2, 16) != 0 ||
    220 		    (with_bus && tcinfop->bus != (*clpp)->ctl_cinfop->bus)) {
    221 			not_found = 1;
    222 		} else
    223 			not_found = 0;
    224 
    225 
    226 		sdssc_convert_path_free(cmp_name_1);
    227 		sdssc_convert_path_free(cmp_name_2);
    228 
    229 		if (not_found)
    230 			continue;
    231 
    232 		/*
    233 		 * Found ctlr, try to locate the drive.
    234 		 */
    235 		for (dpp = &(*clpp)->ctl_drvs; *dpp != NULL;
    236 		    dpp = &(*dpp)->drv_next) {
    237 			(void) sdssc_convert_cluster_path(
    238 			    (*dpp)->drv_dnp->cname, &cmp_name_1);
    239 			(void) sdssc_convert_cluster_path(dnp->cname,
    240 			    &cmp_name_2);
    241 
    242 			not_found = strcmp(cmp_name_1, cmp_name_2);
    243 
    244 			sdssc_convert_path_free(cmp_name_1);
    245 			sdssc_convert_path_free(cmp_name_2);
    246 
    247 			if (not_found)
    248 				continue;
    249 
    250 			/*
    251 			 * Found drive, must be deleting.
    252 			 */
    253 			(*dpp)->drv_op = DRV_DEL;
    254 			if (indiskset)
    255 				(*dpp)->drv_flags |= DRV_F_INDISKSET;
    256 			if (errored) {
    257 				mdclrerror(ep);
    258 				(*dpp)->drv_flags |= DRV_F_ERROR;
    259 			}
    260 			(*clpp)->ctl_dbcnt -= (*dpp)->drv_dbcnt;
    261 			(*clpp)->ctl_drcnt--;
    262 			return (0);
    263 		}
    264 		/*
    265 		 * The ctlr was found, but not the drive, so add
    266 		 * the drive
    267 		 */
    268 		(*dpp) = Zalloc(sizeof (**dpp));
    269 
    270 
    271 		if (indiskset) {
    272 			(*dpp)->drv_op = DRV_NOP;
    273 			(*dpp)->drv_flags |= DRV_F_INDISKSET;
    274 			if (errored) {
    275 				mdclrerror(ep);
    276 				(*dpp)->drv_flags |= DRV_F_ERROR;
    277 			}
    278 		} else {
    279 			(*dpp)->drv_op = DRV_ADD;
    280 			if (errored) {
    281 				(*dpp)->drv_flags |= DRV_F_ERROR;
    282 				return (-1);
    283 			}
    284 			assert(dbsize != 0);
    285 		}
    286 		(*dpp)->drv_dbcnt = dbcnt;
    287 		(*dpp)->drv_dbsize = dbsize;
    288 		(*dpp)->drv_dnp = dnp;
    289 		(*clpp)->ctl_dbcnt += dbcnt;
    290 		(*clpp)->ctl_drcnt++;
    291 		return (0);
    292 	}
    293 	/*
    294 	 * No ctlr was located, so add the ctlr, then recurse to add the
    295 	 * drive to the ctlr.
    296 	 */
    297 	(*clpp) = Zalloc(sizeof (**clpp));
    298 
    299 	(*clpp)->ctl_cinfop = tcinfop;
    300 
    301 	return (add_drv_to_ctl_lst(clpp, rlp, dnp, dbcnt, dbsize, tcinfop,
    302 	    indiskset, with_bus, errored, ep));
    303 }
    304 
    305 static int
    306 add_replica_to_ctl(
    307 	mdsetname_t		*sp,
    308 	md_ctlr_ctl_t		*c,
    309 	int			minimum_replicas,
    310 	md_error_t		*ep
    311 )
    312 {
    313 	md_ctlr_drv_t		*d;
    314 	int			maxdb = 0;
    315 
    316 	/*
    317 	 * If this ctrl has no "usable" drives, assert() or just return if
    318 	 * assert()'s are turned off.
    319 	 */
    320 	if (c->ctl_drcnt == 0) {
    321 		assert(0);
    322 		return (0);
    323 	}
    324 
    325 	/*
    326 	 * Determine the largest DB count on a drive.
    327 	 */
    328 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
    329 		if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL)
    330 			maxdb = d->drv_dbcnt;
    331 
    332 	/*
    333 	 * Make sure we start at a reasonable number
    334 	 */
    335 	if (maxdb == 0)
    336 		maxdb = 1;
    337 
    338 	/*
    339 	 * Add a replica to a drive on this ctrl.
    340 	 */
    341 	/*CONSTCOND*/
    342 	while (1) {
    343 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
    344 			/*
    345 			 * If this drive is being deleted, skip it.
    346 			 */
    347 			if (d->drv_op == DRV_DEL)
    348 				continue;
    349 
    350 			if (d->drv_flags & DRV_F_ERROR)
    351 				continue;
    352 			/*
    353 			 * Make sure that the replicas are distributed across
    354 			 * the drives.
    355 			 */
    356 			if (d->drv_dbcnt >= maxdb)
    357 				continue;
    358 			/*
    359 			 * See if the drive already has replicas,
    360 			 * if it does, then delete the exisiting
    361 			 * replica(s) and re-add n+1 replicas to the drive.
    362 			 */
    363 			/* ==== Vulnerability - no DB's start ==== */
    364 			if (d->drv_dbcnt > 0) {
    365 				if (del_replica(sp, d->drv_dnp, ep) == -1) {
    366 					d->drv_flags |= DRV_F_ERROR;
    367 					if (! (d->drv_flags & DRV_F_INDISKSET))
    368 						return (-1);
    369 					mdclrerror(ep);
    370 					continue;
    371 				}
    372 			}
    373 			if (add_replica(sp, d->drv_dnp, (d->drv_dbcnt + 1),
    374 			    d->drv_dbsize, ep) == -1) {
    375 				md_error_t nep = mdnullerror;
    376 
    377 				if (d->drv_dbcnt) {
    378 					/*
    379 					 * We have to to bring the replica
    380 					 * in the drive to the previous
    381 					 * status by adding the original no
    382 					 * of replicas to the drive since
    383 					 * the addition of (drv_dbcnt+1) no
    384 					 * of replicas has failed. If we
    385 					 * leave it at this state, we might
    386 					 * end up having no replicas at
    387 					 * all for the diskset.
    388 					 */
    389 					if (add_replica(sp, d->drv_dnp,
    390 					    d->drv_dbcnt, d->drv_dbsize,
    391 					    &nep) == -1) {
    392 						c->ctl_dbcnt -= d->drv_dbcnt;
    393 						d->drv_dbcnt = 0;
    394 						mdclrerror(&nep);
    395 					}
    396 				}
    397 
    398 				if (mdismddberror(ep, MDE_TOOMANY_REPLICAS))
    399 					return (-1);
    400 
    401 				if (mdismddberror(ep, MDE_REPLICA_TOOSMALL))
    402 					continue;
    403 
    404 				d->drv_flags |= DRV_F_ERROR;
    405 				if (! (d->drv_flags & DRV_F_INDISKSET))
    406 					return (-1);
    407 				mdclrerror(ep);
    408 				continue;
    409 			}
    410 
    411 			d->drv_dbcnt++;
    412 			c->ctl_dbcnt++;
    413 			/* ==== Vulnerability - no DB's end ==== */
    414 			return (1);
    415 		}
    416 		maxdb++;
    417 		if (maxdb > minimum_replicas)
    418 			return (0);
    419 	}
    420 	/*NOTREACHED*/
    421 }
    422 
    423 static int
    424 del_replica_from_ctl(
    425 	mdsetname_t		*sp,
    426 	md_ctlr_ctl_t		*c,
    427 	md_error_t		*ep
    428 )
    429 {
    430 	md_ctlr_drv_t		*d;
    431 	int			maxdb = 0;
    432 
    433 	/*
    434 	 * If this ctrl has no "usable" drives, assert() or just return if
    435 	 * assert()'s are turned off.
    436 	 */
    437 	if (c->ctl_drcnt == 0) {
    438 		assert(0);
    439 		return (0);
    440 	}
    441 
    442 	/*
    443 	 * Determine the largest DB count on a drive.
    444 	 */
    445 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
    446 		if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL)
    447 			maxdb = d->drv_dbcnt;
    448 
    449 	if (maxdb == 0)
    450 		return (0);
    451 
    452 	/*
    453 	 * Delete a replica from a drive on this ctrl.
    454 	 */
    455 	/*CONSTCOND*/
    456 	while (1) {
    457 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
    458 			/*
    459 			 * If this drive is being deleted, skip it.
    460 			 */
    461 			if (d->drv_op == DRV_DEL)
    462 				continue;
    463 
    464 			/*
    465 			 * Make sure that there are replicas on this drive to
    466 			 * delete.
    467 			 */
    468 			if (d->drv_dbcnt == 0)
    469 				continue;
    470 
    471 			if (d->drv_flags & DRV_F_ERROR)
    472 				continue;
    473 
    474 			/*
    475 			 * We need to keep the DB's distributed across the
    476 			 * drives.
    477 			 */
    478 			if (d->drv_dbcnt < maxdb)
    479 				continue;
    480 
    481 			/*
    482 			 * Delete all the replicas on the drive.
    483 			 */
    484 			/* ==== Vulnerability - no DB's start ==== */
    485 			if (del_replica(sp, d->drv_dnp, ep) == -1) {
    486 				d->drv_flags |= DRV_F_ERROR;
    487 				if (! (d->drv_flags & DRV_F_INDISKSET))
    488 					return (-1);
    489 				mdclrerror(ep);
    490 				continue;
    491 			}
    492 			d->drv_dbcnt--;
    493 			c->ctl_dbcnt--;
    494 			/*
    495 			 * If there is still a dbcnt for this drive, then add
    496 			 * back the needed DB's.
    497 			 */
    498 			if (d->drv_dbcnt > 0) {
    499 				if (add_replica(sp, d->drv_dnp, d->drv_dbcnt,
    500 				    d->drv_dbsize, ep) == -1) {
    501 					c->ctl_dbcnt -= d->drv_dbcnt;
    502 					d->drv_dbcnt = 0;
    503 
    504 					if (mdismddberror(ep,
    505 					    MDE_TOOMANY_REPLICAS))
    506 						return (-1);
    507 
    508 					d->drv_flags |= DRV_F_ERROR;
    509 					if (! (d->drv_flags & DRV_F_INDISKSET))
    510 						return (-1);
    511 					mdclrerror(ep);
    512 					continue;
    513 				}
    514 			}
    515 			/* ==== Vulnerability - no DB's end ==== */
    516 			return (1);
    517 		}
    518 		maxdb--;
    519 		if (maxdb <= 0)
    520 			return (0);
    521 	}
    522 	/*NOTREACHED*/
    523 }
    524 
    525 static int
    526 del_replicas(mdsetname_t *sp, md_ctlr_ctl_t *clp, md_error_t *ep)
    527 {
    528 	md_ctlr_ctl_t		*c;
    529 	md_ctlr_drv_t		*d;
    530 	mdnamelist_t		*nlp;
    531 	mdname_t		*np;
    532 
    533 	for (c = clp; c != NULL; c = c->ctl_next) {
    534 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
    535 			uint_t	rep_slice;
    536 
    537 			if (! (d->drv_flags & DRV_F_ERROR) &&
    538 			    (d->drv_op != DRV_DEL))
    539 				continue;
    540 
    541 			if (d->drv_dbcnt == 0)
    542 				continue;
    543 
    544 			if (meta_replicaslice(d->drv_dnp,
    545 			    &rep_slice, ep) != 0)
    546 				return (-1);
    547 
    548 			np = metaslicename(d->drv_dnp, rep_slice, ep);
    549 			if (np == NULL)
    550 				return (-1);
    551 
    552 			nlp = NULL;
    553 			(void) metanamelist_append(&nlp, np);
    554 
    555 			/*
    556 			 * Delete the replicas listed.
    557 			 */
    558 			if (meta_db_detach(sp, nlp,
    559 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
    560 			    ep) == -1) {
    561 				metafreenamelist(nlp);
    562 				if (d->drv_flags & DRV_F_INDISKSET) {
    563 					mdclrerror(ep);
    564 					continue;
    565 				}
    566 				return (-1);
    567 			}
    568 			metafreenamelist(nlp);
    569 		}
    570 	}
    571 
    572 	return (0);
    573 }
    574 
    575 static void
    576 free_ctlr_lst(md_ctlr_ctl_t **clpp)
    577 {
    578 	md_ctlr_ctl_t		*c, *tc = NULL;
    579 	md_ctlr_drv_t		*d, *td = NULL;
    580 
    581 	for (c = *clpp; c != NULL; c = tc) {
    582 		tc = c->ctl_next;
    583 		for (d = c->ctl_drvs; d != NULL; d = td) {
    584 			td = d->drv_next;
    585 			Free(d);
    586 		}
    587 		Free(c);
    588 	}
    589 	*clpp = NULL;
    590 }
    591 
    592 static int
    593 build_ctlr_lst(
    594 	mdsetname_t		*sp,
    595 	md_ctlr_ctl_t		**clpp,
    596 	md_drive_desc		*opdd,
    597 	md_drive_desc		*curdd,
    598 	int			with_bus,
    599 	daddr_t			dbsize,
    600 	md_error_t		*ep
    601 )
    602 {
    603 	md_drive_desc			*d;
    604 	md_set_desc			*sd;
    605 	daddr_t				nblks;
    606 	md_replicalist_t		*rlp = NULL;
    607 	static	daddr_t			min_dbsize = 0;
    608 
    609 	if (min_dbsize == 0) {
    610 		if ((nblks = meta_db_minreplica(sp, ep)) < 0) {
    611 			min_dbsize = MD_DBSIZE;
    612 
    613 			if (! metaislocalset(sp)) {
    614 				if ((sd = metaget_setdesc(sp, ep)) == NULL)
    615 					return (-1);
    616 
    617 				if (MD_MNSET_DESC(sd))
    618 					min_dbsize = MD_MN_DBSIZE;
    619 			}
    620 			mdclrerror(ep);
    621 		} else
    622 			min_dbsize = nblks;
    623 	}
    624 
    625 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
    626 		if (! mdismddberror(ep, MDE_DB_NODB) &&
    627 		    ! mdismddberror(ep, MDE_DB_NOTOWNER))
    628 			return (-1);
    629 		mdclrerror(ep);
    630 	}
    631 
    632 	/*
    633 	 * Add drives currently in the set to the ctlr list.
    634 	 */
    635 	for (d = curdd; d != NULL; d = d->dd_next) {
    636 		daddr_t	this_dbsize = d->dd_dbsize;
    637 
    638 		if (this_dbsize == 0)
    639 			this_dbsize = min_dbsize;
    640 
    641 		if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, d->dd_dbcnt,
    642 		    this_dbsize, NULL, TRUE, with_bus, 0, ep) == -1)
    643 			return (-1);
    644 	}
    645 
    646 	/*
    647 	 * Add the drives that are being operated on to the ctlr list.
    648 	 */
    649 	for (d = opdd; d != NULL; d = d->dd_next)
    650 		if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, 0, dbsize, NULL,
    651 		    FALSE, with_bus, 0, ep) == -1)
    652 			return (-1);
    653 
    654 	metafreereplicalist(rlp);
    655 	return (0);
    656 }
    657 
    658 static int
    659 count_replica_on_ctl(
    660 	md_ctlr_ctl_t		*c,
    661 	int			adding,
    662 	int			*db_cnt,
    663 	int			minimum_replicas
    664 )
    665 {
    666 	md_ctlr_drv_t		*d;
    667 	int			maxdb = 0;
    668 
    669 	/*
    670 	 * If this ctrl has no "usable" drives, nothing to do.
    671 	 */
    672 	if (c->ctl_drcnt == 0)
    673 		return (0);
    674 
    675 	/*
    676 	 * Determine the largest DB count on a drive.
    677 	 */
    678 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
    679 		if (d->drv_new_dbcnt > maxdb && d->drv_op != DRV_DEL)
    680 			maxdb = d->drv_new_dbcnt;
    681 
    682 	/*
    683 	 * Make sure we start at a reasonable number
    684 	 */
    685 	if (maxdb == 0) {
    686 		if (!adding)
    687 			return (0);
    688 		maxdb = 1;
    689 	}
    690 
    691 	/*
    692 	 * Count or Un-Count replicas that would be
    693 	 * added or deleted respectively.
    694 	 */
    695 	/*CONSTCOND*/
    696 	while (1) {
    697 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
    698 			/*
    699 			 * If this drive is being deleted, skip it.
    700 			 */
    701 			if (d->drv_op == DRV_DEL)
    702 				continue;
    703 
    704 			/*
    705 			 * If the drive is errored and adding, skip it.
    706 			 */
    707 			if (adding && (d->drv_flags & DRV_F_ERROR))
    708 				continue;
    709 
    710 			/*
    711 			 * Make sure that the replicas are distributed across
    712 			 * the drives.
    713 			 */
    714 			if (adding) {
    715 				if (d->drv_new_dbcnt >= maxdb)
    716 					continue;
    717 			} else {
    718 				if (d->drv_new_dbcnt == 0)
    719 					continue;
    720 				if (d->drv_new_dbcnt < maxdb)
    721 					continue;
    722 			}
    723 
    724 			/*
    725 			 * Count or Un-Count replicas here.
    726 			 */
    727 			if (adding) {
    728 				mdpart_t	*partp;
    729 				uint_t		rep_slice;
    730 				md_error_t	mde = mdnullerror;
    731 
    732 				if (meta_replicaslice(d->drv_dnp,
    733 				    &rep_slice, &mde) != 0) {
    734 					mdclrerror(&mde);
    735 					continue;
    736 				}
    737 
    738 				partp = &d->drv_dnp->vtoc.parts[rep_slice];
    739 				if (! partp)
    740 					continue;
    741 
    742 				if (((d->drv_new_dbcnt + 1) * d->drv_dbsize) >
    743 				    (partp->size - 16))
    744 					continue;
    745 				(*db_cnt)++;
    746 				d->drv_new_dbcnt++;
    747 			} else {
    748 				(*db_cnt)--;
    749 				d->drv_new_dbcnt--;
    750 			}
    751 			return (0);
    752 		}
    753 
    754 		/*
    755 		 * This should make sure they get spread
    756 		 * around.  This is to emulate the {add,del}_replica
    757 		 * routines.
    758 		 */
    759 		if (adding) {
    760 			maxdb++;
    761 			if (maxdb > minimum_replicas)
    762 				return (-1);
    763 		} else {
    764 			maxdb--;
    765 			if (maxdb <= 0)
    766 				return (-1);
    767 		}
    768 	}
    769 	/*NOTREACHED*/
    770 }
    771 
    772 static int
    773 count_replicas(
    774 	md_ctlr_ctl_t		*clp,
    775 	int			min_reps
    776 )
    777 {
    778 	md_ctlr_ctl_t		*c;
    779 	md_ctlr_drv_t		*d;
    780 	int			db_cnt;
    781 	int			uctlrs = 0;
    782 	int			total_cnt = 0;
    783 
    784 	/*
    785 	 * Count the number of controllers,
    786 	 * counting the replicas is slightly different based
    787 	 * on the controller count.
    788 	 */
    789 	for (c = clp; c != NULL; c = c->ctl_next)
    790 		if (c->ctl_drcnt > 0) {
    791 			uctlrs++;
    792 			for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
    793 				d->drv_new_dbcnt = d->drv_dbcnt;
    794 		}
    795 
    796 	if (uctlrs > 2) {
    797 		for (c = clp; c != NULL; c = c->ctl_next) {
    798 			if (c->ctl_drcnt == 0)
    799 				continue;
    800 
    801 			db_cnt = c->ctl_dbcnt;
    802 			/*
    803 			 * Count the replicas that would be added.
    804 			 */
    805 			while (db_cnt < min_reps)
    806 				if (count_replica_on_ctl(c, TRUE,
    807 				    &db_cnt, min_reps))
    808 					return (-1);
    809 
    810 			/*
    811 			 * Un-Count the replicas that would be deleted.
    812 			 */
    813 			while (db_cnt > min_reps)
    814 				if (count_replica_on_ctl(c, FALSE,
    815 				    &db_cnt, min_reps))
    816 					return (-1);
    817 			total_cnt += db_cnt;
    818 		}
    819 	} else {
    820 		for (c = clp; c != NULL; c = c->ctl_next) {
    821 			if (c->ctl_drcnt == 0)
    822 				continue;
    823 
    824 			db_cnt = c->ctl_dbcnt;
    825 			/*
    826 			 * Count the replicas that woud be added.
    827 			 */
    828 			while (db_cnt < (min_reps * c->ctl_drcnt))
    829 				if (count_replica_on_ctl(c, TRUE,
    830 				    &db_cnt, min_reps))
    831 					return (-1);
    832 
    833 			total_cnt += db_cnt;
    834 		}
    835 	}
    836 
    837 	return (total_cnt);
    838 }
    839 
    840 static int
    841 balance_replicas(
    842 	mdsetname_t		*sp,
    843 	md_ctlr_ctl_t		**clpp,
    844 	md_drive_desc		*opdd,
    845 	md_drive_desc		*curdd,
    846 	daddr_t			dbsize,
    847 	int			*minimum_replicas,
    848 	md_error_t		*ep
    849 )
    850 {
    851 	int			n;
    852 	int			rctlrs = 0;
    853 	int			uctlrs;
    854 	int			ructlrs;
    855 	int			octlrs;
    856 	int			save_done;
    857 	int			prevcnt = 0, issame = 1;
    858 	uint_t			drvcnt = ~0U;
    859 	uint_t			save_cnum;
    860 	mhd_ctlrtype_t		save_ctype;
    861 	char			save_cname[16];
    862 	char			*cmp_name_1, *cmp_name_2;
    863 	int			reps;
    864 	md_ctlr_ctl_t		*c;
    865 
    866 	/*
    867 	 * Build a ctlr list with SSA-100 busses NOT as separate controllers.
    868 	 */
    869 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
    870 		return (-1);
    871 
    872 	/*
    873 	 * Determine what controllers are usable in the sense of being able to
    874 	 * add a replica to a drive on the controller.
    875 	 * Also find the minimum number of drives on a controller.
    876 	 */
    877 	for (c = *clpp; c != NULL; c = c->ctl_next) {
    878 		if (c->ctl_drcnt > 0) {
    879 			rctlrs++;
    880 			drvcnt = min(drvcnt, c->ctl_drcnt);
    881 			if (prevcnt == 0)
    882 				prevcnt = c->ctl_drcnt;
    883 			else if (prevcnt != c->ctl_drcnt)
    884 				issame = 0;
    885 		}
    886 	}
    887 
    888 	if ((rctlrs <= 2) || (issame && (drvcnt >= 30)))
    889 		goto cont;
    890 
    891 	/*
    892 	 * If here: Handling 3 or more controllers most
    893 	 *	    likely with non-symmetrical number of
    894 	 *	    disks. The number of replicas will be
    895 	 *	    the minimum number of disks on a controller.
    896 	 *
    897 	 *	    The main point is to insure that a
    898 	 *	    controller does not have more than half
    899 	 *	    of the replicas.
    900 	 */
    901 	drvcnt = min(drvcnt, 12);
    902 	drvcnt = max(drvcnt, MD_MINBALREP);
    903 
    904 	/*
    905 	 * Can we find fewer than the maximum replicas by reducing the
    906 	 * number of replicas per drive.
    907 	 */
    908 	for (n = drvcnt; n > 0; n--) {
    909 		reps = count_replicas(*clpp, n);
    910 		if (reps > 0 && reps <= MDDB_NLB) {
    911 			*minimum_replicas = n;
    912 			return (0);
    913 		}
    914 	}
    915 
    916 cont:
    917 	free_ctlr_lst(clpp);
    918 
    919 	/*
    920 	 * Build a ctlr list with SSA-100 busses as separate controllers.
    921 	 *
    922 	 * If Here: Try to put 2 replicas per controller/bus
    923 	 *	    If that doesn't work put 1 replica per controller/bus
    924 	 */
    925 	if (build_ctlr_lst(sp, clpp, opdd, curdd, TRUE, dbsize, ep) == -1)
    926 		return (-1);
    927 
    928 	/*
    929 	 * If the number of "real" controllers is 2, special handling may be
    930 	 * needed.
    931 	 */
    932 	if (rctlrs != 2) {
    933 		drvcnt = MD_MINBALREP;
    934 		goto other;
    935 	}
    936 
    937 	/*
    938 	 * Determine what controllers are usable in the sense of being able to
    939 	 * add a replica to a drive on the controller.
    940 	 * Also find the minimum number of drives on a controller.
    941 	 */
    942 	drvcnt = ~0U;
    943 	uctlrs = 0;
    944 	for (c = *clpp; c != NULL; c = c->ctl_next) {
    945 		if (c->ctl_drcnt > 0) {
    946 			uctlrs++;
    947 			drvcnt = min(drvcnt, c->ctl_drcnt);
    948 		}
    949 	}
    950 
    951 	/*
    952 	 * If the number of controllers is not changed, continue with original
    953 	 * strategy.
    954 	 */
    955 	if (uctlrs == rctlrs) {
    956 		drvcnt = MD_MINBALREP;
    957 		goto other;
    958 	}
    959 
    960 	/*
    961 	 * Check the distribution of bus ctlrs across real controllers.
    962 	 */
    963 	ructlrs = 0;
    964 	octlrs = 0;
    965 	save_done = 0;
    966 	for (c = *clpp; c != NULL; c = c->ctl_next) {
    967 		if (c->ctl_drcnt == 0)
    968 			continue;
    969 
    970 		if (! save_done) {
    971 			save_cnum = c->ctl_cinfop->cnum;
    972 			save_ctype = c->ctl_cinfop->ctype;
    973 			(void) strncpy(save_cname, c->ctl_cinfop->cname, 16);
    974 			save_done = 1;
    975 		}
    976 
    977 		(void) sdssc_convert_cluster_path(c->ctl_cinfop->cname,
    978 		    &cmp_name_1);
    979 		(void) sdssc_convert_cluster_path(save_cname, &cmp_name_2);
    980 
    981 		if (save_ctype != c->ctl_cinfop->ctype ||
    982 		    save_cnum != c->ctl_cinfop->cnum ||
    983 		    strncmp(cmp_name_1, cmp_name_2, 16) != 0)
    984 			octlrs++;
    985 		else
    986 			ructlrs++;
    987 
    988 		sdssc_convert_path_free(cmp_name_1);
    989 		sdssc_convert_path_free(cmp_name_2);
    990 	}
    991 
    992 	/*
    993 	 * Take the largest of the counts
    994 	 */
    995 	ructlrs = max(ructlrs, octlrs);
    996 
    997 	/*
    998 	 * If the distribution of bus controlers is half of the total, then
    999 	 * this layout strategy will work, doit.
   1000 	 */
   1001 	if ((uctlrs / 2) == ructlrs) {
   1002 		drvcnt = MD_MINBALREP;
   1003 		goto other;
   1004 	}
   1005 
   1006 	/*
   1007 	 * If here, there is a distribution of bus controllers that will cause
   1008 	 * the real controller distribution to be unbalanced, so a different
   1009 	 * strategy is used.
   1010 	 */
   1011 	free_ctlr_lst(clpp);
   1012 
   1013 	/*
   1014 	 * Build the ctlr list with SSA-100 busses NOT as separate controllers.
   1015 	 */
   1016 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
   1017 		return (-1);
   1018 
   1019 	/*
   1020 	 * Make ctl_drcnt limit the number of replicas
   1021 	 */
   1022 	for (c = *clpp; c != NULL; c = c->ctl_next)
   1023 		c->ctl_drcnt = min(drvcnt, c->ctl_drcnt);
   1024 
   1025 	/*
   1026 	 * Try at least MD_MINBALREP's per controller after changing ctl_drcnt
   1027 	 */
   1028 	drvcnt = MD_MINBALREP;
   1029 
   1030 other:
   1031 	/*
   1032 	 * Can we find fewer than the maximum replicas by reducing the number
   1033 	 * of replicas per drive.
   1034 	 */
   1035 	for (n = drvcnt; n > 0; n--) {
   1036 		reps = count_replicas(*clpp, n);
   1037 		if (reps > 0 && reps <= MDDB_NLB) {
   1038 			*minimum_replicas = n;
   1039 			return (0);
   1040 		}
   1041 	}
   1042 
   1043 	free_ctlr_lst(clpp);
   1044 
   1045 	/*
   1046 	 * Build a ctlr list with SSA-100 busses NOT as separate controllers.
   1047 	 *
   1048 	 * If Here: Try to put 2 replicas per controller (not on busses)
   1049 	 *	    If that doesn't work put 1 replica per controller
   1050 	 */
   1051 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
   1052 		return (-1);
   1053 
   1054 	/*
   1055 	 * Can we find fewer than the maximum replicas by reducing the
   1056 	 * number of replicas per drive.
   1057 	 */
   1058 	for (n = MD_MINBALREP; n > 0; n--) {
   1059 		reps = count_replicas(*clpp, n);
   1060 		if (reps > 0 && reps <= MDDB_NLB) {
   1061 			*minimum_replicas = n;
   1062 			return (0);
   1063 		}
   1064 	}
   1065 
   1066 	/*
   1067 	 * Return a ctrl list that does not include the SSA-100 buses as
   1068 	 * separate controllers.  This will create fewer separate controllers.
   1069 	 */
   1070 	*minimum_replicas = 1;
   1071 	return (0);
   1072 }
   1073 
   1074 static int
   1075 morethan2_ctl_balance(
   1076 	mdsetname_t		*sp,
   1077 	md_ctlr_ctl_t		*clp,
   1078 	int			min_reps,
   1079 	md_error_t		*ep
   1080 )
   1081 {
   1082 	md_ctlr_ctl_t		*c;
   1083 	int			err;
   1084 	int			multiple_reps = 0;
   1085 	md_ctlr_drv_t		*d;
   1086 
   1087 	for (c = clp; c != NULL; c = c->ctl_next) {
   1088 		if (c->ctl_drcnt == 0)
   1089 			continue;
   1090 
   1091 		/*
   1092 		 * check for multiple databases on a disk and compensate
   1093 		 */
   1094 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
   1095 			if (d->drv_dbcnt)
   1096 				multiple_reps += d->drv_dbcnt - 1;
   1097 		}
   1098 
   1099 		/*
   1100 		 * remove the number of multiple databases count from the
   1101 		 * total db count. This enables us to rebalance if one of
   1102 		 * the disks has a large enough slice for 2 metadb's. If we
   1103 		 * then add a disk with a smaller slice into the set, we want
   1104 		 * that disk to get a replica on it. If we just compare to
   1105 		 * ctl_dbcnt, it won't.
   1106 		 */
   1107 		while ((c->ctl_dbcnt - multiple_reps) <
   1108 		    min_reps) {
   1109 			if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0)
   1110 				return (-1);
   1111 			if (err == 0)
   1112 				break;
   1113 		}
   1114 
   1115 		while (c->ctl_dbcnt > min_reps) {
   1116 			if ((err = del_replica_from_ctl(sp, c, ep)) < 0)
   1117 				return (-1);
   1118 			if (err == 0)
   1119 				break;
   1120 		}
   1121 	}
   1122 
   1123 	return (0);
   1124 }
   1125 
   1126 static int
   1127 lessthan3_ctl_balance(
   1128 	mdsetname_t		*sp,
   1129 	md_ctlr_ctl_t		*clp,
   1130 	int			min_reps,
   1131 	md_error_t		*ep
   1132 )
   1133 {
   1134 	md_ctlr_ctl_t		*c;
   1135 	int			err;
   1136 	int			multiple_reps = 0;
   1137 	md_ctlr_drv_t		*d;
   1138 
   1139 	for (c = clp; c != NULL; c = c->ctl_next) {
   1140 		if (c->ctl_drcnt == 0)
   1141 			continue;
   1142 
   1143 		/*
   1144 		 * check for multiple databases on a disk and compensate
   1145 		 */
   1146 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
   1147 			if (d->drv_dbcnt)
   1148 				multiple_reps += d->drv_dbcnt - 1;
   1149 		}
   1150 
   1151 		/*
   1152 		 * remove the number of multiple databases count from the
   1153 		 * total db count. This enables us to rebalance if one of
   1154 		 * the disks has a large enough slice for 2 metadb's. If we
   1155 		 * then add a disk with a smaller slice into the set, we want
   1156 		 * that disk to get a replica on it. If we just compare to
   1157 		 * ctl_dbcnt, it won't.
   1158 		 */
   1159 		while ((c->ctl_dbcnt - multiple_reps) <
   1160 		    (min_reps * c->ctl_drcnt)) {
   1161 			if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0)
   1162 				return (-1);
   1163 			if (err == 0)
   1164 				break;
   1165 		}
   1166 
   1167 		while (c->ctl_dbcnt > (min_reps * c->ctl_drcnt)) {
   1168 			if ((err = del_replica_from_ctl(sp, c, ep)) < 0)
   1169 				return (-1);
   1170 			if (err == 0)
   1171 				break;
   1172 		}
   1173 	}
   1174 
   1175 	return (0);
   1176 }
   1177 
   1178 static int
   1179 try_again(
   1180 	md_ctlr_ctl_t	*clp,
   1181 	md_error_t	*ep
   1182 )
   1183 {
   1184 	md_ctlr_ctl_t	*c;
   1185 	md_ctlr_drv_t	*d;
   1186 
   1187 	if (mdismddberror(ep, MDE_TOOMANY_REPLICAS))
   1188 		return (TRUE);
   1189 
   1190 	/*
   1191 	 * retry if all the errored drives are already in the diskset.
   1192 	 */
   1193 	for (c = clp; c != NULL; c = c->ctl_next) {
   1194 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
   1195 			if ((d->drv_flags & (DRV_F_INDISKSET|DRV_F_ERROR))
   1196 			    == DRV_F_ERROR)
   1197 				return (FALSE);
   1198 		}
   1199 	}
   1200 	return (TRUE);
   1201 }
   1202 
   1203 int
   1204 meta_db_balance(
   1205 	mdsetname_t		*sp,
   1206 	md_drive_desc		*opdd,
   1207 	md_drive_desc		*curdd,
   1208 	daddr_t			dbsize,
   1209 	md_error_t		*ep
   1210 )
   1211 {
   1212 	int			min_reps;
   1213 	md_ctlr_ctl_t		*c, *cl = NULL;
   1214 	int			uctlrs = 0;
   1215 	int			retry = 0;
   1216 	int			rval = 0;
   1217 
   1218 	if (balance_replicas(sp, &cl, opdd, curdd, dbsize, &min_reps, ep) == -1)
   1219 		return (-1);
   1220 
   1221 	/*
   1222 	 * Determine what controllers are usable in the sense of being able to
   1223 	 * add a replica to a drive on the controller.
   1224 	 */
   1225 	for (c = cl; c != NULL; c = c->ctl_next)
   1226 		if (c->ctl_drcnt > 0)
   1227 			uctlrs++;
   1228 
   1229 	/*
   1230 	 * Add replicas to achieve a balance.
   1231 	 */
   1232 	if (uctlrs > 2)
   1233 		rval = morethan2_ctl_balance(sp, cl, min_reps, ep);
   1234 	else
   1235 		rval = lessthan3_ctl_balance(sp, cl, min_reps, ep);
   1236 
   1237 	if (rval) {
   1238 		if ((retry = try_again(cl, ep)) == TRUE) {
   1239 			mdclrerror(ep);
   1240 			rval = 0;
   1241 		}
   1242 	}
   1243 
   1244 	/*
   1245 	 * Delete all the replicas from drives that are so marked.
   1246 	 */
   1247 	if (! rval)
   1248 		rval = del_replicas(sp, cl, ep);
   1249 
   1250 	if (retry) {
   1251 		if (uctlrs > 2)
   1252 			rval = morethan2_ctl_balance(sp, cl, min_reps, ep);
   1253 		else
   1254 			rval = lessthan3_ctl_balance(sp, cl, min_reps, ep);
   1255 
   1256 		if (rval && mdismddberror(ep, MDE_TOOMANY_REPLICAS)) {
   1257 			mdclrerror(ep);
   1258 			rval = 0;
   1259 		}
   1260 	}
   1261 
   1262 	/*
   1263 	 * Free up the ctlr list.
   1264 	 */
   1265 	free_ctlr_lst(&cl);
   1266 
   1267 	return (rval);
   1268 }
   1269