1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Database location balancing code. 28 */ 29 30 #include <meta.h> 31 #include <sys/lvm/md_mddb.h> 32 #include <sdssc.h> 33 34 #define MD_MINBALREP 2 35 36 /* 37 * Stuff for DB balancing. 38 */ 39 enum md_ctlr_ops_t { 40 DRV_NOP = 0, 41 DRV_ADD = 1, 42 DRV_DEL = 2 43 }; 44 typedef enum md_ctlr_ops_t md_ctlr_ops_t; 45 46 /* drive flag fields */ 47 #define DRV_F_ERROR 0x1 48 #define DRV_F_INDISKSET 0x2 49 50 struct md_ctlr_drv_t { 51 md_ctlr_ops_t drv_op; 52 int drv_flags; 53 int drv_dbcnt; 54 int drv_new_dbcnt; 55 daddr_t drv_dbsize; 56 mddrivename_t *drv_dnp; 57 struct md_ctlr_drv_t *drv_next; 58 }; 59 typedef struct md_ctlr_drv_t md_ctlr_drv_t; 60 61 struct md_ctlr_ctl_t { 62 mdcinfo_t *ctl_cinfop; 63 int ctl_dbcnt; 64 int ctl_drcnt; 65 md_ctlr_drv_t *ctl_drvs; 66 struct md_ctlr_ctl_t *ctl_next; 67 }; 68 typedef struct md_ctlr_ctl_t md_ctlr_ctl_t; 69 70 static int 71 add_replica( 72 mdsetname_t *sp, 73 mddrivename_t *dnp, 74 int dbcnt, 75 daddr_t dbsize, 76 md_error_t *ep 77 ) 78 { 79 mdnamelist_t *nlp = NULL; 80 mdname_t *np; 81 md_set_desc *sd; 82 uint_t rep_slice; 83 84 if (meta_replicaslice(dnp, &rep_slice, ep) != 0) 85 return (-1); 86 87 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) 88 return (-1); 89 90 (void) metanamelist_append(&nlp, np); 91 92 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 93 metafreenamelist(nlp); 94 return (-1); 95 } 96 97 if (meta_db_attach(sp, nlp, (MDCHK_DRVINSET | MDCHK_SET_LOCKED), 98 (&sd->sd_ctime), dbcnt, dbsize, NULL, ep) == -1) { 99 metafreenamelist(nlp); 100 return (-1); 101 } 102 103 metafreenamelist(nlp); 104 return (0); 105 } 106 107 static int 108 del_replica( 109 mdsetname_t *sp, 110 mddrivename_t *dnp, 111 md_error_t *ep 112 ) 113 { 114 mdnamelist_t *nlp = NULL; 115 mdname_t *np; 116 uint_t rep_slice; 117 118 if (meta_replicaslice(dnp, &rep_slice, ep) != 0) 119 return (-1); 120 121 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) 122 return (-1); 123 124 (void) metanamelist_append(&nlp, np); 125 126 if (meta_db_detach(sp, nlp, (MDFORCE_DS | MDFORCE_SET_LOCKED), 127 NULL, ep) == -1) { 128 metafreenamelist(nlp); 129 return (-1); 130 } 131 132 metafreenamelist(nlp); 133 return (0); 134 } 135 136 static int 137 rep_has_err(md_replicalist_t *rlp, mdname_t *np) 138 { 139 md_replicalist_t *rl; 140 141 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 142 md_replica_t *r = rl->rl_repp; 143 144 if (strcmp(r->r_namep->cname, np->cname) != 0) 145 continue; 146 147 if (r->r_flags & (MDDB_F_EREAD | MDDB_F_EFMT | MDDB_F_EDATA | 148 MDDB_F_EMASTER | MDDB_F_EWRITE)) 149 return (1); 150 151 } 152 return (0); 153 } 154 155 static int 156 add_drv_to_ctl_lst( 157 md_ctlr_ctl_t **clpp, 158 md_replicalist_t *rlp, 159 mddrivename_t *dnp, 160 int dbcnt, 161 daddr_t dbsize, 162 mdcinfo_t *cinfop, 163 int indiskset, 164 int with_bus, 165 int errored, 166 md_error_t *ep 167 ) 168 { 169 md_ctlr_drv_t **dpp; 170 mdname_t *np; 171 mdcinfo_t *tcinfop; 172 char *cmp_name_1, *cmp_name_2; 173 int not_found; 174 175 /* 176 * The user must pass in a list head. 177 */ 178 assert(clpp != NULL); 179 180 if (cinfop == NULL) { 181 uint_t rep_slice; 182 183 if (meta_replicaslice(dnp, &rep_slice, ep) != 0) { 184 /* 185 * A failure to get the slice information can occur 186 * because the drive has failed, if this is the 187 * case then there is nothing that can be done 188 * with this drive, so do not include it in the 189 * list of drives. Clear the error and return. 190 */ 191 mdclrerror(ep); 192 return (0); 193 } 194 195 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) 196 return (-1); 197 198 if ((tcinfop = metagetcinfo(np, ep)) == NULL) 199 return (-1); 200 201 if (metagetvtoc(np, FALSE, NULL, ep) == NULL) 202 errored = 1; 203 204 if (rep_has_err(rlp, np)) 205 errored = 1; 206 } else 207 tcinfop = cinfop; 208 209 for (/* void */; *clpp != NULL; clpp = &(*clpp)->ctl_next) { 210 /* 211 * Try to locate ctlr. 212 */ 213 (void) sdssc_convert_cluster_path(tcinfop->cname, &cmp_name_1); 214 (void) sdssc_convert_cluster_path((*clpp)->ctl_cinfop->cname, 215 &cmp_name_2); 216 217 if (tcinfop->ctype != (*clpp)->ctl_cinfop->ctype || 218 tcinfop->cnum != (*clpp)->ctl_cinfop->cnum || 219 strncmp(cmp_name_1, cmp_name_2, 16) != 0 || 220 (with_bus && tcinfop->bus != (*clpp)->ctl_cinfop->bus)) { 221 not_found = 1; 222 } else 223 not_found = 0; 224 225 226 sdssc_convert_path_free(cmp_name_1); 227 sdssc_convert_path_free(cmp_name_2); 228 229 if (not_found) 230 continue; 231 232 /* 233 * Found ctlr, try to locate the drive. 234 */ 235 for (dpp = &(*clpp)->ctl_drvs; *dpp != NULL; 236 dpp = &(*dpp)->drv_next) { 237 (void) sdssc_convert_cluster_path( 238 (*dpp)->drv_dnp->cname, &cmp_name_1); 239 (void) sdssc_convert_cluster_path(dnp->cname, 240 &cmp_name_2); 241 242 not_found = strcmp(cmp_name_1, cmp_name_2); 243 244 sdssc_convert_path_free(cmp_name_1); 245 sdssc_convert_path_free(cmp_name_2); 246 247 if (not_found) 248 continue; 249 250 /* 251 * Found drive, must be deleting. 252 */ 253 (*dpp)->drv_op = DRV_DEL; 254 if (indiskset) 255 (*dpp)->drv_flags |= DRV_F_INDISKSET; 256 if (errored) { 257 mdclrerror(ep); 258 (*dpp)->drv_flags |= DRV_F_ERROR; 259 } 260 (*clpp)->ctl_dbcnt -= (*dpp)->drv_dbcnt; 261 (*clpp)->ctl_drcnt--; 262 return (0); 263 } 264 /* 265 * The ctlr was found, but not the drive, so add 266 * the drive 267 */ 268 (*dpp) = Zalloc(sizeof (**dpp)); 269 270 271 if (indiskset) { 272 (*dpp)->drv_op = DRV_NOP; 273 (*dpp)->drv_flags |= DRV_F_INDISKSET; 274 if (errored) { 275 mdclrerror(ep); 276 (*dpp)->drv_flags |= DRV_F_ERROR; 277 } 278 } else { 279 (*dpp)->drv_op = DRV_ADD; 280 if (errored) { 281 (*dpp)->drv_flags |= DRV_F_ERROR; 282 return (-1); 283 } 284 assert(dbsize != 0); 285 } 286 (*dpp)->drv_dbcnt = dbcnt; 287 (*dpp)->drv_dbsize = dbsize; 288 (*dpp)->drv_dnp = dnp; 289 (*clpp)->ctl_dbcnt += dbcnt; 290 (*clpp)->ctl_drcnt++; 291 return (0); 292 } 293 /* 294 * No ctlr was located, so add the ctlr, then recurse to add the 295 * drive to the ctlr. 296 */ 297 (*clpp) = Zalloc(sizeof (**clpp)); 298 299 (*clpp)->ctl_cinfop = tcinfop; 300 301 return (add_drv_to_ctl_lst(clpp, rlp, dnp, dbcnt, dbsize, tcinfop, 302 indiskset, with_bus, errored, ep)); 303 } 304 305 static int 306 add_replica_to_ctl( 307 mdsetname_t *sp, 308 md_ctlr_ctl_t *c, 309 int minimum_replicas, 310 md_error_t *ep 311 ) 312 { 313 md_ctlr_drv_t *d; 314 int maxdb = 0; 315 316 /* 317 * If this ctrl has no "usable" drives, assert() or just return if 318 * assert()'s are turned off. 319 */ 320 if (c->ctl_drcnt == 0) { 321 assert(0); 322 return (0); 323 } 324 325 /* 326 * Determine the largest DB count on a drive. 327 */ 328 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) 329 if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL) 330 maxdb = d->drv_dbcnt; 331 332 /* 333 * Make sure we start at a reasonable number 334 */ 335 if (maxdb == 0) 336 maxdb = 1; 337 338 /* 339 * Add a replica to a drive on this ctrl. 340 */ 341 /*CONSTCOND*/ 342 while (1) { 343 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 344 /* 345 * If this drive is being deleted, skip it. 346 */ 347 if (d->drv_op == DRV_DEL) 348 continue; 349 350 if (d->drv_flags & DRV_F_ERROR) 351 continue; 352 /* 353 * Make sure that the replicas are distributed across 354 * the drives. 355 */ 356 if (d->drv_dbcnt >= maxdb) 357 continue; 358 /* 359 * See if the drive already has replicas, 360 * if it does, then delete the exisiting 361 * replica(s) and re-add n+1 replicas to the drive. 362 */ 363 /* ==== Vulnerability - no DB's start ==== */ 364 if (d->drv_dbcnt > 0) { 365 if (del_replica(sp, d->drv_dnp, ep) == -1) { 366 d->drv_flags |= DRV_F_ERROR; 367 if (! (d->drv_flags & DRV_F_INDISKSET)) 368 return (-1); 369 mdclrerror(ep); 370 continue; 371 } 372 } 373 if (add_replica(sp, d->drv_dnp, (d->drv_dbcnt + 1), 374 d->drv_dbsize, ep) == -1) { 375 md_error_t nep = mdnullerror; 376 377 if (d->drv_dbcnt) { 378 /* 379 * We have to to bring the replica 380 * in the drive to the previous 381 * status by adding the original no 382 * of replicas to the drive since 383 * the addition of (drv_dbcnt+1) no 384 * of replicas has failed. If we 385 * leave it at this state, we might 386 * end up having no replicas at 387 * all for the diskset. 388 */ 389 if (add_replica(sp, d->drv_dnp, 390 d->drv_dbcnt, d->drv_dbsize, 391 &nep) == -1) { 392 c->ctl_dbcnt -= d->drv_dbcnt; 393 d->drv_dbcnt = 0; 394 mdclrerror(&nep); 395 } 396 } 397 398 if (mdismddberror(ep, MDE_TOOMANY_REPLICAS)) 399 return (-1); 400 401 if (mdismddberror(ep, MDE_REPLICA_TOOSMALL)) 402 continue; 403 404 d->drv_flags |= DRV_F_ERROR; 405 if (! (d->drv_flags & DRV_F_INDISKSET)) 406 return (-1); 407 mdclrerror(ep); 408 continue; 409 } 410 411 d->drv_dbcnt++; 412 c->ctl_dbcnt++; 413 /* ==== Vulnerability - no DB's end ==== */ 414 return (1); 415 } 416 maxdb++; 417 if (maxdb > minimum_replicas) 418 return (0); 419 } 420 /*NOTREACHED*/ 421 } 422 423 static int 424 del_replica_from_ctl( 425 mdsetname_t *sp, 426 md_ctlr_ctl_t *c, 427 md_error_t *ep 428 ) 429 { 430 md_ctlr_drv_t *d; 431 int maxdb = 0; 432 433 /* 434 * If this ctrl has no "usable" drives, assert() or just return if 435 * assert()'s are turned off. 436 */ 437 if (c->ctl_drcnt == 0) { 438 assert(0); 439 return (0); 440 } 441 442 /* 443 * Determine the largest DB count on a drive. 444 */ 445 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) 446 if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL) 447 maxdb = d->drv_dbcnt; 448 449 if (maxdb == 0) 450 return (0); 451 452 /* 453 * Delete a replica from a drive on this ctrl. 454 */ 455 /*CONSTCOND*/ 456 while (1) { 457 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 458 /* 459 * If this drive is being deleted, skip it. 460 */ 461 if (d->drv_op == DRV_DEL) 462 continue; 463 464 /* 465 * Make sure that there are replicas on this drive to 466 * delete. 467 */ 468 if (d->drv_dbcnt == 0) 469 continue; 470 471 if (d->drv_flags & DRV_F_ERROR) 472 continue; 473 474 /* 475 * We need to keep the DB's distributed across the 476 * drives. 477 */ 478 if (d->drv_dbcnt < maxdb) 479 continue; 480 481 /* 482 * Delete all the replicas on the drive. 483 */ 484 /* ==== Vulnerability - no DB's start ==== */ 485 if (del_replica(sp, d->drv_dnp, ep) == -1) { 486 d->drv_flags |= DRV_F_ERROR; 487 if (! (d->drv_flags & DRV_F_INDISKSET)) 488 return (-1); 489 mdclrerror(ep); 490 continue; 491 } 492 d->drv_dbcnt--; 493 c->ctl_dbcnt--; 494 /* 495 * If there is still a dbcnt for this drive, then add 496 * back the needed DB's. 497 */ 498 if (d->drv_dbcnt > 0) { 499 if (add_replica(sp, d->drv_dnp, d->drv_dbcnt, 500 d->drv_dbsize, ep) == -1) { 501 c->ctl_dbcnt -= d->drv_dbcnt; 502 d->drv_dbcnt = 0; 503 504 if (mdismddberror(ep, 505 MDE_TOOMANY_REPLICAS)) 506 return (-1); 507 508 d->drv_flags |= DRV_F_ERROR; 509 if (! (d->drv_flags & DRV_F_INDISKSET)) 510 return (-1); 511 mdclrerror(ep); 512 continue; 513 } 514 } 515 /* ==== Vulnerability - no DB's end ==== */ 516 return (1); 517 } 518 maxdb--; 519 if (maxdb <= 0) 520 return (0); 521 } 522 /*NOTREACHED*/ 523 } 524 525 static int 526 del_replicas(mdsetname_t *sp, md_ctlr_ctl_t *clp, md_error_t *ep) 527 { 528 md_ctlr_ctl_t *c; 529 md_ctlr_drv_t *d; 530 mdnamelist_t *nlp; 531 mdname_t *np; 532 533 for (c = clp; c != NULL; c = c->ctl_next) { 534 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 535 uint_t rep_slice; 536 537 if (! (d->drv_flags & DRV_F_ERROR) && 538 (d->drv_op != DRV_DEL)) 539 continue; 540 541 if (d->drv_dbcnt == 0) 542 continue; 543 544 if (meta_replicaslice(d->drv_dnp, 545 &rep_slice, ep) != 0) 546 return (-1); 547 548 np = metaslicename(d->drv_dnp, rep_slice, ep); 549 if (np == NULL) 550 return (-1); 551 552 nlp = NULL; 553 (void) metanamelist_append(&nlp, np); 554 555 /* 556 * Delete the replicas listed. 557 */ 558 if (meta_db_detach(sp, nlp, 559 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, 560 ep) == -1) { 561 metafreenamelist(nlp); 562 if (d->drv_flags & DRV_F_INDISKSET) { 563 mdclrerror(ep); 564 continue; 565 } 566 return (-1); 567 } 568 metafreenamelist(nlp); 569 } 570 } 571 572 return (0); 573 } 574 575 static void 576 free_ctlr_lst(md_ctlr_ctl_t **clpp) 577 { 578 md_ctlr_ctl_t *c, *tc = NULL; 579 md_ctlr_drv_t *d, *td = NULL; 580 581 for (c = *clpp; c != NULL; c = tc) { 582 tc = c->ctl_next; 583 for (d = c->ctl_drvs; d != NULL; d = td) { 584 td = d->drv_next; 585 Free(d); 586 } 587 Free(c); 588 } 589 *clpp = NULL; 590 } 591 592 static int 593 build_ctlr_lst( 594 mdsetname_t *sp, 595 md_ctlr_ctl_t **clpp, 596 md_drive_desc *opdd, 597 md_drive_desc *curdd, 598 int with_bus, 599 daddr_t dbsize, 600 md_error_t *ep 601 ) 602 { 603 md_drive_desc *d; 604 md_set_desc *sd; 605 daddr_t nblks; 606 md_replicalist_t *rlp = NULL; 607 static daddr_t min_dbsize = 0; 608 609 if (min_dbsize == 0) { 610 if ((nblks = meta_db_minreplica(sp, ep)) < 0) { 611 min_dbsize = MD_DBSIZE; 612 613 if (! metaislocalset(sp)) { 614 if ((sd = metaget_setdesc(sp, ep)) == NULL) 615 return (-1); 616 617 if (MD_MNSET_DESC(sd)) 618 min_dbsize = MD_MN_DBSIZE; 619 } 620 mdclrerror(ep); 621 } else 622 min_dbsize = nblks; 623 } 624 625 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) { 626 if (! mdismddberror(ep, MDE_DB_NODB) && 627 ! mdismddberror(ep, MDE_DB_NOTOWNER)) 628 return (-1); 629 mdclrerror(ep); 630 } 631 632 /* 633 * Add drives currently in the set to the ctlr list. 634 */ 635 for (d = curdd; d != NULL; d = d->dd_next) { 636 daddr_t this_dbsize = d->dd_dbsize; 637 638 if (this_dbsize == 0) 639 this_dbsize = min_dbsize; 640 641 if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, d->dd_dbcnt, 642 this_dbsize, NULL, TRUE, with_bus, 0, ep) == -1) 643 return (-1); 644 } 645 646 /* 647 * Add the drives that are being operated on to the ctlr list. 648 */ 649 for (d = opdd; d != NULL; d = d->dd_next) 650 if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, 0, dbsize, NULL, 651 FALSE, with_bus, 0, ep) == -1) 652 return (-1); 653 654 metafreereplicalist(rlp); 655 return (0); 656 } 657 658 static int 659 count_replica_on_ctl( 660 md_ctlr_ctl_t *c, 661 int adding, 662 int *db_cnt, 663 int minimum_replicas 664 ) 665 { 666 md_ctlr_drv_t *d; 667 int maxdb = 0; 668 669 /* 670 * If this ctrl has no "usable" drives, nothing to do. 671 */ 672 if (c->ctl_drcnt == 0) 673 return (0); 674 675 /* 676 * Determine the largest DB count on a drive. 677 */ 678 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) 679 if (d->drv_new_dbcnt > maxdb && d->drv_op != DRV_DEL) 680 maxdb = d->drv_new_dbcnt; 681 682 /* 683 * Make sure we start at a reasonable number 684 */ 685 if (maxdb == 0) { 686 if (!adding) 687 return (0); 688 maxdb = 1; 689 } 690 691 /* 692 * Count or Un-Count replicas that would be 693 * added or deleted respectively. 694 */ 695 /*CONSTCOND*/ 696 while (1) { 697 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 698 /* 699 * If this drive is being deleted, skip it. 700 */ 701 if (d->drv_op == DRV_DEL) 702 continue; 703 704 /* 705 * If the drive is errored and adding, skip it. 706 */ 707 if (adding && (d->drv_flags & DRV_F_ERROR)) 708 continue; 709 710 /* 711 * Make sure that the replicas are distributed across 712 * the drives. 713 */ 714 if (adding) { 715 if (d->drv_new_dbcnt >= maxdb) 716 continue; 717 } else { 718 if (d->drv_new_dbcnt == 0) 719 continue; 720 if (d->drv_new_dbcnt < maxdb) 721 continue; 722 } 723 724 /* 725 * Count or Un-Count replicas here. 726 */ 727 if (adding) { 728 mdpart_t *partp; 729 uint_t rep_slice; 730 md_error_t mde = mdnullerror; 731 732 if (meta_replicaslice(d->drv_dnp, 733 &rep_slice, &mde) != 0) { 734 mdclrerror(&mde); 735 continue; 736 } 737 738 partp = &d->drv_dnp->vtoc.parts[rep_slice]; 739 if (! partp) 740 continue; 741 742 if (((d->drv_new_dbcnt + 1) * d->drv_dbsize) > 743 (partp->size - 16)) 744 continue; 745 (*db_cnt)++; 746 d->drv_new_dbcnt++; 747 } else { 748 (*db_cnt)--; 749 d->drv_new_dbcnt--; 750 } 751 return (0); 752 } 753 754 /* 755 * This should make sure they get spread 756 * around. This is to emulate the {add,del}_replica 757 * routines. 758 */ 759 if (adding) { 760 maxdb++; 761 if (maxdb > minimum_replicas) 762 return (-1); 763 } else { 764 maxdb--; 765 if (maxdb <= 0) 766 return (-1); 767 } 768 } 769 /*NOTREACHED*/ 770 } 771 772 static int 773 count_replicas( 774 md_ctlr_ctl_t *clp, 775 int min_reps 776 ) 777 { 778 md_ctlr_ctl_t *c; 779 md_ctlr_drv_t *d; 780 int db_cnt; 781 int uctlrs = 0; 782 int total_cnt = 0; 783 784 /* 785 * Count the number of controllers, 786 * counting the replicas is slightly different based 787 * on the controller count. 788 */ 789 for (c = clp; c != NULL; c = c->ctl_next) 790 if (c->ctl_drcnt > 0) { 791 uctlrs++; 792 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) 793 d->drv_new_dbcnt = d->drv_dbcnt; 794 } 795 796 if (uctlrs > 2) { 797 for (c = clp; c != NULL; c = c->ctl_next) { 798 if (c->ctl_drcnt == 0) 799 continue; 800 801 db_cnt = c->ctl_dbcnt; 802 /* 803 * Count the replicas that would be added. 804 */ 805 while (db_cnt < min_reps) 806 if (count_replica_on_ctl(c, TRUE, 807 &db_cnt, min_reps)) 808 return (-1); 809 810 /* 811 * Un-Count the replicas that would be deleted. 812 */ 813 while (db_cnt > min_reps) 814 if (count_replica_on_ctl(c, FALSE, 815 &db_cnt, min_reps)) 816 return (-1); 817 total_cnt += db_cnt; 818 } 819 } else { 820 for (c = clp; c != NULL; c = c->ctl_next) { 821 if (c->ctl_drcnt == 0) 822 continue; 823 824 db_cnt = c->ctl_dbcnt; 825 /* 826 * Count the replicas that woud be added. 827 */ 828 while (db_cnt < (min_reps * c->ctl_drcnt)) 829 if (count_replica_on_ctl(c, TRUE, 830 &db_cnt, min_reps)) 831 return (-1); 832 833 total_cnt += db_cnt; 834 } 835 } 836 837 return (total_cnt); 838 } 839 840 static int 841 balance_replicas( 842 mdsetname_t *sp, 843 md_ctlr_ctl_t **clpp, 844 md_drive_desc *opdd, 845 md_drive_desc *curdd, 846 daddr_t dbsize, 847 int *minimum_replicas, 848 md_error_t *ep 849 ) 850 { 851 int n; 852 int rctlrs = 0; 853 int uctlrs; 854 int ructlrs; 855 int octlrs; 856 int save_done; 857 int prevcnt = 0, issame = 1; 858 uint_t drvcnt = ~0U; 859 uint_t save_cnum; 860 mhd_ctlrtype_t save_ctype; 861 char save_cname[16]; 862 char *cmp_name_1, *cmp_name_2; 863 int reps; 864 md_ctlr_ctl_t *c; 865 866 /* 867 * Build a ctlr list with SSA-100 busses NOT as separate controllers. 868 */ 869 if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1) 870 return (-1); 871 872 /* 873 * Determine what controllers are usable in the sense of being able to 874 * add a replica to a drive on the controller. 875 * Also find the minimum number of drives on a controller. 876 */ 877 for (c = *clpp; c != NULL; c = c->ctl_next) { 878 if (c->ctl_drcnt > 0) { 879 rctlrs++; 880 drvcnt = min(drvcnt, c->ctl_drcnt); 881 if (prevcnt == 0) 882 prevcnt = c->ctl_drcnt; 883 else if (prevcnt != c->ctl_drcnt) 884 issame = 0; 885 } 886 } 887 888 if ((rctlrs <= 2) || (issame && (drvcnt >= 30))) 889 goto cont; 890 891 /* 892 * If here: Handling 3 or more controllers most 893 * likely with non-symmetrical number of 894 * disks. The number of replicas will be 895 * the minimum number of disks on a controller. 896 * 897 * The main point is to insure that a 898 * controller does not have more than half 899 * of the replicas. 900 */ 901 drvcnt = min(drvcnt, 12); 902 drvcnt = max(drvcnt, MD_MINBALREP); 903 904 /* 905 * Can we find fewer than the maximum replicas by reducing the 906 * number of replicas per drive. 907 */ 908 for (n = drvcnt; n > 0; n--) { 909 reps = count_replicas(*clpp, n); 910 if (reps > 0 && reps <= MDDB_NLB) { 911 *minimum_replicas = n; 912 return (0); 913 } 914 } 915 916 cont: 917 free_ctlr_lst(clpp); 918 919 /* 920 * Build a ctlr list with SSA-100 busses as separate controllers. 921 * 922 * If Here: Try to put 2 replicas per controller/bus 923 * If that doesn't work put 1 replica per controller/bus 924 */ 925 if (build_ctlr_lst(sp, clpp, opdd, curdd, TRUE, dbsize, ep) == -1) 926 return (-1); 927 928 /* 929 * If the number of "real" controllers is 2, special handling may be 930 * needed. 931 */ 932 if (rctlrs != 2) { 933 drvcnt = MD_MINBALREP; 934 goto other; 935 } 936 937 /* 938 * Determine what controllers are usable in the sense of being able to 939 * add a replica to a drive on the controller. 940 * Also find the minimum number of drives on a controller. 941 */ 942 drvcnt = ~0U; 943 uctlrs = 0; 944 for (c = *clpp; c != NULL; c = c->ctl_next) { 945 if (c->ctl_drcnt > 0) { 946 uctlrs++; 947 drvcnt = min(drvcnt, c->ctl_drcnt); 948 } 949 } 950 951 /* 952 * If the number of controllers is not changed, continue with original 953 * strategy. 954 */ 955 if (uctlrs == rctlrs) { 956 drvcnt = MD_MINBALREP; 957 goto other; 958 } 959 960 /* 961 * Check the distribution of bus ctlrs across real controllers. 962 */ 963 ructlrs = 0; 964 octlrs = 0; 965 save_done = 0; 966 for (c = *clpp; c != NULL; c = c->ctl_next) { 967 if (c->ctl_drcnt == 0) 968 continue; 969 970 if (! save_done) { 971 save_cnum = c->ctl_cinfop->cnum; 972 save_ctype = c->ctl_cinfop->ctype; 973 (void) strncpy(save_cname, c->ctl_cinfop->cname, 16); 974 save_done = 1; 975 } 976 977 (void) sdssc_convert_cluster_path(c->ctl_cinfop->cname, 978 &cmp_name_1); 979 (void) sdssc_convert_cluster_path(save_cname, &cmp_name_2); 980 981 if (save_ctype != c->ctl_cinfop->ctype || 982 save_cnum != c->ctl_cinfop->cnum || 983 strncmp(cmp_name_1, cmp_name_2, 16) != 0) 984 octlrs++; 985 else 986 ructlrs++; 987 988 sdssc_convert_path_free(cmp_name_1); 989 sdssc_convert_path_free(cmp_name_2); 990 } 991 992 /* 993 * Take the largest of the counts 994 */ 995 ructlrs = max(ructlrs, octlrs); 996 997 /* 998 * If the distribution of bus controlers is half of the total, then 999 * this layout strategy will work, doit. 1000 */ 1001 if ((uctlrs / 2) == ructlrs) { 1002 drvcnt = MD_MINBALREP; 1003 goto other; 1004 } 1005 1006 /* 1007 * If here, there is a distribution of bus controllers that will cause 1008 * the real controller distribution to be unbalanced, so a different 1009 * strategy is used. 1010 */ 1011 free_ctlr_lst(clpp); 1012 1013 /* 1014 * Build the ctlr list with SSA-100 busses NOT as separate controllers. 1015 */ 1016 if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1) 1017 return (-1); 1018 1019 /* 1020 * Make ctl_drcnt limit the number of replicas 1021 */ 1022 for (c = *clpp; c != NULL; c = c->ctl_next) 1023 c->ctl_drcnt = min(drvcnt, c->ctl_drcnt); 1024 1025 /* 1026 * Try at least MD_MINBALREP's per controller after changing ctl_drcnt 1027 */ 1028 drvcnt = MD_MINBALREP; 1029 1030 other: 1031 /* 1032 * Can we find fewer than the maximum replicas by reducing the number 1033 * of replicas per drive. 1034 */ 1035 for (n = drvcnt; n > 0; n--) { 1036 reps = count_replicas(*clpp, n); 1037 if (reps > 0 && reps <= MDDB_NLB) { 1038 *minimum_replicas = n; 1039 return (0); 1040 } 1041 } 1042 1043 free_ctlr_lst(clpp); 1044 1045 /* 1046 * Build a ctlr list with SSA-100 busses NOT as separate controllers. 1047 * 1048 * If Here: Try to put 2 replicas per controller (not on busses) 1049 * If that doesn't work put 1 replica per controller 1050 */ 1051 if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1) 1052 return (-1); 1053 1054 /* 1055 * Can we find fewer than the maximum replicas by reducing the 1056 * number of replicas per drive. 1057 */ 1058 for (n = MD_MINBALREP; n > 0; n--) { 1059 reps = count_replicas(*clpp, n); 1060 if (reps > 0 && reps <= MDDB_NLB) { 1061 *minimum_replicas = n; 1062 return (0); 1063 } 1064 } 1065 1066 /* 1067 * Return a ctrl list that does not include the SSA-100 buses as 1068 * separate controllers. This will create fewer separate controllers. 1069 */ 1070 *minimum_replicas = 1; 1071 return (0); 1072 } 1073 1074 static int 1075 morethan2_ctl_balance( 1076 mdsetname_t *sp, 1077 md_ctlr_ctl_t *clp, 1078 int min_reps, 1079 md_error_t *ep 1080 ) 1081 { 1082 md_ctlr_ctl_t *c; 1083 int err; 1084 int multiple_reps = 0; 1085 md_ctlr_drv_t *d; 1086 1087 for (c = clp; c != NULL; c = c->ctl_next) { 1088 if (c->ctl_drcnt == 0) 1089 continue; 1090 1091 /* 1092 * check for multiple databases on a disk and compensate 1093 */ 1094 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 1095 if (d->drv_dbcnt) 1096 multiple_reps += d->drv_dbcnt - 1; 1097 } 1098 1099 /* 1100 * remove the number of multiple databases count from the 1101 * total db count. This enables us to rebalance if one of 1102 * the disks has a large enough slice for 2 metadb's. If we 1103 * then add a disk with a smaller slice into the set, we want 1104 * that disk to get a replica on it. If we just compare to 1105 * ctl_dbcnt, it won't. 1106 */ 1107 while ((c->ctl_dbcnt - multiple_reps) < 1108 min_reps) { 1109 if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0) 1110 return (-1); 1111 if (err == 0) 1112 break; 1113 } 1114 1115 while (c->ctl_dbcnt > min_reps) { 1116 if ((err = del_replica_from_ctl(sp, c, ep)) < 0) 1117 return (-1); 1118 if (err == 0) 1119 break; 1120 } 1121 } 1122 1123 return (0); 1124 } 1125 1126 static int 1127 lessthan3_ctl_balance( 1128 mdsetname_t *sp, 1129 md_ctlr_ctl_t *clp, 1130 int min_reps, 1131 md_error_t *ep 1132 ) 1133 { 1134 md_ctlr_ctl_t *c; 1135 int err; 1136 int multiple_reps = 0; 1137 md_ctlr_drv_t *d; 1138 1139 for (c = clp; c != NULL; c = c->ctl_next) { 1140 if (c->ctl_drcnt == 0) 1141 continue; 1142 1143 /* 1144 * check for multiple databases on a disk and compensate 1145 */ 1146 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 1147 if (d->drv_dbcnt) 1148 multiple_reps += d->drv_dbcnt - 1; 1149 } 1150 1151 /* 1152 * remove the number of multiple databases count from the 1153 * total db count. This enables us to rebalance if one of 1154 * the disks has a large enough slice for 2 metadb's. If we 1155 * then add a disk with a smaller slice into the set, we want 1156 * that disk to get a replica on it. If we just compare to 1157 * ctl_dbcnt, it won't. 1158 */ 1159 while ((c->ctl_dbcnt - multiple_reps) < 1160 (min_reps * c->ctl_drcnt)) { 1161 if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0) 1162 return (-1); 1163 if (err == 0) 1164 break; 1165 } 1166 1167 while (c->ctl_dbcnt > (min_reps * c->ctl_drcnt)) { 1168 if ((err = del_replica_from_ctl(sp, c, ep)) < 0) 1169 return (-1); 1170 if (err == 0) 1171 break; 1172 } 1173 } 1174 1175 return (0); 1176 } 1177 1178 static int 1179 try_again( 1180 md_ctlr_ctl_t *clp, 1181 md_error_t *ep 1182 ) 1183 { 1184 md_ctlr_ctl_t *c; 1185 md_ctlr_drv_t *d; 1186 1187 if (mdismddberror(ep, MDE_TOOMANY_REPLICAS)) 1188 return (TRUE); 1189 1190 /* 1191 * retry if all the errored drives are already in the diskset. 1192 */ 1193 for (c = clp; c != NULL; c = c->ctl_next) { 1194 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 1195 if ((d->drv_flags & (DRV_F_INDISKSET|DRV_F_ERROR)) 1196 == DRV_F_ERROR) 1197 return (FALSE); 1198 } 1199 } 1200 return (TRUE); 1201 } 1202 1203 int 1204 meta_db_balance( 1205 mdsetname_t *sp, 1206 md_drive_desc *opdd, 1207 md_drive_desc *curdd, 1208 daddr_t dbsize, 1209 md_error_t *ep 1210 ) 1211 { 1212 int min_reps; 1213 md_ctlr_ctl_t *c, *cl = NULL; 1214 int uctlrs = 0; 1215 int retry = 0; 1216 int rval = 0; 1217 1218 if (balance_replicas(sp, &cl, opdd, curdd, dbsize, &min_reps, ep) == -1) 1219 return (-1); 1220 1221 /* 1222 * Determine what controllers are usable in the sense of being able to 1223 * add a replica to a drive on the controller. 1224 */ 1225 for (c = cl; c != NULL; c = c->ctl_next) 1226 if (c->ctl_drcnt > 0) 1227 uctlrs++; 1228 1229 /* 1230 * Add replicas to achieve a balance. 1231 */ 1232 if (uctlrs > 2) 1233 rval = morethan2_ctl_balance(sp, cl, min_reps, ep); 1234 else 1235 rval = lessthan3_ctl_balance(sp, cl, min_reps, ep); 1236 1237 if (rval) { 1238 if ((retry = try_again(cl, ep)) == TRUE) { 1239 mdclrerror(ep); 1240 rval = 0; 1241 } 1242 } 1243 1244 /* 1245 * Delete all the replicas from drives that are so marked. 1246 */ 1247 if (! rval) 1248 rval = del_replicas(sp, cl, ep); 1249 1250 if (retry) { 1251 if (uctlrs > 2) 1252 rval = morethan2_ctl_balance(sp, cl, min_reps, ep); 1253 else 1254 rval = lessthan3_ctl_balance(sp, cl, min_reps, ep); 1255 1256 if (rval && mdismddberror(ep, MDE_TOOMANY_REPLICAS)) { 1257 mdclrerror(ep); 1258 rval = 0; 1259 } 1260 } 1261 1262 /* 1263 * Free up the ctlr list. 1264 */ 1265 free_ctlr_lst(&cl); 1266 1267 return (rval); 1268 } 1269