1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/conf.h> 29 #include <sys/time.h> 30 #include <sys/uio.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/systeminfo.h> 34 #include <sys/sysmacros.h> 35 #include <sys/buf.h> 36 #include <sys/kmem.h> 37 #include <sys/file.h> 38 #include <sys/open.h> 39 #include <sys/debug.h> 40 #include <sys/stat.h> 41 #include <sys/lvm/mdvar.h> 42 #include <sys/lvm/md_crc.h> 43 #include <sys/lvm/md_convert.h> 44 #include <sys/types.h> 45 #include <sys/kmem.h> 46 #include <sys/lvm/mdmn_commd.h> 47 #include <sys/cladm.h> 48 49 mhd_mhiargs_t defmhiargs = { 50 1000, 51 { 6000, 6000, 30000 } 52 }; 53 54 #define MDDB 55 56 #include <sys/lvm/mdvar.h> 57 #include <sys/lvm/mdmed.h> 58 #include <sys/lvm/md_names.h> 59 #include <sys/cred.h> 60 #include <sys/ddi.h> 61 #include <sys/sunddi.h> 62 #include <sys/esunddi.h> 63 64 #include <sys/sysevent/eventdefs.h> 65 #include <sys/sysevent/svm.h> 66 67 extern char svm_bootpath[]; 68 69 int md_maxbootlist = MAXBOOTLIST; 70 static ulong_t mddb_maxblocks = 0; /* tune for small records */ 71 static int mddb_maxbufheaders = 50; 72 static uint_t mddb_maxcopies = MDDB_NLB; 73 74 /* 75 * If this is set, more detailed messages about DB init will be given, instead 76 * of just the MDE_DB_NODB. 77 */ 78 static int mddb_db_err_detail = 0; 79 80 /* 81 * This lock is used to single-thread load/unload of all sets 82 */ 83 static kmutex_t mddb_lock; 84 85 /* 86 * You really do NOT want to change this boolean. 87 * It can be VERY dangerous to do so. Loss of 88 * data may occur. USE AT YOUR OWN RISK!!!! 89 */ 90 static int mddb_allow_half = 0; 91 /* 92 * For mirrored root allow reboot with only half the replicas available 93 * Flag inserted for Santa Fe project. 94 */ 95 int mirrored_root_flag; 96 97 #define ISWHITE(c) (((c) == ' ') || ((c) == '\t') || \ 98 ((c) == '\r') || ((c) == '\n')) 99 #define ISNUM(c) (((c) >= '0') && ((c) <= '9')) 100 101 #define SETMUTEX(setno) (&md_set[setno].s_dbmx) 102 103 extern md_krwlock_t md_unit_array_rw; /* md.c */ 104 extern set_t md_nsets; /* md.c */ 105 extern int md_nmedh; /* md.c */ 106 extern md_set_t md_set[]; /* md.c */ 107 extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 108 extern dev_info_t *md_devinfo; 109 extern int md_init_debug; 110 extern int md_status; 111 extern md_ops_t *md_opslist; 112 extern md_krwlock_t nm_lock; 113 114 static int update_locatorblock(mddb_set_t *s, md_dev64_t dev, 115 ddi_devid_t didptr, ddi_devid_t old_didptr); 116 117 /* 118 * Defines for crc calculation for records 119 * rec_crcgen generates a crc checksum for a record block 120 * rec_crcchk checks the crc checksum for a record block 121 */ 122 #define REC_CRCGEN 0 123 #define REC_CRCCHK 1 124 #define rec_crcgen(s, dep, rbp) \ 125 (void) rec_crcfunc(s, dep, rbp, REC_CRCGEN) 126 #define rec_crcchk(s, dep, rbp) \ 127 rec_crcfunc(s, dep, rbp, REC_CRCCHK) 128 129 /* 130 * During upgrade, SVM basically runs with the devt from the target 131 * being upgraded. Translations are made from the target devt to the 132 * miniroot devt when writing data out to the disk. This is done by 133 * the following routines: 134 * wrtblklst 135 * writeblks 136 * readblklst 137 * readblks 138 * dt_read 139 * 140 * The following routines are used by the routines listed above and 141 * expect a translated (aka miniroot) devt: 142 * getblks 143 * getmasters 144 * 145 * Also, when calling any system routines, such as ddi_lyr_get_devid, 146 * the translated (aka miniroot) devt must be used. 147 * 148 * By the same token, the major number and major name conversion operations 149 * need to use the name_to_major file from the target system instead 150 * of the name_to_major file on the miniroot. So, calls to 151 * ddi_name_to_major must be replaced with calls to md_targ_name_to_major 152 * when running on an upgrade. Same is true with calls to 153 * ddi_major_to_name. 154 */ 155 156 157 #ifndef MDDB_FAKE 158 159 static int 160 mddb_rwdata( 161 mddb_set_t *s, /* incore db set structure */ 162 int flag, /* B_ASYNC, B_FAILFAST or 0 passed in here */ 163 buf_t *bp 164 ) 165 { 166 int err = 0; 167 168 bp->b_flags = (flag | B_BUSY) & (~B_ASYNC); 169 170 mutex_exit(SETMUTEX(s->s_setno)); 171 if (mdv_strategy_tstpnt == NULL || 172 (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0) 173 (void) bdev_strategy(bp); 174 175 if (flag & B_ASYNC) { 176 mutex_enter(SETMUTEX(s->s_setno)); 177 return (0); 178 } 179 180 err = biowait(bp); 181 mutex_enter(SETMUTEX(s->s_setno)); 182 return (err); 183 } 184 185 static void 186 setidentifier( 187 mddb_set_t *s, 188 identifier_t *ident 189 ) 190 { 191 if (s->s_setno == MD_LOCAL_SET) 192 (void) strcpy(&ident->serial[0], s->s_ident.serial); 193 else 194 ident->createtime = s->s_ident.createtime; 195 } 196 197 static int 198 cmpidentifier( 199 mddb_set_t *s, 200 identifier_t *ident 201 ) 202 { 203 if (s->s_setno == MD_LOCAL_SET) 204 return (strcmp(ident->serial, s->s_ident.serial)); 205 else 206 return (timercmp(&ident->createtime, 207 /*CSTYLED*/ 208 &s->s_ident.createtime, !=)); 209 } 210 211 static int 212 mddb_devopen( 213 md_dev64_t dev 214 ) 215 { 216 dev_t ddi_dev = md_dev64_to_dev(dev); 217 218 if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0) 219 return (0); 220 return (1); 221 } 222 223 static void 224 mddb_devclose( 225 md_dev64_t dev 226 ) 227 { 228 (void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred); 229 } 230 231 /* 232 * stripe_skip_ts 233 * 234 * Returns a list of fields to be skipped in the stripe record structure. 235 * These fields are ms_timestamp in the component structure. 236 * Used to skip these fields when calculating the checksum. 237 */ 238 static crc_skip_t * 239 stripe_skip_ts(void *un, uint_t revision) 240 { 241 struct ms_row32_od *small_mdr; 242 struct ms_row *big_mdr; 243 uint_t row, comp, ncomps, compoff; 244 crc_skip_t *skip; 245 crc_skip_t *skip_prev; 246 crc_skip_t skip_start = {0, 0, 0}; 247 ms_unit_t *big_un; 248 ms_unit32_od_t *small_un; 249 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 250 251 switch (revision) { 252 case MDDB_REV_RB: 253 case MDDB_REV_RBFN: 254 small_un = (ms_unit32_od_t *)un; 255 skip_prev = &skip_start; 256 257 if (small_un->un_nrows == 0) 258 return (NULL); 259 /* 260 * walk through all rows to find the total number 261 * of components 262 */ 263 small_mdr = &small_un->un_row[0]; 264 ncomps = 0; 265 for (row = 0; (row < small_un->un_nrows); row++) { 266 ncomps += small_mdr[row].un_ncomp; 267 } 268 269 /* Now walk through the components */ 270 compoff = small_un->un_ocomp + rb_off; 271 for (comp = 0; (comp < ncomps); ++comp) { 272 uint_t mdcp = compoff + 273 (comp * sizeof (ms_comp32_od_t)); 274 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 275 KM_SLEEP); 276 skip->skip_offset = mdcp + 277 offsetof(ms_comp32_od_t, un_mirror.ms_timestamp); 278 skip->skip_size = sizeof (md_timeval32_t); 279 skip_prev->skip_next = skip; 280 skip_prev = skip; 281 } 282 break; 283 case MDDB_REV_RB64: 284 case MDDB_REV_RB64FN: 285 big_un = (ms_unit_t *)un; 286 skip_prev = &skip_start; 287 288 if (big_un->un_nrows == 0) 289 return (NULL); 290 /* 291 * walk through all rows to find the total number 292 * of components 293 */ 294 big_mdr = &big_un->un_row[0]; 295 ncomps = 0; 296 for (row = 0; (row < big_un->un_nrows); row++) { 297 ncomps += big_mdr[row].un_ncomp; 298 } 299 300 /* Now walk through the components */ 301 compoff = big_un->un_ocomp + rb_off; 302 for (comp = 0; (comp < ncomps); ++comp) { 303 uint_t mdcp = compoff + 304 (comp * sizeof (ms_comp_t)); 305 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 306 KM_SLEEP); 307 skip->skip_offset = mdcp + 308 offsetof(ms_comp_t, un_mirror.ms_timestamp); 309 skip->skip_size = sizeof (md_timeval32_t); 310 skip_prev->skip_next = skip; 311 skip_prev = skip; 312 } 313 break; 314 } 315 /* Return the start of the list of fields to skip */ 316 return (skip_start.skip_next); 317 } 318 319 /* 320 * mirror_skip_ts 321 * 322 * Returns a list of fields to be skipped in the mirror record structure. 323 * This includes un_last_read and sm_timestamp for each submirror 324 * Used to skip these fields when calculating the checksum. 325 */ 326 static crc_skip_t * 327 mirror_skip_ts(uint_t revision) 328 { 329 int i; 330 crc_skip_t *skip; 331 crc_skip_t *skip_prev; 332 crc_skip_t skip_start = {0, 0, 0}; 333 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 334 335 skip_prev = &skip_start; 336 337 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 338 switch (revision) { 339 case MDDB_REV_RB: 340 case MDDB_REV_RBFN: 341 skip->skip_offset = offsetof(mm_unit32_od_t, 342 un_last_read) + rb_off; 343 break; 344 case MDDB_REV_RB64: 345 case MDDB_REV_RB64FN: 346 skip->skip_offset = offsetof(mm_unit_t, 347 un_last_read) + rb_off; 348 break; 349 } 350 skip->skip_size = sizeof (int); 351 skip_prev->skip_next = skip; 352 skip_prev = skip; 353 354 for (i = 0; i < NMIRROR; i++) { 355 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 356 switch (revision) { 357 case MDDB_REV_RB: 358 case MDDB_REV_RBFN: 359 skip->skip_offset = offsetof(mm_unit32_od_t, 360 un_sm[i].sm_timestamp) + rb_off; 361 break; 362 case MDDB_REV_RB64: 363 case MDDB_REV_RB64FN: 364 skip->skip_offset = offsetof(mm_unit_t, 365 un_sm[i].sm_timestamp) + rb_off; 366 break; 367 } 368 skip->skip_size = sizeof (md_timeval32_t); 369 skip_prev->skip_next = skip; 370 skip_prev = skip; 371 } 372 /* Return the start of the list of fields to skip */ 373 return (skip_start.skip_next); 374 } 375 376 /* 377 * hotspare_skip_ts 378 * 379 * Returns a list of the timestamp fields in the hotspare record structure. 380 * Used to skip these fields when calculating the checksum. 381 */ 382 static crc_skip_t * 383 hotspare_skip_ts(uint_t revision) 384 { 385 crc_skip_t *skip; 386 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 387 388 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 389 switch (revision) { 390 case MDDB_REV_RB: 391 case MDDB_REV_RBFN: 392 skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) + 393 rb_off; 394 break; 395 case MDDB_REV_RB64: 396 case MDDB_REV_RB64FN: 397 skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) + 398 rb_off; 399 break; 400 } 401 skip->skip_size = sizeof (md_timeval32_t); 402 return (skip); 403 } 404 405 /* 406 * rec_crcfunc 407 * 408 * Calculate or check the checksum for a record 409 * Calculate the crc if check == 0, Check the crc if check == 1 410 * 411 * Record block may be written by different nodes in a multi-owner diskset 412 * (in case of master change), the function rec_crcchk excludes timestamp 413 * fields in crc computation of record data. 414 * Otherwise, timestamp fields will cause each node to have a different 415 * checksum for same record block causing the exclusive-or of all record block 416 * checksums and data block record sums to be non-zero after new master writes 417 * at least one record block. 418 */ 419 static uint_t 420 rec_crcfunc( 421 mddb_set_t *s, 422 mddb_de_ic_t *dep, 423 mddb_rb32_t *rbp, 424 int check 425 ) 426 { 427 crc_skip_t *skip; 428 crc_skip_t *skip_tail; 429 mddb_type_t type = dep->de_type1; 430 uint_t ret; 431 432 /* 433 * Generate a list of the areas to be skipped when calculating 434 * the checksum. 435 * First skip rb_checksum, rb_private and rb_userdata. 436 */ 437 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 438 skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle); 439 skip->skip_size = 3 * sizeof (uint_t); 440 skip_tail = skip; 441 if (MD_MNSET_SETNO(s->s_setno)) { 442 /* For a MN set, skip rb_timestamp */ 443 skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 444 KM_SLEEP); 445 skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp); 446 skip_tail->skip_size = sizeof (md_timeval32_t); 447 skip->skip_next = skip_tail; 448 449 /* Now add a list of timestamps to be skipped */ 450 if (type >= MDDB_FIRST_MODID) { 451 switch (dep->de_flags) { 452 case MDDB_F_STRIPE: 453 skip_tail->skip_next = 454 stripe_skip_ts((void *)rbp->rb_data, 455 rbp->rb_revision); 456 break; 457 case MDDB_F_MIRROR: 458 skip_tail->skip_next = 459 mirror_skip_ts(rbp->rb_revision); 460 break; 461 case MDDB_F_HOTSPARE: 462 skip_tail->skip_next = 463 hotspare_skip_ts(rbp->rb_revision); 464 break; 465 default: 466 break; 467 } 468 } 469 } 470 471 if (check) { 472 ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip); 473 } else { 474 crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip); 475 ret = rbp->rb_checksum; 476 } 477 while (skip) { 478 crc_skip_t *skip_save = skip; 479 480 skip = skip->skip_next; 481 kmem_free(skip_save, sizeof (crc_skip_t)); 482 } 483 return (ret); 484 } 485 486 static mddb_bf_t * 487 allocbuffer( 488 mddb_set_t *s, 489 int sleepflag 490 ) 491 { 492 mddb_bf_t *bfp; 493 494 while ((bfp = s->s_freebufhead) == NULL) { 495 if (sleepflag == MDDB_NOSLEEP) 496 return ((mddb_bf_t *)NULL); 497 ++s->s_bufmisses; 498 #ifdef DEBUG 499 if (s->s_bufmisses == 1) 500 cmn_err(CE_NOTE, 501 "md: mddb: set %u sleeping for buffer", s->s_setno); 502 #endif 503 s->s_bufwakeup = 1; 504 cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno)); 505 } 506 s->s_freebufhead = bfp->bf_next; 507 bzero((caddr_t)bfp, sizeof (*bfp)); 508 bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf; 509 bfp->bf_buf.b_flags = B_BUSY; /* initialize flags */ 510 return (bfp); 511 } 512 513 static void 514 freebuffer( 515 mddb_set_t *s, 516 mddb_bf_t *bfp 517 ) 518 { 519 bfp->bf_next = s->s_freebufhead; 520 s->s_freebufhead = bfp; 521 if (s->s_bufwakeup) { 522 cv_broadcast(&s->s_buf_cv); 523 s->s_bufwakeup = 0; 524 } 525 } 526 527 528 static void 529 blkbusy( 530 mddb_set_t *s, 531 mddb_block_t blk 532 ) 533 { 534 int bit, byte; 535 536 s->s_freeblkcnt--; 537 byte = blk / 8; 538 bit = 1 << (blk & 7); 539 ASSERT(! (s->s_freebitmap[byte] & bit)); 540 s->s_freebitmap[byte] |= bit; 541 } 542 543 static void 544 blkfree( 545 mddb_set_t *s, 546 mddb_block_t blk 547 ) 548 { 549 int bit, byte; 550 551 s->s_freeblkcnt++; 552 byte = blk / 8; 553 bit = 1 << (blk & 7); 554 ASSERT(s->s_freebitmap[byte] & bit); 555 s->s_freebitmap[byte] &= ~bit; 556 } 557 558 static int 559 blkcheck( 560 mddb_set_t *s, 561 mddb_block_t blk 562 ) 563 { 564 int bit, byte; 565 566 byte = blk / 8; 567 bit = 1 << (blk & 7); 568 return (s->s_freebitmap[byte] & bit); 569 } 570 571 /* 572 * not fast but simple 573 */ 574 static mddb_block_t 575 getfreeblks( 576 mddb_set_t *s, 577 size_t count 578 ) 579 { 580 int i; 581 size_t contig; 582 583 contig = 0; 584 for (i = 0; i < s->s_totalblkcnt; i++) { 585 if (blkcheck(s, i)) { 586 contig = 0; 587 } else { 588 contig++; 589 if (contig == count) { 590 contig = i - count + 1; 591 for (i = (int)contig; i < contig + count; i++) 592 blkbusy(s, i); 593 return ((mddb_block_t)contig); 594 } 595 } 596 } 597 return (0); 598 } 599 600 static void 601 computefreeblks( 602 mddb_set_t *s 603 ) 604 { 605 mddb_db_t *dbp; 606 mddb_de_ic_t *dep; 607 int i; 608 int minblks; 609 int freeblks; 610 mddb_mb_ic_t *mbip; 611 mddb_lb_t *lbp; 612 mddb_block_t maxblk; 613 mddb_did_db_t *did_dbp; 614 int nblks; 615 616 minblks = 0; 617 lbp = s->s_lbp; 618 maxblk = 0; 619 620 /* 621 * Determine the max number of blocks. 622 */ 623 nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS; 624 /* 625 * go through and find highest logical block 626 */ 627 for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) { 628 if (dbp->db_blknum > maxblk) 629 maxblk = dbp->db_blknum; 630 for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next) 631 for (i = 0; i < dep->de_blkcount; i++) 632 if (dep->de_blks[i] > maxblk) 633 maxblk = dep->de_blks[i]; 634 } 635 636 for (i = 0; i < lbp->lb_loccnt; i++) { 637 mddb_locator_t *lp = &lbp->lb_locators[i]; 638 639 if ((lp->l_flags & MDDB_F_DELETED) || 640 (lp->l_flags & MDDB_F_EMASTER)) 641 continue; 642 643 freeblks = 0; 644 for (mbip = s->s_mbiarray[i]; mbip != NULL; 645 mbip = mbip->mbi_next) { 646 freeblks += mbip->mbi_mddb_mb.mb_blkcnt; 647 } 648 if (freeblks == 0) /* this happen when there is no */ 649 continue; /* master blk */ 650 651 if (freeblks <= maxblk) { 652 lp->l_flags |= MDDB_F_TOOSMALL; 653 lp->l_flags &= ~MDDB_F_ACTIVE; 654 } 655 656 if (freeblks < minblks || minblks == 0) 657 minblks = freeblks; 658 } 659 /* 660 * set up reasonable freespace if no 661 * data bases exist 662 */ 663 if (minblks == 0) 664 minblks = 100; 665 if (minblks > nblks) 666 minblks = nblks; 667 s->s_freeblkcnt = minblks; 668 s->s_totalblkcnt = minblks; 669 if (! s->s_freebitmapsize) { 670 s->s_freebitmapsize = nblks / 8; 671 s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize, 672 KM_SLEEP); 673 } 674 bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize); 675 676 /* locator block sectors */ 677 for (i = 0; i < s->s_lbp->lb_blkcnt; i++) 678 blkbusy(s, i); 679 680 /* locator name sectors */ 681 for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++) 682 blkbusy(s, (s->s_lbp->lb_lnfirstblk + i)); 683 684 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 685 /* locator block device id information */ 686 for (i = 0; i < s->s_lbp->lb_didblkcnt; i++) 687 blkbusy(s, (s->s_lbp->lb_didfirstblk + i)); 688 689 /* disk blocks containing actual device ids */ 690 did_dbp = s->s_did_icp->did_ic_dbp; 691 while (did_dbp) { 692 for (i = 0; i < did_dbp->db_blkcnt; i++) { 693 blkbusy(s, did_dbp->db_firstblk + i); 694 } 695 did_dbp = did_dbp->db_next; 696 } 697 } 698 699 /* Only use data tags if not a MN set */ 700 if (!(lbp->lb_flags & MDDB_MNSET)) { 701 /* Found a bad tag, do NOT mark the data tag blks busy here */ 702 if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) { 703 for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++) 704 blkbusy(s, (s->s_lbp->lb_dtfirstblk + i)); 705 } 706 } 707 708 /* directory block/entry sectors */ 709 for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) { 710 blkbusy(s, dbp->db_blknum); 711 for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next) 712 for (i = 0; i < dep->de_blkcount; i++) 713 blkbusy(s, dep->de_blks[i]); 714 } 715 } 716 717 /* 718 * Add free space to the device id incore free list. 719 * Called: 720 * - During startup when all devid blocks are temporarily placed on the 721 * free list 722 * - After a devid has been deleted via the metadb command. 723 * - When mddb_devid_free_get adds unused space from a disk block 724 * to free list 725 */ 726 static int 727 mddb_devid_free_add( 728 mddb_set_t *s, 729 uint_t firstblk, 730 uint_t offset, 731 uint_t length 732 ) 733 { 734 mddb_did_free_t *did_freep; 735 736 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 737 return (0); 738 } 739 740 did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t), 741 KM_SLEEP); 742 did_freep->free_blk = firstblk; 743 did_freep->free_offset = offset; 744 did_freep->free_length = length; 745 did_freep->free_next = s->s_did_icp->did_ic_freep; 746 s->s_did_icp->did_ic_freep = did_freep; 747 748 return (0); 749 } 750 751 /* 752 * Remove specific free space from the device id incore free list. 753 * Called at startup (after all devid blocks have been placed on 754 * free list) in order to remove the free space from the list that 755 * contains actual devids. 756 * Returns 0 if area successfully removed. 757 * Returns 1 if no matching area is found - so nothing removed. 758 */ 759 static int 760 mddb_devid_free_delete( 761 mddb_set_t *s, 762 uint_t firstblk, 763 uint_t offset, 764 uint_t length 765 ) 766 { 767 int block_found = 0; 768 mddb_did_free_t *did_freep1; /* next free block */ 769 mddb_did_free_t *did_freep2 = 0; /* previous free block */ 770 mddb_did_free_t *did_freep_before; /* area before offset, len */ 771 mddb_did_free_t *did_freep_after; /* area after offset, len */ 772 uint_t old_length; 773 774 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 775 return (1); 776 } 777 778 /* find free block for this devid */ 779 did_freep1 = s->s_did_icp->did_ic_freep; 780 while (did_freep1) { 781 /* 782 * Look through free list of <block, offset, length> to 783 * find our entry in the free list. Our entry should 784 * exist since the entire devid block was placed into 785 * this free list at startup. This code is just removing 786 * the non-free (in-use) portions of the devid block so 787 * that the remaining linked list does indeed just 788 * contain a free list. 789 * 790 * Our entry has been found if 791 * - the blocks match, 792 * - the offset (starting address) in the free list is 793 * less than the offset of our entry and 794 * - the length+offset (ending address) in the free list is 795 * greater than the length+offset of our entry. 796 */ 797 if ((did_freep1->free_blk == firstblk) && 798 (did_freep1->free_offset <= offset) && 799 ((did_freep1->free_length + did_freep1->free_offset) >= 800 (length + offset))) { 801 /* Have found our entry - remove from list */ 802 block_found = 1; 803 did_freep_before = did_freep1; 804 old_length = did_freep1->free_length; 805 /* did_freep1 - pts to next free block */ 806 did_freep1 = did_freep1->free_next; 807 if (did_freep2) { 808 did_freep2->free_next = did_freep1; 809 } else { 810 s->s_did_icp->did_ic_freep = did_freep1; 811 } 812 813 /* 814 * did_freep_before points to area in block before 815 * offset, length. 816 */ 817 did_freep_before->free_length = offset - 818 did_freep_before->free_offset; 819 /* 820 * did_freep_after points to area in block after 821 * offset, length. 822 */ 823 did_freep_after = (mddb_did_free_t *)kmem_zalloc 824 (sizeof (mddb_did_free_t), KM_SLEEP); 825 did_freep_after->free_blk = did_freep_before->free_blk; 826 did_freep_after->free_offset = offset + length; 827 did_freep_after->free_length = old_length - length - 828 did_freep_before->free_length; 829 /* 830 * Add before and after areas to free list 831 * If area before or after offset, length has length 832 * of 0, that entry is not added. 833 */ 834 if (did_freep_after->free_length) { 835 did_freep_after->free_next = did_freep1; 836 if (did_freep2) { 837 did_freep2->free_next = 838 did_freep_after; 839 } else { 840 s->s_did_icp->did_ic_freep = 841 did_freep_after; 842 } 843 did_freep1 = did_freep_after; 844 } else { 845 kmem_free(did_freep_after, 846 sizeof (mddb_did_free_t)); 847 } 848 849 if (did_freep_before->free_length) { 850 did_freep_before->free_next = did_freep1; 851 if (did_freep2) { 852 did_freep2->free_next = 853 did_freep_before; 854 } else { 855 s->s_did_icp->did_ic_freep = 856 did_freep_before; 857 } 858 } else { 859 kmem_free(did_freep_before, 860 sizeof (mddb_did_free_t)); 861 } 862 break; 863 } else { 864 did_freep2 = did_freep1; 865 did_freep1 = did_freep1->free_next; 866 } 867 } 868 if (block_found == 0) { 869 return (1); 870 } else { 871 return (0); 872 } 873 } 874 875 /* 876 * Find free space of devid length and remove free space from list. 877 * Return a pointer to the previously free area. 878 * 879 * If there's not enough free space on the free list, get an empty 880 * disk block, put the empty disk block on the did_ic_dbp linked list, 881 * and add the disk block space not used for devid to the free list. 882 * 883 * Return pointer to address (inside disk block) of free area for devid. 884 * Return 0 if error. 885 */ 886 static caddr_t 887 mddb_devid_free_get( 888 mddb_set_t *s, 889 uint_t len, 890 uint_t *blk, 891 uint_t *cnt, 892 uint_t *offset 893 ) 894 { 895 mddb_did_free_t *freep, *freep2; 896 mddb_did_db_t *dbp; 897 uint_t blk_cnt, blk_num; 898 ddi_devid_t devid_ptr = NULL; 899 900 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 901 return (0); 902 } 903 904 freep = s->s_did_icp->did_ic_freep; 905 freep2 = (mddb_did_free_t *)NULL; 906 while (freep) { 907 /* found a free area - remove from free list */ 908 if (len <= freep->free_length) { 909 *blk = freep->free_blk; 910 *offset = freep->free_offset; 911 /* find disk block pointer that contains free area */ 912 dbp = s->s_did_icp->did_ic_dbp; 913 while (dbp) { 914 if (dbp->db_firstblk == *blk) 915 break; 916 else 917 dbp = dbp->db_next; 918 } 919 /* 920 * If a disk block pointer can't be found - something 921 * is wrong, so don't use this free space. 922 */ 923 if (dbp == NULL) { 924 freep2 = freep; 925 freep = freep->free_next; 926 continue; 927 } 928 929 devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset); 930 *cnt = dbp->db_blkcnt; 931 932 /* Update free list information */ 933 freep->free_offset += len; 934 freep->free_length -= len; 935 if (freep->free_length == 0) { 936 if (freep2) { 937 freep2->free_next = 938 freep->free_next; 939 } else { 940 s->s_did_icp->did_ic_freep = 941 freep->free_next; 942 } 943 kmem_free(freep, sizeof (mddb_did_free_t)); 944 } 945 break; 946 } 947 freep2 = freep; 948 freep = freep->free_next; 949 } 950 951 /* Didn't find a free spot */ 952 if (freep == NULL) { 953 /* get free logical disk blk in replica */ 954 blk_cnt = btodb(len + (MDDB_BSIZE - 1)); 955 blk_num = getfreeblks(s, blk_cnt); 956 if (blk_num == 0) 957 return (0); 958 959 /* Add disk block to disk block linked list */ 960 dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP); 961 dbp->db_firstblk = blk_num; 962 dbp->db_blkcnt = blk_cnt; 963 dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP); 964 dbp->db_next = s->s_did_icp->did_ic_dbp; 965 s->s_did_icp->did_ic_dbp = dbp; 966 devid_ptr = (ddi_devid_t)dbp->db_ptr; 967 968 /* Update return values */ 969 *blk = blk_num; 970 *offset = 0; 971 *cnt = blk_cnt; 972 973 /* Add unused part of block to free list */ 974 (void) mddb_devid_free_add(s, blk_num, 975 len, (dbtob(blk_cnt) - len)); 976 } 977 978 return ((caddr_t)devid_ptr); 979 } 980 981 /* 982 * Add device id information for locator index to device id area in set. 983 * Get free area to store device id from free list. Update checksum 984 * for mddb_did_blk. 985 * 986 * This routine does not write any data out to disk. 987 * After this routine has been called, the routine, writelocall, should 988 * be called to write both the locator block and device id area out 989 * to disk. 990 */ 991 static int 992 mddb_devid_add( 993 mddb_set_t *s, 994 uint_t index, 995 ddi_devid_t devid, 996 char *minor_name 997 ) 998 { 999 uint_t devid_len; 1000 uint_t blk, offset; 1001 ddi_devid_t devid_ptr; 1002 mddb_did_info_t *did_info; 1003 uint_t blkcnt, i; 1004 mddb_did_blk_t *did_blk; 1005 1006 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 1007 return (1); 1008 } 1009 if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1)) 1010 return (1); 1011 1012 /* Check if device id has already been added */ 1013 did_blk = s->s_did_icp->did_ic_blkp; 1014 did_info = &(did_blk->blk_info[index]); 1015 if (did_info->info_flags & MDDB_DID_EXISTS) 1016 return (0); 1017 1018 devid_len = ddi_devid_sizeof(devid); 1019 devid_ptr = (ddi_devid_t)mddb_devid_free_get(s, 1020 devid_len, &blk, &blkcnt, &offset); 1021 1022 if (devid_ptr == NULL) { 1023 return (1); 1024 } 1025 1026 /* Copy devid into devid free area */ 1027 for (i = 0; i < devid_len; i++) 1028 ((char *)devid_ptr)[i] = ((char *)devid)[i]; 1029 1030 /* Update mddb_did_info area for new device id */ 1031 did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID; 1032 1033 /* 1034 * Only set UPDATED flag for non-replicated import cases. 1035 * This allows the side locator driver name index to get 1036 * updated in load_old_replicas. 1037 */ 1038 if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT)) 1039 did_info->info_flags |= MDDB_DID_UPDATED; 1040 1041 did_info->info_firstblk = blk; 1042 did_info->info_blkcnt = blkcnt; 1043 did_info->info_offset = offset; 1044 did_info->info_length = devid_len; 1045 (void) strcpy(did_info->info_minor_name, minor_name); 1046 crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL); 1047 1048 /* Add device id pointer to did_ic_devid array */ 1049 s->s_did_icp->did_ic_devid[index] = devid_ptr; 1050 1051 return (0); 1052 } 1053 1054 1055 /* 1056 * Delete device id information for locator index from device id area in set. 1057 * Add device id space to free area. 1058 * 1059 * This routine does not write any data out to disk. 1060 * After this routine has been called, the routine, writelocall, should 1061 * be called to write both the locator block and device id area out 1062 * to disk. 1063 */ 1064 static int 1065 mddb_devid_delete(mddb_set_t *s, uint_t index) 1066 { 1067 mddb_did_info_t *did_info; 1068 mddb_did_blk_t *did_blk; 1069 1070 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 1071 return (1); 1072 } 1073 1074 /* Get device id information from mddb_did_blk */ 1075 did_blk = s->s_did_icp->did_ic_blkp; 1076 did_info = &(did_blk->blk_info[index]); 1077 1078 /* 1079 * Ensure that the underlying device supports device ids 1080 * before arbitrarily removing them. 1081 */ 1082 if (!(did_info->info_flags & MDDB_DID_EXISTS)) { 1083 return (1); 1084 } 1085 1086 /* Remove device id information from mddb_did_blk */ 1087 did_info->info_flags = 0; 1088 1089 /* Remove device id from incore area */ 1090 s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL; 1091 1092 /* Add new free space in disk block to free list */ 1093 (void) mddb_devid_free_add(s, did_info->info_firstblk, 1094 did_info->info_offset, did_info->info_length); 1095 1096 return (0); 1097 } 1098 1099 /* 1100 * Check if there is a device id for a locator index. 1101 * 1102 * Caller of this routine should not free devid or minor_name since 1103 * these will point to internal data structures that should not 1104 * be freed. 1105 */ 1106 static int 1107 mddb_devid_get( 1108 mddb_set_t *s, 1109 uint_t index, 1110 ddi_devid_t *devid, 1111 char **minor_name 1112 ) 1113 { 1114 mddb_did_info_t *did_info; 1115 1116 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 1117 return (0); 1118 } 1119 did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]); 1120 1121 if (did_info->info_flags & MDDB_DID_EXISTS) { 1122 *devid = s->s_did_icp->did_ic_devid[index]; 1123 *minor_name = 1124 s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name; 1125 return (1); 1126 } else 1127 return (0); 1128 1129 1130 } 1131 1132 /* 1133 * Check if device id is valid on current system. 1134 * Needs devid, previously known dev_t and current minor_name. 1135 * 1136 * Success: 1137 * Returns 0 if valid device id is found and updates 1138 * dev_t if the dev_t associated with the device id is 1139 * different than dev_t. 1140 * Failure: 1141 * Returns 1 if device id not valid on current system. 1142 */ 1143 static int 1144 mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name) 1145 { 1146 int retndevs; 1147 dev_t *ddi_devs; 1148 int devid_flag = 0; 1149 int cnt; 1150 1151 if (dev == 0) 1152 return (1); 1153 /* 1154 * See if devid is valid in the current system. 1155 * If so, set dev to match the devid. 1156 */ 1157 if (ddi_lyr_devid_to_devlist(devid, minor_name, 1158 &retndevs, &ddi_devs) == DDI_SUCCESS) { 1159 if (retndevs > 0) { 1160 /* devid is valid to use */ 1161 devid_flag = 1; 1162 /* does dev_t in list match dev */ 1163 cnt = 0; 1164 while (cnt < retndevs) { 1165 if (*dev == md_expldev(ddi_devs[cnt])) 1166 break; 1167 cnt++; 1168 } 1169 /* 1170 * If a different dev_t, then setup 1171 * new dev and new major name 1172 */ 1173 if (cnt == retndevs) { 1174 *dev = md_expldev(ddi_devs[0]); 1175 } 1176 ddi_lyr_free_devlist(ddi_devs, retndevs); 1177 } 1178 } 1179 if (devid_flag) 1180 return (0); 1181 else 1182 return (1); 1183 } 1184 1185 1186 /* 1187 * Free the devid incore data areas 1188 */ 1189 static void 1190 mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp) 1191 { 1192 mddb_did_free_t *did_freep1, *did_freep2; 1193 mddb_did_db_t *did_dbp1, *did_dbp2; 1194 mddb_did_ic_t *icp = *did_icp; 1195 1196 if (icp) { 1197 if (icp->did_ic_blkp) { 1198 kmem_free((caddr_t)icp->did_ic_blkp, 1199 dbtob(lbp->lb_didblkcnt)); 1200 icp->did_ic_blkp = (mddb_did_blk_t *)NULL; 1201 } 1202 1203 if (icp->did_ic_dbp) { 1204 did_dbp1 = icp->did_ic_dbp; 1205 while (did_dbp1) { 1206 did_dbp2 = did_dbp1->db_next; 1207 kmem_free((caddr_t)did_dbp1->db_ptr, 1208 dbtob(did_dbp1->db_blkcnt)); 1209 kmem_free((caddr_t)did_dbp1, 1210 sizeof (mddb_did_db_t)); 1211 did_dbp1 = did_dbp2; 1212 } 1213 } 1214 1215 if (icp->did_ic_freep) { 1216 did_freep1 = icp->did_ic_freep; 1217 while (did_freep1) { 1218 did_freep2 = did_freep1->free_next; 1219 kmem_free((caddr_t)did_freep1, 1220 sizeof (mddb_did_free_t)); 1221 did_freep1 = did_freep2; 1222 } 1223 } 1224 1225 kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t)); 1226 *did_icp = (mddb_did_ic_t *)NULL; 1227 } 1228 1229 } 1230 1231 static daddr_t 1232 getphysblk( 1233 mddb_block_t blk, 1234 mddb_mb_ic_t *mbip 1235 ) 1236 { 1237 mddb_mb_t *mbp = &(mbip->mbi_mddb_mb); 1238 1239 while (blk >= mbp->mb_blkcnt) { 1240 if (! mbip->mbi_next) 1241 return ((daddr_t)-1); /* no such block */ 1242 blk -= mbp->mb_blkcnt; 1243 mbip = mbip->mbi_next; 1244 mbp = &(mbip->mbi_mddb_mb); 1245 } 1246 1247 if (blk >= mbp->mb_blkmap.m_consecutive) 1248 return ((daddr_t)-1); /* no such block */ 1249 1250 return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk)); 1251 } 1252 1253 /* 1254 * when a buf header is passed in the new buffer must be 1255 * put on the front of the chain. writerec counts on it 1256 */ 1257 static int 1258 putblks( 1259 mddb_set_t *s, /* incore db set structure */ 1260 caddr_t buffer, /* adr of buffer to be written */ 1261 daddr_t blk, /* block number for first block */ 1262 int cnt, /* number of blocks to be written */ 1263 md_dev64_t device, /* device to be written to */ 1264 mddb_bf_t **bufhead /* if non-zero then ASYNC I/O */ 1265 /* and put buf address here */ 1266 ) 1267 { 1268 buf_t *bp; 1269 mddb_bf_t *bfp; 1270 int err = 0; 1271 1272 bfp = allocbuffer(s, MDDB_SLEEPOK); 1273 bp = &bfp->bf_buf; 1274 bp->b_bcount = MDDB_BSIZE * cnt; 1275 bp->b_un.b_addr = buffer; 1276 bp->b_blkno = blk; 1277 bp->b_edev = md_dev64_to_dev(device); 1278 /* 1279 * if a header for a buf chain is passed in this is async io. 1280 * currently only done for optimize records 1281 */ 1282 if (bufhead) { 1283 bfp->bf_next = *bufhead; 1284 *bufhead = bfp; 1285 (void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp); 1286 return (0); 1287 } 1288 err = mddb_rwdata(s, B_WRITE, bp); 1289 freebuffer(s, bfp); 1290 if (err) { 1291 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA, 1292 s->s_setno, device); 1293 return (MDDB_F_EWRITE); 1294 } 1295 return (0); 1296 } 1297 1298 /* 1299 * wrtblklst - takes an array of logical block numbers 1300 * and writes the buffer to those blocks (scatter). 1301 * If called during upgrade, this routine expects a 1302 * non-translated (aka target) dev. 1303 */ 1304 static int 1305 wrtblklst( 1306 mddb_set_t *s, /* incore set structure */ 1307 caddr_t buffer, /* buffer to be written (record blk) */ 1308 mddb_block_t blka[], /* list of logical blks for record */ 1309 daddr_t cnt, /* number of logical blks */ 1310 const int li, /* locator index */ 1311 mddb_bf_t **bufhead, /* if non-zero then ASYNC I/O */ 1312 /* and put buf address here */ 1313 int master_only /* allow only master node to write */ 1314 ) 1315 { 1316 daddr_t blk; 1317 daddr_t blk1; 1318 int err = 0; 1319 int cons; 1320 mddb_lb_t *lbp = s->s_lbp; 1321 mddb_locator_t *lp = &lbp->lb_locators[li]; 1322 md_dev64_t dev; 1323 mddb_mb_ic_t *mbip = s->s_mbiarray[li]; 1324 1325 /* 1326 * If a MN diskset and only the master can write, 1327 * then a non-master node will just return success. 1328 */ 1329 if (lbp->lb_flags & MDDB_MNSET) { 1330 if (master_only == MDDB_WR_ONLY_MASTER) { 1331 /* return successfully if we aren't the master */ 1332 if (!(md_set[s->s_setno].s_am_i_master)) { 1333 return (0); 1334 } 1335 } 1336 if (mbip == NULL) 1337 return (MDDB_F_EWRITE); 1338 } 1339 1340 dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 1341 if (dev == NODEV64) { 1342 return (1); 1343 } 1344 1345 blk = getphysblk(blka[0], mbip); 1346 ASSERT(blk >= 0); 1347 1348 cons = 1; 1349 while (cnt) { 1350 if (cons != cnt) { 1351 blk1 = getphysblk(blka[cons], mbip); 1352 ASSERT(blk1 >= 0); 1353 if ((blk + cons) == blk1) { 1354 cons++; 1355 continue; 1356 } 1357 } 1358 if (err = putblks(s, buffer, blk, cons, dev, bufhead)) { 1359 /* 1360 * If an MN diskset and any_node_can_write 1361 * then this request is coming from writeoptrecord 1362 * and l_flags field should not be updated. 1363 * l_flags will be updated as a result of sending 1364 * a class1 message to the master. Setting l_flags 1365 * here will cause slave to be out of sync with 1366 * master. 1367 * 1368 * Otherwise, set the error in l_flags 1369 * (this occurs if this is not a MN diskset or 1370 * only_master_can_write is set). 1371 */ 1372 if ((!(lbp->lb_flags & MDDB_MNSET)) || 1373 (master_only == MDDB_WR_ONLY_MASTER)) { 1374 lp->l_flags |= MDDB_F_EWRITE; 1375 } 1376 return (err); 1377 } 1378 if (bufhead) 1379 (*bufhead)->bf_locator = lp; 1380 1381 buffer += MDDB_BSIZE * cons; 1382 cnt -= cons; 1383 blka += cons; 1384 if (cnt) { 1385 blk = getphysblk(blka[0], mbip); 1386 ASSERT(blk >= 0); 1387 } 1388 cons = 1; 1389 } 1390 1391 return (0); 1392 } 1393 1394 /* 1395 * writeblks - takes a logical block number/block count pair 1396 * and writes the buffer to those contiguous logical blocks. 1397 * If called during upgrade, this routine expects a non-translated 1398 * (aka target) dev. 1399 */ 1400 static int 1401 writeblks( 1402 mddb_set_t *s, /* incore set structure */ 1403 caddr_t buffer, /* buffer to be written */ 1404 mddb_block_t blk, /* starting logical block number */ 1405 int cnt, /* number of log blocks to be written */ 1406 const int li, /* locator index */ 1407 int master_only /* allow only master node to write */ 1408 ) 1409 { 1410 daddr_t physblk; 1411 int err = 0; 1412 int i; 1413 mddb_lb_t *lbp = s->s_lbp; 1414 mddb_locator_t *lp = &lbp->lb_locators[li]; 1415 md_dev64_t dev; 1416 mddb_block_t *blkarray; 1417 int size; 1418 int ret; 1419 1420 /* 1421 * If a MN diskset and only the master can write, 1422 * then a non-master node will just return success. 1423 */ 1424 if ((lbp->lb_flags & MDDB_MNSET) && 1425 (master_only == MDDB_WR_ONLY_MASTER)) { 1426 /* return successfully if we aren't the master */ 1427 if (!(md_set[s->s_setno].s_am_i_master)) { 1428 return (0); 1429 } 1430 } 1431 1432 dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 1433 if (dev == NODEV64) { 1434 return (1); 1435 } 1436 1437 if (cnt > 1) { 1438 size = sizeof (mddb_block_t) * cnt; 1439 blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP); 1440 for (i = 0; i < cnt; i++) 1441 blkarray[i] = blk + i; 1442 ret = wrtblklst(s, buffer, blkarray, cnt, 1443 li, 0, MDDB_WR_ONLY_MASTER); 1444 kmem_free(blkarray, size); 1445 return (ret); 1446 } 1447 physblk = getphysblk(blk, s->s_mbiarray[li]); 1448 ASSERT(physblk > 0); 1449 if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) { 1450 lp->l_flags |= MDDB_F_EWRITE; 1451 return (err); 1452 } 1453 return (0); 1454 } 1455 1456 /* 1457 * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas. 1458 */ 1459 static int 1460 writeall( 1461 mddb_set_t *s, /* incore set structure */ 1462 caddr_t buffer, /* buffer to be written */ 1463 mddb_block_t block, /* starting logical block number */ 1464 int cnt, /* number of log blocks to be written */ 1465 int master_only /* allow only master node to write */ 1466 ) 1467 { 1468 int li; 1469 int err = 0; 1470 mddb_lb_t *lbp = s->s_lbp; 1471 1472 for (li = 0; li < lbp->lb_loccnt; li++) { 1473 mddb_locator_t *lp = &lbp->lb_locators[li]; 1474 1475 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 1476 (lp->l_flags & MDDB_F_EWRITE)) 1477 continue; 1478 1479 err |= writeblks(s, buffer, block, cnt, li, master_only); 1480 } 1481 1482 return (err); 1483 } 1484 1485 /* 1486 * writelocall - write the locator block and device id information (if 1487 * replica is in device id format) to all ACTIVE/NON-ERRORER replicas. 1488 * 1489 * Increments the locator block's commitcnt. Updates the device id area's 1490 * commitcnt if the replica is in device id format. Regenerates the 1491 * checksums after updating the commitcnt(s). 1492 */ 1493 static int 1494 writelocall( 1495 mddb_set_t *s /* incore set structure */ 1496 ) 1497 { 1498 int li; 1499 int err = 0; 1500 mddb_lb_t *lbp = s->s_lbp; 1501 mddb_did_blk_t *did_blk; 1502 mddb_did_db_t *did_dbp; 1503 1504 s->s_lbp->lb_commitcnt++; 1505 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 1506 did_blk = s->s_did_icp->did_ic_blkp; 1507 did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt; 1508 crcgen(did_blk, &did_blk->blk_checksum, 1509 dbtob(lbp->lb_didblkcnt), NULL); 1510 } 1511 crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL); 1512 1513 for (li = 0; li < lbp->lb_loccnt; li++) { 1514 mddb_locator_t *lp = &lbp->lb_locators[li]; 1515 1516 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 1517 (lp->l_flags & MDDB_F_EWRITE)) 1518 continue; 1519 1520 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 1521 /* write out blocks containing actual device ids */ 1522 did_dbp = s->s_did_icp->did_ic_dbp; 1523 while (did_dbp) { 1524 err |= writeblks(s, (caddr_t)did_dbp->db_ptr, 1525 did_dbp->db_firstblk, 1526 did_dbp->db_blkcnt, li, 1527 MDDB_WR_ONLY_MASTER); 1528 did_dbp = did_dbp->db_next; 1529 } 1530 1531 /* write out device id area block */ 1532 err |= writeblks(s, (caddr_t)did_blk, 1533 lbp->lb_didfirstblk, lbp->lb_didblkcnt, li, 1534 MDDB_WR_ONLY_MASTER); 1535 } 1536 /* write out locator block */ 1537 err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, 1538 MDDB_WR_ONLY_MASTER); 1539 } 1540 1541 /* 1542 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag 1543 * in the mddb_set structure to show that the locator block has 1544 * been changed. 1545 */ 1546 1547 if ((lbp->lb_flags & MDDB_MNSET) && 1548 (md_set[s->s_setno].s_am_i_master)) { 1549 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; 1550 } 1551 return (err); 1552 } 1553 1554 /* 1555 * If called during upgrade, this routine expects a translated 1556 * (aka miniroot) dev. 1557 */ 1558 static int 1559 getblks( 1560 mddb_set_t *s, /* incore db set structure */ 1561 caddr_t buffer, /* buffer to read data into */ 1562 md_dev64_t device, /* device to read from */ 1563 daddr_t blk, /* physical block number to read */ 1564 int cnt, /* number of blocks to read */ 1565 int flag /* flags for I/O */ 1566 ) 1567 { 1568 buf_t *bp; 1569 mddb_bf_t *bfp; 1570 int err = 0; 1571 1572 bfp = allocbuffer(s, MDDB_SLEEPOK); /* this will never sleep */ 1573 bp = &bfp->bf_buf; 1574 bp->b_bcount = MDDB_BSIZE * cnt; 1575 bp->b_un.b_addr = buffer; 1576 bp->b_blkno = blk; 1577 bp->b_edev = md_dev64_to_dev(device); 1578 err = mddb_rwdata(s, (B_READ | flag), bp); 1579 freebuffer(s, bfp); 1580 if (err) { 1581 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA, 1582 s->s_setno, device); 1583 return (MDDB_F_EREAD); 1584 } 1585 return (0); 1586 } 1587 1588 /* 1589 * readblklst - takes an array of logical block numbers 1590 * and reads those blocks (gather) into the buffer. 1591 * If called during upgrade, this routine expects a non-translated 1592 * (aka target) dev. 1593 */ 1594 static int 1595 readblklst( 1596 mddb_set_t *s, /* incore set structure */ 1597 caddr_t buffer, /* buffer to be read (record block) */ 1598 mddb_block_t blka[], /* list of logical blocks to be read */ 1599 daddr_t cnt, /* number of logical blocks */ 1600 int li, /* locator index */ 1601 int flag /* flags for I/O */ 1602 ) 1603 { 1604 daddr_t blk; 1605 daddr_t blk1; 1606 int err = 0; 1607 int cons; 1608 md_dev64_t dev; 1609 mddb_mb_ic_t *mbip; 1610 1611 mbip = s->s_mbiarray[li]; 1612 dev = md_expldev(s->s_lbp->lb_locators[li].l_dev); 1613 dev = md_xlate_targ_2_mini(dev); 1614 if (dev == NODEV64) { 1615 return (1); 1616 } 1617 1618 blk = getphysblk(blka[0], mbip); 1619 ASSERT(blk >= 0); 1620 1621 cons = 1; 1622 while (cnt) { 1623 if (cons != cnt) { 1624 blk1 = getphysblk(blka[cons], mbip); 1625 ASSERT(blk1 >= 0); 1626 if ((blk + cons) == blk1) { 1627 cons++; 1628 continue; 1629 } 1630 } 1631 if (err = getblks(s, buffer, dev, blk, cons, flag)) 1632 return (err); 1633 buffer += MDDB_BSIZE * cons; 1634 cnt -= cons; 1635 blka += cons; 1636 if (cnt) { 1637 blk = getphysblk(blka[0], mbip); 1638 ASSERT(blk >= 0); 1639 } 1640 cons = 1; 1641 } 1642 return (0); 1643 } 1644 1645 /* 1646 * readblks - takes a logical block number/block count pair 1647 * and reads those contiguous logical blocks into the buffer. 1648 * If called during upgrade, this routine expects a non-translated 1649 * (aka target) dev. 1650 */ 1651 static int 1652 readblks( 1653 mddb_set_t *s, /* incore set structure */ 1654 caddr_t buffer, /* buffer to be read into */ 1655 mddb_block_t blk, /* logical block number to be read */ 1656 int cnt, /* number of logical blocks to be read */ 1657 int li /* locator index */ 1658 ) 1659 { 1660 daddr_t physblk; 1661 md_dev64_t device; 1662 int i; 1663 mddb_block_t *blkarray; 1664 int size; 1665 int ret; 1666 1667 if (cnt > 1) { 1668 size = sizeof (mddb_block_t) * cnt; 1669 blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP); 1670 for (i = 0; i < cnt; i++) 1671 blkarray[i] = blk + i; 1672 ret = readblklst(s, buffer, blkarray, cnt, li, 0); 1673 kmem_free(blkarray, size); 1674 return (ret); 1675 } 1676 physblk = getphysblk(blk, s->s_mbiarray[li]); 1677 ASSERT(physblk > 0); 1678 device = md_expldev(s->s_lbp->lb_locators[li].l_dev); 1679 device = md_xlate_targ_2_mini(device); 1680 if (device == NODEV64) { 1681 return (1); 1682 } 1683 return (getblks(s, buffer, device, physblk, 1, 0)); 1684 } 1685 1686 static void 1687 single_thread_start( 1688 mddb_set_t *s 1689 ) 1690 { 1691 while (s->s_singlelockgotten) { 1692 s->s_singlelockwanted++; 1693 cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno)); 1694 } 1695 s->s_singlelockgotten++; 1696 } 1697 1698 static void 1699 single_thread_end( 1700 mddb_set_t *s 1701 ) 1702 { 1703 ASSERT(s->s_singlelockgotten); 1704 s->s_singlelockgotten = 0; 1705 if (s->s_singlelockwanted) { 1706 s->s_singlelockwanted = 0; 1707 cv_broadcast(&s->s_single_thread_cv); 1708 } 1709 } 1710 1711 static size_t 1712 sizeofde( 1713 mddb_de_ic_t *dep 1714 ) 1715 { 1716 size_t size; 1717 1718 size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) + 1719 sizeof (mddb_block_t) * dep->de_blkcount; 1720 return (size); 1721 } 1722 1723 static size_t 1724 sizeofde32( 1725 mddb_de32_t *dep 1726 ) 1727 { 1728 size_t size; 1729 1730 size = sizeof (*dep) - sizeof (dep->de32_blks) + 1731 sizeof (mddb_block_t) * dep->de32_blkcount; 1732 return (size); 1733 } 1734 1735 static mddb_de32_t * 1736 nextentry( 1737 mddb_de32_t *dep 1738 ) 1739 { 1740 mddb_de32_t *ret; 1741 1742 ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep))); 1743 return (ret); 1744 } 1745 1746 static void 1747 create_db32rec( 1748 mddb_db32_t *db32p, 1749 mddb_db_t *dbp 1750 ) 1751 { 1752 mddb_de_ic_t *dep; 1753 mddb_de32_t *de32p; 1754 1755 #if defined(_ILP32) && !defined(lint) 1756 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 1757 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 1758 #endif 1759 1760 dbtodb32(dbp, db32p); 1761 if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0)) 1762 db32p->db32_firstentry = 0x4; 1763 de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry) 1764 + sizeof (db32p->db32_firstentry))); 1765 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 1766 detode32(dep, de32p); 1767 if ((dep->de_next != NULL) && (de32p->de32_next == 0)) 1768 de32p->de32_next = 0x4; 1769 de32p = nextentry(de32p); 1770 } 1771 ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE); 1772 } 1773 1774 /* 1775 * If called during upgrade, this routine expects a translated 1776 * (aka miniroot) dev. 1777 * If master blocks are found, set the mn_set parameter to 1 if the 1778 * the master block revision number is MDDB_REV_MNMB; otherwise, 1779 * set it to 0. 1780 * If master blocks are not found, do not change the mnset parameter. 1781 */ 1782 static mddb_mb_ic_t * 1783 getmasters( 1784 mddb_set_t *s, 1785 md_dev64_t dev, 1786 daddr_t blkno, 1787 uint_t *flag, 1788 int *mn_set 1789 ) 1790 { 1791 mddb_mb_ic_t *mbi = NULL; 1792 mddb_mb_t *mb; 1793 int error = 0; 1794 ddi_devid_t devid; 1795 1796 1797 if (mddb_devopen(dev)) { 1798 if (flag) 1799 *flag |= MDDB_F_EMASTER; 1800 return ((mddb_mb_ic_t *)NULL); 1801 } 1802 1803 1804 mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP); 1805 mb = &(mbi->mbi_mddb_mb); 1806 if (error = getblks(s, (caddr_t)mb, dev, blkno, 1807 btodb(MDDB_BSIZE), 0)) { 1808 error |= MDDB_F_EMASTER; 1809 } 1810 if (mb->mb_magic != MDDB_MAGIC_MB) { 1811 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1812 } 1813 /* Check for MDDB_REV_MNMB and lower */ 1814 if (revchk(MDDB_REV_MNMB, mb->mb_revision)) { 1815 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1816 } 1817 if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) { 1818 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1819 } 1820 1821 if (!(md_get_setstatus(s->s_setno) & 1822 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && 1823 (mb->mb_setno != s->s_setno)) { 1824 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1825 } 1826 if (mb->mb_blkno != blkno) { 1827 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1828 } 1829 mb->mb_next = NULL; 1830 mbi->mbi_next = NULL; 1831 1832 if (error) 1833 goto out; 1834 1835 /* 1836 * Check the md_devid_destroy and md_keep_repl_state flags 1837 * to see if we need to regen the devid or not. 1838 * 1839 * Don't care about devid in local set since it is not used 1840 * and this should not be part of set importing 1841 */ 1842 if ((s->s_setno != MD_LOCAL_SET) && 1843 !(md_get_setstatus(s->s_setno) & 1844 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) { 1845 /* 1846 * Now check the destroy flag. We also need to handle 1847 * the case where the destroy flag is reset after the 1848 * destroy 1849 */ 1850 if (md_devid_destroy || (mb->mb_devid_len == 0)) { 1851 1852 if (md_devid_destroy) { 1853 bzero(mb->mb_devid, mb->mb_devid_len); 1854 mb->mb_devid_len = 0; 1855 } 1856 1857 /* 1858 * Try to regenerate it if the 'keep' flag is not set 1859 */ 1860 if (!md_keep_repl_state) { 1861 if (ddi_lyr_get_devid(md_dev64_to_dev(dev), 1862 &devid) == DDI_SUCCESS) { 1863 mb->mb_devid_len = 1864 ddi_devid_sizeof(devid); 1865 bcopy(devid, mb->mb_devid, 1866 mb->mb_devid_len); 1867 ddi_devid_free(devid); 1868 } else { 1869 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1870 } 1871 } 1872 1873 crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL); 1874 1875 /* 1876 * Push 1877 */ 1878 if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) { 1879 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1880 } 1881 } 1882 } 1883 1884 if (! error) { 1885 /* Set mn_set parameter to 1 if a MN set */ 1886 if (mb->mb_revision == MDDB_REV_MNMB) 1887 *mn_set = 1; 1888 else 1889 *mn_set = 0; 1890 return (mbi); 1891 } 1892 1893 out: 1894 /* Error Out */ 1895 if (flag) 1896 *flag |= error; 1897 1898 kmem_free((caddr_t)mbi, MDDB_IC_BSIZE); 1899 mddb_devclose(dev); 1900 return ((mddb_mb_ic_t *)NULL); 1901 } 1902 1903 static int 1904 getrecord( 1905 mddb_set_t *s, 1906 mddb_de_ic_t *dep, 1907 int li 1908 ) 1909 { 1910 int err = 0; 1911 mddb_rb32_t *rbp; 1912 1913 #if defined(_ILP32) && !defined(lint) 1914 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 1915 #endif 1916 1917 1918 dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP); 1919 rbp = dep->de_rb; 1920 1921 err = readblklst(s, (caddr_t)rbp, dep->de_blks, 1922 dep->de_blkcount, li, 0); 1923 if (err) { 1924 return (MDDB_F_EDATA | err); 1925 } 1926 if (rbp->rb_magic != MDDB_MAGIC_RB) { 1927 return (MDDB_F_EFMT | MDDB_F_EDATA); 1928 } 1929 if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) && 1930 (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) && 1931 (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) && 1932 (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) { 1933 return (MDDB_F_EFMT | MDDB_F_EDATA); 1934 } 1935 /* Check crc for this record */ 1936 if (rec_crcchk(s, dep, rbp)) { 1937 return (MDDB_F_EFMT | MDDB_F_EDATA); 1938 } 1939 return (0); 1940 } 1941 1942 /* 1943 * Code to read in the locator name information 1944 */ 1945 static int 1946 readlocnames( 1947 mddb_set_t *s, 1948 int li 1949 ) 1950 { 1951 mddb_ln_t *lnp; 1952 int err = 0; 1953 mddb_block_t ln_blkcnt, ln_blkno; 1954 1955 /* 1956 * read in the locator name blocks 1957 */ 1958 s->s_lnp = NULL; 1959 1960 ln_blkno = s->s_lbp->lb_lnfirstblk; 1961 ln_blkcnt = s->s_lbp->lb_lnblkcnt; 1962 lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP); 1963 1964 err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li); 1965 if (err) { 1966 err |= MDDB_F_EDATA; 1967 goto out; 1968 } 1969 if (lnp->ln_magic != MDDB_MAGIC_LN) { 1970 err = MDDB_F_EDATA | MDDB_F_EFMT; 1971 goto out; 1972 } 1973 if (s->s_lbp->lb_flags & MDDB_MNSET) { 1974 if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) { 1975 err = MDDB_F_EDATA | MDDB_F_EFMT; 1976 goto out; 1977 } 1978 } else { 1979 if (revchk(MDDB_REV_LN, lnp->ln_revision)) { 1980 err = MDDB_F_EDATA | MDDB_F_EFMT; 1981 goto out; 1982 } 1983 } 1984 if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) { 1985 err = MDDB_F_EDATA | MDDB_F_EFMT; 1986 goto out; 1987 } 1988 out: 1989 /* 1990 * if error occurred in locator name blocks free them 1991 * and return 1992 */ 1993 if (err) { 1994 kmem_free((caddr_t)lnp, dbtob(ln_blkcnt)); 1995 return (err); 1996 } 1997 s->s_lnp = lnp; 1998 return (0); 1999 } 2000 2001 /* 2002 * code to read in a copy of the database. 2003 */ 2004 2005 static int 2006 readcopy( 2007 mddb_set_t *s, 2008 int li 2009 ) 2010 { 2011 uint_t blk; 2012 mddb_db_t *dbp, *dbp1, *dbhp; 2013 mddb_db32_t *db32p; 2014 mddb_de_ic_t *dep, *dep2; 2015 mddb_de32_t *de32p, *de32p2; 2016 int err = 0; 2017 uint_t checksum; 2018 2019 2020 #if defined(_ILP32) && !defined(lint) 2021 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 2022 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2023 #endif 2024 2025 dbp = NULL; 2026 dbhp = NULL; 2027 /* 2028 * read in all the directory blocks 2029 */ 2030 blk = s->s_lbp->lb_dbfirstblk; 2031 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 2032 2033 for (; blk != 0; blk = dbp->db_nextblk) { 2034 dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP); 2035 if (! dbhp) { 2036 dbhp = dbp1; 2037 } else { 2038 dbp->db_next = dbp1; 2039 } 2040 dbp = dbp1; 2041 2042 err = readblks(s, (caddr_t)db32p, blk, 1, li); 2043 if (err) { 2044 err |= MDDB_F_EDATA; 2045 break; 2046 } 2047 db32todb(db32p, dbp); 2048 if (db32p->db32_magic != MDDB_MAGIC_DB) { 2049 err = MDDB_F_EDATA | MDDB_F_EFMT; 2050 break; 2051 } 2052 if (revchk(MDDB_REV_DB, db32p->db32_revision)) { 2053 err = MDDB_F_EDATA | MDDB_F_EFMT; 2054 break; 2055 } 2056 if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) { 2057 err = MDDB_F_EDATA | MDDB_F_EFMT; 2058 break; 2059 } 2060 /* 2061 * first go through and fix up all de_next pointers 2062 */ 2063 if (dbp->db_firstentry) { 2064 2065 de32p = (mddb_de32_t *) 2066 ((void *) ((caddr_t)(&db32p->db32_firstentry) 2067 + sizeof (db32p->db32_firstentry))); 2068 2069 dep = (mddb_de_ic_t *) 2070 kmem_zalloc(sizeof (mddb_de_ic_t) - 2071 sizeof (mddb_block_t) + 2072 sizeof (mddb_block_t) * de32p->de32_blkcount, 2073 KM_SLEEP); 2074 de32tode(de32p, dep); 2075 2076 dbp->db_firstentry = dep; 2077 while (de32p && de32p->de32_next) { 2078 2079 de32p2 = nextentry(de32p); 2080 2081 dep2 = (mddb_de_ic_t *)kmem_zalloc( 2082 sizeof (mddb_de_ic_t) - 2083 sizeof (mddb_block_t) + 2084 sizeof (mddb_block_t) * 2085 de32p2->de32_blkcount, KM_SLEEP); 2086 2087 de32tode(de32p2, dep2); 2088 2089 dep->de_next = dep2; 2090 dep = dep2; 2091 de32p = de32p2; 2092 } 2093 } 2094 /* 2095 * go through and make all of the pointer to record blocks 2096 * are null; 2097 */ 2098 for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) 2099 dep->de_rb = NULL; 2100 } 2101 kmem_free((caddr_t)db32p, MDDB_BSIZE); 2102 dbp->db_next = NULL; 2103 /* 2104 * if error occurred in directory blocks free them 2105 * and return 2106 */ 2107 if (err) { 2108 dbp = dbhp; 2109 while (dbp) { 2110 dep = dbp->db_firstentry; 2111 while (dep) { 2112 /* No mddb_rb32_t structures yet */ 2113 dep2 = dep->de_next; 2114 kmem_free((caddr_t)dep, sizeofde(dep)); 2115 dep = dep2; 2116 } 2117 dbp1 = dbp->db_next; 2118 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 2119 dbp = dbp1; 2120 } 2121 s->s_dbp = NULL; 2122 return (err); 2123 2124 } 2125 /* 2126 */ 2127 err = 0; 2128 checksum = MDDB_GLOBAL_XOR; 2129 for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) { 2130 checksum ^= dbp->db_recsum; 2131 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2132 if (dep->de_flags & MDDB_F_OPT) 2133 continue; 2134 err = getrecord(s, dep, li); 2135 if (err) 2136 break; 2137 /* Don't include CHANGELOG in big XOR */ 2138 if (dep->de_flags & MDDB_F_CHANGELOG) 2139 continue; 2140 checksum ^= dep->de_rb->rb_checksum; 2141 checksum ^= dep->de_rb->rb_checksum_fiddle; 2142 } 2143 if (err) 2144 break; 2145 } 2146 if (checksum) { 2147 if (! err) 2148 err = MDDB_F_EDATA | MDDB_F_EFMT; 2149 } 2150 if (err) { 2151 dbp = dbhp; 2152 dbhp = NULL; 2153 while (dbp) { 2154 dep = dbp->db_firstentry; 2155 while (dep) { 2156 if (dep->de_rb) 2157 kmem_free((caddr_t)dep->de_rb, 2158 dep->de_recsize); 2159 dep2 = dep->de_next; 2160 kmem_free((caddr_t)dep, sizeofde(dep)); 2161 dep = dep2; 2162 } 2163 dbp1 = dbp->db_next; 2164 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 2165 dbp = dbp1; 2166 } 2167 } 2168 s->s_dbp = dbhp; 2169 return (err); 2170 } 2171 2172 static int 2173 getoptcnt( 2174 mddb_set_t *s, 2175 int li) 2176 { 2177 int result; 2178 mddb_de_ic_t *dep; 2179 mddb_db_t *dbp; 2180 2181 #if defined(_ILP32) && !defined(lint) 2182 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 2183 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2184 #endif 2185 2186 result = 0; 2187 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2188 dep = dbp->db_firstentry; 2189 for (; dep != NULL; dep = dep->de_next) { 2190 if (! (dep->de_flags & MDDB_F_OPT)) 2191 continue; 2192 if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) && 2193 (li == dep->de_optinfo[0].o_li)) || 2194 ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) && 2195 (li == dep->de_optinfo[1].o_li))) 2196 result++; 2197 } 2198 } 2199 return (result); 2200 } 2201 2202 static void 2203 getoptdev( 2204 mddb_set_t *s, 2205 mddb_de_ic_t *rdep, 2206 int opti 2207 ) 2208 { 2209 mddb_lb_t *lbp; 2210 mddb_locator_t *lp; 2211 mddb_optinfo_t *otherop; 2212 mddb_optinfo_t *resultop; 2213 int li; 2214 dev_t otherdev; 2215 int blkonly = 0; 2216 int mincnt; 2217 int thiscnt; 2218 2219 lbp = s->s_lbp; 2220 2221 resultop = &rdep->de_optinfo[opti]; 2222 otherop = &rdep->de_optinfo[1-opti]; 2223 2224 resultop->o_flags = 0; 2225 2226 /* 2227 * scan through and see if data bases have to vary by only device 2228 */ 2229 2230 if (otherop->o_flags & MDDB_F_ACTIVE) { 2231 blkonly = 1; 2232 otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev); 2233 for (li = 0; li < lbp->lb_loccnt; li++) { 2234 lp = &lbp->lb_locators[li]; 2235 if (! (lp->l_flags & MDDB_F_ACTIVE)) 2236 continue; 2237 if (expldev(lp->l_dev) != otherdev) { 2238 blkonly = 0; 2239 break; 2240 } 2241 } 2242 } 2243 2244 mincnt = 999999; 2245 for (li = 0; li < lbp->lb_loccnt; li++) { 2246 dev_info_t *devi; 2247 int removable = 0; 2248 2249 lp = &lbp->lb_locators[li]; 2250 if (! (lp->l_flags & MDDB_F_ACTIVE)) 2251 continue; 2252 if (otherop->o_flags & MDDB_F_ACTIVE) { 2253 if (blkonly) { 2254 if (otherop->o_li == li) 2255 continue; 2256 } else { 2257 if (otherdev == expldev(lp->l_dev)) 2258 continue; 2259 } 2260 } 2261 2262 /* 2263 * Check if this is a removable device. If it is we 2264 * assume it is something like a USB flash disk, a zip disk 2265 * or even a floppy that is being used to help maintain 2266 * mddb quorum. We don't want to put any optimized resync 2267 * records on these kinds of disks since they are usually 2268 * slower or don't have the same read/write lifetimes as 2269 * a regular fixed disk. 2270 */ 2271 if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) { 2272 int error; 2273 struct cb_ops *cb; 2274 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; 2275 int propvalue = 0; 2276 int proplength = sizeof (int); 2277 2278 if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops) 2279 != NULL) { 2280 error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, 2281 prop_op, DDI_PROP_NOTPROM | 2282 DDI_PROP_DONTPASS, "removable-media", 2283 (caddr_t)&propvalue, &proplength); 2284 2285 if (error == DDI_PROP_SUCCESS) 2286 removable = 1; 2287 } 2288 2289 ddi_release_devi(devi); 2290 } 2291 2292 if (removable) 2293 continue; 2294 2295 thiscnt = getoptcnt(s, li); 2296 if (thiscnt < mincnt) { 2297 resultop->o_li = li; 2298 mincnt = thiscnt; 2299 resultop->o_flags = MDDB_F_ACTIVE; 2300 } 2301 } 2302 } 2303 2304 static void 2305 allocuserdata( 2306 mddb_de_ic_t *dep 2307 ) 2308 { 2309 mddb_rb32_t *rbp; 2310 2311 #if defined(_ILP32) && !defined(lint) 2312 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2313 #endif 2314 2315 rbp = dep->de_rb; 2316 rbp->rb_private = 0; 2317 dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP); 2318 rbp->rb_userdata = 0x4; /* Make sure this is non-zero */ 2319 bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize); 2320 } 2321 2322 2323 static void 2324 getuserdata( 2325 set_t setno, 2326 mddb_de_ic_t *dep 2327 ) 2328 { 2329 mddb_rb32_t *rbp; 2330 2331 2332 mddb_type_t type = dep->de_type1; 2333 caddr_t data, udata; 2334 2335 #if defined(_ILP32) && !defined(lint) 2336 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2337 #endif 2338 rbp = dep->de_rb; 2339 data = (caddr_t)rbp->rb_data; 2340 udata = (caddr_t)dep->de_rb_userdata; 2341 2342 /* 2343 * If it's a driver record, and an old style record, and not a DRL 2344 * record, we must convert it because it was incore as a 64 bit 2345 * structure but its on disk layout has only 32 bit for block sizes 2346 */ 2347 if (!(md_get_setstatus(setno) & 2348 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && 2349 (type >= MDDB_FIRST_MODID) && 2350 ((rbp->rb_revision == MDDB_REV_RB) || 2351 (rbp->rb_revision == MDDB_REV_RBFN))) { 2352 2353 switch (dep->de_flags) { 2354 2355 case MDDB_F_STRIPE: 2356 stripe_convert(data, udata, BIG_2_SMALL); 2357 break; 2358 2359 case MDDB_F_MIRROR: 2360 mirror_convert(data, udata, BIG_2_SMALL); 2361 break; 2362 2363 case MDDB_F_RAID: 2364 raid_convert(data, udata, BIG_2_SMALL); 2365 break; 2366 2367 case MDDB_F_SOFTPART: 2368 softpart_convert(data, udata, BIG_2_SMALL); 2369 break; 2370 2371 case MDDB_F_TRANS_MASTER: 2372 trans_master_convert(data, udata, BIG_2_SMALL); 2373 break; 2374 2375 case MDDB_F_TRANS_LOG: 2376 trans_log_convert(data, udata, BIG_2_SMALL); 2377 break; 2378 2379 case MDDB_F_HOTSPARE: 2380 hs_convert(data, udata, BIG_2_SMALL); 2381 break; 2382 2383 case MDDB_F_OPT: 2384 default: 2385 bcopy(udata, data, dep->de_reqsize); 2386 } 2387 } else { 2388 bcopy(udata, data, dep->de_reqsize); 2389 } 2390 } 2391 2392 static void 2393 getoptrecord( 2394 mddb_set_t *s, 2395 mddb_de_ic_t *dep 2396 ) 2397 { 2398 mddb_lb_t *lbp; 2399 mddb_locator_t *lp; 2400 mddb_rb32_t *rbp, *crbp; 2401 int li; 2402 int i; 2403 int err = 0; 2404 size_t recsize; 2405 2406 #if defined(_ILP32) && !defined(lint) 2407 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2408 #endif 2409 2410 lbp = s->s_lbp; 2411 2412 recsize = dep->de_recsize; 2413 dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 2414 rbp = dep->de_rb; 2415 crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 2416 2417 dep->de_optinfo[0].o_flags |= MDDB_F_EDATA; 2418 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 2419 2420 for (i = 0; i < 2; i++) { 2421 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) 2422 continue; 2423 li = dep->de_optinfo[i].o_li; 2424 lp = &lbp->lb_locators[li]; 2425 2426 if (! (lp->l_flags & MDDB_F_ACTIVE) || 2427 (lp->l_flags & MDDB_F_EMASTER)) 2428 continue; 2429 2430 err = readblklst(s, (caddr_t)rbp, dep->de_blks, 2431 dep->de_blkcount, li, 0); 2432 2433 if (err) 2434 continue; 2435 2436 if (rbp->rb_magic != MDDB_MAGIC_RB) 2437 continue; 2438 2439 if (revchk(MDDB_REV_RB, rbp->rb_revision)) 2440 continue; 2441 2442 /* Check the crc for this record */ 2443 if (rec_crcchk(s, dep, rbp)) { 2444 continue; 2445 } 2446 2447 dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE; 2448 2449 if (rbp == crbp) { 2450 if (rbp->rb_checksum != crbp->rb_checksum) 2451 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 2452 break; 2453 } 2454 rbp = crbp; 2455 } 2456 2457 if (rbp == crbp) { 2458 rbp->rb_private = 0; 2459 kmem_free((caddr_t)crbp, recsize); 2460 return; 2461 } 2462 bzero((caddr_t)rbp, recsize); 2463 rbp->rb_magic = MDDB_MAGIC_RB; 2464 rbp->rb_revision = MDDB_REV_RB; 2465 uniqtime32(&rbp->rb_timestamp); 2466 /* Generate the crc for this record */ 2467 rec_crcgen(s, dep, rbp); 2468 kmem_free((caddr_t)crbp, recsize); 2469 } 2470 2471 /* 2472 * writeoptrecord writes out an optimized record. 2473 */ 2474 static int 2475 writeoptrecord( 2476 mddb_set_t *s, 2477 mddb_de_ic_t *dep 2478 ) 2479 { 2480 mddb_rb32_t *rbp; 2481 int li; 2482 int err = 0, wrt_err = 0; 2483 mddb_bf_t *bufhead, *bfp; 2484 mddb_lb_t *lbp = s->s_lbp; 2485 mddb_locator_t *lp; 2486 int i; 2487 2488 #if defined(_ILP32) && !defined(lint) 2489 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2490 #endif 2491 2492 bufhead = NULL; 2493 err = 0; 2494 2495 while (s->s_opthavequeuinglck) { 2496 s->s_optwantqueuinglck++; 2497 cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno)); 2498 } 2499 s->s_opthavequeuinglck++; 2500 rbp = dep->de_rb; 2501 for (i = 0; i < 2; i++) { 2502 /* 2503 * only possible error is xlate. This can 2504 * occur if a replica was off line and came 2505 * back. During the mean time the database grew 2506 * large than the now on line replica can store 2507 */ 2508 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) 2509 continue; 2510 li = dep->de_optinfo[i].o_li; 2511 /* 2512 * In a MN diskset, any node can write optimized record(s). 2513 */ 2514 wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, 2515 dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE); 2516 /* 2517 * For MN diskset, set error in optinfo structure so 2518 * that mddb_commitrec knows which replica failed. 2519 */ 2520 if ((MD_MNSET_SETNO(s->s_setno)) && 2521 (wrt_err & MDDB_F_EWRITE)) { 2522 dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE; 2523 } 2524 err |= wrt_err; 2525 } 2526 s->s_opthavequeuinglck = 0; 2527 if (s->s_optwantqueuinglck) { 2528 s->s_optwantqueuinglck = 0; 2529 cv_broadcast(&s->s_optqueuing_cv); 2530 } 2531 for (bfp = bufhead; bfp; bfp = bufhead) { 2532 mutex_exit(SETMUTEX(s->s_setno)); 2533 (void) biowait(&bfp->bf_buf); 2534 mutex_enter(SETMUTEX(s->s_setno)); 2535 if (bfp->bf_buf.b_flags & B_ERROR) { 2536 /* 2537 * If an MN diskset, don't set replica 2538 * in error since this hasn't been set in master. 2539 * Setting replica in error before master could 2540 * leave the nodes with different views of the 2541 * world since a class 1 configuration change 2542 * could occur in mddb_commitrec as soon as 2543 * all locks are dropped. Must keep this 2544 * node the same as master and can't afford a 2545 * failure from the class 1 config change 2546 * if master succeeded. 2547 */ 2548 if (!(MD_MNSET_SETNO(s->s_setno))) { 2549 bfp->bf_locator->l_flags |= MDDB_F_EWRITE; 2550 } else { 2551 /* 2552 * Find which de_optinfo (which replica) 2553 * had a failure and set the failure in 2554 * the o_flags field. 2555 */ 2556 lp = &lbp->lb_locators[dep->de_optinfo[0].o_li]; 2557 if (lp == bfp->bf_locator) { 2558 dep->de_optinfo[0].o_flags |= 2559 MDDB_F_EWRITE; 2560 } else { 2561 dep->de_optinfo[1].o_flags |= 2562 MDDB_F_EWRITE; 2563 } 2564 } 2565 err |= MDDB_F_EWRITE; 2566 } 2567 bufhead = bfp->bf_next; 2568 freebuffer(s, bfp); 2569 } 2570 return (err); 2571 } 2572 2573 /* 2574 * Fix up the optimized resync record. Used in the traditional and local 2575 * disksets to move an optimized record from a failed or deleted mddb 2576 * to an active one. 2577 * 2578 * In a MN diskset, the fixing of the optimized record is split between 2579 * the master and slave nodes. If the master node moves the optimized 2580 * resync record, then the master node will send a MDDB_PARSE_OPTRECS 2581 * message to the slave nodes causing the slave nodes to reget the 2582 * directory entry containing the location of the optimized resync record. 2583 * After the record is reread from disk, then writeoptrecord is called 2584 * if the location of the optimized resync record or flags have changed. 2585 * When writeoptrecord is called, the node that is the owner of this record 2586 * will write the optimized record to the location specified in the directory 2587 * entry. Since the master node uses the highest class message (PARSE) 2588 * the record owner node is guaranteed to already have an updated 2589 * directory entry incore. 2590 * 2591 * The other difference between the traditional/local set and MN diskset 2592 * is that the directory entry can be written to disk before the optimized 2593 * record in a MN diskset if the record is owned by a slave node. So, 2594 * the users of an optimized record must handle the failure case when no 2595 * data is available from an optimized record since the master node could 2596 * have failed during the relocation of the optimized record to another mddb. 2597 */ 2598 static int 2599 fixoptrecord( 2600 mddb_set_t *s, 2601 mddb_de_ic_t *dep, 2602 mddb_db_t *dbp 2603 ) 2604 { 2605 int changed; 2606 int writedata; 2607 int err = 0; 2608 int i; 2609 mddb_lb_t *lbp; 2610 mddb_optinfo_t *op; 2611 mddb_db32_t *db32p; 2612 int rec_owner; /* Is node owner of record? */ 2613 2614 #if defined(_ILP32) && !defined(lint) 2615 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2616 #endif 2617 2618 lbp = s->s_lbp; 2619 changed = 0; 2620 writedata = 0; 2621 for (i = 0; i < 2; i++) { 2622 op = &dep->de_optinfo[i]; 2623 2624 if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE)) 2625 op->o_flags = 0; 2626 2627 /* 2628 * If optimized record has seen a replica failure, 2629 * assign new replica to record and re-write data 2630 * to new record. 2631 */ 2632 if (! (op->o_flags & MDDB_F_ACTIVE)) { 2633 getoptdev(s, dep, i); 2634 writedata++; 2635 changed++; 2636 /* Set flag for slaves to reread dep and write rec */ 2637 if (lbp->lb_flags & MDDB_MNSET) { 2638 s->s_mn_parseflags |= MDDB_PARSE_OPTRECS; 2639 } 2640 } 2641 2642 /* 2643 * If just an error in the data was seen, set 2644 * the optimized record's replica flag to active (ok) 2645 * and try again. 2646 */ 2647 if (op->o_flags & MDDB_F_EDATA) { 2648 dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE; 2649 writedata++; 2650 } 2651 } 2652 2653 rec_owner = 0; 2654 if (lbp->lb_flags & MDDB_MNSET) { 2655 /* 2656 * If a MN diskset then check the owner of optimized record. 2657 * If the master node owns the record or if there is 2658 * no owner of the record, then the master can write the 2659 * optimized record to disk. 2660 * Master node can write the optimized record now, but 2661 * slave nodes write their records during handling of 2662 * the MDDB_PARSE_OPTRECS message. 2663 */ 2664 if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) || 2665 (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) { 2666 rec_owner = 1; 2667 } 2668 } else { 2669 /* 2670 * In traditional diskset and local set, this node 2671 * is always the record owner and always the master. 2672 */ 2673 rec_owner = 1; 2674 } 2675 2676 /* 2677 * If this node is the record owner, write out record. 2678 */ 2679 if ((writedata) && (rec_owner)) { 2680 if (err = writeoptrecord(s, dep)) { 2681 return (err); 2682 } 2683 } 2684 if (! changed) 2685 return (0); 2686 uniqtime32(&dbp->db_timestamp); 2687 dbp->db_revision = MDDB_REV_DB; 2688 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 2689 create_db32rec(db32p, dbp); 2690 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 2691 err = writeall(s, (caddr_t)db32p, db32p->db32_blknum, 2692 1, MDDB_WR_ONLY_MASTER); 2693 kmem_free((caddr_t)db32p, MDDB_BSIZE); 2694 return (err); 2695 } 2696 2697 static int 2698 fixoptrecords( 2699 mddb_set_t *s 2700 ) 2701 { 2702 mddb_de_ic_t *dep; 2703 mddb_db_t *dbp; 2704 int err = 0; 2705 set_t setno; 2706 2707 /* 2708 * In a MN diskset, the master node is the only node that runs 2709 * fixoptrecords. If the master node changes anything, then the 2710 * master node sends PARSE message to the slave nodes. The slave 2711 * nodes will then re-read in the locator block or re-read in the 2712 * directory blocks and re-write the optimized resync records. 2713 */ 2714 setno = s->s_setno; 2715 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && 2716 (md_set[setno].s_am_i_master == 0)) { 2717 return (0); 2718 } 2719 2720 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2721 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2722 if (! (dep->de_flags & MDDB_F_OPT)) 2723 continue; 2724 err = fixoptrecord(s, dep, dbp); 2725 if (err != 0) 2726 return (err); 2727 } 2728 } 2729 return (0); 2730 } 2731 2732 /* 2733 * Checks incore version of mddb data to mddb data ondisk. 2734 * 2735 * Returns: 2736 * - 0 if the data was successfully read and is good. 2737 * - MDDB_F_EREAD if a read error occurred. 2738 * - 1 if the data read is bad (checksum failed, etc) 2739 */ 2740 static int 2741 checkcopy 2742 ( 2743 mddb_set_t *s, 2744 int li 2745 ) 2746 { 2747 mddb_db_t *dbp; 2748 mddb_db32_t *cdb32p; 2749 mddb_de_ic_t *dep; 2750 mddb_de32_t *cde32p; 2751 mddb_rb32_t *rbp, *crbp; 2752 size_t size; 2753 int i; 2754 int retval = 1; 2755 2756 #if defined(_ILP32) && !defined(lint) 2757 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 2758 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2759 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2760 #endif 2761 2762 if (s->s_databuffer_size == 0) { 2763 size_t maxrecsize = MDDB_BSIZE; 2764 2765 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) 2766 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) 2767 if (! (dep->de_flags & MDDB_F_OPT) && 2768 dep->de_recsize > maxrecsize) 2769 maxrecsize = dep->de_recsize; 2770 2771 s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP); 2772 s->s_databuffer_size = maxrecsize; 2773 } 2774 2775 cdb32p = (mddb_db32_t *)s->s_databuffer; 2776 2777 /* 2778 * first go through and make sure all directory stuff 2779 * is the same 2780 */ 2781 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2782 if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) { 2783 retval = MDDB_F_EREAD; 2784 goto err; 2785 } 2786 if (cdb32p->db32_magic != MDDB_MAGIC_DB) 2787 goto err; 2788 if (revchk(MDDB_REV_DB, cdb32p->db32_revision)) 2789 goto err; 2790 if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL)) 2791 goto err; 2792 if (cdb32p->db32_nextblk != dbp->db_nextblk) 2793 goto err; 2794 if (cdb32p->db32_recsum != dbp->db_recsum) 2795 goto err; 2796 if (cdb32p->db32_firstentry) { 2797 cde32p = (mddb_de32_t *) 2798 ((void *)((caddr_t)(&cdb32p->db32_firstentry) 2799 + sizeof (cdb32p->db32_firstentry))); 2800 } else 2801 cde32p = NULL; 2802 2803 dep = dbp->db_firstentry; 2804 /* 2805 * check if all directory entries are identical 2806 */ 2807 while (dep && cde32p) { 2808 if (dep->de_recid != cde32p->de32_recid) 2809 goto err; 2810 if (dep->de_type1 != cde32p->de32_type1) 2811 goto err; 2812 if (dep->de_type2 != cde32p->de32_type2) 2813 goto err; 2814 if (dep->de_reqsize != cde32p->de32_reqsize) 2815 goto err; 2816 if (dep->de_flags != cde32p->de32_flags) 2817 goto err; 2818 2819 for (i = 0; i < 2; i++) { 2820 if (dep->de_optinfo[i].o_li != 2821 cde32p->de32_optinfo[i].o_li) 2822 break; 2823 } 2824 if (i != 2) 2825 goto err; 2826 size = sizeof (mddb_block_t) * dep->de_blkcount; 2827 if (bcmp((caddr_t)dep->de_blks, 2828 (caddr_t)cde32p->de32_blks, size)) 2829 goto err; 2830 dep = dep->de_next; 2831 if (cde32p->de32_next) 2832 cde32p = nextentry(cde32p); 2833 else 2834 cde32p = NULL; 2835 } 2836 if (dep || cde32p) 2837 goto err; 2838 } 2839 /* 2840 * If here, all directories are functionally identical 2841 * check to make sure all records are identical 2842 * the reason the records are not just bcmped is that the 2843 * lock flag does not want to be compared. 2844 */ 2845 crbp = (mddb_rb32_t *)cdb32p; 2846 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2847 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2848 if ((dep->de_flags & MDDB_F_OPT) || 2849 (dep->de_flags & MDDB_F_CHANGELOG)) 2850 continue; 2851 rbp = (mddb_rb32_t *)dep->de_rb; 2852 if (readblklst(s, (caddr_t)crbp, dep->de_blks, 2853 dep->de_blkcount, li, 0)) { 2854 retval = MDDB_F_EREAD; 2855 goto err; 2856 } 2857 /* Check the crc for this record */ 2858 if (rec_crcchk(s, dep, crbp)) 2859 goto err; 2860 2861 if (rbp->rb_checksum != crbp->rb_checksum || 2862 rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle) 2863 goto err; 2864 } 2865 } 2866 return (0); 2867 err: 2868 return (retval); 2869 } 2870 2871 /* 2872 * Determine if the location information for two mddbs is the same. 2873 * The device slice and block offset should match. If both have devids then 2874 * use that for the comparison, otherwise we compare the dev_ts. 2875 * Comparing with the devid allows us to handle the case where a mddb was 2876 * relocated to a dead mddbs dev_t. The live mddb will have the dev_t of 2877 * the dead mddb but the devid comparison will catch this and not match. 2878 * 2879 * Return 1 if the location of the two mddbs match, 0 if not. 2880 */ 2881 static int 2882 match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev, 2883 daddr32_t blkno) 2884 { 2885 if (rip->ri_flags & MDDB_F_EMASTER) { 2886 /* 2887 * If this element is errored then we don't try to match on it. 2888 * If we try to match we could erroneously match on the dev_t 2889 * of a relocated disk. 2890 */ 2891 return (0); 2892 } 2893 2894 if (rip->ri_devid && devid && minor) { 2895 /* 2896 * If old devid exists, then this is a replicated diskset 2897 * and both old and new devids must be checked. 2898 */ 2899 if (rip->ri_old_devid) { 2900 if (((ddi_devid_compare(rip->ri_devid, devid) != 0) && 2901 (ddi_devid_compare(rip->ri_old_devid, 2902 devid) != 0)) || 2903 (strcmp(rip->ri_minor_name, minor) != 0)) 2904 return (0); 2905 } else { 2906 if (ddi_devid_compare(rip->ri_devid, devid) != 0 || 2907 strcmp(rip->ri_minor_name, minor) != 0) 2908 return (0); 2909 } 2910 } else { 2911 if (rip->ri_dev != dev) 2912 return (0); 2913 } 2914 2915 if (rip->ri_blkno != blkno) 2916 return (0); 2917 2918 return (1); 2919 } 2920 2921 static int 2922 ridev( 2923 mddb_ri_t **rip, 2924 mddb_cfg_loc_t *clp, 2925 dev32_t *dev_2b_fixed, 2926 int flag) 2927 { 2928 mddb_ri_t *r, *r1; 2929 md_dev64_t ldev, ndev; 2930 major_t majordev; 2931 int sz; 2932 2933 if (MD_UPGRADE) { 2934 ldev = md_makedevice(md_targ_name_to_major(clp->l_driver), 2935 clp->l_mnum); 2936 } else { 2937 if (ddi_name_to_major(clp->l_driver) == (major_t)-1) 2938 return (EINVAL); 2939 2940 ldev = md_makedevice(ddi_name_to_major(clp->l_driver), 2941 clp->l_mnum); 2942 } 2943 2944 if (clp->l_devid != 0) { 2945 /* 2946 * Get dev associated with device id and minor name. 2947 * Setup correct driver name if dev is now different. 2948 * Don't change driver name if during upgrade. 2949 */ 2950 ndev = ldev; 2951 if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid, 2952 &ndev, clp->l_minor_name)) { 2953 if ((ndev != ldev) && (!(MD_UPGRADE))) { 2954 majordev = md_getmajor(ndev); 2955 (void) strcpy(clp->l_driver, 2956 ddi_major_to_name(majordev)); 2957 clp->l_mnum = md_getminor(ndev); 2958 clp->l_devid_flags |= MDDB_DEVID_VALID; 2959 ldev = ndev; 2960 } 2961 } else { 2962 /* Mark as invalid */ 2963 clp->l_devid_flags &= ~MDDB_DEVID_VALID; 2964 } 2965 } 2966 2967 clp->l_dev = md_cmpldev(ldev); 2968 if (dev_2b_fixed) 2969 *dev_2b_fixed = clp->l_dev; 2970 r = *rip; 2971 2972 while (r) { 2973 if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid, 2974 clp->l_minor_name, ldev, clp->l_blkno)) { 2975 if ((clp->l_devid != 0) && 2976 !(clp->l_devid_flags & MDDB_DEVID_VALID)) { 2977 r->ri_flags |= MDDB_F_EMASTER; 2978 } else { 2979 r->ri_flags |= flag; 2980 } 2981 return (0); /* already entered return success */ 2982 } 2983 r = r->ri_next; 2984 } 2985 2986 /* 2987 * This replica not represented in the current rip list, 2988 * so add it to the list. 2989 */ 2990 r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP); 2991 r->ri_dev = ldev; 2992 r->ri_blkno = clp->l_blkno; 2993 (void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM); 2994 if (strlen(clp->l_driver) >= MD_MAXDRVNM) { 2995 r->ri_driver[(MD_MAXDRVNM -1)] = '\0'; 2996 } 2997 if (clp->l_devname != NULL) { 2998 (void) strcpy(r->ri_devname, clp->l_devname); 2999 } 3000 r->ri_flags |= flag; 3001 if (clp->l_devid != 0) { 3002 sz = clp->l_devid_sz; 3003 r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP); 3004 bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz); 3005 3006 if (clp->l_old_devid != NULL) { 3007 sz = clp->l_old_devid_sz; 3008 r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz, 3009 KM_SLEEP); 3010 bcopy((char *)(uintptr_t)clp->l_old_devid, 3011 (char *)r->ri_old_devid, sz); 3012 } else { 3013 r->ri_old_devid = 0; 3014 } 3015 if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX) 3016 (void) strcpy(r->ri_minor_name, clp->l_minor_name); 3017 3018 if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) { 3019 /* 3020 * Devid is present, but not valid. This could 3021 * happen if device has been powered off or if 3022 * the device has been removed. Mark the device in 3023 * error. Don't allow any writes to this device 3024 * based on the dev_t since another device could 3025 * have been placed in its spot and be responding to 3026 * the dev_t accesses. 3027 */ 3028 r->ri_flags |= MDDB_F_EMASTER; 3029 } 3030 } else { 3031 r->ri_devid = 0; 3032 r->ri_old_devid = 0; 3033 } 3034 3035 /* 3036 * If the rip list is empty then this entry 3037 * is the list. 3038 */ 3039 if (*rip == NULL) { 3040 *rip = r; 3041 return (0); 3042 } 3043 3044 /* 3045 * Add this entry to the end of the rip list 3046 */ 3047 r1 = *rip; 3048 while (r1->ri_next) 3049 r1 = r1->ri_next; 3050 r1->ri_next = r; 3051 return (0); 3052 } 3053 3054 /* 3055 * writecopy writes the incore data blocks out to all of the replicas. 3056 * This is called from writestart 3057 * - when a diskset is started or 3058 * - when an error has been enountered during the write to a mddb. 3059 * and from newdev when a new mddb is being added. 3060 * 3061 * flag can be 2 values: 3062 * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is 3063 * always used for traditional and local disksets. 3064 * For MN diskset: 3065 * All nodes can call writecopy, but only the 3066 * master node actually writes data to the disk 3067 * except for optimized resync records. 3068 * An optimized resync record can only be written to 3069 * by the record owner. 3070 * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new 3071 * master has been chosen, the new master may need to 3072 * write its incore mddb to disk (this is the case where the 3073 * old master had executed a message but hadn't relayed it 3074 * to this slave yet). New master should not write the 3075 * change log records since new master would be overwriting 3076 * valuable data. Only used during a reconfig cycle. 3077 */ 3078 static int 3079 writecopy( 3080 mddb_set_t *s, 3081 int li, 3082 int flag 3083 ) 3084 { 3085 mddb_db_t *dbp; 3086 mddb_db32_t *db32p; 3087 mddb_de_ic_t *dep; 3088 mddb_rb32_t *rbp; 3089 uint_t checksum; 3090 int err = 0; 3091 3092 #if defined(_ILP32) && !defined(lint) 3093 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 3094 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 3095 #endif 3096 3097 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 3098 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 3099 create_db32rec(db32p, dbp); 3100 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 3101 err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li, 3102 MDDB_WR_ONLY_MASTER); 3103 kmem_free((caddr_t)db32p, MDDB_BSIZE); 3104 if (err) 3105 return (err); 3106 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 3107 /* 3108 * In a multinode diskset, when a new master is 3109 * chosen the new master may need to write its 3110 * incore copy of the mddb to disk. In this case, 3111 * don't want to overwrite the change log records 3112 * so new master sets flag to MDDB_WRITECOPY_SYNC. 3113 */ 3114 if (flag == MDDB_WRITECOPY_SYNC) { 3115 if (dep->de_flags & MDDB_F_CHANGELOG) 3116 continue; 3117 } 3118 /* 3119 * In a multinode diskset, don't write out optimized 3120 * resync resyncs since only the mirror owner node 3121 * will have the correct data. If writecopy is 3122 * being called from writestart as a result of 3123 * an mddb failure, then writestart will handle 3124 * the optimized records when it calls fixoptrecords. 3125 */ 3126 if ((MD_MNSET_SETNO(s->s_setno)) && 3127 (dep->de_flags & MDDB_F_OPT)) { 3128 continue; 3129 } 3130 3131 rbp = dep->de_rb; 3132 checksum = rbp->rb_checksum_fiddle; 3133 checksum ^= rbp->rb_checksum; 3134 /* Generate the crc for this record */ 3135 rec_crcgen(s, dep, rbp); 3136 checksum ^= rbp->rb_checksum; 3137 rbp->rb_checksum_fiddle = checksum; 3138 if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, 3139 dep->de_blkcount, li, (mddb_bf_t **)0, 3140 MDDB_WR_ONLY_MASTER)) 3141 return (err); 3142 } 3143 } 3144 return (0); 3145 } 3146 3147 static int 3148 upd_med( 3149 mddb_set_t *s, 3150 char *tag 3151 ) 3152 { 3153 med_data_t meddb; 3154 int medok; 3155 mddb_lb_t *lbp = s->s_lbp; 3156 set_t setno = s->s_setno; 3157 int li; 3158 int alc; 3159 int lc; 3160 3161 3162 /* If no mediator hosts, nothing to do */ 3163 if (s->s_med.n_cnt == 0) 3164 return (0); 3165 3166 /* 3167 * If this is a MN set and we are not the master, then don't 3168 * update mediator hosts or mark mediator as golden since 3169 * only master node should do that. 3170 */ 3171 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && 3172 (md_set[setno].s_am_i_master == 0)) { 3173 return (0); 3174 } 3175 3176 bzero((char *)&meddb, sizeof (med_data_t)); 3177 meddb.med_dat_mag = MED_DATA_MAGIC; 3178 meddb.med_dat_rev = MED_DATA_REV; 3179 meddb.med_dat_fl = 0; 3180 meddb.med_dat_sn = setno; 3181 meddb.med_dat_cc = lbp->lb_commitcnt; 3182 TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime); 3183 crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL); 3184 3185 /* count accessible mediators */ 3186 medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag); 3187 3188 /* count accessible and existing replicas */ 3189 for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) { 3190 mddb_locator_t *lp = &lbp->lb_locators[li]; 3191 3192 if (lp->l_flags & MDDB_F_DELETED) 3193 continue; 3194 3195 lc++; 3196 3197 if (! (lp->l_flags & MDDB_F_ACTIVE) || 3198 (lp->l_flags & MDDB_F_EMASTER) || 3199 (lp->l_flags & MDDB_F_EWRITE)) 3200 continue; 3201 3202 alc++; 3203 } 3204 3205 /* 3206 * Mediator update quorum is >= 50%: check for less than 3207 * "mediator update" quorum. 3208 */ 3209 if ((medok * 2) < s->s_med.n_cnt) { 3210 /* panic if <= 50% of all replicas are accessible */ 3211 if ((lc > 0) && ((alc * 2) <= lc)) { 3212 cmn_err(CE_PANIC, 3213 "md: Update of 50%% of the mediator hosts failed"); 3214 /* NOTREACHED */ 3215 } 3216 3217 cmn_err(CE_WARN, 3218 "md: Update of 50%% of the mediator hosts failed"); 3219 } 3220 3221 /* 3222 * If we have mediator update quorum and exactly 50% of the replicas 3223 * are accessible then mark the mediator as golden. 3224 */ 3225 if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) && 3226 ((alc * 2) == lc)) { 3227 meddb.med_dat_fl = MED_DFL_GOLDEN; 3228 crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL); 3229 (void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag); 3230 } 3231 3232 return (0); 3233 } 3234 3235 static int 3236 push_lb(mddb_set_t *s) 3237 { 3238 mddb_lb_t *lbp = s->s_lbp; 3239 3240 /* push the change to all the replicas */ 3241 uniqtime32(&lbp->lb_timestamp); 3242 if (MD_MNSET_SETNO(s->s_setno)) { 3243 lbp->lb_revision = MDDB_REV_MNLB; 3244 } else { 3245 lbp->lb_revision = MDDB_REV_LB; 3246 } 3247 /* 3248 * The updates to the mediator hosts are done 3249 * by the callers of this function. 3250 */ 3251 return (writelocall(s)); 3252 } 3253 3254 /* Should not call for MN diskset since data tags are not supported */ 3255 static int 3256 dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp) 3257 { 3258 int diff = 0; 3259 3260 diff = (int)(odtp->dt_setno - ndtp->dt_setno); 3261 if (diff) 3262 return (diff); 3263 3264 diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN); 3265 if (diff) 3266 return (diff); 3267 3268 diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1); 3269 if (diff) 3270 return (diff); 3271 3272 /*CSTYLED*/ 3273 return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=)); 3274 } 3275 3276 /* Should not call for MN diskset since data tags are not supported */ 3277 static int 3278 dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp) 3279 { 3280 int nextid = 0; 3281 mddb_dtag_lst_t **dtlpp = &s->s_dtlp; 3282 3283 /* Run to the end of the list */ 3284 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) { 3285 if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0) 3286 return (0); 3287 nextid++; 3288 } 3289 3290 /* Add the new member */ 3291 *dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP); 3292 3293 /* Update the dtag portion of the list */ 3294 bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt), 3295 sizeof (mddb_dtag_t)); 3296 3297 /* Fix up the id value */ 3298 (*dtlpp)->dtl_dt.dt_id = ++nextid; 3299 3300 return (0); 3301 } 3302 3303 /* 3304 * Even though data tags are not supported in MN disksets, dt_cntl may 3305 * be called for a MN diskset since this routine is called even before 3306 * it is known the kind of diskset being read in from disk. 3307 * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned. 3308 */ 3309 static int 3310 dtl_cntl(mddb_set_t *s) 3311 { 3312 mddb_dtag_lst_t *dtlp = s->s_dtlp; 3313 int ndt = 0; 3314 3315 while (dtlp != NULL) { 3316 ndt++; 3317 dtlp = dtlp->dtl_nx; 3318 } 3319 3320 return (ndt); 3321 } 3322 3323 /* 3324 * Even though data tags are not supported in MN disksets, dt_cntl may 3325 * be called for a MN diskset since this routine is called even before 3326 * it is known the kind of diskset being read in from disk. 3327 * For a MNdiskset, s_dtlp is 0 so a 0 is returned. 3328 */ 3329 static mddb_dtag_t * 3330 dtl_findl(mddb_set_t *s, int id) 3331 { 3332 mddb_dtag_lst_t *dtlp = s->s_dtlp; 3333 3334 while (dtlp != NULL) { 3335 if (dtlp->dtl_dt.dt_id == id) 3336 return (&dtlp->dtl_dt); 3337 dtlp = dtlp->dtl_nx; 3338 } 3339 return ((mddb_dtag_t *)NULL); 3340 } 3341 3342 /* Should not call for MN diskset since data tags are not supported */ 3343 static void 3344 dtl_freel(mddb_dtag_lst_t **dtlpp) 3345 { 3346 mddb_dtag_lst_t *dtlp; 3347 mddb_dtag_lst_t *tdtlp; 3348 3349 3350 for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) { 3351 dtlp = tdtlp->dtl_nx; 3352 kmem_free(tdtlp, sizeof (mddb_dtag_lst_t)); 3353 } 3354 *dtlpp = (mddb_dtag_lst_t *)NULL; 3355 } 3356 3357 /* 3358 * Even though data tags are not supported in MN disksets, dt_setup will 3359 * be called for a MN diskset since this routine is called even before 3360 * it is known the kind of diskset being read in from disk. 3361 * Once this set is known as a MN diskset, the dtp area will be freed. 3362 */ 3363 static void 3364 dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp) 3365 { 3366 mddb_dt_t *dtp; 3367 set_t setno = s->s_setno; 3368 3369 3370 if (md_set[setno].s_dtp == (mddb_dt_t *)NULL) 3371 md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP); 3372 else if (dtagp == (mddb_dtag_t *)NULL) 3373 bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 3374 3375 /* shorthand */ 3376 dtp = (mddb_dt_t *)md_set[setno].s_dtp; 3377 3378 dtp->dt_mag = MDDB_MAGIC_DT; 3379 dtp->dt_rev = MDDB_REV_DT; 3380 3381 if (dtagp != NULL) 3382 dtp->dt_dtag = *dtagp; /* structure assignment */ 3383 3384 /* Initialize the setno */ 3385 dtp->dt_dtag.dt_setno = setno; 3386 3387 /* Clear the id and flags, this is only used in user land */ 3388 dtp->dt_dtag.dt_id = 0; 3389 3390 /* Checksum it */ 3391 crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL); 3392 } 3393 3394 /* Should not call for MN diskset since data tags are not supported */ 3395 static int 3396 set_dtag(mddb_set_t *s, md_error_t *ep) 3397 { 3398 mddb_lb_t *lbp = s->s_lbp; 3399 mddb_dtag_t tag; 3400 3401 if (lbp->lb_dtblkcnt == 0) { 3402 /* Data tags not used in a MN set - so no failure returned */ 3403 if (lbp->lb_flags & MDDB_MNSET) 3404 return (0); 3405 3406 cmn_err(CE_WARN, 3407 "No tag record allocated, unable to tag data"); 3408 (void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno); 3409 return (1); 3410 } 3411 3412 /* Clear the stack variable */ 3413 bzero((caddr_t)&tag, sizeof (mddb_dtag_t)); 3414 3415 /* Get the HW serial number for this host */ 3416 (void) snprintf(tag.dt_sn, MDDB_SN_LEN, "%u", zone_get_hostid(NULL)); 3417 tag.dt_sn[MDDB_SN_LEN - 1] = '\0'; 3418 3419 /* Get the nodename that this host goes by */ 3420 (void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME); 3421 tag.dt_hn[MD_MAX_NODENAME] = '\0'; 3422 3423 /* Get a time stamp for NOW */ 3424 uniqtime32(&tag.dt_tv); 3425 3426 /* Setup the data tag record */ 3427 dt_setup(s, &tag); 3428 3429 /* Free any list of tags if they exist */ 3430 dtl_freel(&s->s_dtlp); 3431 3432 /* Put the new tag onto the tag list */ 3433 (void) dtl_addl(s, &tag); 3434 3435 return (0); 3436 } 3437 3438 /* 3439 * If called during upgrade, this routine expects a non-translated 3440 * (aka target) dev. 3441 * Should not call for MN diskset since data tags are not supported. 3442 */ 3443 static int 3444 dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip) 3445 { 3446 int err = 0; 3447 md_dev64_t dev; 3448 caddr_t tbuf; 3449 daddr_t physblk; 3450 mddb_block_t blk; 3451 mddb_dt_t *dtp; 3452 mddb_dtag_t *dtagp; 3453 set_t setno = s->s_setno; 3454 3455 /* If have not allocated a data tag record, there is nothing to do */ 3456 if (lbp->lb_dtblkcnt == 0) 3457 return (1); 3458 3459 dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP); 3460 3461 if (dtp == (mddb_dt_t *)NULL) 3462 return (1); 3463 3464 /* shorthand */ 3465 dev = md_xlate_targ_2_mini(rip->ri_dev); 3466 if (dev == NODEV64) { 3467 return (1); 3468 } 3469 3470 tbuf = (caddr_t)rip->ri_dtp; 3471 3472 for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) { 3473 physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip); 3474 err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0); 3475 /* error reading the tag */ 3476 if (err) { 3477 err = 1; 3478 goto out; 3479 } 3480 tbuf += MDDB_BSIZE; 3481 } 3482 3483 /* magic is valid? */ 3484 if (dtp->dt_mag != MDDB_MAGIC_DT) { 3485 err = 1; 3486 goto out; 3487 } 3488 3489 /* revision is valid? */ 3490 if (revchk(MDDB_REV_DT, dtp->dt_rev)) { 3491 err = 1; 3492 goto out; 3493 } 3494 3495 /* crc is valid? */ 3496 if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) { 3497 err = 1; 3498 goto out; 3499 } 3500 3501 /* shorthand */ 3502 dtagp = &dtp->dt_dtag; 3503 3504 /* set number match? */ 3505 if (dtagp->dt_setno != setno) { 3506 err = 1; 3507 goto out; 3508 } 3509 3510 /* tag is not empty? */ 3511 if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' && 3512 (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) && 3513 dtagp->dt_id == 0) { 3514 err = 2; 3515 goto out; 3516 } 3517 3518 /* Mark the locator as having tagged data */ 3519 rip->ri_flags |= MDDB_F_TAGDATA; 3520 3521 out: 3522 if (err) { 3523 if (err == 1) { 3524 md_set_setstatus(setno, MD_SET_BADTAG); 3525 rip->ri_flags |= MDDB_F_BADTAG; 3526 } 3527 if (dtp != NULL) { 3528 kmem_free(dtp, MDDB_DT_BYTES); 3529 rip->ri_dtp = (mddb_dt_t *)NULL; 3530 } 3531 } 3532 3533 return (err); 3534 } 3535 3536 /* Should not call for MN diskset since data tags are not supported */ 3537 static int 3538 dt_write(mddb_set_t *s) 3539 { 3540 int li; 3541 int err = 0; 3542 int werr; 3543 int empty_tag = 0; 3544 mddb_dtag_t *dtagp; 3545 mddb_dt_t *dtp; 3546 mddb_lb_t *lbp = s->s_lbp; 3547 set_t setno = s->s_setno; 3548 uint_t set_status = md_get_setstatus(setno); 3549 3550 3551 ASSERT(md_set[setno].s_dtp != NULL); 3552 3553 /* Nowhere to write to */ 3554 if (lbp->lb_dtblkcnt == 0) 3555 return (err); 3556 3557 if (set_status & MD_SET_BADTAG) 3558 return (err); 3559 3560 /* shorthand */ 3561 dtp = (mddb_dt_t *)md_set[setno].s_dtp; 3562 dtagp = &dtp->dt_dtag; 3563 3564 /* See if the tag is empty. */ 3565 if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' && 3566 (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) && 3567 dtagp->dt_id == 0) 3568 empty_tag = 1; 3569 3570 /* Write the tag to the locators and reset appropriate flags. */ 3571 for (li = 0; li < lbp->lb_loccnt; li++) { 3572 mddb_locator_t *lp = &lbp->lb_locators[li]; 3573 3574 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3575 (lp->l_flags & MDDB_F_DELETED) || 3576 (lp->l_flags & MDDB_F_EWRITE)) 3577 continue; 3578 3579 werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk, 3580 MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER); 3581 3582 if (werr) { 3583 err |= werr; 3584 continue; 3585 } 3586 3587 if (empty_tag) 3588 lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA); 3589 else { 3590 lp->l_flags |= MDDB_F_TAGDATA; 3591 lp->l_flags &= ~MDDB_F_BADTAG; 3592 } 3593 } 3594 3595 if (err) 3596 return (err); 3597 3598 3599 /* If the tags were written, check to see if any tags remain. */ 3600 for (li = 0; li < lbp->lb_loccnt; li++) { 3601 mddb_locator_t *lp = &lbp->lb_locators[li]; 3602 3603 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3604 (lp->l_flags & MDDB_F_DELETED) || 3605 (lp->l_flags & MDDB_F_EWRITE)) 3606 continue; 3607 3608 if (lp->l_flags & MDDB_F_TAGDATA) 3609 break; 3610 } 3611 3612 /* If there are no tags, then clear CLRTAG and TAGDATA */ 3613 if (li == lbp->lb_loccnt) { 3614 md_clr_setstatus(setno, MD_SET_CLRTAG); 3615 md_clr_setstatus(setno, MD_SET_TAGDATA); 3616 } 3617 3618 return (err); 3619 } 3620 3621 /* Should not call for MN diskset since data tags are not supported */ 3622 static int 3623 dt_alloc_if_needed(mddb_set_t *s) 3624 { 3625 int i; 3626 int li; 3627 int moveit = 0; 3628 mddb_lb_t *lbp = s->s_lbp; 3629 mddb_block_t blkcnt = lbp->lb_dtblkcnt; 3630 set_t setno = s->s_setno; 3631 uint_t set_status = md_get_setstatus(setno); 3632 3633 /* 3634 * If the data tag record is allocated (blkcnt != 0) and a bad tag was 3635 * not detected, there is nothing to do. 3636 */ 3637 if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG)) 3638 return (0); 3639 3640 /* Bitmap not setup, checks can't be done */ 3641 if (s->s_totalblkcnt == 0) 3642 return (0); 3643 3644 /* While reading the tag(s) an invalid tag data record was seen */ 3645 if (set_status & MD_SET_BADTAG) 3646 /* See if the invalid tag needs to be moved */ 3647 for (i = 0; i < MDDB_DT_BLOCKS; i++) 3648 if (blkcheck(s, (i + lbp->lb_dtfirstblk))) { 3649 moveit = 1; 3650 break; 3651 } 3652 3653 /* Need to move or allocate the tag data record */ 3654 if (moveit || blkcnt == 0) { 3655 lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS); 3656 if (lbp->lb_dtfirstblk == 0) { 3657 cmn_err(CE_WARN, 3658 "Unable to allocate data tag record"); 3659 return (0); 3660 } 3661 lbp->lb_dtblkcnt = MDDB_DT_BLOCKS; 3662 3663 /* Mark the locators so that they get written to disk. */ 3664 for (li = 0; li < lbp->lb_loccnt; li++) { 3665 mddb_locator_t *lp = &lbp->lb_locators[li]; 3666 3667 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3668 (lp->l_flags & MDDB_F_DELETED) || 3669 (lp->l_flags & MDDB_F_EWRITE)) 3670 continue; 3671 3672 lp->l_flags |= MDDB_F_BADTAG; 3673 } 3674 return (1); 3675 } 3676 3677 /* 3678 * Make sure the blocks are owned, since the calculation in 3679 * computefreeblks() is bypassed when MD_SET_BADTAG is set. 3680 */ 3681 for (i = 0; i < MDDB_DT_BLOCKS; i++) 3682 blkbusy(s, (i + lbp->lb_dtfirstblk)); 3683 3684 return (1); 3685 } 3686 3687 /* 3688 * Writestart writes the incore mddb out to all of the replicas. 3689 * This is called when a diskset is started and when an error has 3690 * been enountered during the write to a mddb. 3691 * 3692 * flag can be 2 values: 3693 * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is 3694 * always used for traditional and local disksets. 3695 * This is the normal path for MN disksets since the slave 3696 * nodes aren't actually allowed to write to disk. 3697 * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new 3698 * master has been chosen, the new master may need to 3699 * write its incore mddb to disk (this is the case where the 3700 * old master had executed a message but hadn't relayed it 3701 * to this slave yet). New master should not write the 3702 * change log records since new master would be overwriting 3703 * valuable data. Only used during a reconfig cycle. 3704 */ 3705 static int 3706 writestart( 3707 mddb_set_t *s, 3708 int flag 3709 ) 3710 { 3711 int li; 3712 mddb_locator_t *lp; 3713 mddb_lb_t *lbp; 3714 mddb_ln_t *lnp; 3715 int err = 0; 3716 uint_t set_status; 3717 3718 lbp = s->s_lbp; 3719 3720 for (li = 0; li < lbp->lb_loccnt; li++) { 3721 lp = &lbp->lb_locators[li]; 3722 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3723 continue; 3724 if (! (lp->l_flags & MDDB_F_SUSPECT)) 3725 continue; 3726 if (writecopy(s, li, flag)) 3727 return (1); 3728 lp->l_flags |= MDDB_F_UP2DATE; 3729 } 3730 3731 for (li = 0; li < lbp->lb_loccnt; li++) { 3732 lp = &lbp->lb_locators[li]; 3733 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3734 continue; 3735 if ((lp->l_flags & MDDB_F_UP2DATE)) 3736 continue; 3737 if (checkcopy(s, li)) 3738 if (err = writecopy(s, li, flag)) 3739 return (1); 3740 lp->l_flags |= MDDB_F_UP2DATE; 3741 } 3742 3743 /* 3744 * Call fixoptrecord even during a reconfig cycle since a replica 3745 * failure may force the master to re-assign the optimized 3746 * resync record to another replica. 3747 */ 3748 if (fixoptrecords(s)) 3749 return (1); 3750 3751 set_status = md_get_setstatus(s->s_setno); 3752 3753 /* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */ 3754 for (li = 0; li < lbp->lb_loccnt; li++) { 3755 lp = &lbp->lb_locators[li]; 3756 3757 if (lp->l_flags & MDDB_F_DELETED) 3758 continue; 3759 3760 if (((lp->l_flags & MDDB_F_ACTIVE) != 0 && 3761 (lp->l_flags & MDDB_F_OLDACT) == 0) || 3762 ((lp->l_flags & MDDB_F_ACTIVE) == 0 && 3763 (lp->l_flags & MDDB_F_OLDACT) != 0)) 3764 break; 3765 3766 if ((set_status & MD_SET_TAGDATA) || 3767 (set_status & MD_SET_CLRTAG)) 3768 if ((lp->l_flags & MDDB_F_TAGDATA) || 3769 (lp->l_flags & MDDB_F_BADTAG)) 3770 break; 3771 } 3772 3773 /* 3774 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) 3775 * the lbp identifier and the set identifier doesn't match. 3776 */ 3777 if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) { 3778 3779 /* Only call for traditional and local sets */ 3780 if (!(lbp->lb_flags & MDDB_MNSET)) 3781 (void) dt_write(s); 3782 3783 setidentifier(s, &lbp->lb_ident); 3784 3785 if (err = push_lb(s)) { 3786 (void) upd_med(s, "writestart(0)"); 3787 return (err); 3788 } 3789 3790 (void) upd_med(s, "writestart(0)"); 3791 3792 if (err = push_lb(s)) { 3793 (void) upd_med(s, "writestart(1)"); 3794 return (err); 3795 } 3796 3797 (void) upd_med(s, "writestart(1)"); 3798 3799 lnp = s->s_lnp; 3800 uniqtime32(&lnp->ln_timestamp); 3801 if (lbp->lb_flags & MDDB_MNSET) 3802 lnp->ln_revision = MDDB_REV_MNLN; 3803 else 3804 lnp->ln_revision = MDDB_REV_LN; 3805 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 3806 err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 3807 lbp->lb_lnblkcnt, 0); 3808 /* 3809 * If a MN diskset and this is the master, set the PARSE_LOCNM 3810 * flag in the mddb_set structure to show that the locator 3811 * names have changed. 3812 * Don't set parseflags as a result of a new master sync 3813 * during reconfig cycle since slaves nodes are already 3814 * in-sync with the new master. 3815 */ 3816 3817 if ((lbp->lb_flags & MDDB_MNSET) && 3818 (md_set[s->s_setno].s_am_i_master) && 3819 (flag != MDDB_WRITECOPY_SYNC)) { 3820 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 3821 } 3822 3823 if (err) 3824 return (err); 3825 } 3826 3827 for (li = 0; li < lbp->lb_loccnt; li++) { 3828 lp = &lbp->lb_locators[li]; 3829 if (lp->l_flags & MDDB_F_DELETED) 3830 continue; 3831 if (lp->l_flags & MDDB_F_ACTIVE) { 3832 lp->l_flags |= MDDB_F_OLDACT; 3833 } else { 3834 lp->l_flags &= ~MDDB_F_OLDACT; 3835 } 3836 } 3837 3838 md_clr_setstatus(s->s_setno, MD_SET_STALE); 3839 3840 return (0); 3841 } 3842 3843 /* 3844 * selectreplicas selects the working replicas and may write the incore 3845 * version of the mddb out to the replicas ondisk. 3846 * 3847 * flag can be 3 values: 3848 * MDDB_RETRYSCAN - quick scan to see if there is an error. 3849 * If no new error, returns without writing mddb 3850 * to disks. If a new error is seen, writes out 3851 * mddb to disks. 3852 * MDDB_SCANALL - lengthy scan to check out mddbs and always writes 3853 * out mddb to the replica ondisk. Calls writecopy 3854 * with MDDB_WRITECOPY_ALL flag which writes out 3855 * all records to the replicas ondisk. 3856 * MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore 3857 * and ondisk mddbs by writing incore values to disk. 3858 * Calls writecopy with MDDB_WRITECOPY_SYNC flag so 3859 * that change log records are not written out. 3860 * Only used by MN disksets. 3861 * 3862 * Returns: 3863 * 0 - Successful 3864 * 1 - Unable to write incore mddb data to disk since < 50% replicas. 3865 */ 3866 int 3867 selectreplicas( 3868 mddb_set_t *s, 3869 int flag 3870 ) 3871 { 3872 int li; 3873 int alc; 3874 int lc; 3875 mddb_locator_t *lp; 3876 mddb_lb_t *lbp = s->s_lbp; 3877 set_t setno = s->s_setno; 3878 int wc_flag; 3879 3880 /* 3881 * can never transition from stale to not stale 3882 */ 3883 if (md_get_setstatus(setno) & MD_SET_STALE) { 3884 for (li = 0; li < lbp->lb_loccnt; li++) { 3885 lp = &lbp->lb_locators[li]; 3886 if (lp->l_flags & MDDB_F_DELETED) 3887 continue; 3888 if (! (lp->l_flags & MDDB_F_EMASTER)) { 3889 lp->l_flags |= MDDB_F_ACTIVE; 3890 } else { 3891 lp->l_flags &= ~MDDB_F_ACTIVE; 3892 } 3893 } 3894 return (1); 3895 } 3896 3897 if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) { 3898 for (li = 0; li < lbp->lb_loccnt; li++) { 3899 lp = &lbp->lb_locators[li]; 3900 if (lp->l_flags & MDDB_F_DELETED) 3901 continue; 3902 if (lp->l_flags & MDDB_F_ACTIVE) { 3903 lp->l_flags |= MDDB_F_OLDACT; 3904 lp->l_flags &= ~MDDB_F_SUSPECT; 3905 } else { 3906 lp->l_flags |= MDDB_F_SUSPECT; 3907 lp->l_flags &= ~MDDB_F_OLDACT; 3908 } 3909 3910 if (! (lp->l_flags & MDDB_F_EMASTER)) { 3911 lp->l_flags |= MDDB_F_ACTIVE; 3912 lp->l_flags &= ~MDDB_F_EWRITE; 3913 lp->l_flags &= ~MDDB_F_TOOSMALL; 3914 } else { 3915 lp->l_flags &= ~MDDB_F_ACTIVE; 3916 } 3917 } 3918 computefreeblks(s); /* set up free block bits */ 3919 } else { 3920 for (li = 0; li < lbp->lb_loccnt; li++) { 3921 lp = &lbp->lb_locators[li]; 3922 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3923 continue; 3924 if (lp->l_flags & MDDB_F_EWRITE) 3925 break; 3926 } 3927 3928 /* 3929 * if there are no errors this is error has already 3930 * been processed return current state 3931 */ 3932 if (li == lbp->lb_loccnt) 3933 return (md_get_setstatus(setno) & MD_SET_TOOFEW); 3934 3935 lp->l_flags &= ~MDDB_F_ACTIVE; 3936 do { 3937 lp = &lbp->lb_locators[li]; 3938 lp->l_flags &= ~MDDB_F_UP2DATE; 3939 } while (++li < lbp->lb_loccnt); 3940 } 3941 3942 alc = 0; 3943 lc = 0; 3944 for (li = 0; li < lbp->lb_loccnt; li++) { 3945 lp = &lbp->lb_locators[li]; 3946 if (lp->l_flags & MDDB_F_DELETED) 3947 continue; 3948 lc++; 3949 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3950 continue; 3951 alc++; 3952 } 3953 3954 if (alc < ((lc + 1) / 2)) { 3955 md_set_setstatus(setno, MD_SET_TOOFEW); 3956 return (1); 3957 } 3958 3959 /* Set wc_flag based on flag passed in. */ 3960 if (flag == MDDB_SCANALLSYNC) 3961 wc_flag = MDDB_WRITECOPY_SYNC; 3962 else 3963 wc_flag = MDDB_WRITECOPY_ALL; 3964 3965 do { 3966 if (! writestart(s, wc_flag)) { 3967 md_clr_setstatus(setno, MD_SET_TOOFEW); 3968 return (0); 3969 } 3970 alc = 0; 3971 for (li = 0; li < lbp->lb_loccnt; li++) { 3972 lp = &lbp->lb_locators[li]; 3973 if ((lp->l_flags & MDDB_F_DELETED) || 3974 (lp->l_flags & MDDB_F_EMASTER)) 3975 continue; 3976 3977 if (lp->l_flags & MDDB_F_EWRITE) { 3978 lp->l_flags &= ~MDDB_F_ACTIVE; 3979 lp->l_flags &= ~MDDB_F_UP2DATE; 3980 continue; 3981 } 3982 alc++; 3983 } 3984 } while (alc >= ((lc + 1) / 2)); 3985 md_set_setstatus(setno, MD_SET_TOOFEW); 3986 return (1); 3987 } 3988 3989 static int 3990 checkstate( 3991 mddb_set_t *s, 3992 int probe 3993 ) 3994 { 3995 int error; 3996 uint_t set_status = md_get_setstatus(s->s_setno); 3997 3998 ASSERT(s != NULL); 3999 4000 if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW)) 4001 return (0); 4002 4003 if (probe == MDDB_NOPROBE) 4004 return (1); 4005 4006 single_thread_start(s); 4007 error = selectreplicas(s, MDDB_SCANALL); 4008 single_thread_end(s); 4009 4010 if (error == 0 && s->s_zombie != 0) { 4011 mutex_exit(SETMUTEX(s->s_setno)); 4012 error = mddb_deleterec(s->s_zombie); 4013 mutex_enter(SETMUTEX(s->s_setno)); 4014 if (error == 0) 4015 s->s_zombie = 0; 4016 } 4017 return (error); 4018 } 4019 4020 static int 4021 writeretry( 4022 mddb_set_t *s 4023 ) 4024 { 4025 if (selectreplicas(s, MDDB_RETRYSCAN)) 4026 if (selectreplicas(s, MDDB_SCANALL)) 4027 return (1); 4028 return (0); 4029 } 4030 4031 static void 4032 free_mbipp(mddb_mb_ic_t **mbipp) 4033 { 4034 mddb_mb_ic_t *mbip1, *mbip2; 4035 4036 for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) { 4037 mbip2 = mbip1->mbi_next; 4038 kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE); 4039 } 4040 *mbipp = (mddb_mb_ic_t *)NULL; 4041 } 4042 4043 static mddb_ri_t * 4044 save_rip(mddb_set_t *s) 4045 { 4046 mddb_ri_t *trip = s->s_rip; 4047 mddb_ri_t *nrip = NULL; 4048 mddb_ri_t **nripp = &nrip; 4049 mddb_ri_t *rip; 4050 4051 while (trip) { 4052 /* Run to the end of the list */ 4053 for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next) 4054 /* void */; 4055 4056 /* Add the new member */ 4057 *nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP); 4058 4059 ASSERT(*nripp != NULL); 4060 4061 /* shorthand */ 4062 rip = *nripp; 4063 4064 *rip = *trip; /* structure assignment */ 4065 4066 /* Clear the stuff that is not needed for hints */ 4067 rip->ri_flags = 0; 4068 rip->ri_commitcnt = 0; 4069 rip->ri_transplant = 0; 4070 rip->ri_mbip = (mddb_mb_ic_t *)NULL; 4071 rip->ri_dtp = (mddb_dt_t *)NULL; 4072 rip->ri_lbp = (mddb_lb_t *)NULL; 4073 rip->ri_did_icp = (mddb_did_ic_t *)NULL; 4074 rip->ri_devid = (ddi_devid_t)NULL; 4075 rip->ri_old_devid = (ddi_devid_t)NULL; 4076 rip->ri_next = (mddb_ri_t *)NULL; 4077 4078 trip = trip->ri_next; 4079 } 4080 return (nrip); 4081 } 4082 4083 static void 4084 free_rip(mddb_ri_t **ripp) 4085 { 4086 mddb_ri_t *rip; 4087 mddb_ri_t *arip; 4088 4089 for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) { 4090 arip = rip->ri_next; 4091 if (rip->ri_devid != (ddi_devid_t)NULL) { 4092 ddi_devid_free(rip->ri_devid); 4093 rip->ri_devid = (ddi_devid_t)NULL; 4094 } 4095 if (rip->ri_old_devid != (ddi_devid_t)NULL) { 4096 ddi_devid_free(rip->ri_old_devid); 4097 rip->ri_old_devid = (ddi_devid_t)NULL; 4098 } 4099 kmem_free((caddr_t)rip, sizeof (*rip)); 4100 } 4101 *ripp = (mddb_ri_t *)NULL; 4102 } 4103 4104 /* 4105 * this routine selects the correct replica to use 4106 * the rules are as follows 4107 * 1. if all replica has same init time select highest commit count 4108 * 2. if some but not all replicas are from another hostid discard 4109 * them. 4110 * 3. find which init time is present is most replicas 4111 * 4. discard all replicas which do not match most init times 4112 * 5. select replica with highest commit count 4113 */ 4114 4115 static mddb_lb_t * 4116 selectlocator( 4117 mddb_set_t *s 4118 ) 4119 { 4120 mddb_ri_t *rip = s->s_rip; 4121 mddb_ri_t *r, *r1; 4122 mddb_lb_t *lbp; 4123 struct timeval32 *tp = (struct timeval32 *)NULL; 4124 int different; 4125 int same; 4126 int count; 4127 int maxcount; 4128 set_t setno = s->s_setno; 4129 size_t sz; 4130 int mn_set = 0; 4131 4132 /* Clear the ri_transplant flag on all the rip entries. */ 4133 /* Set ri_commitcnt to locator's commitcnt - if available */ 4134 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4135 r->ri_transplant = 0; 4136 if (r->ri_lbp != (mddb_lb_t *)NULL) { 4137 r->ri_commitcnt = r->ri_lbp->lb_commitcnt; 4138 /* If any locators have MN bit set, set flag */ 4139 if (r->ri_lbp->lb_flags & MDDB_MNSET) 4140 mn_set = 1; 4141 } 4142 } 4143 4144 /* 4145 * A data tag is being used, so use it to limit the selection first. 4146 * Data tags not used in MN diskset. 4147 */ 4148 if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) { 4149 mddb_dt_t *dtp = (mddb_dt_t *)md_set[setno].s_dtp; 4150 4151 /* 4152 * now toss any locators that have a different data tag 4153 */ 4154 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4155 if (r->ri_lbp == (mddb_lb_t *)NULL) 4156 continue; 4157 4158 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4159 /* If same tag, keep it */ 4160 if (dtl_cmp(&dtp->dt_dtag, 4161 &r->ri_dtp->dt_dtag) == 0) 4162 continue; 4163 } 4164 4165 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4166 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4167 r->ri_dtp = (mddb_dt_t *)NULL; 4168 } 4169 4170 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4171 if (!(md_get_setstatus(setno) & 4172 MD_SET_REPLICATED_IMPORT)) { 4173 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4174 sz = ddi_devid_sizeof(r->ri_old_devid); 4175 kmem_free((caddr_t)r->ri_old_devid, sz); 4176 r->ri_old_devid = (ddi_devid_t)NULL; 4177 } 4178 } 4179 4180 kmem_free((caddr_t)r->ri_lbp, 4181 dbtob(r->ri_lbp->lb_blkcnt)); 4182 r->ri_lbp = (mddb_lb_t *)NULL; 4183 4184 r->ri_transplant = 1; 4185 } 4186 4187 /* Tag used, clear the bit */ 4188 md_clr_setstatus(s->s_setno, MD_SET_USETAG); 4189 4190 if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) { 4191 /* 4192 * Get rid of the list of tags. 4193 */ 4194 dtl_freel(&s->s_dtlp); 4195 4196 /* 4197 * Re-create the list with the tag used. 4198 */ 4199 (void) dtl_addl(s, &dtp->dt_dtag); 4200 } 4201 } 4202 4203 /* 4204 * scan to see if all replicas have same time 4205 */ 4206 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4207 if (r->ri_lbp == (mddb_lb_t *)NULL) 4208 continue; 4209 if (tp == NULL) { 4210 tp = &r->ri_lbp->lb_inittime; 4211 continue; 4212 } 4213 /* CSTYLED */ 4214 if (timercmp(tp, &r->ri_lbp->lb_inittime, !=)) 4215 break; 4216 } 4217 4218 /* 4219 * if r == NULL then they were all them same. Choose highest 4220 * commit count 4221 */ 4222 if (r == (mddb_ri_t *)NULL) 4223 goto out; 4224 4225 /* 4226 * If here, a bogus replica is present and at least 1 lb_inittime 4227 * did not match. 4228 */ 4229 4230 /* 4231 * look and see if any but not all are from different id 4232 */ 4233 4234 different = 0; 4235 same = 0; 4236 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4237 if (r->ri_lbp == (mddb_lb_t *)NULL) 4238 continue; 4239 if (cmpidentifier(s, &r->ri_lbp->lb_ident)) 4240 different = 1; 4241 else 4242 same = 1; 4243 } 4244 4245 /* 4246 * now go through and throw out different if there are some 4247 * that are the same 4248 */ 4249 if (different != 0 && same != 0) { 4250 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4251 if (r->ri_lbp == (mddb_lb_t *)NULL) 4252 continue; 4253 4254 if (!cmpidentifier(s, &r->ri_lbp->lb_ident)) 4255 continue; 4256 4257 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4258 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4259 r->ri_dtp = (mddb_dt_t *)NULL; 4260 } 4261 4262 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4263 if (!(md_get_setstatus(setno) & 4264 MD_SET_REPLICATED_IMPORT)) { 4265 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4266 sz = ddi_devid_sizeof(r->ri_old_devid); 4267 kmem_free((caddr_t)r->ri_old_devid, sz); 4268 r->ri_old_devid = (ddi_devid_t)NULL; 4269 } 4270 } 4271 4272 kmem_free((caddr_t)r->ri_lbp, 4273 dbtob(r->ri_lbp->lb_blkcnt)); 4274 r->ri_lbp = (mddb_lb_t *)NULL; 4275 4276 r->ri_transplant = 1; 4277 } 4278 } 4279 4280 /* 4281 * go through and pick highest. Use n square because it is 4282 * simple and 40 some is max possible 4283 */ 4284 maxcount = 0; 4285 lbp = (mddb_lb_t *)NULL; 4286 for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) { 4287 if (r1->ri_lbp == (mddb_lb_t *)NULL) 4288 continue; 4289 count = 0; 4290 for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4291 if (r->ri_lbp == (mddb_lb_t *)NULL) 4292 continue; 4293 if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */ 4294 &r->ri_lbp->lb_inittime, ==)) 4295 count++; 4296 } 4297 if (count > maxcount) { 4298 maxcount = count; 4299 lbp = r1->ri_lbp; 4300 } 4301 } 4302 4303 /* 4304 * now go though and toss any that are of a different time stamp 4305 */ 4306 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4307 if (r->ri_lbp == (mddb_lb_t *)NULL) 4308 continue; 4309 if (timercmp(&lbp->lb_inittime, /* CSTYLED */ 4310 &r->ri_lbp->lb_inittime, ==)) 4311 continue; 4312 4313 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4314 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4315 r->ri_dtp = (mddb_dt_t *)NULL; 4316 } 4317 4318 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4319 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 4320 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4321 sz = ddi_devid_sizeof(r->ri_old_devid); 4322 kmem_free((caddr_t)r->ri_old_devid, sz); 4323 r->ri_old_devid = (ddi_devid_t)NULL; 4324 } 4325 } 4326 4327 kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); 4328 r->ri_lbp = (mddb_lb_t *)NULL; 4329 4330 r->ri_transplant = 1; 4331 } 4332 4333 out: 4334 /* 4335 * Find the locator with the highest commit count, and make it the 4336 * "chosen" one. 4337 */ 4338 lbp = (mddb_lb_t *)NULL; 4339 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4340 if (r->ri_lbp == (mddb_lb_t *)NULL) 4341 continue; 4342 4343 if (lbp == NULL) { 4344 lbp = r->ri_lbp; 4345 continue; 4346 } 4347 4348 if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt) 4349 lbp = r->ri_lbp; 4350 } 4351 4352 /* Toss all locator blocks, except the "chosen" one. */ 4353 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4354 if (r->ri_lbp == (mddb_lb_t *)NULL) 4355 continue; 4356 4357 /* Get rid of all dtp's */ 4358 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4359 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4360 r->ri_dtp = (mddb_dt_t *)NULL; 4361 } 4362 4363 if (r->ri_lbp == lbp) 4364 continue; 4365 4366 /* Get rid of extra locator devid block info */ 4367 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4368 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 4369 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4370 sz = ddi_devid_sizeof(r->ri_old_devid); 4371 kmem_free((caddr_t)r->ri_old_devid, sz); 4372 r->ri_old_devid = (ddi_devid_t)NULL; 4373 } 4374 } 4375 4376 /* Get rid of extra locators */ 4377 kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); 4378 r->ri_lbp = (mddb_lb_t *)NULL; 4379 } 4380 return (lbp); 4381 } 4382 4383 static void 4384 locator2cfgloc( 4385 mddb_lb_t *lbp, 4386 mddb_cfg_loc_t *clp, 4387 int li, 4388 side_t sideno, 4389 mddb_did_ic_t *did_icp 4390 ) 4391 { 4392 mddb_drvnm_t *dn; 4393 mddb_locator_t *lp = &lbp->lb_locators[li]; 4394 mddb_sidelocator_t *slp; 4395 mddb_mnsidelocator_t *mnslp; 4396 mddb_did_info_t *did_info; 4397 int i, sz, szalloc; 4398 int mn_set = 0; 4399 mddb_mnlb_t *mnlbp; 4400 4401 if (lbp->lb_flags & MDDB_MNSET) { 4402 mn_set = 1; 4403 mnlbp = (mddb_mnlb_t *)lbp; 4404 for (i = 0; i < MD_MNMAXSIDES; i++) { 4405 mnslp = &mnlbp->lb_mnsidelocators[i][li]; 4406 if (mnslp->mnl_sideno == sideno) 4407 break; 4408 } 4409 if (i == MD_MNMAXSIDES) 4410 return; 4411 } else { 4412 slp = &lbp->lb_sidelocators[sideno][li]; 4413 } 4414 4415 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4416 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 4417 if (did_info->info_flags & MDDB_DID_EXISTS) { 4418 sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]); 4419 if (clp->l_devid_flags & MDDB_DEVID_SPACE) { 4420 /* 4421 * copy device id from mddb to 4422 * cfg_loc structure 4423 */ 4424 szalloc = clp->l_devid_sz; 4425 if (sz <= szalloc) { 4426 for (i = 0; i < sz; i++) { 4427 ((char *)(uintptr_t) 4428 clp->l_devid)[i] = 4429 ((char *)did_icp-> 4430 did_ic_devid[li])[i]; 4431 } 4432 clp->l_devid_flags |= MDDB_DEVID_VALID; 4433 (void) strcpy(clp->l_minor_name, 4434 did_info->info_minor_name); 4435 } else { 4436 clp->l_devid_flags |= 4437 MDDB_DEVID_NOSPACE; 4438 } 4439 } else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) { 4440 clp->l_devid_flags = MDDB_DEVID_SZ; 4441 clp->l_devid_sz = sz; 4442 } 4443 } 4444 } 4445 4446 /* 4447 * Even if a devid exists, use the dev, drvnm and mnum in the locators 4448 * and sidelocators. During startup, the dev, drvnm and mnum in 4449 * these structures may not match the devid (the locators and 4450 * sidelocators will be updated to match the devid by the routine 4451 * load_old_replicas). Using out-of-sync values won't cause any 4452 * problems since ridev will re-derive these from the devid and mnum. 4453 * After startup, the dev, drvnm and mnum in these structures have 4454 * been updated and can be used. 4455 */ 4456 4457 clp->l_blkno = lp->l_blkno; 4458 clp->l_flags = lp->l_flags; 4459 clp->l_dev = lp->l_dev; 4460 4461 if (mn_set) { 4462 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 4463 clp->l_mnum = mnslp->mnl_mnum; 4464 } else { 4465 dn = &lbp->lb_drvnm[slp->l_drvnm_index]; 4466 clp->l_mnum = slp->l_mnum; 4467 } 4468 (void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM); 4469 } 4470 4471 /* 4472 * Find the index into the mnsidelocator where entry will go. 4473 * Then index can be fed into both splitname2locatorblocks and 4474 * cfgloc2locator so that those entries can be kept in sync. 4475 * 4476 * Returns: 4477 * -1 if failed to find unused slot or if a traditional diskset 4478 * index, if successful (0 <= index <= MD_MNMAXSIDES) 4479 */ 4480 static int 4481 checklocator( 4482 mddb_lb_t *lbp, 4483 int li, 4484 side_t sideno 4485 ) 4486 { 4487 uchar_t i; 4488 mddb_mnsidelocator_t *mnslp; 4489 mddb_mnlb_t *mnlbp; 4490 int index = -1; 4491 4492 if (lbp->lb_flags & MDDB_MNSET) { 4493 /* 4494 * Checking side locator structure. First, check if 4495 * there is already an entry for this side. If so, 4496 * then use that entry. Otherwise, find an entry 4497 * that has a sideno of 0. 4498 */ 4499 mnlbp = (mddb_mnlb_t *)lbp; 4500 for (i = 0; i < MD_MNMAXSIDES; i++) { 4501 mnslp = &mnlbp->lb_mnsidelocators[i][li]; 4502 if (mnslp->mnl_sideno == sideno) { 4503 /* Found a match - stop looking */ 4504 index = i; 4505 break; 4506 } else if ((mnslp->mnl_sideno == 0) && (index == -1)) { 4507 /* Set first empty slot, but keep looking */ 4508 index = i; 4509 } 4510 } 4511 /* Didn't find empty slot or previously used slot */ 4512 if ((i == MD_MNMAXSIDES) && (index == -1)) { 4513 return (-1); 4514 } 4515 return (index); 4516 } else 4517 return (0); 4518 } 4519 4520 /* 4521 * Takes locator information (driver name, minor number, sideno) and 4522 * stores it in the locator block. 4523 * For traditional diskset, the sideno is the index into the sidelocator 4524 * array in the locator block. 4525 * For the MN diskset, the sideno is the nodeid which can be any number, 4526 * so the index passed in is the index into the mnsidelocator array 4527 * in the locator block. 4528 */ 4529 static int 4530 cfgloc2locator( 4531 mddb_lb_t *lbp, 4532 mddb_cfg_loc_t *clp, 4533 int li, 4534 side_t sideno, 4535 int index /* Only useful in MNsets when > 1 */ 4536 ) 4537 { 4538 uchar_t i; 4539 mddb_sidelocator_t *slp; 4540 mddb_mnsidelocator_t *mnslp; 4541 mddb_set_t *s; 4542 int mn_set = 0; 4543 mddb_mnlb_t *mnlbp; 4544 4545 if (lbp->lb_flags & MDDB_MNSET) { 4546 mnlbp = (mddb_mnlb_t *)lbp; 4547 mn_set = 1; 4548 /* 4549 * Index will be the slot that has the given sideno or 4550 * the first empty slot if no match is found. 4551 * This was pre-checked out in check locator. 4552 */ 4553 mnslp = &mnlbp->lb_mnsidelocators[index][li]; 4554 } else { 4555 slp = &lbp->lb_sidelocators[sideno][li]; 4556 } 4557 4558 /* 4559 * Look for the driver name 4560 */ 4561 for (i = 0; i < MDDB_DRVNMCNT; i++) { 4562 if (lbp->lb_drvnm[i].dn_len == 0) 4563 continue; 4564 if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver, 4565 MD_MAXDRVNM) == 0) 4566 break; 4567 } 4568 4569 /* 4570 * Didn't find one, add a new one 4571 */ 4572 if (i == MDDB_DRVNMCNT) { 4573 for (i = 0; i < MDDB_DRVNMCNT; i++) { 4574 if (lbp->lb_drvnm[i].dn_len == 0) 4575 break; 4576 } 4577 if (i == MDDB_DRVNMCNT) 4578 return (1); 4579 (void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver, 4580 MD_MAXDRVNM); 4581 lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver); 4582 } 4583 4584 /* Fill in the drvnm index */ 4585 if (mn_set) { 4586 mnslp->mnl_drvnm_index = i; 4587 mnslp->mnl_mnum = clp->l_mnum; 4588 mnslp->mnl_sideno = sideno; 4589 } else { 4590 slp->l_drvnm_index = i; 4591 slp->l_mnum = clp->l_mnum; 4592 } 4593 4594 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4595 /* 4596 * This device id could already be associated with this index 4597 * if this is not the first side added to the set. 4598 * If device id is 0, there is no device id for this device. 4599 */ 4600 if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0) 4601 return (0); 4602 s = (mddb_set_t *)md_set[lbp->lb_setno].s_db; 4603 if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid, 4604 clp->l_minor_name)) { 4605 return (1); 4606 } 4607 } 4608 4609 return (0); 4610 } 4611 4612 /* 4613 * See if there are mediator hosts and try to use the data. 4614 */ 4615 static int 4616 mediate( 4617 mddb_set_t *s 4618 ) 4619 { 4620 mddb_lb_t *lbp = s->s_lbp; 4621 med_data_lst_t *meddlp = NULL; 4622 med_data_lst_t *tmeddlp = NULL; 4623 med_data_t *meddp; 4624 int medok = 0; 4625 int medacc = 0; 4626 uint_t maxcc; 4627 int golden = 0; 4628 int err = 1; 4629 set_t setno = s->s_setno; 4630 4631 /* Do not have a mediator, then the state is stale */ 4632 if (s->s_med.n_cnt == 0) 4633 return (err); 4634 4635 /* Contact the mediator hosts for the data */ 4636 meddlp = get_med_host_data(&s->s_med, s->s_setname, setno); 4637 4638 /* No mediator data, stale */ 4639 if (meddlp == NULL) 4640 return (err); 4641 4642 /* Mark all the mediator data that is not for this set as errored */ 4643 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4644 struct timeval32 tmptime; 4645 meddp = tmeddlp->mdl_med; 4646 4647 /* Count the number of mediators contacted */ 4648 medacc++; 4649 4650 /* Paranoid check */ 4651 if (meddp->med_dat_sn != setno) 4652 meddp->med_dat_fl |= MED_DFL_ERROR; 4653 4654 TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id); 4655 4656 /*CSTYLED*/ 4657 if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=)) 4658 meddp->med_dat_fl |= MED_DFL_ERROR; 4659 } 4660 4661 /* Get the max commitcount */ 4662 maxcc = 0; 4663 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4664 meddp = tmeddlp->mdl_med; 4665 if (meddp->med_dat_fl & MED_DFL_ERROR) 4666 continue; 4667 if (meddp->med_dat_cc > maxcc) 4668 maxcc = meddp->med_dat_cc; 4669 } 4670 4671 /* Now mark the records that don't have the highest cc as errored */ 4672 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4673 meddp = tmeddlp->mdl_med; 4674 if (meddp->med_dat_fl & MED_DFL_ERROR) 4675 continue; 4676 if (meddp->med_dat_cc != maxcc) 4677 meddp->med_dat_fl |= MED_DFL_ERROR; 4678 } 4679 4680 /* Now mark the records that don't match the lb commitcnt as errored */ 4681 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4682 meddp = tmeddlp->mdl_med; 4683 if (meddp->med_dat_fl & MED_DFL_ERROR) 4684 continue; 4685 if (meddp->med_dat_cc != lbp->lb_commitcnt) 4686 meddp->med_dat_fl |= MED_DFL_ERROR; 4687 } 4688 4689 /* Is there a "golden" copy and how many valid mediators */ 4690 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4691 meddp = tmeddlp->mdl_med; 4692 if (meddp->med_dat_fl & MED_DFL_ERROR) 4693 continue; 4694 4695 if (meddp->med_dat_fl & MED_DFL_GOLDEN) 4696 golden++; 4697 4698 medok++; 4699 } 4700 4701 /* No survivors, stale */ 4702 if (medok == 0) 4703 goto out; 4704 4705 /* No mediator quorum and no golden copies, stale */ 4706 if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) { 4707 /* Skip odd numbers, no exact 50% */ 4708 if (s->s_med.n_cnt & 1) 4709 goto out; 4710 /* Have 50%, allow an accept */ 4711 if (medacc == (s->s_med.n_cnt / 2)) 4712 md_set_setstatus(setno, MD_SET_ACCOK); 4713 goto out; 4714 } 4715 4716 /* We either have a quorum or a golden copy, or both */ 4717 err = 0; 4718 4719 out: 4720 if (meddlp) { 4721 for (/* void */; meddlp != NULL; meddlp = tmeddlp) { 4722 tmeddlp = meddlp->mdl_nx; 4723 kmem_free(meddlp->mdl_med, sizeof (med_data_t)); 4724 kmem_free(meddlp, sizeof (med_data_lst_t)); 4725 } 4726 } 4727 4728 return (err); 4729 } 4730 4731 /* 4732 * 1. read masterblks and locator blocks for all know database locations 4733 * a. keep track of which have good master blks 4734 * b. keep track of which have good locators 4735 * 4736 */ 4737 static int 4738 get_mbs_n_lbs( 4739 mddb_set_t *s, 4740 int *write_lb 4741 ) 4742 { 4743 mddb_lb_t *lbp = NULL; /* pointer to locator block */ 4744 /* May be cast to mddb_mnlb_t */ 4745 /* if accessing sidenames in */ 4746 /* MN set */ 4747 mddb_did_ic_t *did_icp = NULL; /* ptr to Device ID incore */ 4748 mddb_did_blk_t *did_blkp = 0; 4749 int did_blkp_sz = 0; 4750 mddb_did_db_t *did_dbp; 4751 mddb_did_info_t *did_info; 4752 caddr_t did_block; 4753 mddb_ri_t *rip; 4754 mddb_dtag_lst_t *dtlp; 4755 mddb_locator_t *lp; 4756 daddr_t physblk; 4757 int li; 4758 uint_t blk; 4759 md_dev64_t dev; 4760 caddr_t buffer; 4761 uint_t lb_blkcnt; 4762 int retval = 0; 4763 int err = 0; 4764 int lb_ok = 0; 4765 int lb_total = 0; 4766 int lb_tagged = 0; 4767 int lb_tags; 4768 set_t setno = s->s_setno; 4769 int cont_flag, i; 4770 mddb_did_db_t *did_dbp1, *did_dbp2; 4771 int mn_set = 0; 4772 mddb_cfg_loc_t *cl; 4773 4774 /* 4775 * read in master blocks and locator block for all known locators. 4776 * lb_blkcnt will be set correctly for MN set later once getmasters 4777 * has determined that the set is a MN set. 4778 */ 4779 lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT); 4780 4781 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 4782 rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL | 4783 MDDB_F_EMASTER); 4784 rip->ri_lbp = (mddb_lb_t *)NULL; 4785 rip->ri_did_icp = (mddb_did_ic_t *)NULL; 4786 4787 /* 4788 * Translated dev is only used in calls to getmasters and 4789 * getblks which expect a translated (aka miniroot) dev. 4790 */ 4791 dev = md_xlate_targ_2_mini(rip->ri_dev); 4792 if (dev == NODEV64) { 4793 /* Set error flag that getmasters would have set */ 4794 /* if getmasters had been allowed to fail */ 4795 rip->ri_flags |= MDDB_F_EMASTER; 4796 } 4797 4798 /* 4799 * Invalid device id on system (due to failed or 4800 * removed device) or invalid devt during upgrade 4801 * (due to powered off device) will cause this 4802 * replica to be marked in error and not used. 4803 */ 4804 if (rip->ri_flags & MDDB_F_EMASTER) 4805 continue; 4806 4807 /* get all master blocks, does mddb_devopen() */ 4808 rip->ri_mbip = getmasters(s, dev, rip->ri_blkno, 4809 &rip->ri_flags, &mn_set); 4810 4811 /* if invalid master block - try next replica */ 4812 if (! rip->ri_mbip) 4813 continue; 4814 4815 /* 4816 * If lbp alloc'd to wrong size - reset it. 4817 * If MN set, lb_blkcnt must be MDDB_MNLBCNT. 4818 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT. 4819 */ 4820 if (lbp) { 4821 if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) || 4822 ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) { 4823 kmem_free((caddr_t)lbp, dbtob(lb_blkcnt)); 4824 lbp = (mddb_lb_t *)NULL; 4825 } 4826 } 4827 4828 if (lbp == (mddb_lb_t *)NULL) { 4829 /* If a MN set, set lb_blkcnt for MN loc blk size */ 4830 if (mn_set) 4831 lb_blkcnt = MDDB_MNLBCNT; 4832 lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt), 4833 KM_SLEEP); 4834 } 4835 4836 /* 4837 * Read in all the sectors for the locator block 4838 * NOTE: Need to use getblks, rather than readblklst. 4839 * because it is too early and things are 4840 * NOT set up yet for read*()'s 4841 */ 4842 buffer = (caddr_t)lbp; 4843 for (blk = 0; blk < lb_blkcnt; blk++) { 4844 physblk = getphysblk(blk, rip->ri_mbip); 4845 err = getblks(s, buffer, dev, physblk, 4846 btodb(MDDB_BSIZE), 0); 4847 if (err) { 4848 rip->ri_flags |= err; 4849 break; 4850 } 4851 buffer += MDDB_BSIZE; 4852 } 4853 4854 if (err) 4855 continue; 4856 4857 /* Verify the locator block */ 4858 if (blk != lb_blkcnt) 4859 continue; 4860 if (lbp->lb_magic != MDDB_MAGIC_LB) 4861 continue; 4862 if (lbp->lb_blkcnt != lb_blkcnt) 4863 continue; 4864 if (mn_set) { 4865 /* If a MN set, check for MNLB revision in lb. */ 4866 if (revchk(MDDB_REV_MNLB, lbp->lb_revision)) 4867 continue; 4868 } else { 4869 /* If not a MN set, check for LB revision in lb. */ 4870 if (revchk(MDDB_REV_LB, lbp->lb_revision)) 4871 continue; 4872 } 4873 if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL)) 4874 continue; 4875 4876 /* 4877 * With the addition of MultiNode Disksets, we must make sure 4878 * to verify that this is the correct set. A node could 4879 * have been out of the config for awhile and this disk could 4880 * have been moved to a different diskset and we don't want 4881 * to accidentally start the wrong set. 4882 * 4883 * We don't do this check if we're in the middle of 4884 * importing a set. 4885 */ 4886 if (!(md_get_setstatus(s->s_setno) & 4887 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && 4888 (lbp->lb_setno != s->s_setno)) 4889 continue; 4890 4891 rip->ri_flags |= MDDB_F_LOCACC; 4892 4893 /* 4894 * a commit count of zero means this locator has been deleted 4895 */ 4896 if (lbp->lb_commitcnt == 0) 4897 continue; 4898 4899 /* 4900 * If replica is in the device ID style and md_devid_destroy 4901 * flag is set, turn off device id style. This is only to be 4902 * used in a catastrophic failure case. Examples would be 4903 * where the device id of all drives in the system 4904 * (especially the mirror'd root drives) had been changed 4905 * by firmware upgrade or by a patch to an existing disk 4906 * driver. Another example would be in the case of non-unique 4907 * device ids due to a bug. The device id would be valid on 4908 * the system, but would return the wrong dev_t. 4909 */ 4910 if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) { 4911 lbp->lb_flags &= ~MDDB_DEVID_STYLE; 4912 lbp->lb_didfirstblk = 0; 4913 lbp->lb_didblkcnt = 0; 4914 *write_lb = 1; 4915 } 4916 4917 4918 /* 4919 * If replica is in device ID style, read in device ID 4920 * block and verify device ID block information. 4921 */ 4922 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4923 4924 /* Read in device ID block */ 4925 if (did_icp == NULL) { 4926 did_icp = (mddb_did_ic_t *) 4927 kmem_zalloc(sizeof (mddb_did_ic_t), 4928 KM_SLEEP); 4929 } else { 4930 /* Reuse did_icp, but clear out data */ 4931 if (did_icp->did_ic_blkp != 4932 (mddb_did_blk_t *)NULL) { 4933 kmem_free((caddr_t)did_icp->did_ic_blkp, 4934 did_blkp_sz); 4935 did_blkp = (mddb_did_blk_t *)NULL; 4936 did_icp->did_ic_blkp = 4937 (mddb_did_blk_t *)NULL; 4938 } 4939 if (did_icp->did_ic_dbp != 4940 (mddb_did_db_t *)NULL) { 4941 did_dbp1 = did_icp->did_ic_dbp; 4942 while (did_dbp1) { 4943 did_dbp2 = did_dbp1->db_next; 4944 kmem_free((caddr_t) 4945 did_dbp1->db_ptr, 4946 dbtob(did_dbp1->db_blkcnt)); 4947 kmem_free((caddr_t)did_dbp1, 4948 sizeof (mddb_did_db_t)); 4949 did_dbp1 = did_dbp2; 4950 } 4951 did_icp->did_ic_dbp = 4952 (mddb_did_db_t *)NULL; 4953 } 4954 for (i = 0; i < MDDB_NLB; i++) { 4955 did_icp->did_ic_devid[i] = 4956 (ddi_devid_t)NULL; 4957 } 4958 } 4959 4960 /* Can't reuse blkp since size could be different */ 4961 if (did_blkp != (mddb_did_blk_t *)NULL) { 4962 kmem_free(did_blkp, did_blkp_sz); 4963 } 4964 did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt); 4965 did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz, 4966 KM_SLEEP); 4967 did_icp->did_ic_blkp = did_blkp; 4968 buffer = (caddr_t)did_blkp; 4969 for (blk = lbp->lb_didfirstblk; 4970 blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk); 4971 blk++) { 4972 physblk = getphysblk(blk, rip->ri_mbip); 4973 err = getblks(s, buffer, dev, physblk, 4974 btodb(MDDB_BSIZE), 0); 4975 if (err) { 4976 rip->ri_flags |= err; 4977 break; 4978 } 4979 buffer += MDDB_BSIZE; 4980 } 4981 if (err) 4982 continue; 4983 4984 /* Verify the Device ID block */ 4985 if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk)) 4986 continue; 4987 if (did_blkp->blk_magic != MDDB_MAGIC_DI) 4988 continue; 4989 if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS) 4990 continue; 4991 if (revchk(MDDB_REV_DI, did_blkp->blk_revision)) 4992 continue; 4993 if (crcchk(did_blkp, &did_blkp->blk_checksum, 4994 dbtob(lbp->lb_didblkcnt), NULL)) 4995 continue; 4996 4997 /* 4998 * Check if device ID block is out of sync with the 4999 * Locator Block by checking if the locator block 5000 * commitcnt does not match the device id block 5001 * commitcnt. If an 'out of sync' condition 5002 * exists, discard this replica since it has 5003 * inconsistent data and can't be used in 5004 * determining the best replica. 5005 * 5006 * An 'out of sync' condition could happen if old 5007 * SDS code was running with new devid style replicas 5008 * or if a failure occurred between the writing of 5009 * the locator block's commitcnt and the device 5010 * id block's commitcnt. 5011 * 5012 * If old SDS code had been running, the upgrade 5013 * process should detect this situation and 5014 * have removed all of the device id information 5015 * via the md_devid_destroy flag in md.conf. 5016 */ 5017 if (did_blkp->blk_commitcnt != 5018 lbp->lb_commitcnt) { 5019 continue; 5020 } 5021 } 5022 5023 5024 /* 5025 * If replica is still in device ID style, read in all 5026 * of the device IDs, verify the checksum of the device IDs. 5027 */ 5028 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5029 /* 5030 * Reset valid bit in device id info block flags. This 5031 * flag is stored on disk, but the valid bit is reset 5032 * when reading in the replica. If the corresponding 5033 * device id is valid (aka meaning that the system 5034 * knows about this device id), the valid bit will 5035 * be set at a later time. The valid bit for this 5036 * replica's device ID will be set in this routine. 5037 * The valid bits for the rest of the device id's 5038 * will be set after the 'best' replica has 5039 * been selected in routine load_old_replicas. 5040 * Reset updated bit in device id info block flags. 5041 * This flag is also stored on disk, reset when read 5042 * in and set when the locators and side locators 5043 * have been updated to match this valid device 5044 * id information. 5045 */ 5046 for (li = 0; li < lbp->lb_loccnt; li++) { 5047 did_info = &did_blkp->blk_info[li]; 5048 if (did_info->info_flags & MDDB_DID_EXISTS) 5049 did_info->info_flags &= 5050 ~(MDDB_DID_VALID | 5051 MDDB_DID_UPDATED); 5052 } 5053 5054 cont_flag = 0; 5055 for (li = 0; li < lbp->lb_loccnt; li++) { 5056 did_info = &did_blkp->blk_info[li]; 5057 did_block = (caddr_t)NULL; 5058 if (did_info->info_flags & MDDB_DID_EXISTS) { 5059 /* 5060 * Check if block has 5061 * already been read in 5062 */ 5063 did_dbp = did_icp->did_ic_dbp; 5064 while (did_dbp != 0) { 5065 if (did_dbp->db_firstblk == 5066 did_info->info_firstblk) 5067 break; 5068 else 5069 did_dbp = 5070 did_dbp->db_next; 5071 } 5072 /* if block not found, read it in */ 5073 if (did_dbp == NULL) { 5074 did_block = (caddr_t) 5075 (kmem_zalloc(dbtob( 5076 did_info->info_blkcnt), 5077 KM_SLEEP)); 5078 buffer = (caddr_t)did_block; 5079 for (blk = 5080 did_info->info_firstblk; 5081 blk < (did_info-> 5082 info_firstblk + 5083 did_info->info_blkcnt); 5084 blk++) { 5085 physblk = 5086 getphysblk(blk, 5087 rip->ri_mbip); 5088 err = getblks(s, 5089 buffer, dev, 5090 physblk, btodb( 5091 MDDB_BSIZE), 0); 5092 if (err) { 5093 rip->ri_flags |= 5094 err; 5095 break; 5096 } 5097 buffer += MDDB_BSIZE; 5098 } 5099 if (err) { 5100 kmem_free(did_block, 5101 dbtob(did_info-> 5102 info_blkcnt)); 5103 did_block = 5104 (caddr_t)NULL; 5105 cont_flag = 1; 5106 break; 5107 } 5108 5109 /* 5110 * Block read in - 5111 * alloc Disk Block area 5112 */ 5113 did_dbp = (mddb_did_db_t *) 5114 kmem_zalloc( 5115 sizeof (mddb_did_db_t), 5116 KM_SLEEP); 5117 did_dbp->db_ptr = did_block; 5118 did_dbp->db_firstblk = 5119 did_info->info_firstblk; 5120 did_dbp->db_blkcnt = 5121 did_info->info_blkcnt; 5122 5123 /* Add to front of dbp list */ 5124 did_dbp->db_next = 5125 did_icp->did_ic_dbp; 5126 did_icp->did_ic_dbp = did_dbp; 5127 } 5128 /* Check validity of devid in block */ 5129 if (crcchk(((char *)did_dbp->db_ptr + 5130 did_info->info_offset), 5131 &did_info->info_checksum, 5132 did_info->info_length, NULL)) { 5133 cont_flag = 1; 5134 break; 5135 } 5136 5137 /* Block now pointed to by did_dbp */ 5138 did_icp->did_ic_devid[li] = 5139 (ddi_devid_t)((char *) 5140 did_dbp->db_ptr + 5141 did_info->info_offset); 5142 } 5143 } 5144 if (cont_flag) 5145 continue; 5146 } 5147 5148 /* 5149 * All blocks containing devids are now in core. 5150 */ 5151 5152 /* 5153 * If we're doing a replicated import (also known as 5154 * remote copy import), the device id in the locator 5155 * block is incorrect and we need to fix it up here 5156 * alongwith the l_dev otherwise we run into lots of 5157 * trouble later on. 5158 */ 5159 if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5160 mddb_ri_t *trip; 5161 for (li = 0; li < lbp->lb_loccnt; li++) { 5162 did_info = &did_blkp->blk_info[li]; 5163 lp = &lbp->lb_locators[li]; 5164 5165 if (lp->l_flags & MDDB_F_DELETED) 5166 continue; 5167 5168 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5169 continue; 5170 5171 if (did_icp->did_ic_devid[li] == NULL) 5172 continue; 5173 5174 for (trip = s->s_rip; trip != NULL; 5175 trip = trip->ri_next) { 5176 if (trip->ri_old_devid == NULL) 5177 continue; 5178 if (ddi_devid_compare( 5179 trip->ri_old_devid, 5180 did_icp->did_ic_devid[li]) != 0) { 5181 continue; 5182 } 5183 5184 /* update l_dev and side mnum */ 5185 lp->l_dev = md_cmpldev(trip->ri_dev); 5186 lbp->lb_sidelocators[0][li].l_mnum = 5187 md_getminor(trip->ri_dev); 5188 } 5189 } 5190 } 5191 5192 /* 5193 * If there is a valid devid, verify that this locator 5194 * block has information about itself by checking the 5195 * device ID, minor_name and block 5196 * number from this replica's incore data structure 5197 * against the locator block information that has just 5198 * been read in from disk. 5199 * 5200 * If not a valid devid, verify that this locator block 5201 * has information about itself by checking the minor 5202 * number, block number and driver name from this 5203 * replica's incore data structure against the locator 5204 * block information that has just been read in from disk. 5205 */ 5206 if ((rip->ri_devid != NULL) && 5207 (lbp->lb_flags & MDDB_DEVID_STYLE)) { 5208 /* 5209 * This locator block MUST have locator (replica) 5210 * information about itself. Check against devid, 5211 * slice part of minor number, and block number. 5212 */ 5213 for (li = 0; li < lbp->lb_loccnt; li++) { 5214 did_info = &did_blkp->blk_info[li]; 5215 lp = &lbp->lb_locators[li]; 5216 if (lp->l_flags & MDDB_F_DELETED) 5217 continue; 5218 5219 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5220 continue; 5221 5222 if (((md_get_setstatus(setno) & 5223 MD_SET_REPLICATED_IMPORT)) && 5224 (rip->ri_old_devid != (ddi_devid_t)NULL)) { 5225 if (ddi_devid_compare(rip->ri_old_devid, 5226 did_icp->did_ic_devid[li]) != 0) 5227 continue; 5228 } else { 5229 if (ddi_devid_compare(rip->ri_devid, 5230 did_icp->did_ic_devid[li]) != 0) 5231 continue; 5232 } 5233 5234 if (strcmp(rip->ri_minor_name, 5235 did_info->info_minor_name) != 0) 5236 continue; 5237 5238 if (lp->l_blkno == rip->ri_blkno) 5239 break; 5240 } 5241 } else { 5242 /* 5243 * This locator block MUST have locator (replica) 5244 * information about itself. 5245 */ 5246 if (!mn_set) { 5247 for (li = 0; li < lbp->lb_loccnt; li++) { 5248 mddb_drvnm_t *dn; 5249 mddb_sidelocator_t *slp; 5250 5251 lp = &lbp->lb_locators[li]; 5252 slp = &lbp-> 5253 lb_sidelocators[s->s_sideno][li]; 5254 if (lp->l_flags & MDDB_F_DELETED) 5255 continue; 5256 if (slp->l_mnum != md_getminor( 5257 rip->ri_dev)) 5258 continue; 5259 if (lp->l_blkno != rip->ri_blkno) 5260 continue; 5261 dn = &lbp->lb_drvnm[slp->l_drvnm_index]; 5262 if (strncmp(dn->dn_data, 5263 rip->ri_driver, MD_MAXDRVNM) == 0) 5264 break; 5265 } 5266 } else { 5267 for (li = 0; li < lbp->lb_loccnt; li++) { 5268 mddb_drvnm_t *dn; 5269 mddb_mnsidelocator_t *mnslp; 5270 mddb_mnlb_t *mnlbp; 5271 int i; 5272 5273 /* 5274 * Check all possible locators locking 5275 * for match to the currently read-in 5276 * locator, must match on: 5277 * - blkno 5278 * - side locator for this 5279 * node's side 5280 * - side locator minor number 5281 * - side locator driver name 5282 */ 5283 5284 /* 5285 * Looking at sidelocs: 5286 * cast lbp -> mnlbp 5287 */ 5288 mnlbp = (mddb_mnlb_t *)lbp; 5289 lp = &mnlbp->lb_locators[li]; 5290 if (lp->l_flags & MDDB_F_DELETED) 5291 continue; 5292 if (lp->l_blkno != rip->ri_blkno) 5293 continue; 5294 5295 for (i = 0; i < MD_MNMAXSIDES; i++) { 5296 mnslp = &mnlbp-> 5297 lb_mnsidelocators[i][li]; 5298 if (mnslp->mnl_sideno == 5299 s->s_sideno) { 5300 break; 5301 } 5302 } 5303 /* No matching side found */ 5304 if (i == MD_MNMAXSIDES) 5305 continue; 5306 if (mnslp->mnl_mnum != 5307 md_getminor(rip->ri_dev)) 5308 continue; 5309 dn = &lbp-> 5310 lb_drvnm[mnslp->mnl_drvnm_index]; 5311 if (strncmp(dn->dn_data, 5312 rip->ri_driver, MD_MAXDRVNM) == 0) 5313 break; 5314 } 5315 } 5316 } 5317 5318 /* 5319 * Didn't find ourself in this locator block it means 5320 * the locator block is a stale transplant. Probably from 5321 * a user doing a dd. 5322 */ 5323 if (li == lbp->lb_loccnt) 5324 continue; 5325 5326 /* 5327 * Keep track of the number of accessed and valid 5328 * locator blocks. 5329 */ 5330 lb_ok++; 5331 5332 /* 5333 * Read the tag in, skips invalid or blank tags. 5334 * Only valid tags allocate storage 5335 * Data tags are not used in MN disksets. 5336 */ 5337 if ((!mn_set) && (! dt_read(s, lbp, rip))) { 5338 /* 5339 * Keep track of the number of tagged 5340 * locator blocks. 5341 */ 5342 lb_tagged++; 5343 5344 /* Keep a list of unique tags. */ 5345 (void) dtl_addl(s, &rip->ri_dtp->dt_dtag); 5346 } 5347 5348 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5349 /* 5350 * go through locator block and add any other 5351 * locations of the data base. 5352 * For the replicated import case, this was done earlier 5353 * and we really don't need or want to do so again 5354 */ 5355 cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP); 5356 for (li = 0; li < lbp->lb_loccnt; li++) { 5357 lp = &lbp->lb_locators[li]; 5358 if (lp->l_flags & MDDB_F_DELETED) 5359 continue; 5360 5361 cl->l_devid_flags = MDDB_DEVID_GETSZ; 5362 cl->l_devid = (uint64_t)0; 5363 cl->l_devid_sz = 0; 5364 cl->l_old_devid = (uint64_t)0; 5365 cl->l_old_devid_sz = 0; 5366 cl->l_minor_name[0] = '\0'; 5367 locator2cfgloc(lbp, cl, li, s->s_sideno, 5368 did_icp); 5369 5370 if (cl->l_devid_flags & MDDB_DEVID_SZ) { 5371 if ((cl->l_devid = (uintptr_t)kmem_alloc 5372 (cl->l_devid_sz, KM_SLEEP)) 5373 == NULL) { 5374 continue; 5375 } else { 5376 cl->l_devid_flags = 5377 MDDB_DEVID_SPACE; 5378 } 5379 } 5380 locator2cfgloc(lbp, cl, li, s->s_sideno, 5381 did_icp); 5382 5383 (void) ridev(&s->s_rip, cl, &lp->l_dev, 0); 5384 5385 if (cl->l_devid_flags & MDDB_DEVID_SPACE) 5386 kmem_free((caddr_t)(uintptr_t) 5387 cl->l_devid, cl->l_devid_sz); 5388 } 5389 kmem_free(cl, sizeof (mddb_cfg_loc_t)); 5390 } 5391 5392 /* Save LB for later */ 5393 rip->ri_lbp = lbp; 5394 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5395 rip->ri_did_icp = did_icp; 5396 did_icp = (mddb_did_ic_t *)NULL; 5397 did_blkp = (mddb_did_blk_t *)NULL; 5398 } else 5399 rip->ri_did_icp = NULL; 5400 lbp = (mddb_lb_t *)NULL; 5401 } 5402 5403 if (lbp != (mddb_lb_t *)NULL) 5404 kmem_free((caddr_t)lbp, dbtob(lb_blkcnt)); 5405 5406 if (did_icp != (mddb_did_ic_t *)NULL) { 5407 if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) { 5408 kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz); 5409 did_blkp = (mddb_did_blk_t *)NULL; 5410 } 5411 if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) { 5412 mddb_did_db_t *did_dbp1, *did_dbp2; 5413 5414 did_dbp1 = did_icp->did_ic_dbp; 5415 while (did_dbp1) { 5416 did_dbp2 = did_dbp1->db_next; 5417 kmem_free((caddr_t)did_dbp1->db_ptr, 5418 dbtob(did_dbp1->db_blkcnt)); 5419 kmem_free((caddr_t)did_dbp1, 5420 sizeof (mddb_did_db_t)); 5421 did_dbp1 = did_dbp2; 5422 } 5423 } 5424 kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t)); 5425 } 5426 5427 if (did_blkp != (mddb_did_blk_t *)NULL) { 5428 kmem_free((caddr_t)did_blkp, did_blkp_sz); 5429 } 5430 5431 /* No locator blocks were ok */ 5432 if (lb_ok == 0) 5433 goto out; 5434 5435 /* No tagged data was found - will be 0 for MN diskset */ 5436 if (lb_tagged == 0) 5437 goto out; 5438 5439 /* Find the highest non-deleted replica count */ 5440 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5441 int lb_tot = 0; 5442 5443 if (rip->ri_mbip == (mddb_mb_ic_t *)NULL) 5444 continue; 5445 5446 if (rip->ri_lbp == (mddb_lb_t *)NULL) 5447 continue; 5448 5449 for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) { 5450 lp = &rip->ri_lbp->lb_locators[li]; 5451 if (lp->l_flags & MDDB_F_DELETED) 5452 continue; 5453 lb_tot++; 5454 } 5455 5456 if (lb_tot > lb_total) 5457 lb_total = lb_tot; 5458 } 5459 5460 /* Count the number of unique tags */ 5461 for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx) 5462 lb_tags++; 5463 5464 /* Should have at least one tag at this point */ 5465 ASSERT(lb_tags > 0); 5466 5467 5468 /* 5469 * If the number of tagged locators is not the same as the number of 5470 * OK locators OR more than one tag exists, then make sure the 5471 * selected tag will be written out later. 5472 */ 5473 if ((lb_tagged - lb_ok) != 0 || lb_tags > 1) 5474 md_set_setstatus(setno, MD_SET_TAGDATA); 5475 5476 /* Only a single tag, take the tagged data */ 5477 if (lb_tags == 1) { 5478 dt_setup(s, &s->s_dtlp->dtl_dt); 5479 md_set_setstatus(setno, MD_SET_USETAG); 5480 goto out; 5481 } 5482 5483 /* Multiple tags, not selecting a tag, tag mode is on */ 5484 if (! (md_get_setstatus(setno) & MD_SET_USETAG)) 5485 retval = MDDB_E_TAGDATA; 5486 5487 out: 5488 5489 return (retval); 5490 } 5491 5492 /* 5493 * 1. Select a locator. 5494 * 2. check if enough locators now have current copies 5495 * 3. read in database from one of latest 5496 * 4. if known to have latest make all database the same 5497 * 5. if configuration has changed rewrite locators 5498 * 5499 * Parameters: 5500 * s - pointer to mddb_set structure 5501 * flag - used in MN disksets to tell if this node is being joined to 5502 * a diskset that is in the STALE state. If the flag is 5503 * MDDB_MN_STALE, then this node should be marked in the STALE 5504 * state even if > 50% mddbs are available. (The diskset can 5505 * only change from STALE->OK if all nodes withdraw from the 5506 * MN diskset and then rejoin). 5507 */ 5508 static int 5509 load_old_replicas( 5510 mddb_set_t *s, 5511 int flag 5512 ) 5513 { 5514 mddb_lb_t *lbp = NULL; 5515 mddb_mnlb_t *mnlbp = NULL; 5516 mddb_ri_t *rip; 5517 mddb_locator_t *lp; 5518 mddb_db_t *dbp; 5519 mddb_de_ic_t *dep; 5520 int li; 5521 int alc; 5522 int lc; 5523 int tlc; 5524 int retval = 0; 5525 caddr_t p; 5526 size_t maxrecsize; 5527 set_t setno = s->s_setno; 5528 mddb_did_db_t *did_dbp1; 5529 mddb_did_info_t *did_info; 5530 mddb_did_ic_t *did_icp = NULL; 5531 md_dev64_t *newdev; 5532 mddb_sidelocator_t *slp = 0; 5533 mddb_mnsidelocator_t *mnslp = 0; 5534 uchar_t i; 5535 char *name; 5536 ddi_devid_t ret_devid; 5537 md_dev64_t dev; 5538 uint_t len, sz; 5539 char *minor_name; 5540 int write_lb = 0; 5541 int rval; 5542 int stale_rtn = 0; 5543 5544 /* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */ 5545 if (retval = get_mbs_n_lbs(s, &write_lb)) 5546 goto errout; 5547 5548 if ((lbp = s->s_lbp = selectlocator(s)) == NULL) { 5549 retval = MDDB_E_NOLOCBLK; 5550 goto errout; 5551 } 5552 5553 /* If a multi-node set, then set md_set.s_status flag */ 5554 if (lbp->lb_flags & MDDB_MNSET) { 5555 md_set_setstatus(setno, MD_SET_MNSET); 5556 /* 5557 * If data tag area had been allocated before set type was 5558 * known - free it now. 5559 */ 5560 if (md_set[setno].s_dtp) { 5561 kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 5562 md_set[setno].s_dtp = NULL; 5563 } 5564 } 5565 5566 /* 5567 * If the replica is in devid format, setup the devid incore ptr. 5568 */ 5569 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5570 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5571 if (rip->ri_lbp == s->s_lbp) { 5572 did_icp = s->s_did_icp = rip->ri_did_icp; 5573 break; 5574 } 5575 } 5576 /* 5577 * If no devid incore info found - something has gone 5578 * wrong so errout. 5579 */ 5580 if (rip == NULL) { 5581 retval = MDDB_E_NODEVID; 5582 goto errout; 5583 } 5584 5585 /* 5586 * Add all blocks containing devids to free list. 5587 * Then remove addresses that actually contain devids. 5588 */ 5589 did_dbp1 = did_icp->did_ic_dbp; 5590 while (did_dbp1) { 5591 if (mddb_devid_free_add(s, did_dbp1->db_firstblk, 5592 0, dbtob(did_dbp1->db_blkcnt))) { 5593 retval = MDDB_E_NOSPACE; 5594 goto errout; 5595 } 5596 5597 did_dbp1 = did_dbp1->db_next; 5598 } 5599 for (li = 0; li < lbp->lb_loccnt; li++) { 5600 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5601 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5602 continue; 5603 5604 if (mddb_devid_free_delete(s, did_info->info_firstblk, 5605 did_info->info_offset, did_info->info_length)) { 5606 /* unable to find disk block */ 5607 retval = MDDB_E_NODEVID; 5608 goto errout; 5609 } 5610 } 5611 } 5612 5613 /* 5614 * create mddb_mbaray, count all locators and active locators. 5615 */ 5616 alc = 0; 5617 lc = 0; 5618 for (li = 0; li < lbp->lb_loccnt; li++) { 5619 ddi_devid_t li_devid; 5620 5621 lp = &lbp->lb_locators[li]; 5622 5623 if (lp->l_flags & MDDB_F_DELETED) 5624 continue; 5625 5626 /* Count non-deleted replicas */ 5627 lc++; 5628 5629 /* 5630 * Use the devid of this locator to compare with the rip 5631 * list. The scenario to watch out for here is that this 5632 * locator could be on a disk that is dead and there could 5633 * be a valid entry in the rip list for a different disk 5634 * that has been moved to the dead disks dev_t. We don't 5635 * want to match with the moved disk. 5636 */ 5637 li_devid = NULL; 5638 (void) mddb_devid_get(s, li, &li_devid, &minor_name); 5639 5640 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5641 if (match_mddb(rip, li_devid, minor_name, 5642 md_expldev(lp->l_dev), lp->l_blkno)) { 5643 break; 5644 } 5645 } 5646 if (rip == NULL) { 5647 /* 5648 * If rip not found, then mark error in master block 5649 * so that no writes are later attempted to this 5650 * replica. rip may not be setup if ridev 5651 * failed due to un-found driver name. 5652 */ 5653 lp->l_flags |= MDDB_F_EMASTER; 5654 continue; 5655 } 5656 5657 s->s_mbiarray[li] = rip->ri_mbip; 5658 5659 lp->l_flags &= MDDB_F_ACTIVE; 5660 lp->l_flags |= (int)rip->ri_flags; 5661 5662 if (rip->ri_transplant) 5663 lp->l_flags &= ~MDDB_F_ACTIVE; 5664 5665 if (lp->l_flags & MDDB_F_LOCACC) 5666 alc++; 5667 } 5668 5669 /* Save on a divide - calculate 50% + 1 up front */ 5670 tlc = ((lc + 1) / 2); 5671 5672 if (alc > tlc) { /* alc > tlc - OK */ 5673 md_clr_setstatus(setno, MD_SET_STALE); 5674 } else if (alc < tlc) { /* alc < tlc - stale */ 5675 md_set_setstatus(setno, MD_SET_STALE); 5676 } else if (lc & 1) { /* alc == tlc && odd - OK */ 5677 md_clr_setstatus(setno, MD_SET_STALE); 5678 } else { /* alc == tlc && even - ? */ 5679 /* Can do an accept, and are */ 5680 if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) { 5681 md_clr_setstatus(setno, MD_SET_STALE); 5682 } else { /* possibly has a mediator */ 5683 if (mediate(s)) { 5684 md_set_setstatus(setno, MD_SET_STALE); 5685 } else { 5686 md_clr_setstatus(setno, MD_SET_STALE); 5687 } 5688 } 5689 5690 /* 5691 * The mirrored_root_flag allows the sysadmin to decide to 5692 * start the local set in a read/write (non-stale) mode 5693 * when there are only 50% available mddbs on the system and 5694 * when the root file system is on a mirror. This is useful 5695 * in a 2 disk system where 1 disk failure would cause an mddb 5696 * quorum failure and subsequent boot failures since the root 5697 * filesystem would be in a read-only state. 5698 */ 5699 if (mirrored_root_flag == 1 && setno == 0 && 5700 svm_bootpath[0] != 0) { 5701 md_clr_setstatus(setno, MD_SET_STALE); 5702 } else { 5703 if (md_get_setstatus(setno) & MD_SET_STALE) { 5704 /* Allow half mode - CAREFUL! */ 5705 if (mddb_allow_half) 5706 md_clr_setstatus(setno, MD_SET_STALE); 5707 } 5708 } 5709 5710 /* 5711 * In a MN diskset, 5712 * - if 50% mddbs are unavailable and this 5713 * has been marked STALE above 5714 * - master node isn't in the STALE state 5715 * - this node isn't the master node (this node 5716 * isn't the first node to join the set) 5717 * then clear the STALE state and set TOOFEW. 5718 * 5719 * If this node is the master node and set was marked STALE, 5720 * then the set stays STALE. 5721 * 5722 * If this node is not the master and this node's state is 5723 * STALE and the master node is not marked STALE, 5724 * then master node must be in the TOOFEW state or the 5725 * master is panic'ing. A MN diskset can only be placed into 5726 * the STALE state by having the first node join the set 5727 * with <= 50% mddbs. There's no way for a MN diskset to 5728 * transition between STALE and not-STALE states unless all 5729 * nodes are withdrawn from the diskset or all nodes in the 5730 * diskset are rebooted at the same time. 5731 * 5732 * So, mark this node's state as TOOFEW instead of STALE. 5733 */ 5734 if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE)) 5735 == (MD_SET_MNSET | MD_SET_STALE)) && 5736 ((flag & MDDB_MN_STALE) == 0) && 5737 (!(md_set[setno].s_am_i_master))) { 5738 md_clr_setstatus(setno, MD_SET_STALE); 5739 md_set_setstatus(setno, MD_SET_TOOFEW); 5740 } 5741 } 5742 5743 /* 5744 * If a MN set is marked STALE on the other nodes, 5745 * mark it stale here. Override all other considerations 5746 * such as a mediator or > 50% mddbs available. 5747 */ 5748 if (md_get_setstatus(setno) & MD_SET_MNSET) { 5749 if (flag & MDDB_MN_STALE) 5750 md_set_setstatus(setno, MD_SET_STALE); 5751 } 5752 5753 /* 5754 * read a good copy of the locator names 5755 * if an error occurs reading what is suppose 5756 * to be a good copy continue looking for another 5757 * good copy 5758 */ 5759 s->s_lnp = NULL; 5760 for (li = 0; li < lbp->lb_loccnt; li++) { 5761 lp = &lbp->lb_locators[li]; 5762 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 5763 (lp->l_flags & MDDB_F_EMASTER)) 5764 continue; 5765 5766 /* Find rip entry for this locator if one exists */ 5767 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5768 if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev), 5769 lp->l_blkno)) 5770 break; 5771 } 5772 5773 if (rip == NULL) { 5774 continue; 5775 } 5776 5777 /* 5778 * Use the rip commitcnt since the commitcnt in lbp could 5779 * been cleared by selectlocator. Looking for a replica with 5780 * the same commitcnt as the 'golden' copy in order to 5781 * get the same data. 5782 */ 5783 if (rip->ri_commitcnt != lbp->lb_commitcnt) { 5784 continue; 5785 } 5786 5787 /* 5788 * Now have a copy of the database that is equivalent 5789 * to the chosen locator block with respect to 5790 * inittime, identifier and commitcnt. Trying the 5791 * equivalent databases in the order that they were 5792 * written will provide the most up to date data. 5793 */ 5794 lp->l_flags |= readlocnames(s, li); 5795 if (s->s_lnp) 5796 break; 5797 } 5798 5799 if (s->s_lnp == NULL) { 5800 retval = MDDB_E_NOLOCNMS; 5801 goto errout; 5802 } 5803 5804 /* 5805 * read a good copy of the data base 5806 * if an error occurs reading what is suppose 5807 * to be a good copy continue looking for another 5808 * good copy 5809 */ 5810 5811 s->s_dbp = NULL; 5812 for (li = 0; li < lbp->lb_loccnt; li++) { 5813 lp = &lbp->lb_locators[li]; 5814 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 5815 (lp->l_flags & MDDB_F_EMASTER)) 5816 continue; 5817 5818 /* Find rip entry for this locator if one exists */ 5819 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5820 if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev), 5821 lp->l_blkno)) 5822 break; 5823 } 5824 5825 if (rip == NULL) { 5826 continue; 5827 } 5828 5829 /* 5830 * Use the rip commitcnt since the commitcnt in lbp could 5831 * been cleared by selectlocator. Looking for a replica with 5832 * the same commitcnt as the 'golden' copy in order to 5833 * get the same data. 5834 */ 5835 if (rip->ri_commitcnt != lbp->lb_commitcnt) { 5836 continue; 5837 } 5838 5839 /* 5840 * Now have a copy of the database that is equivalent 5841 * to the chosen locator block with respect to 5842 * inittime, identifier and commitcnt. Trying the 5843 * equivalent databases in the order that they were 5844 * written will provide the most up to date data. 5845 */ 5846 lp->l_flags |= readcopy(s, li); 5847 5848 if (s->s_dbp) 5849 break; 5850 } 5851 5852 if (s->s_dbp == NULL) { 5853 retval = MDDB_E_NODIRBLK; 5854 goto errout; 5855 } 5856 5857 lp->l_flags |= MDDB_F_MASTER; 5858 lp->l_flags |= MDDB_F_UP2DATE; 5859 5860 /* 5861 * go through and find largest record; 5862 * Also fixup the user data area's 5863 */ 5864 maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size); 5865 5866 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) 5867 for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) 5868 if (dep->de_flags & MDDB_F_OPT) 5869 getoptrecord(s, dep); 5870 else { 5871 allocuserdata(dep); 5872 maxrecsize = MAX(dep->de_recsize, maxrecsize); 5873 } 5874 5875 if (maxrecsize > s->s_databuffer_size) { 5876 p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP); 5877 if (s->s_databuffer_size) 5878 kmem_free(s->s_databuffer, s->s_databuffer_size); 5879 s->s_databuffer = p; 5880 s->s_databuffer_size = maxrecsize; 5881 } 5882 5883 /* If we can clear the tag data record, do it now. */ 5884 /* Data tags not supported on MN sets */ 5885 if ((md_get_setstatus(setno) & MD_SET_CLRTAG) && 5886 (!(md_get_setstatus(setno) & MD_SET_MNSET))) 5887 dt_setup(s, NULL); 5888 5889 /* This will return non-zero if STALE or TOOFEW */ 5890 /* This will write out chosen replica image to all replicas */ 5891 stale_rtn = selectreplicas(s, MDDB_SCANALL); 5892 5893 if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5894 ddi_devid_t devidptr; 5895 5896 /* 5897 * ignore the return value from selectreplicas because we 5898 * may have a STALE or TOOFEW set in the case of a partial 5899 * replicated diskset. We will fix that up later. 5900 */ 5901 5902 lbp = s->s_lbp; 5903 for (li = 0; li < lbp->lb_loccnt; li++) { 5904 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5905 5906 if (did_info->info_flags & MDDB_DID_EXISTS) { 5907 devidptr = s->s_did_icp->did_ic_devid[li]; 5908 lp = &lbp->lb_locators[li]; 5909 for (rip = s->s_rip; rip != NULL; 5910 rip = rip->ri_next) { 5911 if (rip->ri_old_devid == 0) 5912 continue; 5913 if (ddi_devid_compare(rip->ri_old_devid, 5914 devidptr) != 0) { 5915 continue; 5916 } 5917 if (update_locatorblock(s, 5918 md_expldev(lp->l_dev), 5919 rip->ri_devid, rip->ri_old_devid)) { 5920 goto errout; 5921 } 5922 } 5923 } 5924 } 5925 } else { 5926 if (stale_rtn) 5927 goto errout; 5928 } 5929 5930 /* 5931 * If the replica is in device id style - validate the device id's, 5932 * if present, in the locator block devid area. 5933 */ 5934 newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP); 5935 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5936 for (li = 0; li < lbp->lb_loccnt; li++) { 5937 newdev[li] = 0; 5938 lp = &lbp->lb_locators[li]; 5939 if (lp->l_flags & MDDB_F_DELETED) 5940 continue; 5941 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5942 dev = md_expldev(lp->l_dev); 5943 if (did_info->info_flags & MDDB_DID_EXISTS) { 5944 /* Validate device id on current system */ 5945 newdev[li] = dev; 5946 if (mddb_devid_validate( 5947 did_icp->did_ic_devid[li], 5948 &(newdev[li]), 5949 did_info->info_minor_name) == 0) { 5950 /* Set valid flag */ 5951 did_info->info_flags |= MDDB_DID_VALID; 5952 } else { 5953 lp->l_flags |= MDDB_F_EMASTER; 5954 } 5955 } else if (!(MD_UPGRADE)) { 5956 /* 5957 * If a device doesn't have a device id, 5958 * check if there is now a device ID 5959 * associated with device. If one exists, 5960 * add it to the locator block devid area. 5961 * If there's not enough space to add it, 5962 * print a warning. 5963 * Don't do this during upgrade. 5964 */ 5965 dev_t ddi_dev = md_dev64_to_dev(dev); 5966 if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == 5967 DDI_SUCCESS) { 5968 if (ddi_lyr_get_minor_name(ddi_dev, 5969 S_IFBLK, &minor_name) 5970 == DDI_SUCCESS) { 5971 if (mddb_devid_add(s, li, 5972 ret_devid, minor_name)) { 5973 cmn_err(CE_WARN, 5974 "Not enough space" 5975 " in metadevice" 5976 " state" 5977 " database\n"); 5978 cmn_err(CE_WARN, 5979 "to add relocation" 5980 " information for" 5981 " device:\n"); 5982 cmn_err(CE_WARN, 5983 " major = %d, " 5984 " minor = %d\n", 5985 getmajor(ddi_dev), 5986 getminor(ddi_dev)); 5987 } else { 5988 write_lb = 1; 5989 } 5990 kmem_free(minor_name, 5991 strlen(minor_name) + 1); 5992 } 5993 ddi_devid_free(ret_devid); 5994 } 5995 } 5996 } 5997 5998 /* 5999 * If a device has a valid device id and if the dev_t 6000 * associated with the device id has changed, update the 6001 * driver name, minor num and dev_t in the local and side 6002 * locators to match the dev_t that the system currently 6003 * associates with the device id. 6004 * 6005 * Don't do this during upgrade. 6006 */ 6007 if (!(MD_UPGRADE)) { 6008 for (li = 0; li < lbp->lb_loccnt; li++) { 6009 lp = &lbp->lb_locators[li]; 6010 if (lp->l_flags & MDDB_F_DELETED) 6011 continue; 6012 did_info = &(did_icp->did_ic_blkp->blk_info 6013 [li]); 6014 if ((did_info->info_flags & MDDB_DID_VALID) && 6015 !(did_info->info_flags & 6016 MDDB_DID_UPDATED)) { 6017 if (lbp->lb_flags & MDDB_MNSET) { 6018 int j; 6019 int index = -1; 6020 mnlbp = (mddb_mnlb_t *)lbp; 6021 for (j = 0; j < MD_MNMAXSIDES; 6022 j++) { 6023 mnslp = &mnlbp-> 6024 lb_mnsidelocators[j] 6025 [li]; 6026 if (mnslp->mnl_sideno == 6027 s->s_sideno) 6028 break; 6029 if (mnslp->mnl_sideno == 6030 0) 6031 index = j; 6032 } 6033 if (j == MD_MNMAXSIDES) { 6034 /* 6035 * No match found; take 6036 * empty 6037 */ 6038 mnslp = &mnlbp-> 6039 lb_mnsidelocators 6040 [index][li]; 6041 write_lb = 1; 6042 mnslp->mnl_mnum = 6043 md_getminor(newdev 6044 [li]); 6045 } else if (mnslp->mnl_mnum != 6046 md_getminor(newdev[li])) { 6047 write_lb = 1; 6048 mnslp->mnl_mnum = 6049 md_getminor(newdev 6050 [li]); 6051 } 6052 } else { 6053 slp = &lbp-> 6054 lb_sidelocators[s->s_sideno] 6055 [li]; 6056 if (slp->l_mnum != 6057 md_getminor(newdev[li])) { 6058 write_lb = 1; 6059 slp->l_mnum = 6060 md_getminor(newdev 6061 [li]); 6062 } 6063 } 6064 name = ddi_major_to_name(md_getmajor( 6065 newdev[li])); 6066 if (lbp->lb_flags & MDDB_MNSET) 6067 i = mnslp->mnl_drvnm_index; 6068 else 6069 i = slp->l_drvnm_index; 6070 if (strncmp(lbp->lb_drvnm[i].dn_data, 6071 name, lbp->lb_drvnm[i].dn_len) != 6072 0) { 6073 /* Driver name has changed */ 6074 len = strlen(name); 6075 /* Look for the driver name */ 6076 for (i = 0; i < MDDB_DRVNMCNT; 6077 i++) { 6078 if (lbp->lb_drvnm[i]. 6079 dn_len != len) 6080 continue; 6081 if (strncmp(lbp-> 6082 lb_drvnm[i].dn_data, 6083 name, len) == 0) 6084 break; 6085 } 6086 /* Didn't find one, add it */ 6087 if (i == MDDB_DRVNMCNT) { 6088 for (i = 0; i < 6089 MDDB_DRVNMCNT; 6090 i++) { 6091 if (lbp-> 6092 lb_drvnm[i]. 6093 dn_len == 0) 6094 break; 6095 } 6096 if (i == 6097 MDDB_DRVNMCNT) { 6098 cmn_err(CE_WARN, 6099 "Unable to " 6100 " update " 6101 "driver " 6102 " name for " 6103 "dev: " 6104 "major = %d" 6105 ", minor = " 6106 "%d\n", 6107 md_getmajor( 6108 newdev[li]), 6109 md_getminor( 6110 newdev 6111 [li])); 6112 continue; 6113 } 6114 (void) strncpy(lbp-> 6115 lb_drvnm[i].dn_data, 6116 name, MD_MAXDRVNM); 6117 lbp->lb_drvnm[i]. 6118 dn_len = (uchar_t) 6119 strlen(name); 6120 } 6121 /* Fill in the drvnm index */ 6122 if (lbp->lb_flags & 6123 MDDB_MNSET) 6124 mnslp->mnl_drvnm_index = 6125 i; 6126 else 6127 slp->l_drvnm_index = i; 6128 write_lb = 1; 6129 } 6130 did_info->info_flags |= 6131 MDDB_DID_UPDATED; 6132 } 6133 } 6134 } 6135 } 6136 kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB); 6137 6138 /* 6139 * If locator block has been changed by get_mbs_n_lbs, 6140 * by addition of new device id, by updated minor name or 6141 * by updated driver name - write out locator block. 6142 */ 6143 if (write_lb) { 6144 rval = push_lb(s); 6145 (void) upd_med(s, "load_old_replicas(0)"); 6146 if (rval) 6147 goto errout; 6148 } 6149 6150 /* 6151 * If the tag was moved, allocated, or a BADTAG was seen for some other 6152 * reason, then make sure tags are written to all the replicas. 6153 * Data tags not supported on MN sets. 6154 */ 6155 if (!(md_get_setstatus(setno) & MD_SET_MNSET)) { 6156 if (! (lc = dt_alloc_if_needed(s))) { 6157 for (li = 0; li < lbp->lb_loccnt; li++) { 6158 lp = &lbp->lb_locators[li]; 6159 6160 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 6161 (lp->l_flags & MDDB_F_EMASTER)) 6162 continue; 6163 6164 if (lp->l_flags & MDDB_F_BADTAG) { 6165 lc = 1; 6166 break; 6167 } 6168 } 6169 } 6170 6171 if (lc) { 6172 md_set_setstatus(setno, MD_SET_TAGDATA); 6173 md_clr_setstatus(setno, MD_SET_BADTAG); 6174 (void) selectreplicas(s, MDDB_SCANALL); 6175 } 6176 } 6177 6178 errout: 6179 6180 /* Free extraneous rip components. */ 6181 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 6182 /* Get rid of lbp's and dtp's */ 6183 6184 if (rip->ri_lbp != lbp) { 6185 if (rip->ri_dtp != (mddb_dt_t *)NULL) { 6186 kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES); 6187 rip->ri_dtp = (mddb_dt_t *)NULL; 6188 } 6189 6190 if (rip->ri_devid != (ddi_devid_t)NULL) { 6191 sz = (int)ddi_devid_sizeof(rip->ri_devid); 6192 kmem_free((caddr_t)rip->ri_devid, sz); 6193 rip->ri_devid = (ddi_devid_t)NULL; 6194 } 6195 if (rip->ri_old_devid != (ddi_devid_t)NULL) { 6196 sz = (int)ddi_devid_sizeof(rip->ri_old_devid); 6197 kmem_free((caddr_t)rip->ri_old_devid, sz); 6198 rip->ri_old_devid = (ddi_devid_t)NULL; 6199 } 6200 6201 if (rip->ri_lbp != (mddb_lb_t *)NULL) { 6202 mddb_devid_icp_free(&rip->ri_did_icp, 6203 rip->ri_lbp); 6204 6205 kmem_free((caddr_t)rip->ri_lbp, 6206 dbtob(rip->ri_lbp->lb_blkcnt)); 6207 rip->ri_lbp = (mddb_lb_t *)NULL; 6208 } 6209 } 6210 6211 if (lbp != NULL) { 6212 for (li = 0; li < lbp->lb_loccnt; li++) { 6213 lp = &lbp->lb_locators[li]; 6214 if (lp->l_flags & MDDB_F_DELETED) 6215 continue; 6216 if (rip->ri_dev == md_expldev(lp->l_dev) && 6217 rip->ri_blkno == lp->l_blkno) 6218 break; 6219 } 6220 if (li < lbp->lb_loccnt) 6221 continue; 6222 } 6223 6224 /* 6225 * Get rid of mbp's: 6226 * if lbp, those out of lb_loccnt bounds 6227 * if !lbp, all of them. 6228 */ 6229 if (rip->ri_mbip) { 6230 md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev); 6231 if (dev64 != NODEV64) 6232 mddb_devclose(dev64); 6233 6234 free_mbipp(&rip->ri_mbip); 6235 } 6236 /* 6237 * Turn off MDDB_F_EMASTER flag in a diskset since diskset 6238 * code always ends up calling ridev for all replicas 6239 * before calling load_old_replicas. ridev will reset 6240 * MDDB_F_EMASTER flag if flag was due to unresolved devid. 6241 */ 6242 if (setno != MD_LOCAL_SET) 6243 rip->ri_flags &= ~MDDB_F_EMASTER; 6244 } 6245 return (retval); 6246 } 6247 6248 /* 6249 * Given the devt from the md.conf info, get the devid for the device. 6250 */ 6251 static void 6252 lookup_db_devid(mddb_cfg_loc_t *cl) 6253 { 6254 dev_t ldev; 6255 ddi_devid_t devid; 6256 char *minor; 6257 6258 if (ddi_name_to_major(cl->l_driver) == (major_t)-1) { 6259 cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver); 6260 return; 6261 } 6262 6263 ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum); 6264 if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) { 6265 cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x", 6266 cl->l_driver, cl->l_mnum); 6267 return; 6268 } 6269 6270 if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) { 6271 cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x", 6272 cl->l_mnum); 6273 return; 6274 } 6275 6276 cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ; 6277 cl->l_devid_sz = (int)ddi_devid_sizeof(devid); 6278 cl->l_devid = (uint64_t)(uintptr_t)devid; 6279 (void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX); 6280 6281 kmem_free(minor, strlen(minor) + 1); 6282 } 6283 6284 /* 6285 * grab driver name, minor, block and devid out of 6286 * strings like "driver:minor:block:devid" 6287 */ 6288 static int 6289 parse_db_loc( 6290 char *str, 6291 mddb_cfg_loc_t *clp 6292 ) 6293 { 6294 char *p, *e; 6295 char *minor_name; 6296 ddi_devid_t ret_devid; 6297 6298 clp->l_dev = 0; 6299 p = clp->l_driver; 6300 e = p + sizeof (clp->l_driver) - 1; 6301 while ((*str != ':') && (*str != '\0') && (p < e)) 6302 *p++ = *str++; 6303 *p = '\0'; 6304 if (*str++ != ':') 6305 return (-1); 6306 clp->l_mnum = 0; 6307 while (ISNUM(*str)) { 6308 clp->l_mnum *= 10; 6309 clp->l_mnum += *str++ - '0'; 6310 } 6311 if (*str++ != ':') 6312 return (-1); 6313 clp->l_blkno = 0; 6314 while (ISNUM(*str)) { 6315 clp->l_blkno *= 10; 6316 clp->l_blkno += *str++ - '0'; 6317 } 6318 if (*str++ != ':') 6319 return (-1); 6320 6321 /* 6322 * If the md_devid_destroy flag is set, ignore the device ids. 6323 * This is only to used in a catastrophic failure case. Examples 6324 * would be where the device id of all drives in the system 6325 * (especially the mirror'd root drives) had been changed 6326 * by firmware upgrade or by a patch to an existing disk 6327 * driver. Another example would be in the case of non-unique 6328 * device ids due to a bug. The device id would be valid on 6329 * the system, but would return the wrong dev_t. 6330 */ 6331 if (md_devid_destroy) { 6332 clp->l_devid_flags = 0; 6333 clp->l_devid = (uint64_t)NULL; 6334 clp->l_devid_sz = 0; 6335 clp->l_old_devid = (uint64_t)NULL; 6336 clp->l_old_devid_sz = 0; 6337 clp->l_minor_name[0] = '\0'; 6338 return (0); 6339 } 6340 6341 if (ddi_devid_str_decode(str, 6342 (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE) 6343 return (-1); 6344 6345 clp->l_devid = (uint64_t)(uintptr_t)ret_devid; 6346 clp->l_devid_flags = 0; 6347 clp->l_old_devid = (uint64_t)NULL; 6348 clp->l_old_devid_sz = 0; 6349 6350 /* If no device id associated with device, just return */ 6351 if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) { 6352 clp->l_devid_sz = 0; 6353 clp->l_minor_name[0] = '\0'; 6354 if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 && 6355 md_keep_repl_state == 0) { 6356 /* 6357 * No devid in md.conf; we're in recovery mode so 6358 * lookup the devid for the device as specified by 6359 * the devt in md.conf. 6360 */ 6361 lookup_db_devid(clp); 6362 } 6363 return (0); 6364 } 6365 6366 clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | 6367 MDDB_DEVID_SZ; 6368 clp->l_devid_sz = (int)ddi_devid_sizeof( 6369 (ddi_devid_t)(uintptr_t)clp->l_devid); 6370 (void) strcpy(clp->l_minor_name, minor_name); 6371 kmem_free(minor_name, strlen(minor_name) + 1); 6372 6373 return (0); 6374 } 6375 6376 /* 6377 * grab driver name, minor, and block out of 6378 * strings like "driver:minor:block:devid driver:minor:block:devid ..." 6379 */ 6380 static void 6381 parse_db_string( 6382 char *str 6383 ) 6384 { 6385 char *p, *e; 6386 mddb_cfg_loc_t *cl; 6387 char restore_space; 6388 6389 /* CSTYLED */ 6390 cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP); 6391 for (p = str; (*p != '\0'); ) { 6392 for (; ((*p != '\0') && (ISWHITE(*p))); ++p) 6393 ; 6394 if (*p == '\0') 6395 break; 6396 for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e) 6397 ; 6398 /* 6399 * Only give parse_db_loc 1 entry, so stuff a null into 6400 * the string if we're not at the end. We need to save this 6401 * char and restore it after call. 6402 */ 6403 restore_space = '\0'; 6404 if (*e != '\0') { 6405 restore_space = *e; 6406 *e = '\0'; 6407 } 6408 if (parse_db_loc(p, cl) != 0) { 6409 cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p); 6410 } else { 6411 (void) ridev( 6412 &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip, 6413 cl, NULL, MDDB_F_PTCHED); 6414 if (cl->l_devid_flags & MDDB_DEVID_SPACE) { 6415 kmem_free((caddr_t)(uintptr_t)cl->l_devid, 6416 cl->l_devid_sz); 6417 } 6418 } 6419 if (restore_space != '\0') { 6420 *e = restore_space; 6421 } 6422 p = e; 6423 } 6424 kmem_free(cl, sizeof (mddb_cfg_loc_t)); 6425 } 6426 6427 /* 6428 * grab database locations supplied by md.conf as properties 6429 */ 6430 static void 6431 parse_db_strings(void) 6432 { 6433 int bootlist_id; 6434 int proplen; 6435 /* 6436 * size of _bootlist_name should match uses of line and entry in 6437 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c) 6438 */ 6439 char _bootlist_name[MDDB_BOOTLIST_MAX_LEN]; 6440 char *bootlist_name; 6441 caddr_t prop; 6442 6443 /* 6444 * Step through the bootlist properties one at a time by forming the 6445 * correct name, fetching the property, parsing the property and 6446 * then freeing the memory. If a property does not exist or returns 6447 * some form of error just ignore it. There is no guarantee that 6448 * the properties will always exist in sequence, for example 6449 * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with 6450 * mddb_bootlist3 existing. 6451 */ 6452 bootlist_name = &_bootlist_name[0]; 6453 for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) { 6454 6455 proplen = 0; 6456 (void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id); 6457 6458 if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo, 6459 DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop, 6460 &proplen) != DDI_PROP_SUCCESS) 6461 continue; 6462 6463 if (proplen <= 0) 6464 continue; 6465 6466 if (md_init_debug) 6467 cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop); 6468 6469 parse_db_string(prop); 6470 kmem_free(prop, proplen); 6471 } 6472 } 6473 6474 static int 6475 initit( 6476 set_t setno, 6477 int flag 6478 ) 6479 { 6480 int i; 6481 mddb_set_t *s; 6482 mddb_lb_t *lbp; /* pointer to locator block */ 6483 mddb_ln_t *lnp; /* pointer to locator names */ 6484 mddb_db_t *dbp; /* pointer to directory block */ 6485 mddb_did_blk_t *did_blkp; /* pointer to Device ID block */ 6486 mddb_did_ic_t *did_icp; /* pointer to Device ID incore area */ 6487 mddb_bf_t *bfp; 6488 side_t sideno; 6489 side_t maxsides; 6490 mddb_block_t lb_blkcnt; 6491 int retval = 0; 6492 md_dev64_t dev; 6493 mddb_mnlb_t *mnlbp; 6494 int devid_flag; 6495 6496 /* single thread's all loads/unloads of set's */ 6497 mutex_enter(&mddb_lock); 6498 mutex_enter(SETMUTEX(setno)); 6499 6500 if (((mddb_set_t *)md_set[setno].s_db) == NULL) { 6501 mutex_exit(SETMUTEX(setno)); 6502 mutex_exit(&mddb_lock); 6503 return (MDDB_E_NOTNOW); 6504 } 6505 6506 s = (mddb_set_t *)md_set[setno].s_db; 6507 6508 single_thread_start(s); 6509 6510 /* 6511 * init is already underway, block. Return success. 6512 */ 6513 if (s->s_lbp) { 6514 single_thread_end(s); 6515 mutex_exit(SETMUTEX(setno)); 6516 mutex_exit(&mddb_lock); 6517 return (0); 6518 } 6519 6520 uniqtime32(&s->s_inittime); 6521 6522 /* grab database locations patched by /etc/system */ 6523 if (setno == MD_LOCAL_SET) 6524 parse_db_strings(); 6525 6526 s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc( 6527 sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP); 6528 6529 s->s_zombie = 0; 6530 s->s_staledeletes = 0; 6531 s->s_optcmtcnt = 0; 6532 s->s_opthavelck = 0; 6533 s->s_optwantlck = 0; 6534 s->s_optwaiterr = 0; 6535 s->s_opthungerr = 0; 6536 6537 /* 6538 * KEEPTAG can never be set for a MN diskset since no tags are 6539 * allowed to be stored in a MN diskset. No way to check 6540 * if this is a MN diskset or not at this point since the mddb 6541 * hasn't been read in from disk yet. (flag will only have 6542 * MUTLINODE bit set if a new set is being created.) 6543 */ 6544 if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG)) 6545 dt_setup(s, NULL); 6546 6547 md_clr_setstatus(s->s_setno, MD_SET_TOOFEW); 6548 6549 for (i = 0; i < mddb_maxbufheaders; i++) { 6550 bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP); 6551 sema_init(&bfp->bf_buf.b_io, 0, NULL, 6552 SEMA_DEFAULT, NULL); 6553 sema_init(&bfp->bf_buf.b_sem, 0, NULL, 6554 SEMA_DEFAULT, NULL); 6555 bfp->bf_buf.b_offset = -1; 6556 freebuffer(s, bfp); 6557 } 6558 6559 retval = load_old_replicas(s, flag); 6560 /* If 0 return value - success */ 6561 if (! retval) { 6562 single_thread_end(s); 6563 mutex_exit(SETMUTEX(setno)); 6564 mutex_exit(&mddb_lock); 6565 return (0); 6566 } 6567 6568 /* 6569 * If here, then the load_old_replicas() failed 6570 */ 6571 6572 6573 /* If the database was supposed to exist. */ 6574 if (flag & MDDB_MUSTEXIST) { 6575 if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) { 6576 for (i = 0; i < mddb_maxcopies; i++) { 6577 if (! s->s_mbiarray[i]) 6578 continue; 6579 dev = md_expldev( 6580 s->s_lbp->lb_locators[i].l_dev); 6581 dev = md_xlate_targ_2_mini(dev); 6582 if (dev != NODEV64) 6583 mddb_devclose(dev); 6584 6585 free_mbipp(&s->s_mbiarray[i]); 6586 } 6587 6588 kmem_free((caddr_t)s->s_mbiarray, 6589 sizeof (mddb_mb_ic_t *) * mddb_maxcopies); 6590 s->s_mbiarray = NULL; 6591 } 6592 6593 if (s->s_lnp != (mddb_ln_t *)NULL) { 6594 kmem_free((caddr_t)s->s_lnp, 6595 dbtob(s->s_lbp->lb_lnblkcnt)); 6596 s->s_lnp = (mddb_ln_t *)NULL; 6597 } 6598 6599 mddb_devid_icp_free(&s->s_did_icp, s->s_lbp); 6600 6601 if (s->s_lbp != (mddb_lb_t *)NULL) { 6602 kmem_free((caddr_t)s->s_lbp, 6603 dbtob(s->s_lbp->lb_blkcnt)); 6604 s->s_lbp = (mddb_lb_t *)NULL; 6605 } 6606 6607 while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL) 6608 kmem_free((caddr_t)bfp, sizeof (*bfp)); 6609 6610 single_thread_end(s); 6611 mutex_exit(SETMUTEX(setno)); 6612 mutex_exit(&mddb_lock); 6613 6614 if (retval == MDDB_E_TAGDATA) 6615 return (retval); 6616 6617 /* Want a bit more detailed error messages */ 6618 if (mddb_db_err_detail) 6619 return (retval); 6620 6621 return (MDDB_E_NODB); 6622 } 6623 6624 6625 /* 6626 * MDDB_NOOLDOK set - Creating a new database, so do 6627 * more initialization. 6628 */ 6629 6630 lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ? 6631 MDDB_LOCAL_LBCNT : MDDB_LBCNT); 6632 if (flag & MDDB_MULTINODE) { 6633 lb_blkcnt = MDDB_MNLBCNT; 6634 } 6635 6636 if (s->s_lbp == NULL) 6637 s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP); 6638 lbp = s->s_lbp; 6639 6640 bzero((caddr_t)lbp, dbtob(lb_blkcnt)); 6641 lbp->lb_setno = setno; 6642 lbp->lb_magic = MDDB_MAGIC_LB; 6643 if (flag & MDDB_MULTINODE) { 6644 lbp->lb_revision = MDDB_REV_MNLB; 6645 } else { 6646 lbp->lb_revision = MDDB_REV_LB; 6647 } 6648 lbp->lb_inittime = s->s_inittime; 6649 if (flag & MDDB_MULTINODE) { 6650 mnlbp = (mddb_mnlb_t *)lbp; 6651 for (i = 0; i < MDDB_NLB; i++) { 6652 for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) { 6653 mddb_mnsidelocator_t *mnslp; 6654 mnslp = &mnlbp->lb_mnsidelocators[sideno][i]; 6655 mnslp->mnl_mnum = NODEV32; 6656 mnslp->mnl_sideno = 0; 6657 mnslp->mnl_drvnm_index = 0; 6658 } 6659 } 6660 } else { 6661 maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES); 6662 for (i = 0; i < MDDB_NLB; i++) { 6663 for (sideno = 0; sideno < maxsides; sideno++) { 6664 mddb_sidelocator_t *slp; 6665 slp = &lbp->lb_sidelocators[sideno][i]; 6666 slp->l_mnum = NODEV32; 6667 } 6668 } 6669 } 6670 lbp->lb_blkcnt = lb_blkcnt; 6671 6672 /* lb starts on block 0 */ 6673 /* locator names starts after locator block */ 6674 lbp->lb_lnfirstblk = lb_blkcnt; 6675 if (flag & MDDB_MULTINODE) { 6676 lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT; 6677 } else { 6678 lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ? 6679 MDDB_LOCAL_LNCNT : MDDB_LNCNT); 6680 } 6681 6682 if (flag & MDDB_MULTINODE) { 6683 /* Creating a multinode diskset */ 6684 md_set_setstatus(setno, MD_SET_MNSET); 6685 lbp->lb_flags |= MDDB_MNSET; 6686 } 6687 6688 /* Data portion of mddb located after locator names */ 6689 lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt; 6690 6691 /* the btodb that follows is converting the directory block size */ 6692 /* Data tag part of mddb located after first block of mddb data */ 6693 lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk + 6694 btodb(MDDB_BSIZE)); 6695 /* Data tags are not used in MN diskset - so set count to 0 */ 6696 if (flag & MDDB_MULTINODE) 6697 lbp->lb_dtblkcnt = (mddb_block_t)0; 6698 else 6699 lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS; 6700 6701 6702 lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP); 6703 lnp->ln_magic = MDDB_MAGIC_LN; 6704 if (flag & MDDB_MULTINODE) { 6705 lnp->ln_revision = MDDB_REV_MNLN; 6706 } else { 6707 lnp->ln_revision = MDDB_REV_LN; 6708 } 6709 s->s_lnp = lnp; 6710 6711 /* 6712 * Set up Device ID portion of Locator Block. 6713 * Do not set locator to device id style if 6714 * md_devid_destroy is 1 and md_keep_repl_state is 1 6715 * (destroy all device id data and keep replica in 6716 * non device id mode). 6717 * 6718 * This is logically equivalent to set locator to 6719 * device id style if md_devid_destroy is 0 or 6720 * md_keep_repl_state is 0. 6721 * 6722 * In SunCluster environment, device id mode is disabled 6723 * which means diskset will be run in non-devid mode. For 6724 * localset, the behavior will remain intact and run in 6725 * device id mode. 6726 * 6727 * In multinode diskset devids are turned off. 6728 */ 6729 devid_flag = 1; 6730 if (cluster_bootflags & CLUSTER_CONFIGURED) 6731 if (setno != MD_LOCAL_SET) 6732 devid_flag = 0; 6733 if (flag & MDDB_MULTINODE) 6734 devid_flag = 0; 6735 if ((md_devid_destroy == 1) && (md_keep_repl_state == 1)) 6736 devid_flag = 0; 6737 /* 6738 * if we weren't devid style before and md_keep_repl_state=1 6739 * we need to stay non-devid 6740 */ 6741 if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) && 6742 (md_keep_repl_state == 1)) 6743 devid_flag = 0; 6744 if (devid_flag) { 6745 lbp->lb_didfirstblk = lbp->lb_dtfirstblk + 6746 lbp->lb_dtblkcnt; 6747 lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS; 6748 lbp->lb_flags |= MDDB_DEVID_STYLE; 6749 6750 did_icp = (mddb_did_ic_t *)kmem_zalloc 6751 (sizeof (mddb_did_ic_t), KM_SLEEP); 6752 did_blkp = (mddb_did_blk_t *) 6753 kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP); 6754 did_blkp->blk_magic = MDDB_MAGIC_DI; 6755 did_blkp->blk_revision = MDDB_REV_DI; 6756 did_icp->did_ic_blkp = did_blkp; 6757 s->s_did_icp = did_icp; 6758 } 6759 6760 setidentifier(s, &lbp->lb_ident); 6761 uniqtime32(&lbp->lb_timestamp); 6762 dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP); 6763 dbp->db_magic = MDDB_MAGIC_DB; 6764 dbp->db_revision = MDDB_REV_DB; 6765 uniqtime32(&dbp->db_timestamp); 6766 dbp->db_nextblk = 0; 6767 dbp->db_firstentry = NULL; 6768 dbp->db_blknum = lbp->lb_dbfirstblk; 6769 dbp->db_recsum = MDDB_GLOBAL_XOR; 6770 s->s_dbp = dbp; 6771 single_thread_end(s); 6772 mutex_exit(SETMUTEX(setno)); 6773 mutex_exit(&mddb_lock); 6774 return (0); 6775 } 6776 6777 mddb_set_t * 6778 mddb_setenter( 6779 set_t setno, 6780 int flag, 6781 int *errorcodep 6782 ) 6783 { 6784 mddb_set_t *s; 6785 int err = 0; 6786 size_t sz = sizeof (void *) * MD_MAXUNITS; 6787 6788 mutex_enter(SETMUTEX(setno)); 6789 if (! md_set[setno].s_db) { 6790 mutex_exit(SETMUTEX(setno)); 6791 if (errorcodep != NULL) 6792 *errorcodep = MDDB_E_NOTOWNER; 6793 return (NULL); 6794 } 6795 6796 /* Allocate s_un and s_ui arrays if not already present. */ 6797 if (md_set[setno].s_un == NULL) { 6798 md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP); 6799 if (md_set[setno].s_un == NULL) { 6800 mutex_exit(SETMUTEX(setno)); 6801 if (errorcodep != NULL) 6802 *errorcodep = MDDB_E_NOTOWNER; 6803 return (NULL); 6804 } 6805 } 6806 if (md_set[setno].s_ui == NULL) { 6807 md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP); 6808 if (md_set[setno].s_ui == NULL) { 6809 mutex_exit(&md_set[setno].s_dbmx); 6810 kmem_free(md_set[setno].s_un, sz); 6811 md_set[setno].s_un = NULL; 6812 if (errorcodep != NULL) 6813 *errorcodep = MDDB_E_NOTOWNER; 6814 return (NULL); 6815 } 6816 } 6817 s = (mddb_set_t *)md_set[setno].s_db; 6818 if (s->s_lbp) 6819 return (s); 6820 6821 if (flag & MDDB_NOINIT) 6822 return (s); 6823 6824 /* 6825 * Release the set mutex - it will be acquired and released in 6826 * initit after acquiring the mddb_lock. This is done to assure 6827 * that mutexes are always acquired in the same order to prevent 6828 * possible deadlock 6829 */ 6830 mutex_exit(SETMUTEX(setno)); 6831 6832 if ((err = initit(setno, flag)) != 0) { 6833 if (errorcodep != NULL) 6834 *errorcodep = err; 6835 return (NULL); 6836 } 6837 6838 mutex_enter(SETMUTEX(setno)); 6839 return ((mddb_set_t *)md_set[setno].s_db); 6840 } 6841 6842 /* 6843 * Release the set lock for a given set. 6844 * 6845 * In a MN diskset, this routine may send messages to the rpc.mdcommd 6846 * in order to have the slave nodes re-parse parts of the mddb. 6847 * Messages are only sent if the global ioctl lock is not held. 6848 * 6849 * With the introduction of multi-threaded ioctls, there is no way 6850 * to determine which thread(s) are holding the ioctl lock. So, if 6851 * the ioctl lock is held (by process X) process X will send the 6852 * messages to the slave nodes when process X releases the ioctl lock. 6853 */ 6854 void 6855 mddb_setexit( 6856 mddb_set_t *s 6857 ) 6858 { 6859 md_mn_msg_mddb_parse_t *mddb_parse_msg; 6860 md_mn_kresult_t *kresult; 6861 mddb_lb_t *lbp = s->s_lbp; 6862 int i; 6863 int rval = 1; 6864 6865 /* 6866 * If not a MN diskset OR 6867 * a MN diskset but this node isn't master, 6868 * then release the mutex. 6869 */ 6870 if (!(MD_MNSET_SETNO(s->s_setno)) || 6871 ((MD_MNSET_SETNO(s->s_setno)) && 6872 (!md_set[s->s_setno].s_am_i_master))) { 6873 mutex_exit(SETMUTEX(s->s_setno)); 6874 return; 6875 } 6876 6877 /* 6878 * If global ioctl lock is held, then send no messages, 6879 * just release mutex and return. 6880 * 6881 */ 6882 if (md_status & MD_GBL_IOCTL_LOCK) { 6883 mutex_exit(SETMUTEX(s->s_setno)); 6884 return; 6885 } 6886 6887 /* 6888 * This thread is not holding the ioctl lock, so drop the set 6889 * lock, send messages to slave nodes to reparse portions 6890 * of the mddb and return. 6891 * 6892 * If the block parse flag is set, do not send parse messages. 6893 * This flag is set when master is adding a new mddb that would 6894 * cause parse messages to be sent to the slaves, but the slaves 6895 * don't have knowledge of the new mddb yet since the mddb add 6896 * operation hasn't been run on the slave nodes yet. When the 6897 * master unblocks the parse flag, the parse messages will be 6898 * generated. 6899 * 6900 * If s_mn_parseflags_sending is non-zero, then another thread 6901 * is already currently sending a parse message, so just release 6902 * the mutex and return. If an mddb change occurred that results 6903 * in a parse message to be generated, the thread that is currently 6904 * sending a parse message would generate the additional parse message. 6905 * 6906 * If s_mn_parseflags_sending is zero and parsing is not blocked, 6907 * then loop until s_mn_parseflags is 0 (until there are no more 6908 * messages to send). 6909 * While s_mn_parseflags is non-zero, 6910 * put snapshot of parse_flags in s_mn_parseflags_sending 6911 * set s_mn_parseflags to zero 6912 * release mutex 6913 * send message 6914 * re-grab mutex 6915 * set s_mn_parseflags_sending to zero 6916 */ 6917 mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP); 6918 while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) && 6919 (s->s_mn_parseflags & MDDB_PARSE_MASK) && 6920 (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) { 6921 /* Grab snapshot of parse flags */ 6922 s->s_mn_parseflags_sending = s->s_mn_parseflags; 6923 s->s_mn_parseflags = 0; 6924 6925 mutex_exit(SETMUTEX(s->s_setno)); 6926 6927 /* 6928 * Send the message to the slaves to re-parse 6929 * the indicated portions of the mddb. Send the status 6930 * of the 50 mddbs in this set so that slaves know which 6931 * mddbs that the master node thinks are 'good'. 6932 * Otherwise, slave may reparse, but from wrong replica. 6933 */ 6934 mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending; 6935 for (i = 0; i < MDDB_NLB; i++) { 6936 mddb_parse_msg->msg_lb_flags[i] = 6937 lbp->lb_locators[i].l_flags; 6938 } 6939 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 6940 while (rval != 0) { 6941 rval = mdmn_ksend_message(s->s_setno, 6942 MD_MN_MSG_MDDB_PARSE, 0, 0, 6943 (char *)mddb_parse_msg, 6944 sizeof (md_mn_msg_mddb_parse_t), kresult); 6945 if (rval != 0) 6946 cmn_err(CE_WARN, "mddb_setexit: Unable to send " 6947 "mddb update message to other nodes in " 6948 "diskset %s\n", s->s_setname); 6949 } 6950 kmem_free(kresult, sizeof (md_mn_kresult_t)); 6951 6952 /* 6953 * Re-grab mutex to clear sending field and to 6954 * see if another parse message needs to be generated. 6955 */ 6956 mutex_enter(SETMUTEX(s->s_setno)); 6957 s->s_mn_parseflags_sending = 0; 6958 } 6959 kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t)); 6960 mutex_exit(SETMUTEX(s->s_setno)); 6961 } 6962 6963 static void 6964 mddb_setexit_no_parse( 6965 mddb_set_t *s 6966 ) 6967 { 6968 mutex_exit(SETMUTEX(s->s_setno)); 6969 } 6970 6971 uint_t 6972 mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt) 6973 { 6974 uint_t li; 6975 mddb_lb_t *lbp = s->s_lbp; 6976 mddb_locator_t *lp; 6977 ddi_devid_t ret_devid; 6978 uint_t devid_len; 6979 dev_t ddi_dev; 6980 mddb_did_ic_t *did_icp; 6981 mddb_did_blk_t *did_blkp; 6982 char *minor_name; 6983 size_t sz; 6984 int retval; 6985 int err; 6986 md_dev64_t dev64; /* tmp var to make code look better */ 6987 6988 6989 /* Need disk block(s) to hold mddb_did_blk_t */ 6990 *blk_cnt = MDDB_DID_BLOCKS; 6991 6992 if (doit) { 6993 /* 6994 * Alloc mddb_did_blk_t disk block and fill in header area. 6995 * Don't fill in did magic number until end of routine so 6996 * if machine panics in the middle of conversion, the 6997 * device id information will be thrown away at the 6998 * next snarfing of this set. 6999 * Need to set DEVID_STYLE so that mddb_devid_add will 7000 * function properly. 7001 */ 7002 /* grab the mutex */ 7003 if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) { 7004 return (1); 7005 } 7006 single_thread_start(s); 7007 lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS); 7008 if (lbp->lb_didfirstblk == 0) { 7009 single_thread_end(s); 7010 mddb_setexit(s); 7011 return (1); 7012 } 7013 lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS; 7014 did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t), 7015 KM_SLEEP); 7016 did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES, 7017 KM_SLEEP); 7018 7019 did_blkp->blk_revision = MDDB_REV_DI; 7020 did_icp->did_ic_blkp = did_blkp; 7021 s->s_did_icp = did_icp; 7022 lbp->lb_flags |= MDDB_DEVID_STYLE; 7023 } 7024 7025 /* Fill in information in mddb_did_info_t array */ 7026 for (li = 0; li < lbp->lb_loccnt; li++) { 7027 lp = &lbp->lb_locators[li]; 7028 if (lp->l_flags & MDDB_F_DELETED) 7029 continue; 7030 7031 dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 7032 ddi_dev = md_dev64_to_dev(dev64); 7033 if (ddi_dev == NODEV) { 7034 /* 7035 * No translation available for replica. 7036 * Could fail conversion to device id replica, 7037 * but instead will just continue with next 7038 * replica in list. 7039 */ 7040 continue; 7041 } 7042 if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) { 7043 /* 7044 * Just count each devid as at least 1 block. This 7045 * is conservative since several device id's may fit 7046 * into 1 disk block, but it's better to overestimate 7047 * the number of blocks needed than to underestimate. 7048 */ 7049 devid_len = (int)ddi_devid_sizeof(ret_devid); 7050 *blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1)); 7051 if (doit) { 7052 if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, 7053 &minor_name) == DDI_SUCCESS) { 7054 if (mddb_devid_add(s, li, ret_devid, 7055 minor_name)) { 7056 cmn_err(CE_WARN, 7057 "Not enough space in metadb" 7058 " to add device id for" 7059 " dev: major = %d, " 7060 "minor = %d\n", 7061 getmajor(ddi_dev), 7062 getminor(ddi_dev)); 7063 } 7064 sz = strlen(minor_name) + 1; 7065 kmem_free(minor_name, sz); 7066 } 7067 } 7068 ddi_devid_free(ret_devid); 7069 } 7070 } 7071 7072 if (doit) { 7073 did_blkp->blk_magic = MDDB_MAGIC_DI; 7074 retval = push_lb(s); 7075 (void) upd_med(s, "mddb_lb_did_convert(0)"); 7076 single_thread_end(s); 7077 mddb_setexit(s); 7078 if (retval != 0) 7079 return (1); 7080 } 7081 7082 return (0); 7083 } 7084 7085 static mddb_set_t * 7086 init_set( 7087 mddb_config_t *cp, 7088 int flag, 7089 int *errp 7090 ) 7091 { 7092 mddb_set_t *s; 7093 char *setname = NULL; 7094 set_t setno = MD_LOCAL_SET; 7095 side_t sideno = 0; 7096 struct timeval32 *created = NULL; 7097 7098 if (cp != NULL) { 7099 setname = cp->c_setname; 7100 setno = cp->c_setno; 7101 sideno = cp->c_sideno; 7102 created = &cp->c_timestamp; 7103 } 7104 7105 if (setno >= MD_MAXSETS) 7106 return ((mddb_set_t *)NULL); 7107 7108 if (md_set[setno].s_db) 7109 return (mddb_setenter(setno, flag, errp)); 7110 7111 s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP); 7112 7113 cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL); 7114 cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL); 7115 cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL); 7116 cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL); 7117 cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL); 7118 7119 s->s_setno = setno; 7120 s->s_sideno = sideno; 7121 if (setno == MD_LOCAL_SET) { 7122 (void) snprintf(s->s_ident.serial, sizeof (s->s_ident.serial), 7123 "%u", zone_get_hostid(NULL)); 7124 } else { 7125 s->s_ident.createtime = *created; 7126 s->s_setname = (char *)kmem_alloc(strlen(setname) + 1, 7127 KM_SLEEP); 7128 (void) strcpy(s->s_setname, setname); 7129 } 7130 7131 /* have a config struct, copy mediator information */ 7132 if (cp != NULL) 7133 s->s_med = cp->c_med; /* structure assignment */ 7134 7135 md_set[setno].s_db = (void *) s; 7136 7137 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64); 7138 7139 return (mddb_setenter(setno, flag, errp)); 7140 } 7141 7142 void 7143 mddb_unload_set( 7144 set_t setno 7145 ) 7146 { 7147 7148 mddb_set_t *s; 7149 mddb_db_t *dbp, *adbp = NULL; 7150 mddb_de_ic_t *dep, *dep2; 7151 mddb_bf_t *bfp; 7152 int i; 7153 md_dev64_t dev; 7154 7155 if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL) 7156 return; 7157 7158 single_thread_start(s); 7159 7160 s->s_opthavequeuinglck = 0; 7161 s->s_optwantqueuinglck = 0; 7162 7163 for (dbp = s->s_dbp; dbp != 0; dbp = adbp) { 7164 for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) { 7165 if (dep->de_rb_userdata != NULL) { 7166 if (dep->de_icreqsize) 7167 kmem_free(dep->de_rb_userdata_ic, 7168 dep->de_icreqsize); 7169 else 7170 kmem_free(dep->de_rb_userdata, 7171 dep->de_reqsize); 7172 } 7173 kmem_free((caddr_t)dep->de_rb, dep->de_recsize); 7174 dep2 = dep->de_next; 7175 kmem_free((caddr_t)dep, sizeofde(dep)); 7176 } 7177 adbp = dbp->db_next; 7178 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 7179 } 7180 s->s_dbp = (mddb_db_t *)NULL; 7181 7182 free_rip(&s->s_rip); 7183 7184 for (i = 0; i < mddb_maxcopies; i++) { 7185 if (! s->s_mbiarray) 7186 break; 7187 7188 if (! s->s_mbiarray[i]) 7189 continue; 7190 7191 dev = md_expldev(s->s_lbp->lb_locators[i].l_dev); 7192 dev = md_xlate_targ_2_mini(dev); 7193 if (dev != NODEV64) 7194 mddb_devclose(dev); 7195 7196 free_mbipp(&s->s_mbiarray[i]); 7197 } 7198 7199 if (s->s_mbiarray) { 7200 kmem_free((caddr_t)s->s_mbiarray, 7201 sizeof (mddb_mb_ic_t *) * mddb_maxcopies); 7202 s->s_mbiarray = (mddb_mb_ic_t **)NULL; 7203 } 7204 7205 if (s->s_lnp) { 7206 kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt)); 7207 s->s_lnp = (mddb_ln_t *)NULL; 7208 } 7209 7210 if (s->s_lbp) { 7211 mddb_devid_icp_free(&s->s_did_icp, s->s_lbp); 7212 kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt)); 7213 s->s_lbp = (mddb_lb_t *)NULL; 7214 } 7215 7216 if (s->s_freebitmap) { 7217 kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize); 7218 s->s_freebitmap = NULL; 7219 s->s_freebitmapsize = 0; 7220 } 7221 7222 while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL) 7223 kmem_free((caddr_t)bfp, sizeof (*bfp)); 7224 7225 if (s->s_databuffer_size) { 7226 kmem_free(s->s_databuffer, s->s_databuffer_size); 7227 s->s_databuffer_size = 0; 7228 } 7229 7230 if (s->s_setname != NULL) 7231 kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1); 7232 7233 /* Data tags not supported on MN sets. */ 7234 if (!(md_get_setstatus(setno) & MD_SET_MNSET)) 7235 dtl_freel(&s->s_dtlp); 7236 7237 md_set[setno].s_db = NULL; 7238 ASSERT(s->s_singlelockwanted == 0); 7239 kmem_free(s, sizeof (mddb_set_t)); 7240 7241 /* Take care of things setup in the md_set array */ 7242 if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) { 7243 if (md_set[setno].s_dtp) { 7244 kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 7245 md_set[setno].s_dtp = NULL; 7246 } 7247 } 7248 7249 md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT | 7250 MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE | 7251 MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET | 7252 MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC | 7253 MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT); 7254 7255 mutex_exit(SETMUTEX(setno)); 7256 } 7257 7258 /* 7259 * returns 0 if name can be put into locator block 7260 * returns 1 if locator block prefixes are all used 7261 * 7262 * Takes splitname (suffix, prefix, sideno) and 7263 * stores it in the locator name structure. 7264 * For traditional diskset, the sideno is the index into the suffixes 7265 * array in the locator name structure. 7266 * For the MN diskset, the sideno is the nodeid which can be any number, 7267 * so the index passed in is the index into the mnsuffixes array 7268 * in the locator structure. This index was computed by the 7269 * routine checklocator which basically checked the locator block 7270 * mnside locator structure. 7271 */ 7272 static int 7273 splitname2locatorblock( 7274 md_splitname *spn, 7275 mddb_ln_t *lnp, 7276 int li, 7277 side_t sideno, 7278 int index 7279 ) 7280 { 7281 uchar_t i; 7282 md_name_suffix *sn; 7283 md_mnname_suffix_t *mnsn; 7284 mddb_mnln_t *mnlnp; 7285 7286 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7287 if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len) 7288 continue; 7289 if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data, 7290 SPN_PREFIX(spn).pre_len) == 0) 7291 break; 7292 } 7293 if (i == MDDB_PREFIXCNT) { 7294 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7295 if (lnp->ln_prefixes[i].pre_len == 0) 7296 break; 7297 } 7298 if (i == MDDB_PREFIXCNT) 7299 return (1); 7300 bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data, 7301 SPN_PREFIX(spn).pre_len); 7302 lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len; 7303 } 7304 7305 if (lnp->ln_revision == MDDB_REV_MNLN) { 7306 /* If a MN diskset, use index */ 7307 mnlnp = (mddb_mnln_t *)lnp; 7308 mnsn = &mnlnp->ln_mnsuffixes[index][li]; 7309 mnsn->mn_ln_sideno = sideno; 7310 mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len; 7311 mnsn->mn_ln_suffix.suf_prefix = i; 7312 bcopy(SPN_SUFFIX(spn).suf_data, 7313 mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len); 7314 } else { 7315 sn = &lnp->ln_suffixes[sideno][li]; 7316 sn->suf_len = SPN_SUFFIX(spn).suf_len; 7317 sn->suf_prefix = i; 7318 bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data, 7319 SPN_SUFFIX(spn).suf_len); 7320 } 7321 return (0); 7322 } 7323 7324 /* 7325 * Find the locator name for the given sideno and convert the locator name 7326 * information into a splitname structure. 7327 */ 7328 void 7329 mddb_locatorblock2splitname( 7330 mddb_ln_t *lnp, 7331 int li, 7332 side_t sideno, 7333 md_splitname *spn 7334 ) 7335 { 7336 int iprefix; 7337 md_name_suffix *sn; 7338 md_mnname_suffix_t *mnsn; 7339 int i; 7340 mddb_mnln_t *mnlnp; 7341 7342 if (lnp->ln_revision == MDDB_REV_MNLN) { 7343 mnlnp = (mddb_mnln_t *)lnp; 7344 for (i = 0; i < MD_MNMAXSIDES; i++) { 7345 mnsn = &mnlnp->ln_mnsuffixes[i][li]; 7346 if (mnsn->mn_ln_sideno == sideno) 7347 break; 7348 } 7349 if (i == MD_MNMAXSIDES) 7350 return; 7351 7352 SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len; 7353 bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data, 7354 SPN_SUFFIX(spn).suf_len); 7355 iprefix = mnsn->mn_ln_suffix.suf_prefix; 7356 } else { 7357 sn = &lnp->ln_suffixes[sideno][li]; 7358 SPN_SUFFIX(spn).suf_len = sn->suf_len; 7359 bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data, 7360 SPN_SUFFIX(spn).suf_len); 7361 iprefix = sn->suf_prefix; 7362 } 7363 SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len; 7364 bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data, 7365 SPN_PREFIX(spn).pre_len); 7366 } 7367 7368 static int 7369 getdeldev( 7370 mddb_config_t *cp, 7371 int command, 7372 md_error_t *ep 7373 ) 7374 { 7375 mddb_set_t *s; 7376 mddb_lb_t *lbp; 7377 mddb_locator_t *locators; 7378 uint_t loccnt; 7379 mddb_mb_ic_t *mbip; 7380 mddb_block_t blk; 7381 int err = 0; 7382 int i, j; 7383 int li; 7384 uint_t commitcnt; 7385 set_t setno = cp->c_setno; 7386 uint_t set_status; 7387 md_dev64_t dev; 7388 int flags = MDDB_MUSTEXIST; 7389 mddb_ri_t *rip; 7390 7391 cp->c_dbmax = MDDB_NLB; 7392 7393 /* 7394 * Data checking 7395 */ 7396 if (setno >= md_nsets || cp->c_id < 0 || 7397 cp->c_id > cp->c_dbmax) { 7398 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 7399 } 7400 7401 if (cp->c_flags & MDDB_C_STALE) 7402 flags |= MDDB_MN_STALE; 7403 7404 if ((s = mddb_setenter(setno, flags, &err)) == NULL) 7405 return (mddbstatus2error(ep, err, NODEV32, setno)); 7406 7407 cp->c_flags = 0; 7408 7409 lbp = s->s_lbp; 7410 loccnt = lbp->lb_loccnt; 7411 locators = lbp->lb_locators; 7412 7413 /* shorthand */ 7414 set_status = md_get_setstatus(setno); 7415 7416 if (set_status & MD_SET_STALE) 7417 cp->c_flags |= MDDB_C_STALE; 7418 7419 if (set_status & MD_SET_TOOFEW) 7420 cp->c_flags |= MDDB_C_TOOFEW; 7421 7422 cp->c_sideno = s->s_sideno; 7423 7424 cp->c_dbcnt = 0; 7425 /* 7426 * go through and count active entries 7427 */ 7428 for (i = 0; i < loccnt; i++) { 7429 if (locators[i].l_flags & MDDB_F_DELETED) 7430 continue; 7431 cp->c_dbcnt++; 7432 } 7433 7434 /* 7435 * add the ability to accept a locator block index 7436 * which is not relative to previously deleted replicas. This 7437 * is for support of MD_DEBUG=STAT in metastat since it asks for 7438 * replica information specifically for each of the mirror resync 7439 * records. MDDB_CONFIG_SUBCMD uses one of the pad spares in 7440 * the mddb_config_t type. 7441 */ 7442 if (cp->c_subcmd == MDDB_CONFIG_ABS) { 7443 if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) { 7444 mddb_setexit(s); 7445 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, 7446 setno)); 7447 } 7448 li = cp->c_id; 7449 } else { 7450 if (cp->c_id >= cp->c_dbcnt) { 7451 mddb_setexit(s); 7452 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, 7453 setno)); 7454 } 7455 7456 /* CSTYLED */ 7457 for (li = 0, j = 0; /* void */; li++) { 7458 if (locators[li].l_flags & MDDB_F_DELETED) 7459 continue; 7460 j++; 7461 if (j > cp->c_id) 7462 break; 7463 } 7464 } 7465 7466 if (command == MDDB_ENDDEV) { 7467 daddr_t ib = 0, jb; 7468 7469 blk = 0; 7470 if ((s != NULL) && s->s_mbiarray[li]) { 7471 mbip = s->s_mbiarray[li]; 7472 while ((jb = getphysblk(blk++, mbip)) > 0) { 7473 if (jb > ib) 7474 ib = jb; 7475 } 7476 cp->c_dbend = (int)ib; 7477 } else { 7478 cp->c_dbend = 0; 7479 } 7480 } 7481 7482 locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp); 7483 mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname); 7484 7485 if (command != MDDB_DELDEV) { 7486 mddb_setexit(s); 7487 return (0); 7488 } 7489 7490 /* Currently don't allow addition/deletion of sides during upgrade */ 7491 if (MD_UPGRADE) { 7492 cmn_err(CE_WARN, 7493 "Deletion of replica not allowed during upgrade.\n"); 7494 mddb_setexit(s); 7495 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 7496 } 7497 7498 /* 7499 * If here, replica delete in progress. 7500 */ 7501 single_thread_start(s); 7502 7503 if ((! (locators[li].l_flags & MDDB_F_EMASTER)) && 7504 (locators[li].l_flags & MDDB_F_ACTIVE)) { 7505 commitcnt = lbp->lb_commitcnt; 7506 lbp->lb_commitcnt = 0; 7507 setidentifier(s, &lbp->lb_ident); 7508 crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL); 7509 /* 7510 * Don't need to write out device id area, since locator 7511 * block on this replica is being deleted by setting the 7512 * commitcnt to 0. 7513 */ 7514 (void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, 7515 MDDB_WR_ONLY_MASTER); 7516 lbp->lb_commitcnt = commitcnt; 7517 } 7518 7519 if (s->s_mbiarray[li]) { 7520 /* A freed mbi pointer still exists in the mddb_ri_t */ 7521 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 7522 if (rip->ri_mbip == s->s_mbiarray[li]) 7523 rip->ri_mbip = NULL; 7524 } 7525 free_mbipp(&s->s_mbiarray[li]); 7526 } 7527 7528 if (! (locators[li].l_flags & MDDB_F_EMASTER)) { 7529 dev = md_expldev(locators[li].l_dev); 7530 dev = md_xlate_targ_2_mini(dev); 7531 if (dev != NODEV64) 7532 mddb_devclose(dev); 7533 } 7534 7535 s->s_mbiarray[li] = 0; 7536 lbp->lb_locators[li].l_flags = MDDB_F_DELETED; 7537 7538 /* Only support data tags for traditional and local sets */ 7539 if ((md_get_setstatus(setno) & MD_SET_STALE) && 7540 (!(lbp->lb_flags & MDDB_MNSET)) && 7541 setno != MD_LOCAL_SET) 7542 if (set_dtag(s, ep)) 7543 mdclrerror(ep); 7544 7545 /* Write data tags to all accessible devices */ 7546 /* Only support data tags for traditional and local sets */ 7547 if (!(lbp->lb_flags & MDDB_MNSET)) { 7548 (void) dt_write(s); 7549 } 7550 7551 /* Delete device id of deleted replica */ 7552 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 7553 (void) mddb_devid_delete(s, li); 7554 } 7555 /* write new locator to all devices */ 7556 err = writelocall(s); 7557 7558 (void) upd_med(s, "getdeldev(0)"); 7559 7560 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno, 7561 md_expldev(locators[li].l_dev)); 7562 7563 computefreeblks(s); /* recompute always it may be larger */ 7564 cp->c_dbcnt--; 7565 err |= fixoptrecords(s); 7566 if (err) { 7567 if (writeretry(s)) { 7568 single_thread_end(s); 7569 mddb_setexit(s); 7570 return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno)); 7571 } 7572 } 7573 7574 single_thread_end(s); 7575 mddb_setexit(s); 7576 return (0); 7577 } 7578 7579 static int 7580 getdriver( 7581 mddb_cfg_loc_t *clp 7582 ) 7583 { 7584 major_t majordev; 7585 7586 /* 7587 * Data checking 7588 */ 7589 if (clp->l_dev <= 0) 7590 return (EINVAL); 7591 7592 majordev = getmajor(expldev(clp->l_dev)); 7593 7594 if (ddi_major_to_name(majordev) == (char *)NULL) 7595 return (EINVAL); 7596 7597 if (MD_UPGRADE) 7598 (void) strcpy(clp->l_driver, md_targ_major_to_name(majordev)); 7599 else 7600 (void) strcpy(clp->l_driver, ddi_major_to_name(majordev)); 7601 return (0); 7602 } 7603 7604 /* 7605 * update_valid_replica - updates the locator block namespace (prefix 7606 * and/or suffix) with new pathname and devname. 7607 * RETURN 7608 * 1 Error 7609 * 0 Success 7610 */ 7611 static int 7612 update_valid_replica( 7613 side_t side, 7614 mddb_locator_t *lp, 7615 mddb_set_t *s, 7616 int li, 7617 char *devname, 7618 char *pathname, 7619 md_dev64_t devt 7620 ) 7621 { 7622 uchar_t pre_len, suf_len; 7623 md_name_suffix *sn; 7624 mddb_ln_t *lnp; 7625 uchar_t pre_index; 7626 uchar_t i; 7627 7628 if (md_expldev(lp->l_dev) != devt) { 7629 return (0); 7630 } 7631 7632 if (pathname[strlen(pathname) - 1] == '/') 7633 pathname[strlen(pathname) - 1] = '\0'; 7634 7635 pre_len = (uchar_t)strlen(pathname); 7636 suf_len = (uchar_t)strlen(devname); 7637 7638 if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX)) 7639 return (1); 7640 7641 lnp = s->s_lnp; 7642 7643 /* 7644 * Future note: Need to do something here for the MN diskset case 7645 * when device ids are supported in disksets. 7646 * Can't add until merging devids_in_diskset code into code base 7647 * Currently only called with side of 0. 7648 */ 7649 7650 sn = &lnp->ln_suffixes[side][li]; 7651 7652 /* 7653 * Check if prefix (Ex: /dev/dsk) needs to be changed. 7654 * If new prefix is the same as the previous prefix - no change. 7655 * 7656 * If new prefix is not the same, check if new prefix 7657 * matches an existing one. If so, use that one. 7658 * 7659 * If new prefix doesn't exist, add a new prefix. If not enough 7660 * space, return failure. 7661 */ 7662 pre_index = sn->suf_prefix; 7663 /* Check if new prefix is the same as the old prefix. */ 7664 if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) || 7665 (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname, 7666 pre_len) != 0)) { 7667 /* Check if new prefix is an already known prefix. */ 7668 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7669 if (lnp->ln_prefixes[i].pre_len != pre_len) { 7670 continue; 7671 } 7672 if (bcmp(lnp->ln_prefixes[i].pre_data, pathname, 7673 pre_len) == 0) { 7674 break; 7675 } 7676 } 7677 /* If no match found for new prefix - add the new prefix */ 7678 if (i == MDDB_PREFIXCNT) { 7679 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7680 if (lnp->ln_prefixes[i].pre_len == 0) 7681 break; 7682 } 7683 /* No space to add new prefix - return failure */ 7684 if (i == MDDB_PREFIXCNT) { 7685 return (1); 7686 } 7687 bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len); 7688 lnp->ln_prefixes[i].pre_len = pre_len; 7689 } 7690 sn->suf_prefix = i; 7691 } 7692 7693 /* Now, update the suffix (Ex: c0t0d0s0) if needed */ 7694 if ((sn->suf_len != suf_len) || 7695 (bcmp(sn->suf_data, devname, suf_len) != 0)) { 7696 bcopy(devname, sn->suf_data, suf_len); 7697 sn->suf_len = suf_len; 7698 } 7699 return (0); 7700 } 7701 7702 7703 /* 7704 * md_update_locator_namespace - If in devid style and active and the devid's 7705 * exist and are valid update the locator namespace pathname 7706 * and devname. 7707 * RETURN 7708 * 1 Error 7709 * 0 Success 7710 */ 7711 int 7712 md_update_locator_namespace( 7713 set_t setno, /* which set to get name from */ 7714 side_t side, 7715 char *dname, 7716 char *pname, 7717 md_dev64_t devt 7718 ) 7719 { 7720 mddb_set_t *s; 7721 mddb_lb_t *lbp; 7722 int li; 7723 uint_t flg; 7724 int err = 0; 7725 mddb_ln_t *lnp; 7726 7727 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 7728 return (1); 7729 single_thread_start(s); 7730 lbp = s->s_lbp; 7731 /* must be DEVID_STYLE */ 7732 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 7733 for (li = 0; li < lbp->lb_loccnt; li++) { 7734 mddb_locator_t *lp = &lbp->lb_locators[li]; 7735 7736 if (lp->l_flags & MDDB_F_DELETED) { 7737 continue; 7738 } 7739 7740 /* replica also must be active */ 7741 if (lp->l_flags & MDDB_F_ACTIVE) { 7742 flg = s->s_did_icp->did_ic_blkp-> 7743 blk_info[li].info_flags; 7744 /* only update if did exists and is valid */ 7745 if ((flg & MDDB_DID_EXISTS) && 7746 (flg & MDDB_DID_VALID)) { 7747 if (update_valid_replica(side, lp, s, 7748 li, dname, pname, devt)) { 7749 err = 1; 7750 goto out; 7751 } 7752 } 7753 } 7754 } 7755 } 7756 lnp = s->s_lnp; 7757 uniqtime32(&lnp->ln_timestamp); 7758 if (lbp->lb_flags & MDDB_MNSET) 7759 lnp->ln_revision = MDDB_REV_MNLN; 7760 else 7761 lnp->ln_revision = MDDB_REV_LN; 7762 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 7763 err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 7764 lbp->lb_lnblkcnt, 0); 7765 /* 7766 * If a MN diskset and this is the master, set the PARSE_LOCNM 7767 * flag in the mddb_set structure to show that the locator 7768 * names have changed. 7769 */ 7770 7771 if ((lbp->lb_flags & MDDB_MNSET) && 7772 (md_set[s->s_setno].s_am_i_master)) { 7773 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 7774 } 7775 out: 7776 single_thread_end(s); 7777 mddb_setexit(s); 7778 if (err) 7779 return (1); 7780 return (0); 7781 } 7782 7783 /* 7784 * update_locatorblock - for active entries in the locator block, check 7785 * the devt to see if it matches the given devt. If so, and 7786 * there is an associated device id which is not the same 7787 * as the passed in devid, delete old devid and add a new one. 7788 * 7789 * During import of replicated disksets, old_didptr contains 7790 * the original disk's device id. Use this device id in 7791 * addition to the devt to determine if an entry is a match 7792 * and should be updated with the new device id of the 7793 * replicated disk. Specifically, this is the case being handled: 7794 * 7795 * Original_disk Replicated_disk Disk_Available_During_Import 7796 * c1t1d0 c1t3d0 no - so old name c1t1d0 shown 7797 * c1t2d0 c1t1d0 yes - name is c1t1d0 7798 * c1t3d0 c1t2d0 yes - name is c1t2d0 7799 * 7800 * Can't just match on devt since devt for the first and third 7801 * disks will be the same, but the original disk's device id 7802 * is known and can be used to distinguish which disk's 7803 * replicated device id should be updated. 7804 * RETURN 7805 * MDDB_E_NODEVID 7806 * MDDB_E_NOLOCBLK 7807 * 1 Error 7808 * 0 Success 7809 */ 7810 static int 7811 update_locatorblock( 7812 mddb_set_t *s, 7813 md_dev64_t dev, 7814 ddi_devid_t didptr, 7815 ddi_devid_t old_didptr 7816 ) 7817 { 7818 mddb_lb_t *lbp = NULL; 7819 mddb_locator_t *lp; 7820 int li; 7821 uint_t flg; 7822 ddi_devid_t devid_ptr; 7823 int retval = 0; 7824 char *minor_name; 7825 int repl_import_flag; 7826 7827 /* Set replicated flag if this is a replicated import */ 7828 repl_import_flag = md_get_setstatus(s->s_setno) & 7829 MD_SET_REPLICATED_IMPORT; 7830 7831 lbp = s->s_lbp; 7832 /* find replicas that haven't been deleted */ 7833 for (li = 0; li < lbp->lb_loccnt; li++) { 7834 lp = &lbp->lb_locators[li]; 7835 7836 if ((lp->l_flags & MDDB_F_DELETED)) { 7837 continue; 7838 } 7839 /* 7840 * check to see if locator devt matches given dev 7841 * and if there is a device ID associated with it 7842 */ 7843 flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags; 7844 if ((md_expldev(lp->l_dev) == dev) && 7845 (flg & MDDB_DID_EXISTS)) { 7846 if (flg & MDDB_DID_VALID) { 7847 continue; /* cont to nxt active entry */ 7848 } 7849 devid_ptr = s->s_did_icp->did_ic_devid[li]; 7850 if (devid_ptr == NULL) { 7851 return (MDDB_E_NODEVID); 7852 } 7853 7854 /* 7855 * During a replicated import the old_didptr 7856 * must match the current devid before the 7857 * devid can be updated. 7858 */ 7859 if (repl_import_flag) { 7860 if (ddi_devid_compare(devid_ptr, 7861 old_didptr) != 0) 7862 continue; 7863 } 7864 7865 if (ddi_devid_compare(devid_ptr, didptr) != 0) { 7866 /* 7867 * devid's not equal so 7868 * delete and add 7869 */ 7870 if (ddi_lyr_get_minor_name( 7871 md_dev64_to_dev(dev), 7872 S_IFBLK, &minor_name) == DDI_SUCCESS) { 7873 (void) mddb_devid_delete(s, li); 7874 (void) mddb_devid_add(s, li, didptr, 7875 minor_name); 7876 kmem_free(minor_name, 7877 strlen(minor_name)+1); 7878 break; 7879 } else { 7880 retval = 1; 7881 goto err_out; 7882 } 7883 } 7884 } 7885 } /* end for */ 7886 retval = push_lb(s); 7887 (void) upd_med(s, "update_locatorblock(0)"); 7888 err_out: 7889 return (retval); 7890 } 7891 7892 static int 7893 update_mb_devid( 7894 mddb_set_t *s, 7895 mddb_ri_t *rip, 7896 ddi_devid_t devidptr 7897 ) 7898 { 7899 mddb_mb_ic_t *mbip; 7900 mddb_mb_t *mb = NULL; 7901 daddr_t blkno; 7902 md_dev64_t device; 7903 uint_t sz; 7904 int mb2free = 0; 7905 int err = 0; 7906 7907 7908 /* 7909 * There is case where a disk may not have mddb, 7910 * and only has dummy mddb which contains 7911 * a valid devid we like to update and in this 7912 * case, the rip_lbp will be NULL but we still 7913 * like to update the devid embedded in the 7914 * dummy mb block. 7915 * 7916 */ 7917 if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) { 7918 mbip = rip->ri_mbip; 7919 mb = &mbip->mbi_mddb_mb; 7920 } else { 7921 /* 7922 * Done if it is non-replicated set 7923 */ 7924 if (devidptr != (ddi_devid_t)NULL) { 7925 mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE, 7926 KM_SLEEP); 7927 mb->mb_magic = MDDB_MAGIC_DU; 7928 mb->mb_revision = MDDB_REV_MB; 7929 mb2free = 1; 7930 } else { 7931 goto out; 7932 } 7933 } 7934 7935 blkno = rip->ri_blkno; 7936 device = rip->ri_dev; 7937 /* 7938 * Replace the mb_devid with the new/valid one 7939 */ 7940 if (devidptr != (ddi_devid_t)NULL) { 7941 /* 7942 * Zero out what we have previously 7943 */ 7944 if (mb->mb_devid_len) 7945 bzero(mb->mb_devid, mb->mb_devid_len); 7946 sz = ddi_devid_sizeof(devidptr); 7947 bcopy((char *)devidptr, (char *)mb->mb_devid, sz); 7948 mb->mb_devid_len = sz; 7949 } 7950 7951 mb->mb_setno = s->s_setno; 7952 uniqtime32(&mb->mb_timestamp); 7953 crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL); 7954 /* 7955 * putblks will 7956 * 7957 * - drop the s_dbmx lock 7958 * - biowait 7959 * - regain the s_dbmx lock 7960 * 7961 * Need to update this if we wants to handle 7962 * mb_next != NULL which it is unlikely will happen 7963 */ 7964 err = putblks(s, (caddr_t)mb, blkno, 1, device, 0); 7965 7966 if (mb2free) { 7967 kmem_free(mb, MDDB_BSIZE); 7968 } 7969 out: 7970 return (err); 7971 } 7972 7973 static int 7974 setdid( 7975 mddb_config_t *cp 7976 ) 7977 { 7978 ddi_devid_t devidp; 7979 dev_t ddi_dev; 7980 mddb_set_t *s; 7981 int err = 0; 7982 mddb_ri_t *rip; 7983 7984 /* 7985 * Data integrity check 7986 */ 7987 if (cp->c_setno >= md_nsets || cp->c_devt <= 0) 7988 return (EINVAL); 7989 7990 if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE)) 7991 return (0); 7992 7993 ddi_dev = md_dev64_to_dev(cp->c_devt); 7994 if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) { 7995 return (-1); 7996 } 7997 if (devidp == NULL) { 7998 return (-1); 7999 } 8000 8001 if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) 8002 return (-1); 8003 single_thread_start(s); 8004 8005 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 8006 if (rip->ri_lbp == (mddb_lb_t *)NULL) 8007 continue; 8008 /* 8009 * We only update what is asked 8010 */ 8011 if (rip->ri_dev == cp->c_devt) { 8012 if (update_mb_devid(s, rip, devidp) != 0) { 8013 err = -1; 8014 goto out; 8015 } 8016 } 8017 } 8018 8019 if (update_locatorblock(s, cp->c_devt, devidp, NULL)) { 8020 err = -1; 8021 goto out; 8022 } 8023 8024 out: 8025 single_thread_end(s); 8026 mddb_setexit(s); 8027 ddi_devid_free(devidp); 8028 return (err); 8029 } 8030 8031 static int 8032 delnewside( 8033 mddb_config_t *cp, 8034 int command, 8035 md_error_t *ep 8036 ) 8037 { 8038 mddb_set_t *s; 8039 int li; 8040 mddb_lb_t *lbp; /* pointer to locator block */ 8041 mddb_ln_t *lnp; /* pointer to locator names */ 8042 mddb_mnln_t *mnlnp; /* pointer to locator names */ 8043 mddb_locator_t *lp; 8044 mddb_sidelocator_t *slp; 8045 mddb_cfg_loc_t *clp; 8046 int err = 0; 8047 set_t setno = cp->c_setno; 8048 ddi_devid_t devid; 8049 ddi_devid_t ret_devid = NULL; 8050 char *minor_name; 8051 uint_t use_devid = 0; 8052 dev_t ddi_dev; 8053 md_mnname_suffix_t *mnsn; 8054 mddb_mnlb_t *mnlbp; 8055 mddb_mnsidelocator_t *mnslp; 8056 8057 /* Currently don't allow addition/deletion of sides during upgrade */ 8058 if (MD_UPGRADE) { 8059 cmn_err(CE_WARN, 8060 "Addition and deletion of sides not allowed" 8061 " during upgrade. \n"); 8062 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8063 } 8064 8065 /* 8066 * Data integrity check 8067 */ 8068 if (setno >= md_nsets || cp->c_locator.l_dev <= 0) 8069 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 8070 8071 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 8072 return (mddbstatus2error(ep, err, NODEV32, setno)); 8073 8074 single_thread_start(s); 8075 clp = &cp->c_locator; 8076 8077 lbp = s->s_lbp; 8078 8079 if (lbp->lb_setno != setno) { 8080 single_thread_end(s); 8081 mddb_setexit(s); 8082 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); 8083 } 8084 8085 /* 8086 * Find this device/blkno pair 8087 */ 8088 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 8089 ddi_dev = md_dev64_to_dev(clp->l_dev); 8090 if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) && 8091 (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name) 8092 == DDI_SUCCESS)) { 8093 if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) { 8094 clp->l_devid = (uint64_t)(uintptr_t)ret_devid; 8095 use_devid = 1; 8096 (void) strcpy(clp->l_minor_name, minor_name); 8097 } 8098 kmem_free(minor_name, strlen(minor_name)+1); 8099 } 8100 if (use_devid != 1 && ret_devid != NULL) 8101 ddi_devid_free(ret_devid); 8102 } 8103 for (li = 0; li < lbp->lb_loccnt; li++) { 8104 lp = &lbp->lb_locators[li]; 8105 if (lp->l_flags & MDDB_F_DELETED) 8106 continue; 8107 if (use_devid) { 8108 if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0) 8109 continue; 8110 if ((ddi_devid_compare(devid, 8111 (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) && 8112 (strcmp(clp->l_minor_name, minor_name) == 0) && 8113 ((daddr_t)lp->l_blkno == clp->l_blkno)) { 8114 break; 8115 } 8116 } else { 8117 if (lp->l_dev == clp->l_dev && 8118 (daddr_t)lp->l_blkno == clp->l_blkno) { 8119 break; 8120 } 8121 } 8122 } 8123 8124 if (li == lbp->lb_loccnt) { 8125 if (use_devid) 8126 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8127 single_thread_end(s); 8128 mddb_setexit(s); 8129 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); 8130 } 8131 8132 lnp = s->s_lnp; 8133 if (command == MDDB_NEWSIDE) { 8134 int index = 0; 8135 /* 8136 * If a MN diskset, need to find the index where the new 8137 * locator information is to be stored in the mnsidelocator 8138 * field of the locator block so that the locator name can 8139 * be stored at the same array index in the mnsuffixes 8140 * field of the locator names structure. 8141 */ 8142 if (lbp->lb_flags & MDDB_MNSET) { 8143 if ((index = checklocator(lbp, li, 8144 cp->c_sideno)) == -1) { 8145 if (use_devid) { 8146 ddi_devid_free((ddi_devid_t) 8147 (uintptr_t)clp->l_devid); 8148 } 8149 single_thread_end(s); 8150 mddb_setexit(s); 8151 return (mdmddberror(ep, MDE_DB_TOOSMALL, 8152 NODEV32, setno)); 8153 } 8154 } 8155 8156 /* 8157 * Store the locator name before the sidelocator information 8158 * in case a panic occurs between these 2 steps. Must have 8159 * the locator name information in order to print reasonable 8160 * error information. 8161 */ 8162 if (splitname2locatorblock(&cp->c_devname, lnp, li, 8163 cp->c_sideno, index)) { 8164 if (use_devid) 8165 ddi_devid_free( 8166 (ddi_devid_t)(uintptr_t)clp->l_devid); 8167 single_thread_end(s); 8168 mddb_setexit(s); 8169 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, 8170 setno)); 8171 } 8172 8173 if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) { 8174 if (use_devid) 8175 ddi_devid_free( 8176 (ddi_devid_t)(uintptr_t)clp->l_devid); 8177 single_thread_end(s); 8178 mddb_setexit(s); 8179 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, 8180 setno)); 8181 } 8182 } 8183 8184 if (use_devid) 8185 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8186 8187 if (command == MDDB_DELSIDE) { 8188 int i; 8189 for (i = 0; i < lbp->lb_loccnt; i++) { 8190 if (lbp->lb_flags & MDDB_MNSET) { 8191 int j; 8192 mnlbp = (mddb_mnlb_t *)lbp; 8193 for (j = 0; j < MD_MNMAXSIDES; j++) { 8194 mnslp = &mnlbp->lb_mnsidelocators[j][i]; 8195 if (mnslp->mnl_sideno == cp->c_sideno) 8196 break; 8197 } 8198 if (j < MD_MNMAXSIDES) { 8199 mnslp->mnl_mnum = NODEV32; 8200 mnslp->mnl_sideno = 0; 8201 mnlnp = (mddb_mnln_t *)lnp; 8202 mnsn = &(mnlnp->ln_mnsuffixes[j][i]); 8203 bzero((caddr_t)mnsn, 8204 sizeof (md_mnname_suffix_t)); 8205 } 8206 } else { 8207 slp = &lbp->lb_sidelocators[cp->c_sideno][i]; 8208 bzero((caddr_t)&lnp->ln_suffixes 8209 [cp->c_sideno][i], sizeof (md_name_suffix)); 8210 slp->l_mnum = NODEV32; 8211 } 8212 } 8213 } 8214 8215 /* write new locator names to all devices */ 8216 uniqtime32(&lnp->ln_timestamp); 8217 if (lbp->lb_flags & MDDB_MNSET) 8218 lnp->ln_revision = MDDB_REV_MNLN; 8219 else 8220 lnp->ln_revision = MDDB_REV_LN; 8221 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 8222 err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 8223 lbp->lb_lnblkcnt, 0); 8224 /* 8225 * If a MN diskset and this is the master, set the PARSE_LOCNM 8226 * flag in the mddb_set structure to show that the locator 8227 * names have changed. 8228 */ 8229 8230 if ((lbp->lb_flags & MDDB_MNSET) && 8231 (md_set[s->s_setno].s_am_i_master)) { 8232 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 8233 } 8234 if (err) { 8235 if (writeretry(s)) { 8236 single_thread_end(s); 8237 mddb_setexit(s); 8238 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8239 } 8240 } 8241 8242 uniqtime32(&lbp->lb_timestamp); 8243 /* write new locator to all devices */ 8244 err = writelocall(s); 8245 8246 (void) upd_med(s, "delnewside(0)"); 8247 8248 computefreeblks(s); /* recompute always it may be larger */ 8249 if (err) { 8250 if (writeretry(s)) { 8251 single_thread_end(s); 8252 mddb_setexit(s); 8253 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8254 } 8255 } 8256 8257 single_thread_end(s); 8258 mddb_setexit(s); 8259 8260 return (0); 8261 } 8262 8263 static int 8264 newdev( 8265 mddb_config_t *cp, 8266 int command, 8267 md_error_t *ep 8268 ) 8269 { 8270 mddb_set_t *s; 8271 mddb_mb_ic_t *mbip, *mbip1; 8272 int i, j; 8273 int li; 8274 mddb_lb_t *lbp; /* pointer to locator block */ 8275 mddb_ln_t *lnp; /* pointer to locator names */ 8276 mddb_locator_t *lp; 8277 mddb_cfg_loc_t *clp; 8278 int err