1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * mirror operations 30 */ 31 32 #include <meta.h> 33 #include <sys/lvm/md_mirror.h> 34 #include <thread.h> 35 36 extern int md_in_daemon; 37 extern md_mn_client_list_t *mdmn_clients; 38 39 /* 40 * chain of mirrors 41 */ 42 typedef struct mm_unit_list { 43 struct mm_unit_list *next; /* next in chain */ 44 mdname_t *namep; /* mirror name */ 45 mm_pass_num_t pass; /* pass number */ 46 uint_t done; /* resync done */ 47 } mm_unit_list_t; 48 49 /* 50 * resync mirror 51 * meta_lock for this set should be held on entry. 52 */ 53 int 54 meta_mirror_resync( 55 mdsetname_t *sp, 56 mdname_t *mirnp, 57 daddr_t size, 58 md_error_t *ep, 59 md_resync_cmd_t cmd /* Start/Block/Unblock/Kill */ 60 ) 61 { 62 char *miscname; 63 md_resync_ioctl_t ri; 64 65 /* should have a set */ 66 assert(sp != NULL); 67 assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev))); 68 69 /* make sure we have a mirror */ 70 if ((miscname = metagetmiscname(mirnp, ep)) == NULL) 71 return (-1); 72 if (strcmp(miscname, MD_MIRROR) != 0) { 73 return (mdmderror(ep, MDE_NOT_MM, meta_getminor(mirnp->dev), 74 mirnp->cname)); 75 } 76 77 /* start resync */ 78 (void) memset(&ri, 0, sizeof (ri)); 79 MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno); 80 ri.ri_mnum = meta_getminor(mirnp->dev); 81 ri.ri_copysize = size; 82 switch (cmd) { 83 case MD_RESYNC_FORCE_MNSTART: 84 ri.ri_flags |= MD_RI_RESYNC_FORCE_MNSTART; 85 break; 86 case MD_RESYNC_START: 87 ri.ri_flags = 0; 88 break; 89 case MD_RESYNC_BLOCK: 90 ri.ri_flags = MD_RI_BLOCK; 91 break; 92 case MD_RESYNC_UNBLOCK: 93 ri.ri_flags = MD_RI_UNBLOCK; 94 break; 95 case MD_RESYNC_KILL: 96 ri.ri_flags = MD_RI_KILL; 97 break; 98 case MD_RESYNC_KILL_NO_WAIT: 99 ri.ri_flags = MD_RI_KILL | MD_RI_NO_WAIT; 100 break; 101 default: 102 /* TODO: Add new error MDE_BAD_RESYNC_FLAGS */ 103 return (mderror(ep, MDE_BAD_RESYNC_OPT, mirnp->cname)); 104 } 105 106 if (metaioctl(MD_IOCSETSYNC, &ri, &ri.mde, mirnp->cname) != 0) 107 return (mdstealerror(ep, &ri.mde)); 108 109 /* return success */ 110 return (0); 111 } 112 113 /* 114 * free units 115 */ 116 static void 117 free_units( 118 mm_unit_list_t *mirrors[MD_PASS_MAX + 1] 119 ) 120 { 121 uint_t i; 122 123 for (i = 0; (i < (MD_PASS_MAX + 1)); ++i) { 124 mm_unit_list_t *p, *n; 125 126 for (p = mirrors[i], n = NULL; (p != NULL); p = n) { 127 n = p->next; 128 Free(p); 129 } 130 mirrors[i] = NULL; 131 } 132 } 133 134 /* 135 * setup_units: build lists of units for each pass 136 */ 137 static int 138 setup_units( 139 mdsetname_t *sp, 140 mm_unit_list_t *mirrors[MD_PASS_MAX + 1], 141 md_error_t *ep 142 ) 143 { 144 mdnamelist_t *mirrornlp = NULL; 145 mdnamelist_t *p; 146 int rval = 0; 147 148 /* should have a set */ 149 assert(sp != NULL); 150 151 /* for each mirror */ 152 if (meta_get_mirror_names(sp, &mirrornlp, 0, ep) < 0) 153 return (-1); 154 for (p = mirrornlp; (p != NULL); p = p->next) { 155 md_mirror_t *mirrorp; 156 mm_unit_list_t *lp; 157 158 /* get unit structure */ 159 if ((mirrorp = meta_get_mirror(sp, p->namep, ep)) == NULL) { 160 rval = -1; /* record, but ignore errors */ 161 continue; 162 } 163 164 /* save info */ 165 lp = Zalloc(sizeof (*lp)); 166 lp->namep = p->namep; 167 lp->pass = mirrorp->pass_num; 168 if ((lp->pass < 0) || (lp->pass > MD_PASS_MAX)) 169 lp->pass = MD_PASS_MAX; 170 171 /* put on list */ 172 lp->next = mirrors[lp->pass]; 173 mirrors[lp->pass] = lp; 174 } 175 176 /* cleanup, return error */ 177 metafreenamelist(mirrornlp); 178 return (rval); 179 } 180 181 /* 182 * resync all mirrors (in background) 183 */ 184 int 185 meta_mirror_resync_all( 186 mdsetname_t *sp, 187 daddr_t size, 188 md_error_t *ep 189 ) 190 { 191 mm_unit_list_t *mirrors[MD_PASS_MAX + 1]; 192 mm_pass_num_t pass, max_pass; 193 int rval = 0, fval; 194 195 /* should have a set */ 196 assert(sp != NULL); 197 198 /* get mirrors */ 199 (void) memset(mirrors, 0, sizeof (mirrors)); 200 if (setup_units(sp, mirrors, ep) != 0) 201 return (-1); 202 203 /* fork a process */ 204 if ((fval = md_daemonize(sp, ep)) != 0) { 205 /* 206 * md_daemonize will fork off a process. The is the 207 * parent or error. 208 */ 209 if (fval > 0) { 210 free_units(mirrors); 211 return (0); 212 } 213 mdclrerror(ep); 214 } 215 /* 216 * Closing stdin/out/err here. 217 * In case this was called thru rsh, the calling process on the other 218 * side will know, it doesn't have to wait until all the resyncs have 219 * finished. 220 * Also initialise the rpc client pool so that this process will use 221 * a unique pool of clients. If we don't do this, all of the forked 222 * clients will end up using the same pool of clients which can result 223 * in hung clients. 224 */ 225 if (meta_is_mn_set(sp, ep)) { 226 (void) close(0); 227 (void) close(1); 228 (void) close(2); 229 mdmn_clients = NULL; 230 } 231 assert((fval == 0) || (fval == -1)); 232 233 /* 234 * Determine which pass level is the highest that contains mirrors to 235 * resync. We only need to wait for completion of earlier levels below 236 * this high watermark. If all mirrors are at the same pass level 237 * there is no requirement to wait for completion. 238 */ 239 240 max_pass = 1; 241 for (pass = MD_PASS_MAX; pass > 1; --pass) { 242 if (mirrors[pass] != NULL) { 243 max_pass = pass; 244 break; 245 } 246 } 247 248 /* 249 * max_pass now contains the highest pass-level with resyncable mirrors 250 */ 251 252 /* do passes */ 253 for (pass = 1; (pass <= MD_PASS_MAX); ++pass) { 254 int dispatched = 0; 255 unsigned howlong = 1; 256 mm_unit_list_t *lp; 257 258 /* skip empty passes */ 259 if (mirrors[pass] == NULL) 260 continue; 261 262 /* dispatch all resyncs in pass */ 263 for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) { 264 if (meta_is_mn_set(sp, ep)) { 265 if (meta_mn_send_setsync(sp, lp->namep, 266 size, ep) != 0) { 267 rval = -1; 268 lp->done = 1; 269 } else { 270 ++dispatched; 271 } 272 } else { 273 if (meta_mirror_resync(sp, lp->namep, size, ep, 274 MD_RESYNC_START) != 0) { 275 rval = -1; 276 lp->done = 1; 277 } else { 278 ++dispatched; 279 } 280 } 281 } 282 283 /* 284 * Wait for them to finish iff we are at a level lower than 285 * max_pass. This orders the resyncs into distinct levels. 286 * I.e. level 2 resyncs won't start until all level 1 ones 287 * have completed. 288 */ 289 if (pass == max_pass) 290 continue; 291 292 howlong = 1; 293 while (dispatched > 0) { 294 295 /* wait a while */ 296 (void) sleep(howlong); 297 298 /* see if any finished */ 299 for (lp = mirrors[pass]; lp != NULL; lp = lp->next) { 300 md_resync_ioctl_t ri; 301 302 if (lp->done) 303 continue; 304 305 (void) memset(&ri, '\0', sizeof (ri)); 306 ri.ri_mnum = meta_getminor(lp->namep->dev); 307 MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno); 308 if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde, 309 lp->namep->cname) != 0) { 310 (void) mdstealerror(ep, &ri.mde); 311 rval = -1; 312 lp->done = 1; 313 --dispatched; 314 } else if (! (ri.ri_flags & MD_RI_INPROGRESS)) { 315 lp->done = 1; 316 --dispatched; 317 } 318 } 319 320 /* wait a little longer next time */ 321 if (howlong < 10) 322 ++howlong; 323 } 324 } 325 326 /* cleanup, return success */ 327 free_units(mirrors); 328 if (fval == 0) /* we are the child process so exit */ 329 exit(0); 330 return (rval); 331 } 332 333 /* 334 * meta_mn_mirror_resync_all: 335 * ------------------------- 336 * Resync all mirrors associated with given set (arg). Called when master 337 * node is adding a node to a diskset. Only want to initiate the resync on 338 * the current node. 339 */ 340 void * 341 meta_mn_mirror_resync_all(void *arg) 342 { 343 set_t setno = *((set_t *)arg); 344 mdsetname_t *sp; 345 mm_unit_list_t *mirrors[MD_PASS_MAX + 1]; 346 mm_pass_num_t pass, max_pass; 347 md_error_t mde = mdnullerror; 348 int fval; 349 350 351 /* should have a set */ 352 assert(setno != NULL); 353 354 if ((sp = metasetnosetname(setno, &mde)) == NULL) { 355 mde_perror(&mde, ""); 356 return (NULL); 357 } 358 359 if (!(meta_is_mn_set(sp, &mde))) { 360 mde_perror(&mde, ""); 361 return (NULL); 362 } 363 364 /* fork a process */ 365 if ((fval = md_daemonize(sp, &mde)) != 0) { 366 /* 367 * md_daemonize will fork off a process. The is the 368 * parent or error. 369 */ 370 if (fval > 0) { 371 return (NULL); 372 } 373 mde_perror(&mde, ""); 374 return (NULL); 375 } 376 /* 377 * Child process should never return back to rpc.metad, but 378 * should exit. 379 * Flush all internally cached data inherited from parent process 380 * since cached data will be cleared when parent process RPC request 381 * has completed (which is possibly before this child process 382 * can complete). 383 * Child process can retrieve and cache its own copy of data from 384 * rpc.metad that won't be changed by the parent process. 385 * 386 * Reset md_in_daemon since this child will be a client of rpc.metad 387 * not part of the rpc.metad daemon itself. 388 * md_in_daemon is used by rpc.metad so that libmeta can tell if 389 * this thread is rpc.metad or any other thread. (If this thread 390 * was rpc.metad it could use some short circuit code to get data 391 * directly from rpc.metad instead of doing an RPC call to rpc.metad). 392 */ 393 md_in_daemon = 0; 394 metaflushsetname(sp); 395 sr_cache_flush_setno(setno); 396 if ((sp = metasetnosetname(setno, &mde)) == NULL) { 397 mde_perror(&mde, ""); 398 md_exit(sp, 1); 399 } 400 401 if (meta_lock(sp, TRUE, &mde) != 0) { 402 mde_perror(&mde, ""); 403 md_exit(sp, 1); 404 } 405 406 /* 407 * Closing stdin/out/err here. 408 */ 409 (void) close(0); 410 (void) close(1); 411 (void) close(2); 412 assert(fval == 0); 413 414 /* get mirrors */ 415 (void) memset(mirrors, 0, sizeof (mirrors)); 416 if (setup_units(sp, mirrors, &mde) != 0) { 417 (void) meta_unlock(sp, &mde); 418 md_exit(sp, 1); 419 } 420 421 /* 422 * Determine which pass level is the highest that contains mirrors to 423 * resync. We only need to wait for completion of earlier levels below 424 * this high watermark. If all mirrors are at the same pass level 425 * there is no requirement to wait for completion. 426 */ 427 max_pass = 1; 428 for (pass = MD_PASS_MAX; pass > 1; --pass) { 429 if (mirrors[pass] != NULL) { 430 max_pass = pass; 431 break; 432 } 433 } 434 435 /* 436 * max_pass now contains the highest pass-level with resyncable mirrors 437 */ 438 /* do passes */ 439 for (pass = 1; (pass <= MD_PASS_MAX); ++pass) { 440 int dispatched = 0; 441 unsigned howlong = 1; 442 mm_unit_list_t *lp; 443 444 /* skip empty passes */ 445 if (mirrors[pass] == NULL) 446 continue; 447 448 /* dispatch all resyncs in pass */ 449 for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) { 450 if (meta_mirror_resync(sp, lp->namep, 0, &mde, 451 MD_RESYNC_FORCE_MNSTART) != 0) { 452 mdclrerror(&mde); 453 lp->done = 1; 454 } else { 455 ++dispatched; 456 } 457 } 458 459 /* 460 * Wait for them to finish iff we are at a level lower than 461 * max_pass. This orders the resyncs into distinct levels. 462 * I.e. level 2 resyncs won't start until all level 1 ones 463 * have completed. 464 */ 465 if (pass == max_pass) 466 continue; 467 468 howlong = 1; 469 while (dispatched > 0) { 470 471 /* wait a while */ 472 (void) sleep(howlong); 473 474 /* see if any finished */ 475 for (lp = mirrors[pass]; lp != NULL; lp = lp->next) { 476 md_resync_ioctl_t ri; 477 478 if (lp->done) 479 continue; 480 481 (void) memset(&ri, '\0', sizeof (ri)); 482 ri.ri_mnum = meta_getminor(lp->namep->dev); 483 MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno); 484 if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde, 485 lp->namep->cname) != 0) { 486 mdclrerror(&mde); 487 lp->done = 1; 488 --dispatched; 489 } else if (! (ri.ri_flags & MD_RI_INPROGRESS)) { 490 lp->done = 1; 491 --dispatched; 492 } 493 } 494 495 /* wait a little longer next time */ 496 if (howlong < 10) 497 ++howlong; 498 } 499 } 500 501 /* cleanup, return success */ 502 free_units(mirrors); 503 (void) meta_unlock(sp, &mde); 504 md_exit(sp, 0); 505 /*NOTREACHED*/ 506 return (NULL); 507 } 508 509 /* 510 * meta_mirror_resync_process: 511 * -------------------------- 512 * Modify any resync that is in progress on this node for the given set. 513 * 514 * Input Parameters: 515 * sp setname to scan for mirrors 516 * cmd action to take: 517 * MD_RESYNC_KILL - kill all resync threads 518 * MD_RESYNC_BLOCK - block all resync threads 519 * MD_RESYNC_UNBLOCK - resume all resync threads 520 * Output Parameters 521 * ep error return structure 522 * 523 * meta_lock for this set should be held on entry. 524 */ 525 static void 526 meta_mirror_resync_process(mdsetname_t *sp, md_error_t *ep, md_resync_cmd_t cmd) 527 { 528 mm_unit_list_t *mirrors[MD_PASS_MAX + 1]; 529 mm_pass_num_t pass; 530 531 /* Grab all the mirrors from the set (if any) */ 532 (void) memset(mirrors, 0, sizeof (mirrors)); 533 if (setup_units(sp, mirrors, ep) != 0) 534 return; 535 536 /* do passes */ 537 for (pass = 1; (pass <= MD_PASS_MAX); ++pass) { 538 mm_unit_list_t *lp; 539 540 /* skip empty passes */ 541 if (mirrors[pass] == NULL) 542 continue; 543 544 /* Process all resyncs in pass */ 545 for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) { 546 (void) meta_mirror_resync(sp, lp->namep, 0, ep, 547 cmd); 548 } 549 } 550 551 /* Clear up mirror units */ 552 free_units(mirrors); 553 } 554 555 /* 556 * meta_mirror_resync_process_all: 557 * ------------------------------ 558 * Issue the given resync command to all mirrors contained in all multi-node 559 * sets. 560 * 561 * Input Parameters: 562 * cmd - MD_RESYNC_KILL, MD_RESYNC_BLOCK, MD_RESYNC_UNBLOCK 563 */ 564 static void 565 meta_mirror_resync_process_all(md_resync_cmd_t cmd) 566 { 567 set_t setno, max_sets; 568 md_error_t mde = mdnullerror; 569 mdsetname_t *this_sp; 570 md_set_desc *sd; 571 572 /* 573 * Traverse all sets looking for multi-node capable ones. 574 */ 575 max_sets = get_max_sets(&mde); 576 for (setno = 1; setno < max_sets; setno++) { 577 mde = mdnullerror; 578 if (this_sp = metasetnosetname(setno, &mde)) { 579 if ((sd = metaget_setdesc(this_sp, &mde)) == NULL) 580 continue; 581 if (!MD_MNSET_DESC(sd)) 582 continue; 583 584 if (meta_lock(this_sp, TRUE, &mde)) { 585 continue; 586 } 587 meta_mirror_resync_process(this_sp, &mde, cmd); 588 (void) meta_unlock(this_sp, &mde); 589 } 590 } 591 } 592 593 /* 594 * meta_mirror_resync_kill_all: 595 * --------------------------- 596 * Abort any resync that is in progress on this node. Scan all sets for all 597 * mirrors. 598 * Note: this routine is provided for future use. For example to kill all 599 * resyncs on a node this could be used as long as the 600 * mddoors / rpc.mdcommd tuple is running on all members of the cluster. 601 */ 602 void 603 meta_mirror_resync_kill_all(void) 604 { 605 meta_mirror_resync_process_all(MD_RESYNC_KILL); 606 } 607 608 /* 609 * meta_mirror_resync_block_all: 610 * ---------------------------- 611 * Block all resyncs that are in progress. This causes the resync state to 612 * freeze on this machine, and can be resumed by calling 613 * meta_mirror_resync_unblock_all. 614 */ 615 void 616 meta_mirror_resync_block_all(void) 617 { 618 meta_mirror_resync_process_all(MD_RESYNC_BLOCK); 619 } 620 621 /* 622 * meta_mirror_resync_unblock_all: 623 * ------------------------------ 624 * Unblock all previously blocked resync threads on this node. 625 */ 626 void 627 meta_mirror_resync_unblock_all(void) 628 { 629 meta_mirror_resync_process_all(MD_RESYNC_UNBLOCK); 630 } 631 632 /* 633 * meta_mirror_resync_unblock: 634 * -------------------------- 635 * Unblock any previously blocked resync threads for the given set. 636 * meta_lock for this set should be held on entry. 637 */ 638 void 639 meta_mirror_resync_unblock(mdsetname_t *sp) 640 { 641 md_error_t mde = mdnullerror; 642 643 meta_mirror_resync_process(sp, &mde, MD_RESYNC_UNBLOCK); 644 } 645 646 /* 647 * meta_mirror_resync_kill: 648 * ----------------------- 649 * Kill any resync threads running on mirrors in the given set. 650 * Called when releasing a set (meta_set_prv.c`halt_set) 651 */ 652 void 653 meta_mirror_resync_kill(mdsetname_t *sp) 654 { 655 md_error_t mde = mdnullerror; 656 657 meta_mirror_resync_process(sp, &mde, MD_RESYNC_KILL); 658 } 659