1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Driver for Virtual Disk. 29 */ 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/buf.h> 33 #include <sys/conf.h> 34 #include <sys/user.h> 35 #include <sys/uio.h> 36 #include <sys/proc.h> 37 #include <sys/t_lock.h> 38 #include <sys/dkio.h> 39 #include <sys/kmem.h> 40 #include <sys/debug.h> 41 #include <sys/cmn_err.h> 42 #include <sys/sysmacros.h> 43 #include <sys/types.h> 44 #include <sys/mkdev.h> 45 #include <sys/vtoc.h> 46 #include <sys/open.h> 47 #include <sys/file.h> 48 #include <vm/page.h> 49 #include <sys/callb.h> 50 #include <sys/disp.h> 51 #include <sys/modctl.h> 52 #include <sys/errno.h> 53 #include <sys/door.h> 54 #include <sys/lvm/mdmn_commd.h> 55 #include <sys/lvm/md_hotspares.h> 56 57 #include <sys/lvm/mdvar.h> 58 #include <sys/lvm/md_names.h> 59 60 #include <sys/ddi.h> 61 #include <sys/proc.h> 62 #include <sys/sunddi.h> 63 #include <sys/esunddi.h> 64 65 #include <sys/sysevent.h> 66 #include <sys/sysevent/eventdefs.h> 67 68 #include <sys/sysevent/svm.h> 69 #include <sys/lvm/md_basic.h> 70 71 72 /* 73 * Machine specific Hertz is kept here 74 */ 75 extern clock_t md_hz; 76 77 /* 78 * Externs. 79 */ 80 extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 81 extern major_t md_major; 82 extern unit_t md_nunits; 83 extern set_t md_nsets; 84 extern md_set_t md_set[]; 85 extern md_set_io_t md_set_io[]; 86 extern md_ops_t **md_ops; 87 extern md_ops_t *md_opslist; 88 extern ddi_modhandle_t *md_mods; 89 extern dev_info_t *md_devinfo; 90 91 extern md_krwlock_t md_unit_array_rw; 92 extern kmutex_t md_mx; 93 extern kcondvar_t md_cv; 94 95 extern md_krwlock_t hsp_rwlp; 96 extern md_krwlock_t ni_rwlp; 97 98 extern int md_num_daemons; 99 extern int md_status; 100 extern int md_ioctl_cnt; 101 extern int md_mtioctl_cnt; 102 103 extern struct metatransops metatransops; 104 extern md_event_queue_t *md_event_queue; 105 extern md_resync_t md_cpr_resync; 106 extern int md_done_daemon_threads; 107 extern int md_ff_daemon_threads; 108 109 110 extern mddb_set_t *mddb_setenter(set_t setno, int flag, int *errorcodep); 111 extern void mddb_setexit(mddb_set_t *s); 112 extern void *lookup_entry(struct nm_next_hdr *, set_t, 113 side_t, mdkey_t, md_dev64_t, int); 114 extern struct nm_next_hdr *get_first_record(set_t, int, int); 115 extern dev_t getrootdev(void); 116 117 struct mdq_anchor md_done_daemon; /* done request queue */ 118 struct mdq_anchor md_mstr_daemon; /* mirror error, WOW requests */ 119 struct mdq_anchor md_mhs_daemon; /* mirror hotspare requests queue */ 120 struct mdq_anchor md_hs_daemon; /* raid hotspare requests queue */ 121 struct mdq_anchor md_ff_daemonq; /* failfast request queue */ 122 struct mdq_anchor md_mirror_daemon; /* mirror owner queue */ 123 struct mdq_anchor md_mirror_io_daemon; /* mirror owner i/o queue */ 124 struct mdq_anchor md_mirror_rs_daemon; /* mirror resync done queue */ 125 struct mdq_anchor md_sp_daemon; /* soft-part error daemon queue */ 126 struct mdq_anchor md_mto_daemon; /* mirror timeout daemon queue */ 127 128 int md_done_daemon_threads = 1; /* threads for md_done_daemon requestq */ 129 int md_mstr_daemon_threads = 1; /* threads for md_mstr_daemon requestq */ 130 int md_mhs_daemon_threads = 1; /* threads for md_mhs_daemon requestq */ 131 int md_hs_daemon_threads = 1; /* threads for md_hs_daemon requestq */ 132 int md_ff_daemon_threads = 3; /* threads for md_ff_daemon requestq */ 133 int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */ 134 int md_sp_daemon_threads = 1; /* threads for md_sp_daemon requestq */ 135 int md_mto_daemon_threads = 1; /* threads for md_mto_daemon requestq */ 136 137 #ifdef DEBUG 138 /* Flag to switch on debug messages */ 139 int md_release_reacquire_debug = 0; /* debug flag */ 140 #endif 141 142 /* 143 * 144 * The md_request_queues is table of pointers to request queues and the number 145 * of threads associated with the request queues. 146 * When the number of threads is set to 1, then the order of execution is 147 * sequential. 148 * The number of threads for all the queues have been defined as global 149 * variables to enable kernel tuning. 150 * 151 */ 152 153 #define MD_DAEMON_QUEUES 11 154 155 md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = { 156 {&md_done_daemon, &md_done_daemon_threads}, 157 {&md_mstr_daemon, &md_mstr_daemon_threads}, 158 {&md_hs_daemon, &md_hs_daemon_threads}, 159 {&md_ff_daemonq, &md_ff_daemon_threads}, 160 {&md_mirror_daemon, &md_mirror_daemon_threads}, 161 {&md_mirror_io_daemon, &md_mirror_daemon_threads}, 162 {&md_mirror_rs_daemon, &md_mirror_daemon_threads}, 163 {&md_sp_daemon, &md_sp_daemon_threads}, 164 {&md_mhs_daemon, &md_mhs_daemon_threads}, 165 {&md_mto_daemon, &md_mto_daemon_threads}, 166 {0, 0} 167 }; 168 169 /* 170 * Number of times a message is retried before issuing a warning to the operator 171 */ 172 #define MD_MN_WARN_INTVL 10 173 174 /* 175 * Setting retry cnt to one (pre decremented) so that we actually do no 176 * retries when committing/deleting a mddb rec. The underlying disk driver 177 * does several retries to check if the disk is really dead or not so there 178 * is no reason for us to retry on top of the drivers retries. 179 */ 180 181 uint_t md_retry_cnt = 1; /* global so it can be patched */ 182 183 /* 184 * How many times to try to do the door_ki_upcall() in mdmn_ksend_message. 185 * Again, made patchable here should it prove useful. 186 */ 187 uint_t md_send_retry_limit = 30; 188 189 /* 190 * Bug # 1212146 191 * Before this change the user had to pass in a short aligned buffer because of 192 * problems in some underlying device drivers. This problem seems to have been 193 * corrected in the underlying drivers so we will default to not requiring any 194 * alignment. If the user needs to check for a specific alignment, 195 * md_uio_alignment_mask may be set in /etc/system to accomplish this. To get 196 * the behavior before this fix, the md_uio_alignment_mask would be set to 1, 197 * to check for word alignment, it can be set to 3, for double word alignment, 198 * it can be set to 7, etc. 199 * 200 * [Other part of fix is in function md_chk_uio()] 201 */ 202 static int md_uio_alignment_mask = 0; 203 204 /* 205 * for md_dev64_t translation 206 */ 207 struct md_xlate_table *md_tuple_table; 208 struct md_xlate_major_table *md_major_tuple_table; 209 int md_tuple_length; 210 uint_t md_majortab_len; 211 212 /* Function declarations */ 213 214 static int md_create_probe_rqlist(md_probedev_impl_t *plist, 215 daemon_queue_t **hdr, intptr_t (*probe_test)()); 216 217 /* 218 * manipulate global status 219 */ 220 void 221 md_set_status(int bits) 222 { 223 mutex_enter(&md_mx); 224 md_status |= bits; 225 mutex_exit(&md_mx); 226 } 227 228 void 229 md_clr_status(int bits) 230 { 231 mutex_enter(&md_mx); 232 md_status &= ~bits; 233 mutex_exit(&md_mx); 234 } 235 236 int 237 md_get_status() 238 { 239 int result; 240 mutex_enter(&md_mx); 241 result = md_status; 242 mutex_exit(&md_mx); 243 return (result); 244 } 245 246 void 247 md_set_setstatus(set_t setno, int bits) 248 { 249 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); 250 251 mutex_enter(&md_mx); 252 md_set[setno].s_status |= bits; 253 mutex_exit(&md_mx); 254 } 255 256 void 257 md_clr_setstatus(set_t setno, int bits) 258 { 259 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); 260 261 mutex_enter(&md_mx); 262 md_set[setno].s_status &= ~bits; 263 mutex_exit(&md_mx); 264 } 265 266 uint_t 267 md_get_setstatus(set_t setno) 268 { 269 uint_t result; 270 271 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); 272 273 mutex_enter(&md_mx); 274 result = md_set[setno].s_status; 275 mutex_exit(&md_mx); 276 return (result); 277 } 278 279 /* 280 * md_unit_readerlock_common: 281 * ------------------------- 282 * Mark the given unit as having a reader reference. Spin waiting for any 283 * writer references to be released. 284 * 285 * Input: 286 * ui unit reference 287 * lock_held 0 => ui_mx needs to be grabbed 288 * 1 => ui_mx already held 289 * Output: 290 * mm_unit_t corresponding to unit structure 291 * ui->ui_readercnt incremented 292 */ 293 static void * 294 md_unit_readerlock_common(mdi_unit_t *ui, int lock_held) 295 { 296 uint_t flag = MD_UL_WRITER | MD_UL_WANABEWRITER; 297 298 if (!lock_held) 299 mutex_enter(&ui->ui_mx); 300 while (ui->ui_lock & flag) { 301 if (panicstr) { 302 if (ui->ui_lock & MD_UL_WRITER) 303 panic("md: writer lock is held"); 304 break; 305 } 306 cv_wait(&ui->ui_cv, &ui->ui_mx); 307 } 308 ui->ui_readercnt++; 309 if (!lock_held) 310 mutex_exit(&ui->ui_mx); 311 return (MD_UNIT(ui->ui_link.ln_id)); 312 } 313 314 void * 315 md_unit_readerlock(mdi_unit_t *ui) 316 { 317 return (md_unit_readerlock_common(ui, 0)); 318 } 319 320 /* 321 * md_unit_writerlock_common: 322 * ------------------------- 323 * Acquire a unique writer reference. Causes previous readers to drain. 324 * Spins if a writer reference already exists or if a previous reader/writer 325 * dropped the lock to allow a ksend_message to be despatched. 326 * 327 * Input: 328 * ui unit reference 329 * lock_held 0 => grab ui_mx 330 * 1 => ui_mx already held on entry 331 * Output: 332 * mm_unit_t reference 333 */ 334 static void * 335 md_unit_writerlock_common(mdi_unit_t *ui, int lock_held) 336 { 337 uint_t flag = MD_UL_WRITER; 338 339 if (panicstr) 340 panic("md: writer lock not allowed"); 341 342 if (!lock_held) 343 mutex_enter(&ui->ui_mx); 344 345 while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) { 346 ui->ui_wanabecnt++; 347 ui->ui_lock |= MD_UL_WANABEWRITER; 348 cv_wait(&ui->ui_cv, &ui->ui_mx); 349 if (--ui->ui_wanabecnt == 0) 350 ui->ui_lock &= ~MD_UL_WANABEWRITER; 351 } 352 ui->ui_lock |= MD_UL_WRITER; 353 ui->ui_owner = curthread; 354 355 if (!lock_held) 356 mutex_exit(&ui->ui_mx); 357 return (MD_UNIT(ui->ui_link.ln_id)); 358 } 359 360 void * 361 md_unit_writerlock(mdi_unit_t *ui) 362 { 363 return (md_unit_writerlock_common(ui, 0)); 364 } 365 366 /* 367 * md_unit_readerexit_common: 368 * ------------------------- 369 * Release the readerlock for the specified unit. If the reader count reaches 370 * zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up. 371 * 372 * Input: 373 * ui unit reference 374 * lock_held 0 => ui_mx needs to be acquired 375 * 1 => ui_mx already held 376 */ 377 static void 378 md_unit_readerexit_common(mdi_unit_t *ui, int lock_held) 379 { 380 if (!lock_held) 381 mutex_enter(&ui->ui_mx); 382 ASSERT((ui->ui_lock & MD_UL_WRITER) == 0); 383 ASSERT(ui->ui_readercnt != 0); 384 ui->ui_readercnt--; 385 if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0)) 386 cv_broadcast(&ui->ui_cv); 387 388 if (!lock_held) 389 mutex_exit(&ui->ui_mx); 390 } 391 392 void 393 md_unit_readerexit(mdi_unit_t *ui) 394 { 395 md_unit_readerexit_common(ui, 0); 396 } 397 398 /* 399 * md_unit_writerexit_common: 400 * ------------------------- 401 * Release the writerlock currently held on the unit. Wake any threads waiting 402 * on becoming reader or writer (MD_UL_WANABEWRITER set). 403 * 404 * Input: 405 * ui unit reference 406 * lock_held 0 => ui_mx to be acquired 407 * 1 => ui_mx already held 408 */ 409 static void 410 md_unit_writerexit_common(mdi_unit_t *ui, int lock_held) 411 { 412 if (!lock_held) 413 mutex_enter(&ui->ui_mx); 414 ASSERT((ui->ui_lock & MD_UL_WRITER) != 0); 415 ASSERT(ui->ui_readercnt == 0); 416 ui->ui_lock &= ~MD_UL_WRITER; 417 ui->ui_owner = NULL; 418 419 cv_broadcast(&ui->ui_cv); 420 if (!lock_held) 421 mutex_exit(&ui->ui_mx); 422 } 423 424 void 425 md_unit_writerexit(mdi_unit_t *ui) 426 { 427 md_unit_writerexit_common(ui, 0); 428 } 429 430 void * 431 md_io_readerlock(mdi_unit_t *ui) 432 { 433 md_io_lock_t *io = ui->ui_io_lock; 434 435 ASSERT(io); /* checks case where no io lock allocated */ 436 mutex_enter(&io->io_mx); 437 while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) { 438 if (panicstr) { 439 if (io->io_lock & MD_UL_WRITER) 440 panic("md: writer lock is held"); 441 break; 442 } 443 cv_wait(&io->io_cv, &io->io_mx); 444 } 445 io->io_readercnt++; 446 mutex_exit(&io->io_mx); 447 return (MD_UNIT(ui->ui_link.ln_id)); 448 } 449 450 void * 451 md_io_writerlock(mdi_unit_t *ui) 452 { 453 md_io_lock_t *io = ui->ui_io_lock; 454 455 ASSERT(io); /* checks case where no io lock allocated */ 456 if (panicstr) 457 panic("md: writer lock not allowed"); 458 459 mutex_enter(&io->io_mx); 460 while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) { 461 io->io_wanabecnt++; 462 io->io_lock |= MD_UL_WANABEWRITER; 463 cv_wait(&io->io_cv, &io->io_mx); 464 if (--io->io_wanabecnt == 0) 465 io->io_lock &= ~MD_UL_WANABEWRITER; 466 } 467 io->io_lock |= MD_UL_WRITER; 468 io->io_owner = curthread; 469 470 mutex_exit(&io->io_mx); 471 return (MD_UNIT(ui->ui_link.ln_id)); 472 } 473 474 void 475 md_io_readerexit(mdi_unit_t *ui) 476 { 477 md_io_lock_t *io = ui->ui_io_lock; 478 479 mutex_enter(&io->io_mx); 480 ASSERT((io->io_lock & MD_UL_WRITER) == 0); 481 ASSERT(io->io_readercnt != 0); 482 io->io_readercnt--; 483 if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) { 484 cv_broadcast(&io->io_cv); 485 } 486 mutex_exit(&io->io_mx); 487 } 488 489 void 490 md_io_writerexit(mdi_unit_t *ui) 491 { 492 md_io_lock_t *io = ui->ui_io_lock; 493 494 mutex_enter(&io->io_mx); 495 ASSERT((io->io_lock & MD_UL_WRITER) != 0); 496 ASSERT(io->io_readercnt == 0); 497 io->io_lock &= ~MD_UL_WRITER; 498 io->io_owner = NULL; 499 500 cv_broadcast(&io->io_cv); 501 mutex_exit(&io->io_mx); 502 } 503 504 /* 505 * Attempt to grab that set of locks defined as global. 506 * A mask containing the set of global locks that are owned upon 507 * entry is input. Any additional global locks are then grabbed. 508 * This keeps the caller from having to know the set of global 509 * locks. 510 */ 511 static int 512 md_global_lock_enter(int global_locks_owned_mask) 513 { 514 515 /* 516 * The current implementation has been verified by inspection 517 * and test to be deadlock free. If another global lock is 518 * added, changing the algorithm used by this function should 519 * be considered. With more than 2 locks it is difficult to 520 * guarantee that locks are being acquired in the correct order. 521 * The safe approach would be to drop all of the locks that are 522 * owned at function entry and then reacquire all of the locks 523 * in the order defined by the lock hierarchy. 524 */ 525 mutex_enter(&md_mx); 526 if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) { 527 while ((md_mtioctl_cnt != 0) || 528 (md_status & MD_GBL_IOCTL_LOCK)) { 529 if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) { 530 mutex_exit(&md_mx); 531 return (EINTR); 532 } 533 } 534 md_status |= MD_GBL_IOCTL_LOCK; 535 md_ioctl_cnt++; 536 } 537 if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) { 538 while (md_status & MD_GBL_HS_LOCK) { 539 if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) { 540 md_status &= ~MD_GBL_IOCTL_LOCK; 541 mutex_exit(&md_mx); 542 return (EINTR); 543 } 544 } 545 md_status |= MD_GBL_HS_LOCK; 546 } 547 mutex_exit(&md_mx); 548 return (0); 549 } 550 551 /* 552 * Release the set of global locks that were grabbed in md_global_lock_enter 553 * that were not already owned by the calling thread. The set of previously 554 * owned global locks is passed in as a mask parameter. 555 */ 556 static int 557 md_global_lock_exit(int global_locks_owned_mask, int code, 558 int flags, mdi_unit_t *ui) 559 { 560 mutex_enter(&md_mx); 561 562 /* If MT ioctl decrement mt_ioctl_cnt */ 563 if ((flags & MD_MT_IOCTL)) { 564 md_mtioctl_cnt--; 565 } else { 566 if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) { 567 /* clear the lock and decrement count */ 568 ASSERT(md_ioctl_cnt == 1); 569 md_ioctl_cnt--; 570 md_status &= ~MD_GBL_IOCTL_LOCK; 571 } 572 if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) 573 md_status &= ~MD_GBL_HS_LOCK; 574 } 575 if (flags & MD_READER_HELD) 576 md_unit_readerexit(ui); 577 if (flags & MD_WRITER_HELD) 578 md_unit_writerexit(ui); 579 if (flags & MD_IO_HELD) 580 md_io_writerexit(ui); 581 if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) { 582 rw_exit(&md_unit_array_rw.lock); 583 } 584 cv_broadcast(&md_cv); 585 mutex_exit(&md_mx); 586 587 return (code); 588 } 589 590 /* 591 * The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make 592 * use of the md_global_lock_{enter|exit} functions to avoid duplication 593 * of code. They rely upon the fact that the locks that are specified in 594 * the input mask are not acquired or freed. If this algorithm changes 595 * as described in the block comment at the beginning of md_global_lock_enter 596 * then it will be necessary to change these 2 functions. Otherwise these 597 * functions will be grabbing and holding global locks unnecessarily. 598 */ 599 int 600 md_ioctl_lock_enter(void) 601 { 602 /* grab only the ioctl lock */ 603 return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK)); 604 } 605 606 /* 607 * If md_ioctl_lock_exit is being called at the end of an ioctl before 608 * returning to user space, then ioctl_end is set to 1. 609 * Otherwise, the ioctl lock is being dropped in the middle of handling 610 * an ioctl and will be reacquired before the end of the ioctl. 611 * Do not attempt to process the MN diskset mddb parse flags unless 612 * ioctl_end is true - otherwise a deadlock situation could arise. 613 */ 614 int 615 md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end) 616 { 617 int ret_val; 618 uint_t status; 619 mddb_set_t *s; 620 int i; 621 int err; 622 md_mn_msg_mddb_parse_t *mddb_parse_msg; 623 md_mn_kresult_t *kresult; 624 mddb_lb_t *lbp; 625 int rval = 1; 626 int flag; 627 628 /* release only the ioctl lock */ 629 ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui); 630 631 /* 632 * If md_ioctl_lock_exit is being called with a possible lock held 633 * (ioctl_end is 0), then don't check the MN disksets since the 634 * call to mddb_setenter may cause a lock ordering deadlock. 635 */ 636 if (!ioctl_end) 637 return (ret_val); 638 639 /* 640 * Walk through disksets to see if there is a MN diskset that 641 * has messages that need to be sent. Set must be snarfed and 642 * be a MN diskset in order to be checked. 643 * 644 * In a MN diskset, this routine may send messages to the 645 * rpc.mdcommd in order to have the slave nodes re-parse parts 646 * of the mddb. Messages can only be sent with no locks held, 647 * so if mddb change occurred while the ioctl lock is held, this 648 * routine must send the messages. 649 */ 650 for (i = 1; i < md_nsets; i++) { 651 status = md_get_setstatus(i); 652 653 /* Set must be snarfed and be a MN diskset */ 654 if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) != 655 (MD_SET_SNARFED | MD_SET_MNSET)) 656 continue; 657 658 /* Grab set lock so that set can't change */ 659 if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL) 660 continue; 661 662 lbp = s->s_lbp; 663 664 /* Re-get set status now that lock is held */ 665 status = md_get_setstatus(i); 666 667 /* 668 * If MN parsing block flag is set - continue to next set. 669 * 670 * If s_mn_parseflags_sending is non-zero, then another thread 671 * is already currently sending a parse message, so just 672 * release the set mutex. If this ioctl had caused an mddb 673 * change that results in a parse message to be generated, 674 * the thread that is currently sending a parse message would 675 * generate the additional parse message. 676 * 677 * If s_mn_parseflags_sending is zero then loop until 678 * s_mn_parseflags is 0 (until there are no more 679 * messages to send). 680 * While s_mn_parseflags is non-zero, 681 * put snapshot of parse_flags in s_mn_parseflags_sending 682 * set s_mn_parseflags to zero 683 * release set mutex 684 * send message 685 * re-grab set mutex 686 * set s_mn_parseflags_sending to zero 687 * 688 * If set is STALE, send message with NO_LOG flag so that 689 * rpc.mdcommd won't attempt to log message to non-writeable 690 * replica. 691 */ 692 mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), 693 KM_SLEEP); 694 while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) && 695 (s->s_mn_parseflags & MDDB_PARSE_MASK) && 696 (!(status & MD_SET_MNPARSE_BLK))) { 697 698 /* Grab snapshot of parse flags */ 699 s->s_mn_parseflags_sending = s->s_mn_parseflags; 700 s->s_mn_parseflags = 0; 701 702 mutex_exit(&md_set[(s)->s_setno].s_dbmx); 703 704 /* 705 * Send the message to the slaves to re-parse 706 * the indicated portions of the mddb. Send the status 707 * of the 50 mddbs in this set so that slaves know 708 * which mddbs that the master node thinks are 'good'. 709 * Otherwise, slave may reparse, but from wrong 710 * replica. 711 */ 712 mddb_parse_msg->msg_parse_flags = 713 s->s_mn_parseflags_sending; 714 715 for (i = 0; i < MDDB_NLB; i++) { 716 mddb_parse_msg->msg_lb_flags[i] = 717 lbp->lb_locators[i].l_flags; 718 } 719 kresult = kmem_alloc(sizeof (md_mn_kresult_t), 720 KM_SLEEP); 721 while (rval != 0) { 722 flag = 0; 723 if (status & MD_SET_STALE) 724 flag |= MD_MSGF_NO_LOG; 725 rval = mdmn_ksend_message(s->s_setno, 726 MD_MN_MSG_MDDB_PARSE, flag, 0, 727 (char *)mddb_parse_msg, 728 sizeof (md_mn_msg_mddb_parse_t), kresult); 729 /* if the node hasn't yet joined, it's Ok. */ 730 if ((!MDMN_KSEND_MSG_OK(rval, kresult)) && 731 (kresult->kmmr_comm_state != 732 MDMNE_NOT_JOINED)) { 733 mdmn_ksend_show_error(rval, kresult, 734 "MD_MN_MSG_MDDB_PARSE"); 735 cmn_err(CE_WARN, "md_ioctl_lock_exit: " 736 "Unable to send mddb update " 737 "message to other nodes in " 738 "diskset %s\n", s->s_setname); 739 rval = 1; 740 } 741 } 742 kmem_free(kresult, sizeof (md_mn_kresult_t)); 743 744 /* 745 * Re-grab mutex to clear sending field and to 746 * see if another parse message needs to be generated. 747 */ 748 mutex_enter(&md_set[(s)->s_setno].s_dbmx); 749 s->s_mn_parseflags_sending = 0; 750 } 751 kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t)); 752 mutex_exit(&md_set[(s)->s_setno].s_dbmx); 753 } 754 return (ret_val); 755 } 756 757 /* 758 * Called when in an ioctl and need readerlock. 759 */ 760 void * 761 md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui) 762 { 763 ASSERT(lock != NULL); 764 lock->l_ui = ui; 765 lock->l_flags |= MD_READER_HELD; 766 return (md_unit_readerlock_common(ui, 0)); 767 } 768 769 /* 770 * Called when in an ioctl and need writerlock. 771 */ 772 void * 773 md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui) 774 { 775 ASSERT(lock != NULL); 776 lock->l_ui = ui; 777 lock->l_flags |= MD_WRITER_HELD; 778 return (md_unit_writerlock_common(ui, 0)); 779 } 780 781 void * 782 md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui) 783 { 784 ASSERT(lock != NULL); 785 lock->l_ui = ui; 786 lock->l_flags |= MD_IO_HELD; 787 return (md_io_writerlock(ui)); 788 } 789 790 void 791 md_ioctl_readerexit(IOLOCK *lock) 792 { 793 ASSERT(lock != NULL); 794 lock->l_flags &= ~MD_READER_HELD; 795 md_unit_readerexit(lock->l_ui); 796 } 797 798 void 799 md_ioctl_writerexit(IOLOCK *lock) 800 { 801 ASSERT(lock != NULL); 802 lock->l_flags &= ~MD_WRITER_HELD; 803 md_unit_writerexit(lock->l_ui); 804 } 805 806 void 807 md_ioctl_io_exit(IOLOCK *lock) 808 { 809 ASSERT(lock != NULL); 810 lock->l_flags &= ~MD_IO_HELD; 811 md_io_writerexit(lock->l_ui); 812 } 813 814 /* 815 * md_ioctl_releaselocks: 816 * -------------------- 817 * Release the unit locks that are held and stop subsequent 818 * md_unit_reader/writerlock calls from progressing. This allows the caller 819 * to send messages across the cluster when running in a multinode 820 * environment. 821 * ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are 822 * allowed to progress as normal. This is required as these typically are 823 * invoked by the message handler that may be called while a unit lock is 824 * marked as released. 825 * 826 * On entry: 827 * variety of unit locks may be held including ioctl lock 828 * 829 * On exit: 830 * locks released and unit structure updated to prevent subsequent reader/ 831 * writer locks being acquired until md_ioctl_reacquirelocks is called 832 */ 833 void 834 md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui) 835 { 836 /* This actually releases the locks. */ 837 (void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui); 838 } 839 840 /* 841 * md_ioctl_reacquirelocks: 842 * ---------------------- 843 * Reacquire the locks that were held when md_ioctl_releaselocks 844 * was called. 845 * 846 * On entry: 847 * No unit locks held 848 * On exit: 849 * locks held that were held at md_ioctl_releaselocks time including 850 * the ioctl lock. 851 */ 852 void 853 md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui) 854 { 855 if (flags & MD_MT_IOCTL) { 856 mutex_enter(&md_mx); 857 md_mtioctl_cnt++; 858 mutex_exit(&md_mx); 859 } else { 860 while (md_ioctl_lock_enter() == EINTR) 861 ; 862 } 863 if (flags & MD_ARRAY_WRITER) { 864 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 865 } else if (flags & MD_ARRAY_READER) { 866 rw_enter(&md_unit_array_rw.lock, RW_READER); 867 } 868 if (ui != (mdi_unit_t *)NULL) { 869 if (flags & MD_IO_HELD) { 870 (void) md_io_writerlock(ui); 871 } 872 873 mutex_enter(&ui->ui_mx); 874 if (flags & MD_READER_HELD) { 875 (void) md_unit_readerlock_common(ui, 1); 876 } else if (flags & MD_WRITER_HELD) { 877 (void) md_unit_writerlock_common(ui, 1); 878 } 879 /* Wake up any blocked readerlock() calls */ 880 cv_broadcast(&ui->ui_cv); 881 mutex_exit(&ui->ui_mx); 882 } 883 } 884 885 void 886 md_ioctl_droplocks(IOLOCK *lock) 887 { 888 mdi_unit_t *ui; 889 int flags; 890 891 ASSERT(lock != NULL); 892 ui = lock->l_ui; 893 flags = lock->l_flags; 894 if (flags & MD_READER_HELD) { 895 lock->l_flags &= ~MD_READER_HELD; 896 md_unit_readerexit(ui); 897 } 898 if (flags & MD_WRITER_HELD) { 899 lock->l_flags &= ~MD_WRITER_HELD; 900 md_unit_writerexit(ui); 901 } 902 if (flags & MD_IO_HELD) { 903 lock->l_flags &= ~MD_IO_HELD; 904 md_io_writerexit(ui); 905 } 906 if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) { 907 lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER); 908 rw_exit(&md_unit_array_rw.lock); 909 } 910 } 911 912 void 913 md_array_writer(IOLOCK *lock) 914 { 915 ASSERT(lock != NULL); 916 lock->l_flags |= MD_ARRAY_WRITER; 917 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 918 } 919 920 void 921 md_array_reader(IOLOCK *lock) 922 { 923 ASSERT(lock != NULL); 924 lock->l_flags |= MD_ARRAY_READER; 925 rw_enter(&md_unit_array_rw.lock, RW_READER); 926 } 927 928 /* 929 * Called when in an ioctl and need opencloselock. 930 * Sets flags in lockp for READER_HELD. 931 */ 932 void * 933 md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui) 934 { 935 void *un; 936 937 ASSERT(lockp != NULL); 938 mutex_enter(&ui->ui_mx); 939 while (ui->ui_lock & MD_UL_OPENORCLOSE) 940 cv_wait(&ui->ui_cv, &ui->ui_mx); 941 ui->ui_lock |= MD_UL_OPENORCLOSE; 942 943 /* Maintain mutex across the readerlock call */ 944 lockp->l_ui = ui; 945 lockp->l_flags |= MD_READER_HELD; 946 un = md_unit_readerlock_common(ui, 1); 947 mutex_exit(&ui->ui_mx); 948 949 return (un); 950 } 951 952 /* 953 * Clears reader lock using md_ioctl instead of md_unit 954 * and updates lockp. 955 */ 956 void 957 md_ioctl_openclose_exit(IOLOCK *lockp) 958 { 959 mdi_unit_t *ui; 960 961 ASSERT(lockp != NULL); 962 ui = lockp->l_ui; 963 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 964 965 md_ioctl_readerexit(lockp); 966 967 mutex_enter(&ui->ui_mx); 968 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 969 970 cv_broadcast(&ui->ui_cv); 971 mutex_exit(&ui->ui_mx); 972 } 973 974 /* 975 * Clears reader lock using md_ioctl instead of md_unit 976 * and updates lockp. 977 * Does not acquire or release the ui_mx lock since the calling 978 * routine has already acquired this lock. 979 */ 980 void 981 md_ioctl_openclose_exit_lh(IOLOCK *lockp) 982 { 983 mdi_unit_t *ui; 984 985 ASSERT(lockp != NULL); 986 ui = lockp->l_ui; 987 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 988 989 lockp->l_flags &= ~MD_READER_HELD; 990 md_unit_readerexit_common(lockp->l_ui, 1); 991 992 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 993 cv_broadcast(&ui->ui_cv); 994 } 995 996 void * 997 md_unit_openclose_enter(mdi_unit_t *ui) 998 { 999 void *un; 1000 1001 mutex_enter(&ui->ui_mx); 1002 while (ui->ui_lock & (MD_UL_OPENORCLOSE)) 1003 cv_wait(&ui->ui_cv, &ui->ui_mx); 1004 ui->ui_lock |= MD_UL_OPENORCLOSE; 1005 1006 /* Maintain mutex across the readerlock call */ 1007 un = md_unit_readerlock_common(ui, 1); 1008 mutex_exit(&ui->ui_mx); 1009 1010 return (un); 1011 } 1012 1013 void 1014 md_unit_openclose_exit(mdi_unit_t *ui) 1015 { 1016 md_unit_readerexit(ui); 1017 1018 mutex_enter(&ui->ui_mx); 1019 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 1020 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 1021 1022 cv_broadcast(&ui->ui_cv); 1023 mutex_exit(&ui->ui_mx); 1024 } 1025 1026 /* 1027 * Drop the openclose and readerlocks without acquiring or 1028 * releasing the ui_mx lock since the calling routine has 1029 * already acquired this lock. 1030 */ 1031 void 1032 md_unit_openclose_exit_lh(mdi_unit_t *ui) 1033 { 1034 md_unit_readerexit_common(ui, 1); 1035 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 1036 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 1037 cv_broadcast(&ui->ui_cv); 1038 } 1039 1040 int 1041 md_unit_isopen( 1042 mdi_unit_t *ui 1043 ) 1044 { 1045 int isopen; 1046 1047 /* check status */ 1048 mutex_enter(&ui->ui_mx); 1049 isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0); 1050 mutex_exit(&ui->ui_mx); 1051 return (isopen); 1052 } 1053 1054 int 1055 md_unit_incopen( 1056 minor_t mnum, 1057 int flag, 1058 int otyp 1059 ) 1060 { 1061 mdi_unit_t *ui = MDI_UNIT(mnum); 1062 int err = 0; 1063 1064 /* check type and flags */ 1065 ASSERT(ui != NULL); 1066 mutex_enter(&ui->ui_mx); 1067 if ((otyp < 0) || (otyp >= OTYPCNT)) { 1068 err = EINVAL; 1069 goto out; 1070 } 1071 if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) || 1072 (ui->ui_lock & MD_UL_EXCL)) { 1073 err = EBUSY; 1074 goto out; 1075 } 1076 1077 /* count and flag open */ 1078 ui->ui_ocnt[otyp]++; 1079 ui->ui_lock |= MD_UL_OPEN; 1080 if (flag & FEXCL) 1081 ui->ui_lock |= MD_UL_EXCL; 1082 1083 /* setup kstat, return success */ 1084 mutex_exit(&ui->ui_mx); 1085 md_kstat_init(mnum); 1086 return (0); 1087 1088 /* return error */ 1089 out: 1090 mutex_exit(&ui->ui_mx); 1091 return (err); 1092 } 1093 1094 int 1095 md_unit_decopen( 1096 minor_t mnum, 1097 int otyp 1098 ) 1099 { 1100 mdi_unit_t *ui = MDI_UNIT(mnum); 1101 int err = 0; 1102 unsigned i; 1103 1104 /* check type and flags */ 1105 ASSERT(ui != NULL); 1106 mutex_enter(&ui->ui_mx); 1107 if ((otyp < 0) || (otyp >= OTYPCNT)) { 1108 err = EINVAL; 1109 goto out; 1110 } else if (ui->ui_ocnt[otyp] == 0) { 1111 err = ENXIO; 1112 goto out; 1113 } 1114 1115 /* count and flag closed */ 1116 if (otyp == OTYP_LYR) 1117 ui->ui_ocnt[otyp]--; 1118 else 1119 ui->ui_ocnt[otyp] = 0; 1120 ui->ui_lock &= ~MD_UL_OPEN; 1121 for (i = 0; (i < OTYPCNT); ++i) 1122 if (ui->ui_ocnt[i] != 0) 1123 ui->ui_lock |= MD_UL_OPEN; 1124 if (! (ui->ui_lock & MD_UL_OPEN)) 1125 ui->ui_lock &= ~MD_UL_EXCL; 1126 1127 /* teardown kstat, return success */ 1128 if (! (ui->ui_lock & MD_UL_OPEN)) { 1129 1130 /* 1131 * We have a race condition inherited from specfs between 1132 * open() and close() calls. This results in the kstat 1133 * for a pending I/O being torn down, and then a panic. 1134 * To avoid this, only tear the kstat down if there are 1135 * no other readers on this device. 1136 */ 1137 if (ui->ui_readercnt > 1) { 1138 mutex_exit(&ui->ui_mx); 1139 } else { 1140 mutex_exit(&ui->ui_mx); 1141 md_kstat_destroy(mnum); 1142 } 1143 return (0); 1144 } 1145 1146 /* return success */ 1147 out: 1148 mutex_exit(&ui->ui_mx); 1149 return (err); 1150 } 1151 1152 md_dev64_t 1153 md_xlate_targ_2_mini(md_dev64_t targ_devt) 1154 { 1155 dev32_t mini_32_devt, targ_32_devt; 1156 int i; 1157 1158 /* 1159 * check to see if we're in an upgrade situation 1160 * if we are not in upgrade just return the input device 1161 */ 1162 1163 if (!MD_UPGRADE) 1164 return (targ_devt); 1165 1166 targ_32_devt = md_cmpldev(targ_devt); 1167 1168 i = 0; 1169 while (i != md_tuple_length) { 1170 if (md_tuple_table[i].targ_devt == targ_32_devt) { 1171 mini_32_devt = md_tuple_table[i].mini_devt; 1172 return (md_expldev((md_dev64_t)mini_32_devt)); 1173 } 1174 i++; 1175 } 1176 return (NODEV64); 1177 } 1178 1179 md_dev64_t 1180 md_xlate_mini_2_targ(md_dev64_t mini_devt) 1181 { 1182 dev32_t mini_32_devt, targ_32_devt; 1183 int i; 1184 1185 if (!MD_UPGRADE) 1186 return (mini_devt); 1187 1188 mini_32_devt = md_cmpldev(mini_devt); 1189 1190 i = 0; 1191 while (i != md_tuple_length) { 1192 if (md_tuple_table[i].mini_devt == mini_32_devt) { 1193 targ_32_devt = md_tuple_table[i].targ_devt; 1194 return (md_expldev((md_dev64_t)targ_32_devt)); 1195 } 1196 i++; 1197 } 1198 return (NODEV64); 1199 } 1200 1201 void 1202 md_xlate_free(int size) 1203 { 1204 kmem_free(md_tuple_table, size); 1205 } 1206 1207 char * 1208 md_targ_major_to_name(major_t maj) 1209 { 1210 char *drv_name = NULL; 1211 int i; 1212 1213 if (!MD_UPGRADE) 1214 return (ddi_major_to_name(maj)); 1215 1216 for (i = 0; i < md_majortab_len; i++) { 1217 if (md_major_tuple_table[i].targ_maj == maj) { 1218 drv_name = md_major_tuple_table[i].drv_name; 1219 break; 1220 } 1221 } 1222 return (drv_name); 1223 } 1224 1225 major_t 1226 md_targ_name_to_major(char *drv_name) 1227 { 1228 major_t maj; 1229 int i; 1230 1231 maj = md_getmajor(NODEV64); 1232 if (!MD_UPGRADE) 1233 return (ddi_name_to_major(drv_name)); 1234 1235 for (i = 0; i < md_majortab_len; i++) { 1236 if ((strcmp(md_major_tuple_table[i].drv_name, 1237 drv_name)) == 0) { 1238 maj = md_major_tuple_table[i].targ_maj; 1239 break; 1240 } 1241 } 1242 1243 return (maj); 1244 } 1245 1246 void 1247 md_majortab_free() 1248 { 1249 size_t sz; 1250 int i; 1251 1252 for (i = 0; i < md_majortab_len; i++) { 1253 freestr(md_major_tuple_table[i].drv_name); 1254 } 1255 1256 sz = md_majortab_len * sizeof (struct md_xlate_major_table); 1257 kmem_free(md_major_tuple_table, sz); 1258 } 1259 1260 /* functions return a pointer to a function which returns an int */ 1261 1262 intptr_t (* 1263 md_get_named_service(md_dev64_t dev, int modindex, char *name, 1264 intptr_t (*Default)()))() 1265 { 1266 mdi_unit_t *ui; 1267 md_named_services_t *sp; 1268 int i; 1269 1270 /* 1271 * Return the first named service found. 1272 * Use this path when it is known that there is only 1273 * one named service possible (e.g., hotspare interface) 1274 */ 1275 if ((dev == NODEV64) && (modindex == ANY_SERVICE)) { 1276 for (i = 0; i < MD_NOPS; i++) { 1277 if (md_ops[i] == NULL) { 1278 continue; 1279 } 1280 sp = md_ops[i]->md_services; 1281 if (sp == NULL) 1282 continue; 1283 while (sp->md_service != NULL) { 1284 if (strcmp(name, sp->md_name) == 0) 1285 return (sp->md_service); 1286 sp++; 1287 } 1288 } 1289 return (Default); 1290 } 1291 1292 /* 1293 * Return the named service for the given modindex. 1294 * This is used if there are multiple possible named services 1295 * and each one needs to be called (e.g., poke hotspares) 1296 */ 1297 if (dev == NODEV64) { 1298 if (modindex >= MD_NOPS) 1299 return (Default); 1300 1301 if (md_ops[modindex] == NULL) 1302 return (Default); 1303 1304 sp = md_ops[modindex]->md_services; 1305 if (sp == NULL) 1306 return (Default); 1307 1308 while (sp->md_service != NULL) { 1309 if (strcmp(name, sp->md_name) == 0) 1310 return (sp->md_service); 1311 sp++; 1312 } 1313 return (Default); 1314 } 1315 1316 /* 1317 * Return the named service for this md_dev64_t 1318 */ 1319 if (md_getmajor(dev) != md_major) 1320 return (Default); 1321 1322 if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) || 1323 (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits)) 1324 return (NULL); 1325 1326 1327 if ((ui = MDI_UNIT(md_getminor(dev))) == NULL) 1328 return (NULL); 1329 1330 sp = md_ops[ui->ui_opsindex]->md_services; 1331 if (sp == NULL) 1332 return (Default); 1333 while (sp->md_service != NULL) { 1334 if (strcmp(name, sp->md_name) == 0) 1335 return (sp->md_service); 1336 sp++; 1337 } 1338 return (Default); 1339 } 1340 1341 /* 1342 * md_daemon callback routine 1343 */ 1344 boolean_t 1345 callb_md_cpr(void *arg, int code) 1346 { 1347 callb_cpr_t *cp = (callb_cpr_t *)arg; 1348 int ret = 0; /* assume success */ 1349 clock_t delta; 1350 1351 mutex_enter(cp->cc_lockp); 1352 1353 switch (code) { 1354 case CB_CODE_CPR_CHKPT: 1355 /* 1356 * Check for active resync threads 1357 */ 1358 mutex_enter(&md_cpr_resync.md_resync_mutex); 1359 if ((md_cpr_resync.md_mirror_resync > 0) || 1360 (md_cpr_resync.md_raid_resync > 0)) { 1361 mutex_exit(&md_cpr_resync.md_resync_mutex); 1362 cmn_err(CE_WARN, "There are Solaris Volume Manager " 1363 "synchronization threads running."); 1364 cmn_err(CE_WARN, "Please try system suspension at " 1365 "a later time."); 1366 ret = -1; 1367 break; 1368 } 1369 mutex_exit(&md_cpr_resync.md_resync_mutex); 1370 1371 cp->cc_events |= CALLB_CPR_START; 1372 delta = CPR_KTHREAD_TIMEOUT_SEC * hz; 1373 while (!(cp->cc_events & CALLB_CPR_SAFE)) 1374 /* cv_reltimedwait() returns -1 if it times out. */ 1375 if ((ret = cv_reltimedwait(&cp->cc_callb_cv, 1376 cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1) 1377 break; 1378 break; 1379 1380 case CB_CODE_CPR_RESUME: 1381 cp->cc_events &= ~CALLB_CPR_START; 1382 cv_signal(&cp->cc_stop_cv); 1383 break; 1384 } 1385 mutex_exit(cp->cc_lockp); 1386 return (ret != -1); 1387 } 1388 1389 void 1390 md_daemon(int pass_thru, mdq_anchor_t *anchor) 1391 { 1392 daemon_queue_t *dq; 1393 callb_cpr_t cprinfo; 1394 1395 if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE)) 1396 return; 1397 /* 1398 * Register cpr callback 1399 */ 1400 CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon"); 1401 1402 /*CONSTCOND*/ 1403 while (1) { 1404 mutex_enter(&anchor->a_mx); 1405 while ((dq = anchor->dq.dq_next) == &(anchor->dq)) { 1406 if (pass_thru) { 1407 /* 1408 * CALLB_CPR_EXIT Will do 1409 * mutex_exit(&anchor->a_mx) 1410 */ 1411 CALLB_CPR_EXIT(&cprinfo); 1412 return; 1413 } 1414 if (md_get_status() & MD_GBL_DAEMONS_DIE) { 1415 mutex_exit(&anchor->a_mx); 1416 mutex_enter(&md_mx); 1417 md_num_daemons--; 1418 mutex_exit(&md_mx); 1419 /* 1420 * CALLB_CPR_EXIT will do 1421 * mutex_exit(&anchor->a_mx) 1422 */ 1423 mutex_enter(&anchor->a_mx); 1424 CALLB_CPR_EXIT(&cprinfo); 1425 thread_exit(); 1426 } 1427 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1428 cv_wait(&anchor->a_cv, &anchor->a_mx); 1429 CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx); 1430 } 1431 dq->dq_prev->dq_next = dq->dq_next; 1432 dq->dq_next->dq_prev = dq->dq_prev; 1433 dq->dq_prev = dq->dq_next = NULL; 1434 anchor->dq.qlen--; 1435 mutex_exit(&anchor->a_mx); 1436 (*(dq->dq_call))(dq); 1437 } 1438 /*NOTREACHED*/ 1439 } 1440 1441 /* 1442 * daemon_request: 1443 * 1444 * Adds requests to appropriate requestq which is 1445 * anchored by *anchor. 1446 * The request is the first element of a doubly linked circular list. 1447 * When the request is a single element, the forward and backward 1448 * pointers MUST point to the element itself. 1449 */ 1450 1451 void 1452 daemon_request(mdq_anchor_t *anchor, void (*func)(), 1453 daemon_queue_t *request, callstyle_t style) 1454 { 1455 daemon_queue_t *rqtp; 1456 int i = 0; 1457 1458 rqtp = request; 1459 if (style == REQ_OLD) { 1460 ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL)); 1461 /* set it to the new style */ 1462 rqtp->dq_prev = rqtp->dq_next = rqtp; 1463 } 1464 ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL)); 1465 1466 /* scan the list and add the function to each element */ 1467 1468 do { 1469 rqtp->dq_call = func; 1470 i++; 1471 rqtp = rqtp->dq_next; 1472 } while (rqtp != request); 1473 1474 /* save pointer to tail of the request list */ 1475 rqtp = request->dq_prev; 1476 1477 mutex_enter(&anchor->a_mx); 1478 /* stats */ 1479 anchor->dq.qlen += i; 1480 anchor->dq.treqs += i; 1481 anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ? 1482 anchor->dq.qlen : anchor->dq.maxq_len; 1483 1484 /* now add the list to request queue */ 1485 request->dq_prev = anchor->dq.dq_prev; 1486 rqtp->dq_next = &anchor->dq; 1487 anchor->dq.dq_prev->dq_next = request; 1488 anchor->dq.dq_prev = rqtp; 1489 cv_broadcast(&anchor->a_cv); 1490 mutex_exit(&anchor->a_mx); 1491 } 1492 1493 void 1494 mddb_commitrec_wrapper(mddb_recid_t recid) 1495 { 1496 int sent_log = 0; 1497 uint_t retry = md_retry_cnt; 1498 set_t setno; 1499 1500 while (mddb_commitrec(recid)) { 1501 if (! sent_log) { 1502 cmn_err(CE_WARN, 1503 "md: state database commit failed"); 1504 sent_log = 1; 1505 } 1506 delay(md_hz); 1507 1508 /* 1509 * Setting retry cnt to one (pre decremented) so that we 1510 * actually do no retries when committing/deleting a mddb rec. 1511 * The underlying disk driver does several retries to check 1512 * if the disk is really dead or not so there 1513 * is no reason for us to retry on top of the drivers retries. 1514 */ 1515 1516 if (--retry == 0) { 1517 setno = mddb_getsetnum(recid); 1518 if (md_get_setstatus(setno) & MD_SET_TOOFEW) { 1519 panic( 1520 "md: Panic due to lack of DiskSuite state\n" 1521 " database replicas. Fewer than 50%% of " 1522 "the total were available,\n so panic to " 1523 "ensure data integrity."); 1524 } else { 1525 panic("md: state database problem"); 1526 } 1527 /*NOTREACHED*/ 1528 } 1529 } 1530 } 1531 1532 void 1533 mddb_commitrecs_wrapper(mddb_recid_t *recids) 1534 { 1535 int sent_log = 0; 1536 uint_t retry = md_retry_cnt; 1537 set_t setno; 1538 1539 while (mddb_commitrecs(recids)) { 1540 if (! sent_log) { 1541 cmn_err(CE_WARN, 1542 "md: state database commit failed"); 1543 sent_log = 1; 1544 } 1545 delay(md_hz); 1546 1547 /* 1548 * Setting retry cnt to one (pre decremented) so that we 1549 * actually do no retries when committing/deleting a mddb rec. 1550 * The underlying disk driver does several retries to check 1551 * if the disk is really dead or not so there 1552 * is no reason for us to retry on top of the drivers retries. 1553 */ 1554 1555 if (--retry == 0) { 1556 /* 1557 * since all the records are part of the same set 1558 * use the first one to get setno 1559 */ 1560 setno = mddb_getsetnum(*recids); 1561 if (md_get_setstatus(setno) & MD_SET_TOOFEW) { 1562 panic( 1563 "md: Panic due to lack of DiskSuite state\n" 1564 " database replicas. Fewer than 50%% of " 1565 "the total were available,\n so panic to " 1566 "ensure data integrity."); 1567 } else { 1568 panic("md: state database problem"); 1569 } 1570 /*NOTREACHED*/ 1571 } 1572 } 1573 } 1574 1575 void 1576 mddb_deleterec_wrapper(mddb_recid_t recid) 1577 { 1578 int sent_log = 0; 1579 uint_t retry = md_retry_cnt; 1580 set_t setno; 1581 1582 while (mddb_deleterec(recid)) { 1583 if (! sent_log) { 1584 cmn_err(CE_WARN, 1585 "md: state database delete failed"); 1586 sent_log = 1; 1587 } 1588 delay(md_hz); 1589 1590 /* 1591 * Setting retry cnt to one (pre decremented) so that we 1592 * actually do no retries when committing/deleting a mddb rec. 1593 * The underlying disk driver does several retries to check 1594 * if the disk is really dead or not so there 1595 * is no reason for us to retry on top of the drivers retries. 1596 */ 1597 1598 if (--retry == 0) { 1599 setno = mddb_getsetnum(recid); 1600 if (md_get_setstatus(setno) & MD_SET_TOOFEW) { 1601 panic( 1602 "md: Panic due to lack of DiskSuite state\n" 1603 " database replicas. Fewer than 50%% of " 1604 "the total were available,\n so panic to " 1605 "ensure data integrity."); 1606 } else { 1607 panic("md: state database problem"); 1608 } 1609 /*NOTREACHED*/ 1610 } 1611 } 1612 } 1613 1614 /* 1615 * md_holdset_enter is called in order to hold the set in its 1616 * current state (loaded, unloaded, snarfed, unsnarfed, etc) 1617 * until md_holdset_exit is called. This is used by the mirror 1618 * code to mark the set as HOLD so that the set won't be 1619 * unloaded while hotspares are being allocated in check_4_hotspares. 1620 * The original fix to the mirror code to hold the set was to call 1621 * md_haltsnarf_enter, but this will block all ioctls and ioctls 1622 * must work for a MN diskset while hotspares are allocated. 1623 */ 1624 void 1625 md_holdset_enter(set_t setno) 1626 { 1627 mutex_enter(&md_mx); 1628 while (md_set[setno].s_status & MD_SET_HOLD) 1629 cv_wait(&md_cv, &md_mx); 1630 md_set[setno].s_status |= MD_SET_HOLD; 1631 mutex_exit(&md_mx); 1632 } 1633 1634 void 1635 md_holdset_exit(set_t setno) 1636 { 1637 mutex_enter(&md_mx); 1638 md_set[setno].s_status &= ~MD_SET_HOLD; 1639 cv_broadcast(&md_cv); 1640 mutex_exit(&md_mx); 1641 } 1642 1643 /* 1644 * Returns a 0 if this thread marked the set as HOLD (success), 1645 * returns a -1 if set was already marked HOLD (failure). 1646 * Used by the release_set code to see if set is marked HOLD. 1647 * HOLD is set by a daemon when hotspares are being allocated 1648 * to mirror units. 1649 */ 1650 int 1651 md_holdset_testandenter(set_t setno) 1652 { 1653 mutex_enter(&md_mx); 1654 if (md_set[setno].s_status & MD_SET_HOLD) { 1655 mutex_exit(&md_mx); 1656 return (-1); 1657 } 1658 md_set[setno].s_status |= MD_SET_HOLD; 1659 mutex_exit(&md_mx); 1660 return (0); 1661 } 1662 1663 void 1664 md_haltsnarf_enter(set_t setno) 1665 { 1666 mutex_enter(&md_mx); 1667 while (md_set[setno].s_status & MD_SET_SNARFING) 1668 cv_wait(&md_cv, &md_mx); 1669 1670 md_set[setno].s_status |= MD_SET_SNARFING; 1671 mutex_exit(&md_mx); 1672 } 1673 1674 void 1675 md_haltsnarf_exit(set_t setno) 1676 { 1677 mutex_enter(&md_mx); 1678 md_set[setno].s_status &= ~MD_SET_SNARFING; 1679 cv_broadcast(&md_cv); 1680 mutex_exit(&md_mx); 1681 } 1682 1683 void 1684 md_haltsnarf_wait(set_t setno) 1685 { 1686 mutex_enter(&md_mx); 1687 while (md_set[setno].s_status & MD_SET_SNARFING) 1688 cv_wait(&md_cv, &md_mx); 1689 mutex_exit(&md_mx); 1690 } 1691 1692 /* 1693 * ASSUMED that the md_unit_array_rw WRITER lock is held. 1694 */ 1695 int 1696 md_halt_set(set_t setno, enum md_haltcmd cmd) 1697 { 1698 int i, err; 1699 1700 if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) { 1701 return (0); 1702 } 1703 1704 if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) { 1705 for (i = 0; i < MD_NOPS; i++) { 1706 if (md_ops[i] == NULL) 1707 continue; 1708 if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) { 1709 for (--i; i > 0; --i) { 1710 if (md_ops[i] == NULL) 1711 continue; 1712 (void) (*(md_ops[i]->md_halt)) 1713 (MD_HALT_OPEN, setno); 1714 } 1715 return (EBUSY); 1716 } 1717 } 1718 1719 for (i = 0; i < MD_NOPS; i++) { 1720 if (md_ops[i] == NULL) 1721 continue; 1722 if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) { 1723 for (i = 0; i < MD_NOPS; i++) { 1724 if (md_ops[i] == NULL) 1725 continue; 1726 (void) (*(md_ops[i]->md_halt)) 1727 (MD_HALT_OPEN, setno); 1728 } 1729 return (EBUSY); 1730 } 1731 } 1732 } 1733 1734 if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) { 1735 for (i = 0; i < MD_NOPS; i++) { 1736 if (md_ops[i] == NULL) 1737 continue; 1738 err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno); 1739 if (err != 0) 1740 cmn_err(CE_NOTE, 1741 "md: halt failed for %s, error %d", 1742 md_ops[i]->md_driver.md_drivername, err); 1743 } 1744 1745 /* 1746 * Unload the devid namespace if it is loaded 1747 */ 1748 md_unload_namespace(setno, NM_DEVID); 1749 md_unload_namespace(setno, 0L); 1750 md_clr_setstatus(setno, MD_SET_SNARFED); 1751 } 1752 1753 return (0); 1754 } 1755 1756 int 1757 md_halt(int global_locks_owned_mask) 1758 { 1759 set_t i, j; 1760 int err; 1761 int init_queues; 1762 md_requestq_entry_t *rqp; 1763 md_ops_t **pops, *ops, *lops; 1764 ddi_modhandle_t mod; 1765 char *name; 1766 1767 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 1768 1769 /* 1770 * Grab the all of the global locks that are not 1771 * already owned to ensure that there isn't another 1772 * thread trying to access a global resource 1773 * while the halt is in progress 1774 */ 1775 if (md_global_lock_enter(global_locks_owned_mask) == EINTR) 1776 return (EINTR); 1777 1778 for (i = 0; i < md_nsets; i++) 1779 md_haltsnarf_enter(i); 1780 1781 /* 1782 * Kill the daemon threads. 1783 */ 1784 init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE); 1785 md_clr_status(MD_GBL_DAEMONS_LIVE); 1786 md_set_status(MD_GBL_DAEMONS_DIE); 1787 1788 rqp = &md_daemon_queues[0]; 1789 i = 0; 1790 while (!NULL_REQUESTQ_ENTRY(rqp)) { 1791 cv_broadcast(&rqp->dispq_headp->a_cv); 1792 rqp = &md_daemon_queues[++i]; 1793 } 1794 1795 mutex_enter(&md_mx); 1796 while (md_num_daemons != 0) { 1797 mutex_exit(&md_mx); 1798 delay(md_hz); 1799 mutex_enter(&md_mx); 1800 } 1801 mutex_exit(&md_mx); 1802 md_clr_status(MD_GBL_DAEMONS_DIE); 1803 1804 for (i = 0; i < md_nsets; i++) 1805 /* 1806 * Only call into md_halt_set if s_un / s_ui are both set. 1807 * If they are NULL this set hasn't been accessed, so its 1808 * pointless performing the call. 1809 */ 1810 if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) { 1811 if (md_halt_set(i, MD_HALT_CHECK)) { 1812 if (md_start_daemons(init_queues)) 1813 cmn_err(CE_WARN, 1814 "md: restart of daemon threads " 1815 "failed"); 1816 for (j = 0; j < md_nsets; j++) 1817 md_haltsnarf_exit(j); 1818 1819 return (md_global_lock_exit( 1820 global_locks_owned_mask, EBUSY, 1821 MD_ARRAY_WRITER, NULL)); 1822 } 1823 } 1824 1825 /* 1826 * if we get here we are going to do it 1827 */ 1828 for (i = 0; i < md_nsets; i++) { 1829 /* 1830 * Only call into md_halt_set if s_un / s_ui are both set. 1831 * If they are NULL this set hasn't been accessed, so its 1832 * pointless performing the call. 1833 */ 1834 if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) { 1835 err = md_halt_set(i, MD_HALT_DOIT); 1836 if (err != 0) 1837 cmn_err(CE_NOTE, 1838 "md: halt failed set %u, error %d", 1839 (unsigned)i, err); 1840 } 1841 } 1842 1843 /* 1844 * issue a halt unload to each module to indicate that it 1845 * is about to be unloaded. Each module is called once, set 1846 * has no meaning at this point in time. 1847 */ 1848 for (i = 0; i < MD_NOPS; i++) { 1849 if (md_ops[i] == NULL) 1850 continue; 1851 err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0); 1852 if (err != 0) 1853 cmn_err(CE_NOTE, 1854 "md: halt failed for %s, error %d", 1855 md_ops[i]->md_driver.md_drivername, err); 1856 } 1857 1858 /* ddi_modclose the submodules */ 1859 for (i = 0; i < MD_NOPS; i++) { 1860 /* skip if not open */ 1861 if ((md_ops[i] == NULL) || (md_mods[i] == NULL)) 1862 continue; 1863 1864 /* find and unlink from md_opslist */ 1865 ops = md_ops[i]; 1866 mod = md_mods[i]; 1867 pops = &md_opslist; 1868 for (lops = *pops; lops; 1869 pops = &lops->md_next, lops = *pops) { 1870 if (lops == ops) { 1871 *pops = ops->md_next; 1872 ops->md_next = NULL; 1873 break; 1874 } 1875 } 1876 1877 /* uninitialize */ 1878 name = ops->md_driver.md_drivername; 1879 md_ops[i] = NULL; 1880 md_mods[i] = NULL; 1881 ops->md_selfindex = 0; 1882 ops->md_driver.md_drivername[0] = '\0'; 1883 rw_destroy(&ops->md_link_rw.lock); 1884 1885 /* close */ 1886 err = ddi_modclose(mod); 1887 if (err != 0) 1888 cmn_err(CE_NOTE, 1889 "md: halt close failed for %s, error %d", 1890 name ? name : "UNKNOWN", err); 1891 } 1892 1893 /* Unload the database */ 1894 mddb_unload(); 1895 1896 md_set_status(MD_GBL_HALTED); /* we are ready to be unloaded */ 1897 1898 for (i = 0; i < md_nsets; i++) 1899 md_haltsnarf_exit(i); 1900 1901 return (md_global_lock_exit(global_locks_owned_mask, 0, 1902 MD_ARRAY_WRITER, NULL)); 1903 } 1904 1905 /* 1906 * md_layered_open() is an internal routine only for SVM modules. 1907 * So the input device will be a md_dev64_t, because all SVM modules internally 1908 * work with that device type. 1909 * ddi routines on the other hand work with dev_t. So, if we call any ddi 1910 * routines from here we first have to convert that device into a dev_t. 1911 */ 1912 1913 int 1914 md_layered_open( 1915 minor_t mnum, 1916 md_dev64_t *dev, 1917 int md_oflags 1918 ) 1919 { 1920 int flag = (FREAD | FWRITE); 1921 cred_t *cred_p = kcred; 1922 major_t major; 1923 int err; 1924 dev_t ddi_dev = md_dev64_to_dev(*dev); 1925 1926 if (ddi_dev == NODEV) 1927 return (ENODEV); 1928 1929 major = getmajor(ddi_dev); 1930 1931 /* metadevice */ 1932 if (major == md_major) { 1933 mdi_unit_t *ui; 1934 1935 /* open underlying driver */ 1936 mnum = getminor(ddi_dev); 1937 1938 ui = MDI_UNIT(mnum); 1939 if (md_ops[ui->ui_opsindex]->md_open != NULL) { 1940 int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev, 1941 flag, OTYP_LYR, cred_p, md_oflags); 1942 /* 1943 * As open() may change the device, 1944 * send this info back to the caller. 1945 */ 1946 *dev = md_expldev(ddi_dev); 1947 return (ret); 1948 } 1949 1950 /* or do it ourselves */ 1951 (void) md_unit_openclose_enter(ui); 1952 err = md_unit_incopen(mnum, flag, OTYP_LYR); 1953 md_unit_openclose_exit(ui); 1954 /* convert our ddi_dev back to the dev we were given */ 1955 *dev = md_expldev(ddi_dev); 1956 return (err); 1957 } 1958 1959 /* 1960 * Open regular device, since open() may change dev_t give new dev_t 1961 * back to the caller. 1962 */ 1963 err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p); 1964 *dev = md_expldev(ddi_dev); 1965 return (err); 1966 } 1967 1968 /* 1969 * md_layered_close() is an internal routine only for SVM modules. 1970 * So the input device will be a md_dev64_t, because all SVM modules internally 1971 * work with that device type. 1972 * ddi routines on the other hand work with dev_t. So, if we call any ddi 1973 * routines from here we first have to convert that device into a dev_t. 1974 */ 1975 void 1976 md_layered_close( 1977 md_dev64_t dev, 1978 int md_cflags 1979 ) 1980 { 1981 int flag = (FREAD | FWRITE); 1982 cred_t *cred_p = kcred; 1983 dev_t ddi_dev = md_dev64_to_dev(dev); 1984 major_t major = getmajor(ddi_dev); 1985 minor_t mnum = getminor(ddi_dev); 1986 1987 /* metadevice */ 1988 if (major == md_major) { 1989 mdi_unit_t *ui = MDI_UNIT(mnum); 1990 1991 /* close underlying driver */ 1992 if (md_ops[ui->ui_opsindex]->md_close != NULL) { 1993 (*md_ops[ui->ui_opsindex]->md_close) 1994 (ddi_dev, flag, OTYP_LYR, cred_p, md_cflags); 1995 return; 1996 } 1997 1998 /* or do it ourselves */ 1999 (void) md_unit_openclose_enter(ui); 2000 (void) md_unit_decopen(mnum, OTYP_LYR); 2001 md_unit_openclose_exit(ui); 2002 return; 2003 } 2004 2005 /* close regular device */ 2006 (void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p); 2007 } 2008 2009 /* 2010 * saves a little code in mdstrategy 2011 */ 2012 int 2013 errdone(mdi_unit_t *ui, struct buf *bp, int err) 2014 { 2015 if ((bp->b_error = err) != 0) 2016 bp->b_flags |= B_ERROR; 2017 else 2018 bp->b_resid = bp->b_bcount; 2019 md_unit_readerexit(ui); 2020 md_biodone(bp); 2021 return (1); 2022 } 2023 2024 static int md_write_label = 0; 2025 2026 int 2027 md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp) 2028 { 2029 diskaddr_t endblk; 2030 set_t setno = MD_UN2SET(un); 2031 2032 if ((md_get_setstatus(setno) & MD_SET_STALE) && 2033 (! (bp->b_flags & B_READ))) 2034 return (errdone(ui, bp, EROFS)); 2035 /* 2036 * Check early for unreasonable block number. 2037 * 2038 * b_blkno is defined as adaddr_t which is typedef'd to a long. 2039 * A problem occurs if b_blkno has bit 31 set and un_total_blocks 2040 * doesn't, b_blkno is then compared as a negative number which is 2041 * always less than a positive. 2042 */ 2043 if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks) 2044 return (errdone(ui, bp, EINVAL)); 2045 2046 if (bp->b_lblkno == un->c.un_total_blocks) 2047 return (errdone(ui, bp, 0)); 2048 2049 /* 2050 * make sure we don't clobber any labels 2051 */ 2052 if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) && 2053 (un->c.un_flag & MD_LABELED) && (! md_write_label)) { 2054 cmn_err(CE_NOTE, "md: %s: write to label", 2055 md_shortname(getminor(bp->b_edev))); 2056 return (errdone(ui, bp, EINVAL)); 2057 } 2058 2059 bp->b_resid = 0; 2060 endblk = (diskaddr_t)(bp->b_lblkno + 2061 howmany(bp->b_bcount, DEV_BSIZE) - 1); 2062 2063 if (endblk > (un->c.un_total_blocks - 1)) { 2064 bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1)); 2065 endblk = un->c.un_total_blocks - 1; 2066 bp->b_bcount -= bp->b_resid; 2067 } 2068 return (0); 2069 } 2070 2071 /* 2072 * init_request_queue: initializes the request queues and creates the threads. 2073 * return value = 0 :invalid num_threads 2074 * = n : n is the number of threads created. 2075 */ 2076 2077 int 2078 init_requestq( 2079 md_requestq_entry_t *rq, /* request queue info */ 2080 void (*threadfn)(), /* function to start the thread */ 2081 caddr_t threadfn_args, /* args to the function */ 2082 int pri, /* thread priority */ 2083 int init_queue) /* flag to init queues */ 2084 { 2085 struct mdq_anchor *rqhead; 2086 int i; 2087 int num_threads; 2088 2089 2090 num_threads = *(rq->num_threadsp); 2091 rqhead = rq->dispq_headp; 2092 2093 if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0) 2094 return (0); 2095 2096 if (init_queue) { 2097 rqhead->dq.maxq_len = 0; 2098 rqhead->dq.treqs = 0; 2099 rqhead->dq.dq_next = &rqhead->dq; 2100 rqhead->dq.dq_prev = &rqhead->dq; 2101 cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL); 2102 mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL); 2103 } 2104 for (i = 0; i < num_threads; i++) { 2105 (void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0, 2106 TS_RUN, pri); 2107 } 2108 return (i); 2109 } 2110 2111 static void 2112 start_daemon(struct mdq_anchor *q) 2113 { 2114 md_daemon(0, q); 2115 ASSERT(0); 2116 } 2117 2118 /* 2119 * Creates all the md daemons. 2120 * Global: 2121 * md_num_daemons is set to number of daemons. 2122 * MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active. 2123 * 2124 * Return value: 0 success 2125 * 1 failure 2126 */ 2127 int 2128 md_start_daemons(int init_queue) 2129 { 2130 md_requestq_entry_t *rqp; 2131 int cnt; 2132 int i; 2133 int retval = 0; 2134 2135 2136 if (md_get_status() & MD_GBL_DAEMONS_LIVE) { 2137 return (retval); 2138 } 2139 md_clr_status(MD_GBL_DAEMONS_DIE); 2140 2141 rqp = &md_daemon_queues[0]; 2142 i = 0; 2143 while (!NULL_REQUESTQ_ENTRY(rqp)) { 2144 cnt = init_requestq(rqp, start_daemon, 2145 (caddr_t)rqp->dispq_headp, minclsyspri, init_queue); 2146 2147 if (cnt && cnt != *rqp->num_threadsp) { 2148 retval = 1; 2149 break; 2150 } 2151 /* 2152 * initialize variables 2153 */ 2154 md_num_daemons += cnt; 2155 rqp = &md_daemon_queues[++i]; 2156 } 2157 2158 md_set_status(MD_GBL_DAEMONS_LIVE); 2159 return (retval); 2160 } 2161 2162 int 2163 md_loadsubmod(set_t setno, char *name, int drvrid) 2164 { 2165 ddi_modhandle_t mod; 2166 md_ops_t **pops, *ops; 2167 int i, err; 2168 2169 /* 2170 * See if the submodule is mdopened. If not, i is the index of the 2171 * next empty slot. 2172 */ 2173 for (i = 0; md_ops[i] != NULL; i++) { 2174 if (strncmp(name, md_ops[i]->md_driver.md_drivername, 2175 MD_DRIVERNAMELEN) == 0) 2176 return (i); 2177 2178 if (i == (MD_NOPS - 1)) 2179 return (-1); 2180 } 2181 2182 if (drvrid < 0) { 2183 /* Do not try to add any records to the DB when stale. */ 2184 if (md_get_setstatus(setno) & MD_SET_STALE) 2185 return (-1); 2186 drvrid = md_setshared_name(setno, name, 0L); 2187 } 2188 2189 if (drvrid < 0) 2190 return (-1); 2191 2192 /* open and import the md_ops of the submodules */ 2193 mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err); 2194 if (mod == NULL) { 2195 cmn_err(CE_WARN, "md_loadsubmod: " 2196 "unable to ddi_modopen %s, error %d\n", name, err); 2197 return (-1); 2198 } 2199 pops = ddi_modsym(mod, "md_interface_ops", &err); 2200 if (pops == NULL) { 2201 cmn_err(CE_WARN, "md_loadsubmod: " 2202 "unable to import md_interface_ops from %s, error %d\n", 2203 name, err); 2204 (void) ddi_modclose(mod); 2205 return (-1); 2206 } 2207 2208 /* ddi_modsym returns pointer to md_interface_ops in submod */ 2209 ops = *pops; 2210 2211 /* initialize */ 2212 ops->md_selfindex = i; 2213 rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL); 2214 (void) strncpy(ops->md_driver.md_drivername, name, 2215 MD_DRIVERNAMELEN); 2216 2217 /* plumb */ 2218 md_ops[i] = ops; 2219 md_mods[i] = mod; 2220 ops->md_next = md_opslist; 2221 md_opslist = ops; 2222 2223 /* return index */ 2224 return (i); 2225 } 2226 2227 int 2228 md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired) 2229 { 2230 int i; 2231 int modindex; 2232 char *name = driver->md_drivername; 2233 set_t setno = driver->md_setno; 2234 int drvid; 2235 int local_dont_load; 2236 2237 if (setno >= md_nsets) 2238 return (-1); 2239 2240 for (i = 0; name[i] != 0; i++) 2241 if (i == (MD_DRIVERNAMELEN -1)) 2242 return (-1); 2243 2244 /* 2245 * If set is STALE, set local_dont_load to 1 since no records 2246 * should be added to DB when stale. 2247 */ 2248 if (md_get_setstatus(setno) & MD_SET_STALE) { 2249 local_dont_load = 1; 2250 } else { 2251 local_dont_load = dont_load; 2252 } 2253 2254 /* 2255 * Single thread ioctl module binding with respect to 2256 * similar code executed in md_loadsubmod that is called 2257 * from md_snarf_db_set (which is where that path does 2258 * its md_haltsnarf_enter call). 2259 */ 2260 md_haltsnarf_enter(setno); 2261 2262 /* See if the submodule is already ddi_modopened. */ 2263 for (i = 0; md_ops[i] != NULL; i++) { 2264 if (strncmp(name, md_ops[i]->md_driver.md_drivername, 2265 MD_DRIVERNAMELEN) == 0) { 2266 if (! local_dont_load && 2267 (md_getshared_key(setno, name) == MD_KEYBAD)) { 2268 if (md_setshared_name(setno, name, 0L) 2269 == MD_KEYBAD) { 2270 if (!db_notrequired) 2271 goto err; 2272 } 2273 } 2274 md_haltsnarf_exit(setno); 2275 return (i); 2276 } 2277 2278 if (i == (MD_NOPS -1)) 2279 break; 2280 } 2281 2282 if (local_dont_load) 2283 goto err; 2284 2285 drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name)); 2286 2287 /* ddi_modopen the submodule */ 2288 modindex = md_loadsubmod(setno, name, drvid); 2289 if (modindex < 0) 2290 goto err; 2291 2292 if (md_ops[modindex]->md_snarf != NULL) 2293 (*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno); 2294 2295 md_haltsnarf_exit(setno); 2296 return (modindex); 2297 2298 err: md_haltsnarf_exit(setno); 2299 return (-1); 2300 } 2301 2302 void 2303 md_call_strategy(buf_t *bp, int flags, void *private) 2304 { 2305 mdi_unit_t *ui; 2306 2307 if (mdv_strategy_tstpnt) 2308 if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0) 2309 return; 2310 if (getmajor(bp->b_edev) != md_major) { 2311 (void) bdev_strategy(bp); 2312 return; 2313 } 2314 2315 flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP; 2316 ui = MDI_UNIT(getminor(bp->b_edev)); 2317 ASSERT(ui != NULL); 2318 (*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private); 2319 } 2320 2321 /* 2322 * md_call_ioctl: 2323 * ------------- 2324 * Issue the specified ioctl to the device associated with the given md_dev64_t 2325 * 2326 * Arguments: 2327 * dev - underlying device [md_dev64_t] 2328 * cmd - ioctl to perform 2329 * data - arguments / result location 2330 * mode - read/write/layered ioctl 2331 * lockp - lock reference 2332 * 2333 * Returns: 2334 * 0 success 2335 * !=0 Failure (error code) 2336 */ 2337 int 2338 md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp) 2339 { 2340 dev_t device = md_dev64_to_dev(dev); 2341 int rval; 2342 mdi_unit_t *ui; 2343 2344 /* 2345 * See if device is a metadevice. If not call cdev_ioctl(), otherwise 2346 * call the ioctl entry-point in the metadevice. 2347 */ 2348 if (md_getmajor(dev) != md_major) { 2349 int rv; 2350 rval = cdev_ioctl(device, cmd, (intptr_t)data, mode, 2351 ddi_get_cred(), &rv); 2352 } else { 2353 ui = MDI_UNIT(md_getminor(dev)); 2354 ASSERT(ui != NULL); 2355 rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data, 2356 mode, lockp); 2357 } 2358 return (rval); 2359 } 2360 2361 void 2362 md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head) 2363 { 2364 md_link_t *next; 2365 md_link_t **pprev; 2366 2367 rw_enter(rw, RW_WRITER); 2368 2369 next = *head; 2370 pprev = head; 2371 while (next) { 2372 if ((next->ln_setno == setno) && (next->ln_id == id)) { 2373 *pprev = next->ln_next; 2374 rw_exit(rw); 2375 return; 2376 } 2377 pprev = &next->ln_next; 2378 next = next->ln_next; 2379 } 2380 2381 rw_exit(rw); 2382 } 2383 2384 int 2385 md_dev_exists(md_dev64_t dev) 2386 { 2387 2388 if (dev == NODEV64) 2389 return (0); 2390 2391 if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0) 2392 return (1); 2393 2394 if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) || 2395 (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits)) 2396 return (0); 2397 2398 if (MDI_UNIT(md_getminor(dev)) != NULL) 2399 return (1); 2400 2401 return (0); 2402 } 2403 2404 md_parent_t 2405 md_get_parent(md_dev64_t dev) 2406 { 2407 md_unit_t *un; 2408 mdi_unit_t *ui; 2409 md_parent_t parent; 2410 2411 if (md_getmajor(dev) != md_major) 2412 return (MD_NO_PARENT); 2413 2414 ui = MDI_UNIT(md_getminor(dev)); 2415 2416 un = (md_unit_t *)md_unit_readerlock(ui); 2417 parent = un->c.un_parent; 2418 md_unit_readerexit(ui); 2419 2420 return (parent); 2421 } 2422 2423 void 2424 md_set_parent(md_dev64_t dev, md_parent_t parent) 2425 { 2426 md_unit_t *un; 2427 mdi_unit_t *ui; 2428 2429 if (md_getmajor(dev) != md_major) 2430 return; 2431 2432 ui = MDI_UNIT(md_getminor(dev)); 2433 2434 un = (md_unit_t *)md_unit_readerlock(ui); 2435 un->c.un_parent = parent; 2436 md_unit_readerexit(ui); 2437 } 2438 2439 void 2440 md_reset_parent(md_dev64_t dev) 2441 { 2442 md_unit_t *un; 2443 mdi_unit_t *ui; 2444 2445 if (md_getmajor(dev) != md_major) 2446 return; 2447 2448 ui = MDI_UNIT(md_getminor(dev)); 2449 2450 un = (md_unit_t *)md_unit_readerlock(ui); 2451 un->c.un_parent = MD_NO_PARENT; 2452 md_unit_readerexit(ui); 2453 } 2454 2455 2456 static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL; 2457 2458 int 2459 md_hot_spare_ifc( 2460 hs_cmds_t cmd, 2461 mddb_recid_t id, 2462 u_longlong_t size, 2463 int labeled, 2464 mddb_recid_t *hs_id, 2465 mdkey_t *key, 2466 md_dev64_t *dev, 2467 diskaddr_t *sblock) 2468 { 2469 int err; 2470 2471 /* 2472 * RW lock on hot_spare_interface. We don't want it to change from 2473 * underneath us. If hot_spare_interface is NULL we're going to 2474 * need to set it. So we need to upgrade to a WRITER lock. If that 2475 * doesn't work, we drop the lock and reenter as WRITER. This leaves 2476 * a small hole during which hot_spare_interface could be modified 2477 * so we check it for NULL again. What a pain. Then if still null 2478 * load from md_get_named_service. 2479 */ 2480 2481 rw_enter(&hsp_rwlp.lock, RW_READER); 2482 if (hot_spare_interface == NULL) { 2483 if (rw_tryupgrade(&hsp_rwlp.lock) == 0) { 2484 rw_exit(&hsp_rwlp.lock); 2485 rw_enter(&hsp_rwlp.lock, RW_WRITER); 2486 if (hot_spare_interface != NULL) { 2487 err = ((*hot_spare_interface) 2488 (cmd, id, size, labeled, hs_id, key, dev, 2489 sblock)); 2490 rw_exit(&hsp_rwlp.lock); 2491 return (err); 2492 } 2493 } 2494 hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE, 2495 "hot spare interface", 0); 2496 rw_downgrade(&hsp_rwlp.lock); 2497 } 2498 2499 if (hot_spare_interface == NULL) { 2500 cmn_err(CE_WARN, "md: no hotspare interface"); 2501 rw_exit(&hsp_rwlp.lock); 2502 return (0); 2503 } 2504 2505 err = ((*hot_spare_interface) 2506 (cmd, id, size, labeled, hs_id, key, dev, sblock)); 2507 rw_exit(&hsp_rwlp.lock); 2508 return (err); 2509 } 2510 2511 void 2512 md_clear_hot_spare_interface() 2513 { 2514 rw_enter(&hsp_rwlp.lock, RW_WRITER); 2515 hot_spare_interface = NULL; 2516 rw_exit(&hsp_rwlp.lock); 2517 } 2518 2519 2520 static intptr_t (*notify_interface)() = (intptr_t (*)())NULL; 2521 2522 int 2523 md_notify_interface( 2524 md_event_cmds_t cmd, 2525 md_tags_t tag, 2526 set_t set, 2527 md_dev64_t dev, 2528 md_event_type_t event 2529 ) 2530 { 2531 int err; 2532 2533 if (md_event_queue == NULL) 2534 return (0); 2535 rw_enter(&ni_rwlp.lock, RW_READER); 2536 if (notify_interface == NULL) { 2537 if (rw_tryupgrade(&ni_rwlp.lock) == 0) { 2538 rw_exit(&ni_rwlp.lock); 2539 rw_enter(&ni_rwlp.lock, RW_WRITER); 2540 if (notify_interface != NULL) { 2541 err = ((*notify_interface) 2542 (cmd, tag, set, dev, event)); 2543 rw_exit(&ni_rwlp.lock); 2544 return (err); 2545 } 2546 } 2547 notify_interface = md_get_named_service(NODEV64, ANY_SERVICE, 2548 "notify interface", 0); 2549 rw_downgrade(&ni_rwlp.lock); 2550 } 2551 if (notify_interface == NULL) { 2552 cmn_err(CE_WARN, "md: no notify interface"); 2553 rw_exit(&ni_rwlp.lock); 2554 return (0); 2555 } 2556 err = ((*notify_interface)(cmd, tag, set, dev, event)); 2557 rw_exit(&ni_rwlp.lock); 2558 return (err); 2559 } 2560 2561 char * 2562 obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev) 2563 { 2564 char *setname; 2565 char name[MD_MAX_CTDLEN]; 2566 minor_t mnum = md_getminor(dev); 2567 major_t maj = md_getmajor(dev); 2568 int rtn = 0; 2569 2570 /* 2571 * Verify that the passed dev_t refers to a valid metadevice. 2572 * If it doesn't we can make no assumptions as to what the device 2573 * name is. Return NULL in these cases. 2574 */ 2575 if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) || 2576 (MD_MIN2SET(mnum) >= md_nsets)) { 2577 return (NULL); 2578 } 2579 2580 setname = NULL; 2581 name[0] = '\0'; 2582 switch (tag) { 2583 case SVM_TAG_HSP: 2584 if (setno == 0) { 2585 rtn = snprintf(name, sizeof (name), "hsp%u", 2586 (unsigned)MD_MIN2UNIT(mnum)); 2587 } else { 2588 setname = mddb_getsetname(setno); 2589 if (setname != NULL) { 2590 rtn = snprintf(name, sizeof (name), "%s/hsp%u", 2591 setname, (unsigned)MD_MIN2UNIT(mnum)); 2592 } 2593 } 2594 break; 2595 case SVM_TAG_DRIVE: 2596 (void) sprintf(name, "drive"); 2597 break; 2598 case SVM_TAG_HOST: 2599 (void) sprintf(name, "host"); 2600 break; 2601 case SVM_TAG_SET: 2602 rtn = snprintf(name, sizeof (name), "%s", 2603 mddb_getsetname(setno)); 2604 if ((name[0] == '\0') || (rtn >= sizeof (name))) { 2605 (void) sprintf(name, "diskset"); 2606 rtn = 0; 2607 } 2608 break; 2609 default: 2610 rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum)); 2611 break; 2612 } 2613 2614 /* Check if we got any rubbish for any of the snprintf's */ 2615 if ((name[0] == '\0') || (rtn >= sizeof (name))) { 2616 return (NULL); 2617 } 2618 2619 return (md_strdup(name)); 2620 } 2621 2622 /* Sysevent subclass and mdnotify event type pairs */ 2623 struct node { 2624 char *se_ev; 2625 md_event_type_t md_ev; 2626 }; 2627 2628 /* 2629 * Table must be sorted in case sensitive ascending order of 2630 * the sysevents values 2631 */ 2632 static struct node ev_table[] = { 2633 { ESC_SVM_ADD, EQ_ADD }, 2634 { ESC_SVM_ATTACH, EQ_ATTACH }, 2635 { ESC_SVM_ATTACHING, EQ_ATTACHING }, 2636 { ESC_SVM_CHANGE, EQ_CHANGE }, 2637 { ESC_SVM_CREATE, EQ_CREATE }, 2638 { ESC_SVM_DELETE, EQ_DELETE }, 2639 { ESC_SVM_DETACH, EQ_DETACH }, 2640 { ESC_SVM_DETACHING, EQ_DETACHING }, 2641 { ESC_SVM_DRIVE_ADD, EQ_DRIVE_ADD }, 2642 { ESC_SVM_DRIVE_DELETE, EQ_DRIVE_DELETE }, 2643 { ESC_SVM_ENABLE, EQ_ENABLE }, 2644 { ESC_SVM_ERRED, EQ_ERRED }, 2645 { ESC_SVM_EXCHANGE, EQ_EXCHANGE }, 2646 { ESC_SVM_GROW, EQ_GROW }, 2647 { ESC_SVM_HS_CHANGED, EQ_HS_CHANGED }, 2648 { ESC_SVM_HS_FREED, EQ_HS_FREED }, 2649 { ESC_SVM_HOST_ADD, EQ_HOST_ADD }, 2650 { ESC_SVM_HOST_DELETE, EQ_HOST_DELETE }, 2651 { ESC_SVM_HOTSPARED, EQ_HOTSPARED }, 2652 { ESC_SVM_INIT_FAILED, EQ_INIT_FAILED }, 2653 { ESC_SVM_INIT_FATAL, EQ_INIT_FATAL }, 2654 { ESC_SVM_INIT_START, EQ_INIT_START }, 2655 { ESC_SVM_INIT_SUCCESS, EQ_INIT_SUCCESS }, 2656 { ESC_SVM_IOERR, EQ_IOERR }, 2657 { ESC_SVM_LASTERRED, EQ_LASTERRED }, 2658 { ESC_SVM_MEDIATOR_ADD, EQ_MEDIATOR_ADD }, 2659 { ESC_SVM_MEDIATOR_DELETE, EQ_MEDIATOR_DELETE }, 2660 { ESC_SVM_OFFLINE, EQ_OFFLINE }, 2661 { ESC_SVM_OK, EQ_OK }, 2662 { ESC_SVM_ONLINE, EQ_ONLINE }, 2663 { ESC_SVM_OPEN_FAIL, EQ_OPEN_FAIL }, 2664 { ESC_SVM_REGEN_DONE, EQ_REGEN_DONE }, 2665 { ESC_SVM_REGEN_FAILED, EQ_REGEN_FAILED }, 2666 { ESC_SVM_REGEN_START, EQ_REGEN_START }, 2667 { ESC_SVM_RELEASE, EQ_RELEASE }, 2668 { ESC_SVM_REMOVE, EQ_REMOVE }, 2669 { ESC_SVM_RENAME_DST, EQ_RENAME_DST }, 2670 { ESC_SVM_RENAME_SRC, EQ_RENAME_SRC }, 2671 { ESC_SVM_REPLACE, EQ_REPLACE }, 2672 { ESC_SVM_RESYNC_DONE, EQ_RESYNC_DONE }, 2673 { ESC_SVM_RESYNC_FAILED, EQ_RESYNC_FAILED }, 2674 { ESC_SVM_RESYNC_START, EQ_RESYNC_START }, 2675 { ESC_SVM_RESYNC_SUCCESS, EQ_RESYNC_SUCCESS }, 2676 { ESC_SVM_TAKEOVER, EQ_TAKEOVER } 2677 }; 2678 2679 static md_tags_t md_tags[] = { 2680 TAG_UNK, 2681 TAG_METADEVICE, 2682 TAG_UNK, 2683 TAG_UNK, 2684 TAG_UNK, 2685 TAG_UNK, 2686 TAG_REPLICA, 2687 TAG_HSP, 2688 TAG_HS, 2689 TAG_SET, 2690 TAG_DRIVE, 2691 TAG_HOST, 2692 TAG_MEDIATOR 2693 }; 2694 2695 md_event_type_t 2696 ev_get(char *subclass) 2697 { 2698 int high, mid, low, p; 2699 2700 low = 0; 2701 high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1; 2702 while (low <= high) { 2703 mid = (high + low) / 2; 2704 p = strcmp(subclass, ev_table[mid].se_ev); 2705 if (p == 0) { 2706 return (ev_table[mid].md_ev); 2707 } else if (p < 0) { 2708 high = mid - 1; 2709 } else { 2710 low = mid + 1; 2711 } 2712 } 2713 2714 return (EQ_EMPTY); 2715 } 2716 2717 /* 2718 * Log mdnotify event 2719 */ 2720 void 2721 do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid) 2722 { 2723 md_event_type_t ev_type; 2724 md_tags_t md_tag; 2725 2726 /* Translate sysevent into mdnotify event */ 2727 ev_type = ev_get(se_subclass); 2728 2729 if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) { 2730 md_tag = TAG_UNK; 2731 } else { 2732 md_tag = md_tags[tag]; 2733 } 2734 2735 NOTIFY_MD(md_tag, setno, devid, ev_type); 2736 } 2737 2738 /* 2739 * Log SVM sys events 2740 */ 2741 void 2742 svm_gen_sysevent( 2743 char *se_class, 2744 char *se_subclass, 2745 uint32_t tag, 2746 set_t setno, 2747 md_dev64_t devid 2748 ) 2749 { 2750 nvlist_t *attr_list; 2751 sysevent_id_t eid; 2752 int err = DDI_SUCCESS; 2753 char *devname; 2754 extern dev_info_t *md_devinfo; 2755 2756 /* Raise the mdnotify event before anything else */ 2757 do_mdnotify(se_subclass, tag, setno, devid); 2758 2759 if (md_devinfo == NULL) { 2760 return; 2761 } 2762 2763 err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP); 2764 2765 if (err == DDI_SUCCESS) { 2766 /* Add the version numver */ 2767 err = nvlist_add_uint32(attr_list, SVM_VERSION_NO, 2768 (uint32_t)SVM_VERSION); 2769 if (err != DDI_SUCCESS) { 2770 goto fail; 2771 } 2772 2773 /* Add the tag attribute */ 2774 err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag); 2775 if (err != DDI_SUCCESS) { 2776 goto fail; 2777 } 2778 2779 /* Add the set number attribute */ 2780 err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno); 2781 if (err != DDI_SUCCESS) { 2782 goto fail; 2783 } 2784 2785 /* Add the device id attribute */ 2786 err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid); 2787 if (err != DDI_SUCCESS) { 2788 goto fail; 2789 } 2790 2791 /* Add the device name attribute */ 2792 devname = obj2devname(tag, setno, devid); 2793 if (devname != NULL) { 2794 err = nvlist_add_string(attr_list, SVM_DEV_NAME, 2795 devname); 2796 freestr(devname); 2797 } else { 2798 err = nvlist_add_string(attr_list, SVM_DEV_NAME, 2799 "unspecified"); 2800 } 2801 if (err != DDI_SUCCESS) { 2802 goto fail; 2803 } 2804 2805 /* Attempt to post event */ 2806 err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class, 2807 se_subclass, attr_list, &eid, DDI_SLEEP); 2808 2809 nvlist_free(attr_list); 2810 if (err != DDI_SUCCESS) { 2811 cmn_err(CE_WARN, "Failed to log event for %s, %s," 2812 " err=%x", se_class, se_subclass, err); 2813 } 2814 } 2815 2816 return; 2817 2818 fail: 2819 nvlist_free(attr_list); 2820 cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x", 2821 se_class, se_subclass, err); 2822 } 2823 2824 void 2825 md_clear_named_service() 2826 { 2827 rw_enter(&ni_rwlp.lock, RW_WRITER); 2828 notify_interface = NULL; 2829 rw_exit(&ni_rwlp.lock); 2830 } 2831 2832 void 2833 md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock) 2834 { 2835 mdi_unit_t *ui; 2836 set_t setno = MD_MIN2SET(mnum); 2837 2838 ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP); 2839 ui->ui_opsindex = ops->md_selfindex; 2840 2841 /* initialize all the incore conditional variables */ 2842 mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL); 2843 cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL); 2844 2845 if (alloc_lock) { 2846 ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP); 2847 mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL); 2848 cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL); 2849 mutex_init(&ui->ui_io_lock->io_list_mutex, NULL, 2850 MUTEX_DEFAULT, NULL); 2851 ui->ui_io_lock->io_list_front = NULL; 2852 ui->ui_io_lock->io_list_back = NULL; 2853 } 2854 if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) { 2855 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 2856 MDI_VOIDUNIT(mnum) = (void *) ui; 2857 rw_exit(&md_unit_array_rw.lock); 2858 } else 2859 MDI_VOIDUNIT(mnum) = (void *) ui; 2860 2861 rw_enter(&ops->md_link_rw.lock, RW_WRITER); 2862 ui->ui_link.ln_next = ops->md_head; 2863 ui->ui_link.ln_setno = setno; 2864 ui->ui_link.ln_id = mnum; 2865 ops->md_head = &ui->ui_link; 2866 /* setup the unavailable field */ 2867 #if defined(_ILP32) 2868 if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) { 2869 ui->ui_tstate |= MD_64MD_ON_32KERNEL; 2870 cmn_err(CE_NOTE, "d%d is unavailable because 64 bit " 2871 "metadevices are not accessible on a 32 bit kernel", 2872 mnum); 2873 } 2874 #endif 2875 2876 rw_exit(&ops->md_link_rw.lock); 2877 } 2878 2879 void 2880 md_destroy_unit_incore(minor_t mnum, md_ops_t *ops) 2881 { 2882 mdi_unit_t *ui; 2883 2884 /* 2885 * ASSUMPTION: md_unit_array_rw WRITER lock is held. 2886 */ 2887 ui = MDI_UNIT(mnum); 2888 if (ui == NULL) 2889 return; 2890 2891 md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock, 2892 &ops->md_head); 2893 2894 /* destroy the io lock if one is being used */ 2895 if (ui->ui_io_lock) { 2896 mutex_destroy(&ui->ui_io_lock->io_mx); 2897 cv_destroy(&ui->ui_io_lock->io_cv); 2898 kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t)); 2899 } 2900 2901 /* teardown kstat */ 2902 md_kstat_destroy(mnum); 2903 2904 /* destroy all the incore conditional variables */ 2905 mutex_destroy(&ui->ui_mx); 2906 cv_destroy(&ui->ui_cv); 2907 2908 kmem_free(ui, sizeof (mdi_unit_t)); 2909 MDI_VOIDUNIT(mnum) = (void *) NULL; 2910 } 2911 2912 void 2913 md_rem_names(sv_dev_t *sv, int nsv) 2914 { 2915 int i, s; 2916 int max_sides; 2917 2918 if (nsv == 0) 2919 return; 2920 2921 /* All entries removed are in the same diskset */ 2922 if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET) 2923 max_sides = MD_MNMAXSIDES; 2924 else 2925 max_sides = MD_MAXSIDES; 2926 2927 for (i = 0; i < nsv; i++) 2928 for (s = 0; s < max_sides; s++) 2929 (void) md_remdevname(sv[i].setno, s, sv[i].key); 2930 } 2931 2932 /* 2933 * Checking user args before we get into physio - returns 0 for ok, else errno 2934 * We do a lot of checking against illegal arguments here because some of the 2935 * real disk drivers don't like certain kinds of arguments. (e.g xy doesn't 2936 * like odd address user buffer.) Those drivers capture bad arguments in 2937 * xxread and xxwrite. But since meta-driver calls their strategy routines 2938 * directly, two bad scenario might happen: 2939 * 1. the real strategy doesn't like it and panic. 2940 * 2. the real strategy doesn't like it and set B_ERROR. 2941 * 2942 * The second case is no better than the first one, since the meta-driver 2943 * will treat it as a media-error and off line the mirror metapartition. 2944 * (Too bad there is no way to tell what error it is.) 2945 * 2946 */ 2947 int 2948 md_chk_uio(struct uio *uio) 2949 { 2950 int i; 2951 struct iovec *iov; 2952 2953 /* 2954 * Check for negative or not block-aligned offset 2955 */ 2956 if ((uio->uio_loffset < 0) || 2957 ((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) { 2958 return (EINVAL); 2959 } 2960 iov = uio->uio_iov; 2961 i = uio->uio_iovcnt; 2962 2963 while (i--) { 2964 if ((iov->iov_len & (DEV_BSIZE - 1)) != 0) 2965 return (EINVAL); 2966 /* 2967 * Bug # 1212146 2968 * The default is to not check alignment, but we can now check 2969 * for a larger number of alignments if desired. 2970 */ 2971 if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask) 2972 return (EINVAL); 2973 iov++; 2974 } 2975 return (0); 2976 } 2977 2978 char * 2979 md_shortname( 2980 minor_t mnum 2981 ) 2982 { 2983 static char buf[MAXPATHLEN]; 2984 char *devname; 2985 char *invalid = " (Invalid minor number %u) "; 2986 char *metaname; 2987 mdc_unit_t *un; 2988 side_t side; 2989 set_t setno = MD_MIN2SET(mnum); 2990 unit_t unit = MD_MIN2UNIT(mnum); 2991 2992 if ((un = MD_UNIT(mnum)) == NULL) { 2993 (void) snprintf(buf, sizeof (buf), invalid, mnum); 2994 return (buf); 2995 } 2996 2997 /* 2998 * If unit is not a friendly name unit, derive the name from the 2999 * minor number. 3000 */ 3001 if ((un->un_revision & MD_FN_META_DEV) == 0) { 3002 /* This is a traditional metadevice */ 3003 if (setno == MD_LOCAL_SET) { 3004 (void) snprintf(buf, sizeof (buf), "d%u", 3005 (unsigned)unit); 3006 } else { 3007 (void) snprintf(buf, sizeof (buf), "%s/d%u", 3008 mddb_getsetname(setno), (unsigned)unit); 3009 } 3010 return (buf); 3011 } 3012 3013 /* 3014 * It is a friendly name metadevice, so we need to get its name. 3015 */ 3016 side = mddb_getsidenum(setno); 3017 devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP); 3018 if (md_getdevname(setno, side, MD_KEYWILD, 3019 md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) { 3020 /* 3021 * md_getdevname has given us either /dev/md/dsk/<metaname> 3022 * or /dev/md/<setname>/dsk/<metname> depending on whether 3023 * or not we are in the local set. Thus, we'll pull the 3024 * metaname from this string. 3025 */ 3026 if ((metaname = strrchr(devname, '/')) == NULL) { 3027 (void) snprintf(buf, sizeof (buf), invalid, mnum); 3028 goto out; 3029 } 3030 metaname++; /* move past slash */ 3031 if (setno == MD_LOCAL_SET) { 3032 /* No set name. */ 3033 (void) snprintf(buf, sizeof (buf), "%s", metaname); 3034 } else { 3035 /* Include setname */ 3036 (void) snprintf(buf, sizeof (buf), "%s/%s", 3037 mddb_getsetname(setno), metaname); 3038 } 3039 } else { 3040 /* We couldn't find the name. */ 3041 (void) snprintf(buf, sizeof (buf), invalid, mnum); 3042 } 3043 3044 out: 3045 kmem_free(devname, MAXPATHLEN); 3046 return (buf); 3047 } 3048 3049 char * 3050 md_devname( 3051 set_t setno, 3052 md_dev64_t dev, 3053 char *buf, 3054 size_t size 3055 ) 3056 { 3057 static char mybuf[MD_MAX_CTDLEN]; 3058 int err; 3059 3060 if (buf == NULL) { 3061 buf = mybuf; 3062 size = sizeof (mybuf); 3063 } else { 3064 ASSERT(size >= MD_MAX_CTDLEN); 3065 } 3066 3067 err = md_getdevname_common(setno, mddb_getsidenum(setno), 3068 0, dev, buf, size, MD_NOWAIT_LOCK); 3069 if (err) { 3070 if (err == ENOENT) { 3071 (void) sprintf(buf, "(Unavailable)"); 3072 } else { 3073 (void) sprintf(buf, "(%u.%u)", 3074 md_getmajor(dev), md_getminor(dev)); 3075 } 3076 } 3077 3078 return (buf); 3079 } 3080 void 3081 md_minphys(buf_t *pb) 3082 { 3083 extern unsigned md_maxbcount; 3084 3085 if (pb->b_bcount > md_maxbcount) 3086 pb->b_bcount = md_maxbcount; 3087 } 3088 3089 void 3090 md_bioinit(struct buf *bp) 3091 { 3092 ASSERT(bp); 3093 3094 bioinit(bp); 3095 bp->b_back = bp; 3096 bp->b_forw = bp; 3097 bp->b_flags = B_BUSY; /* initialize flags */ 3098 } 3099 3100 void 3101 md_bioreset(struct buf *bp) 3102 { 3103 ASSERT(bp); 3104 3105 bioreset(bp); 3106 bp->b_back = bp; 3107 bp->b_forw = bp; 3108 bp->b_flags = B_BUSY; /* initialize flags */ 3109 } 3110 3111 /* 3112 * md_bioclone is needed as long as the real bioclone only takes a daddr_t 3113 * as block number. 3114 * We simply call bioclone with all input parameters but blkno, and set the 3115 * correct blkno afterwards. 3116 * Caveat Emptor: bp_mem must not be NULL! 3117 */ 3118 buf_t * 3119 md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno, 3120 int (*iodone)(buf_t *), buf_t *bp_mem, int sleep) 3121 { 3122 (void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep); 3123 bp_mem->b_lblkno = blkno; 3124 return (bp_mem); 3125 } 3126 3127 3128 /* 3129 * kstat stuff 3130 */ 3131 void 3132 md_kstat_init_ui( 3133 minor_t mnum, 3134 mdi_unit_t *ui 3135 ) 3136 { 3137 if ((ui != NULL) && (ui->ui_kstat == NULL)) { 3138 set_t setno = MD_MIN2SET(mnum); 3139 unit_t unit = MD_MIN2UNIT(mnum); 3140 char module[KSTAT_STRLEN]; 3141 char *p = module; 3142 3143 if (setno != MD_LOCAL_SET) { 3144 char buf[64]; 3145 char *s = buf; 3146 char *e = module + sizeof (module) - 4; 3147 3148 (void) sprintf(buf, "%u", setno); 3149 while ((p < e) && (*s != '\0')) 3150 *p++ = *s++; 3151 *p++ = '/'; 3152 } 3153 *p++ = 'm'; 3154 *p++ = 'd'; 3155 *p = '\0'; 3156 if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk", 3157 KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) { 3158 ui->ui_kstat->ks_lock = &ui->ui_mx; 3159 kstat_install(ui->ui_kstat); 3160 } 3161 } 3162 } 3163 3164 void 3165 md_kstat_init( 3166 minor_t mnum 3167 ) 3168 { 3169 md_kstat_init_ui(mnum, MDI_UNIT(mnum)); 3170 } 3171 3172 void 3173 md_kstat_destroy_ui( 3174 mdi_unit_t *ui 3175 ) 3176 { 3177 /* 3178 * kstat_delete() interface has it's own locking mechanism and 3179 * does not allow holding of kstat lock (ks_lock). 3180 * Note: ks_lock == ui_mx from the md_kstat_init_ui(). 3181 */ 3182 if ((ui != NULL) && (ui->ui_kstat != NULL)) { 3183 kstat_delete(ui->ui_kstat); 3184 ui->ui_kstat = NULL; 3185 } 3186 } 3187 3188 void 3189 md_kstat_destroy( 3190 minor_t mnum 3191 ) 3192 { 3193 md_kstat_destroy_ui(MDI_UNIT(mnum)); 3194 } 3195 3196 /* 3197 * In the following subsequent routines, locks are held before checking the 3198 * validity of ui_kstat. This is done to make sure that we don't trip over 3199 * a NULL ui_kstat anymore. 3200 */ 3201 3202 void 3203 md_kstat_waitq_enter( 3204 mdi_unit_t *ui 3205 ) 3206 { 3207 mutex_enter(&ui->ui_mx); 3208 if (ui->ui_kstat != NULL) 3209 kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat)); 3210 mutex_exit(&ui->ui_mx); 3211 } 3212 3213 void 3214 md_kstat_waitq_to_runq( 3215 mdi_unit_t *ui 3216 ) 3217 { 3218 mutex_enter(&ui->ui_mx); 3219 if (ui->ui_kstat != NULL) 3220 kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat)); 3221 mutex_exit(&ui->ui_mx); 3222 } 3223 3224 void 3225 md_kstat_waitq_exit( 3226 mdi_unit_t *ui 3227 ) 3228 { 3229 mutex_enter(&ui->ui_mx); 3230 if (ui->ui_kstat != NULL) 3231 kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat)); 3232 mutex_exit(&ui->ui_mx); 3233 } 3234 3235 void 3236 md_kstat_runq_enter( 3237 mdi_unit_t *ui 3238 ) 3239 { 3240 mutex_enter(&ui->ui_mx); 3241 if (ui->ui_kstat != NULL) 3242 kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat)); 3243 mutex_exit(&ui->ui_mx); 3244 } 3245 3246 void 3247 md_kstat_runq_exit( 3248 mdi_unit_t *ui 3249 ) 3250 { 3251 mutex_enter(&ui->ui_mx); 3252 if (ui->ui_kstat != NULL) 3253 kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat)); 3254 mutex_exit(&ui->ui_mx); 3255 } 3256 3257 void 3258 md_kstat_done( 3259 mdi_unit_t *ui, 3260 buf_t *bp, 3261 int war 3262 ) 3263 { 3264 size_t n_done; 3265 3266 /* check for end of device */ 3267 if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) { 3268 n_done = bp->b_bcount; 3269 } else if (bp->b_bcount < bp->b_resid) { 3270 n_done = 0; 3271 } else { 3272 n_done = bp->b_bcount - bp->b_resid; 3273 } 3274 3275 /* do accounting */ 3276 mutex_enter(&ui->ui_mx); 3277 if (ui->ui_kstat != NULL) { 3278 if ((! war) && (bp->b_flags & B_READ)) { 3279 KSTAT_IO_PTR(ui->ui_kstat)->reads++; 3280 KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done; 3281 } else { 3282 KSTAT_IO_PTR(ui->ui_kstat)->writes++; 3283 KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done; 3284 } 3285 kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat)); 3286 } 3287 mutex_exit(&ui->ui_mx); 3288 } 3289 3290 pid_t 3291 md_getpid() 3292 { 3293 pid_t valuep; 3294 if (drv_getparm(PPID, (pid_t *)&valuep) != 0) { 3295 ASSERT(0); 3296 return ((pid_t)0); 3297 } else { 3298 ASSERT(valuep); 3299 return (valuep); 3300 } 3301 } 3302 3303 3304 proc_t * 3305 md_getproc() 3306 { 3307 proc_t *valuep; 3308 if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) { 3309 ASSERT(0); 3310 return ((proc_t *)NULL); 3311 } else { 3312 ASSERT(valuep); 3313 return (valuep); 3314 } 3315 } 3316 3317 extern kmutex_t pidlock; 3318 3319 /* 3320 * this check to see if a process pid pair are still running. For the 3321 * disk set lock when both pid/proc are zero then the locks is not 3322 * currently held. 3323 */ 3324 int 3325 md_checkpid(pid_t pid, proc_t *proc) 3326 { 3327 int retval = 1; 3328 3329 if (pid == 0 && proc == NULL) 3330 return (0); 3331 3332 mutex_enter(&pidlock); 3333 if (prfind(pid) != proc) 3334 retval = 0; 3335 mutex_exit(&pidlock); 3336 return (retval); 3337 } 3338 3339 /* 3340 * NAME: md_init_probereq 3341 * 3342 * DESCRIPTION: initializes a probe request. Parcels out the mnums such that 3343 * they can be dispatched to multiple daemon threads. 3344 * 3345 * PARAMETERS: struct md_probedev *p pointer ioctl input 3346 * 3347 * RETURN VALUE: Returns errno 3348 * 3349 */ 3350 3351 int 3352 md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp) 3353 { 3354 int err = 0; 3355 int modindx; 3356 intptr_t (*probe_test)(); 3357 3358 /* 3359 * Initialize the semaphores and mutex 3360 * for the request 3361 */ 3362 3363 p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP); 3364 3365 p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP); 3366 sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL); 3367 mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL); 3368 3369 modindx = md_getmodindex(&(p->probe.md_driver), 1, 1); 3370 probe_test = md_get_named_service(NODEV64, modindx, 3371 p->probe.test_name, 0); 3372 if (probe_test == NULL) { 3373 err = EINVAL; 3374 goto err_out; 3375 } 3376 3377 err = md_create_probe_rqlist(p, hdrpp, probe_test); 3378 err_out: 3379 return (err); 3380 } 3381 3382 /* 3383 * NAME: md_probe_one 3384 * 3385 * DESCRIPTION: Generic routine for probing disks. This is called from the 3386 * daemon. 3387 * 3388 * PARAMETERS: probe_req_t *reqp pointer to the probe request structure. 3389 * 3390 */ 3391 3392 void 3393 md_probe_one(probe_req_t *reqp) 3394 { 3395 mdi_unit_t *ui; 3396 md_probedev_impl_t *p; 3397 int err = 0; 3398 set_t setno; 3399 3400 p = (md_probedev_impl_t *)reqp->private_handle; 3401 /* 3402 * Validate the unit while holding the global ioctl lock, then 3403 * obtain the unit_writerlock. Once the writerlock has been obtained 3404 * we can release the global lock. As long as we hold one of these 3405 * locks this will prevent a metaclear operation being performed 3406 * on the metadevice because metaclear takes the readerlock (via 3407 * openclose lock). 3408 * To avoid a potential deadlock with the probe_fcn() causing i/o to 3409 * be issued to the writerlock'd metadevice we only grab the writerlock 3410 * if the unit is not an SVM root device. 3411 */ 3412 while (md_ioctl_lock_enter() == EINTR) 3413 ; 3414 setno = MD_MIN2SET(reqp->mnum); 3415 ui = MDI_UNIT(reqp->mnum); 3416 if (ui != NULL) { 3417 int writer_grabbed; 3418 dev_t svm_root; 3419 3420 if ((setno == MD_LOCAL_SET) && root_is_svm) { 3421 svm_root = getrootdev(); 3422 3423 if (getminor(svm_root) == reqp->mnum) { 3424 writer_grabbed = 0; 3425 } else { 3426 writer_grabbed = 1; 3427 (void) md_unit_writerlock_common(ui, 0); 3428 } 3429 } else { 3430 writer_grabbed = 1; 3431 (void) md_unit_writerlock_common(ui, 0); 3432 } 3433 (void) md_ioctl_lock_exit(0, 0, 0, FALSE); 3434 err = (*reqp->probe_fcn)(ui, reqp->mnum); 3435 if (writer_grabbed) { 3436 md_unit_writerexit(ui); 3437 } 3438 } else { 3439 (void) md_ioctl_lock_exit(0, 0, 0, FALSE); 3440 } 3441 3442 /* update the info in the probe structure */ 3443 3444 mutex_enter(PROBE_MX(p)); 3445 if (err != 0) { 3446 cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err, 3447 reqp->mnum); 3448 (void) mdsyserror(&(p->probe.mde), err); 3449 } 3450 3451 mutex_exit(PROBE_MX(p)); 3452 sema_v(PROBE_SEMA(p)); 3453 3454 kmem_free(reqp, sizeof (probe_req_t)); 3455 } 3456 char * 3457 md_strdup(char *cp) 3458 { 3459 char *new_cp = NULL; 3460 3461 new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP); 3462 3463 return (strcpy(new_cp, cp)); 3464 } 3465 3466 void 3467 freestr(char *cp) 3468 { 3469 kmem_free(cp, strlen(cp) + 1); 3470 } 3471 3472 /* 3473 * Validate the list and skip invalid devices. Then create 3474 * a doubly linked circular list of devices to probe. 3475 * The hdr points to the head and tail of this list. 3476 */ 3477 3478 static int 3479 md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr, 3480 intptr_t (*probe_test)()) 3481 { 3482 int i, err, nodevcnt; 3483 probe_req_t *tp; 3484 daemon_queue_t *hp; 3485 minor_t mnum; 3486 3487 nodevcnt = 0; 3488 3489 hp = NULL; 3490 3491 for (i = 0; i < plist->probe.nmdevs; i++) { 3492 mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i]; 3493 if (MDI_UNIT(mnum) == NULL) { 3494 cmn_err(CE_WARN, "md: Cannot probe %s since it does " 3495 "not exist", md_shortname(mnum)); 3496 nodevcnt++; 3497 continue; 3498 } 3499 tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP); 3500 tp->mnum = mnum; 3501 tp->private_handle = (void *)plist; 3502 tp->probe_fcn = probe_test; 3503 if (hp == NULL) { 3504 hp = (daemon_queue_t *)tp; 3505 hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp; 3506 } else { 3507 tp->dq.dq_next = hp; 3508 tp->dq.dq_prev = hp->dq_prev; 3509 hp->dq_prev->dq_next = (daemon_queue_t *)tp; 3510 hp->dq_prev = (daemon_queue_t *)tp; 3511 } 3512 } 3513 3514 *hdr = hp; 3515 if (nodevcnt > 0) 3516 plist->probe.nmdevs -= nodevcnt; 3517 3518 /* 3519 * If there are no devices to be probed because they were 3520 * incorrect, then return an error. 3521 */ 3522 err = (plist->probe.nmdevs == 0) ? ENODEV : 0; 3523 3524 return (err); 3525 } 3526 3527 /* 3528 * This routine increments the I/O count for set I/O operations. This 3529 * value is used to determine if an I/O can done. If a release is in 3530 * process this will return an error and cause the I/O to be errored. 3531 */ 3532 int 3533 md_inc_iocount(set_t setno) 3534 { 3535 int rc = 0; 3536 3537 if (setno == 0) 3538 return (0); 3539 3540 mutex_enter(&md_set_io[setno].md_io_mx); 3541 if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) { 3542 rc = EIO; 3543 goto out; 3544 } 3545 3546 ASSERT(md_set_io[setno].io_cnt >= 0); 3547 md_set_io[setno].io_cnt++; 3548 3549 out: mutex_exit(&md_set_io[setno].md_io_mx); 3550 return (rc); 3551 } 3552 3553 void 3554 md_inc_iocount_noblock(set_t setno) 3555 { 3556 3557 if (setno == 0) 3558 return; 3559 3560 mutex_enter(&md_set_io[setno].md_io_mx); 3561 md_set_io[setno].io_cnt++; 3562 mutex_exit(&md_set_io[setno].md_io_mx); 3563 } 3564 void 3565 md_dec_iocount(set_t setno) 3566 { 3567 3568 if (setno == 0) 3569 return; 3570 3571 mutex_enter(&md_set_io[setno].md_io_mx); 3572 md_set_io[setno].io_cnt--; 3573 ASSERT(md_set_io[setno].io_cnt >= 0); 3574 if ((md_set_io[setno].io_state & MD_SET_RELEASE) && 3575 (md_set_io[setno].io_cnt == 0)) 3576 cv_broadcast(&md_set_io[setno].md_io_cv); 3577 mutex_exit(&md_set_io[setno].md_io_mx); 3578 } 3579 3580 int 3581 md_isblock_setio(set_t setno) 3582 { 3583 int rc = 0; 3584 3585 if (setno == 0) 3586 return (0); 3587 3588 mutex_enter(&md_set_io[setno].md_io_mx); 3589 if (md_set_io[setno].io_state & MD_SET_RELEASE) 3590 rc = 1; 3591 3592 mutex_exit(&md_set_io[setno].md_io_mx); 3593 return (rc); 3594 } 3595 3596 int 3597 md_block_setio(set_t setno) 3598 { 3599 int rc = 0; 3600 3601 if (setno == 0) 3602 return (1); 3603 3604 mutex_enter(&md_set_io[setno].md_io_mx); 3605 md_set_io[setno].io_state = MD_SET_RELEASE; 3606 3607 while (md_set_io[setno].io_cnt > 0) { 3608 cv_wait(&md_set_io[setno].md_io_cv, 3609 &md_set_io[setno].md_io_mx); 3610 } 3611 rc = 1; 3612 3613 3614 ASSERT(md_set_io[setno].io_cnt == 0); 3615 mutex_exit(&md_set_io[setno].md_io_mx); 3616 3617 return (rc); 3618 } 3619 3620 void 3621 md_clearblock_setio(set_t setno) 3622 { 3623 if (setno == 0) 3624 return; 3625 3626 mutex_enter(&md_set_io[setno].md_io_mx); 3627 md_set_io[setno].io_state = MD_SET_ACTIVE; 3628 mutex_exit(&md_set_io[setno].md_io_mx); 3629 } 3630 3631 void 3632 md_unblock_setio(set_t setno) 3633 { 3634 if (setno == 0) 3635 return; 3636 3637 mutex_enter(&md_set_io[setno].md_io_mx); 3638 #ifdef DEBUG 3639 if (md_set_io[setno].io_cnt != 0) { 3640 cmn_err(CE_NOTE, "set %d count was %ld at take", 3641 setno, md_set_io[setno].io_cnt); 3642 } 3643 #endif /* DEBUG */ 3644 3645 md_set_io[setno].io_state = MD_SET_ACTIVE; 3646 md_set_io[setno].io_cnt = 0; 3647 mutex_exit(&md_set_io[setno].md_io_mx); 3648 } 3649 3650 /* 3651 * Test and set version of the md_block_setio. 3652 * Set the io_state to keep new I/O from being issued. 3653 * If there is I/O currently in progress, then set io_state to active 3654 * and return failure. Otherwise, return a 1 for success. 3655 * 3656 * Used in a MN diskset since the commd must be suspended before 3657 * this node can attempt to withdraw from a diskset. But, with commd 3658 * suspended, I/O may have been issued that can never finish until 3659 * commd is resumed (allocation of hotspare, etc). So, if I/O is 3660 * outstanding after diskset io_state is marked RELEASE, then set diskset 3661 * io_state back to ACTIVE and return failure. 3662 */ 3663 int 3664 md_tas_block_setio(set_t setno) 3665 { 3666 int rc; 3667 3668 if (setno == 0) 3669 return (1); 3670 3671 mutex_enter(&md_set_io[setno].md_io_mx); 3672 md_set_io[setno].io_state = MD_SET_RELEASE; 3673 3674 if (md_set_io[setno].io_cnt > 0) { 3675 md_set_io[setno].io_state = MD_SET_ACTIVE; 3676 rc = 0; 3677 } else { 3678 rc = 1; 3679 } 3680 3681 mutex_exit(&md_set_io[setno].md_io_mx); 3682 3683 return (rc); 3684 } 3685 3686 void 3687 md_biodone(struct buf *pb) 3688 { 3689 minor_t mnum; 3690 set_t setno; 3691 mdi_unit_t *ui; 3692 3693 mnum = getminor(pb->b_edev); 3694 setno = MD_MIN2SET(mnum); 3695 3696 if (setno == 0) { 3697 biodone(pb); 3698 return; 3699 } 3700 3701 #ifdef DEBUG 3702 ui = MDI_UNIT(mnum); 3703 if (!md_unit_isopen(ui)) 3704 cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum)); 3705 #endif /* DEBUG */ 3706 3707 /* 3708 * Handle the local diskset 3709 */ 3710 if (md_set_io[setno].io_cnt > 0) 3711 md_dec_iocount(setno); 3712 3713 #ifdef DEBUG 3714 /* 3715 * this is being done after the lock is dropped so there 3716 * are cases it may be invalid. It is advisory. 3717 */ 3718 if (md_set_io[setno].io_state & MD_SET_RELEASE) { 3719 /* Only display this error once for this metadevice */ 3720 if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) { 3721 cmn_err(CE_NOTE, 3722 "I/O to %s attempted during set RELEASE\n", 3723 md_shortname(mnum)); 3724 ui->ui_tstate |= MD_RELEASE_IOERR_DONE; 3725 } 3726 } 3727 #endif /* DEBUG */ 3728 3729 biodone(pb); 3730 } 3731 3732 3733 /* 3734 * Driver special private devt handling routine 3735 * INPUT: md_dev64_t 3736 * OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel. 3737 */ 3738 dev_t 3739 md_dev64_to_dev(md_dev64_t dev) 3740 { 3741 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 3742 minor_t minor = (minor_t)(dev & MAXMIN64); 3743 3744 return (makedevice(major, minor)); 3745 3746 } 3747 3748 /* 3749 * Driver private makedevice routine 3750 * INPUT: major_t major, minor_t minor 3751 * OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel. 3752 */ 3753 md_dev64_t 3754 md_makedevice(major_t major, minor_t minor) 3755 { 3756 return (((md_dev64_t)major << NBITSMINOR64) | minor); 3757 3758 } 3759 3760 3761 /* 3762 * Driver private devt md_getmajor routine 3763 * INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device 3764 * OUTPUT: the appropriate major number 3765 */ 3766 major_t 3767 md_getmajor(md_dev64_t dev) 3768 { 3769 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 3770 3771 if (major == 0) { 3772 /* Here we were given a 32bit dev */ 3773 major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32; 3774 } 3775 return (major); 3776 } 3777 3778 /* 3779 * Driver private devt md_getminor routine 3780 * INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device 3781 * OUTPUT: the appropriate minor number 3782 */ 3783 minor_t 3784 md_getminor(md_dev64_t dev) 3785 { 3786 minor_t minor; 3787 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 3788 3789 if (major == 0) { 3790 /* Here we were given a 32bit dev */ 3791 minor = (minor_t)(dev & MAXMIN32); 3792 } else { 3793 minor = (minor_t)(dev & MAXMIN64); 3794 } 3795 return (minor); 3796 } 3797 3798 int 3799 md_check_ioctl_against_unit(int cmd, mdc_unit_t c) 3800 { 3801 /* 3802 * If the metadevice is an old style device, it has a vtoc, 3803 * in that case all reading EFI ioctls are not applicable. 3804 * If the metadevice has an EFI label, reading vtoc and geom ioctls 3805 * are not supposed to work. 3806 */ 3807 switch (cmd) { 3808 case DKIOCGGEOM: 3809 case DKIOCGAPART: 3810 /* if > 2 TB then fail */ 3811 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { 3812 return (ENOTSUP); 3813 } 3814 break; 3815 case DKIOCGVTOC: 3816 /* if > 2 TB then fail */ 3817 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { 3818 return (ENOTSUP); 3819 } 3820 3821 /* if > 1 TB but < 2TB return overflow */ 3822 if (c.un_revision & MD_64BIT_META_DEV) { 3823 return (EOVERFLOW); 3824 } 3825 break; 3826 case DKIOCGEXTVTOC: 3827 /* if > 2 TB then fail */ 3828 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { 3829 return (ENOTSUP); 3830 } 3831 break; 3832 case DKIOCGETEFI: 3833 case DKIOCPARTITION: 3834 if ((c.un_flag & MD_EFILABEL) == 0) { 3835 return (ENOTSUP); 3836 } 3837 break; 3838 3839 case DKIOCSETEFI: 3840 /* setting an EFI label should always be ok */ 3841 return (0); 3842 3843 case DKIOCSVTOC: 3844 /* if > 2 TB then fail */ 3845 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { 3846 return (ENOTSUP); 3847 } 3848 3849 /* if > 1 TB but < 2TB return overflow */ 3850 if (c.un_revision & MD_64BIT_META_DEV) { 3851 return (EOVERFLOW); 3852 } 3853 break; 3854 case DKIOCSEXTVTOC: 3855 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { 3856 return (ENOTSUP); 3857 } 3858 break; 3859 } 3860 return (0); 3861 } 3862 3863 /* 3864 * md_vtoc_to_efi_record() 3865 * Input: record id of the vtoc record 3866 * Output: record id of the efi record 3867 * Function: 3868 * - reads the volume name from the vtoc record 3869 * - converts the volume name to a format, libefi understands 3870 * - creates a new record of size MD_EFI_PARTNAME_BYTES 3871 * - stores the volname in that record, 3872 * - commits that record 3873 * - returns the recid of the efi record. 3874 * Caveat Emptor: 3875 * The calling routine must do something like 3876 * - un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid) 3877 * - commit(un) 3878 * - delete(vtoc_recid) 3879 * in order to keep the mddb consistent in case of a panic in the middle. 3880 * Errors: 3881 * - returns 0 on any error 3882 */ 3883 mddb_recid_t 3884 md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno) 3885 { 3886 struct vtoc *vtoc; 3887 ushort_t *v; 3888 mddb_recid_t efi_recid; 3889 int i; 3890 3891 if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) { 3892 return (0); 3893 } 3894 vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid); 3895 efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0, 3896 MD_CRO_32BIT, setno); 3897 if (efi_recid < 0) { 3898 return (0); 3899 } 3900 v = (ushort_t *)mddb_getrecaddr(efi_recid); 3901 3902 /* This for loop read, converts and writes */ 3903 for (i = 0; i < LEN_DKL_VVOL; i++) { 3904 v[i] = LE_16((uint16_t)vtoc->v_volume[i]); 3905 } 3906 /* commit the new record */ 3907 mddb_commitrec_wrapper(efi_recid); 3908 3909 return (efi_recid); 3910 } 3911 3912 /* 3913 * Send a kernel message. 3914 * user has to provide for an allocated result structure 3915 * If the door handler disappears we retry, emitting warnings every so often. 3916 * 3917 * The recipient argument is almost always unused, and is therefore typically 3918 * set to zero, as zero is an invalid cluster nodeid. The exceptions are the 3919 * marking and clearing of the DRL from a node that is not currently the 3920 * owner. In these cases, the recipient argument will be the nodeid of the 3921 * mirror owner, and MD_MSGF_DIRECTED will be set in the flags. Non-owner 3922 * nodes will not receive these messages. 3923 * 3924 * For the case where md_mn_is_commd_present() is false, we simply pre-set 3925 * the result->kmmr_comm_state to MDMNE_RPC_FAIL. 3926 * This covers the case where the service mdcommd has been killed and so we do 3927 * not get a 'new' result structure copied back. Instead we return with the 3928 * supplied result field, and we need to flag a failure to the caller. 3929 */ 3930 int 3931 mdmn_ksend_message( 3932 set_t setno, 3933 md_mn_msgtype_t type, 3934 uint_t flags, 3935 md_mn_nodeid_t recipient, 3936 char *data, 3937 int size, 3938 md_mn_kresult_t *result) 3939 { 3940 door_arg_t da; 3941 md_mn_kmsg_t *kmsg; 3942 uint_t send_try_cnt = 0; 3943 uint_t retry_noise_cnt = 0; 3944 int rval; 3945 k_sigset_t oldmask, newmask; 3946 3947 /* 3948 * Ensure that we default to a recoverable failure state if the 3949 * door upcall cannot pass the request on to rpc.mdcommd. 3950 * This may occur when shutting the node down while there is still 3951 * a mirror resync or metadevice state update occurring. 3952 */ 3953 result->kmmr_comm_state = MDMNE_RPC_FAIL; 3954 result->kmmr_exitval = ~0; 3955 3956 if (size > MDMN_MAX_KMSG_DATA) 3957 return (ENOMEM); 3958 kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP); 3959 kmsg->kmsg_flags = flags; 3960 kmsg->kmsg_setno = setno; 3961 kmsg->kmsg_recipient = recipient; 3962 kmsg->kmsg_type = type; 3963 kmsg->kmsg_size = size; 3964 bcopy(data, &(kmsg->kmsg_data), size); 3965 3966 /* 3967 * Wait for the door handle to be established. 3968 */ 3969 while (mdmn_door_did == -1) { 3970 if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) { 3971 cmn_err(CE_WARN, "door handle not yet ready. " 3972 "Check if /usr/lib/lvm/mddoors is running"); 3973 } 3974 delay(md_hz); 3975 } 3976 3977 /* 3978 * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we 3979 * do not fail if the user process receives a signal while we're 3980 * active in the door interface. 3981 */ 3982 if (flags & MD_MSGF_BLK_SIGNAL) { 3983 sigfillset(&newmask); 3984 sigreplace(&newmask, &oldmask); 3985 } 3986 3987 /* 3988 * If message failed with an RPC_FAILURE when rpc.mdcommd had 3989 * been gracefully shutdown (md_mn_is_commd_present returns FALSE) 3990 * then don't retry the message anymore. If message 3991 * failed due to any other reason, then retry up to MD_MN_WARN_INTVL 3992 * times which should allow a shutting down system time to 3993 * notify the kernel of a graceful shutdown of rpc.mdcommd. 3994 * 3995 * Caller of this routine will need to check the md_mn_commd_present 3996 * flag and the failure error in order to determine whether to panic 3997 * or not. If md_mn_commd_present is set to 0 and failure error 3998 * is RPC_FAILURE, the calling routine should not panic since the 3999 * system is in the process of being shutdown. 4000 * 4001 */ 4002 4003 retry_noise_cnt = send_try_cnt = 0; 4004 while (md_mn_is_commd_present_lite()) { 4005 /* 4006 * data_ptr and data_size are initialized here because on 4007 * return from the upcall, they contain data duplicated from 4008 * rbuf and rsize. This causes subsequent upcalls to fail. 4009 */ 4010 da.data_ptr = (char *)(kmsg); 4011 da.data_size = sizeof (md_mn_kmsg_t); 4012 da.desc_ptr = NULL; 4013 da.desc_num = 0; 4014 da.rbuf = (char *)result; 4015 da.rsize = sizeof (*result); 4016 4017 while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da, 4018 NULL, SIZE_MAX, 0)) != 0) { 4019 if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) { 4020 if (rval == EAGAIN) { 4021 cmn_err(CE_WARN, 4022 "md: door_upcall failed. " 4023 "Check if mddoors is running."); 4024 } else if (rval == EINTR) { 4025 cmn_err(CE_WARN, 4026 "md: door_upcall failed. " 4027 "Check if rpc.mdcommd is running."); 4028 } else { 4029 cmn_err(CE_WARN, 4030 "md: door_upcall failed. " 4031 "Returned %d", 4032 rval); 4033 } 4034 } 4035 if (++send_try_cnt >= md_send_retry_limit) 4036 break; 4037 4038 delay(md_hz); 4039 4040 /* 4041 * data_ptr and data_size are re-initialized here 4042 * because on return from the upcall, they contain 4043 * data duplicated from rbuf and rsize. This causes 4044 * subsequent upcalls to fail. 4045 */ 4046 da.data_ptr = (char *)(kmsg); 4047 da.data_size = sizeof (md_mn_kmsg_t); 4048 da.desc_ptr = NULL; 4049 da.desc_num = 0; 4050 da.rbuf = (char *)result; 4051 da.rsize = sizeof (*result); 4052 } 4053 4054 4055 /* 4056 * If: 4057 * - the send succeeded (MDMNE_ACK) 4058 * - we had an MDMNE_RPC_FAIL and commd is now gone 4059 * (note: since the outer loop is commd-dependent, 4060 * checking MDMN_RPC_FAIL here is meaningless) 4061 * - we were told not to retry 4062 * - we exceeded the RPC failure send limit 4063 * punch out of the outer loop prior to the delay() 4064 */ 4065 if (result->kmmr_comm_state == MDMNE_ACK || 4066 (flags & MD_MSGF_KSEND_NORETRY) || 4067 (++send_try_cnt % md_send_retry_limit) == 0 || 4068 !md_mn_is_commd_present()) 4069 break; 4070 delay(md_hz); 4071 } 4072 4073 if (flags & MD_MSGF_BLK_SIGNAL) { 4074 sigreplace(&oldmask, (k_sigset_t *)NULL); 4075 } 4076 kmem_free(kmsg, sizeof (md_mn_kmsg_t)); 4077 4078 return (0); 4079 } 4080 4081 /* 4082 * Called to propagate the capability of a metadevice to all nodes in the set. 4083 * 4084 * On entry, lockp is set if the function has been called from within an ioctl. 4085 * 4086 * IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this 4087 * routine to enable other mdioctls to enter the kernel while this 4088 * thread of execution waits on the completion of mdmn_ksend_message. When 4089 * the message is completed the thread continues and md_ioctl_lock must be 4090 * reacquired. Even though md_ioctl_lock is interruptable, we choose to 4091 * ignore EINTR as we must not return without acquiring md_ioctl_lock. 4092 */ 4093 4094 int 4095 mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp) 4096 { 4097 md_mn_msg_setcap_t msg; 4098 md_mn_kresult_t *kres; 4099 mdi_unit_t *ui = MDI_UNIT(mnum); 4100 int ret; 4101 k_sigset_t oldmask, newmask; 4102 4103 (void) strncpy((char *)&msg.msg_setcap_driver, 4104 md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN); 4105 msg.msg_setcap_mnum = mnum; 4106 msg.msg_setcap_set = vc.vc_set; 4107 4108 if (lockp) 4109 IOLOCK_RETURN_RELEASE(0, lockp); 4110 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 4111 4112 /* 4113 * Mask signals for the mdmd_ksend_message call. This keeps the door 4114 * interface from failing if the user process receives a signal while 4115 * in mdmn_ksend_message. 4116 */ 4117 sigfillset(&newmask); 4118 sigreplace(&newmask, &oldmask); 4119 ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP, 4120 MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t), 4121 kres)); 4122 sigreplace(&oldmask, (k_sigset_t *)NULL); 4123 4124 if (!MDMN_KSEND_MSG_OK(ret, kres)) { 4125 mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP"); 4126 ret = EIO; 4127 } 4128 kmem_free(kres, sizeof (md_mn_kresult_t)); 4129 4130 if (lockp) { 4131 IOLOCK_RETURN_REACQUIRE(lockp); 4132 } 4133 return (ret); 4134 } 4135 4136 /* 4137 * Called to clear all of the transient capabilities for a metadevice when it is 4138 * not open on any node in the cluster 4139 * Called from close for mirror and sp. 4140 */ 4141 4142 void 4143 mdmn_clear_all_capabilities(minor_t mnum) 4144 { 4145 md_isopen_t clumsg; 4146 int ret; 4147 md_mn_kresult_t *kresult; 4148 volcap_t vc; 4149 k_sigset_t oldmask, newmask; 4150 4151 clumsg.dev = md_makedevice(md_major, mnum); 4152 clumsg.mde = mdnullerror; 4153 /* 4154 * The check open message doesn't have to be logged, nor should the 4155 * result be stored in the MCT. We want an up-to-date state. 4156 */ 4157 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 4158 4159 /* 4160 * Mask signals for the mdmd_ksend_message call. This keeps the door 4161 * interface from failing if the user process receives a signal while 4162 * in mdmn_ksend_message. 4163 */ 4164 sigfillset(&newmask); 4165 sigreplace(&newmask, &oldmask); 4166 ret = mdmn_ksend_message(MD_MIN2SET(mnum), 4167 MD_MN_MSG_CLU_CHECK, 4168 MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0, 4169 (char *)&clumsg, sizeof (clumsg), kresult); 4170 sigreplace(&oldmask, (k_sigset_t *)NULL); 4171 4172 if ((ret == 0) && (kresult->kmmr_exitval == 0)) { 4173 /* 4174 * Not open on any node, clear all capabilities, eg ABR and 4175 * DMR 4176 */ 4177 vc.vc_set = 0; 4178 (void) mdmn_send_capability_message(mnum, vc, NULL); 4179 } 4180 kmem_free(kresult, sizeof (md_mn_kresult_t)); 4181 } 4182 4183 /* 4184 * mdmn_ksend_show_error: 4185 * --------------------- 4186 * Called to display the error contents of a failing mdmn_ksend_message() result 4187 * 4188 * Input: 4189 * rv - return value from mdmn_ksend_message() 4190 * kres - pointer to result structure filled in by mdmn_ksend_message 4191 * s - Informative message to identify failing condition (e.g. 4192 * "Ownership change") This string will be displayed with 4193 * cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system 4194 * administrator 4195 */ 4196 void 4197 mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s) 4198 { 4199 if (rv == 0) { 4200 cmn_err(CE_WARN, "%s *FAILED*", s); 4201 cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node" 4202 " = %d", kres->kmmr_exitval, kres->kmmr_comm_state, 4203 kres->kmmr_failing_node); 4204 } else { 4205 cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv); 4206 } 4207 } 4208 4209 /* 4210 * Callback routine for resync thread. If requested to suspend we mark the 4211 * commd as not being present. 4212 */ 4213 boolean_t 4214 callb_md_mrs_cpr(void *arg, int code) 4215 { 4216 callb_cpr_t *cp = (callb_cpr_t *)arg; 4217 int ret = 0; /* assume success */ 4218 clock_t delta; 4219 4220 mutex_enter(cp->cc_lockp); 4221 4222 switch (code) { 4223 case CB_CODE_CPR_CHKPT: 4224 /* 4225 * Mark the rpc.mdcommd as no longer present. We are trying to 4226 * suspend the system and so we should expect RPC failures to 4227 * occur. 4228 */ 4229 md_mn_clear_commd_present(); 4230 cp->cc_events |= CALLB_CPR_START; 4231 delta = CPR_KTHREAD_TIMEOUT_SEC * hz; 4232 while (!(cp->cc_events & CALLB_CPR_SAFE)) 4233 /* cv_timedwait() returns -1 if it times out. */ 4234 if ((ret = cv_reltimedwait(&cp->cc_callb_cv, 4235 cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1) 4236 break; 4237 break; 4238 4239 case CB_CODE_CPR_RESUME: 4240 cp->cc_events &= ~CALLB_CPR_START; 4241 cv_signal(&cp->cc_stop_cv); 4242 break; 4243 } 4244 mutex_exit(cp->cc_lockp); 4245 return (ret != -1); 4246 } 4247 4248 4249 void 4250 md_rem_hspname(set_t setno, mdkey_t n_key) 4251 { 4252 int s; 4253 int max_sides; 4254 4255 4256 /* All entries removed are in the same diskset */ 4257 if (md_get_setstatus(setno) & MD_SET_MNSET) 4258 max_sides = MD_MNMAXSIDES; 4259 else 4260 max_sides = MD_MAXSIDES; 4261 4262 for (s = 0; s < max_sides; s++) 4263 (void) md_remdevname(setno, s, n_key); 4264 } 4265 4266 4267 int 4268 md_rem_selfname(minor_t selfid) 4269 { 4270 int s; 4271 set_t setno = MD_MIN2SET(selfid); 4272 int max_sides; 4273 md_dev64_t dev; 4274 struct nm_next_hdr *nh; 4275 struct nm_name *n; 4276 mdkey_t key; 4277 4278 /* 4279 * Get the key since remove routine expects it 4280 */ 4281 dev = md_makedevice(md_major, selfid); 4282 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) { 4283 return (ENOENT); 4284 } 4285 4286 if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD, 4287 MD_KEYWILD, dev, 0L)) == NULL) { 4288 return (ENOENT); 4289 } 4290 4291 /* All entries removed are in the same diskset */ 4292 key = n->n_key; 4293 if (md_get_setstatus(setno) & MD_SET_MNSET) 4294 max_sides = MD_MNMAXSIDES; 4295 else 4296 max_sides = MD_MAXSIDES; 4297 4298 for (s = 0; s < max_sides; s++) 4299 (void) md_remdevname(setno, s, key); 4300 4301 return (0); 4302 } 4303 4304 void 4305 md_upd_set_unnext(set_t setno, unit_t un) 4306 { 4307 if (un < md_set[setno].s_un_next) { 4308 md_set[setno].s_un_next = un; 4309 } 4310 } 4311 4312 struct hot_spare_pool * 4313 find_hot_spare_pool(set_t setno, int hsp_id) 4314 { 4315 hot_spare_pool_t *hsp; 4316 4317 hsp = (hot_spare_pool_t *)md_set[setno].s_hsp; 4318 while (hsp != NULL) { 4319 if (hsp->hsp_self_id == hsp_id) 4320 return (hsp); 4321 hsp = hsp->hsp_next; 4322 } 4323 4324 return ((hot_spare_pool_t *)0); 4325 } 4326 4327 /* 4328 * md_create_taskq: 4329 * 4330 * Create a kernel taskq for the given set/unit combination. This is typically 4331 * used to complete a RR_CLEAN request when the callee is unable to obtain the 4332 * mutex / condvar access required to update the DRL safely. 4333 */ 4334 void * 4335 md_create_taskq(set_t setno, minor_t mnum) 4336 { 4337 char name[20]; 4338 ddi_taskq_t *tqp; 4339 4340 (void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum)); 4341 4342 tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0); 4343 4344 return ((void *)tqp); 4345 } 4346