1 // 2 // CDDL HEADER START 3 // 4 // The contents of this file are subject to the terms of the 5 // Common Development and Distribution License (the License). 6 // You may not use this file except in compliance with the License. 7 // 8 // You can obtain a copy of the license at usr/src/CDDL.txt 9 // or http://www.opensolaris.org/os/licensing. 10 // See the License for the specific language governing permissions 11 // and limitations under the License. 12 // 13 // When distributing Covered Code, include this CDDL HEADER in each 14 // file and include the License file at usr/src/CDDL.txt. 15 // If applicable, add the following below this CDDL HEADER, with the 16 // fields enclosed by brackets [] replaced with your own identifying 17 // information: Portions Copyright [yyyy] [name of copyright owner] 18 // 19 // CDDL HEADER END 20 // 21 22 // 23 // Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 // Use is subject to license terms. 25 // 26 27 #pragma ident "@(#)mount_server_impl.cc 1.83 08/05/20 SMI" 28 29 #include <sys/errno.h> 30 31 #include <sys/types.h> 32 #include <sys/thread.h> 33 #include <sys/file.h> 34 #include <sys/mount.h> 35 #include <sys/pathname.h> 36 #include <sys/sysmacros.h> 37 38 #include <sys/os.h> 39 #include <sys/sol_conv.h> 40 #include <solobj/solobj_impl.h> 41 42 #include <pxfs/mount/mount_server_impl.h> 43 #include <pxfs/mount/mount_replica_impl.h> 44 #include <pxfs/mount/mount_debug.h> 45 #include <pxfs/server/fobj_impl.h> 46 47 // 48 // The number of seconds devlock waits for, before timing out. 49 // This is an undocumented tunable. 50 // 51 int pxfs_devlock_timeout = 5; 52 53 // VP to IDL interface version mapping for Mount subsystem 54 extern mount_ver_map_t 55 mount_vp_to_idl[MOUNT_VP_MAX_MAJOR +1][MOUNT_VP_MAX_MINOR +1]; 56 57 //lint -e1512 58 // 59 // Warning(1512) destructor for base class is not virtual -- In a 60 // final pass through all the classes, we have found a class that is 61 // the base class of a derivation and has a destructor but the 62 // destructor is not virtual. It is conventional for inherited classes 63 // to have virtual destructors so that is it safe to 'delete' a 64 // pointer to a base class. 65 // 66 // The classes prov_common_iter and prov_common_setin the file 67 // prov_common.h have to be changed to have virtual destructors. 68 // 69 70 // 71 // fs_elem methods 72 // 73 74 fs_elem::fs_elem(pxfs_v1::filesystem_ptr fs_p, const pxfs_v1::fs_info &finfo, 75 const sol::mounta &md, const char *options, bool is_ha, const char *name, 76 const sol::nodeid_seq_t &nids) : 77 _DList::ListElem(this), 78 fs_v1_info(finfo), 79 ma(md), 80 dev_is_ha(is_ha), 81 dev_nids(nids), 82 fs_elem_ver(VERSION_1) 83 { 84 ASSERT(!CORBA::is_nil(fs_p)); 85 fs_v1_ptr = pxfs_v1::filesystem::_duplicate(fs_p); 86 87 // Make a copy, don't use String_var constructor. 88 mntoptions = options; 89 dev_name = name; 90 } 91 92 fs_elem::~fs_elem() 93 { 94 } 95 96 // 97 // devlock_elem methods 98 // 99 100 // 101 // Constructor for "device lock". 102 // 103 devlock_elem::devlock_elem(fs::mount_client_ptr c, sol::nodeid_t nodeid, 104 const char *name) : 105 _SList::ListElem(this) 106 { 107 owner = fs::mount_client::_duplicate(c); 108 spec = os::strdup(name); 109 nwaiters = 0; 110 ownerid = nodeid; 111 unlocked = false; 112 } 113 114 devlock_elem::~devlock_elem() 115 { 116 delete [] spec; 117 } 118 119 // 120 // mount_client_elem methods 121 // 122 123 // 124 // Create a list element to save the mount_client reference and nodeid. 125 // Primary constructor. 126 // 127 mount_client_elem::mount_client_elem(mount_server_impl *server, 128 fs::mount_client_ptr client, sol::nodeid_t nid, 129 mount_replica_impl *replp) : 130 mc_replica_of<fs::mount_client_died>(replp), 131 _SList::ListElem(this) 132 { 133 serverp = server; 134 clientptr = fs::mount_client::_duplicate(client); 135 nodeid = nid; 136 shutdown = false; 137 MOUNT_DBPRINTF( 138 MOUNT_TRACE_SERVER, 139 MOUNT_GREEN, 140 ("mount_client_elem newP %p\n", this)); 141 } 142 143 // 144 // Create a list element to save the mount_client reference and nodeid. 145 // Secondary constructor. 146 // 147 mount_client_elem::mount_client_elem(mount_server_impl *server, 148 fs::mount_client_ptr client, sol::nodeid_t nid, bool is_shutdown, 149 fs::mount_client_died_ptr obj) : 150 mc_replica_of<fs::mount_client_died>(obj), 151 _SList::ListElem(this) 152 { 153 serverp = server; 154 clientptr = fs::mount_client::_duplicate(client); 155 nodeid = nid; 156 shutdown = is_shutdown; 157 MOUNT_DBPRINTF( 158 MOUNT_TRACE_SERVER, 159 MOUNT_GREEN, 160 ("mount_client_elem newS %p s %d\n", 161 this, shutdown)); 162 } 163 164 mount_client_elem::~mount_client_elem() 165 { 166 MOUNT_DBPRINTF( 167 MOUNT_TRACE_SERVER, 168 MOUNT_GREEN, 169 ("~mount_client_elem %p\n", this)); 170 serverp = NULL; // for lint 171 } 172 173 // 174 // This is called if the mount client crashes or is halted. 175 // 176 void 177 mount_client_elem::_unreferenced(unref_t arg) 178 { 179 if (!_last_unref(arg)) { 180 // _last_unref() should always be true since we don't use 0->1. 181 ASSERT(0); 182 return; 183 } 184 if (!CORBA::is_nil(clientptr)) { 185 // 186 // Note: we don't delete ourself here 187 // (see mount_server_impl::client_died()). 188 // 189 serverp->client_died(this); 190 } else { 191 // 192 // mount_server::remove_client() has already removed 193 // this from the list so just delete ourself. 194 // 195 delete this; 196 } 197 } 198 199 // 200 // mount_state methods 201 // 202 203 // 204 // Constructor for mount_state. 205 // 206 mount_state::mount_state(const sol::mounta &md, fs::mount_client_ptr client_p, 207 mount_server_impl &srvr, mount_ver_t ver) : 208 ma(md), 209 error(0), 210 is_remount(false), 211 mount_ver(ver), 212 server(srvr) 213 { 214 ASSERT(!CORBA::is_nil(client_p)); 215 mountpoint_lock_c = fs::mount_client::_duplicate(client_p); 216 fs_v1_info.fsflag = 0; // for lint 217 } 218 219 mount_state::~mount_state() 220 { 221 } 222 223 // 224 // The client has crashed so there will be no retry of the mount(). 225 // Unlock the mount points we have already locked. 226 // 227 void 228 mount_state::orphaned(Environment &env) 229 { 230 (void) server.mount_orphaned(this, true, env); 231 } 232 233 // 234 // This is called on the secondary to complete the mount transaction. 235 // 236 void 237 mount_state::committed() 238 { 239 mountpoint_lock_c = fs::mount_client::_nil(); 240 } 241 242 // 243 // unmount_state methods 244 // 245 246 // 247 // Constructor for unmount_state for recording unmounts during remove_client(). 248 // Note that client can be nil. 249 // 250 unmount_state::unmount_state(pxfs_v1::filesystem_ptr fs_p, 251 int32_t umflags, 252 solobj::cred_ptr cred_p, 253 mount_server_impl &srvr, 254 fs::mount_client_ptr client_p) : 255 flags(umflags), 256 state(START), 257 error(0), 258 unmount_ver(VERSION_1), 259 server(srvr) 260 { 261 ASSERT(!CORBA::is_nil(fs_p)); 262 fs_v1_obj = pxfs_v1::filesystem::_duplicate(fs_p); 263 264 skip = fs::mount_client::_duplicate(client_p); 265 credobj = solobj::cred::_duplicate(cred_p); 266 } 267 268 unmount_state::~unmount_state() 269 { 270 } 271 272 // 273 // This is called on the (new) primary after a failover of the mount_server 274 // and a crash of the mount_client node. 275 // 276 void 277 unmount_state::orphaned(Environment &e) 278 { 279 server.unmount_orphaned(this, false, e); 280 } 281 282 // 283 // This is called on the secondary when add_commit() is called on the primary. 284 // 285 void 286 unmount_state::committed() 287 { 288 ASSERT(CORBA::is_nil(fs_v1_obj)); 289 } 290 291 // 292 // dc_callback_impl methods 293 // 294 295 // 296 // Primary constructor. 297 // 298 dc_callback_impl::dc_callback_impl(mount_server_impl &srvr, 299 mount_replica_impl *serverp) : 300 mc_replica_of<fs::dc_callback>(serverp), 301 server(srvr) 302 { 303 // LINTED: Call to virtual function within a contructor or destructor. 304 _handler()->set_cookie((void *)this); 305 } 306 307 // 308 // Secondary constructor. 309 // 310 dc_callback_impl::dc_callback_impl(mount_server_impl &srvr, 311 fs::dc_callback_ptr obj) : 312 mc_replica_of<fs::dc_callback>(obj), 313 server(srvr) 314 { 315 // LINTED: Call to virtual function within a contructor or destructor. 316 _handler()->set_cookie((void *)this); 317 } 318 319 dc_callback_impl::~dc_callback_impl() 320 { 321 } 322 323 void 324 dc_callback_impl::_unreferenced(unref_t arg) 325 { 326 if (!_last_unref(arg)) { 327 // _last_unref() should always be true since we don't use 0->1. 328 ASSERT(0); 329 return; 330 } 331 delete this; 332 } 333 334 // dc_callback_impl(fs::dc_callback::notify_change) 335 void 336 dc_callback_impl::notify_change(sol::dev_t gdev, 337 const sol::nodeid_seq_t &nodes, Environment &_environment) 338 { 339 server.notify_change(gdev, nodes, _environment); 340 } 341 342 // dc_callback_impl(fs::dc_callback::still_active, _environment) 343 bool 344 dc_callback_impl::still_active(sol::dev_t gdev, Environment &) 345 { 346 return (server.still_active(gdev)); 347 } 348 349 // 350 // mount_server_impl methods 351 // 352 353 // 354 // Return a CORBA pointer (no CORBA::release() required) 355 // to the checkpoint interface. 356 // 357 repl_pxfs::mount_replica_ptr 358 mount_server_impl::get_checkpoint() 359 { 360 return ((mount_replica_impl*)(get_provider()))-> 361 get_checkpoint_mount_replica(); 362 } 363 364 // 365 // Primary constructor. 366 // 367 mount_server_impl::mount_server_impl(mount_replica_impl *serverp) : 368 mc_replica_of<fs::mount_server>(serverp) 369 { 370 repl_serverp = serverp; 371 primary = true; 372 frozen = false; 373 currentmnt = (char *)NULL; 374 } 375 376 // 377 // Secondary constructor. 378 // 379 mount_server_impl::mount_server_impl(mount_replica_impl *serverp, 380 fs::mount_server_ptr obj) : 381 mc_replica_of<fs::mount_server>(obj) 382 { 383 repl_serverp = serverp; 384 primary = false; 385 frozen = false; 386 currentmnt = (char *)NULL; 387 } 388 389 // 390 // Destructor. 391 // 392 mount_server_impl::~mount_server_impl() 393 { 394 ASSERT(client_list.empty()); 395 ASSERT(fs_list.empty()); 396 } 397 398 void 399 mount_server_impl::_unreferenced(unref_t arg) 400 { 401 if (!_last_unref(arg)) { 402 // _last_unref() should always be true since we don't use 0->1. 403 ASSERT(0); 404 return; 405 } 406 // 407 // XXX Should wait for all _unreferenced() from 408 // mount_client_elem and dc_callback_impl but since this service 409 // is never shut down, _unreferenced() should never get called. 410 // 411 MOUNT_DBPRINTF( 412 MOUNT_TRACE_SERVER, 413 MOUNT_GREEN, 414 ("server:_unreferenced\n")); 415 delete this; 416 } 417 418 // 419 // This is called to get a new reference. Doing get_objref() here would 420 // get the highest reference version that was compiled. We want the 421 // highest reference which is currently committed so we use this indirect 422 // way. 423 // 424 void 425 mount_server_impl::_generic_method(CORBA::octet_seq_t &, 426 CORBA::object_seq_t &objs, Environment &e) 427 { 428 objs[0] = repl_serverp->get_root_obj(e); 429 } 430 431 432 // 433 // Called from mount_replica_impl when switching to primary. 434 // 435 void 436 mount_server_impl::convert_to_primary() 437 { 438 MOUNT_DBPRINTF( 439 MOUNT_TRACE_SERVER, 440 MOUNT_GREEN, 441 ("server: primary\n")); 442 443 primary = true; 444 445 #ifdef DEBUG 446 // 447 // There should be no threads waiting for a device lock 448 // (see comment for devunlock()). 449 // 450 devlock_elem *dep; 451 for (devlock_list.atfirst(); 452 (dep = devlock_list.get_current()) != NULL; 453 devlock_list.advance()) { 454 ASSERT(dep->nwaiters == 0); 455 } 456 #endif 457 } 458 459 // 460 // Called from mount_replica_impl when switching to secondary. 461 // 462 void 463 mount_server_impl::convert_to_secondary() 464 { 465 MOUNT_DBPRINTF( 466 MOUNT_TRACE_SERVER, 467 MOUNT_GREEN, 468 ("server: secondary\n")); 469 470 primary = false; 471 472 #ifdef DEBUG 473 // 474 // There should be no threads waiting for a device lock 475 // (see comment for devunlock()). 476 // 477 devlock_elem *dep; 478 for (devlock_list.atfirst(); 479 (dep = devlock_list.get_current()) != NULL; 480 devlock_list.advance()) { 481 ASSERT(dep->nwaiters == 0); 482 } 483 #endif 484 } 485 486 // 487 // Called from mount_replica_impl when switching to spare. 488 // 489 void 490 mount_server_impl::convert_to_spare() 491 { 492 MOUNT_DBPRINTF( 493 MOUNT_TRACE_SERVER, 494 MOUNT_GREEN, 495 ("server: spare\n")); 496 497 if (!CORBA::is_nil(dc_callback_obj)) { 498 dc_callback_impl *dc_callbackp = (dc_callback_impl *) 499 dc_callback_obj->_handler()->get_cookie(); 500 dc_callback_obj = fs::dc_callback::_nil(); 501 delete dc_callbackp; 502 } 503 504 client_list.dispose(); 505 fs_list.dispose(); 506 devlock_list.dispose(); 507 delete this; 508 } 509 510 // 511 // This is called if a mount_client dies 512 // (called from mount_client_elem::_unreferenced()). 513 // 514 void 515 mount_server_impl::client_died(mount_client_elem *cep) 516 { 517 devlock_elem *dep; 518 519 MOUNT_DBPRINTF( 520 MOUNT_TRACE_SERVER, 521 MOUNT_GREEN, 522 ("server:client_died: %p nid %d\n", 523 (void *)cep, cep->nodeid)); 524 525 if (!primary) { 526 // 527 // _unreferenced() and checkpoints are synchronized on 528 // the secondary so we don't need to lock the list. 529 // 530 // Remove any locks the client held. 531 // 532 devlock_list.atfirst(); 533 while ((dep = devlock_list.get_current()) != NULL) { 534 devlock_list.advance(); 535 if (cep->clientptr->_equiv(dep->owner)) { 536 MOUNT_DBPRINTF( 537 MOUNT_TRACE_SERVER, 538 MOUNT_AMBER, 539 ("server:client_died " 540 "unlock %p waiters %d\n", 541 dep, dep->nwaiters)); 542 (void) devlock_list.erase(dep); 543 delete dep; 544 } 545 } 546 (void) client_list.erase(cep); 547 delete cep; 548 return; 549 } 550 551 FAULTPT_PXFS(FAULTNUM_PXFS_CLIENT_DIED, FaultFunctions::generic); 552 553 // 554 // Remove any locks the client held. 555 // 556 devlock_list_lock.wrlock(); 557 devlock_list.atfirst(); 558 while ((dep = devlock_list.get_current()) != NULL) { 559 devlock_list.advance(); 560 if (cep->clientptr->_equiv(dep->owner)) { 561 // 562 // No need to checkpoint this since the 563 // secondary will get _unreferenced() too. 564 // 565 MOUNT_DBPRINTF( 566 MOUNT_TRACE_SERVER, 567 MOUNT_AMBER, 568 ("server:client_died unlock %p waiters %d\n", 569 dep, dep->nwaiters)); 570 (void) devlock_list.erase(dep); 571 572 dep->waiter_lock.lock(); 573 if (dep->nwaiters != 0) { 574 // This wakes up all waiting threads. 575 dep->unlocked = true; 576 dep->waiter_cv.broadcast(); 577 dep->waiter_lock.unlock(); 578 579 // The last waiter will do the delete. 580 } else { 581 dep->waiter_lock.unlock(); 582 delete dep; 583 } 584 } 585 } 586 devlock_list_lock.unlock(); 587 588 // 589 // Note: there is no checkpoint since the secondary gets 590 // _unreferenced() when we delete the object. 591 // 592 client_list_lock.wrlock(); 593 (void) client_list.erase(cep); 594 delete cep; 595 client_list_lock.unlock(); 596 } 597 598 // 599 // Upgrade mount_client references during Rolling Upgrade commit. 600 // 601 void 602 mount_server_impl::upgrade_client_reference(Environment &_environment) 603 { 604 CORBA::octet_seq_t data; 605 CORBA::object_seq_t objs(1, 1); 606 Environment e; 607 mount_client_elem *cep; 608 devlock_elem *dep; 609 610 // 611 // Create a primary context so the provider can send 612 // checkpoints while the service is frozen. 613 // XXX change the primary_ctx::invoke_env type. 614 // 615 primary_ctx ctx(NULL, primary_ctx::ADD_SECONDARY_CKPT, 616 _environment); 617 618 // Update the client reference in client list. 619 client_list_lock.wrlock(); 620 621 client_list.atfirst(); 622 while ((cep = client_list.get_current()) != NULL) { 623 client_list.advance(); 624 cep->clientptr->_generic_method(data, objs, e); 625 if (e.exception()) { 626 MOUNT_DBPRINTF( 627 MOUNT_TRACE_SERVER, 628 MOUNT_RED, 629 ("server:upgrade_client_reference" 630 "exception when upgrading client reference" 631 "%p in client list on %d errno %d\n", 632 (void *)cep, cep->nodeid, 633 e.exception()->exception_enum())); 634 e.clear(); 635 } else { 636 cep->clientptr = fs::mount_client::_narrow(objs[0]); 637 638 // 639 // Send a checkpoint to the secondaries telling 640 // them to use the new version mount_client object 641 // reference. 642 // 643 get_checkpoint()-> 644 ckpt_upgrade_client_list(cep->clientptr, 645 cep->nodeid, _environment); 646 ASSERT(_environment.exception() == NULL); 647 648 // 649 // Upgrade the mount_client to use the new version 650 // of the mount_server. 651 // 652 cep->clientptr->upgrade_mount_client(get_objref(), e); 653 654 if (e.exception()) { 655 MOUNT_DBPRINTF( 656 MOUNT_TRACE_SERVER, 657 MOUNT_RED, 658 ("server:upgrade_client_reference" 659 "exception when upgrading server reference" 660 "for %p on %d error %d\n", 661 (void *)cep, cep->nodeid, 662 e.exception()->exception_enum())); 663 e.clear(); 664 } 665 } 666 } 667 668 client_list_lock.unlock(); 669 670 // Update the client reference in device lock list. 671 devlock_list_lock.wrlock(); 672 673 devlock_list.atfirst(); 674 while ((dep = devlock_list.get_current()) != NULL) { 675 devlock_list.advance(); 676 dep->owner->_generic_method(data, objs, e); 677 if (e.exception()) { 678 MOUNT_DBPRINTF( 679 MOUNT_TRACE_SERVER, 680 MOUNT_RED, 681 ("server:upgrade_client_reference" 682 "exception when upgrading client reference" 683 "%p in devlock list on %d errno %d\n", 684 (void *)dep, dep->ownerid, 685 e.exception()->exception_enum())); 686 e.clear(); 687 } else { 688 dep->owner = fs::mount_client::_narrow(objs[0]); 689 690 // 691 // Send a checkpoint to the secondaries telling 692 // them to use the new version mount_client object 693 // reference. 694 // 695 get_checkpoint()-> 696 ckpt_upgrade_devlock_list(dep->spec, dep->owner, 697 _environment); 698 ASSERT(_environment.exception() == NULL); 699 700 // 701 // Upgrade the mount_client to use the new version 702 // of the mount_server. The mount_client may have 703 // already been upgraded. This invocation is 704 // idempotent. So that is not a problem. 705 // 706 dep->owner->upgrade_mount_client(get_objref(), e); 707 708 if (e.exception()) { 709 MOUNT_DBPRINTF( 710 MOUNT_TRACE_SERVER, 711 MOUNT_RED, 712 ("server:upgrade_client_reference" 713 "exception when upgrading server reference" 714 "for %p on %d error %d\n", 715 (void *)dep, dep->ownerid, 716 e.exception()->exception_enum())); 717 e.clear(); 718 } 719 } 720 } 721 722 devlock_list_lock.unlock(); 723 724 _environment.trans_ctxp = NULL; 725 } 726 727 // 728 // Add a client to the list of mount_server clients, replaying the extant set 729 // of global mounts to bring it into consistency with the rest of the cluster. 730 // This operation should be idempotent since it can be retried on a new primary. 731 // 732 void 733 mount_server_impl::add_client(fs::mount_client_ptr client_p, 734 sol::nodeid_t nodeid, 735 fs::mount_client_died_out clobj, Environment &_environment) 736 { 737 mount_client_elem *ncep = new mount_client_elem(this, 738 client_p, nodeid, repl_serverp); 739 mount_client_elem *cep; 740 fs::mount_client_died_ptr clobjp; 741 fs_elem *fep; 742 fs_elem *ofep; 743 Environment e; 744 sol::error_t err; 745 bool need_fs_status; 746 bool attempt_unmount; 747 uint32_t i; 748 solobj::cred_var credobj = solobj_impl::conv(kcred); 749 750 // Check to see if we have saved state. 751 primary_ctx *ctxp = primary_ctx::extract_from(_environment); 752 unmount_state *statep; 753 if (ctxp != NULL && 754 (statep = (unmount_state *)ctxp->get_saved_state()) != NULL) { 755 // 756 // Since we have saved state, we know the original 757 // primary sent the ckpt_unmount_start() checkpoint. 758 // We finish the unmount process from where we left off. 759 // 760 unmount_orphaned(statep, false, _environment); 761 } 762 763 client_list_lock.wrlock(); 764 765 // 766 // Check to see if the mount client we are adding is already in 767 // the list. If it is there, it means this is a retry after 768 // a mount server failover. Since we got the checkpoint, we 769 // know that the mounts have been replayed on the client. 770 // Note that we may have an old entry for the same node 771 // in the list until mount_client_elem::_unreferenced() 772 // is processed (which is why we search the list by object 773 // reference rather than nodeid). 774 // 775 if ((cep = find_client(client_p)) != NULL) { 776 MOUNT_DBPRINTF( 777 MOUNT_TRACE_SERVER, 778 MOUNT_GREEN, 779 ("server:add_client found %p\n", cep)); 780 clobj = cep->get_objref(); 781 goto done; 782 } 783 784 fs_list_lock.wrlock(); 785 786 // 787 // If the joining node has a direct connection to the device for a 788 // a filesystem which is currently globally mounted, but that 789 // filesystem is not currently mounted locally (NOT_AVAILABLE), 790 // then we attempt to unmount that filesystem (anticipating that 791 // it will be mounted locally by the joining node). 792 // 793 // An HA filesystem is considered AVAILABLE if there is a 794 // node with a primary or secondary filesystem replica. 795 // 796 // A non-HA filesytem is considered AVAILABLE if the node with 797 // the connection to the device is already running in the cluster. 798 // So if the joining node has a direct connection we attempt an 799 // unmount. 800 // 801 for (fs_list.atlast(); (fep = fs_list.get_current()) != NULL; ) { 802 // 803 // Move the 'current' pointer in the list away from 'fep', 804 // so that the fs_list.erase() in 'unmount_common' does not 805 // move the pointer. 806 // 807 fs_list.retreat(); 808 809 // Check for joining node being connected to the device. 810 need_fs_status = false; 811 attempt_unmount = false; 812 if (fep->dev_is_ha) { 813 for (i = 0; i < fep->dev_nids.length(); i++) { 814 if (nodeid == fep->dev_nids[i]) { 815 need_fs_status = true; 816 break; 817 } 818 } 819 } else { 820 if (nodeid == fep->dev_nids[0]) 821 attempt_unmount = true; 822 } 823 824 if (attempt_unmount || (need_fs_status && 825 (get_fs_status(fep) == NOT_AVAILABLE))) { 826 MOUNT_DBPRINTF( 827 MOUNT_TRACE_SERVER, 828 MOUNT_AMBER, 829 ("server:add_client unmount %s\n", 830 (const char *)fep->ma.dir)); 831 832 // 833 // We try to unmount as many file systems as possible 834 // but don't return an error at the end so the client 835 // isn't removed from the global name space. 836 // 837 ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1); 838 // 839 // At this point we will not do a forced 840 // unmount. So the flags are empty. 841 // 842 get_checkpoint()->ckpt_unmount_start_v1( 843 fep->fs_v1_ptr, 0, credobj, 844 fs::mount_client::_nil(), false, 845 _environment); 846 ASSERT(_environment.exception() == NULL); 847 err = unmount_common_1(fs::mount_client::_nil(), 848 fep, 0, credobj, unmount_state::START, 0, 849 NULL, _environment); 850 if (err != 0) { 851 MOUNT_DBPRINTF( 852 MOUNT_TRACE_SERVER, 853 MOUNT_RED, 854 ("server:add_client unmount %s err %d\n", 855 (const char *)fep->ma.dir, err)); 856 } 857 } 858 } 859 860 // 861 // Replay the extant set of mounts on the new client. 862 // If any replayed mount fails, return an exception. 863 // Note that we count on the list being properly ordered, so that 864 // mount dependencies are respected. 865 // 866 for (fs_list.atfirst(); 867 (fep = fs_list.get_current()) != NULL; 868 fs_list.advance()) { 869 // 870 // Compute whether an HA replica needs to be started. 871 // 872 bool is_ha_repl = false; 873 if (fep->dev_is_ha) { 874 for (i = 0; i < fep->dev_nids.length(); i++) { 875 if (nodeid == fep->dev_nids[i]) { 876 is_ha_repl = true; 877 } 878 } 879 } 880 881 MOUNT_DBPRINTF( 882 MOUNT_TRACE_SERVER, 883 MOUNT_GREEN, 884 ("server:add_client add_notify %s ha %d\n", 885 (const char *)fep->ma.dir, is_ha_repl)); 886 887 ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1); 888 client_p->add_notify_v1(fep->ma, fep->mntoptions, 889 is_ha_repl, fep->dev_name, 890 fep->fs_v1_ptr, fep->fs_v1_info, e); 891 if (e.exception() == NULL) { 892 continue; 893 } 894 895 // 896 // We have to undo the work done so far. 897 // The list of filesystems is traversed in reverse order. 898 // 899 sol::error_t error = pxfslib::get_err(e); 900 e.clear(); 901 MOUNT_DBPRINTF( 902 MOUNT_TRACE_SERVER, 903 MOUNT_RED, 904 ("server:add_client add_notify %s error %d\n", 905 (const char *)fep->ma.dir, error)); 906 ofep = fep; 907 while (fs_list.retreat(), 908 (fep = fs_list.get_current()) != NULL) { 909 // 910 // Recompute whether an HA replica was started. 911 // 912 is_ha_repl = false; 913 if (fep->dev_is_ha) { 914 for (i = 0; i < fep->dev_nids.length(); i++) { 915 if (nodeid == fep->dev_nids[i]) { 916 is_ha_repl = true; 917 } 918 } 919 } 920 921 MOUNT_DBPRINTF( 922 MOUNT_TRACE_SERVER, 923 MOUNT_GREEN, 924 ("server:add_client remove_client %s ha %d\n", 925 (const char *)fep->ma.dir, is_ha_repl)); 926 ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1); 927 client_p->remove_client_v1(fep->ma.dir, 928 fep->ma.spec, is_ha_repl, fep->dev_name, 929 fep->fs_v1_ptr, credobj, e); 930 if (e.exception()) { 931 // 932 // We added it but can't remove it. The 933 // exception should be node died 934 // or EBUSY for remove_client problems. The 935 // latter will be resolved with the 936 // implementation of forced unmount. 937 // 938 if (CORBA::COMM_FAILURE::_exnarrow( 939 e.exception())) { 940 MOUNT_DBPRINTF( 941 MOUNT_TRACE_SERVER, 942 MOUNT_RED, 943 ("server:add_client " 944 "exception when removing " 945 "%s on %d COMM\n", 946 (const char *)fep->ma.dir, 947 ncep->nodeid)); 948 } else { 949 err = pxfslib::get_err(e); 950 MOUNT_DBPRINTF( 951 MOUNT_TRACE_SERVER, 952 MOUNT_RED, 953 ("server:add_client " 954 "exception when removing " 955 "%s on %d errno %d\n", 956 (const char *)fep->ma.dir, 957 ncep->nodeid, err)); 958 } 959 e.clear(); 960 } 961 } 962 963 // 964 // Propagate the exception back to our caller. 965 // 966 fs_list_lock.unlock(); 967 _environment.exception(new mount_err(error, ofep->ma.dir)); 968 clobj = fs::mount_client_died::_nil(); 969 goto done; 970 } 971 972 fs_list_lock.unlock(); 973 974 // 975 // Checkpoint this operation so that the secondary can 976 // create a mount_client_elem too (see ckpt_add_client() below). 977 // Everything else is OK to repeat. 978 // Note: we insert onto the head of the list so the most recent 979 // entry is the valid entry (a stale entry could exist until 980 // mount_client_elem::_unreferenced() is processed). 981 // 982 MOUNT_DBPRINTF( 983 MOUNT_TRACE_SERVER, 984 MOUNT_GREEN, 985 ("server:add_client %p nid %d\n", 986 ncep, ncep->nodeid)); 987 client_list.prepend(ncep); 988 cep = ncep; 989 ncep = NULL; 990 clobjp = cep->get_objref(); 991 get_checkpoint()->ckpt_add_client(clobjp, client_p, nodeid, false, 992 _environment); 993 ASSERT(_environment.exception() == NULL); 994 995 // 996 // Return a reference to the client so we can detect if it crashes. 997 // 998 clobj = clobjp; 999 1000 done: 1001 client_list_lock.unlock(); 1002 1003 if (ncep != NULL) { 1004 delete ncep; 1005 } 1006 } 1007 1008 // 1009 // Add a client to the list of mount_server clients. 1010 // Helper function for checkpointing state on a secondary. 1011 // 1012 void 1013 mount_server_impl::ckpt_add_client(fs::mount_client_died_ptr clobj, 1014 fs::mount_client_ptr client_p, sol::nodeid_t nodeid, bool shutdown) 1015 { 1016 // 1017 // Check to see if the mount client we are adding is already in 1018 // the list. 1019 // 1020 if (find_client(client_p) != NULL) { 1021 return; 1022 } 1023 1024 mount_client_elem *cep = new mount_client_elem(this, client_p, 1025 nodeid, shutdown, clobj); 1026 client_list.prepend(cep); 1027 MOUNT_DBPRINTF( 1028 MOUNT_TRACE_SERVER, 1029 MOUNT_GREEN, 1030 ("server:ckpt_add_client %p nid %d shut %d\n", 1031 cep, cep->nodeid, cep->shutdown)); 1032 } 1033 1034 // 1035 // This method is never called. 1036 // 1037 void 1038 mount_server_impl::remove_client(fs::mount_client_ptr, 1039 solobj::cred_ptr, Environment &) 1040 { 1041 } 1042 1043 // 1044 // Remove a client from the list of mount_server clients. 1045 // Helper function for checkpointing state on a secondary. 1046 // 1047 void 1048 mount_server_impl::ckpt_remove_client(fs::mount_client_ptr client_p) 1049 { 1050 mount_client_elem *cep; 1051 1052 if ((cep = find_client(client_p)) != NULL) { 1053 // 1054 // We found the guy we're looking for. 1055 // 1056 (void) client_list.erase(cep); 1057 cep->clientptr = fs::mount_client::_nil(); 1058 } 1059 } 1060 1061 // 1062 // Create a proxy file system, link it into the global name space, 1063 // and unlock the mount point on all other nodes. 1064 // This operation should be idempotent since it can be retried on a new primary. 1065 // 1066 void 1067 mount_server_impl::mount(const sol::mounta &ma, sol::uintptr_t mvp, 1068 solobj::cred_ptr credobj, fs::mount_client_ptr client_p, bool dev_is_ha, 1069 const char *dev_name, const sol::nodeid_seq_t &dev_nids, 1070 fs::filesystem_out fs_obj, fs::fs_info &fsinfo, 1071 CORBA::String_out mntoptions, Environment &_environment) 1072 { 1073 CL_PANIC(0); 1074 } 1075 1076 // 1077 // Create a proxy file system, link it into the global name space, 1078 // and unlock the mount point on all other nodes. 1079 // This operation should be idempotent since it can be retried on a new primary. 1080 // 1081 void 1082 mount_server_impl::mount_v1(const sol::mounta &ma, sol::uintptr_t mvp, 1083 solobj::cred_ptr credobj, fs::mount_client_ptr client_p, bool dev_is_ha, 1084 const char *dev_name, const sol::nodeid_seq_t &dev_nids, 1085 pxfs_v1::filesystem_out fs_obj, pxfs_v1::fs_info &fsinfo, 1086 CORBA::String_out mntoptions, Environment &_environment) 1087 { 1088 ASSERT(ma.flags & MS_SYSSPACE); 1089 1090 // Check to see if we have saved state. 1091 primary_ctx *ctxp = primary_ctx::extract_from(_environment); 1092 mount_state *statep; 1093 if (ctxp != NULL && 1094 (statep = (mount_state *)ctxp->get_saved_state()) != NULL) { 1095 // 1096 // This is a retry on a new primary after a failover. 1097 // If the previous mount() was committed, we are done. 1098 // 1099 ASSERT(statep->mount_ver == mount_state::VERSION_1); 1100 if (mount_orphaned(statep, false, _environment)) { 1101 if (_environment.exception() == NULL) { 1102 // Return values from the saved state. 1103 ASSERT(!CORBA::is_nil(statep->fs_v1_ptr)); 1104 fs_obj = pxfs_v1::filesystem::_duplicate( 1105 statep->fs_v1_ptr); 1106 fsinfo = statep->fs_v1_info; 1107 mntoptions = os::strdup(statep->mntoptions); 1108 } else { 1109 fs_obj = pxfs_v1::filesystem::_nil(); 1110 mntoptions = (char *)NULL; 1111 } 1112 return; 1113 } 1114 1115 // 1116 // Since we have saved state, we know the original 1117 // primary sent the start checkpoint 1118 // so we don't need to do it again here. 1119 // 1120 ASSERT(statep->mountpoint_lock_c->_equiv(client_p)); 1121 1122 client_list_lock.wrlock(); 1123 } else { 1124 // 1125 // This is the start of a new mount, not a retry. 1126 // Checkpoint the start of locking the mount points so 1127 // we can clean up if both the client and primary fail. 1128 // 1129 client_list_lock.wrlock(); 1130 get_checkpoint()->ckpt_mount_start_v1(ma, client_p, 1131 _environment); 1132 if (_environment.exception()) { 1133 client_list_lock.unlock(); 1134 fs_obj = pxfs_v1::filesystem::_nil(); 1135 mntoptions = (char *)NULL; 1136 MOUNT_DBPRINTF( 1137 MOUNT_TRACE_SERVER, 1138 MOUNT_RED, 1139 ("server:mount ckpt failed\n")); 1140 _environment.clear(); 1141 pxfslib::throw_exception(_environment, EIO); 1142 return; 1143 } 1144 } 1145 1146 MOUNT_DBPRINTF( 1147 MOUNT_TRACE_SERVER, 1148 MOUNT_GREEN, 1149 ("server:mount %s %s c %p\n", 1150 (const char *)ma.spec, (const char *)ma.dir, (void *)client_p)); 1151 1152 // 1153 // Check to see if the special device is already mounted. 1154 // 1155 fs_elem *fep; 1156 sol::error_t error; 1157 mount_client_elem *cep; 1158 Environment e; 1159 1160 fs_list_lock.wrlock(); 1161 if ((const char *)ma.spec != NULL) { 1162 // 1163 // Remove the device lock entry. 1164 // Note that a failed mount unlocks the device too. 1165 // 1166 devlock_list_lock.wrlock(); 1167 devlock_elem *dep = find_devlock(ma.spec); 1168 if (dep != NULL) { 1169 MOUNT_DBPRINTF( 1170 MOUNT_TRACE_SERVER, 1171 MOUNT_AMBER, 1172 ("server:mount unlock %p waiters %d\n", 1173 dep, dep->nwaiters)); 1174 (void) devlock_list.erase(dep); 1175 get_checkpoint()->ckpt_devunlock(ma.spec, _environment); 1176 _environment.clear(); 1177 1178 dep->waiter_lock.lock(); 1179 if (dep->nwaiters != 0) { 1180 // This wakes up all waiting threads. 1181 dep->unlocked = true; 1182 dep->waiter_cv.broadcast(); 1183 dep->waiter_lock.unlock(); 1184 1185 // The last waiter will do the delete. 1186 } else { 1187 dep->waiter_lock.unlock(); 1188 delete dep; 1189 } 1190 } 1191 devlock_list_lock.unlock(); 1192 1193 if ((fep = find_fs(ma.spec)) != NULL) { 1194 MOUNT_DBPRINTF( 1195 MOUNT_TRACE_SERVER, 1196 MOUNT_RED, 1197 ("server:mount found %s\n", 1198 (const char *)ma.spec)); 1199 error = EBUSY; 1200 goto err; 1201 } 1202 } 1203 1204 // 1205 // Lock the mount point on all client nodes except the 1206 // requesting node (since it already has the mount point locked). 1207 // Note that if two nodes try to mount to the same mount point: 1208 // Nodes A and B locally lock the mount point (vn_vfswlock()). 1209 // Node A gets here, tries to lock node B's mount point, gets EBUSY, 1210 // releases client_list_lock. 1211 // Node B gets here, tries to lock node A's mount point, gets EBUSY. 1212 // Both node's locally unlock their mount point and return EBUSY 1213 // from the mount system call. 1214 // 1215 error = lock_mntpnt(client_p, ma.dir, ma.flags, _environment); 1216 if (error != 0) { 1217 MOUNT_DBPRINTF( 1218 MOUNT_TRACE_SERVER, 1219 MOUNT_RED, 1220 ("server:mount can't lock mntpnt %s error %d\n", 1221 (const char *)ma.dir, error)); 1222 1223 fs_list_lock.unlock(); 1224 client_list_lock.unlock(); 1225 1226 // 1227 // Propagate the exception back to our caller. 1228 // 1229 pxfslib::throw_exception(_environment, error); 1230 fs_obj = pxfs_v1::filesystem::_nil(); 1231 mntoptions = (char *)NULL; 1232 return; 1233 } 1234 1235 FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_S_B, FaultFunctions::generic); 1236 1237 // 1238 // Instantiate the file system on the nodes which have direct 1239 // connections to the block device. 1240 // 1241 if (dev_is_ha) { 1242 bool started = false; 1243 for (uint32_t i = 0; i < dev_nids.length(); i++) { 1244 // Find the mount client pointer for device node i. 1245 cep = find_client(dev_nids[i]); 1246 if (cep == NULL) { 1247 // 1248 // The device node isn't up 1249 // at the moment. 1250 // 1251 continue; 1252 } 1253 cep->clientptr->instantiate_ha_v1(ma, 1254 cep->clientptr->_equiv(client_p) ? mvp : NULL, 1255 credobj, dev_name, e); 1256 if (e.exception() == NULL) { 1257 started = true; 1258 continue; 1259 } 1260 if (CORBA::COMM_FAILURE::_exnarrow(e.exception())) { 1261 // 1262 // The node crashed after we locked the 1263 // mount point OK. Just skip it and 1264 // try to start replicas on other nodes. 1265 // 1266 e.clear(); 1267 continue; 1268 } 1269 // XXX What to do? 1270 MOUNT_DBPRINTF( 1271 MOUNT_TRACE_SERVER, 1272 MOUNT_RED, 1273 ("server:mount instantiate_ha " 1274 "returned exception, can't start HA service %s\n", 1275 (const char *)ma.spec)); 1276 e.clear(); 1277 } 1278 if (!started) { 1279 MOUNT_DBPRINTF( 1280 MOUNT_TRACE_SERVER, 1281 MOUNT_GREEN, 1282 ("server:mount can't start service %s\n", 1283 (const char *)ma.spec)); 1284 error = ENXIO; 1285 goto err; 1286 } 1287 // 1288 // Get the root HA file system object. 1289 // 1290 replica::service_admin_var sa = 1291 pxfslib::get_service_admin_ref("mount_server_impl::mount", 1292 (const char *)ma.spec, e); 1293 if (e.exception()) { 1294 error = EIO; 1295 e.clear(); 1296 MOUNT_DBPRINTF( 1297 MOUNT_TRACE_SERVER, 1298 MOUNT_RED, 1299 ("server:mount get_service_admin_ref(%s) except\n", 1300 (const char *)ma.spec)); 1301 // 1302 // XXX Need to shut down this service but 1303 // can't get the service_admin object to do it. 1304 // 1305 goto err; 1306 } 1307 CORBA::Object_var obj = sa->get_root_obj(e); 1308 if (e.exception()) { 1309 error = pxfslib::get_err(e); 1310 e.clear(); 1311 MOUNT_DBPRINTF( 1312 MOUNT_TRACE_SERVER, 1313 MOUNT_RED, 1314 ("server:mount get_root_obj(%s) error %d\n", 1315 (const char *)ma.spec, error)); 1316 1317 // Need to shut down this service. 1318 sa->shutdown_service(false, e); 1319 e.clear(); 1320 goto err; 1321 } 1322 pxfs_v1::filesystem_ptr fsobj = 1323 pxfs_v1::filesystem::_narrow(obj); 1324 ASSERT(!CORBA::is_nil(fsobj)); 1325 1326 // 1327 // Get the file system info. 1328 // XXX Note that the mount_server has a temporary dependency 1329 // on the file system service for this IDL invocation. 1330 // 1331 fsobj->get_mntinfo(fsinfo, mntoptions, e); 1332 if (e.exception()) { 1333 error = pxfslib::get_err(e); 1334 e.clear(); 1335 CORBA::release(fsobj); 1336 MOUNT_DBPRINTF( 1337 MOUNT_TRACE_SERVER, 1338 MOUNT_RED, 1339 ("server:mount get_mntinfo(%s) error %d\n", 1340 (const char *)ma.spec, error)); 1341 1342 // Need to shut down this service. 1343 sa->shutdown_service(false, e); 1344 e.clear(); 1345 goto err; 1346 } 1347 1348 // 1349 // Note that we transfer the CORBA reference 1350 // to the return value (i.e., don't release fsobj). 1351 // 1352 fs_obj = fsobj; 1353 } else { 1354 // 1355 // Find the mount client pointer for the node which has 1356 // the device. 1357 // 1358 cep = find_client(dev_nids[0]); 1359 if (cep == NULL) { 1360 error = ENXIO; 1361 goto err; 1362 } 1363 cep->clientptr->instantiate_v1(ma, 1364 cep->clientptr->_equiv(client_p) ? mvp : NULL, 1365 credobj, fs_obj, fsinfo, mntoptions, e); 1366 if (e.exception() != NULL) { 1367 error = pxfslib::get_err(e); 1368 e.clear(); 1369 MOUNT_DBPRINTF( 1370 MOUNT_TRACE_SERVER, 1371 MOUNT_RED, 1372 ("server:mount instantiate(%s) err %d\n", 1373 (const char *)ma.spec, error)); 1374 err: 1375 fs_list_lock.unlock(); 1376 1377 // Checkpoint the error before unlocking mount points. 1378 get_checkpoint()->ckpt_mount_err(error, _environment); 1379 _environment.clear(); 1380 1381 // 1382 // Unlock the mount points we have already locked. 1383 // 1384 unlock_mntpnt(client_p, NULL, ma.dir, _environment); 1385 1386 client_list_lock.unlock(); 1387 pxfslib::throw_exception(_environment, error); 1388 fs_obj = pxfs_v1::filesystem::_nil(); 1389 mntoptions = (char *)NULL; 1390 return; 1391 } 1392 } 1393 1394 // 1395 // Add a new file system element to the list and checkpoint it. 1396 // 1397 fep = new fs_elem((pxfs_v1::filesystem_ptr)fs_obj, fsinfo, ma, 1398 mntoptions, dev_is_ha, dev_name, dev_nids); 1399 MOUNT_DBPRINTF( 1400 MOUNT_TRACE_SERVER, 1401 MOUNT_GREEN, 1402 ("server:mount add %s fep %p\n", 1403 (const char *)ma.spec, fep)); 1404 1405 ASSERT(find_fs(fs_obj) == NULL); 1406 1407 // 1408 // We should only append to the end of the list to maintain 1409 // mount ordering. 1410 // 1411 fs_list.append(fep); 1412 1413 // 1414 // Checkpoint the successful instantiation of the file system. 1415 // 1416 get_checkpoint()->ckpt_mount_middle_v1(fs_obj, fsinfo, mntoptions, 1417 dev_is_ha, dev_name, dev_nids, _environment); 1418 if (_environment.exception()) { 1419 // 1420 // The only possible exception for checkpoints is 1421 // a VERSION exception, which represents a programming 1422 // error. This is the first use of a new version checkpoint. 1423 // So we will check for a programming mistake. 1424 // 1425 MOUNT_DBPRINTF( 1426 MOUNT_TRACE_SERVER, 1427 MOUNT_RED, 1428 ("server:mount fep %p ckpt_mount_middle_v1 except %d\n", 1429 fep, _environment.exception()->exception_enum())); 1430 ASSERT(0); 1431 _environment.clear(); 1432 } 1433 1434 fs_list_lock.unlock(); 1435 1436 FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_S_A, FaultFunctions::generic); 1437 1438 // 1439 // At this point we are committed to completing the mount without 1440 // errors. Notify each client of the addition. 1441 // 1442 mount_end_v1(client_p, ma, mntoptions, fs_obj, fsinfo, false, 1443 _environment); 1444 1445 client_list_lock.unlock(); 1446 1447 FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_S_E, FaultFunctions::generic); 1448 } 1449 1450 // 1451 // Checkpoint the start of a mount or remount. 1452 // Helper function for checkpointing state on a secondary. 1453 // 1454 void 1455 mount_server_impl::ckpt_mount_start_v1(const sol::mounta &ma, 1456 fs::mount_client_ptr client_p, Environment &env) 1457 { 1458 transaction_state *statep = 1459 new mount_state(ma, client_p, *this, mount_state::VERSION_1); 1460 statep->register_state(env); 1461 env.clear(); 1462 MOUNT_DBPRINTF( 1463 MOUNT_TRACE_SERVER, 1464 MOUNT_GREEN, 1465 ("server:ckpt_mount_start_v1 %s\n", 1466 (const char *)ma.dir)); 1467 } 1468 1469 // 1470 // Checkpoint a mount or remount error. 1471 // Helper function for checkpointing state on a secondary. 1472 // 1473 void 1474 mount_server_impl::ckpt_mount_err(sol::error_t error, Environment &env) 1475 { 1476 // 1477 // We must have saved state because this cannot be the first checkpoint 1478 // 1479 secondary_ctx *ctxp = secondary_ctx::extract_from(env); 1480 ASSERT(ctxp != NULL); 1481 mount_state *statep = (mount_state *)ctxp->get_saved_state(); 1482 ASSERT(statep != NULL); 1483 ASSERT(statep->mount_ver == mount_state::VERSION_1); 1484 1485 // 1486 // Save the error value. 1487 // This also marks the start of unlocking the mount points. 1488 // 1489 ASSERT(CORBA::is_nil(statep->fs_v1_ptr)); 1490 statep->error = error; 1491 MOUNT_DBPRINTF( 1492 MOUNT_TRACE_SERVER, 1493 MOUNT_RED, 1494 ("server:ckpt_mount_err %d\n", error)); 1495 } 1496 1497 // 1498 // Add a fs_elem to the list of globally mounted file systems. 1499 // This is used to dump state to a newly joining secondary. 1500 // 1501 void 1502 mount_server_impl::ckpt_mount_v1(pxfs_v1::filesystem_ptr fs_obj, 1503 const pxfs_v1::fs_info &fsinfo, const sol::mounta &ma, 1504 const char *mntoptions, 1505 bool dev_is_ha, const char *dev_name, const sol::nodeid_seq_t &dev_nids) 1506 { 1507 // 1508 // Add fs_elem if it isn't already there. 1509 // This can happen if the primary failed after sending this 1510 // checkpoint and the operation was retried on the new primary. 1511 // 1512 ASSERT(!CORBA::is_nil(fs_obj)); 1513 if (find_fs(fs_obj) == NULL) { 1514 fs_elem *fep = new fs_elem(fs_obj, fsinfo, ma, 1515 mntoptions, dev_is_ha, dev_name, dev_nids); 1516 fs_list.append(fep); 1517 MOUNT_DBPRINTF( 1518 MOUNT_TRACE_SERVER, 1519 MOUNT_GREEN, 1520 ("server:ckpt_mount_v1 add %s fep %p\n", 1521 (const char *)ma.spec, fep)); 1522 } 1523 } 1524 1525 // 1526 // Checkpoint the creation of a new file system object. 1527 // Helper function for checkpointing state on a secondary. 1528 // 1529 void 1530 mount_server_impl::ckpt_mount_middle_v1(pxfs_v1::filesystem_ptr fs_obj, 1531 const pxfs_v1::fs_info &fsinfo, const char *mntoptions, bool dev_is_ha, 1532 const char *dev_name, const sol::nodeid_seq_t &dev_nids, Environment &env) 1533 { 1534 // 1535 // We must have saved state because this cannot be the first checkpoint 1536 // 1537 secondary_ctx *ctxp = secondary_ctx::extract_from(env); 1538 ASSERT(ctxp != NULL); 1539 mount_state *statep = (mount_state *)ctxp->get_saved_state(); 1540 ASSERT(statep != NULL); 1541 1542 statep->fs_v1_ptr = pxfs_v1::filesystem::_duplicate(fs_obj); 1543 statep->fs_v1_info = fsinfo; 1544 statep->mntoptions = mntoptions; 1545 1546 // 1547 // Add fs_elem if it isn't already there. 1548 // This can happen if the primary failed after sending this checkpoint 1549 // and the operation was retried on the new primary. 1550 // 1551 ASSERT(!CORBA::is_nil(fs_obj)); 1552 fs_elem *fep = find_fs(fs_obj); 1553 if (fep == NULL) { 1554 fep = new fs_elem(fs_obj, fsinfo, statep->ma, mntoptions, 1555 dev_is_ha, dev_name, dev_nids); 1556 fs_list.append(fep); 1557 MOUNT_DBPRINTF( 1558 MOUNT_TRACE_SERVER, 1559 MOUNT_AMBER, 1560 ("server:ckpt_mount_middle_v1 add %s fep %p\n", 1561 (const char *)statep->ma.spec, fep)); 1562 } 1563 } 1564 1565 // 1566 // Helper routine for mount_state::orphaned() to clean up mount() or remount(). 1567 // Return true if the operation is completed. 1568 // 1569 bool 1570 mount_server_impl::mount_orphaned(mount_state *statep, bool orph, 1571 Environment &env) 1572 { 1573 if (CORBA::is_nil(statep->mountpoint_lock_c)) { 1574 // 1575 // We have seen the add_commit() so just return an error 1576 // if needed. 1577 // 1578 if (statep->error != 0 && !orph) { 1579 pxfslib::throw_exception(env, statep->error); 1580 } 1581 MOUNT_DBPRINTF( 1582 MOUNT_TRACE_SERVER, 1583 MOUNT_RED, 1584 ("server:mount_orphaned committed %s\n", 1585 (const char *)statep->ma.dir)); 1586 return (true); 1587 } 1588 1589 ASSERT(statep->mount_ver == mount_state::VERSION_1); 1590 if (!CORBA::is_nil(statep->fs_v1_ptr)) { 1591 // 1592 // We have seen ckpt_mount_middle() or 1593 // ckpt_remount_middle(), but not the add_commit(). 1594 // Make sure all mount clients have unlocked the 1595 // mount point and updated /etc/mnttab. 1596 // 1597 ASSERT(statep->error == 0); 1598 client_list_lock.wrlock(); 1599 MOUNT_DBPRINTF( 1600 MOUNT_TRACE_SERVER, 1601 MOUNT_RED, 1602 ("server:mount_orphaned v1 do end %s\n", 1603 (const char *)statep->ma.dir)); 1604 mount_end_v1(statep->mountpoint_lock_c, statep->ma, 1605 statep->mntoptions, statep->fs_v1_ptr, 1606 statep->fs_v1_info, statep->is_remount, env); 1607 client_list_lock.unlock(); 1608 return (true); 1609 } 1610 if (statep->error != 0 || orph) { 1611 // 1612 // We have seen ckpt_mount_err() or mount_state::orphaned() 1613 // but not the add_commit(). 1614 // Make sure mount points are unlocked. 1615 // 1616 client_list_lock.wrlock(); 1617 MOUNT_DBPRINTF( 1618 MOUNT_TRACE_SERVER, 1619 MOUNT_GREEN, 1620 ("server:mount_orphaned error %d\n", 1621 statep->error)); 1622 1623 // 1624 // Make sure file service is shut down. 1625 // 1626 fs_list_lock.wrlock(); 1627 Environment e; 1628 replica::service_admin_var sa = 1629 pxfslib::get_service_admin_ref("mount_server_impl::mount", 1630 (const char *)statep->ma.spec, e); 1631 if (e.exception() == NULL) { 1632 sa->shutdown_service(false, e); 1633 } 1634 e.clear(); 1635 fs_list_lock.unlock(); 1636 1637 unlock_mntpnt(statep->mountpoint_lock_c, NULL, statep->ma.dir, 1638 env); 1639 client_list_lock.unlock(); 1640 1641 if (statep->error != 0 && !orph) { 1642 pxfslib::throw_exception(env, statep->error); 1643 } 1644 return (true); 1645 } 1646 1647 // 1648 // Only the start checkpoint has been seen 1649 // 1650 return (false); 1651 } 1652 1653 // 1654 // Notify all mount clients of a new mount. 1655 // 1656 void 1657 mount_server_impl::mount_end_v1(fs::mount_client_ptr skip, 1658 const sol::mounta &ma, const char *mntoptions, 1659 pxfs_v1::filesystem_ptr fs_obj, const pxfs_v1::fs_info &fsinfo, 1660 bool is_remount, Environment &env) 1661 { 1662 mount_client_elem *cep; 1663 Environment e; 1664 1665 ASSERT(client_list_lock.write_held()); 1666 1667 client_list.atfirst(); 1668 while ((cep = client_list.get_current()) != NULL) { 1669 client_list.advance(); 1670 1671 // Is this the requesting node? 1672 if (cep->clientptr->_equiv(skip)) { 1673 continue; 1674 } 1675 1676 if (is_remount) { 1677 // 1678 // Set the proxy vfs_t flags and release the 1679 // mount point lock on this node. 1680 // 1681 cep->clientptr->set_flags_v1(ma, mntoptions, fs_obj, 1682 fsinfo.fsflag, e); 1683 } else { 1684 // 1685 // Create a new proxy vfs_t, link it into the name 1686 // space, and release the mount point lock on this 1687 // node. 1688 // 1689 cep->clientptr->add_notify_locked_v1(ma, mntoptions, 1690 fs_obj, fsinfo, e); 1691 } 1692 1693 if (e.exception() == NULL) { 1694 continue; 1695 } 1696 if (CORBA::COMM_FAILURE::_exnarrow(e.exception())) { 1697 e.clear(); 1698 continue; 1699 } 1700 ASSERT(0); // XXX 1701 e.clear(); 1702 } 1703 1704 add_commit(env); 1705 } 1706 1707 // 1708 // Perform a global remount. 1709 // This operation should be idempotent since it can be retried on a new primary. 1710 // 1711 void 1712 mount_server_impl::remount(fs::filesystem_ptr fs_obj, fs::fobj_ptr mntpnt, 1713 const sol::mounta &ma, solobj::cred_ptr credobj, 1714 fs::mount_client_ptr client_p, 1715 uint32_t &vfsflags, CORBA::String_out mntoptions, Environment &_environment) 1716 { 1717 CL_PANIC(0); 1718 } 1719 1720 // 1721 // Perform a global remount. 1722 // This operation should be idempotent since it can be retried on a new primary. 1723 // 1724 void 1725 mount_server_impl::remount_v1(pxfs_v1::filesystem_ptr fs_obj, 1726 pxfs_v1::fobj_ptr mntpnt, 1727 const sol::mounta &ma, solobj::cred_ptr credobj, 1728 fs::mount_client_ptr client_p, 1729 uint32_t &vfsflags, CORBA::String_out mntoptions, Environment &_environment) 1730 { 1731 ASSERT(ma.flags & MS_SYSSPACE); 1732 1733 MOUNT_DBPRINTF( 1734 MOUNT_TRACE_SERVER, 1735 MOUNT_GREEN, 1736 ("server:remount %s\n", (const char *)ma.dir)); 1737 1738 // Check to see if we have saved state. 1739 primary_ctx *ctxp = primary_ctx::extract_from(_environment); 1740 mount_state *statep; 1741 if (ctxp != NULL && 1742 (statep = (mount_state *)ctxp->get_saved_state()) != NULL) { 1743 // 1744 // This is a retry on a new primary after a failover. 1745 // If the previous remount() was committed, we are done. 1746 // 1747 ASSERT(statep->mount_ver == mount_state::VERSION_1); 1748 if (mount_orphaned(statep, false, _environment)) { 1749 if (_environment.exception() == NULL) { 1750 // Return values from the saved state. 1751 vfsflags = statep->fs_v1_info.fsflag; 1752 mntoptions = os::strdup(statep->mntoptions); 1753 } else { 1754 mntoptions = (char *)NULL; 1755 } 1756 return; 1757 } 1758 1759 // 1760 // Since we have saved state, we know the original 1761 // primary sent the start checkpoint 1762 // so we don't need to do it again here. 1763 // 1764 ASSERT(statep->mountpoint_lock_c->_equiv(client_p)); 1765 1766 check_multiple_remounts(ma.dir, _environment); 1767 if (_environment.exception()) { 1768 mntoptions = (char *)NULL; 1769 return; 1770 } 1771 1772 client_list_lock.wrlock(); 1773 } else { 1774 check_multiple_remounts(ma.dir, _environment); 1775 if (_environment.exception()) { 1776 mntoptions = (char *)NULL; 1777 return; 1778 } 1779 1780 // 1781 // This is the start of a new remount, not a retry. 1782 // Checkpoint the start of locking the mount points so 1783 // we can clean up if both the client and primary fail. 1784 // 1785 client_list_lock.wrlock(); 1786 get_checkpoint()->ckpt_mount_start_v1(ma, client_p, 1787 _environment); 1788 if (_environment.exception()) { 1789 // 1790 // We are no longer a member of the cluster, 1791 // don't proceed. 1792 // 1793 client_list_lock.unlock(); 1794 _environment.clear(); 1795 mntoptions = (char *)NULL; 1796 1797 current_mount_lock.lock(); 1798 1799 delete [] currentmnt; 1800 currentmnt = (char *)NULL; 1801 currentmnt_cv.broadcast(); 1802 1803 current_mount_lock.unlock(); 1804 return; 1805 } 1806 } 1807 1808 // 1809 // Lock the mount point on all client nodes except the 1810 // requesting node (since it already has the mount point locked). 1811 // 1812 sol::error_t error = lock_mntpnt(client_p, ma.dir, ma.flags, 1813 _environment); 1814 if (error != 0) { 1815 client_list_lock.unlock(); 1816 1817 // 1818 // Propagate the exception back to our caller. 1819 // 1820 pxfslib::throw_exception(_environment, error); 1821 mntoptions = (char *)NULL; 1822 1823 current_mount_lock.lock(); 1824 1825 delete [] currentmnt; 1826 currentmnt = (char *)NULL; 1827 currentmnt_cv.broadcast(); 1828 1829 current_mount_lock.unlock(); 1830 1831 return; 1832 } 1833 1834 FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_S_B, FaultFunctions::generic); 1835 1836 // 1837 // Get the file system info. 1838 // XXX Note that the mount_server has a temporary dependency 1839 // on the file system service for this IDL invocation. 1840 // 1841 Environment e; 1842 fs_obj->remount(mntpnt, ma, credobj, vfsflags, mntoptions, e); 1843 if (e.exception()) { 1844 // Checkpoint the error before unlocking mount points. 1845 error = pxfslib::get_err(e); 1846 e.clear(); 1847 MOUNT_DBPRINTF( 1848 MOUNT_TRACE_SERVER, 1849 MOUNT_GREEN, 1850 ("server:remount error %d\n", error)); 1851 get_checkpoint()->ckpt_mount_err(error, _environment); 1852 _environment.clear(); 1853 1854 // 1855 // Unlock the mount points we have already locked. 1856 // 1857 unlock_mntpnt(client_p, NULL, ma.dir, _environment); 1858 1859 client_list_lock.unlock(); 1860 pxfslib::throw_exception(_environment, error); 1861 mntoptions = (char *)NULL; 1862 1863 current_mount_lock.lock(); 1864 1865 delete [] currentmnt; 1866 currentmnt = (char *)NULL; 1867 currentmnt_cv.broadcast(); 1868 1869 current_mount_lock.unlock(); 1870 return; 1871 } 1872 1873 // 1874 // Update the option string stored in fs_elem. 1875 // XXX Note that we attempt to construct the mounta data that 1876 // is the combined result of "mount -o ro" merged with 1877 // "mount -o remount" but its possible the remount changed or set 1878 // other options. We may need to save args for both mount and remount, 1879 // and then in add_client() replay the mount and the remount rather 1880 // than try to do just one mount with all the right args. 1881 // 1882 fs_list_lock.wrlock(); 1883 1884 fs_elem *fep = find_fs(fs_obj); 1885 ASSERT(fep != NULL); 1886 fep->ma.flags = ma.flags & ~MS_REMOUNT; 1887 fep->ma.data = ma.data; 1888 fep->mntoptions = os::strdup(mntoptions); 1889 fep->fs_v1_info.fsflag = vfsflags; 1890 1891 // 1892 // Checkpoint this operation so that the secondary can 1893 // update its state. 1894 // 1895 get_checkpoint()->ckpt_remount_middle_v1(fs_obj, vfsflags, mntoptions, 1896 _environment); 1897 _environment.clear(); 1898 1899 fs_list_lock.unlock(); 1900 1901 FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_S_A, FaultFunctions::generic); 1902 1903 // 1904 // Notify each client with the new vfsflags and mntoptions. 1905 // Note that at this point we are committed to updating everything. 1906 // 1907 mount_end_v1(client_p, ma, mntoptions, fs_obj, fep->fs_v1_info, true, 1908 _environment); 1909 1910 current_mount_lock.lock(); 1911 1912 delete [] currentmnt; 1913 currentmnt = (char *)NULL; 1914 currentmnt_cv.broadcast(); 1915 1916 current_mount_lock.unlock(); 1917 1918 client_list_lock.unlock(); 1919 1920 FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_S_E, FaultFunctions::generic); 1921 } 1922 1923 // 1924 // Helper function for checkpointing state on a secondary. 1925 // 1926 void 1927 mount_server_impl::ckpt_remount_middle_v1(pxfs_v1::filesystem_ptr fs_obj, 1928 uint32_t vfsflags, const char *mntoptions, Environment &env) 1929 { 1930 // 1931 // We must have saved state because this cannot be the first checkpoint 1932 // 1933 secondary_ctx *ctxp = secondary_ctx::extract_from(env); 1934 ASSERT(ctxp != NULL); 1935 mount_state *statep = (mount_state *)ctxp->get_saved_state(); 1936 ASSERT(statep != NULL); 1937 1938 MOUNT_DBPRINTF( 1939 MOUNT_TRACE_SERVER, 1940 MOUNT_GREEN, 1941 ("server:ckpt_remount_middle_v1 vfsflags %x options %s\n", 1942 vfsflags, mntoptions)); 1943 statep->is_remount = true; 1944 statep->fs_v1_ptr = pxfs_v1::filesystem::_duplicate(fs_obj); 1945 statep->fs_v1_info.fsflag = vfsflags; 1946 statep->mntoptions = mntoptions; 1947 1948 fs_elem *fep = find_fs(fs_obj); 1949 ASSERT(fep != NULL); 1950 fep->ma.flags = statep->ma.flags & ~MS_REMOUNT; 1951 fep->ma.data = statep->ma.data; 1952 fep->mntoptions = mntoptions; 1953 fep->fs_v1_info.fsflag = vfsflags; 1954 } 1955 1956 // 1957 // Helper routine to lock the mount point on all client nodes except 'skip'. 1958 // Return an exception if the lock can't be acquired on all nodes. 1959 // This operation should be idempotent since it can be retried on a new primary. 1960 // 1961 sol::error_t 1962 mount_server_impl::lock_mntpnt(fs::mount_client_ptr skip, 1963 const char *mountpoint, int32_t mntflags, Environment &env) 1964 { 1965 mount_client_elem *cep; 1966 sol::error_t error; 1967 Environment e; 1968 1969 ASSERT(client_list_lock.write_held()); 1970 1971 client_list.atfirst(); 1972 while ((cep = client_list.get_current()) != NULL) { 1973 client_list.advance(); 1974 1975 // The node requesting the mount has already done the locking 1976 if (cep->clientptr->_equiv(skip)) { 1977 continue; 1978 } 1979 1980 // Try to get the lock on this node. 1981 cep->clientptr->lock_mountpoint(mountpoint, mntflags, e); 1982 if (e.exception() == NULL) 1983 continue; 1984 if (CORBA::COMM_FAILURE::_exnarrow(e.exception())) { 1985 e.clear(); 1986 continue; 1987 } 1988 1989 error = pxfslib::get_err(e); 1990 e.clear(); 1991 get_checkpoint()->ckpt_mount_err(error, env); 1992 ASSERT(env.exception() == NULL); 1993 1994 // Unlock the mount points we have already locked. 1995 unlock_mntpnt(skip, cep, mountpoint, env); 1996 return (error); 1997 } 1998 1999 return (0); 2000 } 2001 2002 // 2003 // Helper routined to unlock clients that have already been locked successfully. 2004 // The client_list_lock should be held before calling this. 2005 // 2006 void 2007 mount_server_impl::unlock_mntpnt(fs::mount_client_ptr skip, 2008 mount_client_elem *endp, const char *mountpoint, Environment &env) 2009 { 2010 mount_client_elem *cep; 2011 Environment e; 2012 2013 ASSERT(client_list_lock.write_held()); 2014 2015 client_list.atfirst(); 2016 while ((cep = client_list.get_current()) != endp) { 2017 client_list.advance(); 2018 2019 // Should we skip this client? 2020 if (cep->clientptr->_equiv(skip)) { 2021 continue; 2022 } 2023 2024 cep->clientptr->unlock_mountpoint(mountpoint, e); 2025 2026 if (e.exception() == NULL) { 2027 continue; 2028 } 2029 if (CORBA::COMM_FAILURE::_exnarrow(e.exception())) { 2030 e.clear(); 2031 continue; 2032 } 2033 2034 // XXX something better we could do? 2035 ASSERT(0); 2036 e.clear(); 2037 } 2038 2039 add_commit(env); 2040 } 2041 2042 // 2043 // Unmount a global file system by locking all mount points on all 2044 // client nodes except the requesting node, flushing the DNLC, etc. 2045 // Return an exception if the lock can't be acquired on all nodes or 2046 // if there are still active proxy vnodes. 2047 // This operation should be idempotent since it can be retried on a new primary. 2048 // 2049 void 2050 mount_server_impl::unmount(fs::filesystem_ptr fs_obj, solobj::cred_ptr credobj, 2051 fs::mount_client_ptr client_p, sol::nodeid_t nodeid, bool is_shutdown, 2052 Environment &_environment) 2053 { 2054 CL_PANIC(0); 2055 } 2056 2057 // 2058 // Forced Unmount support version 2059 // 2060 // Unmount a global file system by locking all mount points on all 2061 // client nodes except the requesting node, flushing the DNLC, etc. 2062 // Return an exception if the lock can't be acquired on all nodes or 2063 // if there are still active proxy vnodes. 2064 // This operation should be idempotent since it can be retried on a new primary. 2065 // 2066 void 2067 mount_server_impl::unmount_1(fs::filesystem_ptr fs_obj, int32_t flags, 2068 solobj::cred_ptr credobj, fs::mount_client_ptr c, sol::nodeid_t nodeid, 2069 bool is_shutdown, Environment &_environment) 2070 { 2071 CL_PANIC(0); 2072 } 2073 2074 // 2075 // Unmount a global file system by locking all mount points on all 2076 // client nodes except the requesting node, flushing the DNLC, etc. 2077 // Return an exception if the lock can't be acquired on all nodes or 2078 // if there are still active proxy vnodes. 2079 // This operation should be idempotent since it can be retried on a new primary. 2080 // 2081 void 2082 mount_server_impl::unmount_v1(pxfs_v1::filesystem_ptr fs_obj, int32_t flags, 2083 solobj::cred_ptr credobj, fs::mount_client_ptr client_p, 2084 sol::nodeid_t nodeid, bool is_shutdown, Environment &_environment) 2085 { 2086 ASSERT(!CORBA::is_nil(fs_obj)); 2087 ASSERT(!CORBA::is_nil(client_p)); 2088 2089 // Check to see if we have saved state. 2090 primary_ctx *ctxp = primary_ctx::extract_from(_environment); 2091 unmount_state *statep; 2092 if (ctxp != NULL && 2093 (statep = (unmount_state *)ctxp->get_saved_state()) != NULL) { 2094 // 2095 // Since we have saved state, we know the original 2096 // primary sent the ckpt_unmount_start() checkpoint. 2097 // We finish the unmount process from where we left off. 2098 // 2099 ASSERT(statep->unmount_ver == unmount_state::VERSION_1); 2100 unmount_orphaned(statep, true, _environment); 2101 return; 2102 } 2103 2104 fs_elem *fep; 2105 Environment e; 2106 sol::error_t error; 2107 2108 client_list_lock.wrlock(); 2109 fs_list_lock.wrlock(); 2110 2111 // 2112 // Since this is not a retry, we should find the file system entry 2113 // unless: 2114 // Node A starts unmounting file system F, gets list locks above. 2115 // Node B starts unmounting file system F, blocks waiting above. 2116 // Node B crashes. 2117 // Node A's unmount completes OK (since it either got the lock on B 2118 // or it got ECOMM and skipped B). 2119 // Node B's unmount unblocks above and we don't find the file system. 2120 // 2121 fep = find_fs(fs_obj); 2122 if (fep == NULL) { 2123 MOUNT_DBPRINTF( 2124 MOUNT_TRACE_SERVER, 2125 MOUNT_RED, 2126 ("server:unmount_v1 nid %d shutdown %d can't find FS\n", 2127 nodeid, is_shutdown)); 2128 fs_list_lock.unlock(); 2129 client_list_lock.unlock(); 2130 pxfslib::throw_exception(_environment, EINVAL); 2131 return; 2132 } 2133 MOUNT_DBPRINTF( 2134 MOUNT_TRACE_SERVER, 2135 MOUNT_GREEN, 2136 ("server:unmount_v1 %s nid %d shutdown %d fep %p\n", 2137 (const char *)fep->ma.dir, nodeid, is_shutdown, fep)); 2138 2139 // 2140 // Ok, start an unmount. 2141 // 2142 get_checkpoint()->ckpt_unmount_start_v1(fs_obj, flags, credobj, 2143 client_p, is_shutdown, _environment); 2144 ASSERT(_environment.exception() == NULL); 2145 2146 error = unmount_common_1(client_p, fep, flags, credobj, 2147 unmount_state::START, 0, NULL, _environment); 2148 2149 MOUNT_DBPRINTF( 2150 MOUNT_TRACE_SERVER, 2151 (error ? MOUNT_RED : MOUNT_GREEN), 2152 ("server:unmount_v1 fep %p err %d\n", 2153 fep, error)); 2154 2155 fs_list_lock.unlock(); 2156 client_list_lock.unlock(); 2157 2158 if (error != 0) { 2159 pxfslib::throw_exception(_environment, error); 2160 } 2161 } 2162 2163 // 2164 // Checkpoint that a node is being shut down. 2165 // Helper function for checkpointing state on a secondary. 2166 // 2167 void 2168 mount_server_impl::ckpt_unmount_shutdown(fs::mount_client_ptr client) 2169 { 2170 mount_client_elem *cep = find_client(client); 2171 if (cep != NULL) { 2172 cep->shutdown = true; 2173 } 2174 } 2175 2176 // 2177 // Checkpoint the start of an unmount. 2178 // Add_commit() should be called to finish the unmount process. 2179 // Helper function for checkpointing state on a secondary. 2180 // 2181 void 2182 mount_server_impl::ckpt_unmount_start_v1(pxfs_v1::filesystem_ptr fs_obj, 2183 int32_t flags, solobj::cred_ptr credobj, fs::mount_client_ptr client, 2184 bool is_shutdown, Environment &env) 2185 { 2186 // 2187 // We might have saved state from a previous ckpt_unmount_start(). 2188 // if so, reuse the same transaction state; otherwise, create one. 2189 // 2190 secondary_ctx *ctxp = secondary_ctx::extract_from(env); 2191 unmount_state *statep = NULL; 2192 if (ctxp == NULL || 2193 (statep = (unmount_state *)ctxp->get_saved_state()) == NULL) { 2194 statep = new unmount_state(fs_obj, flags, credobj, *this, 2195 client); 2196 statep->register_state(env); 2197 env.clear(); 2198 } else { 2199 ASSERT(!CORBA::is_nil(fs_obj)); 2200 statep->skip = fs::mount_client::_duplicate(client); 2201 statep->fs_v1_obj = pxfs_v1::filesystem::_duplicate(fs_obj); 2202 statep->credobj = solobj::cred::_duplicate(credobj); 2203 statep->state = unmount_state::START; 2204 statep->error = 0; 2205 statep->service_name = (char *)NULL; 2206 } 2207 2208 if (is_shutdown) { 2209 mount_client_elem *cep = find_client(client); 2210 if (cep != NULL && !cep->shutdown) { 2211 cep->shutdown = true; 2212 } 2213 } 2214 #ifdef DEBUG 2215 fs_elem *fep = find_fs(fs_obj); 2216 MOUNT_DBPRINTF( 2217 MOUNT_TRACE_SERVER, 2218 MOUNT_GREEN, 2219 ("server:ckpt_unmount_start_v1 %s\n", 2220 (const char *)fep->ma.dir)); 2221 #endif 2222 } 2223 2224 // 2225 // Helper routine for unmount_state::orphaned() to unlock mount points. 2226 // This is also called on the primary when recovering from a failover. 2227 // 2228 void 2229 mount_server_impl::unmount_orphaned(unmount_state *statep, bool ret_err, 2230 Environment &env) 2231 { 2232 MOUNT_DBPRINTF( 2233 MOUNT_TRACE_SERVER, 2234 MOUNT_AMBER, 2235 ("server:unmount_orphaned %d\n", ret_err)); 2236 if (statep->state == unmount_state::COMMITTED) { 2237 // 2238 // We saw the last checkpoint so we are done. 2239 // Return the error if requested and there is one. 2240 // 2241 if (ret_err && statep->error != 0) { 2242 pxfslib::throw_exception(env, statep->error); 2243 } 2244 return; 2245 } 2246 2247 // 2248 // The file system is still mounted so unmount it. 2249 // 2250 client_list_lock.wrlock(); 2251 fs_list_lock.wrlock(); 2252 fs_elem *fep; 2253 2254 ASSERT(statep->unmount_ver == unmount_state::VERSION_1); 2255 fep = find_fs(statep->fs_v1_obj); 2256 statep->error = unmount_common_1(statep->skip, fep, 2257 statep->flags, statep->credobj, statep->state, 2258 statep->error, statep->service_name, env); 2259 fs_list_lock.unlock(); 2260 client_list_lock.unlock(); 2261 2262 if (ret_err && statep->error != 0) { 2263 pxfslib::throw_exception(env, statep->error); 2264 } 2265 } 2266 2267 // 2268 // Version for forced unmount support 2269 // 2270 // Common code for add_client(), remove_client(), unmount(), and 2271 // unmount_orphaned(). 2272 // The job is to unmount the file system 'fep' from all nodes, possibly 2273 // skipping steps that have already been performed. 2274 // Both client_list_lock and fs_list_lock should be held. 2275 // If there is no error, 'fep' is removed from the list of all file systems. 2276 // 2277 sol::error_t 2278 mount_server_impl::unmount_common_1(fs::mount_client_ptr skip, fs_elem *fep, 2279 int32_t flags, solobj::cred_ptr credobj, unmount_state::state_t state, 2280 sol::error_t error, const char *service_name, Environment &env) 2281 { 2282 mount_client_elem *cep; 2283 mount_client_elem *end_cep = NULL; 2284 fs_elem *delete_fep = NULL; 2285 bool skip_purge; 2286 Environment e; 2287 2288 ASSERT(client_list_lock.write_held()); 2289 ASSERT(fs_list_lock.write_held()); 2290 ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1); 2291 2292 switch (state) { 2293 case unmount_state::START: 2294 // 2295 // Make sure the mount point is locked and there 2296 // are no active vnodes on each client. 2297 // 2298 client_list.atfirst(); 2299 while ((cep = client_list.get_current()) != NULL) { 2300 client_list.advance(); 2301 2302 // 2303 // Note that for the node which called the umount 2304 // system call we must skip most of the unmount 2305 // preparation (filesystem locking and vnode cache 2306 // purging) since that has already been done there. 2307 // 2308 skip_purge = false; 2309 if (cep->clientptr->_equiv(skip)) { 2310 skip_purge = true; 2311 } 2312 2313 // Try to get the lock on this node. 2314 MOUNT_DBPRINTF( 2315 MOUNT_TRACE_SERVER, 2316 MOUNT_GREEN, 2317 ("server:prepare_unmount_1 %s on %d flags = %x\n", 2318 (const char *)fep->ma.dir, cep->nodeid, flags)); 2319 cep->clientptr->prepare_unmount_v1( 2320 fep->fs_v1_ptr, flags, credobj, skip_purge, e); 2321 if (e.exception() == NULL) { 2322 continue; 2323 } 2324 if (CORBA::COMM_FAILURE::_exnarrow(e.exception())) { 2325 MOUNT_DBPRINTF( 2326 MOUNT_TRACE_SERVER, 2327 MOUNT_RED, 2328 ("server:prepare_unmount_1 " 2329 "%s on %d COMM_FAILURE\n", 2330 (const char *)fep->ma.dir, cep->nodeid)); 2331 e.clear(); 2332 continue; 2333 } 2334 2335 error = pxfslib::get_err(e); 2336 e.clear(); 2337 // 2338 // Unlock clients that have already been 2339 // locked successfully. 2340 // 2341 end_cep = cep; 2342 get_checkpoint()->ckpt_unmount_middle(error, env); 2343 ASSERT(env.exception() == NULL); 2344 goto notify; 2345 } 2346 2347 FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_B, 2348 FaultFunctions::generic); 2349 2350 // 2351 // Try to unmount the file system. 2352 // XXX Temporary dependency on the file system here. 2353 // 2354 error = 0; 2355 fep->fs_v1_ptr->unmount(flags, credobj, e); 2356 2357 if (e.exception()) { 2358 if (CORBA::VERSION::_exnarrow(e.exception())) { 2359 MOUNT_DBPRINTF( 2360 MOUNT_TRACE_SERVER, 2361 MOUNT_RED, 2362 ("server:unmount_common_1 " 2363 "fs version exception %d\n")); 2364 error = EBUSY; 2365 } else if (CORBA::COMM_FAILURE::_exnarrow( 2366 e.exception())) { 2367 // 2368 // If the server for a file system is dead, 2369 // and no clients have active vnodes, we 2370 // allow the unmount to happen. 2371 // 2372 error = 0; 2373 } else { 2374 error = pxfslib::get_err(e); 2375 // 2376 // For a forced unmount we always set error 2377 // to zero (except for ENOTSUP) so the global 2378 // unmount will always succeed. ENOTSUP is the 2379 // legitimate return from underlying file 2380 // systems not supporting forced unmount. 2381 // 2382 if ((flags & MS_FORCE) && (error != ENOTSUP)) { 2383 MOUNT_DBPRINTF( 2384 MOUNT_TRACE_SERVER, 2385 MOUNT_RED, 2386 ("server:unmount_common_1" 2387 "forced unmount error %d\n", 2388 error)); 2389 error = 0; 2390 } 2391 } 2392 MOUNT_DBPRINTF( 2393 MOUNT_TRACE_SERVER, 2394 MOUNT_RED, 2395 ("server:unmount_common_1 fs exception %d\n", 2396 error)); 2397 e.clear(); 2398 } 2399 2400 get_checkpoint()->ckpt_unmount_middle(error, env); 2401 ASSERT(env.exception() == NULL); 2402 // FALL THROUGH 2403 2404 case unmount_state::UNMOUNTED: 2405 notify: 2406 // 2407 // Notify mount clients of the unmount result. 2408 // 2409 client_list.atfirst(); 2410 while ((cep = client_list.get_current()) != end_cep) { 2411 client_list.advance(); 2412 2413 // 2414 // Notify clients of unmount success or failure so 2415 // they can proceed appropriately. 2416 // In order to handle forced unmounts correctly, 2417 // we need to notify all clients that the unmount 2418 // succeeded. 2419 // 2420 // Note that the node which called the 2421 // umount() system call will unlink the vfs_t from the 2422 // file system name space so that node is treated 2423 // differently from the others. 2424 // 2425 if (error == 0) { 2426 cep->clientptr->remove_notify_1(fep->ma.dir, 2427 !cep->clientptr->_equiv(skip), e); 2428 } else { 2429 if (!cep->clientptr->_equiv(skip)) { 2430 cep->clientptr-> 2431 unmount_failed_1(false, e); 2432 } else { 2433 cep->clientptr-> 2434 unmount_failed_1(true, e); 2435 } 2436 } 2437 e.clear(); 2438 } 2439 2440 // 2441 // Tell the secondary that all clients have been notified 2442 // before shutting down the file system service so that 2443 // the HA object isn't marshalled after the shutdown. 2444 // 2445 if (error == 0) { 2446 if (fep->dev_is_ha) { 2447 // Note: service_name shares string with "fep". 2448 service_name = (const char *)fep->ma.spec; 2449 } 2450 2451 MOUNT_DBPRINTF( 2452 MOUNT_TRACE_SERVER, 2453 MOUNT_GREEN, 2454 ("server:unmount_common_1 remove %p\n", 2455 fep)); 2456 // 2457 // Note that erase() does an advance() if 2458 // fep == get_current(). 2459 // 2460 (void) fs_list.erase(fep); 2461 delete_fep = fep; 2462 } 2463 2464 FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_A, 2465 FaultFunctions::generic); 2466 2467 get_checkpoint()->ckpt_unmount_notified(env); 2468 ASSERT(env.exception() == NULL); 2469 // FALLTHROUGH 2470 2471 case unmount_state::NOTIFIED: 2472 // 2473 // Shut down the FS service. 2474 // 2475 if (service_name != NULL) { 2476 replica::service_admin_var sa = 2477 pxfslib::get_service_admin_ref( 2478 "mount_server_impl::unmount_common_1", 2479 service_name, e); 2480 if (e.exception()) { 2481 // 2482 // XXX Need to shut down this service but 2483 // can't get the service_admin object to do it. 2484 // 2485 MOUNT_DBPRINTF( 2486 MOUNT_TRACE_SERVER, 2487 MOUNT_RED, 2488 ("server:unmount_common_1 " 2489 "get_service_admin_ref exception\n")); 2490 e.clear(); 2491 } else { 2492 if ((flags & MS_FORCE) == 0) { 2493 sa->shutdown_service(false, e); 2494 } else { 2495 sa->shutdown_service(true, e); 2496 } 2497 if (e.exception()) { 2498 MOUNT_DBPRINTF( 2499 MOUNT_TRACE_SERVER, 2500 MOUNT_RED, 2501 ("server:unmount_com_1 " 2502 "shutdown_service() exception\n")); 2503 e.clear(); 2504 } 2505 } 2506 FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_E, 2507 FaultFunctions::generic); 2508 get_checkpoint()->ckpt_unmount_end(env); 2509 ASSERT(env.exception() == NULL); 2510 } 2511 break; 2512 2513 default: 2514 ASSERT(0); 2515 } 2516 2517 if (delete_fep != NULL) { 2518 // 2519 // We delay deleting fep until now since service_name could 2520 // be pointing to the string that would be freed. 2521 // 2522 delete delete_fep; 2523 } 2524 2525 return (error); 2526 } 2527 2528 // 2529 // Checkpoint that either the mount points have been locked 2530 // and the file system has been unmounted or there was an error. 2531 // We have yet to notify the client nodes of the result. 2532 // Helper function for checkpointing state on a secondary. 2533 // 2534 void 2535 mount_server_impl::ckpt_unmount_middle(sol::error_t error, Environment &env) 2536 { 2537 // 2538 // We should have saved state since ckpt_unmount_start() is 2539 // supposed to be called first. 2540 // 2541 secondary_ctx *ctxp = secondary_ctx::extract_from(env); 2542 ASSERT(ctxp != NULL); 2543 unmount_state *statep = (unmount_state *)ctxp->get_saved_state(); 2544 ASSERT(statep != NULL); 2545 2546 // 2547 // state == UNMOUNTED means that 'error' is valid and 2548 // don't retry to lock mount points or unmount the file system again. 2549 // 2550 statep->state = unmount_state::UNMOUNTED; 2551 statep->error = error; 2552 MOUNT_DBPRINTF( 2553 MOUNT_TRACE_SERVER, 2554 MOUNT_GREEN, 2555 ("server:ckpt_unmount_middle error %d\n", error)); 2556 } 2557 2558 // 2559 // Checkpoint that all the client nodes have been notified (if there was 2560 // no unmount error) before shutting down the file system service. 2561 // Helper function for checkpointing state on a secondary. 2562 // 2563 void 2564 mount_server_impl::ckpt_unmount_notified(Environment &env) 2565 { 2566 // 2567 // We should have saved state since ckpt_unmount_start() is 2568 // supposed to be called first. 2569 // 2570 secondary_ctx *ctxp = secondary_ctx::extract_from(env); 2571 ASSERT(ctxp != NULL); 2572 unmount_state *statep = (unmount_state *)ctxp->get_saved_state(); 2573 ASSERT(statep != NULL); 2574 ASSERT(statep->unmount_ver == unmount_state::VERSION_1); 2575 2576 if (CORBA::is_nil(statep->fs_v1_obj)) { 2577 // This is a duplicate checkpoint after a failover. 2578 return; 2579 } 2580 if (statep->error == 0) { 2581 fs_elem *fep; 2582 fep = find_fs(statep->fs_v1_obj); 2583 ASSERT(fep != NULL); 2584 // 2585 // state == NOTIFIED means that the mount points have 2586 // been unlocked and that the only remaining task is 2587 // to shut down the file system service. 2588 // 2589 if (fep->dev_is_ha) { 2590 statep->state = unmount_state::NOTIFIED; 2591 statep->service_name = os::strdup(fep->ma.spec); 2592 } else { 2593 statep->state = unmount_state::COMMITTED; 2594 } 2595 2596 MOUNT_DBPRINTF( 2597 MOUNT_TRACE_SERVER, 2598 MOUNT_GREEN, 2599 ("server:ckpt_unmount_notified remove %p\n", 2600 fep)); 2601 (void) fs_list.erase(fep); 2602 delete fep; 2603 } else { 2604 MOUNT_DBPRINTF( 2605 MOUNT_TRACE_SERVER, 2606 MOUNT_GREEN, 2607 ("server:ckpt_unmount_notified\n")); 2608 statep->state = unmount_state::COMMITTED; 2609 } 2610 statep->fs_v1_obj = pxfs_v1::filesystem::_nil(); 2611 } 2612 2613 // 2614 // Checkpoint that unmount is complete. We can't use a commit() call since 2615 // this may be called more than once when unmounting multiple file systems. 2616 // 2617 void 2618 mount_server_impl::ckpt_unmount_end(Environment &env) 2619 { 2620 // 2621 // We should have saved state since ckpt_unmount_start() is 2622 // supposed to be called first. 2623 // 2624 secondary_ctx *ctxp = secondary_ctx::extract_from(env); 2625 ASSERT(ctxp != NULL); 2626 unmount_state *statep = (unmount_state *)ctxp->get_saved_state(); 2627 ASSERT(statep != NULL); 2628 2629 statep->state = unmount_state::COMMITTED; 2630 MOUNT_DBPRINTF( 2631 MOUNT_TRACE_SERVER, 2632 MOUNT_GREEN, 2633 ("server:ckpt_unmount_end\n")); 2634 } 2635 2636 // 2637 // Notify intent to mount a device. 2638 // The server attempts to get a "lock" on the device 2639 // and returns success if the file system is not mounted 2640 // and no other node has the lock. The device remains locked 2641 // until it is either mounted or the requesting node dies. 2642 // 2643 // mount_server_impl(fs::mount_server::devlock) 2644 void 2645 mount_server_impl::devlock(fs::mount_client_ptr c, sol::nodeid_t nodeid, 2646 const char *dev_name, Environment &_environment) 2647 { 2648 fs_list_lock.wrlock(); 2649 devlock_list_lock.wrlock(); 2650 2651 MOUNT_DBPRINTF( 2652 MOUNT_TRACE_SERVER, 2653 MOUNT_GREEN, 2654 ("server:devlock nid %d %s\n", 2655 nodeid, dev_name)); 2656 2657 // Check to see if the device is locked. 2658 devlock_elem *dep; 2659 while ((dep = find_devlock(dev_name)) != NULL) { 2660 if (dep->ownerid == nodeid) { 2661 // 2662 // The owner asked for the same lock again. 2663 // This could be a retry after a failover. 2664 // Either way, we allow it. 2665 // 2666 MOUNT_DBPRINTF( 2667 MOUNT_TRACE_SERVER, 2668 MOUNT_AMBER, 2669 ("server:devlock repeat\n")); 2670 devlock_list_lock.unlock(); 2671 fs_list_lock.unlock(); 2672 return; 2673 } 2674 2675 // 2676 // Check to see if the service was frozen and make the 2677 // caller try again (so the IDL call completes and 2678 // the switchover can proceed). 2679 // 2680 if (frozen) { 2681 MOUNT_DBPRINTF( 2682 MOUNT_TRACE_SERVER, 2683 MOUNT_RED, 2684 ("server:devlock frozen\n")); 2685 devlock_list_lock.unlock(); 2686 fs_list_lock.unlock(); 2687 pxfslib::throw_exception(_environment, EAGAIN); 2688 return; 2689 } 2690 2691 // 2692 // Check to see if the owner of the lock is blocked waiting. 2693 // This is a simple deadlock check which doesn't support 2694 // A waiting for B which is waiting for C even though this 2695 // case isn't a deadlock. 2696 // 2697 if (find_devlock_waiter(dep->ownerid)) { 2698 MOUNT_DBPRINTF( 2699 MOUNT_TRACE_SERVER, 2700 MOUNT_RED, 2701 ("server:devlock deadlock\n")); 2702 devlock_list_lock.unlock(); 2703 fs_list_lock.unlock(); 2704 pxfslib::throw_exception(_environment, EDEADLOCK); 2705 return; 2706 } 2707 2708 // 2709 // Wait until the lock is released, then return 2710 // either lock granted/not granted if the FS is 2711 // not mounted/mounted. This is so we don't have 2712 // the situation where node A gets the lock, node B 2713 // doesn't get the lock (silently not attempting to mount FS), 2714 // then node A crashes without completing the mount. 2715 // 2716 os::condvar_t::wait_result res = os::condvar_t::NORMAL; 2717 dep->waiter_lock.lock(); 2718 dep->nwaiters++; 2719 dep->waiters.set(nodeid - 1); 2720 devlock_list_lock.unlock(); 2721 fs_list_lock.unlock(); 2722 while (res == os::condvar_t::NORMAL && 2723 !dep->unlocked && !frozen) { 2724 MOUNT_DBPRINTF( 2725 MOUNT_TRACE_SERVER, 2726 MOUNT_GREEN, 2727 ("server:devlock waiting %s\n", 2728 dev_name)); 2729 2730 // Wait for pxfs_devlock_timeout seconds 2731 os::systime timeout; 2732 timeout.setreltime(pxfs_devlock_timeout * 1000000); 2733 res = dep->waiter_cv.timedwait_sig(&dep->waiter_lock, 2734 &timeout); 2735 2736 MOUNT_DBPRINTF( 2737 MOUNT_TRACE_SERVER, 2738 MOUNT_GREEN, 2739 ("server:devlock wakeup %d u %d frozen %d\n", 2740 res, dep->unlocked, frozen)); 2741 } 2742 dep->waiters.clear(nodeid - 1); 2743 bool last_waiter = (--dep->nwaiters == 0 && dep->unlocked); 2744 dep->waiter_lock.unlock(); 2745 2746 if (last_waiter) { 2747 delete dep; 2748 } 2749 2750 if (res == os::condvar_t::TIMEDOUT) { 2751 // 2752 // The wait timed out. We return ETIMEDOUT. 2753 // The client prints a syslog message the first time it 2754 // sees ETIMEDOUT. It retries until it sees something 2755 // other than ETIMEDOUT and EAGAIN. 2756 // 2757 pxfslib::throw_exception(_environment, ETIMEDOUT); 2758 return; 2759 } else if (res == os::condvar_t::SIGNALED) { 2760 pxfslib::throw_exception(_environment, EINTR); 2761 return; 2762 } 2763 fs_list_lock.wrlock(); 2764 devlock_list_lock.wrlock(); 2765 } 2766 2767 // Check to see if the device is mounted. 2768 if (find_fs(dev_name) != NULL) { 2769 MOUNT_DBPRINTF( 2770 MOUNT_TRACE_SERVER, 2771 MOUNT_RED, 2772 ("server:devlock busy\n")); 2773 devlock_list_lock.unlock(); 2774 fs_list_lock.unlock(); 2775 pxfslib::throw_exception(_environment, EBUSY); 2776 return; 2777 } 2778 2779 // 2780 // Add a new device lock element to the list and checkpoint it. 2781 // 2782 dep = new devlock_elem(c, nodeid, dev_name); 2783 devlock_list.prepend(dep); 2784 get_checkpoint()->ckpt_devlock(c, nodeid, dev_name, _environment); 2785 ASSERT(_environment.exception() == NULL); 2786 MOUNT_DBPRINTF( 2787 MOUNT_TRACE_SERVER, 2788 MOUNT_GREEN, 2789 ("server:devlock added %p\n", dep)); 2790 2791 FAULTPT_PXFS(FAULTNUM_PXFS_DEVLOCK, FaultFunctions::generic); 2792 2793 devlock_list_lock.unlock(); 2794 fs_list_lock.unlock(); 2795 } 2796 2797 // 2798 // Checkpoint a new device lock. 2799 // Helper function for checkpointing state on a secondary. 2800 // 2801 void 2802 mount_server_impl::ckpt_devlock(fs::mount_client_ptr client, 2803 sol::nodeid_t nodeid, const char *dev_name) 2804 { 2805 if (find_devlock(dev_name) == NULL) { 2806 // 2807 // Add a new device lock element to the list. 2808 // 2809 devlock_elem *dep = new devlock_elem(client, nodeid, dev_name); 2810 devlock_list.prepend(dep); 2811 MOUNT_DBPRINTF( 2812 MOUNT_TRACE_SERVER, 2813 MOUNT_GREEN, 2814 ("server:ckpt_devlock added nid %d %s %p\n", 2815 nodeid, dev_name, dep)); 2816 } 2817 } 2818 2819 // 2820 // Unlock a device. 2821 // Note that if there are any nodes waiting for a device lock, it will 2822 // prevent switchover of the mount service since there will be an uncompleted 2823 // IDL call (which won't return until the lock is released). 2824 // 2825 void 2826 mount_server_impl::devunlock(const char *dev_name, Environment &_environment) 2827 { 2828 devlock_list_lock.wrlock(); 2829 2830 // Check to see if the device is locked. 2831 devlock_elem *dep = find_devlock(dev_name); 2832 if (dep != NULL) { 2833 // 2834 // Remove lock element from the list and checkpoint it. 2835 // 2836 MOUNT_DBPRINTF( 2837 MOUNT_TRACE_SERVER, 2838 MOUNT_GREEN, 2839 ("server:devunlock %s unlock %p waiters %d\n", 2840 dev_name, dep, dep->nwaiters)); 2841 (void) devlock_list.erase(dep); 2842 get_checkpoint()->ckpt_devunlock(dev_name, _environment); 2843 ASSERT(_environment.exception() == NULL); 2844 2845 FAULTPT_PXFS(FAULTNUM_PXFS_DEVUNLOCK, FaultFunctions::generic); 2846 2847 dep->waiter_lock.lock(); 2848 if (dep->nwaiters != 0) { 2849 // This wakes up all waiting threads. 2850 dep->unlocked = true; 2851 dep->waiter_cv.broadcast(); 2852 dep->waiter_lock.unlock(); 2853 2854 // The last waiter will do the delete. 2855 } else { 2856 dep->waiter_lock.unlock(); 2857 delete dep; 2858 } 2859 } 2860 2861 devlock_list_lock.unlock(); 2862 } 2863 2864 // 2865 // Checkpoint a device unlock. 2866 // Helper function for checkpointing state on a secondary. 2867 // 2868 void 2869 mount_server_impl::ckpt_devunlock(const char *dev_name) 2870 { 2871 devlock_elem *dep = find_devlock(dev_name); 2872 if (dep != NULL) { 2873 // 2874 // Remove device lock element from the list. 2875 // 2876 MOUNT_DBPRINTF( 2877 MOUNT_TRACE_SERVER, 2878 MOUNT_GREEN, 2879 ("server:ckpt_devunlock %s unlock %p\n", 2880 dev_name, dep)); 2881 (void) devlock_list.erase(dep); 2882 delete dep; 2883 } 2884 } 2885 2886 // 2887 // Get the nodeid of the node holding the lock on a device 2888 // 2889 void 2890 mount_server_impl::get_devlock_owner(const char *dev_name, 2891 sol::nodeid_t &lock_owner, Environment &_environment) 2892 { 2893 devlock_elem *dep; 2894 2895 devlock_list_lock.wrlock(); 2896 2897 // Check to see if the device is locked and has a owner 2898 dep = find_devlock(dev_name); 2899 if (dep == NULL) { 2900 lock_owner = 0; 2901 } else { 2902 lock_owner = dep->ownerid; 2903 } 2904 2905 devlock_list_lock.unlock(); 2906 2907 MOUNT_DBPRINTF( 2908 MOUNT_TRACE_SERVER, 2909 MOUNT_GREEN, 2910 ("server:devlock owner (%d) %s\n", 2911 lock_owner, dev_name)); 2912 } 2913 2914 // 2915 // Wake up any threads waiting for a device lock so a swithover can happen. 2916 // See comment in devlock() above. 2917 // 2918 void 2919 mount_server_impl::freeze_primary() 2920 { 2921 devlock_elem *dep; 2922 devlock_list_lock.wrlock(); 2923 frozen = true; 2924 for (devlock_list.atfirst(); 2925 (dep = devlock_list.get_current()) != NULL; 2926 devlock_list.advance()) { 2927 dep->waiter_lock.lock(); 2928 if (dep->nwaiters != 0) { 2929 MOUNT_DBPRINTF( 2930 MOUNT_TRACE_SERVER, 2931 MOUNT_GREEN, 2932 ("server:freeze wakeup %p waiters %d\n", 2933 dep, dep->nwaiters)); 2934 // This wakes up all waiting threads. 2935 dep->waiter_cv.broadcast(); 2936 } 2937 dep->waiter_lock.unlock(); 2938 } 2939 devlock_list_lock.unlock(); 2940 } 2941 2942 // 2943 // Clear frozen state. 2944 // 2945 void 2946 mount_server_impl::unfreeze_primary() 2947 { 2948 devlock_list_lock.wrlock(); 2949 frozen = false; 2950 devlock_list_lock.unlock(); 2951 } 2952 2953 // 2954 // Return a handle to the DCS configuration callback object. 2955 // The callback object is called by DCS to notify us when the 2956 // device configuration changes. 2957 // 2958 fs::dc_callback_ptr 2959 mount_server_impl::get_dc_callback(Environment &_environment) 2960 { 2961 fs_list_lock.wrlock(); 2962 if (CORBA::is_nil(dc_callback_obj)) { 2963 dc_callback_obj = 2964 (new dc_callback_impl(*this, repl_serverp))->get_objref(); 2965 get_checkpoint()->ckpt_get_dc_callback(dc_callback_obj, 2966 _environment); 2967 ASSERT(_environment.exception() == NULL); 2968 } 2969 fs_list_lock.unlock(); 2970 2971 return (fs::dc_callback::_duplicate(dc_callback_obj)); 2972 } 2973 2974 // 2975 // Helper function for checkpointing state on a secondary. 2976 // 2977 void 2978 mount_server_impl::ckpt_get_dc_callback(fs::dc_callback_ptr cb) 2979 { 2980 if (CORBA::is_nil(dc_callback_obj)) { 2981 // Note that the "delete dcp;" is done by _unreferenced(). 2982 (void) new dc_callback_impl(*this, cb); 2983 dc_callback_obj = fs::dc_callback::_duplicate(cb); 2984 } 2985 } 2986 2987 // 2988 // Helper routine to update device configuration changes. 2989 // 2990 void 2991 mount_server_impl::notify_change(sol::dev_t gdev, 2992 const sol::nodeid_seq_t &nodes, Environment &env) 2993 { 2994 mount_client_elem *cep; 2995 fs_elem *fep; 2996 Environment e; 2997 2998 // 2999 // Search the list of globally mounted file systems for the device. 3000 // 3001 client_list_lock.wrlock(); 3002 fs_list_lock.wrlock(); 3003 for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL; 3004 fs_list.advance()) { 3005 3006 ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1); 3007 if (fep->fs_v1_info.fsdev != gdev) { 3008 continue; 3009 } 3010 // 3011 // We found the file system, start replicas on the new 3012 // nodes. Note that we don't try to shut down replicas 3013 // if the connection is removed. 3014 // 3015 for (uint32_t i = 0; i < nodes.length(); i++) { 3016 bool fnd = false; 3017 for (uint32_t j = 0; j < fep->dev_nids.length(); j++) { 3018 if (nodes[i] == fep->dev_nids[j]) { 3019 fnd = true; 3020 break; 3021 } 3022 } 3023 if (fnd) { 3024 continue; 3025 } 3026 3027 // Find the mount client pointer for device node i. 3028 cep = find_client(nodes[i]); 3029 if (cep == NULL || cep->shutdown) { 3030 // 3031 // The device node isn't up 3032 // at the moment. 3033 // 3034 continue; 3035 } 3036 // XXX kcred use. 3037 solobj::cred_var credobj = 3038 solobj_impl::conv(kcred); 3039 cep->clientptr->reinstantiate_ha_v1(fep->ma, 3040 fep->fs_v1_ptr, credobj, fep->dev_name, e); 3041 // 3042 // We do the best we can to start a replica for 3043 // the new device connection but if there are errors, 3044 // we can't really do anything about it here. There 3045 // is a syslog message that is printed when a 3046 // new replica is started but if there was an error, 3047 // there may be no report of it. 3048 // XXX We could print a syslog error message here. 3049 // 3050 e.clear(); 3051 } 3052 fep->dev_nids = nodes; 3053 FAULTPT_PXFS(FAULTNUM_PXFS_DEVICE_CHANGED, 3054 FaultFunctions::generic); 3055 get_checkpoint()->ckpt_notify_change(gdev, nodes, env); 3056 ASSERT(env.exception() == NULL); 3057 } 3058 fs_list_lock.unlock(); 3059 client_list_lock.unlock(); 3060 } 3061 3062 // 3063 // Checkpoint a change in the disk connection list. 3064 // 3065 void 3066 mount_server_impl::ckpt_notify_change(sol::dev_t gdev, 3067 const sol::nodeid_seq_t &dev_nids) 3068 { 3069 fs_elem *fep; 3070 3071 // 3072 // Search the list of globally mounted file systems for the device. 3073 // 3074 for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL; 3075 fs_list.advance()) { 3076 if (fep->fs_v1_info.fsdev != gdev) { 3077 continue; 3078 } 3079 fep->dev_nids = dev_nids; 3080 } 3081 } 3082 3083 // 3084 // Helper routine to check if device is in use. 3085 // 3086 bool 3087 mount_server_impl::still_active(sol::dev_t gdev) 3088 { 3089 fs_elem *fep; 3090 bool inuse = false; 3091 3092 // 3093 // Search the list of globally mounted file systems for the device. 3094 // 3095 fs_list_lock.wrlock(); 3096 for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL; 3097 fs_list.advance()) { 3098 if (fep->fs_v1_info.fsdev == gdev) { 3099 inuse = true; 3100 break; 3101 } 3102 } 3103 fs_list_lock.unlock(); 3104 3105 return (inuse); 3106 } 3107 3108 // 3109 // Helper function for dumping state to a new secondary. 3110 // 3111 void 3112 mount_server_impl::dump_state(repl_pxfs::mount_replica_ptr ckptp, 3113 Environment &env) 3114 { 3115 MOUNT_DBPRINTF( 3116 MOUNT_TRACE_SERVER, 3117 MOUNT_GREEN, 3118 ("server:dump_state\n")); 3119 3120 // Create a new mount server on the secondary. 3121 CORBA::type_info_t *typ = fs::mount_server::_get_type_info( 3122 mount_vp_to_idl[repl_serverp->current_version.major_num] 3123 [repl_serverp->current_version.minor_num].ms); 3124 fs::mount_server_var fsls = get_objref(typ); 3125 ckptp->ckpt_new_server(fsls, env); 3126 if (env.exception()) { 3127 #ifdef DEBUG 3128 env.exception()->print_exception 3129 ("mount_server_impl:dump_state " 3130 "ckpt_new_server"); 3131 #endif 3132 MOUNT_DBPRINTF( 3133 MOUNT_TRACE_SERVER, 3134 MOUNT_RED, 3135 ("server:dump_state(%p): " 3136 "exception '%s' while calling ckpt_new_server().\n", 3137 this, env.exception()->_name())); 3138 ASSERT(CORBA::COMM_FAILURE::_exnarrow(env.exception())); 3139 return; 3140 } 3141 // 3142 // Dump the current list of mount clients. 3143 // 3144 mount_client_elem *cep; 3145 3146 for (client_list.atfirst(); (cep = client_list.get_current()) != NULL; 3147 client_list.advance()) { 3148 // 3149 // Note that the mount_client_elem can get a second 3150 // call to _unreferenced() if 'unref' is set since we 3151 // get a new reference here and then release it. 3152 // We need to be sure to not delete the object until 3153 // _unreferenced() is called again. 3154 // 3155 fs::mount_client_died_var clobj = cep->get_objref(); 3156 ckptp->ckpt_add_client(clobj, cep->clientptr, cep->nodeid, 3157 cep->shutdown, env); 3158 if (env.exception()) { 3159 #ifdef DEBUG 3160 env.exception()->print_exception 3161 ("mount_server_impl:dump_state " 3162 "ckpt_add_client"); 3163 #endif 3164 MOUNT_DBPRINTF( 3165 MOUNT_TRACE_SERVER, 3166 MOUNT_RED, 3167 ("server:dump_state(%p): " 3168 "exception '%s' while calling ckpt_add_client().\n", 3169 this, env.exception()->_name())); 3170 ASSERT(CORBA::COMM_FAILURE::_exnarrow(env.exception())); 3171 return; 3172 } 3173 } 3174 3175 // 3176 // Dump the current list of mounted file systems. 3177 // 3178 fs_elem *fep; 3179 for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL; 3180 fs_list.advance()) { 3181 ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1); 3182 ckptp->ckpt_mount_v1(fep->fs_v1_ptr, fep->fs_v1_info, 3183 fep->ma, 3184 fep->mntoptions, fep->dev_is_ha, fep->dev_name, 3185 fep->dev_nids, env); 3186 if (env.exception()) { 3187 #ifdef DEBUG 3188 env.exception()->print_exception 3189 ("mount_server_impl:dump_state " 3190 "ckpt_mount"); 3191 #endif 3192 MOUNT_DBPRINTF( 3193 MOUNT_TRACE_SERVER, 3194 MOUNT_RED, 3195 ("server:dump_state(%p): " 3196 "exception '%s' while calling ckpt_mount().\n", 3197 this, env.exception()->_name())); 3198 ASSERT(CORBA::COMM_FAILURE::_exnarrow(env.exception())); 3199 return; 3200 } 3201 } 3202 3203 // 3204 // Dump the list of device locks. 3205 // 3206 devlock_elem *dep; 3207 for (devlock_list.atfirst(); 3208 (dep = devlock_list.get_current()) != NULL; 3209 devlock_list.advance()) { 3210 ckptp->ckpt_devlock(dep->owner, dep->ownerid, dep->spec, env); 3211 if (env.exception()) { 3212 #ifdef DEBUG 3213 env.exception()->print_exception 3214 ("mount_server_impl:dump_state " 3215 "ckpt_dev_lock"); 3216 #endif 3217 MOUNT_DBPRINTF( 3218 MOUNT_TRACE_SERVER, 3219 MOUNT_RED, 3220 ("server:dump_state(%p): " 3221 "exception '%s' while calling ckpt_devlock().\n", 3222 this, env.exception()->_name())); 3223 ASSERT(CORBA::COMM_FAILURE::_exnarrow(env.exception())); 3224 return; 3225 } 3226 } 3227 3228 // 3229 // Create a dc_callback object if needed. 3230 // 3231 if (!CORBA::is_nil(dc_callback_obj)) { 3232 ckptp->ckpt_get_dc_callback(dc_callback_obj, env); 3233 } 3234 } 3235 3236 // 3237 // Helper routine to find a mount_client_elem given a node number. 3238 // The client_list_lock should be held before calling this on the primary. 3239 // 3240 mount_client_elem * 3241 mount_server_impl::find_client(nodeid_t nid) 3242 { 3243 mount_client_elem *cep; 3244 3245 ASSERT(!primary || client_list_lock.write_held()); 3246 3247 client_list.atfirst(); 3248 while ((cep = client_list.get_current()) != NULL) { 3249 client_list.advance(); 3250 if (cep->nodeid == nid) { 3251 return (cep); 3252 } 3253 } 3254 return (NULL); 3255 } 3256 3257 // 3258 // Helper routine to find a mount_client_elem given a mount client. 3259 // The client_list_lock should be held before calling this on the primary. 3260 // 3261 mount_client_elem * 3262 mount_server_impl::find_client(fs::mount_client_ptr c) 3263 { 3264 mount_client_elem *cep; 3265 3266 ASSERT(!primary || client_list_lock.write_held()); 3267 3268 client_list.atfirst(); 3269 while ((cep = client_list.get_current()) != NULL) { 3270 client_list.advance(); 3271 if (cep->clientptr->_equiv(c)) { 3272 return (cep); 3273 } 3274 } 3275 return (NULL); 3276 } 3277 3278 // 3279 // Return the fs_elem for fs or NULL if not in fs_list. 3280 // Must be called with the fs_list_lock held if called on the primary. 3281 // 3282 fs_elem * 3283 mount_server_impl::find_fs(pxfs_v1::filesystem_ptr fsptr) 3284 { 3285 fs_elem *fep; 3286 3287 ASSERT(!primary || fs_list_lock.write_held()); 3288 3289 for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL; 3290 fs_list.advance()) { 3291 if (!CORBA::is_nil(fep->fs_v1_ptr) && 3292 fep->fs_v1_ptr->_equiv(fsptr)) { 3293 return (fep); 3294 } 3295 } 3296 3297 return (NULL); 3298 } 3299 3300 // 3301 // Return the fs_elem for fs or NULL if not in fs_list. 3302 // Must be called with the fs_list_lock held if called on the primary. 3303 // 3304 fs_elem * 3305 mount_server_impl::find_fs(const char *spec) 3306 { 3307 fs_elem *fep; 3308 3309 ASSERT(!primary || fs_list_lock.write_held()); 3310 3311 for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL; 3312 fs_list.advance()) { 3313 if (strcmp(fep->ma.spec, spec) == 0) { 3314 return (fep); 3315 } 3316 } 3317 3318 return (NULL); 3319 } 3320 3321 // 3322 // Return the devlock_elem or NULL if not in devlock_list. 3323 // Must be called with the devlock_list_lock held if called on the primary. 3324 // 3325 devlock_elem * 3326 mount_server_impl::find_devlock(const char *spec) 3327 { 3328 devlock_elem *dep; 3329 3330 ASSERT(!primary || devlock_list_lock.write_held()); 3331 3332 for (devlock_list.atfirst(); 3333 (dep = devlock_list.get_current()) != NULL; 3334 devlock_list.advance()) { 3335 if (strcmp(dep->spec, spec) == 0) { 3336 return (dep); 3337 } 3338 } 3339 3340 return (NULL); 3341 } 3342 3343 // 3344 // Return true if the given node is blocked waiting for any lock. 3345 // Must be called with the devlock_list_lock held if called on the primary. 3346 // 3347 bool 3348 mount_server_impl::find_devlock_waiter(sol::nodeid_t nodeid) 3349 { 3350 devlock_elem *dep; 3351 3352 ASSERT(!primary || devlock_list_lock.write_held()); 3353 3354 for (devlock_list.atfirst(); 3355 (dep = devlock_list.get_current()) != NULL; 3356 devlock_list.advance()) { 3357 if (dep->waiters.test(nodeid - 1)) { 3358 return (true); 3359 } 3360 } 3361 3362 return (false); 3363 } 3364 3365 // 3366 // Helper function to catch the case where a user issues 3367 // multiple remount commands at the same time. 3368 // 3369 void 3370 mount_server_impl::check_multiple_remounts(const char *pathp, 3371 Environment &_environment) 3372 { 3373 char *pathmax = NULL, *pathmin = NULL; 3374 size_t minlen = 0; 3375 3376 current_mount_lock.lock(); 3377 3378 while (currentmnt) { 3379 if (strlen(pathp) > strlen(currentmnt)) { 3380 pathmax = (char *)pathp; // const string 3381 pathmin = currentmnt; 3382 } else { 3383 pathmax = currentmnt; 3384 pathmin = (char *)pathp; // const string 3385 } 3386 3387 minlen = strlen(pathmin); 3388 3389 // Use strncmp() so that nested mounts can be compared 3390 if (strncmp(pathmin, pathmax, minlen) == 0 && 3391 (pathmax[minlen] == '/' || pathmax[minlen] == '\0')) { 3392 sol::error_t error = EBUSY; 3393 3394 pxfslib::throw_exception(_environment, error); 3395 3396 current_mount_lock.unlock(); 3397 3398 return; 3399 } 3400 currentmnt_cv.wait(¤t_mount_lock); 3401 3402 } 3403 currentmnt = os::strdup(pathp); 3404 3405 current_mount_lock.unlock(); 3406 } 3407 3408 // 3409 // An HA filesystem is considered available if there is at least one existing 3410 // primary or secondary replica. This function return availability status. 3411 // 3412 //lint -e1038 3413 mount_server_impl::fs_status_t 3414 mount_server_impl::get_fs_status(fs_elem *fep) 3415 { 3416 //lint +e1038 3417 Environment e; 3418 ASSERT(fep->dev_is_ha); 3419 3420 // Get a reference to the service admin. 3421 replica::service_admin_var sa = 3422 pxfslib::get_service_admin_ref("mount_server_impl::mount", 3423 (const char *)fep->ma.spec, e); 3424 if (e.exception()) { 3425 e.exception()->print_exception 3426 ("mount_server_impl:get_fs_status " 3427 "get_service_admin_ref"); 3428 MOUNT_DBPRINTF( 3429 MOUNT_TRACE_SERVER, 3430 MOUNT_RED, 3431 ("server:get_fs_status " 3432 "get_service_admin_ref(%s) exception\n", 3433 (const char *)fep->ma.spec)); 3434 return (ERROR); 3435 } 3436 e.clear(); 3437 3438 // 3439 // Get information about the filesystem replicas 3440 // from the service admin. 3441 // 3442 replica::repl_prov_seq_var repl_provs; 3443 sa->get_repl_provs(repl_provs, e); 3444 if (e.exception()) { 3445 e.exception()->print_exception 3446 ("mount_sever_impl:get_fs_status " 3447 "get_repl_provs"); 3448 MOUNT_DBPRINTF( 3449 MOUNT_TRACE_SERVER, 3450 MOUNT_RED, 3451 ("sever:get_fs_status " 3452 "get_repl_provs returned exception\n")); 3453 return (ERROR); 3454 } 3455 3456 // 3457 // Check if primary or secondary replica exists, 3458 // 3459 uint_t len = repl_provs.length(); 3460 for (uint_t i = 0; i < len; i++) { 3461 if ((repl_provs[i].curr_state 3462 == replica::AS_PRIMARY) || 3463 (repl_provs[i].curr_state 3464 == replica::AS_SECONDARY)) { 3465 // 3466 // From the service admin we found out that, 3467 // there is a potential replica that could 3468 // serve out the filesystem. 3469 // 3470 // 3471 MOUNT_DBPRINTF( 3472 MOUNT_TRACE_SERVER, 3473 MOUNT_GREEN, 3474 ("server:get_fs_status " 3475 "%s can host %s\n", 3476 (const char*)repl_provs[i] 3477 .repl_prov_desc, 3478 (const char *)fep->ma.spec)); 3479 return (AVAILABLE); 3480 } 3481 } 3482 return (NOT_AVAILABLE); 3483 } 3484 3485 // 3486 // Checkpoint the upgrade of a mount_client in client list. 3487 // 3488 void 3489 mount_server_impl::ckpt_upgrade_client_list(fs::mount_client_ptr 3490 client_p, sol::nodeid_t nodeid) 3491 { 3492 mount_client_elem *cep; 3493 3494 if ((cep = find_client(nodeid)) != NULL) { 3495 // 3496 // We found the guy we're looking for. 3497 // 3498 cep->clientptr = fs::mount_client::_duplicate(client_p); 3499 } else { 3500 // 3501 // Notify that we are not able to find the guy. 3502 // 3503 MOUNT_DBPRINTF( 3504 MOUNT_TRACE_SERVER, 3505 MOUNT_RED, 3506 ("server:ckpt_upgrade_client_list" 3507 "Failed to find required mount_client_elem" 3508 "with nodeid %d\n", nodeid)); 3509 } 3510 } 3511 3512 // 3513 // Checkpoint the upgrade of a mount_client in devlock list. 3514 // 3515 void 3516 mount_server_impl::ckpt_upgrade_devlock_list(const char *dev_name, 3517 fs::mount_client_ptr client_p) 3518 { 3519 devlock_elem *dep; 3520 3521 if ((dep = find_devlock(dev_name)) != NULL) { 3522 // 3523 // We found the guy we're looking for. 3524 // 3525 dep->owner = fs::mount_client::_duplicate(client_p); 3526 } else { 3527 // 3528 // Notify that we are not able to find the guy. 3529 // 3530 MOUNT_DBPRINTF( 3531 MOUNT_TRACE_SERVER, 3532 MOUNT_RED, 3533 ("server:ckpt_upgrade_devlock_list" 3534 "Failed to find required devlock_elem" 3535 "with devname %s\n", dev_name)); 3536 } 3537 } 3538