1 // 2 // CDDL HEADER START 3 // 4 // The contents of this file are subject to the terms of the 5 // Common Development and Distribution License (the License). 6 // You may not use this file except in compliance with the License. 7 // 8 // You can obtain a copy of the license at usr/src/CDDL.txt 9 // or http://www.opensolaris.org/os/licensing. 10 // See the License for the specific language governing permissions 11 // and limitations under the License. 12 // 13 // When distributing Covered Code, include this CDDL HEADER in each 14 // file and include the License file at usr/src/CDDL.txt. 15 // If applicable, add the following below this CDDL HEADER, with the 16 // fields enclosed by brackets [] replaced with your own identifying 17 // information: Portions Copyright [yyyy] [name of copyright owner] 18 // 19 // CDDL HEADER END 20 // 21 22 // 23 // Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 // Use is subject to license terms. 25 // 26 27 #pragma ident "@(#)repl_pxfs_server.cc 1.21 08/05/20 SMI" 28 29 #include <sys/pathname.h> 30 #include <sys/fcntl.h> 31 #include <sys/mount.h> 32 #include <sys/dnlc.h> 33 34 #include <sys/sol_conv.h> 35 #include <nslib/ns.h> 36 #include <solobj/solobj_impl.h> 37 #include <h/repl_pxfs.h> 38 39 #include "../version.h" 40 #include <pxfs/mount/mount_debug.h> 41 #include <pxfs/lib/pxfs_debug.h> 42 #include <pxfs/server/repl_pxfs_server.h> 43 #include <pxfs/server/fs_impl.h> 44 #include <pxfs/server/fsmgr_server_impl.h> 45 #include <pxfs/server/unixdir_impl.h> 46 #include <pxfs/server/file_impl.h> 47 #include <pxfs/server/symlink_impl.h> 48 #include <pxfs/server/io_impl.h> 49 #include <pxfs/server/unixdir_ckpt.h> 50 #include <pxfs/server/fobj_trans_states.h> 51 52 #ifndef VXFS_DISABLED 53 #include <pxfs/server/vxfs_dependent_impl.h> 54 #endif 55 56 //lint -e1512 57 // 58 // Warning(1512) destructor for base class is not virtual -- In a 59 // final pass through all the classes, we have found a class that is 60 // the base class of a derivation and has a destructor but the 61 // destructor is not virtual. It is conventional for inherited classes 62 // to have virtual destructors so that is it safe to 'delete' a 63 // pointer to a base class. 64 // 65 66 // 67 // This struct is used for building a table for supporting the mapping 68 // from version protocol spec file version number to the various IDL 69 // interface versions it represents. The table will be a two dimensional 70 // array indexed by major/minor vp version. 71 // 72 typedef struct { // idl interfaces 73 int fs; // pxfs_v1.idl 74 int fs_ckpt; // repl_pxfs_v1.idl 75 } pxfs_ver_map_t; 76 77 // 78 // These are the current maximum indexes used for accessing the vp to idl 79 // version table. 80 // 81 const int PXFS_VP_MAX_MAJOR = 2; 82 const int PXFS_VP_MAX_MINOR = 2; 83 84 // 85 // pxfs_ver_map_t is a struct type which has has an entries for each IDL 86 // interface which is being versioned. For a given VP major and minor version, 87 // we get the IDL version of those interfaces. 88 // 89 // Note: this corresponds to the versions supported by the old 90 // version of pxfs in pxfs_v1.idl and repl_pxfs_v1.idl 91 // 92 pxfs_ver_map_t pxfs_vp_to_idl[PXFS_VP_MAX_MAJOR + 1][PXFS_VP_MAX_MINOR +1] = { 93 { { 0, 0 }, // VP Version 0.0 defined for indexing 94 { 0, 0 }, // VP Version 0.1 " " " 95 { 0, 0 } }, // VP Version 0.2 " " " 96 { { 0, 0 }, // VP Version 1.0 97 { 0, 0 }, // VP Version 1.1 98 { 0, 0 } }, // VP Version 1.2 99 { { 0, 0 }, // VP Version 2.0 Object Consolidation 100 { 3, 3 }, // VP Version 2.1 RU support for 6496894/6493901 101 { 3, 3 } } // VP Version 2.2 currently unused 102 }; 103 104 // Initialize the static member variable unique_integer 105 int repl_pxfs_server::unique_integer = 0; 106 107 // 108 // Class repl_pxfs_server. 109 // 110 111 // 112 // Create a repl_pxfs_server object. 113 // 114 repl_pxfs_server::repl_pxfs_server(vnode_t *mvp, const sol::mounta &ma, 115 cred_t *cr, const char *id) : 116 mountdata(ma), 117 repl_server<REPL_PXFS_VER::fs_replica>(ma.spec, id), 118 _ckpt_proxy(NULL), 119 replica_state(NOT_PRIMARY), 120 active_invo_count(0) 121 { 122 ASSERT(mountdata.flags & MS_SYSSPACE); 123 ASSERT((mountdata.flags & MS_REMOUNT) == 0); 124 125 // 126 // We set MS_NOSPLICE so that the underlying file system isn't 127 // linked into the file system name space. 128 // XXX We also force the MS_OVERLAY flag on to suppress the 129 // mvp->v_count == 1 EBUSY check in the file system code. 130 // 131 mountdata.flags |= MS_NOSPLICE | MS_OVERLAY; 132 133 mnt_vp = mvp; 134 VN_HOLD(mnt_vp); 135 136 crp = cr; 137 crhold(crp); 138 139 fsp = NULL; 140 mnt_error = 0; 141 fs_is_unmounted = false; 142 143 // 144 // Initialize with invalid versions. Correct values will 145 // be obtained by a query of the version manager. 146 // 147 current_version.major_num = 0; 148 current_version.minor_num = 0; 149 pending_version.major_num = 0; 150 pending_version.minor_num = 0; 151 } 152 153 repl_pxfs_server::~repl_pxfs_server() 154 { 155 if (mnt_vp != NULL) { 156 VN_RELE(mnt_vp); 157 } 158 crfree(crp); 159 160 CORBA::release(_ckpt_proxy); 161 _ckpt_proxy = nil; 162 } //lint !e1540 !e1740 pointers are neither freed nor zero'ed by destructor 163 164 REPL_PXFS_VER::fs_replica_ptr 165 repl_pxfs_server::get_checkpoint_fs_replica() 166 { 167 ASSERT(!CORBA::is_nil(_ckpt_proxy)); 168 return (_ckpt_proxy); 169 } 170 171 // 172 // Helper function to get the mount error (if any). 173 // 174 int 175 repl_pxfs_server::get_mount_error() const 176 { 177 return (mnt_error); 178 } 179 180 // 181 // Become the primary. 182 // Note that previously we might have been newly created or 183 // a secondary that is switching to primary. 184 // 185 void 186 repl_pxfs_server::become_primary(const replica::repl_name_seq &, 187 Environment &_environment) 188 { 189 // Show that we are the primary 190 active_invo_lock.lock(); 191 replica_state = PRIMARY; 192 active_invo_lock.unlock(); 193 194 ASSERT(mountdata.flags & MS_SYSSPACE); 195 ASSERT((mountdata.flags & MS_REMOUNT) == 0); 196 ASSERT(mountdata.flags & MS_OVERLAY); 197 ASSERT(mountdata.flags & MS_NOSPLICE); 198 199 // First, initialize the checkpoint proxy. 200 version_lock.wrlock(); 201 202 // Callback may have occured when this replica was a secondary 203 if (pending_version.major_num != 0) { 204 current_version = pending_version; 205 } 206 207 // Save the current _ckpt_proxy and release it after we get a new one 208 REPL_PXFS_VER::fs_replica_ptr old_ckpt_p = _ckpt_proxy; 209 210 CORBA::type_info_t *typ = REPL_PXFS_VER::fs_replica::_get_type_info( 211 pxfs_vp_to_idl[current_version.major_num] 212 [current_version.minor_num].fs_ckpt); 213 replica::checkpoint_var tmp_ckpt_v = set_checkpoint(typ); 214 _ckpt_proxy = REPL_PXFS_VER::fs_replica::_narrow(tmp_ckpt_v); 215 216 ASSERT(!CORBA::is_nil(_ckpt_proxy)); 217 218 // Release the reference to the old ckpt_proxy. 219 CORBA::release(old_ckpt_p); 220 old_ckpt_p = REPL_PXFS_VER::fs_replica::_nil(); 221 222 // 223 // If we were a secondary, there may be an uprocessed upgrade 224 // callback pending. 225 // 226 if (fsp != NULL) { 227 if (pending_version.major_num != 0) { 228 pending_version.major_num = 0; 229 230 // Update the server reference. 231 typ = PXFS_VER::filesystem::_get_type_info( 232 pxfs_vp_to_idl[current_version.major_num] 233 [current_version.minor_num].fs); 234 fs_v = fsp->get_objref(typ); 235 236 // Checkpoint current version. 237 _ckpt_proxy->ckpt_service_version( 238 current_version.major_num, 239 current_version.minor_num, _environment); 240 } 241 } 242 243 // 244 // If a switchover and unmount happen simultaneously, the two 245 // threads can race each other. The unmount thread asks the clients 246 // (including the secondary) to unmount and clear their respective 247 // vfs pointers. At this point the file system is not dead. The 248 // switchover can proceed and try to make the existing secondary a 249 // primary. This secondary's vfs has been cleared by the unmount 250 // and we will panic. 251 // 252 // We resolve this race using the 'fs_is_unmounted' flag. If it is 253 // set, we have commenced unmounting or has partly unmounted the 254 // filesystem. In that case we return after logging an error. The 255 // switchover will fail. The swithover will have to be retried. 256 // 257 if (fs_is_unmounted) { 258 MOUNT_DBPRINTF( 259 MOUNT_TRACE_REPLICA, 260 MOUNT_RED, 261 ("repl_pxfs_server::become_primary %s already unmounted\n", 262 (const char *)mountdata.dir)); //lint !e1776 263 _environment.clear(); 264 265 char nodenum[32]; 266 267 (void) sprintf(nodenum, "Node (%u)", orb_conf::node_number()); 268 os::sc_syslog_msg msg(SC_SYSLOG_GLOBAL_MOUNT_TAG, 269 nodenum, NULL); 270 // 271 // SCMSGS 272 // @explanation 273 // This is an error due to a simultaneous switchover and 274 // unmount. The switchover fails and unmount succeeds. 275 // @user_action 276 // The switchover has to be retried. 277 // 278 msg.log(SC_SYSLOG_WARNING, MESSAGE, 279 "switchover failed since the file system at %s is " 280 "being unmounted.", (const char *)mountdata.dir); 281 version_lock.unlock(); 282 return; 283 } 284 285 if (mnt_error != 0) { 286 ASSERT(fsp != NULL); 287 ASSERT(fsp->get_vfsp() == NULL); 288 MOUNT_DBPRINTF( 289 MOUNT_TRACE_REPLICA, 290 MOUNT_RED, 291 ("repl_pxfs_server::become_primary %s err %d\n", 292 (const char *)mountdata.dir, mnt_error)); 293 // 294 // We have already tried to mount the file system and 295 // got an error or some other unrecoverable situation. 296 // Wait until we are unmounted and mounted again. 297 // 298 version_lock.unlock(); 299 return; 300 } 301 302 // 303 // Set the MS_NOCHECK flag if this is a failover or switchover 304 // (i.e., the PXFS file system is already mounted). This flag is used 305 // to tell the underlying file system to suppress checking for other 306 // mounted file systems with the same device since otherwise it would 307 // see the PXFS proxy and think the device is already mounted. 308 // 309 if (fsp != NULL) { 310 // 311 // Check if we are being called after a failure of a call to 312 // become_secondary(). 313 // 314 if (fsp->get_vfsp() != NULL) { 315 mnt_error = fsp->convert_to_primary(false); 316 if (mnt_error != 0) { 317 // 318 // SCMSGS 319 // @explanation 320 // The file system specified in the message 321 // could not be hosted on the node the message 322 // came from. 323 // @user_action 324 // Check /var/adm/messages to make sure there 325 // were no device errors. If not, contact your 326 // authorized Sun service provider to 327 // determine whether a workaround or patch is 328 // available. 329 // 330 (void) fsp->msg().log(SC_SYSLOG_WARNING, 331 MESSAGE, "Switchover (%s) error (%d) after " 332 "failure to become secondary", 333 (const char *)(mountdata.dir), mnt_error); 334 fsp->get_checkpoint()->ckpt_mnt_error(mnt_error, 335 _environment); 336 ASSERT(_environment.exception() == NULL); 337 MOUNT_DBPRINTF( 338 MOUNT_TRACE_REPLICA, 339 MOUNT_RED, 340 ("repl_pxfs_server::become_primary %s " 341 "become_secondary err %d\n", 342 (const char *)mountdata.dir, mnt_error)); 343 } 344 version_lock.unlock(); 345 return; 346 } 347 348 mountdata.flags |= MS_NOCHECK; 349 350 // 351 // Run the user-level commands needed to prepare the device 352 // we are going to mount (does the fsck if necessary). 353 // 354 char name[20]; 355 Environment e; 356 naming::naming_context_var ctxp = ns::root_nameserver(); 357 os::sprintf(name, "ha_mounter.%d", orb_conf::node_number()); 358 CORBA::Object_var obj = ctxp->resolve(name, e); 359 if (e.exception()) { 360 e.clear(); 361 // 362 // SCMSGS 363 // @explanation 364 // The file system specified in the message could not 365 // be hosted on the node the message came from. Check 366 // to see if the user program "clexecd" is running on 367 // that node. 368 // @user_action 369 // Contact your authorized Sun service provider to 370 // determine whether a workaround or patch is 371 // available. 372 // 373 (void) fsp->msg().log(SC_SYSLOG_WARNING, MESSAGE, 374 "Switchover error (%s): cannot " 375 "find clexecd", (const char *)(mountdata.dir)); 376 mnt_error = EINVAL; 377 fsp->get_checkpoint()->ckpt_mnt_error(mnt_error, 378 _environment); 379 ASSERT(_environment.exception() == NULL); 380 version_lock.unlock(); 381 return; 382 } 383 repl_pxfs::ha_mounter_var mounter = 384 repl_pxfs::ha_mounter::_narrow(obj); 385 ASSERT(!CORBA::is_nil(mounter)); 386 mounter->mount(mountdata.spec, mountdata.fstype, 387 fsp->get_options(), e); 388 if (e.exception()) { 389 e.clear(); 390 // 391 // SCMSGS 392 // @explanation 393 // The file system specified in the message could not 394 // be hosted on the node the message came from because 395 // an fsck on the file system revealed errors. 396 // @user_action 397 // Unmount the PXFS file system (if mounted), fsck the 398 // device, and then mount the PXFS file system again. 399 // 400 (void) fsp->msg().log(SC_SYSLOG_WARNING, MESSAGE, 401 "Switchover error (%s): failed to fsck disk", 402 (const char *)(mountdata.dir)); 403 mnt_error = EINVAL; 404 fsp->get_checkpoint()->ckpt_mnt_error(mnt_error, 405 _environment); 406 ASSERT(_environment.exception() == NULL); 407 version_lock.unlock(); 408 return; 409 } 410 } else { 411 mountdata.flags &= ~MS_NOCHECK; 412 413 // 414 // We turn off MS_GLOBAL, as we are mounting the underlying 415 // filesystem locally. With Solaris 9 build 58, Solaris 416 // disables mount in progress checks if MS_GLOBAL is specified. 417 // We have to make sure that MS_GLOBAL is turned off here, as we 418 // want these checks to be made here. These checks make sure 419 // that if a global mount and a local mount happen 420 // concurrently, and are trying to mount the same device, 421 // on different mount-points, only one of them succeeds. 422 // 423 mountdata.flags &= ~MS_GLOBAL; 424 } 425 426 // 427 // We could end up with mnt_vp == NULL in cases where this 428 // repl_pxfs_server was just being added to the repl_prov_list 429 // and a reconfig triggered by a failure could come in and 430 // shut this down (refer to rm_state_machine::cleanup_providers()). 431 // 432 // Hence, we re-initialize it if necessary. 433 // 434 if (mnt_vp == NULL) { 435 vnode_t *vp = NULL; 436 int error = lookupname(((sol::mounta &)mountdata).dir, 437 UIO_SYSSPACE, FOLLOW, NULL, &vp); 438 439 if (error != 0) { 440 MOUNT_DBPRINTF( 441 MOUNT_TRACE_REPLICA, 442 MOUNT_RED, 443 ("repl_pxfs_server::become_primary %s err %d" 444 "lookupname() failed\n", 445 (const char *)mountdata.dir, error)); 446 pxfslib::throw_exception(_environment, error); 447 return; 448 } 449 mnt_vp = vp; 450 VN_HOLD(mnt_vp); 451 } 452 453 dnlc_purge_vp(mnt_vp); 454 455 int datalen; 456 457 #ifndef VXFS_DISABLED 458 if (strcmp(mountdata.fstype, "vxfs") == 0) { 459 datalen = vxfs_dependent_impl::vxfs_fixup_args(mountdata, 460 (fsp == NULL) ? vxfs_dependent_impl::VX_MOUNT : 461 vxfs_dependent_impl::VX_FAILOVER); 462 if (datalen == -1) { 463 pxfslib::throw_exception(_environment, ENOENT); 464 return; 465 } 466 } else { 467 datalen = (int)mountdata.data.length(); 468 } 469 #else 470 datalen = (int)mountdata.data.length(); 471 #endif 472 473 // 474 // Mount the underlying file system but don't link it into the 475 // name space. 476 // 477 vfs_t *vfsp = NULL; 478 struct mounta mnta; 479 char *options; 480 mnta.spec = mountdata.spec; 481 mnta.dir = mountdata.dir; 482 mnta.flags = mountdata.flags; 483 mnta.fstype = mountdata.fstype; 484 mnta.dataptr = (char *)mountdata.data.buffer(); 485 mnta.datalen = datalen; 486 int len; 487 if (mnta.flags & MS_OPTIONSTR) { 488 len = (int)mountdata.options.length(); 489 //lint -e571 This is ok to loose the sign in this cast. 490 options = new char [(size_t)len]; 491 //lint +e571 492 mnta.optptr = os::strcpy(options, 493 (const char *)mountdata.options.buffer()); 494 mnta.optlen = len; 495 // 496 // Strip "global" from the options list, 497 // if it happens to be specified. 498 // This is because the underlying mount 499 // is a local mount. 500 // 501 (void) pxfslib::exists_mntopt(options, "global", true); 502 } else { 503 len = MAX_MNTOPT_STR; 504 //lint -e571 This is ok to loose the sign in this cast. 505 options = new char [(size_t)len]; 506 //lint +e571 507 mnta.optptr = NULL; 508 mnta.optlen = 0; 509 } 510 #ifdef _FAULT_INJECTION 511 void *f_argp; 512 uint32_t f_argsize; 513 if (fault_triggered(FAULTNUM_PXFS_DOMOUNT, &f_argp, &f_argsize)) { 514 ASSERT(f_argsize == sizeof (int)); 515 mnt_error = *((int *)f_argp); 516 } else 517 #endif 518 mnt_error = domount(mnta.fstype, &mnta, mnt_vp, crp, &vfsp); 519 if (mnt_error == 0 && (mnta.flags & MS_OPTIONSTR) == 0) { 520 mnt_error = vfs_buildoptionstr(&vfsp->vfs_mntopts, options, 521 len); 522 } 523 if (!mnt_error) { 524 (void) pxfslib::exists_mntopt(options, "noglobal", true); 525 (void) strcat(options, ",global"); //lint !e668 526 } 527 if (mnt_error != 0) { 528 delete [] options; 529 MOUNT_DBPRINTF( 530 MOUNT_TRACE_REPLICA, 531 MOUNT_RED, 532 ("repl_pxfs_server::become_primary %s domount err %d\n", 533 (const char *)mountdata.dir, mnt_error)); 534 // 535 // Create a "dead" file system object just to return 536 // the error from domount(). 537 // 538 if (fsp == NULL) { 539 fsp = new fs_repl_impl(NULL, mountdata.fstype, 540 mountdata.spec, NULL, this); 541 typ = PXFS_VER::filesystem::_get_type_info( 542 pxfs_vp_to_idl[current_version.major_num] 543 [current_version.minor_num].fs); 544 fs_v = fsp->get_objref(typ); 545 fsp->get_checkpoint()->ckpt_new_fsobj(fs_v, NULL, 546 _environment); 547 ASSERT(_environment.exception() == NULL); 548 } else { 549 // 550 // We failed while re-mounting the FS after a 551 // switchover/ failover. Syslog this fact. 552 // 553 554 // 555 // SCMSGS 556 // @explanation 557 // The file system specified in the message could not 558 // be hosted on the node the message came from. 559 // @user_action 560 // Check /var/adm/messages to make sure there were no 561 // device errors. If not, contact your authorized Sun 562 // service provider to determine whether a workaround 563 // or patch is available. 564 // 565 (void) fsp->msg().log(SC_SYSLOG_WARNING, MESSAGE, 566 "Switchover error (%s): failed to mount FS (%d)", 567 (const char *)(mountdata.dir), mnt_error); 568 } 569 fsp->get_checkpoint()->ckpt_mnt_error(mnt_error, _environment); 570 ASSERT(_environment.exception() == NULL); 571 version_lock.unlock(); 572 return; 573 } 574 575 // 576 // If this is the first time, we need to create the root fs object. 577 // Locking is handled by the replica manager framework. 578 // 579 bool firsttime; 580 if (fsp == NULL) { 581 fsp = new fs_repl_impl(vfsp, mountdata.fstype, 582 mountdata.spec, options, this); 583 typ = PXFS_VER::filesystem::_get_type_info( 584 pxfs_vp_to_idl[current_version.major_num] 585 [current_version.minor_num].fs); 586 fs_v = fsp->get_objref(typ); 587 fsp->get_checkpoint()->ckpt_new_fsobj(fs_v, options, 588 _environment); 589 ASSERT(_environment.exception() == NULL); 590 firsttime = true; 591 } else { 592 fsp->set_vfsp(vfsp); 593 firsttime = false; 594 } 595 delete [] options; 596 597 mnt_error = fsp->convert_to_primary(firsttime); 598 if (mnt_error != 0) { 599 if (!firsttime) { 600 // 601 // We failed while re-mounting the FS after a 602 // switchover/ failover. Syslog this fact. 603 // 604 605 // 606 // SCMSGS 607 // @explanation 608 // The file system specified in the message could not 609 // be hosted on the node the message came from. 610 // @user_action 611 // Check /var/adm/messages to make sure there were no 612 // device errors. If not, contact your authorized Sun 613 // service provider to determine whether a workaround 614 // or patch is available. 615 // 616 (void) fsp->msg().log(SC_SYSLOG_WARNING, MESSAGE, 617 "Switchover (%s) error (%d) converting to primary", 618 (const char *)(mountdata.dir), mnt_error); 619 } 620 MOUNT_DBPRINTF( 621 MOUNT_TRACE_REPLICA, 622 MOUNT_RED, 623 ("repl_pxfs_server::become_primary %s convert_to_primary " 624 "err %d\n", (const char *)mountdata.dir, mnt_error)); 625 fsp->get_checkpoint()->ckpt_mnt_error(mnt_error, _environment); 626 ASSERT(_environment.exception() == NULL); 627 } 628 version_lock.unlock(); 629 } 630 631 // 632 // Become the secondary. 633 // This is called on the primary in order to do a switchover. 634 // 635 void 636 repl_pxfs_server::become_secondary(Environment &_environment) 637 { 638 ASSERT(fsp != NULL); 639 640 MOUNT_DBPRINTF( 641 MOUNT_TRACE_REPLICA, 642 MOUNT_GREEN, 643 ("repl_pxfs_server::become_secondary %s\n", 644 (const char *)mountdata.dir)); 645 646 // Show that we are not the primary 647 active_invo_lock.lock(); 648 replica_state = NOT_PRIMARY; 649 active_invo_lock.unlock(); 650 651 if (fsp->convert_to_secondary() != 0) { 652 // 653 // If there is an error converting from primary to secondary, 654 // we raise an exception to the HA framework notifying this 655 // replica can't become a secondary. The HA framework will 656 // call become_primary() on this replica and we will mark 657 // the file system as "dead". We handle errors this way 658 // because the HA framework doesn't support checkpoints 659 // in become_secondary() and Solaris 2.7 doesn't support 660 // forced unmount. With forced unmount, we could be sure 661 // the file system wasn't accessing the disk on this node 662 // and allow the switchover to proceed. 663 // 664 _environment.exception(new replica::become_secondary_failed); 665 } 666 667 ASSERT(!CORBA::is_nil(_ckpt_proxy)); 668 669 // Release the reference to the multi_ckpt_handler. 670 version_lock.wrlock(); 671 CORBA::release(_ckpt_proxy); 672 _ckpt_proxy = nil; 673 version_lock.unlock(); 674 } 675 676 void 677 repl_pxfs_server::add_secondary(replica::checkpoint_ptr sec_chkpt, 678 const char *, Environment &_environment) 679 { 680 REPL_PXFS_VER::fs_replica_var ckpt = 681 REPL_PXFS_VER::fs_replica::_narrow(sec_chkpt); 682 ASSERT(!CORBA::is_nil(ckpt)); 683 684 MOUNT_DBPRINTF( 685 MOUNT_TRACE_REPLICA, 686 MOUNT_GREEN, 687 ("repl_pxfs_server::add_secondary %s\n", 688 (const char *)mountdata.dir)); 689 if (fsp != NULL) { 690 // Create a new fs on the secondary before we dump our state. 691 ckpt->ckpt_new_fsobj(fs_v, fsp->get_options(), _environment); 692 ASSERT(_environment.exception() == NULL); 693 694 // Dump current state to the new secondary. 695 if (mnt_error != 0) { 696 ckpt->ckpt_mnt_error(mnt_error, _environment); 697 ASSERT(_environment.exception() == NULL); 698 } 699 700 fsp->dump_state(ckpt, _environment); 701 _environment.clear(); 702 } 703 } 704 705 void 706 repl_pxfs_server::remove_secondary(const char *, Environment &) 707 { 708 } 709 710 // 711 // This routine is called by server methods that issues invocations to 712 // clients which result in the client(s) issuing invocations back to 713 // the server. 714 // 715 // We count the invocations that we allow to proceed. 716 // 717 // When freezing the primary, we hold these invocations here. 718 // 719 // When frozen, we return the invocation with an exception. 720 // 721 bool 722 repl_pxfs_server::check_freeze(Environment &env) 723 { 724 active_invo_lock.lock(); 725 726 switch (replica_state) { 727 728 case NOT_PRIMARY: // This should not be possible 729 ASSERT(0); 730 // Fall through and pretend that we are the primary. 731 732 case PRIMARY: // We are not preparing for a freeze. 733 // Allow the invocation to proceed 734 735 ASSERT(active_invo_count >= 0); 736 ASSERT(active_invo_count != INT_MAX); 737 738 active_invo_count++; 739 active_invo_lock.unlock(); 740 741 return (false); 742 // 743 case FREEZING: // Block any invocation that could result 744 // in invocations back to this primary. 745 // That would result in deadlock. 746 // The clients currently allow invocations to 747 // proceed to the server. So hold on to the 748 // invocation until the client is ready to 749 // block the invocation. 750 // 751 PXFS_DBPRINTF( 752 PXFS_TRACE_FS, 753 PXFS_AMBER, 754 ("repl_pxfs_server:(%p) freezing invo from node %d\n", 755 this, env.get_src_node().ndid)); 756 while (replica_state == FREEZING) { 757 active_invo_cv.wait(&active_invo_lock); 758 } 759 ASSERT(replica_state == FROZEN); 760 // Fall through 761 case FROZEN: 762 env.system_exception( 763 CORBA::PRIMARY_FROZEN(0, CORBA::COMPLETED_NO)); 764 active_invo_lock.unlock(); 765 return (true); 766 767 }; 768 } 769 770 // 771 // This method is called by the HA framework when the service is about to 772 // be frozen. The freeze will proceed only when this method returns. We 773 // take advantage of this serialization to bring our state to a stable one 774 // by waiting for outstanding invocation from the server to complete and 775 // preventing any new invocations from being launched. 776 // 777 void 778 repl_pxfs_server::freeze_primary_prepare(Environment &) 779 { 780 // 781 // Hold invocation count lock to prevent new invocations from 782 // coming in. 783 // 784 active_invo_lock.lock(); 785 replica_state = FREEZING; 786 787 PXFS_DBPRINTF( 788 PXFS_TRACE_FS, 789 PXFS_AMBER, 790 ("repl_pxfs_server(%p): %d outstanding nested server invocations\n", 791 this, active_invo_count)); 792 793 while (active_invo_count > 0) { 794 // Wait for active invocations to signal completion. 795 active_invo_cv.wait(&active_invo_lock); 796 } 797 798 active_invo_lock.unlock(); 799 } 800 801 // 802 // Note: either add_secondary() and unfreeze_primary() or 803 // become_secondary() will be called after this returns. 804 // Invocations and calls to _unreferenced() will be blocked by the HA 805 // framework after we return from here. 806 // 807 // repl_pxfs_server(replica::repl_prov::freeze_primary, _environment) 808 void 809 repl_pxfs_server::freeze_primary(Environment &) 810 { 811 MOUNT_DBPRINTF( 812 MOUNT_TRACE_REPLICA, 813 MOUNT_GREEN, 814 ("repl_pxfs_server::freeze_primary %s\n", 815 (const char *)mountdata.dir)); 816 817 // Wake up any invocations that were blocked during freezing 818 active_invo_lock.lock(); 819 replica_state = FROZEN; 820 active_invo_cv.broadcast(); 821 active_invo_lock.unlock(); 822 823 fsp->freeze_primary((const char *)mountdata.dir); 824 } 825 826 void 827 repl_pxfs_server::unfreeze_primary(Environment &) 828 { 829 MOUNT_DBPRINTF( 830 MOUNT_TRACE_REPLICA, 831 MOUNT_GREEN, 832 ("repl_pxfs_server::unfreeze_primary %s\n", 833 (const char *)mountdata.dir)); 834 835 // Show that invocations are allowed 836 active_invo_lock.lock(); 837 replica_state = PRIMARY; 838 active_invo_lock.unlock(); 839 } 840 841 void 842 repl_pxfs_server::become_spare(Environment &) 843 { 844 MOUNT_DBPRINTF( 845 MOUNT_TRACE_REPLICA, 846 MOUNT_GREEN, 847 ("repl_pxfs_server::become_spare %s\n", 848 (const char *)mountdata.dir)); 849 if (fsp != NULL) { 850 fs_v = PXFS_VER::filesystem::_nil(); 851 fsp->convert_to_spare(); 852 fsp = NULL; 853 } 854 mnt_error = 0; 855 } 856 857 // 858 // This is called on the primary when the service is requested to shutdown. 859 // 860 void 861 repl_pxfs_server::shutdown(Environment &_environment) 862 { 863 // Unregister the upgrade callback. 864 upgrade_callback_unregister(); 865 866 // Return busy if there was no unmount. 867 if (mnt_error == 0 && fsp != NULL && fsp->get_vfsp() != NULL) { 868 _environment.exception(new replica::service_busy); 869 return; 870 } 871 fs_v = PXFS_VER::filesystem::_nil(); 872 if (mnt_vp != NULL) { 873 VN_RELE(mnt_vp); 874 mnt_vp = NULL; 875 } 876 } 877 878 // 879 // This is called on the primary when the service is forced to shutdown. 880 // 881 // Virtual function forced_shutdown in base class generic_repl_server 882 // returns a Solaris error code (if forced shutdown not supported by 883 // specific HA service), else returns 0(Success) 884 // 885 uint32_t 886 repl_pxfs_server::forced_shutdown(Environment &) 887 { 888 // Unregister the upgrade callback. 889 upgrade_callback_unregister(); 890 891 fs_v = PXFS_VER::filesystem::_nil(); 892 if (mnt_vp != NULL) { 893 VN_RELE(mnt_vp); 894 mnt_vp = NULL; 895 } 896 return (0); 897 } 898 899 // 900 // This is called on a spare when the service is requested to shutdown. 901 // 902 void 903 repl_pxfs_server::shutdown_spare(replica::repl_prov_shutdown_type, 904 Environment &) 905 { 906 fs_v = PXFS_VER::filesystem::_nil(); 907 if (mnt_vp != NULL) { 908 VN_RELE(mnt_vp); 909 mnt_vp = NULL; 910 } 911 } 912 913 CORBA::Object_ptr 914 repl_pxfs_server::get_root_obj(Environment &) 915 { 916 ASSERT(fsp != NULL); 917 version_lock.wrlock(); 918 CORBA::type_info_t *typ = PXFS_VER::filesystem::_get_type_info( 919 pxfs_vp_to_idl[current_version.major_num] 920 [current_version.minor_num].fs); 921 version_lock.unlock(); 922 return (fsp->get_objref(typ)); 923 } 924 925 // 926 // Set the initial version number. 927 // 928 void 929 repl_pxfs_server::set_version(const version_manager::vp_version_t &v) 930 { 931 // 932 // We could have had a callback between the time that we called 933 // register_upgrade_callbacks() and the time that it returned and 934 // set tmp_version. The callback may have set a newer version than 935 // v, so don't clobber it. 936 // 937 version_lock.wrlock(); 938 if (current_version.major_num < v.major_num || 939 (current_version.major_num == v.major_num && 940 current_version.minor_num < v.minor_num)) { 941 current_version = v; 942 } 943 version_lock.unlock(); 944 } 945 946 void 947 repl_pxfs_server::upgrade_callback_register(const sol::mounta &ma) 948 { 949 char *service_name = os::strdup(ma.spec); 950 char unique_callback_name[1024]; 951 char unique_str[32]; 952 char *vpname = "pxfs"; 953 version_manager::vp_version_t callback_limit; 954 version_manager::vp_version_t cur_version; 955 Environment e; 956 957 MOUNT_DBPRINTF( 958 MOUNT_TRACE_REPLICA, 959 MOUNT_GREEN, 960 ("repl_pxfs_server::upgrade_callback_register: this = %p " 961 "mountpnt = %s\n", this, (const char *)ma.dir)); 962 963 // 964 // Have the filesystem replica register with the Version Manager 965 // for upgrade callbacks. Pass a string which can be use to 966 // build a unique callback name. This will eliminate races when 967 // unmounting and then mounting the same filesystem. 968 // 969 os::atomic_add_32((uint32_t *)&unique_integer, 1); 970 (void) os::itoa(unique_integer, unique_str, 10); 971 972 // Generate for callback registration 973 os::sprintf(unique_callback_name, "%s", service_name); 974 os::sprintf(unique_callback_name + os::strlen(service_name), "%s", 975 ":"); 976 os::sprintf(unique_callback_name + os::strlen(service_name) +1, "%s", 977 unique_str); 978 979 // Get a pointer to the local version manager 980 version_manager::vm_admin_var vmgr_v = vm_util::get_vm(NODEID_UNKNOWN); 981 982 // Build a UCC for support of version upgrade callbacks 983 version_manager::ucc_seq_t ucc_seq(1, 1); 984 ucc_seq[0].ucc_name = os::strdup(unique_callback_name); 985 ucc_seq[0].vp_name = os::strdup(vpname); 986 version_manager::string_seq_t fseq(1, 1); 987 fseq[0] = os::strdup(service_name); 988 ucc_seq[0].freeze = fseq; 989 990 // 991 // Create a version upgrade callback object for the fs replica 992 // 993 replica::repl_prov_var repl_srvr_v = generic_repl_server::_narrow( 994 this->generic_repl_server::get_objref()); 995 repl_srvr_v->_handler()->set_cookie((void *)this); 996 callback_object_v = (new fs_version_callback_impl( 997 replica::repl_prov::_duplicate( 998 repl_srvr_v)))->get_objref(); 999 1000 MOUNT_DBPRINTF( 1001 MOUNT_TRACE_REPLICA, 1002 MOUNT_GREEN, 1003 ("repl_pxfs_server::upgrade_callback_reg: this = %p\n", this)); 1004 1005 // 1006 // Register the callback object with the Version Manager. The 1007 // tmp_version will be returned. The version lock is not held 1008 // since the replica is not yet registered with the HA framework 1009 // so there cannot be a call to become_primary. 1010 // 1011 // If the running version is less than the callback_limit 1012 // a callback will be registerd, otherwise a callback is not 1013 // registered (currently no way to tell). The current running 1014 // version is returned regardless. 1015 // 1016 callback_limit.major_num = 2; 1017 callback_limit.minor_num = 1; 1018 vmgr_v->register_upgrade_callbacks(ucc_seq, callback_object_v, 1019 callback_limit, cur_version, e); 1020 if (e.exception()) { 1021 callback_object_v = version_manager::upgrade_callback::_nil(); 1022 e.exception()->print_exception("register_upgrade_callbacks:"); 1023 e.clear(); 1024 return; 1025 } 1026 MOUNT_DBPRINTF( 1027 MOUNT_TRACE_REPLICA, 1028 MOUNT_GREEN, 1029 ("repl_pxfs_server::upgrade_callback_reg after reg: this = %p\n", 1030 this)); 1031 1032 // establish the current version in the replica 1033 set_version(cur_version); 1034 } 1035 1036 void 1037 repl_pxfs_server::upgrade_callback_unregister() 1038 { 1039 Environment e; 1040 1041 MOUNT_DBPRINTF( 1042 MOUNT_TRACE_REPLICA, 1043 MOUNT_GREEN, 1044 ("repl_pxfs_server::upgrade_callback_unregister: this = %p\n", 1045 this)); 1046 1047 version_lock.wrlock(); 1048 if (!CORBA::is_nil(callback_object_v)) { 1049 version_manager::vm_admin_var vmgr_v = 1050 vm_util::get_vm(NODEID_UNKNOWN); 1051 vmgr_v->unregister_upgrade_callbacks( 1052 callback_object_v, e); 1053 if (e.exception()) { 1054 ASSERT(0); 1055 e.clear(); 1056 } 1057 callback_object_v = version_manager::upgrade_callback::_nil(); 1058 } 1059 version_lock.unlock(); 1060 } 1061 1062 // 1063 // Process an upgrade callback from the version manager. 1064 // This call can happen before the HA replica is registered with the 1065 // replica manager (i.e., the HA sevice is not started yet or is a spare), 1066 // or as primary or secondary. The callback is not synchronized with 1067 // calls to become_primary(), become_secondary(), etc. and failovers can 1068 // happen while the service is frozen so we need to make sure that the 1069 // "upgrade work" is done if there are node failures. 1070 // If this replica is the primary, we do the work and send a checkpoint 1071 // to indicate the work is complete. If this replica is a secondary and 1072 // the callback happens before we get the checkpoint from the primary, 1073 // we record that the callback happened so that become_primary() can 1074 // do the "upgrade work" and send the checkpoint. If this replica is a 1075 // secondary and the callback happens after the checkpoint is received, 1076 // we ignore the callback. The checkpoint routine on the secondary clears 1077 // the "flag" that the callback sets so no extra work is done in 1078 // become_primary() if the old primary fails after completing the upgrade 1079 // callback work. 1080 // 1081 void 1082 repl_pxfs_server::upgrade_callback(const version_manager::vp_version_t &v, 1083 Environment &e) 1084 { 1085 CORBA::type_info_t *typ; 1086 1087 MOUNT_DBPRINTF( 1088 MOUNT_TRACE_REPLICA, 1089 MOUNT_GREEN, 1090 ("repl_pxfs_server::upgrade_callback: this = %p " 1091 "fsp = %p\n", this, fsp)); 1092 1093 // 1094 // A nil callback object indicates that a callback unregister was 1095 // done. 1096 // 1097 if (CORBA::is_nil(callback_object_v)) { 1098 MOUNT_DBPRINTF( 1099 MOUNT_TRACE_REPLICA, 1100 MOUNT_GREEN, 1101 ("repl_pxfs_server::upgrade_callback: nil cb object\n")); 1102 return; 1103 } 1104 1105 1106 // 1107 // Note that upgrade callbacks are not synchronized with 1108 // calls to become_primary(), add_secondary(), etc. 1109 // Getting this lock makes sure the replica state doesn't change. 1110 // 1111 version_lock.wrlock(); 1112 if (current_version.major_num > v.major_num || 1113 (current_version.major_num == v.major_num && 1114 current_version.minor_num >= v.minor_num)) { 1115 // Version isn't changing so just return. 1116 version_lock.unlock(); 1117 callback_object_v = version_manager::upgrade_callback::_nil(); 1118 return; 1119 } 1120 1121 // 1122 // If this replica is not the primary, set the pending version 1123 // to be used if there is a failover before the primary checkpoints 1124 // a new version. 1125 // 1126 if (CORBA::is_nil(_ckpt_proxy)) { 1127 // If this replica is a secondary, set the pending version. 1128 if (fsp != NULL) { 1129 pending_version = v; 1130 } else { 1131 // Replica must be a spare. 1132 current_version = v; 1133 } 1134 version_lock.unlock(); 1135 callback_object_v = version_manager::upgrade_callback::_nil(); 1136 MOUNT_DBPRINTF( 1137 MOUNT_TRACE_REPLICA, 1138 MOUNT_GREEN, 1139 ("repl_pxfs_server::upgrade_callback: not primary\n")); 1140 return; 1141 } 1142 1143 // 1144 // Check for primary with a "dead" filesystem 1145 // 1146 if (fsp == NULL) { 1147 version_lock.unlock(); 1148 callback_object_v = version_manager::upgrade_callback::_nil(); 1149 MOUNT_DBPRINTF( 1150 MOUNT_TRACE_REPLICA, 1151 MOUNT_GREEN, 1152 ("repl_pxfs_server::upgrade_callback: NULL fsp\n")); 1153 return; 1154 } 1155 1156 // We are the primary and the version has changed. 1157 current_version = v; 1158 MOUNT_DBPRINTF( 1159 MOUNT_TRACE_REPLICA, 1160 MOUNT_GREEN, 1161 ("repl_pxfs_server::upgrade_callback: this = %p " 1162 "new version -> major = %d minor = %d\n", this, 1163 current_version.major_num, current_version.minor_num)); 1164 1165 // 1166 // Switch the checkpoint interface to the new protocol. Save 1167 // the current _ckpt_proxy and release it after we get a new one 1168 // 1169 REPL_PXFS_VER::fs_replica_ptr old_ckpt_p = _ckpt_proxy; 1170 typ = REPL_PXFS_VER::fs_replica::_get_type_info( 1171 pxfs_vp_to_idl[current_version.major_num] 1172 [current_version.minor_num].fs_ckpt); 1173 1174 replica::checkpoint_var tmp_ckpt_v = set_checkpoint(typ); 1175 _ckpt_proxy = REPL_PXFS_VER::fs_replica::_narrow(tmp_ckpt_v); 1176 1177 ASSERT(!CORBA::is_nil(_ckpt_proxy)); 1178 1179 // Release the reference to the old ckpt_proxy. 1180 CORBA::release(old_ckpt_p); 1181 old_ckpt_p = REPL_PXFS_VER::fs_replica::_nil(); 1182 1183 // Update the server reference. 1184 typ = PXFS_VER::filesystem::_get_type_info( 1185 pxfs_vp_to_idl[current_version.major_num] 1186 [current_version.minor_num].fs); 1187 fs_v = fsp->get_objref(typ); 1188 1189 // 1190 // Create and add a primary context so the provider can send 1191 // checkpoints while the service is frozen. 1192 // XXX change the primary_ctx::invoke_env type. 1193 // 1194 primary_ctx ctx(NULL, primary_ctx::ADD_SECONDARY_CKPT, e); 1195 1196 // Checkpoint the current version number. 1197 _ckpt_proxy->ckpt_service_version( 1198 current_version.major_num, current_version.minor_num, e); 1199 1200 e.trans_ctxp = NULL; 1201 1202 // 1203 // This reference isn't needed any more since we won't need 1204 // to do an unregister of the callback object. 1205 // 1206 callback_object_v = version_manager::upgrade_callback::_nil(); 1207 1208 version_lock.unlock(); 1209 } 1210 1211 // 1212 // Checkpoint the creation of a new filesystem (fs_obj). 1213 // 1214 void 1215 repl_pxfs_server::ckpt_new_fsobj(PXFS_VER::filesystem_ptr fs_obj, 1216 const char *mntoptions, Environment &) 1217 { 1218 // Create the shadow fs object. 1219 if (fsp == NULL) { 1220 fsp = new fs_repl_impl(mountdata.fstype, mountdata.spec, 1221 mntoptions, this, fs_obj); 1222 fs_v = PXFS_VER::filesystem::_duplicate(fs_obj); 1223 } 1224 } 1225 1226 // 1227 // Checkpoint a new service version 1228 // 1229 void 1230 repl_pxfs_server::ckpt_service_version(unsigned short new_major, 1231 unsigned short new_minor, Environment &) 1232 { 1233 version_lock.wrlock(); 1234 current_version.major_num = new_major; 1235 current_version.minor_num = new_minor; 1236 pending_version.major_num = 0; 1237 version_lock.unlock(); 1238 } 1239 1240 // 1241 // Checkpoint a failure in become_primary() or become_secondary(). 1242 // 1243 void 1244 repl_pxfs_server::ckpt_mnt_error(sol::error_t error, Environment &) 1245 { 1246 ASSERT(fsp != NULL); 1247 mnt_error = error; 1248 } 1249 1250 // 1251 // Checkpoint the creation of a new fobj which has the specified 1252 // fid (used for switchover for primary) and type. 1253 // 1254 void 1255 repl_pxfs_server::ckpt_new_fobj(PXFS_VER::fobj_ptr obj, 1256 const PXFS_VER::fobjid_t &fobjid, 1257 PXFS_VER::fobj_type_t type, 1258 Environment &) 1259 { 1260 ASSERT(fsp != NULL); 1261 fsp->ckpt_new_fobj(obj, fobjid, type); 1262 } 1263 1264 // 1265 // Checkpoint the creation of a new fsmgr_server object for 1266 // detecting client crashes. 1267 // 1268 void 1269 repl_pxfs_server::ckpt_new_fsmgr(PXFS_VER::fsmgr_server_ptr servermgr, 1270 PXFS_VER::fsmgr_client_ptr clientmgr, sol::nodeid_t nodeid, Environment &) 1271 { 1272 ASSERT(fsp != NULL); 1273 fsp->ckpt_new_fsmgr(servermgr, clientmgr, nodeid); 1274 } 1275 1276 // 1277 // Update the mount arguments after a remount (see fs_ii::remount()). 1278 // We assume that the VFS_MOUNT() (with MS_REMOUNT set) contains the 1279 // complete mount information and doesn't depend on the history of previous 1280 // of previous calls to VFS_MOUNT(). If this isn't true, we would need to 1281 // save a list of all parameters and replay all the VFS_MOUNT() calls 1282 // on failover. 1283 // 1284 void 1285 repl_pxfs_server::set_mountargs(const sol::mounta &ma) 1286 { 1287 mountdata = ma; 1288 // 1289 // We set MS_NOSPLICE so that the underlying file system isn't 1290 // linked into the file system name space. 1291 // XXX We also force the MS_OVERLAY flag on to suppress the 1292 // mvp->v_count == 1 EBUSY check in the file system code. 1293 // We clear MS_REMOUNT since we will be doing a mount instead 1294 // of a remount after a failover. 1295 // 1296 mountdata.flags = MS_NOSPLICE | MS_OVERLAY | (ma.flags & ~MS_REMOUNT); 1297 } 1298 1299 // 1300 // Checkpoint blocks allocated by the server. 1301 // 1302 void 1303 repl_pxfs_server::ckpt_blocks_allocated( 1304 const repl_pxfs_v1::blocks_allocated_t ¤t_allocations, 1305 PXFS_VER::blkcnt_t blocks_free_cnt, Environment &) 1306 { 1307 ASSERT(fsp != NULL); 1308 fsp->ckpt_blocks_allocated(current_allocations, blocks_free_cnt); 1309 } 1310 1311 // 1312 // Checkpoint the server status to secondary. 1313 // 1314 void 1315 repl_pxfs_server::ckpt_server_status(PXFS_VER::server_status_t status, 1316 Environment &) 1317 { 1318 ASSERT(fsp != NULL); 1319 fsp->ckpt_server_status(status); 1320 } 1321 1322 // 1323 // Checkpoint the changes to mount arguments and options due to a remount. 1324 // 1325 void 1326 repl_pxfs_server::ckpt_remount(const sol::mounta &ma, const char *mntoptions, 1327 Environment &) 1328 { 1329 set_mountargs(ma); 1330 fsp->set_options(mntoptions); 1331 } 1332 1333 // 1334 // Checkpoint a change in the active locks of a file. 1335 // 1336 void 1337 repl_pxfs_server::ckpt_locks(PXFS_VER::fobj_ptr obj, 1338 const REPL_PXFS_VER::lock_info_seq_t &locks, Environment &) 1339 { 1340 fobj_ii *fobj_iip = fobj_ii::get_fobj_ii(obj); 1341 fobj_iip->ckpt_locks(locks); 1342 } 1343 1344 // 1345 // Checkpoint the existence of a file entry under the current mini-transaction. 1346 // This method is invoked on the secondary by the primary. 1347 // This state can be used later to determine if the primary completed an 1348 // operation or not. 1349 // 1350 void 1351 repl_pxfs_server::ckpt_entry_state(bool exists, Environment &_environment) 1352 { 1353 transaction_state *state = new unixdir_state(exists); 1354 state->register_state(_environment); 1355 if (_environment.exception()) { 1356 MOUNT_DBPRINTF( 1357 MOUNT_TRACE_REPLICA, 1358 MOUNT_RED, 1359 ("repl_pxfs_server::ckpt_entry_state: client died\n")); 1360 _environment.clear(); 1361 } 1362 } 1363 1364 // 1365 // Checkpoint the existence of a file to be locked 1366 // prior to operating on it. This used with mini-transactions 1367 // to handle failure mid-way through an operation. 1368 // 1369 void 1370 repl_pxfs_server::ckpt_target(PXFS_VER::fobj_ptr obj, 1371 const PXFS_VER::fobj_info &fobjinfo, Environment &_environment) 1372 { 1373 transaction_state *state = new unixdir_state(obj, fobjinfo); 1374 state->register_state(_environment); 1375 1376 // If the client dies, there is nothing to clean up. 1377 _environment.clear(); 1378 } 1379 1380 // 1381 // Checkpoint the existence of a file to be locked 1382 // prior to removing it. This used with mini-transactions 1383 // to handle failure mid-way through an operation. 1384 // 1385 void 1386 repl_pxfs_server::ckpt_target_remove(PXFS_VER::fobj_ptr obj, 1387 const PXFS_VER::fobj_info &fobjinfo, uint64_t delete_id, 1388 Environment &_environment) 1389 { 1390 transaction_state *state = new unixdir_state(obj, 1391 fobjinfo, delete_id); 1392 1393 state->register_state(_environment); 1394 1395 // If the client dies, there is nothing to clean up. 1396 _environment.clear(); 1397 } 1398 1399 // 1400 // Checkpoint an object return under the current mini-transaction. 1401 // This return value is used to return the results of a stale operation. 1402 // 1403 void 1404 repl_pxfs_server::ckpt_fobj_return(PXFS_VER::fobj_ptr ret_obj, 1405 const PXFS_VER::fobj_info &ret_info, Environment &_environment) 1406 { 1407 secondary_ctx *ctxp = secondary_ctx::extract_from(_environment); 1408 unixdir_state *saved_state = 1409 (unixdir_state *)ctxp->get_saved_state(); 1410 // 1411 // ckpt_entry_state() or ckpt_target() should have been called first 1412 // so there should always be a transaction state object for this 1413 // checkpoint operation. 1414 // 1415 ASSERT(saved_state != NULL); 1416 saved_state->ckpt_fobj_return(ret_obj, ret_info); 1417 } 1418 1419 // 1420 // Checkpoint an error return under the current mini-transaction. 1421 // This error return is used to return the results of a stale operation. 1422 // 1423 void 1424 repl_pxfs_server::ckpt_error_return(sol::error_t error, 1425 Environment &_environment) 1426 { 1427 secondary_ctx *ctxp = secondary_ctx::extract_from(_environment); 1428 unixdir_state *saved_state = 1429 (unixdir_state *)ctxp->get_saved_state(); 1430 // 1431 // If ckpt_entry_state() nor ckpt_target() has not been called first, 1432 // this is both the start and commit for this operation. 1433 // 1434 if (saved_state == NULL) { 1435 transaction_state *state = new unixdir_state(error); 1436 state->register_state(_environment); 1437 // If the client dies, there is nothing to clean up. 1438 _environment.clear(); 1439 } else { 1440 saved_state->ckpt_error_return(error); 1441 } 1442 } 1443 1444 // 1445 // Checkpoint the deletion of an fobj. 1446 // 1447 void 1448 repl_pxfs_server::ckpt_delete_fobj(uint64_t delete_id, 1449 Environment &_environment) 1450 { 1451 secondary_ctx *ctxp = secondary_ctx::extract_from(_environment); 1452 unixdir_state *saved_state = 1453 (unixdir_state *)ctxp->get_saved_state(); 1454 // 1455 // ckpt_entry_state() or ckpt_target() should have been called first 1456 // so there should always be a transaction state object for this 1457 // checkpoint operation. 1458 // 1459 ASSERT(saved_state != NULL); 1460 saved_state->ckpt_delete_fobj(delete_id); 1461 } 1462 1463 // 1464 // ckpt_deletecnt - updates the secondary file system with the number 1465 // used by the primary for renames of deleted files. 1466 // 1467 void 1468 repl_pxfs_server::ckpt_deletecnt(uint64_t delete_id, Environment &) 1469 { 1470 fsp->set_deletecnt(delete_id); 1471 } 1472 1473 // 1474 // This checkpoint is used by a primary to bring a secondary up to date on 1475 // filesystem locking state. 1476 // 1477 void 1478 repl_pxfs_server::ckpt_lockfs_info(uint64_t lf_lock, uint64_t lf_flags, 1479 uint64_t lf_key, const char *lf_comment, Environment &) 1480 { 1481 ASSERT(fsp != NULL); 1482 fsp->get_fs_dep_implp()->ckpt_lockfs_state(lf_lock, lf_flags, lf_key, 1483 lf_comment); 1484 } 1485 1486 // 1487 // Checkpoint the beginning of a lockfs call 1488 // 1489 void 1490 repl_pxfs_server::ckpt_lockfs_start(uint64_t lf_lock, uint64_t lf_flags, 1491 uint64_t lf_key, const char *lf_comment, Environment &_environment) 1492 { 1493 ASSERT(fsp != NULL); 1494 fobj_lockfs_state::register_new_state(fsp, lf_lock, lf_flags, lf_key, 1495 lf_comment, _environment); 1496 } 1497 1498 // 1499 // Checkpoint failure of a lockfs call. 1500 // 1501 void 1502 repl_pxfs_server::ckpt_lockfs_failure(sol::error_t err, 1503 Environment &_environment) 1504 { 1505 ASSERT(fsp != NULL); 1506 fobj_lockfs_state::report_failure(err, _environment); 1507 } 1508 1509 void 1510 repl_pxfs_server::ckpt_cachedata_flag(PXFS_VER::file_ptr file_p, bool flag, 1511 Environment &) 1512 { 1513 file_ii *file_iip = (file_ii *)fobj_ii::get_fobj_ii(file_p); 1514 file_iip->ckpt_cachedata_flag(flag); 1515 } 1516 1517 // 1518 // Checkpoint the state of a file 1519 // (this is used to dump state to new secondaries). 1520 // 1521 void 1522 repl_pxfs_server::ckpt_fobj_state(PXFS_VER::fobj_ptr obj, uint64_t delete_id, 1523 Environment &) 1524 { 1525 fobj_ii *fobj_iip = fobj_ii::get_fobj_ii(obj); 1526 fobj_iip->ckpt_fobj_state(delete_id); 1527 } 1528 1529 // 1530 // Checkpoint tunefs parameters for VxFS. 1531 // 1532 void 1533 repl_pxfs_server::ckpt_vx_tunefs(const REPL_PXFS_VER::vx_tunefs_t &tunefs, 1534 Environment &) 1535 { 1536 ASSERT(fsp != NULL); 1537 fsp->get_fs_dep_implp()->ckpt_vx_tunefs(tunefs); 1538 } 1539 1540 void 1541 repl_pxfs_server::ckpt_remove_file_locks_by_sysid(int32_t sysid, 1542 Environment &) 1543 { 1544 ASSERT(fsp != NULL); 1545 fsp->ckpt_remove_file_locks_by_sysid(sysid); 1546 } 1547 1548 void 1549 repl_pxfs_server::ckpt_remove_file_locks_by_nlmid(int32_t nlmid, 1550 Environment &) 1551 { 1552 ASSERT(fsp != NULL); 1553 fsp->ckpt_remove_file_locks_by_nlmid(nlmid); 1554 } 1555 1556 void 1557 repl_pxfs_server::ckpt_fs_is_unmounted(Environment &) 1558 { 1559 fs_is_unmounted = true; 1560 } 1561 1562 fs_version_callback_impl::fs_version_callback_impl( 1563 replica::repl_prov_ptr replica_p) : 1564 prov_v(replica_p) 1565 { 1566 } 1567 1568 fs_version_callback_impl::~fs_version_callback_impl() 1569 { 1570 } 1571 1572 void 1573 fs_version_callback_impl::_unreferenced(unref_t arg) 1574 { 1575 if (!_last_unref(arg)) { 1576 // _last_unref() should always be true since we don't use 0->1. 1577 ASSERT(0); 1578 return; 1579 } 1580 delete this; 1581 } 1582 1583 1584 // Call the provider to update the version and checkpoint it. 1585 void 1586 fs_version_callback_impl::do_callback(const char *, 1587 const version_manager::vp_version_t ¤t_version, Environment &e) 1588 { 1589 MOUNT_DBPRINTF( 1590 MOUNT_TRACE_REPLICA, 1591 MOUNT_GREEN, 1592 ("fs_version_callback_impl::do_callback: this = %p\n", this)); 1593 1594 // Call the provider to update the version and checkpoint it. 1595 void *p = prov_v->_handler()->get_cookie(); 1596 ((repl_pxfs_server *)p)->upgrade_callback(current_version, e); 1597 } 1598