Home | History | Annotate | Download | only in server
      1 //
      2 // CDDL HEADER START
      3 //
      4 // The contents of this file are subject to the terms of the
      5 // Common Development and Distribution License (the License).
      6 // You may not use this file except in compliance with the License.
      7 //
      8 // You can obtain a copy of the license at usr/src/CDDL.txt
      9 // or http://www.opensolaris.org/os/licensing.
     10 // See the License for the specific language governing permissions
     11 // and limitations under the License.
     12 //
     13 // When distributing Covered Code, include this CDDL HEADER in each
     14 // file and include the License file at usr/src/CDDL.txt.
     15 // If applicable, add the following below this CDDL HEADER, with the
     16 // fields enclosed by brackets [] replaced with your own identifying
     17 // information: Portions Copyright [yyyy] [name of copyright owner]
     18 //
     19 // CDDL HEADER END
     20 //
     21 
     22 //
     23 // Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24 // Use is subject to license terms.
     25 //
     26 
     27 #pragma ident	"@(#)repl_pxfs_server.cc	1.21	08/05/20 SMI"
     28 
     29 #include <sys/pathname.h>
     30 #include <sys/fcntl.h>
     31 #include <sys/mount.h>
     32 #include <sys/dnlc.h>
     33 
     34 #include <sys/sol_conv.h>
     35 #include <nslib/ns.h>
     36 #include <solobj/solobj_impl.h>
     37 #include <h/repl_pxfs.h>
     38 
     39 #include "../version.h"
     40 #include <pxfs/mount/mount_debug.h>
     41 #include <pxfs/lib/pxfs_debug.h>
     42 #include <pxfs/server/repl_pxfs_server.h>
     43 #include <pxfs/server/fs_impl.h>
     44 #include <pxfs/server/fsmgr_server_impl.h>
     45 #include <pxfs/server/unixdir_impl.h>
     46 #include <pxfs/server/file_impl.h>
     47 #include <pxfs/server/symlink_impl.h>
     48 #include <pxfs/server/io_impl.h>
     49 #include <pxfs/server/unixdir_ckpt.h>
     50 #include <pxfs/server/fobj_trans_states.h>
     51 
     52 #ifndef VXFS_DISABLED
     53 #include <pxfs/server/vxfs_dependent_impl.h>
     54 #endif
     55 
     56 //lint -e1512
     57 //
     58 // Warning(1512) destructor for base class is not virtual -- In a
     59 // final pass through all the classes, we have found a class that is
     60 // the base class of a derivation and has a destructor but the
     61 // destructor is not virtual. It is conventional for inherited classes
     62 // to have virtual destructors so that is it safe to 'delete' a
     63 // pointer to a base class.
     64 //
     65 
     66 //
     67 // This struct is used for building a table for supporting the mapping
     68 // from version protocol spec file version number to the various IDL
     69 // interface versions it represents.  The table will be a two dimensional
     70 // array indexed by major/minor vp version.
     71 //
     72 typedef struct {		// idl interfaces
     73 	int	fs;		// pxfs_v1.idl
     74 	int	fs_ckpt;	// repl_pxfs_v1.idl
     75 } pxfs_ver_map_t;
     76 
     77 //
     78 // These are the current maximum indexes used for accessing the vp to idl
     79 // version table.
     80 //
     81 const int	PXFS_VP_MAX_MAJOR = 2;
     82 const int	PXFS_VP_MAX_MINOR = 2;
     83 
     84 //
     85 // pxfs_ver_map_t is a struct type which has has an entries for each IDL
     86 // interface which is being versioned. For a given VP major and minor version,
     87 // we get the IDL version of those interfaces.
     88 //
     89 // Note: this corresponds to the versions supported by the old
     90 // version of pxfs in pxfs_v1.idl and repl_pxfs_v1.idl
     91 //
     92 pxfs_ver_map_t pxfs_vp_to_idl[PXFS_VP_MAX_MAJOR + 1][PXFS_VP_MAX_MINOR +1] = {
     93 	{   { 0, 0 },		// VP Version 0.0 defined for indexing
     94 	    { 0, 0 },		// VP Version 0.1   "      "     "
     95 	    { 0, 0 }  },	// VP Version 0.2   "      "     "
     96 	{   { 0, 0 },		// VP Version 1.0
     97 	    { 0, 0 },		// VP Version 1.1
     98 	    { 0, 0 }  },	// VP Version 1.2
     99 	{   { 0, 0 },		// VP Version 2.0 Object Consolidation
    100 	    { 3, 3 },		// VP Version 2.1 RU support for 6496894/6493901
    101 	    { 3, 3 }  }		// VP Version 2.2 currently unused
    102 };
    103 
    104 // Initialize the static member variable unique_integer
    105 int repl_pxfs_server::unique_integer = 0;
    106 
    107 //
    108 // Class repl_pxfs_server.
    109 //
    110 
    111 //
    112 // Create a repl_pxfs_server object.
    113 //
    114 repl_pxfs_server::repl_pxfs_server(vnode_t *mvp, const sol::mounta &ma,
    115     cred_t *cr, const char *id) :
    116 	mountdata(ma),
    117 	repl_server<REPL_PXFS_VER::fs_replica>(ma.spec, id),
    118 	_ckpt_proxy(NULL),
    119 	replica_state(NOT_PRIMARY),
    120 	active_invo_count(0)
    121 {
    122 	ASSERT(mountdata.flags & MS_SYSSPACE);
    123 	ASSERT((mountdata.flags & MS_REMOUNT) == 0);
    124 
    125 	//
    126 	// We set MS_NOSPLICE so that the underlying file system isn't
    127 	// linked into the file system name space.
    128 	// XXX We also force the MS_OVERLAY flag on to suppress the
    129 	// mvp->v_count == 1 EBUSY check in the file system code.
    130 	//
    131 	mountdata.flags |= MS_NOSPLICE | MS_OVERLAY;
    132 
    133 	mnt_vp = mvp;
    134 	VN_HOLD(mnt_vp);
    135 
    136 	crp = cr;
    137 	crhold(crp);
    138 
    139 	fsp = NULL;
    140 	mnt_error = 0;
    141 	fs_is_unmounted = false;
    142 
    143 	//
    144 	// Initialize with invalid versions. Correct values will
    145 	// be obtained by a query of the version manager.
    146 	//
    147 	current_version.major_num = 0;
    148 	current_version.minor_num = 0;
    149 	pending_version.major_num = 0;
    150 	pending_version.minor_num = 0;
    151 }
    152 
    153 repl_pxfs_server::~repl_pxfs_server()
    154 {
    155 	if (mnt_vp != NULL) {
    156 		VN_RELE(mnt_vp);
    157 	}
    158 	crfree(crp);
    159 
    160 	CORBA::release(_ckpt_proxy);
    161 	_ckpt_proxy = nil;
    162 } //lint !e1540 !e1740 pointers are neither freed nor zero'ed by destructor
    163 
    164 REPL_PXFS_VER::fs_replica_ptr
    165 repl_pxfs_server::get_checkpoint_fs_replica()
    166 {
    167 	ASSERT(!CORBA::is_nil(_ckpt_proxy));
    168 	return (_ckpt_proxy);
    169 }
    170 
    171 //
    172 // Helper function to get the mount error (if any).
    173 //
    174 int
    175 repl_pxfs_server::get_mount_error() const
    176 {
    177 	return (mnt_error);
    178 }
    179 
    180 //
    181 // Become the primary.
    182 // Note that previously we might have been newly created or
    183 // a secondary that is switching to primary.
    184 //
    185 void
    186 repl_pxfs_server::become_primary(const replica::repl_name_seq &,
    187     Environment &_environment)
    188 {
    189 	// Show that we are the primary
    190 	active_invo_lock.lock();
    191 	replica_state = PRIMARY;
    192 	active_invo_lock.unlock();
    193 
    194 	ASSERT(mountdata.flags & MS_SYSSPACE);
    195 	ASSERT((mountdata.flags & MS_REMOUNT) == 0);
    196 	ASSERT(mountdata.flags & MS_OVERLAY);
    197 	ASSERT(mountdata.flags & MS_NOSPLICE);
    198 
    199 	// First, initialize the checkpoint proxy.
    200 	version_lock.wrlock();
    201 
    202 	// Callback may have occured when this replica was a secondary
    203 	if (pending_version.major_num != 0) {
    204 		current_version = pending_version;
    205 	}
    206 
    207 	// Save the current _ckpt_proxy and release it after we get a new one
    208 	REPL_PXFS_VER::fs_replica_ptr old_ckpt_p = _ckpt_proxy;
    209 
    210 	CORBA::type_info_t *typ = REPL_PXFS_VER::fs_replica::_get_type_info(
    211 	    pxfs_vp_to_idl[current_version.major_num]
    212 		[current_version.minor_num].fs_ckpt);
    213 	replica::checkpoint_var tmp_ckpt_v = set_checkpoint(typ);
    214 	_ckpt_proxy = REPL_PXFS_VER::fs_replica::_narrow(tmp_ckpt_v);
    215 
    216 	ASSERT(!CORBA::is_nil(_ckpt_proxy));
    217 
    218 	// Release the reference to the old ckpt_proxy.
    219 	CORBA::release(old_ckpt_p);
    220 	old_ckpt_p = REPL_PXFS_VER::fs_replica::_nil();
    221 
    222 	//
    223 	// If we were a secondary, there may be an uprocessed upgrade
    224 	// callback pending.
    225 	//
    226 	if (fsp != NULL) {
    227 		if (pending_version.major_num != 0) {
    228 			pending_version.major_num = 0;
    229 
    230 			// Update the server reference.
    231 			typ = PXFS_VER::filesystem::_get_type_info(
    232 			    pxfs_vp_to_idl[current_version.major_num]
    233 				[current_version.minor_num].fs);
    234 			fs_v = fsp->get_objref(typ);
    235 
    236 			// Checkpoint current version.
    237 			_ckpt_proxy->ckpt_service_version(
    238 			    current_version.major_num,
    239 			    current_version.minor_num, _environment);
    240 		}
    241 	}
    242 
    243 	//
    244 	// If a switchover and unmount happen simultaneously, the two
    245 	// threads can race each other. The unmount thread asks the clients
    246 	// (including the secondary) to unmount and clear their respective
    247 	// vfs pointers. At this point the file system is not dead. The
    248 	// switchover can proceed and try to make the existing secondary a
    249 	// primary. This secondary's vfs has been cleared by the unmount
    250 	// and we will panic.
    251 	//
    252 	// We resolve this race using the 'fs_is_unmounted' flag. If it is
    253 	// set, we have commenced unmounting or has partly unmounted the
    254 	// filesystem. In that case we return after logging an error. The
    255 	// switchover will fail. The swithover will have to be retried.
    256 	//
    257 	if (fs_is_unmounted) {
    258 		MOUNT_DBPRINTF(
    259 		    MOUNT_TRACE_REPLICA,
    260 		    MOUNT_RED,
    261 		    ("repl_pxfs_server::become_primary %s already unmounted\n",
    262 		    (const char *)mountdata.dir)); //lint !e1776
    263 		_environment.clear();
    264 
    265 		char		nodenum[32];
    266 
    267 		(void) sprintf(nodenum, "Node (%u)", orb_conf::node_number());
    268 		os::sc_syslog_msg msg(SC_SYSLOG_GLOBAL_MOUNT_TAG,
    269 		    nodenum, NULL);
    270 		//
    271 		// SCMSGS
    272 		// @explanation
    273 		// This is an error due to a simultaneous switchover and
    274 		// unmount. The switchover fails and unmount succeeds.
    275 		// @user_action
    276 		// The switchover has to be retried.
    277 		//
    278 		msg.log(SC_SYSLOG_WARNING, MESSAGE,
    279 		    "switchover failed since the file system at %s is "
    280 		    "being unmounted.", (const char *)mountdata.dir);
    281 		version_lock.unlock();
    282 		return;
    283 	}
    284 
    285 	if (mnt_error != 0) {
    286 		ASSERT(fsp != NULL);
    287 		ASSERT(fsp->get_vfsp() == NULL);
    288 		MOUNT_DBPRINTF(
    289 		    MOUNT_TRACE_REPLICA,
    290 		    MOUNT_RED,
    291 		    ("repl_pxfs_server::become_primary %s err %d\n",
    292 		    (const char *)mountdata.dir, mnt_error));
    293 		//
    294 		// We have already tried to mount the file system and
    295 		// got an error or some other unrecoverable situation.
    296 		// Wait until we are unmounted and mounted again.
    297 		//
    298 		version_lock.unlock();
    299 		return;
    300 	}
    301 
    302 	//
    303 	// Set the MS_NOCHECK flag if this is a failover or switchover
    304 	// (i.e., the PXFS file system is already mounted). This flag is used
    305 	// to tell the underlying file system to suppress checking for other
    306 	// mounted file systems with the same device since otherwise it would
    307 	// see the PXFS proxy and think the device is already mounted.
    308 	//
    309 	if (fsp != NULL) {
    310 		//
    311 		// Check if we are being called after a failure of a call to
    312 		// become_secondary().
    313 		//
    314 		if (fsp->get_vfsp() != NULL) {
    315 			mnt_error = fsp->convert_to_primary(false);
    316 			if (mnt_error != 0) {
    317 				//
    318 				// SCMSGS
    319 				// @explanation
    320 				// The file system specified in the message
    321 				// could not be hosted on the node the message
    322 				// came from.
    323 				// @user_action
    324 				// Check /var/adm/messages to make sure there
    325 				// were no device errors. If not, contact your
    326 				// authorized Sun service provider to
    327 				// determine whether a workaround or patch is
    328 				// available.
    329 				//
    330 				(void) fsp->msg().log(SC_SYSLOG_WARNING,
    331 				    MESSAGE, "Switchover (%s) error (%d) after "
    332 				    "failure to become secondary",
    333 				    (const char *)(mountdata.dir), mnt_error);
    334 				fsp->get_checkpoint()->ckpt_mnt_error(mnt_error,
    335 				    _environment);
    336 				ASSERT(_environment.exception() == NULL);
    337 				MOUNT_DBPRINTF(
    338 				    MOUNT_TRACE_REPLICA,
    339 				    MOUNT_RED,
    340 				    ("repl_pxfs_server::become_primary %s "
    341 				    "become_secondary err %d\n",
    342 				    (const char *)mountdata.dir, mnt_error));
    343 			}
    344 			version_lock.unlock();
    345 			return;
    346 		}
    347 
    348 		mountdata.flags |= MS_NOCHECK;
    349 
    350 		//
    351 		// Run the user-level commands needed to prepare the device
    352 		// we are going to mount (does the fsck if necessary).
    353 		//
    354 		char name[20];
    355 		Environment e;
    356 		naming::naming_context_var ctxp = ns::root_nameserver();
    357 		os::sprintf(name, "ha_mounter.%d", orb_conf::node_number());
    358 		CORBA::Object_var obj = ctxp->resolve(name, e);
    359 		if (e.exception()) {
    360 			e.clear();
    361 			//
    362 			// SCMSGS
    363 			// @explanation
    364 			// The file system specified in the message could not
    365 			// be hosted on the node the message came from. Check
    366 			// to see if the user program "clexecd" is running on
    367 			// that node.
    368 			// @user_action
    369 			// Contact your authorized Sun service provider to
    370 			// determine whether a workaround or patch is
    371 			// available.
    372 			//
    373 			(void) fsp->msg().log(SC_SYSLOG_WARNING, MESSAGE,
    374 			    "Switchover error (%s): cannot "
    375 			    "find clexecd", (const char *)(mountdata.dir));
    376 			mnt_error = EINVAL;
    377 			fsp->get_checkpoint()->ckpt_mnt_error(mnt_error,
    378 			    _environment);
    379 			ASSERT(_environment.exception() == NULL);
    380 			version_lock.unlock();
    381 			return;
    382 		}
    383 		repl_pxfs::ha_mounter_var	mounter =
    384 		    repl_pxfs::ha_mounter::_narrow(obj);
    385 		ASSERT(!CORBA::is_nil(mounter));
    386 		mounter->mount(mountdata.spec, mountdata.fstype,
    387 		    fsp->get_options(), e);
    388 		if (e.exception()) {
    389 			e.clear();
    390 			//
    391 			// SCMSGS
    392 			// @explanation
    393 			// The file system specified in the message could not
    394 			// be hosted on the node the message came from because
    395 			// an fsck on the file system revealed errors.
    396 			// @user_action
    397 			// Unmount the PXFS file system (if mounted), fsck the
    398 			// device, and then mount the PXFS file system again.
    399 			//
    400 			(void) fsp->msg().log(SC_SYSLOG_WARNING, MESSAGE,
    401 			    "Switchover error (%s): failed to fsck disk",
    402 			    (const char *)(mountdata.dir));
    403 			mnt_error = EINVAL;
    404 			fsp->get_checkpoint()->ckpt_mnt_error(mnt_error,
    405 			    _environment);
    406 			ASSERT(_environment.exception() == NULL);
    407 			version_lock.unlock();
    408 			return;
    409 		}
    410 	} else {
    411 		mountdata.flags &= ~MS_NOCHECK;
    412 
    413 		//
    414 		// We turn off MS_GLOBAL, as we are mounting the underlying
    415 		// filesystem locally. With Solaris 9 build 58, Solaris
    416 		// disables mount in progress checks if MS_GLOBAL is specified.
    417 		// We have to make sure that MS_GLOBAL is turned off here, as we
    418 		// want these checks to be made here. These checks make sure
    419 		// that if a global mount and a local mount happen
    420 		// concurrently, and are trying to mount the same device,
    421 		// on different mount-points, only one of them succeeds.
    422 		//
    423 		mountdata.flags &= ~MS_GLOBAL;
    424 	}
    425 
    426 	//
    427 	// We could end up with mnt_vp == NULL in cases where this
    428 	// repl_pxfs_server was just being added to the repl_prov_list
    429 	// and a reconfig triggered by a failure could come in and
    430 	// shut this down (refer to rm_state_machine::cleanup_providers()).
    431 	//
    432 	// Hence, we re-initialize it if necessary.
    433 	//
    434 	if (mnt_vp == NULL) {
    435 		vnode_t *vp = NULL;
    436 		int	error = lookupname(((sol::mounta &)mountdata).dir,
    437 		    UIO_SYSSPACE, FOLLOW, NULL, &vp);
    438 
    439 		if (error != 0) {
    440 			MOUNT_DBPRINTF(
    441 			    MOUNT_TRACE_REPLICA,
    442 			    MOUNT_RED,
    443 			    ("repl_pxfs_server::become_primary %s err %d"
    444 			    "lookupname() failed\n",
    445 			    (const char *)mountdata.dir, error));
    446 			pxfslib::throw_exception(_environment, error);
    447 			return;
    448 		}
    449 		mnt_vp = vp;
    450 		VN_HOLD(mnt_vp);
    451 	}
    452 
    453 	dnlc_purge_vp(mnt_vp);
    454 
    455 	int datalen;
    456 
    457 #ifndef VXFS_DISABLED
    458 	if (strcmp(mountdata.fstype, "vxfs") == 0) {
    459 		datalen = vxfs_dependent_impl::vxfs_fixup_args(mountdata,
    460 		    (fsp == NULL) ? vxfs_dependent_impl::VX_MOUNT :
    461 				    vxfs_dependent_impl::VX_FAILOVER);
    462 		if (datalen == -1) {
    463 			pxfslib::throw_exception(_environment, ENOENT);
    464 			return;
    465 		}
    466 	} else {
    467 		datalen = (int)mountdata.data.length();
    468 	}
    469 #else
    470 	datalen = (int)mountdata.data.length();
    471 #endif
    472 
    473 	//
    474 	// Mount the underlying file system but don't link it into the
    475 	// name space.
    476 	//
    477 	vfs_t *vfsp = NULL;
    478 	struct mounta mnta;
    479 	char *options;
    480 	mnta.spec = mountdata.spec;
    481 	mnta.dir = mountdata.dir;
    482 	mnta.flags = mountdata.flags;
    483 	mnta.fstype = mountdata.fstype;
    484 	mnta.dataptr = (char *)mountdata.data.buffer();
    485 	mnta.datalen = datalen;
    486 	int len;
    487 	if (mnta.flags & MS_OPTIONSTR) {
    488 		len = (int)mountdata.options.length();
    489 		//lint -e571 This is ok to loose the sign in this cast.
    490 		options = new char [(size_t)len];
    491 		//lint +e571
    492 		mnta.optptr = os::strcpy(options,
    493 		    (const char *)mountdata.options.buffer());
    494 		mnta.optlen = len;
    495 		//
    496 		// Strip "global" from the options list,
    497 		// if it happens to be specified.
    498 		// This is because the underlying mount
    499 		// is a local mount.
    500 		//
    501 		(void) pxfslib::exists_mntopt(options, "global", true);
    502 	} else {
    503 		len = MAX_MNTOPT_STR;
    504 		//lint -e571 This is ok to loose the sign in this cast.
    505 		options = new char [(size_t)len];
    506 		//lint +e571
    507 		mnta.optptr = NULL;
    508 		mnta.optlen = 0;
    509 	}
    510 #ifdef _FAULT_INJECTION
    511 	void *f_argp;
    512 	uint32_t f_argsize;
    513 	if (fault_triggered(FAULTNUM_PXFS_DOMOUNT, &f_argp, &f_argsize)) {
    514 		ASSERT(f_argsize == sizeof (int));
    515 		mnt_error = *((int *)f_argp);
    516 	} else
    517 #endif
    518 		mnt_error = domount(mnta.fstype, &mnta, mnt_vp, crp, &vfsp);
    519 	if (mnt_error == 0 && (mnta.flags & MS_OPTIONSTR) == 0) {
    520 		mnt_error = vfs_buildoptionstr(&vfsp->vfs_mntopts, options,
    521 		    len);
    522 	}
    523 	if (!mnt_error) {
    524 		(void) pxfslib::exists_mntopt(options, "noglobal", true);
    525 		(void) strcat(options, ",global");	//lint !e668
    526 	}
    527 	if (mnt_error != 0) {
    528 		delete [] options;
    529 		MOUNT_DBPRINTF(
    530 		    MOUNT_TRACE_REPLICA,
    531 		    MOUNT_RED,
    532 		    ("repl_pxfs_server::become_primary %s domount err %d\n",
    533 		    (const char *)mountdata.dir, mnt_error));
    534 		//
    535 		// Create a "dead" file system object just to return
    536 		// the error from domount().
    537 		//
    538 		if (fsp == NULL) {
    539 			fsp = new fs_repl_impl(NULL, mountdata.fstype,
    540 			    mountdata.spec, NULL, this);
    541 			typ = PXFS_VER::filesystem::_get_type_info(
    542 			    pxfs_vp_to_idl[current_version.major_num]
    543 				[current_version.minor_num].fs);
    544 			fs_v = fsp->get_objref(typ);
    545 			fsp->get_checkpoint()->ckpt_new_fsobj(fs_v, NULL,
    546 			    _environment);
    547 			ASSERT(_environment.exception() == NULL);
    548 		} else {
    549 			//
    550 			// We failed while re-mounting the FS after a
    551 			// switchover/ failover.  Syslog this fact.
    552 			//
    553 
    554 			//
    555 			// SCMSGS
    556 			// @explanation
    557 			// The file system specified in the message could not
    558 			// be hosted on the node the message came from.
    559 			// @user_action
    560 			// Check /var/adm/messages to make sure there were no
    561 			// device errors. If not, contact your authorized Sun
    562 			// service provider to determine whether a workaround
    563 			// or patch is available.
    564 			//
    565 			(void) fsp->msg().log(SC_SYSLOG_WARNING, MESSAGE,
    566 			    "Switchover error (%s): failed to mount FS (%d)",
    567 			    (const char *)(mountdata.dir), mnt_error);
    568 		}
    569 		fsp->get_checkpoint()->ckpt_mnt_error(mnt_error, _environment);
    570 		ASSERT(_environment.exception() == NULL);
    571 		version_lock.unlock();
    572 		return;
    573 	}
    574 
    575 	//
    576 	// If this is the first time, we need to create the root fs object.
    577 	// Locking is handled by the replica manager framework.
    578 	//
    579 	bool firsttime;
    580 	if (fsp == NULL) {
    581 		fsp = new fs_repl_impl(vfsp, mountdata.fstype,
    582 		    mountdata.spec, options, this);
    583 		typ = PXFS_VER::filesystem::_get_type_info(
    584 		    pxfs_vp_to_idl[current_version.major_num]
    585 			[current_version.minor_num].fs);
    586 		fs_v = fsp->get_objref(typ);
    587 		fsp->get_checkpoint()->ckpt_new_fsobj(fs_v, options,
    588 		    _environment);
    589 		ASSERT(_environment.exception() == NULL);
    590 		firsttime = true;
    591 	} else {
    592 		fsp->set_vfsp(vfsp);
    593 		firsttime = false;
    594 	}
    595 	delete [] options;
    596 
    597 	mnt_error = fsp->convert_to_primary(firsttime);
    598 	if (mnt_error != 0) {
    599 		if (!firsttime) {
    600 			//
    601 			// We failed while re-mounting the FS after a
    602 			// switchover/ failover.  Syslog this fact.
    603 			//
    604 
    605 			//
    606 			// SCMSGS
    607 			// @explanation
    608 			// The file system specified in the message could not
    609 			// be hosted on the node the message came from.
    610 			// @user_action
    611 			// Check /var/adm/messages to make sure there were no
    612 			// device errors. If not, contact your authorized Sun
    613 			// service provider to determine whether a workaround
    614 			// or patch is available.
    615 			//
    616 			(void) fsp->msg().log(SC_SYSLOG_WARNING, MESSAGE,
    617 			    "Switchover (%s) error (%d) converting to primary",
    618 			    (const char *)(mountdata.dir), mnt_error);
    619 		}
    620 		MOUNT_DBPRINTF(
    621 		    MOUNT_TRACE_REPLICA,
    622 		    MOUNT_RED,
    623 		    ("repl_pxfs_server::become_primary %s convert_to_primary "
    624 		    "err %d\n", (const char *)mountdata.dir, mnt_error));
    625 		fsp->get_checkpoint()->ckpt_mnt_error(mnt_error, _environment);
    626 		ASSERT(_environment.exception() == NULL);
    627 	}
    628 	version_lock.unlock();
    629 }
    630 
    631 //
    632 // Become the secondary.
    633 // This is called on the primary in order to do a switchover.
    634 //
    635 void
    636 repl_pxfs_server::become_secondary(Environment &_environment)
    637 {
    638 	ASSERT(fsp != NULL);
    639 
    640 	MOUNT_DBPRINTF(
    641 	    MOUNT_TRACE_REPLICA,
    642 	    MOUNT_GREEN,
    643 	    ("repl_pxfs_server::become_secondary %s\n",
    644 	    (const char *)mountdata.dir));
    645 
    646 	// Show that we are not the primary
    647 	active_invo_lock.lock();
    648 	replica_state = NOT_PRIMARY;
    649 	active_invo_lock.unlock();
    650 
    651 	if (fsp->convert_to_secondary() != 0) {
    652 		//
    653 		// If there is an error converting from primary to secondary,
    654 		// we raise an exception to the HA framework notifying this
    655 		// replica can't become a secondary. The HA framework will
    656 		// call become_primary() on this replica and we will mark
    657 		// the file system as "dead". We handle errors this way
    658 		// because the HA framework doesn't support checkpoints
    659 		// in become_secondary() and Solaris 2.7 doesn't support
    660 		// forced unmount. With forced unmount, we could be sure
    661 		// the file system wasn't accessing the disk on this node
    662 		// and allow the switchover to proceed.
    663 		//
    664 		_environment.exception(new replica::become_secondary_failed);
    665 	}
    666 
    667 	ASSERT(!CORBA::is_nil(_ckpt_proxy));
    668 
    669 	// Release the reference to the multi_ckpt_handler.
    670 	version_lock.wrlock();
    671 	CORBA::release(_ckpt_proxy);
    672 	_ckpt_proxy = nil;
    673 	version_lock.unlock();
    674 }
    675 
    676 void
    677 repl_pxfs_server::add_secondary(replica::checkpoint_ptr sec_chkpt,
    678     const char *, Environment &_environment)
    679 {
    680 	REPL_PXFS_VER::fs_replica_var ckpt =
    681 		REPL_PXFS_VER::fs_replica::_narrow(sec_chkpt);
    682 	ASSERT(!CORBA::is_nil(ckpt));
    683 
    684 	MOUNT_DBPRINTF(
    685 	    MOUNT_TRACE_REPLICA,
    686 	    MOUNT_GREEN,
    687 	    ("repl_pxfs_server::add_secondary %s\n",
    688 	    (const char *)mountdata.dir));
    689 	if (fsp != NULL) {
    690 		// Create a new fs on the secondary before we dump our state.
    691 		ckpt->ckpt_new_fsobj(fs_v, fsp->get_options(), _environment);
    692 		ASSERT(_environment.exception() == NULL);
    693 
    694 		// Dump current state to the new secondary.
    695 		if (mnt_error != 0) {
    696 			ckpt->ckpt_mnt_error(mnt_error, _environment);
    697 			ASSERT(_environment.exception() == NULL);
    698 		}
    699 
    700 		fsp->dump_state(ckpt, _environment);
    701 		_environment.clear();
    702 	}
    703 }
    704 
    705 void
    706 repl_pxfs_server::remove_secondary(const char *, Environment &)
    707 {
    708 }
    709 
    710 //
    711 // This routine is called by server methods that issues invocations to
    712 // clients which result in the client(s) issuing invocations back to
    713 // the server.
    714 //
    715 // We count the invocations that we allow to proceed.
    716 //
    717 // When freezing the primary, we hold these invocations here.
    718 //
    719 // When frozen, we return the invocation with an exception.
    720 //
    721 bool
    722 repl_pxfs_server::check_freeze(Environment &env)
    723 {
    724 	active_invo_lock.lock();
    725 
    726 	switch (replica_state) {
    727 
    728 	case NOT_PRIMARY:	// This should not be possible
    729 		ASSERT(0);
    730 		// Fall through and pretend that we are the primary.
    731 
    732 	case PRIMARY:	// We are not preparing for a freeze.
    733 			// Allow the invocation to proceed
    734 
    735 		ASSERT(active_invo_count >= 0);
    736 		ASSERT(active_invo_count != INT_MAX);
    737 
    738 		active_invo_count++;
    739 		active_invo_lock.unlock();
    740 
    741 		return (false);
    742 			//
    743 	case FREEZING:	// Block any invocation that could result
    744 			// in invocations back to this primary.
    745 			// That would result in deadlock.
    746 			// The clients currently allow invocations to
    747 			// proceed to the server. So hold on to the
    748 			// invocation until the client is ready to
    749 			// block the invocation.
    750 			//
    751 		PXFS_DBPRINTF(
    752 		    PXFS_TRACE_FS,
    753 		    PXFS_AMBER,
    754 		    ("repl_pxfs_server:(%p) freezing invo from node %d\n",
    755 		    this, env.get_src_node().ndid));
    756 		while (replica_state == FREEZING) {
    757 			active_invo_cv.wait(&active_invo_lock);
    758 		}
    759 		ASSERT(replica_state == FROZEN);
    760 		// Fall through
    761 	case FROZEN:
    762 		env.system_exception(
    763 		    CORBA::PRIMARY_FROZEN(0, CORBA::COMPLETED_NO));
    764 		active_invo_lock.unlock();
    765 		return (true);
    766 
    767 	};
    768 }
    769 
    770 //
    771 // This method is called by the HA framework when the service is about to
    772 // be frozen. The freeze will proceed only when this method returns. We
    773 // take advantage of this serialization to bring our state to a stable one
    774 // by waiting for outstanding invocation from the server to complete and
    775 // preventing any new invocations from being launched.
    776 //
    777 void
    778 repl_pxfs_server::freeze_primary_prepare(Environment &)
    779 {
    780 	//
    781 	// Hold invocation count lock to prevent new invocations from
    782 	// coming in.
    783 	//
    784 	active_invo_lock.lock();
    785 	replica_state = FREEZING;
    786 
    787 	PXFS_DBPRINTF(
    788 	    PXFS_TRACE_FS,
    789 	    PXFS_AMBER,
    790 	    ("repl_pxfs_server(%p): %d outstanding nested server invocations\n",
    791 	    this, active_invo_count));
    792 
    793 	while (active_invo_count > 0) {
    794 		// Wait for active invocations to signal completion.
    795 		active_invo_cv.wait(&active_invo_lock);
    796 	}
    797 
    798 	active_invo_lock.unlock();
    799 }
    800 
    801 //
    802 // Note: either add_secondary() and unfreeze_primary() or
    803 // become_secondary() will be called after this returns.
    804 // Invocations and calls to _unreferenced() will be blocked by the HA
    805 // framework after we return from here.
    806 //
    807 // repl_pxfs_server(replica::repl_prov::freeze_primary, _environment)
    808 void
    809 repl_pxfs_server::freeze_primary(Environment &)
    810 {
    811 	MOUNT_DBPRINTF(
    812 	    MOUNT_TRACE_REPLICA,
    813 	    MOUNT_GREEN,
    814 	    ("repl_pxfs_server::freeze_primary %s\n",
    815 	    (const char *)mountdata.dir));
    816 
    817 	// Wake up any invocations that were blocked during freezing
    818 	active_invo_lock.lock();
    819 	replica_state = FROZEN;
    820 	active_invo_cv.broadcast();
    821 	active_invo_lock.unlock();
    822 
    823 	fsp->freeze_primary((const char *)mountdata.dir);
    824 }
    825 
    826 void
    827 repl_pxfs_server::unfreeze_primary(Environment &)
    828 {
    829 	MOUNT_DBPRINTF(
    830 	    MOUNT_TRACE_REPLICA,
    831 	    MOUNT_GREEN,
    832 	    ("repl_pxfs_server::unfreeze_primary %s\n",
    833 	    (const char *)mountdata.dir));
    834 
    835 	// Show that invocations are allowed
    836 	active_invo_lock.lock();
    837 	replica_state = PRIMARY;
    838 	active_invo_lock.unlock();
    839 }
    840 
    841 void
    842 repl_pxfs_server::become_spare(Environment &)
    843 {
    844 	MOUNT_DBPRINTF(
    845 	    MOUNT_TRACE_REPLICA,
    846 	    MOUNT_GREEN,
    847 	    ("repl_pxfs_server::become_spare %s\n",
    848 	    (const char *)mountdata.dir));
    849 	if (fsp != NULL) {
    850 		fs_v = PXFS_VER::filesystem::_nil();
    851 		fsp->convert_to_spare();
    852 		fsp = NULL;
    853 	}
    854 	mnt_error = 0;
    855 }
    856 
    857 //
    858 // This is called on the primary when the service is requested to shutdown.
    859 //
    860 void
    861 repl_pxfs_server::shutdown(Environment &_environment)
    862 {
    863 	// Unregister the upgrade callback.
    864 	upgrade_callback_unregister();
    865 
    866 	// Return busy if there was no unmount.
    867 	if (mnt_error == 0 && fsp != NULL && fsp->get_vfsp() != NULL) {
    868 		_environment.exception(new replica::service_busy);
    869 		return;
    870 	}
    871 	fs_v = PXFS_VER::filesystem::_nil();
    872 	if (mnt_vp != NULL) {
    873 		VN_RELE(mnt_vp);
    874 		mnt_vp = NULL;
    875 	}
    876 }
    877 
    878 //
    879 // This is called on the primary when the service is forced to shutdown.
    880 //
    881 // Virtual function forced_shutdown in base class generic_repl_server
    882 // returns a Solaris error code (if forced shutdown not supported by
    883 // specific HA service), else returns 0(Success)
    884 //
    885 uint32_t
    886 repl_pxfs_server::forced_shutdown(Environment &)
    887 {
    888 	// Unregister the upgrade callback.
    889 	upgrade_callback_unregister();
    890 
    891 	fs_v = PXFS_VER::filesystem::_nil();
    892 	if (mnt_vp != NULL) {
    893 		VN_RELE(mnt_vp);
    894 		mnt_vp = NULL;
    895 	}
    896 	return (0);
    897 }
    898 
    899 //
    900 // This is called on a spare when the service is requested to shutdown.
    901 //
    902 void
    903 repl_pxfs_server::shutdown_spare(replica::repl_prov_shutdown_type,
    904     Environment &)
    905 {
    906 	fs_v = PXFS_VER::filesystem::_nil();
    907 	if (mnt_vp != NULL) {
    908 		VN_RELE(mnt_vp);
    909 		mnt_vp = NULL;
    910 	}
    911 }
    912 
    913 CORBA::Object_ptr
    914 repl_pxfs_server::get_root_obj(Environment &)
    915 {
    916 	ASSERT(fsp != NULL);
    917 	version_lock.wrlock();
    918 	CORBA::type_info_t *typ = PXFS_VER::filesystem::_get_type_info(
    919 	    pxfs_vp_to_idl[current_version.major_num]
    920 		[current_version.minor_num].fs);
    921 	version_lock.unlock();
    922 	return (fsp->get_objref(typ));
    923 }
    924 
    925 //
    926 // Set the initial version number.
    927 //
    928 void
    929 repl_pxfs_server::set_version(const version_manager::vp_version_t &v)
    930 {
    931 	//
    932 	// We could have had a callback between the time that we called
    933 	// register_upgrade_callbacks() and the time that it returned and
    934 	// set tmp_version. The callback may have set a newer version than
    935 	// v, so don't clobber it.
    936 	//
    937 	version_lock.wrlock();
    938 	if (current_version.major_num < v.major_num ||
    939 	    (current_version.major_num == v.major_num &&
    940 	    current_version.minor_num < v.minor_num)) {
    941 		current_version = v;
    942 	}
    943 	version_lock.unlock();
    944 }
    945 
    946 void
    947 repl_pxfs_server::upgrade_callback_register(const sol::mounta &ma)
    948 {
    949 	char *service_name = os::strdup(ma.spec);
    950 	char unique_callback_name[1024];
    951 	char unique_str[32];
    952 	char *vpname = "pxfs";
    953 	version_manager::vp_version_t callback_limit;
    954 	version_manager::vp_version_t cur_version;
    955 	Environment e;
    956 
    957 	MOUNT_DBPRINTF(
    958 	    MOUNT_TRACE_REPLICA,
    959 	    MOUNT_GREEN,
    960 	    ("repl_pxfs_server::upgrade_callback_register: this = %p "
    961 	    "mountpnt = %s\n", this, (const char *)ma.dir));
    962 
    963 	//
    964 	// Have the filesystem replica register with the Version Manager
    965 	// for upgrade callbacks.  Pass a string which can be use to
    966 	// build a unique callback name.  This will eliminate races when
    967 	// unmounting and then mounting the same filesystem.
    968 	//
    969 	os::atomic_add_32((uint32_t *)&unique_integer, 1);
    970 	(void) os::itoa(unique_integer, unique_str, 10);
    971 
    972 	// Generate for callback registration
    973 	os::sprintf(unique_callback_name, "%s", service_name);
    974 	os::sprintf(unique_callback_name + os::strlen(service_name), "%s",
    975 	    ":");
    976 	os::sprintf(unique_callback_name + os::strlen(service_name) +1, "%s",
    977 	    unique_str);
    978 
    979 	// Get a pointer to the local version manager
    980 	version_manager::vm_admin_var vmgr_v = vm_util::get_vm(NODEID_UNKNOWN);
    981 
    982 	// Build a UCC for support of version upgrade callbacks
    983 	version_manager::ucc_seq_t ucc_seq(1, 1);
    984 	ucc_seq[0].ucc_name = os::strdup(unique_callback_name);
    985 	ucc_seq[0].vp_name = os::strdup(vpname);
    986 	version_manager::string_seq_t fseq(1, 1);
    987 	fseq[0] = os::strdup(service_name);
    988 	ucc_seq[0].freeze = fseq;
    989 
    990 	//
    991 	// Create a version upgrade callback object for the fs replica
    992 	//
    993 	replica::repl_prov_var repl_srvr_v = generic_repl_server::_narrow(
    994 	    this->generic_repl_server::get_objref());
    995 	repl_srvr_v->_handler()->set_cookie((void *)this);
    996 	callback_object_v = (new fs_version_callback_impl(
    997 	    replica::repl_prov::_duplicate(
    998 		repl_srvr_v)))->get_objref();
    999 
   1000 	MOUNT_DBPRINTF(
   1001 	    MOUNT_TRACE_REPLICA,
   1002 	    MOUNT_GREEN,
   1003 	    ("repl_pxfs_server::upgrade_callback_reg: this = %p\n", this));
   1004 
   1005 	//
   1006 	// Register the callback object with the Version Manager. The
   1007 	// tmp_version will be returned.  The version lock is not held
   1008 	// since the replica is not yet registered with the HA framework
   1009 	// so there cannot be a call to become_primary.
   1010 	//
   1011 	// If the running version is less than the callback_limit
   1012 	// a callback will be registerd, otherwise a callback is not
   1013 	// registered (currently no way to tell).  The current running
   1014 	// version is returned regardless.
   1015 	//
   1016 	callback_limit.major_num = 2;
   1017 	callback_limit.minor_num = 1;
   1018 	vmgr_v->register_upgrade_callbacks(ucc_seq, callback_object_v,
   1019 	    callback_limit, cur_version, e);
   1020 	if (e.exception()) {
   1021 		callback_object_v = version_manager::upgrade_callback::_nil();
   1022 		e.exception()->print_exception("register_upgrade_callbacks:");
   1023 		e.clear();
   1024 		return;
   1025 	}
   1026 	MOUNT_DBPRINTF(
   1027 	    MOUNT_TRACE_REPLICA,
   1028 	    MOUNT_GREEN,
   1029 	    ("repl_pxfs_server::upgrade_callback_reg after reg: this = %p\n",
   1030 	    this));
   1031 
   1032 	// establish the current version in the replica
   1033 	set_version(cur_version);
   1034 }
   1035 
   1036 void
   1037 repl_pxfs_server::upgrade_callback_unregister()
   1038 {
   1039 	Environment e;
   1040 
   1041 	MOUNT_DBPRINTF(
   1042 	    MOUNT_TRACE_REPLICA,
   1043 	    MOUNT_GREEN,
   1044 	    ("repl_pxfs_server::upgrade_callback_unregister: this = %p\n",
   1045 	    this));
   1046 
   1047 	version_lock.wrlock();
   1048 	if (!CORBA::is_nil(callback_object_v)) {
   1049 		version_manager::vm_admin_var vmgr_v =
   1050 		    vm_util::get_vm(NODEID_UNKNOWN);
   1051 		vmgr_v->unregister_upgrade_callbacks(
   1052 		    callback_object_v, e);
   1053 		if (e.exception()) {
   1054 			ASSERT(0);
   1055 			e.clear();
   1056 		}
   1057 		callback_object_v = version_manager::upgrade_callback::_nil();
   1058 	}
   1059 	version_lock.unlock();
   1060 }
   1061 
   1062 //
   1063 // Process an upgrade callback from the version manager.
   1064 // This call can happen before the HA replica is registered with the
   1065 // replica manager (i.e., the HA sevice is not started yet or is a spare),
   1066 // or as primary or secondary. The callback is not synchronized with
   1067 // calls to become_primary(), become_secondary(), etc. and failovers can
   1068 // happen while the service is frozen so we need to make sure that the
   1069 // "upgrade work" is done if there are node failures.
   1070 // If this replica is the primary, we do the work and send a checkpoint
   1071 // to indicate the work is complete. If this replica is a secondary and
   1072 // the callback happens before we get the checkpoint from the primary,
   1073 // we record that the callback happened so that become_primary() can
   1074 // do the "upgrade work" and send the checkpoint. If this replica is a
   1075 // secondary and the callback happens after the checkpoint is received,
   1076 // we ignore the callback. The checkpoint routine on the secondary clears
   1077 // the "flag" that the callback sets so no extra work is done in
   1078 // become_primary() if the old primary fails after completing the upgrade
   1079 // callback work.
   1080 //
   1081 void
   1082 repl_pxfs_server::upgrade_callback(const version_manager::vp_version_t &v,
   1083     Environment &e)
   1084 {
   1085 	CORBA::type_info_t *typ;
   1086 
   1087 	MOUNT_DBPRINTF(
   1088 	    MOUNT_TRACE_REPLICA,
   1089 	    MOUNT_GREEN,
   1090 	    ("repl_pxfs_server::upgrade_callback: this = %p "
   1091 	    "fsp = %p\n", this, fsp));
   1092 
   1093 	//
   1094 	// A nil callback object indicates that a callback unregister was
   1095 	// done.
   1096 	//
   1097 	if (CORBA::is_nil(callback_object_v)) {
   1098 		MOUNT_DBPRINTF(
   1099 		    MOUNT_TRACE_REPLICA,
   1100 		    MOUNT_GREEN,
   1101 		    ("repl_pxfs_server::upgrade_callback: nil cb object\n"));
   1102 		return;
   1103 	}
   1104 
   1105 
   1106 	//
   1107 	// Note that upgrade callbacks are not synchronized with
   1108 	// calls to become_primary(), add_secondary(), etc.
   1109 	// Getting this lock makes sure the replica state doesn't change.
   1110 	//
   1111 	version_lock.wrlock();
   1112 	if (current_version.major_num > v.major_num ||
   1113 	    (current_version.major_num == v.major_num &&
   1114 	    current_version.minor_num >= v.minor_num)) {
   1115 		// Version isn't changing so just return.
   1116 		version_lock.unlock();
   1117 		callback_object_v = version_manager::upgrade_callback::_nil();
   1118 		return;
   1119 	}
   1120 
   1121 	//
   1122 	// If this replica is not the primary, set the pending version
   1123 	// to be used if there is a failover before the primary checkpoints
   1124 	// a new version.
   1125 	//
   1126 	if (CORBA::is_nil(_ckpt_proxy)) {
   1127 		// If this replica is a secondary, set the pending version.
   1128 		if (fsp != NULL) {
   1129 			pending_version = v;
   1130 		} else {
   1131 			// Replica must be a spare.
   1132 			current_version = v;
   1133 		}
   1134 		version_lock.unlock();
   1135 		callback_object_v = version_manager::upgrade_callback::_nil();
   1136 		MOUNT_DBPRINTF(
   1137 		    MOUNT_TRACE_REPLICA,
   1138 		    MOUNT_GREEN,
   1139 		    ("repl_pxfs_server::upgrade_callback: not primary\n"));
   1140 		return;
   1141 	}
   1142 
   1143 	//
   1144 	// Check for primary with a "dead" filesystem
   1145 	//
   1146 	if (fsp == NULL) {
   1147 		version_lock.unlock();
   1148 		callback_object_v = version_manager::upgrade_callback::_nil();
   1149 		MOUNT_DBPRINTF(
   1150 		    MOUNT_TRACE_REPLICA,
   1151 		    MOUNT_GREEN,
   1152 		    ("repl_pxfs_server::upgrade_callback: NULL fsp\n"));
   1153 		return;
   1154 	}
   1155 
   1156 	// We are the primary and the version has changed.
   1157 	current_version = v;
   1158 	MOUNT_DBPRINTF(
   1159 	    MOUNT_TRACE_REPLICA,
   1160 	    MOUNT_GREEN,
   1161 	    ("repl_pxfs_server::upgrade_callback: this = %p "
   1162 	    "new version ->  major =  %d minor = %d\n", this,
   1163 	    current_version.major_num, current_version.minor_num));
   1164 
   1165 	//
   1166 	// Switch the checkpoint interface to the new protocol. Save
   1167 	// the current _ckpt_proxy and release it after we get a new one
   1168 	//
   1169 	REPL_PXFS_VER::fs_replica_ptr old_ckpt_p = _ckpt_proxy;
   1170 	typ = REPL_PXFS_VER::fs_replica::_get_type_info(
   1171 	    pxfs_vp_to_idl[current_version.major_num]
   1172 		[current_version.minor_num].fs_ckpt);
   1173 
   1174 	replica::checkpoint_var tmp_ckpt_v = set_checkpoint(typ);
   1175 	_ckpt_proxy = REPL_PXFS_VER::fs_replica::_narrow(tmp_ckpt_v);
   1176 
   1177 	ASSERT(!CORBA::is_nil(_ckpt_proxy));
   1178 
   1179 	// Release the reference to the old ckpt_proxy.
   1180 	CORBA::release(old_ckpt_p);
   1181 	old_ckpt_p = REPL_PXFS_VER::fs_replica::_nil();
   1182 
   1183 	// Update the server reference.
   1184 	typ = PXFS_VER::filesystem::_get_type_info(
   1185 	    pxfs_vp_to_idl[current_version.major_num]
   1186 		[current_version.minor_num].fs);
   1187 	fs_v = fsp->get_objref(typ);
   1188 
   1189 	//
   1190 	// Create and add a primary context so the provider can send
   1191 	// checkpoints while the service is frozen.
   1192 	// XXX change the primary_ctx::invoke_env type.
   1193 	//
   1194 	primary_ctx ctx(NULL, primary_ctx::ADD_SECONDARY_CKPT, e);
   1195 
   1196 	// Checkpoint the current version number.
   1197 	_ckpt_proxy->ckpt_service_version(
   1198 	    current_version.major_num, current_version.minor_num, e);
   1199 
   1200 	e.trans_ctxp = NULL;
   1201 
   1202 	//
   1203 	// This reference isn't needed any more since we won't need
   1204 	// to do an unregister of the callback object.
   1205 	//
   1206 	callback_object_v = version_manager::upgrade_callback::_nil();
   1207 
   1208 	version_lock.unlock();
   1209 }
   1210 
   1211 //
   1212 // Checkpoint the creation of a new filesystem (fs_obj).
   1213 //
   1214 void
   1215 repl_pxfs_server::ckpt_new_fsobj(PXFS_VER::filesystem_ptr fs_obj,
   1216     const char *mntoptions, Environment &)
   1217 {
   1218 	// Create the shadow fs object.
   1219 	if (fsp == NULL) {
   1220 		fsp = new fs_repl_impl(mountdata.fstype, mountdata.spec,
   1221 		    mntoptions, this, fs_obj);
   1222 		fs_v = PXFS_VER::filesystem::_duplicate(fs_obj);
   1223 	}
   1224 }
   1225 
   1226 //
   1227 // Checkpoint a new service version
   1228 //
   1229 void
   1230 repl_pxfs_server::ckpt_service_version(unsigned short new_major,
   1231     unsigned short new_minor, Environment &)
   1232 {
   1233 	version_lock.wrlock();
   1234 	current_version.major_num = new_major;
   1235 	current_version.minor_num = new_minor;
   1236 	pending_version.major_num = 0;
   1237 	version_lock.unlock();
   1238 }
   1239 
   1240 //
   1241 // Checkpoint a failure in become_primary() or become_secondary().
   1242 //
   1243 void
   1244 repl_pxfs_server::ckpt_mnt_error(sol::error_t error, Environment &)
   1245 {
   1246 	ASSERT(fsp != NULL);
   1247 	mnt_error = error;
   1248 }
   1249 
   1250 //
   1251 // Checkpoint the creation of a new fobj which has the specified
   1252 // fid (used for switchover for primary) and type.
   1253 //
   1254 void
   1255 repl_pxfs_server::ckpt_new_fobj(PXFS_VER::fobj_ptr obj,
   1256     const PXFS_VER::fobjid_t &fobjid,
   1257     PXFS_VER::fobj_type_t type,
   1258     Environment &)
   1259 {
   1260 	ASSERT(fsp != NULL);
   1261 	fsp->ckpt_new_fobj(obj, fobjid, type);
   1262 }
   1263 
   1264 //
   1265 // Checkpoint the creation of a new fsmgr_server object for
   1266 // detecting client crashes.
   1267 //
   1268 void
   1269 repl_pxfs_server::ckpt_new_fsmgr(PXFS_VER::fsmgr_server_ptr servermgr,
   1270     PXFS_VER::fsmgr_client_ptr clientmgr, sol::nodeid_t nodeid, Environment &)
   1271 {
   1272 	ASSERT(fsp != NULL);
   1273 	fsp->ckpt_new_fsmgr(servermgr, clientmgr, nodeid);
   1274 }
   1275 
   1276 //
   1277 // Update the mount arguments after a remount (see fs_ii::remount()).
   1278 // We assume that the VFS_MOUNT() (with MS_REMOUNT set) contains the
   1279 // complete mount information and doesn't depend on the history of previous
   1280 // of previous calls to VFS_MOUNT(). If this isn't true, we would need to
   1281 // save a list of all parameters and replay all the VFS_MOUNT() calls
   1282 // on failover.
   1283 //
   1284 void
   1285 repl_pxfs_server::set_mountargs(const sol::mounta &ma)
   1286 {
   1287 	mountdata = ma;
   1288 	//
   1289 	// We set MS_NOSPLICE so that the underlying file system isn't
   1290 	// linked into the file system name space.
   1291 	// XXX We also force the MS_OVERLAY flag on to suppress the
   1292 	// mvp->v_count == 1 EBUSY check in the file system code.
   1293 	// We clear MS_REMOUNT since we will be doing a mount instead
   1294 	// of a remount after a failover.
   1295 	//
   1296 	mountdata.flags = MS_NOSPLICE | MS_OVERLAY | (ma.flags & ~MS_REMOUNT);
   1297 }
   1298 
   1299 //
   1300 // Checkpoint blocks allocated by the server.
   1301 //
   1302 void
   1303 repl_pxfs_server::ckpt_blocks_allocated(
   1304     const repl_pxfs_v1::blocks_allocated_t &current_allocations,
   1305     PXFS_VER::blkcnt_t blocks_free_cnt, Environment &)
   1306 {
   1307 	ASSERT(fsp != NULL);
   1308 	fsp->ckpt_blocks_allocated(current_allocations, blocks_free_cnt);
   1309 }
   1310 
   1311 //
   1312 // Checkpoint the server status to secondary.
   1313 //
   1314 void
   1315 repl_pxfs_server::ckpt_server_status(PXFS_VER::server_status_t status,
   1316     Environment &)
   1317 {
   1318 	ASSERT(fsp != NULL);
   1319 	fsp->ckpt_server_status(status);
   1320 }
   1321 
   1322 //
   1323 // Checkpoint the changes to mount arguments and options due to a remount.
   1324 //
   1325 void
   1326 repl_pxfs_server::ckpt_remount(const sol::mounta &ma, const char *mntoptions,
   1327     Environment &)
   1328 {
   1329 	set_mountargs(ma);
   1330 	fsp->set_options(mntoptions);
   1331 }
   1332 
   1333 //
   1334 // Checkpoint a change in the active locks of a file.
   1335 //
   1336 void
   1337 repl_pxfs_server::ckpt_locks(PXFS_VER::fobj_ptr obj,
   1338     const REPL_PXFS_VER::lock_info_seq_t &locks, Environment &)
   1339 {
   1340 	fobj_ii *fobj_iip = fobj_ii::get_fobj_ii(obj);
   1341 	fobj_iip->ckpt_locks(locks);
   1342 }
   1343 
   1344 //
   1345 // Checkpoint the existence of a file entry under the current mini-transaction.
   1346 // This method is invoked on the secondary by the primary.
   1347 // This state can be used later to determine if the primary completed an
   1348 // operation or not.
   1349 //
   1350 void
   1351 repl_pxfs_server::ckpt_entry_state(bool exists, Environment &_environment)
   1352 {
   1353 	transaction_state *state = new unixdir_state(exists);
   1354 	state->register_state(_environment);
   1355 	if (_environment.exception()) {
   1356 		MOUNT_DBPRINTF(
   1357 		    MOUNT_TRACE_REPLICA,
   1358 		    MOUNT_RED,
   1359 		    ("repl_pxfs_server::ckpt_entry_state: client died\n"));
   1360 		_environment.clear();
   1361 	}
   1362 }
   1363 
   1364 //
   1365 // Checkpoint the existence of a file to be locked
   1366 // prior to operating on it.  This used with mini-transactions
   1367 // to handle failure mid-way through an operation.
   1368 //
   1369 void
   1370 repl_pxfs_server::ckpt_target(PXFS_VER::fobj_ptr obj,
   1371     const PXFS_VER::fobj_info &fobjinfo, Environment &_environment)
   1372 {
   1373 	transaction_state	*state = new unixdir_state(obj, fobjinfo);
   1374 	state->register_state(_environment);
   1375 
   1376 	// If the client dies, there is nothing to clean up.
   1377 	_environment.clear();
   1378 }
   1379 
   1380 //
   1381 // Checkpoint the existence of a file to be locked
   1382 // prior to removing it.  This used with mini-transactions
   1383 // to handle failure mid-way through an operation.
   1384 //
   1385 void
   1386 repl_pxfs_server::ckpt_target_remove(PXFS_VER::fobj_ptr obj,
   1387     const PXFS_VER::fobj_info &fobjinfo, uint64_t delete_id,
   1388     Environment &_environment)
   1389 {
   1390 	transaction_state	*state = new unixdir_state(obj,
   1391 				    fobjinfo, delete_id);
   1392 
   1393 	state->register_state(_environment);
   1394 
   1395 	// If the client dies, there is nothing to clean up.
   1396 	_environment.clear();
   1397 }
   1398 
   1399 //
   1400 // Checkpoint an object return under the current mini-transaction.
   1401 // This return value is used to return the results of a stale operation.
   1402 //
   1403 void
   1404 repl_pxfs_server::ckpt_fobj_return(PXFS_VER::fobj_ptr ret_obj,
   1405     const PXFS_VER::fobj_info &ret_info, Environment &_environment)
   1406 {
   1407 	secondary_ctx *ctxp = secondary_ctx::extract_from(_environment);
   1408 	unixdir_state *saved_state =
   1409 	    (unixdir_state *)ctxp->get_saved_state();
   1410 	//
   1411 	// ckpt_entry_state() or ckpt_target() should have been called first
   1412 	// so there should always be a transaction state object for this
   1413 	// checkpoint operation.
   1414 	//
   1415 	ASSERT(saved_state != NULL);
   1416 	saved_state->ckpt_fobj_return(ret_obj, ret_info);
   1417 }
   1418 
   1419 //
   1420 // Checkpoint an error return under the current mini-transaction.
   1421 // This error return is used to return the results of a stale operation.
   1422 //
   1423 void
   1424 repl_pxfs_server::ckpt_error_return(sol::error_t error,
   1425     Environment &_environment)
   1426 {
   1427 	secondary_ctx *ctxp = secondary_ctx::extract_from(_environment);
   1428 	unixdir_state *saved_state =
   1429 	    (unixdir_state *)ctxp->get_saved_state();
   1430 	//
   1431 	// If ckpt_entry_state() nor ckpt_target() has not been called first,
   1432 	// this is both the start and commit for this operation.
   1433 	//
   1434 	if (saved_state == NULL) {
   1435 		transaction_state *state = new unixdir_state(error);
   1436 		state->register_state(_environment);
   1437 		// If the client dies, there is nothing to clean up.
   1438 		_environment.clear();
   1439 	} else {
   1440 		saved_state->ckpt_error_return(error);
   1441 	}
   1442 }
   1443 
   1444 //
   1445 // Checkpoint the deletion of an fobj.
   1446 //
   1447 void
   1448 repl_pxfs_server::ckpt_delete_fobj(uint64_t delete_id,
   1449     Environment &_environment)
   1450 {
   1451 	secondary_ctx *ctxp = secondary_ctx::extract_from(_environment);
   1452 	unixdir_state *saved_state =
   1453 	    (unixdir_state *)ctxp->get_saved_state();
   1454 	//
   1455 	// ckpt_entry_state() or ckpt_target() should have been called first
   1456 	// so there should always be a transaction state object for this
   1457 	// checkpoint operation.
   1458 	//
   1459 	ASSERT(saved_state != NULL);
   1460 	saved_state->ckpt_delete_fobj(delete_id);
   1461 }
   1462 
   1463 //
   1464 // ckpt_deletecnt - updates the secondary file system with the number
   1465 // used by the primary for renames of deleted files.
   1466 //
   1467 void
   1468 repl_pxfs_server::ckpt_deletecnt(uint64_t delete_id, Environment &)
   1469 {
   1470 	fsp->set_deletecnt(delete_id);
   1471 }
   1472 
   1473 //
   1474 // This checkpoint is used by a primary to bring a secondary up to date on
   1475 // filesystem locking state.
   1476 //
   1477 void
   1478 repl_pxfs_server::ckpt_lockfs_info(uint64_t lf_lock, uint64_t lf_flags,
   1479     uint64_t lf_key, const char *lf_comment, Environment &)
   1480 {
   1481 	ASSERT(fsp != NULL);
   1482 	fsp->get_fs_dep_implp()->ckpt_lockfs_state(lf_lock, lf_flags, lf_key,
   1483 	    lf_comment);
   1484 }
   1485 
   1486 //
   1487 // Checkpoint the beginning of a lockfs call
   1488 //
   1489 void
   1490 repl_pxfs_server::ckpt_lockfs_start(uint64_t lf_lock, uint64_t lf_flags,
   1491     uint64_t lf_key, const char *lf_comment, Environment &_environment)
   1492 {
   1493 	ASSERT(fsp != NULL);
   1494 	fobj_lockfs_state::register_new_state(fsp, lf_lock, lf_flags, lf_key,
   1495 	    lf_comment, _environment);
   1496 }
   1497 
   1498 //
   1499 // Checkpoint failure of a lockfs call.
   1500 //
   1501 void
   1502 repl_pxfs_server::ckpt_lockfs_failure(sol::error_t err,
   1503     Environment &_environment)
   1504 {
   1505 	ASSERT(fsp != NULL);
   1506 	fobj_lockfs_state::report_failure(err, _environment);
   1507 }
   1508 
   1509 void
   1510 repl_pxfs_server::ckpt_cachedata_flag(PXFS_VER::file_ptr file_p, bool flag,
   1511     Environment &)
   1512 {
   1513 	file_ii	*file_iip = (file_ii *)fobj_ii::get_fobj_ii(file_p);
   1514 	file_iip->ckpt_cachedata_flag(flag);
   1515 }
   1516 
   1517 //
   1518 // Checkpoint the state of a file
   1519 // (this is used to dump state to new secondaries).
   1520 //
   1521 void
   1522 repl_pxfs_server::ckpt_fobj_state(PXFS_VER::fobj_ptr obj, uint64_t delete_id,
   1523     Environment &)
   1524 {
   1525 	fobj_ii	*fobj_iip = fobj_ii::get_fobj_ii(obj);
   1526 	fobj_iip->ckpt_fobj_state(delete_id);
   1527 }
   1528 
   1529 //
   1530 // Checkpoint tunefs parameters for VxFS.
   1531 //
   1532 void
   1533 repl_pxfs_server::ckpt_vx_tunefs(const REPL_PXFS_VER::vx_tunefs_t &tunefs,
   1534     Environment &)
   1535 {
   1536 	ASSERT(fsp != NULL);
   1537 	fsp->get_fs_dep_implp()->ckpt_vx_tunefs(tunefs);
   1538 }
   1539 
   1540 void
   1541 repl_pxfs_server::ckpt_remove_file_locks_by_sysid(int32_t sysid,
   1542     Environment &)
   1543 {
   1544 	ASSERT(fsp != NULL);
   1545 	fsp->ckpt_remove_file_locks_by_sysid(sysid);
   1546 }
   1547 
   1548 void
   1549 repl_pxfs_server::ckpt_remove_file_locks_by_nlmid(int32_t nlmid,
   1550     Environment &)
   1551 {
   1552 	ASSERT(fsp != NULL);
   1553 	fsp->ckpt_remove_file_locks_by_nlmid(nlmid);
   1554 }
   1555 
   1556 void
   1557 repl_pxfs_server::ckpt_fs_is_unmounted(Environment &)
   1558 {
   1559 	fs_is_unmounted = true;
   1560 }
   1561 
   1562 fs_version_callback_impl::fs_version_callback_impl(
   1563     replica::repl_prov_ptr replica_p) :
   1564 	prov_v(replica_p)
   1565 {
   1566 }
   1567 
   1568 fs_version_callback_impl::~fs_version_callback_impl()
   1569 {
   1570 }
   1571 
   1572 void
   1573 fs_version_callback_impl::_unreferenced(unref_t arg)
   1574 {
   1575 	if (!_last_unref(arg)) {
   1576 		// _last_unref() should always be true since we don't use 0->1.
   1577 		ASSERT(0);
   1578 		return;
   1579 	}
   1580 	delete this;
   1581 }
   1582 
   1583 
   1584 // Call the provider to update the version and checkpoint it.
   1585 void
   1586 fs_version_callback_impl::do_callback(const char *,
   1587     const version_manager::vp_version_t &current_version, Environment &e)
   1588 {
   1589 	MOUNT_DBPRINTF(
   1590 	    MOUNT_TRACE_REPLICA,
   1591 	    MOUNT_GREEN,
   1592 	    ("fs_version_callback_impl::do_callback: this = %p\n", this));
   1593 
   1594 	// Call the provider to update the version and checkpoint it.
   1595 	void *p = prov_v->_handler()->get_cookie();
   1596 	((repl_pxfs_server *)p)->upgrade_callback(current_version, e);
   1597 }
   1598