Home | History | Annotate | Download | only in mount
      1 //
      2 // CDDL HEADER START
      3 //
      4 // The contents of this file are subject to the terms of the
      5 // Common Development and Distribution License (the License).
      6 // You may not use this file except in compliance with the License.
      7 //
      8 // You can obtain a copy of the license at usr/src/CDDL.txt
      9 // or http://www.opensolaris.org/os/licensing.
     10 // See the License for the specific language governing permissions
     11 // and limitations under the License.
     12 //
     13 // When distributing Covered Code, include this CDDL HEADER in each
     14 // file and include the License file at usr/src/CDDL.txt.
     15 // If applicable, add the following below this CDDL HEADER, with the
     16 // fields enclosed by brackets [] replaced with your own identifying
     17 // information: Portions Copyright [yyyy] [name of copyright owner]
     18 //
     19 // CDDL HEADER END
     20 //
     21 
     22 //
     23 // Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24 // Use is subject to license terms.
     25 //
     26 
     27 #pragma ident	"@(#)mount_server_impl.cc	1.83	08/05/20 SMI"
     28 
     29 #include <sys/errno.h>
     30 
     31 #include <sys/types.h>
     32 #include <sys/thread.h>
     33 #include <sys/file.h>
     34 #include <sys/mount.h>
     35 #include <sys/pathname.h>
     36 #include <sys/sysmacros.h>
     37 
     38 #include <sys/os.h>
     39 #include <sys/sol_conv.h>
     40 #include <solobj/solobj_impl.h>
     41 
     42 #include <pxfs/mount/mount_server_impl.h>
     43 #include <pxfs/mount/mount_replica_impl.h>
     44 #include <pxfs/mount/mount_debug.h>
     45 #include <pxfs/server/fobj_impl.h>
     46 
     47 //
     48 // The number of seconds devlock waits for, before timing out.
     49 // This is an undocumented tunable.
     50 //
     51 int pxfs_devlock_timeout = 5;
     52 
     53 // VP to IDL interface version mapping for Mount subsystem
     54 extern mount_ver_map_t
     55     mount_vp_to_idl[MOUNT_VP_MAX_MAJOR +1][MOUNT_VP_MAX_MINOR +1];
     56 
     57 //lint -e1512
     58 //
     59 // Warning(1512) destructor for base class is not virtual -- In a
     60 // final pass through all the classes, we have found a class that is
     61 // the base class of a derivation and has a destructor but the
     62 // destructor is not virtual. It is conventional for inherited classes
     63 // to have virtual destructors so that is it safe to 'delete' a
     64 // pointer to a base class.
     65 //
     66 // The classes prov_common_iter and prov_common_setin the file
     67 // prov_common.h have to be changed to have virtual destructors.
     68 //
     69 
     70 //
     71 // fs_elem methods
     72 //
     73 
     74 fs_elem::fs_elem(pxfs_v1::filesystem_ptr fs_p, const pxfs_v1::fs_info &finfo,
     75     const sol::mounta &md, const char *options, bool is_ha, const char *name,
     76     const sol::nodeid_seq_t &nids) :
     77 	_DList::ListElem(this),
     78 	fs_v1_info(finfo),
     79 	ma(md),
     80 	dev_is_ha(is_ha),
     81 	dev_nids(nids),
     82 	fs_elem_ver(VERSION_1)
     83 {
     84 	ASSERT(!CORBA::is_nil(fs_p));
     85 	fs_v1_ptr = pxfs_v1::filesystem::_duplicate(fs_p);
     86 
     87 	// Make a copy, don't use String_var constructor.
     88 	mntoptions = options;
     89 	dev_name = name;
     90 }
     91 
     92 fs_elem::~fs_elem()
     93 {
     94 }
     95 
     96 //
     97 // devlock_elem methods
     98 //
     99 
    100 //
    101 // Constructor for "device lock".
    102 //
    103 devlock_elem::devlock_elem(fs::mount_client_ptr c, sol::nodeid_t nodeid,
    104     const char *name) :
    105 	_SList::ListElem(this)
    106 {
    107 	owner = fs::mount_client::_duplicate(c);
    108 	spec = os::strdup(name);
    109 	nwaiters = 0;
    110 	ownerid = nodeid;
    111 	unlocked = false;
    112 }
    113 
    114 devlock_elem::~devlock_elem()
    115 {
    116 	delete [] spec;
    117 }
    118 
    119 //
    120 // mount_client_elem methods
    121 //
    122 
    123 //
    124 // Create a list element to save the mount_client reference and nodeid.
    125 // Primary constructor.
    126 //
    127 mount_client_elem::mount_client_elem(mount_server_impl *server,
    128     fs::mount_client_ptr client, sol::nodeid_t nid,
    129     mount_replica_impl *replp) :
    130 	mc_replica_of<fs::mount_client_died>(replp),
    131 	_SList::ListElem(this)
    132 {
    133 	serverp = server;
    134 	clientptr = fs::mount_client::_duplicate(client);
    135 	nodeid = nid;
    136 	shutdown = false;
    137 	MOUNT_DBPRINTF(
    138 	    MOUNT_TRACE_SERVER,
    139 	    MOUNT_GREEN,
    140 	    ("mount_client_elem newP %p\n", this));
    141 }
    142 
    143 //
    144 // Create a list element to save the mount_client reference and nodeid.
    145 // Secondary constructor.
    146 //
    147 mount_client_elem::mount_client_elem(mount_server_impl *server,
    148     fs::mount_client_ptr client, sol::nodeid_t nid, bool is_shutdown,
    149     fs::mount_client_died_ptr obj) :
    150 	mc_replica_of<fs::mount_client_died>(obj),
    151 	_SList::ListElem(this)
    152 {
    153 	serverp = server;
    154 	clientptr = fs::mount_client::_duplicate(client);
    155 	nodeid = nid;
    156 	shutdown = is_shutdown;
    157 	MOUNT_DBPRINTF(
    158 	    MOUNT_TRACE_SERVER,
    159 	    MOUNT_GREEN,
    160 	    ("mount_client_elem newS %p s %d\n",
    161 	    this, shutdown));
    162 }
    163 
    164 mount_client_elem::~mount_client_elem()
    165 {
    166 	MOUNT_DBPRINTF(
    167 	    MOUNT_TRACE_SERVER,
    168 	    MOUNT_GREEN,
    169 	    ("~mount_client_elem %p\n", this));
    170 	serverp = NULL; // for lint
    171 }
    172 
    173 //
    174 // This is called if the mount client crashes or is halted.
    175 //
    176 void
    177 mount_client_elem::_unreferenced(unref_t arg)
    178 {
    179 	if (!_last_unref(arg)) {
    180 		// _last_unref() should always be true since we don't use 0->1.
    181 		ASSERT(0);
    182 		return;
    183 	}
    184 	if (!CORBA::is_nil(clientptr)) {
    185 		//
    186 		// Note: we don't delete ourself here
    187 		// (see mount_server_impl::client_died()).
    188 		//
    189 		serverp->client_died(this);
    190 	} else {
    191 		//
    192 		// mount_server::remove_client() has already removed
    193 		// this from the list so just delete ourself.
    194 		//
    195 		delete this;
    196 	}
    197 }
    198 
    199 //
    200 // mount_state methods
    201 //
    202 
    203 //
    204 // Constructor for mount_state.
    205 //
    206 mount_state::mount_state(const sol::mounta &md, fs::mount_client_ptr client_p,
    207     mount_server_impl &srvr, mount_ver_t ver) :
    208 	ma(md),
    209 	error(0),
    210 	is_remount(false),
    211 	mount_ver(ver),
    212 	server(srvr)
    213 {
    214 	ASSERT(!CORBA::is_nil(client_p));
    215 	mountpoint_lock_c = fs::mount_client::_duplicate(client_p);
    216 	fs_v1_info.fsflag = 0; // for lint
    217 }
    218 
    219 mount_state::~mount_state()
    220 {
    221 }
    222 
    223 //
    224 // The client has crashed so there will be no retry of the mount().
    225 // Unlock the mount points we have already locked.
    226 //
    227 void
    228 mount_state::orphaned(Environment &env)
    229 {
    230 	(void) server.mount_orphaned(this, true, env);
    231 }
    232 
    233 //
    234 // This is called on the secondary to complete the mount transaction.
    235 //
    236 void
    237 mount_state::committed()
    238 {
    239 	mountpoint_lock_c = fs::mount_client::_nil();
    240 }
    241 
    242 //
    243 // unmount_state methods
    244 //
    245 
    246 //
    247 // Constructor for unmount_state for recording unmounts during remove_client().
    248 // Note that client can be nil.
    249 //
    250 unmount_state::unmount_state(pxfs_v1::filesystem_ptr fs_p,
    251     int32_t umflags,
    252     solobj::cred_ptr cred_p,
    253     mount_server_impl &srvr,
    254     fs::mount_client_ptr client_p) :
    255 	flags(umflags),
    256 	state(START),
    257 	error(0),
    258 	unmount_ver(VERSION_1),
    259 	server(srvr)
    260 {
    261 	ASSERT(!CORBA::is_nil(fs_p));
    262 	fs_v1_obj = pxfs_v1::filesystem::_duplicate(fs_p);
    263 
    264 	skip = fs::mount_client::_duplicate(client_p);
    265 	credobj = solobj::cred::_duplicate(cred_p);
    266 }
    267 
    268 unmount_state::~unmount_state()
    269 {
    270 }
    271 
    272 //
    273 // This is called on the (new) primary after a failover of the mount_server
    274 // and a crash of the mount_client node.
    275 //
    276 void
    277 unmount_state::orphaned(Environment &e)
    278 {
    279 	server.unmount_orphaned(this, false, e);
    280 }
    281 
    282 //
    283 // This is called on the secondary when add_commit() is called on the primary.
    284 //
    285 void
    286 unmount_state::committed()
    287 {
    288 	ASSERT(CORBA::is_nil(fs_v1_obj));
    289 }
    290 
    291 //
    292 // dc_callback_impl methods
    293 //
    294 
    295 //
    296 // Primary constructor.
    297 //
    298 dc_callback_impl::dc_callback_impl(mount_server_impl &srvr,
    299     mount_replica_impl *serverp) :
    300 	mc_replica_of<fs::dc_callback>(serverp),
    301 	server(srvr)
    302 {
    303 	// LINTED: Call to virtual function within a contructor or destructor.
    304 	_handler()->set_cookie((void *)this);
    305 }
    306 
    307 //
    308 // Secondary constructor.
    309 //
    310 dc_callback_impl::dc_callback_impl(mount_server_impl &srvr,
    311     fs::dc_callback_ptr obj) :
    312 	mc_replica_of<fs::dc_callback>(obj),
    313 	server(srvr)
    314 {
    315 	// LINTED: Call to virtual function within a contructor or destructor.
    316 	_handler()->set_cookie((void *)this);
    317 }
    318 
    319 dc_callback_impl::~dc_callback_impl()
    320 {
    321 }
    322 
    323 void
    324 dc_callback_impl::_unreferenced(unref_t arg)
    325 {
    326 	if (!_last_unref(arg)) {
    327 		// _last_unref() should always be true since we don't use 0->1.
    328 		ASSERT(0);
    329 		return;
    330 	}
    331 	delete this;
    332 }
    333 
    334 // dc_callback_impl(fs::dc_callback::notify_change)
    335 void
    336 dc_callback_impl::notify_change(sol::dev_t gdev,
    337     const sol::nodeid_seq_t &nodes, Environment &_environment)
    338 {
    339 	server.notify_change(gdev, nodes, _environment);
    340 }
    341 
    342 // dc_callback_impl(fs::dc_callback::still_active, _environment)
    343 bool
    344 dc_callback_impl::still_active(sol::dev_t gdev, Environment &)
    345 {
    346 	return (server.still_active(gdev));
    347 }
    348 
    349 //
    350 // mount_server_impl methods
    351 //
    352 
    353 //
    354 // Return a CORBA pointer (no CORBA::release() required)
    355 // to the checkpoint interface.
    356 //
    357 repl_pxfs::mount_replica_ptr
    358 mount_server_impl::get_checkpoint()
    359 {
    360 	return ((mount_replica_impl*)(get_provider()))->
    361 	    get_checkpoint_mount_replica();
    362 }
    363 
    364 //
    365 // Primary constructor.
    366 //
    367 mount_server_impl::mount_server_impl(mount_replica_impl *serverp) :
    368 	mc_replica_of<fs::mount_server>(serverp)
    369 {
    370 	repl_serverp = serverp;
    371 	primary = true;
    372 	frozen = false;
    373 	currentmnt = (char *)NULL;
    374 }
    375 
    376 //
    377 // Secondary constructor.
    378 //
    379 mount_server_impl::mount_server_impl(mount_replica_impl *serverp,
    380     fs::mount_server_ptr obj) :
    381 	mc_replica_of<fs::mount_server>(obj)
    382 {
    383 	repl_serverp = serverp;
    384 	primary = false;
    385 	frozen = false;
    386 	currentmnt = (char *)NULL;
    387 }
    388 
    389 //
    390 // Destructor.
    391 //
    392 mount_server_impl::~mount_server_impl()
    393 {
    394 	ASSERT(client_list.empty());
    395 	ASSERT(fs_list.empty());
    396 }
    397 
    398 void
    399 mount_server_impl::_unreferenced(unref_t arg)
    400 {
    401 	if (!_last_unref(arg)) {
    402 		// _last_unref() should always be true since we don't use 0->1.
    403 		ASSERT(0);
    404 		return;
    405 	}
    406 	//
    407 	// XXX Should wait for all _unreferenced() from
    408 	// mount_client_elem and dc_callback_impl but since this service
    409 	// is never shut down, _unreferenced() should never get called.
    410 	//
    411 	MOUNT_DBPRINTF(
    412 	    MOUNT_TRACE_SERVER,
    413 	    MOUNT_GREEN,
    414 	    ("server:_unreferenced\n"));
    415 	delete this;
    416 }
    417 
    418 //
    419 // This is called  to get a new reference.  Doing get_objref() here would
    420 // get the highest reference version that was compiled.  We want the
    421 // highest reference which is currently committed so we use this indirect
    422 // way.
    423 //
    424 void
    425 mount_server_impl::_generic_method(CORBA::octet_seq_t &,
    426     CORBA::object_seq_t &objs, Environment &e)
    427 {
    428 	objs[0] = repl_serverp->get_root_obj(e);
    429 }
    430 
    431 
    432 //
    433 // Called from mount_replica_impl when switching to primary.
    434 //
    435 void
    436 mount_server_impl::convert_to_primary()
    437 {
    438 	MOUNT_DBPRINTF(
    439 	    MOUNT_TRACE_SERVER,
    440 	    MOUNT_GREEN,
    441 	    ("server: primary\n"));
    442 
    443 	primary = true;
    444 
    445 #ifdef DEBUG
    446 	//
    447 	// There should be no threads waiting for a device lock
    448 	// (see comment for devunlock()).
    449 	//
    450 	devlock_elem	*dep;
    451 	for (devlock_list.atfirst();
    452 	    (dep = devlock_list.get_current()) != NULL;
    453 	    devlock_list.advance()) {
    454 		ASSERT(dep->nwaiters == 0);
    455 	}
    456 #endif
    457 }
    458 
    459 //
    460 // Called from mount_replica_impl when switching to secondary.
    461 //
    462 void
    463 mount_server_impl::convert_to_secondary()
    464 {
    465 	MOUNT_DBPRINTF(
    466 	    MOUNT_TRACE_SERVER,
    467 	    MOUNT_GREEN,
    468 	    ("server: secondary\n"));
    469 
    470 	primary = false;
    471 
    472 #ifdef DEBUG
    473 	//
    474 	// There should be no threads waiting for a device lock
    475 	// (see comment for devunlock()).
    476 	//
    477 	devlock_elem	*dep;
    478 	for (devlock_list.atfirst();
    479 	    (dep = devlock_list.get_current()) != NULL;
    480 	    devlock_list.advance()) {
    481 		ASSERT(dep->nwaiters == 0);
    482 	}
    483 #endif
    484 }
    485 
    486 //
    487 // Called from mount_replica_impl when switching to spare.
    488 //
    489 void
    490 mount_server_impl::convert_to_spare()
    491 {
    492 	MOUNT_DBPRINTF(
    493 	    MOUNT_TRACE_SERVER,
    494 	    MOUNT_GREEN,
    495 	    ("server: spare\n"));
    496 
    497 	if (!CORBA::is_nil(dc_callback_obj)) {
    498 		dc_callback_impl	*dc_callbackp = (dc_callback_impl *)
    499 		    dc_callback_obj->_handler()->get_cookie();
    500 		dc_callback_obj = fs::dc_callback::_nil();
    501 		delete dc_callbackp;
    502 	}
    503 
    504 	client_list.dispose();
    505 	fs_list.dispose();
    506 	devlock_list.dispose();
    507 	delete this;
    508 }
    509 
    510 //
    511 // This is called if a mount_client dies
    512 // (called from mount_client_elem::_unreferenced()).
    513 //
    514 void
    515 mount_server_impl::client_died(mount_client_elem *cep)
    516 {
    517 	devlock_elem	*dep;
    518 
    519 	MOUNT_DBPRINTF(
    520 	    MOUNT_TRACE_SERVER,
    521 	    MOUNT_GREEN,
    522 	    ("server:client_died: %p nid %d\n",
    523 	    (void *)cep, cep->nodeid));
    524 
    525 	if (!primary) {
    526 		//
    527 		// _unreferenced() and checkpoints are synchronized on
    528 		// the secondary so we don't need to lock the list.
    529 		//
    530 		// Remove any locks the client held.
    531 		//
    532 		devlock_list.atfirst();
    533 		while ((dep = devlock_list.get_current()) != NULL) {
    534 			devlock_list.advance();
    535 			if (cep->clientptr->_equiv(dep->owner)) {
    536 				MOUNT_DBPRINTF(
    537 				    MOUNT_TRACE_SERVER,
    538 				    MOUNT_AMBER,
    539 				    ("server:client_died "
    540 				    "unlock %p waiters %d\n",
    541 				    dep, dep->nwaiters));
    542 				(void) devlock_list.erase(dep);
    543 				delete dep;
    544 			}
    545 		}
    546 		(void) client_list.erase(cep);
    547 		delete cep;
    548 		return;
    549 	}
    550 
    551 	FAULTPT_PXFS(FAULTNUM_PXFS_CLIENT_DIED, FaultFunctions::generic);
    552 
    553 	//
    554 	// Remove any locks the client held.
    555 	//
    556 	devlock_list_lock.wrlock();
    557 	devlock_list.atfirst();
    558 	while ((dep = devlock_list.get_current()) != NULL) {
    559 		devlock_list.advance();
    560 		if (cep->clientptr->_equiv(dep->owner)) {
    561 			//
    562 			// No need to checkpoint this since the
    563 			// secondary will get _unreferenced() too.
    564 			//
    565 			MOUNT_DBPRINTF(
    566 			    MOUNT_TRACE_SERVER,
    567 			    MOUNT_AMBER,
    568 			    ("server:client_died unlock %p waiters %d\n",
    569 			    dep, dep->nwaiters));
    570 			(void) devlock_list.erase(dep);
    571 
    572 			dep->waiter_lock.lock();
    573 			if (dep->nwaiters != 0) {
    574 				// This wakes up all waiting threads.
    575 				dep->unlocked = true;
    576 				dep->waiter_cv.broadcast();
    577 				dep->waiter_lock.unlock();
    578 
    579 				// The last waiter will do the delete.
    580 			} else {
    581 				dep->waiter_lock.unlock();
    582 				delete dep;
    583 			}
    584 		}
    585 	}
    586 	devlock_list_lock.unlock();
    587 
    588 	//
    589 	// Note: there is no checkpoint since the secondary gets
    590 	// _unreferenced() when we delete the object.
    591 	//
    592 	client_list_lock.wrlock();
    593 	(void) client_list.erase(cep);
    594 	delete cep;
    595 	client_list_lock.unlock();
    596 }
    597 
    598 //
    599 // Upgrade mount_client references during Rolling Upgrade commit.
    600 //
    601 void
    602 mount_server_impl::upgrade_client_reference(Environment &_environment)
    603 {
    604 	CORBA::octet_seq_t	data;
    605 	CORBA::object_seq_t	objs(1, 1);
    606 	Environment		e;
    607 	mount_client_elem	*cep;
    608 	devlock_elem		*dep;
    609 
    610 	//
    611 	// Create a primary context so the provider can send
    612 	// checkpoints while the service is frozen.
    613 	// XXX change the primary_ctx::invoke_env type.
    614 	//
    615 	primary_ctx ctx(NULL, primary_ctx::ADD_SECONDARY_CKPT,
    616 	    _environment);
    617 
    618 	// Update the client reference in client list.
    619 	client_list_lock.wrlock();
    620 
    621 	client_list.atfirst();
    622 	while ((cep = client_list.get_current()) != NULL) {
    623 		client_list.advance();
    624 		cep->clientptr->_generic_method(data, objs, e);
    625 		if (e.exception()) {
    626 			MOUNT_DBPRINTF(
    627 			    MOUNT_TRACE_SERVER,
    628 			    MOUNT_RED,
    629 			    ("server:upgrade_client_reference"
    630 			    "exception when upgrading client reference"
    631 			    "%p in client list on %d errno %d\n",
    632 			    (void *)cep, cep->nodeid,
    633 			    e.exception()->exception_enum()));
    634 			e.clear();
    635 		} else {
    636 			cep->clientptr = fs::mount_client::_narrow(objs[0]);
    637 
    638 			//
    639 			// Send a checkpoint to the secondaries telling
    640 			// them to use the new version mount_client object
    641 			// reference.
    642 			//
    643 			get_checkpoint()->
    644 			    ckpt_upgrade_client_list(cep->clientptr,
    645 			    cep->nodeid, _environment);
    646 			ASSERT(_environment.exception() == NULL);
    647 
    648 			//
    649 			// Upgrade the mount_client to use the new version
    650 			// of the mount_server.
    651 			//
    652 			cep->clientptr->upgrade_mount_client(get_objref(), e);
    653 
    654 			if (e.exception()) {
    655 				MOUNT_DBPRINTF(
    656 				    MOUNT_TRACE_SERVER,
    657 				    MOUNT_RED,
    658 				    ("server:upgrade_client_reference"
    659 				    "exception when upgrading server reference"
    660 				    "for %p on %d error %d\n",
    661 				    (void *)cep, cep->nodeid,
    662 				    e.exception()->exception_enum()));
    663 				e.clear();
    664 			}
    665 		}
    666 	}
    667 
    668 	client_list_lock.unlock();
    669 
    670 	// Update the client reference in device lock list.
    671 	devlock_list_lock.wrlock();
    672 
    673 	devlock_list.atfirst();
    674 	while ((dep = devlock_list.get_current()) != NULL) {
    675 		devlock_list.advance();
    676 		dep->owner->_generic_method(data, objs, e);
    677 		if (e.exception()) {
    678 			MOUNT_DBPRINTF(
    679 			    MOUNT_TRACE_SERVER,
    680 			    MOUNT_RED,
    681 			    ("server:upgrade_client_reference"
    682 			    "exception when upgrading client reference"
    683 			    "%p in devlock list on %d errno %d\n",
    684 			    (void *)dep, dep->ownerid,
    685 			    e.exception()->exception_enum()));
    686 			e.clear();
    687 		} else {
    688 			dep->owner = fs::mount_client::_narrow(objs[0]);
    689 
    690 			//
    691 			// Send a checkpoint to the secondaries telling
    692 			// them to use the new version mount_client object
    693 			// reference.
    694 			//
    695 			get_checkpoint()->
    696 			    ckpt_upgrade_devlock_list(dep->spec, dep->owner,
    697 			    _environment);
    698 			ASSERT(_environment.exception() == NULL);
    699 
    700 			//
    701 			// Upgrade the mount_client to use the new version
    702 			// of the mount_server. The mount_client may have
    703 			// already been upgraded. This invocation is
    704 			// idempotent. So that is not a problem.
    705 			//
    706 			dep->owner->upgrade_mount_client(get_objref(), e);
    707 
    708 			if (e.exception()) {
    709 				MOUNT_DBPRINTF(
    710 				    MOUNT_TRACE_SERVER,
    711 				    MOUNT_RED,
    712 				    ("server:upgrade_client_reference"
    713 				    "exception when upgrading server reference"
    714 				    "for %p on %d error %d\n",
    715 				    (void *)dep, dep->ownerid,
    716 				    e.exception()->exception_enum()));
    717 				e.clear();
    718 			}
    719 		}
    720 	}
    721 
    722 	devlock_list_lock.unlock();
    723 
    724 	_environment.trans_ctxp = NULL;
    725 }
    726 
    727 //
    728 // Add a client to the list of mount_server clients, replaying the extant set
    729 // of global mounts to bring it into consistency with the rest of the cluster.
    730 // This operation should be idempotent since it can be retried on a new primary.
    731 //
    732 void
    733 mount_server_impl::add_client(fs::mount_client_ptr client_p,
    734     sol::nodeid_t nodeid,
    735     fs::mount_client_died_out clobj, Environment &_environment)
    736 {
    737 	mount_client_elem		*ncep = new mount_client_elem(this,
    738 					    client_p, nodeid, repl_serverp);
    739 	mount_client_elem		*cep;
    740 	fs::mount_client_died_ptr	clobjp;
    741 	fs_elem				*fep;
    742 	fs_elem				*ofep;
    743 	Environment			e;
    744 	sol::error_t			err;
    745 	bool				need_fs_status;
    746 	bool				attempt_unmount;
    747 	uint32_t			i;
    748 	solobj::cred_var		credobj = solobj_impl::conv(kcred);
    749 
    750 	// Check to see if we have saved state.
    751 	primary_ctx	*ctxp = primary_ctx::extract_from(_environment);
    752 	unmount_state	*statep;
    753 	if (ctxp != NULL &&
    754 	    (statep = (unmount_state *)ctxp->get_saved_state()) != NULL) {
    755 		//
    756 		// Since we have saved state, we know the original
    757 		// primary sent the ckpt_unmount_start() checkpoint.
    758 		// We finish the unmount process from where we left off.
    759 		//
    760 		unmount_orphaned(statep, false, _environment);
    761 	}
    762 
    763 	client_list_lock.wrlock();
    764 
    765 	//
    766 	// Check to see if the mount client we are adding is already in
    767 	// the list. If it is there, it means this is a retry after
    768 	// a mount server failover. Since we got the checkpoint, we
    769 	// know that the mounts have been replayed on the client.
    770 	// Note that we may have an old entry for the same node
    771 	// in the list until mount_client_elem::_unreferenced()
    772 	// is processed (which is why we search the list by object
    773 	// reference rather than nodeid).
    774 	//
    775 	if ((cep = find_client(client_p)) != NULL) {
    776 		MOUNT_DBPRINTF(
    777 		    MOUNT_TRACE_SERVER,
    778 		    MOUNT_GREEN,
    779 		    ("server:add_client found %p\n", cep));
    780 		clobj = cep->get_objref();
    781 		goto done;
    782 	}
    783 
    784 	fs_list_lock.wrlock();
    785 
    786 	//
    787 	// If the joining node has a direct connection to the device for a
    788 	// a filesystem which is currently globally mounted, but that
    789 	// filesystem is not currently mounted locally  (NOT_AVAILABLE),
    790 	// then we attempt to unmount that filesystem (anticipating that
    791 	// it will be mounted locally by the joining node).
    792 	//
    793 	// An HA filesystem is considered AVAILABLE if there is a
    794 	// node with a primary or secondary filesystem replica.
    795 	//
    796 	// A non-HA filesytem is considered AVAILABLE if the node with
    797 	// the connection to the device is already running in the cluster.
    798 	// So if the joining node has a direct connection we attempt an
    799 	// unmount.
    800 	//
    801 	for (fs_list.atlast(); (fep = fs_list.get_current()) != NULL; ) {
    802 		//
    803 		// Move the 'current' pointer in the list away from 'fep',
    804 		// so that the fs_list.erase() in 'unmount_common' does not
    805 		// move the pointer.
    806 		//
    807 		fs_list.retreat();
    808 
    809 		// Check for joining node being connected to the device.
    810 		need_fs_status = false;
    811 		attempt_unmount = false;
    812 		if (fep->dev_is_ha) {
    813 			for (i = 0; i < fep->dev_nids.length(); i++) {
    814 				if (nodeid == fep->dev_nids[i]) {
    815 					need_fs_status = true;
    816 					break;
    817 				}
    818 			}
    819 		} else {
    820 			if (nodeid == fep->dev_nids[0])
    821 				attempt_unmount = true;
    822 		}
    823 
    824 		if (attempt_unmount || (need_fs_status &&
    825 		    (get_fs_status(fep) == NOT_AVAILABLE))) {
    826 			MOUNT_DBPRINTF(
    827 			    MOUNT_TRACE_SERVER,
    828 			    MOUNT_AMBER,
    829 			    ("server:add_client unmount %s\n",
    830 			    (const char *)fep->ma.dir));
    831 
    832 			//
    833 			// We try to unmount as many file systems as possible
    834 			// but don't return an error at the end so the client
    835 			// isn't removed from the global name space.
    836 			//
    837 			ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1);
    838 			//
    839 			// At this point we will not do a forced
    840 			// unmount. So the flags are empty.
    841 			//
    842 			get_checkpoint()->ckpt_unmount_start_v1(
    843 			    fep->fs_v1_ptr, 0, credobj,
    844 			    fs::mount_client::_nil(), false,
    845 			    _environment);
    846 			ASSERT(_environment.exception() == NULL);
    847 			err = unmount_common_1(fs::mount_client::_nil(),
    848 			    fep, 0, credobj, unmount_state::START, 0,
    849 			    NULL, _environment);
    850 			if (err != 0) {
    851 				MOUNT_DBPRINTF(
    852 				    MOUNT_TRACE_SERVER,
    853 				    MOUNT_RED,
    854 				    ("server:add_client unmount %s err %d\n",
    855 				    (const char *)fep->ma.dir, err));
    856 			}
    857 		}
    858 	}
    859 
    860 	//
    861 	// Replay the extant set of mounts on the new client.
    862 	// If any replayed mount fails, return an exception.
    863 	// Note that we count on the list being properly ordered, so that
    864 	// mount dependencies are respected.
    865 	//
    866 	for (fs_list.atfirst();
    867 	    (fep = fs_list.get_current()) != NULL;
    868 	    fs_list.advance()) {
    869 		//
    870 		// Compute whether an HA replica needs to be started.
    871 		//
    872 		bool	is_ha_repl = false;
    873 		if (fep->dev_is_ha) {
    874 			for (i = 0; i < fep->dev_nids.length(); i++) {
    875 				if (nodeid == fep->dev_nids[i]) {
    876 					is_ha_repl = true;
    877 				}
    878 			}
    879 		}
    880 
    881 		MOUNT_DBPRINTF(
    882 		    MOUNT_TRACE_SERVER,
    883 		    MOUNT_GREEN,
    884 		    ("server:add_client add_notify %s ha %d\n",
    885 		    (const char *)fep->ma.dir, is_ha_repl));
    886 
    887 		ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1);
    888 		client_p->add_notify_v1(fep->ma, fep->mntoptions,
    889 		    is_ha_repl, fep->dev_name,
    890 		    fep->fs_v1_ptr, fep->fs_v1_info, e);
    891 		if (e.exception() == NULL) {
    892 			continue;
    893 		}
    894 
    895 		//
    896 		// We have to undo the work done so far.
    897 		// The list of filesystems is traversed in reverse order.
    898 		//
    899 		sol::error_t	error = pxfslib::get_err(e);
    900 		e.clear();
    901 		MOUNT_DBPRINTF(
    902 		    MOUNT_TRACE_SERVER,
    903 		    MOUNT_RED,
    904 		    ("server:add_client add_notify %s error %d\n",
    905 		    (const char *)fep->ma.dir, error));
    906 		ofep = fep;
    907 		while (fs_list.retreat(),
    908 		    (fep = fs_list.get_current()) != NULL) {
    909 			//
    910 			// Recompute whether an HA replica was started.
    911 			//
    912 			is_ha_repl = false;
    913 			if (fep->dev_is_ha) {
    914 				for (i = 0; i < fep->dev_nids.length(); i++) {
    915 					if (nodeid == fep->dev_nids[i]) {
    916 						is_ha_repl = true;
    917 					}
    918 				}
    919 			}
    920 
    921 			MOUNT_DBPRINTF(
    922 			    MOUNT_TRACE_SERVER,
    923 			    MOUNT_GREEN,
    924 			    ("server:add_client remove_client %s ha %d\n",
    925 			    (const char *)fep->ma.dir, is_ha_repl));
    926 			ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1);
    927 			client_p->remove_client_v1(fep->ma.dir,
    928 			    fep->ma.spec, is_ha_repl, fep->dev_name,
    929 			    fep->fs_v1_ptr, credobj, e);
    930 			if (e.exception()) {
    931 				//
    932 				// We added it but can't remove it. The
    933 				// exception should be node died
    934 				// or EBUSY for remove_client problems.  The
    935 				// latter will be resolved with the
    936 				// implementation of forced unmount.
    937 				//
    938 				if (CORBA::COMM_FAILURE::_exnarrow(
    939 				    e.exception())) {
    940 					MOUNT_DBPRINTF(
    941 					    MOUNT_TRACE_SERVER,
    942 					    MOUNT_RED,
    943 					    ("server:add_client "
    944 					    "exception when removing "
    945 					    "%s on %d COMM\n",
    946 					    (const char *)fep->ma.dir,
    947 					    ncep->nodeid));
    948 				} else {
    949 					err = pxfslib::get_err(e);
    950 					MOUNT_DBPRINTF(
    951 					    MOUNT_TRACE_SERVER,
    952 					    MOUNT_RED,
    953 					    ("server:add_client "
    954 					    "exception when removing "
    955 					    "%s on %d errno %d\n",
    956 					    (const char *)fep->ma.dir,
    957 					    ncep->nodeid, err));
    958 				}
    959 				e.clear();
    960 			}
    961 		}
    962 
    963 		//
    964 		// Propagate the exception back to our caller.
    965 		//
    966 		fs_list_lock.unlock();
    967 		_environment.exception(new mount_err(error, ofep->ma.dir));
    968 		clobj = fs::mount_client_died::_nil();
    969 		goto done;
    970 	}
    971 
    972 	fs_list_lock.unlock();
    973 
    974 	//
    975 	// Checkpoint this operation so that the secondary can
    976 	// create a mount_client_elem too (see ckpt_add_client() below).
    977 	// Everything else is OK to repeat.
    978 	// Note: we insert onto the head of the list so the most recent
    979 	// entry is the valid entry (a stale entry could exist until
    980 	// mount_client_elem::_unreferenced() is processed).
    981 	//
    982 	MOUNT_DBPRINTF(
    983 	    MOUNT_TRACE_SERVER,
    984 	    MOUNT_GREEN,
    985 	    ("server:add_client %p nid %d\n",
    986 	    ncep, ncep->nodeid));
    987 	client_list.prepend(ncep);
    988 	cep = ncep;
    989 	ncep = NULL;
    990 	clobjp = cep->get_objref();
    991 	get_checkpoint()->ckpt_add_client(clobjp, client_p, nodeid, false,
    992 	    _environment);
    993 	ASSERT(_environment.exception() == NULL);
    994 
    995 	//
    996 	// Return a reference to the client so we can detect if it crashes.
    997 	//
    998 	clobj = clobjp;
    999 
   1000 done:
   1001 	client_list_lock.unlock();
   1002 
   1003 	if (ncep != NULL) {
   1004 		delete ncep;
   1005 	}
   1006 }
   1007 
   1008 //
   1009 // Add a client to the list of mount_server clients.
   1010 // Helper function for checkpointing state on a secondary.
   1011 //
   1012 void
   1013 mount_server_impl::ckpt_add_client(fs::mount_client_died_ptr clobj,
   1014     fs::mount_client_ptr client_p, sol::nodeid_t nodeid, bool shutdown)
   1015 {
   1016 	//
   1017 	// Check to see if the mount client we are adding is already in
   1018 	// the list.
   1019 	//
   1020 	if (find_client(client_p) != NULL) {
   1021 		return;
   1022 	}
   1023 
   1024 	mount_client_elem	*cep = new mount_client_elem(this, client_p,
   1025 	    nodeid, shutdown, clobj);
   1026 	client_list.prepend(cep);
   1027 	MOUNT_DBPRINTF(
   1028 	    MOUNT_TRACE_SERVER,
   1029 	    MOUNT_GREEN,
   1030 	    ("server:ckpt_add_client %p nid %d shut %d\n",
   1031 	    cep, cep->nodeid, cep->shutdown));
   1032 }
   1033 
   1034 //
   1035 // This method is never called.
   1036 //
   1037 void
   1038 mount_server_impl::remove_client(fs::mount_client_ptr,
   1039     solobj::cred_ptr, Environment &)
   1040 {
   1041 }
   1042 
   1043 //
   1044 // Remove a client from the list of mount_server clients.
   1045 // Helper function for checkpointing state on a secondary.
   1046 //
   1047 void
   1048 mount_server_impl::ckpt_remove_client(fs::mount_client_ptr client_p)
   1049 {
   1050 	mount_client_elem	*cep;
   1051 
   1052 	if ((cep = find_client(client_p)) != NULL) {
   1053 		//
   1054 		// We found the guy we're looking for.
   1055 		//
   1056 		(void) client_list.erase(cep);
   1057 		cep->clientptr = fs::mount_client::_nil();
   1058 	}
   1059 }
   1060 
   1061 //
   1062 // Create a proxy file system, link it into the global name space,
   1063 // and unlock the mount point on all other nodes.
   1064 // This operation should be idempotent since it can be retried on a new primary.
   1065 //
   1066 void
   1067 mount_server_impl::mount(const sol::mounta &ma, sol::uintptr_t mvp,
   1068     solobj::cred_ptr credobj, fs::mount_client_ptr client_p, bool dev_is_ha,
   1069     const char *dev_name, const sol::nodeid_seq_t &dev_nids,
   1070     fs::filesystem_out fs_obj, fs::fs_info &fsinfo,
   1071     CORBA::String_out mntoptions, Environment &_environment)
   1072 {
   1073 	CL_PANIC(0);
   1074 }
   1075 
   1076 //
   1077 // Create a proxy file system, link it into the global name space,
   1078 // and unlock the mount point on all other nodes.
   1079 // This operation should be idempotent since it can be retried on a new primary.
   1080 //
   1081 void
   1082 mount_server_impl::mount_v1(const sol::mounta &ma, sol::uintptr_t mvp,
   1083     solobj::cred_ptr credobj, fs::mount_client_ptr client_p, bool dev_is_ha,
   1084     const char *dev_name, const sol::nodeid_seq_t &dev_nids,
   1085     pxfs_v1::filesystem_out fs_obj, pxfs_v1::fs_info &fsinfo,
   1086     CORBA::String_out mntoptions, Environment &_environment)
   1087 {
   1088 	ASSERT(ma.flags & MS_SYSSPACE);
   1089 
   1090 	// Check to see if we have saved state.
   1091 	primary_ctx	*ctxp = primary_ctx::extract_from(_environment);
   1092 	mount_state	*statep;
   1093 	if (ctxp != NULL &&
   1094 	    (statep = (mount_state *)ctxp->get_saved_state()) != NULL) {
   1095 		//
   1096 		// This is a retry on a new primary after a failover.
   1097 		// If the previous mount() was committed, we are done.
   1098 		//
   1099 		ASSERT(statep->mount_ver == mount_state::VERSION_1);
   1100 		if (mount_orphaned(statep, false, _environment)) {
   1101 			if (_environment.exception() == NULL) {
   1102 				// Return values from the saved state.
   1103 				ASSERT(!CORBA::is_nil(statep->fs_v1_ptr));
   1104 				fs_obj = pxfs_v1::filesystem::_duplicate(
   1105 				    statep->fs_v1_ptr);
   1106 				fsinfo = statep->fs_v1_info;
   1107 				mntoptions = os::strdup(statep->mntoptions);
   1108 			} else {
   1109 				fs_obj = pxfs_v1::filesystem::_nil();
   1110 				mntoptions = (char *)NULL;
   1111 			}
   1112 			return;
   1113 		}
   1114 
   1115 		//
   1116 		// Since we have saved state, we know the original
   1117 		// primary sent the start checkpoint
   1118 		// so we don't need to do it again here.
   1119 		//
   1120 		ASSERT(statep->mountpoint_lock_c->_equiv(client_p));
   1121 
   1122 		client_list_lock.wrlock();
   1123 	} else {
   1124 		//
   1125 		// This is the start of a new mount, not a retry.
   1126 		// Checkpoint the start of locking the mount points so
   1127 		// we can clean up if both the client and primary fail.
   1128 		//
   1129 		client_list_lock.wrlock();
   1130 		get_checkpoint()->ckpt_mount_start_v1(ma, client_p,
   1131 		    _environment);
   1132 		if (_environment.exception()) {
   1133 			client_list_lock.unlock();
   1134 			fs_obj = pxfs_v1::filesystem::_nil();
   1135 			mntoptions = (char *)NULL;
   1136 			MOUNT_DBPRINTF(
   1137 			    MOUNT_TRACE_SERVER,
   1138 			    MOUNT_RED,
   1139 			    ("server:mount ckpt failed\n"));
   1140 			_environment.clear();
   1141 			pxfslib::throw_exception(_environment, EIO);
   1142 			return;
   1143 		}
   1144 	}
   1145 
   1146 	MOUNT_DBPRINTF(
   1147 	    MOUNT_TRACE_SERVER,
   1148 	    MOUNT_GREEN,
   1149 	    ("server:mount %s %s c %p\n",
   1150 	    (const char *)ma.spec, (const char *)ma.dir, (void *)client_p));
   1151 
   1152 	//
   1153 	// Check to see if the special device is already mounted.
   1154 	//
   1155 	fs_elem			*fep;
   1156 	sol::error_t		error;
   1157 	mount_client_elem	*cep;
   1158 	Environment		e;
   1159 
   1160 	fs_list_lock.wrlock();
   1161 	if ((const char *)ma.spec != NULL) {
   1162 		//
   1163 		// Remove the device lock entry.
   1164 		// Note that a failed mount unlocks the device too.
   1165 		//
   1166 		devlock_list_lock.wrlock();
   1167 		devlock_elem	*dep = find_devlock(ma.spec);
   1168 		if (dep != NULL) {
   1169 			MOUNT_DBPRINTF(
   1170 			    MOUNT_TRACE_SERVER,
   1171 			    MOUNT_AMBER,
   1172 			    ("server:mount unlock %p waiters %d\n",
   1173 			    dep, dep->nwaiters));
   1174 			(void) devlock_list.erase(dep);
   1175 			get_checkpoint()->ckpt_devunlock(ma.spec, _environment);
   1176 			_environment.clear();
   1177 
   1178 			dep->waiter_lock.lock();
   1179 			if (dep->nwaiters != 0) {
   1180 				// This wakes up all waiting threads.
   1181 				dep->unlocked = true;
   1182 				dep->waiter_cv.broadcast();
   1183 				dep->waiter_lock.unlock();
   1184 
   1185 				// The last waiter will do the delete.
   1186 			} else {
   1187 				dep->waiter_lock.unlock();
   1188 				delete dep;
   1189 			}
   1190 		}
   1191 		devlock_list_lock.unlock();
   1192 
   1193 		if ((fep = find_fs(ma.spec)) != NULL) {
   1194 			MOUNT_DBPRINTF(
   1195 			    MOUNT_TRACE_SERVER,
   1196 			    MOUNT_RED,
   1197 			    ("server:mount found %s\n",
   1198 			    (const char *)ma.spec));
   1199 			error = EBUSY;
   1200 			goto err;
   1201 		}
   1202 	}
   1203 
   1204 	//
   1205 	// Lock the mount point on all client nodes except the
   1206 	// requesting node (since it already has the mount point locked).
   1207 	// Note that if two nodes try to mount to the same mount point:
   1208 	// Nodes A and B locally lock the mount point (vn_vfswlock()).
   1209 	// Node A gets here, tries to lock node B's mount point, gets EBUSY,
   1210 	//   releases client_list_lock.
   1211 	// Node B gets here, tries to lock node A's mount point, gets EBUSY.
   1212 	// Both node's locally unlock their mount point and return EBUSY
   1213 	// from the mount system call.
   1214 	//
   1215 	error = lock_mntpnt(client_p, ma.dir, ma.flags, _environment);
   1216 	if (error != 0) {
   1217 		MOUNT_DBPRINTF(
   1218 		    MOUNT_TRACE_SERVER,
   1219 		    MOUNT_RED,
   1220 		    ("server:mount can't lock mntpnt %s error %d\n",
   1221 		    (const char *)ma.dir, error));
   1222 
   1223 		fs_list_lock.unlock();
   1224 		client_list_lock.unlock();
   1225 
   1226 		//
   1227 		// Propagate the exception back to our caller.
   1228 		//
   1229 		pxfslib::throw_exception(_environment, error);
   1230 		fs_obj = pxfs_v1::filesystem::_nil();
   1231 		mntoptions = (char *)NULL;
   1232 		return;
   1233 	}
   1234 
   1235 	FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_S_B, FaultFunctions::generic);
   1236 
   1237 	//
   1238 	// Instantiate the file system on the nodes which have direct
   1239 	// connections to the block device.
   1240 	//
   1241 	if (dev_is_ha) {
   1242 		bool	started = false;
   1243 		for (uint32_t i = 0; i < dev_nids.length(); i++) {
   1244 			// Find the mount client pointer for device node i.
   1245 			cep = find_client(dev_nids[i]);
   1246 			if (cep == NULL) {
   1247 				//
   1248 				// The device node isn't up
   1249 				// at the moment.
   1250 				//
   1251 				continue;
   1252 			}
   1253 			cep->clientptr->instantiate_ha_v1(ma,
   1254 			    cep->clientptr->_equiv(client_p) ? mvp : NULL,
   1255 			    credobj, dev_name, e);
   1256 			if (e.exception() == NULL) {
   1257 				started = true;
   1258 				continue;
   1259 			}
   1260 			if (CORBA::COMM_FAILURE::_exnarrow(e.exception())) {
   1261 				//
   1262 				// The node crashed after we locked the
   1263 				// mount point OK. Just skip it and
   1264 				// try to start replicas on other nodes.
   1265 				//
   1266 				e.clear();
   1267 				continue;
   1268 			}
   1269 			// XXX What to do?
   1270 			MOUNT_DBPRINTF(
   1271 			    MOUNT_TRACE_SERVER,
   1272 			    MOUNT_RED,
   1273 			    ("server:mount instantiate_ha "
   1274 			    "returned exception, can't start HA service %s\n",
   1275 			    (const char *)ma.spec));
   1276 			e.clear();
   1277 		}
   1278 		if (!started) {
   1279 			MOUNT_DBPRINTF(
   1280 			    MOUNT_TRACE_SERVER,
   1281 			    MOUNT_GREEN,
   1282 			    ("server:mount can't start service %s\n",
   1283 			    (const char *)ma.spec));
   1284 			error = ENXIO;
   1285 			goto err;
   1286 		}
   1287 		//
   1288 		// Get the root HA file system object.
   1289 		//
   1290 		replica::service_admin_var	sa =
   1291 		    pxfslib::get_service_admin_ref("mount_server_impl::mount",
   1292 			(const char *)ma.spec, e);
   1293 		if (e.exception()) {
   1294 			error = EIO;
   1295 			e.clear();
   1296 			MOUNT_DBPRINTF(
   1297 			    MOUNT_TRACE_SERVER,
   1298 			    MOUNT_RED,
   1299 			    ("server:mount get_service_admin_ref(%s) except\n",
   1300 			    (const char *)ma.spec));
   1301 			//
   1302 			// XXX Need to shut down this service but
   1303 			// can't get the service_admin object to do it.
   1304 			//
   1305 			goto err;
   1306 		}
   1307 		CORBA::Object_var obj = sa->get_root_obj(e);
   1308 		if (e.exception()) {
   1309 			error = pxfslib::get_err(e);
   1310 			e.clear();
   1311 			MOUNT_DBPRINTF(
   1312 			    MOUNT_TRACE_SERVER,
   1313 			    MOUNT_RED,
   1314 			    ("server:mount get_root_obj(%s) error %d\n",
   1315 			    (const char *)ma.spec, error));
   1316 
   1317 			// Need to shut down this service.
   1318 			sa->shutdown_service(false, e);
   1319 			e.clear();
   1320 			goto err;
   1321 		}
   1322 		pxfs_v1::filesystem_ptr fsobj =
   1323 		    pxfs_v1::filesystem::_narrow(obj);
   1324 		ASSERT(!CORBA::is_nil(fsobj));
   1325 
   1326 		//
   1327 		// Get the file system info.
   1328 		// XXX Note that the mount_server has a temporary dependency
   1329 		// on the file system service for this IDL invocation.
   1330 		//
   1331 		fsobj->get_mntinfo(fsinfo, mntoptions, e);
   1332 		if (e.exception()) {
   1333 			error = pxfslib::get_err(e);
   1334 			e.clear();
   1335 			CORBA::release(fsobj);
   1336 			MOUNT_DBPRINTF(
   1337 			    MOUNT_TRACE_SERVER,
   1338 			    MOUNT_RED,
   1339 			    ("server:mount get_mntinfo(%s) error %d\n",
   1340 			    (const char *)ma.spec, error));
   1341 
   1342 			// Need to shut down this service.
   1343 			sa->shutdown_service(false, e);
   1344 			e.clear();
   1345 			goto err;
   1346 		}
   1347 
   1348 		//
   1349 		// Note that we transfer the CORBA reference
   1350 		// to the return value (i.e., don't release fsobj).
   1351 		//
   1352 		fs_obj = fsobj;
   1353 	} else {
   1354 		//
   1355 		// Find the mount client pointer for the node which has
   1356 		// the device.
   1357 		//
   1358 		cep = find_client(dev_nids[0]);
   1359 		if (cep == NULL) {
   1360 			error = ENXIO;
   1361 			goto err;
   1362 		}
   1363 		cep->clientptr->instantiate_v1(ma,
   1364 		    cep->clientptr->_equiv(client_p) ? mvp : NULL,
   1365 		    credobj, fs_obj, fsinfo, mntoptions, e);
   1366 		if (e.exception() != NULL) {
   1367 			error = pxfslib::get_err(e);
   1368 			e.clear();
   1369 			MOUNT_DBPRINTF(
   1370 			    MOUNT_TRACE_SERVER,
   1371 			    MOUNT_RED,
   1372 			    ("server:mount instantiate(%s) err %d\n",
   1373 			    (const char *)ma.spec, error));
   1374 		err:
   1375 			fs_list_lock.unlock();
   1376 
   1377 			// Checkpoint the error before unlocking mount points.
   1378 			get_checkpoint()->ckpt_mount_err(error, _environment);
   1379 			_environment.clear();
   1380 
   1381 			//
   1382 			// Unlock the mount points we have already locked.
   1383 			//
   1384 			unlock_mntpnt(client_p, NULL, ma.dir, _environment);
   1385 
   1386 			client_list_lock.unlock();
   1387 			pxfslib::throw_exception(_environment, error);
   1388 			fs_obj = pxfs_v1::filesystem::_nil();
   1389 			mntoptions = (char *)NULL;
   1390 			return;
   1391 		}
   1392 	}
   1393 
   1394 	//
   1395 	// Add a new file system element to the list and checkpoint it.
   1396 	//
   1397 	fep = new fs_elem((pxfs_v1::filesystem_ptr)fs_obj, fsinfo, ma,
   1398 	    mntoptions, dev_is_ha, dev_name, dev_nids);
   1399 	MOUNT_DBPRINTF(
   1400 	    MOUNT_TRACE_SERVER,
   1401 	    MOUNT_GREEN,
   1402 	    ("server:mount add %s fep %p\n",
   1403 	    (const char *)ma.spec, fep));
   1404 
   1405 	ASSERT(find_fs(fs_obj) == NULL);
   1406 
   1407 	//
   1408 	// We should only append to the end of the list to maintain
   1409 	// mount ordering.
   1410 	//
   1411 	fs_list.append(fep);
   1412 
   1413 	//
   1414 	// Checkpoint the successful instantiation of the file system.
   1415 	//
   1416 	get_checkpoint()->ckpt_mount_middle_v1(fs_obj, fsinfo, mntoptions,
   1417 	    dev_is_ha, dev_name, dev_nids, _environment);
   1418 	if (_environment.exception()) {
   1419 		//
   1420 		// The only possible exception for checkpoints is
   1421 		// a VERSION exception, which represents a programming
   1422 		// error. This is the first use of a new version checkpoint.
   1423 		// So we will check for a programming mistake.
   1424 		//
   1425 		MOUNT_DBPRINTF(
   1426 		    MOUNT_TRACE_SERVER,
   1427 		    MOUNT_RED,
   1428 		    ("server:mount fep %p ckpt_mount_middle_v1 except %d\n",
   1429 		    fep, _environment.exception()->exception_enum()));
   1430 		ASSERT(0);
   1431 		_environment.clear();
   1432 	}
   1433 
   1434 	fs_list_lock.unlock();
   1435 
   1436 	FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_S_A, FaultFunctions::generic);
   1437 
   1438 	//
   1439 	// At this point we are committed to completing the mount without
   1440 	// errors.  Notify each client of the addition.
   1441 	//
   1442 	mount_end_v1(client_p, ma, mntoptions, fs_obj, fsinfo, false,
   1443 	    _environment);
   1444 
   1445 	client_list_lock.unlock();
   1446 
   1447 	FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_S_E, FaultFunctions::generic);
   1448 }
   1449 
   1450 //
   1451 // Checkpoint the start of a mount or remount.
   1452 // Helper function for checkpointing state on a secondary.
   1453 //
   1454 void
   1455 mount_server_impl::ckpt_mount_start_v1(const sol::mounta &ma,
   1456     fs::mount_client_ptr client_p, Environment &env)
   1457 {
   1458 	transaction_state	*statep =
   1459 	    new mount_state(ma, client_p, *this, mount_state::VERSION_1);
   1460 	statep->register_state(env);
   1461 	env.clear();
   1462 	MOUNT_DBPRINTF(
   1463 	    MOUNT_TRACE_SERVER,
   1464 	    MOUNT_GREEN,
   1465 	    ("server:ckpt_mount_start_v1 %s\n",
   1466 	    (const char *)ma.dir));
   1467 }
   1468 
   1469 //
   1470 // Checkpoint a mount or remount error.
   1471 // Helper function for checkpointing state on a secondary.
   1472 //
   1473 void
   1474 mount_server_impl::ckpt_mount_err(sol::error_t error, Environment &env)
   1475 {
   1476 	//
   1477 	// We must have saved state because this cannot be the first checkpoint
   1478 	//
   1479 	secondary_ctx	*ctxp = secondary_ctx::extract_from(env);
   1480 	ASSERT(ctxp != NULL);
   1481 	mount_state	*statep = (mount_state *)ctxp->get_saved_state();
   1482 	ASSERT(statep != NULL);
   1483 	ASSERT(statep->mount_ver == mount_state::VERSION_1);
   1484 
   1485 	//
   1486 	// Save the error value.
   1487 	// This also marks the start of unlocking the mount points.
   1488 	//
   1489 	ASSERT(CORBA::is_nil(statep->fs_v1_ptr));
   1490 	statep->error = error;
   1491 	MOUNT_DBPRINTF(
   1492 	    MOUNT_TRACE_SERVER,
   1493 	    MOUNT_RED,
   1494 	    ("server:ckpt_mount_err %d\n", error));
   1495 }
   1496 
   1497 //
   1498 // Add a fs_elem to the list of globally mounted file systems.
   1499 // This is used to dump state to a newly joining secondary.
   1500 //
   1501 void
   1502 mount_server_impl::ckpt_mount_v1(pxfs_v1::filesystem_ptr fs_obj,
   1503     const pxfs_v1::fs_info &fsinfo, const sol::mounta &ma,
   1504     const char *mntoptions,
   1505     bool dev_is_ha, const char *dev_name, const sol::nodeid_seq_t &dev_nids)
   1506 {
   1507 	//
   1508 	// Add fs_elem if it isn't already there.
   1509 	// This can happen if the primary failed after sending this
   1510 	// checkpoint and the operation was retried on the new primary.
   1511 	//
   1512 	ASSERT(!CORBA::is_nil(fs_obj));
   1513 	if (find_fs(fs_obj) == NULL) {
   1514 		fs_elem		*fep = new fs_elem(fs_obj, fsinfo, ma,
   1515 		    mntoptions, dev_is_ha, dev_name, dev_nids);
   1516 		fs_list.append(fep);
   1517 		MOUNT_DBPRINTF(
   1518 		    MOUNT_TRACE_SERVER,
   1519 		    MOUNT_GREEN,
   1520 		    ("server:ckpt_mount_v1 add %s fep %p\n",
   1521 		    (const char *)ma.spec, fep));
   1522 	}
   1523 }
   1524 
   1525 //
   1526 // Checkpoint the creation of a new file system object.
   1527 // Helper function for checkpointing state on a secondary.
   1528 //
   1529 void
   1530 mount_server_impl::ckpt_mount_middle_v1(pxfs_v1::filesystem_ptr fs_obj,
   1531     const pxfs_v1::fs_info &fsinfo, const char *mntoptions, bool dev_is_ha,
   1532     const char *dev_name, const sol::nodeid_seq_t &dev_nids, Environment &env)
   1533 {
   1534 	//
   1535 	// We must have saved state because this cannot be the first checkpoint
   1536 	//
   1537 	secondary_ctx	*ctxp = secondary_ctx::extract_from(env);
   1538 	ASSERT(ctxp != NULL);
   1539 	mount_state	*statep = (mount_state *)ctxp->get_saved_state();
   1540 	ASSERT(statep != NULL);
   1541 
   1542 	statep->fs_v1_ptr = pxfs_v1::filesystem::_duplicate(fs_obj);
   1543 	statep->fs_v1_info = fsinfo;
   1544 	statep->mntoptions = mntoptions;
   1545 
   1546 	//
   1547 	// Add fs_elem if it isn't already there.
   1548 	// This can happen if the primary failed after sending this checkpoint
   1549 	// and the operation was retried on the new primary.
   1550 	//
   1551 	ASSERT(!CORBA::is_nil(fs_obj));
   1552 	fs_elem		*fep = find_fs(fs_obj);
   1553 	if (fep == NULL) {
   1554 		fep = new fs_elem(fs_obj, fsinfo, statep->ma, mntoptions,
   1555 		    dev_is_ha, dev_name, dev_nids);
   1556 		fs_list.append(fep);
   1557 		MOUNT_DBPRINTF(
   1558 		    MOUNT_TRACE_SERVER,
   1559 		    MOUNT_AMBER,
   1560 		    ("server:ckpt_mount_middle_v1 add %s fep %p\n",
   1561 		    (const char *)statep->ma.spec, fep));
   1562 	}
   1563 }
   1564 
   1565 //
   1566 // Helper routine for mount_state::orphaned() to clean up mount() or remount().
   1567 // Return true if the operation is completed.
   1568 //
   1569 bool
   1570 mount_server_impl::mount_orphaned(mount_state *statep, bool orph,
   1571     Environment &env)
   1572 {
   1573 	if (CORBA::is_nil(statep->mountpoint_lock_c)) {
   1574 		//
   1575 		// We have seen the add_commit() so just return an error
   1576 		// if needed.
   1577 		//
   1578 		if (statep->error != 0 && !orph) {
   1579 			pxfslib::throw_exception(env, statep->error);
   1580 		}
   1581 		MOUNT_DBPRINTF(
   1582 		    MOUNT_TRACE_SERVER,
   1583 		    MOUNT_RED,
   1584 		    ("server:mount_orphaned committed %s\n",
   1585 		    (const char *)statep->ma.dir));
   1586 		return (true);
   1587 	}
   1588 
   1589 	ASSERT(statep->mount_ver == mount_state::VERSION_1);
   1590 	if (!CORBA::is_nil(statep->fs_v1_ptr)) {
   1591 		//
   1592 		// We have seen ckpt_mount_middle() or
   1593 		// ckpt_remount_middle(), but not the add_commit().
   1594 		// Make sure all mount clients have unlocked the
   1595 		// mount point and updated /etc/mnttab.
   1596 		//
   1597 		ASSERT(statep->error == 0);
   1598 		client_list_lock.wrlock();
   1599 		MOUNT_DBPRINTF(
   1600 		    MOUNT_TRACE_SERVER,
   1601 		    MOUNT_RED,
   1602 		    ("server:mount_orphaned v1 do end %s\n",
   1603 		    (const char *)statep->ma.dir));
   1604 		mount_end_v1(statep->mountpoint_lock_c, statep->ma,
   1605 		    statep->mntoptions, statep->fs_v1_ptr,
   1606 		    statep->fs_v1_info, statep->is_remount, env);
   1607 		client_list_lock.unlock();
   1608 		return (true);
   1609 	}
   1610 	if (statep->error != 0 || orph) {
   1611 		//
   1612 		// We have seen ckpt_mount_err() or mount_state::orphaned()
   1613 		// but not the add_commit().
   1614 		// Make sure mount points are unlocked.
   1615 		//
   1616 		client_list_lock.wrlock();
   1617 		MOUNT_DBPRINTF(
   1618 		    MOUNT_TRACE_SERVER,
   1619 		    MOUNT_GREEN,
   1620 		    ("server:mount_orphaned error %d\n",
   1621 		    statep->error));
   1622 
   1623 		//
   1624 		// Make sure file service is shut down.
   1625 		//
   1626 		fs_list_lock.wrlock();
   1627 		Environment	e;
   1628 		replica::service_admin_var	sa =
   1629 		    pxfslib::get_service_admin_ref("mount_server_impl::mount",
   1630 			(const char *)statep->ma.spec, e);
   1631 		if (e.exception() == NULL) {
   1632 			sa->shutdown_service(false, e);
   1633 		}
   1634 		e.clear();
   1635 		fs_list_lock.unlock();
   1636 
   1637 		unlock_mntpnt(statep->mountpoint_lock_c, NULL, statep->ma.dir,
   1638 		    env);
   1639 		client_list_lock.unlock();
   1640 
   1641 		if (statep->error != 0 && !orph) {
   1642 			pxfslib::throw_exception(env, statep->error);
   1643 		}
   1644 		return (true);
   1645 	}
   1646 
   1647 	//
   1648 	// Only the start checkpoint has been seen
   1649 	//
   1650 	return (false);
   1651 }
   1652 
   1653 //
   1654 // Notify all mount clients of a new mount.
   1655 //
   1656 void
   1657 mount_server_impl::mount_end_v1(fs::mount_client_ptr skip,
   1658     const sol::mounta &ma, const char *mntoptions,
   1659     pxfs_v1::filesystem_ptr fs_obj, const pxfs_v1::fs_info &fsinfo,
   1660     bool is_remount, Environment &env)
   1661 {
   1662 	mount_client_elem	*cep;
   1663 	Environment		e;
   1664 
   1665 	ASSERT(client_list_lock.write_held());
   1666 
   1667 	client_list.atfirst();
   1668 	while ((cep = client_list.get_current()) != NULL) {
   1669 		client_list.advance();
   1670 
   1671 		// Is this the requesting node?
   1672 		if (cep->clientptr->_equiv(skip)) {
   1673 			continue;
   1674 		}
   1675 
   1676 		if (is_remount) {
   1677 			//
   1678 			// Set the proxy vfs_t flags and release the
   1679 			// mount point lock on this node.
   1680 			//
   1681 			cep->clientptr->set_flags_v1(ma, mntoptions, fs_obj,
   1682 			    fsinfo.fsflag, e);
   1683 		} else {
   1684 			//
   1685 			// Create a new proxy vfs_t, link it into the name
   1686 			// space, and release the mount point lock on this
   1687 			// node.
   1688 			//
   1689 			cep->clientptr->add_notify_locked_v1(ma, mntoptions,
   1690 			    fs_obj, fsinfo, e);
   1691 		}
   1692 
   1693 		if (e.exception() == NULL) {
   1694 			continue;
   1695 		}
   1696 		if (CORBA::COMM_FAILURE::_exnarrow(e.exception())) {
   1697 			e.clear();
   1698 			continue;
   1699 		}
   1700 		ASSERT(0); // XXX
   1701 		e.clear();
   1702 	}
   1703 
   1704 	add_commit(env);
   1705 }
   1706 
   1707 //
   1708 // Perform a global remount.
   1709 // This operation should be idempotent since it can be retried on a new primary.
   1710 //
   1711 void
   1712 mount_server_impl::remount(fs::filesystem_ptr fs_obj, fs::fobj_ptr mntpnt,
   1713     const sol::mounta &ma, solobj::cred_ptr credobj,
   1714     fs::mount_client_ptr client_p,
   1715     uint32_t &vfsflags, CORBA::String_out mntoptions, Environment &_environment)
   1716 {
   1717 	CL_PANIC(0);
   1718 }
   1719 
   1720 //
   1721 // Perform a global remount.
   1722 // This operation should be idempotent since it can be retried on a new primary.
   1723 //
   1724 void
   1725 mount_server_impl::remount_v1(pxfs_v1::filesystem_ptr fs_obj,
   1726     pxfs_v1::fobj_ptr mntpnt,
   1727     const sol::mounta &ma, solobj::cred_ptr credobj,
   1728     fs::mount_client_ptr client_p,
   1729     uint32_t &vfsflags, CORBA::String_out mntoptions, Environment &_environment)
   1730 {
   1731 	ASSERT(ma.flags & MS_SYSSPACE);
   1732 
   1733 	MOUNT_DBPRINTF(
   1734 	    MOUNT_TRACE_SERVER,
   1735 	    MOUNT_GREEN,
   1736 	    ("server:remount %s\n", (const char *)ma.dir));
   1737 
   1738 	// Check to see if we have saved state.
   1739 	primary_ctx	*ctxp = primary_ctx::extract_from(_environment);
   1740 	mount_state	*statep;
   1741 	if (ctxp != NULL &&
   1742 	    (statep = (mount_state *)ctxp->get_saved_state()) != NULL) {
   1743 		//
   1744 		// This is a retry on a new primary after a failover.
   1745 		// If the previous remount() was committed, we are done.
   1746 		//
   1747 		ASSERT(statep->mount_ver == mount_state::VERSION_1);
   1748 		if (mount_orphaned(statep, false, _environment)) {
   1749 			if (_environment.exception() == NULL) {
   1750 				// Return values from the saved state.
   1751 				vfsflags = statep->fs_v1_info.fsflag;
   1752 				mntoptions = os::strdup(statep->mntoptions);
   1753 			} else {
   1754 				mntoptions = (char *)NULL;
   1755 			}
   1756 			return;
   1757 		}
   1758 
   1759 		//
   1760 		// Since we have saved state, we know the original
   1761 		// primary sent the start checkpoint
   1762 		// so we don't need to do it again here.
   1763 		//
   1764 		ASSERT(statep->mountpoint_lock_c->_equiv(client_p));
   1765 
   1766 		check_multiple_remounts(ma.dir, _environment);
   1767 		if (_environment.exception()) {
   1768 			mntoptions = (char *)NULL;
   1769 			return;
   1770 		}
   1771 
   1772 		client_list_lock.wrlock();
   1773 	} else {
   1774 		check_multiple_remounts(ma.dir, _environment);
   1775 		if (_environment.exception()) {
   1776 			mntoptions = (char *)NULL;
   1777 			return;
   1778 		}
   1779 
   1780 		//
   1781 		// This is the start of a new remount, not a retry.
   1782 		// Checkpoint the start of locking the mount points so
   1783 		// we can clean up if both the client and primary fail.
   1784 		//
   1785 		client_list_lock.wrlock();
   1786 		get_checkpoint()->ckpt_mount_start_v1(ma, client_p,
   1787 		    _environment);
   1788 		if (_environment.exception()) {
   1789 			//
   1790 			// We are no longer a member of the cluster,
   1791 			// don't proceed.
   1792 			//
   1793 			client_list_lock.unlock();
   1794 			_environment.clear();
   1795 			mntoptions = (char *)NULL;
   1796 
   1797 			current_mount_lock.lock();
   1798 
   1799 			delete [] currentmnt;
   1800 			currentmnt = (char *)NULL;
   1801 			currentmnt_cv.broadcast();
   1802 
   1803 			current_mount_lock.unlock();
   1804 			return;
   1805 		}
   1806 	}
   1807 
   1808 	//
   1809 	// Lock the mount point on all client nodes except the
   1810 	// requesting node (since it already has the mount point locked).
   1811 	//
   1812 	sol::error_t error = lock_mntpnt(client_p, ma.dir, ma.flags,
   1813 	    _environment);
   1814 	if (error != 0) {
   1815 		client_list_lock.unlock();
   1816 
   1817 		//
   1818 		// Propagate the exception back to our caller.
   1819 		//
   1820 		pxfslib::throw_exception(_environment, error);
   1821 		mntoptions = (char *)NULL;
   1822 
   1823 		current_mount_lock.lock();
   1824 
   1825 		delete [] currentmnt;
   1826 		currentmnt = (char *)NULL;
   1827 		currentmnt_cv.broadcast();
   1828 
   1829 		current_mount_lock.unlock();
   1830 
   1831 		return;
   1832 	}
   1833 
   1834 	FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_S_B, FaultFunctions::generic);
   1835 
   1836 	//
   1837 	// Get the file system info.
   1838 	// XXX Note that the mount_server has a temporary dependency
   1839 	// on the file system service for this IDL invocation.
   1840 	//
   1841 	Environment e;
   1842 	fs_obj->remount(mntpnt, ma, credobj, vfsflags, mntoptions, e);
   1843 	if (e.exception()) {
   1844 		// Checkpoint the error before unlocking mount points.
   1845 		error = pxfslib::get_err(e);
   1846 		e.clear();
   1847 		MOUNT_DBPRINTF(
   1848 		    MOUNT_TRACE_SERVER,
   1849 		    MOUNT_GREEN,
   1850 		    ("server:remount error %d\n", error));
   1851 		get_checkpoint()->ckpt_mount_err(error, _environment);
   1852 		_environment.clear();
   1853 
   1854 		//
   1855 		// Unlock the mount points we have already locked.
   1856 		//
   1857 		unlock_mntpnt(client_p, NULL, ma.dir, _environment);
   1858 
   1859 		client_list_lock.unlock();
   1860 		pxfslib::throw_exception(_environment, error);
   1861 		mntoptions = (char *)NULL;
   1862 
   1863 		current_mount_lock.lock();
   1864 
   1865 		delete [] currentmnt;
   1866 		currentmnt = (char *)NULL;
   1867 		currentmnt_cv.broadcast();
   1868 
   1869 		current_mount_lock.unlock();
   1870 		return;
   1871 	}
   1872 
   1873 	//
   1874 	// Update the option string stored in fs_elem.
   1875 	// XXX Note that we attempt to construct the mounta data that
   1876 	// is the combined result of "mount -o ro" merged with
   1877 	// "mount -o remount" but its possible the remount changed or set
   1878 	// other options. We may need to save args for both mount and remount,
   1879 	// and then in add_client() replay the mount and the remount rather
   1880 	// than try to do just one mount with all the right args.
   1881 	//
   1882 	fs_list_lock.wrlock();
   1883 
   1884 	fs_elem		*fep = find_fs(fs_obj);
   1885 	ASSERT(fep != NULL);
   1886 	fep->ma.flags = ma.flags & ~MS_REMOUNT;
   1887 	fep->ma.data = ma.data;
   1888 	fep->mntoptions = os::strdup(mntoptions);
   1889 	fep->fs_v1_info.fsflag = vfsflags;
   1890 
   1891 	//
   1892 	// Checkpoint this operation so that the secondary can
   1893 	// update its state.
   1894 	//
   1895 	get_checkpoint()->ckpt_remount_middle_v1(fs_obj, vfsflags, mntoptions,
   1896 	    _environment);
   1897 	_environment.clear();
   1898 
   1899 	fs_list_lock.unlock();
   1900 
   1901 	FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_S_A, FaultFunctions::generic);
   1902 
   1903 	//
   1904 	// Notify each client with the new vfsflags and mntoptions.
   1905 	// Note that at this point we are committed to updating everything.
   1906 	//
   1907 	mount_end_v1(client_p, ma, mntoptions, fs_obj, fep->fs_v1_info, true,
   1908 	    _environment);
   1909 
   1910 	current_mount_lock.lock();
   1911 
   1912 	delete [] currentmnt;
   1913 	currentmnt = (char *)NULL;
   1914 	currentmnt_cv.broadcast();
   1915 
   1916 	current_mount_lock.unlock();
   1917 
   1918 	client_list_lock.unlock();
   1919 
   1920 	FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_S_E, FaultFunctions::generic);
   1921 }
   1922 
   1923 //
   1924 // Helper function for checkpointing state on a secondary.
   1925 //
   1926 void
   1927 mount_server_impl::ckpt_remount_middle_v1(pxfs_v1::filesystem_ptr fs_obj,
   1928     uint32_t vfsflags, const char *mntoptions, Environment &env)
   1929 {
   1930 	//
   1931 	// We must have saved state because this cannot be the first checkpoint
   1932 	//
   1933 	secondary_ctx	*ctxp = secondary_ctx::extract_from(env);
   1934 	ASSERT(ctxp != NULL);
   1935 	mount_state	*statep = (mount_state *)ctxp->get_saved_state();
   1936 	ASSERT(statep != NULL);
   1937 
   1938 	MOUNT_DBPRINTF(
   1939 	    MOUNT_TRACE_SERVER,
   1940 	    MOUNT_GREEN,
   1941 	    ("server:ckpt_remount_middle_v1 vfsflags %x options %s\n",
   1942 	    vfsflags, mntoptions));
   1943 	statep->is_remount = true;
   1944 	statep->fs_v1_ptr = pxfs_v1::filesystem::_duplicate(fs_obj);
   1945 	statep->fs_v1_info.fsflag = vfsflags;
   1946 	statep->mntoptions = mntoptions;
   1947 
   1948 	fs_elem		*fep = find_fs(fs_obj);
   1949 	ASSERT(fep != NULL);
   1950 	fep->ma.flags = statep->ma.flags & ~MS_REMOUNT;
   1951 	fep->ma.data = statep->ma.data;
   1952 	fep->mntoptions = mntoptions;
   1953 	fep->fs_v1_info.fsflag = vfsflags;
   1954 }
   1955 
   1956 //
   1957 // Helper routine to lock the mount point on all client nodes except 'skip'.
   1958 // Return an exception if the lock can't be acquired on all nodes.
   1959 // This operation should be idempotent since it can be retried on a new primary.
   1960 //
   1961 sol::error_t
   1962 mount_server_impl::lock_mntpnt(fs::mount_client_ptr skip,
   1963     const char *mountpoint, int32_t mntflags, Environment &env)
   1964 {
   1965 	mount_client_elem *cep;
   1966 	sol::error_t error;
   1967 	Environment e;
   1968 
   1969 	ASSERT(client_list_lock.write_held());
   1970 
   1971 	client_list.atfirst();
   1972 	while ((cep = client_list.get_current()) != NULL) {
   1973 		client_list.advance();
   1974 
   1975 		// The node requesting the mount has already done the locking
   1976 		if (cep->clientptr->_equiv(skip)) {
   1977 			continue;
   1978 		}
   1979 
   1980 		// Try to get the lock on this node.
   1981 		cep->clientptr->lock_mountpoint(mountpoint, mntflags, e);
   1982 		if (e.exception() == NULL)
   1983 			continue;
   1984 		if (CORBA::COMM_FAILURE::_exnarrow(e.exception())) {
   1985 			e.clear();
   1986 			continue;
   1987 		}
   1988 
   1989 		error = pxfslib::get_err(e);
   1990 		e.clear();
   1991 		get_checkpoint()->ckpt_mount_err(error, env);
   1992 		ASSERT(env.exception() == NULL);
   1993 
   1994 		// Unlock the mount points we have already locked.
   1995 		unlock_mntpnt(skip, cep, mountpoint, env);
   1996 		return (error);
   1997 	}
   1998 
   1999 	return (0);
   2000 }
   2001 
   2002 //
   2003 // Helper routined to unlock clients that have already been locked successfully.
   2004 // The client_list_lock should be held before calling this.
   2005 //
   2006 void
   2007 mount_server_impl::unlock_mntpnt(fs::mount_client_ptr skip,
   2008     mount_client_elem *endp, const char *mountpoint, Environment &env)
   2009 {
   2010 	mount_client_elem	*cep;
   2011 	Environment		e;
   2012 
   2013 	ASSERT(client_list_lock.write_held());
   2014 
   2015 	client_list.atfirst();
   2016 	while ((cep = client_list.get_current()) != endp) {
   2017 		client_list.advance();
   2018 
   2019 		// Should we skip this client?
   2020 		if (cep->clientptr->_equiv(skip)) {
   2021 			continue;
   2022 		}
   2023 
   2024 		cep->clientptr->unlock_mountpoint(mountpoint, e);
   2025 
   2026 		if (e.exception() == NULL) {
   2027 			continue;
   2028 		}
   2029 		if (CORBA::COMM_FAILURE::_exnarrow(e.exception())) {
   2030 			e.clear();
   2031 			continue;
   2032 		}
   2033 
   2034 		// XXX something better we could do?
   2035 		ASSERT(0);
   2036 		e.clear();
   2037 	}
   2038 
   2039 	add_commit(env);
   2040 }
   2041 
   2042 //
   2043 // Unmount a global file system by locking all mount points on all
   2044 // client nodes except the requesting node, flushing the DNLC, etc.
   2045 // Return an exception if the lock can't be acquired on all nodes or
   2046 // if there are still active proxy vnodes.
   2047 // This operation should be idempotent since it can be retried on a new primary.
   2048 //
   2049 void
   2050 mount_server_impl::unmount(fs::filesystem_ptr fs_obj, solobj::cred_ptr credobj,
   2051     fs::mount_client_ptr client_p, sol::nodeid_t nodeid, bool is_shutdown,
   2052     Environment &_environment)
   2053 {
   2054 	CL_PANIC(0);
   2055 }
   2056 
   2057 //
   2058 // Forced Unmount support version
   2059 //
   2060 // Unmount a global file system by locking all mount points on all
   2061 // client nodes except the requesting node, flushing the DNLC, etc.
   2062 // Return an exception if the lock can't be acquired on all nodes or
   2063 // if there are still active proxy vnodes.
   2064 // This operation should be idempotent since it can be retried on a new primary.
   2065 //
   2066 void
   2067 mount_server_impl::unmount_1(fs::filesystem_ptr fs_obj, int32_t flags,
   2068     solobj::cred_ptr credobj, fs::mount_client_ptr c, sol::nodeid_t nodeid,
   2069     bool is_shutdown, Environment &_environment)
   2070 {
   2071 	CL_PANIC(0);
   2072 }
   2073 
   2074 //
   2075 // Unmount a global file system by locking all mount points on all
   2076 // client nodes except the requesting node, flushing the DNLC, etc.
   2077 // Return an exception if the lock can't be acquired on all nodes or
   2078 // if there are still active proxy vnodes.
   2079 // This operation should be idempotent since it can be retried on a new primary.
   2080 //
   2081 void
   2082 mount_server_impl::unmount_v1(pxfs_v1::filesystem_ptr fs_obj, int32_t flags,
   2083     solobj::cred_ptr credobj, fs::mount_client_ptr client_p,
   2084     sol::nodeid_t nodeid, bool is_shutdown, Environment &_environment)
   2085 {
   2086 	ASSERT(!CORBA::is_nil(fs_obj));
   2087 	ASSERT(!CORBA::is_nil(client_p));
   2088 
   2089 	// Check to see if we have saved state.
   2090 	primary_ctx	*ctxp = primary_ctx::extract_from(_environment);
   2091 	unmount_state	*statep;
   2092 	if (ctxp != NULL &&
   2093 	    (statep = (unmount_state *)ctxp->get_saved_state()) != NULL) {
   2094 		//
   2095 		// Since we have saved state, we know the original
   2096 		// primary sent the ckpt_unmount_start() checkpoint.
   2097 		// We finish the unmount process from where we left off.
   2098 		//
   2099 		ASSERT(statep->unmount_ver == unmount_state::VERSION_1);
   2100 		unmount_orphaned(statep, true, _environment);
   2101 		return;
   2102 	}
   2103 
   2104 	fs_elem			*fep;
   2105 	Environment		e;
   2106 	sol::error_t		error;
   2107 
   2108 	client_list_lock.wrlock();
   2109 	fs_list_lock.wrlock();
   2110 
   2111 	//
   2112 	// Since this is not a retry, we should find the file system entry
   2113 	// unless:
   2114 	// Node A starts unmounting file system F, gets list locks above.
   2115 	// Node B starts unmounting file system F, blocks waiting above.
   2116 	// Node B crashes.
   2117 	// Node A's unmount completes OK (since it either got the lock on B
   2118 	// or it got ECOMM and skipped B).
   2119 	// Node B's unmount unblocks above and we don't find the file system.
   2120 	//
   2121 	fep = find_fs(fs_obj);
   2122 	if (fep == NULL) {
   2123 		MOUNT_DBPRINTF(
   2124 		    MOUNT_TRACE_SERVER,
   2125 		    MOUNT_RED,
   2126 		    ("server:unmount_v1 nid %d shutdown %d can't find FS\n",
   2127 		    nodeid, is_shutdown));
   2128 		fs_list_lock.unlock();
   2129 		client_list_lock.unlock();
   2130 		pxfslib::throw_exception(_environment, EINVAL);
   2131 		return;
   2132 	}
   2133 	MOUNT_DBPRINTF(
   2134 	    MOUNT_TRACE_SERVER,
   2135 	    MOUNT_GREEN,
   2136 	    ("server:unmount_v1 %s nid %d shutdown %d fep %p\n",
   2137 	    (const char *)fep->ma.dir, nodeid, is_shutdown, fep));
   2138 
   2139 	//
   2140 	// Ok, start an unmount.
   2141 	//
   2142 	get_checkpoint()->ckpt_unmount_start_v1(fs_obj, flags, credobj,
   2143 	    client_p, is_shutdown, _environment);
   2144 	ASSERT(_environment.exception() == NULL);
   2145 
   2146 	error = unmount_common_1(client_p, fep, flags, credobj,
   2147 	    unmount_state::START, 0, NULL, _environment);
   2148 
   2149 	MOUNT_DBPRINTF(
   2150 	    MOUNT_TRACE_SERVER,
   2151 	    (error ? MOUNT_RED : MOUNT_GREEN),
   2152 	    ("server:unmount_v1 fep %p err %d\n",
   2153 	    fep, error));
   2154 
   2155 	fs_list_lock.unlock();
   2156 	client_list_lock.unlock();
   2157 
   2158 	if (error != 0) {
   2159 		pxfslib::throw_exception(_environment, error);
   2160 	}
   2161 }
   2162 
   2163 //
   2164 // Checkpoint that a node is being shut down.
   2165 // Helper function for checkpointing state on a secondary.
   2166 //
   2167 void
   2168 mount_server_impl::ckpt_unmount_shutdown(fs::mount_client_ptr client)
   2169 {
   2170 	mount_client_elem *cep = find_client(client);
   2171 	if (cep != NULL) {
   2172 		cep->shutdown = true;
   2173 	}
   2174 }
   2175 
   2176 //
   2177 // Checkpoint the start of an unmount.
   2178 // Add_commit() should be called to finish the unmount process.
   2179 // Helper function for checkpointing state on a secondary.
   2180 //
   2181 void
   2182 mount_server_impl::ckpt_unmount_start_v1(pxfs_v1::filesystem_ptr fs_obj,
   2183     int32_t flags, solobj::cred_ptr credobj, fs::mount_client_ptr client,
   2184     bool is_shutdown, Environment &env)
   2185 {
   2186 	//
   2187 	// We might have saved state from a previous ckpt_unmount_start().
   2188 	// if so, reuse the same transaction state; otherwise, create one.
   2189 	//
   2190 	secondary_ctx	*ctxp = secondary_ctx::extract_from(env);
   2191 	unmount_state	*statep = NULL;
   2192 	if (ctxp == NULL ||
   2193 	    (statep = (unmount_state *)ctxp->get_saved_state()) == NULL) {
   2194 		statep = new unmount_state(fs_obj, flags, credobj, *this,
   2195 		    client);
   2196 		statep->register_state(env);
   2197 		env.clear();
   2198 	} else {
   2199 		ASSERT(!CORBA::is_nil(fs_obj));
   2200 		statep->skip = fs::mount_client::_duplicate(client);
   2201 		statep->fs_v1_obj = pxfs_v1::filesystem::_duplicate(fs_obj);
   2202 		statep->credobj = solobj::cred::_duplicate(credobj);
   2203 		statep->state = unmount_state::START;
   2204 		statep->error = 0;
   2205 		statep->service_name = (char *)NULL;
   2206 	}
   2207 
   2208 	if (is_shutdown) {
   2209 		mount_client_elem	*cep = find_client(client);
   2210 		if (cep != NULL && !cep->shutdown) {
   2211 			cep->shutdown = true;
   2212 		}
   2213 	}
   2214 #ifdef DEBUG
   2215 	fs_elem		*fep = find_fs(fs_obj);
   2216 	MOUNT_DBPRINTF(
   2217 	    MOUNT_TRACE_SERVER,
   2218 	    MOUNT_GREEN,
   2219 	    ("server:ckpt_unmount_start_v1 %s\n",
   2220 	    (const char *)fep->ma.dir));
   2221 #endif
   2222 }
   2223 
   2224 //
   2225 // Helper routine for unmount_state::orphaned() to unlock mount points.
   2226 // This is also called on the primary when recovering from a failover.
   2227 //
   2228 void
   2229 mount_server_impl::unmount_orphaned(unmount_state *statep, bool ret_err,
   2230     Environment &env)
   2231 {
   2232 	MOUNT_DBPRINTF(
   2233 	    MOUNT_TRACE_SERVER,
   2234 	    MOUNT_AMBER,
   2235 	    ("server:unmount_orphaned %d\n", ret_err));
   2236 	if (statep->state == unmount_state::COMMITTED) {
   2237 		//
   2238 		// We saw the last checkpoint so we are done.
   2239 		// Return the error if requested and there is one.
   2240 		//
   2241 		if (ret_err && statep->error != 0) {
   2242 			pxfslib::throw_exception(env, statep->error);
   2243 		}
   2244 		return;
   2245 	}
   2246 
   2247 	//
   2248 	// The file system is still mounted so unmount it.
   2249 	//
   2250 	client_list_lock.wrlock();
   2251 	fs_list_lock.wrlock();
   2252 	fs_elem		*fep;
   2253 
   2254 	ASSERT(statep->unmount_ver == unmount_state::VERSION_1);
   2255 	fep = find_fs(statep->fs_v1_obj);
   2256 	statep->error = unmount_common_1(statep->skip, fep,
   2257 	    statep->flags, statep->credobj, statep->state,
   2258 	    statep->error, statep->service_name, env);
   2259 	fs_list_lock.unlock();
   2260 	client_list_lock.unlock();
   2261 
   2262 	if (ret_err && statep->error != 0) {
   2263 		pxfslib::throw_exception(env, statep->error);
   2264 	}
   2265 }
   2266 
   2267 //
   2268 // Version for forced unmount support
   2269 //
   2270 // Common code for add_client(), remove_client(), unmount(), and
   2271 // unmount_orphaned().
   2272 // The job is to unmount the file system 'fep' from all nodes, possibly
   2273 // skipping steps that have already been performed.
   2274 // Both client_list_lock and fs_list_lock should be held.
   2275 // If there is no error, 'fep' is removed from the list of all file systems.
   2276 //
   2277 sol::error_t
   2278 mount_server_impl::unmount_common_1(fs::mount_client_ptr skip, fs_elem *fep,
   2279     int32_t flags, solobj::cred_ptr credobj, unmount_state::state_t state,
   2280     sol::error_t error, const char *service_name, Environment &env)
   2281 {
   2282 	mount_client_elem	*cep;
   2283 	mount_client_elem	*end_cep = NULL;
   2284 	fs_elem			*delete_fep = NULL;
   2285 	bool			skip_purge;
   2286 	Environment		e;
   2287 
   2288 	ASSERT(client_list_lock.write_held());
   2289 	ASSERT(fs_list_lock.write_held());
   2290 	ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1);
   2291 
   2292 	switch (state) {
   2293 	case unmount_state::START:
   2294 		//
   2295 		// Make sure the mount point is locked and there
   2296 		// are no active vnodes on each client.
   2297 		//
   2298 		client_list.atfirst();
   2299 		while ((cep = client_list.get_current()) != NULL) {
   2300 			client_list.advance();
   2301 
   2302 			//
   2303 			// Note that for the node which called the umount
   2304 			// system call we must skip most of the unmount
   2305 			// preparation (filesystem locking and vnode cache
   2306 			// purging) since that has already been done there.
   2307 			//
   2308 			skip_purge = false;
   2309 			if (cep->clientptr->_equiv(skip)) {
   2310 				skip_purge = true;
   2311 			}
   2312 
   2313 			// Try to get the lock on this node.
   2314 			MOUNT_DBPRINTF(
   2315 			    MOUNT_TRACE_SERVER,
   2316 			    MOUNT_GREEN,
   2317 			    ("server:prepare_unmount_1 %s on %d flags = %x\n",
   2318 			    (const char *)fep->ma.dir, cep->nodeid, flags));
   2319 			cep->clientptr->prepare_unmount_v1(
   2320 			    fep->fs_v1_ptr, flags, credobj, skip_purge, e);
   2321 			if (e.exception() == NULL) {
   2322 				continue;
   2323 			}
   2324 			if (CORBA::COMM_FAILURE::_exnarrow(e.exception())) {
   2325 				MOUNT_DBPRINTF(
   2326 				    MOUNT_TRACE_SERVER,
   2327 				    MOUNT_RED,
   2328 				    ("server:prepare_unmount_1 "
   2329 				    "%s on %d COMM_FAILURE\n",
   2330 				    (const char *)fep->ma.dir, cep->nodeid));
   2331 				e.clear();
   2332 				continue;
   2333 			}
   2334 
   2335 			error = pxfslib::get_err(e);
   2336 			e.clear();
   2337 			//
   2338 			// Unlock clients that have already been
   2339 			// locked successfully.
   2340 			//
   2341 			end_cep = cep;
   2342 			get_checkpoint()->ckpt_unmount_middle(error, env);
   2343 			ASSERT(env.exception() == NULL);
   2344 			goto notify;
   2345 		}
   2346 
   2347 		FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_B,
   2348 		    FaultFunctions::generic);
   2349 
   2350 		//
   2351 		// Try to unmount the file system.
   2352 		// XXX Temporary dependency on the file system here.
   2353 		//
   2354 		error = 0;
   2355 		fep->fs_v1_ptr->unmount(flags, credobj, e);
   2356 
   2357 		if (e.exception()) {
   2358 			if (CORBA::VERSION::_exnarrow(e.exception())) {
   2359 				MOUNT_DBPRINTF(
   2360 				    MOUNT_TRACE_SERVER,
   2361 				    MOUNT_RED,
   2362 				    ("server:unmount_common_1 "
   2363 				    "fs version exception %d\n"));
   2364 				error = EBUSY;
   2365 			} else if (CORBA::COMM_FAILURE::_exnarrow(
   2366 			    e.exception())) {
   2367 				//
   2368 				// If the server for a file system is dead,
   2369 				// and no clients have active vnodes, we
   2370 				// allow the unmount to happen.
   2371 				//
   2372 				error = 0;
   2373 			} else {
   2374 				error = pxfslib::get_err(e);
   2375 				//
   2376 				// For a forced unmount we always set error
   2377 				// to zero (except for ENOTSUP) so the global
   2378 				// unmount will always succeed. ENOTSUP is the
   2379 				// legitimate return from underlying file
   2380 				// systems not supporting forced unmount.
   2381 				//
   2382 				if ((flags & MS_FORCE) && (error != ENOTSUP)) {
   2383 					MOUNT_DBPRINTF(
   2384 					    MOUNT_TRACE_SERVER,
   2385 					    MOUNT_RED,
   2386 					    ("server:unmount_common_1"
   2387 					    "forced unmount error %d\n",
   2388 					    error));
   2389 					error = 0;
   2390 				}
   2391 			}
   2392 			MOUNT_DBPRINTF(
   2393 			    MOUNT_TRACE_SERVER,
   2394 			    MOUNT_RED,
   2395 			    ("server:unmount_common_1 fs exception %d\n",
   2396 			    error));
   2397 			e.clear();
   2398 		}
   2399 
   2400 		get_checkpoint()->ckpt_unmount_middle(error, env);
   2401 		ASSERT(env.exception() == NULL);
   2402 		// FALL THROUGH
   2403 
   2404 	case unmount_state::UNMOUNTED:
   2405 	notify:
   2406 		//
   2407 		// Notify mount clients of the unmount result.
   2408 		//
   2409 		client_list.atfirst();
   2410 		while ((cep = client_list.get_current()) != end_cep) {
   2411 			client_list.advance();
   2412 
   2413 			//
   2414 			// Notify clients of unmount success or failure so
   2415 			// they can proceed appropriately.
   2416 			// In order to handle forced unmounts correctly,
   2417 			// we need to notify all clients that the unmount
   2418 			// succeeded.
   2419 			//
   2420 			// Note that the node which called the
   2421 			// umount() system call will unlink the vfs_t from the
   2422 			// file system name space so that node is treated
   2423 			// differently from the others.
   2424 			//
   2425 			if (error == 0) {
   2426 				cep->clientptr->remove_notify_1(fep->ma.dir,
   2427 				    !cep->clientptr->_equiv(skip), e);
   2428 			} else {
   2429 				if (!cep->clientptr->_equiv(skip)) {
   2430 					cep->clientptr->
   2431 					    unmount_failed_1(false, e);
   2432 				} else {
   2433 					cep->clientptr->
   2434 					    unmount_failed_1(true, e);
   2435 				}
   2436 			}
   2437 			e.clear();
   2438 		}
   2439 
   2440 		//
   2441 		// Tell the secondary that all clients have been notified
   2442 		// before shutting down the file system service so that
   2443 		// the HA object isn't marshalled after the shutdown.
   2444 		//
   2445 		if (error == 0) {
   2446 			if (fep->dev_is_ha) {
   2447 				// Note: service_name shares string with "fep".
   2448 				service_name = (const char *)fep->ma.spec;
   2449 			}
   2450 
   2451 			MOUNT_DBPRINTF(
   2452 			    MOUNT_TRACE_SERVER,
   2453 			    MOUNT_GREEN,
   2454 			    ("server:unmount_common_1 remove %p\n",
   2455 			    fep));
   2456 			//
   2457 			// Note that erase() does an advance() if
   2458 			// fep == get_current().
   2459 			//
   2460 			(void) fs_list.erase(fep);
   2461 			delete_fep = fep;
   2462 		}
   2463 
   2464 		FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_A,
   2465 		    FaultFunctions::generic);
   2466 
   2467 		get_checkpoint()->ckpt_unmount_notified(env);
   2468 		ASSERT(env.exception() == NULL);
   2469 		// FALLTHROUGH
   2470 
   2471 	case unmount_state::NOTIFIED:
   2472 		//
   2473 		// Shut down the FS service.
   2474 		//
   2475 		if (service_name != NULL) {
   2476 			replica::service_admin_var sa =
   2477 			    pxfslib::get_service_admin_ref(
   2478 				"mount_server_impl::unmount_common_1",
   2479 				service_name, e);
   2480 			if (e.exception()) {
   2481 				//
   2482 				// XXX Need to shut down this service but
   2483 				// can't get the service_admin object to do it.
   2484 				//
   2485 				MOUNT_DBPRINTF(
   2486 				    MOUNT_TRACE_SERVER,
   2487 				    MOUNT_RED,
   2488 				    ("server:unmount_common_1 "
   2489 				    "get_service_admin_ref exception\n"));
   2490 				e.clear();
   2491 			} else {
   2492 				if ((flags & MS_FORCE) == 0) {
   2493 					sa->shutdown_service(false, e);
   2494 				} else {
   2495 					sa->shutdown_service(true, e);
   2496 				}
   2497 				if (e.exception()) {
   2498 					MOUNT_DBPRINTF(
   2499 					    MOUNT_TRACE_SERVER,
   2500 					    MOUNT_RED,
   2501 					    ("server:unmount_com_1 "
   2502 					    "shutdown_service() exception\n"));
   2503 					e.clear();
   2504 				}
   2505 			}
   2506 			FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_E,
   2507 			    FaultFunctions::generic);
   2508 			get_checkpoint()->ckpt_unmount_end(env);
   2509 			ASSERT(env.exception() == NULL);
   2510 		}
   2511 		break;
   2512 
   2513 	default:
   2514 		ASSERT(0);
   2515 	}
   2516 
   2517 	if (delete_fep != NULL) {
   2518 		//
   2519 		// We delay deleting fep until now since service_name could
   2520 		// be pointing to the string that would be freed.
   2521 		//
   2522 		delete delete_fep;
   2523 	}
   2524 
   2525 	return (error);
   2526 }
   2527 
   2528 //
   2529 // Checkpoint that either the mount points have been locked
   2530 // and the file system has been unmounted or there was an error.
   2531 // We have yet to notify the client nodes of the result.
   2532 // Helper function for checkpointing state on a secondary.
   2533 //
   2534 void
   2535 mount_server_impl::ckpt_unmount_middle(sol::error_t error, Environment &env)
   2536 {
   2537 	//
   2538 	// We should have saved state since ckpt_unmount_start() is
   2539 	// supposed to be called first.
   2540 	//
   2541 	secondary_ctx	*ctxp = secondary_ctx::extract_from(env);
   2542 	ASSERT(ctxp != NULL);
   2543 	unmount_state	*statep = (unmount_state *)ctxp->get_saved_state();
   2544 	ASSERT(statep != NULL);
   2545 
   2546 	//
   2547 	// state == UNMOUNTED means that 'error' is valid and
   2548 	// don't retry to lock mount points or unmount the file system again.
   2549 	//
   2550 	statep->state = unmount_state::UNMOUNTED;
   2551 	statep->error = error;
   2552 	MOUNT_DBPRINTF(
   2553 	    MOUNT_TRACE_SERVER,
   2554 	    MOUNT_GREEN,
   2555 	    ("server:ckpt_unmount_middle error %d\n", error));
   2556 }
   2557 
   2558 //
   2559 // Checkpoint that all the client nodes have been notified (if there was
   2560 // no unmount error) before shutting down the file system service.
   2561 // Helper function for checkpointing state on a secondary.
   2562 //
   2563 void
   2564 mount_server_impl::ckpt_unmount_notified(Environment &env)
   2565 {
   2566 	//
   2567 	// We should have saved state since ckpt_unmount_start() is
   2568 	// supposed to be called first.
   2569 	//
   2570 	secondary_ctx	*ctxp = secondary_ctx::extract_from(env);
   2571 	ASSERT(ctxp != NULL);
   2572 	unmount_state	*statep = (unmount_state *)ctxp->get_saved_state();
   2573 	ASSERT(statep != NULL);
   2574 	ASSERT(statep->unmount_ver == unmount_state::VERSION_1);
   2575 
   2576 	if (CORBA::is_nil(statep->fs_v1_obj)) {
   2577 		// This is a duplicate checkpoint after a failover.
   2578 		return;
   2579 	}
   2580 	if (statep->error == 0) {
   2581 		fs_elem		*fep;
   2582 		fep = find_fs(statep->fs_v1_obj);
   2583 		ASSERT(fep != NULL);
   2584 		//
   2585 		// state == NOTIFIED means that the mount points have
   2586 		// been unlocked and that the only remaining task is
   2587 		// to shut down the file system service.
   2588 		//
   2589 		if (fep->dev_is_ha) {
   2590 			statep->state = unmount_state::NOTIFIED;
   2591 			statep->service_name = os::strdup(fep->ma.spec);
   2592 		} else {
   2593 			statep->state = unmount_state::COMMITTED;
   2594 		}
   2595 
   2596 		MOUNT_DBPRINTF(
   2597 		    MOUNT_TRACE_SERVER,
   2598 		    MOUNT_GREEN,
   2599 		    ("server:ckpt_unmount_notified remove %p\n",
   2600 		    fep));
   2601 		(void) fs_list.erase(fep);
   2602 		delete fep;
   2603 	} else {
   2604 		MOUNT_DBPRINTF(
   2605 		    MOUNT_TRACE_SERVER,
   2606 		    MOUNT_GREEN,
   2607 		    ("server:ckpt_unmount_notified\n"));
   2608 		statep->state = unmount_state::COMMITTED;
   2609 	}
   2610 	statep->fs_v1_obj = pxfs_v1::filesystem::_nil();
   2611 }
   2612 
   2613 //
   2614 // Checkpoint that unmount is complete. We can't use a commit() call since
   2615 // this may be called more than once when unmounting multiple file systems.
   2616 //
   2617 void
   2618 mount_server_impl::ckpt_unmount_end(Environment &env)
   2619 {
   2620 	//
   2621 	// We should have saved state since ckpt_unmount_start() is
   2622 	// supposed to be called first.
   2623 	//
   2624 	secondary_ctx	*ctxp = secondary_ctx::extract_from(env);
   2625 	ASSERT(ctxp != NULL);
   2626 	unmount_state	*statep = (unmount_state *)ctxp->get_saved_state();
   2627 	ASSERT(statep != NULL);
   2628 
   2629 	statep->state = unmount_state::COMMITTED;
   2630 	MOUNT_DBPRINTF(
   2631 	    MOUNT_TRACE_SERVER,
   2632 	    MOUNT_GREEN,
   2633 	    ("server:ckpt_unmount_end\n"));
   2634 }
   2635 
   2636 //
   2637 // Notify intent to mount a device.
   2638 // The server attempts to get a "lock" on the device
   2639 // and returns success if the file system is not mounted
   2640 // and no other node has the lock. The device remains locked
   2641 // until it is either mounted or the requesting node dies.
   2642 //
   2643 // mount_server_impl(fs::mount_server::devlock)
   2644 void
   2645 mount_server_impl::devlock(fs::mount_client_ptr c, sol::nodeid_t nodeid,
   2646     const char *dev_name, Environment &_environment)
   2647 {
   2648 	fs_list_lock.wrlock();
   2649 	devlock_list_lock.wrlock();
   2650 
   2651 	MOUNT_DBPRINTF(
   2652 	    MOUNT_TRACE_SERVER,
   2653 	    MOUNT_GREEN,
   2654 	    ("server:devlock nid %d %s\n",
   2655 	    nodeid, dev_name));
   2656 
   2657 	// Check to see if the device is locked.
   2658 	devlock_elem	*dep;
   2659 	while ((dep = find_devlock(dev_name)) != NULL) {
   2660 		if (dep->ownerid == nodeid) {
   2661 			//
   2662 			// The owner asked for the same lock again.
   2663 			// This could be a retry after a failover.
   2664 			// Either way, we allow it.
   2665 			//
   2666 			MOUNT_DBPRINTF(
   2667 			    MOUNT_TRACE_SERVER,
   2668 			    MOUNT_AMBER,
   2669 			    ("server:devlock repeat\n"));
   2670 			devlock_list_lock.unlock();
   2671 			fs_list_lock.unlock();
   2672 			return;
   2673 		}
   2674 
   2675 		//
   2676 		// Check to see if the service was frozen and make the
   2677 		// caller try again (so the IDL call completes and
   2678 		// the switchover can proceed).
   2679 		//
   2680 		if (frozen) {
   2681 			MOUNT_DBPRINTF(
   2682 			    MOUNT_TRACE_SERVER,
   2683 			    MOUNT_RED,
   2684 			    ("server:devlock frozen\n"));
   2685 			devlock_list_lock.unlock();
   2686 			fs_list_lock.unlock();
   2687 			pxfslib::throw_exception(_environment, EAGAIN);
   2688 			return;
   2689 		}
   2690 
   2691 		//
   2692 		// Check to see if the owner of the lock is blocked waiting.
   2693 		// This is a simple deadlock check which doesn't support
   2694 		// A waiting for B which is waiting for C even though this
   2695 		// case isn't a deadlock.
   2696 		//
   2697 		if (find_devlock_waiter(dep->ownerid)) {
   2698 			MOUNT_DBPRINTF(
   2699 			    MOUNT_TRACE_SERVER,
   2700 			    MOUNT_RED,
   2701 			    ("server:devlock deadlock\n"));
   2702 			devlock_list_lock.unlock();
   2703 			fs_list_lock.unlock();
   2704 			pxfslib::throw_exception(_environment, EDEADLOCK);
   2705 			return;
   2706 		}
   2707 
   2708 		//
   2709 		// Wait until the lock is released, then return
   2710 		// either lock granted/not granted if the FS is
   2711 		// not mounted/mounted. This is so we don't have
   2712 		// the situation where node A gets the lock, node B
   2713 		// doesn't get the lock (silently not attempting to mount FS),
   2714 		// then node A crashes without completing the mount.
   2715 		//
   2716 		os::condvar_t::wait_result	res = os::condvar_t::NORMAL;
   2717 		dep->waiter_lock.lock();
   2718 		dep->nwaiters++;
   2719 		dep->waiters.set(nodeid - 1);
   2720 		devlock_list_lock.unlock();
   2721 		fs_list_lock.unlock();
   2722 		while (res == os::condvar_t::NORMAL &&
   2723 		    !dep->unlocked && !frozen) {
   2724 			MOUNT_DBPRINTF(
   2725 			    MOUNT_TRACE_SERVER,
   2726 			    MOUNT_GREEN,
   2727 			    ("server:devlock waiting %s\n",
   2728 			    dev_name));
   2729 
   2730 			// Wait for pxfs_devlock_timeout seconds
   2731 			os::systime timeout;
   2732 			timeout.setreltime(pxfs_devlock_timeout * 1000000);
   2733 			res = dep->waiter_cv.timedwait_sig(&dep->waiter_lock,
   2734 			    &timeout);
   2735 
   2736 			MOUNT_DBPRINTF(
   2737 			    MOUNT_TRACE_SERVER,
   2738 			    MOUNT_GREEN,
   2739 			    ("server:devlock wakeup %d u %d frozen %d\n",
   2740 			    res, dep->unlocked, frozen));
   2741 		}
   2742 		dep->waiters.clear(nodeid - 1);
   2743 		bool	last_waiter = (--dep->nwaiters == 0 && dep->unlocked);
   2744 		dep->waiter_lock.unlock();
   2745 
   2746 		if (last_waiter) {
   2747 			delete dep;
   2748 		}
   2749 
   2750 		if (res == os::condvar_t::TIMEDOUT) {
   2751 			//
   2752 			// The wait timed out. We return ETIMEDOUT.
   2753 			// The client prints a syslog message the first time it
   2754 			// sees ETIMEDOUT. It retries until it sees something
   2755 			// other than ETIMEDOUT and EAGAIN.
   2756 			//
   2757 			pxfslib::throw_exception(_environment, ETIMEDOUT);
   2758 			return;
   2759 		} else if (res == os::condvar_t::SIGNALED) {
   2760 			pxfslib::throw_exception(_environment, EINTR);
   2761 			return;
   2762 		}
   2763 		fs_list_lock.wrlock();
   2764 		devlock_list_lock.wrlock();
   2765 	}
   2766 
   2767 	// Check to see if the device is mounted.
   2768 	if (find_fs(dev_name) != NULL) {
   2769 		MOUNT_DBPRINTF(
   2770 		    MOUNT_TRACE_SERVER,
   2771 		    MOUNT_RED,
   2772 		    ("server:devlock busy\n"));
   2773 		devlock_list_lock.unlock();
   2774 		fs_list_lock.unlock();
   2775 		pxfslib::throw_exception(_environment, EBUSY);
   2776 		return;
   2777 	}
   2778 
   2779 	//
   2780 	// Add a new device lock element to the list and checkpoint it.
   2781 	//
   2782 	dep = new devlock_elem(c, nodeid, dev_name);
   2783 	devlock_list.prepend(dep);
   2784 	get_checkpoint()->ckpt_devlock(c, nodeid, dev_name, _environment);
   2785 	ASSERT(_environment.exception() == NULL);
   2786 	MOUNT_DBPRINTF(
   2787 	    MOUNT_TRACE_SERVER,
   2788 	    MOUNT_GREEN,
   2789 	    ("server:devlock added %p\n", dep));
   2790 
   2791 	FAULTPT_PXFS(FAULTNUM_PXFS_DEVLOCK, FaultFunctions::generic);
   2792 
   2793 	devlock_list_lock.unlock();
   2794 	fs_list_lock.unlock();
   2795 }
   2796 
   2797 //
   2798 // Checkpoint a new device lock.
   2799 // Helper function for checkpointing state on a secondary.
   2800 //
   2801 void
   2802 mount_server_impl::ckpt_devlock(fs::mount_client_ptr client,
   2803     sol::nodeid_t nodeid, const char *dev_name)
   2804 {
   2805 	if (find_devlock(dev_name) == NULL) {
   2806 		//
   2807 		// Add a new device lock element to the list.
   2808 		//
   2809 		devlock_elem *dep = new devlock_elem(client, nodeid, dev_name);
   2810 		devlock_list.prepend(dep);
   2811 		MOUNT_DBPRINTF(
   2812 		    MOUNT_TRACE_SERVER,
   2813 		    MOUNT_GREEN,
   2814 		    ("server:ckpt_devlock added nid %d %s %p\n",
   2815 		    nodeid, dev_name, dep));
   2816 	}
   2817 }
   2818 
   2819 //
   2820 // Unlock a device.
   2821 // Note that if there are any nodes waiting for a device lock, it will
   2822 // prevent switchover of the mount service since there will be an uncompleted
   2823 // IDL call (which won't return until the lock is released).
   2824 //
   2825 void
   2826 mount_server_impl::devunlock(const char *dev_name, Environment &_environment)
   2827 {
   2828 	devlock_list_lock.wrlock();
   2829 
   2830 	// Check to see if the device is locked.
   2831 	devlock_elem	*dep = find_devlock(dev_name);
   2832 	if (dep != NULL) {
   2833 		//
   2834 		// Remove lock element from the list and checkpoint it.
   2835 		//
   2836 		MOUNT_DBPRINTF(
   2837 		    MOUNT_TRACE_SERVER,
   2838 		    MOUNT_GREEN,
   2839 		    ("server:devunlock %s unlock %p waiters %d\n",
   2840 		    dev_name, dep, dep->nwaiters));
   2841 		(void) devlock_list.erase(dep);
   2842 		get_checkpoint()->ckpt_devunlock(dev_name, _environment);
   2843 		ASSERT(_environment.exception() == NULL);
   2844 
   2845 		FAULTPT_PXFS(FAULTNUM_PXFS_DEVUNLOCK, FaultFunctions::generic);
   2846 
   2847 		dep->waiter_lock.lock();
   2848 		if (dep->nwaiters != 0) {
   2849 			// This wakes up all waiting threads.
   2850 			dep->unlocked = true;
   2851 			dep->waiter_cv.broadcast();
   2852 			dep->waiter_lock.unlock();
   2853 
   2854 			// The last waiter will do the delete.
   2855 		} else {
   2856 			dep->waiter_lock.unlock();
   2857 			delete dep;
   2858 		}
   2859 	}
   2860 
   2861 	devlock_list_lock.unlock();
   2862 }
   2863 
   2864 //
   2865 // Checkpoint a device unlock.
   2866 // Helper function for checkpointing state on a secondary.
   2867 //
   2868 void
   2869 mount_server_impl::ckpt_devunlock(const char *dev_name)
   2870 {
   2871 	devlock_elem	*dep = find_devlock(dev_name);
   2872 	if (dep != NULL) {
   2873 		//
   2874 		// Remove device lock element from the list.
   2875 		//
   2876 		MOUNT_DBPRINTF(
   2877 		    MOUNT_TRACE_SERVER,
   2878 		    MOUNT_GREEN,
   2879 		    ("server:ckpt_devunlock %s unlock %p\n",
   2880 		    dev_name, dep));
   2881 		(void) devlock_list.erase(dep);
   2882 		delete dep;
   2883 	}
   2884 }
   2885 
   2886 //
   2887 // Get the nodeid of the node holding the lock on a device
   2888 //
   2889 void
   2890 mount_server_impl::get_devlock_owner(const char *dev_name,
   2891     sol::nodeid_t &lock_owner, Environment &_environment)
   2892 {
   2893 	devlock_elem	*dep;
   2894 
   2895 	devlock_list_lock.wrlock();
   2896 
   2897 	// Check to see if the device is locked and has a owner
   2898 	dep = find_devlock(dev_name);
   2899 	if (dep == NULL) {
   2900 		lock_owner = 0;
   2901 	} else {
   2902 		lock_owner = dep->ownerid;
   2903 	}
   2904 
   2905 	devlock_list_lock.unlock();
   2906 
   2907 	MOUNT_DBPRINTF(
   2908 	    MOUNT_TRACE_SERVER,
   2909 	    MOUNT_GREEN,
   2910 	    ("server:devlock owner (%d) %s\n",
   2911 	    lock_owner, dev_name));
   2912 }
   2913 
   2914 //
   2915 // Wake up any threads waiting for a device lock so a swithover can happen.
   2916 // See comment in devlock() above.
   2917 //
   2918 void
   2919 mount_server_impl::freeze_primary()
   2920 {
   2921 	devlock_elem	*dep;
   2922 	devlock_list_lock.wrlock();
   2923 	frozen = true;
   2924 	for (devlock_list.atfirst();
   2925 	    (dep = devlock_list.get_current()) != NULL;
   2926 	    devlock_list.advance()) {
   2927 		dep->waiter_lock.lock();
   2928 		if (dep->nwaiters != 0) {
   2929 			MOUNT_DBPRINTF(
   2930 			    MOUNT_TRACE_SERVER,
   2931 			    MOUNT_GREEN,
   2932 			    ("server:freeze wakeup %p waiters %d\n",
   2933 			    dep, dep->nwaiters));
   2934 			// This wakes up all waiting threads.
   2935 			dep->waiter_cv.broadcast();
   2936 		}
   2937 		dep->waiter_lock.unlock();
   2938 	}
   2939 	devlock_list_lock.unlock();
   2940 }
   2941 
   2942 //
   2943 // Clear frozen state.
   2944 //
   2945 void
   2946 mount_server_impl::unfreeze_primary()
   2947 {
   2948 	devlock_list_lock.wrlock();
   2949 	frozen = false;
   2950 	devlock_list_lock.unlock();
   2951 }
   2952 
   2953 //
   2954 // Return a handle to the DCS configuration callback object.
   2955 // The callback object is called by DCS to notify us when the
   2956 // device configuration changes.
   2957 //
   2958 fs::dc_callback_ptr
   2959 mount_server_impl::get_dc_callback(Environment &_environment)
   2960 {
   2961 	fs_list_lock.wrlock();
   2962 	if (CORBA::is_nil(dc_callback_obj)) {
   2963 		dc_callback_obj =
   2964 		    (new dc_callback_impl(*this, repl_serverp))->get_objref();
   2965 		get_checkpoint()->ckpt_get_dc_callback(dc_callback_obj,
   2966 		    _environment);
   2967 		ASSERT(_environment.exception() == NULL);
   2968 	}
   2969 	fs_list_lock.unlock();
   2970 
   2971 	return (fs::dc_callback::_duplicate(dc_callback_obj));
   2972 }
   2973 
   2974 //
   2975 // Helper function for checkpointing state on a secondary.
   2976 //
   2977 void
   2978 mount_server_impl::ckpt_get_dc_callback(fs::dc_callback_ptr cb)
   2979 {
   2980 	if (CORBA::is_nil(dc_callback_obj)) {
   2981 		// Note that the "delete dcp;" is done by _unreferenced().
   2982 		(void) new dc_callback_impl(*this, cb);
   2983 		dc_callback_obj = fs::dc_callback::_duplicate(cb);
   2984 	}
   2985 }
   2986 
   2987 //
   2988 // Helper routine to update device configuration changes.
   2989 //
   2990 void
   2991 mount_server_impl::notify_change(sol::dev_t gdev,
   2992     const sol::nodeid_seq_t &nodes, Environment &env)
   2993 {
   2994 	mount_client_elem	*cep;
   2995 	fs_elem			*fep;
   2996 	Environment		e;
   2997 
   2998 	//
   2999 	// Search the list of globally mounted file systems for the device.
   3000 	//
   3001 	client_list_lock.wrlock();
   3002 	fs_list_lock.wrlock();
   3003 	for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL;
   3004 	    fs_list.advance()) {
   3005 
   3006 		ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1);
   3007 		if (fep->fs_v1_info.fsdev != gdev) {
   3008 			continue;
   3009 		}
   3010 		//
   3011 		// We found the file system, start replicas on the new
   3012 		// nodes. Note that we don't try to shut down replicas
   3013 		// if the connection is removed.
   3014 		//
   3015 		for (uint32_t i = 0; i < nodes.length(); i++) {
   3016 			bool	fnd = false;
   3017 			for (uint32_t j = 0; j < fep->dev_nids.length(); j++) {
   3018 				if (nodes[i] == fep->dev_nids[j]) {
   3019 					fnd = true;
   3020 					break;
   3021 				}
   3022 			}
   3023 			if (fnd) {
   3024 				continue;
   3025 			}
   3026 
   3027 			// Find the mount client pointer for device node i.
   3028 			cep = find_client(nodes[i]);
   3029 			if (cep == NULL || cep->shutdown) {
   3030 				//
   3031 				// The device node isn't up
   3032 				// at the moment.
   3033 				//
   3034 				continue;
   3035 			}
   3036 			// XXX kcred use.
   3037 			solobj::cred_var	credobj =
   3038 			    solobj_impl::conv(kcred);
   3039 			cep->clientptr->reinstantiate_ha_v1(fep->ma,
   3040 			    fep->fs_v1_ptr, credobj, fep->dev_name, e);
   3041 			//
   3042 			// We do the best we can to start a replica for
   3043 			// the new device connection but if there are errors,
   3044 			// we can't really do anything about it here. There
   3045 			// is a syslog message that is printed when a
   3046 			// new replica is started but if there was an error,
   3047 			// there may be no report of it.
   3048 			// XXX We could print a syslog error message here.
   3049 			//
   3050 			e.clear();
   3051 		}
   3052 		fep->dev_nids = nodes;
   3053 		FAULTPT_PXFS(FAULTNUM_PXFS_DEVICE_CHANGED,
   3054 		    FaultFunctions::generic);
   3055 		get_checkpoint()->ckpt_notify_change(gdev, nodes, env);
   3056 		ASSERT(env.exception() == NULL);
   3057 	}
   3058 	fs_list_lock.unlock();
   3059 	client_list_lock.unlock();
   3060 }
   3061 
   3062 //
   3063 // Checkpoint a change in the disk connection list.
   3064 //
   3065 void
   3066 mount_server_impl::ckpt_notify_change(sol::dev_t gdev,
   3067     const sol::nodeid_seq_t &dev_nids)
   3068 {
   3069 	fs_elem		*fep;
   3070 
   3071 	//
   3072 	// Search the list of globally mounted file systems for the device.
   3073 	//
   3074 	for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL;
   3075 	    fs_list.advance()) {
   3076 		if (fep->fs_v1_info.fsdev != gdev) {
   3077 			continue;
   3078 		}
   3079 		fep->dev_nids = dev_nids;
   3080 	}
   3081 }
   3082 
   3083 //
   3084 // Helper routine to check if device is in use.
   3085 //
   3086 bool
   3087 mount_server_impl::still_active(sol::dev_t gdev)
   3088 {
   3089 	fs_elem	*fep;
   3090 	bool	inuse = false;
   3091 
   3092 	//
   3093 	// Search the list of globally mounted file systems for the device.
   3094 	//
   3095 	fs_list_lock.wrlock();
   3096 	for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL;
   3097 	    fs_list.advance()) {
   3098 		if (fep->fs_v1_info.fsdev == gdev) {
   3099 			inuse = true;
   3100 			break;
   3101 		}
   3102 	}
   3103 	fs_list_lock.unlock();
   3104 
   3105 	return (inuse);
   3106 }
   3107 
   3108 //
   3109 // Helper function for dumping state to a new secondary.
   3110 //
   3111 void
   3112 mount_server_impl::dump_state(repl_pxfs::mount_replica_ptr ckptp,
   3113     Environment &env)
   3114 {
   3115 	MOUNT_DBPRINTF(
   3116 	    MOUNT_TRACE_SERVER,
   3117 	    MOUNT_GREEN,
   3118 	    ("server:dump_state\n"));
   3119 
   3120 	// Create a new mount server on the secondary.
   3121 	CORBA::type_info_t *typ = fs::mount_server::_get_type_info(
   3122 	    mount_vp_to_idl[repl_serverp->current_version.major_num]
   3123 		[repl_serverp->current_version.minor_num].ms);
   3124 	fs::mount_server_var fsls = get_objref(typ);
   3125 	ckptp->ckpt_new_server(fsls, env);
   3126 	if (env.exception()) {
   3127 #ifdef DEBUG
   3128 		env.exception()->print_exception
   3129 		    ("mount_server_impl:dump_state "
   3130 		    "ckpt_new_server");
   3131 #endif
   3132 		MOUNT_DBPRINTF(
   3133 		    MOUNT_TRACE_SERVER,
   3134 		    MOUNT_RED,
   3135 		    ("server:dump_state(%p): "
   3136 		    "exception '%s' while calling ckpt_new_server().\n",
   3137 		    this, env.exception()->_name()));
   3138 		ASSERT(CORBA::COMM_FAILURE::_exnarrow(env.exception()));
   3139 		return;
   3140 	}
   3141 	//
   3142 	// Dump the current list of mount clients.
   3143 	//
   3144 	mount_client_elem	*cep;
   3145 
   3146 	for (client_list.atfirst(); (cep = client_list.get_current()) != NULL;
   3147 	    client_list.advance()) {
   3148 		//
   3149 		// Note that the mount_client_elem can get a second
   3150 		// call to _unreferenced() if 'unref' is set since we
   3151 		// get a new reference here and then release it.
   3152 		// We need to be sure to not delete the object until
   3153 		// _unreferenced() is called again.
   3154 		//
   3155 		fs::mount_client_died_var	clobj = cep->get_objref();
   3156 		ckptp->ckpt_add_client(clobj, cep->clientptr, cep->nodeid,
   3157 		    cep->shutdown, env);
   3158 		if (env.exception()) {
   3159 #ifdef DEBUG
   3160 			env.exception()->print_exception
   3161 			    ("mount_server_impl:dump_state "
   3162 			    "ckpt_add_client");
   3163 #endif
   3164 			MOUNT_DBPRINTF(
   3165 			    MOUNT_TRACE_SERVER,
   3166 			    MOUNT_RED,
   3167 			    ("server:dump_state(%p): "
   3168 			    "exception '%s' while calling ckpt_add_client().\n",
   3169 			    this, env.exception()->_name()));
   3170 			ASSERT(CORBA::COMM_FAILURE::_exnarrow(env.exception()));
   3171 			return;
   3172 		}
   3173 	}
   3174 
   3175 	//
   3176 	// Dump the current list of mounted file systems.
   3177 	//
   3178 	fs_elem		*fep;
   3179 	for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL;
   3180 	    fs_list.advance()) {
   3181 		ASSERT(fep->fs_elem_ver == fs_elem::VERSION_1);
   3182 		ckptp->ckpt_mount_v1(fep->fs_v1_ptr, fep->fs_v1_info,
   3183 		    fep->ma,
   3184 		    fep->mntoptions, fep->dev_is_ha, fep->dev_name,
   3185 		    fep->dev_nids, env);
   3186 		if (env.exception()) {
   3187 #ifdef DEBUG
   3188 			env.exception()->print_exception
   3189 			    ("mount_server_impl:dump_state "
   3190 			    "ckpt_mount");
   3191 #endif
   3192 			MOUNT_DBPRINTF(
   3193 			    MOUNT_TRACE_SERVER,
   3194 			    MOUNT_RED,
   3195 			    ("server:dump_state(%p): "
   3196 			    "exception '%s' while calling ckpt_mount().\n",
   3197 			    this, env.exception()->_name()));
   3198 			ASSERT(CORBA::COMM_FAILURE::_exnarrow(env.exception()));
   3199 			return;
   3200 		}
   3201 	}
   3202 
   3203 	//
   3204 	// Dump the list of device locks.
   3205 	//
   3206 	devlock_elem	*dep;
   3207 	for (devlock_list.atfirst();
   3208 	    (dep = devlock_list.get_current()) != NULL;
   3209 	    devlock_list.advance()) {
   3210 		ckptp->ckpt_devlock(dep->owner, dep->ownerid, dep->spec, env);
   3211 		if (env.exception()) {
   3212 #ifdef DEBUG
   3213 			env.exception()->print_exception
   3214 			    ("mount_server_impl:dump_state "
   3215 			    "ckpt_dev_lock");
   3216 #endif
   3217 			MOUNT_DBPRINTF(
   3218 			    MOUNT_TRACE_SERVER,
   3219 			    MOUNT_RED,
   3220 			    ("server:dump_state(%p): "
   3221 			    "exception '%s' while calling ckpt_devlock().\n",
   3222 			    this, env.exception()->_name()));
   3223 			ASSERT(CORBA::COMM_FAILURE::_exnarrow(env.exception()));
   3224 			return;
   3225 		}
   3226 	}
   3227 
   3228 	//
   3229 	// Create a dc_callback object if needed.
   3230 	//
   3231 	if (!CORBA::is_nil(dc_callback_obj)) {
   3232 		ckptp->ckpt_get_dc_callback(dc_callback_obj, env);
   3233 	}
   3234 }
   3235 
   3236 //
   3237 // Helper routine to find a mount_client_elem given a node number.
   3238 // The client_list_lock should be held before calling this on the primary.
   3239 //
   3240 mount_client_elem *
   3241 mount_server_impl::find_client(nodeid_t nid)
   3242 {
   3243 	mount_client_elem	*cep;
   3244 
   3245 	ASSERT(!primary || client_list_lock.write_held());
   3246 
   3247 	client_list.atfirst();
   3248 	while ((cep = client_list.get_current()) != NULL) {
   3249 		client_list.advance();
   3250 		if (cep->nodeid == nid) {
   3251 			return (cep);
   3252 		}
   3253 	}
   3254 	return (NULL);
   3255 }
   3256 
   3257 //
   3258 // Helper routine to find a mount_client_elem given a mount client.
   3259 // The client_list_lock should be held before calling this on the primary.
   3260 //
   3261 mount_client_elem *
   3262 mount_server_impl::find_client(fs::mount_client_ptr c)
   3263 {
   3264 	mount_client_elem	*cep;
   3265 
   3266 	ASSERT(!primary || client_list_lock.write_held());
   3267 
   3268 	client_list.atfirst();
   3269 	while ((cep = client_list.get_current()) != NULL) {
   3270 		client_list.advance();
   3271 		if (cep->clientptr->_equiv(c)) {
   3272 			return (cep);
   3273 		}
   3274 	}
   3275 	return (NULL);
   3276 }
   3277 
   3278 //
   3279 // Return the fs_elem for fs or NULL if not in fs_list.
   3280 // Must be called with the fs_list_lock held if called on the primary.
   3281 //
   3282 fs_elem *
   3283 mount_server_impl::find_fs(pxfs_v1::filesystem_ptr fsptr)
   3284 {
   3285 	fs_elem		*fep;
   3286 
   3287 	ASSERT(!primary || fs_list_lock.write_held());
   3288 
   3289 	for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL;
   3290 	    fs_list.advance()) {
   3291 		if (!CORBA::is_nil(fep->fs_v1_ptr) &&
   3292 		    fep->fs_v1_ptr->_equiv(fsptr)) {
   3293 			return (fep);
   3294 		}
   3295 	}
   3296 
   3297 	return (NULL);
   3298 }
   3299 
   3300 //
   3301 // Return the fs_elem for fs or NULL if not in fs_list.
   3302 // Must be called with the fs_list_lock held if called on the primary.
   3303 //
   3304 fs_elem *
   3305 mount_server_impl::find_fs(const char *spec)
   3306 {
   3307 	fs_elem *fep;
   3308 
   3309 	ASSERT(!primary || fs_list_lock.write_held());
   3310 
   3311 	for (fs_list.atfirst(); (fep = fs_list.get_current()) != NULL;
   3312 	    fs_list.advance()) {
   3313 		if (strcmp(fep->ma.spec, spec) == 0) {
   3314 			return (fep);
   3315 		}
   3316 	}
   3317 
   3318 	return (NULL);
   3319 }
   3320 
   3321 //
   3322 // Return the devlock_elem or NULL if not in devlock_list.
   3323 // Must be called with the devlock_list_lock held if called on the primary.
   3324 //
   3325 devlock_elem *
   3326 mount_server_impl::find_devlock(const char *spec)
   3327 {
   3328 	devlock_elem *dep;
   3329 
   3330 	ASSERT(!primary || devlock_list_lock.write_held());
   3331 
   3332 	for (devlock_list.atfirst();
   3333 	    (dep = devlock_list.get_current()) != NULL;
   3334 	    devlock_list.advance()) {
   3335 		if (strcmp(dep->spec, spec) == 0) {
   3336 			return (dep);
   3337 		}
   3338 	}
   3339 
   3340 	return (NULL);
   3341 }
   3342 
   3343 //
   3344 // Return true if the given node is blocked waiting for any lock.
   3345 // Must be called with the devlock_list_lock held if called on the primary.
   3346 //
   3347 bool
   3348 mount_server_impl::find_devlock_waiter(sol::nodeid_t nodeid)
   3349 {
   3350 	devlock_elem *dep;
   3351 
   3352 	ASSERT(!primary || devlock_list_lock.write_held());
   3353 
   3354 	for (devlock_list.atfirst();
   3355 	    (dep = devlock_list.get_current()) != NULL;
   3356 	    devlock_list.advance()) {
   3357 		if (dep->waiters.test(nodeid - 1)) {
   3358 			return (true);
   3359 		}
   3360 	}
   3361 
   3362 	return (false);
   3363 }
   3364 
   3365 //
   3366 // Helper function to catch the case where a user issues
   3367 // multiple remount commands at the same time.
   3368 //
   3369 void
   3370 mount_server_impl::check_multiple_remounts(const char *pathp,
   3371     Environment &_environment)
   3372 {
   3373 	char 	*pathmax = NULL, *pathmin = NULL;
   3374 	size_t	minlen = 0;
   3375 
   3376 	current_mount_lock.lock();
   3377 
   3378 	while (currentmnt) {
   3379 		if (strlen(pathp) > strlen(currentmnt)) {
   3380 			pathmax = (char *)pathp; // const string
   3381 			pathmin = currentmnt;
   3382 		} else {
   3383 			pathmax = currentmnt;
   3384 			pathmin = (char *)pathp; // const string
   3385 		}
   3386 
   3387 		minlen = strlen(pathmin);
   3388 
   3389 		// Use strncmp() so that nested mounts can be compared
   3390 		if (strncmp(pathmin, pathmax, minlen) == 0 &&
   3391 		    (pathmax[minlen] == '/' || pathmax[minlen] == '\0')) {
   3392 			sol::error_t error = EBUSY;
   3393 
   3394 			pxfslib::throw_exception(_environment, error);
   3395 
   3396 			current_mount_lock.unlock();
   3397 
   3398 			return;
   3399 		}
   3400 		currentmnt_cv.wait(&current_mount_lock);
   3401 
   3402 	}
   3403 	currentmnt = os::strdup(pathp);
   3404 
   3405 	current_mount_lock.unlock();
   3406 }
   3407 
   3408 //
   3409 // An HA filesystem is considered available if there is at least one existing
   3410 // primary or secondary replica. This function return availability status.
   3411 //
   3412 //lint -e1038
   3413 mount_server_impl::fs_status_t
   3414 mount_server_impl::get_fs_status(fs_elem *fep)
   3415 {
   3416 //lint +e1038
   3417 	Environment e;
   3418 	ASSERT(fep->dev_is_ha);
   3419 
   3420 	// Get a reference to the service admin.
   3421 	replica::service_admin_var sa =
   3422 	    pxfslib::get_service_admin_ref("mount_server_impl::mount",
   3423 		(const char *)fep->ma.spec, e);
   3424 	if (e.exception()) {
   3425 		e.exception()->print_exception
   3426 		    ("mount_server_impl:get_fs_status "
   3427 		    "get_service_admin_ref");
   3428 		MOUNT_DBPRINTF(
   3429 		    MOUNT_TRACE_SERVER,
   3430 		    MOUNT_RED,
   3431 		    ("server:get_fs_status "
   3432 		    "get_service_admin_ref(%s) exception\n",
   3433 		    (const char *)fep->ma.spec));
   3434 		return (ERROR);
   3435 	}
   3436 	e.clear();
   3437 
   3438 	//
   3439 	// Get information about the filesystem replicas
   3440 	// from the service admin.
   3441 	//
   3442 	replica::repl_prov_seq_var repl_provs;
   3443 	sa->get_repl_provs(repl_provs, e);
   3444 	if (e.exception()) {
   3445 		e.exception()->print_exception
   3446 		    ("mount_sever_impl:get_fs_status "
   3447 		    "get_repl_provs");
   3448 		MOUNT_DBPRINTF(
   3449 		    MOUNT_TRACE_SERVER,
   3450 		    MOUNT_RED,
   3451 		    ("sever:get_fs_status "
   3452 		    "get_repl_provs returned exception\n"));
   3453 		return (ERROR);
   3454 	}
   3455 
   3456 	//
   3457 	// Check if primary or secondary replica exists,
   3458 	//
   3459 	uint_t len = repl_provs.length();
   3460 	for (uint_t i = 0; i < len; i++) {
   3461 		if ((repl_provs[i].curr_state
   3462 		    == replica::AS_PRIMARY) ||
   3463 		    (repl_provs[i].curr_state
   3464 		    == replica::AS_SECONDARY)) {
   3465 			//
   3466 			// From the service admin we found out that,
   3467 			// there is a potential replica that could
   3468 			// serve out the filesystem.
   3469 			//
   3470 			//
   3471 			MOUNT_DBPRINTF(
   3472 			    MOUNT_TRACE_SERVER,
   3473 			    MOUNT_GREEN,
   3474 			    ("server:get_fs_status "
   3475 			    "%s can host %s\n",
   3476 			    (const char*)repl_provs[i]
   3477 			    .repl_prov_desc,
   3478 			    (const char *)fep->ma.spec));
   3479 			return (AVAILABLE);
   3480 			}
   3481 		}
   3482 	return (NOT_AVAILABLE);
   3483 }
   3484 
   3485 //
   3486 // Checkpoint the upgrade of a mount_client in client list.
   3487 //
   3488 void
   3489 mount_server_impl::ckpt_upgrade_client_list(fs::mount_client_ptr
   3490     client_p, sol::nodeid_t nodeid)
   3491 {
   3492 	mount_client_elem	*cep;
   3493 
   3494 	if ((cep = find_client(nodeid)) != NULL) {
   3495 		//
   3496 		// We found the guy we're looking for.
   3497 		//
   3498 		cep->clientptr = fs::mount_client::_duplicate(client_p);
   3499 	} else {
   3500 		//
   3501 		// Notify that we are not able to find the guy.
   3502 		//
   3503 		MOUNT_DBPRINTF(
   3504 		    MOUNT_TRACE_SERVER,
   3505 		    MOUNT_RED,
   3506 		    ("server:ckpt_upgrade_client_list"
   3507 		    "Failed to find required mount_client_elem"
   3508 		    "with nodeid %d\n", nodeid));
   3509 	}
   3510 }
   3511 
   3512 //
   3513 // Checkpoint the upgrade of a mount_client in devlock list.
   3514 //
   3515 void
   3516 mount_server_impl::ckpt_upgrade_devlock_list(const char *dev_name,
   3517     fs::mount_client_ptr client_p)
   3518 {
   3519 	devlock_elem		*dep;
   3520 
   3521 	if ((dep = find_devlock(dev_name)) != NULL) {
   3522 		//
   3523 		// We found the guy we're looking for.
   3524 		//
   3525 		dep->owner = fs::mount_client::_duplicate(client_p);
   3526 	} else {
   3527 		//
   3528 		// Notify that we are not able to find the guy.
   3529 		//
   3530 		MOUNT_DBPRINTF(
   3531 		    MOUNT_TRACE_SERVER,
   3532 		    MOUNT_RED,
   3533 		    ("server:ckpt_upgrade_devlock_list"
   3534 		    "Failed to find required devlock_elem"
   3535 		    "with devname %s\n", dev_name));
   3536 	}
   3537 }
   3538