Home | History | Annotate | Download | only in lock
      1 //
      2 // CDDL HEADER START
      3 //
      4 // The contents of this file are subject to the terms of the
      5 // Common Development and Distribution License (the License).
      6 // You may not use this file except in compliance with the License.
      7 //
      8 // You can obtain a copy of the license at usr/src/CDDL.txt
      9 // or http://www.opensolaris.org/os/licensing.
     10 // See the License for the specific language governing permissions
     11 // and limitations under the License.
     12 //
     13 // When distributing Covered Code, include this CDDL HEADER in each
     14 // file and include the License file at usr/src/CDDL.txt.
     15 // If applicable, add the following below this CDDL HEADER, with the
     16 // fields enclosed by brackets [] replaced with your own identifying
     17 // information: Portions Copyright [yyyy] [name of copyright owner]
     18 //
     19 // CDDL HEADER END
     20 //
     21 
     22 //
     23 // Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24 // Use is subject to license terms.
     25 //
     26 
     27 #pragma ident	"@(#)nlm_pxfs.cc	1.42	08/05/20 SMI"
     28 
     29 #include <sys/flock.h>
     30 #include <sys/fcntl.h>
     31 
     32 #include <h/naming.h>
     33 #include <nslib/ns.h>
     34 #include <orb/infrastructure/orbthreads.h>
     35 
     36 #include <pxfs/lock/fs_collection_impl.h>
     37 #include <pxfs/lock/lock_debug.h>
     38 #include <pxfs/client/pxvfs.h>
     39 
     40 enum {
     41 	MAX_ITEMS = 1000
     42 };
     43 
     44 const char		fscollection_name[] = "fscollection";
     45 const unsigned long	fscollection_name_size = 12;
     46 
     47 //
     48 // This new class is used to create a deferred task in the ORB that
     49 // will clean up the NLM's lm_sysid cache for sysids (identifies remote
     50 // clients) that aren't holding any locks.  The "execute" method will
     51 // call back into the NLM base Solaris code to execute lm_free_sysids(),
     52 // which does the actual work.
     53 //
     54 // In the old implementation, the constructor for this object was
     55 // called everytime the Solaris hook lm_free_nlm_sysid_table()
     56 // was invoked by the cache reaper thread. If memory was low, then the
     57 // the constructor would block, waiting for memory to free up, but this
     58 // also blocked the single-threaded reaper thread from proceeding.
     59 //
     60 // There are two solutions:
     61 // 1.  Qualify the new statement with (os::NO_SLEEP), so that the
     62 //	allocation won't block if there's no memory.  This is the simplest
     63 //	fix because it won't block the reaper thread.  A deferred
     64 //	task object will be created for every invocation.
     65 //
     66 // 2.  In this solution, we observe that
     67 //	if a deferred task object has already been created to free sysid
     68 //	storage, there's no need to create another one if the first still
     69 //	exists (which means it hasn't yet been executed).  The first will do
     70 //	the same thing as all the others.  This has two benefits: saving
     71 //	on storage at a critical time and saving the processing time of
     72 //	allocating storage.  The approach is to allocate exactly one
     73 //	nlm_sysid_cleanup object at the time the PXFS server kernel module
     74 //	is loaded and exists for all time until the module is unloaded.
     75 //	The object synchronizes concurrrent access between the reaper
     76 //	thread and the deferred task's thread executing method "execute."
     77 //	If the object is "in use" it means it's already on the queue, so
     78 //	there's nothing to do but return.  If not "in use" then flag
     79 //	it as "in use" and enqueue on the deferred task queue.
     80 //
     81 //	We chose solution 2 to implement.
     82 //
     83 class nlm_sysid_cleanup : public defer_task {
     84 public:
     85 	nlm_sysid_cleanup() { obj_in_use = false; }
     86 	~nlm_sysid_cleanup() {}
     87 	//
     88 	// Need this method to override base method's habit of deleting
     89 	// object.
     90 	//
     91 	void task_done() {}
     92 
     93 	void execute();
     94 
     95 	void set_lock() { cleanup_lock.lock(); }
     96 	void unlock() { cleanup_lock.unlock(); }
     97 
     98 	void signal() { nlm_cv.signal(); }
     99 	void wait_for_signal() { nlm_cv.wait(&cleanup_lock); }
    100 
    101 	void set_inuse(bool in_use) { obj_in_use = in_use; }
    102 	bool is_inuse() { return (obj_in_use); }
    103 
    104 private:
    105 	//
    106 	// The reason to define a mutex lock here is to synchronize access
    107 	// from a thread putting the clean up object on the deferred
    108 	// task queue and a thread executing the deferred task.
    109 	//
    110 	os::mutex_t	cleanup_lock;
    111 	bool		obj_in_use;
    112 
    113 	// Condition variable used by 'execute' and 'nlm_fini'
    114 	os::condvar_t	nlm_cv;
    115 };
    116 
    117 //
    118 // Allocate deferred task object.  Exists for all time.  The reason
    119 // we allocate from the heap instead of statically is to avoid a
    120 // possible race condition.  Sun Cluster does
    121 // not control the time when a statically constructed object
    122 // actually gets created. It is theoretically possible that
    123 // immediately after the module gets loaded and before the statically
    124 // constructed object gets created, the system might try to
    125 // exercise this code.
    126 //
    127 static nlm_sysid_cleanup *nlm_cleanup_task = NULL;
    128 
    129 extern "C" {
    130 	extern void (*lm_set_nlm_status)(int, flk_nlm_status_t);
    131 	extern void (*lm_remove_file_locks)(int);
    132 	extern int  (*lm_has_file_locks)(int, int);
    133 	extern void (*lm_free_nlm_sysid_table)(void);
    134 	extern void lm_free_sysids(void *);
    135 }
    136 
    137 //
    138 // Effects:  This method is the workhorse of the deferred task.  It calls
    139 // back into the NLM base Solaris code itself and executes a routine that
    140 // does the actual work.
    141 //
    142 void
    143 nlm_sysid_cleanup::execute()
    144 {
    145 	//
    146 	// The following invocation is a call directly into the bowels
    147 	// of the NLM itself.  The routine iterates over the list of
    148 	// sysids, checks to see whether a sysid holds locks anywhere,
    149 	// and if not, frees the sysid storage; if it does, skips.
    150 	//
    151 	// Note that the actual argument to this base Solaris NLM routine
    152 	// doesn't matter because that argument is never used in the
    153 	// body of the routine.
    154 	//
    155 
    156 	lm_free_sysids(NULL);
    157 
    158 	//
    159 	// Change the "in use" flag to false under protection of the
    160 	// mutex.  False means the task object is no longer in use.
    161 	// The nlm_free_sysid_table() method is free to test the "in use"
    162 	// flag.
    163 	//
    164 	nlm_cleanup_task->set_lock();
    165 	nlm_cleanup_task->set_inuse(false);
    166 
    167 	//
    168 	// We must tell the thread calling nlm_fini() to unload the
    169 	// PXFS server module that it's okay to destroy this object.
    170 	//
    171 	nlm_cleanup_task->signal();
    172 
    173 	nlm_cleanup_task->unlock();
    174 
    175 	//
    176 	// Note that the deferred task object is never deleted. It exists
    177 	// for all eternity.
    178 	//
    179 }
    180 
    181 //
    182 // There is one Network Lock Manager (NLM) per cluster node.
    183 // It is implemented by the lockd daemon.  Clients talking to
    184 // an NLM on some node can acquire locks only for those file
    185 // systems that were NFS-exported from that node. Since the
    186 // primary servers for NFS-exported PXFS file systems could
    187 // reside on any node in the cluster, a client could have
    188 // file locks anywhere. The locks acquired through this node's
    189 // NLM are uniquely identifed by the node id. Note that the NLM
    190 // is a PXFS client.
    191 //
    192 // There is one Local Lock Manager (LLM) per cluster node.  The
    193 // primary server of each PXFS file system uses the LLM at its
    194 // node to store all locks acquired, whether those locks were
    195 // acquired via an NLM for NFS clients or for local clients
    196 // on the cluster. Thus, there may be two kinds of PXFS file
    197 // locks at this LLM--for NFS clients (NLM) and local cluster
    198 // clients.
    199 //
    200 // The Local Lock Manager (LLM) records locks acquired for local file
    201 // systems like UFS and VxFS that are NFS-exported. Permitting local
    202 // file system NLM locks to coexist with PXFS NLM locks was NOT
    203 // part of the original design of the NLM for Sun Cluster, and
    204 // is therefore, a new requirement. It was envisioned that clients
    205 // of the cluster would only use PXFS. And this design decision
    206 // can be seen in the Solaris NLM code, which assumes that when
    207 // a cluster is booted, PXFS is the only file system being used.
    208 //
    209 // It turns out that we can accommodate this new requirement fairly
    210 // easily in the PXFS code (in this file), which is plugged into the
    211 // Solaris NLM hooks at the time the PXFS server kernel module is
    212 // loaded (loaded at every node).  These routines typically go
    213 // out to all PXFS primary servers in the cluster to do the work
    214 // at the LLMs. In additon, we need to consider the NLM locks for
    215 // the local file system at the PXFS client node (NLM is a PXFS
    216 // client), and so we fiddle with either the sysid or the nlmid
    217 // (setting it to 0) and call the LLM routines.
    218 //
    219 
    220 //
    221 // Effects: Returns 1 (true) if file locks are held by
    222 //   a remote client identified by "sysid" at any
    223 //   fs_collection object. Returns 0 (false) otherwise.
    224 //   Note that this routine is callable from C.
    225 // Parameters:
    226 //   sysid   (IN):  uniquely identifies lock owner
    227 //   chklck  (IN):
    228 //
    229 static int
    230 nlm_has_file_locks(int sysid, int chklck)
    231 {
    232 	// 1. Get the name server
    233 	// 2. For each node in the cluster, fetch the fs collection object
    234 	//    a. invoke has_file_locks() method on fs collection object.
    235 	//	- if any exception is raised, then clear it and continue
    236 	//	- if a result is true, then return true immediately.
    237 	//	- if all results are false, then return false
    238 
    239 	Environment			e;
    240 	naming::naming_context_var	ctxp;
    241 
    242 	ctxp = ns::root_nameserver();	// get root name server
    243 
    244 	naming::binding_list_var bl;
    245 	naming::binding_iterator_var binditer;
    246 
    247 	ctxp->list(MAX_ITEMS, bl, binditer, e);
    248 	//
    249 	// The name server should never fail, as long as one node of the
    250 	// cluster is up.
    251 	//
    252 	if (e.exception()) {
    253 		e.clear();
    254 		return (0);
    255 	}
    256 
    257 	uint_t len = bl->length();
    258 
    259 	bool result = false;
    260 	int retres = 0;
    261 	for (uint_t index = 0; index < len; index++) {
    262 		char *bindname = bl[index].binding_name;
    263 
    264 		//
    265 		// Compare just the first 12 characters of the name, those
    266 		// characters being "fscollection".
    267 		//
    268 		if (os::strncmp(bindname, fscollection_name,
    269 			fscollection_name_size) == 0) {
    270 			//
    271 			// If the name server cannot resolve the name, then
    272 			// this node must have died.  We continue with next
    273 			// iteration as if the node is dead, and no locks
    274 			// are held.
    275 			//
    276 			CORBA::Object_var obj = ctxp->resolve(bindname, e);
    277 
    278 			if (e.exception()) {
    279 				e.clear();
    280 				LOCK_DBPRINTF(
    281 				    LOCK_TRACE_NLM,
    282 				    LOCK_RED,
    283 				    ("nlm_has_file_locks: can't resolve "
    284 				    "name\n"));
    285 				continue;
    286 			}
    287 			ASSERT(is_not_nil(obj));
    288 
    289 			// coerce corba object to fs_collection object
    290 			fs::fs_collection_var fscoll =
    291 				fs::fs_collection::_narrow(obj);
    292 			ASSERT(is_not_nil(fscoll));
    293 
    294 			result = fscoll->has_file_locks(sysid, chklck, e);
    295 
    296 			//
    297 			// If any exceptions were raised, then the method
    298 			// invocation has failed.  We aossume that the remote
    299 			// node failed, and therefore the locks are gone.
    300 			// We just continue with the next iteration.
    301 			//
    302 			if (e.exception()) {
    303 				e.clear();
    304 				continue;
    305 			}
    306 
    307 			//
    308 			// If any method invocation returns true, then
    309 			// the client still holds locks somewhere; we
    310 			// don't care where. Return immediately.
    311 			//
    312 			if (result) {
    313 				retres = 1;
    314 				break;
    315 				//				return (1);
    316 			}
    317 		} // end if
    318 	} // end for
    319 
    320 	//
    321 	// Now that we've checked for clients of PXFS files, we must check
    322 	// for clients of local file systems and ask whether those
    323 	// clients have locks anywhere.
    324 	//
    325 	// Since the Solaris NLM code set the upper 2 bytes to indicate
    326 	// we're in a cluster using PXFS, we need to reset those bytes
    327 	// to 0 here so that we can ask about locks for the local file
    328 	// system.  Calling this macro extracts just the sysid (lower
    329 	// two bytes) and sets it to a new variable; this effectively
    330 	// sets the upper two bytes to 0.
    331 	//
    332 	int local_sysid = GETSYSID(sysid);
    333 
    334 	//
    335 	// Check to see if the client holds NLM locks on the local
    336 	// file system. Ask the LLM in a direct call to the Solaris code.
    337 	//
    338 	if (flk_sysid_has_locks(local_sysid, chklck) == 1) {
    339 		retres = 1;
    340 	} // end if
    341 
    342 	return (retres);
    343 }
    344 // nlm_has_file_locks
    345 
    346 
    347 //
    348 // Effects:  This routine sets the status of the local lock manager
    349 //   to the argument "status" for the given NLM identified by argument
    350 //   "nlmid." It also sets the status for NLM locks obtained for the
    351 //   local file system at the NLM's node. Returns nothing.
    352 //   Note that this routine is callable from C.
    353 // Parameters:
    354 //	nlmid	(IN): node id of NLM server
    355 //	status	(IN): state of NLM server
    356 //			legal values are NLM_UP, NLM_DOWN,
    357 //			and NLM_SHUTTING_DOWN.
    358 //
    359 static void
    360 nlm_set_nlm_status(int32_t nlmid, flk_nlm_status_t status)
    361 {
    362 	fs::nlm_status new_status = fs::nlm_up;
    363 	flk_nlm_status_t new_state = FLK_NLM_UP;
    364 
    365 	switch (status) {
    366 	case FLK_NLM_UP:
    367 		new_status = fs::nlm_up;
    368 		new_state = FLK_NLM_UP;
    369 		break;
    370 	case FLK_NLM_DOWN:
    371 		new_status = fs::nlm_down;
    372 		new_state = FLK_NLM_DOWN;
    373 		break;
    374 	case FLK_NLM_SHUTTING_DOWN:
    375 		new_status = fs::nlm_shutting_down;
    376 		new_state = FLK_NLM_SHUTTING_DOWN;
    377 		break;
    378 	default:
    379 		ASSERT(0);
    380 		// NOTREACHED
    381 	} // end switch
    382 
    383 	//
    384 	// When we change the state of the NLM "nlmid" we must consider
    385 	// the local file system's NLM locks. The following call talks
    386 	// directly to the LLM on node "nlmid." To identify the local
    387 	// file system's NLM locks, we set nlmid to 0.
    388 	//
    389 	// The reason we can merely set the nlmid to 0 is that we
    390 	// are taking advantage of a loophole in the called routine.
    391 	// Note that in the Solaris routine "cl_flk_set_nlm_status(nlmid,
    392 	// state)" in usr/src/uts/common/os/flock.c, there's a "Requires"
    393 	// clause in the comment, which says the following:
    394 	//    "nlmid" must be >= 1 and <= clconf_maximum_nodeid()"
    395 	// It says "Requires" because it cannot enforce it. The routine
    396 	// returns void, that is, nothing, so we cannot report an error.
    397 	// We leave it to the caller to ensure the constraint. Since
    398 	// the routine doesn't enforce, that leaves a loophole.
    399 	// This solution exploits that loophole. We can consider
    400 	// UFS locks in this routine even though we're booted as
    401 	// a cluster, by setting the first argument to 0. Someday
    402 	// we should update that comment in Solaris.
    403 	//
    404 	cl_flk_set_nlm_status(0, new_state);
    405 
    406 	//
    407 	// The following call considers NLM locks acquired for
    408 	// PXFS files only. It sets the status for the NLM locks
    409 	// acquired via the NLM residing at node "nlmid." This may have
    410 	// the effect of talking to all PXFS file systems in the cluster
    411 	// and setting the status at the LLMs serving those file systems.
    412 	//
    413 	// Note that the old and new enum's for nlm_status are identical.
    414 	// Different enum definitions are used to prevent a circular dependency
    415 	// due to rolling upgrade.
    416 	//
    417 	pxvfs::set_nlm_status(nlmid, (pxfs_v1::nlm_status)new_status);
    418 }
    419 
    420 //
    421 // Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
    422 //	quantity, the real sysid generated by the NLM server; the upper half
    423 //	identifies the node of the cluster where the NLM server ran.
    424 // Effects: Delete all the advisory file locks held by a remote client
    425 //    at all fs_collection objects.  The client is identified by "sysid."
    426 //    It also remove NLM locks for all local file systems.
    427 //   Note that this routine is callable from C.
    428 // Parameters:
    429 //   sysid   (IN):  uniquely identifies lock owner
    430 //
    431 static void
    432 nlm_remove_file_locks(int32_t sysid)
    433 {
    434 	//
    435 	// Remove the NLM locks on a local file system.  Set the upper
    436 	// bytes of the sysid to 0, which indicates local locks.
    437 	// The following macro extracts the low-order two bytes and
    438 	// assigns them to a new variable; this effectively makes the
    439 	// high-order two bytes to be 0.  The reason we do this instead
    440 	// of something more straightforward like the in-line routine
    441 	//   pxfslib::set_nodeid(sysid, 0)
    442 	// is that that that routine does NOT work correctly when the
    443 	// nodeid is set to 0; in fact, it's a no-op.
    444 	//
    445 	int new_sysid = GETSYSID(sysid);
    446 	cl_flk_remove_locks_by_sysid(new_sysid);
    447 
    448 	//
    449 	// Remove the NLM file locks on PXFS files only. The
    450 	// lock owner is identified by the sysid [nlmid, sysid]
    451 	// pair on all PXFS file systems.
    452 	//
    453 	pxvfs::remove_file_locks(sysid);
    454 }
    455 
    456 //
    457 // Effects:  This routine creates a deferred task from the common
    458 // threadpool.  The task will call back into the NLM base Solaris code
    459 // to the actual work of freeing any sysids not holding locks.
    460 //
    461 // Note:  This routine will be plugged into the lm_free_nlm_sysid_table
    462 // hook in the base Solaris code by "nlm_init" when the PXFS server kernel
    463 // module is loaded.
    464 //
    465 static void
    466 nlm_free_sysid_table()
    467 {
    468 	//
    469 	// If this callback is called before the kernel module is completely
    470 	// loaded, then the deferred task object could be NULL.  We bail
    471 	// out if this is the case.
    472 	//
    473 	if (nlm_cleanup_task == NULL) {
    474 		return;
    475 	}
    476 
    477 	nlm_cleanup_task->set_lock();
    478 
    479 	//
    480 	// If the object is not in use, then flag it as "in use" and
    481 	// then queue it on the deferred task queue for eventual
    482 	// execution.
    483 	//
    484 	if (!nlm_cleanup_task->is_inuse()) {
    485 		nlm_cleanup_task->set_inuse(true);
    486 		nlm_cleanup_task->unlock();
    487 
    488 		//
    489 		// Enqueue nlm_cleanup_task object on the deferred task
    490 		// queue.
    491 		//
    492 		common_threadpool::the().defer_processing(nlm_cleanup_task);
    493 
    494 		return;
    495 	} // end if
    496 
    497 	nlm_cleanup_task->unlock();
    498 }
    499 
    500 void
    501 nlm_init()
    502 {
    503 	//
    504 	// Allocate the deferred task here when the PXFS server module
    505 	// loaded.
    506 	//
    507 	if (nlm_cleanup_task == NULL) {
    508 		nlm_cleanup_task = new nlm_sysid_cleanup();
    509 	}
    510 
    511 	lm_set_nlm_status = nlm_set_nlm_status;
    512 	lm_remove_file_locks = nlm_remove_file_locks;
    513 	lm_has_file_locks = nlm_has_file_locks;
    514 	lm_free_nlm_sysid_table = nlm_free_sysid_table;
    515 }
    516 
    517 void
    518 nlm_fini()
    519 {
    520 	if (nlm_cleanup_task != NULL) {
    521 		//
    522 		// Before destroying the deferred task object, wait
    523 		// until it's no longer in use.  We'll be signalled by
    524 		// the thread executing 'execute' method.
    525 		//
    526 		nlm_cleanup_task->set_lock();
    527 		while (true) {
    528 			nlm_cleanup_task->wait_for_signal();
    529 
    530 			if (!nlm_cleanup_task->is_inuse()) {
    531 				nlm_cleanup_task->unlock();
    532 
    533 				// Delete the task object.
    534 				delete nlm_cleanup_task;
    535 			} // end if
    536 		} // end while
    537 	} // end if
    538 
    539 	lm_set_nlm_status = NULL;
    540 	lm_remove_file_locks = NULL;
    541 	lm_has_file_locks = NULL;
    542 	lm_free_nlm_sysid_table = NULL;
    543 }
    544