1 // 2 // CDDL HEADER START 3 // 4 // The contents of this file are subject to the terms of the 5 // Common Development and Distribution License (the License). 6 // You may not use this file except in compliance with the License. 7 // 8 // You can obtain a copy of the license at usr/src/CDDL.txt 9 // or http://www.opensolaris.org/os/licensing. 10 // See the License for the specific language governing permissions 11 // and limitations under the License. 12 // 13 // When distributing Covered Code, include this CDDL HEADER in each 14 // file and include the License file at usr/src/CDDL.txt. 15 // If applicable, add the following below this CDDL HEADER, with the 16 // fields enclosed by brackets [] replaced with your own identifying 17 // information: Portions Copyright [yyyy] [name of copyright owner] 18 // 19 // CDDL HEADER END 20 // 21 22 // 23 // Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 // Use is subject to license terms. 25 // 26 27 #pragma ident "@(#)nlm_pxfs.cc 1.42 08/05/20 SMI" 28 29 #include <sys/flock.h> 30 #include <sys/fcntl.h> 31 32 #include <h/naming.h> 33 #include <nslib/ns.h> 34 #include <orb/infrastructure/orbthreads.h> 35 36 #include <pxfs/lock/fs_collection_impl.h> 37 #include <pxfs/lock/lock_debug.h> 38 #include <pxfs/client/pxvfs.h> 39 40 enum { 41 MAX_ITEMS = 1000 42 }; 43 44 const char fscollection_name[] = "fscollection"; 45 const unsigned long fscollection_name_size = 12; 46 47 // 48 // This new class is used to create a deferred task in the ORB that 49 // will clean up the NLM's lm_sysid cache for sysids (identifies remote 50 // clients) that aren't holding any locks. The "execute" method will 51 // call back into the NLM base Solaris code to execute lm_free_sysids(), 52 // which does the actual work. 53 // 54 // In the old implementation, the constructor for this object was 55 // called everytime the Solaris hook lm_free_nlm_sysid_table() 56 // was invoked by the cache reaper thread. If memory was low, then the 57 // the constructor would block, waiting for memory to free up, but this 58 // also blocked the single-threaded reaper thread from proceeding. 59 // 60 // There are two solutions: 61 // 1. Qualify the new statement with (os::NO_SLEEP), so that the 62 // allocation won't block if there's no memory. This is the simplest 63 // fix because it won't block the reaper thread. A deferred 64 // task object will be created for every invocation. 65 // 66 // 2. In this solution, we observe that 67 // if a deferred task object has already been created to free sysid 68 // storage, there's no need to create another one if the first still 69 // exists (which means it hasn't yet been executed). The first will do 70 // the same thing as all the others. This has two benefits: saving 71 // on storage at a critical time and saving the processing time of 72 // allocating storage. The approach is to allocate exactly one 73 // nlm_sysid_cleanup object at the time the PXFS server kernel module 74 // is loaded and exists for all time until the module is unloaded. 75 // The object synchronizes concurrrent access between the reaper 76 // thread and the deferred task's thread executing method "execute." 77 // If the object is "in use" it means it's already on the queue, so 78 // there's nothing to do but return. If not "in use" then flag 79 // it as "in use" and enqueue on the deferred task queue. 80 // 81 // We chose solution 2 to implement. 82 // 83 class nlm_sysid_cleanup : public defer_task { 84 public: 85 nlm_sysid_cleanup() { obj_in_use = false; } 86 ~nlm_sysid_cleanup() {} 87 // 88 // Need this method to override base method's habit of deleting 89 // object. 90 // 91 void task_done() {} 92 93 void execute(); 94 95 void set_lock() { cleanup_lock.lock(); } 96 void unlock() { cleanup_lock.unlock(); } 97 98 void signal() { nlm_cv.signal(); } 99 void wait_for_signal() { nlm_cv.wait(&cleanup_lock); } 100 101 void set_inuse(bool in_use) { obj_in_use = in_use; } 102 bool is_inuse() { return (obj_in_use); } 103 104 private: 105 // 106 // The reason to define a mutex lock here is to synchronize access 107 // from a thread putting the clean up object on the deferred 108 // task queue and a thread executing the deferred task. 109 // 110 os::mutex_t cleanup_lock; 111 bool obj_in_use; 112 113 // Condition variable used by 'execute' and 'nlm_fini' 114 os::condvar_t nlm_cv; 115 }; 116 117 // 118 // Allocate deferred task object. Exists for all time. The reason 119 // we allocate from the heap instead of statically is to avoid a 120 // possible race condition. Sun Cluster does 121 // not control the time when a statically constructed object 122 // actually gets created. It is theoretically possible that 123 // immediately after the module gets loaded and before the statically 124 // constructed object gets created, the system might try to 125 // exercise this code. 126 // 127 static nlm_sysid_cleanup *nlm_cleanup_task = NULL; 128 129 extern "C" { 130 extern void (*lm_set_nlm_status)(int, flk_nlm_status_t); 131 extern void (*lm_remove_file_locks)(int); 132 extern int (*lm_has_file_locks)(int, int); 133 extern void (*lm_free_nlm_sysid_table)(void); 134 extern void lm_free_sysids(void *); 135 } 136 137 // 138 // Effects: This method is the workhorse of the deferred task. It calls 139 // back into the NLM base Solaris code itself and executes a routine that 140 // does the actual work. 141 // 142 void 143 nlm_sysid_cleanup::execute() 144 { 145 // 146 // The following invocation is a call directly into the bowels 147 // of the NLM itself. The routine iterates over the list of 148 // sysids, checks to see whether a sysid holds locks anywhere, 149 // and if not, frees the sysid storage; if it does, skips. 150 // 151 // Note that the actual argument to this base Solaris NLM routine 152 // doesn't matter because that argument is never used in the 153 // body of the routine. 154 // 155 156 lm_free_sysids(NULL); 157 158 // 159 // Change the "in use" flag to false under protection of the 160 // mutex. False means the task object is no longer in use. 161 // The nlm_free_sysid_table() method is free to test the "in use" 162 // flag. 163 // 164 nlm_cleanup_task->set_lock(); 165 nlm_cleanup_task->set_inuse(false); 166 167 // 168 // We must tell the thread calling nlm_fini() to unload the 169 // PXFS server module that it's okay to destroy this object. 170 // 171 nlm_cleanup_task->signal(); 172 173 nlm_cleanup_task->unlock(); 174 175 // 176 // Note that the deferred task object is never deleted. It exists 177 // for all eternity. 178 // 179 } 180 181 // 182 // There is one Network Lock Manager (NLM) per cluster node. 183 // It is implemented by the lockd daemon. Clients talking to 184 // an NLM on some node can acquire locks only for those file 185 // systems that were NFS-exported from that node. Since the 186 // primary servers for NFS-exported PXFS file systems could 187 // reside on any node in the cluster, a client could have 188 // file locks anywhere. The locks acquired through this node's 189 // NLM are uniquely identifed by the node id. Note that the NLM 190 // is a PXFS client. 191 // 192 // There is one Local Lock Manager (LLM) per cluster node. The 193 // primary server of each PXFS file system uses the LLM at its 194 // node to store all locks acquired, whether those locks were 195 // acquired via an NLM for NFS clients or for local clients 196 // on the cluster. Thus, there may be two kinds of PXFS file 197 // locks at this LLM--for NFS clients (NLM) and local cluster 198 // clients. 199 // 200 // The Local Lock Manager (LLM) records locks acquired for local file 201 // systems like UFS and VxFS that are NFS-exported. Permitting local 202 // file system NLM locks to coexist with PXFS NLM locks was NOT 203 // part of the original design of the NLM for Sun Cluster, and 204 // is therefore, a new requirement. It was envisioned that clients 205 // of the cluster would only use PXFS. And this design decision 206 // can be seen in the Solaris NLM code, which assumes that when 207 // a cluster is booted, PXFS is the only file system being used. 208 // 209 // It turns out that we can accommodate this new requirement fairly 210 // easily in the PXFS code (in this file), which is plugged into the 211 // Solaris NLM hooks at the time the PXFS server kernel module is 212 // loaded (loaded at every node). These routines typically go 213 // out to all PXFS primary servers in the cluster to do the work 214 // at the LLMs. In additon, we need to consider the NLM locks for 215 // the local file system at the PXFS client node (NLM is a PXFS 216 // client), and so we fiddle with either the sysid or the nlmid 217 // (setting it to 0) and call the LLM routines. 218 // 219 220 // 221 // Effects: Returns 1 (true) if file locks are held by 222 // a remote client identified by "sysid" at any 223 // fs_collection object. Returns 0 (false) otherwise. 224 // Note that this routine is callable from C. 225 // Parameters: 226 // sysid (IN): uniquely identifies lock owner 227 // chklck (IN): 228 // 229 static int 230 nlm_has_file_locks(int sysid, int chklck) 231 { 232 // 1. Get the name server 233 // 2. For each node in the cluster, fetch the fs collection object 234 // a. invoke has_file_locks() method on fs collection object. 235 // - if any exception is raised, then clear it and continue 236 // - if a result is true, then return true immediately. 237 // - if all results are false, then return false 238 239 Environment e; 240 naming::naming_context_var ctxp; 241 242 ctxp = ns::root_nameserver(); // get root name server 243 244 naming::binding_list_var bl; 245 naming::binding_iterator_var binditer; 246 247 ctxp->list(MAX_ITEMS, bl, binditer, e); 248 // 249 // The name server should never fail, as long as one node of the 250 // cluster is up. 251 // 252 if (e.exception()) { 253 e.clear(); 254 return (0); 255 } 256 257 uint_t len = bl->length(); 258 259 bool result = false; 260 int retres = 0; 261 for (uint_t index = 0; index < len; index++) { 262 char *bindname = bl[index].binding_name; 263 264 // 265 // Compare just the first 12 characters of the name, those 266 // characters being "fscollection". 267 // 268 if (os::strncmp(bindname, fscollection_name, 269 fscollection_name_size) == 0) { 270 // 271 // If the name server cannot resolve the name, then 272 // this node must have died. We continue with next 273 // iteration as if the node is dead, and no locks 274 // are held. 275 // 276 CORBA::Object_var obj = ctxp->resolve(bindname, e); 277 278 if (e.exception()) { 279 e.clear(); 280 LOCK_DBPRINTF( 281 LOCK_TRACE_NLM, 282 LOCK_RED, 283 ("nlm_has_file_locks: can't resolve " 284 "name\n")); 285 continue; 286 } 287 ASSERT(is_not_nil(obj)); 288 289 // coerce corba object to fs_collection object 290 fs::fs_collection_var fscoll = 291 fs::fs_collection::_narrow(obj); 292 ASSERT(is_not_nil(fscoll)); 293 294 result = fscoll->has_file_locks(sysid, chklck, e); 295 296 // 297 // If any exceptions were raised, then the method 298 // invocation has failed. We aossume that the remote 299 // node failed, and therefore the locks are gone. 300 // We just continue with the next iteration. 301 // 302 if (e.exception()) { 303 e.clear(); 304 continue; 305 } 306 307 // 308 // If any method invocation returns true, then 309 // the client still holds locks somewhere; we 310 // don't care where. Return immediately. 311 // 312 if (result) { 313 retres = 1; 314 break; 315 // return (1); 316 } 317 } // end if 318 } // end for 319 320 // 321 // Now that we've checked for clients of PXFS files, we must check 322 // for clients of local file systems and ask whether those 323 // clients have locks anywhere. 324 // 325 // Since the Solaris NLM code set the upper 2 bytes to indicate 326 // we're in a cluster using PXFS, we need to reset those bytes 327 // to 0 here so that we can ask about locks for the local file 328 // system. Calling this macro extracts just the sysid (lower 329 // two bytes) and sets it to a new variable; this effectively 330 // sets the upper two bytes to 0. 331 // 332 int local_sysid = GETSYSID(sysid); 333 334 // 335 // Check to see if the client holds NLM locks on the local 336 // file system. Ask the LLM in a direct call to the Solaris code. 337 // 338 if (flk_sysid_has_locks(local_sysid, chklck) == 1) { 339 retres = 1; 340 } // end if 341 342 return (retres); 343 } 344 // nlm_has_file_locks 345 346 347 // 348 // Effects: This routine sets the status of the local lock manager 349 // to the argument "status" for the given NLM identified by argument 350 // "nlmid." It also sets the status for NLM locks obtained for the 351 // local file system at the NLM's node. Returns nothing. 352 // Note that this routine is callable from C. 353 // Parameters: 354 // nlmid (IN): node id of NLM server 355 // status (IN): state of NLM server 356 // legal values are NLM_UP, NLM_DOWN, 357 // and NLM_SHUTTING_DOWN. 358 // 359 static void 360 nlm_set_nlm_status(int32_t nlmid, flk_nlm_status_t status) 361 { 362 fs::nlm_status new_status = fs::nlm_up; 363 flk_nlm_status_t new_state = FLK_NLM_UP; 364 365 switch (status) { 366 case FLK_NLM_UP: 367 new_status = fs::nlm_up; 368 new_state = FLK_NLM_UP; 369 break; 370 case FLK_NLM_DOWN: 371 new_status = fs::nlm_down; 372 new_state = FLK_NLM_DOWN; 373 break; 374 case FLK_NLM_SHUTTING_DOWN: 375 new_status = fs::nlm_shutting_down; 376 new_state = FLK_NLM_SHUTTING_DOWN; 377 break; 378 default: 379 ASSERT(0); 380 // NOTREACHED 381 } // end switch 382 383 // 384 // When we change the state of the NLM "nlmid" we must consider 385 // the local file system's NLM locks. The following call talks 386 // directly to the LLM on node "nlmid." To identify the local 387 // file system's NLM locks, we set nlmid to 0. 388 // 389 // The reason we can merely set the nlmid to 0 is that we 390 // are taking advantage of a loophole in the called routine. 391 // Note that in the Solaris routine "cl_flk_set_nlm_status(nlmid, 392 // state)" in usr/src/uts/common/os/flock.c, there's a "Requires" 393 // clause in the comment, which says the following: 394 // "nlmid" must be >= 1 and <= clconf_maximum_nodeid()" 395 // It says "Requires" because it cannot enforce it. The routine 396 // returns void, that is, nothing, so we cannot report an error. 397 // We leave it to the caller to ensure the constraint. Since 398 // the routine doesn't enforce, that leaves a loophole. 399 // This solution exploits that loophole. We can consider 400 // UFS locks in this routine even though we're booted as 401 // a cluster, by setting the first argument to 0. Someday 402 // we should update that comment in Solaris. 403 // 404 cl_flk_set_nlm_status(0, new_state); 405 406 // 407 // The following call considers NLM locks acquired for 408 // PXFS files only. It sets the status for the NLM locks 409 // acquired via the NLM residing at node "nlmid." This may have 410 // the effect of talking to all PXFS file systems in the cluster 411 // and setting the status at the LLMs serving those file systems. 412 // 413 // Note that the old and new enum's for nlm_status are identical. 414 // Different enum definitions are used to prevent a circular dependency 415 // due to rolling upgrade. 416 // 417 pxvfs::set_nlm_status(nlmid, (pxfs_v1::nlm_status)new_status); 418 } 419 420 // 421 // Requires: "sysid" is a pair [nlmid, sysid]. The lower half is 16-bit 422 // quantity, the real sysid generated by the NLM server; the upper half 423 // identifies the node of the cluster where the NLM server ran. 424 // Effects: Delete all the advisory file locks held by a remote client 425 // at all fs_collection objects. The client is identified by "sysid." 426 // It also remove NLM locks for all local file systems. 427 // Note that this routine is callable from C. 428 // Parameters: 429 // sysid (IN): uniquely identifies lock owner 430 // 431 static void 432 nlm_remove_file_locks(int32_t sysid) 433 { 434 // 435 // Remove the NLM locks on a local file system. Set the upper 436 // bytes of the sysid to 0, which indicates local locks. 437 // The following macro extracts the low-order two bytes and 438 // assigns them to a new variable; this effectively makes the 439 // high-order two bytes to be 0. The reason we do this instead 440 // of something more straightforward like the in-line routine 441 // pxfslib::set_nodeid(sysid, 0) 442 // is that that that routine does NOT work correctly when the 443 // nodeid is set to 0; in fact, it's a no-op. 444 // 445 int new_sysid = GETSYSID(sysid); 446 cl_flk_remove_locks_by_sysid(new_sysid); 447 448 // 449 // Remove the NLM file locks on PXFS files only. The 450 // lock owner is identified by the sysid [nlmid, sysid] 451 // pair on all PXFS file systems. 452 // 453 pxvfs::remove_file_locks(sysid); 454 } 455 456 // 457 // Effects: This routine creates a deferred task from the common 458 // threadpool. The task will call back into the NLM base Solaris code 459 // to the actual work of freeing any sysids not holding locks. 460 // 461 // Note: This routine will be plugged into the lm_free_nlm_sysid_table 462 // hook in the base Solaris code by "nlm_init" when the PXFS server kernel 463 // module is loaded. 464 // 465 static void 466 nlm_free_sysid_table() 467 { 468 // 469 // If this callback is called before the kernel module is completely 470 // loaded, then the deferred task object could be NULL. We bail 471 // out if this is the case. 472 // 473 if (nlm_cleanup_task == NULL) { 474 return; 475 } 476 477 nlm_cleanup_task->set_lock(); 478 479 // 480 // If the object is not in use, then flag it as "in use" and 481 // then queue it on the deferred task queue for eventual 482 // execution. 483 // 484 if (!nlm_cleanup_task->is_inuse()) { 485 nlm_cleanup_task->set_inuse(true); 486 nlm_cleanup_task->unlock(); 487 488 // 489 // Enqueue nlm_cleanup_task object on the deferred task 490 // queue. 491 // 492 common_threadpool::the().defer_processing(nlm_cleanup_task); 493 494 return; 495 } // end if 496 497 nlm_cleanup_task->unlock(); 498 } 499 500 void 501 nlm_init() 502 { 503 // 504 // Allocate the deferred task here when the PXFS server module 505 // loaded. 506 // 507 if (nlm_cleanup_task == NULL) { 508 nlm_cleanup_task = new nlm_sysid_cleanup(); 509 } 510 511 lm_set_nlm_status = nlm_set_nlm_status; 512 lm_remove_file_locks = nlm_remove_file_locks; 513 lm_has_file_locks = nlm_has_file_locks; 514 lm_free_nlm_sysid_table = nlm_free_sysid_table; 515 } 516 517 void 518 nlm_fini() 519 { 520 if (nlm_cleanup_task != NULL) { 521 // 522 // Before destroying the deferred task object, wait 523 // until it's no longer in use. We'll be signalled by 524 // the thread executing 'execute' method. 525 // 526 nlm_cleanup_task->set_lock(); 527 while (true) { 528 nlm_cleanup_task->wait_for_signal(); 529 530 if (!nlm_cleanup_task->is_inuse()) { 531 nlm_cleanup_task->unlock(); 532 533 // Delete the task object. 534 delete nlm_cleanup_task; 535 } // end if 536 } // end while 537 } // end if 538 539 lm_set_nlm_status = NULL; 540 lm_remove_file_locks = NULL; 541 lm_has_file_locks = NULL; 542 lm_free_nlm_sysid_table = NULL; 543 } 544