Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /* ONC_PLUS EXTRACT START */
     22 
     23 /*
     24  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     25  * Use is subject to license terms.
     26  */
     27 
     28 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
     29 /*	All Rights Reserved */
     30 
     31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     32 
     33 #include <sys/flock_impl.h>
     34 #include <sys/vfs.h>
     35 #include <sys/t_lock.h>		/* for <sys/callb.h> */
     36 #include <sys/callb.h>
     37 #include <sys/clconf.h>
     38 #include <sys/cladm.h>
     39 #include <sys/nbmlock.h>
     40 #include <sys/cred.h>
     41 #include <sys/policy.h>
     42 
     43 /*
     44  * The following four variables are for statistics purposes and they are
     45  * not protected by locks. They may not be accurate but will at least be
     46  * close to the actual value.
     47  */
     48 
     49 int	flk_lock_allocs;
     50 int	flk_lock_frees;
     51 int 	edge_allocs;
     52 int	edge_frees;
     53 int 	flk_proc_vertex_allocs;
     54 int 	flk_proc_edge_allocs;
     55 int	flk_proc_vertex_frees;
     56 int	flk_proc_edge_frees;
     57 
     58 static kmutex_t flock_lock;
     59 
     60 #ifdef DEBUG
     61 int check_debug = 0;
     62 #define	CHECK_ACTIVE_LOCKS(gp)	if (check_debug) \
     63 					check_active_locks(gp);
     64 #define	CHECK_SLEEPING_LOCKS(gp)	if (check_debug) \
     65 						check_sleeping_locks(gp);
     66 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp) 	\
     67 		if (check_debug)	\
     68 			check_owner_locks(gp, pid, sysid, vp);
     69 #define	CHECK_LOCK_TRANSITION(old_state, new_state) \
     70 	{ \
     71 		if (check_lock_transition(old_state, new_state)) { \
     72 			cmn_err(CE_PANIC, "Illegal lock transition \
     73 			    from %d to %d", old_state, new_state); \
     74 		} \
     75 	}
     76 #else
     77 
     78 #define	CHECK_ACTIVE_LOCKS(gp)
     79 #define	CHECK_SLEEPING_LOCKS(gp)
     80 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
     81 #define	CHECK_LOCK_TRANSITION(old_state, new_state)
     82 
     83 #endif /* DEBUG */
     84 
     85 struct kmem_cache	*flk_edge_cache;
     86 
     87 graph_t		*lock_graph[HASH_SIZE];
     88 proc_graph_t	pgraph;
     89 
     90 /*
     91  * Clustering.
     92  *
     93  * NLM REGISTRY TYPE IMPLEMENTATION
     94  *
     95  * Assumptions:
     96  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
     97  *	integers; maximum node id is returned by clconf_maximum_nodeid().
     98  *  2.  We use this node id to identify the node an NLM server runs on.
     99  */
    100 
    101 /*
    102  * NLM registry object keeps track of NLM servers via their
    103  * nlmids (which are the node ids of the node in the cluster they run on)
    104  * that have requested locks at this LLM with which this registry is
    105  * associated.
    106  *
    107  * Representation of abstraction:
    108  *    rep = record[	states: array[nlm_state],
    109  *			lock: mutex]
    110  *
    111  *    Representation invariants:
    112  *	1. index i of rep.states is between 0 and n - 1 where n is number
    113  *	   of elements in the array, which happen to be the maximum number
    114  *	   of nodes in the cluster configuration + 1.
    115  *	2. map nlmid to index i of rep.states
    116  *		0   -> 0
    117  *		1   -> 1
    118  *		2   -> 2
    119  *		n-1 -> clconf_maximum_nodeid()+1
    120  *	3.  This 1-1 mapping is quite convenient and it avoids errors resulting
    121  *	    from forgetting to subtract 1 from the index.
    122  *	4.  The reason we keep the 0th index is the following.  A legitimate
    123  *	    cluster configuration includes making a UFS file system NFS
    124  *	    exportable.  The code is structured so that if you're in a cluster
    125  *	    you do one thing; otherwise, you do something else.  The problem
    126  *	    is what to do if you think you're in a cluster with PXFS loaded,
    127  *	    but you're using UFS not PXFS?  The upper two bytes of the sysid
    128  *	    encode the node id of the node where NLM server runs; these bytes
    129  *	    are zero for UFS.  Since the nodeid is used to index into the
    130  *	    registry, we can record the NLM server state information at index
    131  *	    0 using the same mechanism used for PXFS file locks!
    132  */
    133 static flk_nlm_status_t *nlm_reg_status = NULL;	/* state array 0..N-1 */
    134 static kmutex_t nlm_reg_lock;			/* lock to protect arrary */
    135 static uint_t nlm_status_size;			/* size of state array */
    136 
    137 /*
    138  * Although we need a global lock dependency graph (and associated data
    139  * structures), we also need a per-zone notion of whether the lock manager is
    140  * running, and so whether to allow lock manager requests or not.
    141  *
    142  * Thus, on a per-zone basis we maintain a ``global'' variable
    143  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
    144  * manager is determined to be changing state (starting or stopping).
    145  *
    146  * Each graph/zone pair also has a copy of this variable, which is protected by
    147  * the graph's mutex.
    148  *
    149  * The per-graph copies are used to synchronize lock requests with shutdown
    150  * requests.  The global copy is used to initialize the per-graph field when a
    151  * new graph is created.
    152  */
    153 struct flock_globals {
    154 	flk_lockmgr_status_t flk_lockmgr_status;
    155 	flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
    156 };
    157 
    158 zone_key_t flock_zone_key;
    159 
    160 static void create_flock(lock_descriptor_t *, flock64_t *);
    161 static lock_descriptor_t	*flk_get_lock(void);
    162 static void	flk_free_lock(lock_descriptor_t	*lock);
    163 static void	flk_get_first_blocking_lock(lock_descriptor_t *request);
    164 static int flk_process_request(lock_descriptor_t *);
    165 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
    166 static edge_t *flk_get_edge(void);
    167 static int flk_wait_execute_request(lock_descriptor_t *);
    168 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
    169 static void flk_insert_active_lock(lock_descriptor_t *);
    170 static void flk_delete_active_lock(lock_descriptor_t *, int);
    171 static void flk_insert_sleeping_lock(lock_descriptor_t *);
    172 static void flk_graph_uncolor(graph_t *);
    173 static void flk_wakeup(lock_descriptor_t *, int);
    174 static void flk_free_edge(edge_t *);
    175 static void flk_recompute_dependencies(lock_descriptor_t *,
    176 			lock_descriptor_t **,  int, int);
    177 static int flk_find_barriers(lock_descriptor_t *);
    178 static void flk_update_barriers(lock_descriptor_t *);
    179 static int flk_color_reachables(lock_descriptor_t *);
    180 static int flk_canceled(lock_descriptor_t *);
    181 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
    182 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
    183 static void wait_for_lock(lock_descriptor_t *);
    184 static void unlock_lockmgr_granted(struct flock_globals *);
    185 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
    186 
    187 /* Clustering hooks */
    188 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
    189 static void cl_flk_wakeup_sleeping_nlm_locks(int);
    190 static void cl_flk_unlock_nlm_granted(int);
    191 
    192 #ifdef DEBUG
    193 static int check_lock_transition(int, int);
    194 static void check_sleeping_locks(graph_t *);
    195 static void check_active_locks(graph_t *);
    196 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
    197 static void path(lock_descriptor_t *, lock_descriptor_t *);
    198 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
    199 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
    200 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
    201 #endif
    202 
    203 /*	proc_graph function definitions */
    204 static int flk_check_deadlock(lock_descriptor_t *);
    205 static void flk_proc_graph_uncolor(void);
    206 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
    207 static proc_edge_t *flk_get_proc_edge(void);
    208 static void flk_proc_release(proc_vertex_t *);
    209 static void flk_free_proc_edge(proc_edge_t *);
    210 static void flk_update_proc_graph(edge_t *, int);
    211 
    212 /* Non-blocking mandatory locking */
    213 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
    214 			u_offset_t);
    215 
    216 static struct flock_globals *
    217 flk_get_globals(void)
    218 {
    219 	/*
    220 	 * The KLM module had better be loaded if we're attempting to handle
    221 	 * lockmgr requests.
    222 	 */
    223 	ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
    224 	return (zone_getspecific(flock_zone_key, curproc->p_zone));
    225 }
    226 
    227 static flk_lockmgr_status_t
    228 flk_get_lockmgr_status(void)
    229 {
    230 	struct flock_globals *fg;
    231 
    232 	ASSERT(MUTEX_HELD(&flock_lock));
    233 
    234 	if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
    235 		/*
    236 		 * KLM module not loaded; lock manager definitely not running.
    237 		 */
    238 		return (FLK_LOCKMGR_DOWN);
    239 	}
    240 	fg = flk_get_globals();
    241 	return (fg->flk_lockmgr_status);
    242 }
    243 
    244 /*
    245  * Routine called from fs_frlock in fs/fs_subr.c
    246  */
    247 
    248 int
    249 reclock(vnode_t		*vp,
    250 	flock64_t	*lckdat,
    251 	int		cmd,
    252 	int		flag,
    253 	u_offset_t	offset,
    254 	flk_callback_t	*flk_cbp)
    255 {
    256 	lock_descriptor_t	stack_lock_request;
    257 	lock_descriptor_t	*lock_request;
    258 	int error = 0;
    259 	graph_t	*gp;
    260 	int			nlmid;
    261 
    262 	/*
    263 	 * Check access permissions
    264 	 */
    265 	if ((cmd & SETFLCK) &&
    266 		((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
    267 		(lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
    268 			return (EBADF);
    269 
    270 	/*
    271 	 * for query and unlock we use the stack_lock_request
    272 	 */
    273 
    274 	if ((lckdat->l_type == F_UNLCK) ||
    275 			!((cmd & INOFLCK) || (cmd & SETFLCK))) {
    276 		lock_request = &stack_lock_request;
    277 		(void) bzero((caddr_t)lock_request,
    278 				sizeof (lock_descriptor_t));
    279 
    280 		/*
    281 		 * following is added to make the assertions in
    282 		 * flk_execute_request() to pass through
    283 		 */
    284 
    285 		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
    286 		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
    287 		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
    288 		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
    289 		lock_request->l_status = FLK_INITIAL_STATE;
    290 	} else {
    291 		lock_request = flk_get_lock();
    292 	}
    293 	lock_request->l_state = 0;
    294 	lock_request->l_vnode = vp;
    295 	lock_request->l_zoneid = getzoneid();
    296 
    297 	/*
    298 	 * Convert the request range into the canonical start and end
    299 	 * values.  The NLM protocol supports locking over the entire
    300 	 * 32-bit range, so there's no range checking for remote requests,
    301 	 * but we still need to verify that local requests obey the rules.
    302 	 */
    303 	/* Clustering */
    304 	if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
    305 		ASSERT(lckdat->l_whence == 0);
    306 		lock_request->l_start = lckdat->l_start;
    307 		lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
    308 			lckdat->l_start + (lckdat->l_len - 1);
    309 	} else {
    310 		/* check the validity of the lock range */
    311 		error = flk_convert_lock_data(vp, lckdat,
    312 			&lock_request->l_start, &lock_request->l_end,
    313 			offset);
    314 		if (error) {
    315 			goto done;
    316 		}
    317 		error = flk_check_lock_data(lock_request->l_start,
    318 					    lock_request->l_end, MAXEND);
    319 		if (error) {
    320 			goto done;
    321 		}
    322 	}
    323 
    324 	ASSERT(lock_request->l_end >= lock_request->l_start);
    325 
    326 	lock_request->l_type = lckdat->l_type;
    327 	if (cmd & INOFLCK)
    328 		lock_request->l_state |= IO_LOCK;
    329 	if (cmd & SLPFLCK)
    330 		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
    331 	if (cmd & RCMDLCK)
    332 		lock_request->l_state |= LOCKMGR_LOCK;
    333 	if (cmd & NBMLCK)
    334 		lock_request->l_state |= NBMAND_LOCK;
    335 	/*
    336 	 * Clustering: set flag for PXFS locks
    337 	 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
    338 	 * also be of type 'RCMDLCK'.
    339 	 * We do not _only_ check the GETPXFSID() macro because local PXFS
    340 	 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
    341 	 */
    342 
    343 	if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
    344 		lock_request->l_state |= PXFS_LOCK;
    345 	}
    346 	if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
    347 		if (lock_request->l_type == F_RDLCK ||
    348 			lock_request->l_type == F_WRLCK)
    349 			lock_request->l_state |= QUERY_LOCK;
    350 	}
    351 	lock_request->l_flock = (*lckdat);
    352 	lock_request->l_callbacks = flk_cbp;
    353 
    354 	/*
    355 	 * We are ready for processing the request
    356 	 */
    357 	if (IS_LOCKMGR(lock_request)) {
    358 		/*
    359 		 * If the lock request is an NLM server request ....
    360 		 */
    361 		if (nlm_status_size == 0) { /* not booted as cluster */
    362 			mutex_enter(&flock_lock);
    363 			/*
    364 			 * Bail out if this is a lock manager request and the
    365 			 * lock manager is not supposed to be running.
    366 			 */
    367 			if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
    368 				mutex_exit(&flock_lock);
    369 				error = ENOLCK;
    370 				goto done;
    371 			}
    372 			mutex_exit(&flock_lock);
    373 		} else {			/* booted as a cluster */
    374 			nlmid = GETNLMID(lock_request->l_flock.l_sysid);
    375 			ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
    376 
    377 			mutex_enter(&nlm_reg_lock);
    378 			/*
    379 			 * If the NLM registry does not know about this
    380 			 * NLM server making the request, add its nlmid
    381 			 * to the registry.
    382 			 */
    383 			if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
    384 				nlmid)) {
    385 				FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
    386 			} else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
    387 				nlmid)) {
    388 				/*
    389 				 * If the NLM server is already known (has made
    390 				 * previous lock requests) and its state is
    391 				 * not NLM_UP (means that NLM server is
    392 				 * shutting down), then bail out with an
    393 				 * error to deny the lock request.
    394 				 */
    395 				mutex_exit(&nlm_reg_lock);
    396 				error = ENOLCK;
    397 				goto done;
    398 			}
    399 			mutex_exit(&nlm_reg_lock);
    400 		}
    401 	}
    402 
    403 	/* Now get the lock graph for a particular vnode */
    404 	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
    405 
    406 	/*
    407 	 * We drop rwlock here otherwise this might end up causing a
    408 	 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
    409 	 */
    410 
    411 	if (IS_IO_LOCK(lock_request)) {
    412 		VOP_RWUNLOCK(vp,
    413 			(lock_request->l_type == F_RDLCK) ?
    414 				V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
    415 	}
    416 	mutex_enter(&gp->gp_mutex);
    417 
    418 	lock_request->l_state |= REFERENCED_LOCK;
    419 	lock_request->l_graph = gp;
    420 
    421 	switch (lock_request->l_type) {
    422 	case F_RDLCK:
    423 	case F_WRLCK:
    424 		if (IS_QUERY_LOCK(lock_request)) {
    425 			flk_get_first_blocking_lock(lock_request);
    426 			(*lckdat) = lock_request->l_flock;
    427 			break;
    428 		}
    429 
    430 		/* process the request now */
    431 
    432 		error = flk_process_request(lock_request);
    433 		break;
    434 
    435 	case F_UNLCK:
    436 		/* unlock request will not block so execute it immediately */
    437 
    438 		if (IS_LOCKMGR(lock_request) &&
    439 		    flk_canceled(lock_request)) {
    440 			error = 0;
    441 		} else {
    442 			error = flk_execute_request(lock_request);
    443 		}
    444 		break;
    445 
    446 	case F_UNLKSYS:
    447 		/*
    448 		 * Recovery mechanism to release lock manager locks when
    449 		 * NFS client crashes and restart. NFS server will clear
    450 		 * old locks and grant new locks.
    451 		 */
    452 
    453 		if (lock_request->l_flock.l_sysid == 0) {
    454 			mutex_exit(&gp->gp_mutex);
    455 			return (EINVAL);
    456 		}
    457 		if (secpolicy_nfs(CRED()) != 0) {
    458 			mutex_exit(&gp->gp_mutex);
    459 			return (EPERM);
    460 		}
    461 		flk_delete_locks_by_sysid(lock_request);
    462 		lock_request->l_state &= ~REFERENCED_LOCK;
    463 		flk_set_state(lock_request, FLK_DEAD_STATE);
    464 		flk_free_lock(lock_request);
    465 		mutex_exit(&gp->gp_mutex);
    466 		return (0);
    467 
    468 	default:
    469 		error = EINVAL;
    470 		break;
    471 	}
    472 
    473 	/* Clustering: For blocked PXFS locks, return */
    474 	if (error == PXFS_LOCK_BLOCKED) {
    475 		lock_request->l_state &= ~REFERENCED_LOCK;
    476 		mutex_exit(&gp->gp_mutex);
    477 		return (error);
    478 	}
    479 
    480 	/*
    481 	 * Now that we have seen the status of locks in the system for
    482 	 * this vnode we acquire the rwlock if it is an IO_LOCK.
    483 	 */
    484 
    485 	if (IS_IO_LOCK(lock_request)) {
    486 		(void) VOP_RWLOCK(vp,
    487 			(lock_request->l_type == F_RDLCK) ?
    488 				V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
    489 		if (!error) {
    490 			lckdat->l_type = F_UNLCK;
    491 
    492 			/*
    493 			 * This wake up is needed otherwise
    494 			 * if IO_LOCK has slept the dependents on this
    495 			 * will not be woken up at all. (bugid # 1185482).
    496 			 */
    497 
    498 			flk_wakeup(lock_request, 1);
    499 			flk_set_state(lock_request, FLK_DEAD_STATE);
    500 			flk_free_lock(lock_request);
    501 		}
    502 		/*
    503 		 * else if error had occurred either flk_process_request()
    504 		 * has returned EDEADLK in which case there will be no
    505 		 * dependents for this lock or EINTR from flk_wait_execute_
    506 		 * request() in which case flk_cancel_sleeping_lock()
    507 		 * would have been done. same is true with EBADF.
    508 		 */
    509 	}
    510 
    511 	if (lock_request == &stack_lock_request) {
    512 		flk_set_state(lock_request, FLK_DEAD_STATE);
    513 	} else {
    514 		lock_request->l_state &= ~REFERENCED_LOCK;
    515 		if ((error != 0) || IS_DELETED(lock_request)) {
    516 			flk_set_state(lock_request, FLK_DEAD_STATE);
    517 			flk_free_lock(lock_request);
    518 		}
    519 	}
    520 
    521 	mutex_exit(&gp->gp_mutex);
    522 	return (error);
    523 
    524 done:
    525 	flk_set_state(lock_request, FLK_DEAD_STATE);
    526 	if (lock_request != &stack_lock_request)
    527 		flk_free_lock(lock_request);
    528 	return (error);
    529 }
    530 
    531 /*
    532  * Invoke the callbacks in the given list.  If before sleeping, invoke in
    533  * list order.  If after sleeping, invoke in reverse order.
    534  *
    535  * CPR (suspend/resume) support: if one of the callbacks returns a
    536  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
    537  * while it is sleeping.  There should be at most one callb_cpr_t for the
    538  * thread.
    539  * XXX This is unnecessarily complicated.  The CPR information should just
    540  * get passed in directly through VOP_FRLOCK and reclock, rather than
    541  * sneaking it in via a callback.
    542  */
    543 
    544 callb_cpr_t *
    545 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
    546 {
    547 	callb_cpr_t *cpr_callbackp = NULL;
    548 	callb_cpr_t *one_result;
    549 	flk_callback_t *cb;
    550 
    551 	if (cblist == NULL)
    552 		return (NULL);
    553 
    554 	if (when == FLK_BEFORE_SLEEP) {
    555 		cb = cblist;
    556 		do {
    557 			one_result = (*cb->cb_callback)(when, cb->cb_data);
    558 			if (one_result != NULL) {
    559 				ASSERT(cpr_callbackp == NULL);
    560 				cpr_callbackp = one_result;
    561 			}
    562 			cb = cb->cb_next;
    563 		} while (cb != cblist);
    564 	} else {
    565 		cb = cblist->cb_prev;
    566 		do {
    567 			one_result = (*cb->cb_callback)(when, cb->cb_data);
    568 			if (one_result != NULL) {
    569 				cpr_callbackp = one_result;
    570 			}
    571 			cb = cb->cb_prev;
    572 		} while (cb != cblist->cb_prev);
    573 	}
    574 
    575 	return (cpr_callbackp);
    576 }
    577 
    578 /*
    579  * Initialize a flk_callback_t to hold the given callback.
    580  */
    581 
    582 void
    583 flk_init_callback(flk_callback_t *flk_cb,
    584 	callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
    585 {
    586 	flk_cb->cb_next = flk_cb;
    587 	flk_cb->cb_prev = flk_cb;
    588 	flk_cb->cb_callback = cb_fcn;
    589 	flk_cb->cb_data = cbdata;
    590 }
    591 
    592 /*
    593  * Initialize an flk_callback_t and then link it into the head of an
    594  * existing list (which may be NULL).
    595  */
    596 
    597 void
    598 flk_add_callback(flk_callback_t *newcb,
    599 		callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
    600 		void *cbdata, flk_callback_t *cblist)
    601 {
    602 	flk_init_callback(newcb, cb_fcn, cbdata);
    603 
    604 	if (cblist == NULL)
    605 		return;
    606 
    607 	newcb->cb_prev = cblist->cb_prev;
    608 	newcb->cb_next = cblist;
    609 	cblist->cb_prev->cb_next = newcb;
    610 	cblist->cb_prev = newcb;
    611 }
    612 /* ONC_PLUS EXTRACT END */
    613 
    614 /*
    615  * Initialize the flk_edge_cache data structure and create the
    616  * nlm_reg_status array.
    617  */
    618 
    619 void
    620 flk_init(void)
    621 {
    622 	uint_t	i;
    623 
    624 	flk_edge_cache = kmem_cache_create("flk_edges",
    625 		sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
    626 	if (flk_edge_cache == NULL) {
    627 		cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
    628 	}
    629 	/*
    630 	 * Create the NLM registry object.
    631 	 */
    632 
    633 	if (cluster_bootflags & CLUSTER_BOOTED) {
    634 		/*
    635 		 * This routine tells you the maximum node id that will be used
    636 		 * in the cluster.  This number will be the size of the nlm
    637 		 * registry status array.  We add 1 because we will be using
    638 		 * all entries indexed from 0 to maxnodeid; e.g., from 0
    639 		 * to 64, for a total of 65 entries.
    640 		 */
    641 		nlm_status_size = clconf_maximum_nodeid() + 1;
    642 	} else {
    643 		nlm_status_size = 0;
    644 	}
    645 
    646 	if (nlm_status_size != 0) {	/* booted as a cluster */
    647 		nlm_reg_status = (flk_nlm_status_t *)
    648 			kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
    649 				KM_SLEEP);
    650 
    651 		/* initialize all NLM states in array to NLM_UNKNOWN */
    652 		for (i = 0; i < nlm_status_size; i++) {
    653 			nlm_reg_status[i] = FLK_NLM_UNKNOWN;
    654 		}
    655 	}
    656 }
    657 
    658 /*
    659  * Zone constructor/destructor callbacks to be executed when a zone is
    660  * created/destroyed.
    661  */
    662 /* ARGSUSED */
    663 void *
    664 flk_zone_init(zoneid_t zoneid)
    665 {
    666 	struct flock_globals *fg;
    667 	uint_t i;
    668 
    669 	fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
    670 	fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
    671 	for (i = 0; i < HASH_SIZE; i++)
    672 		fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
    673 	return (fg);
    674 }
    675 
    676 /* ARGSUSED */
    677 void
    678 flk_zone_fini(zoneid_t zoneid, void *data)
    679 {
    680 	struct flock_globals *fg = data;
    681 
    682 	kmem_free(fg, sizeof (*fg));
    683 }
    684 
    685 /*
    686  * Get a lock_descriptor structure with initialization of edge lists.
    687  */
    688 
    689 static lock_descriptor_t *
    690 flk_get_lock(void)
    691 {
    692 	lock_descriptor_t	*l;
    693 
    694 	l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
    695 
    696 	cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
    697 	l->l_edge.edge_in_next = &l->l_edge;
    698 	l->l_edge.edge_in_prev = &l->l_edge;
    699 	l->l_edge.edge_adj_next = &l->l_edge;
    700 	l->l_edge.edge_adj_prev = &l->l_edge;
    701 	l->pvertex = -1;
    702 	l->l_status = FLK_INITIAL_STATE;
    703 	flk_lock_allocs++;
    704 	return (l);
    705 }
    706 
    707 /*
    708  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
    709  * when some thread has a reference to it as in reclock().
    710  */
    711 
    712 void
    713 flk_free_lock(lock_descriptor_t	*lock)
    714 {
    715 	ASSERT(IS_DEAD(lock));
    716 	if (IS_REFERENCED(lock)) {
    717 		lock->l_state |= DELETED_LOCK;
    718 		return;
    719 	}
    720 	flk_lock_frees++;
    721 	kmem_free((void *)lock, sizeof (lock_descriptor_t));
    722 }
    723 
    724 void
    725 flk_set_state(lock_descriptor_t *lock, int new_state)
    726 {
    727 	/*
    728 	 * Locks in the sleeping list may be woken up in a number of ways,
    729 	 * and more than once.  If a sleeping lock is signaled awake more
    730 	 * than once, then it may or may not change state depending on its
    731 	 * current state.
    732 	 * Also note that NLM locks that are sleeping could be moved to an
    733 	 * interrupted state more than once if the unlock request is
    734 	 * retransmitted by the NLM client - the second time around, this is
    735 	 * just a nop.
    736 	 * The ordering of being signaled awake is:
    737 	 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
    738 	 * The checks below implement this ordering.
    739 	 */
    740 	if (IS_INTERRUPTED(lock)) {
    741 		if ((new_state == FLK_CANCELLED_STATE) ||
    742 		    (new_state == FLK_GRANTED_STATE) ||
    743 		    (new_state == FLK_INTERRUPTED_STATE)) {
    744 			return;
    745 		}
    746 	}
    747 	if (IS_CANCELLED(lock)) {
    748 		if ((new_state == FLK_GRANTED_STATE) ||
    749 		    (new_state == FLK_CANCELLED_STATE)) {
    750 			return;
    751 		}
    752 	}
    753 	CHECK_LOCK_TRANSITION(lock->l_status, new_state);
    754 	if (IS_PXFS(lock)) {
    755 		cl_flk_state_transition_notify(lock, lock->l_status, new_state);
    756 	}
    757 	lock->l_status = new_state;
    758 }
    759 
    760 /*
    761  * Routine that checks whether there are any blocking locks in the system.
    762  *
    763  * The policy followed is if a write lock is sleeping we don't allow read
    764  * locks before this write lock even though there may not be any active
    765  * locks corresponding to the read locks' region.
    766  *
    767  * flk_add_edge() function adds an edge between l1 and l2 iff there
    768  * is no path between l1 and l2. This is done to have a "minimum
    769  * storage representation" of the dependency graph.
    770  *
    771  * Another property of the graph is since only the new request throws
    772  * edges to the existing locks in the graph, the graph is always topologically
    773  * ordered.
    774  */
    775 
    776 static int
    777 flk_process_request(lock_descriptor_t *request)
    778 {
    779 	graph_t	*gp = request->l_graph;
    780 	lock_descriptor_t *lock;
    781 	int request_blocked_by_active = 0;
    782 	int request_blocked_by_granted = 0;
    783 	int request_blocked_by_sleeping = 0;
    784 	vnode_t	*vp = request->l_vnode;
    785 	int	error = 0;
    786 	int request_will_wait = 0;
    787 	int found_covering_lock = 0;
    788 	lock_descriptor_t *covered_by = NULL;
    789 
    790 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
    791 	request_will_wait = IS_WILLING_TO_SLEEP(request);
    792 
    793 	/*
    794 	 * check active locks
    795 	 */
    796 
    797 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
    798 
    799 
    800 	if (lock) {
    801 		do {
    802 			if (BLOCKS(lock, request)) {
    803 				if (!request_will_wait)
    804 					return (EAGAIN);
    805 				request_blocked_by_active = 1;
    806 				break;
    807 			}
    808 			/*
    809 			 * Grant lock if it is for the same owner holding active
    810 			 * lock that covers the request.
    811 			 */
    812 
    813 			if (SAME_OWNER(lock, request) &&
    814 					COVERS(lock, request) &&
    815 						(request->l_type == F_RDLCK))
    816 				return (flk_execute_request(request));
    817 			lock = lock->l_next;
    818 		} while (lock->l_vnode == vp);
    819 	}
    820 
    821 	if (!request_blocked_by_active) {
    822 			lock_descriptor_t *lk[1];
    823 			lock_descriptor_t *first_glock = NULL;
    824 		/*
    825 		 * Shall we grant this?! NO!!
    826 		 * What about those locks that were just granted and still
    827 		 * in sleep queue. Those threads are woken up and so locks
    828 		 * are almost active.
    829 		 */
    830 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
    831 		if (lock) {
    832 			do {
    833 				if (BLOCKS(lock, request)) {
    834 					if (IS_GRANTED(lock)) {
    835 						request_blocked_by_granted = 1;
    836 					} else {
    837 						request_blocked_by_sleeping = 1;
    838 					}
    839 				}
    840 
    841 				lock = lock->l_next;
    842 			} while ((lock->l_vnode == vp));
    843 			first_glock = lock->l_prev;
    844 			ASSERT(first_glock->l_vnode == vp);
    845 		}
    846 
    847 		if (request_blocked_by_granted)
    848 			goto block;
    849 
    850 		if (!request_blocked_by_sleeping) {
    851 			/*
    852 			 * If the request isn't going to be blocked by a
    853 			 * sleeping request, we know that it isn't going to
    854 			 * be blocked; we can just execute the request --
    855 			 * without performing costly deadlock detection.
    856 			 */
    857 			ASSERT(!request_blocked_by_active);
    858 			return (flk_execute_request(request));
    859 		} else if (request->l_type == F_RDLCK) {
    860 			/*
    861 			 * If we have a sleeping writer in the requested
    862 			 * lock's range, block.
    863 			 */
    864 			goto block;
    865 		}
    866 
    867 		lk[0] = request;
    868 		request->l_state |= RECOMPUTE_LOCK;
    869 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
    870 		if (lock) {
    871 			do {
    872 				flk_recompute_dependencies(lock, lk, 1, 0);
    873 				lock = lock->l_next;
    874 			} while (lock->l_vnode == vp);
    875 		}
    876 		lock = first_glock;
    877 		if (lock) {
    878 			do {
    879 				if (IS_GRANTED(lock)) {
    880 				flk_recompute_dependencies(lock, lk, 1, 0);
    881 				}
    882 				lock = lock->l_prev;
    883 			} while ((lock->l_vnode == vp));
    884 		}
    885 		request->l_state &= ~RECOMPUTE_LOCK;
    886 		if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
    887 			return (EDEADLK);
    888 		return (flk_execute_request(request));
    889 	}
    890 
    891 block:
    892 	if (request_will_wait)
    893 		flk_graph_uncolor(gp);
    894 
    895 	/* check sleeping locks */
    896 
    897 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
    898 
    899 	/*
    900 	 * If we find a sleeping write lock that is a superset of the
    901 	 * region wanted by request we can be assured that by adding an
    902 	 * edge to this write lock we have paths to all locks in the
    903 	 * graph that blocks the request except in one case and that is why
    904 	 * another check for SAME_OWNER in the loop below. The exception
    905 	 * case is when this process that owns the sleeping write lock 'l1'
    906 	 * has other locks l2, l3, l4 that are in the system and arrived
    907 	 * before l1. l1 does not have path to these locks as they are from
    908 	 * same process. We break when we find a second covering sleeping
    909 	 * lock l5 owned by a process different from that owning l1, because
    910 	 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
    911 	 * it has l1 would have produced a deadlock already.
    912 	 */
    913 
    914 	if (lock) {
    915 		do {
    916 			if (BLOCKS(lock, request)) {
    917 				if (!request_will_wait)
    918 					return (EAGAIN);
    919 				if (COVERS(lock, request) &&
    920 						lock->l_type == F_WRLCK) {
    921 					if (found_covering_lock &&
    922 					    !SAME_OWNER(lock, covered_by)) {
    923 						found_covering_lock++;
    924 						break;
    925 					}
    926 					found_covering_lock = 1;
    927 					covered_by = lock;
    928 				}
    929 				if (found_covering_lock &&
    930 					!SAME_OWNER(lock, covered_by)) {
    931 					lock = lock->l_next;
    932 					continue;
    933 				}
    934 				if ((error = flk_add_edge(request, lock,
    935 						!found_covering_lock, 0)))
    936 					return (error);
    937 			}
    938 			lock = lock->l_next;
    939 		} while (lock->l_vnode == vp);
    940 	}
    941 
    942 /*
    943  * found_covering_lock == 2 iff at this point 'request' has paths
    944  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
    945  * point 'request' has paths to all locks that blocks 'request' whose owners
    946  * are not same as the one that covers 'request' (covered_by above) and
    947  * we can have locks whose owner is same as covered_by in the active list.
    948  */
    949 
    950 	if (request_blocked_by_active && found_covering_lock != 2) {
    951 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
    952 		ASSERT(lock != NULL);
    953 		do {
    954 			if (BLOCKS(lock, request)) {
    955 				if (found_covering_lock &&
    956 					!SAME_OWNER(lock, covered_by)) {
    957 					lock = lock->l_next;
    958 					continue;
    959 				}
    960 				if ((error = flk_add_edge(request, lock,
    961 							CHECK_CYCLE, 0)))
    962 					return (error);
    963 			}
    964 			lock = lock->l_next;
    965 		} while (lock->l_vnode == vp);
    966 	}
    967 
    968 	if (NOT_BLOCKED(request)) {
    969 		/*
    970 		 * request not dependent on any other locks
    971 		 * so execute this request
    972 		 */
    973 		return (flk_execute_request(request));
    974 	} else {
    975 		/*
    976 		 * check for deadlock
    977 		 */
    978 		if (flk_check_deadlock(request))
    979 			return (EDEADLK);
    980 		/*
    981 		 * this thread has to sleep
    982 		 */
    983 		return (flk_wait_execute_request(request));
    984 	}
    985 }
    986 
    987 /* ONC_PLUS EXTRACT START */
    988 /*
    989  * The actual execution of the request in the simple case is only to
    990  * insert the 'request' in the list of active locks if it is not an
    991  * UNLOCK.
    992  * We have to consider the existing active locks' relation to
    993  * this 'request' if they are owned by same process. flk_relation() does
    994  * this job and sees to that the dependency graph information is maintained
    995  * properly.
    996  */
    997 
    998 int
    999 flk_execute_request(lock_descriptor_t *request)
   1000 {
   1001 	graph_t	*gp = request->l_graph;
   1002 	vnode_t	*vp = request->l_vnode;
   1003 	lock_descriptor_t	*lock, *lock1;
   1004 	int done_searching = 0;
   1005 
   1006 	CHECK_SLEEPING_LOCKS(gp);
   1007 	CHECK_ACTIVE_LOCKS(gp);
   1008 
   1009 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   1010 
   1011 	flk_set_state(request, FLK_START_STATE);
   1012 
   1013 	ASSERT(NOT_BLOCKED(request));
   1014 
   1015 	/* IO_LOCK requests are only to check status */
   1016 
   1017 	if (IS_IO_LOCK(request))
   1018 		return (0);
   1019 
   1020 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
   1021 
   1022 	if (lock == NULL && request->l_type == F_UNLCK)
   1023 		return (0);
   1024 	if (lock == NULL) {
   1025 		flk_insert_active_lock(request);
   1026 		return (0);
   1027 	}
   1028 
   1029 	do {
   1030 		lock1 = lock->l_next;
   1031 		if (SAME_OWNER(request, lock)) {
   1032 			done_searching = flk_relation(lock, request);
   1033 		}
   1034 		lock = lock1;
   1035 	} while (lock->l_vnode == vp && !done_searching);
   1036 
   1037 	/*
   1038 	 * insert in active queue
   1039 	 */
   1040 
   1041 	if (request->l_type != F_UNLCK)
   1042 		flk_insert_active_lock(request);
   1043 
   1044 	return (0);
   1045 }
   1046 /* ONC_PLUS EXTRACT END */
   1047 
   1048 /*
   1049  * 'request' is blocked by some one therefore we put it into sleep queue.
   1050  */
   1051 static int
   1052 flk_wait_execute_request(lock_descriptor_t *request)
   1053 {
   1054 	graph_t	*gp = request->l_graph;
   1055 	callb_cpr_t 	*cprp;		/* CPR info from callback */
   1056 	struct flock_globals *fg;
   1057 	int index;
   1058 
   1059 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   1060 	ASSERT(IS_WILLING_TO_SLEEP(request));
   1061 
   1062 	flk_insert_sleeping_lock(request);
   1063 
   1064 	if (IS_LOCKMGR(request)) {
   1065 		index = HASH_INDEX(request->l_vnode);
   1066 		fg = flk_get_globals();
   1067 
   1068 		if (nlm_status_size == 0) {	/* not booted as a cluster */
   1069 			if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
   1070 				flk_cancel_sleeping_lock(request, 1);
   1071 				return (ENOLCK);
   1072 			}
   1073 		} else {			/* booted as a cluster */
   1074 			/*
   1075 			 * If the request is an NLM server lock request,
   1076 			 * and the NLM state of the lock request is not
   1077 			 * NLM_UP (because the NLM server is shutting
   1078 			 * down), then cancel the sleeping lock and
   1079 			 * return error ENOLCK that will encourage the
   1080 			 * client to retransmit.
   1081 			 */
   1082 			if (!IS_NLM_UP(request)) {
   1083 				flk_cancel_sleeping_lock(request, 1);
   1084 				return (ENOLCK);
   1085 			}
   1086 		}
   1087 	}
   1088 
   1089 	/* Clustering: For blocking PXFS locks, return */
   1090 	if (IS_PXFS(request)) {
   1091 		/*
   1092 		 * PXFS locks sleep on the client side.
   1093 		 * The callback argument is used to wake up the sleeper
   1094 		 * when the lock is granted.
   1095 		 * We return -1 (rather than an errno value) to indicate
   1096 		 * the client side should sleep
   1097 		 */
   1098 		return (PXFS_LOCK_BLOCKED);
   1099 	}
   1100 
   1101 	if (request->l_callbacks != NULL) {
   1102 		/*
   1103 		 * To make sure the shutdown code works correctly, either
   1104 		 * the callback must happen after putting the lock on the
   1105 		 * sleep list, or we must check the shutdown status after
   1106 		 * returning from the callback (and before sleeping).  At
   1107 		 * least for now, we'll use the first option.  If a
   1108 		 * shutdown or signal or whatever happened while the graph
   1109 		 * mutex was dropped, that will be detected by
   1110 		 * wait_for_lock().
   1111 		 */
   1112 		mutex_exit(&gp->gp_mutex);
   1113 
   1114 		cprp = flk_invoke_callbacks(request->l_callbacks,
   1115 					    FLK_BEFORE_SLEEP);
   1116 
   1117 		mutex_enter(&gp->gp_mutex);
   1118 
   1119 		if (cprp == NULL) {
   1120 			wait_for_lock(request);
   1121 		} else {
   1122 			mutex_enter(cprp->cc_lockp);
   1123 			CALLB_CPR_SAFE_BEGIN(cprp);
   1124 			mutex_exit(cprp->cc_lockp);
   1125 			wait_for_lock(request);
   1126 			mutex_enter(cprp->cc_lockp);
   1127 			CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
   1128 			mutex_exit(cprp->cc_lockp);
   1129 		}
   1130 
   1131 		mutex_exit(&gp->gp_mutex);
   1132 		(void) flk_invoke_callbacks(request->l_callbacks,
   1133 					    FLK_AFTER_SLEEP);
   1134 		mutex_enter(&gp->gp_mutex);
   1135 	} else {
   1136 		wait_for_lock(request);
   1137 	}
   1138 
   1139 	if (IS_LOCKMGR(request)) {
   1140 		/*
   1141 		 * If the lock manager is shutting down, return an
   1142 		 * error that will encourage the client to retransmit.
   1143 		 */
   1144 		if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
   1145 			!IS_GRANTED(request)) {
   1146 			flk_cancel_sleeping_lock(request, 1);
   1147 			return (ENOLCK);
   1148 		}
   1149 	}
   1150 
   1151 	if (IS_INTERRUPTED(request)) {
   1152 		/* we got a signal, or act like we did */
   1153 		flk_cancel_sleeping_lock(request, 1);
   1154 		return (EINTR);
   1155 	}
   1156 
   1157 	/* Cancelled if some other thread has closed the file */
   1158 
   1159 	if (IS_CANCELLED(request)) {
   1160 		flk_cancel_sleeping_lock(request, 1);
   1161 		return (EBADF);
   1162 	}
   1163 
   1164 	request->l_state &= ~GRANTED_LOCK;
   1165 	REMOVE_SLEEP_QUEUE(request);
   1166 	return (flk_execute_request(request));
   1167 }
   1168 
   1169 /*
   1170  * This routine adds an edge between from and to because from depends
   1171  * to. If asked to check for deadlock it checks whether there are any
   1172  * reachable locks from "from_lock" that is owned by the same process
   1173  * as "from_lock".
   1174  * NOTE: It is the caller's responsibility to make sure that the color
   1175  * of the graph is consistent between the calls to flk_add_edge as done
   1176  * in flk_process_request. This routine does not color and check for
   1177  * deadlock explicitly.
   1178  */
   1179 
   1180 static int
   1181 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
   1182 			int check_cycle, int update_graph)
   1183 {
   1184 	edge_t	*edge;
   1185 	edge_t	*ep;
   1186 	lock_descriptor_t	*vertex;
   1187 	lock_descriptor_t *vertex_stack;
   1188 
   1189 	STACK_INIT(vertex_stack);
   1190 
   1191 	/*
   1192 	 * if to vertex already has mark_color just return
   1193 	 * don't add an edge as it is reachable from from vertex
   1194 	 * before itself.
   1195 	 */
   1196 
   1197 	if (COLORED(to_lock))
   1198 		return (0);
   1199 
   1200 	edge = flk_get_edge();
   1201 
   1202 	/*
   1203 	 * set the from and to vertex
   1204 	 */
   1205 
   1206 	edge->from_vertex = from_lock;
   1207 	edge->to_vertex = to_lock;
   1208 
   1209 	/*
   1210 	 * put in adjacency list of from vertex
   1211 	 */
   1212 
   1213 	from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
   1214 	edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
   1215 	edge->edge_adj_prev = &from_lock->l_edge;
   1216 	from_lock->l_edge.edge_adj_next = edge;
   1217 
   1218 	/*
   1219 	 * put in in list of to vertex
   1220 	 */
   1221 
   1222 	to_lock->l_edge.edge_in_next->edge_in_prev = edge;
   1223 	edge->edge_in_next = to_lock->l_edge.edge_in_next;
   1224 	to_lock->l_edge.edge_in_next = edge;
   1225 	edge->edge_in_prev = &to_lock->l_edge;
   1226 
   1227 
   1228 	if (update_graph) {
   1229 		flk_update_proc_graph(edge, 0);
   1230 		return (0);
   1231 	}
   1232 	if (!check_cycle) {
   1233 		return (0);
   1234 	}
   1235 
   1236 	STACK_PUSH(vertex_stack, from_lock, l_stack);
   1237 
   1238 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
   1239 
   1240 		STACK_POP(vertex_stack, l_stack);
   1241 
   1242 		for (ep = FIRST_ADJ(vertex);
   1243 			ep != HEAD(vertex);
   1244 				ep = NEXT_ADJ(ep)) {
   1245 			if (COLORED(ep->to_vertex))
   1246 				continue;
   1247 			COLOR(ep->to_vertex);
   1248 			if (SAME_OWNER(ep->to_vertex, from_lock))
   1249 				goto dead_lock;
   1250 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
   1251 		}
   1252 	}
   1253 	return (0);
   1254 
   1255 dead_lock:
   1256 
   1257 	/*
   1258 	 * remove all edges
   1259 	 */
   1260 
   1261 	ep = FIRST_ADJ(from_lock);
   1262 
   1263 	while (ep != HEAD(from_lock)) {
   1264 		IN_LIST_REMOVE(ep);
   1265 		from_lock->l_sedge = NEXT_ADJ(ep);
   1266 		ADJ_LIST_REMOVE(ep);
   1267 		flk_free_edge(ep);
   1268 		ep = from_lock->l_sedge;
   1269 	}
   1270 	return (EDEADLK);
   1271 }
   1272 
   1273 /*
   1274  * Get an edge structure for representing the dependency between two locks.
   1275  */
   1276 
   1277 static edge_t *
   1278 flk_get_edge()
   1279 {
   1280 	edge_t	*ep;
   1281 
   1282 	ASSERT(flk_edge_cache != NULL);
   1283 
   1284 	ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
   1285 	edge_allocs++;
   1286 	return (ep);
   1287 }
   1288 
   1289 /*
   1290  * Free the edge structure.
   1291  */
   1292 
   1293 static void
   1294 flk_free_edge(edge_t *ep)
   1295 {
   1296 	edge_frees++;
   1297 	kmem_cache_free(flk_edge_cache, (void *)ep);
   1298 }
   1299 
   1300 /*
   1301  * Check the relationship of request with lock and perform the
   1302  * recomputation of dependencies, break lock if required, and return
   1303  * 1 if request cannot have any more relationship with the next
   1304  * active locks.
   1305  * The 'lock' and 'request' are compared and in case of overlap we
   1306  * delete the 'lock' and form new locks to represent the non-overlapped
   1307  * portion of original 'lock'. This function has side effects such as
   1308  * 'lock' will be freed, new locks will be added to the active list.
   1309  */
   1310 
   1311 static int
   1312 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
   1313 {
   1314 	int lock_effect;
   1315 	lock_descriptor_t *lock1, *lock2;
   1316 	lock_descriptor_t *topology[3];
   1317 	int nvertex = 0;
   1318 	int i;
   1319 	edge_t	*ep;
   1320 	graph_t	*gp = (lock->l_graph);
   1321 
   1322 
   1323 	CHECK_SLEEPING_LOCKS(gp);
   1324 	CHECK_ACTIVE_LOCKS(gp);
   1325 
   1326 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   1327 
   1328 	topology[0] = topology[1] = topology[2] = NULL;
   1329 
   1330 	if (request->l_type == F_UNLCK)
   1331 		lock_effect = FLK_UNLOCK;
   1332 	else if (request->l_type == F_RDLCK &&
   1333 			lock->l_type == F_WRLCK)
   1334 		lock_effect = FLK_DOWNGRADE;
   1335 	else if (request->l_type == F_WRLCK &&
   1336 			lock->l_type == F_RDLCK)
   1337 		lock_effect = FLK_UPGRADE;
   1338 	else
   1339 		lock_effect = FLK_STAY_SAME;
   1340 
   1341 	if (lock->l_end < request->l_start) {
   1342 		if (lock->l_end == request->l_start - 1 &&
   1343 				lock_effect == FLK_STAY_SAME) {
   1344 			topology[0] = request;
   1345 			request->l_start = lock->l_start;
   1346 			nvertex = 1;
   1347 			goto recompute;
   1348 		} else {
   1349 			return (0);
   1350 		}
   1351 	}
   1352 
   1353 	if (lock->l_start > request->l_end) {
   1354 		if (request->l_end == lock->l_start - 1 &&
   1355 					lock_effect == FLK_STAY_SAME) {
   1356 			topology[0] = request;
   1357 			request->l_end = lock->l_end;
   1358 			nvertex = 1;
   1359 			goto recompute;
   1360 		} else {
   1361 			return (1);
   1362 		}
   1363 	}
   1364 
   1365 	if (request->l_end < lock->l_end) {
   1366 		if (request->l_start > lock->l_start) {
   1367 			if (lock_effect == FLK_STAY_SAME) {
   1368 				request->l_start = lock->l_start;
   1369 				request->l_end = lock->l_end;
   1370 				topology[0] = request;
   1371 				nvertex = 1;
   1372 			} else {
   1373 				lock1 = flk_get_lock();
   1374 				lock2 = flk_get_lock();
   1375 				COPY(lock1, lock);
   1376 				COPY(lock2, lock);
   1377 				lock1->l_start = lock->l_start;
   1378 				lock1->l_end = request->l_start - 1;
   1379 				lock2->l_start = request->l_end + 1;
   1380 				lock2->l_end = lock->l_end;
   1381 				topology[0] = lock1;
   1382 				topology[1] = lock2;
   1383 				topology[2] = request;
   1384 				nvertex = 3;
   1385 			}
   1386 		} else if (request->l_start < lock->l_start) {
   1387 			if (lock_effect == FLK_STAY_SAME) {
   1388 				request->l_end = lock->l_end;
   1389 				topology[0] = request;
   1390 				nvertex = 1;
   1391 			} else {
   1392 				lock1 = flk_get_lock();
   1393 				COPY(lock1, lock);
   1394 				lock1->l_start = request->l_end + 1;
   1395 				topology[0] = lock1;
   1396 				topology[1] = request;
   1397 				nvertex = 2;
   1398 			}
   1399 		} else  {
   1400 			if (lock_effect == FLK_STAY_SAME) {
   1401 				request->l_start = lock->l_start;
   1402 				request->l_end = lock->l_end;
   1403 				topology[0] = request;
   1404 				nvertex = 1;
   1405 			} else {
   1406 				lock1 = flk_get_lock();
   1407 				COPY(lock1, lock);
   1408 				lock1->l_start = request->l_end + 1;
   1409 				topology[0] = lock1;
   1410 				topology[1] = request;
   1411 				nvertex = 2;
   1412 			}
   1413 		}
   1414 	} else if (request->l_end > lock->l_end) {
   1415 		if (request->l_start > lock->l_start)  {
   1416 			if (lock_effect == FLK_STAY_SAME) {
   1417 				request->l_start = lock->l_start;
   1418 				topology[0] = request;
   1419 				nvertex = 1;
   1420 			} else {
   1421 				lock1 = flk_get_lock();
   1422 				COPY(lock1, lock);
   1423 				lock1->l_end = request->l_start - 1;
   1424 				topology[0] = lock1;
   1425 				topology[1] = request;
   1426 				nvertex = 2;
   1427 			}
   1428 		} else if (request->l_start < lock->l_start)  {
   1429 			topology[0] = request;
   1430 			nvertex = 1;
   1431 		} else {
   1432 			topology[0] = request;
   1433 			nvertex = 1;
   1434 		}
   1435 	} else {
   1436 		if (request->l_start > lock->l_start) {
   1437 			if (lock_effect == FLK_STAY_SAME) {
   1438 				request->l_start = lock->l_start;
   1439 				topology[0] = request;
   1440 				nvertex = 1;
   1441 			} else {
   1442 				lock1 = flk_get_lock();
   1443 				COPY(lock1, lock);
   1444 				lock1->l_end = request->l_start - 1;
   1445 				topology[0] = lock1;
   1446 				topology[1] = request;
   1447 				nvertex = 2;
   1448 			}
   1449 		} else if (request->l_start < lock->l_start) {
   1450 			topology[0] = request;
   1451 			nvertex = 1;
   1452 		} else {
   1453 			if (lock_effect !=  FLK_UNLOCK) {
   1454 				topology[0] = request;
   1455 				nvertex = 1;
   1456 			} else {
   1457 				flk_delete_active_lock(lock, 0);
   1458 				flk_wakeup(lock, 1);
   1459 				flk_free_lock(lock);
   1460 				CHECK_SLEEPING_LOCKS(gp);
   1461 				CHECK_ACTIVE_LOCKS(gp);
   1462 				return (1);
   1463 			}
   1464 		}
   1465 	}
   1466 
   1467 recompute:
   1468 
   1469 	/*
   1470 	 * For unlock we don't send the 'request' to for recomputing
   1471 	 * dependencies because no lock will add an edge to this.
   1472 	 */
   1473 
   1474 	if (lock_effect == FLK_UNLOCK) {
   1475 		topology[nvertex-1] = NULL;
   1476 		nvertex--;
   1477 	}
   1478 	for (i = 0; i < nvertex; i++) {
   1479 		topology[i]->l_state |= RECOMPUTE_LOCK;
   1480 		topology[i]->l_color = NO_COLOR;
   1481 	}
   1482 
   1483 	ASSERT(FIRST_ADJ(lock) == HEAD(lock));
   1484 
   1485 	/*
   1486 	 * we remove the adjacent edges for all vertices' to this vertex
   1487 	 * 'lock'.
   1488 	 */
   1489 
   1490 	ep = FIRST_IN(lock);
   1491 	while (ep != HEAD(lock)) {
   1492 		ADJ_LIST_REMOVE(ep);
   1493 		ep = NEXT_IN(ep);
   1494 	}
   1495 
   1496 	flk_delete_active_lock(lock, 0);
   1497 
   1498 	/* We are ready for recomputing the dependencies now */
   1499 
   1500 	flk_recompute_dependencies(lock, topology, nvertex, 1);
   1501 
   1502 	for (i = 0; i < nvertex; i++) {
   1503 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
   1504 		topology[i]->l_color = NO_COLOR;
   1505 	}
   1506 
   1507 
   1508 	if (lock_effect == FLK_UNLOCK) {
   1509 		nvertex++;
   1510 	}
   1511 	for (i = 0; i < nvertex - 1; i++) {
   1512 		flk_insert_active_lock(topology[i]);
   1513 	}
   1514 
   1515 
   1516 	if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
   1517 		flk_wakeup(lock, 0);
   1518 	} else {
   1519 		ep = FIRST_IN(lock);
   1520 		while (ep != HEAD(lock)) {
   1521 			lock->l_sedge = NEXT_IN(ep);
   1522 			IN_LIST_REMOVE(ep);
   1523 			flk_update_proc_graph(ep, 1);
   1524 			flk_free_edge(ep);
   1525 			ep = lock->l_sedge;
   1526 		}
   1527 	}
   1528 	flk_free_lock(lock);
   1529 
   1530 	CHECK_SLEEPING_LOCKS(gp);
   1531 	CHECK_ACTIVE_LOCKS(gp);
   1532 	return (0);
   1533 }
   1534 
   1535 /*
   1536  * Insert a lock into the active queue.
   1537  */
   1538 
   1539 static void
   1540 flk_insert_active_lock(lock_descriptor_t *new_lock)
   1541 {
   1542 	graph_t	*gp = new_lock->l_graph;
   1543 	vnode_t	*vp = new_lock->l_vnode;
   1544 	lock_descriptor_t *first_lock, *lock;
   1545 
   1546 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   1547 
   1548 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
   1549 	first_lock = lock;
   1550 
   1551 	if (first_lock != NULL) {
   1552 		for (; (lock->l_vnode == vp &&
   1553 			lock->l_start < new_lock->l_start); lock = lock->l_next)
   1554 			;
   1555 	} else {
   1556 		lock = ACTIVE_HEAD(gp);
   1557 	}
   1558 
   1559 	lock->l_prev->l_next = new_lock;
   1560 	new_lock->l_next = lock;
   1561 	new_lock->l_prev = lock->l_prev;
   1562 	lock->l_prev = new_lock;
   1563 
   1564 	if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
   1565 		vp->v_filocks = (struct filock *)new_lock;
   1566 	}
   1567 	flk_set_state(new_lock, FLK_ACTIVE_STATE);
   1568 	new_lock->l_state |= ACTIVE_LOCK;
   1569 
   1570 	CHECK_ACTIVE_LOCKS(gp);
   1571 	CHECK_SLEEPING_LOCKS(gp);
   1572 }
   1573 
   1574 /*
   1575  * Delete the active lock : Performs two functions depending on the
   1576  * value of second parameter. One is to remove from the active lists
   1577  * only and other is to both remove and free the lock.
   1578  */
   1579 
   1580 static void
   1581 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
   1582 {
   1583 	vnode_t *vp = lock->l_vnode;
   1584 	graph_t	*gp = lock->l_graph;
   1585 
   1586 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   1587 	if (free_lock)
   1588 		ASSERT(NO_DEPENDENTS(lock));
   1589 	ASSERT(NOT_BLOCKED(lock));
   1590 	ASSERT(IS_ACTIVE(lock));
   1591 
   1592 	ASSERT((vp->v_filocks != NULL));
   1593 
   1594 	if (vp->v_filocks == (struct filock *)lock) {
   1595 		vp->v_filocks = (struct filock *)
   1596 				((lock->l_next->l_vnode == vp) ? lock->l_next :
   1597 								NULL);
   1598 	}
   1599 	lock->l_next->l_prev = lock->l_prev;
   1600 	lock->l_prev->l_next = lock->l_next;
   1601 	lock->l_next = lock->l_prev = NULL;
   1602 	flk_set_state(lock, FLK_DEAD_STATE);
   1603 	lock->l_state &= ~ACTIVE_LOCK;
   1604 
   1605 	if (free_lock)
   1606 		flk_free_lock(lock);
   1607 	CHECK_ACTIVE_LOCKS(gp);
   1608 	CHECK_SLEEPING_LOCKS(gp);
   1609 }
   1610 
   1611 /*
   1612  * Insert into the sleep queue.
   1613  */
   1614 
   1615 static void
   1616 flk_insert_sleeping_lock(lock_descriptor_t *request)
   1617 {
   1618 	graph_t *gp = request->l_graph;
   1619 	vnode_t	*vp = request->l_vnode;
   1620 	lock_descriptor_t	*lock;
   1621 
   1622 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   1623 	ASSERT(IS_INITIAL(request));
   1624 
   1625 	for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
   1626 		lock->l_vnode < vp); lock = lock->l_next)
   1627 		;
   1628 
   1629 	lock->l_prev->l_next = request;
   1630 	request->l_prev = lock->l_prev;
   1631 	lock->l_prev = request;
   1632 	request->l_next = lock;
   1633 	flk_set_state(request, FLK_SLEEPING_STATE);
   1634 	request->l_state |= SLEEPING_LOCK;
   1635 }
   1636 
   1637 /*
   1638  * Cancelling a sleeping lock implies removing a vertex from the
   1639  * dependency graph and therefore we should recompute the dependencies
   1640  * of all vertices that have a path  to this vertex, w.r.t. all
   1641  * vertices reachable from this vertex.
   1642  */
   1643 
   1644 void
   1645 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
   1646 {
   1647 	graph_t	*gp = request->l_graph;
   1648 	vnode_t *vp = request->l_vnode;
   1649 	lock_descriptor_t **topology = NULL;
   1650 	edge_t	*ep;
   1651 	lock_descriptor_t *vertex, *lock;
   1652 	int nvertex = 0;
   1653 	int i;
   1654 	lock_descriptor_t *vertex_stack;
   1655 
   1656 	STACK_INIT(vertex_stack);
   1657 
   1658 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   1659 	/*
   1660 	 * count number of vertex pointers that has to be allocated
   1661 	 * All vertices that are reachable from request.
   1662 	 */
   1663 
   1664 	STACK_PUSH(vertex_stack, request, l_stack);
   1665 
   1666 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
   1667 		STACK_POP(vertex_stack, l_stack);
   1668 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
   1669 					ep = NEXT_ADJ(ep)) {
   1670 			if (IS_RECOMPUTE(ep->to_vertex))
   1671 				continue;
   1672 			ep->to_vertex->l_state |= RECOMPUTE_LOCK;
   1673 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
   1674 			nvertex++;
   1675 		}
   1676 	}
   1677 
   1678 	/*
   1679 	 * allocate memory for holding the vertex pointers
   1680 	 */
   1681 
   1682 	if (nvertex) {
   1683 		topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
   1684 						KM_SLEEP);
   1685 	}
   1686 
   1687 	/*
   1688 	 * one more pass to actually store the vertices in the
   1689 	 * allocated array.
   1690 	 * We first check sleeping locks and then active locks
   1691 	 * so that topology array will be in a topological
   1692 	 * order.
   1693 	 */
   1694 
   1695 	nvertex = 0;
   1696 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
   1697 
   1698 	if (lock) {
   1699 		do {
   1700 			if (IS_RECOMPUTE(lock)) {
   1701 				lock->l_index = nvertex;
   1702 				topology[nvertex++] = lock;
   1703 			}
   1704 			lock->l_color = NO_COLOR;
   1705 			lock = lock->l_next;
   1706 		} while (lock->l_vnode == vp);
   1707 	}
   1708 
   1709 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
   1710 
   1711 	if (lock) {
   1712 		do {
   1713 			if (IS_RECOMPUTE(lock)) {
   1714 				lock->l_index = nvertex;
   1715 				topology[nvertex++] = lock;
   1716 			}
   1717 			lock->l_color = NO_COLOR;
   1718 			lock = lock->l_next;
   1719 		} while (lock->l_vnode == vp);
   1720 	}
   1721 
   1722 	/*
   1723 	 * remove in and out edges of request
   1724 	 * They are freed after updating proc_graph below.
   1725 	 */
   1726 
   1727 	for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
   1728 		ADJ_LIST_REMOVE(ep);
   1729 	}
   1730 
   1731 
   1732 	if (remove_from_queue)
   1733 		REMOVE_SLEEP_QUEUE(request);
   1734 
   1735 	/* we are ready to recompute */
   1736 
   1737 	flk_recompute_dependencies(request, topology, nvertex, 1);
   1738 
   1739 	ep = FIRST_ADJ(request);
   1740 	while (ep != HEAD(request)) {
   1741 		IN_LIST_REMOVE(ep);
   1742 		request->l_sedge = NEXT_ADJ(ep);
   1743 		ADJ_LIST_REMOVE(ep);
   1744 		flk_update_proc_graph(ep, 1);
   1745 		flk_free_edge(ep);
   1746 		ep = request->l_sedge;
   1747 	}
   1748 
   1749 
   1750 	/*
   1751 	 * unset the RECOMPUTE flag in those vertices
   1752 	 */
   1753 
   1754 	for (i = 0; i < nvertex; i++) {
   1755 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
   1756 	}
   1757 
   1758 	/*
   1759 	 * free the topology
   1760 	 */
   1761 	if (nvertex)
   1762 		kmem_free((void *)topology,
   1763 			(nvertex * sizeof (lock_descriptor_t *)));
   1764 	/*
   1765 	 * Possibility of some locks unblocked now
   1766 	 */
   1767 
   1768 	flk_wakeup(request, 0);
   1769 
   1770 	/*
   1771 	 * we expect to have a correctly recomputed graph  now.
   1772 	 */
   1773 	flk_set_state(request, FLK_DEAD_STATE);
   1774 	flk_free_lock(request);
   1775 	CHECK_SLEEPING_LOCKS(gp);
   1776 	CHECK_ACTIVE_LOCKS(gp);
   1777 
   1778 }
   1779 
   1780 /*
   1781  * Uncoloring the graph is simply to increment the mark value of the graph
   1782  * And only when wrap round takes place will we color all vertices in
   1783  * the graph explicitly.
   1784  */
   1785 
   1786 static void
   1787 flk_graph_uncolor(graph_t *gp)
   1788 {
   1789 	lock_descriptor_t *lock;
   1790 
   1791 	if (gp->mark == UINT_MAX) {
   1792 		gp->mark = 1;
   1793 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
   1794 					lock = lock->l_next)
   1795 			lock->l_color  = 0;
   1796 
   1797 	for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
   1798 					lock = lock->l_next)
   1799 			lock->l_color  = 0;
   1800 	} else {
   1801 		gp->mark++;
   1802 	}
   1803 }
   1804 
   1805 /*
   1806  * Wake up locks that are blocked on the given lock.
   1807  */
   1808 
   1809 static void
   1810 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
   1811 {
   1812 	edge_t	*ep;
   1813 	graph_t	*gp = lock->l_graph;
   1814 	lock_descriptor_t	*lck;
   1815 
   1816 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   1817 	if (NO_DEPENDENTS(lock))
   1818 		return;
   1819 	ep = FIRST_IN(lock);
   1820 	do {
   1821 		/*
   1822 		 * delete the edge from the adjacency list
   1823 		 * of from vertex. if no more adjacent edges
   1824 		 * for this vertex wake this process.
   1825 		 */
   1826 		lck = ep->from_vertex;
   1827 		if (adj_list_remove)
   1828 			ADJ_LIST_REMOVE(ep);
   1829 		flk_update_proc_graph(ep, 1);
   1830 		if (NOT_BLOCKED(lck)) {
   1831 			GRANT_WAKEUP(lck);
   1832 		}
   1833 		lock->l_sedge = NEXT_IN(ep);
   1834 		IN_LIST_REMOVE(ep);
   1835 		flk_free_edge(ep);
   1836 		ep = lock->l_sedge;
   1837 	} while (ep != HEAD(lock));
   1838 	ASSERT(NO_DEPENDENTS(lock));
   1839 }
   1840 
   1841 /*
   1842  * The dependents of request, is checked for its dependency against the
   1843  * locks in topology (called topology because the array is and should be in
   1844  * topological order for this algorithm, if not in topological order the
   1845  * inner loop below might add more edges than necessary. Topological ordering
   1846  * of vertices satisfies the property that all edges will be from left to
   1847  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
   1848  * If lock l1 in the dependent set of request is dependent (blocked by)
   1849  * on lock l2 in topology but does not have a path to it, we add an edge
   1850  * in the inner loop below.
   1851  *
   1852  * We don't want to add an edge between l1 and l2 if there exists
   1853  * already a path from l1 to l2, so care has to be taken for those vertices
   1854  * that  have two paths to 'request'. These vertices are referred to here
   1855  * as barrier locks.
   1856  *
   1857  * The barriers has to be found (those vertex that originally had two paths
   1858  * to request) because otherwise we may end up adding edges unnecessarily
   1859  * to vertices in topology, and thus barrier vertices can have an edge
   1860  * to a vertex in topology as well a path to it.
   1861  */
   1862 
   1863 static void
   1864 flk_recompute_dependencies(lock_descriptor_t *request,
   1865 		lock_descriptor_t **topology,
   1866 			int nvertex, int update_graph)
   1867 {
   1868 	lock_descriptor_t *vertex, *lock;
   1869 	graph_t	*gp = request->l_graph;
   1870 	int i, count;
   1871 	int barrier_found = 0;
   1872 	edge_t	*ep;
   1873 	lock_descriptor_t *vertex_stack;
   1874 
   1875 	STACK_INIT(vertex_stack);
   1876 
   1877 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   1878 	if (nvertex == 0)
   1879 		return;
   1880 	flk_graph_uncolor(request->l_graph);
   1881 	barrier_found = flk_find_barriers(request);
   1882 	request->l_state |= RECOMPUTE_DONE;
   1883 
   1884 	STACK_PUSH(vertex_stack, request, l_stack);
   1885 	request->l_sedge = FIRST_IN(request);
   1886 
   1887 
   1888 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
   1889 		if (vertex->l_state & RECOMPUTE_DONE) {
   1890 			count = 0;
   1891 			goto next_in_edge;
   1892 		}
   1893 		if (IS_BARRIER(vertex)) {
   1894 			/* decrement the barrier count */
   1895 			if (vertex->l_index) {
   1896 				vertex->l_index--;
   1897 				/* this guy will be pushed again anyway ? */
   1898 				STACK_POP(vertex_stack, l_stack);
   1899 				if (vertex->l_index == 0)  {
   1900 				/*
   1901 				 * barrier is over we can recompute
   1902 				 * dependencies for this lock in the
   1903 				 * next stack pop
   1904 				 */
   1905 					vertex->l_state &= ~BARRIER_LOCK;
   1906 				}
   1907 				continue;
   1908 			}
   1909 		}
   1910 		vertex->l_state |= RECOMPUTE_DONE;
   1911 		flk_graph_uncolor(gp);
   1912 		count = flk_color_reachables(vertex);
   1913 		for (i = 0; i < nvertex; i++) {
   1914 			lock = topology[i];
   1915 			if (COLORED(lock))
   1916 				continue;
   1917 			if (BLOCKS(lock, vertex)) {
   1918 				(void) flk_add_edge(vertex, lock,
   1919 				    NO_CHECK_CYCLE, update_graph);
   1920 				COLOR(lock);
   1921 				count++;
   1922 				count += flk_color_reachables(lock);
   1923 			}
   1924 
   1925 		}
   1926 
   1927 next_in_edge:
   1928 		if (count == nvertex ||
   1929 				vertex->l_sedge == HEAD(vertex)) {
   1930 			/* prune the tree below this */
   1931 			STACK_POP(vertex_stack, l_stack);
   1932 			vertex->l_state &= ~RECOMPUTE_DONE;
   1933 			/* update the barrier locks below this! */
   1934 			if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
   1935 				flk_graph_uncolor(gp);
   1936 				flk_update_barriers(vertex);
   1937 			}
   1938 			continue;
   1939 		}
   1940 
   1941 		ep = vertex->l_sedge;
   1942 		lock = ep->from_vertex;
   1943 		STACK_PUSH(vertex_stack, lock, l_stack);
   1944 		lock->l_sedge = FIRST_IN(lock);
   1945 		vertex->l_sedge = NEXT_IN(ep);
   1946 	}
   1947 
   1948 }
   1949 
   1950 /*
   1951  * Color all reachable vertices from vertex that belongs to topology (here
   1952  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
   1953  *
   1954  * Note: we need to use a different stack_link l_stack1 because this is
   1955  * called from flk_recompute_dependencies() that already uses a stack with
   1956  * l_stack as stack_link.
   1957  */
   1958 
   1959 static int
   1960 flk_color_reachables(lock_descriptor_t *vertex)
   1961 {
   1962 	lock_descriptor_t *ver, *lock;
   1963 	int count;
   1964 	edge_t	*ep;
   1965 	lock_descriptor_t *vertex_stack;
   1966 
   1967 	STACK_INIT(vertex_stack);
   1968 
   1969 	STACK_PUSH(vertex_stack, vertex, l_stack1);
   1970 	count = 0;
   1971 	while ((ver = STACK_TOP(vertex_stack)) != NULL) {
   1972 
   1973 		STACK_POP(vertex_stack, l_stack1);
   1974 		for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
   1975 					ep = NEXT_ADJ(ep)) {
   1976 			lock = ep->to_vertex;
   1977 			if (COLORED(lock))
   1978 				continue;
   1979 			COLOR(lock);
   1980 			if (IS_RECOMPUTE(lock))
   1981 				count++;
   1982 			STACK_PUSH(vertex_stack, lock, l_stack1);
   1983 		}
   1984 
   1985 	}
   1986 	return (count);
   1987 }
   1988 
   1989 /*
   1990  * Called from flk_recompute_dependencies() this routine decrements
   1991  * the barrier count of barrier vertices that are reachable from lock.
   1992  */
   1993 
   1994 static void
   1995 flk_update_barriers(lock_descriptor_t *lock)
   1996 {
   1997 	lock_descriptor_t *vertex, *lck;
   1998 	edge_t	*ep;
   1999 	lock_descriptor_t *vertex_stack;
   2000 
   2001 	STACK_INIT(vertex_stack);
   2002 
   2003 	STACK_PUSH(vertex_stack, lock, l_stack1);
   2004 
   2005 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
   2006 		STACK_POP(vertex_stack, l_stack1);
   2007 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
   2008 						ep = NEXT_IN(ep)) {
   2009 			lck = ep->from_vertex;
   2010 			if (COLORED(lck)) {
   2011 				if (IS_BARRIER(lck)) {
   2012 					ASSERT(lck->l_index > 0);
   2013 					lck->l_index--;
   2014 					if (lck->l_index == 0)
   2015 						lck->l_state &= ~BARRIER_LOCK;
   2016 				}
   2017 				continue;
   2018 			}
   2019 			COLOR(lck);
   2020 			if (IS_BARRIER(lck)) {
   2021 				ASSERT(lck->l_index > 0);
   2022 				lck->l_index--;
   2023 				if (lck->l_index == 0)
   2024 					lck->l_state &= ~BARRIER_LOCK;
   2025 			}
   2026 			STACK_PUSH(vertex_stack, lck, l_stack1);
   2027 		}
   2028 	}
   2029 }
   2030 
   2031 /*
   2032  * Finds all vertices that are reachable from 'lock' more than once and
   2033  * mark them as barrier vertices and increment their barrier count.
   2034  * The barrier count is one minus the total number of paths from lock
   2035  * to that vertex.
   2036  */
   2037 
   2038 static int
   2039 flk_find_barriers(lock_descriptor_t *lock)
   2040 {
   2041 	lock_descriptor_t *vertex, *lck;
   2042 	int found = 0;
   2043 	edge_t	*ep;
   2044 	lock_descriptor_t *vertex_stack;
   2045 
   2046 	STACK_INIT(vertex_stack);
   2047 
   2048 	STACK_PUSH(vertex_stack, lock, l_stack1);
   2049 
   2050 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
   2051 		STACK_POP(vertex_stack, l_stack1);
   2052 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
   2053 						ep = NEXT_IN(ep)) {
   2054 			lck = ep->from_vertex;
   2055 			if (COLORED(lck)) {
   2056 				/* this is a barrier */
   2057 				lck->l_state |= BARRIER_LOCK;
   2058 				/* index will have barrier count */
   2059 				lck->l_index++;
   2060 				if (!found)
   2061 					found = 1;
   2062 				continue;
   2063 			}
   2064 			COLOR(lck);
   2065 			lck->l_index = 0;
   2066 			STACK_PUSH(vertex_stack, lck, l_stack1);
   2067 		}
   2068 	}
   2069 	return (found);
   2070 }
   2071 
   2072 /*
   2073  * Finds the first lock that is mainly responsible for blocking this
   2074  * request.  If there is no such lock, request->l_flock.l_type is set to
   2075  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
   2076  * of the blocking lock.
   2077  *
   2078  * Note: It is possible a request is blocked by a sleeping lock because
   2079  * of the fairness policy used in flk_process_request() to construct the
   2080  * dependencies. (see comments before flk_process_request()).
   2081  */
   2082 
   2083 static void
   2084 flk_get_first_blocking_lock(lock_descriptor_t *request)
   2085 {
   2086 	graph_t	*gp = request->l_graph;
   2087 	vnode_t *vp = request->l_vnode;
   2088 	lock_descriptor_t *lock, *blocker;
   2089 
   2090 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   2091 	blocker = NULL;
   2092 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
   2093 
   2094 	if (lock) {
   2095 		do {
   2096 			if (BLOCKS(lock, request)) {
   2097 				blocker = lock;
   2098 				break;
   2099 			}
   2100 			lock = lock->l_next;
   2101 		} while (lock->l_vnode == vp);
   2102 	}
   2103 
   2104 	if (blocker) {
   2105 		report_blocker(blocker, request);
   2106 	} else
   2107 		request->l_flock.l_type = F_UNLCK;
   2108 }
   2109 
   2110 /*
   2111  * Get the graph_t structure associated with a vnode.
   2112  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
   2113  * not yet been initialized, then a new element is allocated and returned.
   2114  */
   2115 graph_t *
   2116 flk_get_lock_graph(vnode_t *vp, int initialize)
   2117 {
   2118 	graph_t *gp;
   2119 	graph_t *gp_alloc = NULL;
   2120 	int index = HASH_INDEX(vp);
   2121 
   2122 	if (initialize == FLK_USE_GRAPH) {
   2123 		mutex_enter(&flock_lock);
   2124 		gp = lock_graph[index];
   2125 		mutex_exit(&flock_lock);
   2126 		return (gp);
   2127 	}
   2128 
   2129 	ASSERT(initialize == FLK_INIT_GRAPH);
   2130 
   2131 	if (lock_graph[index] == NULL) {
   2132 
   2133 		gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
   2134 
   2135 		/* Initialize the graph */
   2136 
   2137 		gp_alloc->active_locks.l_next =
   2138 		    gp_alloc->active_locks.l_prev =
   2139 		    (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
   2140 		gp_alloc->sleeping_locks.l_next =
   2141 		    gp_alloc->sleeping_locks.l_prev =
   2142 		    (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
   2143 		gp_alloc->index = index;
   2144 		mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
   2145 	}
   2146 
   2147 	mutex_enter(&flock_lock);
   2148 
   2149 	gp = lock_graph[index];
   2150 
   2151 	/* Recheck the value within flock_lock */
   2152 	if (gp == NULL) {
   2153 		struct flock_globals *fg;
   2154 
   2155 		/* We must have previously allocated the graph_t structure */
   2156 		ASSERT(gp_alloc != NULL);
   2157 		lock_graph[index] = gp = gp_alloc;
   2158 		/*
   2159 		 * The lockmgr status is only needed if KLM is loaded.
   2160 		 */
   2161 		if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
   2162 			fg = flk_get_globals();
   2163 			fg->lockmgr_status[index] = fg->flk_lockmgr_status;
   2164 		}
   2165 	}
   2166 
   2167 	mutex_exit(&flock_lock);
   2168 
   2169 	if ((gp_alloc != NULL) && (gp != gp_alloc)) {
   2170 		/* There was a race to allocate the graph_t and we lost */
   2171 		mutex_destroy(&gp_alloc->gp_mutex);
   2172 		kmem_free(gp_alloc, sizeof (graph_t));
   2173 	}
   2174 
   2175 	return (gp);
   2176 }
   2177 
   2178 /*
   2179  * PSARC case 1997/292
   2180  */
   2181 int
   2182 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
   2183 {
   2184 	lock_descriptor_t *lock;
   2185 	int result = 0;
   2186 	graph_t *gp;
   2187 	int			lock_nlmid;
   2188 
   2189 	/*
   2190 	 * Check to see if node is booted as a cluster. If not, return.
   2191 	 */
   2192 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
   2193 		return (0);
   2194 	}
   2195 
   2196 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
   2197 	if (gp == NULL) {
   2198 		return (0);
   2199 	}
   2200 
   2201 	mutex_enter(&gp->gp_mutex);
   2202 
   2203 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
   2204 
   2205 	if (lock) {
   2206 		while (lock->l_vnode == vp) {
   2207 			/* get NLM id from sysid */
   2208 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
   2209 
   2210 			/*
   2211 			 * If NLM server request _and_ nlmid of lock matches
   2212 			 * nlmid of argument, then we've found a remote lock.
   2213 			 */
   2214 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
   2215 				result = 1;
   2216 				goto done;
   2217 			}
   2218 			lock = lock->l_next;
   2219 		}
   2220 	}
   2221 
   2222 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
   2223 
   2224 	if (lock) {
   2225 		while (lock->l_vnode == vp) {
   2226 			/* get NLM id from sysid */
   2227 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
   2228 
   2229 			/*
   2230 			 * If NLM server request _and_ nlmid of lock matches
   2231 			 * nlmid of argument, then we've found a remote lock.
   2232 			 */
   2233 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
   2234 				result = 1;
   2235 				goto done;
   2236 			}
   2237 			lock = lock->l_next;
   2238 		}
   2239 	}
   2240 
   2241 done:
   2242 	mutex_exit(&gp->gp_mutex);
   2243 	return (result);
   2244 }
   2245 
   2246 /* ONC_PLUS EXTRACT START */
   2247 /*
   2248  * Determine whether there are any locks for the given vnode with a remote
   2249  * sysid.  Returns zero if not, non-zero if there are.
   2250  *
   2251  * Note that the return value from this function is potentially invalid
   2252  * once it has been returned.  The caller is responsible for providing its
   2253  * own synchronization mechanism to ensure that the return value is useful
   2254  * (e.g., see nfs_lockcompletion()).
   2255  */
   2256 int
   2257 flk_has_remote_locks(vnode_t *vp)
   2258 {
   2259 	lock_descriptor_t *lock;
   2260 	int result = 0;
   2261 	graph_t *gp;
   2262 
   2263 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
   2264 	if (gp == NULL) {
   2265 		return (0);
   2266 	}
   2267 
   2268 	mutex_enter(&gp->gp_mutex);
   2269 
   2270 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
   2271 
   2272 	if (lock) {
   2273 		while (lock->l_vnode == vp) {
   2274 			if (IS_REMOTE(lock)) {
   2275 				result = 1;
   2276 				goto done;
   2277 			}
   2278 			lock = lock->l_next;
   2279 		}
   2280 	}
   2281 
   2282 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
   2283 
   2284 	if (lock) {
   2285 		while (lock->l_vnode == vp) {
   2286 			if (IS_REMOTE(lock)) {
   2287 				result = 1;
   2288 				goto done;
   2289 			}
   2290 			lock = lock->l_next;
   2291 		}
   2292 	}
   2293 
   2294 done:
   2295 	mutex_exit(&gp->gp_mutex);
   2296 	return (result);
   2297 }
   2298 
   2299 /*
   2300  * Determine if there are any locks owned by the given sysid.
   2301  * Returns zero if not, non-zero if there are.  Note that this return code
   2302  * could be derived from flk_get_{sleeping,active}_locks, but this routine
   2303  * avoids all the memory allocations of those routines.
   2304  *
   2305  * This routine has the same synchronization issues as
   2306  * flk_has_remote_locks.
   2307  */
   2308 
   2309 int
   2310 flk_sysid_has_locks(int sysid, int lck_type)
   2311 {
   2312 	int		has_locks = 0;
   2313 	lock_descriptor_t	*lock;
   2314 	graph_t 	*gp;
   2315 	int		i;
   2316 
   2317 	for (i = 0; i < HASH_SIZE && !has_locks; i++) {
   2318 		mutex_enter(&flock_lock);
   2319 		gp = lock_graph[i];
   2320 		mutex_exit(&flock_lock);
   2321 		if (gp == NULL) {
   2322 			continue;
   2323 		}
   2324 
   2325 		mutex_enter(&gp->gp_mutex);
   2326 
   2327 		if (lck_type & FLK_QUERY_ACTIVE) {
   2328 			for (lock = ACTIVE_HEAD(gp)->l_next;
   2329 			    lock != ACTIVE_HEAD(gp) && !has_locks;
   2330 			    lock = lock->l_next) {
   2331 				if (lock->l_flock.l_sysid == sysid)
   2332 					has_locks = 1;
   2333 			}
   2334 		}
   2335 
   2336 		if (lck_type & FLK_QUERY_SLEEPING) {
   2337 			for (lock = SLEEPING_HEAD(gp)->l_next;
   2338 				lock != SLEEPING_HEAD(gp) && !has_locks;
   2339 				lock = lock->l_next) {
   2340 				if (lock->l_flock.l_sysid == sysid)
   2341 					has_locks = 1;
   2342 			}
   2343 		}
   2344 		mutex_exit(&gp->gp_mutex);
   2345 	}
   2346 
   2347 	return (has_locks);
   2348 }
   2349 
   2350 
   2351 /*
   2352  * PSARC case 1997/292
   2353  *
   2354  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
   2355  *  quantity, the real sysid generated by the NLM server; the upper half
   2356  *  identifies the node of the cluster where the NLM server ran.
   2357  *  This routine is only called by an NLM server running in a cluster.
   2358  * Effects: Remove all locks held on behalf of the client identified
   2359  *  by "sysid."
   2360  */
   2361 void
   2362 cl_flk_remove_locks_by_sysid(int sysid)
   2363 {
   2364 	graph_t	*gp;
   2365 	int i;
   2366 	lock_descriptor_t *lock, *nlock;
   2367 
   2368 	/*
   2369 	 * Check to see if node is booted as a cluster. If not, return.
   2370 	 */
   2371 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
   2372 		return;
   2373 	}
   2374 
   2375 	ASSERT(sysid != 0);
   2376 	for (i = 0; i < HASH_SIZE; i++) {
   2377 		mutex_enter(&flock_lock);
   2378 		gp = lock_graph[i];
   2379 		mutex_exit(&flock_lock);
   2380 
   2381 		if (gp == NULL)
   2382 			continue;
   2383 
   2384 		mutex_enter(&gp->gp_mutex);	/*  get mutex on lock graph */
   2385 
   2386 		/* signal sleeping requests so that they bail out */
   2387 		lock = SLEEPING_HEAD(gp)->l_next;
   2388 		while (lock != SLEEPING_HEAD(gp)) {
   2389 			nlock = lock->l_next;
   2390 			if (lock->l_flock.l_sysid == sysid) {
   2391 				INTERRUPT_WAKEUP(lock);
   2392 			}
   2393 			lock = nlock;
   2394 		}
   2395 
   2396 		/* delete active locks */
   2397 		lock = ACTIVE_HEAD(gp)->l_next;
   2398 		while (lock != ACTIVE_HEAD(gp)) {
   2399 			nlock = lock->l_next;
   2400 			if (lock->l_flock.l_sysid == sysid) {
   2401 				flk_delete_active_lock(lock, 0);
   2402 				flk_wakeup(lock, 1);
   2403 				flk_free_lock(lock);
   2404 			}
   2405 			lock = nlock;
   2406 		}
   2407 		mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
   2408 	}
   2409 }
   2410 
   2411 /*
   2412  * Delete all locks in the system that belongs to the sysid of the request.
   2413  */
   2414 
   2415 static void
   2416 flk_delete_locks_by_sysid(lock_descriptor_t *request)
   2417 {
   2418 	int	sysid  = request->l_flock.l_sysid;
   2419 	lock_descriptor_t *lock, *nlock;
   2420 	graph_t	*gp;
   2421 	int i;
   2422 
   2423 	ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
   2424 	ASSERT(sysid != 0);
   2425 
   2426 	mutex_exit(&request->l_graph->gp_mutex);
   2427 
   2428 	for (i = 0; i < HASH_SIZE; i++) {
   2429 		mutex_enter(&flock_lock);
   2430 		gp = lock_graph[i];
   2431 		mutex_exit(&flock_lock);
   2432 
   2433 		if (gp == NULL)
   2434 			continue;
   2435 
   2436 		mutex_enter(&gp->gp_mutex);
   2437 
   2438 		/* signal sleeping requests so that they bail out */
   2439 		lock = SLEEPING_HEAD(gp)->l_next;
   2440 		while (lock != SLEEPING_HEAD(gp)) {
   2441 			nlock = lock->l_next;
   2442 			if (lock->l_flock.l_sysid == sysid) {
   2443 				INTERRUPT_WAKEUP(lock);
   2444 			}
   2445 			lock = nlock;
   2446 		}
   2447 
   2448 		/* delete active locks */
   2449 		lock = ACTIVE_HEAD(gp)->l_next;
   2450 		while (lock != ACTIVE_HEAD(gp)) {
   2451 			nlock = lock->l_next;
   2452 			if (lock->l_flock.l_sysid == sysid) {
   2453 				flk_delete_active_lock(lock, 0);
   2454 				flk_wakeup(lock, 1);
   2455 				flk_free_lock(lock);
   2456 			}
   2457 			lock = nlock;
   2458 		}
   2459 		mutex_exit(&gp->gp_mutex);
   2460 	}
   2461 
   2462 	mutex_enter(&request->l_graph->gp_mutex);
   2463 }
   2464 
   2465 /*
   2466  * Clustering: Deletes PXFS locks
   2467  * Effects: Delete all locks on files in the given file system and with the
   2468  *  given PXFS id.
   2469  */
   2470 void
   2471 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
   2472 {
   2473 	lock_descriptor_t *lock, *nlock;
   2474 	graph_t	*gp;
   2475 	int i;
   2476 
   2477 	for (i = 0; i < HASH_SIZE; i++) {
   2478 		mutex_enter(&flock_lock);
   2479 		gp = lock_graph[i];
   2480 		mutex_exit(&flock_lock);
   2481 
   2482 		if (gp == NULL)
   2483 			continue;
   2484 
   2485 		mutex_enter(&gp->gp_mutex);
   2486 
   2487 		/* signal sleeping requests so that they bail out */
   2488 		lock = SLEEPING_HEAD(gp)->l_next;
   2489 		while (lock != SLEEPING_HEAD(gp)) {
   2490 			nlock = lock->l_next;
   2491 			if (lock->l_vnode->v_vfsp == vfsp) {
   2492 				ASSERT(IS_PXFS(lock));
   2493 				if (GETPXFSID(lock->l_flock.l_sysid) ==
   2494 				    pxfsid) {
   2495 					flk_set_state(lock,
   2496 					    FLK_CANCELLED_STATE);
   2497 					flk_cancel_sleeping_lock(lock, 1);
   2498 				}
   2499 			}
   2500 			lock = nlock;
   2501 		}
   2502 
   2503 		/* delete active locks */
   2504 		lock = ACTIVE_HEAD(gp)->l_next;
   2505 		while (lock != ACTIVE_HEAD(gp)) {
   2506 			nlock = lock->l_next;
   2507 			if (lock->l_vnode->v_vfsp == vfsp) {
   2508 				ASSERT(IS_PXFS(lock));
   2509 				if (GETPXFSID(lock->l_flock.l_sysid) ==
   2510 				    pxfsid) {
   2511 					flk_delete_active_lock(lock, 0);
   2512 					flk_wakeup(lock, 1);
   2513 					flk_free_lock(lock);
   2514 				}
   2515 			}
   2516 			lock = nlock;
   2517 		}
   2518 		mutex_exit(&gp->gp_mutex);
   2519 	}
   2520 }
   2521 
   2522 /*
   2523  * Search for a sleeping lock manager lock which matches exactly this lock
   2524  * request; if one is found, fake a signal to cancel it.
   2525  *
   2526  * Return 1 if a matching lock was found, 0 otherwise.
   2527  */
   2528 
   2529 static int
   2530 flk_canceled(lock_descriptor_t *request)
   2531 {
   2532 	lock_descriptor_t *lock, *nlock;
   2533 	graph_t *gp = request->l_graph;
   2534 	vnode_t *vp = request->l_vnode;
   2535 
   2536 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   2537 	ASSERT(IS_LOCKMGR(request));
   2538 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
   2539 
   2540 	if (lock) {
   2541 		while (lock->l_vnode == vp) {
   2542 			nlock = lock->l_next;
   2543 			if (SAME_OWNER(lock, request) &&
   2544 				lock->l_start == request->l_start &&
   2545 					lock->l_end == request->l_end) {
   2546 				INTERRUPT_WAKEUP(lock);
   2547 				return (1);
   2548 			}
   2549 			lock = nlock;
   2550 		}
   2551 	}
   2552 	return (0);
   2553 }
   2554 
   2555 /*
   2556  * Remove all the locks for the vnode belonging to the given pid and sysid.
   2557  */
   2558 
   2559 void
   2560 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
   2561 {
   2562 	graph_t	*gp;
   2563 	lock_descriptor_t *lock, *nlock;
   2564 	lock_descriptor_t *link_stack;
   2565 
   2566 	STACK_INIT(link_stack);
   2567 
   2568 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
   2569 
   2570 	if (gp == NULL)
   2571 		return;
   2572 	mutex_enter(&gp->gp_mutex);
   2573 
   2574 	CHECK_SLEEPING_LOCKS(gp);
   2575 	CHECK_ACTIVE_LOCKS(gp);
   2576 
   2577 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
   2578 
   2579 	if (lock) {
   2580 		do {
   2581 			nlock = lock->l_next;
   2582 			if ((lock->l_flock.l_pid == pid ||
   2583 					pid == IGN_PID) &&
   2584 				lock->l_flock.l_sysid == sysid) {
   2585 				CANCEL_WAKEUP(lock);
   2586 			}
   2587 			lock = nlock;
   2588 		} while (lock->l_vnode == vp);
   2589 	}
   2590 
   2591 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
   2592 
   2593 	if (lock) {
   2594 		do {
   2595 			nlock = lock->l_next;
   2596 			if ((lock->l_flock.l_pid == pid ||
   2597 					pid == IGN_PID) &&
   2598 				lock->l_flock.l_sysid == sysid) {
   2599 				flk_delete_active_lock(lock, 0);
   2600 				STACK_PUSH(link_stack, lock, l_stack);
   2601 			}
   2602 			lock = nlock;
   2603 		} while (lock->l_vnode == vp);
   2604 	}
   2605 
   2606 	while ((lock = STACK_TOP(link_stack)) != NULL) {
   2607 		STACK_POP(link_stack, l_stack);
   2608 		flk_wakeup(lock, 1);
   2609 		flk_free_lock(lock);
   2610 	}
   2611 
   2612 	CHECK_SLEEPING_LOCKS(gp);
   2613 	CHECK_ACTIVE_LOCKS(gp);
   2614 	CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
   2615 	mutex_exit(&gp->gp_mutex);
   2616 }
   2617 /* ONC_PLUS EXTRACT END */
   2618 
   2619 
   2620 /*
   2621  * Called from 'fs' read and write routines for files that have mandatory
   2622  * locking enabled.
   2623  */
   2624 
   2625 int
   2626 chklock(
   2627 	struct vnode	*vp,
   2628 	int 		iomode,
   2629 	u_offset_t	offset,
   2630 	ssize_t		len,
   2631 	int 		fmode,
   2632 	caller_context_t *ct)
   2633 {
   2634 	register int	i;
   2635 	struct flock64 	bf;
   2636 	int 		error = 0;
   2637 
   2638 	bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
   2639 	bf.l_whence = 0;
   2640 	bf.l_start = offset;
   2641 	bf.l_len = len;
   2642 	if (ct == NULL) {
   2643 		bf.l_pid = curproc->p_pid;
   2644 		bf.l_sysid = 0;
   2645 	} else {
   2646 		bf.l_pid = ct->cc_pid;
   2647 		bf.l_sysid = ct->cc_sysid;
   2648 	}
   2649 	i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
   2650 	if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
   2651 	    bf.l_type != F_UNLCK)
   2652 		error = i ? i : EAGAIN;
   2653 	return (error);
   2654 }
   2655 
   2656 /* ONC_PLUS EXTRACT START */
   2657 /*
   2658  * convoff - converts the given data (start, whence) to the
   2659  * given whence.
   2660  */
   2661 int
   2662 convoff(vp, lckdat, whence, offset)
   2663 	struct vnode 	*vp;
   2664 	struct flock64 	*lckdat;
   2665 	int 		whence;
   2666 	offset_t	offset;
   2667 {
   2668 	int 		error;
   2669 	struct vattr 	vattr;
   2670 
   2671 	if ((lckdat->l_whence == 2) || (whence == 2)) {
   2672 		vattr.va_mask = AT_SIZE;
   2673 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
   2674 			return (error);
   2675 	}
   2676 
   2677 	switch (lckdat->l_whence) {
   2678 	case 1:
   2679 		lckdat->l_start += offset;
   2680 		break;
   2681 	case 2:
   2682 		lckdat->l_start += vattr.va_size;
   2683 		/* FALLTHRU */
   2684 	case 0:
   2685 		break;
   2686 	default:
   2687 		return (EINVAL);
   2688 	}
   2689 
   2690 	if (lckdat->l_start < 0)
   2691 		return (EINVAL);
   2692 
   2693 	switch (whence) {
   2694 	case 1:
   2695 		lckdat->l_start -= offset;
   2696 		break;
   2697 	case 2:
   2698 		lckdat->l_start -= vattr.va_size;
   2699 		/* FALLTHRU */
   2700 	case 0:
   2701 		break;
   2702 	default:
   2703 		return (EINVAL);
   2704 	}
   2705 
   2706 	lckdat->l_whence = (short)whence;
   2707 	return (0);
   2708 }
   2709 /* ONC_PLUS EXTRACT END */
   2710 
   2711 
   2712 /* 	proc_graph function definitions */
   2713 
   2714 /*
   2715  * Function checks for deadlock due to the new 'lock'. If deadlock found
   2716  * edges of this lock are freed and returned.
   2717  */
   2718 
   2719 static int
   2720 flk_check_deadlock(lock_descriptor_t *lock)
   2721 {
   2722 	proc_vertex_t	*start_vertex, *pvertex;
   2723 	proc_vertex_t *dvertex;
   2724 	proc_edge_t *pep, *ppep;
   2725 	edge_t	*ep, *nep;
   2726 	proc_vertex_t *process_stack;
   2727 
   2728 	STACK_INIT(process_stack);
   2729 
   2730 	mutex_enter(&flock_lock);
   2731 	start_vertex = flk_get_proc_vertex(lock);
   2732 	ASSERT(start_vertex != NULL);
   2733 
   2734 	/* construct the edges from this process to other processes */
   2735 
   2736 	ep = FIRST_ADJ(lock);
   2737 	while (ep != HEAD(lock)) {
   2738 		proc_vertex_t *adj_proc;
   2739 
   2740 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
   2741 		for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
   2742 			if (pep->to_proc == adj_proc) {
   2743 				ASSERT(pep->refcount);
   2744 				pep->refcount++;
   2745 				break;
   2746 			}
   2747 		}
   2748 		if (pep == NULL) {
   2749 			pep = flk_get_proc_edge();
   2750 			pep->to_proc = adj_proc;
   2751 			pep->refcount = 1;
   2752 			adj_proc->incount++;
   2753 			pep->next = start_vertex->edge;
   2754 			start_vertex->edge = pep;
   2755 		}
   2756 		ep = NEXT_ADJ(ep);
   2757 	}
   2758 
   2759 	ep = FIRST_IN(lock);
   2760 
   2761 	while (ep != HEAD(lock)) {
   2762 		proc_vertex_t *in_proc;
   2763 
   2764 		in_proc = flk_get_proc_vertex(ep->from_vertex);
   2765 
   2766 		for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
   2767 			if (pep->to_proc == start_vertex) {
   2768 				ASSERT(pep->refcount);
   2769 				pep->refcount++;
   2770 				break;
   2771 			}
   2772 		}
   2773 		if (pep == NULL) {
   2774 			pep = flk_get_proc_edge();
   2775 			pep->to_proc = start_vertex;
   2776 			pep->refcount = 1;
   2777 			start_vertex->incount++;
   2778 			pep->next = in_proc->edge;
   2779 			in_proc->edge = pep;
   2780 		}
   2781 		ep = NEXT_IN(ep);
   2782 	}
   2783 
   2784 	if (start_vertex->incount == 0) {
   2785 		mutex_exit(&flock_lock);
   2786 		return (0);
   2787 	}
   2788 
   2789 	flk_proc_graph_uncolor();
   2790 
   2791 	start_vertex->p_sedge = start_vertex->edge;
   2792 
   2793 	STACK_PUSH(process_stack, start_vertex, p_stack);
   2794 
   2795 	while ((pvertex = STACK_TOP(process_stack)) != NULL) {
   2796 		for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
   2797 			dvertex = pep->to_proc;
   2798 			if (!PROC_ARRIVED(dvertex)) {
   2799 				STACK_PUSH(process_stack, dvertex, p_stack);
   2800 				dvertex->p_sedge = dvertex->edge;
   2801 				PROC_ARRIVE(pvertex);
   2802 				pvertex->p_sedge = pep->next;
   2803 				break;
   2804 			}
   2805 			if (!PROC_DEPARTED(dvertex))
   2806 				goto deadlock;
   2807 		}
   2808 		if (pep == NULL) {
   2809 			PROC_DEPART(pvertex);
   2810 			STACK_POP(process_stack, p_stack);
   2811 		}
   2812 	}
   2813 	mutex_exit(&flock_lock);
   2814 	return (0);
   2815 
   2816 deadlock:
   2817 
   2818 	/* we remove all lock edges and proc edges */
   2819 
   2820 	ep = FIRST_ADJ(lock);
   2821 	while (ep != HEAD(lock)) {
   2822 		proc_vertex_t *adj_proc;
   2823 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
   2824 		nep = NEXT_ADJ(ep);
   2825 		IN_LIST_REMOVE(ep);
   2826 		ADJ_LIST_REMOVE(ep);
   2827 		flk_free_edge(ep);
   2828 		ppep = start_vertex->edge;
   2829 		for (pep = start_vertex->edge; pep != NULL; ppep = pep,
   2830 						pep = ppep->next) {
   2831 			if (pep->to_proc == adj_proc) {
   2832 				pep->refcount--;
   2833 				if (pep->refcount == 0) {
   2834 					if (pep == ppep) {
   2835 						start_vertex->edge = pep->next;
   2836 					} else {
   2837 						ppep->next = pep->next;
   2838 					}
   2839 					adj_proc->incount--;
   2840 					flk_proc_release(adj_proc);
   2841 					flk_free_proc_edge(pep);
   2842 				}
   2843 				break;
   2844 			}
   2845 		}
   2846 		ep = nep;
   2847 	}
   2848 	ep = FIRST_IN(lock);
   2849 	while (ep != HEAD(lock)) {
   2850 		proc_vertex_t *in_proc;
   2851 		in_proc = flk_get_proc_vertex(ep->from_vertex);
   2852 		nep = NEXT_IN(ep);
   2853 		IN_LIST_REMOVE(ep);
   2854 		ADJ_LIST_REMOVE(ep);
   2855 		flk_free_edge(ep);
   2856 		ppep = in_proc->edge;
   2857 		for (pep = in_proc->edge; pep != NULL; ppep = pep,
   2858 						pep = ppep->next) {
   2859 			if (pep->to_proc == start_vertex) {
   2860 				pep->refcount--;
   2861 				if (pep->refcount == 0) {
   2862 					if (pep == ppep) {
   2863 						in_proc->edge = pep->next;
   2864 					} else {
   2865 						ppep->next = pep->next;
   2866 					}
   2867 					start_vertex->incount--;
   2868 					flk_proc_release(in_proc);
   2869 					flk_free_proc_edge(pep);
   2870 				}
   2871 				break;
   2872 			}
   2873 		}
   2874 		ep = nep;
   2875 	}
   2876 	flk_proc_release(start_vertex);
   2877 	mutex_exit(&flock_lock);
   2878 	return (1);
   2879 }
   2880 
   2881 /*
   2882  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
   2883  * from the list we return that, otherwise we allocate one. If necessary,
   2884  * we grow the list of vertices also.
   2885  */
   2886 
   2887 static proc_vertex_t *
   2888 flk_get_proc_vertex(lock_descriptor_t *lock)
   2889 {
   2890 	int i;
   2891 	proc_vertex_t	*pv;
   2892 	proc_vertex_t	**palloc;
   2893 
   2894 	ASSERT(MUTEX_HELD(&flock_lock));
   2895 	if (lock->pvertex != -1) {
   2896 		ASSERT(lock->pvertex >= 0);
   2897 		pv = pgraph.proc[lock->pvertex];
   2898 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
   2899 			return (pv);
   2900 		}
   2901 	}
   2902 	for (i = 0; i < pgraph.gcount; i++) {
   2903 		pv = pgraph.proc[i];
   2904 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
   2905 			lock->pvertex = pv->index = i;
   2906 			return (pv);
   2907 		}
   2908 	}
   2909 	pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
   2910 	pv->pid = lock->l_flock.l_pid;
   2911 	pv->sysid = lock->l_flock.l_sysid;
   2912 	flk_proc_vertex_allocs++;
   2913 	if (pgraph.free != 0) {
   2914 		for (i = 0; i < pgraph.gcount; i++) {
   2915 			if (pgraph.proc[i] == NULL) {
   2916 				pgraph.proc[i] = pv;
   2917 				lock->pvertex = pv->index = i;
   2918 				pgraph.free--;
   2919 				return (pv);
   2920 			}
   2921 		}
   2922 	}
   2923 	palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
   2924 				sizeof (proc_vertex_t *), KM_SLEEP);
   2925 
   2926 	if (pgraph.proc) {
   2927 		bcopy(pgraph.proc, palloc,
   2928 			pgraph.gcount * sizeof (proc_vertex_t *));
   2929 
   2930 		kmem_free(pgraph.proc,
   2931 			pgraph.gcount * sizeof (proc_vertex_t *));
   2932 	}
   2933 	pgraph.proc = palloc;
   2934 	pgraph.free += (PROC_CHUNK - 1);
   2935 	pv->index = lock->pvertex = pgraph.gcount;
   2936 	pgraph.gcount += PROC_CHUNK;
   2937 	pgraph.proc[pv->index] = pv;
   2938 	return (pv);
   2939 }
   2940 
   2941 /*
   2942  * Allocate a proc edge.
   2943  */
   2944 
   2945 static proc_edge_t *
   2946 flk_get_proc_edge()
   2947 {
   2948 	proc_edge_t *pep;
   2949 
   2950 	pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
   2951 	flk_proc_edge_allocs++;
   2952 	return (pep);
   2953 }
   2954 
   2955 /*
   2956  * Free the proc edge. Called whenever its reference count goes to zero.
   2957  */
   2958 
   2959 static void
   2960 flk_free_proc_edge(proc_edge_t *pep)
   2961 {
   2962 	ASSERT(pep->refcount == 0);
   2963 	kmem_free((void *)pep, sizeof (proc_edge_t));
   2964 	flk_proc_edge_frees++;
   2965 }
   2966 
   2967 /*
   2968  * Color the graph explicitly done only when the mark value hits max value.
   2969  */
   2970 
   2971 static void
   2972 flk_proc_graph_uncolor()
   2973 {
   2974 	int i;
   2975 
   2976 	if (pgraph.mark == UINT_MAX) {
   2977 		for (i = 0; i < pgraph.gcount; i++)
   2978 			if (pgraph.proc[i] != NULL) {
   2979 				pgraph.proc[i]->atime = 0;
   2980 				pgraph.proc[i]->dtime = 0;
   2981 			}
   2982 		pgraph.mark = 1;
   2983 	} else {
   2984 		pgraph.mark++;
   2985 	}
   2986 }
   2987 
   2988 /*
   2989  * Release the proc vertex iff both there are no in edges and out edges
   2990  */
   2991 
   2992 static void
   2993 flk_proc_release(proc_vertex_t *proc)
   2994 {
   2995 	ASSERT(MUTEX_HELD(&flock_lock));
   2996 	if (proc->edge == NULL && proc->incount == 0) {
   2997 		pgraph.proc[proc->index] = NULL;
   2998 		pgraph.free++;
   2999 		kmem_free(proc, sizeof (proc_vertex_t));
   3000 		flk_proc_vertex_frees++;
   3001 	}
   3002 }
   3003 
   3004 /*
   3005  * Updates process graph to reflect change in a lock_graph.
   3006  * Note: We should call this function only after we have a correctly
   3007  * recomputed lock graph. Otherwise we might miss a deadlock detection.
   3008  * eg: in function flk_relation() we call this function after flk_recompute_
   3009  * dependencies() otherwise if a process tries to lock a vnode hashed
   3010  * into another graph it might sleep for ever.
   3011  */
   3012 
   3013 static void
   3014 flk_update_proc_graph(edge_t *ep, int delete)
   3015 {
   3016 	proc_vertex_t *toproc, *fromproc;
   3017 	proc_edge_t *pep, *prevpep;
   3018 
   3019 	mutex_enter(&flock_lock);
   3020 	toproc = flk_get_proc_vertex(ep->to_vertex);
   3021 	fromproc = flk_get_proc_vertex(ep->from_vertex);
   3022 
   3023 	if (!delete)
   3024 		goto add;
   3025 	pep = prevpep = fromproc->edge;
   3026 
   3027 	ASSERT(pep != NULL);
   3028 	while (pep != NULL) {
   3029 		if (pep->to_proc == toproc) {
   3030 			ASSERT(pep->refcount > 0);
   3031 			pep->refcount--;
   3032 			if (pep->refcount == 0) {
   3033 				if (pep == prevpep) {
   3034 					fromproc->edge = pep->next;
   3035 				} else {
   3036 					prevpep->next = pep->next;
   3037 				}
   3038 				toproc->incount--;
   3039 				flk_proc_release(toproc);
   3040 				flk_free_proc_edge(pep);
   3041 			}
   3042 			break;
   3043 		}
   3044 		prevpep = pep;
   3045 		pep = pep->next;
   3046 	}
   3047 	flk_proc_release(fromproc);
   3048 	mutex_exit(&flock_lock);
   3049 	return;
   3050 add:
   3051 
   3052 	pep = fromproc->edge;
   3053 
   3054 	while (pep != NULL) {
   3055 		if (pep->to_proc == toproc) {
   3056 			ASSERT(pep->refcount > 0);
   3057 			pep->refcount++;
   3058 			break;
   3059 		}
   3060 		pep = pep->next;
   3061 	}
   3062 	if (pep == NULL) {
   3063 		pep = flk_get_proc_edge();
   3064 		pep->to_proc = toproc;
   3065 		pep->refcount = 1;
   3066 		toproc->incount++;
   3067 		pep->next = fromproc->edge;
   3068 		fromproc->edge = pep;
   3069 	}
   3070 	mutex_exit(&flock_lock);
   3071 }
   3072 
   3073 /* ONC_PLUS EXTRACT START */
   3074 /*
   3075  * Set the control status for lock manager requests.
   3076  *
   3077  */
   3078 
   3079 /*
   3080  * PSARC case 1997/292
   3081  *
   3082  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
   3083  * Effects: Set the state of the NLM server identified by "nlmid"
   3084  *   in the NLM registry to state "nlm_state."
   3085  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
   3086  *   NLM server to this LLM.
   3087  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
   3088  *   may be locks requests that have gotten started but not finished.  In
   3089  *   particular, there may be blocking requests that are in the callback code
   3090  *   before sleeping (so they're not holding the lock for the graph).  If
   3091  *   such a thread reacquires the graph's lock (to go to sleep) after
   3092  *   NLM state in the NLM registry  is set to a non-up value,
   3093  *   it will notice the status and bail out.  If the request gets
   3094  *   granted before the thread can check the NLM registry, let it
   3095  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
   3096  *
   3097  * Modifies: nlm_reg_obj (global)
   3098  * Arguments:
   3099  *    nlmid	(IN):    id uniquely identifying an NLM server
   3100  *    nlm_state (IN):    NLM server state to change "nlmid" to
   3101  */
   3102 void
   3103 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
   3104 {
   3105 	/*
   3106 	 * Check to see if node is booted as a cluster. If not, return.
   3107 	 */
   3108 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
   3109 		return;
   3110 	}
   3111 
   3112 	/*
   3113 	 * Check for development/debugging.  It is possible to boot a node
   3114 	 * in non-cluster mode, and then run a special script, currently
   3115 	 * available only to developers, to bring up the node as part of a
   3116 	 * cluster.  The problem is that running such a script does not
   3117 	 * result in the routine flk_init() being called and hence global array
   3118 	 * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
   3119 	 * but the LLM needs to do an additional check to see if the global
   3120 	 * array has been created or not. If nlm_reg_status is NULL, then
   3121 	 * return, else continue.
   3122 	 */
   3123 	if (nlm_reg_status == NULL) {
   3124 		return;
   3125 	}
   3126 
   3127 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
   3128 	mutex_enter(&nlm_reg_lock);
   3129 
   3130 	if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
   3131 		/*
   3132 		 * If the NLM server "nlmid" is unknown in the NLM registry,
   3133 		 * add it to the registry in the nlm shutting down state.
   3134 		 */
   3135 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
   3136 			FLK_NLM_SHUTTING_DOWN);
   3137 	} else {
   3138 		/*
   3139 		 * Change the state of the NLM server identified by "nlmid"
   3140 		 * in the NLM registry to the argument "nlm_state."
   3141 		 */
   3142 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
   3143 			nlm_state);
   3144 	}
   3145 
   3146 	/*
   3147 	 *  The reason we must register the NLM server that is shutting down
   3148 	 *  with an LLM that doesn't already know about it (never sent a lock
   3149 	 *  request) is to handle correctly a race between shutdown and a new
   3150 	 *  lock request.  Suppose that a shutdown request from the NLM server
   3151 	 *  invokes this routine at the LLM, and a thread is spawned to
   3152 	 *  service the request. Now suppose a new lock request is in
   3153 	 *  progress and has already passed the first line of defense in
   3154 	 *  reclock(), which denies new locks requests from NLM servers
   3155 	 *  that are not in the NLM_UP state.  After the current routine
   3156 	 *  is invoked for both phases of shutdown, the routine will return,
   3157 	 *  having done nothing, and the lock request will proceed and
   3158 	 *  probably be granted.  The problem is that the shutdown was ignored
   3159 	 *  by the lock request because there was no record of that NLM server
   3160 	 *  shutting down.   We will be in the peculiar position of thinking
   3161 	 *  that we've shutdown the NLM server and all locks at all LLMs have
   3162 	 *  been discarded, but in fact there's still one lock held.
   3163 	 *  The solution is to record the existence of NLM server and change
   3164 	 *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
   3165 	 *  progress may proceed because the next phase NLM_DOWN will catch
   3166 	 *  this lock and discard it.
   3167 	 */
   3168 	mutex_exit(&nlm_reg_lock);
   3169 
   3170 	switch (nlm_state) {
   3171 	case FLK_NLM_UP:
   3172 		/*
   3173 		 * Change the NLM state of all locks still held on behalf of
   3174 		 * the NLM server identified by "nlmid" to NLM_UP.
   3175 		 */
   3176 		cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
   3177 		break;
   3178 
   3179 	case FLK_NLM_SHUTTING_DOWN:
   3180 		/*
   3181 		 * Wake up all sleeping locks for the NLM server identified
   3182 		 * by "nlmid." Note that eventually all woken threads will
   3183 		 * have their lock requests cancelled and descriptors
   3184 		 * removed from the sleeping lock list.  Note that the NLM
   3185 		 * server state associated with each lock descriptor is
   3186 		 * changed to FLK_NLM_SHUTTING_DOWN.
   3187 		 */
   3188 		cl_flk_wakeup_sleeping_nlm_locks(nlmid);
   3189 		break;
   3190 
   3191 	case FLK_NLM_DOWN:
   3192 		/*
   3193 		 * Discard all active, granted locks for this NLM server
   3194 		 * identified by "nlmid."
   3195 		 */
   3196 		cl_flk_unlock_nlm_granted(nlmid);
   3197 		break;
   3198 
   3199 	default:
   3200 		panic("cl_set_nlm_status: bad status (%d)", nlm_state);
   3201 	}
   3202 }
   3203 
   3204 /*
   3205  * Set the control status for lock manager requests.
   3206  *
   3207  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
   3208  * may be locks requests that have gotten started but not finished.  In
   3209  * particular, there may be blocking requests that are in the callback code
   3210  * before sleeping (so they're not holding the lock for the graph).  If
   3211  * such a thread reacquires the graph's lock (to go to sleep) after
   3212  * flk_lockmgr_status is set to a non-up value, it will notice the status
   3213  * and bail out.  If the request gets granted before the thread can check
   3214  * flk_lockmgr_status, let it continue normally.  It will get flushed when
   3215  * we are called with FLK_LOCKMGR_DOWN.
   3216  */
   3217 
   3218 void
   3219 flk_set_lockmgr_status(flk_lockmgr_status_t status)
   3220 {
   3221 	int i;
   3222 	graph_t *gp;
   3223 	struct flock_globals *fg;
   3224 
   3225 	fg = flk_get_globals();
   3226 	ASSERT(fg != NULL);
   3227 
   3228 	mutex_enter(&flock_lock);
   3229 	fg->flk_lockmgr_status = status;
   3230 	mutex_exit(&flock_lock);
   3231 
   3232 	/*
   3233 	 * If the lock manager is coming back up, all that's needed is to
   3234 	 * propagate this information to the graphs.  If the lock manager
   3235 	 * is going down, additional action is required, and each graph's
   3236 	 * copy of the state is updated atomically with this other action.
   3237 	 */
   3238 	switch (status) {
   3239 	case FLK_LOCKMGR_UP:
   3240 		for (i = 0; i < HASH_SIZE; i++) {
   3241 			mutex_enter(&flock_lock);
   3242 			gp = lock_graph[i];
   3243 			mutex_exit(&flock_lock);
   3244 			if (gp == NULL)
   3245 				continue;
   3246 			mutex_enter(&gp->gp_mutex);
   3247 			fg->lockmgr_status[i] = status;
   3248 			mutex_exit(&gp->gp_mutex);
   3249 		}
   3250 		break;
   3251 	case FLK_WAKEUP_SLEEPERS:
   3252 		wakeup_sleeping_lockmgr_locks(fg);
   3253 		break;
   3254 	case FLK_LOCKMGR_DOWN:
   3255 		unlock_lockmgr_granted(fg);
   3256 		break;
   3257 	default:
   3258 		panic("flk_set_lockmgr_status: bad status (%d)", status);
   3259 		break;
   3260 	}
   3261 }
   3262 
   3263 /*
   3264  * This routine returns all the locks that are active or sleeping and are
   3265  * associated with a particular set of identifiers.  If lock_state != 0, then
   3266  * only locks that match the lock_state are returned. If lock_state == 0, then
   3267  * all locks are returned. If pid == NOPID, the pid is ignored.  If
   3268  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
   3269  * vnode pointer is ignored.
   3270  *
   3271  * A list containing the vnode pointer and an flock structure
   3272  * describing the lock is returned.  Each element in the list is
   3273  * dynamically allocated and must be freed by the caller.  The
   3274  * last item in the list is denoted by a NULL value in the ll_next
   3275  * field.
   3276  *
   3277  * The vnode pointers returned are held.  The caller is responsible
   3278  * for releasing these.  Note that the returned list is only a snapshot of
   3279  * the current lock information, and that it is a snapshot of a moving
   3280  * target (only one graph is locked at a time).
   3281  */
   3282 
   3283 locklist_t *
   3284 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
   3285 		pid_t pid, const vnode_t *vp, zoneid_t zoneid)
   3286 {
   3287 	lock_descriptor_t	*lock;
   3288 	lock_descriptor_t	*graph_head;
   3289 	locklist_t		listhead;
   3290 	locklist_t		*llheadp;
   3291 	locklist_t		*llp;
   3292 	locklist_t		*lltp;
   3293 	graph_t			*gp;
   3294 	int			i;
   3295 	int			first_index; /* graph index */
   3296 	int			num_indexes; /* graph index */
   3297 
   3298 	ASSERT((list_type == FLK_ACTIVE_STATE) ||
   3299 	    (list_type == FLK_SLEEPING_STATE));
   3300 
   3301 	/*
   3302 	 * Get a pointer to something to use as a list head while building
   3303 	 * the rest of the list.
   3304 	 */
   3305 	llheadp = &listhead;
   3306 	lltp = llheadp;
   3307 	llheadp->ll_next = (locklist_t *)NULL;
   3308 
   3309 	/* Figure out which graphs we want to look at. */
   3310 	if (vp == NULL) {
   3311 		first_index = 0;
   3312 		num_indexes = HASH_SIZE;
   3313 	} else {
   3314 		first_index = HASH_INDEX(vp);
   3315 		num_indexes = 1;
   3316 	}
   3317 
   3318 	for (i = first_index; i < first_index + num_indexes; i++) {
   3319 		mutex_enter(&flock_lock);
   3320 		gp = lock_graph[i];
   3321 		mutex_exit(&flock_lock);
   3322 		if (gp == NULL) {
   3323 			continue;
   3324 		}
   3325 
   3326 		mutex_enter(&gp->gp_mutex);
   3327 		graph_head = (list_type == FLK_ACTIVE_STATE) ?
   3328 			ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
   3329 		for (lock = graph_head->l_next;
   3330 		    lock != graph_head;
   3331 		    lock = lock->l_next) {
   3332 			if (use_sysid && lock->l_flock.l_sysid != sysid)
   3333 				continue;
   3334 			if (pid != NOPID && lock->l_flock.l_pid != pid)
   3335 				continue;
   3336 			if (vp != NULL && lock->l_vnode != vp)
   3337 				continue;
   3338 			if (lock_state && !(lock_state & lock->l_state))
   3339 				continue;
   3340 			if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
   3341 				continue;
   3342 			/*
   3343 			 * A matching lock was found.  Allocate
   3344 			 * space for a new locklist entry and fill
   3345 			 * it in.
   3346 			 */
   3347 			llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
   3348 			lltp->ll_next = llp;
   3349 			VN_HOLD(lock->l_vnode);
   3350 			llp->ll_vp = lock->l_vnode;
   3351 			create_flock(lock, &(llp->ll_flock));
   3352 			llp->ll_next = (locklist_t *)NULL;
   3353 			lltp = llp;
   3354 		}
   3355 		mutex_exit(&gp->gp_mutex);
   3356 	}
   3357 
   3358 	llp = llheadp->ll_next;
   3359 	return (llp);
   3360 }
   3361 
   3362 /*
   3363  * These two functions are simply interfaces to get_lock_list.  They return
   3364  * a list of sleeping or active locks for the given sysid and pid.  See
   3365  * get_lock_list for details.
   3366  *
   3367  * In either case we don't particularly care to specify the zone of interest;
   3368  * the sysid-space is global across zones, so the sysid will map to exactly one
   3369  * zone, and we'll return information for that zone.
   3370  */
   3371 
   3372 locklist_t *
   3373 flk_get_sleeping_locks(int sysid, pid_t pid)
   3374 {
   3375 	return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
   3376 		    ALL_ZONES));
   3377 }
   3378 
   3379 locklist_t *
   3380 flk_get_active_locks(int sysid, pid_t pid)
   3381 {
   3382 	return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
   3383 		    ALL_ZONES));
   3384 }
   3385 
   3386 /*
   3387  * Another interface to get_lock_list.  This one returns all the active
   3388  * locks for a given vnode.  Again, see get_lock_list for details.
   3389  *
   3390  * We don't need to specify which zone's locks we're interested in.  The matter
   3391  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
   3392  * be used by multiple zones, so the list of locks will all be from the right
   3393  * zone.
   3394  */
   3395 
   3396 locklist_t *
   3397 flk_active_locks_for_vp(const vnode_t *vp)
   3398 {
   3399 	return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
   3400 		    ALL_ZONES));
   3401 }
   3402 
   3403 /*
   3404  * Another interface to get_lock_list.  This one returns all the active
   3405  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
   3406  *
   3407  * See the comment for flk_active_locks_for_vp() for why we don't care to
   3408  * specify the particular zone of interest.
   3409  */
   3410 locklist_t *
   3411 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
   3412 {
   3413 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
   3414 				NOPID, vp, ALL_ZONES));
   3415 }
   3416 
   3417 /*
   3418  * Another interface to get_lock_list.  This one returns all the active
   3419  * nbmand locks for a given pid.  Again, see get_lock_list for details.
   3420  *
   3421  * The zone doesn't need to be specified here; the locks held by a
   3422  * particular process will either be local (ie, non-NFS) or from the zone
   3423  * the process is executing in.  This is because other parts of the system
   3424  * ensure that an NFS vnode can't be used in a zone other than that in
   3425  * which it was opened.
   3426  */
   3427 locklist_t *
   3428 flk_active_nbmand_locks(pid_t pid)
   3429 {
   3430 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
   3431 				pid, NULL, ALL_ZONES));
   3432 }
   3433 
   3434 /*
   3435  * Free up all entries in the locklist.
   3436  */
   3437 void
   3438 flk_free_locklist(locklist_t *llp)
   3439 {
   3440 	locklist_t *next_llp;
   3441 
   3442 	while (llp) {
   3443 		next_llp = llp->ll_next;
   3444 		VN_RELE(llp->ll_vp);
   3445 		kmem_free(llp, sizeof (*llp));
   3446 		llp = next_llp;
   3447 	}
   3448 }
   3449 
   3450 static void
   3451 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
   3452 {
   3453 	/*
   3454 	 * For each graph "lg" in the hash table lock_graph do
   3455 	 * a.  Get the list of sleeping locks
   3456 	 * b.  For each lock descriptor in the list do
   3457 	 *	i.   If the requested lock is an NLM server request AND
   3458 	 *		the nlmid is the same as the routine argument then
   3459 	 *		change the lock descriptor's state field to
   3460 	 *		"nlm_state."
   3461 	 * c.  Get the list of active locks
   3462 	 * d.  For each lock descriptor in the list do
   3463 	 *	i.   If the requested lock is an NLM server request AND
   3464 	 *		the nlmid is the same as the routine argument then
   3465 	 *		change the lock descriptor's state field to
   3466 	 *		"nlm_state."
   3467 	 */
   3468 
   3469 	int			i;
   3470 	graph_t			*gp;			/* lock graph */
   3471 	lock_descriptor_t	*lock;			/* lock */
   3472 	lock_descriptor_t	*nlock = NULL;		/* next lock */
   3473 	int			lock_nlmid;
   3474 
   3475 	for (i = 0; i < HASH_SIZE; i++) {
   3476 		mutex_enter(&flock_lock);
   3477 		gp = lock_graph[i];
   3478 		mutex_exit(&flock_lock);
   3479 		if (gp == NULL) {
   3480 			continue;
   3481 		}
   3482 
   3483 		/* Get list of sleeping locks in current lock graph. */
   3484 		mutex_enter(&gp->gp_mutex);
   3485 		for (lock = SLEEPING_HEAD(gp)->l_next;
   3486 		    lock != SLEEPING_HEAD(gp);
   3487 		    lock = nlock) {
   3488 			nlock = lock->l_next;
   3489 			/* get NLM id */
   3490 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
   3491 
   3492 			/*
   3493 			 * If NLM server request AND nlmid of lock matches
   3494 			 * nlmid of argument, then set the NLM state of the
   3495 			 * lock to "nlm_state."
   3496 			 */
   3497 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
   3498 				SET_NLM_STATE(lock, nlm_state);
   3499 			}
   3500 		}
   3501 
   3502 		/* Get list of active locks in current lock graph. */
   3503 		for (lock = ACTIVE_HEAD(gp)->l_next;
   3504 		    lock != ACTIVE_HEAD(gp);
   3505 		    lock = nlock) {
   3506 			nlock = lock->l_next;
   3507 			/* get NLM id */
   3508 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
   3509 
   3510 			/*
   3511 			 * If NLM server request AND nlmid of lock matches
   3512 			 * nlmid of argument, then set the NLM state of the
   3513 			 * lock to "nlm_state."
   3514 			 */
   3515 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
   3516 				ASSERT(IS_ACTIVE(lock));
   3517 				SET_NLM_STATE(lock, nlm_state);
   3518 			}
   3519 		}
   3520 		mutex_exit(&gp->gp_mutex);
   3521 	}
   3522 }
   3523 
   3524 /*
   3525  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
   3526  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
   3527  *   identified by "nlmid." Poke those lock requests.
   3528  */
   3529 static void
   3530 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
   3531 {
   3532 	lock_descriptor_t *lock;
   3533 	lock_descriptor_t *nlock = NULL; /* next lock */
   3534 	int i;
   3535 	graph_t *gp;
   3536 	int	lock_nlmid;
   3537 
   3538 	for (i = 0; i < HASH_SIZE; i++) {
   3539 		mutex_enter(&flock_lock);
   3540 		gp = lock_graph[i];
   3541 		mutex_exit(&flock_lock);
   3542 		if (gp == NULL) {
   3543 			continue;
   3544 		}
   3545 
   3546 		mutex_enter(&gp->gp_mutex);
   3547 		for (lock = SLEEPING_HEAD(gp)->l_next;
   3548 		    lock != SLEEPING_HEAD(gp);
   3549 		    lock = nlock) {
   3550 			nlock = lock->l_next;
   3551 			/*
   3552 			 * If NLM server request _and_ nlmid of lock matches
   3553 			 * nlmid of argument, then set the NLM state of the
   3554 			 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
   3555 			 * request.
   3556 			 */
   3557 			if (IS_LOCKMGR(lock)) {
   3558 				/* get NLM id */
   3559 				lock_nlmid =
   3560 					GETNLMID(lock->l_flock.l_sysid);
   3561 				if (nlmid == lock_nlmid) {
   3562 					SET_NLM_STATE(lock,
   3563 						FLK_NLM_SHUTTING_DOWN);
   3564 					INTERRUPT_WAKEUP(lock);
   3565 				}
   3566 			}
   3567 		}
   3568 		mutex_exit(&gp->gp_mutex);
   3569 	}
   3570 }
   3571 
   3572 /*
   3573  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
   3574  * Effects:  Find all active (granted) lock manager locks _only_ for the
   3575  *   NLM server identified by "nlmid" and release them.
   3576  */
   3577 static void
   3578 cl_flk_unlock_nlm_granted(int nlmid)
   3579 {
   3580 	lock_descriptor_t *lock;
   3581 	lock_descriptor_t *nlock = NULL; /* next lock */
   3582 	int i;
   3583 	graph_t *gp;
   3584 	int	lock_nlmid;
   3585 
   3586 	for (i = 0; i < HASH_SIZE; i++) {
   3587 		mutex_enter(&flock_lock);
   3588 		gp = lock_graph[i];
   3589 		mutex_exit(&flock_lock);
   3590 		if (gp == NULL) {
   3591 			continue;
   3592 		}
   3593 
   3594 		mutex_enter(&gp->gp_mutex);
   3595 		for (lock = ACTIVE_HEAD(gp)->l_next;
   3596 		    lock != ACTIVE_HEAD(gp);
   3597 		    lock = nlock) {
   3598 			nlock = lock->l_next;
   3599 			ASSERT(IS_ACTIVE(lock));
   3600 
   3601 			/*
   3602 			 * If it's an  NLM server request _and_ nlmid of
   3603 			 * the lock matches nlmid of argument, then
   3604 			 * remove the active lock the list, wakup blocked
   3605 			 * threads, and free the storage for the lock.
   3606 			 * Note that there's no need to mark the NLM state
   3607 			 * of this lock to NLM_DOWN because the lock will
   3608 			 * be deleted anyway and its storage freed.
   3609 			 */
   3610 			if (IS_LOCKMGR(lock)) {
   3611 				/* get NLM id */
   3612 				lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
   3613 				if (nlmid == lock_nlmid) {
   3614 					flk_delete_active_lock(lock, 0);
   3615 					flk_wakeup(lock, 1);
   3616 					flk_free_lock(lock);
   3617 				}
   3618 			}
   3619 		}
   3620 		mutex_exit(&gp->gp_mutex);
   3621 	}
   3622 }
   3623 
   3624 /*
   3625  * Find all sleeping lock manager requests and poke them.
   3626  */
   3627 static void
   3628 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
   3629 {
   3630 	lock_descriptor_t *lock;
   3631 	lock_descriptor_t *nlock = NULL; /* next lock */
   3632 	int i;
   3633 	graph_t *gp;
   3634 	zoneid_t zoneid = getzoneid();
   3635 
   3636 	for (i = 0; i < HASH_SIZE; i++) {
   3637 		mutex_enter(&flock_lock);
   3638 		gp = lock_graph[i];
   3639 		mutex_exit(&flock_lock);
   3640 		if (gp == NULL) {
   3641 			continue;
   3642 		}
   3643 
   3644 		mutex_enter(&gp->gp_mutex);
   3645 		fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
   3646 		for (lock = SLEEPING_HEAD(gp)->l_next;
   3647 		    lock != SLEEPING_HEAD(gp);
   3648 		    lock = nlock) {
   3649 			nlock = lock->l_next;
   3650 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
   3651 				INTERRUPT_WAKEUP(lock);
   3652 			}
   3653 		}
   3654 		mutex_exit(&gp->gp_mutex);
   3655 	}
   3656 }
   3657 
   3658 
   3659 /*
   3660  * Find all active (granted) lock manager locks and release them.
   3661  */
   3662 static void
   3663 unlock_lockmgr_granted(struct flock_globals *fg)
   3664 {
   3665 	lock_descriptor_t *lock;
   3666 	lock_descriptor_t *nlock = NULL; /* next lock */
   3667 	int i;
   3668 	graph_t *gp;
   3669 	zoneid_t zoneid = getzoneid();
   3670 
   3671 	for (i = 0; i < HASH_SIZE; i++) {
   3672 		mutex_enter(&flock_lock);
   3673 		gp = lock_graph[i];
   3674 		mutex_exit(&flock_lock);
   3675 		if (gp == NULL) {
   3676 			continue;
   3677 		}
   3678 
   3679 		mutex_enter(&gp->gp_mutex);
   3680 		fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
   3681 		for (lock = ACTIVE_HEAD(gp)->l_next;
   3682 		    lock != ACTIVE_HEAD(gp);
   3683 		    lock = nlock) {
   3684 			nlock = lock->l_next;
   3685 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
   3686 				ASSERT(IS_ACTIVE(lock));
   3687 				flk_delete_active_lock(lock, 0);
   3688 				flk_wakeup(lock, 1);
   3689 				flk_free_lock(lock);
   3690 			}
   3691 		}
   3692 		mutex_exit(&gp->gp_mutex);
   3693 	}
   3694 }
   3695 /* ONC_PLUS EXTRACT END */
   3696 
   3697 
   3698 /*
   3699  * Wait until a lock is granted, cancelled, or interrupted.
   3700  */
   3701 
   3702 static void
   3703 wait_for_lock(lock_descriptor_t *request)
   3704 {
   3705 	graph_t *gp = request->l_graph;
   3706 
   3707 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
   3708 
   3709 	while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
   3710 	    !(IS_INTERRUPTED(request))) {
   3711 		if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
   3712 			flk_set_state(request, FLK_INTERRUPTED_STATE);
   3713 			request->l_state |= INTERRUPTED_LOCK;
   3714 		}
   3715 	}
   3716 }
   3717 
   3718 /* ONC_PLUS EXTRACT START */
   3719 /*
   3720  * Create an flock structure from the existing lock information
   3721  *
   3722  * This routine is used to create flock structures for the lock manager
   3723  * to use in a reclaim request.  Since the lock was originated on this
   3724  * host, it must be conforming to UNIX semantics, so no checking is
   3725  * done to make sure it falls within the lower half of the 32-bit range.
   3726  */
   3727 
   3728 static void
   3729 create_flock(lock_descriptor_t *lp, flock64_t *flp)
   3730 {
   3731 	ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
   3732 	ASSERT(lp->l_end >= lp->l_start);
   3733 
   3734 	flp->l_type = lp->l_type;
   3735 	flp->l_whence = 0;
   3736 	flp->l_start = lp->l_start;
   3737 	flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
   3738 		(lp->l_end - lp->l_start + 1);
   3739 	flp->l_sysid = lp->l_flock.l_sysid;
   3740 	flp->l_pid = lp->l_flock.l_pid;
   3741 }
   3742 
   3743 /*
   3744  * Convert flock_t data describing a lock range into unsigned long starting
   3745  * and ending points, which are put into lock_request.  Returns 0 or an
   3746  * errno value.
   3747  * Large Files: max is passed by the caller and we return EOVERFLOW
   3748  * as defined by LFS API.
   3749  */
   3750 
   3751 int
   3752 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
   3753     u_offset_t *start, u_offset_t *end, offset_t offset)
   3754 {
   3755 	struct vattr	vattr;
   3756 	int	error;
   3757 
   3758 	/*
   3759 	 * Determine the starting point of the request
   3760 	 */
   3761 	switch (flp->l_whence) {
   3762 	case 0:		/* SEEK_SET */
   3763 		*start = (u_offset_t)flp->l_start;
   3764 		break;
   3765 	case 1:		/* SEEK_CUR */
   3766 		*start = (u_offset_t)(flp->l_start + offset);
   3767 		break;
   3768 	case 2:		/* SEEK_END */
   3769 		vattr.va_mask = AT_SIZE;
   3770 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
   3771 			return (error);
   3772 		*start = (u_offset_t)(flp->l_start + vattr.va_size);
   3773 		break;
   3774 	default:
   3775 		return (EINVAL);
   3776 	}
   3777 
   3778 	/*
   3779 	 * Determine the range covered by the request.
   3780 	 */
   3781 	if (flp->l_len == 0)
   3782 		*end = MAX_U_OFFSET_T;
   3783 	else if ((offset_t)flp->l_len > 0) {
   3784 		*end = (u_offset_t)(*start + (flp->l_len - 1));
   3785 	} else {
   3786 		/*
   3787 		 * Negative length; why do we even allow this ?
   3788 		 * Because this allows easy specification of
   3789 		 * the last n bytes of the file.
   3790 		 */
   3791 		*end = *start;
   3792 		*start += (u_offset_t)flp->l_len;
   3793 		(*start)++;
   3794 	}
   3795 	return (0);
   3796 }
   3797 
   3798 /*
   3799  * Check the validity of lock data.  This can used by the NFS
   3800  * frlock routines to check data before contacting the server.  The
   3801  * server must support semantics that aren't as restrictive as
   3802  * the UNIX API, so the NFS client is required to check.
   3803  * The maximum is now passed in by the caller.
   3804  */
   3805 
   3806 int
   3807 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
   3808 {
   3809 	/*
   3810 	 * The end (length) for local locking should never be greater
   3811 	 * than MAXEND. However, the representation for
   3812 	 * the entire file is MAX_U_OFFSET_T.
   3813 	 */
   3814 	if ((start > max) ||
   3815 	    ((end > max) && (end != MAX_U_OFFSET_T))) {
   3816 		return (EINVAL);
   3817 	}
   3818 	if (start > end) {
   3819 	    return (EINVAL);
   3820 	}
   3821 	return (0);
   3822 }
   3823 
   3824 /*
   3825  * Fill in request->l_flock with information about the lock blocking the
   3826  * request.  The complexity here is that lock manager requests are allowed
   3827  * to see into the upper part of the 32-bit address range, whereas local
   3828  * requests are only allowed to see signed values.
   3829  *
   3830  * What should be done when "blocker" is a lock manager lock that uses the
   3831  * upper portion of the 32-bit range, but "request" is local?  Since the
   3832  * request has already been determined to have been blocked by the blocker,
   3833  * at least some portion of "blocker" must be in the range of the request,
   3834  * or the request extends to the end of file.  For the first case, the
   3835  * portion in the lower range is returned with the indication that it goes
   3836  * "to EOF."  For the second case, the last byte of the lower range is
   3837  * returned with the indication that it goes "to EOF."
   3838  */
   3839 
   3840 static void
   3841 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
   3842 {
   3843 	flock64_t *flrp;			/* l_flock portion of request */
   3844 
   3845 	ASSERT(blocker != NULL);
   3846 
   3847 	flrp = &request->l_flock;
   3848 	flrp->l_whence = 0;
   3849 	flrp->l_type = blocker->l_type;
   3850 	flrp->l_pid = blocker->l_flock.l_pid;
   3851 	flrp->l_sysid = blocker->l_flock.l_sysid;
   3852 
   3853 	if (IS_LOCKMGR(request)) {
   3854 		flrp->l_start = blocker->l_start;
   3855 		if (blocker->l_end == MAX_U_OFFSET_T)
   3856 			flrp->l_len = 0;
   3857 		else
   3858 			flrp->l_len = blocker->l_end - blocker->l_start + 1;
   3859 	} else {
   3860 		if (blocker->l_start > MAXEND) {
   3861 			flrp->l_start = MAXEND;
   3862 			flrp->l_len = 0;
   3863 		} else {
   3864 			flrp->l_start = blocker->l_start;
   3865 			if (blocker->l_end == MAX_U_OFFSET_T)
   3866 				flrp->l_len = 0;
   3867 			else
   3868 				flrp->l_len = blocker->l_end -
   3869 					blocker->l_start + 1;
   3870 		}
   3871 	}
   3872 }
   3873 /* ONC_PLUS EXTRACT END */
   3874 
   3875 /*
   3876  * PSARC case 1997/292
   3877  */
   3878 /*
   3879  * This is the public routine exported by flock.h.
   3880  */
   3881 void
   3882 cl_flk_change_nlm_state_to_unknown(int nlmid)
   3883 {
   3884 	/*
   3885 	 * Check to see if node is booted as a cluster. If not, return.
   3886 	 */
   3887 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
   3888 		return;
   3889 	}
   3890 
   3891 	/*
   3892 	 * See comment in cl_flk_set_nlm_status().
   3893 	 */
   3894 	if (nlm_reg_status == NULL) {
   3895 		return;
   3896 	}
   3897 
   3898 	/*
   3899 	 * protect NLM registry state with a mutex.
   3900 	 */
   3901 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
   3902 	mutex_enter(&nlm_reg_lock);
   3903 	FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
   3904 	mutex_exit(&nlm_reg_lock);
   3905 }
   3906 
   3907 /*
   3908  * Return non-zero if the given I/O request conflicts with an active NBMAND
   3909  * lock.
   3910  * If svmand is non-zero, it means look at all active locks, not just NBMAND
   3911  * locks.
   3912  */
   3913 
   3914 int
   3915 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
   3916 		ssize_t length, int svmand, caller_context_t *ct)
   3917 {
   3918 	int conflict = 0;
   3919 	graph_t			*gp;
   3920 	lock_descriptor_t	*lock;
   3921 	pid_t pid;
   3922 	int sysid;
   3923 
   3924 	if (ct == NULL) {
   3925 		pid = curproc->p_pid;
   3926 		sysid = 0;
   3927 	} else {
   3928 		pid = ct->cc_pid;
   3929 		sysid = ct->cc_sysid;
   3930 	}
   3931 
   3932 	mutex_enter(&flock_lock);
   3933 	gp = lock_graph[HASH_INDEX(vp)];
   3934 	mutex_exit(&flock_lock);
   3935 	if (gp == NULL)
   3936 		return (0);
   3937 
   3938 	mutex_enter(&gp->gp_mutex);
   3939 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
   3940 
   3941 	for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
   3942 		if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
   3943 		    (lock->l_flock.l_sysid != sysid ||
   3944 		    lock->l_flock.l_pid != pid) &&
   3945 		    lock_blocks_io(op, offset, length,
   3946 				lock->l_type, lock->l_start, lock->l_end)) {
   3947 			conflict = 1;
   3948 			break;
   3949 		}
   3950 	}
   3951 	mutex_exit(&gp->gp_mutex);
   3952 
   3953 	return (conflict);
   3954 }
   3955 
   3956 /*
   3957  * Return non-zero if the given I/O request conflicts with the given lock.
   3958  */
   3959 
   3960 static int
   3961 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
   3962 	    int lock_type, u_offset_t lock_start, u_offset_t lock_end)
   3963 {
   3964 	ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
   3965 	ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
   3966 
   3967 	if (op == NBL_READ && lock_type == F_RDLCK)
   3968 		return (0);
   3969 
   3970 	if (offset <= lock_start && lock_start < offset + length)
   3971 		return (1);
   3972 	if (lock_start <= offset && offset <= lock_end)
   3973 		return (1);
   3974 
   3975 	return (0);
   3976 }
   3977 
   3978 #ifdef DEBUG
   3979 static void
   3980 check_active_locks(graph_t *gp)
   3981 {
   3982 	lock_descriptor_t *lock, *lock1;
   3983 	edge_t	*ep;
   3984 
   3985 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
   3986 						lock = lock->l_next) {
   3987 		ASSERT(IS_ACTIVE(lock));
   3988 		ASSERT(NOT_BLOCKED(lock));
   3989 		ASSERT(!IS_BARRIER(lock));
   3990 
   3991 		ep = FIRST_IN(lock);
   3992 
   3993 		while (ep != HEAD(lock)) {
   3994 			ASSERT(IS_SLEEPING(ep->from_vertex));
   3995 			ASSERT(!NOT_BLOCKED(ep->from_vertex));
   3996 			ep = NEXT_IN(ep);
   3997 		}
   3998 
   3999 		for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
   4000 					lock1 = lock1->l_next) {
   4001 			if (lock1->l_vnode == lock->l_vnode) {
   4002 			if (BLOCKS(lock1, lock)) {
   4003 				cmn_err(CE_PANIC,
   4004 				    "active lock %p blocks %p",
   4005 				    (void *)lock1, (void *)lock);
   4006 			} else if (BLOCKS(lock, lock1)) {
   4007 				cmn_err(CE_PANIC,
   4008 				    "active lock %p blocks %p",
   4009 				    (void *)lock, (void *)lock1);
   4010 			}
   4011 			}
   4012 		}
   4013 	}
   4014 }
   4015 
   4016 /*
   4017  * Effect: This functions checks to see if the transition from 'old_state' to
   4018  *	'new_state' is a valid one.  It returns 0 if the transition is valid
   4019  *	and 1 if it is not.
   4020  *	For a map of valid transitions, see sys/flock_impl.h
   4021  */
   4022 static int
   4023 check_lock_transition(int old_state, int new_state)
   4024 {
   4025 	switch (old_state) {
   4026 	case FLK_INITIAL_STATE:
   4027 		if ((new_state == FLK_START_STATE) ||
   4028 		    (new_state == FLK_SLEEPING_STATE) ||
   4029 		    (new_state == FLK_ACTIVE_STATE) ||
   4030 		    (new_state == FLK_DEAD_STATE)) {
   4031 			return (0);
   4032 		} else {
   4033 			return (1);
   4034 		}
   4035 	case FLK_START_STATE:
   4036 		if ((new_state == FLK_ACTIVE_STATE) ||
   4037 		    (new_state == FLK_DEAD_STATE)) {
   4038 			return (0);
   4039 		} else {
   4040 			return (1);
   4041 		}
   4042 	case FLK_ACTIVE_STATE:
   4043 		if (new_state == FLK_DEAD_STATE) {
   4044 			return (0);
   4045 		} else {
   4046 			return (1);
   4047 		}
   4048 	case FLK_SLEEPING_STATE:
   4049 		if ((new_state == FLK_GRANTED_STATE) ||
   4050 		    (new_state == FLK_INTERRUPTED_STATE) ||
   4051 		    (new_state == FLK_CANCELLED_STATE)) {
   4052 			return (0);
   4053 		} else {
   4054 			return (1);
   4055 		}
   4056 	case FLK_GRANTED_STATE:
   4057 		if ((new_state == FLK_START_STATE) ||
   4058 		    (new_state == FLK_INTERRUPTED_STATE) ||
   4059 		    (new_state == FLK_CANCELLED_STATE)) {
   4060 			return (0);
   4061 		} else {
   4062 			return (1);
   4063 		}
   4064 	case FLK_CANCELLED_STATE:
   4065 		if ((new_state == FLK_INTERRUPTED_STATE) ||
   4066 		    (new_state == FLK_DEAD_STATE)) {
   4067 			return (0);
   4068 		} else {
   4069 			return (1);
   4070 		}
   4071 	case FLK_INTERRUPTED_STATE:
   4072 		if (new_state == FLK_DEAD_STATE) {
   4073 			return (0);
   4074 		} else {
   4075 			return (1);
   4076 		}
   4077 	case FLK_DEAD_STATE:
   4078 		/* May be set more than once */
   4079 		if (new_state == FLK_DEAD_STATE) {
   4080 			return (0);
   4081 		} else {
   4082 			return (1);
   4083 		}
   4084 	default:
   4085 		return (1);
   4086 	}
   4087 }
   4088 
   4089 static void
   4090 check_sleeping_locks(graph_t *gp)
   4091 {
   4092 	lock_descriptor_t *lock1, *lock2;
   4093 	edge_t *ep;
   4094 	for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
   4095 				lock1 = lock1->l_next) {
   4096 				ASSERT(!IS_BARRIER(lock1));
   4097 	for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
   4098 				lock2 = lock2->l_next) {
   4099 		if (lock1->l_vnode == lock2->l_vnode) {
   4100 			if (BLOCKS(lock2, lock1)) {
   4101 				ASSERT(!IS_GRANTED(lock1));
   4102 				ASSERT(!NOT_BLOCKED(lock1));
   4103 				path(lock1, lock2);
   4104 			}
   4105 		}
   4106 	}
   4107 
   4108 	for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
   4109 					lock2 = lock2->l_next) {
   4110 				ASSERT(!IS_BARRIER(lock1));
   4111 		if (lock1->l_vnode == lock2->l_vnode) {
   4112 			if (BLOCKS(lock2, lock1)) {
   4113 				ASSERT(!IS_GRANTED(lock1));
   4114 				ASSERT(!NOT_BLOCKED(lock1));
   4115 				path(lock1, lock2);
   4116 			}
   4117 		}
   4118 	}
   4119 	ep = FIRST_ADJ(lock1);
   4120 	while (ep != HEAD(lock1)) {
   4121 		ASSERT(BLOCKS(ep->to_vertex, lock1));
   4122 		ep = NEXT_ADJ(ep);
   4123 	}
   4124 	}
   4125 }
   4126 
   4127 static int
   4128 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
   4129 {
   4130 	edge_t	*ep;
   4131 	lock_descriptor_t	*vertex;
   4132 	lock_descriptor_t *vertex_stack;
   4133 
   4134 	STACK_INIT(vertex_stack);
   4135 
   4136 	flk_graph_uncolor(lock1->l_graph);
   4137 	ep = FIRST_ADJ(lock1);
   4138 	ASSERT(ep != HEAD(lock1));
   4139 	while (ep != HEAD(lock1)) {
   4140 		if (no_path)
   4141 			ASSERT(ep->to_vertex != lock2);
   4142 		STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
   4143 		COLOR(ep->to_vertex);
   4144 		ep = NEXT_ADJ(ep);
   4145 	}
   4146 
   4147 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
   4148 		STACK_POP(vertex_stack, l_dstack);
   4149 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
   4150 						ep = NEXT_ADJ(ep)) {
   4151 			if (COLORED(ep->to_vertex))
   4152 				continue;
   4153 			COLOR(ep->to_vertex);
   4154 			if (ep->to_vertex == lock2)
   4155 				return (1);
   4156 
   4157 			STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
   4158 		}
   4159 	}
   4160 	return (0);
   4161 }
   4162 
   4163 static void
   4164 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
   4165 {
   4166 	lock_descriptor_t *lock;
   4167 
   4168 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
   4169 
   4170 	if (lock) {
   4171 		while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
   4172 			if (lock->l_flock.l_pid == pid &&
   4173 			    lock->l_flock.l_sysid == sysid)
   4174 				cmn_err(CE_PANIC,
   4175 				    "owner pid %d's lock %p in active queue",
   4176 				    pid, (void *)lock);
   4177 			lock = lock->l_next;
   4178 		}
   4179 	}
   4180 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
   4181 
   4182 	if (lock) {
   4183 		while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
   4184 			if (lock->l_flock.l_pid == pid &&
   4185 			    lock->l_flock.l_sysid == sysid)
   4186 				cmn_err(CE_PANIC,
   4187 				    "owner pid %d's lock %p in sleep queue",
   4188 				    pid, (void *)lock);
   4189 			lock = lock->l_next;
   4190 		}
   4191 	}
   4192 }
   4193 
   4194 static int
   4195 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
   4196 {
   4197 	edge_t *ep = FIRST_ADJ(lock1);
   4198 
   4199 	while (ep != HEAD(lock1)) {
   4200 		if (ep->to_vertex == lock2)
   4201 			return (1);
   4202 		else
   4203 			ep = NEXT_ADJ(ep);
   4204 	}
   4205 	return (0);
   4206 }
   4207 
   4208 static int
   4209 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
   4210 {
   4211 	return (!level_two_path(lock1, lock2, 1));
   4212 }
   4213 
   4214 static void
   4215 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
   4216 {
   4217 	if (level_one_path(lock1, lock2)) {
   4218 		if (level_two_path(lock1, lock2, 0) != 0) {
   4219 			cmn_err(CE_WARN,
   4220 			    "one edge one path from lock1 %p lock2 %p",
   4221 			    (void *)lock1, (void *)lock2);
   4222 		}
   4223 	} else if (no_path(lock1, lock2)) {
   4224 		cmn_err(CE_PANIC,
   4225 		    "No path from  lock1 %p to lock2 %p",
   4226 		    (void *)lock1, (void *)lock2);
   4227 	}
   4228 }
   4229 #endif /* DEBUG */
   4230