Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/systm.h>
     27 #include <rpc/auth.h>
     28 #include <rpc/clnt.h>
     29 #include <nfs/nfs4_kprot.h>
     30 #include <nfs/nfs4.h>
     31 #include <nfs/lm.h>
     32 #include <sys/cmn_err.h>
     33 #include <sys/disp.h>
     34 #include <sys/sdt.h>
     35 
     36 #include <sys/pathname.h>
     37 
     38 #include <sys/strsubr.h>
     39 #include <sys/ddi.h>
     40 
     41 #include <sys/vnode.h>
     42 #include <sys/sdt.h>
     43 #include <inet/common.h>
     44 #include <inet/ip.h>
     45 #include <inet/ip6.h>
     46 #include <sys/sdt.h>
     47 
     48 #define	MAX_READ_DELEGATIONS 5
     49 
     50 static int rfs4_max_setup_cb_tries = 5;
     51 
     52 #ifdef DEBUG
     53 static int rfs4_test_cbgetattr_fail = 0;
     54 int rfs4_cb_null;
     55 int rfs4_cb_debug;
     56 int rfs4_deleg_debug;
     57 #endif
     58 
     59 int mds_cbrecall_no_session = 0;
     60 
     61 static void rfs4_recall_file(rfs4_file_t *, bool_t, rfs4_client_t *);
     62 static	void		rfs4_revoke_deleg(rfs4_deleg_state_t *);
     63 static	void		rfs41_revoke_deleg(rfs4_deleg_state_t *);
     64 static	void		rfs4_revoke_file(rfs4_file_t *);
     65 static	void		rfs4_cb_chflush(rfs4_cbinfo_t *);
     66 static	CLIENT		*rfs4_cb_getch(rfs4_cbinfo_t *);
     67 static	void		rfs4_cb_freech(rfs4_cbinfo_t *, CLIENT *, bool_t);
     68 static rfs4_deleg_state_t *rfs4_deleg_state(struct compound_state *,
     69     rfs4_state_t *, open_delegation_type4, int *);
     70 
     71 /*
     72  * Convert a universal address to an transport specific
     73  * address using inet_pton.
     74  */
     75 int
     76 uaddr2sockaddr(int af, char *ua, void *ap, in_port_t *pp)
     77 {
     78 	int dots = 0, i, j, len, k;
     79 	unsigned char c;
     80 	in_port_t port = 0;
     81 
     82 	len = strlen(ua);
     83 
     84 	for (i = len-1; i >= 0; i--) {
     85 
     86 		if (ua[i] == '.')
     87 			dots++;
     88 
     89 		if (dots == 2) {
     90 
     91 			ua[i] = '\0';
     92 			/*
     93 			 * We use k to remember were to stick '.' back, since
     94 			 * ua was kmem_allocateded from the pool len+1.
     95 			 */
     96 			k = i;
     97 			if (inet_pton(af, ua, ap) == 1) {
     98 
     99 				c = 0;
    100 
    101 				for (j = i+1; j < len; j++) {
    102 					if (ua[j] == '.') {
    103 						port = c << 8;
    104 						c = 0;
    105 					} else if (ua[j] >= '0' &&
    106 					    ua[j] <= '9') {
    107 						c *= 10;
    108 						c += ua[j] - '0';
    109 					} else {
    110 						ua[k] = '.';
    111 						return (EINVAL);
    112 					}
    113 				}
    114 				port += c;
    115 
    116 
    117 				/* reset to network order */
    118 				if (af == AF_INET) {
    119 					*(uint32_t *)ap =
    120 					    htonl(*(uint32_t *)ap);
    121 					*pp = htons(port);
    122 				} else {
    123 					int ix;
    124 					uint16_t *sap;
    125 
    126 					for (sap = ap, ix = 0; ix <
    127 					    sizeof (struct in6_addr) /
    128 					    sizeof (uint16_t); ix++)
    129 						sap[ix] = htons(sap[ix]);
    130 
    131 					*pp = htons(port);
    132 				}
    133 
    134 				ua[k] = '.';
    135 				return (0);
    136 			} else {
    137 				ua[k] = '.';
    138 				return (EINVAL);
    139 			}
    140 		}
    141 	}
    142 
    143 	return (EINVAL);
    144 }
    145 
    146 /*
    147  * Update the delegation policy with the
    148  * value of "new_policy"
    149  */
    150 void
    151 rfs4_set_deleg_policy(nfs_server_instance_t *instp,
    152 		srv_deleg_policy_t new_policy)
    153 {
    154 	rw_enter(&instp->deleg_policy_lock, RW_WRITER);
    155 	instp->deleg_policy = new_policy;
    156 	rw_exit(&instp->deleg_policy_lock);
    157 }
    158 
    159 void
    160 rfs4_hold_deleg_policy(nfs_server_instance_t *instp)
    161 {
    162 	rw_enter(&instp->deleg_policy_lock, RW_READER);
    163 }
    164 
    165 void
    166 rfs4_rele_deleg_policy(nfs_server_instance_t *instp)
    167 {
    168 	rw_exit(&instp->deleg_policy_lock);
    169 }
    170 
    171 
    172 /*
    173  * This free function is to be used when the client struct is being
    174  * released and nothing at all is needed of the callback info any
    175  * longer.
    176  */
    177 void
    178 rfs4_cbinfo_free(rfs4_cbinfo_t *cbp)
    179 {
    180 	char *addr = cbp->cb_callback.cb_location.r_addr;
    181 	char *netid = cbp->cb_callback.cb_location.r_netid;
    182 
    183 	/* Free old address if any */
    184 
    185 	if (addr)
    186 		kmem_free(addr, strlen(addr) + 1);
    187 	if (netid)
    188 		kmem_free(netid, strlen(netid) + 1);
    189 
    190 	addr = cbp->cb_newer.cb_callback.cb_location.r_addr;
    191 	netid = cbp->cb_newer.cb_callback.cb_location.r_netid;
    192 
    193 	if (addr)
    194 		kmem_free(addr, strlen(addr) + 1);
    195 	if (netid)
    196 		kmem_free(netid, strlen(netid) + 1);
    197 
    198 	if (cbp->cb_chc_free) {
    199 		rfs4_cb_chflush(cbp);
    200 	}
    201 }
    202 
    203 /*
    204  * The server uses this to check the callback path supplied by the
    205  * client.  The callback connection is marked "in progress" while this
    206  * work is going on and then eventually marked either OK or FAILED.
    207  * This work can be done as part of a separate thread and at the end
    208  * of this the thread will exit or it may be done such that the caller
    209  * will continue with other work.
    210  */
    211 static void
    212 rfs4_do_cb_null(rfs4_client_t *cp)
    213 {
    214 	struct timeval tv;
    215 	CLIENT *ch;
    216 	rfs4_cbstate_t newstate;
    217 	rfs4_cbinfo_t *cbp = &cp->rc_cbinfo;
    218 
    219 	mutex_enter(cbp->cb_lock);
    220 	/* If another thread is doing CB_NULL RPC then return */
    221 	if (cbp->cb_nullcaller == TRUE) {
    222 		mutex_exit(cbp->cb_lock);
    223 		rfs4_client_rele(cp);
    224 		return;
    225 	}
    226 
    227 	/* Mark the cbinfo as having a thread in the NULL callback */
    228 	cbp->cb_nullcaller = TRUE;
    229 
    230 	/*
    231 	 * Are there other threads still using the cbinfo client
    232 	 * handles?  If so, this thread must wait before going and
    233 	 * mucking aroiund with the callback information
    234 	 */
    235 	while (cbp->cb_refcnt != 0)
    236 		cv_wait(cbp->cb_cv_nullcaller, cbp->cb_lock);
    237 
    238 	/*
    239 	 * This thread itself may find that new callback info has
    240 	 * arrived and is set up to handle this case and redrive the
    241 	 * call to the client's callback server.
    242 	 */
    243 retry:
    244 	if (cbp->cb_newer.cb_new == TRUE &&
    245 	    cbp->cb_newer.cb_confirmed == TRUE) {
    246 		char *addr = cbp->cb_callback.cb_location.r_addr;
    247 		char *netid = cbp->cb_callback.cb_location.r_netid;
    248 
    249 		/*
    250 		 * Free the old stuff if it exists; may be the first
    251 		 * time through this path
    252 		 */
    253 		if (addr)
    254 			kmem_free(addr, strlen(addr) + 1);
    255 		if (netid)
    256 			kmem_free(netid, strlen(netid) + 1);
    257 
    258 		/* Move over the addr/netid */
    259 		cbp->cb_callback.cb_location.r_addr =
    260 		    cbp->cb_newer.cb_callback.cb_location.r_addr;
    261 		cbp->cb_newer.cb_callback.cb_location.r_addr = NULL;
    262 		cbp->cb_callback.cb_location.r_netid =
    263 		    cbp->cb_newer.cb_callback.cb_location.r_netid;
    264 		cbp->cb_newer.cb_callback.cb_location.r_netid = NULL;
    265 
    266 		/* Get the program number */
    267 		cbp->cb_callback.cb_program =
    268 		    cbp->cb_newer.cb_callback.cb_program;
    269 		cbp->cb_newer.cb_callback.cb_program = 0;
    270 
    271 		/* Don't forget the protocol's "cb_ident" field */
    272 		cbp->cb_ident = cbp->cb_newer.cb_ident;
    273 		cbp->cb_newer.cb_ident = 0;
    274 
    275 		/* no longer new */
    276 		cbp->cb_newer.cb_new = FALSE;
    277 		cbp->cb_newer.cb_confirmed = FALSE;
    278 
    279 		/* get rid of the old client handles that may exist */
    280 		rfs4_cb_chflush(cbp);
    281 
    282 		cbp->cb_state = CB_NONE;
    283 		cbp->cb_timefailed = 0; /* reset the clock */
    284 		cbp->cb_notified_of_cb_path_down = TRUE;
    285 	}
    286 
    287 	if (cbp->cb_state != CB_NONE) {
    288 		cv_broadcast(cbp->cb_cv);	/* let the others know */
    289 		cbp->cb_nullcaller = FALSE;
    290 		mutex_exit(cbp->cb_lock);
    291 		rfs4_client_rele(cp);
    292 		return;
    293 	}
    294 
    295 	/* mark rfs4_client_t as CALLBACK NULL in progress */
    296 	cbp->cb_state = CB_INPROG;
    297 	mutex_exit(cbp->cb_lock);
    298 
    299 	/* get/generate a client handle */
    300 	if ((ch = rfs4_cb_getch(cbp)) == NULL) {
    301 		mutex_enter(cbp->cb_lock);
    302 		cbp->cb_state = CB_BAD;
    303 		cbp->cb_timefailed = gethrestime_sec(); /* observability */
    304 		goto retry;
    305 	}
    306 
    307 
    308 	tv.tv_sec = 30;
    309 	tv.tv_usec = 0;
    310 	if (clnt_call(ch, CB_NULL, xdr_void, NULL, xdr_void, NULL, tv) != 0) {
    311 		newstate = CB_BAD;
    312 	} else {
    313 		newstate = CB_OK;
    314 #ifdef	DEBUG
    315 		rfs4_cb_null++;
    316 #endif
    317 	}
    318 
    319 	/* Check to see if the client has specified new callback info */
    320 	mutex_enter(cbp->cb_lock);
    321 	rfs4_cb_freech(cbp, ch, TRUE);
    322 	if (cbp->cb_newer.cb_new == TRUE &&
    323 	    cbp->cb_newer.cb_confirmed == TRUE) {
    324 		goto retry;	/* give the CB_NULL another chance */
    325 	}
    326 
    327 	cbp->cb_state = newstate;
    328 	if (cbp->cb_state == CB_BAD)
    329 		cbp->cb_timefailed = gethrestime_sec(); /* observability */
    330 
    331 	cv_broadcast(cbp->cb_cv);	/* start up the other threads */
    332 	cbp->cb_nullcaller = FALSE;
    333 	mutex_exit(cbp->cb_lock);
    334 
    335 	rfs4_client_rele(cp);
    336 }
    337 
    338 /*
    339  * Given a client struct, inspect the callback info to see if the
    340  * callback path is up and available.
    341  *
    342  * If new callback path is available and no one has set it up then
    343  * try to set it up. If setup is not successful after 5 tries (5 secs)
    344  * then gives up and returns NULL.
    345  *
    346  * If callback path is being initialized, then wait for the CB_NULL RPC
    347  * call to occur.
    348  */
    349 static rfs4_cbinfo_t *
    350 rfs4_cbinfo_hold(rfs4_client_t *cp)
    351 {
    352 	rfs4_cbinfo_t *cbp = &cp->rc_cbinfo;
    353 	int retries = 0;
    354 
    355 	mutex_enter(cbp->cb_lock);
    356 
    357 	while (cbp->cb_newer.cb_new == TRUE && cbp->cb_nullcaller == FALSE) {
    358 		/*
    359 		 * Looks like a new callback path may be available and
    360 		 * noone has set it up.
    361 		 */
    362 		mutex_exit(cbp->cb_lock);
    363 		rfs4_dbe_hold(cp->rc_dbe);
    364 		rfs4_do_cb_null(cp); /* caller will release client hold */
    365 
    366 		mutex_enter(cbp->cb_lock);
    367 		/*
    368 		 * If callback path is no longer new, or it's being setup
    369 		 * then stop and wait for it to be done.
    370 		 */
    371 		if (cbp->cb_newer.cb_new == FALSE || cbp->cb_nullcaller == TRUE)
    372 			break;
    373 		mutex_exit(cbp->cb_lock);
    374 
    375 		if (++retries >= rfs4_max_setup_cb_tries)
    376 			return (NULL);
    377 		delay(hz);
    378 		mutex_enter(cbp->cb_lock);
    379 	}
    380 
    381 	/* Is there a thread working on doing the CB_NULL RPC? */
    382 	if (cbp->cb_nullcaller == TRUE)
    383 		cv_wait(cbp->cb_cv, cbp->cb_lock);  /* if so, wait on it */
    384 
    385 	/* If the callback path is not okay (up and running), just quit */
    386 	if (cbp->cb_state != CB_OK) {
    387 		mutex_exit(cbp->cb_lock);
    388 		return (NULL);
    389 	}
    390 
    391 	/* Let someone know we are using the current callback info */
    392 	cbp->cb_refcnt++;
    393 	mutex_exit(cbp->cb_lock);
    394 	return (cbp);
    395 }
    396 
    397 /*
    398  * The caller is done with the callback info.  It may be that the
    399  * caller's RPC failed and the NFSv4 client has actually provided new
    400  * callback information.  If so, let the caller know so they can
    401  * advantage of this and maybe retry the RPC that originally failed.
    402  */
    403 static int
    404 rfs4_cbinfo_rele(rfs4_cbinfo_t *cbp, rfs4_cbstate_t newstate)
    405 {
    406 	int cb_new = FALSE;
    407 
    408 	mutex_enter(cbp->cb_lock);
    409 
    410 	/* The caller gets a chance to mark the callback info as bad */
    411 	if (newstate != CB_NOCHANGE)
    412 		cbp->cb_state = newstate;
    413 	if (newstate == CB_FAILED) {
    414 		cbp->cb_timefailed = gethrestime_sec(); /* observability */
    415 		cbp->cb_notified_of_cb_path_down = FALSE;
    416 	}
    417 
    418 	cbp->cb_refcnt--;	/* no longer using the information */
    419 
    420 	/*
    421 	 * A thread may be waiting on this one to finish and if so,
    422 	 * let it know that it is okay to do the CB_NULL to the
    423 	 * client's callback server.
    424 	 */
    425 	if (cbp->cb_refcnt == 0 && cbp->cb_nullcaller)
    426 		cv_broadcast(cbp->cb_cv_nullcaller);
    427 
    428 	/*
    429 	 * If this is the last thread to use the callback info and
    430 	 * there is new callback information to try and no thread is
    431 	 * there ready to do the CB_NULL, then return true to teh
    432 	 * caller so they can do the CB_NULL
    433 	 */
    434 	if (cbp->cb_refcnt == 0 &&
    435 	    cbp->cb_nullcaller == FALSE &&
    436 	    cbp->cb_newer.cb_new == TRUE &&
    437 	    cbp->cb_newer.cb_confirmed == TRUE)
    438 		cb_new = TRUE;
    439 
    440 	mutex_exit(cbp->cb_lock);
    441 
    442 	return (cb_new);
    443 }
    444 
    445 /*
    446  * Common v4 routine to init a callback client handle
    447  */
    448 
    449 static CLIENT *
    450 cbch_init(struct netbuf *nb, uint32_t cb_program)
    451 {
    452 	struct knetconfig knc;
    453 	vnode_t *vp;
    454 	char *devnam;
    455 	int err = 0;
    456 	CLIENT *ch = NULL;
    457 	struct sockaddr *sa;
    458 
    459 	sa = (struct sockaddr *)nb->buf;
    460 
    461 	if (sa->sa_family == AF_INET) {
    462 		knc.knc_semantics = NC_TPI_COTS;
    463 		knc.knc_protofmly = "inet";
    464 		knc.knc_proto = "tcp";
    465 		devnam = "/dev/tcp";
    466 	} else if (sa->sa_family == AF_INET6) {
    467 		knc.knc_semantics = NC_TPI_COTS;
    468 		knc.knc_protofmly = "inet6";
    469 		knc.knc_proto = "tcp";
    470 		devnam = "/dev/tcp6";
    471 	} else {
    472 		DTRACE_PROBE2(nfss__cb__debug, char *,
    473 		    "cbch_init: unknown transport family", int, sa->sa_family);
    474 
    475 		goto cb_init_out;
    476 	}
    477 
    478 	if (lookupname(devnam, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp) != 0) {
    479 
    480 		DTRACE_PROBE2(nfss__cb__debug, char *,
    481 		    "cbch_init: lookupname failed", int, err);
    482 
    483 		goto cb_init_out;
    484 	}
    485 
    486 	if (vp->v_type != VCHR) {
    487 
    488 		DTRACE_PROBE2(nfss__cb__debug, char *,
    489 		    "cbch_init: v_type not of type VCHR", char *, devnam);
    490 		VN_RELE(vp);
    491 		goto cb_init_out;
    492 	}
    493 
    494 	knc.knc_rdev = vp->v_rdev;
    495 
    496 	VN_RELE(vp);
    497 
    498 	if (err = clnt_tli_kcreate(&knc, nb, cb_program,
    499 	    NFS_CB, 0, 0, curthread->t_cred, &ch)) {
    500 
    501 		DTRACE_PROBE2(nfss__cb__debug, char *,
    502 		    "cbch_init: clnt_tli_kcreate failed", int, err);
    503 		ch = NULL;
    504 	}
    505 
    506 cb_init_out:
    507 	return (ch);
    508 }
    509 
    510 /*
    511  * Given the information in the callback info struct, create a client
    512  * handle that can be used by the server for its callback path.
    513  */
    514 static CLIENT *
    515 rfs4_cbch_init(rfs4_cbinfo_t *cbp)
    516 {
    517 	int useresvport = 0;
    518 	int af;
    519 	int size;
    520 	netaddr4 *naddr;
    521 	void *addr;
    522 	void *taddr;
    523 	in_port_t *pp;
    524 	struct sockaddr_in addr4;
    525 	struct sockaddr_in6 addr6;
    526 	struct netbuf nb;
    527 	CLIENT *ch = NULL;
    528 
    529 	DTRACE_PROBE2(nfss__cb__debug, char *,
    530 	    "rfs4_cbch_init: entry cbp:", rfs4_cbinfo_t *, cbp);
    531 
    532 	mutex_enter(cbp->cb_lock);
    533 
    534 	naddr = (netaddr4 *)&cbp->cb_callback.cb_location;
    535 
    536 	if (naddr->na_r_netid == NULL || naddr->na_r_addr == NULL) {
    537 		goto ch_out;
    538 	}
    539 
    540 	if (strcmp(naddr->na_r_netid, "tcp") == 0) {
    541 		af = AF_INET;
    542 		size = sizeof (addr4);
    543 		bzero(&addr4, size);
    544 		addr4.sin_family = (sa_family_t)af;
    545 		addr = &addr4.sin_addr;
    546 		pp = &addr4.sin_port;
    547 		taddr = &addr4;
    548 	} else if (strcmp(naddr->na_r_netid, "tcp6") == 0) {
    549 		af = AF_INET6;
    550 		size = sizeof (addr6);
    551 		bzero(&addr6, size);
    552 		addr6.sin6_family = (sa_family_t)af;
    553 		addr = &addr6.sin6_addr;
    554 		pp = &addr6.sin6_port;
    555 		taddr = &addr6;
    556 	} else {
    557 		DTRACE_PROBE2(nfss__cb__debug, char *,
    558 		    "rfs4_cbch_init: bad transport", char *,
    559 		    cbp->cb_callback.cb_location.r_netid);
    560 		goto ch_out;
    561 	}
    562 
    563 	if (uaddr2sockaddr(af, naddr->na_r_addr, addr, pp)) {
    564 
    565 		DTRACE_PROBE2(nfss__cb__debug, char *,
    566 		    "rfs4_cbch_init: malformed universal addr: ",
    567 		    void *, naddr->na_r_addr);
    568 
    569 		goto ch_out;
    570 	}
    571 
    572 
    573 	nb.maxlen = nb.len = size;
    574 	nb.buf = (char *)taddr;
    575 
    576 	ch = cbch_init(&nb, cbp->cb_callback.cb_program);
    577 
    578 	/* turn off reserved port usage */
    579 	if (ch != NULL)
    580 		(void) CLNT_CONTROL(ch, CLSET_BINDRESVPORT,
    581 		    (char *)&useresvport);
    582 ch_out:
    583 	mutex_exit(cbp->cb_lock);
    584 	return (ch);
    585 }
    586 
    587 /*
    588  * Iterate over the client handle cache and
    589  * destroy it.
    590  */
    591 static void
    592 rfs4_cb_chflush(rfs4_cbinfo_t *cbp)
    593 {
    594 	CLIENT *ch;
    595 
    596 	while (cbp->cb_chc_free) {
    597 		cbp->cb_chc_free--;
    598 		ch = cbp->cb_chc[cbp->cb_chc_free];
    599 		cbp->cb_chc[cbp->cb_chc_free] = NULL;
    600 		if (ch) {
    601 			if (ch->cl_auth)
    602 				auth_destroy(ch->cl_auth);
    603 			clnt_destroy(ch);
    604 		}
    605 	}
    606 }
    607 
    608 /*
    609  * Return a client handle, either from a the small
    610  * rfs4_client_t cache or one that we just created.
    611  */
    612 static CLIENT *
    613 rfs4_cb_getch(rfs4_cbinfo_t *cbp)
    614 {
    615 	CLIENT *cbch = NULL;
    616 	uint32_t zilch = 0;
    617 
    618 	mutex_enter(cbp->cb_lock);
    619 
    620 	if (cbp->cb_chc_free) {
    621 		cbp->cb_chc_free--;
    622 		cbch = cbp->cb_chc[ cbp->cb_chc_free ];
    623 		mutex_exit(cbp->cb_lock);
    624 		(void) CLNT_CONTROL(cbch, CLSET_XID, (char *)&zilch);
    625 		return (cbch);
    626 	}
    627 
    628 	mutex_exit(cbp->cb_lock);
    629 
    630 	/* none free so make it now */
    631 	cbch = rfs4_cbch_init(cbp);
    632 
    633 	return (cbch);
    634 }
    635 
    636 /*
    637  * Return the client handle to the small cache or
    638  * destroy it.
    639  */
    640 static void
    641 rfs4_cb_freech(rfs4_cbinfo_t *cbp, CLIENT *ch, bool_t lockheld)
    642 {
    643 	if (lockheld == FALSE)
    644 		mutex_enter(cbp->cb_lock);
    645 
    646 	if (cbp->cb_chc_free < RFS4_CBCH_MAX) {
    647 		cbp->cb_chc[ cbp->cb_chc_free++ ] = ch;
    648 		if (lockheld == FALSE)
    649 			mutex_exit(cbp->cb_lock);
    650 		return;
    651 	}
    652 	if (lockheld == FALSE)
    653 		mutex_exit(cbp->cb_lock);
    654 
    655 	/*
    656 	 * cache maxed out of free entries, obliterate
    657 	 * this client handle, destroy it, throw it away.
    658 	 */
    659 	if (ch->cl_auth)
    660 		auth_destroy(ch->cl_auth);
    661 	clnt_destroy(ch);
    662 }
    663 
    664 static CLIENT *
    665 rfs41_cb_chinit(uint32_t cbprog)
    666 {
    667 	CLIENT *ch;
    668 	struct knetconfig knc;
    669 	int err;
    670 
    671 	/*
    672 	 * The dest addr and parts of knc fields passed into
    673 	 * clnt_tli_kcreate() are dummy. The connection is
    674 	 * picked up later and RPC does not really use it to
    675 	 * create connections for 4.1 callbacks.
    676 	 */
    677 
    678 	bzero(&knc, sizeof (struct knetconfig));
    679 
    680 	/*
    681 	 * knc_semantics is important to choose the
    682 	 * right transport type.
    683 	 */
    684 	knc.knc_semantics = NC_TPI_COTS;
    685 	knc.knc_protofmly = "inet";
    686 	knc.knc_proto = "tcp";
    687 
    688 	if (err = clnt_tli_kcreate(&knc, 0, cbprog, NFS_CB, 0, 0,
    689 	    curthread->t_cred, &ch)) {
    690 		DTRACE_PROBE2(nfss__cb__debug, char *,
    691 		    "rfs41_cbch_init: clnt_tli_kcreate failed", int, err);
    692 		ch = NULL;
    693 	}
    694 	if (ch != NULL)
    695 		CLNT_CONTROL(ch, CLSET_CBCLIENT, NULL);
    696 	return (ch);
    697 }
    698 
    699 CLIENT *
    700 rfs41_cb_getch(mds_session_t *sp)
    701 {
    702 	CLIENT *cbch = NULL;
    703 	sess_channel_t *bcp;
    704 	sess_bcsd_t *bsdp;
    705 
    706 	rfs4_dbe_lock(sp->sn_dbe);
    707 	bcp = SNTOBC(sp);
    708 	rfs4_dbe_unlock(sp->sn_dbe);
    709 
    710 	rw_enter(&bcp->cn_lock, RW_READER);
    711 	bsdp = CTOBSD(bcp);
    712 
    713 	rw_enter(&bsdp->bsd_rwlock, RW_WRITER);
    714 	if (bsdp->bsd_ch_free) {
    715 		bsdp->bsd_ch_free--;
    716 		cbch = bsdp->bsd_clnt[bsdp->bsd_ch_free];
    717 	} else {
    718 		cbch = rfs41_cb_chinit(sp->sn_bc.progno);
    719 		CLNT_CONTROL(cbch, CLSET_TAG, (void *)sp->sn_sessid);
    720 	}
    721 
    722 	rw_exit(&bsdp->bsd_rwlock);
    723 	rw_exit(&bcp->cn_lock);
    724 	return (cbch);
    725 }
    726 
    727 void
    728 rfs41_cb_freech(mds_session_t *sp, CLIENT *ch)
    729 {
    730 	sess_channel_t *bcp;
    731 	sess_bcsd_t *bsdp;
    732 
    733 	rfs4_dbe_lock(sp->sn_dbe);
    734 	bcp = SNTOBC(sp);
    735 	rfs4_dbe_unlock(sp->sn_dbe);
    736 
    737 	rw_enter(&bcp->cn_lock, RW_READER);
    738 	bsdp = CTOBSD(bcp);
    739 
    740 	rw_enter(&bsdp->bsd_rwlock, RW_WRITER);
    741 	if (bsdp->bsd_ch_free < RFS4_CBCH_MAX) {
    742 		bsdp->bsd_clnt[bsdp->bsd_ch_free++] = ch;
    743 		rw_exit(&bsdp->bsd_rwlock);
    744 		rw_exit(&bcp->cn_lock);
    745 		return;
    746 	}
    747 
    748 	rw_exit(&bsdp->bsd_rwlock);
    749 	rw_exit(&bcp->cn_lock);
    750 
    751 	/*
    752 	 * cache maxed out of free entries, obliterate
    753 	 * this client handle, destroy it, throw it away.
    754 	 */
    755 	if (ch->cl_auth)
    756 		auth_destroy(ch->cl_auth);
    757 	clnt_destroy(ch);
    758 }
    759 
    760 /*
    761  * Iterate over the session's client handle cache and
    762  * destroy it.
    763  */
    764 void
    765 rfs41_cb_chflush(mds_session_t *sp)
    766 {
    767 	CLIENT *ch;
    768 	sess_channel_t *bcp;
    769 	sess_bcsd_t *bsdp;
    770 
    771 	rfs4_dbe_lock(sp->sn_dbe);
    772 	bcp = SNTOBC(sp);
    773 	rfs4_dbe_unlock(sp->sn_dbe);
    774 
    775 	rw_enter(&bcp->cn_lock, RW_READER);
    776 	bsdp = CTOBSD(bcp);
    777 
    778 	rw_enter(&bsdp->bsd_rwlock, RW_WRITER);
    779 
    780 	while (bsdp->bsd_ch_free) {
    781 		bsdp->bsd_ch_free--;
    782 		ch = bsdp->bsd_clnt[bsdp->bsd_ch_free];
    783 		bsdp->bsd_clnt[bsdp->bsd_ch_free] = NULL;
    784 		if (ch) {
    785 			if (ch->cl_auth)
    786 				auth_destroy(ch->cl_auth);
    787 			clnt_destroy(ch);
    788 		}
    789 	}
    790 
    791 	rw_exit(&bsdp->bsd_rwlock);
    792 }
    793 
    794 /*
    795  * With the supplied callback information - initialize the client
    796  * callback data.  If there is a callback in progress, save the
    797  * callback info so that a thread can pick it up in the future.
    798  */
    799 void
    800 rfs4_client_setcb(rfs4_client_t *cp, cb_client4 *cb, uint32_t cb_ident)
    801 {
    802 	char *addr = NULL;
    803 	char *netid = NULL;
    804 	rfs4_cbinfo_t *cbp = &cp->rc_cbinfo;
    805 	size_t len;
    806 
    807 	/* Set the call back for the client */
    808 	if (cb->cb_location.r_addr && cb->cb_location.r_addr[0] != '\0' &&
    809 	    cb->cb_location.r_netid && cb->cb_location.r_netid[0] != '\0') {
    810 		len = strlen(cb->cb_location.r_addr) + 1;
    811 		addr = kmem_alloc(len, KM_SLEEP);
    812 		bcopy(cb->cb_location.r_addr, addr, len);
    813 		len = strlen(cb->cb_location.r_netid) + 1;
    814 		netid = kmem_alloc(len, KM_SLEEP);
    815 		bcopy(cb->cb_location.r_netid, netid, len);
    816 	}
    817 	/* ready to save the new information but first free old, if exists */
    818 	mutex_enter(cbp->cb_lock);
    819 
    820 	cbp->cb_newer.cb_callback.cb_program = cb->cb_program;
    821 
    822 	if (cbp->cb_newer.cb_callback.cb_location.r_addr != NULL)
    823 		kmem_free(cbp->cb_newer.cb_callback.cb_location.r_addr,
    824 		    strlen(cbp->cb_newer.cb_callback.cb_location.r_addr) + 1);
    825 	cbp->cb_newer.cb_callback.cb_location.r_addr = addr;
    826 
    827 	if (cbp->cb_newer.cb_callback.cb_location.r_netid != NULL)
    828 		kmem_free(cbp->cb_newer.cb_callback.cb_location.r_netid,
    829 		    strlen(cbp->cb_newer.cb_callback.cb_location.r_netid) + 1);
    830 	cbp->cb_newer.cb_callback.cb_location.r_netid = netid;
    831 
    832 	cbp->cb_newer.cb_ident = cb_ident;
    833 
    834 	if (addr && *addr && netid && *netid) {
    835 		cbp->cb_newer.cb_new = TRUE;
    836 		cbp->cb_newer.cb_confirmed = FALSE;
    837 	} else {
    838 		cbp->cb_newer.cb_new = FALSE;
    839 		cbp->cb_newer.cb_confirmed = FALSE;
    840 	}
    841 
    842 	mutex_exit(cbp->cb_lock);
    843 }
    844 
    845 /*
    846  * The server uses this when processing SETCLIENTID_CONFIRM.  Callback
    847  * information may have been provided on SETCLIENTID and this call
    848  * marks that information as confirmed and then starts a thread to
    849  * test the callback path.
    850  */
    851 void
    852 rfs4_deleg_cb_check(rfs4_client_t *cp)
    853 {
    854 	if (cp->rc_cbinfo.cb_newer.cb_new == FALSE)
    855 		return;
    856 
    857 	cp->rc_cbinfo.cb_newer.cb_confirmed = TRUE;
    858 
    859 	rfs4_dbe_hold(cp->rc_dbe); /* hold the client struct for thread */
    860 
    861 	(void) thread_create(NULL, 0, rfs4_do_cb_null, cp, 0, &p0, TS_RUN,
    862 	    minclsyspri);
    863 }
    864 
    865 static void
    866 rfs4args_cb_recall_free(nfs_cb_argop4 *argop)
    867 {
    868 	CB_RECALL4args	*rec_argp;
    869 
    870 	rec_argp = &argop->nfs_cb_argop4_u.opcbrecall;
    871 	if (rec_argp->fh.nfs_fh4_val)
    872 		kmem_free(rec_argp->fh.nfs_fh4_val, rec_argp->fh.nfs_fh4_len);
    873 }
    874 
    875 /* XXX - this only works for one entry in the referring_call_list4 - rick */
    876 void
    877 rfs41args_cb_sequence_free(nfs_cb_argop4 *argop)
    878 {
    879 	CB_SEQUENCE4args	*ap;
    880 	referring_call_list4	*rp;
    881 	uint_t			 len;
    882 
    883 	ap = &argop->nfs_cb_argop4_u.opcbsequence;
    884 	if ((rp = ap->csa_rcall_lval) != NULL) {
    885 		if (rp->rcl_val != NULL) {
    886 			len = rp->rcl_len;
    887 			kmem_free(rp->rcl_val, len * sizeof (referring_call4));
    888 			rp->rcl_val = NULL;
    889 		}
    890 		len = ap->csa_rcall_llen;
    891 		kmem_free(rp, len * sizeof (referring_call_list4));
    892 		ap->csa_rcall_lval = NULL;
    893 	}
    894 }
    895 
    896 /* ARGSUSED */
    897 static void
    898 rfs4args_cb_getattr_free(nfs_cb_argop4 *argop)
    899 {
    900 	CB_GETATTR4args *argp;
    901 
    902 	argp = &argop->nfs_cb_argop4_u.opcbgetattr;
    903 	if (argp->fh.nfs_fh4_val)
    904 		kmem_free(argp->fh.nfs_fh4_val, argp->fh.nfs_fh4_len);
    905 }
    906 
    907 void
    908 rfs4freeargres(CB_COMPOUND4args *args, CB_COMPOUND4res *resp)
    909 {
    910 	int i, arglen;
    911 	nfs_cb_argop4 *argop;
    912 
    913 	/*
    914 	 * First free any special args alloc'd for specific ops.
    915 	 */
    916 	arglen = args->array_len;
    917 	argop = args->array;
    918 	for (i = 0; i < arglen; i++, argop++) {
    919 
    920 		switch (argop->argop) {
    921 		case OP_CB_SEQUENCE:
    922 			rfs41args_cb_sequence_free(argop);
    923 			break;
    924 
    925 		case OP_CB_RECALL:
    926 			rfs4args_cb_recall_free(argop);
    927 			break;
    928 
    929 		case OP_CB_GETATTR:
    930 			rfs4args_cb_getattr_free(argop);
    931 			break;
    932 
    933 		case OP_CB_LAYOUTRECALL:
    934 			break;
    935 
    936 		default:
    937 			return;
    938 		}
    939 	}
    940 
    941 	if (args->tag.utf8string_len > 0)
    942 		UTF8STRING_FREE(args->tag)
    943 
    944 	kmem_free(args->array, arglen * sizeof (nfs_cb_argop4));
    945 	if (resp)
    946 		(void) xdr_free(xdr_CB_COMPOUND4res, (caddr_t)resp);
    947 }
    948 
    949 slotid4
    950 svc_slot_maxslot(mds_session_t *sp)
    951 {
    952 	slotid4		 ms;
    953 	sess_channel_t	*bcp;
    954 	sess_bcsd_t	*bsdp;
    955 
    956 	rfs4_dbe_lock(sp->sn_dbe);
    957 	bcp = SNTOBC(sp);
    958 	rfs4_dbe_unlock(sp->sn_dbe);
    959 
    960 	rw_enter(&bcp->cn_lock, RW_READER);
    961 	if ((bsdp = CTOBSD(bcp)) == NULL)
    962 		cmn_err(CE_PANIC, "svc_slot_maxslot: BC Specific Data Not Set");
    963 
    964 	rw_enter(&bsdp->bsd_rwlock, RW_READER);
    965 	slot_table_query(bsdp->bsd_stok, SLT_MAXSLOT, &ms);
    966 	rw_exit(&bsdp->bsd_rwlock);
    967 
    968 	rw_exit(&bcp->cn_lock);
    969 	return (ms);
    970 }
    971 
    972 /*
    973  * Server-side slot allocations from BC's slot table.
    974  */
    975 slot_ent_t *
    976 svc_slot_alloc(mds_session_t *sp)
    977 {
    978 	slot_ent_t	*p;
    979 	sess_channel_t	*bcp;
    980 	sess_bcsd_t	*bsdp;
    981 
    982 	rfs4_dbe_lock(sp->sn_dbe);
    983 	bcp = SNTOBC(sp);
    984 	rfs4_dbe_unlock(sp->sn_dbe);
    985 
    986 	rw_enter(&bcp->cn_lock, RW_READER);
    987 	if ((bsdp = CTOBSD(bcp)) == NULL)
    988 		cmn_err(CE_PANIC, "svc_slot_alloc: BC Specific Data Not Set");
    989 
    990 	rw_enter(&bsdp->bsd_rwlock, RW_READER);
    991 	(void) slot_alloc(bsdp->bsd_stok, SLT_SLEEP, &p);
    992 	rw_exit(&bsdp->bsd_rwlock);
    993 
    994 	rw_exit(&bcp->cn_lock);
    995 	return (p);
    996 }
    997 
    998 /*
    999  * Server-side slot allocations from BC's slot table.
   1000  */
   1001 void
   1002 svc_slot_free(mds_session_t *sp, slot_ent_t *p)
   1003 {
   1004 	sess_channel_t	*bcp;
   1005 	sess_bcsd_t	*bsdp;
   1006 
   1007 	ASSERT(sp != NULL);
   1008 	ASSERT(p != NULL);
   1009 	rfs4_dbe_lock(sp->sn_dbe);
   1010 	bcp = SNTOBC(sp);
   1011 	rfs4_dbe_unlock(sp->sn_dbe);
   1012 
   1013 	rw_enter(&bcp->cn_lock, RW_READER);
   1014 	if ((bsdp = CTOBSD(bcp)) == NULL)
   1015 		cmn_err(CE_PANIC, "svc_slot_free: BC Specific Data Not Set");
   1016 
   1017 	rw_enter(&bsdp->bsd_rwlock, RW_READER);
   1018 	slot_free(bsdp->bsd_stok, p);
   1019 	rw_exit(&bsdp->bsd_rwlock);
   1020 
   1021 	rw_exit(&bcp->cn_lock);
   1022 }
   1023 
   1024 void
   1025 svc_slot_cb_seqid(CB_COMPOUND4res *resp, slot_ent_t *p)
   1026 {
   1027 	CB_SEQUENCE4res	*rp;
   1028 
   1029 	if (resp == NULL || resp->array == NULL)
   1030 		return;
   1031 
   1032 	ASSERT(resp->array->resop == OP_CB_SEQUENCE);
   1033 	rp = &resp->array->nfs_cb_resop4_u.opcbsequence;
   1034 	if (rp->csr_status == NFS4_OK) {
   1035 		slot_incr_seq(p, 1);
   1036 	}
   1037 }
   1038 
   1039 /*
   1040  * General callback routine for the server to the client.
   1041  */
   1042 static enum clnt_stat
   1043 rfs4_do_callback(rfs4_client_t *cp, CB_COMPOUND4args *args,
   1044     CB_COMPOUND4res *res, struct timeval timeout)
   1045 {
   1046 	rfs4_cbinfo_t *cbp;
   1047 	CLIENT *ch;
   1048 	/* start with this in case cb_getch() fails */
   1049 	enum clnt_stat	stat = RPC_FAILED;
   1050 
   1051 	res->tag.utf8string_val = NULL;
   1052 	res->array = NULL;
   1053 
   1054 retry:
   1055 	cbp = rfs4_cbinfo_hold(cp);
   1056 	if (cbp == NULL)
   1057 		return (stat);
   1058 
   1059 	/* get a client handle */
   1060 	if ((ch = rfs4_cb_getch(cbp)) != NULL) {
   1061 		/*
   1062 		 * reset the cb_ident since it may have changed in
   1063 		 * rfs4_cbinfo_hold()
   1064 		 */
   1065 		args->callback_ident = cbp->cb_ident;
   1066 
   1067 		stat = clnt_call(ch, CB_COMPOUND, xdr_CB_COMPOUND4args_srv,
   1068 		    (caddr_t)args, xdr_CB_COMPOUND4res,
   1069 		    (caddr_t)res, timeout);
   1070 
   1071 		/* free client handle */
   1072 		rfs4_cb_freech(cbp, ch, FALSE);
   1073 	}
   1074 
   1075 	/*
   1076 	 * If the rele says that there may be new callback info then
   1077 	 * retry this sequence and it may succeed as a result of the
   1078 	 * new callback path
   1079 	 */
   1080 	if (rfs4_cbinfo_rele(cbp,
   1081 	    (stat == RPC_SUCCESS ? CB_NOCHANGE : CB_FAILED)) == TRUE)
   1082 		goto retry;
   1083 
   1084 	return (stat);
   1085 }
   1086 
   1087 /*
   1088  * Used by the NFSv4 server to get attributes for a file while
   1089  * handling the case where a file has been write delegated.  For the
   1090  * time being, VOP_GETATTR() is called and CB_GETATTR processing is
   1091  * not undertaken.  This call site is maintained in case the server is
   1092  * updated in the future to handle write delegation space guarantees.
   1093  */
   1094 nfsstat4
   1095 rfs4_vop_getattr(vnode_t *vp, vattr_t *vap, int flag, cred_t *cr)
   1096 {
   1097 
   1098 	int error;
   1099 
   1100 	error = VOP_GETATTR(vp, vap, flag, cr, NULL);
   1101 	return (puterrno4(error));
   1102 }
   1103 
   1104 /*
   1105  * This is used everywhere in the v2/v3 server to allow the
   1106  * integration of all NFS versions and the support of delegation.  For
   1107  * now, just call the VOP_GETATTR().  If the NFSv4 server is enhanced
   1108  * in the future to provide space guarantees for write delegations
   1109  * then this call site should be expanded to interact with the client.
   1110  */
   1111 int
   1112 rfs4_delegated_getattr(vnode_t *vp, vattr_t *vap, int flag, cred_t *cr)
   1113 {
   1114 	return (VOP_GETATTR(vp, vap, flag, cr, NULL));
   1115 }
   1116 
   1117 /*
   1118  * Place the actual cb_recall otw call to client.
   1119  */
   1120 void
   1121 rfs4_do_cb_recall(rfs4_deleg_state_t *dsp, bool_t trunc)
   1122 {
   1123 	CB_COMPOUND4args	cb4_args;
   1124 	CB_COMPOUND4res		cb4_res;
   1125 	CB_RECALL4args		*rec_argp;
   1126 	CB_RECALL4res		*rec_resp;
   1127 	nfs_cb_argop4		*argop;
   1128 	int			numops;
   1129 	int			argoplist_size;
   1130 	struct timeval		timeout;
   1131 	nfs_fh4			*fhp;
   1132 	enum clnt_stat		call_stat;
   1133 
   1134 	/*
   1135 	 * set up the compound args
   1136 	 */
   1137 	numops = 1;	/* CB_RECALL only */
   1138 
   1139 	argoplist_size = numops * sizeof (nfs_cb_argop4);
   1140 	argop = kmem_zalloc(argoplist_size, KM_SLEEP);
   1141 	argop->argop = OP_CB_RECALL;
   1142 	rec_argp = &argop->nfs_cb_argop4_u.opcbrecall;
   1143 
   1144 	(void) str_to_utf8("cb_recall", &cb4_args.tag);
   1145 	cb4_args.minorversion = CB4_MINOR_v0;
   1146 	/* cb4_args.callback_ident is set in rfs4_do_callback() */
   1147 	cb4_args.array_len = numops;
   1148 	cb4_args.array = argop;
   1149 
   1150 	/*
   1151 	 * fill in the args struct
   1152 	 */
   1153 	bcopy(&dsp->rds_delegid.stateid, &rec_argp->stateid, sizeof (stateid4));
   1154 	rec_argp->truncate = trunc;
   1155 
   1156 	fhp = &dsp->rds_finfo->rf_filehandle;
   1157 	rec_argp->fh.nfs_fh4_val = kmem_alloc(sizeof (char) *
   1158 	    fhp->nfs_fh4_len, KM_SLEEP);
   1159 	nfs_fh4_copy(fhp, &rec_argp->fh);
   1160 
   1161 	/* Keep track of when we did this for observability */
   1162 	dsp->rds_time_recalled = gethrestime_sec();
   1163 
   1164 	/*
   1165 	 * Set up the timeout for the callback and make the actual call.
   1166 	 * Timeout will be 80% of the lease period for this server.
   1167 	 */
   1168 	timeout.tv_sec = (dbe_to_instp(dsp->rds_dbe)->lease_period * 80) / 100;
   1169 	timeout.tv_usec = 0;
   1170 
   1171 	DTRACE_NFSV4_3(cb__recall__start, rfs4_client_t *, dsp->rds_client,
   1172 	    rfs4_deleg_state_t *, dsp, CB_RECALL4args *, rec_argp);
   1173 
   1174 	call_stat = rfs4_do_callback(dsp->rds_client, &cb4_args, &cb4_res,
   1175 	    timeout);
   1176 
   1177 	rec_resp = (cb4_res.array_len == 0) ? NULL :
   1178 	    &cb4_res.array[0].nfs_cb_resop4_u.opcbrecall;
   1179 
   1180 	DTRACE_NFSV4_3(cb__recall__done, rfs4_client_t *, dsp->rds_client,
   1181 	    rfs4_deleg_state_t *, dsp, CB_RECALL4res *, rec_resp);
   1182 
   1183 	if (call_stat != RPC_SUCCESS || cb4_res.status != NFS4_OK) {
   1184 		rfs4_return_deleg(dsp, TRUE);
   1185 	}
   1186 
   1187 	rfs4freeargres(&cb4_args, &cb4_res);
   1188 }
   1189 
   1190 bool_t
   1191 rfs41_file_still_delegated(rfs4_deleg_state_t *dsp)
   1192 {
   1193 	rfs4_file_t	*fp;
   1194 
   1195 	ASSERT(dsp != NULL);
   1196 	ASSERT(dsp->rds_finfo != NULL);
   1197 	fp = dsp->rds_finfo;
   1198 
   1199 	/* do we have a delegation on this file? */
   1200 	rfs4_dbe_lock(fp->rf_dbe);
   1201 	if (fp->rf_dinfo->rd_dtype == OPEN_DELEGATE_NONE) {	/* check type */
   1202 		rfs4_dbe_unlock(fp->rf_dbe);
   1203 		return (FALSE);
   1204 	}
   1205 
   1206 	/* check deleg cnt */
   1207 	if (list_next(&fp->rf_delegstatelist, dsp) == NULL) {
   1208 		rfs4_dbe_unlock(fp->rf_dbe);
   1209 		return (FALSE);
   1210 	}
   1211 	rfs4_dbe_unlock(fp->rf_dbe);
   1212 	return (TRUE);
   1213 }
   1214 
   1215 void
   1216 rfs41_cb_seq_rcl_args(CB_SEQUENCE4args *ap, rfs4_deleg_state_t *dsp)
   1217 {
   1218 	referring_call_list4	*rp;
   1219 	referring_call4		*rcp;
   1220 
   1221 	ASSERT(ap != NULL);
   1222 
   1223 	/* construct one entry in referring_call_list4 */
   1224 	ap->csa_rcall_llen = 1;
   1225 	rp = (referring_call_list4 *)kmem_zalloc(sizeof (referring_call_list4),
   1226 	    KM_SLEEP);
   1227 	ap->csa_rcall_lval = rp;
   1228 
   1229 	/* construct one referring_call4 entry in list above */
   1230 	rp->rcl_len = 1;
   1231 	rcp = (referring_call4 *)kmem_zalloc(sizeof (referring_call4),
   1232 	    KM_SLEEP);
   1233 	rp->rcl_val = rcp;
   1234 
   1235 	/* set the necessary arg fields */
   1236 	bcopy(&dsp->rds_rs.sessid, &rp->rcl_sessionid, sizeof (sessionid4));
   1237 	rcp->rc_sequenceid = dsp->rds_rs.seqid;
   1238 	rcp->rc_slotid = dsp->rds_rs.slotno;
   1239 }
   1240 
   1241 void
   1242 rfs41_cb_path_down(mds_session_t *sp, uint32_t sonly)
   1243 {
   1244 	uint32_t	cp_flag = SEQ4_STATUS_CB_PATH_DOWN;
   1245 	uint32_t	sn_flag = SEQ4_STATUS_CB_PATH_DOWN_SESSION;
   1246 	uint32_t	idx = log2(sn_flag);
   1247 
   1248 	ASSERT(sp != NULL);
   1249 	ASSERT(sp->sn_clnt != NULL);
   1250 
   1251 	/* NB - refcnt for both these bits == active cb connections */
   1252 
   1253 	/* session */
   1254 	sp->sn_seq4[idx].ba_sonly = sonly;
   1255 	rfs41_seq4_rele(&sp->sn_seq4, sn_flag);
   1256 
   1257 	/* clientid */
   1258 	rfs41_seq4_rele(&sp->sn_clnt->rc_seq4, cp_flag);
   1259 }
   1260 
   1261 /*
   1262  * Place the actual cb_recall otw call to client. (using slot_XXX api)
   1263  */
   1264 /*ARGSUSED*/
   1265 void
   1266 mds_do_cb_recall(rfs4_deleg_state_t *dsp, bool_t trunc)
   1267 {
   1268 	CB_COMPOUND4args	cb4_args;
   1269 	CB_COMPOUND4res		cb4_res;
   1270 	CB_SEQUENCE4args	*cbsap;
   1271 	CB_RECALL4args		*cbrap;
   1272 	mds_session_t		*sp;
   1273 	slot_ent_t		*p;
   1274 	nfs_cb_argop4		*argops;
   1275 	int			numops;
   1276 	int			argoplist_size;
   1277 	struct timeval		timeout;
   1278 	nfs_fh4			*fhp;
   1279 	enum clnt_stat		call_stat = RPC_FAILED;
   1280 	int			zilch = 0;
   1281 	CLIENT			*ch;
   1282 	int			rcl = 0;	/* referring call list */
   1283 	int			retried = 0;
   1284 	uint32_t		sc = 0;		/* session ctxt */
   1285 
   1286 	/*
   1287 	 * get the session id
   1288 	 */
   1289 	sp = mds_findsession_by_clid(dbe_to_instp(dsp->rds_dbe),
   1290 	    dsp->rds_client->rc_clientid);
   1291 	if (sp == NULL) {
   1292 		/*
   1293 		 * this shouldn't ever happen.  if it does, just
   1294 		 * increment a counter for now and return.
   1295 		 */
   1296 		mds_cbrecall_no_session++;
   1297 		return;
   1298 	}
   1299 
   1300 	/*
   1301 	 * set up the compound args
   1302 	 */
   1303 	numops = 2;	/* CB_SEQUENCE + CB_RECALL */
   1304 	argoplist_size = numops * sizeof (nfs_cb_argop4);
   1305 	argops = kmem_zalloc(argoplist_size, KM_SLEEP);
   1306 
   1307 	argops[0].argop = OP_CB_SEQUENCE;
   1308 	cbsap = &argops[0].nfs_cb_argop4_u.opcbsequence;
   1309 
   1310 	argops[1].argop = OP_CB_RECALL;
   1311 	cbrap = &argops[1].nfs_cb_argop4_u.opcbrecall;
   1312 
   1313 	(void) str_to_utf8("mds_cb_recall", &cb4_args.tag);
   1314 	cb4_args.minorversion = CB4_MINOR_v1;
   1315 
   1316 	cb4_args.callback_ident = sp->sn_bc.progno;
   1317 	cb4_args.array_len = numops;
   1318 	cb4_args.array = argops;
   1319 
   1320 	cb4_res.tag.utf8string_val = NULL;
   1321 	cb4_res.array = NULL;
   1322 
   1323 	/*
   1324 	 * CB_SEQUENCE
   1325 	 */
   1326 	bcopy(sp->sn_sessid, cbsap->csa_sessionid, sizeof (sessionid4));
   1327 	p = svc_slot_alloc(sp);
   1328 	mutex_enter(&p->se_lock);
   1329 	cbsap->csa_slotid = p->se_sltno;
   1330 	cbsap->csa_sequenceid = p->se_seqid;
   1331 	cbsap->csa_highest_slotid = svc_slot_maxslot(sp);
   1332 	cbsap->csa_cachethis = FALSE;
   1333 
   1334 	/*
   1335 	 * Section 2.10.5.3 (draft 23)
   1336 	 *
   1337 	 *		case description		refcnt
   1338 	 *	----------------------------------	------
   1339 	 * 1) rs state gets created (deleg granted) 	1
   1340 	 *    slot is reused				0
   1341 	 *
   1342 	 * 2) rs state gets created (deleg granted)	1
   1343 	 *    cb_seq, cb_recall				2
   1344 	 *    <-- client replies to cb_recall		1
   1345 	 *    eventually, slot is reused		0
   1346 	 *
   1347 	 * 3) rs state gets created (deleg granted)	1
   1348 	 *    cb_seq, cb_recall				2
   1349 	 *    eventually, slot is reused		1
   1350 	 *    <-- client replies to cb_recall		0
   1351 	 *
   1352 	 * Cases 2 & 3 are covered here; case 1 covered as
   1353 	 * part of a new request to op_sequence.
   1354 	 */
   1355 	if (dsp->rds_rs.refcnt == 0) {
   1356 		cbsap->csa_rcall_llen = 0;
   1357 		cbsap->csa_rcall_lval = NULL;
   1358 	} else {
   1359 		rfs41_deleg_rs_hold(dsp);
   1360 		rcl = 1;
   1361 		rfs41_cb_seq_rcl_args(cbsap, dsp);
   1362 	}
   1363 	mutex_exit(&p->se_lock);
   1364 
   1365 	/*
   1366 	 * CB_RECALL
   1367 	 */
   1368 	bcopy(&dsp->rds_delegid.stateid, &cbrap->stateid, sizeof (stateid4));
   1369 	cbrap->truncate = trunc;
   1370 	fhp = &dsp->rds_finfo->rf_filehandle;
   1371 	cbrap->fh.nfs_fh4_val = kmem_alloc(sizeof (char) *
   1372 	    fhp->nfs_fh4_len, KM_SLEEP);
   1373 	nfs_fh4_copy(fhp, &cbrap->fh);
   1374 
   1375 	/*
   1376 	 * Set up the timeout for the callback and make the actual call.
   1377 	 * Timeout will be 80% of the lease period for this server.
   1378 	 */
   1379 	dsp->rds_time_recalled = gethrestime_sec();	/* observability */
   1380 	timeout.tv_sec = (rfs4_lease_time * 80) / 100;
   1381 	timeout.tv_usec = 0;
   1382 
   1383 retry:
   1384 	ch = rfs41_cb_getch(sp);
   1385 	(void) CLNT_CONTROL(ch, CLSET_XID, (char *)&zilch);
   1386 	call_stat = clnt_call(ch, CB_COMPOUND,
   1387 	    xdr_CB_COMPOUND4args_srv, (caddr_t)&cb4_args,
   1388 	    xdr_CB_COMPOUND4res, (caddr_t)&cb4_res, timeout);
   1389 	rfs41_cb_freech(sp, ch);
   1390 
   1391 	/*
   1392 	 * If the back channel is down, then mark session(s) appropriately
   1393 	 * (SEQ4_STATUS_CB_PATH_DOWN). On NFS4ERR_DELAY, retry the callback
   1394 	 * after a lease period; if that _still_ results in an error, revoke
   1395 	 * the delegation and assert SEQ4_STATUS_RECALLABLE_STATE_REVOKED
   1396 	 * section 10.4.5 (draft-23). As per Section 8.3 (d23), it's up to
   1397 	 * the client to figure out 'which' stateid got revoked.
   1398 	 */
   1399 	if (call_stat != RPC_SUCCESS) {
   1400 		if (!retried)
   1401 			delay(SEC_TO_TICK(rfs4_lease_time));
   1402 
   1403 		if (rfs41_file_still_delegated(dsp)) {
   1404 			if (!retried) {
   1405 				retried = 1;
   1406 				goto retry;
   1407 			}
   1408 
   1409 			/*
   1410 			 * We want to make sure that the delegation is
   1411 			 * still valid lest we assert a SEQ4 flag that
   1412 			 * will never be turned off.
   1413 			 */
   1414 			rfs41_revoke_deleg(dsp);
   1415 		}
   1416 		sc = (call_stat == RPC_CANTSEND || call_stat == RPC_CANTRECV);
   1417 		rfs41_cb_path_down(sp, sc);
   1418 		goto done;
   1419 
   1420 	} else if (cb4_res.status != NFS4_OK) {
   1421 		switch (cb4_res.status) {
   1422 		case NFS4ERR_BADHANDLE:
   1423 		case NFS4ERR_BADXDR:
   1424 		case NFS4ERR_OP_NOT_IN_SESSION:
   1425 		case NFS4ERR_REQ_TOO_BIG:
   1426 		case NFS4ERR_TOO_MANY_OPS:
   1427 			/* What do we do when it's our own fault ? */
   1428 			break;
   1429 
   1430 		/* XXX - rick: NFS4ERR_BAD_STATEID should also retry */
   1431 		/* case NFS4ERR_BAD_STATEID: */
   1432 		case NFS4ERR_DELAY:
   1433 			if (!retried)
   1434 				delay(SEC_TO_TICK(rfs4_lease_time));
   1435 
   1436 			if (!rfs41_file_still_delegated(dsp))
   1437 				break;
   1438 
   1439 			if (!retried) {
   1440 				retried = 1;
   1441 				goto retry;
   1442 			}
   1443 			/* FALLTHROUGH */
   1444 
   1445 		case NFS4ERR_BAD_STATEID:	/* XXX see above */
   1446 		default:
   1447 			if (rfs41_file_still_delegated(dsp))
   1448 				rfs41_revoke_deleg(dsp);
   1449 			break;
   1450 		}
   1451 	}
   1452 	svc_slot_cb_seqid(&cb4_res, p);
   1453 done:
   1454 	if (rcl)
   1455 		rfs41_deleg_rs_rele(dsp);
   1456 	svc_slot_free(sp, p);
   1457 
   1458 	rfs4freeargres(&cb4_args, &cb4_res);
   1459 	rfs41_session_rele(sp);
   1460 }
   1461 
   1462 struct recall_arg {
   1463 	rfs4_deleg_state_t *dsp;
   1464 	void (*recall)(rfs4_deleg_state_t *, bool_t trunc);
   1465 	bool_t trunc;
   1466 };
   1467 
   1468 static void
   1469 do_recall(struct recall_arg *arg)
   1470 {
   1471 	rfs4_deleg_state_t *dsp = arg->dsp;
   1472 	rfs4_file_t *fp = dsp->rds_finfo;
   1473 	callb_cpr_t cpr_info;
   1474 	kmutex_t cpr_lock;
   1475 
   1476 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
   1477 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recall");
   1478 
   1479 	/*
   1480 	 * It is possible that before this thread starts
   1481 	 * the client has send us a return_delegation, and
   1482 	 * if that is the case we do not need to send the
   1483 	 * recall callback.
   1484 	 */
   1485 	if (dsp->rds_dtype != OPEN_DELEGATE_NONE) {
   1486 		DTRACE_PROBE3(nfss__i__recall,
   1487 		    struct recall_arg *, arg,
   1488 		    struct rfs4_deleg_state_t *, dsp,
   1489 		    struct rfs4_file_t *, fp);
   1490 
   1491 		if (arg->recall)
   1492 			(void) (*arg->recall)(dsp, arg->trunc);
   1493 	}
   1494 
   1495 	mutex_enter(fp->rf_dinfo->rd_recall_lock);
   1496 	/*
   1497 	 * Recall count may go negative if the parent thread that is
   1498 	 * creating the individual callback threads does not modify
   1499 	 * the recall_count field before the callback thread actually
   1500 	 * gets a response from the CB_RECALL
   1501 	 */
   1502 	fp->rf_dinfo->rd_recall_count--;
   1503 	if (fp->rf_dinfo->rd_recall_count == 0)
   1504 		cv_signal(fp->rf_dinfo->rd_recall_cv);
   1505 	mutex_exit(fp->rf_dinfo->rd_recall_lock);
   1506 
   1507 	mutex_enter(&cpr_lock);
   1508 	CALLB_CPR_EXIT(&cpr_info);
   1509 	mutex_destroy(&cpr_lock);
   1510 
   1511 	rfs4_deleg_state_rele(dsp); /* release the hold for this thread */
   1512 
   1513 	kmem_free(arg, sizeof (struct recall_arg));
   1514 }
   1515 
   1516 struct master_recall_args {
   1517     rfs4_file_t *fp;
   1518     void (*recall)(rfs4_deleg_state_t *, bool_t);
   1519     bool_t trunc;
   1520 };
   1521 
   1522 static void
   1523 do_recall_file(struct master_recall_args *map)
   1524 {
   1525 	rfs4_file_t *fp = map->fp;
   1526 	rfs4_deleg_state_t *dsp;
   1527 	struct recall_arg *arg;
   1528 	callb_cpr_t cpr_info;
   1529 	kmutex_t cpr_lock;
   1530 	int32_t recall_count;
   1531 	nfs_server_instance_t *instp;
   1532 
   1533 	rfs4_dbe_lock(fp->rf_dbe);
   1534 
   1535 	/* Recall already in progress ? */
   1536 	mutex_enter(fp->rf_dinfo->rd_recall_lock);
   1537 	if (fp->rf_dinfo->rd_recall_count != 0) {
   1538 		mutex_exit(fp->rf_dinfo->rd_recall_lock);
   1539 		rfs4_dbe_rele_nolock(fp->rf_dbe);
   1540 		rfs4_dbe_unlock(fp->rf_dbe);
   1541 		kmem_free(map, sizeof (struct master_recall_args));
   1542 		return;
   1543 	}
   1544 
   1545 	mutex_exit(fp->rf_dinfo->rd_recall_lock);
   1546 
   1547 	instp = dbe_to_instp(fp->rf_dbe);
   1548 
   1549 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
   1550 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,	"v4RecallFile");
   1551 
   1552 	/*
   1553 	 * iterate over the file delegation list and
   1554 	 * recall..
   1555 	 */
   1556 	recall_count = 0;
   1557 	for (dsp = list_head(&fp->rf_delegstatelist); dsp != NULL;
   1558 	    dsp = list_next(&fp->rf_delegstatelist, dsp)) {
   1559 
   1560 		rfs4_dbe_lock(dsp->rds_dbe);
   1561 		/*
   1562 		 * if this delegation state
   1563 		 * is being reaped skip it
   1564 		 */
   1565 		if (rfs4_dbe_is_invalid(dsp->rds_dbe)) {
   1566 			rfs4_dbe_unlock(dsp->rds_dbe);
   1567 			continue;
   1568 		}
   1569 
   1570 		/* hold for receiving thread */
   1571 		rfs4_dbe_hold(dsp->rds_dbe);
   1572 		rfs4_dbe_unlock(dsp->rds_dbe);
   1573 
   1574 		arg = kmem_alloc(sizeof (struct recall_arg), KM_SLEEP);
   1575 		arg->recall = instp->deleg_cbrecall;
   1576 		arg->trunc = map->trunc;
   1577 		arg->dsp = dsp;
   1578 
   1579 		recall_count++;
   1580 
   1581 		(void) thread_create(NULL, 0, do_recall, arg, 0, &p0, TS_RUN,
   1582 		    minclsyspri);
   1583 	}
   1584 
   1585 	rfs4_dbe_unlock(fp->rf_dbe);
   1586 
   1587 	mutex_enter(fp->rf_dinfo->rd_recall_lock);
   1588 	/*
   1589 	 * Recall count may go negative if the parent thread that is
   1590 	 * creating the individual callback threads does not modify
   1591 	 * the recall_count field before the callback thread actually
   1592 	 * gets a response from the CB_RECALL
   1593 	 */
   1594 	fp->rf_dinfo->rd_recall_count += recall_count;
   1595 	while (fp->rf_dinfo->rd_recall_count)
   1596 		cv_wait(fp->rf_dinfo->rd_recall_cv,
   1597 		    fp->rf_dinfo->rd_recall_lock);
   1598 
   1599 	mutex_exit(fp->rf_dinfo->rd_recall_lock);
   1600 
   1601 	DTRACE_PROBE1(nfss__i__recall_done, rfs4_file_t *, fp);
   1602 	rfs4_file_rele(fp);
   1603 	kmem_free(map, sizeof (struct master_recall_args));
   1604 	mutex_enter(&cpr_lock);
   1605 	CALLB_CPR_EXIT(&cpr_info);
   1606 	mutex_destroy(&cpr_lock);
   1607 }
   1608 
   1609 static void
   1610 rfs4_recall_file(rfs4_file_t *fp, bool_t trunc, rfs4_client_t *cp)
   1611 {
   1612 	struct master_recall_args *args;
   1613 
   1614 	rfs4_dbe_lock(fp->rf_dbe);
   1615 	if (fp->rf_dinfo->rd_dtype == OPEN_DELEGATE_NONE) {
   1616 		rfs4_dbe_unlock(fp->rf_dbe);
   1617 		return;
   1618 	}
   1619 	rfs4_dbe_hold(fp->rf_dbe);	/* hold for new thread */
   1620 
   1621 	/*
   1622 	 * Mark the time we started the recall processing.
   1623 	 * If it has been previously recalled, do not reset the
   1624 	 * timer since this is used for the revocation decision.
   1625 	 */
   1626 	if (fp->rf_dinfo->rd_time_recalled == 0)
   1627 		fp->rf_dinfo->rd_time_recalled = gethrestime_sec();
   1628 	fp->rf_dinfo->rd_ever_recalled = TRUE; /* used for policy decision */
   1629 	/* Client causing recall not always available */
   1630 	if (cp)
   1631 		fp->rf_dinfo->rd_conflicted_client = cp->rc_clientid;
   1632 
   1633 	rfs4_dbe_unlock(fp->rf_dbe);
   1634 
   1635 	args = kmem_alloc(sizeof (struct master_recall_args), KM_SLEEP);
   1636 	args->fp = fp;
   1637 	args->recall = NULL;
   1638 	args->trunc = trunc;
   1639 
   1640 	(void) thread_create(NULL, 0, do_recall_file, args, 0, &p0, TS_RUN,
   1641 	    minclsyspri);
   1642 }
   1643 
   1644 void
   1645 rfs4_recall_deleg(rfs4_file_t *fp, bool_t trunc, rfs4_client_t *cp)
   1646 {
   1647 	time_t elapsed1, elapsed2;
   1648 	time_t lease;
   1649 
   1650 	lease = dbe_to_instp(fp->rf_dbe)->lease_period;
   1651 
   1652 	if (fp->rf_dinfo->rd_time_recalled != 0) {
   1653 		elapsed1 = gethrestime_sec() - fp->rf_dinfo->rd_time_recalled;
   1654 		elapsed2 = gethrestime_sec() - fp->rf_dinfo->rd_time_lastwrite;
   1655 
   1656 		/* First check to see if a revocation should occur */
   1657 		if (elapsed1 > lease && elapsed2 > lease) {
   1658 			rfs4_revoke_file(fp);
   1659 			return;
   1660 		}
   1661 		/*
   1662 		 * Next check to see if a recall should be done again
   1663 		 * so quickly.
   1664 		 */
   1665 		if (elapsed1 <= ((lease * 20) / 100))
   1666 			return;
   1667 	}
   1668 	rfs4_recall_file(fp, trunc, cp);
   1669 }
   1670 
   1671 /*
   1672  * rfs4_check_recall is called from rfs4_do_open to determine if the current
   1673  * open conflicts with the delegation.
   1674  * Return true if we need recall otherwise false.
   1675  * Assumes entry locks for sp and sp->rs_finfo are held.
   1676  */
   1677 bool_t
   1678 rfs4_check_recall(rfs4_state_t *sp, uint32_t access)
   1679 {
   1680 	open_delegation_type4 dtype = sp->rs_finfo->rf_dinfo->rd_dtype;
   1681 
   1682 	switch (dtype) {
   1683 	case OPEN_DELEGATE_NONE:
   1684 		/* Not currently delegated so there is nothing to do */
   1685 		return (FALSE);
   1686 	case OPEN_DELEGATE_READ:
   1687 		/*
   1688 		 * If the access is only asking for READ then there is
   1689 		 * no conflict and nothing to do.  If it is asking
   1690 		 * for write, then there will be conflict and the read
   1691 		 * delegation should be recalled.
   1692 		 */
   1693 		if (access == OPEN4_SHARE_ACCESS_READ)
   1694 			return (FALSE);
   1695 		else
   1696 			return (TRUE);
   1697 	case OPEN_DELEGATE_WRITE:
   1698 		/* Check to see if this client has the delegation */
   1699 		return (rfs4_is_deleg(sp));
   1700 	}
   1701 
   1702 	return (FALSE);
   1703 }
   1704 
   1705 /*
   1706  * Return the "best" allowable delegation available given the current
   1707  * delegation type and the desired access and deny modes on the file.
   1708  * At the point that this routine is called we know that the access and
   1709  * deny modes are consistent with the file modes.
   1710  */
   1711 static open_delegation_type4
   1712 rfs4_check_delegation(rfs4_state_t *sp, rfs4_file_t *fp)
   1713 {
   1714 	open_delegation_type4 dtype = fp->rf_dinfo->rd_dtype;
   1715 	uint32_t access = sp->rs_share_access;
   1716 	uint32_t deny = sp->rs_share_deny;
   1717 	int readcnt = 0;
   1718 	int writecnt = 0;
   1719 
   1720 	switch (dtype) {
   1721 	case OPEN_DELEGATE_NONE:
   1722 		/*
   1723 		 * Determine if more than just this OPEN have the file
   1724 		 * open and if so, no delegation may be provided to
   1725 		 * the client.
   1726 		 */
   1727 		if (access & OPEN4_SHARE_ACCESS_WRITE)
   1728 			writecnt++;
   1729 		if (access & OPEN4_SHARE_ACCESS_READ)
   1730 			readcnt++;
   1731 
   1732 		if (fp->rf_access_read > readcnt ||
   1733 		    fp->rf_access_write > writecnt)
   1734 			return (OPEN_DELEGATE_NONE);
   1735 
   1736 		/*
   1737 		 * If the client is going to write, or if the client
   1738 		 * has exclusive access, return a write delegation.
   1739 		 */
   1740 		if ((access & OPEN4_SHARE_ACCESS_WRITE) ||
   1741 		    (deny & (OPEN4_SHARE_DENY_READ | OPEN4_SHARE_DENY_WRITE)))
   1742 			return (OPEN_DELEGATE_WRITE);
   1743 		/*
   1744 		 * If we don't want to write or we've haven't denied read
   1745 		 * access to others, return a read delegation.
   1746 		 */
   1747 		if ((access & ~OPEN4_SHARE_ACCESS_WRITE) ||
   1748 		    (deny & ~OPEN4_SHARE_DENY_READ))
   1749 			return (OPEN_DELEGATE_READ);
   1750 
   1751 		/* Shouldn't get here */
   1752 		return (OPEN_DELEGATE_NONE);
   1753 
   1754 	case OPEN_DELEGATE_READ:
   1755 		/*
   1756 		 * If the file is delegated for read but we wan't to
   1757 		 * write or deny others to read then we can't delegate
   1758 		 * the file. We shouldn't get here since the delegation should
   1759 		 * have been recalled already.
   1760 		 */
   1761 		if ((access & OPEN4_SHARE_ACCESS_WRITE) ||
   1762 		    (deny & OPEN4_SHARE_DENY_READ))
   1763 			return (OPEN_DELEGATE_NONE);
   1764 		return (OPEN_DELEGATE_READ);
   1765 
   1766 	case OPEN_DELEGATE_WRITE:
   1767 		return (OPEN_DELEGATE_WRITE);
   1768 	}
   1769 
   1770 	/* Shouldn't get here */
   1771 	return (OPEN_DELEGATE_NONE);
   1772 }
   1773 
   1774 /*
   1775  * Given the desired delegation type and the "history" of the file
   1776  * determine the actual delegation type to return.
   1777  */
   1778 static open_delegation_type4
   1779 rfs4_delegation_policy(nfs_server_instance_t *instp,
   1780 	open_delegation_type4 dtype,
   1781 	rfs4_dinfo_t *dinfo, clientid4 cid)
   1782 {
   1783 	time_t elapsed;
   1784 
   1785 	if (instp->deleg_policy != SRV_NORMAL_DELEGATE)
   1786 		return (OPEN_DELEGATE_NONE);
   1787 
   1788 	/*
   1789 	 * Has this file/delegation ever been recalled?  If not then
   1790 	 * no further checks for a delegation race need to be done.
   1791 	 * However if a recall has occurred, then check to see if a
   1792 	 * client has caused its own delegation recall to occur.  If
   1793 	 * not, then has a delegation for this file been returned
   1794 	 * recently?  If so, then do not assign a new delegation to
   1795 	 * avoid a "delegation race" between the original client and
   1796 	 * the new/conflicting client.
   1797 	 */
   1798 	if (dinfo->rd_ever_recalled == TRUE) {
   1799 		if (dinfo->rd_conflicted_client != cid) {
   1800 			elapsed = gethrestime_sec() - dinfo->rd_time_returned;
   1801 			if (elapsed < instp->lease_period)
   1802 				return (OPEN_DELEGATE_NONE);
   1803 		}
   1804 	}
   1805 
   1806 	/* Limit the number of read grants */
   1807 	if (dtype == OPEN_DELEGATE_READ &&
   1808 	    dinfo->rd_rdgrants > MAX_READ_DELEGATIONS)
   1809 		return (OPEN_DELEGATE_NONE);
   1810 
   1811 	/*
   1812 	 * Should consider limiting total number of read/write
   1813 	 * delegations the server will permit.
   1814 	 */
   1815 
   1816 	return (dtype);
   1817 }
   1818 
   1819 /*
   1820  * Try and grant a delegation for an open give the state. The routine
   1821  * returns the delegation type granted. This could be OPEN_DELEGATE_NONE.
   1822  *
   1823  * The state and associate file entry must be locked
   1824  */
   1825 rfs4_deleg_state_t *
   1826 rfs4_grant_delegation(struct compound_state *cs,
   1827 		delegreq_t dreq, rfs4_state_t *sp, int *recall)
   1828 {
   1829 	rfs4_file_t *fp = sp->rs_finfo;
   1830 	open_delegation_type4 dtype;
   1831 	int no_delegation;
   1832 
   1833 	ASSERT(rfs4_dbe_islocked(sp->rs_dbe));
   1834 	ASSERT(rfs4_dbe_islocked(fp->rf_dbe));
   1835 
   1836 	/* Is the server even providing delegations? */
   1837 	if (cs->instp->deleg_policy == SRV_NEVER_DELEGATE || dreq == DELEG_NONE)
   1838 		return (NULL);
   1839 
   1840 	/* Check to see if delegations have been temporarily disabled */
   1841 	mutex_enter(&cs->instp->deleg_lock);
   1842 	no_delegation = cs->instp->deleg_disabled;
   1843 	mutex_exit(&cs->instp->deleg_lock);
   1844 
   1845 	if (no_delegation)
   1846 		return (NULL);
   1847 
   1848 	/* Don't grant a delegation if a deletion is impending. */
   1849 	if (fp->rf_dinfo->rd_hold_grant > 0) {
   1850 		return (NULL);
   1851 	}
   1852 
   1853 	/*
   1854 	 * Don't grant a delegation if there are any lock manager
   1855 	 * (NFSv2/v3) locks for the file.  This is a bit of a hack (e.g.,
   1856 	 * if there are only read locks we should be able to grant a
   1857 	 * read-only delegation), but it's good enough for now.
   1858 	 *
   1859 	 * MT safety: the lock manager checks for conflicting delegations
   1860 	 * before processing a lock request.  That check will block until
   1861 	 * we are done here.  So if the lock manager acquires a lock after
   1862 	 * we decide to grant the delegation, the delegation will get
   1863 	 * immediately recalled (if there's a conflict), so we're safe.
   1864 	 */
   1865 	if (lm_vp_active(fp->rf_vp)) {
   1866 		return (NULL);
   1867 	}
   1868 
   1869 	/*
   1870 	 * Based on the type of delegation request passed in, take the
   1871 	 * appropriate action (DELEG_NONE is handled above)
   1872 	 */
   1873 	switch (dreq) {
   1874 
   1875 	case DELEG_READ:
   1876 	case DELEG_WRITE:
   1877 		/*
   1878 		 * The server "must" grant the delegation in this case.
   1879 		 * Client is using open previous
   1880 		 */
   1881 		dtype = (open_delegation_type4)dreq;
   1882 		*recall = 1;
   1883 		break;
   1884 	case DELEG_ANY:
   1885 		/*
   1886 		 * If a valid callback path does not exist, no delegation may
   1887 		 * be granted.
   1888 		 */
   1889 		if ((*cs->instp->deleg_cbcheck)(sp) != CB_OK)
   1890 			return (NULL);
   1891 
   1892 		/*
   1893 		 * If the original operation which caused time_rm_delayed
   1894 		 * to be set hasn't been retried and completed for one
   1895 		 * full lease period, clear it and allow delegations to
   1896 		 * get granted again.
   1897 		 */
   1898 		if (fp->rf_dinfo->rd_time_rm_delayed > 0 &&
   1899 		    gethrestime_sec() >
   1900 		    fp->rf_dinfo->rd_time_rm_delayed + cs->instp->lease_period)
   1901 			fp->rf_dinfo->rd_time_rm_delayed = 0;
   1902 
   1903 		/*
   1904 		 * If we are waiting for a delegation to be returned then
   1905 		 * don't delegate this file. We do this for correctness as
   1906 		 * well as if the file is being recalled we would likely
   1907 		 * recall this file again.
   1908 		 */
   1909 
   1910 		if (fp->rf_dinfo->rd_time_recalled != 0 ||
   1911 		    fp->rf_dinfo->rd_time_rm_delayed != 0)
   1912 			return (NULL);
   1913 
   1914 		/* Get the "best" delegation candidate */
   1915 		dtype = rfs4_check_delegation(sp, fp);
   1916 
   1917 		if (dtype == OPEN_DELEGATE_NONE)
   1918 			return (NULL);
   1919 
   1920 		/*
   1921 		 * Based on policy and the history of the file get the
   1922 		 * actual delegation.
   1923 		 */
   1924 		dtype = rfs4_delegation_policy(cs->instp, dtype, fp->rf_dinfo,
   1925 		    sp->rs_owner->ro_client->rc_clientid);
   1926 
   1927 		if (dtype == OPEN_DELEGATE_NONE)
   1928 			return (NULL);
   1929 		break;
   1930 	default:
   1931 		return (NULL);
   1932 	}
   1933 
   1934 	/* set the delegation for the state */
   1935 	return (rfs4_deleg_state(cs, sp, dtype, recall));
   1936 }
   1937 
   1938 void
   1939 rfs4_set_deleg_response(rfs4_deleg_state_t *dsp, open_delegation4 *dp,
   1940     nfsace4 *ace,  int recall)
   1941 {
   1942 	open_write_delegation4 *wp;
   1943 	open_read_delegation4 *rp;
   1944 	nfs_space_limit4 *spl;
   1945 	nfsace4 nace;
   1946 
   1947 	/*
   1948 	 * We need to allocate a new copy of the who string.
   1949 	 * this string will be freed by the rfs4_op_open dis_resfree
   1950 	 * routine. We need to do this allocation since replays will
   1951 	 * be allocated and rfs4_compound can't tell the difference from
   1952 	 * a replay and an inital open. N.B. if an ace is passed in, it
   1953 	 * the caller's responsibility to free it.
   1954 	 */
   1955 
   1956 	if (ace == NULL) {
   1957 		/*
   1958 		 * Default is to deny all access, the client will have
   1959 		 * to contact the server.  XXX Do we want to actually
   1960 		 * set a deny for every one, or do we simply want to
   1961 		 * construct an entity that will match no one?
   1962 		 */
   1963 		nace.type = ACE4_ACCESS_DENIED_ACE_TYPE;
   1964 		nace.flag = 0;
   1965 		nace.access_mask = ACE4_VALID_MASK_BITS;
   1966 		(void) str_to_utf8(ACE4_WHO_EVERYONE, &nace.who);
   1967 	} else {
   1968 		nace.type = ace->type;
   1969 		nace.flag = ace->flag;
   1970 		nace.access_mask = ace->access_mask;
   1971 		(void) utf8_copy(&ace->who, &nace.who);
   1972 	}
   1973 
   1974 	dp->delegation_type = dsp->rds_dtype;
   1975 
   1976 	switch (dsp->rds_dtype) {
   1977 	case OPEN_DELEGATE_NONE:
   1978 		break;
   1979 	case OPEN_DELEGATE_READ:
   1980 		rp = &dp->open_delegation4_u.read;
   1981 		rp->stateid = dsp->rds_delegid.stateid;
   1982 		rp->recall = (bool_t)recall;
   1983 		rp->permissions = nace;
   1984 		break;
   1985 	case OPEN_DELEGATE_WRITE:
   1986 		wp = &dp->open_delegation4_u.write;
   1987 		wp->stateid = dsp->rds_delegid.stateid;
   1988 		wp->recall = (bool_t)recall;
   1989 		spl = &wp->space_limit;
   1990 		spl->limitby = NFS_LIMIT_SIZE;
   1991 		spl->nfs_space_limit4_u.filesize = 0;
   1992 		wp->permissions = nace;
   1993 		break;
   1994 	}
   1995 }
   1996 
   1997 /*
   1998  * Check if the file is delegated via the provided file struct.
   1999  * Return TRUE if it is delegated.  This is intended for use by
   2000  * the v4 server.  The v2/v3 server code should use rfs4_check_delegated().
   2001  *
   2002  * Note that if the file is found to have a delegation, it is
   2003  * recalled, unless the clientid of the caller matches the clientid of the
   2004  * delegation. If the caller has specified, there is a slight delay
   2005  * inserted in the hopes that the delegation will be returned quickly.
   2006  */
   2007 bool_t
   2008 rfs4_check_delegated_byfp(nfs_server_instance_t *instp,
   2009     int mode, rfs4_file_t *fp, bool_t trunc, bool_t do_delay,
   2010     bool_t is_rm, clientid4 *cp)
   2011 {
   2012 	rfs4_deleg_state_t *dsp;
   2013 
   2014 	/* Is delegation enabled? */
   2015 	if (instp->deleg_policy == SRV_NEVER_DELEGATE)
   2016 		return (FALSE);
   2017 
   2018 	/* do we have a delegation on this file? */
   2019 	rfs4_dbe_lock(fp->rf_dbe);
   2020 	if (fp->rf_dinfo->rd_dtype == OPEN_DELEGATE_NONE) {
   2021 		if (is_rm)
   2022 			fp->rf_dinfo->rd_hold_grant++;
   2023 		rfs4_dbe_unlock(fp->rf_dbe);
   2024 		return (FALSE);
   2025 	}
   2026 	/*
   2027 	 * do we have a write delegation on this file or are we
   2028 	 * requesting write access to a file with any type of existing
   2029 	 * delegation?
   2030 	 */
   2031 	if (mode == FWRITE || fp->rf_dinfo->rd_dtype == OPEN_DELEGATE_WRITE) {
   2032 		if (cp != NULL) {
   2033 			dsp = list_head(&fp->rf_delegstatelist);
   2034 			if (dsp == NULL) {
   2035 				rfs4_dbe_unlock(fp->rf_dbe);
   2036 				return (FALSE);
   2037 			}
   2038 			/*
   2039 			 * Does the requestor already own the delegation?
   2040 			 */
   2041 			if (dsp->rds_client->rc_clientid == *(cp)) {
   2042 				rfs4_dbe_unlock(fp->rf_dbe);
   2043 				return (FALSE);
   2044 			}
   2045 		}
   2046 
   2047 		rfs4_dbe_unlock(fp->rf_dbe);
   2048 		rfs4_recall_deleg(fp, trunc, NULL);
   2049 
   2050 		if (!do_delay) {
   2051 			rfs4_dbe_lock(fp->rf_dbe);
   2052 			fp->rf_dinfo->rd_time_rm_delayed = gethrestime_sec();
   2053 			rfs4_dbe_unlock(fp->rf_dbe);
   2054 			return (TRUE);
   2055 		}
   2056 
   2057 		delay(NFS4_DELEGATION_CONFLICT_DELAY);
   2058 
   2059 		rfs4_dbe_lock(fp->rf_dbe);
   2060 		if (fp->rf_dinfo->rd_dtype != OPEN_DELEGATE_NONE) {
   2061 			fp->rf_dinfo->rd_time_rm_delayed = gethrestime_sec();
   2062 			rfs4_dbe_unlock(fp->rf_dbe);
   2063 			return (TRUE);
   2064 		}
   2065 	}
   2066 	if (is_rm)
   2067 		fp->rf_dinfo->rd_hold_grant++;
   2068 	rfs4_dbe_unlock(fp->rf_dbe);
   2069 	return (FALSE);
   2070 }
   2071 
   2072 
   2073 /*
   2074  * Check if the file is delegated in the case of a v2 or v3 access.
   2075  * Return TRUE if it is delegated which in turn means that v2 should
   2076  * drop the request and in the case of v3 JUKEBOX should be returned.
   2077  */
   2078 bool_t
   2079 rfs4_check_delegated(int mode, vnode_t *vp, bool_t trunc, bool_t do_delay,
   2080     bool_t is_rm, void *vcp)
   2081 {
   2082 	int    delegd = 0;
   2083 	bool_t create = FALSE;
   2084 	rfs4_file_t *fp;
   2085 	clientid4 *cp = (clientid4 *)vcp;
   2086 	nfs_server_instance_t *nsip;
   2087 
   2088 	/* iterrate through the instances */
   2089 	rw_enter(&nsi_lock, RW_READER);
   2090 	for (nsip = list_head(&nsi_head);
   2091 	    nsip != NULL && !delegd;
   2092 	    nsip = list_next(&nsi_head, &nsip->nsi_list)) {
   2093 
   2094 		mutex_enter(&nsip->state_lock);
   2095 
   2096 		if ((nsip->inst_flags & NFS_INST_STORE_INIT) &&
   2097 		    (nsip->deleg_policy != SRV_NEVER_DELEGATE)) {
   2098 
   2099 			fp = rfs4_findfile(nsip, vp, NULL, &create);
   2100 			if (fp != NULL) {
   2101 				if (rfs4_check_delegated_byfp(nsip, mode, fp,
   2102 				    trunc, do_delay, is_rm, cp))
   2103 					delegd++;
   2104 				rfs4_file_rele(fp);
   2105 			}
   2106 		}
   2107 		mutex_exit(&nsip->state_lock);
   2108 	}
   2109 	rw_exit(&nsi_lock);
   2110 	return (delegd ? TRUE : FALSE);
   2111 }
   2112 
   2113 /*
   2114  * Release a hold on the hold_grant counter which
   2115  * prevents delegation from being granted while a remove
   2116  * or a rename is in progress.
   2117  */
   2118 void
   2119 rfs4_clear_dont_grant(nfs_server_instance_t *instp,
   2120 		rfs4_file_t *fp)
   2121 {
   2122 	if (instp->deleg_policy == SRV_NEVER_DELEGATE)
   2123 		return;
   2124 	rfs4_dbe_lock(fp->rf_dbe);
   2125 	ASSERT(fp->rf_dinfo->rd_hold_grant > 0);
   2126 	fp->rf_dinfo->rd_hold_grant--;
   2127 	fp->rf_dinfo->rd_time_rm_delayed = 0;
   2128 	rfs4_dbe_unlock(fp->rf_dbe);
   2129 }
   2130 
   2131 /*
   2132  * State support for delegation.
   2133  * Set the state delegation type for this state;
   2134  * This routine is called from open via rfs4_grant_delegation and the entry
   2135  * locks on sp and sp->rs_finfo are assumed.
   2136  */
   2137 static rfs4_deleg_state_t *
   2138 rfs4_deleg_state(struct compound_state *cs,
   2139 		rfs4_state_t *sp, open_delegation_type4 dtype, int *recall)
   2140 {
   2141 	rfs4_file_t *fp = sp->rs_finfo;
   2142 	bool_t create = TRUE;
   2143 	rfs4_deleg_state_t *dsp;
   2144 	vnode_t *vp;
   2145 	int open_prev = *recall;
   2146 	int ret;
   2147 	int fflags = 0;
   2148 
   2149 	ASSERT(rfs4_dbe_islocked(sp->rs_dbe));
   2150 	ASSERT(rfs4_dbe_islocked(fp->rf_dbe));
   2151 
   2152 	/* Shouldn't happen */
   2153 	if (fp->rf_dinfo->rd_recall_count != 0 ||
   2154 	    (fp->rf_dinfo->rd_dtype == OPEN_DELEGATE_READ &&
   2155 	    dtype != OPEN_DELEGATE_READ)) {
   2156 		return (NULL);
   2157 	}
   2158 
   2159 	/* Unlock to avoid deadlock */
   2160 	rfs4_dbe_unlock(fp->rf_dbe);
   2161 	rfs4_dbe_unlock(sp->rs_dbe);
   2162 
   2163 	dsp = rfs4_finddeleg(cs, sp, &create);
   2164 
   2165 	rfs4_dbe_lock(sp->rs_dbe);
   2166 	rfs4_dbe_lock(fp->rf_dbe);
   2167 
   2168 	if (dsp == NULL)
   2169 		return (NULL);
   2170 
   2171 	/*
   2172 	 * It is possible that since we dropped the lock
   2173 	 * in order to call finddeleg, the rfs4_file_t
   2174 	 * was marked such that we should not grant a
   2175 	 * delegation, if so bail out.
   2176 	 */
   2177 	if (fp->rf_dinfo->rd_hold_grant > 0) {
   2178 		rfs4_deleg_state_rele(dsp);
   2179 		return (NULL);
   2180 	}
   2181 
   2182 	if (create == FALSE) {
   2183 		if (sp->rs_owner->ro_client == dsp->rds_client &&
   2184 		    dsp->rds_dtype == dtype) {
   2185 			return (dsp);
   2186 		} else {
   2187 			rfs4_deleg_state_rele(dsp);
   2188 			return (NULL);
   2189 		}
   2190 	}
   2191 
   2192 	/*
   2193 	 * Check that this file has not been delegated to another
   2194 	 * client
   2195 	 */
   2196 	if (fp->rf_dinfo->rd_recall_count != 0 ||
   2197 	    fp->rf_dinfo->rd_dtype == OPEN_DELEGATE_WRITE ||
   2198 	    (fp->rf_dinfo->rd_dtype == OPEN_DELEGATE_READ &&
   2199 	    dtype != OPEN_DELEGATE_READ)) {
   2200 		rfs4_deleg_state_rele(dsp);
   2201 		return (NULL);
   2202 	}
   2203 
   2204 	vp = fp->rf_vp;
   2205 	/* vnevent_support returns 0 if file system supports vnevents */
   2206 	if (vnevent_support(vp, NULL)) {
   2207 		rfs4_deleg_state_rele(dsp);
   2208 		return (NULL);
   2209 	}
   2210 
   2211 	/* Calculate the fflags for this OPEN. */
   2212 	if (sp->rs_share_access & OPEN4_SHARE_ACCESS_READ)
   2213 		fflags |= FREAD;
   2214 	if (sp->rs_share_access & OPEN4_SHARE_ACCESS_WRITE)
   2215 		fflags |= FWRITE;
   2216 
   2217 	*recall = 0;
   2218 	/*
   2219 	 * Before granting a delegation we need to know if anyone else has
   2220 	 * opened the file in a conflicting mode.  However, first we need to
   2221 	 * know how we opened the file to check the counts properly.
   2222 	 */
   2223 	if (dtype == OPEN_DELEGATE_READ) {
   2224 		if (((fflags & FWRITE) && vn_has_other_opens(vp, V_WRITE)) ||
   2225 		    (((fflags & FWRITE) == 0) && vn_is_opened(vp, V_WRITE)) ||
   2226 		    vn_is_mapped(vp, V_WRITE)) {
   2227 			if (open_prev) {
   2228 				*recall = 1;
   2229 			} else {
   2230 				rfs4_deleg_state_rele(dsp);
   2231 				return (NULL);
   2232 			}
   2233 		}
   2234 		ret = fem_install(vp, cs->instp->deleg_rdops, (void *)fp,
   2235 		    OPARGUNIQ, rfs4_mon_hold, rfs4_mon_rele);
   2236 		if (((fflags & FWRITE) && vn_has_other_opens(vp, V_WRITE)) ||
   2237 		    (((fflags & FWRITE) == 0) && vn_is_opened(vp, V_WRITE)) ||
   2238 		    vn_is_mapped(vp, V_WRITE)) {
   2239 			if (open_prev) {
   2240 				*recall = 1;
   2241 			} else {
   2242 				(void) fem_uninstall(vp, cs->instp->deleg_rdops,
   2243 				    (void *)fp);
   2244 				rfs4_deleg_state_rele(dsp);
   2245 				return (NULL);
   2246 			}
   2247 		}
   2248 		/*
   2249 		 * Because a client can hold onto a delegation after the
   2250 		 * file has been closed, we need to keep track of the
   2251 		 * access to this file.  Otherwise the CIFS server would
   2252 		 * not know about the client accessing the file and could
   2253 		 * inappropriately grant an OPLOCK.
   2254 		 * fem_install() returns EBUSY when asked to install a
   2255 		 * OPARGUNIQ monitor more than once.  Therefore, check the
   2256 		 * return code because we only want this done once.
   2257 		 */
   2258 		if (ret == 0)
   2259 			vn_open_upgrade(vp, FREAD);
   2260 	} else { /* WRITE */
   2261 		if (((fflags & FWRITE) && vn_has_other_opens(vp, V_WRITE)) ||
   2262 		    (((fflags & FWRITE) == 0) && vn_is_opened(vp, V_WRITE)) ||
   2263 		    ((fflags & FREAD) && vn_has_other_opens(vp, V_READ)) ||
   2264 		    (((fflags & FREAD) == 0) && vn_is_opened(vp, V_READ)) ||
   2265 		    vn_is_mapped(vp, V_RDORWR)) {
   2266 			if (open_prev) {
   2267 				*recall = 1;
   2268 			} else {
   2269 				rfs4_deleg_state_rele(dsp);
   2270 				return (NULL);
   2271 			}
   2272 		}
   2273 		ret = fem_install(vp, cs->instp->deleg_wrops, (void *)fp,
   2274 		    OPARGUNIQ, rfs4_mon_hold, rfs4_mon_rele);
   2275 		if (((fflags & FWRITE) && vn_has_other_opens(vp, V_WRITE)) ||
   2276 		    (((fflags & FWRITE) == 0) && vn_is_opened(vp, V_WRITE)) ||
   2277 		    ((fflags & FREAD) && vn_has_other_opens(vp, V_READ)) ||
   2278 		    (((fflags & FREAD) == 0) && vn_is_opened(vp, V_READ)) ||
   2279 		    vn_is_mapped(vp, V_RDORWR)) {
   2280 			if (open_prev) {
   2281 				*recall = 1;
   2282 			} else {
   2283 				(void) fem_uninstall(vp, cs->instp->deleg_wrops,
   2284 				    (void *)fp);
   2285 				rfs4_deleg_state_rele(dsp);
   2286 				return (NULL);
   2287 			}
   2288 		}
   2289 		/*
   2290 		 * Because a client can hold onto a delegation after the
   2291 		 * file has been closed, we need to keep track of the
   2292 		 * access to this file.  Otherwise the CIFS server would
   2293 		 * not know about the client accessing the file and could
   2294 		 * inappropriately grant an OPLOCK.
   2295 		 * fem_install() returns EBUSY when asked to install a
   2296 		 * OPUNIQ monitor more than once.  Therefore, check the
   2297 		 * return code because we only want this done once.
   2298 		 */
   2299 		if (ret == 0)
   2300 			vn_open_upgrade(vp, FREAD|FWRITE);
   2301 	}
   2302 
   2303 	/*
   2304 	 * Place on delegation list for file
   2305 	 */
   2306 	ASSERT(!list_link_active(&dsp->rds_node));
   2307 	list_insert_tail(&fp->rf_delegstatelist, dsp);
   2308 
   2309 	dsp->rds_dtype = fp->rf_dinfo->rd_dtype = dtype;
   2310 
   2311 	/* Update delegation stats for this file */
   2312 	fp->rf_dinfo->rd_time_lastgrant = gethrestime_sec();
   2313 
   2314 	/* reset since this is a new delegation */
   2315 	fp->rf_dinfo->rd_conflicted_client = 0;
   2316 	fp->rf_dinfo->rd_ever_recalled = FALSE;
   2317 
   2318 	if (dtype == OPEN_DELEGATE_READ)
   2319 		fp->rf_dinfo->rd_rdgrants++;
   2320 	else
   2321 		fp->rf_dinfo->rd_wrgrants++;
   2322 
   2323 	return (dsp);
   2324 }
   2325 
   2326 /*
   2327  * State routine for the server when a delegation is returned.
   2328  */
   2329 void
   2330 rfs4_return_deleg(rfs4_deleg_state_t *dsp, bool_t revoked)
   2331 {
   2332 	rfs4_file_t *fp = dsp->rds_finfo;
   2333 	open_delegation_type4 dtypewas;
   2334 	nfs_server_instance_t *instp;
   2335 
   2336 	rfs4_dbe_lock(fp->rf_dbe);
   2337 
   2338 	/* nothing to do if no longer on list */
   2339 	if (!list_link_active(&dsp->rds_node)) {
   2340 		rfs4_dbe_unlock(fp->rf_dbe);
   2341 		return;
   2342 	}
   2343 
   2344 	/* Remove state from recall list */
   2345 	list_remove(&fp->rf_delegstatelist, dsp);
   2346 
   2347 	instp = dbe_to_instp(fp->rf_dbe);
   2348 	if (instp->inst_flags & NFS_INST_v41) {
   2349 		mds_session_t	*sp;
   2350 		slotid4		 slot;
   2351 		slot_ent_t	*slp;
   2352 		extern void rfs41_rs_erase(void *);
   2353 
   2354 		if (dsp->rds_rs.refcnt > 0) {
   2355 			/*
   2356 			 * refcnt > 0, so this means we still have an active
   2357 			 * hold on deleg_state. If (for some reason) we don't
   2358 			 * find the sp, the worse that'll happen is that we'll
   2359 			 * leak some state (ie. won't be able to clean up the
   2360 			 * hold). But nothing to get too excited about.
   2361 			 */
   2362 			slot = dsp->rds_rs.slotno;
   2363 			sp = mds_findsession_by_id(instp, dsp->rds_rs.sessid);
   2364 			if (sp != NULL) {
   2365 				rfs4_dbe_lock(sp->sn_dbe);
   2366 				ASSERT(sp->sn_replay != NULL);
   2367 				slp = slrc_slot_get(sp->sn_replay, slot);
   2368 				if (slp->se_p == dsp) {
   2369 					rfs41_rs_erase(dsp);
   2370 					slp->se_p = NULL;
   2371 				}
   2372 				rfs4_dbe_unlock(sp->sn_dbe);
   2373 				rfs41_session_rele(sp);
   2374 			}
   2375 		}
   2376 	}
   2377 
   2378 	/*
   2379 	 * If no more delegations then remove the FEM
   2380 	 * monitors
   2381 	 */
   2382 	if (list_is_empty(&fp->rf_delegstatelist)) {
   2383 		dtypewas = fp->rf_dinfo->rd_dtype;
   2384 		fp->rf_dinfo->rd_dtype = OPEN_DELEGATE_NONE;
   2385 		rfs4_dbe_cv_broadcast(fp->rf_dbe);
   2386 
   2387 		/* if file system was unshared, the vp will be NULL */
   2388 		if (fp->rf_vp != NULL) {
   2389 			/*
   2390 			 * Once a delegation is no longer held by any client,
   2391 			 * the monitor is uninstalled.  At this point, the
   2392 			 * client must send OPEN otw, so we don't need the
   2393 			 * reference on the vnode anymore.  The open
   2394 			 * downgrade removes the reference put on earlier.
   2395 			 */
   2396 			if (dtypewas == OPEN_DELEGATE_READ) {
   2397 				(void) fem_uninstall(fp->rf_vp,
   2398 				    instp->deleg_rdops, (void *)fp);
   2399 				vn_open_downgrade(fp->rf_vp, FREAD);
   2400 			} else if (dtypewas == OPEN_DELEGATE_WRITE) {
   2401 				(void) fem_uninstall(fp->rf_vp,
   2402 				    instp->deleg_wrops, (void *)fp);
   2403 				vn_open_downgrade(fp->rf_vp, FREAD|FWRITE);
   2404 			}
   2405 		}
   2406 	}
   2407 
   2408 	switch (dsp->rds_dtype) {
   2409 	case OPEN_DELEGATE_READ:
   2410 		fp->rf_dinfo->rd_rdgrants--;
   2411 		break;
   2412 	case OPEN_DELEGATE_WRITE:
   2413 		fp->rf_dinfo->rd_wrgrants--;
   2414 		break;
   2415 	default:
   2416 		break;
   2417 	}
   2418 
   2419 	/* used in the policy decision */
   2420 	fp->rf_dinfo->rd_time_returned = gethrestime_sec();
   2421 
   2422 	/*
   2423 	 * reset the time_recalled field so future delegations are not
   2424 	 * accidentally revoked
   2425 	 */
   2426 	if ((fp->rf_dinfo->rd_rdgrants + fp->rf_dinfo->rd_wrgrants) == 0)
   2427 		fp->rf_dinfo->rd_time_recalled = 0;
   2428 
   2429 	rfs4_dbe_unlock(fp->rf_dbe);
   2430 
   2431 	rfs4_dbe_lock(dsp->rds_dbe);
   2432 
   2433 	dsp->rds_dtype = OPEN_DELEGATE_NONE;
   2434 
   2435 	if (revoked == TRUE)
   2436 		dsp->rds_time_revoked = gethrestime_sec();
   2437 
   2438 	rfs4_dbe_invalidate(dsp->rds_dbe);
   2439 
   2440 	rfs4_dbe_unlock(dsp->rds_dbe);
   2441 
   2442 	if (revoked == TRUE) {
   2443 		rfs4_dbe_lock(dsp->rds_client->rc_dbe);
   2444 		dsp->rds_client->rc_deleg_revoked++;	/* observability */
   2445 		rfs4_dbe_unlock(dsp->rds_client->rc_dbe);
   2446 	}
   2447 }
   2448 
   2449 static void
   2450 rfs4_revoke_deleg(rfs4_deleg_state_t *dsp)
   2451 {
   2452 	rfs4_return_deleg(dsp, TRUE);
   2453 }
   2454 
   2455 static void
   2456 rfs41_revoke_deleg(rfs4_deleg_state_t *dsp)
   2457 {
   2458 	cmn_err(CE_NOTE, "rfs41_revoke_deleg: delegation revoked");
   2459 	rfs41_seq4_hold(&dsp->rds_client->rc_seq4,
   2460 	    SEQ4_STATUS_RECALLABLE_STATE_REVOKED);
   2461 	rfs4_revoke_deleg(dsp);
   2462 }
   2463 
   2464 static void
   2465 rfs4_revoke_file(rfs4_file_t *fp)
   2466 {
   2467 	rfs4_deleg_state_t *dsp;
   2468 
   2469 	/*
   2470 	 * The lock for rfs4_file_t must be held when traversing the
   2471 	 * delegation list but that lock needs to be released to call
   2472 	 * rfs4_revoke_deleg().
   2473 	 *
   2474 	 * The called function rfs4_revoke_deleg removes the entry
   2475 	 * from the fp delegation list, so the while loop will keep
   2476 	 * looping until the list is empty.
   2477 	 */
   2478 	rfs4_dbe_lock(fp->rf_dbe);
   2479 	while (dsp = list_head(&fp->rf_delegstatelist)) {
   2480 		rfs4_dbe_hold(dsp->rds_dbe);
   2481 		rfs4_dbe_unlock(fp->rf_dbe);
   2482 		rfs4_revoke_deleg(dsp);
   2483 		rfs4_deleg_state_rele(dsp);
   2484 		rfs4_dbe_lock(fp->rf_dbe);
   2485 	}
   2486 	rfs4_dbe_unlock(fp->rf_dbe);
   2487 }
   2488 
   2489 /*
   2490  * A delegation is assumed to be present on the file associated with
   2491  * "sp".  Check to see if the delegation matches is associated with
   2492  * the same client as referenced by "sp".  If it is not, TRUE is
   2493  * returned.  If the delegation DOES match the client (or no
   2494  * delegation is present), return FALSE.
   2495  * Assume the state entry and file entry are locked.
   2496  *
   2497  * This routine only checks the delegations of the calling server instance.
   2498  * Since this is only called from rfs4_check_recall(), which is only called
   2499  * by rfs4_do_open() and mds_do_open(), they only need to check if they own
   2500  * this delegation.  All other conflict detection will be done by the monitor
   2501  * on OPEN.
   2502  */
   2503 bool_t
   2504 rfs4_is_deleg(rfs4_state_t *sp)
   2505 {
   2506 	rfs4_deleg_state_t *dsp;
   2507 	rfs4_file_t *fp = sp->rs_finfo;
   2508 	rfs4_client_t *cp = sp->rs_owner->ro_client;
   2509 
   2510 	ASSERT(rfs4_dbe_islocked(fp->rf_dbe));
   2511 	for (dsp = list_head(&fp->rf_delegstatelist); dsp != NULL;
   2512 	    dsp = list_next(&fp->rf_delegstatelist, dsp)) {
   2513 		if (cp != dsp->rds_client)
   2514 			return (TRUE);
   2515 	}
   2516 
   2517 	return (FALSE);
   2518 }
   2519 
   2520 void
   2521 rfs4_disable_delegation(nfs_server_instance_t *instp)
   2522 {
   2523 	mutex_enter(&instp->deleg_lock);
   2524 	instp->deleg_disabled++;
   2525 	mutex_exit(&instp->deleg_lock);
   2526 }
   2527 
   2528 void
   2529 rfs4_enable_delegation(nfs_server_instance_t *instp)
   2530 {
   2531 	mutex_enter(&instp->deleg_lock);
   2532 	ASSERT(instp->deleg_disabled > 0);
   2533 	instp->deleg_disabled--;
   2534 	mutex_exit(&instp->deleg_lock);
   2535 }
   2536 
   2537 void
   2538 rfs4_mon_hold(void *arg)
   2539 {
   2540 	rfs4_file_t *fp = arg;
   2541 
   2542 	rfs4_dbe_hold(fp->rf_dbe);
   2543 }
   2544 
   2545 void
   2546 rfs4_mon_rele(void *arg)
   2547 {
   2548 	rfs4_file_t *fp = arg;
   2549 
   2550 	rfs4_dbe_rele_nolock(fp->rf_dbe);
   2551 }
   2552