Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
     28  *	All Rights Reserved
     29  */
     30 
     31 #include <sys/param.h>
     32 #include <sys/types.h>
     33 #include <sys/systm.h>
     34 #include <sys/thread.h>
     35 #include <sys/t_lock.h>
     36 #include <sys/time.h>
     37 #include <sys/vnode.h>
     38 #include <sys/vfs.h>
     39 #include <sys/errno.h>
     40 #include <sys/buf.h>
     41 #include <sys/stat.h>
     42 #include <sys/cred.h>
     43 #include <sys/kmem.h>
     44 #include <sys/debug.h>
     45 #include <sys/dnlc.h>
     46 #include <sys/vmsystm.h>
     47 #include <sys/flock.h>
     48 #include <sys/share.h>
     49 #include <sys/cmn_err.h>
     50 #include <sys/tiuser.h>
     51 #include <sys/sysmacros.h>
     52 #include <sys/callb.h>
     53 #include <sys/acl.h>
     54 #include <sys/kstat.h>
     55 #include <sys/signal.h>
     56 #include <sys/disp.h>
     57 #include <sys/atomic.h>
     58 #include <sys/list.h>
     59 #include <sys/sdt.h>
     60 
     61 #include <rpc/types.h>
     62 #include <rpc/xdr.h>
     63 #include <rpc/auth.h>
     64 #include <rpc/clnt.h>
     65 
     66 #include <nfs/nfs.h>
     67 #include <nfs/nfs_clnt.h>
     68 #include <nfs/nfs_acl.h>
     69 
     70 #include <nfs/nfs4.h>
     71 #include <nfs/rnode4.h>
     72 #include <nfs/nfs4_clnt.h>
     73 
     74 #include <vm/hat.h>
     75 #include <vm/as.h>
     76 #include <vm/page.h>
     77 #include <vm/pvn.h>
     78 #include <vm/seg.h>
     79 #include <vm/seg_map.h>
     80 #include <vm/seg_vn.h>
     81 
     82 #include <sys/ddi.h>
     83 
     84 /*
     85  * Arguments to page-flush thread.
     86  */
     87 typedef struct {
     88 	vnode_t *vp;
     89 	cred_t *cr;
     90 } pgflush_t;
     91 
     92 #ifdef DEBUG
     93 int nfs4_client_lease_debug;
     94 int nfs4_sharedfh_debug;
     95 int nfs4_fname_debug;
     96 
     97 /* temporary: panic if v_type is inconsistent with r_attr va_type */
     98 int nfs4_vtype_debug;
     99 
    100 uint_t nfs4_tsd_key;
    101 #endif
    102 
    103 static time_t	nfs4_client_resumed = 0;
    104 static	callb_id_t cid = 0;
    105 
    106 static int	nfs4renew(nfs4_server_t *);
    107 static void	nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
    108 static void	nfs4_pgflush_thread(pgflush_t *);
    109 
    110 static boolean_t nfs4_client_cpr_callb(void *, int);
    111 
    112 struct mi4_globals {
    113 	kmutex_t	mig_lock;  /* lock protecting mig_list */
    114 	list_t		mig_list;  /* list of NFS v4 mounts in zone */
    115 	boolean_t	mig_destructor_called;
    116 };
    117 
    118 static zone_key_t mi4_list_key;
    119 
    120 /*
    121  * Attributes caching:
    122  *
    123  * Attributes are cached in the rnode in struct vattr form.
    124  * There is a time associated with the cached attributes (r_time_attr_inval)
    125  * which tells whether the attributes are valid. The time is initialized
    126  * to the difference between current time and the modify time of the vnode
    127  * when new attributes are cached. This allows the attributes for
    128  * files that have changed recently to be timed out sooner than for files
    129  * that have not changed for a long time. There are minimum and maximum
    130  * timeout values that can be set per mount point.
    131  */
    132 
    133 /*
    134  * If a cache purge is in progress, wait for it to finish.
    135  *
    136  * The current thread must not be in the middle of an
    137  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
    138  * between this thread, a recovery thread, and the page flush thread.
    139  */
    140 int
    141 nfs4_waitfor_purge_complete(vnode_t *vp)
    142 {
    143 	rnode4_t *rp;
    144 	k_sigset_t smask;
    145 
    146 	rp = VTOR4(vp);
    147 	if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
    148 	    ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
    149 		mutex_enter(&rp->r_statelock);
    150 		sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
    151 		while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
    152 		    ((rp->r_flags & R4PGFLUSH) &&
    153 		    rp->r_pgflush != curthread)) {
    154 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
    155 				sigunintr(&smask);
    156 				mutex_exit(&rp->r_statelock);
    157 				return (EINTR);
    158 			}
    159 		}
    160 		sigunintr(&smask);
    161 		mutex_exit(&rp->r_statelock);
    162 	}
    163 	return (0);
    164 }
    165 
    166 /*
    167  * Validate caches by checking cached attributes. If they have timed out,
    168  * then get new attributes from the server.  As a side effect, cache
    169  * invalidation is done if the attributes have changed.
    170  *
    171  * If the attributes have not timed out and if there is a cache
    172  * invalidation being done by some other thread, then wait until that
    173  * thread has completed the cache invalidation.
    174  */
    175 int
    176 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
    177 {
    178 	int error;
    179 	nfs4_ga_res_t gar;
    180 
    181 	if (ATTRCACHE4_VALID(vp)) {
    182 		error = nfs4_waitfor_purge_complete(vp);
    183 		if (error)
    184 			return (error);
    185 		return (0);
    186 	}
    187 
    188 	gar.n4g_va.va_mask = AT_ALL;
    189 	return (nfs4_getattr_otw(vp, &gar, cr, 0));
    190 }
    191 
    192 /*
    193  * Fill in attribute from the cache.
    194  * If valid, then return 0 to indicate that no error occurred,
    195  * otherwise return 1 to indicate that an error occurred.
    196  */
    197 static int
    198 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
    199 {
    200 	rnode4_t *rp;
    201 
    202 	rp = VTOR4(vp);
    203 	mutex_enter(&rp->r_statelock);
    204 	mutex_enter(&rp->r_statev4_lock);
    205 	if (ATTRCACHE4_VALID(vp)) {
    206 		mutex_exit(&rp->r_statev4_lock);
    207 		/*
    208 		 * Cached attributes are valid
    209 		 */
    210 		*vap = rp->r_attr;
    211 		mutex_exit(&rp->r_statelock);
    212 		return (0);
    213 	}
    214 	mutex_exit(&rp->r_statev4_lock);
    215 	mutex_exit(&rp->r_statelock);
    216 	return (1);
    217 }
    218 
    219 
    220 /*
    221  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
    222  * call is synchronous because all the pages were invalidated by the
    223  * nfs4_invalidate_pages() call.
    224  */
    225 void
    226 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
    227 {
    228 	struct rnode4 *rp = VTOR4(vp);
    229 
    230 	/* Ensure that the ..._end_op() call has been done */
    231 	ASSERT(tsd_get(nfs4_tsd_key) == NULL);
    232 
    233 	if (errno != ESTALE)
    234 		return;
    235 
    236 	mutex_enter(&rp->r_statelock);
    237 	rp->r_flags |= R4STALE;
    238 	if (!rp->r_error)
    239 		rp->r_error = errno;
    240 	mutex_exit(&rp->r_statelock);
    241 	if (nfs4_has_pages(vp))
    242 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
    243 	nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
    244 }
    245 
    246 /*
    247  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
    248  * page purge is done asynchronously.
    249  */
    250 void
    251 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
    252 {
    253 	rnode4_t *rp;
    254 	char *contents;
    255 	vnode_t *xattr;
    256 	int size;
    257 	int pgflush;			/* are we the page flush thread? */
    258 
    259 	/*
    260 	 * Purge the DNLC for any entries which refer to this file.
    261 	 */
    262 	if (vp->v_count > 1 &&
    263 	    (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
    264 		dnlc_purge_vp(vp);
    265 
    266 	/*
    267 	 * Clear any readdir state bits and purge the readlink response cache.
    268 	 */
    269 	rp = VTOR4(vp);
    270 	mutex_enter(&rp->r_statelock);
    271 	rp->r_flags &= ~R4LOOKUP;
    272 	contents = rp->r_symlink.contents;
    273 	size = rp->r_symlink.size;
    274 	rp->r_symlink.contents = NULL;
    275 
    276 	xattr = rp->r_xattr_dir;
    277 	rp->r_xattr_dir = NULL;
    278 
    279 	/*
    280 	 * Purge pathconf cache too.
    281 	 */
    282 	rp->r_pathconf.pc4_xattr_valid = 0;
    283 	rp->r_pathconf.pc4_cache_valid = 0;
    284 
    285 	pgflush = (curthread == rp->r_pgflush);
    286 	mutex_exit(&rp->r_statelock);
    287 
    288 	if (contents != NULL) {
    289 
    290 		kmem_free((void *)contents, size);
    291 	}
    292 
    293 	if (xattr != NULL)
    294 		VN_RELE(xattr);
    295 
    296 	/*
    297 	 * Flush the page cache.  If the current thread is the page flush
    298 	 * thread, don't initiate a new page flush.  There's no need for
    299 	 * it, and doing it correctly is hard.
    300 	 */
    301 	if (nfs4_has_pages(vp) && !pgflush) {
    302 		if (!asyncpg) {
    303 			(void) nfs4_waitfor_purge_complete(vp);
    304 			nfs4_flush_pages(vp, cr);
    305 		} else {
    306 			pgflush_t *args;
    307 
    308 			/*
    309 			 * We don't hold r_statelock while creating the
    310 			 * thread, in case the call blocks.  So we use a
    311 			 * flag to indicate that a page flush thread is
    312 			 * active.
    313 			 */
    314 			mutex_enter(&rp->r_statelock);
    315 			if (rp->r_flags & R4PGFLUSH) {
    316 				mutex_exit(&rp->r_statelock);
    317 			} else {
    318 				rp->r_flags |= R4PGFLUSH;
    319 				mutex_exit(&rp->r_statelock);
    320 
    321 				args = kmem_alloc(sizeof (pgflush_t),
    322 				    KM_SLEEP);
    323 				args->vp = vp;
    324 				VN_HOLD(args->vp);
    325 				args->cr = cr;
    326 				crhold(args->cr);
    327 				(void) zthread_create(NULL, 0,
    328 				    nfs4_pgflush_thread, args, 0,
    329 				    minclsyspri);
    330 			}
    331 		}
    332 	}
    333 
    334 	/*
    335 	 * Flush the readdir response cache.
    336 	 */
    337 	nfs4_purge_rddir_cache(vp);
    338 }
    339 
    340 /*
    341  * Invalidate all pages for the given file, after writing back the dirty
    342  * ones.
    343  */
    344 
    345 void
    346 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
    347 {
    348 	int error;
    349 	rnode4_t *rp = VTOR4(vp);
    350 
    351 	error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
    352 	if (error == ENOSPC || error == EDQUOT) {
    353 		mutex_enter(&rp->r_statelock);
    354 		if (!rp->r_error)
    355 			rp->r_error = error;
    356 		mutex_exit(&rp->r_statelock);
    357 	}
    358 }
    359 
    360 /*
    361  * Page flush thread.
    362  */
    363 
    364 static void
    365 nfs4_pgflush_thread(pgflush_t *args)
    366 {
    367 	rnode4_t *rp = VTOR4(args->vp);
    368 
    369 	/* remember which thread we are, so we don't deadlock ourselves */
    370 	mutex_enter(&rp->r_statelock);
    371 	ASSERT(rp->r_pgflush == NULL);
    372 	rp->r_pgflush = curthread;
    373 	mutex_exit(&rp->r_statelock);
    374 
    375 	nfs4_flush_pages(args->vp, args->cr);
    376 
    377 	mutex_enter(&rp->r_statelock);
    378 	rp->r_pgflush = NULL;
    379 	rp->r_flags &= ~R4PGFLUSH;
    380 	cv_broadcast(&rp->r_cv);
    381 	mutex_exit(&rp->r_statelock);
    382 
    383 	VN_RELE(args->vp);
    384 	crfree(args->cr);
    385 	kmem_free(args, sizeof (pgflush_t));
    386 	zthread_exit();
    387 }
    388 
    389 /*
    390  * Purge the readdir cache of all entries which are not currently
    391  * being filled.
    392  */
    393 void
    394 nfs4_purge_rddir_cache(vnode_t *vp)
    395 {
    396 	rnode4_t *rp;
    397 
    398 	rp = VTOR4(vp);
    399 
    400 	mutex_enter(&rp->r_statelock);
    401 	rp->r_direof = NULL;
    402 	rp->r_flags &= ~R4LOOKUP;
    403 	rp->r_flags |= R4READDIRWATTR;
    404 	rddir4_cache_purge(rp);
    405 	mutex_exit(&rp->r_statelock);
    406 }
    407 
    408 /*
    409  * Set attributes cache for given vnode using virtual attributes.  There is
    410  * no cache validation, but if the attributes are deemed to be stale, they
    411  * are ignored.  This corresponds to nfs3_attrcache().
    412  *
    413  * Set the timeout value on the attribute cache and fill it
    414  * with the passed in attributes.
    415  */
    416 void
    417 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
    418 {
    419 	rnode4_t *rp = VTOR4(vp);
    420 
    421 	mutex_enter(&rp->r_statelock);
    422 	if (rp->r_time_attr_saved <= t)
    423 		nfs4_attrcache_va(vp, garp, FALSE);
    424 	mutex_exit(&rp->r_statelock);
    425 }
    426 
    427 /*
    428  * Use the passed in virtual attributes to check to see whether the
    429  * data and metadata caches are valid, cache the new attributes, and
    430  * then do the cache invalidation if required.
    431  *
    432  * The cache validation and caching of the new attributes is done
    433  * atomically via the use of the mutex, r_statelock.  If required,
    434  * the cache invalidation is done atomically w.r.t. the cache
    435  * validation and caching of the attributes via the pseudo lock,
    436  * r_serial.
    437  *
    438  * This routine is used to do cache validation and attributes caching
    439  * for operations with a single set of post operation attributes.
    440  */
    441 
    442 void
    443 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
    444     hrtime_t t, cred_t *cr, int async,
    445     change_info4 *cinfo)
    446 {
    447 	rnode4_t *rp;
    448 	int mtime_changed = 0;
    449 	int ctime_changed = 0;
    450 	vsecattr_t *vsp;
    451 	int was_serial, set_time_cache_inval, recov;
    452 	vattr_t *vap = &garp->n4g_va;
    453 	mntinfo4_t *mi = VTOMI4(vp);
    454 	len_t preattr_rsize;
    455 	boolean_t writemodify_set = B_FALSE;
    456 	boolean_t cachepurge_set = B_FALSE;
    457 
    458 	ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
    459 
    460 	/* Is curthread the recovery thread? */
    461 	mutex_enter(&mi->mi_lock);
    462 	recov = (VTOMI4(vp)->mi_recovthread == curthread);
    463 	mutex_exit(&mi->mi_lock);
    464 
    465 	rp = VTOR4(vp);
    466 	mutex_enter(&rp->r_statelock);
    467 	was_serial = (rp->r_serial == curthread);
    468 	if (rp->r_serial && !was_serial) {
    469 		klwp_t *lwp = ttolwp(curthread);
    470 
    471 		/*
    472 		 * If we're the recovery thread, then purge current attrs
    473 		 * and bail out to avoid potential deadlock between another
    474 		 * thread caching attrs (r_serial thread), recov thread,
    475 		 * and an async writer thread.
    476 		 */
    477 		if (recov) {
    478 			PURGE_ATTRCACHE4_LOCKED(rp);
    479 			mutex_exit(&rp->r_statelock);
    480 			return;
    481 		}
    482 
    483 		if (lwp != NULL)
    484 			lwp->lwp_nostop++;
    485 		while (rp->r_serial != NULL) {
    486 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
    487 				mutex_exit(&rp->r_statelock);
    488 				if (lwp != NULL)
    489 					lwp->lwp_nostop--;
    490 				return;
    491 			}
    492 		}
    493 		if (lwp != NULL)
    494 			lwp->lwp_nostop--;
    495 	}
    496 
    497 	/*
    498 	 * If there is a page flush thread, the current thread needs to
    499 	 * bail out, to prevent a possible deadlock between the current
    500 	 * thread (which might be in a start_op/end_op region), the
    501 	 * recovery thread, and the page flush thread.  Expire the
    502 	 * attribute cache, so that any attributes the current thread was
    503 	 * going to set are not lost.
    504 	 */
    505 	if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
    506 		PURGE_ATTRCACHE4_LOCKED(rp);
    507 		mutex_exit(&rp->r_statelock);
    508 		return;
    509 	}
    510 
    511 	if (rp->r_time_attr_saved > t) {
    512 		/*
    513 		 * Attributes have been cached since these attributes were
    514 		 * probably made. If there is an inconsistency in what is
    515 		 * cached, mark them invalid. If not, don't act on them.
    516 		 */
    517 		if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
    518 			PURGE_ATTRCACHE4_LOCKED(rp);
    519 		mutex_exit(&rp->r_statelock);
    520 		return;
    521 	}
    522 	set_time_cache_inval = 0;
    523 	if (cinfo) {
    524 		/*
    525 		 * Only directory modifying callers pass non-NULL cinfo.
    526 		 */
    527 		ASSERT(vp->v_type == VDIR);
    528 		/*
    529 		 * If the cache timeout either doesn't exist or hasn't expired,
    530 		 * and dir didn't changed on server before dirmod op
    531 		 * and dir didn't change after dirmod op but before getattr
    532 		 * then there's a chance that the client's cached data for
    533 		 * this object is current (not stale).  No immediate cache
    534 		 * flush is required.
    535 		 *
    536 		 */
    537 		if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
    538 		    cinfo->before == rp->r_change &&
    539 		    (garp->n4g_change_valid &&
    540 		    cinfo->after == garp->n4g_change)) {
    541 
    542 			/*
    543 			 * If atomic isn't set, then the before/after info
    544 			 * cannot be blindly trusted.  For this case, we tell
    545 			 * nfs4_attrcache_va to cache the attrs but also
    546 			 * establish an absolute maximum cache timeout.  When
    547 			 * the timeout is reached, caches will be flushed.
    548 			 */
    549 			if (! cinfo->atomic)
    550 				set_time_cache_inval = 1;
    551 		} else {
    552 
    553 			/*
    554 			 * We're not sure exactly what changed, but we know
    555 			 * what to do.  flush all caches for dir.  remove the
    556 			 * attr timeout.
    557 			 *
    558 			 * a) timeout expired.  flush all caches.
    559 			 * b) r_change != cinfo.before.  flush all caches.
    560 			 * c) r_change == cinfo.before, but cinfo.after !=
    561 			 *    post-op getattr(change).  flush all caches.
    562 			 * d) post-op getattr(change) not provided by server.
    563 			 *    flush all caches.
    564 			 */
    565 			mtime_changed = 1;
    566 			ctime_changed = 1;
    567 			rp->r_time_cache_inval = 0;
    568 		}
    569 	} else {
    570 		/*
    571 		 * Write thread after writing data to file on remote server,
    572 		 * will always set R4WRITEMODIFIED to indicate that file on
    573 		 * remote server was modified with a WRITE operation and would
    574 		 * have marked attribute cache as timed out. If R4WRITEMODIFIED
    575 		 * is set, then do not check for mtime and ctime change.
    576 		 */
    577 		if (!(rp->r_flags & R4WRITEMODIFIED)) {
    578 			if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
    579 				mtime_changed = 1;
    580 
    581 			if (rp->r_attr.va_ctime.tv_sec !=
    582 			    vap->va_ctime.tv_sec ||
    583 			    rp->r_attr.va_ctime.tv_nsec !=
    584 			    vap->va_ctime.tv_nsec)
    585 				ctime_changed = 1;
    586 		} else {
    587 			writemodify_set = B_TRUE;
    588 		}
    589 	}
    590 
    591 	preattr_rsize = rp->r_size;
    592 
    593 	nfs4_attrcache_va(vp, garp, set_time_cache_inval);
    594 
    595 	/*
    596 	 * If we have updated filesize in nfs4_attrcache_va, as soon as we
    597 	 * drop statelock we will be in transition of purging all
    598 	 * our caches and updating them. It is possible for another
    599 	 * thread to pick this new file size and read in zeroed data.
    600 	 * stall other threads till cache purge is complete.
    601 	 */
    602 	if ((!cinfo) && (rp->r_size != preattr_rsize)) {
    603 		/*
    604 		 * If R4WRITEMODIFIED was set and we have updated the file
    605 		 * size, Server's returned file size need not necessarily
    606 		 * be because of this Client's WRITE. We need to purge
    607 		 * all caches.
    608 		 */
    609 		if (writemodify_set)
    610 			mtime_changed = 1;
    611 
    612 		if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
    613 			rp->r_flags |= R4INCACHEPURGE;
    614 			cachepurge_set = B_TRUE;
    615 		}
    616 	}
    617 
    618 	if (!mtime_changed && !ctime_changed) {
    619 		mutex_exit(&rp->r_statelock);
    620 		return;
    621 	}
    622 
    623 	rp->r_serial = curthread;
    624 
    625 	mutex_exit(&rp->r_statelock);
    626 
    627 	/*
    628 	 * If we're the recov thread, then force async nfs4_purge_caches
    629 	 * to avoid potential deadlock.
    630 	 */
    631 	if (mtime_changed)
    632 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
    633 
    634 	if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
    635 		mutex_enter(&rp->r_statelock);
    636 		rp->r_flags &= ~R4INCACHEPURGE;
    637 		cv_broadcast(&rp->r_cv);
    638 		mutex_exit(&rp->r_statelock);
    639 		cachepurge_set = B_FALSE;
    640 	}
    641 
    642 	if (ctime_changed) {
    643 		(void) nfs4_access_purge_rp(rp);
    644 		if (rp->r_secattr != NULL) {
    645 			mutex_enter(&rp->r_statelock);
    646 			vsp = rp->r_secattr;
    647 			rp->r_secattr = NULL;
    648 			mutex_exit(&rp->r_statelock);
    649 			if (vsp != NULL)
    650 				nfs4_acl_free_cache(vsp);
    651 		}
    652 	}
    653 
    654 	if (!was_serial) {
    655 		mutex_enter(&rp->r_statelock);
    656 		rp->r_serial = NULL;
    657 		cv_broadcast(&rp->r_cv);
    658 		mutex_exit(&rp->r_statelock);
    659 	}
    660 }
    661 
    662 /*
    663  * Set attributes cache for given vnode using virtual attributes.
    664  *
    665  * Set the timeout value on the attribute cache and fill it
    666  * with the passed in attributes.
    667  *
    668  * The caller must be holding r_statelock.
    669  */
    670 static void
    671 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
    672 {
    673 	rnode4_t *rp;
    674 	mntinfo4_t *mi;
    675 	hrtime_t delta;
    676 	hrtime_t now;
    677 	vattr_t *vap = &garp->n4g_va;
    678 
    679 	rp = VTOR4(vp);
    680 
    681 	ASSERT(MUTEX_HELD(&rp->r_statelock));
    682 	ASSERT(vap->va_mask == AT_ALL);
    683 
    684 	/* Switch to master before checking v_flag */
    685 	if (IS_SHADOW(vp, rp))
    686 		vp = RTOV4(rp);
    687 
    688 	now = gethrtime();
    689 
    690 	mi = VTOMI4(vp);
    691 
    692 	/*
    693 	 * Only establish a new cache timeout (if requested).  Never
    694 	 * extend a timeout.  Never clear a timeout.  Clearing a timeout
    695 	 * is done by nfs4_update_dircaches (ancestor in our call chain)
    696 	 */
    697 	if (set_cache_timeout && ! rp->r_time_cache_inval)
    698 		rp->r_time_cache_inval = now + mi->mi_acdirmax;
    699 
    700 	/*
    701 	 * Delta is the number of nanoseconds that we will
    702 	 * cache the attributes of the file.  It is based on
    703 	 * the number of nanoseconds since the last time that
    704 	 * we detected a change.  The assumption is that files
    705 	 * that changed recently are likely to change again.
    706 	 * There is a minimum and a maximum for regular files
    707 	 * and for directories which is enforced though.
    708 	 *
    709 	 * Using the time since last change was detected
    710 	 * eliminates direct comparison or calculation
    711 	 * using mixed client and server times.  NFS does
    712 	 * not make any assumptions regarding the client
    713 	 * and server clocks being synchronized.
    714 	 */
    715 	if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
    716 	    vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
    717 	    vap->va_size != rp->r_attr.va_size) {
    718 		rp->r_time_attr_saved = now;
    719 	}
    720 
    721 	if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
    722 		delta = 0;
    723 	else {
    724 		delta = now - rp->r_time_attr_saved;
    725 		if (vp->v_type == VDIR) {
    726 			if (delta < mi->mi_acdirmin)
    727 				delta = mi->mi_acdirmin;
    728 			else if (delta > mi->mi_acdirmax)
    729 				delta = mi->mi_acdirmax;
    730 		} else {
    731 			if (delta < mi->mi_acregmin)
    732 				delta = mi->mi_acregmin;
    733 			else if (delta > mi->mi_acregmax)
    734 				delta = mi->mi_acregmax;
    735 		}
    736 	}
    737 	rp->r_time_attr_inval = now + delta;
    738 
    739 	rp->r_attr = *vap;
    740 	if (garp->n4g_change_valid)
    741 		rp->r_change = garp->n4g_change;
    742 
    743 	/*
    744 	 * The attributes that were returned may be valid and can
    745 	 * be used, but they may not be allowed to be cached.
    746 	 * Reset the timers to cause immediate invalidation and
    747 	 * clear r_change so no VERIFY operations will suceed
    748 	 */
    749 	if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
    750 		rp->r_time_attr_inval = now;
    751 		rp->r_time_attr_saved = now;
    752 		rp->r_change = 0;
    753 	}
    754 
    755 	/*
    756 	 * If mounted_on_fileid returned AND the object is a stub,
    757 	 * then set object's va_nodeid to the mounted over fid
    758 	 * returned by server.
    759 	 *
    760 	 * If mounted_on_fileid not provided/supported, then
    761 	 * just set it to 0 for now.  Eventually it would be
    762 	 * better to set it to a hashed version of FH.  This
    763 	 * would probably be good enough to provide a unique
    764 	 * fid/d_ino within a dir.
    765 	 *
    766 	 * We don't need to carry mounted_on_fileid in the
    767 	 * rnode as long as the client never requests fileid
    768 	 * without also requesting mounted_on_fileid.  For
    769 	 * now, it stays.
    770 	 */
    771 	if (garp->n4g_mon_fid_valid) {
    772 		rp->r_mntd_fid = garp->n4g_mon_fid;
    773 
    774 		if (RP_ISSTUB(rp))
    775 			rp->r_attr.va_nodeid = rp->r_mntd_fid;
    776 	}
    777 
    778 	/*
    779 	 * Check to see if there are valid pathconf bits to
    780 	 * cache in the rnode.
    781 	 */
    782 	if (garp->n4g_ext_res) {
    783 		if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
    784 			rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
    785 		} else {
    786 			if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
    787 				rp->r_pathconf.pc4_xattr_valid = TRUE;
    788 				rp->r_pathconf.pc4_xattr_exists =
    789 				    garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
    790 			}
    791 		}
    792 	}
    793 	/*
    794 	 * Update the size of the file if there is no cached data or if
    795 	 * the cached data is clean and there is no data being written
    796 	 * out.
    797 	 */
    798 	if (rp->r_size != vap->va_size &&
    799 	    (!vn_has_cached_data(vp) ||
    800 	    (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
    801 		rp->r_size = vap->va_size;
    802 	}
    803 	nfs_setswaplike(vp, vap);
    804 	rp->r_flags &= ~R4WRITEMODIFIED;
    805 }
    806 
    807 /*
    808  * Get attributes over-the-wire and update attributes cache
    809  * if no error occurred in the over-the-wire operation.
    810  * Return 0 if successful, otherwise error.
    811  */
    812 int
    813 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
    814 {
    815 	mntinfo4_t *mi = VTOMI4(vp);
    816 	hrtime_t t;
    817 	nfs4_recov_state_t recov_state;
    818 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
    819 
    820 	recov_state.rs_flags = 0;
    821 	recov_state.rs_num_retry_despite_err = 0;
    822 
    823 	/* Save the original mount point security flavor */
    824 	(void) save_mnt_secinfo(mi->mi_curr_serv);
    825 
    826 recov_retry:
    827 
    828 	if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
    829 	    &recov_state, NULL))) {
    830 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
    831 		return (e.error);
    832 	}
    833 
    834 	t = gethrtime();
    835 
    836 	nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
    837 
    838 	if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
    839 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
    840 		    NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
    841 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
    842 			    &recov_state, 1);
    843 			goto recov_retry;
    844 		}
    845 	}
    846 
    847 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
    848 
    849 	if (!e.error) {
    850 		if (e.stat == NFS4_OK) {
    851 			nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
    852 		} else {
    853 			e.error = geterrno4(e.stat);
    854 
    855 			nfs4_purge_stale_fh(e.error, vp, cr);
    856 		}
    857 	}
    858 
    859 	/*
    860 	 * If getattr a node that is a stub for a crossed
    861 	 * mount point, keep the original secinfo flavor for
    862 	 * the current file system, not the crossed one.
    863 	 */
    864 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
    865 
    866 	return (e.error);
    867 }
    868 
    869 /*
    870  * Generate a compound to get attributes over-the-wire.
    871  */
    872 void
    873 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
    874     nfs4_error_t *ep, cred_t *cr, int get_acl)
    875 {
    876 	COMPOUND4args_clnt args;
    877 	COMPOUND4res_clnt res;
    878 	int doqueue;
    879 	rnode4_t *rp = VTOR4(vp);
    880 	nfs_argop4 argop[2];
    881 
    882 	args.ctag = TAG_GETATTR;
    883 
    884 	args.array_len = 2;
    885 	args.array = argop;
    886 
    887 	/* putfh */
    888 	argop[0].argop = OP_CPUTFH;
    889 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
    890 
    891 	/* getattr */
    892 	/*
    893 	 * Unlike nfs version 2 and 3, where getattr returns all the
    894 	 * attributes, nfs version 4 returns only the ones explicitly
    895 	 * asked for. This creates problems, as some system functions
    896 	 * (e.g. cache check) require certain attributes and if the
    897 	 * cached node lacks some attributes such as uid/gid, it can
    898 	 * affect system utilities (e.g. "ls") that rely on the information
    899 	 * to be there. This can lead to anything from system crashes to
    900 	 * corrupted information processed by user apps.
    901 	 * So to ensure that all bases are covered, request at least
    902 	 * the AT_ALL attribute mask.
    903 	 */
    904 	argop[1].argop = OP_GETATTR;
    905 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
    906 	if (get_acl)
    907 		argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
    908 	argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
    909 
    910 	doqueue = 1;
    911 
    912 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
    913 
    914 	if (ep->error)
    915 		return;
    916 
    917 	if (res.status != NFS4_OK) {
    918 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
    919 		return;
    920 	}
    921 
    922 	*garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
    923 
    924 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
    925 }
    926 
    927 /*
    928  * Return either cached or remote attributes. If get remote attr
    929  * use them to check and invalidate caches, then cache the new attributes.
    930  */
    931 int
    932 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
    933 {
    934 	int error;
    935 	rnode4_t *rp;
    936 	nfs4_ga_res_t gar;
    937 
    938 	ASSERT(nfs4_consistent_type(vp));
    939 
    940 	/*
    941 	 * If we've got cached attributes, we're done, otherwise go
    942 	 * to the server to get attributes, which will update the cache
    943 	 * in the process. Either way, use the cached attributes for
    944 	 * the caller's vattr_t.
    945 	 *
    946 	 * Note that we ignore the gar set by the OTW call: the attr caching
    947 	 * code may make adjustments when storing to the rnode, and we want
    948 	 * to see those changes here.
    949 	 */
    950 	rp = VTOR4(vp);
    951 	error = 0;
    952 	mutex_enter(&rp->r_statelock);
    953 	if (!ATTRCACHE4_VALID(vp)) {
    954 		mutex_exit(&rp->r_statelock);
    955 		error = nfs4_getattr_otw(vp, &gar, cr, 0);
    956 		mutex_enter(&rp->r_statelock);
    957 	}
    958 
    959 	if (!error)
    960 		*vap = rp->r_attr;
    961 
    962 	/* Return the client's view of file size */
    963 	vap->va_size = rp->r_size;
    964 
    965 	mutex_exit(&rp->r_statelock);
    966 
    967 	ASSERT(nfs4_consistent_type(vp));
    968 
    969 	return (error);
    970 }
    971 
    972 int
    973 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
    974     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
    975 {
    976 	COMPOUND4args_clnt args;
    977 	COMPOUND4res_clnt res;
    978 	int doqueue;
    979 	nfs_argop4 argop[2];
    980 	mntinfo4_t *mi = VTOMI4(vp);
    981 	bool_t needrecov = FALSE;
    982 	nfs4_recov_state_t recov_state;
    983 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
    984 	nfs4_ga_ext_res_t *gerp;
    985 
    986 	recov_state.rs_flags = 0;
    987 	recov_state.rs_num_retry_despite_err = 0;
    988 
    989 recov_retry:
    990 	args.ctag = tag_type;
    991 
    992 	args.array_len = 2;
    993 	args.array = argop;
    994 
    995 	e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
    996 	if (e.error)
    997 		return (e.error);
    998 
    999 	/* putfh */
   1000 	argop[0].argop = OP_CPUTFH;
   1001 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
   1002 
   1003 	/* getattr */
   1004 	argop[1].argop = OP_GETATTR;
   1005 	argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
   1006 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
   1007 
   1008 	doqueue = 1;
   1009 
   1010 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
   1011 	    "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
   1012 	    rnode4info(VTOR4(vp))));
   1013 
   1014 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
   1015 
   1016 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
   1017 	if (!needrecov && e.error) {
   1018 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
   1019 		    needrecov);
   1020 		return (e.error);
   1021 	}
   1022 
   1023 	if (needrecov) {
   1024 		bool_t abort;
   1025 
   1026 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1027 		    "nfs4_attr_otw: initiating recovery\n"));
   1028 
   1029 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
   1030 		    NULL, OP_GETATTR, NULL, NULL, NULL);
   1031 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
   1032 		    needrecov);
   1033 		if (!e.error) {
   1034 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1035 			e.error = geterrno4(res.status);
   1036 		}
   1037 		if (abort == FALSE)
   1038 			goto recov_retry;
   1039 		return (e.error);
   1040 	}
   1041 
   1042 	if (res.status) {
   1043 		e.error = geterrno4(res.status);
   1044 	} else {
   1045 		gerp = garp->n4g_ext_res;
   1046 		bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
   1047 		    garp, sizeof (nfs4_ga_res_t));
   1048 		garp->n4g_ext_res = gerp;
   1049 		if (garp->n4g_ext_res &&
   1050 		    res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
   1051 			bcopy(res.array[1].nfs_resop4_u.opgetattr.
   1052 			    ga_res.n4g_ext_res,
   1053 			    garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
   1054 	}
   1055 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1056 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
   1057 	    needrecov);
   1058 	return (e.error);
   1059 }
   1060 
   1061 /*
   1062  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
   1063  * for the demand-based allocation of async threads per-mount.  The
   1064  * nfs_async_timeout is the amount of time a thread will live after it
   1065  * becomes idle, unless new I/O requests are received before the thread
   1066  * dies.  See nfs4_async_putpage and nfs4_async_start.
   1067  */
   1068 
   1069 static void	nfs4_async_start(struct vfs *);
   1070 static void	nfs4_async_pgops_start(struct vfs *);
   1071 static void	nfs4_async_common_start(struct vfs *, int);
   1072 
   1073 static void
   1074 free_async_args4(struct nfs4_async_reqs *args)
   1075 {
   1076 	rnode4_t *rp;
   1077 
   1078 	if (args->a_io != NFS4_INACTIVE) {
   1079 		rp = VTOR4(args->a_vp);
   1080 		mutex_enter(&rp->r_statelock);
   1081 		rp->r_count--;
   1082 		if (args->a_io == NFS4_PUTAPAGE ||
   1083 		    args->a_io == NFS4_PAGEIO)
   1084 			rp->r_awcount--;
   1085 		cv_broadcast(&rp->r_cv);
   1086 		mutex_exit(&rp->r_statelock);
   1087 		VN_RELE(args->a_vp);
   1088 	}
   1089 	crfree(args->a_cred);
   1090 	kmem_free(args, sizeof (*args));
   1091 }
   1092 
   1093 /*
   1094  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
   1095  * pageout(), running in the global zone, have legitimate reasons to do
   1096  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
   1097  * use of a a per-mount "asynchronous requests manager thread" which is
   1098  * signaled by the various asynchronous work routines when there is
   1099  * asynchronous work to be done.  It is responsible for creating new
   1100  * worker threads if necessary, and notifying existing worker threads
   1101  * that there is work to be done.
   1102  *
   1103  * In other words, it will "take the specifications from the customers and
   1104  * give them to the engineers."
   1105  *
   1106  * Worker threads die off of their own accord if they are no longer
   1107  * needed.
   1108  *
   1109  * This thread is killed when the zone is going away or the filesystem
   1110  * is being unmounted.
   1111  */
   1112 void
   1113 nfs4_async_manager(vfs_t *vfsp)
   1114 {
   1115 	callb_cpr_t cprinfo;
   1116 	mntinfo4_t *mi;
   1117 	uint_t max_threads;
   1118 
   1119 	mi = VFTOMI4(vfsp);
   1120 
   1121 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
   1122 	    "nfs4_async_manager");
   1123 
   1124 	mutex_enter(&mi->mi_async_lock);
   1125 	/*
   1126 	 * We want to stash the max number of threads that this mount was
   1127 	 * allowed so we can use it later when the variable is set to zero as
   1128 	 * part of the zone/mount going away.
   1129 	 *
   1130 	 * We want to be able to create at least one thread to handle
   1131 	 * asynchronous inactive calls.
   1132 	 */
   1133 	max_threads = MAX(mi->mi_max_threads, 1);
   1134 	/*
   1135 	 * We don't want to wait for mi_max_threads to go to zero, since that
   1136 	 * happens as part of a failed unmount, but this thread should only
   1137 	 * exit when the mount is really going away.
   1138 	 *
   1139 	 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
   1140 	 * attempted: the various _async_*() functions know to do things
   1141 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
   1142 	 * outstanding requests.
   1143 	 *
   1144 	 * Note that we still create zthreads even if we notice the zone is
   1145 	 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
   1146 	 * shutdown sequence to take slightly longer in some cases, but
   1147 	 * doesn't violate the protocol, as all threads will exit as soon as
   1148 	 * they're done processing the remaining requests.
   1149 	 */
   1150 	for (;;) {
   1151 		while (mi->mi_async_req_count > 0) {
   1152 			/*
   1153 			 * Paranoia: If the mount started out having
   1154 			 * (mi->mi_max_threads == 0), and the value was
   1155 			 * later changed (via a debugger or somesuch),
   1156 			 * we could be confused since we will think we
   1157 			 * can't create any threads, and the calling
   1158 			 * code (which looks at the current value of
   1159 			 * mi->mi_max_threads, now non-zero) thinks we
   1160 			 * can.
   1161 			 *
   1162 			 * So, because we're paranoid, we create threads
   1163 			 * up to the maximum of the original and the
   1164 			 * current value. This means that future
   1165 			 * (debugger-induced) alterations of
   1166 			 * mi->mi_max_threads are ignored for our
   1167 			 * purposes, but who told them they could change
   1168 			 * random values on a live kernel anyhow?
   1169 			 */
   1170 			if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
   1171 			    MAX(mi->mi_max_threads, max_threads)) {
   1172 				mi->mi_threads[NFS4_ASYNC_QUEUE]++;
   1173 				mutex_exit(&mi->mi_async_lock);
   1174 				MI4_HOLD(mi);
   1175 				VFS_HOLD(vfsp);	/* hold for new thread */
   1176 				(void) zthread_create(NULL, 0, nfs4_async_start,
   1177 				    vfsp, 0, minclsyspri);
   1178 				mutex_enter(&mi->mi_async_lock);
   1179 			} else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
   1180 			    NUM_ASYNC_PGOPS_THREADS) {
   1181 				mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
   1182 				mutex_exit(&mi->mi_async_lock);
   1183 				MI4_HOLD(mi);
   1184 				VFS_HOLD(vfsp); /* hold for new thread */
   1185 				(void) zthread_create(NULL, 0,
   1186 				    nfs4_async_pgops_start, vfsp, 0,
   1187 				    minclsyspri);
   1188 				mutex_enter(&mi->mi_async_lock);
   1189 			}
   1190 			NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
   1191 			ASSERT(mi->mi_async_req_count != 0);
   1192 			mi->mi_async_req_count--;
   1193 		}
   1194 
   1195 		mutex_enter(&mi->mi_lock);
   1196 		if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
   1197 			mutex_exit(&mi->mi_lock);
   1198 			break;
   1199 		}
   1200 		mutex_exit(&mi->mi_lock);
   1201 
   1202 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   1203 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
   1204 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
   1205 	}
   1206 
   1207 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
   1208 	    "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
   1209 	/*
   1210 	 * Let everyone know we're done.
   1211 	 */
   1212 	mi->mi_manager_thread = NULL;
   1213 	/*
   1214 	 * Wake up the inactive thread.
   1215 	 */
   1216 	cv_broadcast(&mi->mi_inact_req_cv);
   1217 	/*
   1218 	 * Wake up anyone sitting in nfs4_async_manager_stop()
   1219 	 */
   1220 	cv_broadcast(&mi->mi_async_cv);
   1221 	/*
   1222 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
   1223 	 * since CALLB_CPR_EXIT is actually responsible for releasing
   1224 	 * 'mi_async_lock'.
   1225 	 */
   1226 	CALLB_CPR_EXIT(&cprinfo);
   1227 	VFS_RELE(vfsp);	/* release thread's hold */
   1228 	MI4_RELE(mi);
   1229 	zthread_exit();
   1230 }
   1231 
   1232 /*
   1233  * Signal (and wait for) the async manager thread to clean up and go away.
   1234  */
   1235 void
   1236 nfs4_async_manager_stop(vfs_t *vfsp)
   1237 {
   1238 	mntinfo4_t *mi = VFTOMI4(vfsp);
   1239 
   1240 	mutex_enter(&mi->mi_async_lock);
   1241 	mutex_enter(&mi->mi_lock);
   1242 	mi->mi_flags |= MI4_ASYNC_MGR_STOP;
   1243 	mutex_exit(&mi->mi_lock);
   1244 	cv_broadcast(&mi->mi_async_reqs_cv);
   1245 	/*
   1246 	 * Wait for the async manager thread to die.
   1247 	 */
   1248 	while (mi->mi_manager_thread != NULL)
   1249 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
   1250 	mutex_exit(&mi->mi_async_lock);
   1251 }
   1252 
   1253 int
   1254 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
   1255     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
   1256     u_offset_t, caddr_t, struct seg *, cred_t *))
   1257 {
   1258 	rnode4_t *rp;
   1259 	mntinfo4_t *mi;
   1260 	struct nfs4_async_reqs *args;
   1261 
   1262 	rp = VTOR4(vp);
   1263 	ASSERT(rp->r_freef == NULL);
   1264 
   1265 	mi = VTOMI4(vp);
   1266 
   1267 	/*
   1268 	 * If addr falls in a different segment, don't bother doing readahead.
   1269 	 */
   1270 	if (addr >= seg->s_base + seg->s_size)
   1271 		return (-1);
   1272 
   1273 	/*
   1274 	 * If we can't allocate a request structure, punt on the readahead.
   1275 	 */
   1276 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
   1277 		return (-1);
   1278 
   1279 	/*
   1280 	 * If a lock operation is pending, don't initiate any new
   1281 	 * readaheads.  Otherwise, bump r_count to indicate the new
   1282 	 * asynchronous I/O.
   1283 	 */
   1284 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
   1285 		kmem_free(args, sizeof (*args));
   1286 		return (-1);
   1287 	}
   1288 	mutex_enter(&rp->r_statelock);
   1289 	rp->r_count++;
   1290 	mutex_exit(&rp->r_statelock);
   1291 	nfs_rw_exit(&rp->r_lkserlock);
   1292 
   1293 	args->a_next = NULL;
   1294 #ifdef DEBUG
   1295 	args->a_queuer = curthread;
   1296 #endif
   1297 	VN_HOLD(vp);
   1298 	args->a_vp = vp;
   1299 	ASSERT(cr != NULL);
   1300 	crhold(cr);
   1301 	args->a_cred = cr;
   1302 	args->a_io = NFS4_READ_AHEAD;
   1303 	args->a_nfs4_readahead = readahead;
   1304 	args->a_nfs4_blkoff = blkoff;
   1305 	args->a_nfs4_seg = seg;
   1306 	args->a_nfs4_addr = addr;
   1307 
   1308 	mutex_enter(&mi->mi_async_lock);
   1309 
   1310 	/*
   1311 	 * If asyncio has been disabled, don't bother readahead.
   1312 	 */
   1313 	if (mi->mi_max_threads == 0) {
   1314 		mutex_exit(&mi->mi_async_lock);
   1315 		goto noasync;
   1316 	}
   1317 
   1318 	/*
   1319 	 * Link request structure into the async list and
   1320 	 * wakeup async thread to do the i/o.
   1321 	 */
   1322 	if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
   1323 		mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
   1324 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
   1325 	} else {
   1326 		mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
   1327 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
   1328 	}
   1329 
   1330 	if (mi->mi_io_kstats) {
   1331 		mutex_enter(&mi->mi_lock);
   1332 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   1333 		mutex_exit(&mi->mi_lock);
   1334 	}
   1335 
   1336 	mi->mi_async_req_count++;
   1337 	ASSERT(mi->mi_async_req_count != 0);
   1338 	cv_signal(&mi->mi_async_reqs_cv);
   1339 	mutex_exit(&mi->mi_async_lock);
   1340 	return (0);
   1341 
   1342 noasync:
   1343 	mutex_enter(&rp->r_statelock);
   1344 	rp->r_count--;
   1345 	cv_broadcast(&rp->r_cv);
   1346 	mutex_exit(&rp->r_statelock);
   1347 	VN_RELE(vp);
   1348 	crfree(cr);
   1349 	kmem_free(args, sizeof (*args));
   1350 	return (-1);
   1351 }
   1352 
   1353 static void
   1354 nfs4_async_start(struct vfs *vfsp)
   1355 {
   1356 	nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
   1357 }
   1358 
   1359 static void
   1360 nfs4_async_pgops_start(struct vfs *vfsp)
   1361 {
   1362 	nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
   1363 }
   1364 
   1365 /*
   1366  * The async queues for each mounted file system are arranged as a
   1367  * set of queues, one for each async i/o type.  Requests are taken
   1368  * from the queues in a round-robin fashion.  A number of consecutive
   1369  * requests are taken from each queue before moving on to the next
   1370  * queue.  This functionality may allow the NFS Version 2 server to do
   1371  * write clustering, even if the client is mixing writes and reads
   1372  * because it will take multiple write requests from the queue
   1373  * before processing any of the other async i/o types.
   1374  *
   1375  * XXX The nfs4_async_common_start thread is unsafe in the light of the present
   1376  * model defined by cpr to suspend the system. Specifically over the
   1377  * wire calls are cpr-unsafe. The thread should be reevaluated in
   1378  * case of future updates to the cpr model.
   1379  */
   1380 static void
   1381 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
   1382 {
   1383 	struct nfs4_async_reqs *args;
   1384 	mntinfo4_t *mi = VFTOMI4(vfsp);
   1385 	clock_t time_left = 1;
   1386 	callb_cpr_t cprinfo;
   1387 	int i;
   1388 	extern int nfs_async_timeout;
   1389 	int async_types;
   1390 	kcondvar_t *async_work_cv;
   1391 
   1392 	if (async_queue == NFS4_ASYNC_QUEUE) {
   1393 		async_types = NFS4_ASYNC_TYPES;
   1394 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
   1395 	} else {
   1396 		async_types = NFS4_ASYNC_PGOPS_TYPES;
   1397 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
   1398 	}
   1399 
   1400 	/*
   1401 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
   1402 	 * built in an implementation independent manner.
   1403 	 */
   1404 	if (nfs_async_timeout == -1)
   1405 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
   1406 
   1407 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
   1408 
   1409 	mutex_enter(&mi->mi_async_lock);
   1410 	for (;;) {
   1411 		/*
   1412 		 * Find the next queue containing an entry.  We start
   1413 		 * at the current queue pointer and then round robin
   1414 		 * through all of them until we either find a non-empty
   1415 		 * queue or have looked through all of them.
   1416 		 */
   1417 		for (i = 0; i < async_types; i++) {
   1418 			args = *mi->mi_async_curr[async_queue];
   1419 			if (args != NULL)
   1420 				break;
   1421 			mi->mi_async_curr[async_queue]++;
   1422 			if (mi->mi_async_curr[async_queue] ==
   1423 			    &mi->mi_async_reqs[async_types]) {
   1424 				mi->mi_async_curr[async_queue] =
   1425 				    &mi->mi_async_reqs[0];
   1426 			}
   1427 		}
   1428 		/*
   1429 		 * If we didn't find a entry, then block until woken up
   1430 		 * again and then look through the queues again.
   1431 		 */
   1432 		if (args == NULL) {
   1433 			/*
   1434 			 * Exiting is considered to be safe for CPR as well
   1435 			 */
   1436 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
   1437 
   1438 			/*
   1439 			 * Wakeup thread waiting to unmount the file
   1440 			 * system only if all async threads are inactive.
   1441 			 *
   1442 			 * If we've timed-out and there's nothing to do,
   1443 			 * then get rid of this thread.
   1444 			 */
   1445 			if (mi->mi_max_threads == 0 || time_left <= 0) {
   1446 				--mi->mi_threads[async_queue];
   1447 
   1448 				if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
   1449 				    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
   1450 					cv_signal(&mi->mi_async_cv);
   1451 				CALLB_CPR_EXIT(&cprinfo);
   1452 				VFS_RELE(vfsp);	/* release thread's hold */
   1453 				MI4_RELE(mi);
   1454 				zthread_exit();
   1455 				/* NOTREACHED */
   1456 			}
   1457 			time_left = cv_reltimedwait(async_work_cv,
   1458 			    &mi->mi_async_lock, nfs_async_timeout,
   1459 			    TR_CLOCK_TICK);
   1460 
   1461 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
   1462 
   1463 			continue;
   1464 		} else {
   1465 			time_left = 1;
   1466 		}
   1467 
   1468 		/*
   1469 		 * Remove the request from the async queue and then
   1470 		 * update the current async request queue pointer.  If
   1471 		 * the current queue is empty or we have removed enough
   1472 		 * consecutive entries from it, then reset the counter
   1473 		 * for this queue and then move the current pointer to
   1474 		 * the next queue.
   1475 		 */
   1476 		*mi->mi_async_curr[async_queue] = args->a_next;
   1477 		if (*mi->mi_async_curr[async_queue] == NULL ||
   1478 		    --mi->mi_async_clusters[args->a_io] == 0) {
   1479 			mi->mi_async_clusters[args->a_io] =
   1480 			    mi->mi_async_init_clusters;
   1481 			mi->mi_async_curr[async_queue]++;
   1482 			if (mi->mi_async_curr[async_queue] ==
   1483 			    &mi->mi_async_reqs[async_types]) {
   1484 				mi->mi_async_curr[async_queue] =
   1485 				    &mi->mi_async_reqs[0];
   1486 			}
   1487 		}
   1488 
   1489 		if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
   1490 			mutex_enter(&mi->mi_lock);
   1491 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
   1492 			mutex_exit(&mi->mi_lock);
   1493 		}
   1494 
   1495 		mutex_exit(&mi->mi_async_lock);
   1496 
   1497 		/*
   1498 		 * Obtain arguments from the async request structure.
   1499 		 */
   1500 		if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
   1501 			(*args->a_nfs4_readahead)(args->a_vp,
   1502 			    args->a_nfs4_blkoff, args->a_nfs4_addr,
   1503 			    args->a_nfs4_seg, args->a_cred);
   1504 		} else if (args->a_io == NFS4_PUTAPAGE) {
   1505 			(void) (*args->a_nfs4_putapage)(args->a_vp,
   1506 			    args->a_nfs4_pp, args->a_nfs4_off,
   1507 			    args->a_nfs4_len, args->a_nfs4_flags,
   1508 			    args->a_cred);
   1509 		} else if (args->a_io == NFS4_PAGEIO) {
   1510 			(void) (*args->a_nfs4_pageio)(args->a_vp,
   1511 			    args->a_nfs4_pp, args->a_nfs4_off,
   1512 			    args->a_nfs4_len, args->a_nfs4_flags,
   1513 			    args->a_cred);
   1514 		} else if (args->a_io == NFS4_READDIR) {
   1515 			(void) ((*args->a_nfs4_readdir)(args->a_vp,
   1516 			    args->a_nfs4_rdc, args->a_cred));
   1517 		} else if (args->a_io == NFS4_COMMIT) {
   1518 			(*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
   1519 			    args->a_nfs4_offset, args->a_nfs4_count,
   1520 			    args->a_cred);
   1521 		} else if (args->a_io == NFS4_INACTIVE) {
   1522 			nfs4_inactive_otw(args->a_vp, args->a_cred);
   1523 		}
   1524 
   1525 		/*
   1526 		 * Now, release the vnode and free the credentials
   1527 		 * structure.
   1528 		 */
   1529 		free_async_args4(args);
   1530 		/*
   1531 		 * Reacquire the mutex because it will be needed above.
   1532 		 */
   1533 		mutex_enter(&mi->mi_async_lock);
   1534 	}
   1535 }
   1536 
   1537 /*
   1538  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
   1539  * part of VOP_INACTIVE.
   1540  */
   1541 
   1542 void
   1543 nfs4_inactive_thread(mntinfo4_t *mi)
   1544 {
   1545 	struct nfs4_async_reqs *args;
   1546 	callb_cpr_t cprinfo;
   1547 	vfs_t *vfsp = mi->mi_vfsp;
   1548 
   1549 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
   1550 	    "nfs4_inactive_thread");
   1551 
   1552 	for (;;) {
   1553 		mutex_enter(&mi->mi_async_lock);
   1554 		args = mi->mi_async_reqs[NFS4_INACTIVE];
   1555 		if (args == NULL) {
   1556 			mutex_enter(&mi->mi_lock);
   1557 			/*
   1558 			 * We don't want to exit until the async manager is done
   1559 			 * with its work; hence the check for mi_manager_thread
   1560 			 * being NULL.
   1561 			 *
   1562 			 * The async manager thread will cv_broadcast() on
   1563 			 * mi_inact_req_cv when it's done, at which point we'll
   1564 			 * wake up and exit.
   1565 			 */
   1566 			if (mi->mi_manager_thread == NULL)
   1567 				goto die;
   1568 			mi->mi_flags |= MI4_INACTIVE_IDLE;
   1569 			mutex_exit(&mi->mi_lock);
   1570 			cv_signal(&mi->mi_async_cv);
   1571 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
   1572 			cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
   1573 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
   1574 			mutex_exit(&mi->mi_async_lock);
   1575 		} else {
   1576 			mutex_enter(&mi->mi_lock);
   1577 			mi->mi_flags &= ~MI4_INACTIVE_IDLE;
   1578 			mutex_exit(&mi->mi_lock);
   1579 			mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
   1580 			mutex_exit(&mi->mi_async_lock);
   1581 			nfs4_inactive_otw(args->a_vp, args->a_cred);
   1582 			crfree(args->a_cred);
   1583 			kmem_free(args, sizeof (*args));
   1584 		}
   1585 	}
   1586 die:
   1587 	mutex_exit(&mi->mi_lock);
   1588 	mi->mi_inactive_thread = NULL;
   1589 	cv_signal(&mi->mi_async_cv);
   1590 
   1591 	/*
   1592 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
   1593 	 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
   1594 	 */
   1595 	CALLB_CPR_EXIT(&cprinfo);
   1596 
   1597 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
   1598 	    "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
   1599 
   1600 	MI4_RELE(mi);
   1601 	zthread_exit();
   1602 	/* NOTREACHED */
   1603 }
   1604 
   1605 /*
   1606  * nfs_async_stop:
   1607  * Wait for all outstanding putpage operations and the inactive thread to
   1608  * complete; nfs4_async_stop_sig() without interruptibility.
   1609  */
   1610 void
   1611 nfs4_async_stop(struct vfs *vfsp)
   1612 {
   1613 	mntinfo4_t *mi = VFTOMI4(vfsp);
   1614 
   1615 	/*
   1616 	 * Wait for all outstanding async operations to complete and for
   1617 	 * worker threads to exit.
   1618 	 */
   1619 	mutex_enter(&mi->mi_async_lock);
   1620 	mi->mi_max_threads = 0;
   1621 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
   1622 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
   1623 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
   1624 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
   1625 
   1626 	/*
   1627 	 * Wait for the inactive thread to finish doing what it's doing.  It
   1628 	 * won't exit until the last reference to the vfs_t goes away.
   1629 	 */
   1630 	if (mi->mi_inactive_thread != NULL) {
   1631 		mutex_enter(&mi->mi_lock);
   1632 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
   1633 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
   1634 			mutex_exit(&mi->mi_lock);
   1635 			cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
   1636 			mutex_enter(&mi->mi_lock);
   1637 		}
   1638 		mutex_exit(&mi->mi_lock);
   1639 	}
   1640 	mutex_exit(&mi->mi_async_lock);
   1641 }
   1642 
   1643 /*
   1644  * nfs_async_stop_sig:
   1645  * Wait for all outstanding putpage operations and the inactive thread to
   1646  * complete. If a signal is delivered we will abort and return non-zero;
   1647  * otherwise return 0. Since this routine is called from nfs4_unmount, we
   1648  * need to make it interruptible.
   1649  */
   1650 int
   1651 nfs4_async_stop_sig(struct vfs *vfsp)
   1652 {
   1653 	mntinfo4_t *mi = VFTOMI4(vfsp);
   1654 	ushort_t omax;
   1655 	bool_t intr = FALSE;
   1656 
   1657 	/*
   1658 	 * Wait for all outstanding putpage operations to complete and for
   1659 	 * worker threads to exit.
   1660 	 */
   1661 	mutex_enter(&mi->mi_async_lock);
   1662 	omax = mi->mi_max_threads;
   1663 	mi->mi_max_threads = 0;
   1664 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
   1665 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
   1666 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
   1667 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
   1668 			intr = TRUE;
   1669 			goto interrupted;
   1670 		}
   1671 	}
   1672 
   1673 	/*
   1674 	 * Wait for the inactive thread to finish doing what it's doing.  It
   1675 	 * won't exit until the a last reference to the vfs_t goes away.
   1676 	 */
   1677 	if (mi->mi_inactive_thread != NULL) {
   1678 		mutex_enter(&mi->mi_lock);
   1679 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
   1680 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
   1681 			mutex_exit(&mi->mi_lock);
   1682 			if (!cv_wait_sig(&mi->mi_async_cv,
   1683 			    &mi->mi_async_lock)) {
   1684 				intr = TRUE;
   1685 				goto interrupted;
   1686 			}
   1687 			mutex_enter(&mi->mi_lock);
   1688 		}
   1689 		mutex_exit(&mi->mi_lock);
   1690 	}
   1691 interrupted:
   1692 	if (intr)
   1693 		mi->mi_max_threads = omax;
   1694 	mutex_exit(&mi->mi_async_lock);
   1695 
   1696 	return (intr);
   1697 }
   1698 
   1699 int
   1700 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
   1701     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
   1702     u_offset_t, size_t, int, cred_t *))
   1703 {
   1704 	rnode4_t *rp;
   1705 	mntinfo4_t *mi;
   1706 	struct nfs4_async_reqs *args;
   1707 
   1708 	ASSERT(flags & B_ASYNC);
   1709 	ASSERT(vp->v_vfsp != NULL);
   1710 
   1711 	rp = VTOR4(vp);
   1712 	ASSERT(rp->r_count > 0);
   1713 
   1714 	mi = VTOMI4(vp);
   1715 
   1716 	/*
   1717 	 * If we can't allocate a request structure, do the putpage
   1718 	 * operation synchronously in this thread's context.
   1719 	 */
   1720 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
   1721 		goto noasync;
   1722 
   1723 	args->a_next = NULL;
   1724 #ifdef DEBUG
   1725 	args->a_queuer = curthread;
   1726 #endif
   1727 	VN_HOLD(vp);
   1728 	args->a_vp = vp;
   1729 	ASSERT(cr != NULL);
   1730 	crhold(cr);
   1731 	args->a_cred = cr;
   1732 	args->a_io = NFS4_PUTAPAGE;
   1733 	args->a_nfs4_putapage = putapage;
   1734 	args->a_nfs4_pp = pp;
   1735 	args->a_nfs4_off = off;
   1736 	args->a_nfs4_len = (uint_t)len;
   1737 	args->a_nfs4_flags = flags;
   1738 
   1739 	mutex_enter(&mi->mi_async_lock);
   1740 
   1741 	/*
   1742 	 * If asyncio has been disabled, then make a synchronous request.
   1743 	 * This check is done a second time in case async io was diabled
   1744 	 * while this thread was blocked waiting for memory pressure to
   1745 	 * reduce or for the queue to drain.
   1746 	 */
   1747 	if (mi->mi_max_threads == 0) {
   1748 		mutex_exit(&mi->mi_async_lock);
   1749 
   1750 		VN_RELE(vp);
   1751 		crfree(cr);
   1752 		kmem_free(args, sizeof (*args));
   1753 		goto noasync;
   1754 	}
   1755 
   1756 	/*
   1757 	 * Link request structure into the async list and
   1758 	 * wakeup async thread to do the i/o.
   1759 	 */
   1760 	if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
   1761 		mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
   1762 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
   1763 	} else {
   1764 		mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
   1765 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
   1766 	}
   1767 
   1768 	mutex_enter(&rp->r_statelock);
   1769 	rp->r_count++;
   1770 	rp->r_awcount++;
   1771 	mutex_exit(&rp->r_statelock);
   1772 
   1773 	if (mi->mi_io_kstats) {
   1774 		mutex_enter(&mi->mi_lock);
   1775 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   1776 		mutex_exit(&mi->mi_lock);
   1777 	}
   1778 
   1779 	mi->mi_async_req_count++;
   1780 	ASSERT(mi->mi_async_req_count != 0);
   1781 	cv_signal(&mi->mi_async_reqs_cv);
   1782 	mutex_exit(&mi->mi_async_lock);
   1783 	return (0);
   1784 
   1785 noasync:
   1786 
   1787 	if (curproc == proc_pageout || curproc == proc_fsflush ||
   1788 	    nfs_zone() == mi->mi_zone) {
   1789 		/*
   1790 		 * If we get here in the context of the pageout/fsflush,
   1791 		 * or we have run out of memory or we're attempting to
   1792 		 * unmount we refuse to do a sync write, because this may
   1793 		 * hang pageout/fsflush and the machine. In this case,
   1794 		 * we just re-mark the page as dirty and punt on the page.
   1795 		 *
   1796 		 * Make sure B_FORCE isn't set.  We can re-mark the
   1797 		 * pages as dirty and unlock the pages in one swoop by
   1798 		 * passing in B_ERROR to pvn_write_done().  However,
   1799 		 * we should make sure B_FORCE isn't set - we don't
   1800 		 * want the page tossed before it gets written out.
   1801 		 */
   1802 		if (flags & B_FORCE)
   1803 			flags &= ~(B_INVAL | B_FORCE);
   1804 		pvn_write_done(pp, flags | B_ERROR);
   1805 		return (0);
   1806 	}
   1807 
   1808 	/*
   1809 	 * We'll get here only if (nfs_zone() != mi->mi_zone)
   1810 	 * which means that this was a cross-zone sync putpage.
   1811 	 *
   1812 	 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
   1813 	 * as dirty and unlock them.
   1814 	 *
   1815 	 * We don't want to clear B_FORCE here as the caller presumably
   1816 	 * knows what they're doing if they set it.
   1817 	 */
   1818 	pvn_write_done(pp, flags | B_ERROR);
   1819 	return (EPERM);
   1820 }
   1821 
   1822 int
   1823 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
   1824     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
   1825     size_t, int, cred_t *))
   1826 {
   1827 	rnode4_t *rp;
   1828 	mntinfo4_t *mi;
   1829 	struct nfs4_async_reqs *args;
   1830 
   1831 	ASSERT(flags & B_ASYNC);
   1832 	ASSERT(vp->v_vfsp != NULL);
   1833 
   1834 	rp = VTOR4(vp);
   1835 	ASSERT(rp->r_count > 0);
   1836 
   1837 	mi = VTOMI4(vp);
   1838 
   1839 	/*
   1840 	 * If we can't allocate a request structure, do the pageio
   1841 	 * request synchronously in this thread's context.
   1842 	 */
   1843 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
   1844 		goto noasync;
   1845 
   1846 	args->a_next = NULL;
   1847 #ifdef DEBUG
   1848 	args->a_queuer = curthread;
   1849 #endif
   1850 	VN_HOLD(vp);
   1851 	args->a_vp = vp;
   1852 	ASSERT(cr != NULL);
   1853 	crhold(cr);
   1854 	args->a_cred = cr;
   1855 	args->a_io = NFS4_PAGEIO;
   1856 	args->a_nfs4_pageio = pageio;
   1857 	args->a_nfs4_pp = pp;
   1858 	args->a_nfs4_off = io_off;
   1859 	args->a_nfs4_len = (uint_t)io_len;
   1860 	args->a_nfs4_flags = flags;
   1861 
   1862 	mutex_enter(&mi->mi_async_lock);
   1863 
   1864 	/*
   1865 	 * If asyncio has been disabled, then make a synchronous request.
   1866 	 * This check is done a second time in case async io was diabled
   1867 	 * while this thread was blocked waiting for memory pressure to
   1868 	 * reduce or for the queue to drain.
   1869 	 */
   1870 	if (mi->mi_max_threads == 0) {
   1871 		mutex_exit(&mi->mi_async_lock);
   1872 
   1873 		VN_RELE(vp);
   1874 		crfree(cr);
   1875 		kmem_free(args, sizeof (*args));
   1876 		goto noasync;
   1877 	}
   1878 
   1879 	/*
   1880 	 * Link request structure into the async list and
   1881 	 * wakeup async thread to do the i/o.
   1882 	 */
   1883 	if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
   1884 		mi->mi_async_reqs[NFS4_PAGEIO] = args;
   1885 		mi->mi_async_tail[NFS4_PAGEIO] = args;
   1886 	} else {
   1887 		mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
   1888 		mi->mi_async_tail[NFS4_PAGEIO] = args;
   1889 	}
   1890 
   1891 	mutex_enter(&rp->r_statelock);
   1892 	rp->r_count++;
   1893 	rp->r_awcount++;
   1894 	mutex_exit(&rp->r_statelock);
   1895 
   1896 	if (mi->mi_io_kstats) {
   1897 		mutex_enter(&mi->mi_lock);
   1898 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   1899 		mutex_exit(&mi->mi_lock);
   1900 	}
   1901 
   1902 	mi->mi_async_req_count++;
   1903 	ASSERT(mi->mi_async_req_count != 0);
   1904 	cv_signal(&mi->mi_async_reqs_cv);
   1905 	mutex_exit(&mi->mi_async_lock);
   1906 	return (0);
   1907 
   1908 noasync:
   1909 	/*
   1910 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
   1911 	 * the page list), for writes we do it synchronously, except for
   1912 	 * proc_pageout/proc_fsflush as described below.
   1913 	 */
   1914 	if (flags & B_READ) {
   1915 		pvn_read_done(pp, flags | B_ERROR);
   1916 		return (0);
   1917 	}
   1918 
   1919 	if (curproc == proc_pageout || curproc == proc_fsflush) {
   1920 		/*
   1921 		 * If we get here in the context of the pageout/fsflush,
   1922 		 * we refuse to do a sync write, because this may hang
   1923 		 * pageout/fsflush (and the machine). In this case, we just
   1924 		 * re-mark the page as dirty and punt on the page.
   1925 		 *
   1926 		 * Make sure B_FORCE isn't set.  We can re-mark the
   1927 		 * pages as dirty and unlock the pages in one swoop by
   1928 		 * passing in B_ERROR to pvn_write_done().  However,
   1929 		 * we should make sure B_FORCE isn't set - we don't
   1930 		 * want the page tossed before it gets written out.
   1931 		 */
   1932 		if (flags & B_FORCE)
   1933 			flags &= ~(B_INVAL | B_FORCE);
   1934 		pvn_write_done(pp, flags | B_ERROR);
   1935 		return (0);
   1936 	}
   1937 
   1938 	if (nfs_zone() != mi->mi_zone) {
   1939 		/*
   1940 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
   1941 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
   1942 		 * them.
   1943 		 *
   1944 		 * We don't want to clear B_FORCE here as the caller presumably
   1945 		 * knows what they're doing if they set it.
   1946 		 */
   1947 		pvn_write_done(pp, flags | B_ERROR);
   1948 		return (EPERM);
   1949 	}
   1950 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
   1951 }
   1952 
   1953 void
   1954 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
   1955     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
   1956 {
   1957 	rnode4_t *rp;
   1958 	mntinfo4_t *mi;
   1959 	struct nfs4_async_reqs *args;
   1960 
   1961 	rp = VTOR4(vp);
   1962 	ASSERT(rp->r_freef == NULL);
   1963 
   1964 	mi = VTOMI4(vp);
   1965 
   1966 	/*
   1967 	 * If we can't allocate a request structure, skip the readdir.
   1968 	 */
   1969 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
   1970 		goto noasync;
   1971 
   1972 	args->a_next = NULL;
   1973 #ifdef DEBUG
   1974 	args->a_queuer = curthread;
   1975 #endif
   1976 	VN_HOLD(vp);
   1977 	args->a_vp = vp;
   1978 	ASSERT(cr != NULL);
   1979 	crhold(cr);
   1980 	args->a_cred = cr;
   1981 	args->a_io = NFS4_READDIR;
   1982 	args->a_nfs4_readdir = readdir;
   1983 	args->a_nfs4_rdc = rdc;
   1984 
   1985 	mutex_enter(&mi->mi_async_lock);
   1986 
   1987 	/*
   1988 	 * If asyncio has been disabled, then skip this request
   1989 	 */
   1990 	if (mi->mi_max_threads == 0) {
   1991 		mutex_exit(&mi->mi_async_lock);
   1992 
   1993 		VN_RELE(vp);
   1994 		crfree(cr);
   1995 		kmem_free(args, sizeof (*args));
   1996 		goto noasync;
   1997 	}
   1998 
   1999 	/*
   2000 	 * Link request structure into the async list and
   2001 	 * wakeup async thread to do the i/o.
   2002 	 */
   2003 	if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
   2004 		mi->mi_async_reqs[NFS4_READDIR] = args;
   2005 		mi->mi_async_tail[NFS4_READDIR] = args;
   2006 	} else {
   2007 		mi->mi_async_tail[NFS4_READDIR]->a_next = args;
   2008 		mi->mi_async_tail[NFS4_READDIR] = args;
   2009 	}
   2010 
   2011 	mutex_enter(&rp->r_statelock);
   2012 	rp->r_count++;
   2013 	mutex_exit(&rp->r_statelock);
   2014 
   2015 	if (mi->mi_io_kstats) {
   2016 		mutex_enter(&mi->mi_lock);
   2017 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   2018 		mutex_exit(&mi->mi_lock);
   2019 	}
   2020 
   2021 	mi->mi_async_req_count++;
   2022 	ASSERT(mi->mi_async_req_count != 0);
   2023 	cv_signal(&mi->mi_async_reqs_cv);
   2024 	mutex_exit(&mi->mi_async_lock);
   2025 	return;
   2026 
   2027 noasync:
   2028 	mutex_enter(&rp->r_statelock);
   2029 	rdc->entries = NULL;
   2030 	/*
   2031 	 * Indicate that no one is trying to fill this entry and
   2032 	 * it still needs to be filled.
   2033 	 */
   2034 	rdc->flags &= ~RDDIR;
   2035 	rdc->flags |= RDDIRREQ;
   2036 	rddir4_cache_rele(rp, rdc);
   2037 	mutex_exit(&rp->r_statelock);
   2038 }
   2039 
   2040 void
   2041 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
   2042     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
   2043     cred_t *))
   2044 {
   2045 	rnode4_t *rp;
   2046 	mntinfo4_t *mi;
   2047 	struct nfs4_async_reqs *args;
   2048 	page_t *pp;
   2049 
   2050 	rp = VTOR4(vp);
   2051 	mi = VTOMI4(vp);
   2052 
   2053 	/*
   2054 	 * If we can't allocate a request structure, do the commit
   2055 	 * operation synchronously in this thread's context.
   2056 	 */
   2057 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
   2058 		goto noasync;
   2059 
   2060 	args->a_next = NULL;
   2061 #ifdef DEBUG
   2062 	args->a_queuer = curthread;
   2063 #endif
   2064 	VN_HOLD(vp);
   2065 	args->a_vp = vp;
   2066 	ASSERT(cr != NULL);
   2067 	crhold(cr);
   2068 	args->a_cred = cr;
   2069 	args->a_io = NFS4_COMMIT;
   2070 	args->a_nfs4_commit = commit;
   2071 	args->a_nfs4_plist = plist;
   2072 	args->a_nfs4_offset = offset;
   2073 	args->a_nfs4_count = count;
   2074 
   2075 	mutex_enter(&mi->mi_async_lock);
   2076 
   2077 	/*
   2078 	 * If asyncio has been disabled, then make a synchronous request.
   2079 	 * This check is done a second time in case async io was diabled
   2080 	 * while this thread was blocked waiting for memory pressure to
   2081 	 * reduce or for the queue to drain.
   2082 	 */
   2083 	if (mi->mi_max_threads == 0) {
   2084 		mutex_exit(&mi->mi_async_lock);
   2085 
   2086 		VN_RELE(vp);
   2087 		crfree(cr);
   2088 		kmem_free(args, sizeof (*args));
   2089 		goto noasync;
   2090 	}
   2091 
   2092 	/*
   2093 	 * Link request structure into the async list and
   2094 	 * wakeup async thread to do the i/o.
   2095 	 */
   2096 	if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
   2097 		mi->mi_async_reqs[NFS4_COMMIT] = args;
   2098 		mi->mi_async_tail[NFS4_COMMIT] = args;
   2099 	} else {
   2100 		mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
   2101 		mi->mi_async_tail[NFS4_COMMIT] = args;
   2102 	}
   2103 
   2104 	mutex_enter(&rp->r_statelock);
   2105 	rp->r_count++;
   2106 	mutex_exit(&rp->r_statelock);
   2107 
   2108 	if (mi->mi_io_kstats) {
   2109 		mutex_enter(&mi->mi_lock);
   2110 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   2111 		mutex_exit(&mi->mi_lock);
   2112 	}
   2113 
   2114 	mi->mi_async_req_count++;
   2115 	ASSERT(mi->mi_async_req_count != 0);
   2116 	cv_signal(&mi->mi_async_reqs_cv);
   2117 	mutex_exit(&mi->mi_async_lock);
   2118 	return;
   2119 
   2120 noasync:
   2121 	if (curproc == proc_pageout || curproc == proc_fsflush ||
   2122 	    nfs_zone() != mi->mi_zone) {
   2123 		while (plist != NULL) {
   2124 			pp = plist;
   2125 			page_sub(&plist, pp);
   2126 			pp->p_fsdata = C_COMMIT;
   2127 			page_unlock(pp);
   2128 		}
   2129 		return;
   2130 	}
   2131 	(*commit)(vp, plist, offset, count, cr);
   2132 }
   2133 
   2134 /*
   2135  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
   2136  * reference to the vnode is handed over to the thread; the caller should
   2137  * no longer refer to the vnode.
   2138  *
   2139  * Unlike most of the async routines, this handoff is needed for
   2140  * correctness reasons, not just performance.  So doing operations in the
   2141  * context of the current thread is not an option.
   2142  */
   2143 void
   2144 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
   2145 {
   2146 	mntinfo4_t *mi;
   2147 	struct nfs4_async_reqs *args;
   2148 	boolean_t signal_inactive_thread = B_FALSE;
   2149 
   2150 	mi = VTOMI4(vp);
   2151 
   2152 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
   2153 	args->a_next = NULL;
   2154 #ifdef DEBUG
   2155 	args->a_queuer = curthread;
   2156 #endif
   2157 	args->a_vp = vp;
   2158 	ASSERT(cr != NULL);
   2159 	crhold(cr);
   2160 	args->a_cred = cr;
   2161 	args->a_io = NFS4_INACTIVE;
   2162 
   2163 	/*
   2164 	 * Note that we don't check mi->mi_max_threads here, since we
   2165 	 * *need* to get rid of this vnode regardless of whether someone
   2166 	 * set nfs4_max_threads to zero in /etc/system.
   2167 	 *
   2168 	 * The manager thread knows about this and is willing to create
   2169 	 * at least one thread to accommodate us.
   2170 	 */
   2171 	mutex_enter(&mi->mi_async_lock);
   2172 	if (mi->mi_inactive_thread == NULL) {
   2173 		rnode4_t *rp;
   2174 		vnode_t *unldvp = NULL;
   2175 		char *unlname;
   2176 		cred_t *unlcred;
   2177 
   2178 		mutex_exit(&mi->mi_async_lock);
   2179 		/*
   2180 		 * We just need to free up the memory associated with the
   2181 		 * vnode, which can be safely done from within the current
   2182 		 * context.
   2183 		 */
   2184 		crfree(cr);	/* drop our reference */
   2185 		kmem_free(args, sizeof (*args));
   2186 		rp = VTOR4(vp);
   2187 		mutex_enter(&rp->r_statelock);
   2188 		if (rp->r_unldvp != NULL) {
   2189 			unldvp = rp->r_unldvp;
   2190 			rp->r_unldvp = NULL;
   2191 			unlname = rp->r_unlname;
   2192 			rp->r_unlname = NULL;
   2193 			unlcred = rp->r_unlcred;
   2194 			rp->r_unlcred = NULL;
   2195 		}
   2196 		mutex_exit(&rp->r_statelock);
   2197 		/*
   2198 		 * No need to explicitly throw away any cached pages.  The
   2199 		 * eventual r4inactive() will attempt a synchronous
   2200 		 * VOP_PUTPAGE() which will immediately fail since the request
   2201 		 * is coming from the wrong zone, and then will proceed to call
   2202 		 * nfs4_invalidate_pages() which will clean things up for us.
   2203 		 *
   2204 		 * Throw away the delegation here so rp4_addfree()'s attempt to
   2205 		 * return any existing delegations becomes a no-op.
   2206 		 */
   2207 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
   2208 			(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
   2209 			    FALSE);
   2210 			(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
   2211 			nfs_rw_exit(&mi->mi_recovlock);
   2212 		}
   2213 		nfs4_clear_open_streams(rp);
   2214 
   2215 		rp4_addfree(rp, cr);
   2216 		if (unldvp != NULL) {
   2217 			kmem_free(unlname, MAXNAMELEN);
   2218 			VN_RELE(unldvp);
   2219 			crfree(unlcred);
   2220 		}
   2221 		return;
   2222 	}
   2223 
   2224 	if (mi->mi_manager_thread == NULL) {
   2225 		/*
   2226 		 * We want to talk to the inactive thread.
   2227 		 */
   2228 		signal_inactive_thread = B_TRUE;
   2229 	}
   2230 
   2231 	/*
   2232 	 * Enqueue the vnode and wake up either the special thread (empty
   2233 	 * list) or an async thread.
   2234 	 */
   2235 	if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
   2236 		mi->mi_async_reqs[NFS4_INACTIVE] = args;
   2237 		mi->mi_async_tail[NFS4_INACTIVE] = args;
   2238 		signal_inactive_thread = B_TRUE;
   2239 	} else {
   2240 		mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
   2241 		mi->mi_async_tail[NFS4_INACTIVE] = args;
   2242 	}
   2243 	if (signal_inactive_thread) {
   2244 		cv_signal(&mi->mi_inact_req_cv);
   2245 	} else  {
   2246 		mi->mi_async_req_count++;
   2247 		ASSERT(mi->mi_async_req_count != 0);
   2248 		cv_signal(&mi->mi_async_reqs_cv);
   2249 	}
   2250 
   2251 	mutex_exit(&mi->mi_async_lock);
   2252 }
   2253 
   2254 int
   2255 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
   2256 {
   2257 	int pagecreate;
   2258 	int n;
   2259 	int saved_n;
   2260 	caddr_t saved_base;
   2261 	u_offset_t offset;
   2262 	int error;
   2263 	int sm_error;
   2264 	vnode_t *vp = RTOV(rp);
   2265 
   2266 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
   2267 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
   2268 	if (!vpm_enable) {
   2269 		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
   2270 	}
   2271 
   2272 	/*
   2273 	 * Move bytes in at most PAGESIZE chunks. We must avoid
   2274 	 * spanning pages in uiomove() because page faults may cause
   2275 	 * the cache to be invalidated out from under us. The r_size is not
   2276 	 * updated until after the uiomove. If we push the last page of a
   2277 	 * file before r_size is correct, we will lose the data written past
   2278 	 * the current (and invalid) r_size.
   2279 	 */
   2280 	do {
   2281 		offset = uio->uio_loffset;
   2282 		pagecreate = 0;
   2283 
   2284 		/*
   2285 		 * n is the number of bytes required to satisfy the request
   2286 		 *   or the number of bytes to fill out the page.
   2287 		 */
   2288 		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
   2289 
   2290 		/*
   2291 		 * Check to see if we can skip reading in the page
   2292 		 * and just allocate the memory.  We can do this
   2293 		 * if we are going to rewrite the entire mapping
   2294 		 * or if we are going to write to or beyond the current
   2295 		 * end of file from the beginning of the mapping.
   2296 		 *
   2297 		 * The read of r_size is now protected by r_statelock.
   2298 		 */
   2299 		mutex_enter(&rp->r_statelock);
   2300 		/*
   2301 		 * When pgcreated is nonzero the caller has already done
   2302 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
   2303 		 * segkpm this means we already have at least one page
   2304 		 * created and mapped at base.
   2305 		 */
   2306 		pagecreate = pgcreated ||
   2307 		    ((offset & PAGEOFFSET) == 0 &&
   2308 		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
   2309 
   2310 		mutex_exit(&rp->r_statelock);
   2311 
   2312 		if (!vpm_enable && pagecreate) {
   2313 			/*
   2314 			 * The last argument tells segmap_pagecreate() to
   2315 			 * always lock the page, as opposed to sometimes
   2316 			 * returning with the page locked. This way we avoid a
   2317 			 * fault on the ensuing uiomove(), but also
   2318 			 * more importantly (to fix bug 1094402) we can
   2319 			 * call segmap_fault() to unlock the page in all
   2320 			 * cases. An alternative would be to modify
   2321 			 * segmap_pagecreate() to tell us when it is
   2322 			 * locking a page, but that's a fairly major
   2323 			 * interface change.
   2324 			 */
   2325 			if (pgcreated == 0)
   2326 				(void) segmap_pagecreate(segkmap, base,
   2327 				    (uint_t)n, 1);
   2328 			saved_base = base;
   2329 			saved_n = n;
   2330 		}
   2331 
   2332 		/*
   2333 		 * The number of bytes of data in the last page can not
   2334 		 * be accurately be determined while page is being
   2335 		 * uiomove'd to and the size of the file being updated.
   2336 		 * Thus, inform threads which need to know accurately
   2337 		 * how much data is in the last page of the file.  They
   2338 		 * will not do the i/o immediately, but will arrange for
   2339 		 * the i/o to happen later when this modify operation
   2340 		 * will have finished.
   2341 		 */
   2342 		ASSERT(!(rp->r_flags & R4MODINPROGRESS));
   2343 		mutex_enter(&rp->r_statelock);
   2344 		rp->r_flags |= R4MODINPROGRESS;
   2345 		rp->r_modaddr = (offset & MAXBMASK);
   2346 		mutex_exit(&rp->r_statelock);
   2347 
   2348 		if (vpm_enable) {
   2349 			/*
   2350 			 * Copy data. If new pages are created, part of
   2351 			 * the page that is not written will be initizliazed
   2352 			 * with zeros.
   2353 			 */
   2354 			error = vpm_data_copy(vp, offset, n, uio,
   2355 			    !pagecreate, NULL, 0, S_WRITE);
   2356 		} else {
   2357 			error = uiomove(base, n, UIO_WRITE, uio);
   2358 		}
   2359 
   2360 		/*
   2361 		 * r_size is the maximum number of
   2362 		 * bytes known to be in the file.
   2363 		 * Make sure it is at least as high as the
   2364 		 * first unwritten byte pointed to by uio_loffset.
   2365 		 */
   2366 		mutex_enter(&rp->r_statelock);
   2367 		if (rp->r_size < uio->uio_loffset)
   2368 			rp->r_size = uio->uio_loffset;
   2369 		rp->r_flags &= ~R4MODINPROGRESS;
   2370 		rp->r_flags |= R4DIRTY;
   2371 		mutex_exit(&rp->r_statelock);
   2372 
   2373 		/* n = # of bytes written */
   2374 		n = (int)(uio->uio_loffset - offset);
   2375 
   2376 		if (!vpm_enable) {
   2377 			base += n;
   2378 		}
   2379 
   2380 		tcount -= n;
   2381 		/*
   2382 		 * If we created pages w/o initializing them completely,
   2383 		 * we need to zero the part that wasn't set up.
   2384 		 * This happens on a most EOF write cases and if
   2385 		 * we had some sort of error during the uiomove.
   2386 		 */
   2387 		if (!vpm_enable && pagecreate) {
   2388 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
   2389 				(void) kzero(base, PAGESIZE - n);
   2390 
   2391 			if (pgcreated) {
   2392 				/*
   2393 				 * Caller is responsible for this page,
   2394 				 * it was not created in this loop.
   2395 				 */
   2396 				pgcreated = 0;
   2397 			} else {
   2398 				/*
   2399 				 * For bug 1094402: segmap_pagecreate locks
   2400 				 * page. Unlock it. This also unlocks the
   2401 				 * pages allocated by page_create_va() in
   2402 				 * segmap_pagecreate().
   2403 				 */
   2404 				sm_error = segmap_fault(kas.a_hat, segkmap,
   2405 				    saved_base, saved_n,
   2406 				    F_SOFTUNLOCK, S_WRITE);
   2407 				if (error == 0)
   2408 					error = sm_error;
   2409 			}
   2410 		}
   2411 	} while (tcount > 0 && error == 0);
   2412 
   2413 	return (error);
   2414 }
   2415 
   2416 int
   2417 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
   2418 {
   2419 	rnode4_t *rp;
   2420 	page_t *pp;
   2421 	u_offset_t eoff;
   2422 	u_offset_t io_off;
   2423 	size_t io_len;
   2424 	int error;
   2425 	int rdirty;
   2426 	int err;
   2427 
   2428 	rp = VTOR4(vp);
   2429 	ASSERT(rp->r_count > 0);
   2430 
   2431 	if (!nfs4_has_pages(vp))
   2432 		return (0);
   2433 
   2434 	ASSERT(vp->v_type != VCHR);
   2435 
   2436 	/*
   2437 	 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
   2438 	 * writes.  B_FORCE is set to force the VM system to actually
   2439 	 * invalidate the pages, even if the i/o failed.  The pages
   2440 	 * need to get invalidated because they can't be written out
   2441 	 * because there isn't any space left on either the server's
   2442 	 * file system or in the user's disk quota.  The B_FREE bit
   2443 	 * is cleared to avoid confusion as to whether this is a
   2444 	 * request to place the page on the freelist or to destroy
   2445 	 * it.
   2446 	 */
   2447 	if ((rp->r_flags & R4OUTOFSPACE) ||
   2448 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
   2449 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
   2450 
   2451 	if (len == 0) {
   2452 		/*
   2453 		 * If doing a full file synchronous operation, then clear
   2454 		 * the R4DIRTY bit.  If a page gets dirtied while the flush
   2455 		 * is happening, then R4DIRTY will get set again.  The
   2456 		 * R4DIRTY bit must get cleared before the flush so that
   2457 		 * we don't lose this information.
   2458 		 *
   2459 		 * If there are no full file async write operations
   2460 		 * pending and RDIRTY bit is set, clear it.
   2461 		 */
   2462 		if (off == (u_offset_t)0 &&
   2463 		    !(flags & B_ASYNC) &&
   2464 		    (rp->r_flags & R4DIRTY)) {
   2465 			mutex_enter(&rp->r_statelock);
   2466 			rdirty = (rp->r_flags & R4DIRTY);
   2467 			rp->r_flags &= ~R4DIRTY;
   2468 			mutex_exit(&rp->r_statelock);
   2469 		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
   2470 			mutex_enter(&rp->r_statelock);
   2471 			if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
   2472 				rdirty = (rp->r_flags & R4DIRTY);
   2473 				rp->r_flags &= ~R4DIRTY;
   2474 			}
   2475 			mutex_exit(&rp->r_statelock);
   2476 		} else
   2477 			rdirty = 0;
   2478 
   2479 		/*
   2480 		 * Search the entire vp list for pages >= off, and flush
   2481 		 * the dirty pages.
   2482 		 */
   2483 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
   2484 		    flags, cr);
   2485 
   2486 		/*
   2487 		 * If an error occurred and the file was marked as dirty
   2488 		 * before and we aren't forcibly invalidating pages, then
   2489 		 * reset the R4DIRTY flag.
   2490 		 */
   2491 		if (error && rdirty &&
   2492 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
   2493 			mutex_enter(&rp->r_statelock);
   2494 			rp->r_flags |= R4DIRTY;
   2495 			mutex_exit(&rp->r_statelock);
   2496 		}
   2497 	} else {
   2498 		/*
   2499 		 * Do a range from [off...off + len) looking for pages
   2500 		 * to deal with.
   2501 		 */
   2502 		error = 0;
   2503 		io_len = 0;
   2504 		eoff = off + len;
   2505 		mutex_enter(&rp->r_statelock);
   2506 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
   2507 		    io_off += io_len) {
   2508 			mutex_exit(&rp->r_statelock);
   2509 			/*
   2510 			 * If we are not invalidating, synchronously
   2511 			 * freeing or writing pages use the routine
   2512 			 * page_lookup_nowait() to prevent reclaiming
   2513 			 * them from the free list.
   2514 			 */
   2515 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
   2516 				pp = page_lookup(vp, io_off,
   2517 				    (flags & (B_INVAL | B_FREE)) ?
   2518 				    SE_EXCL : SE_SHARED);
   2519 			} else {
   2520 				pp = page_lookup_nowait(vp, io_off,
   2521 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
   2522 			}
   2523 
   2524 			if (pp == NULL || !pvn_getdirty(pp, flags))
   2525 				io_len = PAGESIZE;
   2526 			else {
   2527 				err = (*rp->r_putapage)(vp, pp, &io_off,
   2528 				    &io_len, flags, cr);
   2529 				if (!error)
   2530 					error = err;
   2531 				/*
   2532 				 * "io_off" and "io_len" are returned as
   2533 				 * the range of pages we actually wrote.
   2534 				 * This allows us to skip ahead more quickly
   2535 				 * since several pages may've been dealt
   2536 				 * with by this iteration of the loop.
   2537 				 */
   2538 			}
   2539 			mutex_enter(&rp->r_statelock);
   2540 		}
   2541 		mutex_exit(&rp->r_statelock);
   2542 	}
   2543 
   2544 	return (error);
   2545 }
   2546 
   2547 void
   2548 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
   2549 {
   2550 	rnode4_t *rp;
   2551 
   2552 	rp = VTOR4(vp);
   2553 	if (IS_SHADOW(vp, rp))
   2554 		vp = RTOV4(rp);
   2555 	mutex_enter(&rp->r_statelock);
   2556 	while (rp->r_flags & R4TRUNCATE)
   2557 		cv_wait(&rp->r_cv, &rp->r_statelock);
   2558 	rp->r_flags |= R4TRUNCATE;
   2559 	if (off == (u_offset_t)0) {
   2560 		rp->r_flags &= ~R4DIRTY;
   2561 		if (!(rp->r_flags & R4STALE))
   2562 			rp->r_error = 0;
   2563 	}
   2564 	rp->r_truncaddr = off;
   2565 	mutex_exit(&rp->r_statelock);
   2566 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
   2567 	    B_INVAL | B_TRUNC, cr);
   2568 	mutex_enter(&rp->r_statelock);
   2569 	rp->r_flags &= ~R4TRUNCATE;
   2570 	cv_broadcast(&rp->r_cv);
   2571 	mutex_exit(&rp->r_statelock);
   2572 }
   2573 
   2574 static int
   2575 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
   2576 {
   2577 	mntinfo4_t *mi;
   2578 	struct mntinfo_kstat *mik;
   2579 	vfs_t *vfsp;
   2580 
   2581 	/* this is a read-only kstat. Bail out on a write */
   2582 	if (rw == KSTAT_WRITE)
   2583 		return (EACCES);
   2584 
   2585 
   2586 	/*
   2587 	 * We don't want to wait here as kstat_chain_lock could be held by
   2588 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
   2589 	 * and thus could lead to a deadlock.
   2590 	 */
   2591 	vfsp = (struct vfs *)ksp->ks_private;
   2592 
   2593 	mi = VFTOMI4(vfsp);
   2594 	mik = (struct mntinfo_kstat *)ksp->ks_data;
   2595 
   2596 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
   2597 
   2598 	mik->mik_vers = (uint32_t)mi->mi_vers;
   2599 	mik->mik_flags = mi->mi_flags;
   2600 	/*
   2601 	 * The sv_secdata holds the flavor the client specifies.
   2602 	 * If the client uses default and a security negotiation
   2603 	 * occurs, sv_currsec will point to the current flavor
   2604 	 * selected from the server flavor list.
   2605 	 * sv_currsec is NULL if no security negotiation takes place.
   2606 	 */
   2607 	mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
   2608 	    mi->mi_curr_serv->sv_currsec->secmod :
   2609 	    mi->mi_curr_serv->sv_secdata->secmod;
   2610 	mik->mik_curread = (uint32_t)mi->mi_curread;
   2611 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
   2612 	mik->mik_retrans = mi->mi_retrans;
   2613 	mik->mik_timeo = mi->mi_timeo;
   2614 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
   2615 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
   2616 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
   2617 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
   2618 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
   2619 	mik->mik_failover = (uint32_t)mi->mi_failover;
   2620 	mik->mik_remap = (uint32_t)mi->mi_remap;
   2621 
   2622 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
   2623 
   2624 	return (0);
   2625 }
   2626 
   2627 void
   2628 nfs4_mnt_kstat_init(struct vfs *vfsp)
   2629 {
   2630 	mntinfo4_t *mi = VFTOMI4(vfsp);
   2631 
   2632 	/*
   2633 	 * PSARC 2001/697 Contract Private Interface
   2634 	 * All nfs kstats are under SunMC contract
   2635 	 * Please refer to the PSARC listed above and contact
   2636 	 * SunMC before making any changes!
   2637 	 *
   2638 	 * Changes must be reviewed by Solaris File Sharing
   2639 	 * Changes must be communicated to contract-2001-697 (at) sun.com
   2640 	 *
   2641 	 */
   2642 
   2643 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
   2644 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
   2645 	if (mi->mi_io_kstats) {
   2646 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
   2647 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
   2648 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
   2649 		kstat_install(mi->mi_io_kstats);
   2650 	}
   2651 
   2652 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
   2653 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
   2654 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
   2655 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
   2656 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
   2657 		mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
   2658 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
   2659 		kstat_install(mi->mi_ro_kstats);
   2660 	}
   2661 
   2662 	nfs4_mnt_recov_kstat_init(vfsp);
   2663 }
   2664 
   2665 void
   2666 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
   2667 {
   2668 	mntinfo4_t *mi;
   2669 	clock_t now = ddi_get_lbolt();
   2670 
   2671 	mi = VTOMI4(vp);
   2672 	/*
   2673 	 * In case of forced unmount, do not print any messages
   2674 	 * since it can flood the console with error messages.
   2675 	 */
   2676 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
   2677 		return;
   2678 
   2679 	/*
   2680 	 * If the mount point is dead, not recoverable, do not
   2681 	 * print error messages that can flood the console.
   2682 	 */
   2683 	if (mi->mi_flags & MI4_RECOV_FAIL)
   2684 		return;
   2685 
   2686 	/*
   2687 	 * No use in flooding the console with ENOSPC
   2688 	 * messages from the same file system.
   2689 	 */
   2690 	if ((error != ENOSPC && error != EDQUOT) ||
   2691 	    now - mi->mi_printftime > 0) {
   2692 		zoneid_t zoneid = mi->mi_zone->zone_id;
   2693 
   2694 #ifdef DEBUG
   2695 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
   2696 		    mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
   2697 #else
   2698 		nfs_perror(error, "NFS write error on host %s: %m.\n",
   2699 		    VTOR4(vp)->r_server->sv_hostname, NULL);
   2700 #endif
   2701 		if (error == ENOSPC || error == EDQUOT) {
   2702 			zcmn_err(zoneid, CE_CONT,
   2703 			    "^File: userid=%d, groupid=%d\n",
   2704 			    crgetuid(cr), crgetgid(cr));
   2705 			if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
   2706 			    crgetgid(curthread->t_cred) != crgetgid(cr)) {
   2707 				zcmn_err(zoneid, CE_CONT,
   2708 				    "^User: userid=%d, groupid=%d\n",
   2709 				    crgetuid(curthread->t_cred),
   2710 				    crgetgid(curthread->t_cred));
   2711 			}
   2712 			mi->mi_printftime = now +
   2713 			    nfs_write_error_interval * hz;
   2714 		}
   2715 		sfh4_printfhandle(VTOR4(vp)->r_fh);
   2716 #ifdef DEBUG
   2717 		if (error == EACCES) {
   2718 			zcmn_err(zoneid, CE_CONT,
   2719 			    "nfs_bio: cred is%s kcred\n",
   2720 			    cr == kcred ? "" : " not");
   2721 		}
   2722 #endif
   2723 	}
   2724 }
   2725 
   2726 /*
   2727  * Return non-zero if the given file can be safely memory mapped.  Locks
   2728  * are safe if whole-file (length and offset are both zero).
   2729  */
   2730 
   2731 #define	SAFE_LOCK(flk)	((flk).l_start == 0 && (flk).l_len == 0)
   2732 
   2733 static int
   2734 nfs4_safemap(const vnode_t *vp)
   2735 {
   2736 	locklist_t	*llp, *next_llp;
   2737 	int		safe = 1;
   2738 	rnode4_t	*rp = VTOR4(vp);
   2739 
   2740 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
   2741 
   2742 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
   2743 	    "vp = %p", (void *)vp));
   2744 
   2745 	/*
   2746 	 * Review all the locks for the vnode, both ones that have been
   2747 	 * acquired and ones that are pending.  We assume that
   2748 	 * flk_active_locks_for_vp() has merged any locks that can be
   2749 	 * merged (so that if a process has the entire file locked, it is
   2750 	 * represented as a single lock).
   2751 	 *
   2752 	 * Note that we can't bail out of the loop if we find a non-safe
   2753 	 * lock, because we have to free all the elements in the llp list.
   2754 	 * We might be able to speed up this code slightly by not looking
   2755 	 * at each lock's l_start and l_len fields once we've found a
   2756 	 * non-safe lock.
   2757 	 */
   2758 
   2759 	llp = flk_active_locks_for_vp(vp);
   2760 	while (llp) {
   2761 		NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
   2762 		    "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
   2763 		    llp->ll_flock.l_start, llp->ll_flock.l_len));
   2764 		if (!SAFE_LOCK(llp->ll_flock)) {
   2765 			safe = 0;
   2766 			NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
   2767 			    "nfs4_safemap: unsafe active lock (%" PRId64
   2768 			    ", %" PRId64 ")", llp->ll_flock.l_start,
   2769 			    llp->ll_flock.l_len));
   2770 		}
   2771 		next_llp = llp->ll_next;
   2772 		VN_RELE(llp->ll_vp);
   2773 		kmem_free(llp, sizeof (*llp));
   2774 		llp = next_llp;
   2775 	}
   2776 
   2777 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
   2778 	    safe ? "safe" : "unsafe"));
   2779 	return (safe);
   2780 }
   2781 
   2782 /*
   2783  * Return whether there is a lost LOCK or LOCKU queued up for the given
   2784  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
   2785  */
   2786 
   2787 bool_t
   2788 nfs4_map_lost_lock_conflict(vnode_t *vp)
   2789 {
   2790 	bool_t conflict = FALSE;
   2791 	nfs4_lost_rqst_t *lrp;
   2792 	mntinfo4_t *mi = VTOMI4(vp);
   2793 
   2794 	mutex_enter(&mi->mi_lock);
   2795 	for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
   2796 	    lrp = list_next(&mi->mi_lost_state, lrp)) {
   2797 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
   2798 			continue;
   2799 		ASSERT(lrp->lr_vp != NULL);
   2800 		if (!VOP_CMP(lrp->lr_vp, vp, NULL))
   2801 			continue;	/* different file */
   2802 		if (!SAFE_LOCK(*lrp->lr_flk)) {
   2803 			conflict = TRUE;
   2804 			break;
   2805 		}
   2806 	}
   2807 
   2808 	mutex_exit(&mi->mi_lock);
   2809 	return (conflict);
   2810 }
   2811 
   2812 /*
   2813  * nfs_lockcompletion:
   2814  *
   2815  * If the vnode has a lock that makes it unsafe to cache the file, mark it
   2816  * as non cachable (set VNOCACHE bit).
   2817  */
   2818 
   2819 void
   2820 nfs4_lockcompletion(vnode_t *vp, int cmd)
   2821 {
   2822 	rnode4_t *rp = VTOR4(vp);
   2823 
   2824 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
   2825 	ASSERT(!IS_SHADOW(vp, rp));
   2826 
   2827 	if (cmd == F_SETLK || cmd == F_SETLKW) {
   2828 
   2829 		if (!nfs4_safemap(vp)) {
   2830 			mutex_enter(&vp->v_lock);
   2831 			vp->v_flag |= VNOCACHE;
   2832 			mutex_exit(&vp->v_lock);
   2833 		} else {
   2834 			mutex_enter(&vp->v_lock);
   2835 			vp->v_flag &= ~VNOCACHE;
   2836 			mutex_exit(&vp->v_lock);
   2837 		}
   2838 	}
   2839 	/*
   2840 	 * The cached attributes of the file are stale after acquiring
   2841 	 * the lock on the file. They were updated when the file was
   2842 	 * opened, but not updated when the lock was acquired. Therefore the
   2843 	 * cached attributes are invalidated after the lock is obtained.
   2844 	 */
   2845 	PURGE_ATTRCACHE4(vp);
   2846 }
   2847 
   2848 /* ARGSUSED */
   2849 static void *
   2850 nfs4_mi_init(zoneid_t zoneid)
   2851 {
   2852 	struct mi4_globals *mig;
   2853 
   2854 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
   2855 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
   2856 	list_create(&mig->mig_list, sizeof (mntinfo4_t),
   2857 	    offsetof(mntinfo4_t, mi_zone_node));
   2858 	mig->mig_destructor_called = B_FALSE;
   2859 	return (mig);
   2860 }
   2861 
   2862 /*
   2863  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
   2864  * state and killing off threads.
   2865  */
   2866 /* ARGSUSED */
   2867 static void
   2868 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
   2869 {
   2870 	struct mi4_globals *mig = data;
   2871 	mntinfo4_t *mi;
   2872 	nfs4_server_t *np;
   2873 
   2874 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
   2875 	    "nfs4_mi_shutdown zone %d\n", zoneid));
   2876 	ASSERT(mig != NULL);
   2877 	for (;;) {
   2878 		mutex_enter(&mig->mig_lock);
   2879 		mi = list_head(&mig->mig_list);
   2880 		if (mi == NULL) {
   2881 			mutex_exit(&mig->mig_lock);
   2882 			break;
   2883 		}
   2884 
   2885 		NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
   2886 		    "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
   2887 		/*
   2888 		 * purge the DNLC for this filesystem
   2889 		 */
   2890 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
   2891 		/*
   2892 		 * Tell existing async worker threads to exit.
   2893 		 */
   2894 		mutex_enter(&mi->mi_async_lock);
   2895 		mi->mi_max_threads = 0;
   2896 		NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
   2897 		/*
   2898 		 * Set the appropriate flags, signal and wait for both the
   2899 		 * async manager and the inactive thread to exit when they're
   2900 		 * done with their current work.
   2901 		 */
   2902 		mutex_enter(&mi->mi_lock);
   2903 		mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
   2904 		mutex_exit(&mi->mi_lock);
   2905 		mutex_exit(&mi->mi_async_lock);
   2906 		if (mi->mi_manager_thread) {
   2907 			nfs4_async_manager_stop(mi->mi_vfsp);
   2908 		}
   2909 		if (mi->mi_inactive_thread) {
   2910 			mutex_enter(&mi->mi_async_lock);
   2911 			cv_signal(&mi->mi_inact_req_cv);
   2912 			/*
   2913 			 * Wait for the inactive thread to exit.
   2914 			 */
   2915 			while (mi->mi_inactive_thread != NULL) {
   2916 				cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
   2917 			}
   2918 			mutex_exit(&mi->mi_async_lock);
   2919 		}
   2920 		/*
   2921 		 * Wait for the recovery thread to complete, that is, it will
   2922 		 * signal when it is done using the "mi" structure and about
   2923 		 * to exit
   2924 		 */
   2925 		mutex_enter(&mi->mi_lock);
   2926 		while (mi->mi_in_recovery > 0)
   2927 			cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
   2928 		mutex_exit(&mi->mi_lock);
   2929 		/*
   2930 		 * We're done when every mi has been done or the list is empty.
   2931 		 * This one is done, remove it from the list.
   2932 		 */
   2933 		list_remove(&mig->mig_list, mi);
   2934 		mutex_exit(&mig->mig_lock);
   2935 		zone_rele(mi->mi_zone);
   2936 		/*
   2937 		 * Release hold on vfs and mi done to prevent race with zone
   2938 		 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
   2939 		 */
   2940 		VFS_RELE(mi->mi_vfsp);
   2941 		MI4_RELE(mi);
   2942 	}
   2943 	/*
   2944 	 * Tell each renew thread in the zone to exit
   2945 	 */
   2946 	mutex_enter(&nfs4_server_lst_lock);
   2947 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
   2948 		mutex_enter(&np->s_lock);
   2949 		if (np->zoneid == zoneid) {
   2950 			/*
   2951 			 * We add another hold onto the nfs4_server_t
   2952 			 * because this will make sure tha the nfs4_server_t
   2953 			 * stays around until nfs4_callback_fini_zone destroys
   2954 			 * the zone. This way, the renew thread can
   2955 			 * unconditionally release its holds on the
   2956 			 * nfs4_server_t.
   2957 			 */
   2958 			np->s_refcnt++;
   2959 			nfs4_mark_srv_dead(np);
   2960 		}
   2961 		mutex_exit(&np->s_lock);
   2962 	}
   2963 	mutex_exit(&nfs4_server_lst_lock);
   2964 }
   2965 
   2966 static void
   2967 nfs4_mi_free_globals(struct mi4_globals *mig)
   2968 {
   2969 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
   2970 	mutex_destroy(&mig->mig_lock);
   2971 	kmem_free(mig, sizeof (*mig));
   2972 }
   2973 
   2974 /* ARGSUSED */
   2975 static void
   2976 nfs4_mi_destroy(zoneid_t zoneid, void *data)
   2977 {
   2978 	struct mi4_globals *mig = data;
   2979 
   2980 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
   2981 	    "nfs4_mi_destroy zone %d\n", zoneid));
   2982 	ASSERT(mig != NULL);
   2983 	mutex_enter(&mig->mig_lock);
   2984 	if (list_head(&mig->mig_list) != NULL) {
   2985 		/* Still waiting for VFS_FREEVFS() */
   2986 		mig->mig_destructor_called = B_TRUE;
   2987 		mutex_exit(&mig->mig_lock);
   2988 		return;
   2989 	}
   2990 	nfs4_mi_free_globals(mig);
   2991 }
   2992 
   2993 /*
   2994  * Add an NFS mount to the per-zone list of NFS mounts.
   2995  */
   2996 void
   2997 nfs4_mi_zonelist_add(mntinfo4_t *mi)
   2998 {
   2999 	struct mi4_globals *mig;
   3000 
   3001 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
   3002 	mutex_enter(&mig->mig_lock);
   3003 	list_insert_head(&mig->mig_list, mi);
   3004 	/*
   3005 	 * hold added to eliminate race with zone shutdown -this will be
   3006 	 * released in mi_shutdown
   3007 	 */
   3008 	MI4_HOLD(mi);
   3009 	VFS_HOLD(mi->mi_vfsp);
   3010 	mutex_exit(&mig->mig_lock);
   3011 }
   3012 
   3013 /*
   3014  * Remove an NFS mount from the per-zone list of NFS mounts.
   3015  */
   3016 int
   3017 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
   3018 {
   3019 	struct mi4_globals *mig;
   3020 	int ret = 0;
   3021 
   3022 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
   3023 	mutex_enter(&mig->mig_lock);
   3024 	mutex_enter(&mi->mi_lock);
   3025 	/* if this mi is marked dead, then the zone already released it */
   3026 	if (!(mi->mi_flags & MI4_DEAD)) {
   3027 		list_remove(&mig->mig_list, mi);
   3028 		mutex_exit(&mi->mi_lock);
   3029 
   3030 		/* release the holds put on in zonelist_add(). */
   3031 		VFS_RELE(mi->mi_vfsp);
   3032 		MI4_RELE(mi);
   3033 		ret = 1;
   3034 	} else {
   3035 		mutex_exit(&mi->mi_lock);
   3036 	}
   3037 
   3038 	/*
   3039 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
   3040 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
   3041 	 * mi globals.
   3042 	 */
   3043 	if (list_head(&mig->mig_list) == NULL &&
   3044 	    mig->mig_destructor_called == B_TRUE) {
   3045 		nfs4_mi_free_globals(mig);
   3046 		return (ret);
   3047 	}
   3048 	mutex_exit(&mig->mig_lock);
   3049 	return (ret);
   3050 }
   3051 
   3052 void
   3053 nfs_free_mi4(mntinfo4_t *mi)
   3054 {
   3055 	nfs4_open_owner_t	*foop;
   3056 	nfs4_oo_hash_bucket_t   *bucketp;
   3057 	nfs4_debug_msg_t	*msgp;
   3058 	int i;
   3059 	servinfo4_t 		*svp;
   3060 
   3061 	/*
   3062 	 * Code introduced here should be carefully evaluated to make
   3063 	 * sure none of the freed resources are accessed either directly
   3064 	 * or indirectly after freeing them. For eg: Introducing calls to
   3065 	 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
   3066 	 * the structure members or other routines calling back into NFS
   3067 	 * accessing freed mntinfo4_t structure member.
   3068 	 */
   3069 	mutex_enter(&mi->mi_lock);
   3070 	ASSERT(mi->mi_recovthread == NULL);
   3071 	ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
   3072 	mutex_exit(&mi->mi_lock);
   3073 	mutex_enter(&mi->mi_async_lock);
   3074 	ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
   3075 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
   3076 	ASSERT(mi->mi_manager_thread == NULL);
   3077 	mutex_exit(&mi->mi_async_lock);
   3078 	if (mi->mi_io_kstats) {
   3079 		kstat_delete(mi->mi_io_kstats);
   3080 		mi->mi_io_kstats = NULL;
   3081 	}
   3082 	if (mi->mi_ro_kstats) {
   3083 		kstat_delete(mi->mi_ro_kstats);
   3084 		mi->mi_ro_kstats = NULL;
   3085 	}
   3086 	if (mi->mi_recov_ksp) {
   3087 		kstat_delete(mi->mi_recov_ksp);
   3088 		mi->mi_recov_ksp = NULL;
   3089 	}
   3090 	mutex_enter(&mi->mi_msg_list_lock);
   3091 	while (msgp = list_head(&mi->mi_msg_list)) {
   3092 		list_remove(&mi->mi_msg_list, msgp);
   3093 		nfs4_free_msg(msgp);
   3094 	}
   3095 	mutex_exit(&mi->mi_msg_list_lock);
   3096 	list_destroy(&mi->mi_msg_list);
   3097 	if (mi->mi_fname != NULL)
   3098 		fn_rele(&mi->mi_fname);
   3099 	if (mi->mi_rootfh != NULL)
   3100 		sfh4_rele(&mi->mi_rootfh);
   3101 	if (mi->mi_srvparentfh != NULL)
   3102 		sfh4_rele(&mi->mi_srvparentfh);
   3103 	svp = mi->mi_servers;
   3104 	sv4_free(svp);
   3105 	mutex_destroy(&mi->mi_lock);
   3106 	mutex_destroy(&mi->mi_async_lock);
   3107 	mutex_destroy(&mi->mi_msg_list_lock);
   3108 	nfs_rw_destroy(&mi->mi_recovlock);
   3109 	nfs_rw_destroy(&mi->mi_rename_lock);
   3110 	nfs_rw_destroy(&mi->mi_fh_lock);
   3111 	cv_destroy(&mi->mi_failover_cv);
   3112 	cv_destroy(&mi->mi_async_reqs_cv);
   3113 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
   3114 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
   3115 	cv_destroy(&mi->mi_async_cv);
   3116 	cv_destroy(&mi->mi_inact_req_cv);
   3117 	/*
   3118 	 * Destroy the oo hash lists and mutexes for the cred hash table.
   3119 	 */
   3120 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
   3121 		bucketp = &(mi->mi_oo_list[i]);
   3122 		/* Destroy any remaining open owners on the list */
   3123 		foop = list_head(&bucketp->b_oo_hash_list);
   3124 		while (foop != NULL) {
   3125 			list_remove(&bucketp->b_oo_hash_list, foop);
   3126 			nfs4_destroy_open_owner(foop);
   3127 			foop = list_head(&bucketp->b_oo_hash_list);
   3128 		}
   3129 		list_destroy(&bucketp->b_oo_hash_list);
   3130 		mutex_destroy(&bucketp->b_lock);
   3131 	}
   3132 	/*
   3133 	 * Empty and destroy the freed open owner list.
   3134 	 */
   3135 	foop = list_head(&mi->mi_foo_list);
   3136 	while (foop != NULL) {
   3137 		list_remove(&mi->mi_foo_list, foop);
   3138 		nfs4_destroy_open_owner(foop);
   3139 		foop = list_head(&mi->mi_foo_list);
   3140 	}
   3141 	list_destroy(&mi->mi_foo_list);
   3142 	list_destroy(&mi->mi_bseqid_list);
   3143 	list_destroy(&mi->mi_lost_state);
   3144 	avl_destroy(&mi->mi_filehandles);
   3145 	kmem_free(mi, sizeof (*mi));
   3146 }
   3147 void
   3148 mi_hold(mntinfo4_t *mi)
   3149 {
   3150 	atomic_add_32(&mi->mi_count, 1);
   3151 	ASSERT(mi->mi_count != 0);
   3152 }
   3153 
   3154 void
   3155 mi_rele(mntinfo4_t *mi)
   3156 {
   3157 	ASSERT(mi->mi_count != 0);
   3158 	if (atomic_add_32_nv(&mi->mi_count, -1) == 0) {
   3159 		nfs_free_mi4(mi);
   3160 	}
   3161 }
   3162 
   3163 vnode_t    nfs4_xattr_notsupp_vnode;
   3164 
   3165 void
   3166 nfs4_clnt_init(void)
   3167 {
   3168 	nfs4_vnops_init();
   3169 	(void) nfs4_rnode_init();
   3170 	(void) nfs4_shadow_init();
   3171 	(void) nfs4_acache_init();
   3172 	(void) nfs4_subr_init();
   3173 	nfs4_acl_init();
   3174 	nfs_idmap_init();
   3175 	nfs4_callback_init();
   3176 	nfs4_secinfo_init();
   3177 #ifdef	DEBUG
   3178 	tsd_create(&nfs4_tsd_key, NULL);
   3179 #endif
   3180 
   3181 	/*
   3182 	 * Add a CPR callback so that we can update client
   3183 	 * lease after a suspend and resume.
   3184 	 */
   3185 	cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
   3186 
   3187 	zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
   3188 	    nfs4_mi_destroy);
   3189 
   3190 	/*
   3191 	 * Initialise the reference count of the notsupp xattr cache vnode to 1
   3192 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
   3193 	 */
   3194 	nfs4_xattr_notsupp_vnode.v_count = 1;
   3195 }
   3196 
   3197 void
   3198 nfs4_clnt_fini(void)
   3199 {
   3200 	(void) zone_key_delete(mi4_list_key);
   3201 	nfs4_vnops_fini();
   3202 	(void) nfs4_rnode_fini();
   3203 	(void) nfs4_shadow_fini();
   3204 	(void) nfs4_acache_fini();
   3205 	(void) nfs4_subr_fini();
   3206 	nfs_idmap_fini();
   3207 	nfs4_callback_fini();
   3208 	nfs4_secinfo_fini();
   3209 #ifdef	DEBUG
   3210 	tsd_destroy(&nfs4_tsd_key);
   3211 #endif
   3212 	if (cid)
   3213 		(void) callb_delete(cid);
   3214 }
   3215 
   3216 /*ARGSUSED*/
   3217 static boolean_t
   3218 nfs4_client_cpr_callb(void *arg, int code)
   3219 {
   3220 	/*
   3221 	 * We get called for Suspend and Resume events.
   3222 	 * For the suspend case we simply don't care!
   3223 	 */
   3224 	if (code == CB_CODE_CPR_CHKPT) {
   3225 		return (B_TRUE);
   3226 	}
   3227 
   3228 	/*
   3229 	 * When we get to here we are in the process of
   3230 	 * resuming the system from a previous suspend.
   3231 	 */
   3232 	nfs4_client_resumed = gethrestime_sec();
   3233 	return (B_TRUE);
   3234 }
   3235 
   3236 void
   3237 nfs4_renew_lease_thread(nfs4_server_t *sp)
   3238 {
   3239 	int	error = 0;
   3240 	time_t	tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
   3241 	clock_t	tick_delay = 0;
   3242 	clock_t time_left = 0;
   3243 	callb_cpr_t cpr_info;
   3244 	kmutex_t cpr_lock;
   3245 
   3246 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3247 	    "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
   3248 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
   3249 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
   3250 
   3251 	mutex_enter(&sp->s_lock);
   3252 	/* sp->s_lease_time is set via a GETATTR */
   3253 	sp->last_renewal_time = gethrestime_sec();
   3254 	sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
   3255 	ASSERT(sp->s_refcnt >= 1);
   3256 
   3257 	for (;;) {
   3258 		if (!sp->state_ref_count ||
   3259 		    sp->lease_valid != NFS4_LEASE_VALID) {
   3260 
   3261 			kip_secs = MAX((sp->s_lease_time >> 1) -
   3262 			    (3 * sp->propagation_delay.tv_sec), 1);
   3263 
   3264 			tick_delay = SEC_TO_TICK(kip_secs);
   3265 
   3266 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3267 			    "nfs4_renew_lease_thread: no renew : thread "
   3268 			    "wait %ld secs", kip_secs));
   3269 
   3270 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3271 			    "nfs4_renew_lease_thread: no renew : "
   3272 			    "state_ref_count %d, lease_valid %d",
   3273 			    sp->state_ref_count, sp->lease_valid));
   3274 
   3275 			mutex_enter(&cpr_lock);
   3276 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
   3277 			mutex_exit(&cpr_lock);
   3278 			time_left = cv_reltimedwait(&sp->cv_thread_exit,
   3279 			    &sp->s_lock, tick_delay, TR_CLOCK_TICK);
   3280 			mutex_enter(&cpr_lock);
   3281 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
   3282 			mutex_exit(&cpr_lock);
   3283 
   3284 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3285 			    "nfs4_renew_lease_thread: no renew: "
   3286 			    "time left %ld", time_left));
   3287 
   3288 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
   3289 				goto die;
   3290 			continue;
   3291 		}
   3292 
   3293 		tmp_last_renewal_time = sp->last_renewal_time;
   3294 
   3295 		tmp_time = gethrestime_sec() - sp->last_renewal_time +
   3296 		    (3 * sp->propagation_delay.tv_sec);
   3297 
   3298 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3299 		    "nfs4_renew_lease_thread: tmp_time %ld, "
   3300 		    "sp->last_renewal_time %ld", tmp_time,
   3301 		    sp->last_renewal_time));
   3302 
   3303 		kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
   3304 
   3305 		tick_delay = SEC_TO_TICK(kip_secs);
   3306 
   3307 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3308 		    "nfs4_renew_lease_thread: valid lease: sleep for %ld "
   3309 		    "secs", kip_secs));
   3310 
   3311 		mutex_enter(&cpr_lock);
   3312 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
   3313 		mutex_exit(&cpr_lock);
   3314 		time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
   3315 		    tick_delay, TR_CLOCK_TICK);
   3316 		mutex_enter(&cpr_lock);
   3317 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
   3318 		mutex_exit(&cpr_lock);
   3319 
   3320 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3321 		    "nfs4_renew_lease_thread: valid lease: time left %ld :"
   3322 		    "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
   3323 		    "tmp_last_renewal_time %ld", time_left,
   3324 		    sp->last_renewal_time, nfs4_client_resumed,
   3325 		    tmp_last_renewal_time));
   3326 
   3327 		if (sp->s_thread_exit == NFS4_THREAD_EXIT)
   3328 			goto die;
   3329 
   3330 		if (tmp_last_renewal_time == sp->last_renewal_time ||
   3331 		    (nfs4_client_resumed != 0 &&
   3332 		    nfs4_client_resumed > sp->last_renewal_time)) {
   3333 			/*
   3334 			 * Issue RENEW op since we haven't renewed the lease
   3335 			 * since we slept.
   3336 			 */
   3337 			tmp_now_time = gethrestime_sec();
   3338 			error = nfs4renew(sp);
   3339 			/*
   3340 			 * Need to re-acquire sp's lock, nfs4renew()
   3341 			 * relinqueshes it.
   3342 			 */
   3343 			mutex_enter(&sp->s_lock);
   3344 
   3345 			/*
   3346 			 * See if someone changed s_thread_exit while we gave
   3347 			 * up s_lock.
   3348 			 */
   3349 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
   3350 				goto die;
   3351 
   3352 			if (!error) {
   3353 				/*
   3354 				 * check to see if we implicitly renewed while
   3355 				 * we waited for a reply for our RENEW call.
   3356 				 */
   3357 				if (tmp_last_renewal_time ==
   3358 				    sp->last_renewal_time) {
   3359 					/* no implicit renew came */
   3360 					sp->last_renewal_time = tmp_now_time;
   3361 				} else {
   3362 					NFS4_DEBUG(nfs4_client_lease_debug,
   3363 					    (CE_NOTE, "renew_thread: did "
   3364 					    "implicit renewal before reply "
   3365 					    "from server for RENEW"));
   3366 				}
   3367 			} else {
   3368 				/* figure out error */
   3369 				NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3370 				    "renew_thread: nfs4renew returned error"
   3371 				    " %d", error));
   3372 			}
   3373 
   3374 		}
   3375 	}
   3376 
   3377 die:
   3378 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3379 	    "nfs4_renew_lease_thread: thread exiting"));
   3380 
   3381 	while (sp->s_otw_call_count != 0) {
   3382 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3383 		    "nfs4_renew_lease_thread: waiting for outstanding "
   3384 		    "otw calls to finish for sp 0x%p, current "
   3385 		    "s_otw_call_count %d", (void *)sp,
   3386 		    sp->s_otw_call_count));
   3387 		mutex_enter(&cpr_lock);
   3388 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
   3389 		mutex_exit(&cpr_lock);
   3390 		cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
   3391 		mutex_enter(&cpr_lock);
   3392 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
   3393 		mutex_exit(&cpr_lock);
   3394 	}
   3395 	mutex_exit(&sp->s_lock);
   3396 
   3397 	nfs4_server_rele(sp);		/* free the thread's reference */
   3398 	nfs4_server_rele(sp);		/* free the list's reference */
   3399 	sp = NULL;
   3400 
   3401 done:
   3402 	mutex_enter(&cpr_lock);
   3403 	CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
   3404 	mutex_destroy(&cpr_lock);
   3405 
   3406 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3407 	    "nfs4_renew_lease_thread: renew thread exit officially"));
   3408 
   3409 	zthread_exit();
   3410 	/* NOT REACHED */
   3411 }
   3412 
   3413 /*
   3414  * Send out a RENEW op to the server.
   3415  * Assumes sp is locked down.
   3416  */
   3417 static int
   3418 nfs4renew(nfs4_server_t *sp)
   3419 {
   3420 	COMPOUND4args_clnt args;
   3421 	COMPOUND4res_clnt res;
   3422 	nfs_argop4 argop[1];
   3423 	int doqueue = 1;
   3424 	int rpc_error;
   3425 	cred_t *cr;
   3426 	mntinfo4_t *mi;
   3427 	timespec_t prop_time, after_time;
   3428 	int needrecov = FALSE;
   3429 	nfs4_recov_state_t recov_state;
   3430 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   3431 
   3432 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
   3433 
   3434 	recov_state.rs_flags = 0;
   3435 	recov_state.rs_num_retry_despite_err = 0;
   3436 
   3437 recov_retry:
   3438 	mi = sp->mntinfo4_list;
   3439 	VFS_HOLD(mi->mi_vfsp);
   3440 	mutex_exit(&sp->s_lock);
   3441 	ASSERT(mi != NULL);
   3442 
   3443 	e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
   3444 	if (e.error) {
   3445 		VFS_RELE(mi->mi_vfsp);
   3446 		return (e.error);
   3447 	}
   3448 
   3449 	/* Check to see if we're dealing with a marked-dead sp */
   3450 	mutex_enter(&sp->s_lock);
   3451 	if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
   3452 		mutex_exit(&sp->s_lock);
   3453 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
   3454 		VFS_RELE(mi->mi_vfsp);
   3455 		return (0);
   3456 	}
   3457 
   3458 	/* Make sure mi hasn't changed on us */
   3459 	if (mi != sp->mntinfo4_list) {
   3460 		/* Must drop sp's lock to avoid a recursive mutex enter */
   3461 		mutex_exit(&sp->s_lock);
   3462 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
   3463 		VFS_RELE(mi->mi_vfsp);
   3464 		mutex_enter(&sp->s_lock);
   3465 		goto recov_retry;
   3466 	}
   3467 	mutex_exit(&sp->s_lock);
   3468 
   3469 	args.ctag = TAG_RENEW;
   3470 
   3471 	args.array_len = 1;
   3472 	args.array = argop;
   3473 
   3474 	argop[0].argop = OP_RENEW;
   3475 
   3476 	mutex_enter(&sp->s_lock);
   3477 	argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
   3478 	cr = sp->s_cred;
   3479 	crhold(cr);
   3480 	mutex_exit(&sp->s_lock);
   3481 
   3482 	ASSERT(cr != NULL);
   3483 
   3484 	/* used to figure out RTT for sp */
   3485 	gethrestime(&prop_time);
   3486 
   3487 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
   3488 	    "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
   3489 	    (void*)sp));
   3490 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
   3491 	    prop_time.tv_sec, prop_time.tv_nsec));
   3492 
   3493 	DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
   3494 	    mntinfo4_t *, mi);
   3495 
   3496 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
   3497 	crfree(cr);
   3498 
   3499 	DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
   3500 	    mntinfo4_t *, mi);
   3501 
   3502 	gethrestime(&after_time);
   3503 
   3504 	mutex_enter(&sp->s_lock);
   3505 	sp->propagation_delay.tv_sec =
   3506 	    MAX(1, after_time.tv_sec - prop_time.tv_sec);
   3507 	mutex_exit(&sp->s_lock);
   3508 
   3509 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
   3510 	    after_time.tv_sec, after_time.tv_nsec));
   3511 
   3512 	if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
   3513 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3514 		nfs4_delegreturn_all(sp);
   3515 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
   3516 		VFS_RELE(mi->mi_vfsp);
   3517 		/*
   3518 		 * If the server returns CB_PATH_DOWN, it has renewed
   3519 		 * the lease and informed us that the callback path is
   3520 		 * down.  Since the lease is renewed, just return 0 and
   3521 		 * let the renew thread proceed as normal.
   3522 		 */
   3523 		return (0);
   3524 	}
   3525 
   3526 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
   3527 	if (!needrecov && e.error) {
   3528 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
   3529 		VFS_RELE(mi->mi_vfsp);
   3530 		return (e.error);
   3531 	}
   3532 
   3533 	rpc_error = e.error;
   3534 
   3535 	if (needrecov) {
   3536 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   3537 		    "nfs4renew: initiating recovery\n"));
   3538 
   3539 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
   3540 		    OP_RENEW, NULL, NULL, NULL) == FALSE) {
   3541 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
   3542 			VFS_RELE(mi->mi_vfsp);
   3543 			if (!e.error)
   3544 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   3545 				    (caddr_t)&res);
   3546 			mutex_enter(&sp->s_lock);
   3547 			goto recov_retry;
   3548 		}
   3549 		/* fall through for res.status case */
   3550 	}
   3551 
   3552 	if (res.status) {
   3553 		if (res.status == NFS4ERR_LEASE_MOVED) {
   3554 			/*EMPTY*/
   3555 			/*
   3556 			 * XXX need to try every mntinfo4 in sp->mntinfo4_list
   3557 			 * to renew the lease on that server
   3558 			 */
   3559 		}
   3560 		e.error = geterrno4(res.status);
   3561 	}
   3562 
   3563 	if (!rpc_error)
   3564 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3565 
   3566 	nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
   3567 
   3568 	VFS_RELE(mi->mi_vfsp);
   3569 
   3570 	return (e.error);
   3571 }
   3572 
   3573 void
   3574 nfs4_inc_state_ref_count(mntinfo4_t *mi)
   3575 {
   3576 	nfs4_server_t	*sp;
   3577 
   3578 	/* this locks down sp if it is found */
   3579 	sp = find_nfs4_server(mi);
   3580 
   3581 	if (sp != NULL) {
   3582 		nfs4_inc_state_ref_count_nolock(sp, mi);
   3583 		mutex_exit(&sp->s_lock);
   3584 		nfs4_server_rele(sp);
   3585 	}
   3586 }
   3587 
   3588 /*
   3589  * Bump the number of OPEN files (ie: those with state) so we know if this
   3590  * nfs4_server has any state to maintain a lease for or not.
   3591  *
   3592  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
   3593  */
   3594 void
   3595 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
   3596 {
   3597 	ASSERT(mutex_owned(&sp->s_lock));
   3598 
   3599 	sp->state_ref_count++;
   3600 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3601 	    "nfs4_inc_state_ref_count: state_ref_count now %d",
   3602 	    sp->state_ref_count));
   3603 
   3604 	if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
   3605 		sp->lease_valid = NFS4_LEASE_VALID;
   3606 
   3607 	/*
   3608 	 * If this call caused the lease to be marked valid and/or
   3609 	 * took the state_ref_count from 0 to 1, then start the time
   3610 	 * on lease renewal.
   3611 	 */
   3612 	if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
   3613 		sp->last_renewal_time = gethrestime_sec();
   3614 
   3615 	/* update the number of open files for mi */
   3616 	mi->mi_open_files++;
   3617 }
   3618 
   3619 void
   3620 nfs4_dec_state_ref_count(mntinfo4_t *mi)
   3621 {
   3622 	nfs4_server_t	*sp;
   3623 
   3624 	/* this locks down sp if it is found */
   3625 	sp = find_nfs4_server_all(mi, 1);
   3626 
   3627 	if (sp != NULL) {
   3628 		nfs4_dec_state_ref_count_nolock(sp, mi);
   3629 		mutex_exit(&sp->s_lock);
   3630 		nfs4_server_rele(sp);
   3631 	}
   3632 }
   3633 
   3634 /*
   3635  * Decrement the number of OPEN files (ie: those with state) so we know if
   3636  * this nfs4_server has any state to maintain a lease for or not.
   3637  */
   3638 void
   3639 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
   3640 {
   3641 	ASSERT(mutex_owned(&sp->s_lock));
   3642 	ASSERT(sp->state_ref_count != 0);
   3643 	sp->state_ref_count--;
   3644 
   3645 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3646 	    "nfs4_dec_state_ref_count: state ref count now %d",
   3647 	    sp->state_ref_count));
   3648 
   3649 	mi->mi_open_files--;
   3650 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3651 	    "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
   3652 	    mi->mi_open_files, mi->mi_flags));
   3653 
   3654 	/* We don't have to hold the mi_lock to test mi_flags */
   3655 	if (mi->mi_open_files == 0 &&
   3656 	    (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
   3657 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3658 		    "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
   3659 		    "we have closed the last open file", (void*)mi));
   3660 		nfs4_remove_mi_from_server(mi, sp);
   3661 	}
   3662 }
   3663 
   3664 bool_t
   3665 inlease(nfs4_server_t *sp)
   3666 {
   3667 	bool_t result;
   3668 
   3669 	ASSERT(mutex_owned(&sp->s_lock));
   3670 
   3671 	if (sp->lease_valid == NFS4_LEASE_VALID &&
   3672 	    gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
   3673 		result = TRUE;
   3674 	else
   3675 		result = FALSE;
   3676 
   3677 	return (result);
   3678 }
   3679 
   3680 
   3681 /*
   3682  * Return non-zero if the given nfs4_server_t is going through recovery.
   3683  */
   3684 
   3685 int
   3686 nfs4_server_in_recovery(nfs4_server_t *sp)
   3687 {
   3688 	return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
   3689 }
   3690 
   3691 /*
   3692  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
   3693  * first is less than, equal to, or greater than the second.
   3694  */
   3695 
   3696 int
   3697 sfh4cmp(const void *p1, const void *p2)
   3698 {
   3699 	const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
   3700 	const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
   3701 
   3702 	return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
   3703 }
   3704 
   3705 /*
   3706  * Create a table for shared filehandle objects.
   3707  */
   3708 
   3709 void
   3710 sfh4_createtab(avl_tree_t *tab)
   3711 {
   3712 	avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
   3713 	    offsetof(nfs4_sharedfh_t, sfh_tree));
   3714 }
   3715 
   3716 /*
   3717  * Return a shared filehandle object for the given filehandle.  The caller
   3718  * is responsible for eventually calling sfh4_rele().
   3719  */
   3720 
   3721 nfs4_sharedfh_t *
   3722 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
   3723 {
   3724 	nfs4_sharedfh_t *sfh, *nsfh;
   3725 	avl_index_t where;
   3726 	nfs4_sharedfh_t skey;
   3727 
   3728 	if (!key) {
   3729 		skey.sfh_fh = *fh;
   3730 		key = &skey;
   3731 	}
   3732 
   3733 	nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
   3734 	nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
   3735 	/*
   3736 	 * We allocate the largest possible filehandle size because it's
   3737 	 * not that big, and it saves us from possibly having to resize the
   3738 	 * buffer later.
   3739 	 */
   3740 	nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
   3741 	bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
   3742 	mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
   3743 	nsfh->sfh_refcnt = 1;
   3744 	nsfh->sfh_flags = SFH4_IN_TREE;
   3745 	nsfh->sfh_mi = mi;
   3746 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
   3747 	    (void *)nsfh));
   3748 
   3749 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
   3750 	sfh = avl_find(&mi->mi_filehandles, key, &where);
   3751 	if (sfh != NULL) {
   3752 		mutex_enter(&sfh->sfh_lock);
   3753 		sfh->sfh_refcnt++;
   3754 		mutex_exit(&sfh->sfh_lock);
   3755 		nfs_rw_exit(&mi->mi_fh_lock);
   3756 		/* free our speculative allocs */
   3757 		kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
   3758 		kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
   3759 		return (sfh);
   3760 	}
   3761 
   3762 	avl_insert(&mi->mi_filehandles, nsfh, where);
   3763 	nfs_rw_exit(&mi->mi_fh_lock);
   3764 
   3765 	return (nsfh);
   3766 }
   3767 
   3768 /*
   3769  * Return a shared filehandle object for the given filehandle.  The caller
   3770  * is responsible for eventually calling sfh4_rele().
   3771  */
   3772 
   3773 nfs4_sharedfh_t *
   3774 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
   3775 {
   3776 	nfs4_sharedfh_t *sfh;
   3777 	nfs4_sharedfh_t key;
   3778 
   3779 	ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
   3780 
   3781 #ifdef DEBUG
   3782 	if (nfs4_sharedfh_debug) {
   3783 		nfs4_fhandle_t fhandle;
   3784 
   3785 		fhandle.fh_len = fh->nfs_fh4_len;
   3786 		bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
   3787 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
   3788 		nfs4_printfhandle(&fhandle);
   3789 	}
   3790 #endif
   3791 
   3792 	/*
   3793 	 * If there's already an object for the given filehandle, bump the
   3794 	 * reference count and return it.  Otherwise, create a new object
   3795 	 * and add it to the AVL tree.
   3796 	 */
   3797 
   3798 	key.sfh_fh = *fh;
   3799 
   3800 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
   3801 	sfh = avl_find(&mi->mi_filehandles, &key, NULL);
   3802 	if (sfh != NULL) {
   3803 		mutex_enter(&sfh->sfh_lock);
   3804 		sfh->sfh_refcnt++;
   3805 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
   3806 		    "sfh4_get: found existing %p, new refcnt=%d",
   3807 		    (void *)sfh, sfh->sfh_refcnt));
   3808 		mutex_exit(&sfh->sfh_lock);
   3809 		nfs_rw_exit(&mi->mi_fh_lock);
   3810 		return (sfh);
   3811 	}
   3812 	nfs_rw_exit(&mi->mi_fh_lock);
   3813 
   3814 	return (sfh4_put(fh, mi, &key));
   3815 }
   3816 
   3817 /*
   3818  * Get a reference to the given shared filehandle object.
   3819  */
   3820 
   3821 void
   3822 sfh4_hold(nfs4_sharedfh_t *sfh)
   3823 {
   3824 	ASSERT(sfh->sfh_refcnt > 0);
   3825 
   3826 	mutex_enter(&sfh->sfh_lock);
   3827 	sfh->sfh_refcnt++;
   3828 	NFS4_DEBUG(nfs4_sharedfh_debug,
   3829 	    (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
   3830 	    (void *)sfh, sfh->sfh_refcnt));
   3831 	mutex_exit(&sfh->sfh_lock);
   3832 }
   3833 
   3834 /*
   3835  * Release a reference to the given shared filehandle object and null out
   3836  * the given pointer.
   3837  */
   3838 
   3839 void
   3840 sfh4_rele(nfs4_sharedfh_t **sfhpp)
   3841 {
   3842 	mntinfo4_t *mi;
   3843 	nfs4_sharedfh_t *sfh = *sfhpp;
   3844 
   3845 	ASSERT(sfh->sfh_refcnt > 0);
   3846 
   3847 	mutex_enter(&sfh->sfh_lock);
   3848 	if (sfh->sfh_refcnt > 1) {
   3849 		sfh->sfh_refcnt--;
   3850 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
   3851 		    "sfh4_rele %p, new refcnt=%d",
   3852 		    (void *)sfh, sfh->sfh_refcnt));
   3853 		mutex_exit(&sfh->sfh_lock);
   3854 		goto finish;
   3855 	}
   3856 	mutex_exit(&sfh->sfh_lock);
   3857 
   3858 	/*
   3859 	 * Possibly the last reference, so get the lock for the table in
   3860 	 * case it's time to remove the object from the table.
   3861 	 */
   3862 	mi = sfh->sfh_mi;
   3863 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
   3864 	mutex_enter(&sfh->sfh_lock);
   3865 	sfh->sfh_refcnt--;
   3866 	if (sfh->sfh_refcnt > 0) {
   3867 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
   3868 		    "sfh4_rele %p, new refcnt=%d",
   3869 		    (void *)sfh, sfh->sfh_refcnt));
   3870 		mutex_exit(&sfh->sfh_lock);
   3871 		nfs_rw_exit(&mi->mi_fh_lock);
   3872 		goto finish;
   3873 	}
   3874 
   3875 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
   3876 	    "sfh4_rele %p, last ref", (void *)sfh));
   3877 	if (sfh->sfh_flags & SFH4_IN_TREE) {
   3878 		avl_remove(&mi->mi_filehandles, sfh);
   3879 		sfh->sfh_flags &= ~SFH4_IN_TREE;
   3880 	}
   3881 	mutex_exit(&sfh->sfh_lock);
   3882 	nfs_rw_exit(&mi->mi_fh_lock);
   3883 	mutex_destroy(&sfh->sfh_lock);
   3884 	kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
   3885 	kmem_free(sfh, sizeof (nfs4_sharedfh_t));
   3886 
   3887 finish:
   3888 	*sfhpp = NULL;
   3889 }
   3890 
   3891 /*
   3892  * Update the filehandle for the given shared filehandle object.
   3893  */
   3894 
   3895 int nfs4_warn_dupfh = 0;	/* if set, always warn about dup fhs below */
   3896 
   3897 void
   3898 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
   3899 {
   3900 	mntinfo4_t *mi = sfh->sfh_mi;
   3901 	nfs4_sharedfh_t *dupsfh;
   3902 	avl_index_t where;
   3903 	nfs4_sharedfh_t key;
   3904 
   3905 #ifdef DEBUG
   3906 	mutex_enter(&sfh->sfh_lock);
   3907 	ASSERT(sfh->sfh_refcnt > 0);
   3908 	mutex_exit(&sfh->sfh_lock);
   3909 #endif
   3910 	ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
   3911 
   3912 	/*
   3913 	 * The basic plan is to remove the shared filehandle object from
   3914 	 * the table, update it to have the new filehandle, then reinsert
   3915 	 * it.
   3916 	 */
   3917 
   3918 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
   3919 	mutex_enter(&sfh->sfh_lock);
   3920 	if (sfh->sfh_flags & SFH4_IN_TREE) {
   3921 		avl_remove(&mi->mi_filehandles, sfh);
   3922 		sfh->sfh_flags &= ~SFH4_IN_TREE;
   3923 	}
   3924 	mutex_exit(&sfh->sfh_lock);
   3925 	sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
   3926 	bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
   3927 	    sfh->sfh_fh.nfs_fh4_len);
   3928 
   3929 	/*
   3930 	 * XXX If there is already a shared filehandle object with the new
   3931 	 * filehandle, we're in trouble, because the rnode code assumes
   3932 	 * that there is only one shared filehandle object for a given
   3933 	 * filehandle.  So issue a warning (for read-write mounts only)
   3934 	 * and don't try to re-insert the given object into the table.
   3935 	 * Hopefully the given object will quickly go away and everyone
   3936 	 * will use the new object.
   3937 	 */
   3938 	key.sfh_fh = *newfh;
   3939 	dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
   3940 	if (dupsfh != NULL) {
   3941 		if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
   3942 			zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
   3943 			    "duplicate filehandle detected");
   3944 			sfh4_printfhandle(dupsfh);
   3945 		}
   3946 	} else {
   3947 		avl_insert(&mi->mi_filehandles, sfh, where);
   3948 		mutex_enter(&sfh->sfh_lock);
   3949 		sfh->sfh_flags |= SFH4_IN_TREE;
   3950 		mutex_exit(&sfh->sfh_lock);
   3951 	}
   3952 	nfs_rw_exit(&mi->mi_fh_lock);
   3953 }
   3954 
   3955 /*
   3956  * Copy out the current filehandle for the given shared filehandle object.
   3957  */
   3958 
   3959 void
   3960 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
   3961 {
   3962 	mntinfo4_t *mi = sfh->sfh_mi;
   3963 
   3964 	ASSERT(sfh->sfh_refcnt > 0);
   3965 
   3966 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
   3967 	fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
   3968 	ASSERT(fhp->fh_len <= NFS4_FHSIZE);
   3969 	bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
   3970 	nfs_rw_exit(&mi->mi_fh_lock);
   3971 }
   3972 
   3973 /*
   3974  * Print out the filehandle for the given shared filehandle object.
   3975  */
   3976 
   3977 void
   3978 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
   3979 {
   3980 	nfs4_fhandle_t fhandle;
   3981 
   3982 	sfh4_copyval(sfh, &fhandle);
   3983 	nfs4_printfhandle(&fhandle);
   3984 }
   3985 
   3986 /*
   3987  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
   3988  * if they're the same, +1 if the first is "greater" than the second.  The
   3989  * caller (or whoever's calling the AVL package) is responsible for
   3990  * handling locking issues.
   3991  */
   3992 
   3993 static int
   3994 fncmp(const void *p1, const void *p2)
   3995 {
   3996 	const nfs4_fname_t *f1 = p1;
   3997 	const nfs4_fname_t *f2 = p2;
   3998 	int res;
   3999 
   4000 	res = strcmp(f1->fn_name, f2->fn_name);
   4001 	/*
   4002 	 * The AVL package wants +/-1, not arbitrary positive or negative
   4003 	 * integers.
   4004 	 */
   4005 	if (res > 0)
   4006 		res = 1;
   4007 	else if (res < 0)
   4008 		res = -1;
   4009 	return (res);
   4010 }
   4011 
   4012 /*
   4013  * Get or create an fname with the given name, as a child of the given
   4014  * fname.  The caller is responsible for eventually releasing the reference
   4015  * (fn_rele()).  parent may be NULL.
   4016  */
   4017 
   4018 nfs4_fname_t *
   4019 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
   4020 {
   4021 	nfs4_fname_t key;
   4022 	nfs4_fname_t *fnp;
   4023 	avl_index_t where;
   4024 
   4025 	key.fn_name = name;
   4026 
   4027 	/*
   4028 	 * If there's already an fname registered with the given name, bump
   4029 	 * its reference count and return it.  Otherwise, create a new one
   4030 	 * and add it to the parent's AVL tree.
   4031 	 *
   4032 	 * fname entries we are looking for should match both name
   4033 	 * and sfh stored in the fname.
   4034 	 */
   4035 again:
   4036 	if (parent != NULL) {
   4037 		mutex_enter(&parent->fn_lock);
   4038 		fnp = avl_find(&parent->fn_children, &key, &where);
   4039 		if (fnp != NULL) {
   4040 			/*
   4041 			 * This hold on fnp is released below later,
   4042 			 * in case this is not the fnp we want.
   4043 			 */
   4044 			fn_hold(fnp);
   4045 
   4046 			if (fnp->fn_sfh == sfh) {
   4047 				/*
   4048 				 * We have found our entry.
   4049 				 * put an hold and return it.
   4050 				 */
   4051 				mutex_exit(&parent->fn_lock);
   4052 				return (fnp);
   4053 			}
   4054 
   4055 			/*
   4056 			 * We have found an entry that has a mismatching
   4057 			 * fn_sfh. This could be a stale entry due to
   4058 			 * server side rename. We will remove this entry
   4059 			 * and make sure no such entries exist.
   4060 			 */
   4061 			mutex_exit(&parent->fn_lock);
   4062 			mutex_enter(&fnp->fn_lock);
   4063 			if (fnp->fn_parent == parent) {
   4064 				/*
   4065 				 * Remove ourselves from parent's
   4066 				 * fn_children tree.
   4067 				 */
   4068 				mutex_enter(&parent->fn_lock);
   4069 				avl_remove(&parent->fn_children, fnp);
   4070 				mutex_exit(&parent->fn_lock);
   4071 				fn_rele(&fnp->fn_parent);
   4072 			}
   4073 			mutex_exit(&fnp->fn_lock);
   4074 			fn_rele(&fnp);
   4075 			goto again;
   4076 		}
   4077 	}
   4078 
   4079 	fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
   4080 	mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
   4081 	fnp->fn_parent = parent;
   4082 	if (parent != NULL)
   4083 		fn_hold(parent);
   4084 	fnp->fn_len = strlen(name);
   4085 	ASSERT(fnp->fn_len < MAXNAMELEN);
   4086 	fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
   4087 	(void) strcpy(fnp->fn_name, name);
   4088 	fnp->fn_refcnt = 1;
   4089 
   4090 	/*
   4091 	 * This hold on sfh is later released
   4092 	 * when we do the final fn_rele() on this fname.
   4093 	 */
   4094 	sfh4_hold(sfh);
   4095 	fnp->fn_sfh = sfh;
   4096 
   4097 	avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
   4098 	    offsetof(nfs4_fname_t, fn_tree));
   4099 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
   4100 	    "fn_get %p:%s, a new nfs4_fname_t!",
   4101 	    (void *)fnp, fnp->fn_name));
   4102 	if (parent != NULL) {
   4103 		avl_insert(&parent->fn_children, fnp, where);
   4104 		mutex_exit(&parent->fn_lock);
   4105 	}
   4106 
   4107 	return (fnp);
   4108 }
   4109 
   4110 void
   4111 fn_hold(nfs4_fname_t *fnp)
   4112 {
   4113 	atomic_add_32(&fnp->fn_refcnt, 1);
   4114 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
   4115 	    "fn_hold %p:%s, new refcnt=%d",
   4116 	    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
   4117 }
   4118 
   4119 /*
   4120  * Decrement the reference count of the given fname, and destroy it if its
   4121  * reference count goes to zero.  Nulls out the given pointer.
   4122  */
   4123 
   4124 void
   4125 fn_rele(nfs4_fname_t **fnpp)
   4126 {
   4127 	nfs4_fname_t *parent;
   4128 	uint32_t newref;
   4129 	nfs4_fname_t *fnp;
   4130 
   4131 recur:
   4132 	fnp = *fnpp;
   4133 	*fnpp = NULL;
   4134 
   4135 	mutex_enter(&fnp->fn_lock);
   4136 	parent = fnp->fn_parent;
   4137 	if (parent != NULL)
   4138 		mutex_enter(&parent->fn_lock);	/* prevent new references */
   4139 	newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
   4140 	if (newref > 0) {
   4141 		NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
   4142 		    "fn_rele %p:%s, new refcnt=%d",
   4143 		    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
   4144 		if (parent != NULL)
   4145 			mutex_exit(&parent->fn_lock);
   4146 		mutex_exit(&fnp->fn_lock);
   4147 		return;
   4148 	}
   4149 
   4150 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
   4151 	    "fn_rele %p:%s, last reference, deleting...",
   4152 	    (void *)fnp, fnp->fn_name));
   4153 	if (parent != NULL) {
   4154 		avl_remove(&parent->fn_children, fnp);
   4155 		mutex_exit(&parent->fn_lock);
   4156 	}
   4157 	kmem_free(fnp->fn_name, fnp->fn_len + 1);
   4158 	sfh4_rele(&fnp->fn_sfh);
   4159 	mutex_destroy(&fnp->fn_lock);
   4160 	avl_destroy(&fnp->fn_children);
   4161 	kmem_free(fnp, sizeof (nfs4_fname_t));
   4162 	/*
   4163 	 * Recursivly fn_rele the parent.
   4164 	 * Use goto instead of a recursive call to avoid stack overflow.
   4165 	 */
   4166 	if (parent != NULL) {
   4167 		fnpp = &parent;
   4168 		goto recur;
   4169 	}
   4170 }
   4171 
   4172 /*
   4173  * Returns the single component name of the given fname, in a MAXNAMELEN
   4174  * string buffer, which the caller is responsible for freeing.  Note that
   4175  * the name may become invalid as a result of fn_move().
   4176  */
   4177 
   4178 char *
   4179 fn_name(nfs4_fname_t *fnp)
   4180 {
   4181 	char *name;
   4182 
   4183 	ASSERT(fnp->fn_len < MAXNAMELEN);
   4184 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
   4185 	mutex_enter(&fnp->fn_lock);
   4186 	(void) strcpy(name, fnp->fn_name);
   4187 	mutex_exit(&fnp->fn_lock);
   4188 
   4189 	return (name);
   4190 }
   4191 
   4192 
   4193 /*
   4194  * fn_path_realloc
   4195  *
   4196  * This function, used only by fn_path, constructs
   4197  * a new string which looks like "prepend" + "/" + "current".
   4198  * by allocating a new string and freeing the old one.
   4199  */
   4200 static void
   4201 fn_path_realloc(char **curses, char *prepend)
   4202 {
   4203 	int len, curlen = 0;
   4204 	char *news;
   4205 
   4206 	if (*curses == NULL) {
   4207 		/*
   4208 		 * Prime the pump, allocate just the
   4209 		 * space for prepend and return that.
   4210 		 */
   4211 		len = strlen(prepend) + 1;
   4212 		news = kmem_alloc(len, KM_SLEEP);
   4213 		(void) strncpy(news, prepend, len);
   4214 	} else {
   4215 		/*
   4216 		 * Allocate the space  for a new string
   4217 		 * +1 +1 is for the "/" and the NULL
   4218 		 * byte at the end of it all.
   4219 		 */
   4220 		curlen = strlen(*curses);
   4221 		len = curlen + strlen(prepend) + 1 + 1;
   4222 		news = kmem_alloc(len, KM_SLEEP);
   4223 		(void) strncpy(news, prepend, len);
   4224 		(void) strcat(news, "/");
   4225 		(void) strcat(news, *curses);
   4226 		kmem_free(*curses, curlen + 1);
   4227 	}
   4228 	*curses = news;
   4229 }
   4230 
   4231 /*
   4232  * Returns the path name (starting from the fs root) for the given fname.
   4233  * The caller is responsible for freeing.  Note that the path may be or
   4234  * become invalid as a result of fn_move().
   4235  */
   4236 
   4237 char *
   4238 fn_path(nfs4_fname_t *fnp)
   4239 {
   4240 	char *path;
   4241 	nfs4_fname_t *nextfnp;
   4242 
   4243 	if (fnp == NULL)
   4244 		return (NULL);
   4245 
   4246 	path = NULL;
   4247 
   4248 	/* walk up the tree constructing the pathname.  */
   4249 
   4250 	fn_hold(fnp);			/* adjust for later rele */
   4251 	do {
   4252 		mutex_enter(&fnp->fn_lock);
   4253 		/*
   4254 		 * Add fn_name in front of the current path
   4255 		 */
   4256 		fn_path_realloc(&path, fnp->fn_name);
   4257 		nextfnp = fnp->fn_parent;
   4258 		if (nextfnp != NULL)
   4259 			fn_hold(nextfnp);
   4260 		mutex_exit(&fnp->fn_lock);
   4261 		fn_rele(&fnp);
   4262 		fnp = nextfnp;
   4263 	} while (fnp != NULL);
   4264 
   4265 	return (path);
   4266 }
   4267 
   4268 /*
   4269  * Return a reference to the parent of the given fname, which the caller is
   4270  * responsible for eventually releasing.
   4271  */
   4272 
   4273 nfs4_fname_t *
   4274 fn_parent(nfs4_fname_t *fnp)
   4275 {
   4276 	nfs4_fname_t *parent;
   4277 
   4278 	mutex_enter(&fnp->fn_lock);
   4279 	parent = fnp->fn_parent;
   4280 	if (parent != NULL)
   4281 		fn_hold(parent);
   4282 	mutex_exit(&fnp->fn_lock);
   4283 
   4284 	return (parent);
   4285 }
   4286 
   4287 /*
   4288  * Update fnp so that its parent is newparent and its name is newname.
   4289  */
   4290 
   4291 void
   4292 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
   4293 {
   4294 	nfs4_fname_t *parent, *tmpfnp;
   4295 	ssize_t newlen;
   4296 	nfs4_fname_t key;
   4297 	avl_index_t where;
   4298 
   4299 	/*
   4300 	 * This assert exists to catch the client trying to rename
   4301 	 * a dir to be a child of itself.  This happened at a recent
   4302 	 * bakeoff against a 3rd party (broken) server which allowed
   4303 	 * the rename to succeed.  If it trips it means that:
   4304 	 *	a) the code in nfs4rename that detects this case is broken
   4305 	 *	b) the server is broken (since it allowed the bogus rename)
   4306 	 *
   4307 	 * For non-DEBUG kernels, prepare for a recursive mutex_enter
   4308 	 * panic below from:  mutex_enter(&newparent->fn_lock);
   4309 	 */
   4310 	ASSERT(fnp != newparent);
   4311 
   4312 	/*
   4313 	 * Remove fnp from its current parent, change its name, then add it
   4314 	 * to newparent. It might happen that fnp was replaced by another
   4315 	 * nfs4_fname_t with the same fn_name in parent->fn_children.
   4316 	 * In such case, fnp->fn_parent is NULL and we skip the removal
   4317 	 * of fnp from its current parent.
   4318 	 */
   4319 	mutex_enter(&fnp->fn_lock);
   4320 	parent = fnp->fn_parent;
   4321 	if (parent != NULL) {
   4322 		mutex_enter(&parent->fn_lock);
   4323 		avl_remove(&parent->fn_children, fnp);
   4324 		mutex_exit(&parent->fn_lock);
   4325 		fn_rele(&fnp->fn_parent);
   4326 	}
   4327 
   4328 	newlen = strlen(newname);
   4329 	if (newlen != fnp->fn_len) {
   4330 		ASSERT(newlen < MAXNAMELEN);
   4331 		kmem_free(fnp->fn_name, fnp->fn_len + 1);
   4332 		fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
   4333 		fnp->fn_len = newlen;
   4334 	}
   4335 	(void) strcpy(fnp->fn_name, newname);
   4336 
   4337 again:
   4338 	mutex_enter(&newparent->fn_lock);
   4339 	key.fn_name = fnp->fn_name;
   4340 	tmpfnp = avl_find(&newparent->fn_children, &key, &where);
   4341 	if (tmpfnp != NULL) {
   4342 		/*
   4343 		 * This could be due to a file that was unlinked while
   4344 		 * open, or perhaps the rnode is in the free list.  Remove
   4345 		 * it from newparent and let it go away on its own.  The
   4346 		 * contorted code is to deal with lock order issues and
   4347 		 * race conditions.
   4348 		 */
   4349 		fn_hold(tmpfnp);
   4350 		mutex_exit(&newparent->fn_lock);
   4351 		mutex_enter(&tmpfnp->fn_lock);
   4352 		if (tmpfnp->fn_parent == newparent) {
   4353 			mutex_enter(&newparent->fn_lock);
   4354 			avl_remove(&newparent->fn_children, tmpfnp);
   4355 			mutex_exit(&newparent->fn_lock);
   4356 			fn_rele(&tmpfnp->fn_parent);
   4357 		}
   4358 		mutex_exit(&tmpfnp->fn_lock);
   4359 		fn_rele(&tmpfnp);
   4360 		goto again;
   4361 	}
   4362 	fnp->fn_parent = newparent;
   4363 	fn_hold(newparent);
   4364 	avl_insert(&newparent->fn_children, fnp, where);
   4365 	mutex_exit(&newparent->fn_lock);
   4366 	mutex_exit(&fnp->fn_lock);
   4367 }
   4368 
   4369 #ifdef DEBUG
   4370 /*
   4371  * Return non-zero if the type information makes sense for the given vnode.
   4372  * Otherwise panic.
   4373  */
   4374 int
   4375 nfs4_consistent_type(vnode_t *vp)
   4376 {
   4377 	rnode4_t *rp = VTOR4(vp);
   4378 
   4379 	if (nfs4_vtype_debug && vp->v_type != VNON &&
   4380 	    rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
   4381 		cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
   4382 		    "rnode attr type=%d", (void *)vp, vp->v_type,
   4383 		    rp->r_attr.va_type);
   4384 	}
   4385 
   4386 	return (1);
   4387 }
   4388 #endif /* DEBUG */
   4389