OpenGrok

Cross Reference: nfs_client.c
xref: /onnv/onnv-gate/usr/src/uts/common/fs/nfs/nfs_client.c
Home | History | Annotate | Line # | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
     23  *
     24  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
     25  *	All rights reserved.
     26  */
     27 
     28 #include <sys/param.h>
     29 #include <sys/types.h>
     30 #include <sys/systm.h>
     31 #include <sys/thread.h>
     32 #include <sys/t_lock.h>
     33 #include <sys/time.h>
     34 #include <sys/vnode.h>
     35 #include <sys/vfs.h>
     36 #include <sys/errno.h>
     37 #include <sys/buf.h>
     38 #include <sys/stat.h>
     39 #include <sys/cred.h>
     40 #include <sys/kmem.h>
     41 #include <sys/debug.h>
     42 #include <sys/dnlc.h>
     43 #include <sys/vmsystm.h>
     44 #include <sys/flock.h>
     45 #include <sys/share.h>
     46 #include <sys/cmn_err.h>
     47 #include <sys/tiuser.h>
     48 #include <sys/sysmacros.h>
     49 #include <sys/callb.h>
     50 #include <sys/acl.h>
     51 #include <sys/kstat.h>
     52 #include <sys/signal.h>
     53 #include <sys/list.h>
     54 #include <sys/zone.h>
     55 
     56 #include <rpc/types.h>
     57 #include <rpc/xdr.h>
     58 #include <rpc/auth.h>
     59 #include <rpc/clnt.h>
     60 
     61 #include <nfs/nfs.h>
     62 #include <nfs/nfs_clnt.h>
     63 
     64 #include <nfs/rnode.h>
     65 #include <nfs/nfs_acl.h>
     66 #include <nfs/lm.h>
     67 
     68 #include <vm/hat.h>
     69 #include <vm/as.h>
     70 #include <vm/page.h>
     71 #include <vm/pvn.h>
     72 #include <vm/seg.h>
     73 #include <vm/seg_map.h>
     74 #include <vm/seg_vn.h>
     75 
     76 static void	nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
     77 			cred_t *);
     78 static int	nfs_getattr_cache(vnode_t *, struct vattr *);
     79 static int	nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
     80 
     81 struct mi_globals {
     82 	kmutex_t	mig_lock;  /* lock protecting mig_list */
     83 	list_t		mig_list;  /* list of NFS v2 or v3 mounts in zone */
     84 	boolean_t	mig_destructor_called;
     85 };
     86 
     87 static zone_key_t mi_list_key;
     88 
     89 /* Debugging flag for PC file shares. */
     90 extern int	share_debug;
     91 
     92 /*
     93  * Attributes caching:
     94  *
     95  * Attributes are cached in the rnode in struct vattr form.
     96  * There is a time associated with the cached attributes (r_attrtime)
     97  * which tells whether the attributes are valid. The time is initialized
     98  * to the difference between current time and the modify time of the vnode
     99  * when new attributes are cached. This allows the attributes for
    100  * files that have changed recently to be timed out sooner than for files
    101  * that have not changed for a long time. There are minimum and maximum
    102  * timeout values that can be set per mount point.
    103  */
    104 
    105 int
    106 nfs_waitfor_purge_complete(vnode_t *vp)
    107 {
    108 	rnode_t *rp;
    109 	k_sigset_t smask;
    110 
    111 	rp = VTOR(vp);
    112 	if (rp->r_serial != NULL && rp->r_serial != curthread) {
    113 		mutex_enter(&rp->r_statelock);
    114 		sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
    115 		while (rp->r_serial != NULL) {
    116 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
    117 				sigunintr(&smask);
    118 				mutex_exit(&rp->r_statelock);
    119 				return (EINTR);
    120 			}
    121 		}
    122 		sigunintr(&smask);
    123 		mutex_exit(&rp->r_statelock);
    124 	}
    125 	return (0);
    126 }
    127 
    128 /*
    129  * Validate caches by checking cached attributes. If the cached
    130  * attributes have timed out, then get new attributes from the server.
    131  * As a side affect, this will do cache invalidation if the attributes
    132  * have changed.
    133  *
    134  * If the attributes have not timed out and if there is a cache
    135  * invalidation being done by some other thread, then wait until that
    136  * thread has completed the cache invalidation.
    137  */
    138 int
    139 nfs_validate_caches(vnode_t *vp, cred_t *cr)
    140 {
    141 	int error;
    142 	struct vattr va;
    143 
    144 	if (ATTRCACHE_VALID(vp)) {
    145 		error = nfs_waitfor_purge_complete(vp);
    146 		if (error)
    147 			return (error);
    148 		return (0);
    149 	}
    150 
    151 	va.va_mask = AT_ALL;
    152 	return (nfs_getattr_otw(vp, &va, cr));
    153 }
    154 
    155 /*
    156  * Validate caches by checking cached attributes. If the cached
    157  * attributes have timed out, then get new attributes from the server.
    158  * As a side affect, this will do cache invalidation if the attributes
    159  * have changed.
    160  *
    161  * If the attributes have not timed out and if there is a cache
    162  * invalidation being done by some other thread, then wait until that
    163  * thread has completed the cache invalidation.
    164  */
    165 int
    166 nfs3_validate_caches(vnode_t *vp, cred_t *cr)
    167 {
    168 	int error;
    169 	struct vattr va;
    170 
    171 	if (ATTRCACHE_VALID(vp)) {
    172 		error = nfs_waitfor_purge_complete(vp);
    173 		if (error)
    174 			return (error);
    175 		return (0);
    176 	}
    177 
    178 	va.va_mask = AT_ALL;
    179 	return (nfs3_getattr_otw(vp, &va, cr));
    180 }
    181 
    182 /*
    183  * Purge all of the various NFS `data' caches.
    184  */
    185 void
    186 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
    187 {
    188 	rnode_t *rp;
    189 	char *contents;
    190 	int size;
    191 	int error;
    192 
    193 	/*
    194 	 * Purge the DNLC for any entries which refer to this file.
    195 	 * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
    196 	 */
    197 	rp = VTOR(vp);
    198 	mutex_enter(&rp->r_statelock);
    199 	if (vp->v_count > 1 &&
    200 	    (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
    201 	    !(rp->r_flags & RINDNLCPURGE)) {
    202 		/*
    203 		 * Set the RINDNLCPURGE flag to prevent recursive entry
    204 		 * into dnlc_purge_vp()
    205 		 */
    206 		if (vp->v_type == VDIR)
    207 			rp->r_flags |= RINDNLCPURGE;
    208 		mutex_exit(&rp->r_statelock);
    209 		dnlc_purge_vp(vp);
    210 		mutex_enter(&rp->r_statelock);
    211 		if (rp->r_flags & RINDNLCPURGE)
    212 			rp->r_flags &= ~RINDNLCPURGE;
    213 	}
    214 
    215 	/*
    216 	 * Clear any readdir state bits and purge the readlink response cache.
    217 	 */
    218 	contents = rp->r_symlink.contents;
    219 	size = rp->r_symlink.size;
    220 	rp->r_symlink.contents = NULL;
    221 	mutex_exit(&rp->r_statelock);
    222 
    223 	if (contents != NULL) {
    224 
    225 		kmem_free((void *)contents, size);
    226 	}
    227 
    228 	/*
    229 	 * Flush the page cache.
    230 	 */
    231 	if (vn_has_cached_data(vp)) {
    232 		error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
    233 		if (error && (error == ENOSPC || error == EDQUOT)) {
    234 			mutex_enter(&rp->r_statelock);
    235 			if (!rp->r_error)
    236 				rp->r_error = error;
    237 			mutex_exit(&rp->r_statelock);
    238 		}
    239 	}
    240 
    241 	/*
    242 	 * Flush the readdir response cache.
    243 	 */
    244 	if (HAVE_RDDIR_CACHE(rp))
    245 		nfs_purge_rddir_cache(vp);
    246 }
    247 
    248 /*
    249  * Purge the readdir cache of all entries
    250  */
    251 void
    252 nfs_purge_rddir_cache(vnode_t *vp)
    253 {
    254 	rnode_t *rp;
    255 	rddir_cache *rdc;
    256 	rddir_cache *nrdc;
    257 
    258 	rp = VTOR(vp);
    259 top:
    260 	mutex_enter(&rp->r_statelock);
    261 	rp->r_direof = NULL;
    262 	rp->r_flags &= ~RLOOKUP;
    263 	rp->r_flags |= RREADDIRPLUS;
    264 	rdc = avl_first(&rp->r_dir);
    265 	while (rdc != NULL) {
    266 		nrdc = AVL_NEXT(&rp->r_dir, rdc);
    267 		avl_remove(&rp->r_dir, rdc);
    268 		rddir_cache_rele(rdc);
    269 		rdc = nrdc;
    270 	}
    271 	mutex_exit(&rp->r_statelock);
    272 }
    273 
    274 /*
    275  * Do a cache check based on the post-operation attributes.
    276  * Then make them the new cached attributes.  If no attributes
    277  * were returned, then mark the attributes as timed out.
    278  */
    279 void
    280 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
    281 {
    282 	vattr_t attr;
    283 
    284 	if (!poap->attributes) {
    285 		PURGE_ATTRCACHE(vp);
    286 		return;
    287 	}
    288 	(void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
    289 }
    290 
    291 /*
    292  * Same as above, but using a vattr
    293  */
    294 void
    295 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
    296     cred_t *cr)
    297 {
    298 	if (!poap->attributes) {
    299 		PURGE_ATTRCACHE(vp);
    300 		return;
    301 	}
    302 	nfs_attr_cache(vp, poap->fres.vap, t, cr);
    303 }
    304 
    305 /*
    306  * Do a cache check based on the weak cache consistency attributes.
    307  * These consist of a small set of pre-operation attributes and the
    308  * full set of post-operation attributes.
    309  *
    310  * If we are given the pre-operation attributes, then use them to
    311  * check the validity of the various caches.  Then, if we got the
    312  * post-operation attributes, make them the new cached attributes.
    313  * If we didn't get the post-operation attributes, then mark the
    314  * attribute cache as timed out so that the next reference will
    315  * cause a GETATTR to the server to refresh with the current
    316  * attributes.
    317  *
    318  * Otherwise, if we didn't get the pre-operation attributes, but
    319  * we did get the post-operation attributes, then use these
    320  * attributes to check the validity of the various caches.  This
    321  * will probably cause a flush of the caches because if the
    322  * operation succeeded, the attributes of the object were changed
    323  * in some way from the old post-operation attributes.  This
    324  * should be okay because it is the safe thing to do.  After
    325  * checking the data caches, then we make these the new cached
    326  * attributes.
    327  *
    328  * Otherwise, we didn't get either the pre- or post-operation
    329  * attributes.  Simply mark the attribute cache as timed out so
    330  * the next reference will cause a GETATTR to the server to
    331  * refresh with the current attributes.
    332  *
    333  * If an error occurred trying to convert the over the wire
    334  * attributes to a vattr, then simply mark the attribute cache as
    335  * timed out.
    336  */
    337 void
    338 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
    339 {
    340 	vattr_t bva;
    341 	vattr_t ava;
    342 
    343 	if (wccp->after.attributes) {
    344 		if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
    345 			PURGE_ATTRCACHE(vp);
    346 			return;
    347 		}
    348 		if (wccp->before.attributes) {
    349 			bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
    350 			bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
    351 			bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
    352 			bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
    353 			bva.va_size = wccp->before.attr.size;
    354 			nfs3_attr_cache(vp, &bva, &ava, t, cr);
    355 		} else
    356 			nfs_attr_cache(vp, &ava, t, cr);
    357 	} else {
    358 		PURGE_ATTRCACHE(vp);
    359 	}
    360 }
    361 
    362 /*
    363  * Set attributes cache for given vnode using nfsattr.
    364  *
    365  * This routine does not do cache validation with the attributes.
    366  *
    367  * If an error occurred trying to convert the over the wire
    368  * attributes to a vattr, then simply mark the attribute cache as
    369  * timed out.
    370  */
    371 void
    372 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
    373 {
    374 	rnode_t *rp;
    375 	struct vattr va;
    376 
    377 	if (!nattr_to_vattr(vp, na, &va)) {
    378 		rp = VTOR(vp);
    379 		mutex_enter(&rp->r_statelock);
    380 		if (rp->r_mtime <= t)
    381 			nfs_attrcache_va(vp, &va);
    382 		mutex_exit(&rp->r_statelock);
    383 	} else {
    384 		PURGE_ATTRCACHE(vp);
    385 	}
    386 }
    387 
    388 /*
    389  * Set attributes cache for given vnode using fattr3.
    390  *
    391  * This routine does not do cache validation with the attributes.
    392  *
    393  * If an error occurred trying to convert the over the wire
    394  * attributes to a vattr, then simply mark the attribute cache as
    395  * timed out.
    396  */
    397 void
    398 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
    399 {
    400 	rnode_t *rp;
    401 	struct vattr va;
    402 
    403 	if (!fattr3_to_vattr(vp, na, &va)) {
    404 		rp = VTOR(vp);
    405 		mutex_enter(&rp->r_statelock);
    406 		if (rp->r_mtime <= t)
    407 			nfs_attrcache_va(vp, &va);
    408 		mutex_exit(&rp->r_statelock);
    409 	} else {
    410 		PURGE_ATTRCACHE(vp);
    411 	}
    412 }
    413 
    414 /*
    415  * Do a cache check based on attributes returned over the wire.  The
    416  * new attributes are cached.
    417  *
    418  * If an error occurred trying to convert the over the wire attributes
    419  * to a vattr, then just return that error.
    420  *
    421  * As a side affect, the vattr argument is filled in with the converted
    422  * attributes.
    423  */
    424 int
    425 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
    426     cred_t *cr)
    427 {
    428 	int error;
    429 
    430 	error = nattr_to_vattr(vp, na, vap);
    431 	if (error)
    432 		return (error);
    433 	nfs_attr_cache(vp, vap, t, cr);
    434 	return (0);
    435 }
    436 
    437 /*
    438  * Do a cache check based on attributes returned over the wire.  The
    439  * new attributes are cached.
    440  *
    441  * If an error occurred trying to convert the over the wire attributes
    442  * to a vattr, then just return that error.
    443  *
    444  * As a side affect, the vattr argument is filled in with the converted
    445  * attributes.
    446  */
    447 int
    448 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
    449 {
    450 	int error;
    451 
    452 	error = fattr3_to_vattr(vp, na, vap);
    453 	if (error)
    454 		return (error);
    455 	nfs_attr_cache(vp, vap, t, cr);
    456 	return (0);
    457 }
    458 
    459 /*
    460  * Use the passed in virtual attributes to check to see whether the
    461  * data and metadata caches are valid, cache the new attributes, and
    462  * then do the cache invalidation if required.
    463  *
    464  * The cache validation and caching of the new attributes is done
    465  * atomically via the use of the mutex, r_statelock.  If required,
    466  * the cache invalidation is done atomically w.r.t. the cache
    467  * validation and caching of the attributes via the pseudo lock,
    468  * r_serial.
    469  *
    470  * This routine is used to do cache validation and attributes caching
    471  * for operations with a single set of post operation attributes.
    472  */
    473 void
    474 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
    475 {
    476 	rnode_t *rp;
    477 	int mtime_changed = 0;
    478 	int ctime_changed = 0;
    479 	vsecattr_t *vsp;
    480 	int was_serial;
    481 	len_t preattr_rsize;
    482 	boolean_t writeattr_set = B_FALSE;
    483 	boolean_t cachepurge_set = B_FALSE;
    484 
    485 	rp = VTOR(vp);
    486 
    487 	mutex_enter(&rp->r_statelock);
    488 
    489 	if (rp->r_serial != curthread) {
    490 		klwp_t *lwp = ttolwp(curthread);
    491 
    492 		was_serial = 0;
    493 		if (lwp != NULL)
    494 			lwp->lwp_nostop++;
    495 		while (rp->r_serial != NULL) {
    496 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
    497 				mutex_exit(&rp->r_statelock);
    498 				if (lwp != NULL)
    499 					lwp->lwp_nostop--;
    500 				return;
    501 			}
    502 		}
    503 		if (lwp != NULL)
    504 			lwp->lwp_nostop--;
    505 	} else
    506 		was_serial = 1;
    507 
    508 	if (rp->r_mtime > t) {
    509 		if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
    510 			PURGE_ATTRCACHE_LOCKED(rp);
    511 		mutex_exit(&rp->r_statelock);
    512 		return;
    513 	}
    514 
    515 	/*
    516 	 * Write thread after writing data to file on remote server,
    517 	 * will always set RWRITEATTR to indicate that file on remote
    518 	 * server was modified with a WRITE operation and would have
    519 	 * marked attribute cache as timed out. If RWRITEATTR
    520 	 * is set, then do not check for mtime and ctime change.
    521 	 */
    522 	if (!(rp->r_flags & RWRITEATTR)) {
    523 		if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
    524 			mtime_changed = 1;
    525 
    526 		if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
    527 		    rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
    528 			ctime_changed = 1;
    529 	} else {
    530 		writeattr_set = B_TRUE;
    531 	}
    532 
    533 	preattr_rsize = rp->r_size;
    534 
    535 	nfs_attrcache_va(vp, vap);
    536 
    537 	/*
    538 	 * If we have updated filesize in nfs_attrcache_va, as soon as we
    539 	 * drop statelock we will be in transition of purging all
    540 	 * our caches and updating them. It is possible for another
    541 	 * thread to pick this new file size and read in zeroed data.
    542 	 * stall other threads till cache purge is complete.
    543 	 */
    544 	if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
    545 		/*
    546 		 * If RWRITEATTR was set and we have updated the file
    547 		 * size, Server's returned file size need not necessarily
    548 		 * be because of this Client's WRITE. We need to purge
    549 		 * all caches.
    550 		 */
    551 		if (writeattr_set)
    552 			mtime_changed = 1;
    553 
    554 		if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
    555 			rp->r_flags |= RINCACHEPURGE;
    556 			cachepurge_set = B_TRUE;
    557 		}
    558 	}
    559 
    560 	if (!mtime_changed && !ctime_changed) {
    561 		mutex_exit(&rp->r_statelock);
    562 		return;
    563 	}
    564 
    565 	rp->r_serial = curthread;
    566 
    567 	mutex_exit(&rp->r_statelock);
    568 
    569 	if (mtime_changed)
    570 		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
    571 
    572 	if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
    573 		mutex_enter(&rp->r_statelock);
    574 		rp->r_flags &= ~RINCACHEPURGE;
    575 		cv_broadcast(&rp->r_cv);
    576 		mutex_exit(&rp->r_statelock);
    577 		cachepurge_set = B_FALSE;
    578 	}
    579 
    580 	if (ctime_changed) {
    581 		(void) nfs_access_purge_rp(rp);
    582 		if (rp->r_secattr != NULL) {
    583 			mutex_enter(&rp->r_statelock);
    584 			vsp = rp->r_secattr;
    585 			rp->r_secattr = NULL;
    586 			mutex_exit(&rp->r_statelock);
    587 			if (vsp != NULL)
    588 				nfs_acl_free(vsp);
    589 		}
    590 	}
    591 
    592 	if (!was_serial) {
    593 		mutex_enter(&rp->r_statelock);
    594 		rp->r_serial = NULL;
    595 		cv_broadcast(&rp->r_cv);
    596 		mutex_exit(&rp->r_statelock);
    597 	}
    598 }
    599 
    600 /*
    601  * Use the passed in "before" virtual attributes to check to see
    602  * whether the data and metadata caches are valid, cache the "after"
    603  * new attributes, and then do the cache invalidation if required.
    604  *
    605  * The cache validation and caching of the new attributes is done
    606  * atomically via the use of the mutex, r_statelock.  If required,
    607  * the cache invalidation is done atomically w.r.t. the cache
    608  * validation and caching of the attributes via the pseudo lock,
    609  * r_serial.
    610  *
    611  * This routine is used to do cache validation and attributes caching
    612  * for operations with both pre operation attributes and post operation
    613  * attributes.
    614  */
    615 static void
    616 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
    617     cred_t *cr)
    618 {
    619 	rnode_t *rp;
    620 	int mtime_changed = 0;
    621 	int ctime_changed = 0;
    622 	vsecattr_t *vsp;
    623 	int was_serial;
    624 	len_t preattr_rsize;
    625 	boolean_t writeattr_set = B_FALSE;
    626 	boolean_t cachepurge_set = B_FALSE;
    627 
    628 	rp = VTOR(vp);
    629 
    630 	mutex_enter(&rp->r_statelock);
    631 
    632 	if (rp->r_serial != curthread) {
    633 		klwp_t *lwp = ttolwp(curthread);
    634 
    635 		was_serial = 0;
    636 		if (lwp != NULL)
    637 			lwp->lwp_nostop++;
    638 		while (rp->r_serial != NULL) {
    639 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
    640 				mutex_exit(&rp->r_statelock);
    641 				if (lwp != NULL)
    642 					lwp->lwp_nostop--;
    643 				return;
    644 			}
    645 		}
    646 		if (lwp != NULL)
    647 			lwp->lwp_nostop--;
    648 	} else
    649 		was_serial = 1;
    650 
    651 	if (rp->r_mtime > t) {
    652 		if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size))
    653 			PURGE_ATTRCACHE_LOCKED(rp);
    654 		mutex_exit(&rp->r_statelock);
    655 		return;
    656 	}
    657 
    658 	/*
    659 	 * Write thread after writing data to file on remote server,
    660 	 * will always set RWRITEATTR to indicate that file on remote
    661 	 * server was modified with a WRITE operation and would have
    662 	 * marked attribute cache as timed out. If RWRITEATTR
    663 	 * is set, then do not check for mtime and ctime change.
    664 	 */
    665 	if (!(rp->r_flags & RWRITEATTR)) {
    666 		if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
    667 			mtime_changed = 1;
    668 
    669 		if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
    670 		    rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
    671 			ctime_changed = 1;
    672 	} else {
    673 		writeattr_set = B_TRUE;
    674 	}
    675 
    676 	preattr_rsize = rp->r_size;
    677 
    678 	nfs_attrcache_va(vp, avap);
    679 
    680 	/*
    681 	 * If we have updated filesize in nfs_attrcache_va, as soon as we
    682 	 * drop statelock we will be in transition of purging all
    683 	 * our caches and updating them. It is possible for another
    684 	 * thread to pick this new file size and read in zeroed data.
    685 	 * stall other threads till cache purge is complete.
    686 	 */
    687 	if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
    688 		/*
    689 		 * If RWRITEATTR was set and we have updated the file
    690 		 * size, Server's returned file size need not necessarily
    691 		 * be because of this Client's WRITE. We need to purge
    692 		 * all caches.
    693 		 */
    694 		if (writeattr_set)
    695 			mtime_changed = 1;
    696 
    697 		if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
    698 			rp->r_flags |= RINCACHEPURGE;
    699 			cachepurge_set = B_TRUE;
    700 		}
    701 	}
    702 
    703 	if (!mtime_changed && !ctime_changed) {
    704 		mutex_exit(&rp->r_statelock);
    705 		return;
    706 	}
    707 
    708 	rp->r_serial = curthread;
    709 
    710 	mutex_exit(&rp->r_statelock);
    711 
    712 	if (mtime_changed)
    713 		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
    714 
    715 	if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
    716 		mutex_enter(&rp->r_statelock);
    717 		rp->r_flags &= ~RINCACHEPURGE;
    718 		cv_broadcast(&rp->r_cv);
    719 		mutex_exit(&rp->r_statelock);
    720 		cachepurge_set = B_FALSE;
    721 	}
    722 
    723 	if (ctime_changed) {
    724 		(void) nfs_access_purge_rp(rp);
    725 		if (rp->r_secattr != NULL) {
    726 			mutex_enter(&rp->r_statelock);
    727 			vsp = rp->r_secattr;
    728 			rp->r_secattr = NULL;
    729 			mutex_exit(&rp->r_statelock);
    730 			if (vsp != NULL)
    731 				nfs_acl_free(vsp);
    732 		}
    733 	}
    734 
    735 	if (!was_serial) {
    736 		mutex_enter(&rp->r_statelock);
    737 		rp->r_serial = NULL;
    738 		cv_broadcast(&rp->r_cv);
    739 		mutex_exit(&rp->r_statelock);
    740 	}
    741 }
    742 
    743 /*
    744  * Set attributes cache for given vnode using virtual attributes.
    745  *
    746  * Set the timeout value on the attribute cache and fill it
    747  * with the passed in attributes.
    748  *
    749  * The caller must be holding r_statelock.
    750  */
    751 void
    752 nfs_attrcache_va(vnode_t *vp, struct vattr *va)
    753 {
    754 	rnode_t *rp;
    755 	mntinfo_t *mi;
    756 	hrtime_t delta;
    757 	hrtime_t now;
    758 
    759 	rp = VTOR(vp);
    760 
    761 	ASSERT(MUTEX_HELD(&rp->r_statelock));
    762 
    763 	now = gethrtime();
    764 
    765 	mi = VTOMI(vp);
    766 
    767 	/*
    768 	 * Delta is the number of nanoseconds that we will
    769 	 * cache the attributes of the file.  It is based on
    770 	 * the number of nanoseconds since the last time that
    771 	 * we detected a change.  The assumption is that files
    772 	 * that changed recently are likely to change again.
    773 	 * There is a minimum and a maximum for regular files
    774 	 * and for directories which is enforced though.
    775 	 *
    776 	 * Using the time since last change was detected
    777 	 * eliminates direct comparison or calculation
    778 	 * using mixed client and server times.  NFS does
    779 	 * not make any assumptions regarding the client
    780 	 * and server clocks being synchronized.
    781 	 */
    782 	if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
    783 	    va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
    784 	    va->va_size != rp->r_attr.va_size)
    785 		rp->r_mtime = now;
    786 
    787 	if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
    788 		delta = 0;
    789 	else {
    790 		delta = now - rp->r_mtime;
    791 		if (vp->v_type == VDIR) {
    792 			if (delta < mi->mi_acdirmin)
    793 				delta = mi->mi_acdirmin;
    794 			else if (delta > mi->mi_acdirmax)
    795 				delta = mi->mi_acdirmax;
    796 		} else {
    797 			if (delta < mi->mi_acregmin)
    798 				delta = mi->mi_acregmin;
    799 			else if (delta > mi->mi_acregmax)
    800 				delta = mi->mi_acregmax;
    801 		}
    802 	}
    803 	rp->r_attrtime = now + delta;
    804 	rp->r_attr = *va;
    805 	/*
    806 	 * Update the size of the file if there is no cached data or if
    807 	 * the cached data is clean and there is no data being written
    808 	 * out.
    809 	 */
    810 	if (rp->r_size != va->va_size &&
    811 	    (!vn_has_cached_data(vp) ||
    812 	    (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
    813 		rp->r_size = va->va_size;
    814 	nfs_setswaplike(vp, va);
    815 	rp->r_flags &= ~RWRITEATTR;
    816 }
    817 
    818 /*
    819  * Fill in attribute from the cache.
    820  * If valid, then return 0 to indicate that no error occurred,
    821  * otherwise return 1 to indicate that an error occurred.
    822  */
    823 static int
    824 nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
    825 {
    826 	rnode_t *rp;
    827 	uint_t mask = vap->va_mask;
    828 
    829 	rp = VTOR(vp);
    830 	mutex_enter(&rp->r_statelock);
    831 	if (ATTRCACHE_VALID(vp)) {
    832 		/*
    833 		 * Cached attributes are valid
    834 		 */
    835 		*vap = rp->r_attr;
    836 		/*
    837 		 * Set the caller's va_mask to the set of attributes
    838 		 * that were requested ANDed with the attributes that
    839 		 * are available.  If attributes were requested that
    840 		 * are not available, those bits must be turned off
    841 		 * in the callers va_mask.
    842 		 */
    843 		vap->va_mask &= mask;
    844 		mutex_exit(&rp->r_statelock);
    845 		return (0);
    846 	}
    847 	mutex_exit(&rp->r_statelock);
    848 	return (1);
    849 }
    850 
    851 /*
    852  * Get attributes over-the-wire and update attributes cache
    853  * if no error occurred in the over-the-wire operation.
    854  * Return 0 if successful, otherwise error.
    855  */
    856 int
    857 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
    858 {
    859 	int error;
    860 	struct nfsattrstat ns;
    861 	int douprintf;
    862 	mntinfo_t *mi;
    863 	failinfo_t fi;
    864 	hrtime_t t;
    865 
    866 	mi = VTOMI(vp);
    867 	fi.vp = vp;
    868 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
    869 	fi.copyproc = nfscopyfh;
    870 	fi.lookupproc = nfslookup;
    871 	fi.xattrdirproc = acl_getxattrdir2;
    872 
    873 	if (mi->mi_flags & MI_ACL) {
    874 		error = acl_getattr2_otw(vp, vap, cr);
    875 		if (mi->mi_flags & MI_ACL)
    876 			return (error);
    877 	}
    878 
    879 	douprintf = 1;
    880 
    881 	t = gethrtime();
    882 
    883 	error = rfs2call(mi, RFS_GETATTR,
    884 	    xdr_fhandle, (caddr_t)VTOFH(vp),
    885 	    xdr_attrstat, (caddr_t)&ns, cr,
    886 	    &douprintf, &ns.ns_status, 0, &fi);
    887 
    888 	if (!error) {
    889 		error = geterrno(ns.ns_status);
    890 		if (!error)
    891 			error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
    892 		else {
    893 			PURGE_STALE_FH(error, vp, cr);
    894 		}
    895 	}
    896 
    897 	return (error);
    898 }
    899 
    900 /*
    901  * Return either cached ot remote attributes. If get remote attr
    902  * use them to check and invalidate caches, then cache the new attributes.
    903  */
    904 int
    905 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
    906 {
    907 	int error;
    908 	rnode_t *rp;
    909 
    910 	/*
    911 	 * If we've got cached attributes, we're done, otherwise go
    912 	 * to the server to get attributes, which will update the cache
    913 	 * in the process.
    914 	 */
    915 	error = nfs_getattr_cache(vp, vap);
    916 	if (error)
    917 		error = nfs_getattr_otw(vp, vap, cr);
    918 
    919 	/* Return the client's view of file size */
    920 	rp = VTOR(vp);
    921 	mutex_enter(&rp->r_statelock);
    922 	vap->va_size = rp->r_size;
    923 	mutex_exit(&rp->r_statelock);
    924 
    925 	return (error);
    926 }
    927 
    928 /*
    929  * Get attributes over-the-wire and update attributes cache
    930  * if no error occurred in the over-the-wire operation.
    931  * Return 0 if successful, otherwise error.
    932  */
    933 int
    934 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
    935 {
    936 	int error;
    937 	GETATTR3args args;
    938 	GETATTR3vres res;
    939 	int douprintf;
    940 	failinfo_t fi;
    941 	hrtime_t t;
    942 
    943 	args.object = *VTOFH3(vp);
    944 	fi.vp = vp;
    945 	fi.fhp = (caddr_t)&args.object;
    946 	fi.copyproc = nfs3copyfh;
    947 	fi.lookupproc = nfs3lookup;
    948 	fi.xattrdirproc = acl_getxattrdir3;
    949 	res.fres.vp = vp;
    950 	res.fres.vap = vap;
    951 
    952 	douprintf = 1;
    953 
    954 	t = gethrtime();
    955 
    956 	error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
    957 	    xdr_nfs_fh3, (caddr_t)&args,
    958 	    xdr_GETATTR3vres, (caddr_t)&res, cr,
    959 	    &douprintf, &res.status, 0, &fi);
    960 
    961 	if (error)
    962 		return (error);
    963 
    964 	error = geterrno3(res.status);
    965 	if (error) {
    966 		PURGE_STALE_FH(error, vp, cr);
    967 		return (error);
    968 	}
    969 
    970 	/*
    971 	 * Catch status codes that indicate fattr3 to vattr translation failure
    972 	 */
    973 	if (res.fres.status)
    974 		return (res.fres.status);
    975 
    976 	nfs_attr_cache(vp, vap, t, cr);
    977 	return (0);
    978 }
    979 
    980 /*
    981  * Return either cached or remote attributes. If get remote attr
    982  * use them to check and invalidate caches, then cache the new attributes.
    983  */
    984 int
    985 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
    986 {
    987 	int error;
    988 	rnode_t *rp;
    989 
    990 	/*
    991 	 * If we've got cached attributes, we're done, otherwise go
    992 	 * to the server to get attributes, which will update the cache
    993 	 * in the process.
    994 	 */
    995 	error = nfs_getattr_cache(vp, vap);
    996 	if (error)
    997 		error = nfs3_getattr_otw(vp, vap, cr);
    998 
    999 	/* Return the client's view of file size */
   1000 	rp = VTOR(vp);
   1001 	mutex_enter(&rp->r_statelock);
   1002 	vap->va_size = rp->r_size;
   1003 	mutex_exit(&rp->r_statelock);
   1004 
   1005 	return (error);
   1006 }
   1007 
   1008 vtype_t nf_to_vt[] = {
   1009 	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
   1010 };
   1011 /*
   1012  * Convert NFS Version 2 over the network attributes to the local
   1013  * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
   1014  * network representation and the local representation is done here.
   1015  * Returns 0 for success, error if failed due to overflow.
   1016  */
   1017 int
   1018 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
   1019 {
   1020 	/* overflow in time attributes? */
   1021 #ifndef _LP64
   1022 	if (!NFS2_FATTR_TIME_OK(na))
   1023 		return (EOVERFLOW);
   1024 #endif
   1025 
   1026 	vap->va_mask = AT_ALL;
   1027 
   1028 	if (na->na_type < NFNON || na->na_type > NFSOC)
   1029 		vap->va_type = VBAD;
   1030 	else
   1031 		vap->va_type = nf_to_vt[na->na_type];
   1032 	vap->va_mode = na->na_mode;
   1033 	vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
   1034 	vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
   1035 	vap->va_fsid = vp->v_vfsp->vfs_dev;
   1036 	vap->va_nodeid = na->na_nodeid;
   1037 	vap->va_nlink = na->na_nlink;
   1038 	vap->va_size = na->na_size;	/* keep for cache validation */
   1039 	/*
   1040 	 * nfs protocol defines times as unsigned so don't extend sign,
   1041 	 * unless sysadmin set nfs_allow_preepoch_time.
   1042 	 */
   1043 	NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
   1044 	vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
   1045 	NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
   1046 	vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
   1047 	NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
   1048 	vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
   1049 	/*
   1050 	 * Shannon's law - uncompress the received dev_t
   1051 	 * if the top half of is zero indicating a response
   1052 	 * from an `older style' OS. Except for when it is a
   1053 	 * `new style' OS sending the maj device of zero,
   1054 	 * in which case the algorithm still works because the
   1055 	 * fact that it is a new style server
   1056 	 * is hidden by the minor device not being greater
   1057 	 * than 255 (a requirement in this case).
   1058 	 */
   1059 	if ((na->na_rdev & 0xffff0000) == 0)
   1060 		vap->va_rdev = nfsv2_expdev(na->na_rdev);
   1061 	else
   1062 		vap->va_rdev = expldev(na->na_rdev);
   1063 
   1064 	vap->va_nblocks = na->na_blocks;
   1065 	switch (na->na_type) {
   1066 	case NFBLK:
   1067 		vap->va_blksize = DEV_BSIZE;
   1068 		break;
   1069 
   1070 	case NFCHR:
   1071 		vap->va_blksize = MAXBSIZE;
   1072 		break;
   1073 
   1074 	case NFSOC:
   1075 	default:
   1076 		vap->va_blksize = na->na_blocksize;
   1077 		break;
   1078 	}
   1079 	/*
   1080 	 * This bit of ugliness is a hack to preserve the
   1081 	 * over-the-wire protocols for named-pipe vnodes.
   1082 	 * It remaps the special over-the-wire type to the
   1083 	 * VFIFO type. (see note in nfs.h)
   1084 	 */
   1085 	if (NA_ISFIFO(na)) {
   1086 		vap->va_type = VFIFO;
   1087 		vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
   1088 		vap->va_rdev = 0;
   1089 		vap->va_blksize = na->na_blocksize;
   1090 	}
   1091 	vap->va_seq = 0;
   1092 	return (0);
   1093 }
   1094 
   1095 /*
   1096  * Convert NFS Version 3 over the network attributes to the local
   1097  * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
   1098  * network representation and the local representation is done here.
   1099  */
   1100 vtype_t nf3_to_vt[] = {
   1101 	VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
   1102 };
   1103 
   1104 int
   1105 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
   1106 {
   1107 
   1108 #ifndef _LP64
   1109 	/* overflow in time attributes? */
   1110 	if (!NFS3_FATTR_TIME_OK(na))
   1111 		return (EOVERFLOW);
   1112 #endif
   1113 	if (!NFS3_SIZE_OK(na->size))
   1114 		/* file too big */
   1115 		return (EFBIG);
   1116 
   1117 	vap->va_mask = AT_ALL;
   1118 
   1119 	if (na->type < NF3REG || na->type > NF3FIFO)
   1120 		vap->va_type = VBAD;
   1121 	else
   1122 		vap->va_type = nf3_to_vt[na->type];
   1123 	vap->va_mode = na->mode;
   1124 	vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
   1125 	vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
   1126 	vap->va_fsid = vp->v_vfsp->vfs_dev;
   1127 	vap->va_nodeid = na->fileid;
   1128 	vap->va_nlink = na->nlink;
   1129 	vap->va_size = na->size;
   1130 
   1131 	/*
   1132 	 * nfs protocol defines times as unsigned so don't extend sign,
   1133 	 * unless sysadmin set nfs_allow_preepoch_time.
   1134 	 */
   1135 	NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
   1136 	vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
   1137 	NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
   1138 	vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
   1139 	NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
   1140 	vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
   1141 
   1142 	switch (na->type) {
   1143 	case NF3BLK:
   1144 		vap->va_rdev = makedevice(na->rdev.specdata1,
   1145 		    na->rdev.specdata2);
   1146 		vap->va_blksize = DEV_BSIZE;
   1147 		vap->va_nblocks = 0;
   1148 		break;
   1149 	case NF3CHR:
   1150 		vap->va_rdev = makedevice(na->rdev.specdata1,
   1151 		    na->rdev.specdata2);
   1152 		vap->va_blksize = MAXBSIZE;
   1153 		vap->va_nblocks = 0;
   1154 		break;
   1155 	case NF3REG:
   1156 	case NF3DIR:
   1157 	case NF3LNK:
   1158 		vap->va_rdev = 0;
   1159 		vap->va_blksize = MAXBSIZE;
   1160 		vap->va_nblocks = (u_longlong_t)
   1161 		    ((na->used + (size3)DEV_BSIZE - (size3)1) /
   1162 		    (size3)DEV_BSIZE);
   1163 		break;
   1164 	case NF3SOCK:
   1165 	case NF3FIFO:
   1166 	default:
   1167 		vap->va_rdev = 0;
   1168 		vap->va_blksize = MAXBSIZE;
   1169 		vap->va_nblocks = 0;
   1170 		break;
   1171 	}
   1172 	vap->va_seq = 0;
   1173 	return (0);
   1174 }
   1175 
   1176 /*
   1177  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
   1178  * for the demand-based allocation of async threads per-mount.  The
   1179  * nfs_async_timeout is the amount of time a thread will live after it
   1180  * becomes idle, unless new I/O requests are received before the thread
   1181  * dies.  See nfs_async_putpage and nfs_async_start.
   1182  */
   1183 
   1184 int nfs_async_timeout = -1;	/* uninitialized */
   1185 
   1186 static void	nfs_async_start(struct vfs *);
   1187 static void	nfs_async_pgops_start(struct vfs *);
   1188 static void	nfs_async_common_start(struct vfs *, int);
   1189 
   1190 static void
   1191 free_async_args(struct nfs_async_reqs *args)
   1192 {
   1193 	rnode_t *rp;
   1194 
   1195 	if (args->a_io != NFS_INACTIVE) {
   1196 		rp = VTOR(args->a_vp);
   1197 		mutex_enter(&rp->r_statelock);
   1198 		rp->r_count--;
   1199 		if (args->a_io == NFS_PUTAPAGE ||
   1200 		    args->a_io == NFS_PAGEIO)
   1201 			rp->r_awcount--;
   1202 		cv_broadcast(&rp->r_cv);
   1203 		mutex_exit(&rp->r_statelock);
   1204 		VN_RELE(args->a_vp);
   1205 	}
   1206 	crfree(args->a_cred);
   1207 	kmem_free(args, sizeof (*args));
   1208 }
   1209 
   1210 /*
   1211  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
   1212  * pageout(), running in the global zone, have legitimate reasons to do
   1213  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
   1214  * use of a a per-mount "asynchronous requests manager thread" which is
   1215  * signaled by the various asynchronous work routines when there is
   1216  * asynchronous work to be done.  It is responsible for creating new
   1217  * worker threads if necessary, and notifying existing worker threads
   1218  * that there is work to be done.
   1219  *
   1220  * In other words, it will "take the specifications from the customers and
   1221  * give them to the engineers."
   1222  *
   1223  * Worker threads die off of their own accord if they are no longer
   1224  * needed.
   1225  *
   1226  * This thread is killed when the zone is going away or the filesystem
   1227  * is being unmounted.
   1228  */
   1229 void
   1230 nfs_async_manager(vfs_t *vfsp)
   1231 {
   1232 	callb_cpr_t cprinfo;
   1233 	mntinfo_t *mi;
   1234 	uint_t max_threads;
   1235 
   1236 	mi = VFTOMI(vfsp);
   1237 
   1238 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
   1239 	    "nfs_async_manager");
   1240 
   1241 	mutex_enter(&mi->mi_async_lock);
   1242 	/*
   1243 	 * We want to stash the max number of threads that this mount was
   1244 	 * allowed so we can use it later when the variable is set to zero as
   1245 	 * part of the zone/mount going away.
   1246 	 *
   1247 	 * We want to be able to create at least one thread to handle
   1248 	 * asynchronous inactive calls.
   1249 	 */
   1250 	max_threads = MAX(mi->mi_max_threads, 1);
   1251 	/*
   1252 	 * We don't want to wait for mi_max_threads to go to zero, since that
   1253 	 * happens as part of a failed unmount, but this thread should only
   1254 	 * exit when the mount/zone is really going away.
   1255 	 *
   1256 	 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
   1257 	 * attempted: the various _async_*() functions know to do things
   1258 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
   1259 	 * outstanding requests.
   1260 	 *
   1261 	 * Note that we still create zthreads even if we notice the zone is
   1262 	 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
   1263 	 * shutdown sequence to take slightly longer in some cases, but
   1264 	 * doesn't violate the protocol, as all threads will exit as soon as
   1265 	 * they're done processing the remaining requests.
   1266 	 */
   1267 	for (;;) {
   1268 		while (mi->mi_async_req_count > 0) {
   1269 			/*
   1270 			 * Paranoia: If the mount started out having
   1271 			 * (mi->mi_max_threads == 0), and the value was
   1272 			 * later changed (via a debugger or somesuch),
   1273 			 * we could be confused since we will think we
   1274 			 * can't create any threads, and the calling
   1275 			 * code (which looks at the current value of
   1276 			 * mi->mi_max_threads, now non-zero) thinks we
   1277 			 * can.
   1278 			 *
   1279 			 * So, because we're paranoid, we create threads
   1280 			 * up to the maximum of the original and the
   1281 			 * current value. This means that future
   1282 			 * (debugger-induced) lowerings of
   1283 			 * mi->mi_max_threads are ignored for our
   1284 			 * purposes, but who told them they could change
   1285 			 * random values on a live kernel anyhow?
   1286 			 */
   1287 			if (mi->mi_threads[NFS_ASYNC_QUEUE] <
   1288 			    MAX(mi->mi_max_threads, max_threads)) {
   1289 				mi->mi_threads[NFS_ASYNC_QUEUE]++;
   1290 				mutex_exit(&mi->mi_async_lock);
   1291 				VFS_HOLD(vfsp);	/* hold for new thread */
   1292 				(void) zthread_create(NULL, 0, nfs_async_start,
   1293 				    vfsp, 0, minclsyspri);
   1294 				mutex_enter(&mi->mi_async_lock);
   1295 			} else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] <
   1296 			    NUM_ASYNC_PGOPS_THREADS) {
   1297 				mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++;
   1298 				mutex_exit(&mi->mi_async_lock);
   1299 				VFS_HOLD(vfsp); /* hold for new thread */
   1300 				(void) zthread_create(NULL, 0,
   1301 				    nfs_async_pgops_start, vfsp, 0,
   1302 				    minclsyspri);
   1303 				mutex_enter(&mi->mi_async_lock);
   1304 			}
   1305 			NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
   1306 			ASSERT(mi->mi_async_req_count != 0);
   1307 			mi->mi_async_req_count--;
   1308 		}
   1309 
   1310 		mutex_enter(&mi->mi_lock);
   1311 		if (mi->mi_flags & MI_ASYNC_MGR_STOP) {
   1312 			mutex_exit(&mi->mi_lock);
   1313 			break;
   1314 		}
   1315 		mutex_exit(&mi->mi_lock);
   1316 
   1317 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   1318 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
   1319 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
   1320 	}
   1321 	/*
   1322 	 * Let everyone know we're done.
   1323 	 */
   1324 	mi->mi_manager_thread = NULL;
   1325 	cv_broadcast(&mi->mi_async_cv);
   1326 
   1327 	/*
   1328 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
   1329 	 * since CALLB_CPR_EXIT is actually responsible for releasing
   1330 	 * 'mi_async_lock'.
   1331 	 */
   1332 	CALLB_CPR_EXIT(&cprinfo);
   1333 	VFS_RELE(vfsp);	/* release thread's hold */
   1334 	zthread_exit();
   1335 }
   1336 
   1337 /*
   1338  * Signal (and wait for) the async manager thread to clean up and go away.
   1339  */
   1340 void
   1341 nfs_async_manager_stop(vfs_t *vfsp)
   1342 {
   1343 	mntinfo_t *mi = VFTOMI(vfsp);
   1344 
   1345 	mutex_enter(&mi->mi_async_lock);
   1346 	mutex_enter(&mi->mi_lock);
   1347 	mi->mi_flags |= MI_ASYNC_MGR_STOP;
   1348 	mutex_exit(&mi->mi_lock);
   1349 	cv_broadcast(&mi->mi_async_reqs_cv);
   1350 	while (mi->mi_manager_thread != NULL)
   1351 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
   1352 	mutex_exit(&mi->mi_async_lock);
   1353 }
   1354 
   1355 int
   1356 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
   1357 	struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
   1358 	u_offset_t, caddr_t, struct seg *, cred_t *))
   1359 {
   1360 	rnode_t *rp;
   1361 	mntinfo_t *mi;
   1362 	struct nfs_async_reqs *args;
   1363 
   1364 	rp = VTOR(vp);
   1365 	ASSERT(rp->r_freef == NULL);
   1366 
   1367 	mi = VTOMI(vp);
   1368 
   1369 	/*
   1370 	 * If addr falls in a different segment, don't bother doing readahead.
   1371 	 */
   1372 	if (addr >= seg->s_base + seg->s_size)
   1373 		return (-1);
   1374 
   1375 	/*
   1376 	 * If we can't allocate a request structure, punt on the readahead.
   1377 	 */
   1378 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
   1379 		return (-1);
   1380 
   1381 	/*
   1382 	 * If a lock operation is pending, don't initiate any new
   1383 	 * readaheads.  Otherwise, bump r_count to indicate the new
   1384 	 * asynchronous I/O.
   1385 	 */
   1386 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
   1387 		kmem_free(args, sizeof (*args));
   1388 		return (-1);
   1389 	}
   1390 	mutex_enter(&rp->r_statelock);
   1391 	rp->r_count++;
   1392 	mutex_exit(&rp->r_statelock);
   1393 	nfs_rw_exit(&rp->r_lkserlock);
   1394 
   1395 	args->a_next = NULL;
   1396 #ifdef DEBUG
   1397 	args->a_queuer = curthread;
   1398 #endif
   1399 	VN_HOLD(vp);
   1400 	args->a_vp = vp;
   1401 	ASSERT(cr != NULL);
   1402 	crhold(cr);
   1403 	args->a_cred = cr;
   1404 	args->a_io = NFS_READ_AHEAD;
   1405 	args->a_nfs_readahead = readahead;
   1406 	args->a_nfs_blkoff = blkoff;
   1407 	args->a_nfs_seg = seg;
   1408 	args->a_nfs_addr = addr;
   1409 
   1410 	mutex_enter(&mi->mi_async_lock);
   1411 
   1412 	/*
   1413 	 * If asyncio has been disabled, don't bother readahead.
   1414 	 */
   1415 	if (mi->mi_max_threads == 0) {
   1416 		mutex_exit(&mi->mi_async_lock);
   1417 		goto noasync;
   1418 	}
   1419 
   1420 	/*
   1421 	 * Link request structure into the async list and
   1422 	 * wakeup async thread to do the i/o.
   1423 	 */
   1424 	if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
   1425 		mi->mi_async_reqs[NFS_READ_AHEAD] = args;
   1426 		mi->mi_async_tail[NFS_READ_AHEAD] = args;
   1427 	} else {
   1428 		mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
   1429 		mi->mi_async_tail[NFS_READ_AHEAD] = args;
   1430 	}
   1431 
   1432 	if (mi->mi_io_kstats) {
   1433 		mutex_enter(&mi->mi_lock);
   1434 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   1435 		mutex_exit(&mi->mi_lock);
   1436 	}
   1437 
   1438 	mi->mi_async_req_count++;
   1439 	ASSERT(mi->mi_async_req_count != 0);
   1440 	cv_signal(&mi->mi_async_reqs_cv);
   1441 	mutex_exit(&mi->mi_async_lock);
   1442 	return (0);
   1443 
   1444 noasync:
   1445 	mutex_enter(&rp->r_statelock);
   1446 	rp->r_count--;
   1447 	cv_broadcast(&rp->r_cv);
   1448 	mutex_exit(&rp->r_statelock);
   1449 	VN_RELE(vp);
   1450 	crfree(cr);
   1451 	kmem_free(args, sizeof (*args));
   1452 	return (-1);
   1453 }
   1454 
   1455 int
   1456 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
   1457 	int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
   1458 	u_offset_t, size_t, int, cred_t *))
   1459 {
   1460 	rnode_t *rp;
   1461 	mntinfo_t *mi;
   1462 	struct nfs_async_reqs *args;
   1463 
   1464 	ASSERT(flags & B_ASYNC);
   1465 	ASSERT(vp->v_vfsp != NULL);
   1466 
   1467 	rp = VTOR(vp);
   1468 	ASSERT(rp->r_count > 0);
   1469 
   1470 	mi = VTOMI(vp);
   1471 
   1472 	/*
   1473 	 * If we can't allocate a request structure, do the putpage
   1474 	 * operation synchronously in this thread's context.
   1475 	 */
   1476 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
   1477 		goto noasync;
   1478 
   1479 	args->a_next = NULL;
   1480 #ifdef DEBUG
   1481 	args->a_queuer = curthread;
   1482 #endif
   1483 	VN_HOLD(vp);
   1484 	args->a_vp = vp;
   1485 	ASSERT(cr != NULL);
   1486 	crhold(cr);
   1487 	args->a_cred = cr;
   1488 	args->a_io = NFS_PUTAPAGE;
   1489 	args->a_nfs_putapage = putapage;
   1490 	args->a_nfs_pp = pp;
   1491 	args->a_nfs_off = off;
   1492 	args->a_nfs_len = (uint_t)len;
   1493 	args->a_nfs_flags = flags;
   1494 
   1495 	mutex_enter(&mi->mi_async_lock);
   1496 
   1497 	/*
   1498 	 * If asyncio has been disabled, then make a synchronous request.
   1499 	 * This check is done a second time in case async io was diabled
   1500 	 * while this thread was blocked waiting for memory pressure to
   1501 	 * reduce or for the queue to drain.
   1502 	 */
   1503 	if (mi->mi_max_threads == 0) {
   1504 		mutex_exit(&mi->mi_async_lock);
   1505 		goto noasync;
   1506 	}
   1507 
   1508 	/*
   1509 	 * Link request structure into the async list and
   1510 	 * wakeup async thread to do the i/o.
   1511 	 */
   1512 	if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
   1513 		mi->mi_async_reqs[NFS_PUTAPAGE] = args;
   1514 		mi->mi_async_tail[NFS_PUTAPAGE] = args;
   1515 	} else {
   1516 		mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
   1517 		mi->mi_async_tail[NFS_PUTAPAGE] = args;
   1518 	}
   1519 
   1520 	mutex_enter(&rp->r_statelock);
   1521 	rp->r_count++;
   1522 	rp->r_awcount++;
   1523 	mutex_exit(&rp->r_statelock);
   1524 
   1525 	if (mi->mi_io_kstats) {
   1526 		mutex_enter(&mi->mi_lock);
   1527 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   1528 		mutex_exit(&mi->mi_lock);
   1529 	}
   1530 
   1531 	mi->mi_async_req_count++;
   1532 	ASSERT(mi->mi_async_req_count != 0);
   1533 	cv_signal(&mi->mi_async_reqs_cv);
   1534 	mutex_exit(&mi->mi_async_lock);
   1535 	return (0);
   1536 
   1537 noasync:
   1538 	if (args != NULL) {
   1539 		VN_RELE(vp);
   1540 		crfree(cr);
   1541 		kmem_free(args, sizeof (*args));
   1542 	}
   1543 
   1544 	if (curproc == proc_pageout || curproc == proc_fsflush) {
   1545 		/*
   1546 		 * If we get here in the context of the pageout/fsflush,
   1547 		 * we refuse to do a sync write, because this may hang
   1548 		 * pageout (and the machine). In this case, we just
   1549 		 * re-mark the page as dirty and punt on the page.
   1550 		 *
   1551 		 * Make sure B_FORCE isn't set.  We can re-mark the
   1552 		 * pages as dirty and unlock the pages in one swoop by
   1553 		 * passing in B_ERROR to pvn_write_done().  However,
   1554 		 * we should make sure B_FORCE isn't set - we don't
   1555 		 * want the page tossed before it gets written out.
   1556 		 */
   1557 		if (flags & B_FORCE)
   1558 			flags &= ~(B_INVAL | B_FORCE);
   1559 		pvn_write_done(pp, flags | B_ERROR);
   1560 		return (0);
   1561 	}
   1562 	if (nfs_zone() != mi->mi_zone) {
   1563 		/*
   1564 		 * So this was a cross-zone sync putpage.  We pass in B_ERROR
   1565 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
   1566 		 * them.
   1567 		 *
   1568 		 * We don't want to clear B_FORCE here as the caller presumably
   1569 		 * knows what they're doing if they set it.
   1570 		 */
   1571 		pvn_write_done(pp, flags | B_ERROR);
   1572 		return (EPERM);
   1573 	}
   1574 	return ((*putapage)(vp, pp, off, len, flags, cr));
   1575 }
   1576 
   1577 int
   1578 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
   1579 	int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
   1580 	size_t, int, cred_t *))
   1581 {
   1582 	rnode_t *rp;
   1583 	mntinfo_t *mi;
   1584 	struct nfs_async_reqs *args;
   1585 
   1586 	ASSERT(flags & B_ASYNC);
   1587 	ASSERT(vp->v_vfsp != NULL);
   1588 
   1589 	rp = VTOR(vp);
   1590 	ASSERT(rp->r_count > 0);
   1591 
   1592 	mi = VTOMI(vp);
   1593 
   1594 	/*
   1595 	 * If we can't allocate a request structure, do the pageio
   1596 	 * request synchronously in this thread's context.
   1597 	 */
   1598 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
   1599 		goto noasync;
   1600 
   1601 	args->a_next = NULL;
   1602 #ifdef DEBUG
   1603 	args->a_queuer = curthread;
   1604 #endif
   1605 	VN_HOLD(vp);
   1606 	args->a_vp = vp;
   1607 	ASSERT(cr != NULL);
   1608 	crhold(cr);
   1609 	args->a_cred = cr;
   1610 	args->a_io = NFS_PAGEIO;
   1611 	args->a_nfs_pageio = pageio;
   1612 	args->a_nfs_pp = pp;
   1613 	args->a_nfs_off = io_off;
   1614 	args->a_nfs_len = (uint_t)io_len;
   1615 	args->a_nfs_flags = flags;
   1616 
   1617 	mutex_enter(&mi->mi_async_lock);
   1618 
   1619 	/*
   1620 	 * If asyncio has been disabled, then make a synchronous request.
   1621 	 * This check is done a second time in case async io was diabled
   1622 	 * while this thread was blocked waiting for memory pressure to
   1623 	 * reduce or for the queue to drain.
   1624 	 */
   1625 	if (mi->mi_max_threads == 0) {
   1626 		mutex_exit(&mi->mi_async_lock);
   1627 		goto noasync;
   1628 	}
   1629 
   1630 	/*
   1631 	 * Link request structure into the async list and
   1632 	 * wakeup async thread to do the i/o.
   1633 	 */
   1634 	if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
   1635 		mi->mi_async_reqs[NFS_PAGEIO] = args;
   1636 		mi->mi_async_tail[NFS_PAGEIO] = args;
   1637 	} else {
   1638 		mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
   1639 		mi->mi_async_tail[NFS_PAGEIO] = args;
   1640 	}
   1641 
   1642 	mutex_enter(&rp->r_statelock);
   1643 	rp->r_count++;
   1644 	rp->r_awcount++;
   1645 	mutex_exit(&rp->r_statelock);
   1646 
   1647 	if (mi->mi_io_kstats) {
   1648 		mutex_enter(&mi->mi_lock);
   1649 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   1650 		mutex_exit(&mi->mi_lock);
   1651 	}
   1652 
   1653 	mi->mi_async_req_count++;
   1654 	ASSERT(mi->mi_async_req_count != 0);
   1655 	cv_signal(&mi->mi_async_reqs_cv);
   1656 	mutex_exit(&mi->mi_async_lock);
   1657 	return (0);
   1658 
   1659 noasync:
   1660 	if (args != NULL) {
   1661 		VN_RELE(vp);
   1662 		crfree(cr);
   1663 		kmem_free(args, sizeof (*args));
   1664 	}
   1665 
   1666 	/*
   1667 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
   1668 	 * the page list), for writes we do it synchronously, except for
   1669 	 * proc_pageout/proc_fsflush as described below.
   1670 	 */
   1671 	if (flags & B_READ) {
   1672 		pvn_read_done(pp, flags | B_ERROR);
   1673 		return (0);
   1674 	}
   1675 
   1676 	if (curproc == proc_pageout || curproc == proc_fsflush) {
   1677 		/*
   1678 		 * If we get here in the context of the pageout/fsflush,
   1679 		 * we refuse to do a sync write, because this may hang
   1680 		 * pageout/fsflush (and the machine). In this case, we just
   1681 		 * re-mark the page as dirty and punt on the page.
   1682 		 *
   1683 		 * Make sure B_FORCE isn't set.  We can re-mark the
   1684 		 * pages as dirty and unlock the pages in one swoop by
   1685 		 * passing in B_ERROR to pvn_write_done().  However,
   1686 		 * we should make sure B_FORCE isn't set - we don't
   1687 		 * want the page tossed before it gets written out.
   1688 		 */
   1689 		if (flags & B_FORCE)
   1690 			flags &= ~(B_INVAL | B_FORCE);
   1691 		pvn_write_done(pp, flags | B_ERROR);
   1692 		return (0);
   1693 	}
   1694 
   1695 	if (nfs_zone() != mi->mi_zone) {
   1696 		/*
   1697 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
   1698 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
   1699 		 * them.
   1700 		 *
   1701 		 * We don't want to clear B_FORCE here as the caller presumably
   1702 		 * knows what they're doing if they set it.
   1703 		 */
   1704 		pvn_write_done(pp, flags | B_ERROR);
   1705 		return (EPERM);
   1706 	}
   1707 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
   1708 }
   1709 
   1710 void
   1711 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
   1712 	int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
   1713 {
   1714 	rnode_t *rp;
   1715 	mntinfo_t *mi;
   1716 	struct nfs_async_reqs *args;
   1717 
   1718 	rp = VTOR(vp);
   1719 	ASSERT(rp->r_freef == NULL);
   1720 
   1721 	mi = VTOMI(vp);
   1722 
   1723 	/*
   1724 	 * If we can't allocate a request structure, do the readdir
   1725 	 * operation synchronously in this thread's context.
   1726 	 */
   1727 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
   1728 		goto noasync;
   1729 
   1730 	args->a_next = NULL;
   1731 #ifdef DEBUG
   1732 	args->a_queuer = curthread;
   1733 #endif
   1734 	VN_HOLD(vp);
   1735 	args->a_vp = vp;
   1736 	ASSERT(cr != NULL);
   1737 	crhold(cr);
   1738 	args->a_cred = cr;
   1739 	args->a_io = NFS_READDIR;
   1740 	args->a_nfs_readdir = readdir;
   1741 	args->a_nfs_rdc = rdc;
   1742 
   1743 	mutex_enter(&mi->mi_async_lock);
   1744 
   1745 	/*
   1746 	 * If asyncio has been disabled, then make a synchronous request.
   1747 	 */
   1748 	if (mi->mi_max_threads == 0) {
   1749 		mutex_exit(&mi->mi_async_lock);
   1750 		goto noasync;
   1751 	}
   1752 
   1753 	/*
   1754 	 * Link request structure into the async list and
   1755 	 * wakeup async thread to do the i/o.
   1756 	 */
   1757 	if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
   1758 		mi->mi_async_reqs[NFS_READDIR] = args;
   1759 		mi->mi_async_tail[NFS_READDIR] = args;
   1760 	} else {
   1761 		mi->mi_async_tail[NFS_READDIR]->a_next = args;
   1762 		mi->mi_async_tail[NFS_READDIR] = args;
   1763 	}
   1764 
   1765 	mutex_enter(&rp->r_statelock);
   1766 	rp->r_count++;
   1767 	mutex_exit(&rp->r_statelock);
   1768 
   1769 	if (mi->mi_io_kstats) {
   1770 		mutex_enter(&mi->mi_lock);
   1771 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   1772 		mutex_exit(&mi->mi_lock);
   1773 	}
   1774 
   1775 	mi->mi_async_req_count++;
   1776 	ASSERT(mi->mi_async_req_count != 0);
   1777 	cv_signal(&mi->mi_async_reqs_cv);
   1778 	mutex_exit(&mi->mi_async_lock);
   1779 	return;
   1780 
   1781 noasync:
   1782 	if (args != NULL) {
   1783 		VN_RELE(vp);
   1784 		crfree(cr);
   1785 		kmem_free(args, sizeof (*args));
   1786 	}
   1787 
   1788 	rdc->entries = NULL;
   1789 	mutex_enter(&rp->r_statelock);
   1790 	ASSERT(rdc->flags & RDDIR);
   1791 	rdc->flags &= ~RDDIR;
   1792 	rdc->flags |= RDDIRREQ;
   1793 	/*
   1794 	 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
   1795 	 * is set, wakeup the thread sleeping in cv_wait_sig().
   1796 	 * The woken up thread will reset the flag to RDDIR and will
   1797 	 * continue with the readdir opeartion.
   1798 	 */
   1799 	if (rdc->flags & RDDIRWAIT) {
   1800 		rdc->flags &= ~RDDIRWAIT;
   1801 		cv_broadcast(&rdc->cv);
   1802 	}
   1803 	mutex_exit(&rp->r_statelock);
   1804 	rddir_cache_rele(rdc);
   1805 }
   1806 
   1807 void
   1808 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
   1809 	cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
   1810 	cred_t *))
   1811 {
   1812 	rnode_t *rp;
   1813 	mntinfo_t *mi;
   1814 	struct nfs_async_reqs *args;
   1815 	page_t *pp;
   1816 
   1817 	rp = VTOR(vp);
   1818 	mi = VTOMI(vp);
   1819 
   1820 	/*
   1821 	 * If we can't allocate a request structure, do the commit
   1822 	 * operation synchronously in this thread's context.
   1823 	 */
   1824 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
   1825 		goto noasync;
   1826 
   1827 	args->a_next = NULL;
   1828 #ifdef DEBUG
   1829 	args->a_queuer = curthread;
   1830 #endif
   1831 	VN_HOLD(vp);
   1832 	args->a_vp = vp;
   1833 	ASSERT(cr != NULL);
   1834 	crhold(cr);
   1835 	args->a_cred = cr;
   1836 	args->a_io = NFS_COMMIT;
   1837 	args->a_nfs_commit = commit;
   1838 	args->a_nfs_plist = plist;
   1839 	args->a_nfs_offset = offset;
   1840 	args->a_nfs_count = count;
   1841 
   1842 	mutex_enter(&mi->mi_async_lock);
   1843 
   1844 	/*
   1845 	 * If asyncio has been disabled, then make a synchronous request.
   1846 	 * This check is done a second time in case async io was diabled
   1847 	 * while this thread was blocked waiting for memory pressure to
   1848 	 * reduce or for the queue to drain.
   1849 	 */
   1850 	if (mi->mi_max_threads == 0) {
   1851 		mutex_exit(&mi->mi_async_lock);
   1852 		goto noasync;
   1853 	}
   1854 
   1855 	/*
   1856 	 * Link request structure into the async list and
   1857 	 * wakeup async thread to do the i/o.
   1858 	 */
   1859 	if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
   1860 		mi->mi_async_reqs[NFS_COMMIT] = args;
   1861 		mi->mi_async_tail[NFS_COMMIT] = args;
   1862 	} else {
   1863 		mi->mi_async_tail[NFS_COMMIT]->a_next = args;
   1864 		mi->mi_async_tail[NFS_COMMIT] = args;
   1865 	}
   1866 
   1867 	mutex_enter(&rp->r_statelock);
   1868 	rp->r_count++;
   1869 	mutex_exit(&rp->r_statelock);
   1870 
   1871 	if (mi->mi_io_kstats) {
   1872 		mutex_enter(&mi->mi_lock);
   1873 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   1874 		mutex_exit(&mi->mi_lock);
   1875 	}
   1876 
   1877 	mi->mi_async_req_count++;
   1878 	ASSERT(mi->mi_async_req_count != 0);
   1879 	cv_signal(&mi->mi_async_reqs_cv);
   1880 	mutex_exit(&mi->mi_async_lock);
   1881 	return;
   1882 
   1883 noasync:
   1884 	if (args != NULL) {
   1885 		VN_RELE(vp);
   1886 		crfree(cr);
   1887 		kmem_free(args, sizeof (*args));
   1888 	}
   1889 
   1890 	if (curproc == proc_pageout || curproc == proc_fsflush ||
   1891 	    nfs_zone() != mi->mi_zone) {
   1892 		while (plist != NULL) {
   1893 			pp = plist;
   1894 			page_sub(&plist, pp);
   1895 			pp->p_fsdata = C_COMMIT;
   1896 			page_unlock(pp);
   1897 		}
   1898 		return;
   1899 	}
   1900 	(*commit)(vp, plist, offset, count, cr);
   1901 }
   1902 
   1903 void
   1904 nfs_async_inactive(vnode_t *vp, cred_t *cr,
   1905     void (*inactive)(vnode_t *, cred_t *, caller_context_t *))
   1906 {
   1907 	mntinfo_t *mi;
   1908 	struct nfs_async_reqs *args;
   1909 
   1910 	mi = VTOMI(vp);
   1911 
   1912 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
   1913 	args->a_next = NULL;
   1914 #ifdef DEBUG
   1915 	args->a_queuer = curthread;
   1916 #endif
   1917 	args->a_vp = vp;
   1918 	ASSERT(cr != NULL);
   1919 	crhold(cr);
   1920 	args->a_cred = cr;
   1921 	args->a_io = NFS_INACTIVE;
   1922 	args->a_nfs_inactive = inactive;
   1923 
   1924 	/*
   1925 	 * Note that we don't check mi->mi_max_threads here, since we
   1926 	 * *need* to get rid of this vnode regardless of whether someone
   1927 	 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
   1928 	 *
   1929 	 * The manager thread knows about this and is willing to create
   1930 	 * at least one thread to accommodate us.
   1931 	 */
   1932 	mutex_enter(&mi->mi_async_lock);
   1933 	if (mi->mi_manager_thread == NULL) {
   1934 		rnode_t *rp = VTOR(vp);
   1935 
   1936 		mutex_exit(&mi->mi_async_lock);
   1937 		crfree(cr);	/* drop our reference */
   1938 		kmem_free(args, sizeof (*args));
   1939 		/*
   1940 		 * We can't do an over-the-wire call since we're in the wrong
   1941 		 * zone, so we need to clean up state as best we can and then
   1942 		 * throw away the vnode.
   1943 		 */
   1944 		mutex_enter(&rp->r_statelock);
   1945 		if (rp->r_unldvp != NULL) {
   1946 			vnode_t *unldvp;
   1947 			char *unlname;
   1948 			cred_t *unlcred;
   1949 
   1950 			unldvp = rp->r_unldvp;
   1951 			rp->r_unldvp = NULL;
   1952 			unlname = rp->r_unlname;
   1953 			rp->r_unlname = NULL;
   1954 			unlcred = rp->r_unlcred;
   1955 			rp->r_unlcred = NULL;
   1956 			mutex_exit(&rp->r_statelock);
   1957 
   1958 			VN_RELE(unldvp);
   1959 			kmem_free(unlname, MAXNAMELEN);
   1960 			crfree(unlcred);
   1961 		} else {
   1962 			mutex_exit(&rp->r_statelock);
   1963 		}
   1964 		/*
   1965 		 * No need to explicitly throw away any cached pages.  The
   1966 		 * eventual rinactive() will attempt a synchronous
   1967 		 * VOP_PUTPAGE() which will immediately fail since the request
   1968 		 * is coming from the wrong zone, and then will proceed to call
   1969 		 * nfs_invalidate_pages() which will clean things up for us.
   1970 		 */
   1971 		rp_addfree(VTOR(vp), cr);
   1972 		return;
   1973 	}
   1974 
   1975 	if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
   1976 		mi->mi_async_reqs[NFS_INACTIVE] = args;
   1977 	} else {
   1978 		mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
   1979 	}
   1980 	mi->mi_async_tail[NFS_INACTIVE] = args;
   1981 	/*
   1982 	 * Don't increment r_count, since we're trying to get rid of the vnode.
   1983 	 */
   1984 
   1985 	mi->mi_async_req_count++;
   1986 	ASSERT(mi->mi_async_req_count != 0);
   1987 	cv_signal(&mi->mi_async_reqs_cv);
   1988 	mutex_exit(&mi->mi_async_lock);
   1989 }
   1990 
   1991 static void
   1992 nfs_async_start(struct vfs *vfsp)
   1993 {
   1994 	nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE);
   1995 }
   1996 
   1997 static void
   1998 nfs_async_pgops_start(struct vfs *vfsp)
   1999 {
   2000 	nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE);
   2001 }
   2002 
   2003 /*
   2004  * The async queues for each mounted file system are arranged as a
   2005  * set of queues, one for each async i/o type.  Requests are taken
   2006  * from the queues in a round-robin fashion.  A number of consecutive
   2007  * requests are taken from each queue before moving on to the next
   2008  * queue.  This functionality may allow the NFS Version 2 server to do
   2009  * write clustering, even if the client is mixing writes and reads
   2010  * because it will take multiple write requests from the queue
   2011  * before processing any of the other async i/o types.
   2012  *
   2013  * XXX The nfs_async_common_start thread is unsafe in the light of the present
   2014  * model defined by cpr to suspend the system. Specifically over the
   2015  * wire calls are cpr-unsafe. The thread should be reevaluated in
   2016  * case of future updates to the cpr model.
   2017  */
   2018 static void
   2019 nfs_async_common_start(struct vfs *vfsp, int async_queue)
   2020 {
   2021 	struct nfs_async_reqs *args;
   2022 	mntinfo_t *mi = VFTOMI(vfsp);
   2023 	clock_t time_left = 1;
   2024 	callb_cpr_t cprinfo;
   2025 	int i;
   2026 	int async_types;
   2027 	kcondvar_t *async_work_cv;
   2028 
   2029 	if (async_queue == NFS_ASYNC_QUEUE) {
   2030 		async_types = NFS_ASYNC_TYPES;
   2031 		async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE];
   2032 	} else {
   2033 		async_types = NFS_ASYNC_PGOPS_TYPES;
   2034 		async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE];
   2035 	}
   2036 
   2037 	/*
   2038 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
   2039 	 * built in an implementation independent manner.
   2040 	 */
   2041 	if (nfs_async_timeout == -1)
   2042 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
   2043 
   2044 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
   2045 
   2046 	mutex_enter(&mi->mi_async_lock);
   2047 	for (;;) {
   2048 		/*
   2049 		 * Find the next queue containing an entry.  We start
   2050 		 * at the current queue pointer and then round robin
   2051 		 * through all of them until we either find a non-empty
   2052 		 * queue or have looked through all of them.
   2053 		 */
   2054 		for (i = 0; i < async_types; i++) {
   2055 			args = *mi->mi_async_curr[async_queue];
   2056 			if (args != NULL)
   2057 				break;
   2058 			mi->mi_async_curr[async_queue]++;
   2059 			if (mi->mi_async_curr[async_queue] ==
   2060 			    &mi->mi_async_reqs[async_types]) {
   2061 				mi->mi_async_curr[async_queue] =
   2062 				    &mi->mi_async_reqs[0];
   2063 			}
   2064 		}
   2065 		/*
   2066 		 * If we didn't find a entry, then block until woken up
   2067 		 * again and then look through the queues again.
   2068 		 */
   2069 		if (args == NULL) {
   2070 			/*
   2071 			 * Exiting is considered to be safe for CPR as well
   2072 			 */
   2073 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
   2074 
   2075 			/*
   2076 			 * Wakeup thread waiting to unmount the file
   2077 			 * system only if all async threads are inactive.
   2078 			 *
   2079 			 * If we've timed-out and there's nothing to do,
   2080 			 * then get rid of this thread.
   2081 			 */
   2082 			if (mi->mi_max_threads == 0 || time_left <= 0) {
   2083 				--mi->mi_threads[async_queue];
   2084 
   2085 				if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
   2086 				    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0)
   2087 					cv_signal(&mi->mi_async_cv);
   2088 				CALLB_CPR_EXIT(&cprinfo);
   2089 				VFS_RELE(vfsp);	/* release thread's hold */
   2090 				zthread_exit();
   2091 				/* NOTREACHED */
   2092 			}
   2093 			time_left = cv_reltimedwait(async_work_cv,
   2094 			    &mi->mi_async_lock, nfs_async_timeout,
   2095 			    TR_CLOCK_TICK);
   2096 
   2097 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
   2098 
   2099 			continue;
   2100 		}
   2101 		time_left = 1;
   2102 
   2103 		/*
   2104 		 * Remove the request from the async queue and then
   2105 		 * update the current async request queue pointer.  If
   2106 		 * the current queue is empty or we have removed enough
   2107 		 * consecutive entries from it, then reset the counter
   2108 		 * for this queue and then move the current pointer to
   2109 		 * the next queue.
   2110 		 */
   2111 		*mi->mi_async_curr[async_queue] = args->a_next;
   2112 		if (*mi->mi_async_curr[async_queue] == NULL ||
   2113 		    --mi->mi_async_clusters[args->a_io] == 0) {
   2114 			mi->mi_async_clusters[args->a_io] =
   2115 			    mi->mi_async_init_clusters;
   2116 			mi->mi_async_curr[async_queue]++;
   2117 			if (mi->mi_async_curr[async_queue] ==
   2118 			    &mi->mi_async_reqs[async_types]) {
   2119 				mi->mi_async_curr[async_queue] =
   2120 				    &mi->mi_async_reqs[0];
   2121 			}
   2122 		}
   2123 
   2124 		if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
   2125 			mutex_enter(&mi->mi_lock);
   2126 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
   2127 			mutex_exit(&mi->mi_lock);
   2128 		}
   2129 
   2130 		mutex_exit(&mi->mi_async_lock);
   2131 
   2132 		/*
   2133 		 * Obtain arguments from the async request structure.
   2134 		 */
   2135 		if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
   2136 			(*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
   2137 			    args->a_nfs_addr, args->a_nfs_seg,
   2138 			    args->a_cred);
   2139 		} else if (args->a_io == NFS_PUTAPAGE) {
   2140 			(void) (*args->a_nfs_putapage)(args->a_vp,
   2141 			    args->a_nfs_pp, args->a_nfs_off,
   2142 			    args->a_nfs_len, args->a_nfs_flags,
   2143 			    args->a_cred);
   2144 		} else if (args->a_io == NFS_PAGEIO) {
   2145 			(void) (*args->a_nfs_pageio)(args->a_vp,
   2146 			    args->a_nfs_pp, args->a_nfs_off,
   2147 			    args->a_nfs_len, args->a_nfs_flags,
   2148 			    args->a_cred);
   2149 		} else if (args->a_io == NFS_READDIR) {
   2150 			(void) ((*args->a_nfs_readdir)(args->a_vp,
   2151 			    args->a_nfs_rdc, args->a_cred));
   2152 		} else if (args->a_io == NFS_COMMIT) {
   2153 			(*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
   2154 			    args->a_nfs_offset, args->a_nfs_count,
   2155 			    args->a_cred);
   2156 		} else if (args->a_io == NFS_INACTIVE) {
   2157 			(*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL);
   2158 		}
   2159 
   2160 		/*
   2161 		 * Now, release the vnode and free the credentials
   2162 		 * structure.
   2163 		 */
   2164 		free_async_args(args);
   2165 		/*
   2166 		 * Reacquire the mutex because it will be needed above.
   2167 		 */
   2168 		mutex_enter(&mi->mi_async_lock);
   2169 	}
   2170 }
   2171 
   2172 void
   2173 nfs_async_stop(struct vfs *vfsp)
   2174 {
   2175 	mntinfo_t *mi = VFTOMI(vfsp);
   2176 
   2177 	/*
   2178 	 * Wait for all outstanding async operations to complete and for the
   2179 	 * worker threads to exit.
   2180 	 */
   2181 	mutex_enter(&mi->mi_async_lock);
   2182 	mi->mi_max_threads = 0;
   2183 	NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
   2184 	while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
   2185 	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0)
   2186 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
   2187 	mutex_exit(&mi->mi_async_lock);
   2188 }
   2189 
   2190 /*
   2191  * nfs_async_stop_sig:
   2192  * Wait for all outstanding putpage operation to complete. If a signal
   2193  * is deliver we will abort and return non-zero. If we can put all the
   2194  * pages we will return 0. This routine is called from nfs_unmount and
   2195  * nfs3_unmount to make these operations interruptible.
   2196  */
   2197 int
   2198 nfs_async_stop_sig(struct vfs *vfsp)
   2199 {
   2200 	mntinfo_t *mi = VFTOMI(vfsp);
   2201 	ushort_t omax;
   2202 	int rval;
   2203 
   2204 	/*
   2205 	 * Wait for all outstanding async operations to complete and for the
   2206 	 * worker threads to exit.
   2207 	 */
   2208 	mutex_enter(&mi->mi_async_lock);
   2209 	omax = mi->mi_max_threads;
   2210 	mi->mi_max_threads = 0;
   2211 	/*
   2212 	 * Tell all the worker threads to exit.
   2213 	 */
   2214 	NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
   2215 	while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
   2216 	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) {
   2217 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
   2218 			break;
   2219 	}
   2220 	rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
   2221 	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]  != 0); /* Interrupted */
   2222 	if (rval)
   2223 		mi->mi_max_threads = omax;
   2224 	mutex_exit(&mi->mi_async_lock);
   2225 
   2226 	return (rval);
   2227 }
   2228 
   2229 int
   2230 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
   2231 {
   2232 	int pagecreate;
   2233 	int n;
   2234 	int saved_n;
   2235 	caddr_t saved_base;
   2236 	u_offset_t offset;
   2237 	int error;
   2238 	int sm_error;
   2239 	vnode_t *vp = RTOV(rp);
   2240 
   2241 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
   2242 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
   2243 	if (!vpm_enable) {
   2244 		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
   2245 	}
   2246 
   2247 	/*
   2248 	 * Move bytes in at most PAGESIZE chunks. We must avoid
   2249 	 * spanning pages in uiomove() because page faults may cause
   2250 	 * the cache to be invalidated out from under us. The r_size is not
   2251 	 * updated until after the uiomove. If we push the last page of a
   2252 	 * file before r_size is correct, we will lose the data written past
   2253 	 * the current (and invalid) r_size.
   2254 	 */
   2255 	do {
   2256 		offset = uio->uio_loffset;
   2257 		pagecreate = 0;
   2258 
   2259 		/*
   2260 		 * n is the number of bytes required to satisfy the request
   2261 		 *   or the number of bytes to fill out the page.
   2262 		 */
   2263 		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
   2264 
   2265 		/*
   2266 		 * Check to see if we can skip reading in the page
   2267 		 * and just allocate the memory.  We can do this
   2268 		 * if we are going to rewrite the entire mapping
   2269 		 * or if we are going to write to or beyond the current
   2270 		 * end of file from the beginning of the mapping.
   2271 		 *
   2272 		 * The read of r_size is now protected by r_statelock.
   2273 		 */
   2274 		mutex_enter(&rp->r_statelock);
   2275 		/*
   2276 		 * When pgcreated is nonzero the caller has already done
   2277 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
   2278 		 * segkpm this means we already have at least one page
   2279 		 * created and mapped at base.
   2280 		 */
   2281 		pagecreate = pgcreated ||
   2282 		    ((offset & PAGEOFFSET) == 0 &&
   2283 		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
   2284 
   2285 		mutex_exit(&rp->r_statelock);
   2286 		if (!vpm_enable && pagecreate) {
   2287 			/*
   2288 			 * The last argument tells segmap_pagecreate() to
   2289 			 * always lock the page, as opposed to sometimes
   2290 			 * returning with the page locked. This way we avoid a
   2291 			 * fault on the ensuing uiomove(), but also
   2292 			 * more importantly (to fix bug 1094402) we can
   2293 			 * call segmap_fault() to unlock the page in all
   2294 			 * cases. An alternative would be to modify
   2295 			 * segmap_pagecreate() to tell us when it is
   2296 			 * locking a page, but that's a fairly major
   2297 			 * interface change.
   2298 			 */
   2299 			if (pgcreated == 0)
   2300 				(void) segmap_pagecreate(segkmap, base,
   2301 				    (uint_t)n, 1);
   2302 			saved_base = base;
   2303 			saved_n = n;
   2304 		}
   2305 
   2306 		/*
   2307 		 * The number of bytes of data in the last page can not
   2308 		 * be accurately be determined while page is being
   2309 		 * uiomove'd to and the size of the file being updated.
   2310 		 * Thus, inform threads which need to know accurately
   2311 		 * how much data is in the last page of the file.  They
   2312 		 * will not do the i/o immediately, but will arrange for
   2313 		 * the i/o to happen later when this modify operation
   2314 		 * will have finished.
   2315 		 */
   2316 		ASSERT(!(rp->r_flags & RMODINPROGRESS));
   2317 		mutex_enter(&rp->r_statelock);
   2318 		rp->r_flags |= RMODINPROGRESS;
   2319 		rp->r_modaddr = (offset & MAXBMASK);
   2320 		mutex_exit(&rp->r_statelock);
   2321 
   2322 		if (vpm_enable) {
   2323 			/*
   2324 			 * Copy data. If new pages are created, part of
   2325 			 * the page that is not written will be initizliazed
   2326 			 * with zeros.
   2327 			 */
   2328 			error = vpm_data_copy(vp, offset, n, uio,
   2329 			    !pagecreate, NULL, 0, S_WRITE);
   2330 		} else {
   2331 			error = uiomove(base, n, UIO_WRITE, uio);
   2332 		}
   2333 
   2334 		/*
   2335 		 * r_size is the maximum number of
   2336 		 * bytes known to be in the file.
   2337 		 * Make sure it is at least as high as the
   2338 		 * first unwritten byte pointed to by uio_loffset.
   2339 		 */
   2340 		mutex_enter(&rp->r_statelock);
   2341 		if (rp->r_size < uio->uio_loffset)
   2342 			rp->r_size = uio->uio_loffset;
   2343 		rp->r_flags &= ~RMODINPROGRESS;
   2344 		rp->r_flags |= RDIRTY;
   2345 		mutex_exit(&rp->r_statelock);
   2346 
   2347 		/* n = # of bytes written */
   2348 		n = (int)(uio->uio_loffset - offset);
   2349 
   2350 		if (!vpm_enable) {
   2351 			base += n;
   2352 		}
   2353 		tcount -= n;
   2354 		/*
   2355 		 * If we created pages w/o initializing them completely,
   2356 		 * we need to zero the part that wasn't set up.
   2357 		 * This happens on a most EOF write cases and if
   2358 		 * we had some sort of error during the uiomove.
   2359 		 */
   2360 		if (!vpm_enable && pagecreate) {
   2361 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
   2362 				(void) kzero(base, PAGESIZE - n);
   2363 
   2364 			if (pgcreated) {
   2365 				/*
   2366 				 * Caller is responsible for this page,
   2367 				 * it was not created in this loop.
   2368 				 */
   2369 				pgcreated = 0;
   2370 			} else {
   2371 				/*
   2372 				 * For bug 1094402: segmap_pagecreate locks
   2373 				 * page. Unlock it. This also unlocks the
   2374 				 * pages allocated by page_create_va() in
   2375 				 * segmap_pagecreate().
   2376 				 */
   2377 				sm_error = segmap_fault(kas.a_hat, segkmap,
   2378 				    saved_base, saved_n,
   2379 				    F_SOFTUNLOCK, S_WRITE);
   2380 				if (error == 0)
   2381 					error = sm_error;
   2382 			}
   2383 		}
   2384 	} while (tcount > 0 && error == 0);
   2385 
   2386 	return (error);
   2387 }
   2388 
   2389 int
   2390 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
   2391 {
   2392 	rnode_t *rp;
   2393 	page_t *pp;
   2394 	u_offset_t eoff;
   2395 	u_offset_t io_off;
   2396 	size_t io_len;
   2397 	int error;
   2398 	int rdirty;
   2399 	int err;
   2400 
   2401 	rp = VTOR(vp);
   2402 	ASSERT(rp->r_count > 0);
   2403 
   2404 	if (!vn_has_cached_data(vp))
   2405 		return (0);
   2406 
   2407 	ASSERT(vp->v_type != VCHR);
   2408 
   2409 	/*
   2410 	 * If ROUTOFSPACE is set, then all writes turn into B_INVAL
   2411 	 * writes.  B_FORCE is set to force the VM system to actually
   2412 	 * invalidate the pages, even if the i/o failed.  The pages
   2413 	 * need to get invalidated because they can't be written out
   2414 	 * because there isn't any space left on either the server's
   2415 	 * file system or in the user's disk quota.  The B_FREE bit
   2416 	 * is cleared to avoid confusion as to whether this is a
   2417 	 * request to place the page on the freelist or to destroy
   2418 	 * it.
   2419 	 */
   2420 	if ((rp->r_flags & ROUTOFSPACE) ||
   2421 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
   2422 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
   2423 
   2424 	if (len == 0) {
   2425 		/*
   2426 		 * If doing a full file synchronous operation, then clear
   2427 		 * the RDIRTY bit.  If a page gets dirtied while the flush
   2428 		 * is happening, then RDIRTY will get set again.  The
   2429 		 * RDIRTY bit must get cleared before the flush so that
   2430 		 * we don't lose this information.
   2431 		 *
   2432 		 * If there are no full file async write operations
   2433 		 * pending and RDIRTY bit is set, clear it.
   2434 		 */
   2435 		if (off == (u_offset_t)0 &&
   2436 		    !(flags & B_ASYNC) &&
   2437 		    (rp->r_flags & RDIRTY)) {
   2438 			mutex_enter(&rp->r_statelock);
   2439 			rdirty = (rp->r_flags & RDIRTY);
   2440 			rp->r_flags &= ~RDIRTY;
   2441 			mutex_exit(&rp->r_statelock);
   2442 		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
   2443 			mutex_enter(&rp->r_statelock);
   2444 			if (rp->r_flags & RDIRTY && rp->r_awcount == 0) {
   2445 				rdirty = (rp->r_flags & RDIRTY);
   2446 				rp->r_flags &= ~RDIRTY;
   2447 			}
   2448 			mutex_exit(&rp->r_statelock);
   2449 		} else
   2450 			rdirty = 0;
   2451 
   2452 		/*
   2453 		 * Search the entire vp list for pages >= off, and flush
   2454 		 * the dirty pages.
   2455 		 */
   2456 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
   2457 		    flags, cr);
   2458 
   2459 		/*
   2460 		 * If an error occurred and the file was marked as dirty
   2461 		 * before and we aren't forcibly invalidating pages, then
   2462 		 * reset the RDIRTY flag.
   2463 		 */
   2464 		if (error && rdirty &&
   2465 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
   2466 			mutex_enter(&rp->r_statelock);
   2467 			rp->r_flags |= RDIRTY;
   2468 			mutex_exit(&rp->r_statelock);
   2469 		}
   2470 	} else {
   2471 		/*
   2472 		 * Do a range from [off...off + len) looking for pages
   2473 		 * to deal with.
   2474 		 */
   2475 		error = 0;
   2476 #ifdef lint
   2477 		io_len = 0;
   2478 #endif
   2479 		eoff = off + len;
   2480 		mutex_enter(&rp->r_statelock);
   2481 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
   2482 		    io_off += io_len) {
   2483 			mutex_exit(&rp->r_statelock);
   2484 			/*
   2485 			 * If we are not invalidating, synchronously
   2486 			 * freeing or writing pages use the routine
   2487 			 * page_lookup_nowait() to prevent reclaiming
   2488 			 * them from the free list.
   2489 			 */
   2490 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
   2491 				pp = page_lookup(vp, io_off,
   2492 				    (flags & (B_INVAL | B_FREE)) ?
   2493 				    SE_EXCL : SE_SHARED);
   2494 			} else {
   2495 				pp = page_lookup_nowait(vp, io_off,
   2496 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
   2497 			}
   2498 
   2499 			if (pp == NULL || !pvn_getdirty(pp, flags))
   2500 				io_len = PAGESIZE;
   2501 			else {
   2502 				err = (*rp->r_putapage)(vp, pp, &io_off,
   2503 				    &io_len, flags, cr);
   2504 				if (!error)
   2505 					error = err;
   2506 				/*
   2507 				 * "io_off" and "io_len" are returned as
   2508 				 * the range of pages we actually wrote.
   2509 				 * This allows us to skip ahead more quickly
   2510 				 * since several pages may've been dealt
   2511 				 * with by this iteration of the loop.
   2512 				 */
   2513 			}
   2514 			mutex_enter(&rp->r_statelock);
   2515 		}
   2516 		mutex_exit(&rp->r_statelock);
   2517 	}
   2518 
   2519 	return (error);
   2520 }
   2521 
   2522 void
   2523 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
   2524 {
   2525 	rnode_t *rp;
   2526 
   2527 	rp = VTOR(vp);
   2528 	mutex_enter(&rp->r_statelock);
   2529 	while (rp->r_flags & RTRUNCATE)
   2530 		cv_wait(&rp->r_cv, &rp->r_statelock);
   2531 	rp->r_flags |= RTRUNCATE;
   2532 	if (off == (u_offset_t)0) {
   2533 		rp->r_flags &= ~RDIRTY;
   2534 		if (!(rp->r_flags & RSTALE))
   2535 			rp->r_error = 0;
   2536 	}
   2537 	rp->r_truncaddr = off;
   2538 	mutex_exit(&rp->r_statelock);
   2539 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
   2540 	    B_INVAL | B_TRUNC, cr);
   2541 	mutex_enter(&rp->r_statelock);
   2542 	rp->r_flags &= ~RTRUNCATE;
   2543 	cv_broadcast(&rp->r_cv);
   2544 	mutex_exit(&rp->r_statelock);
   2545 }
   2546 
   2547 static int nfs_write_error_to_cons_only = 0;
   2548 #define	MSG(x)	(nfs_write_error_to_cons_only ? (x) : (x) + 1)
   2549 
   2550 /*
   2551  * Print a file handle
   2552  */
   2553 void
   2554 nfs_printfhandle(nfs_fhandle *fhp)
   2555 {
   2556 	int *ip;
   2557 	char *buf;
   2558 	size_t bufsize;
   2559 	char *cp;
   2560 
   2561 	/*
   2562 	 * 13 == "(file handle:"
   2563 	 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
   2564 	 *	1 == ' '
   2565 	 *	8 == maximum strlen of "%x"
   2566 	 * 3 == ")\n\0"
   2567 	 */
   2568 	bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
   2569 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
   2570 	if (buf == NULL)
   2571 		return;
   2572 
   2573 	cp = buf;
   2574 	(void) strcpy(cp, "(file handle:");
   2575 	while (*cp != '\0')
   2576 		cp++;
   2577 	for (ip = (int *)fhp->fh_buf;
   2578 	    ip < (int *)&fhp->fh_buf[fhp->fh_len];
   2579 	    ip++) {
   2580 		(void) sprintf(cp, " %x", *ip);
   2581 		while (*cp != '\0')
   2582 			cp++;
   2583 	}
   2584 	(void) strcpy(cp, ")\n");
   2585 
   2586 	zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
   2587 
   2588 	kmem_free(buf, bufsize);
   2589 }
   2590 
   2591 /*
   2592  * Notify the system administrator that an NFS write error has
   2593  * occurred.
   2594  */
   2595 
   2596 /* seconds between ENOSPC/EDQUOT messages */
   2597 clock_t nfs_write_error_interval = 5;
   2598 
   2599 void
   2600 nfs_write_error(vnode_t *vp, int error, cred_t *cr)
   2601 {
   2602 	mntinfo_t *mi;
   2603 	clock_t now;
   2604 
   2605 	mi = VTOMI(vp);
   2606 	/*
   2607 	 * In case of forced unmount or zone shutdown, do not print any
   2608 	 * messages since it can flood the console with error messages.
   2609 	 */
   2610 	if (FS_OR_ZONE_GONE(mi->mi_vfsp))
   2611 		return;
   2612 
   2613 	/*
   2614 	 * No use in flooding the console with ENOSPC
   2615 	 * messages from the same file system.
   2616 	 */
   2617 	now = ddi_get_lbolt();
   2618 	if ((error != ENOSPC && error != EDQUOT) ||
   2619 	    now - mi->mi_printftime > 0) {
   2620 		zoneid_t zoneid = mi->mi_zone->zone_id;
   2621 
   2622 #ifdef DEBUG
   2623 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
   2624 		    mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
   2625 #else
   2626 		nfs_perror(error, "NFS write error on host %s: %m.\n",
   2627 		    VTOR(vp)->r_server->sv_hostname, NULL);
   2628 #endif
   2629 		if (error == ENOSPC || error == EDQUOT) {
   2630 			zcmn_err(zoneid, CE_CONT,
   2631 			    MSG("^File: userid=%d, groupid=%d\n"),
   2632 			    crgetuid(cr), crgetgid(cr));
   2633 			if (crgetuid(CRED()) != crgetuid(cr) ||
   2634 			    crgetgid(CRED()) != crgetgid(cr)) {
   2635 				zcmn_err(zoneid, CE_CONT,
   2636 				    MSG("^User: userid=%d, groupid=%d\n"),
   2637 				    crgetuid(CRED()), crgetgid(CRED()));
   2638 			}
   2639 			mi->mi_printftime = now +
   2640 			    nfs_write_error_interval * hz;
   2641 		}
   2642 		nfs_printfhandle(&VTOR(vp)->r_fh);
   2643 #ifdef DEBUG
   2644 		if (error == EACCES) {
   2645 			zcmn_err(zoneid, CE_CONT,
   2646 			    MSG("^nfs_bio: cred is%s kcred\n"),
   2647 			    cr == kcred ? "" : " not");
   2648 		}
   2649 #endif
   2650 	}
   2651 }
   2652 
   2653 /* ARGSUSED */
   2654 static void *
   2655 nfs_mi_init(zoneid_t zoneid)
   2656 {
   2657 	struct mi_globals *mig;
   2658 
   2659 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
   2660 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
   2661 	list_create(&mig->mig_list, sizeof (mntinfo_t),
   2662 	    offsetof(mntinfo_t, mi_zone_node));
   2663 	mig->mig_destructor_called = B_FALSE;
   2664 	return (mig);
   2665 }
   2666 
   2667 /*
   2668  * Callback routine to tell all NFS mounts in the zone to stop creating new
   2669  * threads.  Existing threads should exit.
   2670  */
   2671 /* ARGSUSED */
   2672 static void
   2673 nfs_mi_shutdown(zoneid_t zoneid, void *data)
   2674 {
   2675 	struct mi_globals *mig = data;
   2676 	mntinfo_t *mi;
   2677 
   2678 	ASSERT(mig != NULL);
   2679 again:
   2680 	mutex_enter(&mig->mig_lock);
   2681 	for (mi = list_head(&mig->mig_list); mi != NULL;
   2682 	    mi = list_next(&mig->mig_list, mi)) {
   2683 
   2684 		/*
   2685 		 * If we've done the shutdown work for this FS, skip.
   2686 		 * Once we go off the end of the list, we're done.
   2687 		 */
   2688 		if (mi->mi_flags & MI_DEAD)
   2689 			continue;
   2690 
   2691 		/*
   2692 		 * We will do work, so not done.  Get a hold on the FS.
   2693 		 */
   2694 		VFS_HOLD(mi->mi_vfsp);
   2695 
   2696 		/*
   2697 		 * purge the DNLC for this filesystem
   2698 		 */
   2699 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
   2700 
   2701 		mutex_enter(&mi->mi_async_lock);
   2702 		/*
   2703 		 * Tell existing async worker threads to exit.
   2704 		 */
   2705 		mi->mi_max_threads = 0;
   2706 		NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
   2707 		/*
   2708 		 * Set MI_ASYNC_MGR_STOP so the async manager thread starts
   2709 		 * getting ready to exit when it's done with its current work.
   2710 		 * Also set MI_DEAD to note we've acted on this FS.
   2711 		 */
   2712 		mutex_enter(&mi->mi_lock);
   2713 		mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
   2714 		mutex_exit(&mi->mi_lock);
   2715 		/*
   2716 		 * Wake up the async manager thread.
   2717 		 */
   2718 		cv_broadcast(&mi->mi_async_reqs_cv);
   2719 		mutex_exit(&mi->mi_async_lock);
   2720 
   2721 		/*
   2722 		 * Drop lock and release FS, which may change list, then repeat.
   2723 		 * We're done when every mi has been done or the list is empty.
   2724 		 */
   2725 		mutex_exit(&mig->mig_lock);
   2726 		VFS_RELE(mi->mi_vfsp);
   2727 		goto again;
   2728 	}
   2729 	mutex_exit(&mig->mig_lock);
   2730 }
   2731 
   2732 static void
   2733 nfs_mi_free_globals(struct mi_globals *mig)
   2734 {
   2735 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
   2736 	mutex_destroy(&mig->mig_lock);
   2737 	kmem_free(mig, sizeof (*mig));
   2738 
   2739 }
   2740 
   2741 /* ARGSUSED */
   2742 static void
   2743 nfs_mi_destroy(zoneid_t zoneid, void *data)
   2744 {
   2745 	struct mi_globals *mig = data;
   2746 
   2747 	ASSERT(mig != NULL);
   2748 	mutex_enter(&mig->mig_lock);
   2749 	if (list_head(&mig->mig_list) != NULL) {
   2750 		/* Still waiting for VFS_FREEVFS() */
   2751 		mig->mig_destructor_called = B_TRUE;
   2752 		mutex_exit(&mig->mig_lock);
   2753 		return;
   2754 	}
   2755 	nfs_mi_free_globals(mig);
   2756 }
   2757 
   2758 /*
   2759  * Add an NFS mount to the per-zone list of NFS mounts.
   2760  */
   2761 void
   2762 nfs_mi_zonelist_add(mntinfo_t *mi)
   2763 {
   2764 	struct mi_globals *mig;
   2765 
   2766 	mig = zone_getspecific(mi_list_key, mi->mi_zone);
   2767 	mutex_enter(&mig->mig_lock);
   2768 	list_insert_head(&mig->mig_list, mi);
   2769 	mutex_exit(&mig->mig_lock);
   2770 }
   2771 
   2772 /*
   2773  * Remove an NFS mount from the per-zone list of NFS mounts.
   2774  */
   2775 static void
   2776 nfs_mi_zonelist_remove(mntinfo_t *mi)
   2777 {
   2778 	struct mi_globals *mig;
   2779 
   2780 	mig = zone_getspecific(mi_list_key, mi->mi_zone);
   2781 	mutex_enter(&mig->mig_lock);
   2782 	list_remove(&mig->mig_list, mi);
   2783 	/*
   2784 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
   2785 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
   2786 	 * mi globals.
   2787 	 */
   2788 	if (list_head(&mig->mig_list) == NULL &&
   2789 	    mig->mig_destructor_called == B_TRUE) {
   2790 		nfs_mi_free_globals(mig);
   2791 		return;
   2792 	}
   2793 	mutex_exit(&mig->mig_lock);
   2794 }
   2795 
   2796 /*
   2797  * NFS Client initialization routine.  This routine should only be called
   2798  * once.  It performs the following tasks:
   2799  *	- Initalize all global locks
   2800  * 	- Call sub-initialization routines (localize access to variables)
   2801  */
   2802 int
   2803 nfs_clntinit(void)
   2804 {
   2805 #ifdef DEBUG
   2806 	static boolean_t nfs_clntup = B_FALSE;
   2807 #endif
   2808 	int error;
   2809 
   2810 #ifdef DEBUG
   2811 	ASSERT(nfs_clntup == B_FALSE);
   2812 #endif
   2813 
   2814 	error = nfs_subrinit();
   2815 	if (error)
   2816 		return (error);
   2817 
   2818 	error = nfs_vfsinit();
   2819 	if (error) {
   2820 		/*
   2821 		 * Cleanup nfs_subrinit() work
   2822 		 */
   2823 		nfs_subrfini();
   2824 		return (error);
   2825 	}
   2826 	zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
   2827 	    nfs_mi_destroy);
   2828 
   2829 	nfs4_clnt_init();
   2830 
   2831 #ifdef DEBUG
   2832 	nfs_clntup = B_TRUE;
   2833 #endif
   2834 
   2835 	return (0);
   2836 }
   2837 
   2838 /*
   2839  * This routine is only called if the NFS Client has been initialized but
   2840  * the module failed to be installed. This routine will cleanup the previously
   2841  * allocated/initialized work.
   2842  */
   2843 void
   2844 nfs_clntfini(void)
   2845 {
   2846 	(void) zone_key_delete(mi_list_key);
   2847 	nfs_subrfini();
   2848 	nfs_vfsfini();
   2849 	nfs4_clnt_fini();
   2850 }
   2851 
   2852 /*
   2853  * nfs_lockrelease:
   2854  *
   2855  * Release any locks on the given vnode that are held by the current
   2856  * process.
   2857  */
   2858 void
   2859 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
   2860 {
   2861 	flock64_t ld;
   2862 	struct shrlock shr;
   2863 	char *buf;
   2864 	int remote_lock_possible;
   2865 	int ret;
   2866 
   2867 	ASSERT((uintptr_t)vp > KERNELBASE);
   2868 
   2869 	/*
   2870 	 * Generate an explicit unlock operation for the entire file.  As a
   2871 	 * partial optimization, only generate the unlock if there is a
   2872 	 * lock registered for the file.  We could check whether this
   2873 	 * particular process has any locks on the file, but that would
   2874 	 * require the local locking code to provide yet another query
   2875 	 * routine.  Note that no explicit synchronization is needed here.
   2876 	 * At worst, flk_has_remote_locks() will return a false positive,
   2877 	 * in which case the unlock call wastes time but doesn't harm
   2878 	 * correctness.
   2879 	 *
   2880 	 * In addition, an unlock request is generated if the process
   2881 	 * is listed as possibly having a lock on the file because the
   2882 	 * server and client lock managers may have gotten out of sync.
   2883 	 * N.B. It is important to make sure nfs_remove_locking_id() is
   2884 	 * called here even if flk_has_remote_locks(vp) reports true.
   2885 	 * If it is not called and there is an entry on the process id
   2886 	 * list, that entry will never get removed.
   2887 	 */
   2888 	remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
   2889 	    (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
   2890 	if (remote_lock_possible || flk_has_remote_locks(vp)) {
   2891 		ld.l_type = F_UNLCK;	/* set to unlock entire file */
   2892 		ld.l_whence = 0;	/* unlock from start of file */
   2893 		ld.l_start = 0;
   2894 		ld.l_len = 0;		/* do entire file */
   2895 		ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr,
   2896 		    NULL);
   2897 
   2898 		if (ret != 0) {
   2899 			/*
   2900 			 * If VOP_FRLOCK fails, make sure we unregister
   2901 			 * local locks before we continue.
   2902 			 */
   2903 			ld.l_pid = ttoproc(curthread)->p_pid;
   2904 			lm_register_lock_locally(vp, NULL, &ld, flag, offset);
   2905 #ifdef DEBUG
   2906 			nfs_perror(ret,
   2907 			    "NFS lock release error on vp %p: %m.\n",
   2908 			    (void *)vp, NULL);
   2909 #endif
   2910 		}
   2911 
   2912 		/*
   2913 		 * The call to VOP_FRLOCK may put the pid back on the
   2914 		 * list.  We need to remove it.
   2915 		 */
   2916 		(void) nfs_remove_locking_id(vp, RLMPL_PID,
   2917 		    (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
   2918 	}
   2919 
   2920 	/*
   2921 	 * As long as the vp has a share matching our pid,
   2922 	 * pluck it off and unshare it.  There are circumstances in
   2923 	 * which the call to nfs_remove_locking_id() may put the
   2924 	 * owner back on the list, in which case we simply do a
   2925 	 * redundant and harmless unshare.
   2926 	 */
   2927 	buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
   2928 	while (nfs_remove_locking_id(vp, RLMPL_OWNER,
   2929 	    (char *)NULL, buf, &shr.s_own_len)) {
   2930 		shr.s_owner = buf;
   2931 		shr.s_access = 0;
   2932 		shr.s_deny = 0;
   2933 		shr.s_sysid = 0;
   2934 		shr.s_pid = curproc->p_pid;
   2935 
   2936 		ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL);
   2937 #ifdef DEBUG
   2938 		if (ret != 0) {
   2939 			nfs_perror(ret,
   2940 			    "NFS share release error on vp %p: %m.\n",
   2941 			    (void *)vp, NULL);
   2942 		}
   2943 #endif
   2944 	}
   2945 	kmem_free(buf, MAX_SHR_OWNER_LEN);
   2946 }
   2947 
   2948 /*
   2949  * nfs_lockcompletion:
   2950  *
   2951  * If the vnode has a lock that makes it unsafe to cache the file, mark it
   2952  * as non cachable (set VNOCACHE bit).
   2953  */
   2954 
   2955 void
   2956 nfs_lockcompletion(vnode_t *vp, int cmd)
   2957 {
   2958 #ifdef DEBUG
   2959 	rnode_t *rp = VTOR(vp);
   2960 
   2961 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
   2962 #endif
   2963 
   2964 	if (cmd == F_SETLK || cmd == F_SETLKW) {
   2965 		if (!lm_safemap(vp)) {
   2966 			mutex_enter(&vp->v_lock);
   2967 			vp->v_flag |= VNOCACHE;
   2968 			mutex_exit(&vp->v_lock);
   2969 		} else {
   2970 			mutex_enter(&vp->v_lock);
   2971 			vp->v_flag &= ~VNOCACHE;
   2972 			mutex_exit(&vp->v_lock);
   2973 		}
   2974 	}
   2975 	/*
   2976 	 * The cached attributes of the file are stale after acquiring
   2977 	 * the lock on the file. They were updated when the file was
   2978 	 * opened, but not updated when the lock was acquired. Therefore the
   2979 	 * cached attributes are invalidated after the lock is obtained.
   2980 	 */
   2981 	PURGE_ATTRCACHE(vp);
   2982 }
   2983 
   2984 /*
   2985  * The lock manager holds state making it possible for the client
   2986  * and server to be out of sync.  For example, if the response from
   2987  * the server granting a lock request is lost, the server will think
   2988  * the lock is granted and the client will think the lock is lost.
   2989  * The client can tell when it is not positive if it is in sync with
   2990  * the server.
   2991  *
   2992  * To deal with this, a list of processes for which the client is
   2993  * not sure if the server holds a lock is attached to the rnode.
   2994  * When such a process closes the rnode, an unlock request is sent
   2995  * to the server to unlock the entire file.
   2996  *
   2997  * The list is kept as a singularly linked NULL terminated list.
   2998  * Because it is only added to under extreme error conditions, the
   2999  * list shouldn't get very big.  DEBUG kernels print a message if
   3000  * the list gets bigger than nfs_lmpl_high_water.  This is arbitrarily
   3001  * choosen to be 8, but can be tuned at runtime.
   3002  */
   3003 #ifdef DEBUG
   3004 /* int nfs_lmpl_high_water = 8; */
   3005 int nfs_lmpl_high_water = 128;
   3006 int nfs_cnt_add_locking_id = 0;
   3007 int nfs_len_add_locking_id = 0;
   3008 #endif /* DEBUG */
   3009 
   3010 /*
   3011  * Record that the nfs lock manager server may be holding a lock on
   3012  * a vnode for a process.
   3013  *
   3014  * Because the nfs lock manager server holds state, it is possible
   3015  * for the server to get out of sync with the client.  This routine is called
   3016  * from the client when it is no longer sure if the server is in sync
   3017  * with the client.  nfs_lockrelease() will then notice this and send
   3018  * an unlock request when the file is closed
   3019  */
   3020 void
   3021 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
   3022 {
   3023 	rnode_t *rp;
   3024 	lmpl_t *new;
   3025 	lmpl_t *cur;
   3026 	lmpl_t **lmplp;
   3027 #ifdef DEBUG
   3028 	int list_len = 1;
   3029 #endif /* DEBUG */
   3030 
   3031 #ifdef DEBUG
   3032 	++nfs_cnt_add_locking_id;
   3033 #endif /* DEBUG */
   3034 	/*
   3035 	 * allocate new lmpl_t now so we don't sleep
   3036 	 * later after grabbing mutexes
   3037 	 */
   3038 	ASSERT(len < MAX_SHR_OWNER_LEN);
   3039 	new = kmem_alloc(sizeof (*new), KM_SLEEP);
   3040 	new->lmpl_type = type;
   3041 	new->lmpl_pid = pid;
   3042 	new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
   3043 	bcopy(id, new->lmpl_owner, len);
   3044 	new->lmpl_own_len = len;
   3045 	new->lmpl_next = (lmpl_t *)NULL;
   3046 #ifdef DEBUG
   3047 	if (type == RLMPL_PID) {
   3048 		ASSERT(len == sizeof (pid_t));
   3049 		ASSERT(pid == *(pid_t *)new->lmpl_owner);
   3050 	} else {
   3051 		ASSERT(type == RLMPL_OWNER);
   3052 	}
   3053 #endif
   3054 
   3055 	rp = VTOR(vp);
   3056 	mutex_enter(&rp->r_statelock);
   3057 
   3058 	/*
   3059 	 * Add this id to the list for this rnode only if the
   3060 	 * rnode is active and the id is not already there.
   3061 	 */
   3062 	ASSERT(rp->r_flags & RHASHED);
   3063 	lmplp = &(rp->r_lmpl);
   3064 	for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
   3065 		if (cur->lmpl_pid == pid &&
   3066 		    cur->lmpl_type == type &&
   3067 		    cur->lmpl_own_len == len &&
   3068 		    bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
   3069 			kmem_free(new->lmpl_owner, len);
   3070 			kmem_free(new, sizeof (*new));
   3071 			break;
   3072 		}
   3073 		lmplp = &cur->lmpl_next;
   3074 #ifdef DEBUG
   3075 		++list_len;
   3076 #endif /* DEBUG */
   3077 	}
   3078 	if (cur == (lmpl_t *)NULL) {
   3079 		*lmplp = new;
   3080 #ifdef DEBUG
   3081 		if (list_len > nfs_len_add_locking_id) {
   3082 			nfs_len_add_locking_id = list_len;
   3083 		}
   3084 		if (list_len > nfs_lmpl_high_water) {
   3085 			cmn_err(CE_WARN, "nfs_add_locking_id: long list "
   3086 			    "vp=%p is %d", (void *)vp, list_len);
   3087 		}
   3088 #endif /* DEBUG */
   3089 	}
   3090 
   3091 #ifdef DEBUG
   3092 	if (share_debug) {
   3093 		int nitems = 0;
   3094 		int npids = 0;
   3095 		int nowners = 0;
   3096 
   3097 		/*
   3098 		 * Count the number of things left on r_lmpl after the remove.
   3099 		 */
   3100 		for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
   3101 		    cur = cur->lmpl_next) {
   3102 			nitems++;
   3103 			if (cur->lmpl_type == RLMPL_PID) {
   3104 				npids++;
   3105 			} else if (cur->lmpl_type == RLMPL_OWNER) {
   3106 				nowners++;
   3107 			} else {
   3108 				cmn_err(CE_PANIC, "nfs_add_locking_id: "
   3109 				    "unrecognized lmpl_type %d",
   3110 				    cur->lmpl_type);
   3111 			}
   3112 		}
   3113 
   3114 		cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
   3115 		    "OWNs = %d items left on r_lmpl\n",
   3116 		    (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
   3117 	}
   3118 #endif
   3119 
   3120 	mutex_exit(&rp->r_statelock);
   3121 }
   3122 
   3123 /*
   3124  * Remove an id from the lock manager id list.
   3125  *
   3126  * If the id is not in the list return 0.  If it was found and
   3127  * removed, return 1.
   3128  */
   3129 static int
   3130 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
   3131 {
   3132 	lmpl_t *cur;
   3133 	lmpl_t **lmplp;
   3134 	rnode_t *rp;
   3135 	int rv = 0;
   3136 
   3137 	ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
   3138 
   3139 	rp = VTOR(vp);
   3140 
   3141 	mutex_enter(&rp->r_statelock);
   3142 	ASSERT(rp->r_flags & RHASHED);
   3143 	lmplp = &(rp->r_lmpl);
   3144 
   3145 	/*
   3146 	 * Search through the list and remove the entry for this id
   3147 	 * if it is there.  The special case id == NULL allows removal
   3148 	 * of the first share on the r_lmpl list belonging to the
   3149 	 * current process (if any), without regard to further details
   3150 	 * of its identity.
   3151 	 */
   3152 	for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
   3153 		if (cur->lmpl_type == type &&
   3154 		    cur->lmpl_pid == curproc->p_pid &&
   3155 		    (id == (char *)NULL ||
   3156 		    bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
   3157 			*lmplp = cur->lmpl_next;
   3158 			ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
   3159 			if (rid != NULL) {
   3160 				bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
   3161 				*rlen = cur->lmpl_own_len;
   3162 			}
   3163 			kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
   3164 			kmem_free(cur, sizeof (*cur));
   3165 			rv = 1;
   3166 			break;
   3167 		}
   3168 		lmplp = &cur->lmpl_next;
   3169 	}
   3170 
   3171 #ifdef DEBUG
   3172 	if (share_debug) {
   3173 		int nitems = 0;
   3174 		int npids = 0;
   3175 		int nowners = 0;
   3176 
   3177 		/*
   3178 		 * Count the number of things left on r_lmpl after the remove.
   3179 		 */
   3180 		for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
   3181 		    cur = cur->lmpl_next) {
   3182 			nitems++;
   3183 			if (cur->lmpl_type == RLMPL_PID) {
   3184 				npids++;
   3185 			} else if (cur->lmpl_type == RLMPL_OWNER) {
   3186 				nowners++;
   3187 			} else {
   3188 				cmn_err(CE_PANIC,
   3189 				    "nrli: unrecognized lmpl_type %d",
   3190 				    cur->lmpl_type);
   3191 			}
   3192 		}
   3193 
   3194 		cmn_err(CE_CONT,
   3195 		"nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
   3196 		    (type == RLMPL_PID) ? "P" : "O",
   3197 		    npids,
   3198 		    nowners,
   3199 		    nitems);
   3200 	}
   3201 #endif
   3202 
   3203 	mutex_exit(&rp->r_statelock);
   3204 	return (rv);
   3205 }
   3206 
   3207 void
   3208 nfs_free_mi(mntinfo_t *mi)
   3209 {
   3210 	ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
   3211 	ASSERT(mi->mi_manager_thread == NULL);
   3212 	ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
   3213 	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0);
   3214 
   3215 	/*
   3216 	 * Remove the node from the global list before we start tearing it down.
   3217 	 */
   3218 	nfs_mi_zonelist_remove(mi);
   3219 	if (mi->mi_klmconfig) {
   3220 		lm_free_config(mi->mi_klmconfig);
   3221 		kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
   3222 	}
   3223 	mutex_destroy(&mi->mi_lock);
   3224 	mutex_destroy(&mi->mi_remap_lock);
   3225 	mutex_destroy(&mi->mi_async_lock);
   3226 	cv_destroy(&mi->mi_failover_cv);
   3227 	cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]);
   3228 	cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]);
   3229 	cv_destroy(&mi->mi_async_reqs_cv);
   3230 	cv_destroy(&mi->mi_async_cv);
   3231 	zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS);
   3232 	kmem_free(mi, sizeof (*mi));
   3233 }
   3234 
   3235 static int
   3236 mnt_kstat_update(kstat_t *ksp, int rw)
   3237 {
   3238 	mntinfo_t *mi;
   3239 	struct mntinfo_kstat *mik;
   3240 	vfs_t *vfsp;
   3241 	int i;
   3242 
   3243 	/* this is a read-only kstat. Bail out on a write */
   3244 	if (rw == KSTAT_WRITE)
   3245 		return (EACCES);
   3246 
   3247 	/*
   3248 	 * We don't want to wait here as kstat_chain_lock could be held by
   3249 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
   3250 	 * and thus could lead to a deadlock.
   3251 	 */
   3252 	vfsp = (struct vfs *)ksp->ks_private;
   3253 
   3254 
   3255 	mi = VFTOMI(vfsp);
   3256 
   3257 	mik = (struct mntinfo_kstat *)ksp->ks_data;
   3258 
   3259 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
   3260 	mik->mik_vers = (uint32_t)mi->mi_vers;
   3261 	mik->mik_flags = mi->mi_flags;
   3262 	mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
   3263 	mik->mik_curread = (uint32_t)mi->mi_curread;
   3264 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
   3265 	mik->mik_retrans = mi->mi_retrans;
   3266 	mik->mik_timeo = mi->mi_timeo;
   3267 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
   3268 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
   3269 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
   3270 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
   3271 	for (i = 0; i < NFS_CALLTYPES + 1; i++) {
   3272 		mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
   3273 		mik->mik_timers[i].deviate =
   3274 		    (uint32_t)mi->mi_timers[i].rt_deviate;
   3275 		mik->mik_timers[i].rtxcur =
   3276 		    (uint32_t)mi->mi_timers[i].rt_rtxcur;
   3277 	}
   3278 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
   3279 	mik->mik_failover = (uint32_t)mi->mi_failover;
   3280 	mik->mik_remap = (uint32_t)mi->mi_remap;
   3281 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
   3282 
   3283 	return (0);
   3284 }
   3285 
   3286 void
   3287 nfs_mnt_kstat_init(struct vfs *vfsp)
   3288 {
   3289 	mntinfo_t *mi = VFTOMI(vfsp);
   3290 
   3291 	/*
   3292 	 * Create the version specific kstats.
   3293 	 *
   3294 	 * PSARC 2001/697 Contract Private Interface
   3295 	 * All nfs kstats are under SunMC contract
   3296 	 * Please refer to the PSARC listed above and contact
   3297 	 * SunMC before making any changes!
   3298 	 *
   3299 	 * Changes must be reviewed by Solaris File Sharing
   3300 	 * Changes must be communicated to contract-2001-697 (at) sun.com
   3301 	 *
   3302 	 */
   3303 
   3304 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
   3305 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
   3306 	if (mi->mi_io_kstats) {
   3307 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
   3308 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
   3309 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
   3310 		kstat_install(mi->mi_io_kstats);
   3311 	}
   3312 
   3313 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
   3314 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
   3315 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
   3316 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
   3317 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
   3318 		mi->mi_ro_kstats->ks_update = mnt_kstat_update;
   3319 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
   3320 		kstat_install(mi->mi_ro_kstats);
   3321 	}
   3322 }
   3323 
   3324 nfs_delmapcall_t *
   3325 nfs_init_delmapcall()
   3326 {
   3327 	nfs_delmapcall_t	*delmap_call;
   3328 
   3329 	delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
   3330 	delmap_call->call_id = curthread;
   3331 	delmap_call->error = 0;
   3332 
   3333 	return (delmap_call);
   3334 }
   3335 
   3336 void
   3337 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
   3338 {
   3339 	kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
   3340 }
   3341 
   3342 /*
   3343  * Searches for the current delmap caller (based on curthread) in the list of
   3344  * callers.  If it is found, we remove it and free the delmap caller.
   3345  * Returns:
   3346  *	0 if the caller wasn't found
   3347  *	1 if the caller was found, removed and freed.  *errp is set to what
   3348  * 	the result of the delmap was.
   3349  */
   3350 int
   3351 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
   3352 {
   3353 	nfs_delmapcall_t	*delmap_call;
   3354 
   3355 	/*
   3356 	 * If the list doesn't exist yet, we create it and return
   3357 	 * that the caller wasn't found.  No list = no callers.
   3358 	 */
   3359 	mutex_enter(&rp->r_statelock);
   3360 	if (!(rp->r_flags & RDELMAPLIST)) {
   3361 		/* The list does not exist */
   3362 		list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
   3363 		    offsetof(nfs_delmapcall_t, call_node));
   3364 		rp->r_flags |= RDELMAPLIST;
   3365 		mutex_exit(&rp->r_statelock);
   3366 		return (0);
   3367 	} else {
   3368 		/* The list exists so search it */
   3369 		for (delmap_call = list_head(&rp->r_indelmap);
   3370 		    delmap_call != NULL;
   3371 		    delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
   3372 			if (delmap_call->call_id == curthread) {
   3373 				/* current caller is in the list */
   3374 				*errp = delmap_call->error;
   3375 				list_remove(&rp->r_indelmap, delmap_call);
   3376 				mutex_exit(&rp->r_statelock);
   3377 				nfs_free_delmapcall(delmap_call);
   3378 				return (1);
   3379 			}
   3380 		}
   3381 	}
   3382 	mutex_exit(&rp->r_statelock);
   3383 	return (0);
   3384 }
   3385