Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/param.h>
     27 #include <sys/types.h>
     28 #include <sys/systm.h>
     29 #include <sys/cred.h>
     30 #include <sys/proc.h>
     31 #include <sys/user.h>
     32 #include <sys/time.h>
     33 #include <sys/buf.h>
     34 #include <sys/vfs.h>
     35 #include <sys/vnode.h>
     36 #include <sys/socket.h>
     37 #include <sys/uio.h>
     38 #include <sys/tiuser.h>
     39 #include <sys/swap.h>
     40 #include <sys/errno.h>
     41 #include <sys/debug.h>
     42 #include <sys/kmem.h>
     43 #include <sys/kstat.h>
     44 #include <sys/cmn_err.h>
     45 #include <sys/vtrace.h>
     46 #include <sys/session.h>
     47 #include <sys/dnlc.h>
     48 #include <sys/bitmap.h>
     49 #include <sys/acl.h>
     50 #include <sys/ddi.h>
     51 #include <sys/pathname.h>
     52 #include <sys/flock.h>
     53 #include <sys/dirent.h>
     54 #include <sys/flock.h>
     55 #include <sys/callb.h>
     56 #include <sys/atomic.h>
     57 #include <sys/list.h>
     58 #include <sys/tsol/tnet.h>
     59 #include <sys/priv.h>
     60 #include <sys/sdt.h>
     61 #include <sys/attr.h>
     62 
     63 #include <inet/ip6.h>
     64 
     65 #include <rpc/types.h>
     66 #include <rpc/xdr.h>
     67 #include <rpc/auth.h>
     68 #include <rpc/clnt.h>
     69 
     70 #include <nfs/nfs.h>
     71 #include <nfs/nfs4.h>
     72 #include <nfs/nfs_clnt.h>
     73 #include <nfs/rnode.h>
     74 #include <nfs/nfs_acl.h>
     75 
     76 #include <sys/tsol/label.h>
     77 
     78 /*
     79  * The hash queues for the access to active and cached rnodes
     80  * are organized as doubly linked lists.  A reader/writer lock
     81  * for each hash bucket is used to control access and to synchronize
     82  * lookups, additions, and deletions from the hash queue.
     83  *
     84  * The rnode freelist is organized as a doubly linked list with
     85  * a head pointer.  Additions and deletions are synchronized via
     86  * a single mutex.
     87  *
     88  * In order to add an rnode to the free list, it must be hashed into
     89  * a hash queue and the exclusive lock to the hash queue be held.
     90  * If an rnode is not hashed into a hash queue, then it is destroyed
     91  * because it represents no valuable information that can be reused
     92  * about the file.  The exclusive lock to the hash queue must be
     93  * held in order to prevent a lookup in the hash queue from finding
     94  * the rnode and using it and assuming that the rnode is not on the
     95  * freelist.  The lookup in the hash queue will have the hash queue
     96  * locked, either exclusive or shared.
     97  *
     98  * The vnode reference count for each rnode is not allowed to drop
     99  * below 1.  This prevents external entities, such as the VM
    100  * subsystem, from acquiring references to vnodes already on the
    101  * freelist and then trying to place them back on the freelist
    102  * when their reference is released.  This means that the when an
    103  * rnode is looked up in the hash queues, then either the rnode
    104  * is removed from the freelist and that reference is transferred to
    105  * the new reference or the vnode reference count must be incremented
    106  * accordingly.  The mutex for the freelist must be held in order to
    107  * accurately test to see if the rnode is on the freelist or not.
    108  * The hash queue lock might be held shared and it is possible that
    109  * two different threads may race to remove the rnode from the
    110  * freelist.  This race can be resolved by holding the mutex for the
    111  * freelist.  Please note that the mutex for the freelist does not
    112  * need to held if the rnode is not on the freelist.  It can not be
    113  * placed on the freelist due to the requirement that the thread
    114  * putting the rnode on the freelist must hold the exclusive lock
    115  * to the hash queue and the thread doing the lookup in the hash
    116  * queue is holding either a shared or exclusive lock to the hash
    117  * queue.
    118  *
    119  * The lock ordering is:
    120  *
    121  *	hash bucket lock -> vnode lock
    122  *	hash bucket lock -> freelist lock
    123  */
    124 static rhashq_t *rtable;
    125 
    126 static kmutex_t rpfreelist_lock;
    127 static rnode_t *rpfreelist = NULL;
    128 static long rnew = 0;
    129 long nrnode = 0;
    130 
    131 static int rtablesize;
    132 static int rtablemask;
    133 
    134 static int hashlen = 4;
    135 
    136 static struct kmem_cache *rnode_cache;
    137 
    138 /*
    139  * Mutex to protect the following variables:
    140  *	nfs_major
    141  *	nfs_minor
    142  */
    143 kmutex_t nfs_minor_lock;
    144 int nfs_major;
    145 int nfs_minor;
    146 
    147 /* Do we allow preepoch (negative) time values otw? */
    148 bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
    149 
    150 /*
    151  * Access cache
    152  */
    153 static acache_hash_t *acache;
    154 static long nacache;	/* used strictly to size the number of hash queues */
    155 
    156 static int acachesize;
    157 static int acachemask;
    158 static struct kmem_cache *acache_cache;
    159 
    160 /*
    161  * Client side utilities
    162  */
    163 
    164 /*
    165  * client side statistics
    166  */
    167 static const struct clstat clstat_tmpl = {
    168 	{ "calls",	KSTAT_DATA_UINT64 },
    169 	{ "badcalls",	KSTAT_DATA_UINT64 },
    170 	{ "clgets",	KSTAT_DATA_UINT64 },
    171 	{ "cltoomany",	KSTAT_DATA_UINT64 },
    172 #ifdef DEBUG
    173 	{ "clalloc",	KSTAT_DATA_UINT64 },
    174 	{ "noresponse",	KSTAT_DATA_UINT64 },
    175 	{ "failover",	KSTAT_DATA_UINT64 },
    176 	{ "remap",	KSTAT_DATA_UINT64 },
    177 #endif
    178 };
    179 
    180 /*
    181  * The following are statistics that describe behavior of the system as a whole
    182  * and doesn't correspond to any one particular zone.
    183  */
    184 #ifdef DEBUG
    185 static struct clstat_debug {
    186 	kstat_named_t	nrnode;			/* number of allocated rnodes */
    187 	kstat_named_t	access;			/* size of access cache */
    188 	kstat_named_t	dirent;			/* size of readdir cache */
    189 	kstat_named_t	dirents;		/* size of readdir buf cache */
    190 	kstat_named_t	reclaim;		/* number of reclaims */
    191 	kstat_named_t	clreclaim;		/* number of cl reclaims */
    192 	kstat_named_t	f_reclaim;		/* number of free reclaims */
    193 	kstat_named_t	a_reclaim;		/* number of active reclaims */
    194 	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
    195 	kstat_named_t	rpath;			/* bytes used to store rpaths */
    196 } clstat_debug = {
    197 	{ "nrnode",	KSTAT_DATA_UINT64 },
    198 	{ "access",	KSTAT_DATA_UINT64 },
    199 	{ "dirent",	KSTAT_DATA_UINT64 },
    200 	{ "dirents",	KSTAT_DATA_UINT64 },
    201 	{ "reclaim",	KSTAT_DATA_UINT64 },
    202 	{ "clreclaim",	KSTAT_DATA_UINT64 },
    203 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
    204 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
    205 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
    206 	{ "r_path",	KSTAT_DATA_UINT64 },
    207 };
    208 #endif	/* DEBUG */
    209 
    210 /*
    211  * We keep a global list of per-zone client data, so we can clean up all zones
    212  * if we get low on memory.
    213  */
    214 static list_t nfs_clnt_list;
    215 static kmutex_t nfs_clnt_list_lock;
    216 static zone_key_t nfsclnt_zone_key;
    217 
    218 static struct kmem_cache *chtab_cache;
    219 
    220 /*
    221  * Some servers do not properly update the attributes of the
    222  * directory when changes are made.  To allow interoperability
    223  * with these broken servers, the nfs_disable_rddir_cache
    224  * parameter must be set in /etc/system
    225  */
    226 int nfs_disable_rddir_cache = 0;
    227 
    228 int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
    229 		    struct chtab **);
    230 void		clfree(CLIENT *, struct chtab *);
    231 static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
    232 		    struct chtab **, struct nfs_clnt *);
    233 static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
    234 		    struct chtab **, struct nfs_clnt *);
    235 static void	clreclaim(void *);
    236 static int	nfs_feedback(int, int, mntinfo_t *);
    237 static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
    238 		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
    239 		    failinfo_t *);
    240 static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
    241 		    caddr_t, cred_t *, int *, int, failinfo_t *);
    242 static void	rinactive(rnode_t *, cred_t *);
    243 static int	rtablehash(nfs_fhandle *);
    244 static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
    245 		    struct vnodeops *,
    246 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
    247 			cred_t *),
    248 		    int (*)(const void *, const void *), int *, cred_t *,
    249 		    char *, char *);
    250 static void	rp_rmfree(rnode_t *);
    251 static void	rp_addhash(rnode_t *);
    252 static void	rp_rmhash_locked(rnode_t *);
    253 static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
    254 static void	destroy_rnode(rnode_t *);
    255 static void	rddir_cache_free(rddir_cache *);
    256 static int	nfs_free_data_reclaim(rnode_t *);
    257 static int	nfs_active_data_reclaim(rnode_t *);
    258 static int	nfs_free_reclaim(void);
    259 static int	nfs_active_reclaim(void);
    260 static int	nfs_rnode_reclaim(void);
    261 static void	nfs_reclaim(void *);
    262 static int	failover_safe(failinfo_t *);
    263 static void	failover_newserver(mntinfo_t *mi);
    264 static void	failover_thread(mntinfo_t *mi);
    265 static int	failover_wait(mntinfo_t *);
    266 static int	failover_remap(failinfo_t *);
    267 static int	failover_lookup(char *, vnode_t *,
    268 		    int (*)(vnode_t *, char *, vnode_t **,
    269 			struct pathname *, int, vnode_t *, cred_t *, int),
    270 		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
    271 		    vnode_t **);
    272 static void	nfs_free_r_path(rnode_t *);
    273 static void	nfs_set_vroot(vnode_t *);
    274 static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
    275 
    276 /*
    277  * from rpcsec module (common/rpcsec)
    278  */
    279 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
    280 extern void sec_clnt_freeh(AUTH *);
    281 extern void sec_clnt_freeinfo(struct sec_data *);
    282 
    283 /*
    284  * used in mount policy
    285  */
    286 extern ts_label_t *getflabel_cipso(vfs_t *);
    287 
    288 /*
    289  * EIO or EINTR are not recoverable errors.
    290  */
    291 #define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
    292 
    293 #ifdef DEBUG
    294 #define	SRV_QFULL_MSG	"send queue to NFS%d server %s is full; still trying\n"
    295 #define	SRV_NOTRESP_MSG	"NFS%d server %s not responding still trying\n"
    296 #else
    297 #define	SRV_QFULL_MSG	"send queue to NFS server %s is full still trying\n"
    298 #define	SRV_NOTRESP_MSG	"NFS server %s not responding still trying\n"
    299 #endif
    300 /*
    301  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
    302  */
    303 static int
    304 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
    305     struct chtab **chp, struct nfs_clnt *nfscl)
    306 {
    307 	struct chhead *ch, *newch;
    308 	struct chhead **plistp;
    309 	struct chtab *cp;
    310 	int error;
    311 	k_sigset_t smask;
    312 
    313 	if (newcl == NULL || chp == NULL || ci == NULL)
    314 		return (EINVAL);
    315 
    316 	*newcl = NULL;
    317 	*chp = NULL;
    318 
    319 	/*
    320 	 * Find an unused handle or create one
    321 	 */
    322 	newch = NULL;
    323 	nfscl->nfscl_stat.clgets.value.ui64++;
    324 top:
    325 	/*
    326 	 * Find the correct entry in the cache to check for free
    327 	 * client handles.  The search is based on the RPC program
    328 	 * number, program version number, dev_t for the transport
    329 	 * device, and the protocol family.
    330 	 */
    331 	mutex_enter(&nfscl->nfscl_chtable_lock);
    332 	plistp = &nfscl->nfscl_chtable;
    333 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
    334 		if (ch->ch_prog == ci->cl_prog &&
    335 		    ch->ch_vers == ci->cl_vers &&
    336 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
    337 		    (strcmp(ch->ch_protofmly,
    338 		    svp->sv_knconf->knc_protofmly) == 0))
    339 			break;
    340 		plistp = &ch->ch_next;
    341 	}
    342 
    343 	/*
    344 	 * If we didn't find a cache entry for this quadruple, then
    345 	 * create one.  If we don't have one already preallocated,
    346 	 * then drop the cache lock, create one, and then start over.
    347 	 * If we did have a preallocated entry, then just add it to
    348 	 * the front of the list.
    349 	 */
    350 	if (ch == NULL) {
    351 		if (newch == NULL) {
    352 			mutex_exit(&nfscl->nfscl_chtable_lock);
    353 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
    354 			newch->ch_timesused = 0;
    355 			newch->ch_prog = ci->cl_prog;
    356 			newch->ch_vers = ci->cl_vers;
    357 			newch->ch_dev = svp->sv_knconf->knc_rdev;
    358 			newch->ch_protofmly = kmem_alloc(
    359 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
    360 			    KM_SLEEP);
    361 			(void) strcpy(newch->ch_protofmly,
    362 			    svp->sv_knconf->knc_protofmly);
    363 			newch->ch_list = NULL;
    364 			goto top;
    365 		}
    366 		ch = newch;
    367 		newch = NULL;
    368 		ch->ch_next = nfscl->nfscl_chtable;
    369 		nfscl->nfscl_chtable = ch;
    370 	/*
    371 	 * We found a cache entry, but if it isn't on the front of the
    372 	 * list, then move it to the front of the list to try to take
    373 	 * advantage of locality of operations.
    374 	 */
    375 	} else if (ch != nfscl->nfscl_chtable) {
    376 		*plistp = ch->ch_next;
    377 		ch->ch_next = nfscl->nfscl_chtable;
    378 		nfscl->nfscl_chtable = ch;
    379 	}
    380 
    381 	/*
    382 	 * If there was a free client handle cached, then remove it
    383 	 * from the list, init it, and use it.
    384 	 */
    385 	if (ch->ch_list != NULL) {
    386 		cp = ch->ch_list;
    387 		ch->ch_list = cp->ch_list;
    388 		mutex_exit(&nfscl->nfscl_chtable_lock);
    389 		if (newch != NULL) {
    390 			kmem_free(newch->ch_protofmly,
    391 			    strlen(newch->ch_protofmly) + 1);
    392 			kmem_free(newch, sizeof (*newch));
    393 		}
    394 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
    395 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
    396 		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
    397 		    &cp->ch_client->cl_auth);
    398 		if (error || cp->ch_client->cl_auth == NULL) {
    399 			CLNT_DESTROY(cp->ch_client);
    400 			kmem_cache_free(chtab_cache, cp);
    401 			return ((error != 0) ? error : EINTR);
    402 		}
    403 		ch->ch_timesused++;
    404 		*newcl = cp->ch_client;
    405 		*chp = cp;
    406 		return (0);
    407 	}
    408 
    409 	/*
    410 	 * There weren't any free client handles which fit, so allocate
    411 	 * a new one and use that.
    412 	 */
    413 #ifdef DEBUG
    414 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
    415 #endif
    416 	mutex_exit(&nfscl->nfscl_chtable_lock);
    417 
    418 	nfscl->nfscl_stat.cltoomany.value.ui64++;
    419 	if (newch != NULL) {
    420 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
    421 		kmem_free(newch, sizeof (*newch));
    422 	}
    423 
    424 	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
    425 	cp->ch_head = ch;
    426 
    427 	sigintr(&smask, (int)ci->cl_flags & MI_INT);
    428 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
    429 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
    430 	sigunintr(&smask);
    431 
    432 	if (error != 0) {
    433 		kmem_cache_free(chtab_cache, cp);
    434 #ifdef DEBUG
    435 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
    436 #endif
    437 		/*
    438 		 * Warning is unnecessary if error is EINTR.
    439 		 */
    440 		if (error != EINTR) {
    441 			nfs_cmn_err(error, CE_WARN,
    442 			    "clget: couldn't create handle: %m\n");
    443 		}
    444 		return (error);
    445 	}
    446 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
    447 	auth_destroy(cp->ch_client->cl_auth);
    448 	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
    449 	    &cp->ch_client->cl_auth);
    450 	if (error || cp->ch_client->cl_auth == NULL) {
    451 		CLNT_DESTROY(cp->ch_client);
    452 		kmem_cache_free(chtab_cache, cp);
    453 #ifdef DEBUG
    454 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
    455 #endif
    456 		return ((error != 0) ? error : EINTR);
    457 	}
    458 	ch->ch_timesused++;
    459 	*newcl = cp->ch_client;
    460 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
    461 	*chp = cp;
    462 	return (0);
    463 }
    464 
    465 int
    466 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
    467     struct chtab **chp)
    468 {
    469 	struct nfs_clnt *nfscl;
    470 
    471 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
    472 	ASSERT(nfscl != NULL);
    473 
    474 	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
    475 }
    476 
    477 static int
    478 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
    479     struct chtab **chp, struct nfs_clnt *nfscl)
    480 {
    481 	clinfo_t ci;
    482 	int error;
    483 
    484 	/*
    485 	 * Set read buffer size to rsize
    486 	 * and add room for RPC headers.
    487 	 */
    488 	ci.cl_readsize = mi->mi_tsize;
    489 	if (ci.cl_readsize != 0)
    490 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
    491 
    492 	/*
    493 	 * If soft mount and server is down just try once.
    494 	 * meaning: do not retransmit.
    495 	 */
    496 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
    497 		ci.cl_retrans = 0;
    498 	else
    499 		ci.cl_retrans = mi->mi_retrans;
    500 
    501 	ci.cl_prog = NFS_ACL_PROGRAM;
    502 	ci.cl_vers = mi->mi_vers;
    503 	ci.cl_flags = mi->mi_flags;
    504 
    505 	/*
    506 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
    507 	 * security flavor, the client tries to establish a security context
    508 	 * by contacting the server. If the connection is timed out or reset,
    509 	 * e.g. server reboot, we will try again.
    510 	 */
    511 	do {
    512 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
    513 
    514 		if (error == 0)
    515 			break;
    516 
    517 		/*
    518 		 * For forced unmount or zone shutdown, bail out, no retry.
    519 		 */
    520 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
    521 			error = EIO;
    522 			break;
    523 		}
    524 
    525 		/* do not retry for softmount */
    526 		if (!(mi->mi_flags & MI_HARD))
    527 			break;
    528 
    529 		/* let the caller deal with the failover case */
    530 		if (FAILOVER_MOUNT(mi))
    531 			break;
    532 
    533 	} while (error == ETIMEDOUT || error == ECONNRESET);
    534 
    535 	return (error);
    536 }
    537 
    538 static int
    539 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
    540     struct chtab **chp, struct nfs_clnt *nfscl)
    541 {
    542 	clinfo_t ci;
    543 	int error;
    544 
    545 	/*
    546 	 * Set read buffer size to rsize
    547 	 * and add room for RPC headers.
    548 	 */
    549 	ci.cl_readsize = mi->mi_tsize;
    550 	if (ci.cl_readsize != 0)
    551 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
    552 
    553 	/*
    554 	 * If soft mount and server is down just try once.
    555 	 * meaning: do not retransmit.
    556 	 */
    557 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
    558 		ci.cl_retrans = 0;
    559 	else
    560 		ci.cl_retrans = mi->mi_retrans;
    561 
    562 	ci.cl_prog = mi->mi_prog;
    563 	ci.cl_vers = mi->mi_vers;
    564 	ci.cl_flags = mi->mi_flags;
    565 
    566 	/*
    567 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
    568 	 * security flavor, the client tries to establish a security context
    569 	 * by contacting the server. If the connection is timed out or reset,
    570 	 * e.g. server reboot, we will try again.
    571 	 */
    572 	do {
    573 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
    574 
    575 		if (error == 0)
    576 			break;
    577 
    578 		/*
    579 		 * For forced unmount or zone shutdown, bail out, no retry.
    580 		 */
    581 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
    582 			error = EIO;
    583 			break;
    584 		}
    585 
    586 		/* do not retry for softmount */
    587 		if (!(mi->mi_flags & MI_HARD))
    588 			break;
    589 
    590 		/* let the caller deal with the failover case */
    591 		if (FAILOVER_MOUNT(mi))
    592 			break;
    593 
    594 	} while (error == ETIMEDOUT || error == ECONNRESET);
    595 
    596 	return (error);
    597 }
    598 
    599 static void
    600 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
    601 {
    602 	if (cl->cl_auth != NULL) {
    603 		sec_clnt_freeh(cl->cl_auth);
    604 		cl->cl_auth = NULL;
    605 	}
    606 
    607 	/*
    608 	 * Timestamp this cache entry so that we know when it was last
    609 	 * used.
    610 	 */
    611 	cp->ch_freed = gethrestime_sec();
    612 
    613 	/*
    614 	 * Add the free client handle to the front of the list.
    615 	 * This way, the list will be sorted in youngest to oldest
    616 	 * order.
    617 	 */
    618 	mutex_enter(&nfscl->nfscl_chtable_lock);
    619 	cp->ch_list = cp->ch_head->ch_list;
    620 	cp->ch_head->ch_list = cp;
    621 	mutex_exit(&nfscl->nfscl_chtable_lock);
    622 }
    623 
    624 void
    625 clfree(CLIENT *cl, struct chtab *cp)
    626 {
    627 	struct nfs_clnt *nfscl;
    628 
    629 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
    630 	ASSERT(nfscl != NULL);
    631 
    632 	clfree_impl(cl, cp, nfscl);
    633 }
    634 
    635 #define	CL_HOLDTIME	60	/* time to hold client handles */
    636 
    637 static void
    638 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
    639 {
    640 	struct chhead *ch;
    641 	struct chtab *cp;	/* list of objects that can be reclaimed */
    642 	struct chtab *cpe;
    643 	struct chtab *cpl;
    644 	struct chtab **cpp;
    645 #ifdef DEBUG
    646 	int n = 0;
    647 #endif
    648 
    649 	/*
    650 	 * Need to reclaim some memory, so step through the cache
    651 	 * looking through the lists for entries which can be freed.
    652 	 */
    653 	cp = NULL;
    654 
    655 	mutex_enter(&nfscl->nfscl_chtable_lock);
    656 
    657 	/*
    658 	 * Here we step through each non-NULL quadruple and start to
    659 	 * construct the reclaim list pointed to by cp.  Note that
    660 	 * cp will contain all eligible chtab entries.  When this traversal
    661 	 * completes, chtab entries from the last quadruple will be at the
    662 	 * front of cp and entries from previously inspected quadruples have
    663 	 * been appended to the rear of cp.
    664 	 */
    665 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
    666 		if (ch->ch_list == NULL)
    667 			continue;
    668 		/*
    669 		 * Search each list for entries older then
    670 		 * cl_holdtime seconds.  The lists are maintained
    671 		 * in youngest to oldest order so that when the
    672 		 * first entry is found which is old enough, then
    673 		 * all of the rest of the entries on the list will
    674 		 * be old enough as well.
    675 		 */
    676 		cpl = ch->ch_list;
    677 		cpp = &ch->ch_list;
    678 		while (cpl != NULL &&
    679 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
    680 			cpp = &cpl->ch_list;
    681 			cpl = cpl->ch_list;
    682 		}
    683 		if (cpl != NULL) {
    684 			*cpp = NULL;
    685 			if (cp != NULL) {
    686 				cpe = cpl;
    687 				while (cpe->ch_list != NULL)
    688 					cpe = cpe->ch_list;
    689 				cpe->ch_list = cp;
    690 			}
    691 			cp = cpl;
    692 		}
    693 	}
    694 
    695 	mutex_exit(&nfscl->nfscl_chtable_lock);
    696 
    697 	/*
    698 	 * If cp is empty, then there is nothing to reclaim here.
    699 	 */
    700 	if (cp == NULL)
    701 		return;
    702 
    703 	/*
    704 	 * Step through the list of entries to free, destroying each client
    705 	 * handle and kmem_free'ing the memory for each entry.
    706 	 */
    707 	while (cp != NULL) {
    708 #ifdef DEBUG
    709 		n++;
    710 #endif
    711 		CLNT_DESTROY(cp->ch_client);
    712 		cpl = cp->ch_list;
    713 		kmem_cache_free(chtab_cache, cp);
    714 		cp = cpl;
    715 	}
    716 
    717 #ifdef DEBUG
    718 	/*
    719 	 * Update clalloc so that nfsstat shows the current number
    720 	 * of allocated client handles.
    721 	 */
    722 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
    723 #endif
    724 }
    725 
    726 /* ARGSUSED */
    727 static void
    728 clreclaim(void *all)
    729 {
    730 	struct nfs_clnt *nfscl;
    731 
    732 #ifdef DEBUG
    733 	clstat_debug.clreclaim.value.ui64++;
    734 #endif
    735 	/*
    736 	 * The system is low on memory; go through and try to reclaim some from
    737 	 * every zone on the system.
    738 	 */
    739 	mutex_enter(&nfs_clnt_list_lock);
    740 	nfscl = list_head(&nfs_clnt_list);
    741 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
    742 		clreclaim_zone(nfscl, CL_HOLDTIME);
    743 	mutex_exit(&nfs_clnt_list_lock);
    744 }
    745 
    746 /*
    747  * Minimum time-out values indexed by call type
    748  * These units are in "eights" of a second to avoid multiplies
    749  */
    750 static unsigned int minimum_timeo[] = {
    751 	6, 7, 10
    752 };
    753 
    754 /*
    755  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
    756  */
    757 #define	MAXTIMO	(20*hz)
    758 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
    759 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
    760 
    761 #define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
    762 #define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
    763 #define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
    764 
    765 /*
    766  * Function called when rfscall notices that we have been
    767  * re-transmitting, or when we get a response without retransmissions.
    768  * Return 1 if the transfer size was adjusted down - 0 if no change.
    769  */
    770 static int
    771 nfs_feedback(int flag, int which, mntinfo_t *mi)
    772 {
    773 	int kind;
    774 	int r = 0;
    775 
    776 	mutex_enter(&mi->mi_lock);
    777 	if (flag == FEEDBACK_REXMIT1) {
    778 		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
    779 		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
    780 			goto done;
    781 		if (mi->mi_curread > MIN_NFS_TSIZE) {
    782 			mi->mi_curread /= 2;
    783 			if (mi->mi_curread < MIN_NFS_TSIZE)
    784 				mi->mi_curread = MIN_NFS_TSIZE;
    785 			r = 1;
    786 		}
    787 
    788 		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
    789 			mi->mi_curwrite /= 2;
    790 			if (mi->mi_curwrite < MIN_NFS_TSIZE)
    791 				mi->mi_curwrite = MIN_NFS_TSIZE;
    792 			r = 1;
    793 		}
    794 	} else if (flag == FEEDBACK_OK) {
    795 		kind = mi->mi_timer_type[which];
    796 		if (kind == 0 ||
    797 		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
    798 			goto done;
    799 		if (kind == 1) {
    800 			if (mi->mi_curread >= mi->mi_tsize)
    801 				goto done;
    802 			mi->mi_curread +=  MIN_NFS_TSIZE;
    803 			if (mi->mi_curread > mi->mi_tsize/2)
    804 				mi->mi_curread = mi->mi_tsize;
    805 		} else if (kind == 2) {
    806 			if (mi->mi_curwrite >= mi->mi_stsize)
    807 				goto done;
    808 			mi->mi_curwrite += MIN_NFS_TSIZE;
    809 			if (mi->mi_curwrite > mi->mi_stsize/2)
    810 				mi->mi_curwrite = mi->mi_stsize;
    811 		}
    812 	}
    813 done:
    814 	mutex_exit(&mi->mi_lock);
    815 	return (r);
    816 }
    817 
    818 #ifdef DEBUG
    819 static int rfs2call_hits = 0;
    820 static int rfs2call_misses = 0;
    821 #endif
    822 
    823 int
    824 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
    825     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
    826     enum nfsstat *statusp, int flags, failinfo_t *fi)
    827 {
    828 	int rpcerror;
    829 	enum clnt_stat rpc_status;
    830 
    831 	ASSERT(statusp != NULL);
    832 
    833 	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
    834 	    cr, douprintf, &rpc_status, flags, fi);
    835 	if (!rpcerror) {
    836 		/*
    837 		 * See crnetadjust() for comments.
    838 		 */
    839 		if (*statusp == NFSERR_ACCES &&
    840 		    (cr = crnetadjust(cr)) != NULL) {
    841 #ifdef DEBUG
    842 			rfs2call_hits++;
    843 #endif
    844 			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
    845 			    resp, cr, douprintf, NULL, flags, fi);
    846 			crfree(cr);
    847 #ifdef DEBUG
    848 			if (*statusp == NFSERR_ACCES)
    849 				rfs2call_misses++;
    850 #endif
    851 		}
    852 	} else if (rpc_status == RPC_PROCUNAVAIL) {
    853 		*statusp = NFSERR_OPNOTSUPP;
    854 		rpcerror = 0;
    855 	}
    856 
    857 	return (rpcerror);
    858 }
    859 
    860 #define	NFS3_JUKEBOX_DELAY	10 * hz
    861 
    862 static clock_t nfs3_jukebox_delay = 0;
    863 
    864 #ifdef DEBUG
    865 static int rfs3call_hits = 0;
    866 static int rfs3call_misses = 0;
    867 #endif
    868 
    869 int
    870 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
    871     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
    872     nfsstat3 *statusp, int flags, failinfo_t *fi)
    873 {
    874 	int rpcerror;
    875 	int user_informed;
    876 
    877 	user_informed = 0;
    878 	do {
    879 		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
    880 		    cr, douprintf, NULL, flags, fi);
    881 		if (!rpcerror) {
    882 			cred_t *crr;
    883 			if (*statusp == NFS3ERR_JUKEBOX) {
    884 				if (ttoproc(curthread) == &p0) {
    885 					rpcerror = EAGAIN;
    886 					break;
    887 				}
    888 				if (!user_informed) {
    889 					user_informed = 1;
    890 					uprintf(
    891 		"file temporarily unavailable on the server, retrying...\n");
    892 				}
    893 				delay(nfs3_jukebox_delay);
    894 			}
    895 			/*
    896 			 * See crnetadjust() for comments.
    897 			 */
    898 			else if (*statusp == NFS3ERR_ACCES &&
    899 			    (crr = crnetadjust(cr)) != NULL) {
    900 #ifdef DEBUG
    901 				rfs3call_hits++;
    902 #endif
    903 				rpcerror = rfscall(mi, which, xdrargs, argsp,
    904 				    xdrres, resp, crr, douprintf,
    905 				    NULL, flags, fi);
    906 
    907 				crfree(crr);
    908 #ifdef DEBUG
    909 				if (*statusp == NFS3ERR_ACCES)
    910 					rfs3call_misses++;
    911 #endif
    912 			}
    913 		}
    914 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
    915 
    916 	return (rpcerror);
    917 }
    918 
    919 #define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
    920 #define	INC_READERS(mi)		{ \
    921 	mi->mi_readers++; \
    922 }
    923 #define	DEC_READERS(mi)		{ \
    924 	mi->mi_readers--; \
    925 	if (mi->mi_readers == 0) \
    926 		cv_broadcast(&mi->mi_failover_cv); \
    927 }
    928 
    929 static int
    930 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
    931     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
    932     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
    933 {
    934 	CLIENT *client;
    935 	struct chtab *ch;
    936 	cred_t *cr = icr;
    937 	enum clnt_stat status;
    938 	struct rpc_err rpcerr, rpcerr_tmp;
    939 	struct timeval wait;
    940 	int timeo;		/* in units of hz */
    941 	int my_rsize, my_wsize;
    942 	bool_t tryagain;
    943 	bool_t cred_cloned = FALSE;
    944 	k_sigset_t smask;
    945 	servinfo_t *svp;
    946 	struct nfs_clnt *nfscl;
    947 	zoneid_t zoneid = getzoneid();
    948 	char *msg;
    949 #ifdef DEBUG
    950 	char *bufp;
    951 #endif
    952 
    953 
    954 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
    955 	    "rfscall_start:which %d mi %p", which, mi);
    956 
    957 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
    958 	ASSERT(nfscl != NULL);
    959 
    960 	nfscl->nfscl_stat.calls.value.ui64++;
    961 	mi->mi_reqs[which].value.ui64++;
    962 
    963 	rpcerr.re_status = RPC_SUCCESS;
    964 
    965 	/*
    966 	 * In case of forced unmount or zone shutdown, return EIO.
    967 	 */
    968 
    969 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
    970 		rpcerr.re_status = RPC_FAILED;
    971 		rpcerr.re_errno = EIO;
    972 		return (rpcerr.re_errno);
    973 	}
    974 
    975 	/*
    976 	 * Remember the transfer sizes in case
    977 	 * nfs_feedback changes them underneath us.
    978 	 */
    979 	my_rsize = mi->mi_curread;
    980 	my_wsize = mi->mi_curwrite;
    981 
    982 	/*
    983 	 * NFS client failover support
    984 	 *
    985 	 * If this rnode is not in sync with the current server (VALID_FH),
    986 	 * we'd like to do a remap to get in sync.  We can be interrupted
    987 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
    988 	 * use the best info we have to try the RPC.  Part of that is
    989 	 * unconditionally updating the filehandle copy kept for V3.
    990 	 *
    991 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
    992 	 * rw_enter(); we're trying to keep the current server from being
    993 	 * changed on us until we're done with the remapping and have a
    994 	 * matching client handle.  We don't want to sending a filehandle
    995 	 * to the wrong host.
    996 	 */
    997 failoverretry:
    998 	if (FAILOVER_MOUNT(mi)) {
    999 		mutex_enter(&mi->mi_lock);
   1000 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
   1001 			if (failover_wait(mi)) {
   1002 				mutex_exit(&mi->mi_lock);
   1003 				return (EINTR);
   1004 			}
   1005 		}
   1006 		INC_READERS(mi);
   1007 		mutex_exit(&mi->mi_lock);
   1008 		if (fi) {
   1009 			if (!VALID_FH(fi) &&
   1010 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
   1011 				int remaperr;
   1012 
   1013 				svp = mi->mi_curr_serv;
   1014 				remaperr = failover_remap(fi);
   1015 				if (remaperr != 0) {
   1016 #ifdef DEBUG
   1017 					if (remaperr != EINTR)
   1018 						nfs_cmn_err(remaperr, CE_WARN,
   1019 					    "rfscall couldn't failover: %m");
   1020 #endif
   1021 					mutex_enter(&mi->mi_lock);
   1022 					DEC_READERS(mi);
   1023 					mutex_exit(&mi->mi_lock);
   1024 					/*
   1025 					 * If failover_remap returns ETIMEDOUT
   1026 					 * and the filesystem is hard mounted
   1027 					 * we have to retry the call with a new
   1028 					 * server.
   1029 					 */
   1030 					if ((mi->mi_flags & MI_HARD) &&
   1031 					    IS_RECOVERABLE_ERROR(remaperr)) {
   1032 						if (svp == mi->mi_curr_serv)
   1033 							failover_newserver(mi);
   1034 						rpcerr.re_status = RPC_SUCCESS;
   1035 						goto failoverretry;
   1036 					}
   1037 					rpcerr.re_errno = remaperr;
   1038 					return (remaperr);
   1039 				}
   1040 			}
   1041 			if (fi->fhp && fi->copyproc)
   1042 				(*fi->copyproc)(fi->fhp, fi->vp);
   1043 		}
   1044 	}
   1045 
   1046 	/* For TSOL, use a new cred which has net_mac_aware flag */
   1047 	if (!cred_cloned && is_system_labeled()) {
   1048 		cred_cloned = TRUE;
   1049 		cr = crdup(icr);
   1050 		(void) setpflags(NET_MAC_AWARE, 1, cr);
   1051 	}
   1052 
   1053 	/*
   1054 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
   1055 	 * are guaranteed to reprocess the retry as a new request.
   1056 	 */
   1057 	svp = mi->mi_curr_serv;
   1058 	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
   1059 
   1060 	if (FAILOVER_MOUNT(mi)) {
   1061 		mutex_enter(&mi->mi_lock);
   1062 		DEC_READERS(mi);
   1063 		mutex_exit(&mi->mi_lock);
   1064 
   1065 		if ((rpcerr.re_errno == ETIMEDOUT ||
   1066 		    rpcerr.re_errno == ECONNRESET) &&
   1067 		    failover_safe(fi)) {
   1068 			if (svp == mi->mi_curr_serv)
   1069 				failover_newserver(mi);
   1070 			goto failoverretry;
   1071 		}
   1072 	}
   1073 	if (rpcerr.re_errno != 0)
   1074 		return (rpcerr.re_errno);
   1075 
   1076 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
   1077 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
   1078 		timeo = (mi->mi_timeo * hz) / 10;
   1079 	} else {
   1080 		mutex_enter(&mi->mi_lock);
   1081 		timeo = CLNT_SETTIMERS(client,
   1082 		    &(mi->mi_timers[mi->mi_timer_type[which]]),
   1083 		    &(mi->mi_timers[NFS_CALLTYPES]),
   1084 		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
   1085 		    (void (*)())NULL, (caddr_t)mi, 0);
   1086 		mutex_exit(&mi->mi_lock);
   1087 	}
   1088 
   1089 	/*
   1090 	 * If hard mounted fs, retry call forever unless hard error occurs.
   1091 	 */
   1092 	do {
   1093 		tryagain = FALSE;
   1094 
   1095 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
   1096 			status = RPC_FAILED;
   1097 			rpcerr.re_status = RPC_FAILED;
   1098 			rpcerr.re_errno = EIO;
   1099 			break;
   1100 		}
   1101 
   1102 		TICK_TO_TIMEVAL(timeo, &wait);
   1103 
   1104 		/*
   1105 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
   1106 		 * and SIGTERM. (Preserving the existing masks).
   1107 		 * Mask out SIGINT if mount option nointr is specified.
   1108 		 */
   1109 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
   1110 		if (!(mi->mi_flags & MI_INT))
   1111 			client->cl_nosignal = TRUE;
   1112 
   1113 		/*
   1114 		 * If there is a current signal, then don't bother
   1115 		 * even trying to send out the request because we
   1116 		 * won't be able to block waiting for the response.
   1117 		 * Simply assume RPC_INTR and get on with it.
   1118 		 */
   1119 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
   1120 			status = RPC_INTR;
   1121 		else {
   1122 			status = CLNT_CALL(client, which, xdrargs, argsp,
   1123 			    xdrres, resp, wait);
   1124 		}
   1125 
   1126 		if (!(mi->mi_flags & MI_INT))
   1127 			client->cl_nosignal = FALSE;
   1128 		/*
   1129 		 * restore original signal mask
   1130 		 */
   1131 		sigunintr(&smask);
   1132 
   1133 		switch (status) {
   1134 		case RPC_SUCCESS:
   1135 			if ((mi->mi_flags & MI_DYNAMIC) &&
   1136 			    mi->mi_timer_type[which] != 0 &&
   1137 			    (mi->mi_curread != my_rsize ||
   1138 			    mi->mi_curwrite != my_wsize))
   1139 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
   1140 			break;
   1141 
   1142 		case RPC_INTR:
   1143 			/*
   1144 			 * There is no way to recover from this error,
   1145 			 * even if mount option nointr is specified.
   1146 			 * SIGKILL, for example, cannot be blocked.
   1147 			 */
   1148 			rpcerr.re_status = RPC_INTR;
   1149 			rpcerr.re_errno = EINTR;
   1150 			break;
   1151 
   1152 		case RPC_UDERROR:
   1153 			/*
   1154 			 * If the NFS server is local (vold) and
   1155 			 * it goes away then we get RPC_UDERROR.
   1156 			 * This is a retryable error, so we would
   1157 			 * loop, so check to see if the specific
   1158 			 * error was ECONNRESET, indicating that
   1159 			 * target did not exist at all.  If so,
   1160 			 * return with RPC_PROGUNAVAIL and
   1161 			 * ECONNRESET to indicate why.
   1162 			 */
   1163 			CLNT_GETERR(client, &rpcerr);
   1164 			if (rpcerr.re_errno == ECONNRESET) {
   1165 				rpcerr.re_status = RPC_PROGUNAVAIL;
   1166 				rpcerr.re_errno = ECONNRESET;
   1167 				break;
   1168 			}
   1169 			/*FALLTHROUGH*/
   1170 
   1171 		default:		/* probably RPC_TIMEDOUT */
   1172 			if (IS_UNRECOVERABLE_RPC(status))
   1173 				break;
   1174 
   1175 			/*
   1176 			 * increment server not responding count
   1177 			 */
   1178 			mutex_enter(&mi->mi_lock);
   1179 			mi->mi_noresponse++;
   1180 			mutex_exit(&mi->mi_lock);
   1181 #ifdef DEBUG
   1182 			nfscl->nfscl_stat.noresponse.value.ui64++;
   1183 #endif
   1184 
   1185 			if (!(mi->mi_flags & MI_HARD)) {
   1186 				if (!(mi->mi_flags & MI_SEMISOFT) ||
   1187 				    (mi->mi_ss_call_type[which] == 0))
   1188 					break;
   1189 			}
   1190 
   1191 			/*
   1192 			 * The call is in progress (over COTS).
   1193 			 * Try the CLNT_CALL again, but don't
   1194 			 * print a noisy error message.
   1195 			 */
   1196 			if (status == RPC_INPROGRESS) {
   1197 				tryagain = TRUE;
   1198 				break;
   1199 			}
   1200 
   1201 			if (flags & RFSCALL_SOFT)
   1202 				break;
   1203 
   1204 			/*
   1205 			 * On zone shutdown, just move on.
   1206 			 */
   1207 			if (zone_status_get(curproc->p_zone) >=
   1208 			    ZONE_IS_SHUTTING_DOWN) {
   1209 				rpcerr.re_status = RPC_FAILED;
   1210 				rpcerr.re_errno = EIO;
   1211 				break;
   1212 			}
   1213 
   1214 			/*
   1215 			 * NFS client failover support
   1216 			 *
   1217 			 * If the current server just failed us, we'll
   1218 			 * start the process of finding a new server.
   1219 			 * After that, we can just retry.
   1220 			 */
   1221 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
   1222 				if (svp == mi->mi_curr_serv)
   1223 					failover_newserver(mi);
   1224 				clfree_impl(client, ch, nfscl);
   1225 				goto failoverretry;
   1226 			}
   1227 
   1228 			tryagain = TRUE;
   1229 			timeo = backoff(timeo);
   1230 
   1231 			CLNT_GETERR(client, &rpcerr_tmp);
   1232 			if ((status == RPC_CANTSEND) &&
   1233 			    (rpcerr_tmp.re_errno == ENOBUFS))
   1234 				msg = SRV_QFULL_MSG;
   1235 			else
   1236 				msg = SRV_NOTRESP_MSG;
   1237 
   1238 			mutex_enter(&mi->mi_lock);
   1239 			if (!(mi->mi_flags & MI_PRINTED)) {
   1240 				mi->mi_flags |= MI_PRINTED;
   1241 				mutex_exit(&mi->mi_lock);
   1242 #ifdef DEBUG
   1243 				zprintf(zoneid, msg, mi->mi_vers,
   1244 				    svp->sv_hostname);
   1245 #else
   1246 				zprintf(zoneid, msg, svp->sv_hostname);
   1247 #endif
   1248 			} else
   1249 				mutex_exit(&mi->mi_lock);
   1250 			if (*douprintf && nfs_has_ctty()) {
   1251 				*douprintf = 0;
   1252 				if (!(mi->mi_flags & MI_NOPRINT))
   1253 #ifdef DEBUG
   1254 					uprintf(msg, mi->mi_vers,
   1255 					    svp->sv_hostname);
   1256 #else
   1257 					uprintf(msg, svp->sv_hostname);
   1258 #endif
   1259 			}
   1260 
   1261 			/*
   1262 			 * If doing dynamic adjustment of transfer
   1263 			 * size and if it's a read or write call
   1264 			 * and if the transfer size changed while
   1265 			 * retransmitting or if the feedback routine
   1266 			 * changed the transfer size,
   1267 			 * then exit rfscall so that the transfer
   1268 			 * size can be adjusted at the vnops level.
   1269 			 */
   1270 			if ((mi->mi_flags & MI_DYNAMIC) &&
   1271 			    mi->mi_timer_type[which] != 0 &&
   1272 			    (mi->mi_curread != my_rsize ||
   1273 			    mi->mi_curwrite != my_wsize ||
   1274 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
   1275 				/*
   1276 				 * On read or write calls, return
   1277 				 * back to the vnode ops level if
   1278 				 * the transfer size changed.
   1279 				 */
   1280 				clfree_impl(client, ch, nfscl);
   1281 				if (cred_cloned)
   1282 					crfree(cr);
   1283 				return (ENFS_TRYAGAIN);
   1284 			}
   1285 		}
   1286 	} while (tryagain);
   1287 
   1288 	if (status != RPC_SUCCESS) {
   1289 		/*
   1290 		 * Let soft mounts use the timed out message.
   1291 		 */
   1292 		if (status == RPC_INPROGRESS)
   1293 			status = RPC_TIMEDOUT;
   1294 		nfscl->nfscl_stat.badcalls.value.ui64++;
   1295 		if (status != RPC_INTR) {
   1296 			mutex_enter(&mi->mi_lock);
   1297 			mi->mi_flags |= MI_DOWN;
   1298 			mutex_exit(&mi->mi_lock);
   1299 			CLNT_GETERR(client, &rpcerr);
   1300 #ifdef DEBUG
   1301 			bufp = clnt_sperror(client, svp->sv_hostname);
   1302 			zprintf(zoneid, "NFS%d %s failed for %s\n",
   1303 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
   1304 			if (nfs_has_ctty()) {
   1305 				if (!(mi->mi_flags & MI_NOPRINT)) {
   1306 					uprintf("NFS%d %s failed for %s\n",
   1307 					    mi->mi_vers, mi->mi_rfsnames[which],
   1308 					    bufp);
   1309 				}
   1310 			}
   1311 			kmem_free(bufp, MAXPATHLEN);
   1312 #else
   1313 			zprintf(zoneid,
   1314 			    "NFS %s failed for server %s: error %d (%s)\n",
   1315 			    mi->mi_rfsnames[which], svp->sv_hostname,
   1316 			    status, clnt_sperrno(status));
   1317 			if (nfs_has_ctty()) {
   1318 				if (!(mi->mi_flags & MI_NOPRINT)) {
   1319 					uprintf(
   1320 				"NFS %s failed for server %s: error %d (%s)\n",
   1321 					    mi->mi_rfsnames[which],
   1322 					    svp->sv_hostname, status,
   1323 					    clnt_sperrno(status));
   1324 				}
   1325 			}
   1326 #endif
   1327 			/*
   1328 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
   1329 			 * re_errno is set appropriately depending on
   1330 			 * the authentication error
   1331 			 */
   1332 			if (status == RPC_VERSMISMATCH ||
   1333 			    status == RPC_PROGVERSMISMATCH)
   1334 				rpcerr.re_errno = EIO;
   1335 		}
   1336 	} else {
   1337 		/*
   1338 		 * Test the value of mi_down and mi_printed without
   1339 		 * holding the mi_lock mutex.  If they are both zero,
   1340 		 * then it is okay to skip the down and printed
   1341 		 * processing.  This saves on a mutex_enter and
   1342 		 * mutex_exit pair for a normal, successful RPC.
   1343 		 * This was just complete overhead.
   1344 		 */
   1345 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
   1346 			mutex_enter(&mi->mi_lock);
   1347 			mi->mi_flags &= ~MI_DOWN;
   1348 			if (mi->mi_flags & MI_PRINTED) {
   1349 				mi->mi_flags &= ~MI_PRINTED;
   1350 				mutex_exit(&mi->mi_lock);
   1351 #ifdef DEBUG
   1352 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
   1353 				zprintf(zoneid, "NFS%d server %s ok\n",
   1354 				    mi->mi_vers, svp->sv_hostname);
   1355 #else
   1356 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
   1357 				zprintf(zoneid, "NFS server %s ok\n",
   1358 				    svp->sv_hostname);
   1359 #endif
   1360 			} else
   1361 				mutex_exit(&mi->mi_lock);
   1362 		}
   1363 
   1364 		if (*douprintf == 0) {
   1365 			if (!(mi->mi_flags & MI_NOPRINT))
   1366 #ifdef DEBUG
   1367 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
   1368 					uprintf("NFS%d server %s ok\n",
   1369 					    mi->mi_vers, svp->sv_hostname);
   1370 #else
   1371 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
   1372 				uprintf("NFS server %s ok\n", svp->sv_hostname);
   1373 #endif
   1374 			*douprintf = 1;
   1375 		}
   1376 	}
   1377 
   1378 	clfree_impl(client, ch, nfscl);
   1379 	if (cred_cloned)
   1380 		crfree(cr);
   1381 
   1382 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
   1383 
   1384 	if (rpc_status != NULL)
   1385 		*rpc_status = rpcerr.re_status;
   1386 
   1387 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
   1388 	    rpcerr.re_errno);
   1389 
   1390 	return (rpcerr.re_errno);
   1391 }
   1392 
   1393 #ifdef DEBUG
   1394 static int acl2call_hits = 0;
   1395 static int acl2call_misses = 0;
   1396 #endif
   1397 
   1398 int
   1399 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
   1400     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
   1401     enum nfsstat *statusp, int flags, failinfo_t *fi)
   1402 {
   1403 	int rpcerror;
   1404 
   1405 	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
   1406 	    cr, douprintf, flags, fi);
   1407 	if (!rpcerror) {
   1408 		/*
   1409 		 * See comments with crnetadjust().
   1410 		 */
   1411 		if (*statusp == NFSERR_ACCES &&
   1412 		    (cr = crnetadjust(cr)) != NULL) {
   1413 #ifdef DEBUG
   1414 			acl2call_hits++;
   1415 #endif
   1416 			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
   1417 			    resp, cr, douprintf, flags, fi);
   1418 			crfree(cr);
   1419 #ifdef DEBUG
   1420 			if (*statusp == NFSERR_ACCES)
   1421 				acl2call_misses++;
   1422 #endif
   1423 		}
   1424 	}
   1425 
   1426 	return (rpcerror);
   1427 }
   1428 
   1429 #ifdef DEBUG
   1430 static int acl3call_hits = 0;
   1431 static int acl3call_misses = 0;
   1432 #endif
   1433 
   1434 int
   1435 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
   1436     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
   1437     nfsstat3 *statusp, int flags, failinfo_t *fi)
   1438 {
   1439 	int rpcerror;
   1440 	int user_informed;
   1441 
   1442 	user_informed = 0;
   1443 
   1444 	do {
   1445 		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
   1446 		    cr, douprintf, flags, fi);
   1447 		if (!rpcerror) {
   1448 			cred_t *crr;
   1449 			if (*statusp == NFS3ERR_JUKEBOX) {
   1450 				if (!user_informed) {
   1451 					user_informed = 1;
   1452 					uprintf(
   1453 		"file temporarily unavailable on the server, retrying...\n");
   1454 				}
   1455 				delay(nfs3_jukebox_delay);
   1456 			}
   1457 			/*
   1458 			 * See crnetadjust() for comments.
   1459 			 */
   1460 			else if (*statusp == NFS3ERR_ACCES &&
   1461 			    (crr = crnetadjust(cr)) != NULL) {
   1462 #ifdef DEBUG
   1463 				acl3call_hits++;
   1464 #endif
   1465 				rpcerror = aclcall(mi, which, xdrargs, argsp,
   1466 				    xdrres, resp, crr, douprintf, flags, fi);
   1467 
   1468 				crfree(crr);
   1469 #ifdef DEBUG
   1470 				if (*statusp == NFS3ERR_ACCES)
   1471 					acl3call_misses++;
   1472 #endif
   1473 			}
   1474 		}
   1475 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
   1476 
   1477 	return (rpcerror);
   1478 }
   1479 
   1480 static int
   1481 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
   1482     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
   1483     int flags, failinfo_t *fi)
   1484 {
   1485 	CLIENT *client;
   1486 	struct chtab *ch;
   1487 	cred_t *cr = icr;
   1488 	bool_t cred_cloned = FALSE;
   1489 	enum clnt_stat status;
   1490 	struct rpc_err rpcerr;
   1491 	struct timeval wait;
   1492 	int timeo;		/* in units of hz */
   1493 #if 0 /* notyet */
   1494 	int my_rsize, my_wsize;
   1495 #endif
   1496 	bool_t tryagain;
   1497 	k_sigset_t smask;
   1498 	servinfo_t *svp;
   1499 	struct nfs_clnt *nfscl;
   1500 	zoneid_t zoneid = getzoneid();
   1501 #ifdef DEBUG
   1502 	char *bufp;
   1503 #endif
   1504 
   1505 #if 0 /* notyet */
   1506 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
   1507 	    "rfscall_start:which %d mi %p", which, mi);
   1508 #endif
   1509 
   1510 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
   1511 	ASSERT(nfscl != NULL);
   1512 
   1513 	nfscl->nfscl_stat.calls.value.ui64++;
   1514 	mi->mi_aclreqs[which].value.ui64++;
   1515 
   1516 	rpcerr.re_status = RPC_SUCCESS;
   1517 
   1518 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
   1519 		rpcerr.re_status = RPC_FAILED;
   1520 		rpcerr.re_errno = EIO;
   1521 		return (rpcerr.re_errno);
   1522 	}
   1523 
   1524 #if 0 /* notyet */
   1525 	/*
   1526 	 * Remember the transfer sizes in case
   1527 	 * nfs_feedback changes them underneath us.
   1528 	 */
   1529 	my_rsize = mi->mi_curread;
   1530 	my_wsize = mi->mi_curwrite;
   1531 #endif
   1532 
   1533 	/*
   1534 	 * NFS client failover support
   1535 	 *
   1536 	 * If this rnode is not in sync with the current server (VALID_FH),
   1537 	 * we'd like to do a remap to get in sync.  We can be interrupted
   1538 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
   1539 	 * use the best info we have to try the RPC.  Part of that is
   1540 	 * unconditionally updating the filehandle copy kept for V3.
   1541 	 *
   1542 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
   1543 	 * rw_enter(); we're trying to keep the current server from being
   1544 	 * changed on us until we're done with the remapping and have a
   1545 	 * matching client handle.  We don't want to sending a filehandle
   1546 	 * to the wrong host.
   1547 	 */
   1548 failoverretry:
   1549 	if (FAILOVER_MOUNT(mi)) {
   1550 		mutex_enter(&mi->mi_lock);
   1551 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
   1552 			if (failover_wait(mi)) {
   1553 				mutex_exit(&mi->mi_lock);
   1554 				return (EINTR);
   1555 			}
   1556 		}
   1557 		INC_READERS(mi);
   1558 		mutex_exit(&mi->mi_lock);
   1559 		if (fi) {
   1560 			if (!VALID_FH(fi) &&
   1561 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
   1562 				int remaperr;
   1563 
   1564 				svp = mi->mi_curr_serv;
   1565 				remaperr = failover_remap(fi);
   1566 				if (remaperr != 0) {
   1567 #ifdef DEBUG
   1568 					if (remaperr != EINTR)
   1569 						nfs_cmn_err(remaperr, CE_WARN,
   1570 					    "aclcall couldn't failover: %m");
   1571 #endif
   1572 					mutex_enter(&mi->mi_lock);
   1573 					DEC_READERS(mi);
   1574 					mutex_exit(&mi->mi_lock);
   1575 
   1576 					/*
   1577 					 * If failover_remap returns ETIMEDOUT
   1578 					 * and the filesystem is hard mounted
   1579 					 * we have to retry the call with a new
   1580 					 * server.
   1581 					 */
   1582 					if ((mi->mi_flags & MI_HARD) &&
   1583 					    IS_RECOVERABLE_ERROR(remaperr)) {
   1584 						if (svp == mi->mi_curr_serv)
   1585 							failover_newserver(mi);
   1586 						rpcerr.re_status = RPC_SUCCESS;
   1587 						goto failoverretry;
   1588 					}
   1589 					return (remaperr);
   1590 				}
   1591 			}
   1592 			if (fi->fhp && fi->copyproc)
   1593 				(*fi->copyproc)(fi->fhp, fi->vp);
   1594 		}
   1595 	}
   1596 
   1597 	/* For TSOL, use a new cred which has net_mac_aware flag */
   1598 	if (!cred_cloned && is_system_labeled()) {
   1599 		cred_cloned = TRUE;
   1600 		cr = crdup(icr);
   1601 		(void) setpflags(NET_MAC_AWARE, 1, cr);
   1602 	}
   1603 
   1604 	/*
   1605 	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
   1606 	 * are guaranteed to reprocess the retry as a new request.
   1607 	 */
   1608 	svp = mi->mi_curr_serv;
   1609 	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
   1610 	if (FAILOVER_MOUNT(mi)) {
   1611 		mutex_enter(&mi->mi_lock);
   1612 		DEC_READERS(mi);
   1613 		mutex_exit(&mi->mi_lock);
   1614 
   1615 		if ((rpcerr.re_errno == ETIMEDOUT ||
   1616 		    rpcerr.re_errno == ECONNRESET) &&
   1617 		    failover_safe(fi)) {
   1618 			if (svp == mi->mi_curr_serv)
   1619 				failover_newserver(mi);
   1620 			goto failoverretry;
   1621 		}
   1622 	}
   1623 	if (rpcerr.re_errno != 0) {
   1624 		if (cred_cloned)
   1625 			crfree(cr);
   1626 		return (rpcerr.re_errno);
   1627 	}
   1628 
   1629 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
   1630 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
   1631 		timeo = (mi->mi_timeo * hz) / 10;
   1632 	} else {
   1633 		mutex_enter(&mi->mi_lock);
   1634 		timeo = CLNT_SETTIMERS(client,
   1635 		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
   1636 		    &(mi->mi_timers[NFS_CALLTYPES]),
   1637 		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
   1638 		    (void (*)()) 0, (caddr_t)mi, 0);
   1639 		mutex_exit(&mi->mi_lock);
   1640 	}
   1641 
   1642 	/*
   1643 	 * If hard mounted fs, retry call forever unless hard error occurs.
   1644 	 */
   1645 	do {
   1646 		tryagain = FALSE;
   1647 
   1648 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
   1649 			status = RPC_FAILED;
   1650 			rpcerr.re_status = RPC_FAILED;
   1651 			rpcerr.re_errno = EIO;
   1652 			break;
   1653 		}
   1654 
   1655 		TICK_TO_TIMEVAL(timeo, &wait);
   1656 
   1657 		/*
   1658 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
   1659 		 * and SIGTERM. (Preserving the existing masks).
   1660 		 * Mask out SIGINT if mount option nointr is specified.
   1661 		 */
   1662 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
   1663 		if (!(mi->mi_flags & MI_INT))
   1664 			client->cl_nosignal = TRUE;
   1665 
   1666 		/*
   1667 		 * If there is a current signal, then don't bother
   1668 		 * even trying to send out the request because we
   1669 		 * won't be able to block waiting for the response.
   1670 		 * Simply assume RPC_INTR and get on with it.
   1671 		 */
   1672 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
   1673 			status = RPC_INTR;
   1674 		else {
   1675 			status = CLNT_CALL(client, which, xdrargs, argsp,
   1676 			    xdrres, resp, wait);
   1677 		}
   1678 
   1679 		if (!(mi->mi_flags & MI_INT))
   1680 			client->cl_nosignal = FALSE;
   1681 		/*
   1682 		 * restore original signal mask
   1683 		 */
   1684 		sigunintr(&smask);
   1685 
   1686 		switch (status) {
   1687 		case RPC_SUCCESS:
   1688 #if 0 /* notyet */
   1689 			if ((mi->mi_flags & MI_DYNAMIC) &&
   1690 			    mi->mi_timer_type[which] != 0 &&
   1691 			    (mi->mi_curread != my_rsize ||
   1692 			    mi->mi_curwrite != my_wsize))
   1693 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
   1694 #endif
   1695 			break;
   1696 
   1697 		/*
   1698 		 * Unfortunately, there are servers in the world which
   1699 		 * are not coded correctly.  They are not prepared to
   1700 		 * handle RPC requests to the NFS port which are not
   1701 		 * NFS requests.  Thus, they may try to process the
   1702 		 * NFS_ACL request as if it were an NFS request.  This
   1703 		 * does not work.  Generally, an error will be generated
   1704 		 * on the client because it will not be able to decode
   1705 		 * the response from the server.  However, it seems
   1706 		 * possible that the server may not be able to decode
   1707 		 * the arguments.  Thus, the criteria for deciding
   1708 		 * whether the server supports NFS_ACL or not is whether
   1709 		 * the following RPC errors are returned from CLNT_CALL.
   1710 		 */
   1711 		case RPC_CANTDECODERES:
   1712 		case RPC_PROGUNAVAIL:
   1713 		case RPC_CANTDECODEARGS:
   1714 		case RPC_PROGVERSMISMATCH:
   1715 			mutex_enter(&mi->mi_lock);
   1716 			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
   1717 			mutex_exit(&mi->mi_lock);
   1718 			break;
   1719 
   1720 		/*
   1721 		 * If the server supports NFS_ACL but not the new ops
   1722 		 * for extended attributes, make sure we don't retry.
   1723 		 */
   1724 		case RPC_PROCUNAVAIL:
   1725 			mutex_enter(&mi->mi_lock);
   1726 			mi->mi_flags &= ~MI_EXTATTR;
   1727 			mutex_exit(&mi->mi_lock);
   1728 			break;
   1729 
   1730 		case RPC_INTR:
   1731 			/*
   1732 			 * There is no way to recover from this error,
   1733 			 * even if mount option nointr is specified.
   1734 			 * SIGKILL, for example, cannot be blocked.
   1735 			 */
   1736 			rpcerr.re_status = RPC_INTR;
   1737 			rpcerr.re_errno = EINTR;
   1738 			break;
   1739 
   1740 		case RPC_UDERROR:
   1741 			/*
   1742 			 * If the NFS server is local (vold) and
   1743 			 * it goes away then we get RPC_UDERROR.
   1744 			 * This is a retryable error, so we would
   1745 			 * loop, so check to see if the specific
   1746 			 * error was ECONNRESET, indicating that
   1747 			 * target did not exist at all.  If so,
   1748 			 * return with RPC_PROGUNAVAIL and
   1749 			 * ECONNRESET to indicate why.
   1750 			 */
   1751 			CLNT_GETERR(client, &rpcerr);
   1752 			if (rpcerr.re_errno == ECONNRESET) {
   1753 				rpcerr.re_status = RPC_PROGUNAVAIL;
   1754 				rpcerr.re_errno = ECONNRESET;
   1755 				break;
   1756 			}
   1757 			/*FALLTHROUGH*/
   1758 
   1759 		default:		/* probably RPC_TIMEDOUT */
   1760 			if (IS_UNRECOVERABLE_RPC(status))
   1761 				break;
   1762 
   1763 			/*
   1764 			 * increment server not responding count
   1765 			 */
   1766 			mutex_enter(&mi->mi_lock);
   1767 			mi->mi_noresponse++;
   1768 			mutex_exit(&mi->mi_lock);
   1769 #ifdef DEBUG
   1770 			nfscl->nfscl_stat.noresponse.value.ui64++;
   1771 #endif
   1772 
   1773 			if (!(mi->mi_flags & MI_HARD)) {
   1774 				if (!(mi->mi_flags & MI_SEMISOFT) ||
   1775 				    (mi->mi_acl_ss_call_type[which] == 0))
   1776 					break;
   1777 			}
   1778 
   1779 			/*
   1780 			 * The call is in progress (over COTS).
   1781 			 * Try the CLNT_CALL again, but don't
   1782 			 * print a noisy error message.
   1783 			 */
   1784 			if (status == RPC_INPROGRESS) {
   1785 				tryagain = TRUE;
   1786 				break;
   1787 			}
   1788 
   1789 			if (flags & RFSCALL_SOFT)
   1790 				break;
   1791 
   1792 			/*
   1793 			 * On zone shutdown, just move on.
   1794 			 */
   1795 			if (zone_status_get(curproc->p_zone) >=
   1796 			    ZONE_IS_SHUTTING_DOWN) {
   1797 				rpcerr.re_status = RPC_FAILED;
   1798 				rpcerr.re_errno = EIO;
   1799 				break;
   1800 			}
   1801 
   1802 			/*
   1803 			 * NFS client failover support
   1804 			 *
   1805 			 * If the current server just failed us, we'll
   1806 			 * start the process of finding a new server.
   1807 			 * After that, we can just retry.
   1808 			 */
   1809 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
   1810 				if (svp == mi->mi_curr_serv)
   1811 					failover_newserver(mi);
   1812 				clfree_impl(client, ch, nfscl);
   1813 				goto failoverretry;
   1814 			}
   1815 
   1816 			tryagain = TRUE;
   1817 			timeo = backoff(timeo);
   1818 			mutex_enter(&mi->mi_lock);
   1819 			if (!(mi->mi_flags & MI_PRINTED)) {
   1820 				mi->mi_flags |= MI_PRINTED;
   1821 				mutex_exit(&mi->mi_lock);
   1822 #ifdef DEBUG
   1823 				zprintf(zoneid,
   1824 			"NFS_ACL%d server %s not responding still trying\n",
   1825 				    mi->mi_vers, svp->sv_hostname);
   1826 #else
   1827 				zprintf(zoneid,
   1828 			    "NFS server %s not responding still trying\n",
   1829 				    svp->sv_hostname);
   1830 #endif
   1831 			} else
   1832 				mutex_exit(&mi->mi_lock);
   1833 			if (*douprintf && nfs_has_ctty()) {
   1834 				*douprintf = 0;
   1835 				if (!(mi->mi_flags & MI_NOPRINT))
   1836 #ifdef DEBUG
   1837 					uprintf(
   1838 			"NFS_ACL%d server %s not responding still trying\n",
   1839 					    mi->mi_vers, svp->sv_hostname);
   1840 #else
   1841 					uprintf(
   1842 			    "NFS server %s not responding still trying\n",
   1843 					    svp->sv_hostname);
   1844 #endif
   1845 			}
   1846 
   1847 #if 0 /* notyet */
   1848 			/*
   1849 			 * If doing dynamic adjustment of transfer
   1850 			 * size and if it's a read or write call
   1851 			 * and if the transfer size changed while
   1852 			 * retransmitting or if the feedback routine
   1853 			 * changed the transfer size,
   1854 			 * then exit rfscall so that the transfer
   1855 			 * size can be adjusted at the vnops level.
   1856 			 */
   1857 			if ((mi->mi_flags & MI_DYNAMIC) &&
   1858 			    mi->mi_acl_timer_type[which] != 0 &&
   1859 			    (mi->mi_curread != my_rsize ||
   1860 			    mi->mi_curwrite != my_wsize ||
   1861 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
   1862 				/*
   1863 				 * On read or write calls, return
   1864 				 * back to the vnode ops level if
   1865 				 * the transfer size changed.
   1866 				 */
   1867 				clfree_impl(client, ch, nfscl);
   1868 				if (cred_cloned)
   1869 					crfree(cr);
   1870 				return (ENFS_TRYAGAIN);
   1871 			}
   1872 #endif
   1873 		}
   1874 	} while (tryagain);
   1875 
   1876 	if (status != RPC_SUCCESS) {
   1877 		/*
   1878 		 * Let soft mounts use the timed out message.
   1879 		 */
   1880 		if (status == RPC_INPROGRESS)
   1881 			status = RPC_TIMEDOUT;
   1882 		nfscl->nfscl_stat.badcalls.value.ui64++;
   1883 		if (status == RPC_CANTDECODERES ||
   1884 		    status == RPC_PROGUNAVAIL ||
   1885 		    status == RPC_PROCUNAVAIL ||
   1886 		    status == RPC_CANTDECODEARGS ||
   1887 		    status == RPC_PROGVERSMISMATCH)
   1888 			CLNT_GETERR(client, &rpcerr);
   1889 		else if (status != RPC_INTR) {
   1890 			mutex_enter(&mi->mi_lock);
   1891 			mi->mi_flags |= MI_DOWN;
   1892 			mutex_exit(&mi->mi_lock);
   1893 			CLNT_GETERR(client, &rpcerr);
   1894 #ifdef DEBUG
   1895 			bufp = clnt_sperror(client, svp->sv_hostname);
   1896 			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
   1897 			    mi->mi_vers, mi->mi_aclnames[which], bufp);
   1898 			if (nfs_has_ctty()) {
   1899 				if (!(mi->mi_flags & MI_NOPRINT)) {
   1900 					uprintf("NFS_ACL%d %s failed for %s\n",
   1901 					    mi->mi_vers, mi->mi_aclnames[which],
   1902 					    bufp);
   1903 				}
   1904 			}
   1905 			kmem_free(bufp, MAXPATHLEN);
   1906 #else
   1907 			zprintf(zoneid,
   1908 			    "NFS %s failed for server %s: error %d (%s)\n",
   1909 			    mi->mi_aclnames[which], svp->sv_hostname,
   1910 			    status, clnt_sperrno(status));
   1911 			if (nfs_has_ctty()) {
   1912 				if (!(mi->mi_flags & MI_NOPRINT))
   1913 					uprintf(
   1914 				"NFS %s failed for server %s: error %d (%s)\n",
   1915 					    mi->mi_aclnames[which],
   1916 					    svp->sv_hostname, status,
   1917 					    clnt_sperrno(status));
   1918 			}
   1919 #endif
   1920 			/*
   1921 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
   1922 			 * re_errno is set appropriately depending on
   1923 			 * the authentication error
   1924 			 */
   1925 			if (status == RPC_VERSMISMATCH ||
   1926 			    status == RPC_PROGVERSMISMATCH)
   1927 				rpcerr.re_errno = EIO;
   1928 		}
   1929 	} else {
   1930 		/*
   1931 		 * Test the value of mi_down and mi_printed without
   1932 		 * holding the mi_lock mutex.  If they are both zero,
   1933 		 * then it is okay to skip the down and printed
   1934 		 * processing.  This saves on a mutex_enter and
   1935 		 * mutex_exit pair for a normal, successful RPC.
   1936 		 * This was just complete overhead.
   1937 		 */
   1938 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
   1939 			mutex_enter(&mi->mi_lock);
   1940 			mi->mi_flags &= ~MI_DOWN;
   1941 			if (mi->mi_flags & MI_PRINTED) {
   1942 				mi->mi_flags &= ~MI_PRINTED;
   1943 				mutex_exit(&mi->mi_lock);
   1944 #ifdef DEBUG
   1945 				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
   1946 				    mi->mi_vers, svp->sv_hostname);
   1947 #else
   1948 				zprintf(zoneid, "NFS server %s ok\n",
   1949 				    svp->sv_hostname);
   1950 #endif
   1951 			} else
   1952 				mutex_exit(&mi->mi_lock);
   1953 		}
   1954 
   1955 		if (*douprintf == 0) {
   1956 			if (!(mi->mi_flags & MI_NOPRINT))
   1957 #ifdef DEBUG
   1958 				uprintf("NFS_ACL%d server %s ok\n",
   1959 				    mi->mi_vers, svp->sv_hostname);
   1960 #else
   1961 				uprintf("NFS server %s ok\n", svp->sv_hostname);
   1962 #endif
   1963 			*douprintf = 1;
   1964 		}
   1965 	}
   1966 
   1967 	clfree_impl(client, ch, nfscl);
   1968 	if (cred_cloned)
   1969 		crfree(cr);
   1970 
   1971 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
   1972 
   1973 #if 0 /* notyet */
   1974 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
   1975 	    rpcerr.re_errno);
   1976 #endif
   1977 
   1978 	return (rpcerr.re_errno);
   1979 }
   1980 
   1981 int
   1982 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
   1983 {
   1984 	uint_t mask = vap->va_mask;
   1985 
   1986 	if (!(mask & AT_MODE))
   1987 		sa->sa_mode = (uint32_t)-1;
   1988 	else
   1989 		sa->sa_mode = vap->va_mode;
   1990 	if (!(mask & AT_UID))
   1991 		sa->sa_uid = (uint32_t)-1;
   1992 	else
   1993 		sa->sa_uid = (uint32_t)vap->va_uid;
   1994 	if (!(mask & AT_GID))
   1995 		sa->sa_gid = (uint32_t)-1;
   1996 	else
   1997 		sa->sa_gid = (uint32_t)vap->va_gid;
   1998 	if (!(mask & AT_SIZE))
   1999 		sa->sa_size = (uint32_t)-1;
   2000 	else
   2001 		sa->sa_size = (uint32_t)vap->va_size;
   2002 	if (!(mask & AT_ATIME))
   2003 		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
   2004 	else {
   2005 		/* check time validity */
   2006 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
   2007 			return (EOVERFLOW);
   2008 		}
   2009 		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
   2010 		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
   2011 	}
   2012 	if (!(mask & AT_MTIME))
   2013 		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
   2014 	else {
   2015 		/* check time validity */
   2016 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
   2017 			return (EOVERFLOW);
   2018 		}
   2019 		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
   2020 		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
   2021 	}
   2022 	return (0);
   2023 }
   2024 
   2025 int
   2026 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
   2027 {
   2028 	uint_t mask = vap->va_mask;
   2029 
   2030 	if (!(mask & AT_MODE))
   2031 		sa->mode.set_it = FALSE;
   2032 	else {
   2033 		sa->mode.set_it = TRUE;
   2034 		sa->mode.mode = (mode3)vap->va_mode;
   2035 	}
   2036 	if (!(mask & AT_UID))
   2037 		sa->uid.set_it = FALSE;
   2038 	else {
   2039 		sa->uid.set_it = TRUE;
   2040 		sa->uid.uid = (uid3)vap->va_uid;
   2041 	}
   2042 	if (!(mask & AT_GID))
   2043 		sa->gid.set_it = FALSE;
   2044 	else {
   2045 		sa->gid.set_it = TRUE;
   2046 		sa->gid.gid = (gid3)vap->va_gid;
   2047 	}
   2048 	if (!(mask & AT_SIZE))
   2049 		sa->size.set_it = FALSE;
   2050 	else {
   2051 		sa->size.set_it = TRUE;
   2052 		sa->size.size = (size3)vap->va_size;
   2053 	}
   2054 	if (!(mask & AT_ATIME))
   2055 		sa->atime.set_it = DONT_CHANGE;
   2056 	else {
   2057 		/* check time validity */
   2058 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
   2059 			return (EOVERFLOW);
   2060 		}
   2061 		sa->atime.set_it = SET_TO_CLIENT_TIME;
   2062 		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
   2063 		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
   2064 	}
   2065 	if (!(mask & AT_MTIME))
   2066 		sa->mtime.set_it = DONT_CHANGE;
   2067 	else {
   2068 		/* check time validity */
   2069 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
   2070 			return (EOVERFLOW);
   2071 		}
   2072 		sa->mtime.set_it = SET_TO_CLIENT_TIME;
   2073 		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
   2074 		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
   2075 	}
   2076 	return (0);
   2077 }
   2078 
   2079 void
   2080 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
   2081 {
   2082 
   2083 	da->da_fhandle = VTOFH(dvp);
   2084 	da->da_name = nm;
   2085 	da->da_flags = 0;
   2086 }
   2087 
   2088 void
   2089 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
   2090 {
   2091 
   2092 	da->dirp = VTOFH3(dvp);
   2093 	da->name = nm;
   2094 }
   2095 
   2096 int
   2097 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
   2098 {
   2099 	int error;
   2100 	rnode_t *rp;
   2101 	struct vattr va;
   2102 
   2103 	va.va_mask = AT_MODE | AT_GID;
   2104 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
   2105 	if (error)
   2106 		return (error);
   2107 
   2108 	/*
   2109 	 * To determine the expected group-id of the created file:
   2110 	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
   2111 	 *	GRPID option, and the directory's set-gid bit is clear,
   2112 	 *	then use the process's gid.
   2113 	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
   2114 	 */
   2115 	rp = VTOR(dvp);
   2116 	mutex_enter(&rp->r_statelock);
   2117 	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
   2118 		*gidp = crgetgid(cr);
   2119 	else
   2120 		*gidp = va.va_gid;
   2121 	mutex_exit(&rp->r_statelock);
   2122 	return (0);
   2123 }
   2124 
   2125 int
   2126 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
   2127 {
   2128 	int error;
   2129 	struct vattr va;
   2130 
   2131 	va.va_mask = AT_MODE;
   2132 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
   2133 	if (error)
   2134 		return (error);
   2135 
   2136 	/*
   2137 	 * Modify the expected mode (om) so that the set-gid bit matches
   2138 	 * that of the parent directory (dvp).
   2139 	 */
   2140 	if (va.va_mode & VSGID)
   2141 		*omp |= VSGID;
   2142 	else
   2143 		*omp &= ~VSGID;
   2144 	return (0);
   2145 }
   2146 
   2147 void
   2148 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
   2149 {
   2150 
   2151 	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
   2152 		if (!(vp->v_flag & VSWAPLIKE)) {
   2153 			mutex_enter(&vp->v_lock);
   2154 			vp->v_flag |= VSWAPLIKE;
   2155 			mutex_exit(&vp->v_lock);
   2156 		}
   2157 	} else {
   2158 		if (vp->v_flag & VSWAPLIKE) {
   2159 			mutex_enter(&vp->v_lock);
   2160 			vp->v_flag &= ~VSWAPLIKE;
   2161 			mutex_exit(&vp->v_lock);
   2162 		}
   2163 	}
   2164 }
   2165 
   2166 /*
   2167  * Free the resources associated with an rnode.
   2168  */
   2169 static void
   2170 rinactive(rnode_t *rp, cred_t *cr)
   2171 {
   2172 	vnode_t *vp;
   2173 	cred_t *cred;
   2174 	char *contents;
   2175 	int size;
   2176 	vsecattr_t *vsp;
   2177 	int error;
   2178 	nfs3_pathconf_info *info;
   2179 
   2180 	/*
   2181 	 * Before freeing anything, wait until all asynchronous
   2182 	 * activity is done on this rnode.  This will allow all
   2183 	 * asynchronous read ahead and write behind i/o's to
   2184 	 * finish.
   2185 	 */
   2186 	mutex_enter(&rp->r_statelock);
   2187 	while (rp->r_count > 0)
   2188 		cv_wait(&rp->r_cv, &rp->r_statelock);
   2189 	mutex_exit(&rp->r_statelock);
   2190 
   2191 	/*
   2192 	 * Flush and invalidate all pages associated with the vnode.
   2193 	 */
   2194 	vp = RTOV(rp);
   2195 	if (vn_has_cached_data(vp)) {
   2196 		ASSERT(vp->v_type != VCHR);
   2197 		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
   2198 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
   2199 			if (error && (error == ENOSPC || error == EDQUOT)) {
   2200 				mutex_enter(&rp->r_statelock);
   2201 				if (!rp->r_error)
   2202 					rp->r_error = error;
   2203 				mutex_exit(&rp->r_statelock);
   2204 			}
   2205 		}
   2206 		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
   2207 	}
   2208 
   2209 	/*
   2210 	 * Free any held credentials and caches which may be associated
   2211 	 * with this rnode.
   2212 	 */
   2213 	mutex_enter(&rp->r_statelock);
   2214 	cred = rp->r_cred;
   2215 	rp->r_cred = NULL;
   2216 	contents = rp->r_symlink.contents;
   2217 	size = rp->r_symlink.size;
   2218 	rp->r_symlink.contents = NULL;
   2219 	vsp = rp->r_secattr;
   2220 	rp->r_secattr = NULL;
   2221 	info = rp->r_pathconf;
   2222 	rp->r_pathconf = NULL;
   2223 	mutex_exit(&rp->r_statelock);
   2224 
   2225 	/*
   2226 	 * Free the held credential.
   2227 	 */
   2228 	if (cred != NULL)
   2229 		crfree(cred);
   2230 
   2231 	/*
   2232 	 * Free the access cache entries.
   2233 	 */
   2234 	(void) nfs_access_purge_rp(rp);
   2235 
   2236 	/*
   2237 	 * Free the readdir cache entries.
   2238 	 */
   2239 	if (HAVE_RDDIR_CACHE(rp))
   2240 		nfs_purge_rddir_cache(vp);
   2241 
   2242 	/*
   2243 	 * Free the symbolic link cache.
   2244 	 */
   2245 	if (contents != NULL) {
   2246 
   2247 		kmem_free((void *)contents, size);
   2248 	}
   2249 
   2250 	/*
   2251 	 * Free any cached ACL.
   2252 	 */
   2253 	if (vsp != NULL)
   2254 		nfs_acl_free(vsp);
   2255 
   2256 	/*
   2257 	 * Free any cached pathconf information.
   2258 	 */
   2259 	if (info != NULL)
   2260 		kmem_free(info, sizeof (*info));
   2261 }
   2262 
   2263 /*
   2264  * Return a vnode for the given NFS Version 2 file handle.
   2265  * If no rnode exists for this fhandle, create one and put it
   2266  * into the hash queues.  If the rnode for this fhandle
   2267  * already exists, return it.
   2268  *
   2269  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
   2270  */
   2271 vnode_t *
   2272 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
   2273     hrtime_t t, cred_t *cr, char *dnm, char *nm)
   2274 {
   2275 	int newnode;
   2276 	int index;
   2277 	vnode_t *vp;
   2278 	nfs_fhandle nfh;
   2279 	vattr_t va;
   2280 
   2281 	nfh.fh_len = NFS_FHSIZE;
   2282 	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
   2283 
   2284 	index = rtablehash(&nfh);
   2285 	rw_enter(&rtable[index].r_lock, RW_READER);
   2286 
   2287 	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
   2288 	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
   2289 
   2290 	if (attr != NULL) {
   2291 		if (!newnode) {
   2292 			rw_exit(&rtable[index].r_lock);
   2293 			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
   2294 		} else {
   2295 			if (attr->na_type < NFNON || attr->na_type > NFSOC)
   2296 				vp->v_type = VBAD;
   2297 			else
   2298 				vp->v_type = n2v_type(attr);
   2299 			/*
   2300 			 * A translation here seems to be necessary
   2301 			 * because this function can be called
   2302 			 * with `attr' that has come from the wire,
   2303 			 * and been operated on by vattr_to_nattr().
   2304 			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
   2305 			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
   2306 			 * ->makenfsnode().
   2307 			 */
   2308 			if ((attr->na_rdev & 0xffff0000) == 0)
   2309 				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
   2310 			else
   2311 				vp->v_rdev = expldev(n2v_rdev(attr));
   2312 			nfs_attrcache(vp, attr, t);
   2313 			rw_exit(&rtable[index].r_lock);
   2314 		}
   2315 	} else {
   2316 		if (newnode) {
   2317 			PURGE_ATTRCACHE(vp);
   2318 		}
   2319 		rw_exit(&rtable[index].r_lock);
   2320 	}
   2321 
   2322 	return (vp);
   2323 }
   2324 
   2325 /*
   2326  * Return a vnode for the given NFS Version 3 file handle.
   2327  * If no rnode exists for this fhandle, create one and put it
   2328  * into the hash queues.  If the rnode for this fhandle
   2329  * already exists, return it.
   2330  *
   2331  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
   2332  */
   2333 vnode_t *
   2334 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
   2335     cred_t *cr, char *dnm, char *nm)
   2336 {
   2337 	int newnode;
   2338 	int index;
   2339 	vnode_t *vp;
   2340 
   2341 	index = rtablehash((nfs_fhandle *)fh);
   2342 	rw_enter(&rtable[index].r_lock, RW_READER);
   2343 
   2344 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
   2345 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
   2346 	    dnm, nm);
   2347 
   2348 	if (vap == NULL) {
   2349 		if (newnode) {
   2350 			PURGE_ATTRCACHE(vp);
   2351 		}
   2352 		rw_exit(&rtable[index].r_lock);
   2353 		return (vp);
   2354 	}
   2355 
   2356 	if (!newnode) {
   2357 		rw_exit(&rtable[index].r_lock);
   2358 		nfs_attr_cache(vp, vap, t, cr);
   2359 	} else {
   2360 		rnode_t *rp = VTOR(vp);
   2361 
   2362 		vp->v_type = vap->va_type;
   2363 		vp->v_rdev = vap->va_rdev;
   2364 
   2365 		mutex_enter(&rp->r_statelock);
   2366 		if (rp->r_mtime <= t)
   2367 			nfs_attrcache_va(vp, vap);
   2368 		mutex_exit(&rp->r_statelock);
   2369 		rw_exit(&rtable[index].r_lock);
   2370 	}
   2371 
   2372 	return (vp);
   2373 }
   2374 
   2375 vnode_t *
   2376 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
   2377     cred_t *cr, char *dnm, char *nm)
   2378 {
   2379 	int newnode;
   2380 	int index;
   2381 	vnode_t *vp;
   2382 	vattr_t va;
   2383 
   2384 	index = rtablehash((nfs_fhandle *)fh);
   2385 	rw_enter(&rtable[index].r_lock, RW_READER);
   2386 
   2387 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
   2388 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
   2389 	    dnm, nm);
   2390 
   2391 	if (attr == NULL) {
   2392 		if (newnode) {
   2393 			PURGE_ATTRCACHE(vp);
   2394 		}
   2395 		rw_exit(&rtable[index].r_lock);
   2396 		return (vp);
   2397 	}
   2398 
   2399 	if (!newnode) {
   2400 		rw_exit(&rtable[index].r_lock);
   2401 		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
   2402 	} else {
   2403 		if (attr->type < NF3REG || attr->type > NF3FIFO)
   2404 			vp->v_type = VBAD;
   2405 		else
   2406 			vp->v_type = nf3_to_vt[attr->type];
   2407 		vp->v_rdev = makedevice(attr->rdev.specdata1,
   2408 		    attr->rdev.specdata2);
   2409 		nfs3_attrcache(vp, attr, t);
   2410 		rw_exit(&rtable[index].r_lock);
   2411 	}
   2412 
   2413 	return (vp);
   2414 }
   2415 
   2416 /*
   2417  * Read this comment before making changes to rtablehash()!
   2418  * This is a hash function in which seemingly obvious and harmless
   2419  * changes can cause escalations costing million dollars!
   2420  * Know what you are doing.
   2421  *
   2422  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
   2423  * algorithm is currently detailed here:
   2424  *
   2425  *   http://burtleburtle.net/bob/hash/doobs.html
   2426  *
   2427  * Of course, the above link may not be valid by the time you are reading
   2428  * this, but suffice it to say that the one-at-a-time algorithm works well in
   2429  * almost all cases.  If you are changing the algorithm be sure to verify that
   2430  * the hash algorithm still provides even distribution in all cases and with
   2431  * any server returning filehandles in whatever order (sequential or random).
   2432  */
   2433 static int
   2434 rtablehash(nfs_fhandle *fh)
   2435 {
   2436 	ulong_t hash, len, i;
   2437 	char *key;
   2438 
   2439 	key = fh->fh_buf;
   2440 	len = (ulong_t)fh->fh_len;
   2441 	for (hash = 0, i = 0; i < len; i++) {
   2442 		hash += key[i];
   2443 		hash += (hash << 10);
   2444 		hash ^= (hash >> 6);
   2445 	}
   2446 	hash += (hash << 3);
   2447 	hash ^= (hash >> 11);
   2448 	hash += (hash << 15);
   2449 	return (hash & rtablemask);
   2450 }
   2451 
   2452 static vnode_t *
   2453 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
   2454     struct vnodeops *vops,
   2455     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
   2456     int (*compar)(const void *, const void *),
   2457     int *newnode, cred_t *cr, char *dnm, char *nm)
   2458 {
   2459 	rnode_t *rp;
   2460 	rnode_t *trp;
   2461 	vnode_t *vp;
   2462 	mntinfo_t *mi;
   2463 
   2464 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
   2465 
   2466 	mi = VFTOMI(vfsp);
   2467 start:
   2468 	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
   2469 		vp = RTOV(rp);
   2470 		nfs_set_vroot(vp);
   2471 		*newnode = 0;
   2472 		return (vp);
   2473 	}
   2474 	rw_exit(&rhtp->r_lock);
   2475 
   2476 	mutex_enter(&rpfreelist_lock);
   2477 	if (rpfreelist != NULL && rnew >= nrnode) {
   2478 		rp = rpfreelist;
   2479 		rp_rmfree(rp);
   2480 		mutex_exit(&rpfreelist_lock);
   2481 
   2482 		vp = RTOV(rp);
   2483 
   2484 		if (rp->r_flags & RHASHED) {
   2485 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
   2486 			mutex_enter(&vp->v_lock);
   2487 			if (vp->v_count > 1) {
   2488 				vp->v_count--;
   2489 				mutex_exit(&vp->v_lock);
   2490 				rw_exit(&rp->r_hashq->r_lock);
   2491 				rw_enter(&rhtp->r_lock, RW_READER);
   2492 				goto start;
   2493 			}
   2494 			mutex_exit(&vp->v_lock);
   2495 			rp_rmhash_locked(rp);
   2496 			rw_exit(&rp->r_hashq->r_lock);
   2497 		}
   2498 
   2499 		rinactive(rp, cr);
   2500 
   2501 		mutex_enter(&vp->v_lock);
   2502 		if (vp->v_count > 1) {
   2503 			vp->v_count--;
   2504 			mutex_exit(&vp->v_lock);
   2505 			rw_enter(&rhtp->r_lock, RW_READER);
   2506 			goto start;
   2507 		}
   2508 		mutex_exit(&vp->v_lock);
   2509 		vn_invalid(vp);
   2510 		/*
   2511 		 * destroy old locks before bzero'ing and
   2512 		 * recreating the locks below.
   2513 		 */
   2514 		nfs_rw_destroy(&rp->r_rwlock);
   2515 		nfs_rw_destroy(&rp->r_lkserlock);
   2516 		mutex_destroy(&rp->r_statelock);
   2517 		cv_destroy(&rp->r_cv);
   2518 		cv_destroy(&rp->r_commit.c_cv);
   2519 		nfs_free_r_path(rp);
   2520 		avl_destroy(&rp->r_dir);
   2521 		/*
   2522 		 * Make sure that if rnode is recycled then
   2523 		 * VFS count is decremented properly before
   2524 		 * reuse.
   2525 		 */
   2526 		VFS_RELE(vp->v_vfsp);
   2527 		vn_reinit(vp);
   2528 	} else {
   2529 		vnode_t *new_vp;
   2530 
   2531 		mutex_exit(&rpfreelist_lock);
   2532 
   2533 		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
   2534 		new_vp = vn_alloc(KM_SLEEP);
   2535 
   2536 		atomic_add_long((ulong_t *)&rnew, 1);
   2537 #ifdef DEBUG
   2538 		clstat_debug.nrnode.value.ui64++;
   2539 #endif
   2540 		vp = new_vp;
   2541 	}
   2542 
   2543 	bzero(rp, sizeof (*rp));
   2544 	rp->r_vnode = vp;
   2545 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
   2546 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
   2547 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
   2548 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
   2549 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
   2550 	rp->r_fh.fh_len = fh->fh_len;
   2551 	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
   2552 	rp->r_server = mi->mi_curr_serv;
   2553 	if (FAILOVER_MOUNT(mi)) {
   2554 		/*
   2555 		 * If replicated servers, stash pathnames
   2556 		 */
   2557 		if (dnm != NULL && nm != NULL) {
   2558 			char *s, *p;
   2559 			uint_t len;
   2560 
   2561 			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
   2562 			rp->r_path = kmem_alloc(len, KM_SLEEP);
   2563 #ifdef DEBUG
   2564 			clstat_debug.rpath.value.ui64 += len;
   2565 #endif
   2566 			s = rp->r_path;
   2567 			for (p = dnm; *p; p++)
   2568 				*s++ = *p;
   2569 			*s++ = '/';
   2570 			for (p = nm; *p; p++)
   2571 				*s++ = *p;
   2572 			*s = '\0';
   2573 		} else {
   2574 			/* special case for root */
   2575 			rp->r_path = kmem_alloc(2, KM_SLEEP);
   2576 #ifdef DEBUG
   2577 			clstat_debug.rpath.value.ui64 += 2;
   2578 #endif
   2579 			*rp->r_path = '.';
   2580 			*(rp->r_path + 1) = '\0';
   2581 		}
   2582 	}
   2583 	VFS_HOLD(vfsp);
   2584 	rp->r_putapage = putapage;
   2585 	rp->r_hashq = rhtp;
   2586 	rp->r_flags = RREADDIRPLUS;
   2587 	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
   2588 	    offsetof(rddir_cache, tree));
   2589 	vn_setops(vp, vops);
   2590 	vp->v_data = (caddr_t)rp;
   2591 	vp->v_vfsp = vfsp;
   2592 	vp->v_type = VNON;
   2593 	nfs_set_vroot(vp);
   2594 
   2595 	/*
   2596 	 * There is a race condition if someone else
   2597 	 * alloc's the rnode while no locks are held, so we
   2598 	 * check again and recover if found.
   2599 	 */
   2600 	rw_enter(&rhtp->r_lock, RW_WRITER);
   2601 	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
   2602 		vp = RTOV(trp);
   2603 		nfs_set_vroot(vp);
   2604 		*newnode = 0;
   2605 		rw_exit(&rhtp->r_lock);
   2606 		rp_addfree(rp, cr);
   2607 		rw_enter(&rhtp->r_lock, RW_READER);
   2608 		return (vp);
   2609 	}
   2610 	rp_addhash(rp);
   2611 	*newnode = 1;
   2612 	return (vp);
   2613 }
   2614 
   2615 static void
   2616 nfs_set_vroot(vnode_t *vp)
   2617 {
   2618 	rnode_t *rp;
   2619 	nfs_fhandle *rootfh;
   2620 
   2621 	rp = VTOR(vp);
   2622 	rootfh = &rp->r_server->sv_fhandle;
   2623 	if (rootfh->fh_len == rp->r_fh.fh_len &&
   2624 	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
   2625 		if (!(vp->v_flag & VROOT)) {
   2626 			mutex_enter(&vp->v_lock);
   2627 			vp->v_flag |= VROOT;
   2628 			mutex_exit(&vp->v_lock);
   2629 		}
   2630 	}
   2631 }
   2632 
   2633 static void
   2634 nfs_free_r_path(rnode_t *rp)
   2635 {
   2636 	char *path;
   2637 	size_t len;
   2638 
   2639 	path = rp->r_path;
   2640 	if (path) {
   2641 		rp->r_path = NULL;
   2642 		len = strlen(path) + 1;
   2643 		kmem_free(path, len);
   2644 #ifdef DEBUG
   2645 		clstat_debug.rpath.value.ui64 -= len;
   2646 #endif
   2647 	}
   2648 }
   2649 
   2650 /*
   2651  * Put an rnode on the free list.
   2652  *
   2653  * Rnodes which were allocated above and beyond the normal limit
   2654  * are immediately freed.
   2655  */
   2656 void
   2657 rp_addfree(rnode_t *rp, cred_t *cr)
   2658 {
   2659 	vnode_t *vp;
   2660 	struct vfs *vfsp;
   2661 
   2662 	vp = RTOV(rp);
   2663 	ASSERT(vp->v_count >= 1);
   2664 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
   2665 
   2666 	/*
   2667 	 * If we have too many rnodes allocated and there are no
   2668 	 * references to this rnode, or if the rnode is no longer
   2669 	 * accessible by it does not reside in the hash queues,
   2670 	 * or if an i/o error occurred while writing to the file,
   2671 	 * then just free it instead of putting it on the rnode
   2672 	 * freelist.
   2673 	 */
   2674 	vfsp = vp->v_vfsp;
   2675 	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
   2676 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
   2677 		if (rp->r_flags & RHASHED) {
   2678 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
   2679 			mutex_enter(&vp->v_lock);
   2680 			if (vp->v_count > 1) {
   2681 				vp->v_count--;
   2682 				mutex_exit(&vp->v_lock);
   2683 				rw_exit(&rp->r_hashq->r_lock);
   2684 				return;
   2685 			}
   2686 			mutex_exit(&vp->v_lock);
   2687 			rp_rmhash_locked(rp);
   2688 			rw_exit(&rp->r_hashq->r_lock);
   2689 		}
   2690 
   2691 		rinactive(rp, cr);
   2692 
   2693 		/*
   2694 		 * Recheck the vnode reference count.  We need to
   2695 		 * make sure that another reference has not been
   2696 		 * acquired while we were not holding v_lock.  The
   2697 		 * rnode is not in the rnode hash queues, so the
   2698 		 * only way for a reference to have been acquired
   2699 		 * is for a VOP_PUTPAGE because the rnode was marked
   2700 		 * with RDIRTY or for a modified page.  This
   2701 		 * reference may have been acquired before our call
   2702 		 * to rinactive.  The i/o may have been completed,
   2703 		 * thus allowing rinactive to complete, but the
   2704 		 * reference to the vnode may not have been released
   2705 		 * yet.  In any case, the rnode can not be destroyed
   2706 		 * until the other references to this vnode have been
   2707 		 * released.  The other references will take care of
   2708 		 * either destroying the rnode or placing it on the
   2709 		 * rnode freelist.  If there are no other references,
   2710 		 * then the rnode may be safely destroyed.
   2711 		 */
   2712 		mutex_enter(&vp->v_lock);
   2713 		if (vp->v_count > 1) {
   2714 			vp->v_count--;
   2715 			mutex_exit(&vp->v_lock);
   2716 			return;
   2717 		}
   2718 		mutex_exit(&vp->v_lock);
   2719 
   2720 		destroy_rnode(rp);
   2721 		return;
   2722 	}
   2723 
   2724 	/*
   2725 	 * Lock the hash queue and then recheck the reference count
   2726 	 * to ensure that no other threads have acquired a reference
   2727 	 * to indicate that the rnode should not be placed on the
   2728 	 * freelist.  If another reference has been acquired, then
   2729 	 * just release this one and let the other thread complete
   2730 	 * the processing of adding this rnode to the freelist.
   2731 	 */
   2732 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
   2733 
   2734 	mutex_enter(&vp->v_lock);
   2735 	if (vp->v_count > 1) {
   2736 		vp->v_count--;
   2737 		mutex_exit(&vp->v_lock);
   2738 		rw_exit(&rp->r_hashq->r_lock);
   2739 		return;
   2740 	}
   2741 	mutex_exit(&vp->v_lock);
   2742 
   2743 	/*
   2744 	 * If there is no cached data or metadata for this file, then
   2745 	 * put the rnode on the front of the freelist so that it will
   2746 	 * be reused before other rnodes which may have cached data or
   2747 	 * metadata associated with them.
   2748 	 */
   2749 	mutex_enter(&rpfreelist_lock);
   2750 	if (rpfreelist == NULL) {
   2751 		rp->r_freef = rp;
   2752 		rp->r_freeb = rp;
   2753 		rpfreelist = rp;
   2754 	} else {
   2755 		rp->r_freef = rpfreelist;
   2756 		rp->r_freeb = rpfreelist->r_freeb;
   2757 		rpfreelist->r_freeb->r_freef = rp;
   2758 		rpfreelist->r_freeb = rp;
   2759 		if (!vn_has_cached_data(vp) &&
   2760 		    !HAVE_RDDIR_CACHE(rp) &&
   2761 		    rp->r_symlink.contents == NULL &&
   2762 		    rp->r_secattr == NULL &&
   2763 		    rp->r_pathconf == NULL)
   2764 			rpfreelist = rp;
   2765 	}
   2766 	mutex_exit(&rpfreelist_lock);
   2767 
   2768 	rw_exit(&rp->r_hashq->r_lock);
   2769 }
   2770 
   2771 /*
   2772  * Remove an rnode from the free list.
   2773  *
   2774  * The caller must be holding rpfreelist_lock and the rnode
   2775  * must be on the freelist.
   2776  */
   2777 static void
   2778 rp_rmfree(rnode_t *rp)
   2779 {
   2780 
   2781 	ASSERT(MUTEX_HELD(&rpfreelist_lock));
   2782 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
   2783 
   2784 	if (rp == rpfreelist) {
   2785 		rpfreelist = rp->r_freef;
   2786 		if (rp == rpfreelist)
   2787 			rpfreelist = NULL;
   2788 	}
   2789 
   2790 	rp->r_freeb->r_freef = rp->r_freef;
   2791 	rp->r_freef->r_freeb = rp->r_freeb;
   2792 
   2793 	rp->r_freef = rp->r_freeb = NULL;
   2794 }
   2795 
   2796 /*
   2797  * Put a rnode in the hash table.
   2798  *
   2799  * The caller must be holding the exclusive hash queue lock.
   2800  */
   2801 static void
   2802 rp_addhash(rnode_t *rp)
   2803 {
   2804 
   2805 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
   2806 	ASSERT(!(rp->r_flags & RHASHED));
   2807 
   2808 	rp->r_hashf = rp->r_hashq->r_hashf;
   2809 	rp->r_hashq->r_hashf = rp;
   2810 	rp->r_hashb = (rnode_t *)rp->r_hashq;
   2811 	rp->r_hashf->r_hashb = rp;
   2812 
   2813 	mutex_enter(&rp->r_statelock);
   2814 	rp->r_flags |= RHASHED;
   2815 	mutex_exit(&rp->r_statelock);
   2816 }
   2817 
   2818 /*
   2819  * Remove a rnode from the hash table.
   2820  *
   2821  * The caller must be holding the hash queue lock.
   2822  */
   2823 static void
   2824 rp_rmhash_locked(rnode_t *rp)
   2825 {
   2826 
   2827 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
   2828 	ASSERT(rp->r_flags & RHASHED);
   2829 
   2830 	rp->r_hashb->r_hashf = rp->r_hashf;
   2831 	rp->r_hashf->r_hashb = rp->r_hashb;
   2832 
   2833 	mutex_enter(&rp->r_statelock);
   2834 	rp->r_flags &= ~RHASHED;
   2835 	mutex_exit(&rp->r_statelock);
   2836 }
   2837 
   2838 /*
   2839  * Remove a rnode from the hash table.
   2840  *
   2841  * The caller must not be holding the hash queue lock.
   2842  */
   2843 void
   2844 rp_rmhash(rnode_t *rp)
   2845 {
   2846 
   2847 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
   2848 	rp_rmhash_locked(rp);
   2849 	rw_exit(&rp->r_hashq->r_lock);
   2850 }
   2851 
   2852 /*
   2853  * Lookup a rnode by fhandle.
   2854  *
   2855  * The caller must be holding the hash queue lock, either shared or exclusive.
   2856  */
   2857 static rnode_t *
   2858 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
   2859 {
   2860 	rnode_t *rp;
   2861 	vnode_t *vp;
   2862 
   2863 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
   2864 
   2865 	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
   2866 		vp = RTOV(rp);
   2867 		if (vp->v_vfsp == vfsp &&
   2868 		    rp->r_fh.fh_len == fh->fh_len &&
   2869 		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
   2870 			/*
   2871 			 * remove rnode from free list, if necessary.
   2872 			 */
   2873 			if (rp->r_freef != NULL) {
   2874 				mutex_enter(&rpfreelist_lock);
   2875 				/*
   2876 				 * If the rnode is on the freelist,
   2877 				 * then remove it and use that reference
   2878 				 * as the new reference.  Otherwise,
   2879 				 * need to increment the reference count.
   2880 				 */
   2881 				if (rp->r_freef != NULL) {
   2882 					rp_rmfree(rp);
   2883 					mutex_exit(&rpfreelist_lock);
   2884 				} else {
   2885 					mutex_exit(&rpfreelist_lock);
   2886 					VN_HOLD(vp);
   2887 				}
   2888 			} else
   2889 				VN_HOLD(vp);
   2890 			return (rp);
   2891 		}
   2892 	}
   2893 	return (NULL);
   2894 }
   2895 
   2896 /*
   2897  * Return 1 if there is a active vnode belonging to this vfs in the
   2898  * rtable cache.
   2899  *
   2900  * Several of these checks are done without holding the usual
   2901  * locks.  This is safe because destroy_rtable(), rp_addfree(),
   2902  * etc. will redo the necessary checks before actually destroying
   2903  * any rnodes.
   2904  */
   2905 int
   2906 check_rtable(struct vfs *vfsp)
   2907 {
   2908 	int index;
   2909 	rnode_t *rp;
   2910 	vnode_t *vp;
   2911 
   2912 	for (index = 0; index < rtablesize; index++) {
   2913 		rw_enter(&rtable[index].r_lock, RW_READER);
   2914 		for (rp = rtable[index].r_hashf;
   2915 		    rp != (rnode_t *)(&rtable[index]);
   2916 		    rp = rp->r_hashf) {
   2917 			vp = RTOV(rp);
   2918 			if (vp->v_vfsp == vfsp) {
   2919 				if (rp->r_freef == NULL ||
   2920 				    (vn_has_cached_data(vp) &&
   2921 				    (rp->r_flags & RDIRTY)) ||
   2922 				    rp->r_count > 0) {
   2923 					rw_exit(&rtable[index].r_lock);
   2924 					return (1);
   2925 				}
   2926 			}
   2927 		}
   2928 		rw_exit(&rtable[index].r_lock);
   2929 	}
   2930 	return (0);
   2931 }
   2932 
   2933 /*
   2934  * Destroy inactive vnodes from the hash queues which belong to this
   2935  * vfs.  It is essential that we destroy all inactive vnodes during a
   2936  * forced unmount as well as during a normal unmount.
   2937  */
   2938 void
   2939 destroy_rtable(struct vfs *vfsp, cred_t *cr)
   2940 {
   2941 	int index;
   2942 	rnode_t *rp;
   2943 	rnode_t *rlist;
   2944 	rnode_t *r_hashf;
   2945 	vnode_t *vp;
   2946 
   2947 	rlist = NULL;
   2948 
   2949 	for (index = 0; index < rtablesize; index++) {
   2950 		rw_enter(&rtable[index].r_lock, RW_WRITER);
   2951 		for (rp = rtable[index].r_hashf;
   2952 		    rp != (rnode_t *)(&rtable[index]);
   2953 		    rp = r_hashf) {
   2954 			/* save the hash pointer before destroying */
   2955 			r_hashf = rp->r_hashf;
   2956 			vp = RTOV(rp);
   2957 			if (vp->v_vfsp == vfsp) {
   2958 				mutex_enter(&rpfreelist_lock);
   2959 				if (rp->r_freef != NULL) {
   2960 					rp_rmfree(rp);
   2961 					mutex_exit(&rpfreelist_lock);
   2962 					rp_rmhash_locked(rp);
   2963 					rp->r_hashf = rlist;
   2964 					rlist = rp;
   2965 				} else
   2966 					mutex_exit(&rpfreelist_lock);
   2967 			}
   2968 		}
   2969 		rw_exit(&rtable[index].r_lock);
   2970 	}
   2971 
   2972 	for (rp = rlist; rp != NULL; rp = rlist) {
   2973 		rlist = rp->r_hashf;
   2974 		/*
   2975 		 * This call to rp_addfree will end up destroying the
   2976 		 * rnode, but in a safe way with the appropriate set
   2977 		 * of checks done.
   2978 		 */
   2979 		rp_addfree(rp, cr);
   2980 	}
   2981 
   2982 }
   2983 
   2984 /*
   2985  * This routine destroys all the resources associated with the rnode
   2986  * and then the rnode itself.
   2987  */
   2988 static void
   2989 destroy_rnode(rnode_t *rp)
   2990 {
   2991 	vnode_t *vp;
   2992 	vfs_t *vfsp;
   2993 
   2994 	vp = RTOV(rp);
   2995 	vfsp = vp->v_vfsp;
   2996 
   2997 	ASSERT(vp->v_count == 1);
   2998 	ASSERT(rp->r_count == 0);
   2999 	ASSERT(rp->r_lmpl == NULL);
   3000 	ASSERT(rp->r_mapcnt == 0);
   3001 	ASSERT(!(rp->r_flags & RHASHED));
   3002 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
   3003 	atomic_add_long((ulong_t *)&rnew, -1);
   3004 #ifdef DEBUG
   3005 	clstat_debug.nrnode.value.ui64--;
   3006 #endif
   3007 	nfs_rw_destroy(&rp->r_rwlock);
   3008 	nfs_rw_destroy(&rp->r_lkserlock);
   3009 	mutex_destroy(&rp->r_statelock);
   3010 	cv_destroy(&rp->r_cv);
   3011 	cv_destroy(&rp->r_commit.c_cv);
   3012 	if (rp->r_flags & RDELMAPLIST)
   3013 		list_destroy(&rp->r_indelmap);
   3014 	nfs_free_r_path(rp);
   3015 	avl_destroy(&rp->r_dir);
   3016 	vn_invalid(vp);
   3017 	vn_free(vp);
   3018 	kmem_cache_free(rnode_cache, rp);
   3019 	VFS_RELE(vfsp);
   3020 }
   3021 
   3022 /*
   3023  * Flush all vnodes in this (or every) vfs.
   3024  * Used by nfs_sync and by nfs_unmount.
   3025  */
   3026 void
   3027 rflush(struct vfs *vfsp, cred_t *cr)
   3028 {
   3029 	int index;
   3030 	rnode_t *rp;
   3031 	vnode_t *vp, **vplist;
   3032 	long num, cnt;
   3033 
   3034 	/*
   3035 	 * Check to see whether there is anything to do.
   3036 	 */
   3037 	num = rnew;
   3038 	if (num == 0)
   3039 		return;
   3040 
   3041 	/*
   3042 	 * Allocate a slot for all currently active rnodes on the
   3043 	 * supposition that they all may need flushing.
   3044 	 */
   3045 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
   3046 	cnt = 0;
   3047 
   3048 	/*
   3049 	 * Walk the hash queues looking for rnodes with page
   3050 	 * lists associated with them.  Make a list of these
   3051 	 * files.
   3052 	 */
   3053 	for (index = 0; index < rtablesize; index++) {
   3054 		rw_enter(&rtable[index].r_lock, RW_READER);
   3055 		for (rp = rtable[index].r_hashf;
   3056 		    rp != (rnode_t *)(&rtable[index]);
   3057 		    rp = rp->r_hashf) {
   3058 			vp = RTOV(rp);
   3059 			/*
   3060 			 * Don't bother sync'ing a vp if it
   3061 			 * is part of virtual swap device or
   3062 			 * if VFS is read-only
   3063 			 */
   3064 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
   3065 				continue;
   3066 			/*
   3067 			 * If flushing all mounted file systems or
   3068 			 * the vnode belongs to this vfs, has pages
   3069 			 * and is marked as either dirty or mmap'd,
   3070 			 * hold and add this vnode to the list of
   3071 			 * vnodes to flush.
   3072 			 */
   3073 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
   3074 			    vn_has_cached_data(vp) &&
   3075 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
   3076 				VN_HOLD(vp);
   3077 				vplist[cnt++] = vp;
   3078 				if (cnt == num) {
   3079 					rw_exit(&rtable[index].r_lock);
   3080 					goto toomany;
   3081 				}
   3082 			}
   3083 		}
   3084 		rw_exit(&rtable[index].r_lock);
   3085 	}
   3086 toomany:
   3087 
   3088 	/*
   3089 	 * Flush and release all of the files on the list.
   3090 	 */
   3091 	while (cnt-- > 0) {
   3092 		vp = vplist[cnt];
   3093 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
   3094 		VN_RELE(vp);
   3095 	}
   3096 
   3097 	/*
   3098 	 * Free the space allocated to hold the list.
   3099 	 */
   3100 	kmem_free(vplist, num * sizeof (*vplist));
   3101 }
   3102 
   3103 /*
   3104  * This probably needs to be larger than or equal to
   3105  * log2(sizeof (struct rnode)) due to the way that rnodes are
   3106  * allocated.
   3107  */
   3108 #define	ACACHE_SHIFT_BITS	9
   3109 
   3110 static int
   3111 acachehash(rnode_t *rp, cred_t *cr)
   3112 {
   3113 
   3114 	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
   3115 	    acachemask);
   3116 }
   3117 
   3118 #ifdef DEBUG
   3119 static long nfs_access_cache_hits = 0;
   3120 static long nfs_access_cache_misses = 0;
   3121 #endif
   3122 
   3123 nfs_access_type_t
   3124 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
   3125 {
   3126 	vnode_t *vp;
   3127 	acache_t *ap;
   3128 	acache_hash_t *hp;
   3129 	nfs_access_type_t all;
   3130 
   3131 	vp = RTOV(rp);
   3132 	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
   3133 		return (NFS_ACCESS_UNKNOWN);
   3134 
   3135 	if (rp->r_acache != NULL) {
   3136 		hp = &acache[acachehash(rp, cr)];
   3137 		rw_enter(&hp->lock, RW_READER);
   3138 		ap = hp->next;
   3139 		while (ap != (acache_t *)hp) {
   3140 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
   3141 				if ((ap->known & acc) == acc) {
   3142 #ifdef DEBUG
   3143 					nfs_access_cache_hits++;
   3144 #endif
   3145 					if ((ap->allowed & acc) == acc)
   3146 						all = NFS_ACCESS_ALLOWED;
   3147 					else
   3148 						all = NFS_ACCESS_DENIED;
   3149 				} else {
   3150 #ifdef DEBUG
   3151 					nfs_access_cache_misses++;
   3152 #endif
   3153 					all = NFS_ACCESS_UNKNOWN;
   3154 				}
   3155 				rw_exit(&hp->lock);
   3156 				return (all);
   3157 			}
   3158 			ap = ap->next;
   3159 		}
   3160 		rw_exit(&hp->lock);
   3161 	}
   3162 
   3163 #ifdef DEBUG
   3164 	nfs_access_cache_misses++;
   3165 #endif
   3166 	return (NFS_ACCESS_UNKNOWN);
   3167 }
   3168 
   3169 void
   3170 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
   3171 {
   3172 	acache_t *ap;
   3173 	acache_t *nap;
   3174 	acache_hash_t *hp;
   3175 
   3176 	hp = &acache[acachehash(rp, cr)];
   3177 
   3178 	/*
   3179 	 * Allocate now assuming that mostly an allocation will be
   3180 	 * required.  This allows the allocation to happen without
   3181 	 * holding the hash bucket locked.
   3182 	 */
   3183 	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
   3184 	if (nap != NULL) {
   3185 		nap->known = acc;
   3186 		nap->allowed = resacc;
   3187 		nap->rnode = rp;
   3188 		crhold(cr);
   3189 		nap->cred = cr;
   3190 		nap->hashq = hp;
   3191 	}
   3192 
   3193 	rw_enter(&hp->lock, RW_WRITER);
   3194 
   3195 	if (rp->r_acache != NULL) {
   3196 		ap = hp->next;
   3197 		while (ap != (acache_t *)hp) {
   3198 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
   3199 				ap->known |= acc;
   3200 				ap->allowed &= ~acc;
   3201 				ap->allowed |= resacc;
   3202 				rw_exit(&hp->lock);
   3203 				if (nap != NULL) {
   3204 					crfree(nap->cred);
   3205 					kmem_cache_free(acache_cache, nap);
   3206 				}
   3207 				return;
   3208 			}
   3209 			ap = ap->next;
   3210 		}
   3211 	}
   3212 
   3213 	if (nap != NULL) {
   3214 #ifdef DEBUG
   3215 		clstat_debug.access.value.ui64++;
   3216 #endif
   3217 		nap->next = hp->next;
   3218 		hp->next = nap;
   3219 		nap->next->prev = nap;
   3220 		nap->prev = (acache_t *)hp;
   3221 
   3222 		mutex_enter(&rp->r_statelock);
   3223 		nap->list = rp->r_acache;
   3224 		rp->r_acache = nap;
   3225 		mutex_exit(&rp->r_statelock);
   3226 	}
   3227 
   3228 	rw_exit(&hp->lock);
   3229 }
   3230 
   3231 int
   3232 nfs_access_purge_rp(rnode_t *rp)
   3233 {
   3234 	acache_t *ap;
   3235 	acache_t *tmpap;
   3236 	acache_t *rplist;
   3237 
   3238 	/*
   3239 	 * If there aren't any cached entries, then there is nothing
   3240 	 * to free.
   3241 	 */
   3242 	if (rp->r_acache == NULL)
   3243 		return (0);
   3244 
   3245 	mutex_enter(&rp->r_statelock);
   3246 	rplist = rp->r_acache;
   3247 	rp->r_acache = NULL;
   3248 	mutex_exit(&rp->r_statelock);
   3249 
   3250 	/*
   3251 	 * Loop through each entry in the list pointed to in the
   3252 	 * rnode.  Remove each of these entries from the hash
   3253 	 * queue that it is on and remove it from the list in
   3254 	 * the rnode.
   3255 	 */
   3256 	for (ap = rplist; ap != NULL; ap = tmpap) {
   3257 		rw_enter(&ap->hashq->lock, RW_WRITER);
   3258 		ap->prev->next = ap->next;
   3259 		ap->next->prev = ap->prev;
   3260 		rw_exit(&ap->hashq->lock);
   3261 
   3262 		tmpap = ap->list;
   3263 		crfree(ap->cred);
   3264 		kmem_cache_free(acache_cache, ap);
   3265 #ifdef DEBUG
   3266 		clstat_debug.access.value.ui64--;
   3267 #endif
   3268 	}
   3269 
   3270 	return (1);
   3271 }
   3272 
   3273 static const char prefix[] = ".nfs";
   3274 
   3275 static kmutex_t newnum_lock;
   3276 
   3277 int
   3278 newnum(void)
   3279 {
   3280 	static uint_t newnum = 0;
   3281 	uint_t id;
   3282 
   3283 	mutex_enter(&newnum_lock);
   3284 	if (newnum == 0)
   3285 		newnum = gethrestime_sec() & 0xffff;
   3286 	id = newnum++;
   3287 	mutex_exit(&newnum_lock);
   3288 	return (id);
   3289 }
   3290 
   3291 char *
   3292 newname(void)
   3293 {
   3294 	char *news;
   3295 	char *s;
   3296 	const char *p;
   3297 	uint_t id;
   3298 
   3299 	id = newnum();
   3300 	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
   3301 	s = news;
   3302 	p = prefix;
   3303 	while (*p != '\0')
   3304 		*s++ = *p++;
   3305 	while (id != 0) {
   3306 		*s++ = "0123456789ABCDEF"[id & 0x0f];
   3307 		id >>= 4;
   3308 	}
   3309 	*s = '\0';
   3310 	return (news);
   3311 }
   3312 
   3313 /*
   3314  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
   3315  * framework.
   3316  */
   3317 static int
   3318 cl_snapshot(kstat_t *ksp, void *buf, int rw)
   3319 {
   3320 	ksp->ks_snaptime = gethrtime();
   3321 	if (rw == KSTAT_WRITE) {
   3322 		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
   3323 #ifdef DEBUG
   3324 		/*
   3325 		 * Currently only the global zone can write to kstats, but we
   3326 		 * add the check just for paranoia.
   3327 		 */
   3328 		if (INGLOBALZONE(curproc))
   3329 			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
   3330 			    sizeof (clstat_debug));
   3331 #endif
   3332 	} else {
   3333 		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
   3334 #ifdef DEBUG
   3335 		/*
   3336 		 * If we're displaying the "global" debug kstat values, we
   3337 		 * display them as-is to all zones since in fact they apply to
   3338 		 * the system as a whole.
   3339 		 */
   3340 		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
   3341 		    sizeof (clstat_debug));
   3342 #endif
   3343 	}
   3344 	return (0);
   3345 }
   3346 
   3347 static void *
   3348 clinit_zone(zoneid_t zoneid)
   3349 {
   3350 	kstat_t *nfs_client_kstat;
   3351 	struct nfs_clnt *nfscl;
   3352 	uint_t ndata;
   3353 
   3354 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
   3355 	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
   3356 	nfscl->nfscl_chtable = NULL;
   3357 	nfscl->nfscl_zoneid = zoneid;
   3358 
   3359 	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
   3360 	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
   3361 #ifdef DEBUG
   3362 	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
   3363 #endif
   3364 	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
   3365 	    "misc", KSTAT_TYPE_NAMED, ndata,
   3366 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
   3367 		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
   3368 		nfs_client_kstat->ks_snapshot = cl_snapshot;
   3369 		kstat_install(nfs_client_kstat);
   3370 	}
   3371 	mutex_enter(&nfs_clnt_list_lock);
   3372 	list_insert_head(&nfs_clnt_list, nfscl);
   3373 	mutex_exit(&nfs_clnt_list_lock);
   3374 	return (nfscl);
   3375 }
   3376 
   3377 /*ARGSUSED*/
   3378 static void
   3379 clfini_zone(zoneid_t zoneid, void *arg)
   3380 {
   3381 	struct nfs_clnt *nfscl = arg;
   3382 	chhead_t *chp, *next;
   3383 
   3384 	if (nfscl == NULL)
   3385 		return;
   3386 	mutex_enter(&nfs_clnt_list_lock);
   3387 	list_remove(&nfs_clnt_list, nfscl);
   3388 	mutex_exit(&nfs_clnt_list_lock);
   3389 	clreclaim_zone(nfscl, 0);
   3390 	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
   3391 		ASSERT(chp->ch_list == NULL);
   3392 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
   3393 		next = chp->ch_next;
   3394 		kmem_free(chp, sizeof (*chp));
   3395 	}
   3396 	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
   3397 	mutex_destroy(&nfscl->nfscl_chtable_lock);
   3398 	kmem_free(nfscl, sizeof (*nfscl));
   3399 }
   3400 
   3401 /*
   3402  * Called by endpnt_destructor to make sure the client handles are
   3403  * cleaned up before the RPC endpoints.  This becomes a no-op if
   3404  * clfini_zone (above) is called first.  This function is needed
   3405  * (rather than relying on clfini_zone to clean up) because the ZSD
   3406  * callbacks have no ordering mechanism, so we have no way to ensure
   3407  * that clfini_zone is called before endpnt_destructor.
   3408  */
   3409 void
   3410 clcleanup_zone(zoneid_t zoneid)
   3411 {
   3412 	struct nfs_clnt *nfscl;
   3413 
   3414 	mutex_enter(&nfs_clnt_list_lock);
   3415 	nfscl = list_head(&nfs_clnt_list);
   3416 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
   3417 		if (nfscl->nfscl_zoneid == zoneid) {
   3418 			clreclaim_zone(nfscl, 0);
   3419 			break;
   3420 		}
   3421 	}
   3422 	mutex_exit(&nfs_clnt_list_lock);
   3423 }
   3424 
   3425 int
   3426 nfs_subrinit(void)
   3427 {
   3428 	int i;
   3429 	ulong_t nrnode_max;
   3430 
   3431 	/*
   3432 	 * Allocate and initialize the rnode hash queues
   3433 	 */
   3434 	if (nrnode <= 0)
   3435 		nrnode = ncsize;
   3436 	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
   3437 	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
   3438 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
   3439 		    "setting nrnode to max value of %ld", nrnode_max);
   3440 		nrnode = nrnode_max;
   3441 	}
   3442 
   3443 	rtablesize = 1 << highbit(nrnode / hashlen);
   3444 	rtablemask = rtablesize - 1;
   3445 	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
   3446 	for (i = 0; i < rtablesize; i++) {
   3447 		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
   3448 		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
   3449 		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
   3450 	}
   3451 	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
   3452 	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
   3453 
   3454 	/*
   3455 	 * Allocate and initialize the access cache
   3456 	 */
   3457 
   3458 	/*
   3459 	 * Initial guess is one access cache entry per rnode unless
   3460 	 * nacache is set to a non-zero value and then it is used to
   3461 	 * indicate a guess at the number of access cache entries.
   3462 	 */
   3463 	if (nacache > 0)
   3464 		acachesize = 1 << highbit(nacache / hashlen);
   3465 	else
   3466 		acachesize = rtablesize;
   3467 	acachemask = acachesize - 1;
   3468 	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
   3469 	for (i = 0; i < acachesize; i++) {
   3470 		acache[i].next = (acache_t *)&acache[i];
   3471 		acache[i].prev = (acache_t *)&acache[i];
   3472 		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
   3473 	}
   3474 	acache_cache = kmem_cache_create("nfs_access_cache",
   3475 	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
   3476 	/*
   3477 	 * Allocate and initialize the client handle cache
   3478 	 */
   3479 	chtab_cache = kmem_cache_create("client_handle_cache",
   3480 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
   3481 	/*
   3482 	 * Initialize the list of per-zone client handles (and associated data).
   3483 	 * This needs to be done before we call zone_key_create().
   3484 	 */
   3485 	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
   3486 	    offsetof(struct nfs_clnt, nfscl_node));
   3487 	/*
   3488 	 * Initialize the zone_key for per-zone client handle lists.
   3489 	 */
   3490 	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
   3491 	/*
   3492 	 * Initialize the various mutexes and reader/writer locks
   3493 	 */
   3494 	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
   3495 	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
   3496 	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
   3497 
   3498 	/*
   3499 	 * Assign unique major number for all nfs mounts
   3500 	 */
   3501 	if ((nfs_major = getudev()) == -1) {
   3502 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
   3503 		    "nfs: init: can't get unique device number");
   3504 		nfs_major = 0;
   3505 	}
   3506 	nfs_minor = 0;
   3507 
   3508 	if (nfs3_jukebox_delay == 0)
   3509 		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
   3510 
   3511 	return (0);
   3512 }
   3513 
   3514 void
   3515 nfs_subrfini(void)
   3516 {
   3517 	int i;
   3518 
   3519 	/*
   3520 	 * Deallocate the rnode hash queues
   3521 	 */
   3522 	kmem_cache_destroy(rnode_cache);
   3523 
   3524 	for (i = 0; i < rtablesize; i++)
   3525 		rw_destroy(&rtable[i].r_lock);
   3526 	kmem_free(rtable, rtablesize * sizeof (*rtable));
   3527 
   3528 	/*
   3529 	 * Deallocated the access cache
   3530 	 */
   3531 	kmem_cache_destroy(acache_cache);
   3532 
   3533 	for (i = 0; i < acachesize; i++)
   3534 		rw_destroy(&acache[i].lock);
   3535 	kmem_free(acache, acachesize * sizeof (*acache));
   3536 
   3537 	/*
   3538 	 * Deallocate the client handle cache
   3539 	 */
   3540 	kmem_cache_destroy(chtab_cache);
   3541 
   3542 	/*
   3543 	 * Destroy the various mutexes and reader/writer locks
   3544 	 */
   3545 	mutex_destroy(&rpfreelist_lock);
   3546 	mutex_destroy(&newnum_lock);
   3547 	mutex_destroy(&nfs_minor_lock);
   3548 	(void) zone_key_delete(nfsclnt_zone_key);
   3549 }
   3550 
   3551 enum nfsstat
   3552 puterrno(int error)
   3553 {
   3554 
   3555 	switch (error) {
   3556 	case EOPNOTSUPP:
   3557 		return (NFSERR_OPNOTSUPP);
   3558 	case ENAMETOOLONG:
   3559 		return (NFSERR_NAMETOOLONG);
   3560 	case ENOTEMPTY:
   3561 		return (NFSERR_NOTEMPTY);
   3562 	case EDQUOT:
   3563 		return (NFSERR_DQUOT);
   3564 	case ESTALE:
   3565 		return (NFSERR_STALE);
   3566 	case EREMOTE:
   3567 		return (NFSERR_REMOTE);
   3568 	case ENOSYS:
   3569 		return (NFSERR_OPNOTSUPP);
   3570 	case EOVERFLOW:
   3571 		return (NFSERR_INVAL);
   3572 	default:
   3573 		return ((enum nfsstat)error);
   3574 	}
   3575 	/* NOTREACHED */
   3576 }
   3577 
   3578 int
   3579 geterrno(enum nfsstat status)
   3580 {
   3581 
   3582 	switch (status) {
   3583 	case NFSERR_OPNOTSUPP:
   3584 		return (EOPNOTSUPP);
   3585 	case NFSERR_NAMETOOLONG:
   3586 		return (ENAMETOOLONG);
   3587 	case NFSERR_NOTEMPTY:
   3588 		return (ENOTEMPTY);
   3589 	case NFSERR_DQUOT:
   3590 		return (EDQUOT);
   3591 	case NFSERR_STALE:
   3592 		return (ESTALE);
   3593 	case NFSERR_REMOTE:
   3594 		return (EREMOTE);
   3595 	case NFSERR_WFLUSH:
   3596 		return (EIO);
   3597 	default:
   3598 		return ((int)status);
   3599 	}
   3600 	/* NOTREACHED */
   3601 }
   3602 
   3603 enum nfsstat3
   3604 puterrno3(int error)
   3605 {
   3606 
   3607 #ifdef DEBUG
   3608 	switch (error) {
   3609 	case 0:
   3610 		return (NFS3_OK);
   3611 	case EPERM:
   3612 		return (NFS3ERR_PERM);
   3613 	case ENOENT:
   3614 		return (NFS3ERR_NOENT);
   3615 	case EIO:
   3616 		return (NFS3ERR_IO);
   3617 	case ENXIO:
   3618 		return (NFS3ERR_NXIO);
   3619 	case EACCES:
   3620 		return (NFS3ERR_ACCES);
   3621 	case EEXIST:
   3622 		return (NFS3ERR_EXIST);
   3623 	case EXDEV:
   3624 		return (NFS3ERR_XDEV);
   3625 	case ENODEV:
   3626 		return (NFS3ERR_NODEV);
   3627 	case ENOTDIR:
   3628 		return (NFS3ERR_NOTDIR);
   3629 	case EISDIR:
   3630 		return (NFS3ERR_ISDIR);
   3631 	case EINVAL:
   3632 		return (NFS3ERR_INVAL);
   3633 	case EFBIG:
   3634 		return (NFS3ERR_FBIG);
   3635 	case ENOSPC:
   3636 		return (NFS3ERR_NOSPC);
   3637 	case EROFS:
   3638 		return (NFS3ERR_ROFS);
   3639 	case EMLINK:
   3640 		return (NFS3ERR_MLINK);
   3641 	case ENAMETOOLONG:
   3642 		return (NFS3ERR_NAMETOOLONG);
   3643 	case ENOTEMPTY:
   3644 		return (NFS3ERR_NOTEMPTY);
   3645 	case EDQUOT:
   3646 		return (NFS3ERR_DQUOT);
   3647 	case ESTALE:
   3648 		return (NFS3ERR_STALE);
   3649 	case EREMOTE:
   3650 		return (NFS3ERR_REMOTE);
   3651 	case ENOSYS:
   3652 	case EOPNOTSUPP:
   3653 		return (NFS3ERR_NOTSUPP);
   3654 	case EOVERFLOW:
   3655 		return (NFS3ERR_INVAL);
   3656 	default:
   3657 		zcmn_err(getzoneid(), CE_WARN,
   3658 		    "puterrno3: got error %d", error);
   3659 		return ((enum nfsstat3)error);
   3660 	}
   3661 #else
   3662 	switch (error) {
   3663 	case ENAMETOOLONG:
   3664 		return (NFS3ERR_NAMETOOLONG);
   3665 	case ENOTEMPTY:
   3666 		return (NFS3ERR_NOTEMPTY);
   3667 	case EDQUOT:
   3668 		return (NFS3ERR_DQUOT);
   3669 	case ESTALE:
   3670 		return (NFS3ERR_STALE);
   3671 	case ENOSYS:
   3672 	case EOPNOTSUPP:
   3673 		return (NFS3ERR_NOTSUPP);
   3674 	case EREMOTE:
   3675 		return (NFS3ERR_REMOTE);
   3676 	case EOVERFLOW:
   3677 		return (NFS3ERR_INVAL);
   3678 	default:
   3679 		return ((enum nfsstat3)error);
   3680 	}
   3681 #endif
   3682 }
   3683 
   3684 int
   3685 geterrno3(enum nfsstat3 status)
   3686 {
   3687 
   3688 #ifdef DEBUG
   3689 	switch (status) {
   3690 	case NFS3_OK:
   3691 		return (0);
   3692 	case NFS3ERR_PERM:
   3693 		return (EPERM);
   3694 	case NFS3ERR_NOENT:
   3695 		return (ENOENT);
   3696 	case NFS3ERR_IO:
   3697 		return (EIO);
   3698 	case NFS3ERR_NXIO:
   3699 		return (ENXIO);
   3700 	case NFS3ERR_ACCES:
   3701 		return (EACCES);
   3702 	case NFS3ERR_EXIST:
   3703 		return (EEXIST);
   3704 	case NFS3ERR_XDEV:
   3705 		return (EXDEV);
   3706 	case NFS3ERR_NODEV:
   3707 		return (ENODEV);
   3708 	case NFS3ERR_NOTDIR:
   3709 		return (ENOTDIR);
   3710 	case NFS3ERR_ISDIR:
   3711 		return (EISDIR);
   3712 	case NFS3ERR_INVAL:
   3713 		return (EINVAL);
   3714 	case NFS3ERR_FBIG:
   3715 		return (EFBIG);
   3716 	case NFS3ERR_NOSPC:
   3717 		return (ENOSPC);
   3718 	case NFS3ERR_ROFS:
   3719 		return (EROFS);
   3720 	case NFS3ERR_MLINK:
   3721 		return (EMLINK);
   3722 	case NFS3ERR_NAMETOOLONG:
   3723 		return (ENAMETOOLONG);
   3724 	case NFS3ERR_NOTEMPTY:
   3725 		return (ENOTEMPTY);
   3726 	case NFS3ERR_DQUOT:
   3727 		return (EDQUOT);
   3728 	case NFS3ERR_STALE:
   3729 		return (ESTALE);
   3730 	case NFS3ERR_REMOTE:
   3731 		return (EREMOTE);
   3732 	case NFS3ERR_BADHANDLE:
   3733 		return (ESTALE);
   3734 	case NFS3ERR_NOT_SYNC:
   3735 		return (EINVAL);
   3736 	case NFS3ERR_BAD_COOKIE:
   3737 		return (ENOENT);
   3738 	case NFS3ERR_NOTSUPP:
   3739 		return (EOPNOTSUPP);
   3740 	case NFS3ERR_TOOSMALL:
   3741 		return (EINVAL);
   3742 	case NFS3ERR_SERVERFAULT:
   3743 		return (EIO);
   3744 	case NFS3ERR_BADTYPE:
   3745 		return (EINVAL);
   3746 	case NFS3ERR_JUKEBOX:
   3747 		return (ENXIO);
   3748 	default:
   3749 		zcmn_err(getzoneid(), CE_WARN,
   3750 		    "geterrno3: got status %d", status);
   3751 		return ((int)status);
   3752 	}
   3753 #else
   3754 	switch (status) {
   3755 	case NFS3ERR_NAMETOOLONG:
   3756 		return (ENAMETOOLONG);
   3757 	case NFS3ERR_NOTEMPTY:
   3758 		return (ENOTEMPTY);
   3759 	case NFS3ERR_DQUOT:
   3760 		return (EDQUOT);
   3761 	case NFS3ERR_STALE:
   3762 	case NFS3ERR_BADHANDLE:
   3763 		return (ESTALE);
   3764 	case NFS3ERR_NOTSUPP:
   3765 		return (EOPNOTSUPP);
   3766 	case NFS3ERR_REMOTE:
   3767 		return (EREMOTE);
   3768 	case NFS3ERR_NOT_SYNC:
   3769 	case NFS3ERR_TOOSMALL:
   3770 	case NFS3ERR_BADTYPE:
   3771 		return (EINVAL);
   3772 	case NFS3ERR_BAD_COOKIE:
   3773 		return (ENOENT);
   3774 	case NFS3ERR_SERVERFAULT:
   3775 		return (EIO);
   3776 	case NFS3ERR_JUKEBOX:
   3777 		return (ENXIO);
   3778 	default:
   3779 		return ((int)status);
   3780 	}
   3781 #endif
   3782 }
   3783 
   3784 rddir_cache *
   3785 rddir_cache_alloc(int flags)
   3786 {
   3787 	rddir_cache *rc;
   3788 
   3789 	rc = kmem_alloc(sizeof (*rc), flags);
   3790 	if (rc != NULL) {
   3791 		rc->entries = NULL;
   3792 		rc->flags = RDDIR;
   3793 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
   3794 		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
   3795 		rc->count = 1;
   3796 #ifdef DEBUG
   3797 		atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
   3798 #endif
   3799 	}
   3800 	return (rc);
   3801 }
   3802 
   3803 static void
   3804 rddir_cache_free(rddir_cache *rc)
   3805 {
   3806 
   3807 #ifdef DEBUG
   3808 	atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
   3809 #endif
   3810 	if (rc->entries != NULL) {
   3811 #ifdef DEBUG
   3812 		rddir_cache_buf_free(rc->entries, rc->buflen);
   3813 #else
   3814 		kmem_free(rc->entries, rc->buflen);
   3815 #endif
   3816 	}
   3817 	cv_destroy(&rc->cv);
   3818 	mutex_destroy(&rc->lock);
   3819 	kmem_free(rc, sizeof (*rc));
   3820 }
   3821 
   3822 void
   3823 rddir_cache_hold(rddir_cache *rc)
   3824 {
   3825 
   3826 	mutex_enter(&rc->lock);
   3827 	rc->count++;
   3828 	mutex_exit(&rc->lock);
   3829 }
   3830 
   3831 void
   3832 rddir_cache_rele(rddir_cache *rc)
   3833 {
   3834 
   3835 	mutex_enter(&rc->lock);
   3836 	ASSERT(rc->count > 0);
   3837 	if (--rc->count == 0) {
   3838 		mutex_exit(&rc->lock);
   3839 		rddir_cache_free(rc);
   3840 	} else
   3841 		mutex_exit(&rc->lock);
   3842 }
   3843 
   3844 #ifdef DEBUG
   3845 char *
   3846 rddir_cache_buf_alloc(size_t size, int flags)
   3847 {
   3848 	char *rc;
   3849 
   3850 	rc = kmem_alloc(size, flags);
   3851 	if (rc != NULL)
   3852 		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
   3853 	return (rc);
   3854 }
   3855 
   3856 void
   3857 rddir_cache_buf_free(void *addr, size_t size)
   3858 {
   3859 
   3860 	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
   3861 	kmem_free(addr, size);
   3862 }
   3863 #endif
   3864 
   3865 static int
   3866 nfs_free_data_reclaim(rnode_t *rp)
   3867 {
   3868 	char *contents;
   3869 	int size;
   3870 	vsecattr_t *vsp;
   3871 	nfs3_pathconf_info *info;
   3872 	int freed;
   3873 	cred_t *cred;
   3874 
   3875 	/*
   3876 	 * Free any held credentials and caches which
   3877 	 * may be associated with this rnode.
   3878 	 */
   3879 	mutex_enter(&rp->r_statelock);
   3880 	cred = rp->r_cred;
   3881 	rp->r_cred = NULL;
   3882 	contents = rp->r_symlink.contents;
   3883 	size = rp->r_symlink.size;
   3884 	rp->r_symlink.contents = NULL;
   3885 	vsp = rp->r_secattr;
   3886 	rp->r_secattr = NULL;
   3887 	info = rp->r_pathconf;
   3888 	rp->r_pathconf = NULL;
   3889 	mutex_exit(&rp->r_statelock);
   3890 
   3891 	if (cred != NULL)
   3892 		crfree(cred);
   3893 
   3894 	/*
   3895 	 * Free the access cache entries.
   3896 	 */
   3897 	freed = nfs_access_purge_rp(rp);
   3898 
   3899 	if (!HAVE_RDDIR_CACHE(rp) &&
   3900 	    contents == NULL &&
   3901 	    vsp == NULL &&
   3902 	    info == NULL)
   3903 		return (freed);
   3904 
   3905 	/*
   3906 	 * Free the readdir cache entries
   3907 	 */
   3908 	if (HAVE_RDDIR_CACHE(rp))
   3909 		nfs_purge_rddir_cache(RTOV(rp));
   3910 
   3911 	/*
   3912 	 * Free the symbolic link cache.
   3913 	 */
   3914 	if (contents != NULL) {
   3915 
   3916 		kmem_free((void *)contents, size);
   3917 	}
   3918 
   3919 	/*
   3920 	 * Free any cached ACL.
   3921 	 */
   3922 	if (vsp != NULL)
   3923 		nfs_acl_free(vsp);
   3924 
   3925 	/*
   3926 	 * Free any cached pathconf information.
   3927 	 */
   3928 	if (info != NULL)
   3929 		kmem_free(info, sizeof (*info));
   3930 
   3931 	return (1);
   3932 }
   3933 
   3934 static int
   3935 nfs_active_data_reclaim(rnode_t *rp)
   3936 {
   3937 	char *contents;
   3938 	int size;
   3939 	vsecattr_t *vsp;
   3940 	nfs3_pathconf_info *info;
   3941 	int freed;
   3942 
   3943 	/*
   3944 	 * Free any held credentials and caches which
   3945 	 * may be associated with this rnode.
   3946 	 */
   3947 	if (!mutex_tryenter(&rp->r_statelock))
   3948 		return (0);
   3949 	contents = rp->r_symlink.contents;
   3950 	size = rp->r_symlink.size;
   3951 	rp->r_symlink.contents = NULL;
   3952 	vsp = rp->r_secattr;
   3953 	rp->r_secattr = NULL;
   3954 	info = rp->r_pathconf;
   3955 	rp->r_pathconf = NULL;
   3956 	mutex_exit(&rp->r_statelock);
   3957 
   3958 	/*
   3959 	 * Free the access cache entries.
   3960 	 */
   3961 	freed = nfs_access_purge_rp(rp);
   3962 
   3963 	if (!HAVE_RDDIR_CACHE(rp) &&
   3964 	    contents == NULL &&
   3965 	    vsp == NULL &&
   3966 	    info == NULL)
   3967 		return (freed);
   3968 
   3969 	/*
   3970 	 * Free the readdir cache entries
   3971 	 */
   3972 	if (HAVE_RDDIR_CACHE(rp))
   3973 		nfs_purge_rddir_cache(RTOV(rp));
   3974 
   3975 	/*
   3976 	 * Free the symbolic link cache.
   3977 	 */
   3978 	if (contents != NULL) {
   3979 
   3980 		kmem_free((void *)contents, size);
   3981 	}
   3982 
   3983 	/*
   3984 	 * Free any cached ACL.
   3985 	 */
   3986 	if (vsp != NULL)
   3987 		nfs_acl_free(vsp);
   3988 
   3989 	/*
   3990 	 * Free any cached pathconf information.
   3991 	 */
   3992 	if (info != NULL)
   3993 		kmem_free(info, sizeof (*info));
   3994 
   3995 	return (1);
   3996 }
   3997 
   3998 static int
   3999 nfs_free_reclaim(void)
   4000 {
   4001 	int freed;
   4002 	rnode_t *rp;
   4003 
   4004 #ifdef DEBUG
   4005 	clstat_debug.f_reclaim.value.ui64++;
   4006 #endif
   4007 	freed = 0;
   4008 	mutex_enter(&rpfreelist_lock);
   4009 	rp = rpfreelist;
   4010 	if (rp != NULL) {
   4011 		do {
   4012 			if (nfs_free_data_reclaim(rp))
   4013 				freed = 1;
   4014 		} while ((rp = rp->r_freef) != rpfreelist);
   4015 	}
   4016 	mutex_exit(&rpfreelist_lock);
   4017 	return (freed);
   4018 }
   4019 
   4020 static int
   4021 nfs_active_reclaim(void)
   4022 {
   4023 	int freed;
   4024 	int index;
   4025 	rnode_t *rp;
   4026 
   4027 #ifdef DEBUG
   4028 	clstat_debug.a_reclaim.value.ui64++;
   4029 #endif
   4030 	freed = 0;
   4031 	for (index = 0; index < rtablesize; index++) {
   4032 		rw_enter(&rtable[index].r_lock, RW_READER);
   4033 		for (rp = rtable[index].r_hashf;
   4034 		    rp != (rnode_t *)(&rtable[index]);
   4035 		    rp = rp->r_hashf) {
   4036 			if (nfs_active_data_reclaim(rp))
   4037 				freed = 1;
   4038 		}
   4039 		rw_exit(&rtable[index].r_lock);
   4040 	}
   4041 	return (freed);
   4042 }
   4043 
   4044 static int
   4045 nfs_rnode_reclaim(void)
   4046 {
   4047 	int freed;
   4048 	rnode_t *rp;
   4049 	vnode_t *vp;
   4050 
   4051 #ifdef DEBUG
   4052 	clstat_debug.r_reclaim.value.ui64++;
   4053 #endif
   4054 	freed = 0;
   4055 	mutex_enter(&rpfreelist_lock);
   4056 	while ((rp = rpfreelist) != NULL) {
   4057 		rp_rmfree(rp);
   4058 		mutex_exit(&rpfreelist_lock);
   4059 		if (rp->r_flags & RHASHED) {
   4060 			vp = RTOV(rp);
   4061 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
   4062 			mutex_enter(&vp->v_lock);
   4063 			if (vp->v_count > 1) {
   4064 				vp->v_count--;
   4065 				mutex_exit(&vp->v_lock);
   4066 				rw_exit(&rp->r_hashq->r_lock);
   4067 				mutex_enter(&rpfreelist_lock);
   4068 				continue;
   4069 			}
   4070 			mutex_exit(&vp->v_lock);
   4071 			rp_rmhash_locked(rp);
   4072 			rw_exit(&rp->r_hashq->r_lock);
   4073 		}
   4074 		/*
   4075 		 * This call to rp_addfree will end up destroying the
   4076 		 * rnode, but in a safe way with the appropriate set
   4077 		 * of checks done.
   4078 		 */
   4079 		rp_addfree(rp, CRED());
   4080 		mutex_enter(&rpfreelist_lock);
   4081 	}
   4082 	mutex_exit(&rpfreelist_lock);
   4083 	return (freed);
   4084 }
   4085 
   4086 /*ARGSUSED*/
   4087 static void
   4088 nfs_reclaim(void *cdrarg)
   4089 {
   4090 
   4091 #ifdef DEBUG
   4092 	clstat_debug.reclaim.value.ui64++;
   4093 #endif
   4094 	if (nfs_free_reclaim())
   4095 		return;
   4096 
   4097 	if (nfs_active_reclaim())
   4098 		return;
   4099 
   4100 	(void) nfs_rnode_reclaim();
   4101 }
   4102 
   4103 /*
   4104  * NFS client failover support
   4105  *
   4106  * Routines to copy filehandles
   4107  */
   4108 void
   4109 nfscopyfh(caddr_t fhp, vnode_t *vp)
   4110 {
   4111 	fhandle_t *dest = (fhandle_t *)fhp;
   4112 
   4113 	if (dest != NULL)
   4114 		*dest = *VTOFH(vp);
   4115 }
   4116 
   4117 void
   4118 nfs3copyfh(caddr_t fhp, vnode_t *vp)
   4119 {
   4120 	nfs_fh3 *dest = (nfs_fh3 *)fhp;
   4121 
   4122 	if (dest != NULL)
   4123 		*dest = *VTOFH3(vp);
   4124 }
   4125 
   4126 /*
   4127  * NFS client failover support
   4128  *
   4129  * failover_safe() will test various conditions to ensure that
   4130  * failover is permitted for this vnode.  It will be denied
   4131  * if:
   4132  *	1) the operation in progress does not support failover (NULL fi)
   4133  *	2) there are no available replicas (NULL mi_servers->sv_next)
   4134  *	3) any locks are outstanding on this file
   4135  */
   4136 static int
   4137 failover_safe(failinfo_t *fi)
   4138 {
   4139 
   4140 	/*
   4141 	 * Does this op permit failover?
   4142 	 */
   4143 	if (fi == NULL || fi->vp == NULL)
   4144 		return (0);
   4145 
   4146 	/*
   4147 	 * Are there any alternates to failover to?
   4148 	 */
   4149 	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
   4150 		return (0);
   4151 
   4152 	/*
   4153 	 * Disable check; we've forced local locking
   4154 	 *
   4155 	 * if (flk_has_remote_locks(fi->vp))
   4156 	 *	return (0);
   4157 	 */
   4158 
   4159 	/*
   4160 	 * If we have no partial path, we can't do anything
   4161 	 */
   4162 	if (VTOR(fi->vp)->r_path == NULL)
   4163 		return (0);
   4164 
   4165 	return (1);
   4166 }
   4167 
   4168 #include <sys/thread.h>
   4169 
   4170 /*
   4171  * NFS client failover support
   4172  *
   4173  * failover_newserver() will start a search for a new server,
   4174  * preferably by starting an async thread to do the work.  If
   4175  * someone is already doing this (recognizable by MI_BINDINPROG
   4176  * being set), it will simply return and the calling thread
   4177  * will queue on the mi_failover_cv condition variable.
   4178  */
   4179 static void
   4180 failover_newserver(mntinfo_t *mi)
   4181 {
   4182 	/*
   4183 	 * Check if someone else is doing this already
   4184 	 */
   4185 	mutex_enter(&mi->mi_lock);
   4186 	if (mi->mi_flags & MI_BINDINPROG) {
   4187 		mutex_exit(&mi->mi_lock);
   4188 		return;
   4189 	}
   4190 	mi->mi_flags |= MI_BINDINPROG;
   4191 
   4192 	/*
   4193 	 * Need to hold the vfs struct so that it can't be released
   4194 	 * while the failover thread is selecting a new server.
   4195 	 */
   4196 	VFS_HOLD(mi->mi_vfsp);
   4197 
   4198 	/*
   4199 	 * Start a thread to do the real searching.
   4200 	 */
   4201 	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
   4202 
   4203 	mutex_exit(&mi->mi_lock);
   4204 }
   4205 
   4206 /*
   4207  * NFS client failover support
   4208  *
   4209  * failover_thread() will find a new server to replace the one
   4210  * currently in use, wake up other threads waiting on this mount
   4211  * point, and die.  It will start at the head of the server list
   4212  * and poll servers until it finds one with an NFS server which is
   4213  * registered and responds to a NULL procedure ping.
   4214  *
   4215  * XXX failover_thread is unsafe within the scope of the
   4216  * present model defined for cpr to suspend the system.
   4217  * Specifically, over-the-wire calls made by the thread
   4218  * are unsafe. The thread needs to be reevaluated in case of
   4219  * future updates to the cpr suspend model.
   4220  */
   4221 static void
   4222 failover_thread(mntinfo_t *mi)
   4223 {
   4224 	servinfo_t *svp = NULL;
   4225 	CLIENT *cl;
   4226 	enum clnt_stat status;
   4227 	struct timeval tv;
   4228 	int error;
   4229 	int oncethru = 0;
   4230 	callb_cpr_t cprinfo;
   4231 	rnode_t *rp;
   4232 	int index;
   4233 	char *srvnames;
   4234 	size_t srvnames_len;
   4235 	struct nfs_clnt *nfscl = NULL;
   4236 	zoneid_t zoneid = getzoneid();
   4237 
   4238 #ifdef DEBUG
   4239 	/*
   4240 	 * This is currently only needed to access counters which exist on
   4241 	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
   4242 	 * on non-DEBUG kernels.
   4243 	 */
   4244 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
   4245 	ASSERT(nfscl != NULL);
   4246 #endif
   4247 
   4248 	/*
   4249 	 * Its safe to piggyback on the mi_lock since failover_newserver()
   4250 	 * code guarantees that there will be only one failover thread
   4251 	 * per mountinfo at any instance.
   4252 	 */
   4253 	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
   4254 	    "failover_thread");
   4255 
   4256 	mutex_enter(&mi->mi_lock);
   4257 	while (mi->mi_readers) {
   4258 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   4259 		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
   4260 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
   4261 	}
   4262 	mutex_exit(&mi->mi_lock);
   4263 
   4264 	tv.tv_sec = 2;
   4265 	tv.tv_usec = 0;
   4266 
   4267 	/*
   4268 	 * Ping the null NFS procedure of every server in
   4269 	 * the list until one responds.  We always start
   4270 	 * at the head of the list and always skip the one
   4271 	 * that is current, since it's caused us a problem.
   4272 	 */
   4273 	while (svp == NULL) {
   4274 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
   4275 			if (!oncethru && svp == mi->mi_curr_serv)
   4276 				continue;
   4277 
   4278 			/*
   4279 			 * If the file system was forcibly umounted
   4280 			 * while trying to do a failover, then just
   4281 			 * give up on the failover.  It won't matter
   4282 			 * what the server is.
   4283 			 */
   4284 			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
   4285 				svp = NULL;
   4286 				goto done;
   4287 			}
   4288 
   4289 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
   4290 			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
   4291 			if (error)
   4292 				continue;
   4293 
   4294 			if (!(mi->mi_flags & MI_INT))
   4295 				cl->cl_nosignal = TRUE;
   4296 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
   4297 			    xdr_void, NULL, tv);
   4298 			if (!(mi->mi_flags & MI_INT))
   4299 				cl->cl_nosignal = FALSE;
   4300 			AUTH_DESTROY(cl->cl_auth);
   4301 			CLNT_DESTROY(cl);
   4302 			if (status == RPC_SUCCESS) {
   4303 				if (svp == mi->mi_curr_serv) {
   4304 #ifdef DEBUG
   4305 					zcmn_err(zoneid, CE_NOTE,
   4306 			"NFS%d: failing over: selecting original server %s",
   4307 					    mi->mi_vers, svp->sv_hostname);
   4308 #else
   4309 					zcmn_err(zoneid, CE_NOTE,
   4310 			"NFS: failing over: selecting original server %s",
   4311 					    svp->sv_hostname);
   4312 #endif
   4313 				} else {
   4314 #ifdef DEBUG
   4315 					zcmn_err(zoneid, CE_NOTE,
   4316 				    "NFS%d: failing over from %s to %s",
   4317 					    mi->mi_vers,
   4318 					    mi->mi_curr_serv->sv_hostname,
   4319 					    svp->sv_hostname);
   4320 #else
   4321 					zcmn_err(zoneid, CE_NOTE,
   4322 				    "NFS: failing over from %s to %s",
   4323 					    mi->mi_curr_serv->sv_hostname,
   4324 					    svp->sv_hostname);
   4325 #endif
   4326 				}
   4327 				break;
   4328 			}
   4329 		}
   4330 
   4331 		if (svp == NULL) {
   4332 			if (!oncethru) {
   4333 				srvnames = nfs_getsrvnames(mi, &srvnames_len);
   4334 #ifdef DEBUG
   4335 				zprintf(zoneid,
   4336 				    "NFS%d servers %s not responding "
   4337 				    "still trying\n", mi->mi_vers, srvnames);
   4338 #else
   4339 				zprintf(zoneid, "NFS servers %s not responding "
   4340 				    "still trying\n", srvnames);
   4341 #endif
   4342 				oncethru = 1;
   4343 			}
   4344 			mutex_enter(&mi->mi_lock);
   4345 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
   4346 			mutex_exit(&mi->mi_lock);
   4347 			delay(hz);
   4348 			mutex_enter(&mi->mi_lock);
   4349 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
   4350 			mutex_exit(&mi->mi_lock);
   4351 		}
   4352 	}
   4353 
   4354 	if (oncethru) {
   4355 #ifdef DEBUG
   4356 		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
   4357 #else
   4358 		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
   4359 #endif
   4360 	}
   4361 
   4362 	if (svp != mi->mi_curr_serv) {
   4363 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
   4364 		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
   4365 		rw_enter(&rtable[index].r_lock, RW_WRITER);
   4366 		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
   4367 		    mi->mi_vfsp);
   4368 		if (rp != NULL) {
   4369 			if (rp->r_flags & RHASHED)
   4370 				rp_rmhash_locked(rp);
   4371 			rw_exit(&rtable[index].r_lock);
   4372 			rp->r_server = svp;
   4373 			rp->r_fh = svp->sv_fhandle;
   4374 			(void) nfs_free_data_reclaim(rp);
   4375 			index = rtablehash(&rp->r_fh);
   4376 			rp->r_hashq = &rtable[index];
   4377 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
   4378 			vn_exists(RTOV(rp));
   4379 			rp_addhash(rp);
   4380 			rw_exit(&rp->r_hashq->r_lock);
   4381 			VN_RELE(RTOV(rp));
   4382 		} else
   4383 			rw_exit(&rtable[index].r_lock);
   4384 	}
   4385 
   4386 done:
   4387 	if (oncethru)
   4388 		kmem_free(srvnames, srvnames_len);
   4389 	mutex_enter(&mi->mi_lock);
   4390 	mi->mi_flags &= ~MI_BINDINPROG;
   4391 	if (svp != NULL) {
   4392 		mi->mi_curr_serv = svp;
   4393 		mi->mi_failover++;
   4394 #ifdef DEBUG
   4395 	nfscl->nfscl_stat.failover.value.ui64++;
   4396 #endif
   4397 	}
   4398 	cv_broadcast(&mi->mi_failover_cv);
   4399 	CALLB_CPR_EXIT(&cprinfo);
   4400 	VFS_RELE(mi->mi_vfsp);
   4401 	zthread_exit();
   4402 	/* NOTREACHED */
   4403 }
   4404 
   4405 /*
   4406  * NFS client failover support
   4407  *
   4408  * failover_wait() will put the thread to sleep until MI_BINDINPROG
   4409  * is cleared, meaning that failover is complete.  Called with
   4410  * mi_lock mutex held.
   4411  */
   4412 static int
   4413 failover_wait(mntinfo_t *mi)
   4414 {
   4415 	k_sigset_t smask;
   4416 
   4417 	/*
   4418 	 * If someone else is hunting for a living server,
   4419 	 * sleep until it's done.  After our sleep, we may
   4420 	 * be bound to the right server and get off cheaply.
   4421 	 */
   4422 	while (mi->mi_flags & MI_BINDINPROG) {
   4423 		/*
   4424 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
   4425 		 * and SIGTERM. (Preserving the existing masks).
   4426 		 * Mask out SIGINT if mount option nointr is specified.
   4427 		 */
   4428 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
   4429 		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
   4430 			/*
   4431 			 * restore original signal mask
   4432 			 */
   4433 			sigunintr(&smask);
   4434 			return (EINTR);
   4435 		}
   4436 		/*
   4437 		 * restore original signal mask
   4438 		 */
   4439 		sigunintr(&smask);
   4440 	}
   4441 	return (0);
   4442 }
   4443 
   4444 /*
   4445  * NFS client failover support
   4446  *
   4447  * failover_remap() will do a partial pathname lookup and find the
   4448  * desired vnode on the current server.  The interim vnode will be
   4449  * discarded after we pilfer the new filehandle.
   4450  *
   4451  * Side effects:
   4452  * - This routine will also update the filehandle in the args structure
   4453  *    pointed to by the fi->fhp pointer if it is non-NULL.
   4454  */
   4455 
   4456 static int
   4457 failover_remap(failinfo_t *fi)
   4458 {
   4459 	vnode_t *vp, *nvp, *rootvp;
   4460 	rnode_t *rp, *nrp;
   4461 	mntinfo_t *mi;
   4462 	int error;
   4463 #ifdef DEBUG
   4464 	struct nfs_clnt *nfscl;
   4465 
   4466 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
   4467 	ASSERT(nfscl != NULL);
   4468 #endif
   4469 	/*
   4470 	 * Sanity check
   4471 	 */
   4472 	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
   4473 		return (EINVAL);
   4474 	vp = fi->vp;
   4475 	rp = VTOR(vp);
   4476 	mi = VTOMI(vp);
   4477 
   4478 	if (!(vp->v_flag & VROOT)) {
   4479 		/*
   4480 		 * Given the root fh, use the path stored in
   4481 		 * the rnode to find the fh for the new server.
   4482 		 */
   4483 		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
   4484 		if (error)
   4485 			return (error);
   4486 
   4487 		error = failover_lookup(rp->r_path, rootvp,
   4488 		    fi->lookupproc, fi->xattrdirproc, &nvp);
   4489 
   4490 		VN_RELE(rootvp);
   4491 
   4492 		if (error)
   4493 			return (error);
   4494 
   4495 		/*
   4496 		 * If we found the same rnode, we're done now
   4497 		 */
   4498 		if (nvp == vp) {
   4499 			/*
   4500 			 * Failed and the new server may physically be same
   4501 			 * OR may share a same disk subsystem. In this case
   4502 			 * file handle for a particular file path is not going
   4503 			 * to change, given the same filehandle lookup will
   4504 			 * always locate the same rnode as the existing one.
   4505 			 * All we might need to do is to update the r_server
   4506 			 * with the current servinfo.
   4507 			 */
   4508 			if (!VALID_FH(fi)) {
   4509 				rp->r_server = mi->mi_curr_serv;
   4510 			}
   4511 			VN_RELE(nvp);
   4512 			return (0);
   4513 		}
   4514 
   4515 		/*
   4516 		 * Try to make it so that no one else will find this
   4517 		 * vnode because it is just a temporary to hold the
   4518 		 * new file handle until that file handle can be
   4519 		 * copied to the original vnode/rnode.
   4520 		 */
   4521 		nrp = VTOR(nvp);
   4522 		mutex_enter(&mi->mi_remap_lock);
   4523 		/*
   4524 		 * Some other thread could have raced in here and could
   4525 		 * have done the remap for this particular rnode before
   4526 		 * this thread here. Check for rp->r_server and
   4527 		 * mi->mi_curr_serv and return if they are same.
   4528 		 */
   4529 		if (VALID_FH(fi)) {
   4530 			mutex_exit(&mi->mi_remap_lock);
   4531 			VN_RELE(nvp);
   4532 			return (0);
   4533 		}
   4534 
   4535 		if (nrp->r_flags & RHASHED)
   4536 			rp_rmhash(nrp);
   4537 
   4538 		/*
   4539 		 * As a heuristic check on the validity of the new
   4540 		 * file, check that the size and type match against
   4541 		 * that we remember from the old version.
   4542 		 */
   4543 		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
   4544 			mutex_exit(&mi->mi_remap_lock);
   4545 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
   4546 			    "NFS replicas %s and %s: file %s not same.",
   4547 			    rp->r_server->sv_hostname,
   4548 			    nrp->r_server->sv_hostname, rp->r_path);
   4549 			VN_RELE(nvp);
   4550 			return (EINVAL);
   4551 		}
   4552 
   4553 		/*
   4554 		 * snarf the filehandle from the new rnode
   4555 		 * then release it, again while updating the
   4556 		 * hash queues for the rnode.
   4557 		 */
   4558 		if (rp->r_flags & RHASHED)
   4559 			rp_rmhash(rp);
   4560 		rp->r_server = mi->mi_curr_serv;
   4561 		rp->r_fh = nrp->r_fh;
   4562 		rp->r_hashq = nrp->r_hashq;
   4563 		/*
   4564 		 * Copy the attributes from the new rnode to the old
   4565 		 * rnode.  This will help to reduce unnecessary page
   4566 		 * cache flushes.
   4567 		 */
   4568 		rp->r_attr = nrp->r_attr;
   4569 		rp->r_attrtime = nrp->r_attrtime;
   4570 		rp->r_mtime = nrp->r_mtime;
   4571 		(void) nfs_free_data_reclaim(rp);
   4572 		nfs_setswaplike(vp, &rp->r_attr);
   4573 		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
   4574 		rp_addhash(rp);
   4575 		rw_exit(&rp->r_hashq->r_lock);
   4576 		mutex_exit(&mi->mi_remap_lock);
   4577 		VN_RELE(nvp);
   4578 	}
   4579 
   4580 	/*
   4581 	 * Update successful failover remap count
   4582 	 */
   4583 	mutex_enter(&mi->mi_lock);
   4584 	mi->mi_remap++;
   4585 	mutex_exit(&mi->mi_lock);
   4586 #ifdef DEBUG
   4587 	nfscl->nfscl_stat.remap.value.ui64++;
   4588 #endif
   4589 
   4590 	/*
   4591 	 * If we have a copied filehandle to update, do it now.
   4592 	 */
   4593 	if (fi->fhp != NULL && fi->copyproc != NULL)
   4594 		(*fi->copyproc)(fi->fhp, vp);
   4595 
   4596 	return (0);
   4597 }
   4598 
   4599 /*
   4600  * NFS client failover support
   4601  *
   4602  * We want a simple pathname lookup routine to parse the pieces
   4603  * of path in rp->r_path.  We know that the path was a created
   4604  * as rnodes were made, so we know we have only to deal with
   4605  * paths that look like:
   4606  *	dir1/dir2/dir3/file
   4607  * Any evidence of anything like .., symlinks, and ENOTDIR
   4608  * are hard errors, because they mean something in this filesystem
   4609  * is different from the one we came from, or has changed under
   4610  * us in some way.  If this is true, we want the failure.
   4611  *
   4612  * Extended attributes: if the filesystem is mounted with extended
   4613  * attributes enabled (-o xattr), the attribute directory will be
   4614  * represented in the r_path as the magic name XATTR_RPATH. So if
   4615  * we see that name in the pathname, is must be because this node
   4616  * is an extended attribute.  Therefore, look it up that way.
   4617  */
   4618 static int
   4619 failover_lookup(char *path, vnode_t *root,
   4620     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
   4621 	vnode_t *, cred_t *, int),
   4622     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
   4623     vnode_t **new)
   4624 {
   4625 	vnode_t *dvp, *nvp;
   4626 	int error = EINVAL;
   4627 	char *s, *p, *tmppath;
   4628 	size_t len;
   4629 	mntinfo_t *mi;
   4630 	bool_t xattr;
   4631 
   4632 	/* Make local copy of path */
   4633 	len = strlen(path) + 1;
   4634 	tmppath = kmem_alloc(len, KM_SLEEP);
   4635 	(void) strcpy(tmppath, path);
   4636 	s = tmppath;
   4637 
   4638 	dvp = root;
   4639 	VN_HOLD(dvp);
   4640 	mi = VTOMI(root);
   4641 	xattr = mi->mi_flags & MI_EXTATTR;
   4642 
   4643 	do {
   4644 		p = strchr(s, '/');
   4645 		if (p != NULL)
   4646 			*p = '\0';
   4647 		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
   4648 			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
   4649 			    RFSCALL_SOFT);
   4650 		} else {
   4651 			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
   4652 			    CRED(), RFSCALL_SOFT);
   4653 		}
   4654 		if (p != NULL)
   4655 			*p++ = '/';
   4656 		if (error) {
   4657 			VN_RELE(dvp);
   4658 			kmem_free(tmppath, len);
   4659 			return (error);
   4660 		}
   4661 		s = p;
   4662 		VN_RELE(dvp);
   4663 		dvp = nvp;
   4664 	} while (p != NULL);
   4665 
   4666 	if (nvp != NULL && new != NULL)
   4667 		*new = nvp;
   4668 	kmem_free(tmppath, len);
   4669 	return (0);
   4670 }
   4671 
   4672 /*
   4673  * NFS client failover support
   4674  *
   4675  * sv_free() frees the malloc'd portion of a "servinfo_t".
   4676  */
   4677 void
   4678 sv_free(servinfo_t *svp)
   4679 {
   4680 	servinfo_t *next;
   4681 	struct knetconfig *knconf;
   4682 
   4683 	while (svp != NULL) {
   4684 		next = svp->sv_next;
   4685 		if (svp->sv_secdata)
   4686 			sec_clnt_freeinfo(svp->sv_secdata);
   4687 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
   4688 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
   4689 		knconf = svp->sv_knconf;
   4690 		if (knconf != NULL) {
   4691 			if (knconf->knc_protofmly != NULL)
   4692 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
   4693 			if (knconf->knc_proto != NULL)
   4694 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
   4695 			kmem_free(knconf, sizeof (*knconf));
   4696 		}
   4697 		knconf = svp->sv_origknconf;
   4698 		if (knconf != NULL) {
   4699 			if (knconf->knc_protofmly != NULL)
   4700 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
   4701 			if (knconf->knc_proto != NULL)
   4702 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
   4703 			kmem_free(knconf, sizeof (*knconf));
   4704 		}
   4705 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
   4706 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
   4707 		mutex_destroy(&svp->sv_lock);
   4708 		kmem_free(svp, sizeof (*svp));
   4709 		svp = next;
   4710 	}
   4711 }
   4712 
   4713 /*
   4714  * Only can return non-zero if intr != 0.
   4715  */
   4716 int
   4717 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
   4718 {
   4719 
   4720 	mutex_enter(&l->lock);
   4721 
   4722 	/*
   4723 	 * If this is a nested enter, then allow it.  There
   4724 	 * must be as many exits as enters through.
   4725 	 */
   4726 	if (l->owner == curthread) {
   4727 		/* lock is held for writing by current thread */
   4728 		ASSERT(rw == RW_READER || rw == RW_WRITER);
   4729 		l->count--;
   4730 	} else if (rw == RW_READER) {
   4731 		/*
   4732 		 * While there is a writer active or writers waiting,
   4733 		 * then wait for them to finish up and move on.  Then,
   4734 		 * increment the count to indicate that a reader is
   4735 		 * active.
   4736 		 */
   4737 		while (l->count < 0 || l->waiters > 0) {
   4738 			if (intr) {
   4739 				klwp_t *lwp = ttolwp(curthread);
   4740 
   4741 				if (lwp != NULL)
   4742 					lwp->lwp_nostop++;
   4743 				if (!cv_wait_sig(&l->cv, &l->lock)) {
   4744 					if (lwp != NULL)
   4745 						lwp->lwp_nostop--;
   4746 					mutex_exit(&l->lock);
   4747 					return (EINTR);
   4748 				}
   4749 				if (lwp != NULL)
   4750 					lwp->lwp_nostop--;
   4751 			} else
   4752 				cv_wait(&l->cv, &l->lock);
   4753 		}
   4754 		ASSERT(l->count < INT_MAX);
   4755 #ifdef	DEBUG
   4756 		if ((l->count % 10000) == 9999)
   4757 			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
   4758 			    "rwlock @ %p\n", l->count, (void *)&l);
   4759 #endif
   4760 		l->count++;
   4761 	} else {
   4762 		ASSERT(rw == RW_WRITER);
   4763 		/*
   4764 		 * While there are readers active or a writer
   4765 		 * active, then wait for all of the readers
   4766 		 * to finish or for the writer to finish.
   4767 		 * Then, set the owner field to curthread and
   4768 		 * decrement count to indicate that a writer
   4769 		 * is active.
   4770 		 */
   4771 		while (l->count > 0 || l->owner != NULL) {
   4772 			l->waiters++;
   4773 			if (intr) {
   4774 				klwp_t *lwp = ttolwp(curthread);
   4775 
   4776 				if (lwp != NULL)
   4777 					lwp->lwp_nostop++;
   4778 				if (!cv_wait_sig(&l->cv, &l->lock)) {
   4779 					if (lwp != NULL)
   4780 						lwp->lwp_nostop--;
   4781 					l->waiters--;
   4782 					cv_broadcast(&l->cv);
   4783 					mutex_exit(&l->lock);
   4784 					return (EINTR);
   4785 				}
   4786 				if (lwp != NULL)
   4787 					lwp->lwp_nostop--;
   4788 			} else
   4789 				cv_wait(&l->cv, &l->lock);
   4790 			l->waiters--;
   4791 		}
   4792 		l->owner = curthread;
   4793 		l->count--;
   4794 	}
   4795 
   4796 	mutex_exit(&l->lock);
   4797 
   4798 	return (0);
   4799 }
   4800 
   4801 /*
   4802  * If the lock is available, obtain it and return non-zero.  If there is
   4803  * already a conflicting lock, return 0 immediately.
   4804  */
   4805 
   4806 int
   4807 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
   4808 {
   4809 	mutex_enter(&l->lock);
   4810 
   4811 	/*
   4812 	 * If this is a nested enter, then allow it.  There
   4813 	 * must be as many exits as enters through.
   4814 	 */
   4815 	if (l->owner == curthread) {
   4816 		/* lock is held for writing by current thread */
   4817 		ASSERT(rw == RW_READER || rw == RW_WRITER);
   4818 		l->count--;
   4819 	} else if (rw == RW_READER) {
   4820 		/*
   4821 		 * If there is a writer active or writers waiting, deny the
   4822 		 * lock.  Otherwise, bump the count of readers.
   4823 		 */
   4824 		if (l->count < 0 || l->waiters > 0) {
   4825 			mutex_exit(&l->lock);
   4826 			return (0);
   4827 		}
   4828 		l->count++;
   4829 	} else {
   4830 		ASSERT(rw == RW_WRITER);
   4831 		/*
   4832 		 * If there are readers active or a writer active, deny the
   4833 		 * lock.  Otherwise, set the owner field to curthread and
   4834 		 * decrement count to indicate that a writer is active.
   4835 		 */
   4836 		if (l->count > 0 || l->owner != NULL) {
   4837 			mutex_exit(&l->lock);
   4838 			return (0);
   4839 		}
   4840 		l->owner = curthread;
   4841 		l->count--;
   4842 	}
   4843 
   4844 	mutex_exit(&l->lock);
   4845 
   4846 	return (1);
   4847 }
   4848 
   4849 void
   4850 nfs_rw_exit(nfs_rwlock_t *l)
   4851 {
   4852 
   4853 	mutex_enter(&l->lock);
   4854 	/*
   4855 	 * If this is releasing a writer lock, then increment count to
   4856 	 * indicate that there is one less writer active.  If this was
   4857 	 * the last of possibly nested writer locks, then clear the owner
   4858 	 * field as well to indicate that there is no writer active
   4859 	 * and wakeup any possible waiting writers or readers.
   4860 	 *
   4861 	 * If releasing a reader lock, then just decrement count to
   4862 	 * indicate that there is one less reader active.  If this was
   4863 	 * the last active reader and there are writer(s) waiting,
   4864 	 * then wake up the first.
   4865 	 */
   4866 	if (l->owner != NULL) {
   4867 		ASSERT(l->owner == curthread);
   4868 		l->count++;
   4869 		if (l->count == 0) {
   4870 			l->owner = NULL;
   4871 			cv_broadcast(&l->cv);
   4872 		}
   4873 	} else {
   4874 		ASSERT(l->count > 0);
   4875 		l->count--;
   4876 		if (l->count == 0 && l->waiters > 0)
   4877 			cv_broadcast(&l->cv);
   4878 	}
   4879 	mutex_exit(&l->lock);
   4880 }
   4881 
   4882 int
   4883 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
   4884 {
   4885 
   4886 	if (rw == RW_READER)
   4887 		return (l->count > 0);
   4888 	ASSERT(rw == RW_WRITER);
   4889 	return (l->count < 0);
   4890 }
   4891 
   4892 /* ARGSUSED */
   4893 void
   4894 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
   4895 {
   4896 
   4897 	l->count = 0;
   4898 	l->waiters = 0;
   4899 	l->owner = NULL;
   4900 	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
   4901 	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
   4902 }
   4903 
   4904 void
   4905 nfs_rw_destroy(nfs_rwlock_t *l)
   4906 {
   4907 
   4908 	mutex_destroy(&l->lock);
   4909 	cv_destroy(&l->cv);
   4910 }
   4911 
   4912 int
   4913 nfs3_rddir_compar(const void *x, const void *y)
   4914 {
   4915 	rddir_cache *a = (rddir_cache *)x;
   4916 	rddir_cache *b = (rddir_cache *)y;
   4917 
   4918 	if (a->nfs3_cookie == b->nfs3_cookie) {
   4919 		if (a->buflen == b->buflen)
   4920 			return (0);
   4921 		if (a->buflen < b->buflen)
   4922 			return (-1);
   4923 		return (1);
   4924 	}
   4925 
   4926 	if (a->nfs3_cookie < b->nfs3_cookie)
   4927 		return (-1);
   4928 
   4929 	return (1);
   4930 }
   4931 
   4932 int
   4933 nfs_rddir_compar(const void *x, const void *y)
   4934 {
   4935 	rddir_cache *a = (rddir_cache *)x;
   4936 	rddir_cache *b = (rddir_cache *)y;
   4937 
   4938 	if (a->nfs_cookie == b->nfs_cookie) {
   4939 		if (a->buflen == b->buflen)
   4940 			return (0);
   4941 		if (a->buflen < b->buflen)
   4942 			return (-1);
   4943 		return (1);
   4944 	}
   4945 
   4946 	if (a->nfs_cookie < b->nfs_cookie)
   4947 		return (-1);
   4948 
   4949 	return (1);
   4950 }
   4951 
   4952 static char *
   4953 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
   4954 {
   4955 	servinfo_t *s;
   4956 	char *srvnames;
   4957 	char *namep;
   4958 	size_t length;
   4959 
   4960 	/*
   4961 	 * Calculate the length of the string required to hold all
   4962 	 * of the server names plus either a comma or a null
   4963 	 * character following each individual one.
   4964 	 */
   4965 	length = 0;
   4966 	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
   4967 		length += s->sv_hostnamelen;
   4968 
   4969 	srvnames = kmem_alloc(length, KM_SLEEP);
   4970 
   4971 	namep = srvnames;
   4972 	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
   4973 		(void) strcpy(namep, s->sv_hostname);
   4974 		namep += s->sv_hostnamelen - 1;
   4975 		*namep++ = ',';
   4976 	}
   4977 	*--namep = '\0';
   4978 
   4979 	*len = length;
   4980 
   4981 	return (srvnames);
   4982 }
   4983 
   4984 /*
   4985  * These two functions are temporary and designed for the upgrade-workaround
   4986  * only.  They cannot be used for general zone-crossing NFS client support, and
   4987  * will be removed shortly.
   4988  *
   4989  * When the workaround is enabled, all NFS traffic is forced into the global
   4990  * zone.  These functions are called when the code needs to refer to the state
   4991  * of the underlying network connection.  They're not called when the function
   4992  * needs to refer to the state of the process that invoked the system call.
   4993  * (E.g., when checking whether the zone is shutting down during the mount()
   4994  * call.)
   4995  */
   4996 
   4997 struct zone *
   4998 nfs_zone(void)
   4999 {
   5000 	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
   5001 }
   5002 
   5003 zoneid_t
   5004 nfs_zoneid(void)
   5005 {
   5006 	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
   5007 }
   5008 
   5009 /*
   5010  * nfs_mount_label_policy:
   5011  *	Determine whether the mount is allowed according to MAC check,
   5012  *	by comparing (where appropriate) label of the remote server
   5013  *	against the label of the zone being mounted into.
   5014  *
   5015  *	Returns:
   5016  *		 0 :	access allowed
   5017  *		-1 :	read-only access allowed (i.e., read-down)
   5018  *		>0 :	error code, such as EACCES
   5019  */
   5020 int
   5021 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
   5022     struct knetconfig *knconf, cred_t *cr)
   5023 {
   5024 	int		addr_type;
   5025 	void		*ipaddr;
   5026 	bslabel_t	*server_sl, *mntlabel;
   5027 	zone_t		*mntzone = NULL;
   5028 	ts_label_t	*zlabel;
   5029 	tsol_tpc_t	*tp;
   5030 	ts_label_t	*tsl = NULL;
   5031 	int		retv;
   5032 
   5033 	/*
   5034 	 * Get the zone's label.  Each zone on a labeled system has a label.
   5035 	 */
   5036 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
   5037 	zlabel = mntzone->zone_slabel;
   5038 	ASSERT(zlabel != NULL);
   5039 	label_hold(zlabel);
   5040 
   5041 	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
   5042 		addr_type = IPV4_VERSION;
   5043 		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
   5044 	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
   5045 		addr_type = IPV6_VERSION;
   5046 		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
   5047 	} else {
   5048 		retv = 0;
   5049 		goto out;
   5050 	}
   5051 
   5052 	retv = EACCES;				/* assume the worst */
   5053 
   5054 	/*
   5055 	 * Next, get the assigned label of the remote server.
   5056 	 */
   5057 	tp = find_tpc(ipaddr, addr_type, B_FALSE);
   5058 	if (tp == NULL)
   5059 		goto out;			/* error getting host entry */
   5060 
   5061 	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
   5062 		goto rel_tpc;			/* invalid domain */
   5063 	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
   5064 	    (tp->tpc_tp.host_type != UNLABELED))
   5065 		goto rel_tpc;			/* invalid hosttype */
   5066 
   5067 	if (tp->tpc_tp.host_type == SUN_CIPSO) {
   5068 		tsl = getflabel_cipso(vfsp);
   5069 		if (tsl == NULL)
   5070 			goto rel_tpc;		/* error getting server lbl */
   5071 
   5072 		server_sl = label2bslabel(tsl);
   5073 	} else {	/* UNLABELED */
   5074 		server_sl = &tp->tpc_tp.tp_def_label;
   5075 	}
   5076 
   5077 	mntlabel = label2bslabel(zlabel);
   5078 
   5079 	/*
   5080 	 * Now compare labels to complete the MAC check.  If the labels
   5081 	 * are equal or if the requestor is in the global zone and has
   5082 	 * NET_MAC_AWARE, then allow read-write access.   (Except for
   5083 	 * mounts into the global zone itself; restrict these to
   5084 	 * read-only.)
   5085 	 *
   5086 	 * If the requestor is in some other zone, but his label
   5087 	 * dominates the server, then allow read-down.
   5088 	 *
   5089 	 * Otherwise, access is denied.
   5090 	 */
   5091 	if (blequal(mntlabel, server_sl) ||
   5092 	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
   5093 	    getpflags(NET_MAC_AWARE, cr) != 0)) {
   5094 		if ((mntzone == global_zone) ||
   5095 		    !blequal(mntlabel, server_sl))
   5096 			retv = -1;		/* read-only */
   5097 		else
   5098 			retv = 0;		/* access OK */
   5099 	} else if (bldominates(mntlabel, server_sl)) {
   5100 		retv = -1;			/* read-only */
   5101 	} else {
   5102 		retv = EACCES;
   5103 	}
   5104 
   5105 	if (tsl != NULL)
   5106 		label_rele(tsl);
   5107 
   5108 rel_tpc:
   5109 	TPC_RELE(tp);
   5110 out:
   5111 	if (mntzone)
   5112 		zone_rele(mntzone);
   5113 	label_rele(zlabel);
   5114 	return (retv);
   5115 }
   5116 
   5117 boolean_t
   5118 nfs_has_ctty(void)
   5119 {
   5120 	boolean_t rv;
   5121 	mutex_enter(&curproc->p_splock);
   5122 	rv = (curproc->p_sessp->s_vp != NULL);
   5123 	mutex_exit(&curproc->p_splock);
   5124 	return (rv);
   5125 }
   5126 
   5127 /*
   5128  * See if xattr directory to see if it has any generic user attributes
   5129  */
   5130 int
   5131 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
   5132 {
   5133 	struct uio uio;
   5134 	struct iovec iov;
   5135 	char *dbuf;
   5136 	struct dirent64 *dp;
   5137 	size_t dlen = 8 * 1024;
   5138 	size_t dbuflen;
   5139 	int eof = 0;
   5140 	int error;
   5141 
   5142 	*valp = 0;
   5143 	dbuf = kmem_alloc(dlen, KM_SLEEP);
   5144 	uio.uio_iov = &iov;
   5145 	uio.uio_iovcnt = 1;
   5146 	uio.uio_segflg = UIO_SYSSPACE;
   5147 	uio.uio_fmode = 0;
   5148 	uio.uio_extflg = UIO_COPY_CACHED;
   5149 	uio.uio_loffset = 0;
   5150 	uio.uio_resid = dlen;
   5151 	iov.iov_base = dbuf;
   5152 	iov.iov_len = dlen;
   5153 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
   5154 	error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
   5155 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
   5156 
   5157 	dbuflen = dlen - uio.uio_resid;
   5158 
   5159 	if (error || dbuflen == 0) {
   5160 		kmem_free(dbuf, dlen);
   5161 		return (error);
   5162 	}
   5163 
   5164 	dp = (dirent64_t *)dbuf;
   5165 
   5166 	while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
   5167 		if (strcmp(dp->d_name, ".") == 0 ||
   5168 		    strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
   5169 		    VIEW_READWRITE) == 0 || strcmp(dp->d_name,
   5170 		    VIEW_READONLY) == 0) {
   5171 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
   5172 			continue;
   5173 		}
   5174 
   5175 		*valp = 1;
   5176 		break;
   5177 	}
   5178 	kmem_free(dbuf, dlen);
   5179 	return (0);
   5180 }
   5181