Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
     28  *	All Rights Reserved
     29  */
     30 
     31 #include <sys/param.h>
     32 #include <sys/types.h>
     33 #include <sys/systm.h>
     34 #include <sys/cmn_err.h>
     35 #include <sys/vtrace.h>
     36 #include <sys/session.h>
     37 #include <sys/thread.h>
     38 #include <sys/dnlc.h>
     39 #include <sys/cred.h>
     40 #include <sys/priv.h>
     41 #include <sys/list.h>
     42 #include <sys/sdt.h>
     43 #include <sys/policy.h>
     44 
     45 #include <rpc/types.h>
     46 #include <rpc/xdr.h>
     47 
     48 #include <nfs/nfs.h>
     49 
     50 #include <nfs/nfs_clnt.h>
     51 
     52 #include <nfs/nfs4.h>
     53 #include <nfs/rnode4.h>
     54 #include <nfs/nfs4_clnt.h>
     55 
     56 /*
     57  * client side statistics
     58  */
     59 static const struct clstat4 clstat4_tmpl = {
     60 	{ "calls",	KSTAT_DATA_UINT64 },
     61 	{ "badcalls",	KSTAT_DATA_UINT64 },
     62 	{ "clgets",	KSTAT_DATA_UINT64 },
     63 	{ "cltoomany",	KSTAT_DATA_UINT64 },
     64 #ifdef DEBUG
     65 	{ "clalloc",	KSTAT_DATA_UINT64 },
     66 	{ "noresponse",	KSTAT_DATA_UINT64 },
     67 	{ "failover",	KSTAT_DATA_UINT64 },
     68 	{ "remap",	KSTAT_DATA_UINT64 },
     69 #endif
     70 };
     71 
     72 #ifdef DEBUG
     73 struct clstat4_debug clstat4_debug = {
     74 	{ "nrnode",	KSTAT_DATA_UINT64 },
     75 	{ "access",	KSTAT_DATA_UINT64 },
     76 	{ "dirent",	KSTAT_DATA_UINT64 },
     77 	{ "dirents",	KSTAT_DATA_UINT64 },
     78 	{ "reclaim",	KSTAT_DATA_UINT64 },
     79 	{ "clreclaim",	KSTAT_DATA_UINT64 },
     80 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
     81 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
     82 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
     83 	{ "r_path",	KSTAT_DATA_UINT64 },
     84 };
     85 #endif
     86 
     87 /*
     88  * We keep a global list of per-zone client data, so we can clean up all zones
     89  * if we get low on memory.
     90  */
     91 static list_t nfs4_clnt_list;
     92 static kmutex_t nfs4_clnt_list_lock;
     93 static zone_key_t nfs4clnt_zone_key;
     94 
     95 static struct kmem_cache *chtab4_cache;
     96 
     97 #ifdef DEBUG
     98 static int nfs4_rfscall_debug;
     99 static int nfs4_try_failover_any;
    100 int nfs4_utf8_debug = 0;
    101 #endif
    102 
    103 /*
    104  * NFSv4 readdir cache implementation
    105  */
    106 typedef struct rddir4_cache_impl {
    107 	rddir4_cache	rc;		/* readdir cache element */
    108 	kmutex_t	lock;		/* lock protects count */
    109 	uint_t		count;		/* reference count */
    110 	avl_node_t	tree;		/* AVL tree link */
    111 } rddir4_cache_impl;
    112 
    113 static int rddir4_cache_compar(const void *, const void *);
    114 static void rddir4_cache_free(rddir4_cache_impl *);
    115 static rddir4_cache *rddir4_cache_alloc(int);
    116 static void rddir4_cache_hold(rddir4_cache *);
    117 static int try_failover(enum clnt_stat);
    118 
    119 static int nfs4_readdir_cache_hits = 0;
    120 static int nfs4_readdir_cache_waits = 0;
    121 static int nfs4_readdir_cache_misses = 0;
    122 
    123 /*
    124  * Shared nfs4 functions
    125  */
    126 
    127 /*
    128  * Copy an nfs_fh4.  The destination storage (to->nfs_fh4_val) must already
    129  * be allocated.
    130  */
    131 
    132 void
    133 nfs_fh4_copy(nfs_fh4 *from, nfs_fh4 *to)
    134 {
    135 	to->nfs_fh4_len = from->nfs_fh4_len;
    136 	bcopy(from->nfs_fh4_val, to->nfs_fh4_val, to->nfs_fh4_len);
    137 }
    138 
    139 /*
    140  * nfs4cmpfh - compare 2 filehandles.
    141  * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is
    142  * "less" than the second, +1 if the first is "greater" than the second.
    143  */
    144 
    145 int
    146 nfs4cmpfh(const nfs_fh4 *fh4p1, const nfs_fh4 *fh4p2)
    147 {
    148 	const char *c1, *c2;
    149 
    150 	if (fh4p1->nfs_fh4_len < fh4p2->nfs_fh4_len)
    151 		return (-1);
    152 	if (fh4p1->nfs_fh4_len > fh4p2->nfs_fh4_len)
    153 		return (1);
    154 	for (c1 = fh4p1->nfs_fh4_val, c2 = fh4p2->nfs_fh4_val;
    155 	    c1 < fh4p1->nfs_fh4_val + fh4p1->nfs_fh4_len;
    156 	    c1++, c2++) {
    157 		if (*c1 < *c2)
    158 			return (-1);
    159 		if (*c1 > *c2)
    160 			return (1);
    161 	}
    162 
    163 	return (0);
    164 }
    165 
    166 /*
    167  * Compare two v4 filehandles.  Return zero if they're the same, non-zero
    168  * if they're not.  Like nfs4cmpfh(), but different filehandle
    169  * representation, and doesn't provide information about greater than or
    170  * less than.
    171  */
    172 
    173 int
    174 nfs4cmpfhandle(nfs4_fhandle_t *fh1, nfs4_fhandle_t *fh2)
    175 {
    176 	if (fh1->fh_len == fh2->fh_len)
    177 		return (bcmp(fh1->fh_buf, fh2->fh_buf, fh1->fh_len));
    178 
    179 	return (1);
    180 }
    181 
    182 int
    183 stateid4_cmp(stateid4 *s1, stateid4 *s2)
    184 {
    185 	if (bcmp(s1, s2, sizeof (stateid4)) == 0)
    186 		return (1);
    187 	else
    188 		return (0);
    189 }
    190 
    191 nfsstat4
    192 puterrno4(int error)
    193 {
    194 	switch (error) {
    195 	case 0:
    196 		return (NFS4_OK);
    197 	case EPERM:
    198 		return (NFS4ERR_PERM);
    199 	case ENOENT:
    200 		return (NFS4ERR_NOENT);
    201 	case EINTR:
    202 		return (NFS4ERR_IO);
    203 	case EIO:
    204 		return (NFS4ERR_IO);
    205 	case ENXIO:
    206 		return (NFS4ERR_NXIO);
    207 	case ENOMEM:
    208 		return (NFS4ERR_RESOURCE);
    209 	case EACCES:
    210 		return (NFS4ERR_ACCESS);
    211 	case EBUSY:
    212 		return (NFS4ERR_IO);
    213 	case EEXIST:
    214 		return (NFS4ERR_EXIST);
    215 	case EXDEV:
    216 		return (NFS4ERR_XDEV);
    217 	case ENODEV:
    218 		return (NFS4ERR_IO);
    219 	case ENOTDIR:
    220 		return (NFS4ERR_NOTDIR);
    221 	case EISDIR:
    222 		return (NFS4ERR_ISDIR);
    223 	case EINVAL:
    224 		return (NFS4ERR_INVAL);
    225 	case EMFILE:
    226 		return (NFS4ERR_RESOURCE);
    227 	case EFBIG:
    228 		return (NFS4ERR_FBIG);
    229 	case ENOSPC:
    230 		return (NFS4ERR_NOSPC);
    231 	case EROFS:
    232 		return (NFS4ERR_ROFS);
    233 	case EMLINK:
    234 		return (NFS4ERR_MLINK);
    235 	case EDEADLK:
    236 		return (NFS4ERR_DEADLOCK);
    237 	case ENOLCK:
    238 		return (NFS4ERR_DENIED);
    239 	case EREMOTE:
    240 		return (NFS4ERR_SERVERFAULT);
    241 	case ENOTSUP:
    242 		return (NFS4ERR_NOTSUPP);
    243 	case EDQUOT:
    244 		return (NFS4ERR_DQUOT);
    245 	case ENAMETOOLONG:
    246 		return (NFS4ERR_NAMETOOLONG);
    247 	case EOVERFLOW:
    248 		return (NFS4ERR_INVAL);
    249 	case ENOSYS:
    250 		return (NFS4ERR_NOTSUPP);
    251 	case ENOTEMPTY:
    252 		return (NFS4ERR_NOTEMPTY);
    253 	case EOPNOTSUPP:
    254 		return (NFS4ERR_NOTSUPP);
    255 	case ESTALE:
    256 		return (NFS4ERR_STALE);
    257 	case EAGAIN:
    258 		if (curthread->t_flag & T_WOULDBLOCK) {
    259 			curthread->t_flag &= ~T_WOULDBLOCK;
    260 			return (NFS4ERR_DELAY);
    261 		}
    262 		return (NFS4ERR_LOCKED);
    263 	default:
    264 		return ((enum nfsstat4)error);
    265 	}
    266 }
    267 
    268 int
    269 geterrno4(enum nfsstat4 status)
    270 {
    271 	switch (status) {
    272 	case NFS4_OK:
    273 		return (0);
    274 	case NFS4ERR_PERM:
    275 		return (EPERM);
    276 	case NFS4ERR_NOENT:
    277 		return (ENOENT);
    278 	case NFS4ERR_IO:
    279 		return (EIO);
    280 	case NFS4ERR_NXIO:
    281 		return (ENXIO);
    282 	case NFS4ERR_ACCESS:
    283 		return (EACCES);
    284 	case NFS4ERR_EXIST:
    285 		return (EEXIST);
    286 	case NFS4ERR_XDEV:
    287 		return (EXDEV);
    288 	case NFS4ERR_NOTDIR:
    289 		return (ENOTDIR);
    290 	case NFS4ERR_ISDIR:
    291 		return (EISDIR);
    292 	case NFS4ERR_INVAL:
    293 		return (EINVAL);
    294 	case NFS4ERR_FBIG:
    295 		return (EFBIG);
    296 	case NFS4ERR_NOSPC:
    297 		return (ENOSPC);
    298 	case NFS4ERR_ROFS:
    299 		return (EROFS);
    300 	case NFS4ERR_MLINK:
    301 		return (EMLINK);
    302 	case NFS4ERR_NAMETOOLONG:
    303 		return (ENAMETOOLONG);
    304 	case NFS4ERR_NOTEMPTY:
    305 		return (ENOTEMPTY);
    306 	case NFS4ERR_DQUOT:
    307 		return (EDQUOT);
    308 	case NFS4ERR_STALE:
    309 		return (ESTALE);
    310 	case NFS4ERR_BADHANDLE:
    311 		return (ESTALE);
    312 	case NFS4ERR_BAD_COOKIE:
    313 		return (EINVAL);
    314 	case NFS4ERR_NOTSUPP:
    315 		return (EOPNOTSUPP);
    316 	case NFS4ERR_TOOSMALL:
    317 		return (EINVAL);
    318 	case NFS4ERR_SERVERFAULT:
    319 		return (EIO);
    320 	case NFS4ERR_BADTYPE:
    321 		return (EINVAL);
    322 	case NFS4ERR_DELAY:
    323 		return (ENXIO);
    324 	case NFS4ERR_SAME:
    325 		return (EPROTO);
    326 	case NFS4ERR_DENIED:
    327 		return (ENOLCK);
    328 	case NFS4ERR_EXPIRED:
    329 		return (EPROTO);
    330 	case NFS4ERR_LOCKED:
    331 		return (EACCES);
    332 	case NFS4ERR_GRACE:
    333 		return (EAGAIN);
    334 	case NFS4ERR_FHEXPIRED:	/* if got here, failed to get a new fh */
    335 		return (ESTALE);
    336 	case NFS4ERR_SHARE_DENIED:
    337 		return (EACCES);
    338 	case NFS4ERR_WRONGSEC:
    339 		return (EPERM);
    340 	case NFS4ERR_CLID_INUSE:
    341 		return (EAGAIN);
    342 	case NFS4ERR_RESOURCE:
    343 		return (EAGAIN);
    344 	case NFS4ERR_MOVED:
    345 		return (EPROTO);
    346 	case NFS4ERR_NOFILEHANDLE:
    347 		return (EIO);
    348 	case NFS4ERR_MINOR_VERS_MISMATCH:
    349 		return (ENOTSUP);
    350 	case NFS4ERR_STALE_CLIENTID:
    351 		return (EIO);
    352 	case NFS4ERR_STALE_STATEID:
    353 		return (EIO);
    354 	case NFS4ERR_OLD_STATEID:
    355 		return (EIO);
    356 	case NFS4ERR_BAD_STATEID:
    357 		return (EIO);
    358 	case NFS4ERR_BAD_SEQID:
    359 		return (EIO);
    360 	case NFS4ERR_NOT_SAME:
    361 		return (EPROTO);
    362 	case NFS4ERR_LOCK_RANGE:
    363 		return (EPROTO);
    364 	case NFS4ERR_SYMLINK:
    365 		return (EPROTO);
    366 	case NFS4ERR_RESTOREFH:
    367 		return (EPROTO);
    368 	case NFS4ERR_LEASE_MOVED:
    369 		return (EPROTO);
    370 	case NFS4ERR_ATTRNOTSUPP:
    371 		return (ENOTSUP);
    372 	case NFS4ERR_NO_GRACE:
    373 		return (EPROTO);
    374 	case NFS4ERR_RECLAIM_BAD:
    375 		return (EPROTO);
    376 	case NFS4ERR_RECLAIM_CONFLICT:
    377 		return (EPROTO);
    378 	case NFS4ERR_BADXDR:
    379 		return (EINVAL);
    380 	case NFS4ERR_LOCKS_HELD:
    381 		return (EIO);
    382 	case NFS4ERR_OPENMODE:
    383 		return (EACCES);
    384 	case NFS4ERR_BADOWNER:
    385 		/*
    386 		 * Client and server are in different DNS domains
    387 		 * and the NFSMAPID_DOMAIN in /etc/default/nfs
    388 		 * doesn't match.  No good answer here.  Return
    389 		 * EACCESS, which translates to "permission denied".
    390 		 */
    391 		return (EACCES);
    392 	case NFS4ERR_BADCHAR:
    393 		return (EINVAL);
    394 	case NFS4ERR_BADNAME:
    395 		return (EINVAL);
    396 	case NFS4ERR_BAD_RANGE:
    397 		return (EIO);
    398 	case NFS4ERR_LOCK_NOTSUPP:
    399 		return (ENOTSUP);
    400 	case NFS4ERR_OP_ILLEGAL:
    401 		return (EINVAL);
    402 	case NFS4ERR_DEADLOCK:
    403 		return (EDEADLK);
    404 	case NFS4ERR_FILE_OPEN:
    405 		return (EACCES);
    406 	case NFS4ERR_ADMIN_REVOKED:
    407 		return (EPROTO);
    408 	case NFS4ERR_CB_PATH_DOWN:
    409 		return (EPROTO);
    410 	default:
    411 #ifdef DEBUG
    412 		zcmn_err(getzoneid(), CE_WARN, "geterrno4: got status %d",
    413 		    status);
    414 #endif
    415 		return ((int)status);
    416 	}
    417 }
    418 
    419 void
    420 nfs4_log_badowner(mntinfo4_t *mi, nfs_opnum4 op)
    421 {
    422 	nfs4_server_t *server;
    423 
    424 	/*
    425 	 * Return if already printed/queued a msg
    426 	 * for this mount point.
    427 	 */
    428 	if (mi->mi_flags & MI4_BADOWNER_DEBUG)
    429 		return;
    430 	/*
    431 	 * Happens once per client <-> server pair.
    432 	 */
    433 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
    434 	    mi->mi_flags & MI4_INT))
    435 		return;
    436 
    437 	server = find_nfs4_server(mi);
    438 	if (server == NULL) {
    439 		nfs_rw_exit(&mi->mi_recovlock);
    440 		return;
    441 	}
    442 
    443 	if (!(server->s_flags & N4S_BADOWNER_DEBUG)) {
    444 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
    445 		    "!NFSMAPID_DOMAIN does not match"
    446 		    " the server: %s domain.\n"
    447 		    "Please check configuration",
    448 		    mi->mi_curr_serv->sv_hostname);
    449 		server->s_flags |= N4S_BADOWNER_DEBUG;
    450 	}
    451 	mutex_exit(&server->s_lock);
    452 	nfs4_server_rele(server);
    453 	nfs_rw_exit(&mi->mi_recovlock);
    454 
    455 	/*
    456 	 * Happens once per mntinfo4_t.
    457 	 * This error is deemed as one of the recovery facts "RF_BADOWNER",
    458 	 * queue this in the mesg queue for this mount_info. This message
    459 	 * is not printed, meaning its absent from id_to_dump_solo_fact()
    460 	 * but its there for inspection if the queue is ever dumped/inspected.
    461 	 */
    462 	mutex_enter(&mi->mi_lock);
    463 	if (!(mi->mi_flags & MI4_BADOWNER_DEBUG)) {
    464 		nfs4_queue_fact(RF_BADOWNER, mi, NFS4ERR_BADOWNER, 0, op,
    465 		    FALSE, NULL, 0, NULL);
    466 		mi->mi_flags |= MI4_BADOWNER_DEBUG;
    467 	}
    468 	mutex_exit(&mi->mi_lock);
    469 }
    470 
    471 int
    472 nfs4_time_ntov(nfstime4 *ntime, timestruc_t *vatime)
    473 {
    474 	int64_t sec;
    475 	int32_t nsec;
    476 
    477 	/*
    478 	 * Here check that the nfsv4 time is valid for the system.
    479 	 * nfsv4 time value is a signed 64-bit, and the system time
    480 	 * may be either int64_t or int32_t (depends on the kernel),
    481 	 * so if the kernel is 32-bit, the nfsv4 time value may not fit.
    482 	 */
    483 #ifndef _LP64
    484 	if (! NFS4_TIME_OK(ntime->seconds)) {
    485 		return (EOVERFLOW);
    486 	}
    487 #endif
    488 
    489 	/* Invalid to specify 1 billion (or more) nsecs */
    490 	if (ntime->nseconds >= 1000000000)
    491 		return (EINVAL);
    492 
    493 	if (ntime->seconds < 0) {
    494 		sec = ntime->seconds + 1;
    495 		nsec = -1000000000 + ntime->nseconds;
    496 	} else {
    497 		sec = ntime->seconds;
    498 		nsec = ntime->nseconds;
    499 	}
    500 
    501 	vatime->tv_sec = sec;
    502 	vatime->tv_nsec = nsec;
    503 
    504 	return (0);
    505 }
    506 
    507 int
    508 nfs4_time_vton(timestruc_t *vatime, nfstime4 *ntime)
    509 {
    510 	int64_t sec;
    511 	uint32_t nsec;
    512 
    513 	/*
    514 	 * nfsv4 time value is a signed 64-bit, and the system time
    515 	 * may be either int64_t or int32_t (depends on the kernel),
    516 	 * so all system time values will fit.
    517 	 */
    518 	if (vatime->tv_nsec >= 0) {
    519 		sec = vatime->tv_sec;
    520 		nsec = vatime->tv_nsec;
    521 	} else {
    522 		sec = vatime->tv_sec - 1;
    523 		nsec = 1000000000 + vatime->tv_nsec;
    524 	}
    525 	ntime->seconds = sec;
    526 	ntime->nseconds = nsec;
    527 
    528 	return (0);
    529 }
    530 
    531 /*
    532  * Converts a utf8 string to a valid null terminated filename string.
    533  *
    534  * XXX - Not actually translating the UTF-8 string as per RFC 2279.
    535  *	 For now, just validate that the UTF-8 string off the wire
    536  *	 does not have characters that will freak out UFS, and leave
    537  *	 it at that.
    538  */
    539 char *
    540 utf8_to_fn(utf8string *u8s, uint_t *lenp, char *s)
    541 {
    542 	ASSERT(lenp != NULL);
    543 
    544 	if (u8s == NULL || u8s->utf8string_len <= 0 ||
    545 	    u8s->utf8string_val == NULL)
    546 		return (NULL);
    547 
    548 	/*
    549 	 * Check for obvious illegal filename chars
    550 	 */
    551 	if (utf8_strchr(u8s, '/') != NULL) {
    552 #ifdef DEBUG
    553 		if (nfs4_utf8_debug) {
    554 			char *path;
    555 			int len = u8s->utf8string_len;
    556 
    557 			path = kmem_alloc(len + 1, KM_SLEEP);
    558 			bcopy(u8s->utf8string_val, path, len);
    559 			path[len] = '\0';
    560 
    561 			zcmn_err(getzoneid(), CE_WARN,
    562 			    "Invalid UTF-8 filename: %s", path);
    563 
    564 			kmem_free(path, len + 1);
    565 		}
    566 #endif
    567 		return (NULL);
    568 	}
    569 
    570 	return (utf8_to_str(u8s, lenp, s));
    571 }
    572 
    573 /*
    574  * Converts a utf8 string to a C string.
    575  * kmem_allocs a new string if not supplied
    576  */
    577 char *
    578 utf8_to_str(utf8string *str, uint_t *lenp, char *s)
    579 {
    580 	char	*sp;
    581 	char	*u8p;
    582 	int	len;
    583 	int	 i;
    584 
    585 	ASSERT(lenp != NULL);
    586 
    587 	if (str == NULL)
    588 		return (NULL);
    589 
    590 	u8p = str->utf8string_val;
    591 	len = str->utf8string_len;
    592 	if (len <= 0 || u8p == NULL) {
    593 		if (s)
    594 			*s = '\0';
    595 		return (NULL);
    596 	}
    597 
    598 	sp = s;
    599 	if (sp == NULL)
    600 		sp = kmem_alloc(len + 1, KM_SLEEP);
    601 
    602 	/*
    603 	 * At least check for embedded nulls
    604 	 */
    605 	for (i = 0; i < len; i++) {
    606 		sp[i] = u8p[i];
    607 		if (u8p[i] == '\0') {
    608 #ifdef	DEBUG
    609 			zcmn_err(getzoneid(), CE_WARN,
    610 			    "Embedded NULL in UTF-8 string");
    611 #endif
    612 			if (s == NULL)
    613 				kmem_free(sp, len + 1);
    614 			return (NULL);
    615 		}
    616 	}
    617 	sp[len] = '\0';
    618 	*lenp = len + 1;
    619 
    620 	return (sp);
    621 }
    622 
    623 /*
    624  * str_to_utf8 - converts a null-terminated C string to a utf8 string
    625  */
    626 utf8string *
    627 str_to_utf8(char *nm, utf8string *str)
    628 {
    629 	int len;
    630 
    631 	if (str == NULL)
    632 		return (NULL);
    633 
    634 	if (nm == NULL || *nm == '\0') {
    635 		str->utf8string_len = 0;
    636 		str->utf8string_val = NULL;
    637 	}
    638 
    639 	len = strlen(nm);
    640 
    641 	str->utf8string_val = kmem_alloc(len, KM_SLEEP);
    642 	str->utf8string_len = len;
    643 	bcopy(nm, str->utf8string_val, len);
    644 
    645 	return (str);
    646 }
    647 
    648 utf8string *
    649 utf8_copy(utf8string *src, utf8string *dest)
    650 {
    651 	if (src == NULL)
    652 		return (NULL);
    653 	if (dest == NULL)
    654 		return (NULL);
    655 
    656 	if (src->utf8string_len > 0) {
    657 		dest->utf8string_val = kmem_alloc(src->utf8string_len,
    658 		    KM_SLEEP);
    659 		bcopy(src->utf8string_val, dest->utf8string_val,
    660 		    src->utf8string_len);
    661 		dest->utf8string_len = src->utf8string_len;
    662 	} else {
    663 		dest->utf8string_val = NULL;
    664 		dest->utf8string_len = 0;
    665 	}
    666 
    667 	return (dest);
    668 }
    669 
    670 int
    671 utf8_compare(const utf8string *a, const utf8string *b)
    672 {
    673 	int mlen, cmp;
    674 	int alen, blen;
    675 	char *aval, *bval;
    676 
    677 	if ((a == NULL) && (b == NULL))
    678 		return (0);
    679 	else if (a == NULL)
    680 		return (-1);
    681 	else if (b == NULL)
    682 		return (1);
    683 
    684 	alen = a->utf8string_len;
    685 	blen = b->utf8string_len;
    686 	aval = a->utf8string_val;
    687 	bval = b->utf8string_val;
    688 
    689 	if (((alen == 0) || (aval == NULL)) &&
    690 	    ((blen == 0) || (bval == NULL)))
    691 		return (0);
    692 	else if ((alen == 0) || (aval == NULL))
    693 		return (-1);
    694 	else if ((blen == 0) || (bval == NULL))
    695 		return (1);
    696 
    697 	mlen = MIN(alen, blen);
    698 	cmp = strncmp(aval, bval, mlen);
    699 
    700 	if ((cmp == 0) && (alen == blen))
    701 		return (0);
    702 	else if ((cmp == 0) && (alen < blen))
    703 		return (-1);
    704 	else if (cmp == 0)
    705 		return (1);
    706 	else if (cmp < 0)
    707 		return (-1);
    708 	return (1);
    709 }
    710 
    711 /*
    712  * utf8_dir_verify - checks that the utf8 string is valid
    713  */
    714 int
    715 utf8_dir_verify(utf8string *str)
    716 {
    717 	char *nm;
    718 	int len;
    719 
    720 	if (str == NULL)
    721 		return (0);
    722 
    723 	nm = str->utf8string_val;
    724 	len = str->utf8string_len;
    725 	if (nm == NULL || len == 0) {
    726 		return (0);
    727 	}
    728 
    729 	if (len == 1 && nm[0] == '.')
    730 		return (0);
    731 	if (len == 2 && nm[0] == '.' && nm[1] == '.')
    732 		return (0);
    733 
    734 	if (utf8_strchr(str, '/') != NULL)
    735 		return (0);
    736 
    737 	if (utf8_strchr(str, '\0') != NULL)
    738 		return (0);
    739 
    740 	return (1);
    741 }
    742 
    743 /*
    744  * from rpcsec module (common/rpcsec)
    745  */
    746 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
    747 extern void sec_clnt_freeh(AUTH *);
    748 extern void sec_clnt_freeinfo(struct sec_data *);
    749 
    750 /*
    751  * authget() gets an auth handle based on the security
    752  * information from the servinfo in mountinfo.
    753  * The auth handle is stored in ch_client->cl_auth.
    754  *
    755  * First security flavor of choice is to use sv_secdata
    756  * which is initiated by the client. If that fails, get
    757  * secinfo from the server and then select one from the
    758  * server secinfo list .
    759  *
    760  * For RPCSEC_GSS flavor, upon success, a secure context is
    761  * established between client and server.
    762  */
    763 int
    764 authget(servinfo4_t *svp, CLIENT *ch_client, cred_t *cr)
    765 {
    766 	int error, i;
    767 
    768 	/*
    769 	 * SV4_TRYSECINFO indicates to try the secinfo list from
    770 	 * sv_secinfo until a successful one is reached. Point
    771 	 * sv_currsec to the selected security mechanism for
    772 	 * later sessions.
    773 	 */
    774 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
    775 	if ((svp->sv_flags & SV4_TRYSECINFO) && svp->sv_secinfo) {
    776 		for (i = svp->sv_secinfo->index; i < svp->sv_secinfo->count;
    777 		    i++) {
    778 			if (!(error = sec_clnt_geth(ch_client,
    779 			    &svp->sv_secinfo->sdata[i],
    780 			    cr, &ch_client->cl_auth))) {
    781 
    782 				svp->sv_currsec = &svp->sv_secinfo->sdata[i];
    783 				svp->sv_secinfo->index = i;
    784 				/* done */
    785 				svp->sv_flags &= ~SV4_TRYSECINFO;
    786 				break;
    787 			}
    788 
    789 			/*
    790 			 * Allow the caller retry with the security flavor
    791 			 * pointed by svp->sv_secinfo->index when
    792 			 * ETIMEDOUT/ECONNRESET occurs.
    793 			 */
    794 			if (error == ETIMEDOUT || error == ECONNRESET) {
    795 				svp->sv_secinfo->index = i;
    796 				break;
    797 			}
    798 		}
    799 	} else {
    800 		/* sv_currsec points to one of the entries in sv_secinfo */
    801 		if (svp->sv_currsec) {
    802 			error = sec_clnt_geth(ch_client, svp->sv_currsec, cr,
    803 			    &ch_client->cl_auth);
    804 		} else {
    805 			/* If it's null, use sv_secdata. */
    806 			error = sec_clnt_geth(ch_client, svp->sv_secdata, cr,
    807 			    &ch_client->cl_auth);
    808 		}
    809 	}
    810 	nfs_rw_exit(&svp->sv_lock);
    811 
    812 	return (error);
    813 }
    814 
    815 /*
    816  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
    817  */
    818 int
    819 clget4(clinfo_t *ci, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
    820     struct chtab **chp, struct nfs4_clnt *nfscl)
    821 {
    822 	struct chhead *ch, *newch;
    823 	struct chhead **plistp;
    824 	struct chtab *cp;
    825 	int error;
    826 	k_sigset_t smask;
    827 
    828 	if (newcl == NULL || chp == NULL || ci == NULL)
    829 		return (EINVAL);
    830 
    831 	*newcl = NULL;
    832 	*chp = NULL;
    833 
    834 	/*
    835 	 * Find an unused handle or create one
    836 	 */
    837 	newch = NULL;
    838 	nfscl->nfscl_stat.clgets.value.ui64++;
    839 top:
    840 	/*
    841 	 * Find the correct entry in the cache to check for free
    842 	 * client handles.  The search is based on the RPC program
    843 	 * number, program version number, dev_t for the transport
    844 	 * device, and the protocol family.
    845 	 */
    846 	mutex_enter(&nfscl->nfscl_chtable4_lock);
    847 	plistp = &nfscl->nfscl_chtable4;
    848 	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
    849 		if (ch->ch_prog == ci->cl_prog &&
    850 		    ch->ch_vers == ci->cl_vers &&
    851 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
    852 		    (strcmp(ch->ch_protofmly,
    853 		    svp->sv_knconf->knc_protofmly) == 0))
    854 			break;
    855 		plistp = &ch->ch_next;
    856 	}
    857 
    858 	/*
    859 	 * If we didn't find a cache entry for this quadruple, then
    860 	 * create one.  If we don't have one already preallocated,
    861 	 * then drop the cache lock, create one, and then start over.
    862 	 * If we did have a preallocated entry, then just add it to
    863 	 * the front of the list.
    864 	 */
    865 	if (ch == NULL) {
    866 		if (newch == NULL) {
    867 			mutex_exit(&nfscl->nfscl_chtable4_lock);
    868 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
    869 			newch->ch_timesused = 0;
    870 			newch->ch_prog = ci->cl_prog;
    871 			newch->ch_vers = ci->cl_vers;
    872 			newch->ch_dev = svp->sv_knconf->knc_rdev;
    873 			newch->ch_protofmly = kmem_alloc(
    874 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
    875 			    KM_SLEEP);
    876 			(void) strcpy(newch->ch_protofmly,
    877 			    svp->sv_knconf->knc_protofmly);
    878 			newch->ch_list = NULL;
    879 			goto top;
    880 		}
    881 		ch = newch;
    882 		newch = NULL;
    883 		ch->ch_next = nfscl->nfscl_chtable4;
    884 		nfscl->nfscl_chtable4 = ch;
    885 	/*
    886 	 * We found a cache entry, but if it isn't on the front of the
    887 	 * list, then move it to the front of the list to try to take
    888 	 * advantage of locality of operations.
    889 	 */
    890 	} else if (ch != nfscl->nfscl_chtable4) {
    891 		*plistp = ch->ch_next;
    892 		ch->ch_next = nfscl->nfscl_chtable4;
    893 		nfscl->nfscl_chtable4 = ch;
    894 	}
    895 
    896 	/*
    897 	 * If there was a free client handle cached, then remove it
    898 	 * from the list, init it, and use it.
    899 	 */
    900 	if (ch->ch_list != NULL) {
    901 		cp = ch->ch_list;
    902 		ch->ch_list = cp->ch_list;
    903 		mutex_exit(&nfscl->nfscl_chtable4_lock);
    904 		if (newch != NULL) {
    905 			kmem_free(newch->ch_protofmly,
    906 			    strlen(newch->ch_protofmly) + 1);
    907 			kmem_free(newch, sizeof (*newch));
    908 		}
    909 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
    910 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
    911 
    912 		/*
    913 		 * Get an auth handle.
    914 		 */
    915 		error = authget(svp, cp->ch_client, cr);
    916 		if (error || cp->ch_client->cl_auth == NULL) {
    917 			CLNT_DESTROY(cp->ch_client);
    918 			kmem_cache_free(chtab4_cache, cp);
    919 			return ((error != 0) ? error : EINTR);
    920 		}
    921 		ch->ch_timesused++;
    922 		*newcl = cp->ch_client;
    923 		*chp = cp;
    924 		return (0);
    925 	}
    926 
    927 	/*
    928 	 * There weren't any free client handles which fit, so allocate
    929 	 * a new one and use that.
    930 	 */
    931 #ifdef DEBUG
    932 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
    933 #endif
    934 	mutex_exit(&nfscl->nfscl_chtable4_lock);
    935 
    936 	nfscl->nfscl_stat.cltoomany.value.ui64++;
    937 	if (newch != NULL) {
    938 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
    939 		kmem_free(newch, sizeof (*newch));
    940 	}
    941 
    942 	cp = kmem_cache_alloc(chtab4_cache, KM_SLEEP);
    943 	cp->ch_head = ch;
    944 
    945 	sigintr(&smask, (int)ci->cl_flags & MI4_INT);
    946 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
    947 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
    948 	sigunintr(&smask);
    949 
    950 	if (error != 0) {
    951 		kmem_cache_free(chtab4_cache, cp);
    952 #ifdef DEBUG
    953 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
    954 #endif
    955 		/*
    956 		 * Warning is unnecessary if error is EINTR.
    957 		 */
    958 		if (error != EINTR) {
    959 			nfs_cmn_err(error, CE_WARN,
    960 			    "clget: couldn't create handle: %m\n");
    961 		}
    962 		return (error);
    963 	}
    964 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
    965 	auth_destroy(cp->ch_client->cl_auth);
    966 
    967 	/*
    968 	 * Get an auth handle.
    969 	 */
    970 	error = authget(svp, cp->ch_client, cr);
    971 	if (error || cp->ch_client->cl_auth == NULL) {
    972 		CLNT_DESTROY(cp->ch_client);
    973 		kmem_cache_free(chtab4_cache, cp);
    974 #ifdef DEBUG
    975 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
    976 #endif
    977 		return ((error != 0) ? error : EINTR);
    978 	}
    979 	ch->ch_timesused++;
    980 	*newcl = cp->ch_client;
    981 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
    982 	*chp = cp;
    983 	return (0);
    984 }
    985 
    986 static int
    987 nfs_clget4(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
    988     struct chtab **chp, struct nfs4_clnt *nfscl)
    989 {
    990 	clinfo_t ci;
    991 	bool_t is_recov;
    992 	int firstcall, error = 0;
    993 
    994 	/*
    995 	 * Set read buffer size to rsize
    996 	 * and add room for RPC headers.
    997 	 */
    998 	ci.cl_readsize = mi->mi_tsize;
    999 	if (ci.cl_readsize != 0)
   1000 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
   1001 
   1002 	/*
   1003 	 * If soft mount and server is down just try once.
   1004 	 * meaning: do not retransmit.
   1005 	 */
   1006 	if (!(mi->mi_flags & MI4_HARD) && (mi->mi_flags & MI4_DOWN))
   1007 		ci.cl_retrans = 0;
   1008 	else
   1009 		ci.cl_retrans = mi->mi_retrans;
   1010 
   1011 	ci.cl_prog = mi->mi_prog;
   1012 	ci.cl_vers = mi->mi_vers;
   1013 	ci.cl_flags = mi->mi_flags;
   1014 
   1015 	/*
   1016 	 * clget4 calls authget() to get an auth handle. For RPCSEC_GSS
   1017 	 * security flavor, the client tries to establish a security context
   1018 	 * by contacting the server. If the connection is timed out or reset,
   1019 	 * e.g. server reboot, we will try again.
   1020 	 */
   1021 	is_recov = (curthread == mi->mi_recovthread);
   1022 	firstcall = 1;
   1023 
   1024 	do {
   1025 		error = clget4(&ci, svp, cr, newcl, chp, nfscl);
   1026 
   1027 		if (error == 0)
   1028 			break;
   1029 
   1030 		/*
   1031 		 * For forced unmount and zone shutdown, bail out but
   1032 		 * let the recovery thread do one more transmission.
   1033 		 */
   1034 		if ((FS_OR_ZONE_GONE4(mi->mi_vfsp)) &&
   1035 		    (!is_recov || !firstcall)) {
   1036 			error = EIO;
   1037 			break;
   1038 		}
   1039 
   1040 		/* do not retry for soft mount */
   1041 		if (!(mi->mi_flags & MI4_HARD))
   1042 			break;
   1043 
   1044 		/* let the caller deal with the failover case */
   1045 		if (FAILOVER_MOUNT4(mi))
   1046 			break;
   1047 
   1048 		firstcall = 0;
   1049 
   1050 	} while (error == ETIMEDOUT || error == ECONNRESET);
   1051 
   1052 	return (error);
   1053 }
   1054 
   1055 void
   1056 clfree4(CLIENT *cl, struct chtab *cp, struct nfs4_clnt *nfscl)
   1057 {
   1058 	if (cl->cl_auth != NULL) {
   1059 		sec_clnt_freeh(cl->cl_auth);
   1060 		cl->cl_auth = NULL;
   1061 	}
   1062 
   1063 	/*
   1064 	 * Timestamp this cache entry so that we know when it was last
   1065 	 * used.
   1066 	 */
   1067 	cp->ch_freed = gethrestime_sec();
   1068 
   1069 	/*
   1070 	 * Add the free client handle to the front of the list.
   1071 	 * This way, the list will be sorted in youngest to oldest
   1072 	 * order.
   1073 	 */
   1074 	mutex_enter(&nfscl->nfscl_chtable4_lock);
   1075 	cp->ch_list = cp->ch_head->ch_list;
   1076 	cp->ch_head->ch_list = cp;
   1077 	mutex_exit(&nfscl->nfscl_chtable4_lock);
   1078 }
   1079 
   1080 #define	CL_HOLDTIME	60	/* time to hold client handles */
   1081 
   1082 static void
   1083 clreclaim4_zone(struct nfs4_clnt *nfscl, uint_t cl_holdtime)
   1084 {
   1085 	struct chhead *ch;
   1086 	struct chtab *cp;	/* list of objects that can be reclaimed */
   1087 	struct chtab *cpe;
   1088 	struct chtab *cpl;
   1089 	struct chtab **cpp;
   1090 #ifdef DEBUG
   1091 	int n = 0;
   1092 	clstat4_debug.clreclaim.value.ui64++;
   1093 #endif
   1094 
   1095 	/*
   1096 	 * Need to reclaim some memory, so step through the cache
   1097 	 * looking through the lists for entries which can be freed.
   1098 	 */
   1099 	cp = NULL;
   1100 
   1101 	mutex_enter(&nfscl->nfscl_chtable4_lock);
   1102 
   1103 	/*
   1104 	 * Here we step through each non-NULL quadruple and start to
   1105 	 * construct the reclaim list pointed to by cp.  Note that
   1106 	 * cp will contain all eligible chtab entries.  When this traversal
   1107 	 * completes, chtab entries from the last quadruple will be at the
   1108 	 * front of cp and entries from previously inspected quadruples have
   1109 	 * been appended to the rear of cp.
   1110 	 */
   1111 	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
   1112 		if (ch->ch_list == NULL)
   1113 			continue;
   1114 		/*
   1115 		 * Search each list for entries older then
   1116 		 * cl_holdtime seconds.  The lists are maintained
   1117 		 * in youngest to oldest order so that when the
   1118 		 * first entry is found which is old enough, then
   1119 		 * all of the rest of the entries on the list will
   1120 		 * be old enough as well.
   1121 		 */
   1122 		cpl = ch->ch_list;
   1123 		cpp = &ch->ch_list;
   1124 		while (cpl != NULL &&
   1125 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
   1126 			cpp = &cpl->ch_list;
   1127 			cpl = cpl->ch_list;
   1128 		}
   1129 		if (cpl != NULL) {
   1130 			*cpp = NULL;
   1131 			if (cp != NULL) {
   1132 				cpe = cpl;
   1133 				while (cpe->ch_list != NULL)
   1134 					cpe = cpe->ch_list;
   1135 				cpe->ch_list = cp;
   1136 			}
   1137 			cp = cpl;
   1138 		}
   1139 	}
   1140 
   1141 	mutex_exit(&nfscl->nfscl_chtable4_lock);
   1142 
   1143 	/*
   1144 	 * If cp is empty, then there is nothing to reclaim here.
   1145 	 */
   1146 	if (cp == NULL)
   1147 		return;
   1148 
   1149 	/*
   1150 	 * Step through the list of entries to free, destroying each client
   1151 	 * handle and kmem_free'ing the memory for each entry.
   1152 	 */
   1153 	while (cp != NULL) {
   1154 #ifdef DEBUG
   1155 		n++;
   1156 #endif
   1157 		CLNT_DESTROY(cp->ch_client);
   1158 		cpl = cp->ch_list;
   1159 		kmem_cache_free(chtab4_cache, cp);
   1160 		cp = cpl;
   1161 	}
   1162 
   1163 #ifdef DEBUG
   1164 	/*
   1165 	 * Update clalloc so that nfsstat shows the current number
   1166 	 * of allocated client handles.
   1167 	 */
   1168 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
   1169 #endif
   1170 }
   1171 
   1172 /* ARGSUSED */
   1173 static void
   1174 clreclaim4(void *all)
   1175 {
   1176 	struct nfs4_clnt *nfscl;
   1177 
   1178 	/*
   1179 	 * The system is low on memory; go through and try to reclaim some from
   1180 	 * every zone on the system.
   1181 	 */
   1182 	mutex_enter(&nfs4_clnt_list_lock);
   1183 	nfscl = list_head(&nfs4_clnt_list);
   1184 	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl))
   1185 		clreclaim4_zone(nfscl, CL_HOLDTIME);
   1186 	mutex_exit(&nfs4_clnt_list_lock);
   1187 }
   1188 
   1189 /*
   1190  * Minimum time-out values indexed by call type
   1191  * These units are in "eights" of a second to avoid multiplies
   1192  */
   1193 static unsigned int minimum_timeo[] = {
   1194 	6, 7, 10
   1195 };
   1196 
   1197 #define	SHORTWAIT	(NFS_COTS_TIMEO / 10)
   1198 
   1199 /*
   1200  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
   1201  */
   1202 #define	MAXTIMO	(20*hz)
   1203 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
   1204 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
   1205 
   1206 static int
   1207 nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
   1208     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *doqueue,
   1209     enum clnt_stat *rpc_statusp, int flags, struct nfs4_clnt *nfscl)
   1210 {
   1211 	CLIENT *client;
   1212 	struct chtab *ch;
   1213 	cred_t *cr = icr;
   1214 	struct rpc_err rpcerr, rpcerr_tmp;
   1215 	enum clnt_stat status;
   1216 	int error;
   1217 	struct timeval wait;
   1218 	int timeo;		/* in units of hz */
   1219 	bool_t tryagain, is_recov;
   1220 	bool_t cred_cloned = FALSE;
   1221 	k_sigset_t smask;
   1222 	servinfo4_t *svp;
   1223 #ifdef DEBUG
   1224 	char *bufp;
   1225 #endif
   1226 	int firstcall;
   1227 
   1228 	rpcerr.re_status = RPC_SUCCESS;
   1229 
   1230 	/*
   1231 	 * If we know that we are rebooting then let's
   1232 	 * not bother with doing any over the wireness.
   1233 	 */
   1234 	mutex_enter(&mi->mi_lock);
   1235 	if (mi->mi_flags & MI4_SHUTDOWN) {
   1236 		mutex_exit(&mi->mi_lock);
   1237 		return (EIO);
   1238 	}
   1239 	mutex_exit(&mi->mi_lock);
   1240 
   1241 	/* For TSOL, use a new cred which has net_mac_aware flag */
   1242 	if (!cred_cloned && is_system_labeled()) {
   1243 		cred_cloned = TRUE;
   1244 		cr = crdup(icr);
   1245 		(void) setpflags(NET_MAC_AWARE, 1, cr);
   1246 	}
   1247 
   1248 	/*
   1249 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
   1250 	 * are guaranteed to reprocess the retry as a new request.
   1251 	 */
   1252 	svp = mi->mi_curr_serv;
   1253 	rpcerr.re_errno = nfs_clget4(mi, svp, cr, &client, &ch, nfscl);
   1254 	if (rpcerr.re_errno != 0)
   1255 		return (rpcerr.re_errno);
   1256 
   1257 	timeo = (mi->mi_timeo * hz) / 10;
   1258 
   1259 	/*
   1260 	 * If hard mounted fs, retry call forever unless hard error
   1261 	 * occurs.
   1262 	 *
   1263 	 * For forced unmount, let the recovery thread through but return
   1264 	 * an error for all others.  This is so that user processes can
   1265 	 * exit quickly.  The recovery thread bails out after one
   1266 	 * transmission so that it can tell if it needs to continue.
   1267 	 *
   1268 	 * For zone shutdown, behave as above to encourage quick
   1269 	 * process exit, but also fail quickly when servers have
   1270 	 * timed out before and reduce the timeouts.
   1271 	 */
   1272 	is_recov = (curthread == mi->mi_recovthread);
   1273 	firstcall = 1;
   1274 	do {
   1275 		tryagain = FALSE;
   1276 
   1277 		NFS4_DEBUG(nfs4_rfscall_debug, (CE_NOTE,
   1278 		    "nfs4_rfscall: vfs_flag=0x%x, %s",
   1279 		    mi->mi_vfsp->vfs_flag,
   1280 		    is_recov ? "recov thread" : "not recov thread"));
   1281 
   1282 		/*
   1283 		 * It's possible while we're retrying the admin
   1284 		 * decided to reboot.
   1285 		 */
   1286 		mutex_enter(&mi->mi_lock);
   1287 		if (mi->mi_flags & MI4_SHUTDOWN) {
   1288 			mutex_exit(&mi->mi_lock);
   1289 			clfree4(client, ch, nfscl);
   1290 			if (cred_cloned)
   1291 				crfree(cr);
   1292 			return (EIO);
   1293 		}
   1294 		mutex_exit(&mi->mi_lock);
   1295 
   1296 		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
   1297 		    (!is_recov || !firstcall)) {
   1298 			clfree4(client, ch, nfscl);
   1299 			if (cred_cloned)
   1300 				crfree(cr);
   1301 			return (EIO);
   1302 		}
   1303 
   1304 		if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) {
   1305 			mutex_enter(&mi->mi_lock);
   1306 			if ((mi->mi_flags & MI4_TIMEDOUT) ||
   1307 			    !is_recov || !firstcall) {
   1308 				mutex_exit(&mi->mi_lock);
   1309 				clfree4(client, ch, nfscl);
   1310 				if (cred_cloned)
   1311 					crfree(cr);
   1312 				return (EIO);
   1313 			}
   1314 			mutex_exit(&mi->mi_lock);
   1315 			timeo = (MIN(mi->mi_timeo, SHORTWAIT) * hz) / 10;
   1316 		}
   1317 
   1318 		firstcall = 0;
   1319 		TICK_TO_TIMEVAL(timeo, &wait);
   1320 
   1321 		/*
   1322 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
   1323 		 * and SIGTERM. (Preserving the existing masks).
   1324 		 * Mask out SIGINT if mount option nointr is specified.
   1325 		 */
   1326 		sigintr(&smask, (int)mi->mi_flags & MI4_INT);
   1327 		if (!(mi->mi_flags & MI4_INT))
   1328 			client->cl_nosignal = TRUE;
   1329 
   1330 		/*
   1331 		 * If there is a current signal, then don't bother
   1332 		 * even trying to send out the request because we
   1333 		 * won't be able to block waiting for the response.
   1334 		 * Simply assume RPC_INTR and get on with it.
   1335 		 */
   1336 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
   1337 			status = RPC_INTR;
   1338 		else {
   1339 			status = CLNT_CALL(client, which, xdrargs, argsp,
   1340 			    xdrres, resp, wait);
   1341 		}
   1342 
   1343 		if (!(mi->mi_flags & MI4_INT))
   1344 			client->cl_nosignal = FALSE;
   1345 		/*
   1346 		 * restore original signal mask
   1347 		 */
   1348 		sigunintr(&smask);
   1349 
   1350 		switch (status) {
   1351 		case RPC_SUCCESS:
   1352 			break;
   1353 
   1354 		case RPC_INTR:
   1355 			/*
   1356 			 * There is no way to recover from this error,
   1357 			 * even if mount option nointr is specified.
   1358 			 * SIGKILL, for example, cannot be blocked.
   1359 			 */
   1360 			rpcerr.re_status = RPC_INTR;
   1361 			rpcerr.re_errno = EINTR;
   1362 			break;
   1363 
   1364 		case RPC_UDERROR:
   1365 			/*
   1366 			 * If the NFS server is local (vold) and
   1367 			 * it goes away then we get RPC_UDERROR.
   1368 			 * This is a retryable error, so we would
   1369 			 * loop, so check to see if the specific
   1370 			 * error was ECONNRESET, indicating that
   1371 			 * target did not exist at all.  If so,
   1372 			 * return with RPC_PROGUNAVAIL and
   1373 			 * ECONNRESET to indicate why.
   1374 			 */
   1375 			CLNT_GETERR(client, &rpcerr);
   1376 			if (rpcerr.re_errno == ECONNRESET) {
   1377 				rpcerr.re_status = RPC_PROGUNAVAIL;
   1378 				rpcerr.re_errno = ECONNRESET;
   1379 				break;
   1380 			}
   1381 			/*FALLTHROUGH*/
   1382 
   1383 		default:		/* probably RPC_TIMEDOUT */
   1384 
   1385 			if (IS_UNRECOVERABLE_RPC(status))
   1386 				break;
   1387 
   1388 			/*
   1389 			 * increment server not responding count
   1390 			 */
   1391 			mutex_enter(&mi->mi_lock);
   1392 			mi->mi_noresponse++;
   1393 			mutex_exit(&mi->mi_lock);
   1394 #ifdef DEBUG
   1395 			nfscl->nfscl_stat.noresponse.value.ui64++;
   1396 #endif
   1397 			/*
   1398 			 * On zone shutdown, mark server dead and move on.
   1399 			 */
   1400 			if (zone_status_get(curproc->p_zone) >=
   1401 			    ZONE_IS_SHUTTING_DOWN) {
   1402 				mutex_enter(&mi->mi_lock);
   1403 				mi->mi_flags |= MI4_TIMEDOUT;
   1404 				mutex_exit(&mi->mi_lock);
   1405 				clfree4(client, ch, nfscl);
   1406 				if (cred_cloned)
   1407 					crfree(cr);
   1408 				return (EIO);
   1409 			}
   1410 
   1411 			/*
   1412 			 * NFS client failover support:
   1413 			 * return and let the caller take care of
   1414 			 * failover.  We only return for failover mounts
   1415 			 * because otherwise we want the "not responding"
   1416 			 * message, the timer updates, etc.
   1417 			 */
   1418 			if (mi->mi_vers == 4 && FAILOVER_MOUNT4(mi) &&
   1419 			    (error = try_failover(status)) != 0) {
   1420 				clfree4(client, ch, nfscl);
   1421 				if (cred_cloned)
   1422 					crfree(cr);
   1423 				*rpc_statusp = status;
   1424 				return (error);
   1425 			}
   1426 
   1427 			if (flags & RFSCALL_SOFT)
   1428 				break;
   1429 
   1430 			tryagain = TRUE;
   1431 
   1432 			/*
   1433 			 * The call is in progress (over COTS).
   1434 			 * Try the CLNT_CALL again, but don't
   1435 			 * print a noisy error message.
   1436 			 */
   1437 			if (status == RPC_INPROGRESS)
   1438 				break;
   1439 
   1440 			timeo = backoff(timeo);
   1441 			CLNT_GETERR(client, &rpcerr_tmp);
   1442 
   1443 			mutex_enter(&mi->mi_lock);
   1444 			if (!(mi->mi_flags & MI4_PRINTED)) {
   1445 				mi->mi_flags |= MI4_PRINTED;
   1446 				mutex_exit(&mi->mi_lock);
   1447 				if ((status == RPC_CANTSEND) &&
   1448 				    (rpcerr_tmp.re_errno == ENOBUFS))
   1449 					nfs4_queue_fact(RF_SENDQ_FULL, mi, 0,
   1450 					    0, 0, FALSE, NULL, 0, NULL);
   1451 				else
   1452 					nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi,
   1453 					    0, 0, 0, FALSE, NULL, 0, NULL);
   1454 			} else
   1455 				mutex_exit(&mi->mi_lock);
   1456 
   1457 			if (*doqueue && nfs_has_ctty()) {
   1458 				*doqueue = 0;
   1459 				if (!(mi->mi_flags & MI4_NOPRINT)) {
   1460 					if ((status == RPC_CANTSEND) &&
   1461 					    (rpcerr_tmp.re_errno == ENOBUFS))
   1462 						nfs4_queue_fact(RF_SENDQ_FULL,
   1463 						    mi, 0, 0, 0, FALSE, NULL,
   1464 						    0, NULL);
   1465 					else
   1466 						nfs4_queue_fact(
   1467 						    RF_SRV_NOT_RESPOND, mi, 0,
   1468 						    0, 0, FALSE, NULL, 0, NULL);
   1469 				}
   1470 			}
   1471 		}
   1472 	} while (tryagain);
   1473 
   1474 	DTRACE_PROBE2(nfs4__rfscall_debug, enum clnt_stat, status,
   1475 	    int, rpcerr.re_errno);
   1476 
   1477 	if (status != RPC_SUCCESS) {
   1478 		zoneid_t zoneid = mi->mi_zone->zone_id;
   1479 
   1480 		/*
   1481 		 * Let soft mounts use the timed out message.
   1482 		 */
   1483 		if (status == RPC_INPROGRESS)
   1484 			status = RPC_TIMEDOUT;
   1485 		nfscl->nfscl_stat.badcalls.value.ui64++;
   1486 		if (status != RPC_INTR) {
   1487 			mutex_enter(&mi->mi_lock);
   1488 			mi->mi_flags |= MI4_DOWN;
   1489 			mutex_exit(&mi->mi_lock);
   1490 			CLNT_GETERR(client, &rpcerr);
   1491 #ifdef DEBUG
   1492 			bufp = clnt_sperror(client, svp->sv_hostname);
   1493 			zprintf(zoneid, "NFS%d %s failed for %s\n",
   1494 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
   1495 			if (nfs_has_ctty()) {
   1496 				if (!(mi->mi_flags & MI4_NOPRINT)) {
   1497 					uprintf("NFS%d %s failed for %s\n",
   1498 					    mi->mi_vers, mi->mi_rfsnames[which],
   1499 					    bufp);
   1500 				}
   1501 			}
   1502 			kmem_free(bufp, MAXPATHLEN);
   1503 #else
   1504 			zprintf(zoneid,
   1505 			    "NFS %s failed for server %s: error %d (%s)\n",
   1506 			    mi->mi_rfsnames[which], svp->sv_hostname,
   1507 			    status, clnt_sperrno(status));
   1508 			if (nfs_has_ctty()) {
   1509 				if (!(mi->mi_flags & MI4_NOPRINT)) {
   1510 					uprintf(
   1511 				"NFS %s failed for server %s: error %d (%s)\n",
   1512 					    mi->mi_rfsnames[which],
   1513 					    svp->sv_hostname, status,
   1514 					    clnt_sperrno(status));
   1515 				}
   1516 			}
   1517 #endif
   1518 			/*
   1519 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
   1520 			 * re_errno is set appropriately depending on
   1521 			 * the authentication error
   1522 			 */
   1523 			if (status == RPC_VERSMISMATCH ||
   1524 			    status == RPC_PROGVERSMISMATCH)
   1525 				rpcerr.re_errno = EIO;
   1526 		}
   1527 	} else {
   1528 		/*
   1529 		 * Test the value of mi_down and mi_printed without
   1530 		 * holding the mi_lock mutex.  If they are both zero,
   1531 		 * then it is okay to skip the down and printed
   1532 		 * processing.  This saves on a mutex_enter and
   1533 		 * mutex_exit pair for a normal, successful RPC.
   1534 		 * This was just complete overhead.
   1535 		 */
   1536 		if (mi->mi_flags & (MI4_DOWN | MI4_PRINTED)) {
   1537 			mutex_enter(&mi->mi_lock);
   1538 			mi->mi_flags &= ~MI4_DOWN;
   1539 			if (mi->mi_flags & MI4_PRINTED) {
   1540 				mi->mi_flags &= ~MI4_PRINTED;
   1541 				mutex_exit(&mi->mi_lock);
   1542 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
   1543 					nfs4_queue_fact(RF_SRV_OK, mi, 0, 0,
   1544 					    0, FALSE, NULL, 0, NULL);
   1545 			} else
   1546 				mutex_exit(&mi->mi_lock);
   1547 		}
   1548 
   1549 		if (*doqueue == 0) {
   1550 			if (!(mi->mi_flags & MI4_NOPRINT) &&
   1551 			    !(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
   1552 				nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 0,
   1553 				    FALSE, NULL, 0, NULL);
   1554 
   1555 			*doqueue = 1;
   1556 		}
   1557 	}
   1558 
   1559 	clfree4(client, ch, nfscl);
   1560 	if (cred_cloned)
   1561 		crfree(cr);
   1562 
   1563 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
   1564 
   1565 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "nfs4_rfscall_end:errno %d",
   1566 	    rpcerr.re_errno);
   1567 
   1568 	*rpc_statusp = status;
   1569 	return (rpcerr.re_errno);
   1570 }
   1571 
   1572 /*
   1573  * rfs4call - general wrapper for RPC calls initiated by the client
   1574  */
   1575 void
   1576 rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp,
   1577     cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep)
   1578 {
   1579 	int i, error;
   1580 	enum clnt_stat rpc_status = NFS4_OK;
   1581 	int num_resops;
   1582 	struct nfs4_clnt *nfscl;
   1583 
   1584 	ASSERT(nfs_zone() == mi->mi_zone);
   1585 	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
   1586 	ASSERT(nfscl != NULL);
   1587 
   1588 	nfscl->nfscl_stat.calls.value.ui64++;
   1589 	mi->mi_reqs[NFSPROC4_COMPOUND].value.ui64++;
   1590 
   1591 	/* Set up the results struct for XDR usage */
   1592 	resp->argsp = argsp;
   1593 	resp->array = NULL;
   1594 	resp->status = 0;
   1595 	resp->decode_len = 0;
   1596 
   1597 	error = nfs4_rfscall(mi, NFSPROC4_COMPOUND,
   1598 	    xdr_COMPOUND4args_clnt, (caddr_t)argsp,
   1599 	    xdr_COMPOUND4res_clnt, (caddr_t)resp, cr,
   1600 	    doqueue, &rpc_status, flags, nfscl);
   1601 
   1602 	/* Return now if it was an RPC error */
   1603 	if (error) {
   1604 		ep->error = error;
   1605 		ep->stat = resp->status;
   1606 		ep->rpc_status = rpc_status;
   1607 		return;
   1608 	}
   1609 
   1610 	/* else we'll count the processed operations */
   1611 	num_resops = resp->decode_len;
   1612 	for (i = 0; i < num_resops; i++) {
   1613 		/*
   1614 		 * Count the individual operations
   1615 		 * processed by the server.
   1616 		 */
   1617 		if (resp->array[i].resop >= NFSPROC4_NULL &&
   1618 		    resp->array[i].resop <= OP_WRITE)
   1619 			mi->mi_reqs[resp->array[i].resop].value.ui64++;
   1620 	}
   1621 
   1622 	ep->error = 0;
   1623 	ep->stat = resp->status;
   1624 	ep->rpc_status = rpc_status;
   1625 }
   1626 
   1627 /*
   1628  * nfs4rename_update - updates stored state after a rename.  Currently this
   1629  * is the path of the object and anything under it, and the filehandle of
   1630  * the renamed object.
   1631  */
   1632 void
   1633 nfs4rename_update(vnode_t *renvp, vnode_t *ndvp, nfs_fh4 *nfh4p, char *nnm)
   1634 {
   1635 	sfh4_update(VTOR4(renvp)->r_fh, nfh4p);
   1636 	fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, nnm);
   1637 }
   1638 
   1639 /*
   1640  * Routine to look up the filehandle for the given path and rootvp.
   1641  *
   1642  * Return values:
   1643  * - success: returns zero and *statp is set to NFS4_OK, and *fhp is
   1644  *   updated.
   1645  * - error: return value (errno value) and/or *statp is set appropriately.
   1646  */
   1647 #define	RML_ORDINARY	1
   1648 #define	RML_NAMED_ATTR	2
   1649 #define	RML_ATTRDIR	3
   1650 
   1651 static void
   1652 remap_lookup(nfs4_fname_t *fname, vnode_t *rootvp,
   1653     int filetype, cred_t *cr,
   1654     nfs_fh4 *fhp, nfs4_ga_res_t *garp,		/* fh, attrs for object */
   1655     nfs_fh4 *pfhp, nfs4_ga_res_t *pgarp,	/* fh, attrs for parent */
   1656     nfs4_error_t *ep)
   1657 {
   1658 	COMPOUND4args_clnt args;
   1659 	COMPOUND4res_clnt res;
   1660 	nfs_argop4 *argop;
   1661 	nfs_resop4 *resop;
   1662 	int num_argops;
   1663 	lookup4_param_t lookuparg;
   1664 	nfs_fh4 *tmpfhp;
   1665 	int doqueue = 1;
   1666 	char *path;
   1667 	mntinfo4_t *mi;
   1668 
   1669 	ASSERT(fname != NULL);
   1670 	ASSERT(rootvp->v_type == VDIR);
   1671 
   1672 	mi = VTOMI4(rootvp);
   1673 	path = fn_path(fname);
   1674 	switch (filetype) {
   1675 	case RML_NAMED_ATTR:
   1676 		lookuparg.l4_getattrs = LKP4_LAST_NAMED_ATTR;
   1677 		args.ctag = TAG_REMAP_LOOKUP_NA;
   1678 		break;
   1679 	case RML_ATTRDIR:
   1680 		lookuparg.l4_getattrs = LKP4_LAST_ATTRDIR;
   1681 		args.ctag = TAG_REMAP_LOOKUP_AD;
   1682 		break;
   1683 	case RML_ORDINARY:
   1684 		lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
   1685 		args.ctag = TAG_REMAP_LOOKUP;
   1686 		break;
   1687 	default:
   1688 		ep->error = EINVAL;
   1689 		return;
   1690 	}
   1691 	lookuparg.argsp = &args;
   1692 	lookuparg.resp = &res;
   1693 	lookuparg.header_len = 1;	/* Putfh */
   1694 	lookuparg.trailer_len = 0;
   1695 	lookuparg.ga_bits = NFS4_VATTR_MASK;
   1696 	lookuparg.mi = VTOMI4(rootvp);
   1697 
   1698 	(void) nfs4lookup_setup(path, &lookuparg, 1);
   1699 
   1700 	/* 0: putfh directory */
   1701 	argop = args.array;
   1702 	argop[0].argop = OP_CPUTFH;
   1703 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(rootvp)->r_fh;
   1704 
   1705 	num_argops = args.array_len;
   1706 
   1707 	rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
   1708 
   1709 	if (ep->error || res.status != NFS4_OK)
   1710 		goto exit;
   1711 
   1712 	/* get the object filehandle */
   1713 	resop = &res.array[res.array_len - 2];
   1714 	if (resop->resop != OP_GETFH) {
   1715 		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
   1716 		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1717 		ep->stat = NFS4ERR_SERVERFAULT;
   1718 		goto exit;
   1719 	}
   1720 	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
   1721 	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
   1722 		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
   1723 		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
   1724 		    TAG_NONE, 0, 0);
   1725 		ep->stat = NFS4ERR_SERVERFAULT;
   1726 		goto exit;
   1727 	}
   1728 	fhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
   1729 	nfs_fh4_copy(tmpfhp, fhp);
   1730 
   1731 	/* get the object attributes */
   1732 	resop = &res.array[res.array_len - 1];
   1733 	if (garp && resop->resop == OP_GETATTR)
   1734 		*garp = resop->nfs_resop4_u.opgetattr.ga_res;
   1735 
   1736 	/* See if there are enough fields in the response for parent info */
   1737 	if ((int)res.array_len - 5 <= 0)
   1738 		goto exit;
   1739 
   1740 	/* get the parent filehandle */
   1741 	resop = &res.array[res.array_len - 5];
   1742 	if (resop->resop != OP_GETFH) {
   1743 		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
   1744 		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1745 		ep->stat = NFS4ERR_SERVERFAULT;
   1746 		goto exit;
   1747 	}
   1748 	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
   1749 	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
   1750 		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
   1751 		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
   1752 		    TAG_NONE, 0, 0);
   1753 		ep->stat = NFS4ERR_SERVERFAULT;
   1754 		goto exit;
   1755 	}
   1756 	pfhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
   1757 	nfs_fh4_copy(tmpfhp, pfhp);
   1758 
   1759 	/* get the parent attributes */
   1760 	resop = &res.array[res.array_len - 4];
   1761 	if (pgarp && resop->resop == OP_GETATTR)
   1762 		*pgarp = resop->nfs_resop4_u.opgetattr.ga_res;
   1763 
   1764 exit:
   1765 	/*
   1766 	 * It is too hard to remember where all the OP_LOOKUPs are
   1767 	 */
   1768 	nfs4args_lookup_free(argop, num_argops);
   1769 	kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
   1770 
   1771 	if (!ep->error)
   1772 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1773 	kmem_free(path, strlen(path)+1);
   1774 }
   1775 
   1776 /*
   1777  * NFS client failover / volatile filehandle support
   1778  *
   1779  * Recover the filehandle for the given rnode.
   1780  *
   1781  * Errors are returned via the nfs4_error_t parameter.
   1782  */
   1783 
   1784 void
   1785 nfs4_remap_file(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
   1786 {
   1787 	int is_stub;
   1788 	rnode4_t *rp = VTOR4(vp);
   1789 	vnode_t *rootvp = NULL;
   1790 	vnode_t *dvp = NULL;
   1791 	cred_t *cr, *cred_otw;
   1792 	nfs4_ga_res_t gar, pgar;
   1793 	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
   1794 	int filetype = RML_ORDINARY;
   1795 	nfs4_recov_state_t recov = {NULL, 0, 0};
   1796 	int badfhcount = 0;
   1797 	nfs4_open_stream_t *osp = NULL;
   1798 	bool_t first_time = TRUE;	/* first time getting OTW cred */
   1799 	bool_t last_time = FALSE;	/* last time getting OTW cred */
   1800 
   1801 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1802 	    "nfs4_remap_file: remapping %s", rnode4info(rp)));
   1803 	ASSERT(nfs4_consistent_type(vp));
   1804 
   1805 	if (vp->v_flag & VROOT) {
   1806 		nfs4_remap_root(mi, ep, flags);
   1807 		return;
   1808 	}
   1809 
   1810 	/*
   1811 	 * Given the root fh, use the path stored in
   1812 	 * the rnode to find the fh for the new server.
   1813 	 */
   1814 	ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
   1815 	if (ep->error != 0)
   1816 		return;
   1817 
   1818 	cr = curthread->t_cred;
   1819 	ASSERT(cr != NULL);
   1820 get_remap_cred:
   1821 	/*
   1822 	 * Releases the osp, if it is provided.
   1823 	 * Puts a hold on the cred_otw and the new osp (if found).
   1824 	 */
   1825 	cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
   1826 	    &first_time, &last_time);
   1827 	ASSERT(cred_otw != NULL);
   1828 
   1829 	if (rp->r_flags & R4ISXATTR) {
   1830 		filetype = RML_NAMED_ATTR;
   1831 		(void) vtodv(vp, &dvp, cred_otw, FALSE);
   1832 	}
   1833 
   1834 	if (vp->v_flag & V_XATTRDIR) {
   1835 		filetype = RML_ATTRDIR;
   1836 	}
   1837 
   1838 	if (filetype == RML_ORDINARY && rootvp->v_type == VREG) {
   1839 		/* file mount, doesn't need a remap */
   1840 		goto done;
   1841 	}
   1842 
   1843 again:
   1844 	remap_lookup(rp->r_svnode.sv_name, rootvp, filetype, cred_otw,
   1845 	    &newfh, &gar, &newpfh, &pgar, ep);
   1846 
   1847 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1848 	    "nfs4_remap_file: remap_lookup returned %d/%d",
   1849 	    ep->error, ep->stat));
   1850 
   1851 	if (last_time == FALSE && ep->error == EACCES) {
   1852 		crfree(cred_otw);
   1853 		if (dvp != NULL)
   1854 			VN_RELE(dvp);
   1855 		goto get_remap_cred;
   1856 	}
   1857 	if (ep->error != 0)
   1858 		goto done;
   1859 
   1860 	switch (ep->stat) {
   1861 	case NFS4_OK:
   1862 		badfhcount = 0;
   1863 		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
   1864 			mutex_enter(&rp->r_statelock);
   1865 			rp->r_delay_interval = 0;
   1866 			mutex_exit(&rp->r_statelock);
   1867 			uprintf("NFS File Available..\n");
   1868 		}
   1869 		break;
   1870 	case NFS4ERR_FHEXPIRED:
   1871 	case NFS4ERR_BADHANDLE:
   1872 		/*
   1873 		 * If we ran into filehandle problems, we should try to
   1874 		 * remap the root vnode first and hope life gets better.
   1875 		 * But we need to avoid loops.
   1876 		 */
   1877 		if (badfhcount++ > 0)
   1878 			goto done;
   1879 		if (newfh.nfs_fh4_len != 0) {
   1880 			kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
   1881 			newfh.nfs_fh4_len = 0;
   1882 		}
   1883 		if (newpfh.nfs_fh4_len != 0) {
   1884 			kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
   1885 			newpfh.nfs_fh4_len = 0;
   1886 		}
   1887 		/* relative path - remap rootvp then retry */
   1888 		VN_RELE(rootvp);
   1889 		rootvp = NULL;
   1890 		nfs4_remap_root(mi, ep, flags);
   1891 		if (ep->error != 0 || ep->stat != NFS4_OK)
   1892 			goto done;
   1893 		ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
   1894 		if (ep->error != 0)
   1895 			goto done;
   1896 		goto again;
   1897 	case NFS4ERR_DELAY:
   1898 		badfhcount = 0;
   1899 		nfs4_set_delay_wait(vp);
   1900 		ep->error = nfs4_wait_for_delay(vp, &recov);
   1901 		if (ep->error != 0)
   1902 			goto done;
   1903 		goto again;
   1904 	case NFS4ERR_ACCESS:
   1905 		/* get new cred, try again */
   1906 		if (last_time == TRUE)
   1907 			goto done;
   1908 		if (dvp != NULL)
   1909 			VN_RELE(dvp);
   1910 		crfree(cred_otw);
   1911 		goto get_remap_cred;
   1912 	default:
   1913 		goto done;
   1914 	}
   1915 
   1916 	/*
   1917 	 * Check on the new and old rnodes before updating;
   1918 	 * if the vnode type or size changes, issue a warning
   1919 	 * and mark the file dead.
   1920 	 */
   1921 	mutex_enter(&rp->r_statelock);
   1922 	if (flags & NFS4_REMAP_CKATTRS) {
   1923 		if (vp->v_type != gar.n4g_va.va_type ||
   1924 		    (vp->v_type != VDIR &&
   1925 		    rp->r_size != gar.n4g_va.va_size)) {
   1926 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1927 			    "nfs4_remap_file: size %d vs. %d, type %d vs. %d",
   1928 			    (int)rp->r_size, (int)gar.n4g_va.va_size,
   1929 			    vp->v_type, gar.n4g_va.va_type));
   1930 			mutex_exit(&rp->r_statelock);
   1931 			nfs4_queue_event(RE_FILE_DIFF, mi,
   1932 			    rp->r_server->sv_hostname, 0, vp, NULL, 0, NULL, 0,
   1933 			    TAG_NONE, TAG_NONE, 0, 0);
   1934 			nfs4_fail_recov(vp, NULL, 0, NFS4_OK);
   1935 			goto done;
   1936 		}
   1937 	}
   1938 	ASSERT(gar.n4g_va.va_type != VNON);
   1939 	rp->r_server = mi->mi_curr_serv;
   1940 
   1941 	/*
   1942 	 * Turn this object into a "stub" object if we
   1943 	 * crossed an underlying server fs boundary.
   1944 	 *
   1945 	 * This stub will be for a mirror-mount.
   1946 	 *
   1947 	 * See comment in r4_do_attrcache() for more details.
   1948 	 */
   1949 	is_stub = 0;
   1950 	if (gar.n4g_fsid_valid) {
   1951 		(void) nfs_rw_enter_sig(&rp->r_server->sv_lock, RW_READER, 0);
   1952 		rp->r_srv_fsid = gar.n4g_fsid;
   1953 		if (!FATTR4_FSID_EQ(&gar.n4g_fsid, &rp->r_server->sv_fsid))
   1954 			is_stub = 1;
   1955 		nfs_rw_exit(&rp->r_server->sv_lock);
   1956 #ifdef DEBUG
   1957 	} else {
   1958 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1959 		    "remap_file: fsid attr not provided by server.  rp=%p",
   1960 		    (void *)rp));
   1961 #endif
   1962 	}
   1963 	if (is_stub)
   1964 		r4_stub_mirrormount(rp);
   1965 	else
   1966 		r4_stub_none(rp);
   1967 	mutex_exit(&rp->r_statelock);
   1968 	nfs4_attrcache_noinval(vp, &gar, gethrtime()); /* force update */
   1969 	sfh4_update(rp->r_fh, &newfh);
   1970 	ASSERT(nfs4_consistent_type(vp));
   1971 
   1972 	/*
   1973 	 * If we got parent info, use it to update the parent
   1974 	 */
   1975 	if (newpfh.nfs_fh4_len != 0) {
   1976 		if (rp->r_svnode.sv_dfh != NULL)
   1977 			sfh4_update(rp->r_svnode.sv_dfh, &newpfh);
   1978 		if (dvp != NULL) {
   1979 			/* force update of attrs */
   1980 			nfs4_attrcache_noinval(dvp, &pgar, gethrtime());
   1981 		}
   1982 	}
   1983 done:
   1984 	if (newfh.nfs_fh4_len != 0)
   1985 		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
   1986 	if (newpfh.nfs_fh4_len != 0)
   1987 		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
   1988 	if (cred_otw != NULL)
   1989 		crfree(cred_otw);
   1990 	if (rootvp != NULL)
   1991 		VN_RELE(rootvp);
   1992 	if (dvp != NULL)
   1993 		VN_RELE(dvp);
   1994 	if (osp != NULL)
   1995 		open_stream_rele(osp, rp);
   1996 }
   1997 
   1998 /*
   1999  * Client-side failover support: remap the filehandle for vp if it appears
   2000  * necessary.  errors are returned via the nfs4_error_t parameter; though,
   2001  * if there is a problem, we will just try again later.
   2002  */
   2003 
   2004 void
   2005 nfs4_check_remap(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
   2006 {
   2007 	if (vp == NULL)
   2008 		return;
   2009 
   2010 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY))
   2011 		return;
   2012 
   2013 	if (VTOR4(vp)->r_server == mi->mi_curr_serv)
   2014 		return;
   2015 
   2016 	nfs4_remap_file(mi, vp, flags, ep);
   2017 }
   2018 
   2019 /*
   2020  * nfs4_make_dotdot() - find or create a parent vnode of a non-root node.
   2021  *
   2022  * Our caller has a filehandle for ".." relative to a particular
   2023  * directory object.  We want to find or create a parent vnode
   2024  * with that filehandle and return it.  We can of course create
   2025  * a vnode from this filehandle, but we need to also make sure
   2026  * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR)
   2027  * that we have a parent FH for future reopens as well.  If
   2028  * we have a remap failure, we won't be able to reopen this
   2029  * file, but we won't treat that as fatal because a reopen
   2030  * is at least unlikely.  Someday nfs4_reopen() should look
   2031  * for a missing parent FH and try a remap to recover from it.
   2032  *
   2033  * need_start_op argument indicates whether this function should
   2034  * do a start_op before calling remap_lookup().  This should
   2035  * be FALSE, if you are the recovery thread or in an op; otherwise,
   2036  * set it to TRUE.
   2037  */
   2038 int
   2039 nfs4_make_dotdot(nfs4_sharedfh_t *fhp, hrtime_t t, vnode_t *dvp,
   2040     cred_t *cr, vnode_t **vpp, int need_start_op)
   2041 {
   2042 	mntinfo4_t *mi = VTOMI4(dvp);
   2043 	nfs4_fname_t *np = NULL, *pnp = NULL;
   2044 	vnode_t *vp = NULL, *rootvp = NULL;
   2045 	rnode4_t *rp;
   2046 	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
   2047 	nfs4_ga_res_t gar, pgar;
   2048 	vattr_t va, pva;
   2049 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   2050 	nfs4_sharedfh_t *sfh = NULL, *psfh = NULL;
   2051 	nfs4_recov_state_t recov_state;
   2052 
   2053 #ifdef DEBUG
   2054 	/*
   2055 	 * ensure need_start_op is correct
   2056 	 */
   2057 	{
   2058 		int no_need_start_op = (tsd_get(nfs4_tsd_key) ||
   2059 		    (curthread == mi->mi_recovthread));
   2060 		/* C needs a ^^ operator! */
   2061 		ASSERT(((need_start_op) && (!no_need_start_op)) ||
   2062 		    ((! need_start_op) && (no_need_start_op)));
   2063 	}
   2064 #endif
   2065 	ASSERT(VTOMI4(dvp)->mi_zone == nfs_zone());
   2066 
   2067 	NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE,
   2068 	    "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp,
   2069 	    rnode4info(VTOR4(dvp))));
   2070 
   2071 	/*
   2072 	 * rootvp might be needed eventually. Holding it now will
   2073 	 * ensure that r4find_unlocked() will find it, if ".." is the root.
   2074 	 */
   2075 	e.error = VFS_ROOT(mi->mi_vfsp, &rootvp);
   2076 	if (e.error != 0)
   2077 		goto out;
   2078 	rp = r4find_unlocked(fhp, mi->mi_vfsp);
   2079 	if (rp != NULL) {
   2080 		*vpp = RTOV4(rp);
   2081 		VN_RELE(rootvp);
   2082 		return (0);
   2083 	}
   2084 
   2085 	/*
   2086 	 * Since we don't have the rnode, we have to go over the wire.
   2087 	 * remap_lookup() can get all of the filehandles and attributes
   2088 	 * we need in one operation.
   2089 	 */
   2090 	np = fn_parent(VTOSV(dvp)->sv_name);
   2091 	ASSERT(np != NULL);
   2092 
   2093 	recov_state.rs_flags = 0;
   2094 	recov_state.rs_num_retry_despite_err = 0;
   2095 recov_retry:
   2096 	if (need_start_op) {
   2097 		e.error = nfs4_start_fop(mi, rootvp, NULL, OH_LOOKUP,
   2098 		    &recov_state, NULL);
   2099 		if (e.error != 0) {
   2100 			goto out;
   2101 		}
   2102 	}
   2103 	va.va_type = VNON;
   2104 	pva.va_type = VNON;
   2105 	remap_lookup(np, rootvp, RML_ORDINARY, cr,
   2106 	    &newfh, &gar, &newpfh, &pgar, &e);
   2107 	if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
   2108 		if (need_start_op) {
   2109 			bool_t abort;
   2110 
   2111 			abort = nfs4_start_recovery(&e, mi,
   2112 			    rootvp, NULL, NULL, NULL, OP_LOOKUP, NULL);
   2113 			if (abort) {
   2114 				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
   2115 				    &recov_state, FALSE);
   2116 				if (e.error == 0)
   2117 					e.error = EIO;
   2118 				goto out;
   2119 			}
   2120 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
   2121 			    &recov_state, TRUE);
   2122 			goto recov_retry;
   2123 		}
   2124 		if (e.error == 0)
   2125 			e.error = EIO;
   2126 		goto out;
   2127 	}
   2128 
   2129 	if (!e.error) {
   2130 		va = gar.n4g_va;
   2131 		pva = pgar.n4g_va;
   2132 	}
   2133 
   2134 	if ((e.error != 0) ||
   2135 	    (va.va_type != VDIR)) {
   2136 		if (need_start_op)
   2137 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
   2138 			    &recov_state, FALSE);
   2139 		if (e.error == 0)
   2140 			e.error = EIO;
   2141 		goto out;
   2142 	}
   2143 
   2144 	if (e.stat != NFS4_OK) {
   2145 		if (need_start_op)
   2146 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
   2147 			    &recov_state, FALSE);
   2148 		e.error = EIO;
   2149 		goto out;
   2150 	}
   2151 
   2152 	/*
   2153 	 * It is possible for remap_lookup() to return with no error,
   2154 	 * but without providing the parent filehandle and attrs.
   2155 	 */
   2156 	if (pva.va_type != VDIR) {
   2157 		/*
   2158 		 * Call remap_lookup() again, this time with the
   2159 		 * newpfh and pgar args in the first position.
   2160 		 */
   2161 		pnp = fn_parent(np);
   2162 		if (pnp != NULL) {
   2163 			remap_lookup(pnp, rootvp, RML_ORDINARY, cr,
   2164 			    &newpfh, &pgar, NULL, NULL, &e);
   2165 			if (nfs4_needs_recovery(&e, FALSE,
   2166 			    mi->mi_vfsp)) {
   2167 				if (need_start_op) {
   2168 					bool_t abort;
   2169 
   2170 					abort = nfs4_start_recovery(&e, mi,
   2171 					    rootvp, NULL, NULL, NULL,
   2172 					    OP_LOOKUP, NULL);
   2173 					if (abort) {
   2174 						nfs4_end_fop(mi, rootvp, NULL,
   2175 						    OH_LOOKUP, &recov_state,
   2176 						    FALSE);
   2177 						if (e.error == 0)
   2178 							e.error = EIO;
   2179 						goto out;
   2180 					}
   2181 					nfs4_end_fop(mi, rootvp, NULL,
   2182 					    OH_LOOKUP, &recov_state, TRUE);
   2183 					goto recov_retry;
   2184 				}
   2185 				if (e.error == 0)
   2186 					e.error = EIO;
   2187 				goto out;
   2188 			}
   2189 
   2190 			if (e.stat != NFS4_OK) {
   2191 				if (need_start_op)
   2192 					nfs4_end_fop(mi, rootvp, NULL,
   2193 					    OH_LOOKUP, &recov_state, FALSE);
   2194 				e.error = EIO;
   2195 				goto out;
   2196 			}
   2197 		}
   2198 		if ((pnp == NULL) ||
   2199 		    (e.error != 0) ||
   2200 		    (pva.va_type == VNON)) {
   2201 			if (need_start_op)
   2202 				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
   2203 				    &recov_state, FALSE);
   2204 			if (e.error == 0)
   2205 				e.error = EIO;
   2206 			goto out;
   2207 		}
   2208 	}
   2209 	ASSERT(newpfh.nfs_fh4_len != 0);
   2210 	if (need_start_op)
   2211 		nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, &recov_state, FALSE);
   2212 	psfh = sfh4_get(&newpfh, mi);
   2213 
   2214 	sfh = sfh4_get(&newfh, mi);
   2215 	vp = makenfs4node_by_fh(sfh, psfh, &np, &gar, mi, cr, t);
   2216 
   2217 out:
   2218 	if (np != NULL)
   2219 		fn_rele(&np);
   2220 	if (pnp != NULL)
   2221 		fn_rele(&pnp);
   2222 	if (newfh.nfs_fh4_len != 0)
   2223 		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
   2224 	if (newpfh.nfs_fh4_len != 0)
   2225 		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
   2226 	if (sfh != NULL)
   2227 		sfh4_rele(&sfh);
   2228 	if (psfh != NULL)
   2229 		sfh4_rele(&psfh);
   2230 	if (rootvp != NULL)
   2231 		VN_RELE(rootvp);
   2232 	*vpp = vp;
   2233 	return (e.error);
   2234 }
   2235 
   2236 #ifdef DEBUG
   2237 size_t r_path_memuse = 0;
   2238 #endif
   2239 
   2240 /*
   2241  * NFS client failover support
   2242  *
   2243  * sv4_free() frees the malloc'd portion of a "servinfo_t".
   2244  */
   2245 void
   2246 sv4_free(servinfo4_t *svp)
   2247 {
   2248 	servinfo4_t *next;
   2249 	struct knetconfig *knconf;
   2250 
   2251 	while (svp != NULL) {
   2252 		next = svp->sv_next;
   2253 		if (svp->sv_dhsec)
   2254 			sec_clnt_freeinfo(svp->sv_dhsec);
   2255 		if (svp->sv_secdata)
   2256 			sec_clnt_freeinfo(svp->sv_secdata);
   2257 		if (svp->sv_save_secinfo &&
   2258 		    svp->sv_save_secinfo != svp->sv_secinfo)
   2259 			secinfo_free(svp->sv_save_secinfo);
   2260 		if (svp->sv_secinfo)
   2261 			secinfo_free(svp->sv_secinfo);
   2262 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
   2263 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
   2264 		knconf = svp->sv_knconf;
   2265 		if (knconf != NULL) {
   2266 			if (knconf->knc_protofmly != NULL)
   2267 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
   2268 			if (knconf->knc_proto != NULL)
   2269 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
   2270 			kmem_free(knconf, sizeof (*knconf));
   2271 		}
   2272 		knconf = svp->sv_origknconf;
   2273 		if (knconf != NULL) {
   2274 			if (knconf->knc_protofmly != NULL)
   2275 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
   2276 			if (knconf->knc_proto != NULL)
   2277 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
   2278 			kmem_free(knconf, sizeof (*knconf));
   2279 		}
   2280 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
   2281 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
   2282 		if (svp->sv_path != NULL) {
   2283 			kmem_free(svp->sv_path, svp->sv_pathlen);
   2284 		}
   2285 		nfs_rw_destroy(&svp->sv_lock);
   2286 		kmem_free(svp, sizeof (*svp));
   2287 		svp = next;
   2288 	}
   2289 }
   2290 
   2291 void
   2292 nfs4_printfhandle(nfs4_fhandle_t *fhp)
   2293 {
   2294 	int *ip;
   2295 	char *buf;
   2296 	size_t bufsize;
   2297 	char *cp;
   2298 
   2299 	/*
   2300 	 * 13 == "(file handle:"
   2301 	 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
   2302 	 *	1 == ' '
   2303 	 *	8 == maximum strlen of "%x"
   2304 	 * 3 == ")\n\0"
   2305 	 */
   2306 	bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
   2307 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
   2308 	if (buf == NULL)
   2309 		return;
   2310 
   2311 	cp = buf;
   2312 	(void) strcpy(cp, "(file handle:");
   2313 	while (*cp != '\0')
   2314 		cp++;
   2315 	for (ip = (int *)fhp->fh_buf;
   2316 	    ip < (int *)&fhp->fh_buf[fhp->fh_len];
   2317 	    ip++) {
   2318 		(void) sprintf(cp, " %x", *ip);
   2319 		while (*cp != '\0')
   2320 			cp++;
   2321 	}
   2322 	(void) strcpy(cp, ")\n");
   2323 
   2324 	zcmn_err(getzoneid(), CE_CONT, "%s", buf);
   2325 
   2326 	kmem_free(buf, bufsize);
   2327 }
   2328 
   2329 /*
   2330  * The NFSv4 readdir cache subsystem.
   2331  *
   2332  * We provide a set of interfaces to allow the rest of the system to utilize
   2333  * a caching mechanism while encapsulating the details of the actual
   2334  * implementation.  This should allow for better maintainability and
   2335  * extensibility by consolidating the implementation details in one location.
   2336  */
   2337 
   2338 /*
   2339  * Comparator used by AVL routines.
   2340  */
   2341 static int
   2342 rddir4_cache_compar(const void *x, const void *y)
   2343 {
   2344 	rddir4_cache_impl *ai = (rddir4_cache_impl *)x;
   2345 	rddir4_cache_impl *bi = (rddir4_cache_impl *)y;
   2346 	rddir4_cache *a = &ai->rc;
   2347 	rddir4_cache *b = &bi->rc;
   2348 
   2349 	if (a->nfs4_cookie == b->nfs4_cookie) {
   2350 		if (a->buflen == b->buflen)
   2351 			return (0);
   2352 		if (a->buflen < b->buflen)
   2353 			return (-1);
   2354 		return (1);
   2355 	}
   2356 
   2357 	if (a->nfs4_cookie < b->nfs4_cookie)
   2358 			return (-1);
   2359 
   2360 	return (1);
   2361 }
   2362 
   2363 /*
   2364  * Allocate an opaque handle for the readdir cache.
   2365  */
   2366 void
   2367 rddir4_cache_create(rnode4_t *rp)
   2368 {
   2369 	ASSERT(rp->r_dir == NULL);
   2370 
   2371 	rp->r_dir = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
   2372 
   2373 	avl_create(rp->r_dir, rddir4_cache_compar, sizeof (rddir4_cache_impl),
   2374 	    offsetof(rddir4_cache_impl, tree));
   2375 }
   2376 
   2377 /*
   2378  *  Purge the cache of all cached readdir responses.
   2379  */
   2380 void
   2381 rddir4_cache_purge(rnode4_t *rp)
   2382 {
   2383 	rddir4_cache_impl	*rdip;
   2384 	rddir4_cache_impl	*nrdip;
   2385 
   2386 	ASSERT(MUTEX_HELD(&rp->r_statelock));
   2387 
   2388 	if (rp->r_dir == NULL)
   2389 		return;
   2390 
   2391 	rdip = avl_first(rp->r_dir);
   2392 
   2393 	while (rdip != NULL) {
   2394 		nrdip = AVL_NEXT(rp->r_dir, rdip);
   2395 		avl_remove(rp->r_dir, rdip);
   2396 		rdip->rc.flags &= ~RDDIRCACHED;
   2397 		rddir4_cache_rele(rp, &rdip->rc);
   2398 		rdip = nrdip;
   2399 	}
   2400 	ASSERT(avl_numnodes(rp->r_dir) == 0);
   2401 }
   2402 
   2403 /*
   2404  * Destroy the readdir cache.
   2405  */
   2406 void
   2407 rddir4_cache_destroy(rnode4_t *rp)
   2408 {
   2409 	ASSERT(MUTEX_HELD(&rp->r_statelock));
   2410 	if (rp->r_dir == NULL)
   2411 		return;
   2412 
   2413 	rddir4_cache_purge(rp);
   2414 	avl_destroy(rp->r_dir);
   2415 	kmem_free(rp->r_dir, sizeof (avl_tree_t));
   2416 	rp->r_dir = NULL;
   2417 }
   2418 
   2419 /*
   2420  * Locate a readdir response from the readdir cache.
   2421  *
   2422  * Return values:
   2423  *
   2424  * NULL - If there is an unrecoverable situation like the operation may have
   2425  *	  been interrupted.
   2426  *
   2427  * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller.
   2428  *		    The flags are set approprately, such that the caller knows
   2429  *		    what state the entry is in.
   2430  */
   2431 rddir4_cache *
   2432 rddir4_cache_lookup(rnode4_t *rp, offset_t cookie, int count)
   2433 {
   2434 	rddir4_cache_impl	*rdip = NULL;
   2435 	rddir4_cache_impl	srdip;
   2436 	rddir4_cache		*srdc;
   2437 	rddir4_cache		*rdc = NULL;
   2438 	rddir4_cache		*nrdc = NULL;
   2439 	avl_index_t		where;
   2440 
   2441 top:
   2442 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
   2443 	ASSERT(MUTEX_HELD(&rp->r_statelock));
   2444 	/*
   2445 	 * Check to see if the readdir cache has been disabled.  If so, then
   2446 	 * simply allocate an rddir4_cache entry and return it, since caching
   2447 	 * operations do not apply.
   2448 	 */
   2449 	if (rp->r_dir == NULL) {
   2450 		if (nrdc == NULL) {
   2451 			/*
   2452 			 * Drop the lock because we are doing a sleeping
   2453 			 * allocation.
   2454 			 */
   2455 			mutex_exit(&rp->r_statelock);
   2456 			rdc = rddir4_cache_alloc(KM_SLEEP);
   2457 			rdc->nfs4_cookie = cookie;
   2458 			rdc->buflen = count;
   2459 			mutex_enter(&rp->r_statelock);
   2460 			return (rdc);
   2461 		}
   2462 		return (nrdc);
   2463 	}
   2464 
   2465 	srdc = &srdip.rc;
   2466 	srdc->nfs4_cookie = cookie;
   2467 	srdc->buflen = count;
   2468 
   2469 	rdip = avl_find(rp->r_dir, &srdip, &where);
   2470 
   2471 	/*
   2472 	 * If we didn't find an entry then create one and insert it
   2473 	 * into the cache.
   2474 	 */
   2475 	if (rdip == NULL) {
   2476 		/*
   2477 		 * Check for the case where we have made a second pass through
   2478 		 * the cache due to a lockless allocation.  If we find that no
   2479 		 * thread has already inserted this entry, do the insert now
   2480 		 * and return.
   2481 		 */
   2482 		if (nrdc != NULL) {
   2483 			avl_insert(rp->r_dir, nrdc->data, where);
   2484 			nrdc->flags |= RDDIRCACHED;
   2485 			rddir4_cache_hold(nrdc);
   2486 			return (nrdc);
   2487 		}
   2488 
   2489 #ifdef DEBUG
   2490 		nfs4_readdir_cache_misses++;
   2491 #endif
   2492 		/*
   2493 		 * First, try to allocate an entry without sleeping.  If that
   2494 		 * fails then drop the lock and do a sleeping allocation.
   2495 		 */
   2496 		nrdc = rddir4_cache_alloc(KM_NOSLEEP);
   2497 		if (nrdc != NULL) {
   2498 			nrdc->nfs4_cookie = cookie;
   2499 			nrdc->buflen = count;
   2500 			avl_insert(rp->r_dir, nrdc->data, where);
   2501 			nrdc->flags |= RDDIRCACHED;
   2502 			rddir4_cache_hold(nrdc);
   2503 			return (nrdc);
   2504 		}
   2505 
   2506 		/*
   2507 		 * Drop the lock and do a sleeping allocation.	We incur
   2508 		 * additional overhead by having to search the cache again,
   2509 		 * but this case should be rare.
   2510 		 */
   2511 		mutex_exit(&rp->r_statelock);
   2512 		nrdc = rddir4_cache_alloc(KM_SLEEP);
   2513 		nrdc->nfs4_cookie = cookie;
   2514 		nrdc->buflen = count;
   2515 		mutex_enter(&rp->r_statelock);
   2516 		/*
   2517 		 * We need to take another pass through the cache
   2518 		 * since we dropped our lock to perform the alloc.
   2519 		 * Another thread may have come by and inserted the
   2520 		 * entry we are interested in.
   2521 		 */
   2522 		goto top;
   2523 	}
   2524 
   2525 	/*
   2526 	 * Check to see if we need to free our entry.  This can happen if
   2527 	 * another thread came along beat us to the insert.  We can
   2528 	 * safely call rddir4_cache_free directly because no other thread
   2529 	 * would have a reference to this entry.
   2530 	 */
   2531 	if (nrdc != NULL)
   2532 		rddir4_cache_free((rddir4_cache_impl *)nrdc->data);
   2533 
   2534 #ifdef DEBUG
   2535 	nfs4_readdir_cache_hits++;
   2536 #endif
   2537 	/*
   2538 	 * Found something.  Make sure it's ready to return.
   2539 	 */
   2540 	rdc = &rdip->rc;
   2541 	rddir4_cache_hold(rdc);
   2542 	/*
   2543 	 * If the cache entry is in the process of being filled in, wait
   2544 	 * until this completes.  The RDDIRWAIT bit is set to indicate that
   2545 	 * someone is waiting and when the thread currently filling the entry
   2546 	 * is done, it should do a cv_broadcast to wakeup all of the threads
   2547 	 * waiting for it to finish. If the thread wakes up to find that
   2548 	 * someone new is now trying to complete the the entry, go back
   2549 	 * to sleep.
   2550 	 */
   2551 	while (rdc->flags & RDDIR) {
   2552 		/*
   2553 		 * The entry is not complete.
   2554 		 */
   2555 		nfs_rw_exit(&rp->r_rwlock);
   2556 		rdc->flags |= RDDIRWAIT;
   2557 #ifdef DEBUG
   2558 		nfs4_readdir_cache_waits++;
   2559 #endif
   2560 		while (rdc->flags & RDDIRWAIT) {
   2561 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
   2562 				/*
   2563 				 * We got interrupted, probably the user
   2564 				 * typed ^C or an alarm fired.  We free the
   2565 				 * new entry if we allocated one.
   2566 				 */
   2567 				rddir4_cache_rele(rp, rdc);
   2568 				mutex_exit(&rp->r_statelock);
   2569 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
   2570 				    RW_READER, FALSE);
   2571 				mutex_enter(&rp->r_statelock);
   2572 				return (NULL);
   2573 			}
   2574 		}
   2575 		mutex_exit(&rp->r_statelock);
   2576 		(void) nfs_rw_enter_sig(&rp->r_rwlock,
   2577 		    RW_READER, FALSE);
   2578 		mutex_enter(&rp->r_statelock);
   2579 	}
   2580 
   2581 	/*
   2582 	 * The entry we were waiting on may have been purged from
   2583 	 * the cache and should no longer be used, release it and
   2584 	 * start over.
   2585 	 */
   2586 	if (!(rdc->flags & RDDIRCACHED)) {
   2587 		rddir4_cache_rele(rp, rdc);
   2588 		goto top;
   2589 	}
   2590 
   2591 	/*
   2592 	 * The entry is completed.  Return it.
   2593 	 */
   2594 	return (rdc);
   2595 }
   2596 
   2597 /*
   2598  * Allocate a cache element and return it.  Can return NULL if memory is
   2599  * low.
   2600  */
   2601 static rddir4_cache *
   2602 rddir4_cache_alloc(int flags)
   2603 {
   2604 	rddir4_cache_impl	*rdip = NULL;
   2605 	rddir4_cache		*rc = NULL;
   2606 
   2607 	rdip = kmem_alloc(sizeof (rddir4_cache_impl), flags);
   2608 
   2609 	if (rdip != NULL) {
   2610 		rc = &rdip->rc;
   2611 		rc->data = (void *)rdip;
   2612 		rc->nfs4_cookie = 0;
   2613 		rc->nfs4_ncookie = 0;
   2614 		rc->entries = NULL;
   2615 		rc->eof = 0;
   2616 		rc->entlen = 0;
   2617 		rc->buflen = 0;
   2618 		rc->actlen = 0;
   2619 		/*
   2620 		 * A readdir is required so set the flag.
   2621 		 */
   2622 		rc->flags = RDDIRREQ;
   2623 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
   2624 		rc->error = 0;
   2625 		mutex_init(&rdip->lock, NULL, MUTEX_DEFAULT, NULL);
   2626 		rdip->count = 1;
   2627 #ifdef DEBUG
   2628 		atomic_add_64(&clstat4_debug.dirent.value.ui64, 1);
   2629 #endif
   2630 	}
   2631 	return (rc);
   2632 }
   2633 
   2634 /*
   2635  * Increment the reference count to this cache element.
   2636  */
   2637 static void
   2638 rddir4_cache_hold(rddir4_cache *rc)
   2639 {
   2640 	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rc->data;
   2641 
   2642 	mutex_enter(&rdip->lock);
   2643 	rdip->count++;
   2644 	mutex_exit(&rdip->lock);
   2645 }
   2646 
   2647 /*
   2648  * Release a reference to this cache element.  If the count is zero then
   2649  * free the element.
   2650  */
   2651 void
   2652 rddir4_cache_rele(rnode4_t *rp, rddir4_cache *rdc)
   2653 {
   2654 	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rdc->data;
   2655 
   2656 	ASSERT(MUTEX_HELD(&rp->r_statelock));
   2657 
   2658 	/*
   2659 	 * Check to see if we have any waiters.  If so, we can wake them
   2660 	 * so that they can proceed.
   2661 	 */
   2662 	if (rdc->flags & RDDIRWAIT) {
   2663 		rdc->flags &= ~RDDIRWAIT;
   2664 		cv_broadcast(&rdc->cv);
   2665 	}
   2666 
   2667 	mutex_enter(&rdip->lock);
   2668 	ASSERT(rdip->count > 0);
   2669 	if (--rdip->count == 0) {
   2670 		mutex_exit(&rdip->lock);
   2671 		rddir4_cache_free(rdip);
   2672 	} else
   2673 		mutex_exit(&rdip->lock);
   2674 }
   2675 
   2676 /*
   2677  * Free a cache element.
   2678  */
   2679 static void
   2680 rddir4_cache_free(rddir4_cache_impl *rdip)
   2681 {
   2682 	rddir4_cache *rc = &rdip->rc;
   2683 
   2684 #ifdef DEBUG
   2685 	atomic_add_64(&clstat4_debug.dirent.value.ui64, -1);
   2686 #endif
   2687 	if (rc->entries != NULL)
   2688 		kmem_free(rc->entries, rc->buflen);
   2689 	cv_destroy(&rc->cv);
   2690 	mutex_destroy(&rdip->lock);
   2691 	kmem_free(rdip, sizeof (*rdip));
   2692 }
   2693 
   2694 /*
   2695  * Snapshot callback for nfs:0:nfs4_client as registered with the kstat
   2696  * framework.
   2697  */
   2698 static int
   2699 cl4_snapshot(kstat_t *ksp, void *buf, int rw)
   2700 {
   2701 	ksp->ks_snaptime = gethrtime();
   2702 	if (rw == KSTAT_WRITE) {
   2703 		bcopy(buf, ksp->ks_private, sizeof (clstat4_tmpl));
   2704 #ifdef DEBUG
   2705 		/*
   2706 		 * Currently only the global zone can write to kstats, but we
   2707 		 * add the check just for paranoia.
   2708 		 */
   2709 		if (INGLOBALZONE(curproc))
   2710 			bcopy((char *)buf + sizeof (clstat4_tmpl),
   2711 			    &clstat4_debug, sizeof (clstat4_debug));
   2712 #endif
   2713 	} else {
   2714 		bcopy(ksp->ks_private, buf, sizeof (clstat4_tmpl));
   2715 #ifdef DEBUG
   2716 		/*
   2717 		 * If we're displaying the "global" debug kstat values, we
   2718 		 * display them as-is to all zones since in fact they apply to
   2719 		 * the system as a whole.
   2720 		 */
   2721 		bcopy(&clstat4_debug, (char *)buf + sizeof (clstat4_tmpl),
   2722 		    sizeof (clstat4_debug));
   2723 #endif
   2724 	}
   2725 	return (0);
   2726 }
   2727 
   2728 
   2729 
   2730 /*
   2731  * Zone support
   2732  */
   2733 static void *
   2734 clinit4_zone(zoneid_t zoneid)
   2735 {
   2736 	kstat_t *nfs4_client_kstat;
   2737 	struct nfs4_clnt *nfscl;
   2738 	uint_t ndata;
   2739 
   2740 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
   2741 	mutex_init(&nfscl->nfscl_chtable4_lock, NULL, MUTEX_DEFAULT, NULL);
   2742 	nfscl->nfscl_chtable4 = NULL;
   2743 	nfscl->nfscl_zoneid = zoneid;
   2744 
   2745 	bcopy(&clstat4_tmpl, &nfscl->nfscl_stat, sizeof (clstat4_tmpl));
   2746 	ndata = sizeof (clstat4_tmpl) / sizeof (kstat_named_t);
   2747 #ifdef DEBUG
   2748 	ndata += sizeof (clstat4_debug) / sizeof (kstat_named_t);
   2749 #endif
   2750 	if ((nfs4_client_kstat = kstat_create_zone("nfs", 0, "nfs4_client",
   2751 	    "misc", KSTAT_TYPE_NAMED, ndata,
   2752 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
   2753 		nfs4_client_kstat->ks_private = &nfscl->nfscl_stat;
   2754 		nfs4_client_kstat->ks_snapshot = cl4_snapshot;
   2755 		kstat_install(nfs4_client_kstat);
   2756 	}
   2757 	mutex_enter(&nfs4_clnt_list_lock);
   2758 	list_insert_head(&nfs4_clnt_list, nfscl);
   2759 	mutex_exit(&nfs4_clnt_list_lock);
   2760 	return (nfscl);
   2761 }
   2762 
   2763 /*ARGSUSED*/
   2764 static void
   2765 clfini4_zone(zoneid_t zoneid, void *arg)
   2766 {
   2767 	struct nfs4_clnt *nfscl = arg;
   2768 	chhead_t *chp, *next;
   2769 
   2770 	if (nfscl == NULL)
   2771 		return;
   2772 	mutex_enter(&nfs4_clnt_list_lock);
   2773 	list_remove(&nfs4_clnt_list, nfscl);
   2774 	mutex_exit(&nfs4_clnt_list_lock);
   2775 	clreclaim4_zone(nfscl, 0);
   2776 	for (chp = nfscl->nfscl_chtable4; chp != NULL; chp = next) {
   2777 		ASSERT(chp->ch_list == NULL);
   2778 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
   2779 		next = chp->ch_next;
   2780 		kmem_free(chp, sizeof (*chp));
   2781 	}
   2782 	kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid);
   2783 	mutex_destroy(&nfscl->nfscl_chtable4_lock);
   2784 	kmem_free(nfscl, sizeof (*nfscl));
   2785 }
   2786 
   2787 /*
   2788  * Called by endpnt_destructor to make sure the client handles are
   2789  * cleaned up before the RPC endpoints.  This becomes a no-op if
   2790  * clfini_zone (above) is called first.  This function is needed
   2791  * (rather than relying on clfini_zone to clean up) because the ZSD
   2792  * callbacks have no ordering mechanism, so we have no way to ensure
   2793  * that clfini_zone is called before endpnt_destructor.
   2794  */
   2795 void
   2796 clcleanup4_zone(zoneid_t zoneid)
   2797 {
   2798 	struct nfs4_clnt *nfscl;
   2799 
   2800 	mutex_enter(&nfs4_clnt_list_lock);
   2801 	nfscl = list_head(&nfs4_clnt_list);
   2802 	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) {
   2803 		if (nfscl->nfscl_zoneid == zoneid) {
   2804 			clreclaim4_zone(nfscl, 0);
   2805 			break;
   2806 		}
   2807 	}
   2808 	mutex_exit(&nfs4_clnt_list_lock);
   2809 }
   2810 
   2811 int
   2812 nfs4_subr_init(void)
   2813 {
   2814 	/*
   2815 	 * Allocate and initialize the client handle cache
   2816 	 */
   2817 	chtab4_cache = kmem_cache_create("client_handle4_cache",
   2818 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim4, NULL,
   2819 	    NULL, 0);
   2820 
   2821 	/*
   2822 	 * Initialize the list of per-zone client handles (and associated data).
   2823 	 * This needs to be done before we call zone_key_create().
   2824 	 */
   2825 	list_create(&nfs4_clnt_list, sizeof (struct nfs4_clnt),
   2826 	    offsetof(struct nfs4_clnt, nfscl_node));
   2827 
   2828 	/*
   2829 	 * Initialize the zone_key for per-zone client handle lists.
   2830 	 */
   2831 	zone_key_create(&nfs4clnt_zone_key, clinit4_zone, NULL, clfini4_zone);
   2832 
   2833 	if (nfs4err_delay_time == 0)
   2834 		nfs4err_delay_time = NFS4ERR_DELAY_TIME;
   2835 
   2836 	return (0);
   2837 }
   2838 
   2839 int
   2840 nfs4_subr_fini(void)
   2841 {
   2842 	/*
   2843 	 * Deallocate the client handle cache
   2844 	 */
   2845 	kmem_cache_destroy(chtab4_cache);
   2846 
   2847 	/*
   2848 	 * Destroy the zone_key
   2849 	 */
   2850 	(void) zone_key_delete(nfs4clnt_zone_key);
   2851 
   2852 	return (0);
   2853 }
   2854 /*
   2855  * Set or Clear direct I/O flag
   2856  * VOP_RWLOCK() is held for write access to prevent a race condition
   2857  * which would occur if a process is in the middle of a write when
   2858  * directio flag gets set. It is possible that all pages may not get flushed.
   2859  *
   2860  * This is a copy of nfs_directio, changes here may need to be made
   2861  * there and vice versa.
   2862  */
   2863 
   2864 int
   2865 nfs4_directio(vnode_t *vp, int cmd, cred_t *cr)
   2866 {
   2867 	int	error = 0;
   2868 	rnode4_t *rp;
   2869 
   2870 	rp = VTOR4(vp);
   2871 
   2872 	if (cmd == DIRECTIO_ON) {
   2873 
   2874 		if (rp->r_flags & R4DIRECTIO)
   2875 			return (0);
   2876 
   2877 		/*
   2878 		 * Flush the page cache.
   2879 		 */
   2880 
   2881 		(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
   2882 
   2883 		if (rp->r_flags & R4DIRECTIO) {
   2884 			VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
   2885 			return (0);
   2886 		}
   2887 
   2888 		if (nfs4_has_pages(vp) &&
   2889 		    ((rp->r_flags & R4DIRTY) || rp->r_awcount > 0)) {
   2890 			error = VOP_PUTPAGE(vp, (offset_t)0, (uint_t)0,
   2891 			    B_INVAL, cr, NULL);
   2892 			if (error) {
   2893 				if (error == ENOSPC || error == EDQUOT) {
   2894 					mutex_enter(&rp->r_statelock);
   2895 					if (!rp->r_error)
   2896 						rp->r_error = error;
   2897 					mutex_exit(&rp->r_statelock);
   2898 				}
   2899 				VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
   2900 				return (error);
   2901 			}
   2902 		}
   2903 
   2904 		mutex_enter(&rp->r_statelock);
   2905 		rp->r_flags |= R4DIRECTIO;
   2906 		mutex_exit(&rp->r_statelock);
   2907 		VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
   2908 		return (0);
   2909 	}
   2910 
   2911 	if (cmd == DIRECTIO_OFF) {
   2912 		mutex_enter(&rp->r_statelock);
   2913 		rp->r_flags &= ~R4DIRECTIO;	/* disable direct mode */
   2914 		mutex_exit(&rp->r_statelock);
   2915 		return (0);
   2916 	}
   2917 
   2918 	return (EINVAL);
   2919 }
   2920 
   2921 /*
   2922  * Return TRUE if the file has any pages.  Always go back to
   2923  * the master vnode to check v_pages since none of the shadows
   2924  * can have pages.
   2925  */
   2926 
   2927 bool_t
   2928 nfs4_has_pages(vnode_t *vp)
   2929 {
   2930 	rnode4_t *rp;
   2931 
   2932 	rp = VTOR4(vp);
   2933 	if (IS_SHADOW(vp, rp))
   2934 		vp = RTOV4(rp);	/* RTOV4 always gives the master */
   2935 
   2936 	return (vn_has_cached_data(vp));
   2937 }
   2938 
   2939 /*
   2940  * This table is used to determine whether the client should attempt
   2941  * failover based on the clnt_stat value returned by CLNT_CALL.  The
   2942  * clnt_stat is used as an index into the table.  If
   2943  * the error value that corresponds to the clnt_stat value in the
   2944  * table is non-zero, then that is the error to be returned AND
   2945  * that signals that failover should be attempted.
   2946  *
   2947  * Special note: If the RPC_ values change, then direct indexing of the
   2948  * table is no longer valid, but having the RPC_ values in the table
   2949  * allow the functions to detect the change and issue a warning.
   2950  * In this case, the code will always attempt failover as a defensive
   2951  * measure.
   2952  */
   2953 
   2954 static struct try_failover_tab {
   2955 	enum clnt_stat	cstat;
   2956 	int		error;
   2957 } try_failover_table [] = {
   2958 
   2959 	RPC_SUCCESS,		0,
   2960 	RPC_CANTENCODEARGS,	0,
   2961 	RPC_CANTDECODERES,	0,
   2962 	RPC_CANTSEND,		ECOMM,
   2963 	RPC_CANTRECV,		ECOMM,
   2964 	RPC_TIMEDOUT,		ETIMEDOUT,
   2965 	RPC_VERSMISMATCH,	0,
   2966 	RPC_AUTHERROR,		0,
   2967 	RPC_PROGUNAVAIL,	0,
   2968 	RPC_PROGVERSMISMATCH,	0,
   2969 	RPC_PROCUNAVAIL,	0,
   2970 	RPC_CANTDECODEARGS,	0,
   2971 	RPC_SYSTEMERROR,	ENOSR,
   2972 	RPC_UNKNOWNHOST,	EHOSTUNREACH,
   2973 	RPC_RPCBFAILURE,	ENETUNREACH,
   2974 	RPC_PROGNOTREGISTERED,	ECONNREFUSED,
   2975 	RPC_FAILED,		ETIMEDOUT,
   2976 	RPC_UNKNOWNPROTO,	EHOSTUNREACH,
   2977 	RPC_INTR,		0,
   2978 	RPC_UNKNOWNADDR,	EHOSTUNREACH,
   2979 	RPC_TLIERROR,		0,
   2980 	RPC_NOBROADCAST,	EHOSTUNREACH,
   2981 	RPC_N2AXLATEFAILURE,	ECONNREFUSED,
   2982 	RPC_UDERROR,		0,
   2983 	RPC_INPROGRESS,		0,
   2984 	RPC_STALERACHANDLE,	EINVAL,
   2985 	RPC_CANTCONNECT,	ECONNREFUSED,
   2986 	RPC_XPRTFAILED,		ECONNABORTED,
   2987 	RPC_CANTCREATESTREAM,	ECONNREFUSED,
   2988 	RPC_CANTSTORE,		ENOBUFS
   2989 };
   2990 
   2991 /*
   2992  * nfs4_try_failover - determine whether the client should
   2993  * attempt failover based on the values stored in the nfs4_error_t.
   2994  */
   2995 int
   2996 nfs4_try_failover(nfs4_error_t *ep)
   2997 {
   2998 	if (ep->error == ETIMEDOUT || ep->stat == NFS4ERR_RESOURCE)
   2999 		return (TRUE);
   3000 
   3001 	if (ep->error && ep->rpc_status != RPC_SUCCESS)
   3002 		return (try_failover(ep->rpc_status) != 0 ? TRUE : FALSE);
   3003 
   3004 	return (FALSE);
   3005 }
   3006 
   3007 /*
   3008  * try_failover - internal version of nfs4_try_failover, called
   3009  * only by rfscall and aclcall.  Determine if failover is warranted
   3010  * based on the clnt_stat and return the error number if it is.
   3011  */
   3012 static int
   3013 try_failover(enum clnt_stat rpc_status)
   3014 {
   3015 	int err = 0;
   3016 
   3017 	if (rpc_status == RPC_SUCCESS)
   3018 		return (0);
   3019 
   3020 #ifdef	DEBUG
   3021 	if (rpc_status != 0 && nfs4_try_failover_any) {
   3022 		err = ETIMEDOUT;
   3023 		goto done;
   3024 	}
   3025 #endif
   3026 	/*
   3027 	 * The rpc status is used as an index into the table.
   3028 	 * If the rpc status is outside of the range of the
   3029 	 * table or if the rpc error numbers have been changed
   3030 	 * since the table was constructed, then print a warning
   3031 	 * (DEBUG only) and try failover anyway.  Otherwise, just
   3032 	 * grab the resulting error number out of the table.
   3033 	 */
   3034 	if (rpc_status < RPC_SUCCESS || rpc_status >=
   3035 	    sizeof (try_failover_table)/sizeof (try_failover_table[0]) ||
   3036 	    try_failover_table[rpc_status].cstat != rpc_status) {
   3037 
   3038 		err = ETIMEDOUT;
   3039 #ifdef	DEBUG
   3040 		cmn_err(CE_NOTE, "try_failover: unexpected rpc error %d",
   3041 		    rpc_status);
   3042 #endif
   3043 	} else
   3044 		err = try_failover_table[rpc_status].error;
   3045 
   3046 done:
   3047 	if (rpc_status)
   3048 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   3049 		    "nfs4_try_failover: %strying failover on error %d",
   3050 		    err ? "" : "NOT ", rpc_status));
   3051 
   3052 	return (err);
   3053 }
   3054 
   3055 void
   3056 nfs4_error_zinit(nfs4_error_t *ep)
   3057 {
   3058 	ep->error = 0;
   3059 	ep->stat = NFS4_OK;
   3060 	ep->rpc_status = RPC_SUCCESS;
   3061 }
   3062 
   3063 void
   3064 nfs4_error_init(nfs4_error_t *ep, int error)
   3065 {
   3066 	ep->error = error;
   3067 	ep->stat = NFS4_OK;
   3068 	ep->rpc_status = RPC_SUCCESS;
   3069 }
   3070 
   3071 
   3072 #ifdef DEBUG
   3073 
   3074 /*
   3075  * Return a 16-bit hash for filehandle, stateid, clientid, owner.
   3076  * use the same algorithm as for NFS v3.
   3077  *
   3078  */
   3079 int
   3080 hash16(void *p, int len)
   3081 {
   3082 	int i, rem;
   3083 	uint_t *wp;
   3084 	uint_t key = 0;
   3085 
   3086 	/* protect against non word aligned */
   3087 	if ((rem = len & 3) != 0)
   3088 		len &= ~3;
   3089 
   3090 	for (i = 0, wp = (uint_t *)p; i < len; i += 4, wp++) {
   3091 		key ^= (*wp >> 16) ^ *wp;
   3092 	}
   3093 
   3094 	/* hash left-over bytes */
   3095 	for (i = 0; i < rem; i++)
   3096 		key ^= *((uchar_t *)p + i);
   3097 
   3098 	return (key & 0xffff);
   3099 }
   3100 
   3101 /*
   3102  * rnode4info - return filehandle and path information for an rnode.
   3103  * XXX MT issues: uses a single static buffer, no locking of path.
   3104  */
   3105 char *
   3106 rnode4info(rnode4_t *rp)
   3107 {
   3108 	static char buf[80];
   3109 	nfs4_fhandle_t fhandle;
   3110 	char *path;
   3111 	char *type;
   3112 
   3113 	if (rp == NULL)
   3114 		return ("null");
   3115 	if (rp->r_flags & R4ISXATTR)
   3116 		type = "attr";
   3117 	else if (RTOV4(rp)->v_flag & V_XATTRDIR)
   3118 		type = "attrdir";
   3119 	else if (RTOV4(rp)->v_flag & VROOT)
   3120 		type = "root";
   3121 	else if (RTOV4(rp)->v_type == VDIR)
   3122 		type = "dir";
   3123 	else if (RTOV4(rp)->v_type == VREG)
   3124 		type = "file";
   3125 	else
   3126 		type = "other";
   3127 	sfh4_copyval(rp->r_fh, &fhandle);
   3128 	path = fn_path(rp->r_svnode.sv_name);
   3129 	(void) snprintf(buf, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n",
   3130 	    (void *)rp, path, type, rp->r_flags,
   3131 	    hash16((void *)&fhandle.fh_buf, fhandle.fh_len));
   3132 	kmem_free(path, strlen(path)+1);
   3133 	return (buf);
   3134 }
   3135 #endif
   3136