Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
     28  *	All Rights Reserved
     29  */
     30 
     31 #include <sys/param.h>
     32 #include <sys/types.h>
     33 #include <sys/systm.h>
     34 #include <sys/cmn_err.h>
     35 #include <sys/vtrace.h>
     36 #include <sys/session.h>
     37 #include <sys/thread.h>
     38 #include <sys/dnlc.h>
     39 #include <sys/cred.h>
     40 #include <sys/priv.h>
     41 #include <sys/list.h>
     42 #include <sys/sdt.h>
     43 #include <sys/policy.h>
     44 
     45 #include <rpc/types.h>
     46 #include <rpc/xdr.h>
     47 
     48 #include <nfs/nfs.h>
     49 
     50 #include <nfs/nfs_clnt.h>
     51 
     52 #include <nfs/nfs4.h>
     53 #include <nfs/rnode4.h>
     54 #include <nfs/nfs4_clnt.h>
     55 
     56 /*
     57  * client side statistics
     58  */
     59 static const struct clstat4 clstat4_tmpl = {
     60 	{ "calls",	KSTAT_DATA_UINT64 },
     61 	{ "badcalls",	KSTAT_DATA_UINT64 },
     62 	{ "referrals",	KSTAT_DATA_UINT64 },
     63 	{ "referlinks",	KSTAT_DATA_UINT64 },
     64 	{ "clgets",	KSTAT_DATA_UINT64 },
     65 	{ "cltoomany",	KSTAT_DATA_UINT64 },
     66 #ifdef DEBUG
     67 	{ "clalloc",	KSTAT_DATA_UINT64 },
     68 	{ "noresponse",	KSTAT_DATA_UINT64 },
     69 	{ "failover",	KSTAT_DATA_UINT64 },
     70 	{ "remap",	KSTAT_DATA_UINT64 },
     71 #endif
     72 };
     73 
     74 #ifdef DEBUG
     75 struct clstat4_debug clstat4_debug = {
     76 	{ "nrnode",	KSTAT_DATA_UINT64 },
     77 	{ "access",	KSTAT_DATA_UINT64 },
     78 	{ "dirent",	KSTAT_DATA_UINT64 },
     79 	{ "dirents",	KSTAT_DATA_UINT64 },
     80 	{ "reclaim",	KSTAT_DATA_UINT64 },
     81 	{ "clreclaim",	KSTAT_DATA_UINT64 },
     82 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
     83 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
     84 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
     85 	{ "r_path",	KSTAT_DATA_UINT64 },
     86 };
     87 #endif
     88 
     89 /*
     90  * We keep a global list of per-zone client data, so we can clean up all zones
     91  * if we get low on memory.
     92  */
     93 static list_t nfs4_clnt_list;
     94 static kmutex_t nfs4_clnt_list_lock;
     95 zone_key_t nfs4clnt_zone_key;
     96 
     97 static struct kmem_cache *chtab4_cache;
     98 
     99 #ifdef DEBUG
    100 static int nfs4_rfscall_debug;
    101 static int nfs4_try_failover_any;
    102 int nfs4_utf8_debug = 0;
    103 #endif
    104 
    105 /*
    106  * NFSv4 readdir cache implementation
    107  */
    108 typedef struct rddir4_cache_impl {
    109 	rddir4_cache	rc;		/* readdir cache element */
    110 	kmutex_t	lock;		/* lock protects count */
    111 	uint_t		count;		/* reference count */
    112 	avl_node_t	tree;		/* AVL tree link */
    113 } rddir4_cache_impl;
    114 
    115 static int rddir4_cache_compar(const void *, const void *);
    116 static void rddir4_cache_free(rddir4_cache_impl *);
    117 static rddir4_cache *rddir4_cache_alloc(int);
    118 static void rddir4_cache_hold(rddir4_cache *);
    119 static int try_failover(enum clnt_stat);
    120 
    121 static int nfs4_readdir_cache_hits = 0;
    122 static int nfs4_readdir_cache_waits = 0;
    123 static int nfs4_readdir_cache_misses = 0;
    124 
    125 /*
    126  * Shared nfs4 functions
    127  */
    128 
    129 /*
    130  * Copy an nfs_fh4.  The destination storage (to->nfs_fh4_val) must already
    131  * be allocated.
    132  */
    133 
    134 void
    135 nfs_fh4_copy(nfs_fh4 *from, nfs_fh4 *to)
    136 {
    137 	to->nfs_fh4_len = from->nfs_fh4_len;
    138 	bcopy(from->nfs_fh4_val, to->nfs_fh4_val, to->nfs_fh4_len);
    139 }
    140 
    141 /*
    142  * nfs4cmpfh - compare 2 filehandles.
    143  * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is
    144  * "less" than the second, +1 if the first is "greater" than the second.
    145  */
    146 
    147 int
    148 nfs4cmpfh(const nfs_fh4 *fh4p1, const nfs_fh4 *fh4p2)
    149 {
    150 	const char *c1, *c2;
    151 
    152 	if (fh4p1->nfs_fh4_len < fh4p2->nfs_fh4_len)
    153 		return (-1);
    154 	if (fh4p1->nfs_fh4_len > fh4p2->nfs_fh4_len)
    155 		return (1);
    156 	for (c1 = fh4p1->nfs_fh4_val, c2 = fh4p2->nfs_fh4_val;
    157 	    c1 < fh4p1->nfs_fh4_val + fh4p1->nfs_fh4_len;
    158 	    c1++, c2++) {
    159 		if (*c1 < *c2)
    160 			return (-1);
    161 		if (*c1 > *c2)
    162 			return (1);
    163 	}
    164 
    165 	return (0);
    166 }
    167 
    168 /*
    169  * Compare two v4 filehandles.  Return zero if they're the same, non-zero
    170  * if they're not.  Like nfs4cmpfh(), but different filehandle
    171  * representation, and doesn't provide information about greater than or
    172  * less than.
    173  */
    174 
    175 int
    176 nfs4cmpfhandle(nfs4_fhandle_t *fh1, nfs4_fhandle_t *fh2)
    177 {
    178 	if (fh1->fh_len == fh2->fh_len)
    179 		return (bcmp(fh1->fh_buf, fh2->fh_buf, fh1->fh_len));
    180 
    181 	return (1);
    182 }
    183 
    184 int
    185 stateid4_cmp(stateid4 *s1, stateid4 *s2)
    186 {
    187 	if (bcmp(s1, s2, sizeof (stateid4)) == 0)
    188 		return (1);
    189 	else
    190 		return (0);
    191 }
    192 
    193 nfsstat4
    194 puterrno4(int error)
    195 {
    196 	switch (error) {
    197 	case 0:
    198 		return (NFS4_OK);
    199 	case EPERM:
    200 		return (NFS4ERR_PERM);
    201 	case ENOENT:
    202 		return (NFS4ERR_NOENT);
    203 	case EINTR:
    204 		return (NFS4ERR_IO);
    205 	case EIO:
    206 		return (NFS4ERR_IO);
    207 	case ENXIO:
    208 		return (NFS4ERR_NXIO);
    209 	case ENOMEM:
    210 		return (NFS4ERR_RESOURCE);
    211 	case EACCES:
    212 		return (NFS4ERR_ACCESS);
    213 	case EBUSY:
    214 		return (NFS4ERR_IO);
    215 	case EEXIST:
    216 		return (NFS4ERR_EXIST);
    217 	case EXDEV:
    218 		return (NFS4ERR_XDEV);
    219 	case ENODEV:
    220 		return (NFS4ERR_IO);
    221 	case ENOTDIR:
    222 		return (NFS4ERR_NOTDIR);
    223 	case EISDIR:
    224 		return (NFS4ERR_ISDIR);
    225 	case EINVAL:
    226 		return (NFS4ERR_INVAL);
    227 	case EMFILE:
    228 		return (NFS4ERR_RESOURCE);
    229 	case EFBIG:
    230 		return (NFS4ERR_FBIG);
    231 	case ENOSPC:
    232 		return (NFS4ERR_NOSPC);
    233 	case EROFS:
    234 		return (NFS4ERR_ROFS);
    235 	case EMLINK:
    236 		return (NFS4ERR_MLINK);
    237 	case EDEADLK:
    238 		return (NFS4ERR_DEADLOCK);
    239 	case ENOLCK:
    240 		return (NFS4ERR_DENIED);
    241 	case EREMOTE:
    242 		return (NFS4ERR_SERVERFAULT);
    243 	case ENOTSUP:
    244 		return (NFS4ERR_NOTSUPP);
    245 	case EDQUOT:
    246 		return (NFS4ERR_DQUOT);
    247 	case ENAMETOOLONG:
    248 		return (NFS4ERR_NAMETOOLONG);
    249 	case EOVERFLOW:
    250 		return (NFS4ERR_INVAL);
    251 	case ENOSYS:
    252 		return (NFS4ERR_NOTSUPP);
    253 	case ENOTEMPTY:
    254 		return (NFS4ERR_NOTEMPTY);
    255 	case EOPNOTSUPP:
    256 		return (NFS4ERR_NOTSUPP);
    257 	case ESTALE:
    258 		return (NFS4ERR_STALE);
    259 	case EAGAIN:
    260 		if (curthread->t_flag & T_WOULDBLOCK) {
    261 			curthread->t_flag &= ~T_WOULDBLOCK;
    262 			return (NFS4ERR_DELAY);
    263 		}
    264 		return (NFS4ERR_LOCKED);
    265 	default:
    266 		return ((enum nfsstat4)error);
    267 	}
    268 }
    269 
    270 int
    271 geterrno4(enum nfsstat4 status)
    272 {
    273 	switch (status) {
    274 	case NFS4_OK:
    275 		return (0);
    276 	case NFS4ERR_PERM:
    277 		return (EPERM);
    278 	case NFS4ERR_NOENT:
    279 		return (ENOENT);
    280 	case NFS4ERR_IO:
    281 		return (EIO);
    282 	case NFS4ERR_NXIO:
    283 		return (ENXIO);
    284 	case NFS4ERR_ACCESS:
    285 		return (EACCES);
    286 	case NFS4ERR_EXIST:
    287 		return (EEXIST);
    288 	case NFS4ERR_XDEV:
    289 		return (EXDEV);
    290 	case NFS4ERR_NOTDIR:
    291 		return (ENOTDIR);
    292 	case NFS4ERR_ISDIR:
    293 		return (EISDIR);
    294 	case NFS4ERR_INVAL:
    295 		return (EINVAL);
    296 	case NFS4ERR_FBIG:
    297 		return (EFBIG);
    298 	case NFS4ERR_NOSPC:
    299 		return (ENOSPC);
    300 	case NFS4ERR_ROFS:
    301 		return (EROFS);
    302 	case NFS4ERR_MLINK:
    303 		return (EMLINK);
    304 	case NFS4ERR_NAMETOOLONG:
    305 		return (ENAMETOOLONG);
    306 	case NFS4ERR_NOTEMPTY:
    307 		return (ENOTEMPTY);
    308 	case NFS4ERR_DQUOT:
    309 		return (EDQUOT);
    310 	case NFS4ERR_STALE:
    311 		return (ESTALE);
    312 	case NFS4ERR_BADHANDLE:
    313 		return (ESTALE);
    314 	case NFS4ERR_BAD_COOKIE:
    315 		return (EINVAL);
    316 	case NFS4ERR_NOTSUPP:
    317 		return (EOPNOTSUPP);
    318 	case NFS4ERR_TOOSMALL:
    319 		return (EINVAL);
    320 	case NFS4ERR_SERVERFAULT:
    321 		return (EIO);
    322 	case NFS4ERR_BADTYPE:
    323 		return (EINVAL);
    324 	case NFS4ERR_DELAY:
    325 		return (ENXIO);
    326 	case NFS4ERR_SAME:
    327 		return (EPROTO);
    328 	case NFS4ERR_DENIED:
    329 		return (ENOLCK);
    330 	case NFS4ERR_EXPIRED:
    331 		return (EPROTO);
    332 	case NFS4ERR_LOCKED:
    333 		return (EACCES);
    334 	case NFS4ERR_GRACE:
    335 		return (EAGAIN);
    336 	case NFS4ERR_FHEXPIRED:	/* if got here, failed to get a new fh */
    337 		return (ESTALE);
    338 	case NFS4ERR_SHARE_DENIED:
    339 		return (EACCES);
    340 	case NFS4ERR_WRONGSEC:
    341 		return (EPERM);
    342 	case NFS4ERR_CLID_INUSE:
    343 		return (EAGAIN);
    344 	case NFS4ERR_RESOURCE:
    345 		return (EAGAIN);
    346 	case NFS4ERR_MOVED:
    347 		return (EPROTO);
    348 	case NFS4ERR_NOFILEHANDLE:
    349 		return (EIO);
    350 	case NFS4ERR_MINOR_VERS_MISMATCH:
    351 		return (ENOTSUP);
    352 	case NFS4ERR_STALE_CLIENTID:
    353 		return (EIO);
    354 	case NFS4ERR_STALE_STATEID:
    355 		return (EIO);
    356 	case NFS4ERR_OLD_STATEID:
    357 		return (EIO);
    358 	case NFS4ERR_BAD_STATEID:
    359 		return (EIO);
    360 	case NFS4ERR_BAD_SEQID:
    361 		return (EIO);
    362 	case NFS4ERR_NOT_SAME:
    363 		return (EPROTO);
    364 	case NFS4ERR_LOCK_RANGE:
    365 		return (EPROTO);
    366 	case NFS4ERR_SYMLINK:
    367 		return (EPROTO);
    368 	case NFS4ERR_RESTOREFH:
    369 		return (EPROTO);
    370 	case NFS4ERR_LEASE_MOVED:
    371 		return (EPROTO);
    372 	case NFS4ERR_ATTRNOTSUPP:
    373 		return (ENOTSUP);
    374 	case NFS4ERR_NO_GRACE:
    375 		return (EPROTO);
    376 	case NFS4ERR_RECLAIM_BAD:
    377 		return (EPROTO);
    378 	case NFS4ERR_RECLAIM_CONFLICT:
    379 		return (EPROTO);
    380 	case NFS4ERR_BADXDR:
    381 		return (EINVAL);
    382 	case NFS4ERR_LOCKS_HELD:
    383 		return (EIO);
    384 	case NFS4ERR_OPENMODE:
    385 		return (EACCES);
    386 	case NFS4ERR_BADOWNER:
    387 		/*
    388 		 * Client and server are in different DNS domains
    389 		 * and the NFSMAPID_DOMAIN in /etc/default/nfs
    390 		 * doesn't match.  No good answer here.  Return
    391 		 * EACCESS, which translates to "permission denied".
    392 		 */
    393 		return (EACCES);
    394 	case NFS4ERR_BADCHAR:
    395 		return (EINVAL);
    396 	case NFS4ERR_BADNAME:
    397 		return (EINVAL);
    398 	case NFS4ERR_BAD_RANGE:
    399 		return (EIO);
    400 	case NFS4ERR_LOCK_NOTSUPP:
    401 		return (ENOTSUP);
    402 	case NFS4ERR_OP_ILLEGAL:
    403 		return (EINVAL);
    404 	case NFS4ERR_DEADLOCK:
    405 		return (EDEADLK);
    406 	case NFS4ERR_FILE_OPEN:
    407 		return (EACCES);
    408 	case NFS4ERR_ADMIN_REVOKED:
    409 		return (EPROTO);
    410 	case NFS4ERR_CB_PATH_DOWN:
    411 		return (EPROTO);
    412 	default:
    413 #ifdef DEBUG
    414 		zcmn_err(getzoneid(), CE_WARN, "geterrno4: got status %d",
    415 		    status);
    416 #endif
    417 		return ((int)status);
    418 	}
    419 }
    420 
    421 void
    422 nfs4_log_badowner(mntinfo4_t *mi, nfs_opnum4 op)
    423 {
    424 	nfs4_server_t *server;
    425 
    426 	/*
    427 	 * Return if already printed/queued a msg
    428 	 * for this mount point.
    429 	 */
    430 	if (mi->mi_flags & MI4_BADOWNER_DEBUG)
    431 		return;
    432 	/*
    433 	 * Happens once per client <-> server pair.
    434 	 */
    435 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
    436 	    mi->mi_flags & MI4_INT))
    437 		return;
    438 
    439 	server = find_nfs4_server(mi);
    440 	if (server == NULL) {
    441 		nfs_rw_exit(&mi->mi_recovlock);
    442 		return;
    443 	}
    444 
    445 	if (!(server->s_flags & N4S_BADOWNER_DEBUG)) {
    446 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
    447 		    "!NFSMAPID_DOMAIN does not match"
    448 		    " the server: %s domain.\n"
    449 		    "Please check configuration",
    450 		    mi->mi_curr_serv->sv_hostname);
    451 		server->s_flags |= N4S_BADOWNER_DEBUG;
    452 	}
    453 	mutex_exit(&server->s_lock);
    454 	nfs4_server_rele(server);
    455 	nfs_rw_exit(&mi->mi_recovlock);
    456 
    457 	/*
    458 	 * Happens once per mntinfo4_t.
    459 	 * This error is deemed as one of the recovery facts "RF_BADOWNER",
    460 	 * queue this in the mesg queue for this mount_info. This message
    461 	 * is not printed, meaning its absent from id_to_dump_solo_fact()
    462 	 * but its there for inspection if the queue is ever dumped/inspected.
    463 	 */
    464 	mutex_enter(&mi->mi_lock);
    465 	if (!(mi->mi_flags & MI4_BADOWNER_DEBUG)) {
    466 		nfs4_queue_fact(RF_BADOWNER, mi, NFS4ERR_BADOWNER, 0, op,
    467 		    FALSE, NULL, 0, NULL);
    468 		mi->mi_flags |= MI4_BADOWNER_DEBUG;
    469 	}
    470 	mutex_exit(&mi->mi_lock);
    471 }
    472 
    473 int
    474 nfs4_time_ntov(nfstime4 *ntime, timestruc_t *vatime)
    475 {
    476 	int64_t sec;
    477 	int32_t nsec;
    478 
    479 	/*
    480 	 * Here check that the nfsv4 time is valid for the system.
    481 	 * nfsv4 time value is a signed 64-bit, and the system time
    482 	 * may be either int64_t or int32_t (depends on the kernel),
    483 	 * so if the kernel is 32-bit, the nfsv4 time value may not fit.
    484 	 */
    485 #ifndef _LP64
    486 	if (! NFS4_TIME_OK(ntime->seconds)) {
    487 		return (EOVERFLOW);
    488 	}
    489 #endif
    490 
    491 	/* Invalid to specify 1 billion (or more) nsecs */
    492 	if (ntime->nseconds >= 1000000000)
    493 		return (EINVAL);
    494 
    495 	if (ntime->seconds < 0) {
    496 		sec = ntime->seconds + 1;
    497 		nsec = -1000000000 + ntime->nseconds;
    498 	} else {
    499 		sec = ntime->seconds;
    500 		nsec = ntime->nseconds;
    501 	}
    502 
    503 	vatime->tv_sec = sec;
    504 	vatime->tv_nsec = nsec;
    505 
    506 	return (0);
    507 }
    508 
    509 int
    510 nfs4_time_vton(timestruc_t *vatime, nfstime4 *ntime)
    511 {
    512 	int64_t sec;
    513 	uint32_t nsec;
    514 
    515 	/*
    516 	 * nfsv4 time value is a signed 64-bit, and the system time
    517 	 * may be either int64_t or int32_t (depends on the kernel),
    518 	 * so all system time values will fit.
    519 	 */
    520 	if (vatime->tv_nsec >= 0) {
    521 		sec = vatime->tv_sec;
    522 		nsec = vatime->tv_nsec;
    523 	} else {
    524 		sec = vatime->tv_sec - 1;
    525 		nsec = 1000000000 + vatime->tv_nsec;
    526 	}
    527 	ntime->seconds = sec;
    528 	ntime->nseconds = nsec;
    529 
    530 	return (0);
    531 }
    532 
    533 /*
    534  * Converts a utf8 string to a valid null terminated filename string.
    535  *
    536  * XXX - Not actually translating the UTF-8 string as per RFC 2279.
    537  *	 For now, just validate that the UTF-8 string off the wire
    538  *	 does not have characters that will freak out UFS, and leave
    539  *	 it at that.
    540  */
    541 char *
    542 utf8_to_fn(utf8string *u8s, uint_t *lenp, char *s)
    543 {
    544 	ASSERT(lenp != NULL);
    545 
    546 	if (u8s == NULL || u8s->utf8string_len <= 0 ||
    547 	    u8s->utf8string_val == NULL)
    548 		return (NULL);
    549 
    550 	/*
    551 	 * Check for obvious illegal filename chars
    552 	 */
    553 	if (utf8_strchr(u8s, '/') != NULL) {
    554 #ifdef DEBUG
    555 		if (nfs4_utf8_debug) {
    556 			char *path;
    557 			int len = u8s->utf8string_len;
    558 
    559 			path = kmem_alloc(len + 1, KM_SLEEP);
    560 			bcopy(u8s->utf8string_val, path, len);
    561 			path[len] = '\0';
    562 
    563 			zcmn_err(getzoneid(), CE_WARN,
    564 			    "Invalid UTF-8 filename: %s", path);
    565 
    566 			kmem_free(path, len + 1);
    567 		}
    568 #endif
    569 		return (NULL);
    570 	}
    571 
    572 	return (utf8_to_str(u8s, lenp, s));
    573 }
    574 
    575 /*
    576  * Converts a utf8 string to a C string.
    577  * kmem_allocs a new string if not supplied
    578  */
    579 char *
    580 utf8_to_str(utf8string *str, uint_t *lenp, char *s)
    581 {
    582 	char	*sp;
    583 	char	*u8p;
    584 	int	len;
    585 	int	 i;
    586 
    587 	ASSERT(lenp != NULL);
    588 
    589 	if (str == NULL)
    590 		return (NULL);
    591 
    592 	u8p = str->utf8string_val;
    593 	len = str->utf8string_len;
    594 	if (len <= 0 || u8p == NULL) {
    595 		if (s)
    596 			*s = '\0';
    597 		return (NULL);
    598 	}
    599 
    600 	sp = s;
    601 	if (sp == NULL)
    602 		sp = kmem_alloc(len + 1, KM_SLEEP);
    603 
    604 	/*
    605 	 * At least check for embedded nulls
    606 	 */
    607 	for (i = 0; i < len; i++) {
    608 		sp[i] = u8p[i];
    609 		if (u8p[i] == '\0') {
    610 #ifdef	DEBUG
    611 			zcmn_err(getzoneid(), CE_WARN,
    612 			    "Embedded NULL in UTF-8 string");
    613 #endif
    614 			if (s == NULL)
    615 				kmem_free(sp, len + 1);
    616 			return (NULL);
    617 		}
    618 	}
    619 	sp[len] = '\0';
    620 	*lenp = len + 1;
    621 
    622 	return (sp);
    623 }
    624 
    625 /*
    626  * str_to_utf8 - converts a null-terminated C string to a utf8 string
    627  */
    628 utf8string *
    629 str_to_utf8(char *nm, utf8string *str)
    630 {
    631 	int len;
    632 
    633 	if (str == NULL)
    634 		return (NULL);
    635 
    636 	if (nm == NULL || *nm == '\0') {
    637 		str->utf8string_len = 0;
    638 		str->utf8string_val = NULL;
    639 	}
    640 
    641 	len = strlen(nm);
    642 
    643 	str->utf8string_val = kmem_alloc(len, KM_SLEEP);
    644 	str->utf8string_len = len;
    645 	bcopy(nm, str->utf8string_val, len);
    646 
    647 	return (str);
    648 }
    649 
    650 utf8string *
    651 utf8_copy(utf8string *src, utf8string *dest)
    652 {
    653 	if (src == NULL)
    654 		return (NULL);
    655 	if (dest == NULL)
    656 		return (NULL);
    657 
    658 	if (src->utf8string_len > 0) {
    659 		dest->utf8string_val = kmem_alloc(src->utf8string_len,
    660 		    KM_SLEEP);
    661 		bcopy(src->utf8string_val, dest->utf8string_val,
    662 		    src->utf8string_len);
    663 		dest->utf8string_len = src->utf8string_len;
    664 	} else {
    665 		dest->utf8string_val = NULL;
    666 		dest->utf8string_len = 0;
    667 	}
    668 
    669 	return (dest);
    670 }
    671 
    672 int
    673 utf8_compare(const utf8string *a, const utf8string *b)
    674 {
    675 	int mlen, cmp;
    676 	int alen, blen;
    677 	char *aval, *bval;
    678 
    679 	if ((a == NULL) && (b == NULL))
    680 		return (0);
    681 	else if (a == NULL)
    682 		return (-1);
    683 	else if (b == NULL)
    684 		return (1);
    685 
    686 	alen = a->utf8string_len;
    687 	blen = b->utf8string_len;
    688 	aval = a->utf8string_val;
    689 	bval = b->utf8string_val;
    690 
    691 	if (((alen == 0) || (aval == NULL)) &&
    692 	    ((blen == 0) || (bval == NULL)))
    693 		return (0);
    694 	else if ((alen == 0) || (aval == NULL))
    695 		return (-1);
    696 	else if ((blen == 0) || (bval == NULL))
    697 		return (1);
    698 
    699 	mlen = MIN(alen, blen);
    700 	cmp = strncmp(aval, bval, mlen);
    701 
    702 	if ((cmp == 0) && (alen == blen))
    703 		return (0);
    704 	else if ((cmp == 0) && (alen < blen))
    705 		return (-1);
    706 	else if (cmp == 0)
    707 		return (1);
    708 	else if (cmp < 0)
    709 		return (-1);
    710 	return (1);
    711 }
    712 
    713 /*
    714  * utf8_dir_verify - checks that the utf8 string is valid
    715  */
    716 int
    717 utf8_dir_verify(utf8string *str)
    718 {
    719 	char *nm;
    720 	int len;
    721 
    722 	if (str == NULL)
    723 		return (0);
    724 
    725 	nm = str->utf8string_val;
    726 	len = str->utf8string_len;
    727 	if (nm == NULL || len == 0) {
    728 		return (0);
    729 	}
    730 
    731 	if (len == 1 && nm[0] == '.')
    732 		return (0);
    733 	if (len == 2 && nm[0] == '.' && nm[1] == '.')
    734 		return (0);
    735 
    736 	if (utf8_strchr(str, '/') != NULL)
    737 		return (0);
    738 
    739 	if (utf8_strchr(str, '\0') != NULL)
    740 		return (0);
    741 
    742 	return (1);
    743 }
    744 
    745 /*
    746  * from rpcsec module (common/rpcsec)
    747  */
    748 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
    749 extern void sec_clnt_freeh(AUTH *);
    750 extern void sec_clnt_freeinfo(struct sec_data *);
    751 
    752 /*
    753  * authget() gets an auth handle based on the security
    754  * information from the servinfo in mountinfo.
    755  * The auth handle is stored in ch_client->cl_auth.
    756  *
    757  * First security flavor of choice is to use sv_secdata
    758  * which is initiated by the client. If that fails, get
    759  * secinfo from the server and then select one from the
    760  * server secinfo list .
    761  *
    762  * For RPCSEC_GSS flavor, upon success, a secure context is
    763  * established between client and server.
    764  */
    765 int
    766 authget(servinfo4_t *svp, CLIENT *ch_client, cred_t *cr)
    767 {
    768 	int error, i;
    769 
    770 	/*
    771 	 * SV4_TRYSECINFO indicates to try the secinfo list from
    772 	 * sv_secinfo until a successful one is reached. Point
    773 	 * sv_currsec to the selected security mechanism for
    774 	 * later sessions.
    775 	 */
    776 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
    777 	if ((svp->sv_flags & SV4_TRYSECINFO) && svp->sv_secinfo) {
    778 		for (i = svp->sv_secinfo->index; i < svp->sv_secinfo->count;
    779 		    i++) {
    780 			if (!(error = sec_clnt_geth(ch_client,
    781 			    &svp->sv_secinfo->sdata[i],
    782 			    cr, &ch_client->cl_auth))) {
    783 
    784 				svp->sv_currsec = &svp->sv_secinfo->sdata[i];
    785 				svp->sv_secinfo->index = i;
    786 				/* done */
    787 				svp->sv_flags &= ~SV4_TRYSECINFO;
    788 				break;
    789 			}
    790 
    791 			/*
    792 			 * Allow the caller retry with the security flavor
    793 			 * pointed by svp->sv_secinfo->index when
    794 			 * ETIMEDOUT/ECONNRESET occurs.
    795 			 */
    796 			if (error == ETIMEDOUT || error == ECONNRESET) {
    797 				svp->sv_secinfo->index = i;
    798 				break;
    799 			}
    800 		}
    801 	} else {
    802 		/* sv_currsec points to one of the entries in sv_secinfo */
    803 		if (svp->sv_currsec) {
    804 			error = sec_clnt_geth(ch_client, svp->sv_currsec, cr,
    805 			    &ch_client->cl_auth);
    806 		} else {
    807 			/* If it's null, use sv_secdata. */
    808 			error = sec_clnt_geth(ch_client, svp->sv_secdata, cr,
    809 			    &ch_client->cl_auth);
    810 		}
    811 	}
    812 	nfs_rw_exit(&svp->sv_lock);
    813 
    814 	return (error);
    815 }
    816 
    817 /*
    818  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
    819  */
    820 int
    821 clget4(clinfo_t *ci, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
    822     struct chtab **chp, struct nfs4_clnt *nfscl)
    823 {
    824 	struct chhead *ch, *newch;
    825 	struct chhead **plistp;
    826 	struct chtab *cp;
    827 	int error;
    828 	k_sigset_t smask;
    829 
    830 	if (newcl == NULL || chp == NULL || ci == NULL)
    831 		return (EINVAL);
    832 
    833 	*newcl = NULL;
    834 	*chp = NULL;
    835 
    836 	/*
    837 	 * Find an unused handle or create one
    838 	 */
    839 	newch = NULL;
    840 	nfscl->nfscl_stat.clgets.value.ui64++;
    841 top:
    842 	/*
    843 	 * Find the correct entry in the cache to check for free
    844 	 * client handles.  The search is based on the RPC program
    845 	 * number, program version number, dev_t for the transport
    846 	 * device, and the protocol family.
    847 	 */
    848 	mutex_enter(&nfscl->nfscl_chtable4_lock);
    849 	plistp = &nfscl->nfscl_chtable4;
    850 	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
    851 		if (ch->ch_prog == ci->cl_prog &&
    852 		    ch->ch_vers == ci->cl_vers &&
    853 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
    854 		    (strcmp(ch->ch_protofmly,
    855 		    svp->sv_knconf->knc_protofmly) == 0))
    856 			break;
    857 		plistp = &ch->ch_next;
    858 	}
    859 
    860 	/*
    861 	 * If we didn't find a cache entry for this quadruple, then
    862 	 * create one.  If we don't have one already preallocated,
    863 	 * then drop the cache lock, create one, and then start over.
    864 	 * If we did have a preallocated entry, then just add it to
    865 	 * the front of the list.
    866 	 */
    867 	if (ch == NULL) {
    868 		if (newch == NULL) {
    869 			mutex_exit(&nfscl->nfscl_chtable4_lock);
    870 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
    871 			newch->ch_timesused = 0;
    872 			newch->ch_prog = ci->cl_prog;
    873 			newch->ch_vers = ci->cl_vers;
    874 			newch->ch_dev = svp->sv_knconf->knc_rdev;
    875 			newch->ch_protofmly = kmem_alloc(
    876 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
    877 			    KM_SLEEP);
    878 			(void) strcpy(newch->ch_protofmly,
    879 			    svp->sv_knconf->knc_protofmly);
    880 			newch->ch_list = NULL;
    881 			goto top;
    882 		}
    883 		ch = newch;
    884 		newch = NULL;
    885 		ch->ch_next = nfscl->nfscl_chtable4;
    886 		nfscl->nfscl_chtable4 = ch;
    887 	/*
    888 	 * We found a cache entry, but if it isn't on the front of the
    889 	 * list, then move it to the front of the list to try to take
    890 	 * advantage of locality of operations.
    891 	 */
    892 	} else if (ch != nfscl->nfscl_chtable4) {
    893 		*plistp = ch->ch_next;
    894 		ch->ch_next = nfscl->nfscl_chtable4;
    895 		nfscl->nfscl_chtable4 = ch;
    896 	}
    897 
    898 	/*
    899 	 * If there was a free client handle cached, then remove it
    900 	 * from the list, init it, and use it.
    901 	 */
    902 	if (ch->ch_list != NULL) {
    903 		cp = ch->ch_list;
    904 		ch->ch_list = cp->ch_list;
    905 		mutex_exit(&nfscl->nfscl_chtable4_lock);
    906 		if (newch != NULL) {
    907 			kmem_free(newch->ch_protofmly,
    908 			    strlen(newch->ch_protofmly) + 1);
    909 			kmem_free(newch, sizeof (*newch));
    910 		}
    911 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
    912 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
    913 
    914 		/*
    915 		 * Get an auth handle.
    916 		 */
    917 		error = authget(svp, cp->ch_client, cr);
    918 		if (error || cp->ch_client->cl_auth == NULL) {
    919 			CLNT_DESTROY(cp->ch_client);
    920 			kmem_cache_free(chtab4_cache, cp);
    921 			return ((error != 0) ? error : EINTR);
    922 		}
    923 		ch->ch_timesused++;
    924 		*newcl = cp->ch_client;
    925 		*chp = cp;
    926 		return (0);
    927 	}
    928 
    929 	/*
    930 	 * There weren't any free client handles which fit, so allocate
    931 	 * a new one and use that.
    932 	 */
    933 #ifdef DEBUG
    934 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
    935 #endif
    936 	mutex_exit(&nfscl->nfscl_chtable4_lock);
    937 
    938 	nfscl->nfscl_stat.cltoomany.value.ui64++;
    939 	if (newch != NULL) {
    940 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
    941 		kmem_free(newch, sizeof (*newch));
    942 	}
    943 
    944 	cp = kmem_cache_alloc(chtab4_cache, KM_SLEEP);
    945 	cp->ch_head = ch;
    946 
    947 	sigintr(&smask, (int)ci->cl_flags & MI4_INT);
    948 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
    949 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
    950 	sigunintr(&smask);
    951 
    952 	if (error != 0) {
    953 		kmem_cache_free(chtab4_cache, cp);
    954 #ifdef DEBUG
    955 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
    956 #endif
    957 		/*
    958 		 * Warning is unnecessary if error is EINTR.
    959 		 */
    960 		if (error != EINTR) {
    961 			nfs_cmn_err(error, CE_WARN,
    962 			    "clget: couldn't create handle: %m\n");
    963 		}
    964 		return (error);
    965 	}
    966 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
    967 	auth_destroy(cp->ch_client->cl_auth);
    968 
    969 	/*
    970 	 * Get an auth handle.
    971 	 */
    972 	error = authget(svp, cp->ch_client, cr);
    973 	if (error || cp->ch_client->cl_auth == NULL) {
    974 		CLNT_DESTROY(cp->ch_client);
    975 		kmem_cache_free(chtab4_cache, cp);
    976 #ifdef DEBUG
    977 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
    978 #endif
    979 		return ((error != 0) ? error : EINTR);
    980 	}
    981 	ch->ch_timesused++;
    982 	*newcl = cp->ch_client;
    983 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
    984 	*chp = cp;
    985 	return (0);
    986 }
    987 
    988 static int
    989 nfs_clget4(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
    990     struct chtab **chp, struct nfs4_clnt *nfscl)
    991 {
    992 	clinfo_t ci;
    993 	bool_t is_recov;
    994 	int firstcall, error = 0;
    995 
    996 	/*
    997 	 * Set read buffer size to rsize
    998 	 * and add room for RPC headers.
    999 	 */
   1000 	ci.cl_readsize = mi->mi_tsize;
   1001 	if (ci.cl_readsize != 0)
   1002 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
   1003 
   1004 	/*
   1005 	 * If soft mount and server is down just try once.
   1006 	 * meaning: do not retransmit.
   1007 	 */
   1008 	if (!(mi->mi_flags & MI4_HARD) && (mi->mi_flags & MI4_DOWN))
   1009 		ci.cl_retrans = 0;
   1010 	else
   1011 		ci.cl_retrans = mi->mi_retrans;
   1012 
   1013 	ci.cl_prog = mi->mi_prog;
   1014 	ci.cl_vers = mi->mi_vers;
   1015 	ci.cl_flags = mi->mi_flags;
   1016 
   1017 	/*
   1018 	 * clget4 calls authget() to get an auth handle. For RPCSEC_GSS
   1019 	 * security flavor, the client tries to establish a security context
   1020 	 * by contacting the server. If the connection is timed out or reset,
   1021 	 * e.g. server reboot, we will try again.
   1022 	 */
   1023 	is_recov = (curthread == mi->mi_recovthread);
   1024 	firstcall = 1;
   1025 
   1026 	do {
   1027 		error = clget4(&ci, svp, cr, newcl, chp, nfscl);
   1028 
   1029 		if (error == 0)
   1030 			break;
   1031 
   1032 		/*
   1033 		 * For forced unmount and zone shutdown, bail out but
   1034 		 * let the recovery thread do one more transmission.
   1035 		 */
   1036 		if ((FS_OR_ZONE_GONE4(mi->mi_vfsp)) &&
   1037 		    (!is_recov || !firstcall)) {
   1038 			error = EIO;
   1039 			break;
   1040 		}
   1041 
   1042 		/* do not retry for soft mount */
   1043 		if (!(mi->mi_flags & MI4_HARD))
   1044 			break;
   1045 
   1046 		/* let the caller deal with the failover case */
   1047 		if (FAILOVER_MOUNT4(mi))
   1048 			break;
   1049 
   1050 		firstcall = 0;
   1051 
   1052 	} while (error == ETIMEDOUT || error == ECONNRESET);
   1053 
   1054 	return (error);
   1055 }
   1056 
   1057 void
   1058 clfree4(CLIENT *cl, struct chtab *cp, struct nfs4_clnt *nfscl)
   1059 {
   1060 	if (cl->cl_auth != NULL) {
   1061 		sec_clnt_freeh(cl->cl_auth);
   1062 		cl->cl_auth = NULL;
   1063 	}
   1064 
   1065 	/*
   1066 	 * Timestamp this cache entry so that we know when it was last
   1067 	 * used.
   1068 	 */
   1069 	cp->ch_freed = gethrestime_sec();
   1070 
   1071 	/*
   1072 	 * Add the free client handle to the front of the list.
   1073 	 * This way, the list will be sorted in youngest to oldest
   1074 	 * order.
   1075 	 */
   1076 	mutex_enter(&nfscl->nfscl_chtable4_lock);
   1077 	cp->ch_list = cp->ch_head->ch_list;
   1078 	cp->ch_head->ch_list = cp;
   1079 	mutex_exit(&nfscl->nfscl_chtable4_lock);
   1080 }
   1081 
   1082 #define	CL_HOLDTIME	60	/* time to hold client handles */
   1083 
   1084 static void
   1085 clreclaim4_zone(struct nfs4_clnt *nfscl, uint_t cl_holdtime)
   1086 {
   1087 	struct chhead *ch;
   1088 	struct chtab *cp;	/* list of objects that can be reclaimed */
   1089 	struct chtab *cpe;
   1090 	struct chtab *cpl;
   1091 	struct chtab **cpp;
   1092 #ifdef DEBUG
   1093 	int n = 0;
   1094 	clstat4_debug.clreclaim.value.ui64++;
   1095 #endif
   1096 
   1097 	/*
   1098 	 * Need to reclaim some memory, so step through the cache
   1099 	 * looking through the lists for entries which can be freed.
   1100 	 */
   1101 	cp = NULL;
   1102 
   1103 	mutex_enter(&nfscl->nfscl_chtable4_lock);
   1104 
   1105 	/*
   1106 	 * Here we step through each non-NULL quadruple and start to
   1107 	 * construct the reclaim list pointed to by cp.  Note that
   1108 	 * cp will contain all eligible chtab entries.  When this traversal
   1109 	 * completes, chtab entries from the last quadruple will be at the
   1110 	 * front of cp and entries from previously inspected quadruples have
   1111 	 * been appended to the rear of cp.
   1112 	 */
   1113 	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
   1114 		if (ch->ch_list == NULL)
   1115 			continue;
   1116 		/*
   1117 		 * Search each list for entries older then
   1118 		 * cl_holdtime seconds.  The lists are maintained
   1119 		 * in youngest to oldest order so that when the
   1120 		 * first entry is found which is old enough, then
   1121 		 * all of the rest of the entries on the list will
   1122 		 * be old enough as well.
   1123 		 */
   1124 		cpl = ch->ch_list;
   1125 		cpp = &ch->ch_list;
   1126 		while (cpl != NULL &&
   1127 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
   1128 			cpp = &cpl->ch_list;
   1129 			cpl = cpl->ch_list;
   1130 		}
   1131 		if (cpl != NULL) {
   1132 			*cpp = NULL;
   1133 			if (cp != NULL) {
   1134 				cpe = cpl;
   1135 				while (cpe->ch_list != NULL)
   1136 					cpe = cpe->ch_list;
   1137 				cpe->ch_list = cp;
   1138 			}
   1139 			cp = cpl;
   1140 		}
   1141 	}
   1142 
   1143 	mutex_exit(&nfscl->nfscl_chtable4_lock);
   1144 
   1145 	/*
   1146 	 * If cp is empty, then there is nothing to reclaim here.
   1147 	 */
   1148 	if (cp == NULL)
   1149 		return;
   1150 
   1151 	/*
   1152 	 * Step through the list of entries to free, destroying each client
   1153 	 * handle and kmem_free'ing the memory for each entry.
   1154 	 */
   1155 	while (cp != NULL) {
   1156 #ifdef DEBUG
   1157 		n++;
   1158 #endif
   1159 		CLNT_DESTROY(cp->ch_client);
   1160 		cpl = cp->ch_list;
   1161 		kmem_cache_free(chtab4_cache, cp);
   1162 		cp = cpl;
   1163 	}
   1164 
   1165 #ifdef DEBUG
   1166 	/*
   1167 	 * Update clalloc so that nfsstat shows the current number
   1168 	 * of allocated client handles.
   1169 	 */
   1170 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
   1171 #endif
   1172 }
   1173 
   1174 /* ARGSUSED */
   1175 static void
   1176 clreclaim4(void *all)
   1177 {
   1178 	struct nfs4_clnt *nfscl;
   1179 
   1180 	/*
   1181 	 * The system is low on memory; go through and try to reclaim some from
   1182 	 * every zone on the system.
   1183 	 */
   1184 	mutex_enter(&nfs4_clnt_list_lock);
   1185 	nfscl = list_head(&nfs4_clnt_list);
   1186 	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl))
   1187 		clreclaim4_zone(nfscl, CL_HOLDTIME);
   1188 	mutex_exit(&nfs4_clnt_list_lock);
   1189 }
   1190 
   1191 /*
   1192  * Minimum time-out values indexed by call type
   1193  * These units are in "eights" of a second to avoid multiplies
   1194  */
   1195 static unsigned int minimum_timeo[] = {
   1196 	6, 7, 10
   1197 };
   1198 
   1199 #define	SHORTWAIT	(NFS_COTS_TIMEO / 10)
   1200 
   1201 /*
   1202  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
   1203  */
   1204 #define	MAXTIMO	(20*hz)
   1205 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
   1206 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
   1207 
   1208 static int
   1209 nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
   1210     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *doqueue,
   1211     enum clnt_stat *rpc_statusp, int flags, struct nfs4_clnt *nfscl)
   1212 {
   1213 	CLIENT *client;
   1214 	struct chtab *ch;
   1215 	cred_t *cr = icr;
   1216 	struct rpc_err rpcerr, rpcerr_tmp;
   1217 	enum clnt_stat status;
   1218 	int error;
   1219 	struct timeval wait;
   1220 	int timeo;		/* in units of hz */
   1221 	bool_t tryagain, is_recov;
   1222 	bool_t cred_cloned = FALSE;
   1223 	k_sigset_t smask;
   1224 	servinfo4_t *svp;
   1225 #ifdef DEBUG
   1226 	char *bufp;
   1227 #endif
   1228 	int firstcall;
   1229 
   1230 	rpcerr.re_status = RPC_SUCCESS;
   1231 
   1232 	/*
   1233 	 * If we know that we are rebooting then let's
   1234 	 * not bother with doing any over the wireness.
   1235 	 */
   1236 	mutex_enter(&mi->mi_lock);
   1237 	if (mi->mi_flags & MI4_SHUTDOWN) {
   1238 		mutex_exit(&mi->mi_lock);
   1239 		return (EIO);
   1240 	}
   1241 	mutex_exit(&mi->mi_lock);
   1242 
   1243 	/* For TSOL, use a new cred which has net_mac_aware flag */
   1244 	if (!cred_cloned && is_system_labeled()) {
   1245 		cred_cloned = TRUE;
   1246 		cr = crdup(icr);
   1247 		(void) setpflags(NET_MAC_AWARE, 1, cr);
   1248 	}
   1249 
   1250 	/*
   1251 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
   1252 	 * are guaranteed to reprocess the retry as a new request.
   1253 	 */
   1254 	svp = mi->mi_curr_serv;
   1255 	rpcerr.re_errno = nfs_clget4(mi, svp, cr, &client, &ch, nfscl);
   1256 	if (rpcerr.re_errno != 0)
   1257 		return (rpcerr.re_errno);
   1258 
   1259 	timeo = (mi->mi_timeo * hz) / 10;
   1260 
   1261 	/*
   1262 	 * If hard mounted fs, retry call forever unless hard error
   1263 	 * occurs.
   1264 	 *
   1265 	 * For forced unmount, let the recovery thread through but return
   1266 	 * an error for all others.  This is so that user processes can
   1267 	 * exit quickly.  The recovery thread bails out after one
   1268 	 * transmission so that it can tell if it needs to continue.
   1269 	 *
   1270 	 * For zone shutdown, behave as above to encourage quick
   1271 	 * process exit, but also fail quickly when servers have
   1272 	 * timed out before and reduce the timeouts.
   1273 	 */
   1274 	is_recov = (curthread == mi->mi_recovthread);
   1275 	firstcall = 1;
   1276 	do {
   1277 		tryagain = FALSE;
   1278 
   1279 		NFS4_DEBUG(nfs4_rfscall_debug, (CE_NOTE,
   1280 		    "nfs4_rfscall: vfs_flag=0x%x, %s",
   1281 		    mi->mi_vfsp->vfs_flag,
   1282 		    is_recov ? "recov thread" : "not recov thread"));
   1283 
   1284 		/*
   1285 		 * It's possible while we're retrying the admin
   1286 		 * decided to reboot.
   1287 		 */
   1288 		mutex_enter(&mi->mi_lock);
   1289 		if (mi->mi_flags & MI4_SHUTDOWN) {
   1290 			mutex_exit(&mi->mi_lock);
   1291 			clfree4(client, ch, nfscl);
   1292 			if (cred_cloned)
   1293 				crfree(cr);
   1294 			return (EIO);
   1295 		}
   1296 		mutex_exit(&mi->mi_lock);
   1297 
   1298 		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
   1299 		    (!is_recov || !firstcall)) {
   1300 			clfree4(client, ch, nfscl);
   1301 			if (cred_cloned)
   1302 				crfree(cr);
   1303 			return (EIO);
   1304 		}
   1305 
   1306 		if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) {
   1307 			mutex_enter(&mi->mi_lock);
   1308 			if ((mi->mi_flags & MI4_TIMEDOUT) ||
   1309 			    !is_recov || !firstcall) {
   1310 				mutex_exit(&mi->mi_lock);
   1311 				clfree4(client, ch, nfscl);
   1312 				if (cred_cloned)
   1313 					crfree(cr);
   1314 				return (EIO);
   1315 			}
   1316 			mutex_exit(&mi->mi_lock);
   1317 			timeo = (MIN(mi->mi_timeo, SHORTWAIT) * hz) / 10;
   1318 		}
   1319 
   1320 		firstcall = 0;
   1321 		TICK_TO_TIMEVAL(timeo, &wait);
   1322 
   1323 		/*
   1324 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
   1325 		 * and SIGTERM. (Preserving the existing masks).
   1326 		 * Mask out SIGINT if mount option nointr is specified.
   1327 		 */
   1328 		sigintr(&smask, (int)mi->mi_flags & MI4_INT);
   1329 		if (!(mi->mi_flags & MI4_INT))
   1330 			client->cl_nosignal = TRUE;
   1331 
   1332 		/*
   1333 		 * If there is a current signal, then don't bother
   1334 		 * even trying to send out the request because we
   1335 		 * won't be able to block waiting for the response.
   1336 		 * Simply assume RPC_INTR and get on with it.
   1337 		 */
   1338 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
   1339 			status = RPC_INTR;
   1340 		else {
   1341 			status = CLNT_CALL(client, which, xdrargs, argsp,
   1342 			    xdrres, resp, wait);
   1343 		}
   1344 
   1345 		if (!(mi->mi_flags & MI4_INT))
   1346 			client->cl_nosignal = FALSE;
   1347 		/*
   1348 		 * restore original signal mask
   1349 		 */
   1350 		sigunintr(&smask);
   1351 
   1352 		switch (status) {
   1353 		case RPC_SUCCESS:
   1354 			break;
   1355 
   1356 		case RPC_INTR:
   1357 			/*
   1358 			 * There is no way to recover from this error,
   1359 			 * even if mount option nointr is specified.
   1360 			 * SIGKILL, for example, cannot be blocked.
   1361 			 */
   1362 			rpcerr.re_status = RPC_INTR;
   1363 			rpcerr.re_errno = EINTR;
   1364 			break;
   1365 
   1366 		case RPC_UDERROR:
   1367 			/*
   1368 			 * If the NFS server is local (vold) and
   1369 			 * it goes away then we get RPC_UDERROR.
   1370 			 * This is a retryable error, so we would
   1371 			 * loop, so check to see if the specific
   1372 			 * error was ECONNRESET, indicating that
   1373 			 * target did not exist at all.  If so,
   1374 			 * return with RPC_PROGUNAVAIL and
   1375 			 * ECONNRESET to indicate why.
   1376 			 */
   1377 			CLNT_GETERR(client, &rpcerr);
   1378 			if (rpcerr.re_errno == ECONNRESET) {
   1379 				rpcerr.re_status = RPC_PROGUNAVAIL;
   1380 				rpcerr.re_errno = ECONNRESET;
   1381 				break;
   1382 			}
   1383 			/*FALLTHROUGH*/
   1384 
   1385 		default:		/* probably RPC_TIMEDOUT */
   1386 
   1387 			if (IS_UNRECOVERABLE_RPC(status))
   1388 				break;
   1389 
   1390 			/*
   1391 			 * increment server not responding count
   1392 			 */
   1393 			mutex_enter(&mi->mi_lock);
   1394 			mi->mi_noresponse++;
   1395 			mutex_exit(&mi->mi_lock);
   1396 #ifdef DEBUG
   1397 			nfscl->nfscl_stat.noresponse.value.ui64++;
   1398 #endif
   1399 			/*
   1400 			 * On zone shutdown, mark server dead and move on.
   1401 			 */
   1402 			if (zone_status_get(curproc->p_zone) >=
   1403 			    ZONE_IS_SHUTTING_DOWN) {
   1404 				mutex_enter(&mi->mi_lock);
   1405 				mi->mi_flags |= MI4_TIMEDOUT;
   1406 				mutex_exit(&mi->mi_lock);
   1407 				clfree4(client, ch, nfscl);
   1408 				if (cred_cloned)
   1409 					crfree(cr);
   1410 				return (EIO);
   1411 			}
   1412 
   1413 			/*
   1414 			 * NFS client failover support:
   1415 			 * return and let the caller take care of
   1416 			 * failover.  We only return for failover mounts
   1417 			 * because otherwise we want the "not responding"
   1418 			 * message, the timer updates, etc.
   1419 			 */
   1420 			if (mi->mi_vers == 4 && FAILOVER_MOUNT4(mi) &&
   1421 			    (error = try_failover(status)) != 0) {
   1422 				clfree4(client, ch, nfscl);
   1423 				if (cred_cloned)
   1424 					crfree(cr);
   1425 				*rpc_statusp = status;
   1426 				return (error);
   1427 			}
   1428 
   1429 			if (flags & RFSCALL_SOFT)
   1430 				break;
   1431 
   1432 			tryagain = TRUE;
   1433 
   1434 			/*
   1435 			 * The call is in progress (over COTS).
   1436 			 * Try the CLNT_CALL again, but don't
   1437 			 * print a noisy error message.
   1438 			 */
   1439 			if (status == RPC_INPROGRESS)
   1440 				break;
   1441 
   1442 			timeo = backoff(timeo);
   1443 			CLNT_GETERR(client, &rpcerr_tmp);
   1444 
   1445 			mutex_enter(&mi->mi_lock);
   1446 			if (!(mi->mi_flags & MI4_PRINTED)) {
   1447 				mi->mi_flags |= MI4_PRINTED;
   1448 				mutex_exit(&mi->mi_lock);
   1449 				if ((status == RPC_CANTSEND) &&
   1450 				    (rpcerr_tmp.re_errno == ENOBUFS))
   1451 					nfs4_queue_fact(RF_SENDQ_FULL, mi, 0,
   1452 					    0, 0, FALSE, NULL, 0, NULL);
   1453 				else
   1454 					nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi,
   1455 					    0, 0, 0, FALSE, NULL, 0, NULL);
   1456 			} else
   1457 				mutex_exit(&mi->mi_lock);
   1458 
   1459 			if (*doqueue && nfs_has_ctty()) {
   1460 				*doqueue = 0;
   1461 				if (!(mi->mi_flags & MI4_NOPRINT)) {
   1462 					if ((status == RPC_CANTSEND) &&
   1463 					    (rpcerr_tmp.re_errno == ENOBUFS))
   1464 						nfs4_queue_fact(RF_SENDQ_FULL,
   1465 						    mi, 0, 0, 0, FALSE, NULL,
   1466 						    0, NULL);
   1467 					else
   1468 						nfs4_queue_fact(
   1469 						    RF_SRV_NOT_RESPOND, mi, 0,
   1470 						    0, 0, FALSE, NULL, 0, NULL);
   1471 				}
   1472 			}
   1473 		}
   1474 	} while (tryagain);
   1475 
   1476 	DTRACE_PROBE2(nfs4__rfscall_debug, enum clnt_stat, status,
   1477 	    int, rpcerr.re_errno);
   1478 
   1479 	if (status != RPC_SUCCESS) {
   1480 		zoneid_t zoneid = mi->mi_zone->zone_id;
   1481 
   1482 		/*
   1483 		 * Let soft mounts use the timed out message.
   1484 		 */
   1485 		if (status == RPC_INPROGRESS)
   1486 			status = RPC_TIMEDOUT;
   1487 		nfscl->nfscl_stat.badcalls.value.ui64++;
   1488 		if (status != RPC_INTR) {
   1489 			mutex_enter(&mi->mi_lock);
   1490 			mi->mi_flags |= MI4_DOWN;
   1491 			mutex_exit(&mi->mi_lock);
   1492 			CLNT_GETERR(client, &rpcerr);
   1493 #ifdef DEBUG
   1494 			bufp = clnt_sperror(client, svp->sv_hostname);
   1495 			zprintf(zoneid, "NFS%d %s failed for %s\n",
   1496 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
   1497 			if (nfs_has_ctty()) {
   1498 				if (!(mi->mi_flags & MI4_NOPRINT)) {
   1499 					uprintf("NFS%d %s failed for %s\n",
   1500 					    mi->mi_vers, mi->mi_rfsnames[which],
   1501 					    bufp);
   1502 				}
   1503 			}
   1504 			kmem_free(bufp, MAXPATHLEN);
   1505 #else
   1506 			zprintf(zoneid,
   1507 			    "NFS %s failed for server %s: error %d (%s)\n",
   1508 			    mi->mi_rfsnames[which], svp->sv_hostname,
   1509 			    status, clnt_sperrno(status));
   1510 			if (nfs_has_ctty()) {
   1511 				if (!(mi->mi_flags & MI4_NOPRINT)) {
   1512 					uprintf(
   1513 				"NFS %s failed for server %s: error %d (%s)\n",
   1514 					    mi->mi_rfsnames[which],
   1515 					    svp->sv_hostname, status,
   1516 					    clnt_sperrno(status));
   1517 				}
   1518 			}
   1519 #endif
   1520 			/*
   1521 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
   1522 			 * re_errno is set appropriately depending on
   1523 			 * the authentication error
   1524 			 */
   1525 			if (status == RPC_VERSMISMATCH ||
   1526 			    status == RPC_PROGVERSMISMATCH)
   1527 				rpcerr.re_errno = EIO;
   1528 		}
   1529 	} else {
   1530 		/*
   1531 		 * Test the value of mi_down and mi_printed without
   1532 		 * holding the mi_lock mutex.  If they are both zero,
   1533 		 * then it is okay to skip the down and printed
   1534 		 * processing.  This saves on a mutex_enter and
   1535 		 * mutex_exit pair for a normal, successful RPC.
   1536 		 * This was just complete overhead.
   1537 		 */
   1538 		if (mi->mi_flags & (MI4_DOWN | MI4_PRINTED)) {
   1539 			mutex_enter(&mi->mi_lock);
   1540 			mi->mi_flags &= ~MI4_DOWN;
   1541 			if (mi->mi_flags & MI4_PRINTED) {
   1542 				mi->mi_flags &= ~MI4_PRINTED;
   1543 				mutex_exit(&mi->mi_lock);
   1544 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
   1545 					nfs4_queue_fact(RF_SRV_OK, mi, 0, 0,
   1546 					    0, FALSE, NULL, 0, NULL);
   1547 			} else
   1548 				mutex_exit(&mi->mi_lock);
   1549 		}
   1550 
   1551 		if (*doqueue == 0) {
   1552 			if (!(mi->mi_flags & MI4_NOPRINT) &&
   1553 			    !(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
   1554 				nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 0,
   1555 				    FALSE, NULL, 0, NULL);
   1556 
   1557 			*doqueue = 1;
   1558 		}
   1559 	}
   1560 
   1561 	clfree4(client, ch, nfscl);
   1562 	if (cred_cloned)
   1563 		crfree(cr);
   1564 
   1565 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
   1566 
   1567 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "nfs4_rfscall_end:errno %d",
   1568 	    rpcerr.re_errno);
   1569 
   1570 	*rpc_statusp = status;
   1571 	return (rpcerr.re_errno);
   1572 }
   1573 
   1574 /*
   1575  * rfs4call - general wrapper for RPC calls initiated by the client
   1576  */
   1577 void
   1578 rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp,
   1579     cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep)
   1580 {
   1581 	int i, error;
   1582 	enum clnt_stat rpc_status = NFS4_OK;
   1583 	int num_resops;
   1584 	struct nfs4_clnt *nfscl;
   1585 
   1586 	ASSERT(nfs_zone() == mi->mi_zone);
   1587 	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
   1588 	ASSERT(nfscl != NULL);
   1589 
   1590 	nfscl->nfscl_stat.calls.value.ui64++;
   1591 	mi->mi_reqs[NFSPROC4_COMPOUND].value.ui64++;
   1592 
   1593 	/* Set up the results struct for XDR usage */
   1594 	resp->argsp = argsp;
   1595 	resp->array = NULL;
   1596 	resp->status = 0;
   1597 	resp->decode_len = 0;
   1598 
   1599 	error = nfs4_rfscall(mi, NFSPROC4_COMPOUND,
   1600 	    xdr_COMPOUND4args_clnt, (caddr_t)argsp,
   1601 	    xdr_COMPOUND4res_clnt, (caddr_t)resp, cr,
   1602 	    doqueue, &rpc_status, flags, nfscl);
   1603 
   1604 	/* Return now if it was an RPC error */
   1605 	if (error) {
   1606 		ep->error = error;
   1607 		ep->stat = resp->status;
   1608 		ep->rpc_status = rpc_status;
   1609 		return;
   1610 	}
   1611 
   1612 	/* else we'll count the processed operations */
   1613 	num_resops = resp->decode_len;
   1614 	for (i = 0; i < num_resops; i++) {
   1615 		/*
   1616 		 * Count the individual operations
   1617 		 * processed by the server.
   1618 		 */
   1619 		if (resp->array[i].resop >= NFSPROC4_NULL &&
   1620 		    resp->array[i].resop <= OP_WRITE)
   1621 			mi->mi_reqs[resp->array[i].resop].value.ui64++;
   1622 	}
   1623 
   1624 	ep->error = 0;
   1625 	ep->stat = resp->status;
   1626 	ep->rpc_status = rpc_status;
   1627 }
   1628 
   1629 /*
   1630  * nfs4rename_update - updates stored state after a rename.  Currently this
   1631  * is the path of the object and anything under it, and the filehandle of
   1632  * the renamed object.
   1633  */
   1634 void
   1635 nfs4rename_update(vnode_t *renvp, vnode_t *ndvp, nfs_fh4 *nfh4p, char *nnm)
   1636 {
   1637 	sfh4_update(VTOR4(renvp)->r_fh, nfh4p);
   1638 	fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, nnm);
   1639 }
   1640 
   1641 /*
   1642  * Routine to look up the filehandle for the given path and rootvp.
   1643  *
   1644  * Return values:
   1645  * - success: returns zero and *statp is set to NFS4_OK, and *fhp is
   1646  *   updated.
   1647  * - error: return value (errno value) and/or *statp is set appropriately.
   1648  */
   1649 #define	RML_ORDINARY	1
   1650 #define	RML_NAMED_ATTR	2
   1651 #define	RML_ATTRDIR	3
   1652 
   1653 static void
   1654 remap_lookup(nfs4_fname_t *fname, vnode_t *rootvp,
   1655     int filetype, cred_t *cr,
   1656     nfs_fh4 *fhp, nfs4_ga_res_t *garp,		/* fh, attrs for object */
   1657     nfs_fh4 *pfhp, nfs4_ga_res_t *pgarp,	/* fh, attrs for parent */
   1658     nfs4_error_t *ep)
   1659 {
   1660 	COMPOUND4args_clnt args;
   1661 	COMPOUND4res_clnt res;
   1662 	nfs_argop4 *argop;
   1663 	nfs_resop4 *resop;
   1664 	int num_argops;
   1665 	lookup4_param_t lookuparg;
   1666 	nfs_fh4 *tmpfhp;
   1667 	int doqueue = 1;
   1668 	char *path;
   1669 	mntinfo4_t *mi;
   1670 
   1671 	ASSERT(fname != NULL);
   1672 	ASSERT(rootvp->v_type == VDIR);
   1673 
   1674 	mi = VTOMI4(rootvp);
   1675 	path = fn_path(fname);
   1676 	switch (filetype) {
   1677 	case RML_NAMED_ATTR:
   1678 		lookuparg.l4_getattrs = LKP4_LAST_NAMED_ATTR;
   1679 		args.ctag = TAG_REMAP_LOOKUP_NA;
   1680 		break;
   1681 	case RML_ATTRDIR:
   1682 		lookuparg.l4_getattrs = LKP4_LAST_ATTRDIR;
   1683 		args.ctag = TAG_REMAP_LOOKUP_AD;
   1684 		break;
   1685 	case RML_ORDINARY:
   1686 		lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
   1687 		args.ctag = TAG_REMAP_LOOKUP;
   1688 		break;
   1689 	default:
   1690 		ep->error = EINVAL;
   1691 		return;
   1692 	}
   1693 	lookuparg.argsp = &args;
   1694 	lookuparg.resp = &res;
   1695 	lookuparg.header_len = 1;	/* Putfh */
   1696 	lookuparg.trailer_len = 0;
   1697 	lookuparg.ga_bits = NFS4_VATTR_MASK;
   1698 	lookuparg.mi = VTOMI4(rootvp);
   1699 
   1700 	(void) nfs4lookup_setup(path, &lookuparg, 1);
   1701 
   1702 	/* 0: putfh directory */
   1703 	argop = args.array;
   1704 	argop[0].argop = OP_CPUTFH;
   1705 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(rootvp)->r_fh;
   1706 
   1707 	num_argops = args.array_len;
   1708 
   1709 	rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
   1710 
   1711 	if (ep->error || res.status != NFS4_OK)
   1712 		goto exit;
   1713 
   1714 	/* get the object filehandle */
   1715 	resop = &res.array[res.array_len - 2];
   1716 	if (resop->resop != OP_GETFH) {
   1717 		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
   1718 		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1719 		ep->stat = NFS4ERR_SERVERFAULT;
   1720 		goto exit;
   1721 	}
   1722 	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
   1723 	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
   1724 		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
   1725 		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
   1726 		    TAG_NONE, 0, 0);
   1727 		ep->stat = NFS4ERR_SERVERFAULT;
   1728 		goto exit;
   1729 	}
   1730 	fhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
   1731 	nfs_fh4_copy(tmpfhp, fhp);
   1732 
   1733 	/* get the object attributes */
   1734 	resop = &res.array[res.array_len - 1];
   1735 	if (garp && resop->resop == OP_GETATTR)
   1736 		*garp = resop->nfs_resop4_u.opgetattr.ga_res;
   1737 
   1738 	/* See if there are enough fields in the response for parent info */
   1739 	if ((int)res.array_len - 5 <= 0)
   1740 		goto exit;
   1741 
   1742 	/* get the parent filehandle */
   1743 	resop = &res.array[res.array_len - 5];
   1744 	if (resop->resop != OP_GETFH) {
   1745 		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
   1746 		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1747 		ep->stat = NFS4ERR_SERVERFAULT;
   1748 		goto exit;
   1749 	}
   1750 	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
   1751 	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
   1752 		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
   1753 		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
   1754 		    TAG_NONE, 0, 0);
   1755 		ep->stat = NFS4ERR_SERVERFAULT;
   1756 		goto exit;
   1757 	}
   1758 	pfhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
   1759 	nfs_fh4_copy(tmpfhp, pfhp);
   1760 
   1761 	/* get the parent attributes */
   1762 	resop = &res.array[res.array_len - 4];
   1763 	if (pgarp && resop->resop == OP_GETATTR)
   1764 		*pgarp = resop->nfs_resop4_u.opgetattr.ga_res;
   1765 
   1766 exit:
   1767 	/*
   1768 	 * It is too hard to remember where all the OP_LOOKUPs are
   1769 	 */
   1770 	nfs4args_lookup_free(argop, num_argops);
   1771 	kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
   1772 
   1773 	if (!ep->error)
   1774 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1775 	kmem_free(path, strlen(path)+1);
   1776 }
   1777 
   1778 /*
   1779  * NFS client failover / volatile filehandle support
   1780  *
   1781  * Recover the filehandle for the given rnode.
   1782  *
   1783  * Errors are returned via the nfs4_error_t parameter.
   1784  */
   1785 
   1786 void
   1787 nfs4_remap_file(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
   1788 {
   1789 	int is_stub;
   1790 	rnode4_t *rp = VTOR4(vp);
   1791 	vnode_t *rootvp = NULL;
   1792 	vnode_t *dvp = NULL;
   1793 	cred_t *cr, *cred_otw;
   1794 	nfs4_ga_res_t gar, pgar;
   1795 	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
   1796 	int filetype = RML_ORDINARY;
   1797 	nfs4_recov_state_t recov = {NULL, 0, 0};
   1798 	int badfhcount = 0;
   1799 	nfs4_open_stream_t *osp = NULL;
   1800 	bool_t first_time = TRUE;	/* first time getting OTW cred */
   1801 	bool_t last_time = FALSE;	/* last time getting OTW cred */
   1802 
   1803 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1804 	    "nfs4_remap_file: remapping %s", rnode4info(rp)));
   1805 	ASSERT(nfs4_consistent_type(vp));
   1806 
   1807 	if (vp->v_flag & VROOT) {
   1808 		nfs4_remap_root(mi, ep, flags);
   1809 		return;
   1810 	}
   1811 
   1812 	/*
   1813 	 * Given the root fh, use the path stored in
   1814 	 * the rnode to find the fh for the new server.
   1815 	 */
   1816 	ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
   1817 	if (ep->error != 0)
   1818 		return;
   1819 
   1820 	cr = curthread->t_cred;
   1821 	ASSERT(cr != NULL);
   1822 get_remap_cred:
   1823 	/*
   1824 	 * Releases the osp, if it is provided.
   1825 	 * Puts a hold on the cred_otw and the new osp (if found).
   1826 	 */
   1827 	cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
   1828 	    &first_time, &last_time);
   1829 	ASSERT(cred_otw != NULL);
   1830 
   1831 	if (rp->r_flags & R4ISXATTR) {
   1832 		filetype = RML_NAMED_ATTR;
   1833 		(void) vtodv(vp, &dvp, cred_otw, FALSE);
   1834 	}
   1835 
   1836 	if (vp->v_flag & V_XATTRDIR) {
   1837 		filetype = RML_ATTRDIR;
   1838 	}
   1839 
   1840 	if (filetype == RML_ORDINARY && rootvp->v_type == VREG) {
   1841 		/* file mount, doesn't need a remap */
   1842 		goto done;
   1843 	}
   1844 
   1845 again:
   1846 	remap_lookup(rp->r_svnode.sv_name, rootvp, filetype, cred_otw,
   1847 	    &newfh, &gar, &newpfh, &pgar, ep);
   1848 
   1849 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1850 	    "nfs4_remap_file: remap_lookup returned %d/%d",
   1851 	    ep->error, ep->stat));
   1852 
   1853 	if (last_time == FALSE && ep->error == EACCES) {
   1854 		crfree(cred_otw);
   1855 		if (dvp != NULL)
   1856 			VN_RELE(dvp);
   1857 		goto get_remap_cred;
   1858 	}
   1859 	if (ep->error != 0)
   1860 		goto done;
   1861 
   1862 	switch (ep->stat) {
   1863 	case NFS4_OK:
   1864 		badfhcount = 0;
   1865 		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
   1866 			mutex_enter(&rp->r_statelock);
   1867 			rp->r_delay_interval = 0;
   1868 			mutex_exit(&rp->r_statelock);
   1869 			uprintf("NFS File Available..\n");
   1870 		}
   1871 		break;
   1872 	case NFS4ERR_FHEXPIRED:
   1873 	case NFS4ERR_BADHANDLE:
   1874 		/*
   1875 		 * If we ran into filehandle problems, we should try to
   1876 		 * remap the root vnode first and hope life gets better.
   1877 		 * But we need to avoid loops.
   1878 		 */
   1879 		if (badfhcount++ > 0)
   1880 			goto done;
   1881 		if (newfh.nfs_fh4_len != 0) {
   1882 			kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
   1883 			newfh.nfs_fh4_len = 0;
   1884 		}
   1885 		if (newpfh.nfs_fh4_len != 0) {
   1886 			kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
   1887 			newpfh.nfs_fh4_len = 0;
   1888 		}
   1889 		/* relative path - remap rootvp then retry */
   1890 		VN_RELE(rootvp);
   1891 		rootvp = NULL;
   1892 		nfs4_remap_root(mi, ep, flags);
   1893 		if (ep->error != 0 || ep->stat != NFS4_OK)
   1894 			goto done;
   1895 		ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
   1896 		if (ep->error != 0)
   1897 			goto done;
   1898 		goto again;
   1899 	case NFS4ERR_DELAY:
   1900 		badfhcount = 0;
   1901 		nfs4_set_delay_wait(vp);
   1902 		ep->error = nfs4_wait_for_delay(vp, &recov);
   1903 		if (ep->error != 0)
   1904 			goto done;
   1905 		goto again;
   1906 	case NFS4ERR_ACCESS:
   1907 		/* get new cred, try again */
   1908 		if (last_time == TRUE)
   1909 			goto done;
   1910 		if (dvp != NULL)
   1911 			VN_RELE(dvp);
   1912 		crfree(cred_otw);
   1913 		goto get_remap_cred;
   1914 	default:
   1915 		goto done;
   1916 	}
   1917 
   1918 	/*
   1919 	 * Check on the new and old rnodes before updating;
   1920 	 * if the vnode type or size changes, issue a warning
   1921 	 * and mark the file dead.
   1922 	 */
   1923 	mutex_enter(&rp->r_statelock);
   1924 	if (flags & NFS4_REMAP_CKATTRS) {
   1925 		if (vp->v_type != gar.n4g_va.va_type ||
   1926 		    (vp->v_type != VDIR &&
   1927 		    rp->r_size != gar.n4g_va.va_size)) {
   1928 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1929 			    "nfs4_remap_file: size %d vs. %d, type %d vs. %d",
   1930 			    (int)rp->r_size, (int)gar.n4g_va.va_size,
   1931 			    vp->v_type, gar.n4g_va.va_type));
   1932 			mutex_exit(&rp->r_statelock);
   1933 			nfs4_queue_event(RE_FILE_DIFF, mi,
   1934 			    rp->r_server->sv_hostname, 0, vp, NULL, 0, NULL, 0,
   1935 			    TAG_NONE, TAG_NONE, 0, 0);
   1936 			nfs4_fail_recov(vp, NULL, 0, NFS4_OK);
   1937 			goto done;
   1938 		}
   1939 	}
   1940 	ASSERT(gar.n4g_va.va_type != VNON);
   1941 	rp->r_server = mi->mi_curr_serv;
   1942 
   1943 	/*
   1944 	 * Turn this object into a "stub" object if we
   1945 	 * crossed an underlying server fs boundary.
   1946 	 *
   1947 	 * This stub will be for a mirror-mount.
   1948 	 * A referral would look like a boundary crossing
   1949 	 * as well, but would not be the same type of object,
   1950 	 * so we would expect to mark the object dead.
   1951 	 *
   1952 	 * See comment in r4_do_attrcache() for more details.
   1953 	 */
   1954 	is_stub = 0;
   1955 	if (gar.n4g_fsid_valid) {
   1956 		(void) nfs_rw_enter_sig(&rp->r_server->sv_lock, RW_READER, 0);
   1957 		rp->r_srv_fsid = gar.n4g_fsid;
   1958 		if (!FATTR4_FSID_EQ(&gar.n4g_fsid, &rp->r_server->sv_fsid))
   1959 			is_stub = 1;
   1960 		nfs_rw_exit(&rp->r_server->sv_lock);
   1961 #ifdef DEBUG
   1962 	} else {
   1963 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1964 		    "remap_file: fsid attr not provided by server.  rp=%p",
   1965 		    (void *)rp));
   1966 #endif
   1967 	}
   1968 	if (is_stub)
   1969 		r4_stub_mirrormount(rp);
   1970 	else
   1971 		r4_stub_none(rp);
   1972 	mutex_exit(&rp->r_statelock);
   1973 	nfs4_attrcache_noinval(vp, &gar, gethrtime()); /* force update */
   1974 	sfh4_update(rp->r_fh, &newfh);
   1975 	ASSERT(nfs4_consistent_type(vp));
   1976 
   1977 	/*
   1978 	 * If we got parent info, use it to update the parent
   1979 	 */
   1980 	if (newpfh.nfs_fh4_len != 0) {
   1981 		if (rp->r_svnode.sv_dfh != NULL)
   1982 			sfh4_update(rp->r_svnode.sv_dfh, &newpfh);
   1983 		if (dvp != NULL) {
   1984 			/* force update of attrs */
   1985 			nfs4_attrcache_noinval(dvp, &pgar, gethrtime());
   1986 		}
   1987 	}
   1988 done:
   1989 	if (newfh.nfs_fh4_len != 0)
   1990 		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
   1991 	if (newpfh.nfs_fh4_len != 0)
   1992 		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
   1993 	if (cred_otw != NULL)
   1994 		crfree(cred_otw);
   1995 	if (rootvp != NULL)
   1996 		VN_RELE(rootvp);
   1997 	if (dvp != NULL)
   1998 		VN_RELE(dvp);
   1999 	if (osp != NULL)
   2000 		open_stream_rele(osp, rp);
   2001 }
   2002 
   2003 /*
   2004  * Client-side failover support: remap the filehandle for vp if it appears
   2005  * necessary.  errors are returned via the nfs4_error_t parameter; though,
   2006  * if there is a problem, we will just try again later.
   2007  */
   2008 
   2009 void
   2010 nfs4_check_remap(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
   2011 {
   2012 	if (vp == NULL)
   2013 		return;
   2014 
   2015 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY))
   2016 		return;
   2017 
   2018 	if (VTOR4(vp)->r_server == mi->mi_curr_serv)
   2019 		return;
   2020 
   2021 	nfs4_remap_file(mi, vp, flags, ep);
   2022 }
   2023 
   2024 /*
   2025  * nfs4_make_dotdot() - find or create a parent vnode of a non-root node.
   2026  *
   2027  * Our caller has a filehandle for ".." relative to a particular
   2028  * directory object.  We want to find or create a parent vnode
   2029  * with that filehandle and return it.  We can of course create
   2030  * a vnode from this filehandle, but we need to also make sure
   2031  * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR)
   2032  * that we have a parent FH for future reopens as well.  If
   2033  * we have a remap failure, we won't be able to reopen this
   2034  * file, but we won't treat that as fatal because a reopen
   2035  * is at least unlikely.  Someday nfs4_reopen() should look
   2036  * for a missing parent FH and try a remap to recover from it.
   2037  *
   2038  * need_start_op argument indicates whether this function should
   2039  * do a start_op before calling remap_lookup().  This should
   2040  * be FALSE, if you are the recovery thread or in an op; otherwise,
   2041  * set it to TRUE.
   2042  */
   2043 int
   2044 nfs4_make_dotdot(nfs4_sharedfh_t *fhp, hrtime_t t, vnode_t *dvp,
   2045     cred_t *cr, vnode_t **vpp, int need_start_op)
   2046 {
   2047 	mntinfo4_t *mi = VTOMI4(dvp);
   2048 	nfs4_fname_t *np = NULL, *pnp = NULL;
   2049 	vnode_t *vp = NULL, *rootvp = NULL;
   2050 	rnode4_t *rp;
   2051 	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
   2052 	nfs4_ga_res_t gar, pgar;
   2053 	vattr_t va, pva;
   2054 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   2055 	nfs4_sharedfh_t *sfh = NULL, *psfh = NULL;
   2056 	nfs4_recov_state_t recov_state;
   2057 
   2058 #ifdef DEBUG
   2059 	/*
   2060 	 * ensure need_start_op is correct
   2061 	 */
   2062 	{
   2063 		int no_need_start_op = (tsd_get(nfs4_tsd_key) ||
   2064 		    (curthread == mi->mi_recovthread));
   2065 		/* C needs a ^^ operator! */
   2066 		ASSERT(((need_start_op) && (!no_need_start_op)) ||
   2067 		    ((! need_start_op) && (no_need_start_op)));
   2068 	}
   2069 #endif
   2070 	ASSERT(VTOMI4(dvp)->mi_zone == nfs_zone());
   2071 
   2072 	NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE,
   2073 	    "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp,
   2074 	    rnode4info(VTOR4(dvp))));
   2075 
   2076 	/*
   2077 	 * rootvp might be needed eventually. Holding it now will
   2078 	 * ensure that r4find_unlocked() will find it, if ".." is the root.
   2079 	 */
   2080 	e.error = VFS_ROOT(mi->mi_vfsp, &rootvp);
   2081 	if (e.error != 0)
   2082 		goto out;
   2083 	rp = r4find_unlocked(fhp, mi->mi_vfsp);
   2084 	if (rp != NULL) {
   2085 		*vpp = RTOV4(rp);
   2086 		VN_RELE(rootvp);
   2087 		return (0);
   2088 	}
   2089 
   2090 	/*
   2091 	 * Since we don't have the rnode, we have to go over the wire.
   2092 	 * remap_lookup() can get all of the filehandles and attributes
   2093 	 * we need in one operation.
   2094 	 */
   2095 	np = fn_parent(VTOSV(dvp)->sv_name);
   2096 	/* if a parent was not found return an error */
   2097 	if (np == NULL) {
   2098 		e.error = ENOENT;
   2099 		goto out;
   2100 	}
   2101 
   2102 	recov_state.rs_flags = 0;
   2103 	recov_state.rs_num_retry_despite_err = 0;
   2104 recov_retry:
   2105 	if (need_start_op) {
   2106 		e.error = nfs4_start_fop(mi, rootvp, NULL, OH_LOOKUP,
   2107 		    &recov_state, NULL);
   2108 		if (e.error != 0) {
   2109 			goto out;
   2110 		}
   2111 	}
   2112 	va.va_type = VNON;
   2113 	pva.va_type = VNON;
   2114 	remap_lookup(np, rootvp, RML_ORDINARY, cr,
   2115 	    &newfh, &gar, &newpfh, &pgar, &e);
   2116 	if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
   2117 		if (need_start_op) {
   2118 			bool_t abort;
   2119 
   2120 			abort = nfs4_start_recovery(&e, mi,
   2121 			    rootvp, NULL, NULL, NULL, OP_LOOKUP, NULL, NULL,
   2122 			    NULL);
   2123 			if (abort) {
   2124 				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
   2125 				    &recov_state, FALSE);
   2126 				if (e.error == 0)
   2127 					e.error = EIO;
   2128 				goto out;
   2129 			}
   2130 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
   2131 			    &recov_state, TRUE);
   2132 			goto recov_retry;
   2133 		}
   2134 		if (e.error == 0)
   2135 			e.error = EIO;
   2136 		goto out;
   2137 	}
   2138 
   2139 	if (!e.error) {
   2140 		va = gar.n4g_va;
   2141 		pva = pgar.n4g_va;
   2142 	}
   2143 
   2144 	if ((e.error != 0) ||
   2145 	    (va.va_type != VDIR)) {
   2146 		if (need_start_op)
   2147 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
   2148 			    &recov_state, FALSE);
   2149 		if (e.error == 0)
   2150 			e.error = EIO;
   2151 		goto out;
   2152 	}
   2153 
   2154 	if (e.stat != NFS4_OK) {
   2155 		if (need_start_op)
   2156 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
   2157 			    &recov_state, FALSE);
   2158 		e.error = EIO;
   2159 		goto out;
   2160 	}
   2161 
   2162 	/*
   2163 	 * It is possible for remap_lookup() to return with no error,
   2164 	 * but without providing the parent filehandle and attrs.
   2165 	 */
   2166 	if (pva.va_type != VDIR) {
   2167 		/*
   2168 		 * Call remap_lookup() again, this time with the
   2169 		 * newpfh and pgar args in the first position.
   2170 		 */
   2171 		pnp = fn_parent(np);
   2172 		if (pnp != NULL) {
   2173 			remap_lookup(pnp, rootvp, RML_ORDINARY, cr,
   2174 			    &newpfh, &pgar, NULL, NULL, &e);
   2175 			if (nfs4_needs_recovery(&e, FALSE,
   2176 			    mi->mi_vfsp)) {
   2177 				if (need_start_op) {
   2178 					bool_t abort;
   2179 
   2180 					abort = nfs4_start_recovery(&e, mi,
   2181 					    rootvp, NULL, NULL, NULL,
   2182 					    OP_LOOKUP, NULL, NULL, NULL);
   2183 					if (abort) {
   2184 						nfs4_end_fop(mi, rootvp, NULL,
   2185 						    OH_LOOKUP, &recov_state,
   2186 						    FALSE);
   2187 						if (e.error == 0)
   2188 							e.error = EIO;
   2189 						goto out;
   2190 					}
   2191 					nfs4_end_fop(mi, rootvp, NULL,
   2192 					    OH_LOOKUP, &recov_state, TRUE);
   2193 					goto recov_retry;
   2194 				}
   2195 				if (e.error == 0)
   2196 					e.error = EIO;
   2197 				goto out;
   2198 			}
   2199 
   2200 			if (e.stat != NFS4_OK) {
   2201 				if (need_start_op)
   2202 					nfs4_end_fop(mi, rootvp, NULL,
   2203 					    OH_LOOKUP, &recov_state, FALSE);
   2204 				e.error = EIO;
   2205 				goto out;
   2206 			}
   2207 		}
   2208 		if ((pnp == NULL) ||
   2209 		    (e.error != 0) ||
   2210 		    (pva.va_type == VNON)) {
   2211 			if (need_start_op)
   2212 				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
   2213 				    &recov_state, FALSE);
   2214 			if (e.error == 0)
   2215 				e.error = EIO;
   2216 			goto out;
   2217 		}
   2218 	}
   2219 	ASSERT(newpfh.nfs_fh4_len != 0);
   2220 	if (need_start_op)
   2221 		nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, &recov_state, FALSE);
   2222 	psfh = sfh4_get(&newpfh, mi);
   2223 
   2224 	sfh = sfh4_get(&newfh, mi);
   2225 	vp = makenfs4node_by_fh(sfh, psfh, &np, &gar, mi, cr, t);
   2226 
   2227 out:
   2228 	if (np != NULL)
   2229 		fn_rele(&np);
   2230 	if (pnp != NULL)
   2231 		fn_rele(&pnp);
   2232 	if (newfh.nfs_fh4_len != 0)
   2233 		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
   2234 	if (newpfh.nfs_fh4_len != 0)
   2235 		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
   2236 	if (sfh != NULL)
   2237 		sfh4_rele(&sfh);
   2238 	if (psfh != NULL)
   2239 		sfh4_rele(&psfh);
   2240 	if (rootvp != NULL)
   2241 		VN_RELE(rootvp);
   2242 	*vpp = vp;
   2243 	return (e.error);
   2244 }
   2245 
   2246 #ifdef DEBUG
   2247 size_t r_path_memuse = 0;
   2248 #endif
   2249 
   2250 /*
   2251  * NFS client failover support
   2252  *
   2253  * sv4_free() frees the malloc'd portion of a "servinfo_t".
   2254  */
   2255 void
   2256 sv4_free(servinfo4_t *svp)
   2257 {
   2258 	servinfo4_t *next;
   2259 	struct knetconfig *knconf;
   2260 
   2261 	while (svp != NULL) {
   2262 		next = svp->sv_next;
   2263 		if (svp->sv_dhsec)
   2264 			sec_clnt_freeinfo(svp->sv_dhsec);
   2265 		if (svp->sv_secdata)
   2266 			sec_clnt_freeinfo(svp->sv_secdata);
   2267 		if (svp->sv_save_secinfo &&
   2268 		    svp->sv_save_secinfo != svp->sv_secinfo)
   2269 			secinfo_free(svp->sv_save_secinfo);
   2270 		if (svp->sv_secinfo)
   2271 			secinfo_free(svp->sv_secinfo);
   2272 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
   2273 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
   2274 		knconf = svp->sv_knconf;
   2275 		if (knconf != NULL) {
   2276 			if (knconf->knc_protofmly != NULL)
   2277 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
   2278 			if (knconf->knc_proto != NULL)
   2279 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
   2280 			kmem_free(knconf, sizeof (*knconf));
   2281 		}
   2282 		knconf = svp->sv_origknconf;
   2283 		if (knconf != NULL) {
   2284 			if (knconf->knc_protofmly != NULL)
   2285 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
   2286 			if (knconf->knc_proto != NULL)
   2287 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
   2288 			kmem_free(knconf, sizeof (*knconf));
   2289 		}
   2290 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
   2291 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
   2292 		if (svp->sv_path != NULL) {
   2293 			kmem_free(svp->sv_path, svp->sv_pathlen);
   2294 		}
   2295 		nfs_rw_destroy(&svp->sv_lock);
   2296 		kmem_free(svp, sizeof (*svp));
   2297 		svp = next;
   2298 	}
   2299 }
   2300 
   2301 void
   2302 nfs4_printfhandle(nfs4_fhandle_t *fhp)
   2303 {
   2304 	int *ip;
   2305 	char *buf;
   2306 	size_t bufsize;
   2307 	char *cp;
   2308 
   2309 	/*
   2310 	 * 13 == "(file handle:"
   2311 	 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
   2312 	 *	1 == ' '
   2313 	 *	8 == maximum strlen of "%x"
   2314 	 * 3 == ")\n\0"
   2315 	 */
   2316 	bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
   2317 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
   2318 	if (buf == NULL)
   2319 		return;
   2320 
   2321 	cp = buf;
   2322 	(void) strcpy(cp, "(file handle:");
   2323 	while (*cp != '\0')
   2324 		cp++;
   2325 	for (ip = (int *)fhp->fh_buf;
   2326 	    ip < (int *)&fhp->fh_buf[fhp->fh_len];
   2327 	    ip++) {
   2328 		(void) sprintf(cp, " %x", *ip);
   2329 		while (*cp != '\0')
   2330 			cp++;
   2331 	}
   2332 	(void) strcpy(cp, ")\n");
   2333 
   2334 	zcmn_err(getzoneid(), CE_CONT, "%s", buf);
   2335 
   2336 	kmem_free(buf, bufsize);
   2337 }
   2338 
   2339 /*
   2340  * The NFSv4 readdir cache subsystem.
   2341  *
   2342  * We provide a set of interfaces to allow the rest of the system to utilize
   2343  * a caching mechanism while encapsulating the details of the actual
   2344  * implementation.  This should allow for better maintainability and
   2345  * extensibility by consolidating the implementation details in one location.
   2346  */
   2347 
   2348 /*
   2349  * Comparator used by AVL routines.
   2350  */
   2351 static int
   2352 rddir4_cache_compar(const void *x, const void *y)
   2353 {
   2354 	rddir4_cache_impl *ai = (rddir4_cache_impl *)x;
   2355 	rddir4_cache_impl *bi = (rddir4_cache_impl *)y;
   2356 	rddir4_cache *a = &ai->rc;
   2357 	rddir4_cache *b = &bi->rc;
   2358 
   2359 	if (a->nfs4_cookie == b->nfs4_cookie) {
   2360 		if (a->buflen == b->buflen)
   2361 			return (0);
   2362 		if (a->buflen < b->buflen)
   2363 			return (-1);
   2364 		return (1);
   2365 	}
   2366 
   2367 	if (a->nfs4_cookie < b->nfs4_cookie)
   2368 			return (-1);
   2369 
   2370 	return (1);
   2371 }
   2372 
   2373 /*
   2374  * Allocate an opaque handle for the readdir cache.
   2375  */
   2376 void
   2377 rddir4_cache_create(rnode4_t *rp)
   2378 {
   2379 	ASSERT(rp->r_dir == NULL);
   2380 
   2381 	rp->r_dir = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
   2382 
   2383 	avl_create(rp->r_dir, rddir4_cache_compar, sizeof (rddir4_cache_impl),
   2384 	    offsetof(rddir4_cache_impl, tree));
   2385 }
   2386 
   2387 /*
   2388  *  Purge the cache of all cached readdir responses.
   2389  */
   2390 void
   2391 rddir4_cache_purge(rnode4_t *rp)
   2392 {
   2393 	rddir4_cache_impl	*rdip;
   2394 	rddir4_cache_impl	*nrdip;
   2395 
   2396 	ASSERT(MUTEX_HELD(&rp->r_statelock));
   2397 
   2398 	if (rp->r_dir == NULL)
   2399 		return;
   2400 
   2401 	rdip = avl_first(rp->r_dir);
   2402 
   2403 	while (rdip != NULL) {
   2404 		nrdip = AVL_NEXT(rp->r_dir, rdip);
   2405 		avl_remove(rp->r_dir, rdip);
   2406 		rdip->rc.flags &= ~RDDIRCACHED;
   2407 		rddir4_cache_rele(rp, &rdip->rc);
   2408 		rdip = nrdip;
   2409 	}
   2410 	ASSERT(avl_numnodes(rp->r_dir) == 0);
   2411 }
   2412 
   2413 /*
   2414  * Destroy the readdir cache.
   2415  */
   2416 void
   2417 rddir4_cache_destroy(rnode4_t *rp)
   2418 {
   2419 	ASSERT(MUTEX_HELD(&rp->r_statelock));
   2420 	if (rp->r_dir == NULL)
   2421 		return;
   2422 
   2423 	rddir4_cache_purge(rp);
   2424 	avl_destroy(rp->r_dir);
   2425 	kmem_free(rp->r_dir, sizeof (avl_tree_t));
   2426 	rp->r_dir = NULL;
   2427 }
   2428 
   2429 /*
   2430  * Locate a readdir response from the readdir cache.
   2431  *
   2432  * Return values:
   2433  *
   2434  * NULL - If there is an unrecoverable situation like the operation may have
   2435  *	  been interrupted.
   2436  *
   2437  * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller.
   2438  *		    The flags are set approprately, such that the caller knows
   2439  *		    what state the entry is in.
   2440  */
   2441 rddir4_cache *
   2442 rddir4_cache_lookup(rnode4_t *rp, offset_t cookie, int count)
   2443 {
   2444 	rddir4_cache_impl	*rdip = NULL;
   2445 	rddir4_cache_impl	srdip;
   2446 	rddir4_cache		*srdc;
   2447 	rddir4_cache		*rdc = NULL;
   2448 	rddir4_cache		*nrdc = NULL;
   2449 	avl_index_t		where;
   2450 
   2451 top:
   2452 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
   2453 	ASSERT(MUTEX_HELD(&rp->r_statelock));
   2454 	/*
   2455 	 * Check to see if the readdir cache has been disabled.  If so, then
   2456 	 * simply allocate an rddir4_cache entry and return it, since caching
   2457 	 * operations do not apply.
   2458 	 */
   2459 	if (rp->r_dir == NULL) {
   2460 		if (nrdc == NULL) {
   2461 			/*
   2462 			 * Drop the lock because we are doing a sleeping
   2463 			 * allocation.
   2464 			 */
   2465 			mutex_exit(&rp->r_statelock);
   2466 			rdc = rddir4_cache_alloc(KM_SLEEP);
   2467 			rdc->nfs4_cookie = cookie;
   2468 			rdc->buflen = count;
   2469 			mutex_enter(&rp->r_statelock);
   2470 			return (rdc);
   2471 		}
   2472 		return (nrdc);
   2473 	}
   2474 
   2475 	srdc = &srdip.rc;
   2476 	srdc->nfs4_cookie = cookie;
   2477 	srdc->buflen = count;
   2478 
   2479 	rdip = avl_find(rp->r_dir, &srdip, &where);
   2480 
   2481 	/*
   2482 	 * If we didn't find an entry then create one and insert it
   2483 	 * into the cache.
   2484 	 */
   2485 	if (rdip == NULL) {
   2486 		/*
   2487 		 * Check for the case where we have made a second pass through
   2488 		 * the cache due to a lockless allocation.  If we find that no
   2489 		 * thread has already inserted this entry, do the insert now
   2490 		 * and return.
   2491 		 */
   2492 		if (nrdc != NULL) {
   2493 			avl_insert(rp->r_dir, nrdc->data, where);
   2494 			nrdc->flags |= RDDIRCACHED;
   2495 			rddir4_cache_hold(nrdc);
   2496 			return (nrdc);
   2497 		}
   2498 
   2499 #ifdef DEBUG
   2500 		nfs4_readdir_cache_misses++;
   2501 #endif
   2502 		/*
   2503 		 * First, try to allocate an entry without sleeping.  If that
   2504 		 * fails then drop the lock and do a sleeping allocation.
   2505 		 */
   2506 		nrdc = rddir4_cache_alloc(KM_NOSLEEP);
   2507 		if (nrdc != NULL) {
   2508 			nrdc->nfs4_cookie = cookie;
   2509 			nrdc->buflen = count;
   2510 			avl_insert(rp->r_dir, nrdc->data, where);
   2511 			nrdc->flags |= RDDIRCACHED;
   2512 			rddir4_cache_hold(nrdc);
   2513 			return (nrdc);
   2514 		}
   2515 
   2516 		/*
   2517 		 * Drop the lock and do a sleeping allocation.	We incur
   2518 		 * additional overhead by having to search the cache again,
   2519 		 * but this case should be rare.
   2520 		 */
   2521 		mutex_exit(&rp->r_statelock);
   2522 		nrdc = rddir4_cache_alloc(KM_SLEEP);
   2523 		nrdc->nfs4_cookie = cookie;
   2524 		nrdc->buflen = count;
   2525 		mutex_enter(&rp->r_statelock);
   2526 		/*
   2527 		 * We need to take another pass through the cache
   2528 		 * since we dropped our lock to perform the alloc.
   2529 		 * Another thread may have come by and inserted the
   2530 		 * entry we are interested in.
   2531 		 */
   2532 		goto top;
   2533 	}
   2534 
   2535 	/*
   2536 	 * Check to see if we need to free our entry.  This can happen if
   2537 	 * another thread came along beat us to the insert.  We can
   2538 	 * safely call rddir4_cache_free directly because no other thread
   2539 	 * would have a reference to this entry.
   2540 	 */
   2541 	if (nrdc != NULL)
   2542 		rddir4_cache_free((rddir4_cache_impl *)nrdc->data);
   2543 
   2544 #ifdef DEBUG
   2545 	nfs4_readdir_cache_hits++;
   2546 #endif
   2547 	/*
   2548 	 * Found something.  Make sure it's ready to return.
   2549 	 */
   2550 	rdc = &rdip->rc;
   2551 	rddir4_cache_hold(rdc);
   2552 	/*
   2553 	 * If the cache entry is in the process of being filled in, wait
   2554 	 * until this completes.  The RDDIRWAIT bit is set to indicate that
   2555 	 * someone is waiting and when the thread currently filling the entry
   2556 	 * is done, it should do a cv_broadcast to wakeup all of the threads
   2557 	 * waiting for it to finish. If the thread wakes up to find that
   2558 	 * someone new is now trying to complete the the entry, go back
   2559 	 * to sleep.
   2560 	 */
   2561 	while (rdc->flags & RDDIR) {
   2562 		/*
   2563 		 * The entry is not complete.
   2564 		 */
   2565 		nfs_rw_exit(&rp->r_rwlock);
   2566 		rdc->flags |= RDDIRWAIT;
   2567 #ifdef DEBUG
   2568 		nfs4_readdir_cache_waits++;
   2569 #endif
   2570 		while (rdc->flags & RDDIRWAIT) {
   2571 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
   2572 				/*
   2573 				 * We got interrupted, probably the user
   2574 				 * typed ^C or an alarm fired.  We free the
   2575 				 * new entry if we allocated one.
   2576 				 */
   2577 				rddir4_cache_rele(rp, rdc);
   2578 				mutex_exit(&rp->r_statelock);
   2579 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
   2580 				    RW_READER, FALSE);
   2581 				mutex_enter(&rp->r_statelock);
   2582 				return (NULL);
   2583 			}
   2584 		}
   2585 		mutex_exit(&rp->r_statelock);
   2586 		(void) nfs_rw_enter_sig(&rp->r_rwlock,
   2587 		    RW_READER, FALSE);
   2588 		mutex_enter(&rp->r_statelock);
   2589 	}
   2590 
   2591 	/*
   2592 	 * The entry we were waiting on may have been purged from
   2593 	 * the cache and should no longer be used, release it and
   2594 	 * start over.
   2595 	 */
   2596 	if (!(rdc->flags & RDDIRCACHED)) {
   2597 		rddir4_cache_rele(rp, rdc);
   2598 		goto top;
   2599 	}
   2600 
   2601 	/*
   2602 	 * The entry is completed.  Return it.
   2603 	 */
   2604 	return (rdc);
   2605 }
   2606 
   2607 /*
   2608  * Allocate a cache element and return it.  Can return NULL if memory is
   2609  * low.
   2610  */
   2611 static rddir4_cache *
   2612 rddir4_cache_alloc(int flags)
   2613 {
   2614 	rddir4_cache_impl	*rdip = NULL;
   2615 	rddir4_cache		*rc = NULL;
   2616 
   2617 	rdip = kmem_alloc(sizeof (rddir4_cache_impl), flags);
   2618 
   2619 	if (rdip != NULL) {
   2620 		rc = &rdip->rc;
   2621 		rc->data = (void *)rdip;
   2622 		rc->nfs4_cookie = 0;
   2623 		rc->nfs4_ncookie = 0;
   2624 		rc->entries = NULL;
   2625 		rc->eof = 0;
   2626 		rc->entlen = 0;
   2627 		rc->buflen = 0;
   2628 		rc->actlen = 0;
   2629 		/*
   2630 		 * A readdir is required so set the flag.
   2631 		 */
   2632 		rc->flags = RDDIRREQ;
   2633 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
   2634 		rc->error = 0;
   2635 		mutex_init(&rdip->lock, NULL, MUTEX_DEFAULT, NULL);
   2636 		rdip->count = 1;
   2637 #ifdef DEBUG
   2638 		atomic_add_64(&clstat4_debug.dirent.value.ui64, 1);
   2639 #endif
   2640 	}
   2641 	return (rc);
   2642 }
   2643 
   2644 /*
   2645  * Increment the reference count to this cache element.
   2646  */
   2647 static void
   2648 rddir4_cache_hold(rddir4_cache *rc)
   2649 {
   2650 	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rc->data;
   2651 
   2652 	mutex_enter(&rdip->lock);
   2653 	rdip->count++;
   2654 	mutex_exit(&rdip->lock);
   2655 }
   2656 
   2657 /*
   2658  * Release a reference to this cache element.  If the count is zero then
   2659  * free the element.
   2660  */
   2661 void
   2662 rddir4_cache_rele(rnode4_t *rp, rddir4_cache *rdc)
   2663 {
   2664 	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rdc->data;
   2665 
   2666 	ASSERT(MUTEX_HELD(&rp->r_statelock));
   2667 
   2668 	/*
   2669 	 * Check to see if we have any waiters.  If so, we can wake them
   2670 	 * so that they can proceed.
   2671 	 */
   2672 	if (rdc->flags & RDDIRWAIT) {
   2673 		rdc->flags &= ~RDDIRWAIT;
   2674 		cv_broadcast(&rdc->cv);
   2675 	}
   2676 
   2677 	mutex_enter(&rdip->lock);
   2678 	ASSERT(rdip->count > 0);
   2679 	if (--rdip->count == 0) {
   2680 		mutex_exit(&rdip->lock);
   2681 		rddir4_cache_free(rdip);
   2682 	} else
   2683 		mutex_exit(&rdip->lock);
   2684 }
   2685 
   2686 /*
   2687  * Free a cache element.
   2688  */
   2689 static void
   2690 rddir4_cache_free(rddir4_cache_impl *rdip)
   2691 {
   2692 	rddir4_cache *rc = &rdip->rc;
   2693 
   2694 #ifdef DEBUG
   2695 	atomic_add_64(&clstat4_debug.dirent.value.ui64, -1);
   2696 #endif
   2697 	if (rc->entries != NULL)
   2698 		kmem_free(rc->entries, rc->buflen);
   2699 	cv_destroy(&rc->cv);
   2700 	mutex_destroy(&rdip->lock);
   2701 	kmem_free(rdip, sizeof (*rdip));
   2702 }
   2703 
   2704 /*
   2705  * Snapshot callback for nfs:0:nfs4_client as registered with the kstat
   2706  * framework.
   2707  */
   2708 static int
   2709 cl4_snapshot(kstat_t *ksp, void *buf, int rw)
   2710 {
   2711 	ksp->ks_snaptime = gethrtime();
   2712 	if (rw == KSTAT_WRITE) {
   2713 		bcopy(buf, ksp->ks_private, sizeof (clstat4_tmpl));
   2714 #ifdef DEBUG
   2715 		/*
   2716 		 * Currently only the global zone can write to kstats, but we
   2717 		 * add the check just for paranoia.
   2718 		 */
   2719 		if (INGLOBALZONE(curproc))
   2720 			bcopy((char *)buf + sizeof (clstat4_tmpl),
   2721 			    &clstat4_debug, sizeof (clstat4_debug));
   2722 #endif
   2723 	} else {
   2724 		bcopy(ksp->ks_private, buf, sizeof (clstat4_tmpl));
   2725 #ifdef DEBUG
   2726 		/*
   2727 		 * If we're displaying the "global" debug kstat values, we
   2728 		 * display them as-is to all zones since in fact they apply to
   2729 		 * the system as a whole.
   2730 		 */
   2731 		bcopy(&clstat4_debug, (char *)buf + sizeof (clstat4_tmpl),
   2732 		    sizeof (clstat4_debug));
   2733 #endif
   2734 	}
   2735 	return (0);
   2736 }
   2737 
   2738 
   2739 
   2740 /*
   2741  * Zone support
   2742  */
   2743 static void *
   2744 clinit4_zone(zoneid_t zoneid)
   2745 {
   2746 	kstat_t *nfs4_client_kstat;
   2747 	struct nfs4_clnt *nfscl;
   2748 	uint_t ndata;
   2749 
   2750 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
   2751 	mutex_init(&nfscl->nfscl_chtable4_lock, NULL, MUTEX_DEFAULT, NULL);
   2752 	nfscl->nfscl_chtable4 = NULL;
   2753 	nfscl->nfscl_zoneid = zoneid;
   2754 
   2755 	bcopy(&clstat4_tmpl, &nfscl->nfscl_stat, sizeof (clstat4_tmpl));
   2756 	ndata = sizeof (clstat4_tmpl) / sizeof (kstat_named_t);
   2757 #ifdef DEBUG
   2758 	ndata += sizeof (clstat4_debug) / sizeof (kstat_named_t);
   2759 #endif
   2760 	if ((nfs4_client_kstat = kstat_create_zone("nfs", 0, "nfs4_client",
   2761 	    "misc", KSTAT_TYPE_NAMED, ndata,
   2762 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
   2763 		nfs4_client_kstat->ks_private = &nfscl->nfscl_stat;
   2764 		nfs4_client_kstat->ks_snapshot = cl4_snapshot;
   2765 		kstat_install(nfs4_client_kstat);
   2766 	}
   2767 	mutex_enter(&nfs4_clnt_list_lock);
   2768 	list_insert_head(&nfs4_clnt_list, nfscl);
   2769 	mutex_exit(&nfs4_clnt_list_lock);
   2770 
   2771 	return (nfscl);
   2772 }
   2773 
   2774 /*ARGSUSED*/
   2775 static void
   2776 clfini4_zone(zoneid_t zoneid, void *arg)
   2777 {
   2778 	struct nfs4_clnt *nfscl = arg;
   2779 	chhead_t *chp, *next;
   2780 
   2781 	if (nfscl == NULL)
   2782 		return;
   2783 	mutex_enter(&nfs4_clnt_list_lock);
   2784 	list_remove(&nfs4_clnt_list, nfscl);
   2785 	mutex_exit(&nfs4_clnt_list_lock);
   2786 	clreclaim4_zone(nfscl, 0);
   2787 	for (chp = nfscl->nfscl_chtable4; chp != NULL; chp = next) {
   2788 		ASSERT(chp->ch_list == NULL);
   2789 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
   2790 		next = chp->ch_next;
   2791 		kmem_free(chp, sizeof (*chp));
   2792 	}
   2793 	kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid);
   2794 	mutex_destroy(&nfscl->nfscl_chtable4_lock);
   2795 	kmem_free(nfscl, sizeof (*nfscl));
   2796 }
   2797 
   2798 /*
   2799  * Called by endpnt_destructor to make sure the client handles are
   2800  * cleaned up before the RPC endpoints.  This becomes a no-op if
   2801  * clfini_zone (above) is called first.  This function is needed
   2802  * (rather than relying on clfini_zone to clean up) because the ZSD
   2803  * callbacks have no ordering mechanism, so we have no way to ensure
   2804  * that clfini_zone is called before endpnt_destructor.
   2805  */
   2806 void
   2807 clcleanup4_zone(zoneid_t zoneid)
   2808 {
   2809 	struct nfs4_clnt *nfscl;
   2810 
   2811 	mutex_enter(&nfs4_clnt_list_lock);
   2812 	nfscl = list_head(&nfs4_clnt_list);
   2813 	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) {
   2814 		if (nfscl->nfscl_zoneid == zoneid) {
   2815 			clreclaim4_zone(nfscl, 0);
   2816 			break;
   2817 		}
   2818 	}
   2819 	mutex_exit(&nfs4_clnt_list_lock);
   2820 }
   2821 
   2822 int
   2823 nfs4_subr_init(void)
   2824 {
   2825 	/*
   2826 	 * Allocate and initialize the client handle cache
   2827 	 */
   2828 	chtab4_cache = kmem_cache_create("client_handle4_cache",
   2829 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim4, NULL,
   2830 	    NULL, 0);
   2831 
   2832 	/*
   2833 	 * Initialize the list of per-zone client handles (and associated data).
   2834 	 * This needs to be done before we call zone_key_create().
   2835 	 */
   2836 	list_create(&nfs4_clnt_list, sizeof (struct nfs4_clnt),
   2837 	    offsetof(struct nfs4_clnt, nfscl_node));
   2838 
   2839 	/*
   2840 	 * Initialize the zone_key for per-zone client handle lists.
   2841 	 */
   2842 	zone_key_create(&nfs4clnt_zone_key, clinit4_zone, NULL, clfini4_zone);
   2843 
   2844 	if (nfs4err_delay_time == 0)
   2845 		nfs4err_delay_time = NFS4ERR_DELAY_TIME;
   2846 
   2847 	return (0);
   2848 }
   2849 
   2850 int
   2851 nfs4_subr_fini(void)
   2852 {
   2853 	/*
   2854 	 * Deallocate the client handle cache
   2855 	 */
   2856 	kmem_cache_destroy(chtab4_cache);
   2857 
   2858 	/*
   2859 	 * Destroy the zone_key
   2860 	 */
   2861 	(void) zone_key_delete(nfs4clnt_zone_key);
   2862 
   2863 	return (0);
   2864 }
   2865 /*
   2866  * Set or Clear direct I/O flag
   2867  * VOP_RWLOCK() is held for write access to prevent a race condition
   2868  * which would occur if a process is in the middle of a write when
   2869  * directio flag gets set. It is possible that all pages may not get flushed.
   2870  *
   2871  * This is a copy of nfs_directio, changes here may need to be made
   2872  * there and vice versa.
   2873  */
   2874 
   2875 int
   2876 nfs4_directio(vnode_t *vp, int cmd, cred_t *cr)
   2877 {
   2878 	int	error = 0;
   2879 	rnode4_t *rp;
   2880 
   2881 	rp = VTOR4(vp);
   2882 
   2883 	if (cmd == DIRECTIO_ON) {
   2884 
   2885 		if (rp->r_flags & R4DIRECTIO)
   2886 			return (0);
   2887 
   2888 		/*
   2889 		 * Flush the page cache.
   2890 		 */
   2891 
   2892 		(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
   2893 
   2894 		if (rp->r_flags & R4DIRECTIO) {
   2895 			VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
   2896 			return (0);
   2897 		}
   2898 
   2899 		if (nfs4_has_pages(vp) &&
   2900 		    ((rp->r_flags & R4DIRTY) || rp->r_awcount > 0)) {
   2901 			error = VOP_PUTPAGE(vp, (offset_t)0, (uint_t)0,
   2902 			    B_INVAL, cr, NULL);
   2903 			if (error) {
   2904 				if (error == ENOSPC || error == EDQUOT) {
   2905 					mutex_enter(&rp->r_statelock);
   2906 					if (!rp->r_error)
   2907 						rp->r_error = error;
   2908 					mutex_exit(&rp->r_statelock);
   2909 				}
   2910 				VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
   2911 				return (error);
   2912 			}
   2913 		}
   2914 
   2915 		mutex_enter(&rp->r_statelock);
   2916 		rp->r_flags |= R4DIRECTIO;
   2917 		mutex_exit(&rp->r_statelock);
   2918 		VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
   2919 		return (0);
   2920 	}
   2921 
   2922 	if (cmd == DIRECTIO_OFF) {
   2923 		mutex_enter(&rp->r_statelock);
   2924 		rp->r_flags &= ~R4DIRECTIO;	/* disable direct mode */
   2925 		mutex_exit(&rp->r_statelock);
   2926 		return (0);
   2927 	}
   2928 
   2929 	return (EINVAL);
   2930 }
   2931 
   2932 /*
   2933  * Return TRUE if the file has any pages.  Always go back to
   2934  * the master vnode to check v_pages since none of the shadows
   2935  * can have pages.
   2936  */
   2937 
   2938 bool_t
   2939 nfs4_has_pages(vnode_t *vp)
   2940 {
   2941 	rnode4_t *rp;
   2942 
   2943 	rp = VTOR4(vp);
   2944 	if (IS_SHADOW(vp, rp))
   2945 		vp = RTOV4(rp);	/* RTOV4 always gives the master */
   2946 
   2947 	return (vn_has_cached_data(vp));
   2948 }
   2949 
   2950 /*
   2951  * This table is used to determine whether the client should attempt
   2952  * failover based on the clnt_stat value returned by CLNT_CALL.  The
   2953  * clnt_stat is used as an index into the table.  If
   2954  * the error value that corresponds to the clnt_stat value in the
   2955  * table is non-zero, then that is the error to be returned AND
   2956  * that signals that failover should be attempted.
   2957  *
   2958  * Special note: If the RPC_ values change, then direct indexing of the
   2959  * table is no longer valid, but having the RPC_ values in the table
   2960  * allow the functions to detect the change and issue a warning.
   2961  * In this case, the code will always attempt failover as a defensive
   2962  * measure.
   2963  */
   2964 
   2965 static struct try_failover_tab {
   2966 	enum clnt_stat	cstat;
   2967 	int		error;
   2968 } try_failover_table [] = {
   2969 
   2970 	RPC_SUCCESS,		0,
   2971 	RPC_CANTENCODEARGS,	0,
   2972 	RPC_CANTDECODERES,	0,
   2973 	RPC_CANTSEND,		ECOMM,
   2974 	RPC_CANTRECV,		ECOMM,
   2975 	RPC_TIMEDOUT,		ETIMEDOUT,
   2976 	RPC_VERSMISMATCH,	0,
   2977 	RPC_AUTHERROR,		0,
   2978 	RPC_PROGUNAVAIL,	0,
   2979 	RPC_PROGVERSMISMATCH,	0,
   2980 	RPC_PROCUNAVAIL,	0,
   2981 	RPC_CANTDECODEARGS,	0,
   2982 	RPC_SYSTEMERROR,	ENOSR,
   2983 	RPC_UNKNOWNHOST,	EHOSTUNREACH,
   2984 	RPC_RPCBFAILURE,	ENETUNREACH,
   2985 	RPC_PROGNOTREGISTERED,	ECONNREFUSED,
   2986 	RPC_FAILED,		ETIMEDOUT,
   2987 	RPC_UNKNOWNPROTO,	EHOSTUNREACH,
   2988 	RPC_INTR,		0,
   2989 	RPC_UNKNOWNADDR,	EHOSTUNREACH,
   2990 	RPC_TLIERROR,		0,
   2991 	RPC_NOBROADCAST,	EHOSTUNREACH,
   2992 	RPC_N2AXLATEFAILURE,	ECONNREFUSED,
   2993 	RPC_UDERROR,		0,
   2994 	RPC_INPROGRESS,		0,
   2995 	RPC_STALERACHANDLE,	EINVAL,
   2996 	RPC_CANTCONNECT,	ECONNREFUSED,
   2997 	RPC_XPRTFAILED,		ECONNABORTED,
   2998 	RPC_CANTCREATESTREAM,	ECONNREFUSED,
   2999 	RPC_CANTSTORE,		ENOBUFS
   3000 };
   3001 
   3002 /*
   3003  * nfs4_try_failover - determine whether the client should
   3004  * attempt failover based on the values stored in the nfs4_error_t.
   3005  */
   3006 int
   3007 nfs4_try_failover(nfs4_error_t *ep)
   3008 {
   3009 	if (ep->error == ETIMEDOUT || ep->stat == NFS4ERR_RESOURCE)
   3010 		return (TRUE);
   3011 
   3012 	if (ep->error && ep->rpc_status != RPC_SUCCESS)
   3013 		return (try_failover(ep->rpc_status) != 0 ? TRUE : FALSE);
   3014 
   3015 	return (FALSE);
   3016 }
   3017 
   3018 /*
   3019  * try_failover - internal version of nfs4_try_failover, called
   3020  * only by rfscall and aclcall.  Determine if failover is warranted
   3021  * based on the clnt_stat and return the error number if it is.
   3022  */
   3023 static int
   3024 try_failover(enum clnt_stat rpc_status)
   3025 {
   3026 	int err = 0;
   3027 
   3028 	if (rpc_status == RPC_SUCCESS)
   3029 		return (0);
   3030 
   3031 #ifdef	DEBUG
   3032 	if (rpc_status != 0 && nfs4_try_failover_any) {
   3033 		err = ETIMEDOUT;
   3034 		goto done;
   3035 	}
   3036 #endif
   3037 	/*
   3038 	 * The rpc status is used as an index into the table.
   3039 	 * If the rpc status is outside of the range of the
   3040 	 * table or if the rpc error numbers have been changed
   3041 	 * since the table was constructed, then print a warning
   3042 	 * (DEBUG only) and try failover anyway.  Otherwise, just
   3043 	 * grab the resulting error number out of the table.
   3044 	 */
   3045 	if (rpc_status < RPC_SUCCESS || rpc_status >=
   3046 	    sizeof (try_failover_table)/sizeof (try_failover_table[0]) ||
   3047 	    try_failover_table[rpc_status].cstat != rpc_status) {
   3048 
   3049 		err = ETIMEDOUT;
   3050 #ifdef	DEBUG
   3051 		cmn_err(CE_NOTE, "try_failover: unexpected rpc error %d",
   3052 		    rpc_status);
   3053 #endif
   3054 	} else
   3055 		err = try_failover_table[rpc_status].error;
   3056 
   3057 done:
   3058 	if (rpc_status)
   3059 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   3060 		    "nfs4_try_failover: %strying failover on error %d",
   3061 		    err ? "" : "NOT ", rpc_status));
   3062 
   3063 	return (err);
   3064 }
   3065 
   3066 void
   3067 nfs4_error_zinit(nfs4_error_t *ep)
   3068 {
   3069 	ep->error = 0;
   3070 	ep->stat = NFS4_OK;
   3071 	ep->rpc_status = RPC_SUCCESS;
   3072 }
   3073 
   3074 void
   3075 nfs4_error_init(nfs4_error_t *ep, int error)
   3076 {
   3077 	ep->error = error;
   3078 	ep->stat = NFS4_OK;
   3079 	ep->rpc_status = RPC_SUCCESS;
   3080 }
   3081 
   3082 
   3083 #ifdef DEBUG
   3084 
   3085 /*
   3086  * Return a 16-bit hash for filehandle, stateid, clientid, owner.
   3087  * use the same algorithm as for NFS v3.
   3088  *
   3089  */
   3090 int
   3091 hash16(void *p, int len)
   3092 {
   3093 	int i, rem;
   3094 	uint_t *wp;
   3095 	uint_t key = 0;
   3096 
   3097 	/* protect against non word aligned */
   3098 	if ((rem = len & 3) != 0)
   3099 		len &= ~3;
   3100 
   3101 	for (i = 0, wp = (uint_t *)p; i < len; i += 4, wp++) {
   3102 		key ^= (*wp >> 16) ^ *wp;
   3103 	}
   3104 
   3105 	/* hash left-over bytes */
   3106 	for (i = 0; i < rem; i++)
   3107 		key ^= *((uchar_t *)p + i);
   3108 
   3109 	return (key & 0xffff);
   3110 }
   3111 
   3112 /*
   3113  * rnode4info - return filehandle and path information for an rnode.
   3114  * XXX MT issues: uses a single static buffer, no locking of path.
   3115  */
   3116 char *
   3117 rnode4info(rnode4_t *rp)
   3118 {
   3119 	static char buf[80];
   3120 	nfs4_fhandle_t fhandle;
   3121 	char *path;
   3122 	char *type;
   3123 
   3124 	if (rp == NULL)
   3125 		return ("null");
   3126 	if (rp->r_flags & R4ISXATTR)
   3127 		type = "attr";
   3128 	else if (RTOV4(rp)->v_flag & V_XATTRDIR)
   3129 		type = "attrdir";
   3130 	else if (RTOV4(rp)->v_flag & VROOT)
   3131 		type = "root";
   3132 	else if (RTOV4(rp)->v_type == VDIR)
   3133 		type = "dir";
   3134 	else if (RTOV4(rp)->v_type == VREG)
   3135 		type = "file";
   3136 	else
   3137 		type = "other";
   3138 	sfh4_copyval(rp->r_fh, &fhandle);
   3139 	path = fn_path(rp->r_svnode.sv_name);
   3140 	(void) snprintf(buf, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n",
   3141 	    (void *)rp, path, type, rp->r_flags,
   3142 	    hash16((void *)&fhandle.fh_buf, fhandle.fh_len));
   3143 	kmem_free(path, strlen(path)+1);
   3144 	return (buf);
   3145 }
   3146 #endif
   3147