Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
     28  *	All Rights Reserved
     29  */
     30 
     31 #include <sys/param.h>
     32 #include <sys/types.h>
     33 #include <sys/systm.h>
     34 #include <sys/cred.h>
     35 #include <sys/vfs.h>
     36 #include <sys/vfs_opreg.h>
     37 #include <sys/vnode.h>
     38 #include <sys/pathname.h>
     39 #include <sys/sysmacros.h>
     40 #include <sys/kmem.h>
     41 #include <sys/mkdev.h>
     42 #include <sys/mount.h>
     43 #include <sys/statvfs.h>
     44 #include <sys/errno.h>
     45 #include <sys/debug.h>
     46 #include <sys/cmn_err.h>
     47 #include <sys/utsname.h>
     48 #include <sys/bootconf.h>
     49 #include <sys/modctl.h>
     50 #include <sys/acl.h>
     51 #include <sys/flock.h>
     52 #include <sys/time.h>
     53 #include <sys/disp.h>
     54 #include <sys/policy.h>
     55 #include <sys/socket.h>
     56 #include <sys/netconfig.h>
     57 #include <sys/dnlc.h>
     58 #include <sys/list.h>
     59 #include <sys/mntent.h>
     60 #include <sys/tsol/label.h>
     61 
     62 #include <rpc/types.h>
     63 #include <rpc/auth.h>
     64 #include <rpc/rpcsec_gss.h>
     65 #include <rpc/clnt.h>
     66 
     67 #include <nfs/nfs.h>
     68 #include <nfs/nfs_clnt.h>
     69 #include <nfs/mount.h>
     70 #include <nfs/nfs_acl.h>
     71 
     72 #include <fs/fs_subr.h>
     73 
     74 #include <nfs/nfs4.h>
     75 #include <nfs/rnode4.h>
     76 #include <nfs/nfs4_clnt.h>
     77 #include <sys/fs/autofs.h>
     78 
     79 
     80 /*
     81  * Arguments passed to thread to free data structures from forced unmount.
     82  */
     83 
     84 typedef struct {
     85 	vfs_t	*fm_vfsp;
     86 	int	fm_flag;
     87 	cred_t	*fm_cr;
     88 } freemountargs_t;
     89 
     90 static void	async_free_mount(vfs_t *, int, cred_t *);
     91 static void	nfs4_free_mount(vfs_t *, int, cred_t *);
     92 static void	nfs4_free_mount_thread(freemountargs_t *);
     93 static int nfs4_chkdup_servinfo4(servinfo4_t *, servinfo4_t *);
     94 
     95 /*
     96  * From rpcsec module (common/rpcsec).
     97  */
     98 extern int sec_clnt_loadinfo(struct sec_data *, struct sec_data **, model_t);
     99 extern void sec_clnt_freeinfo(struct sec_data *);
    100 
    101 /*
    102  * The order and contents of this structure must be kept in sync with that of
    103  * rfsreqcnt_v4_tmpl in nfs_stats.c
    104  */
    105 static char *rfsnames_v4[] = {
    106 	"null", "compound", "reserved",	"access", "close", "commit", "create",
    107 	"delegpurge", "delegreturn", "getattr",	"getfh", "link", "lock",
    108 	"lockt", "locku", "lookup", "lookupp", "nverify", "open", "openattr",
    109 	"open_confirm",	"open_downgrade", "putfh", "putpubfh", "putrootfh",
    110 	"read", "readdir", "readlink", "remove", "rename", "renew",
    111 	"restorefh", "savefh", "secinfo", "setattr", "setclientid",
    112 	"setclientid_confirm", "verify", "write"
    113 };
    114 
    115 /*
    116  * nfs4_max_mount_retry is the number of times the client will redrive
    117  * a mount compound before giving up and returning failure.  The intent
    118  * is to redrive mount compounds which fail NFS4ERR_STALE so that
    119  * if a component of the server path being mounted goes stale, it can
    120  * "recover" by redriving the mount compund (LOOKUP ops).  This recovery
    121  * code is needed outside of the recovery framework because mount is a
    122  * special case.  The client doesn't create vnodes/rnodes for components
    123  * of the server path being mounted.  The recovery code recovers real
    124  * client objects, not STALE FHs which map to components of the server
    125  * path being mounted.
    126  *
    127  * We could just fail the mount on the first time, but that would
    128  * instantly trigger failover (from nfs4_mount), and the client should
    129  * try to re-lookup the STALE FH before doing failover.  The easiest
    130  * way to "re-lookup" is to simply redrive the mount compound.
    131  */
    132 static int nfs4_max_mount_retry = 2;
    133 
    134 /*
    135  * nfs4 vfs operations.
    136  */
    137 int		nfs4_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
    138 static int	nfs4_unmount(vfs_t *, int, cred_t *);
    139 static int	nfs4_root(vfs_t *, vnode_t **);
    140 static int	nfs4_statvfs(vfs_t *, struct statvfs64 *);
    141 static int	nfs4_sync(vfs_t *, short, cred_t *);
    142 static int	nfs4_vget(vfs_t *, vnode_t **, fid_t *);
    143 static int	nfs4_mountroot(vfs_t *, whymountroot_t);
    144 static void	nfs4_freevfs(vfs_t *);
    145 
    146 static int	nfs4rootvp(vnode_t **, vfs_t *, struct servinfo4 *,
    147 		    int, cred_t *, zone_t *);
    148 
    149 vfsops_t	*nfs4_vfsops;
    150 
    151 int nfs4_vfsinit(void);
    152 void nfs4_vfsfini(void);
    153 static void nfs4setclientid_init(void);
    154 static void nfs4setclientid_fini(void);
    155 static void nfs4setclientid_otw(mntinfo4_t *, servinfo4_t *,  cred_t *,
    156 		struct nfs4_server *, nfs4_error_t *, int *);
    157 static void	destroy_nfs4_server(nfs4_server_t *);
    158 static void	remove_mi(nfs4_server_t *, mntinfo4_t *);
    159 
    160 extern void nfs4_ephemeral_init(void);
    161 extern void nfs4_ephemeral_fini(void);
    162 
    163 /*
    164  * Initialize the vfs structure
    165  */
    166 
    167 static int nfs4fstyp;
    168 
    169 
    170 /*
    171  * Debug variable to check for rdma based
    172  * transport startup and cleanup. Controlled
    173  * through /etc/system. Off by default.
    174  */
    175 extern int rdma_debug;
    176 
    177 int
    178 nfs4init(int fstyp, char *name)
    179 {
    180 	static const fs_operation_def_t nfs4_vfsops_template[] = {
    181 		VFSNAME_MOUNT,		{ .vfs_mount = nfs4_mount },
    182 		VFSNAME_UNMOUNT,	{ .vfs_unmount = nfs4_unmount },
    183 		VFSNAME_ROOT,		{ .vfs_root = nfs4_root },
    184 		VFSNAME_STATVFS,	{ .vfs_statvfs = nfs4_statvfs },
    185 		VFSNAME_SYNC,		{ .vfs_sync = nfs4_sync },
    186 		VFSNAME_VGET,		{ .vfs_vget = nfs4_vget },
    187 		VFSNAME_MOUNTROOT,	{ .vfs_mountroot = nfs4_mountroot },
    188 		VFSNAME_FREEVFS,	{ .vfs_freevfs = nfs4_freevfs },
    189 		NULL,			NULL
    190 	};
    191 	int error;
    192 
    193 	nfs4_vfsops = NULL;
    194 	nfs4_vnodeops = NULL;
    195 	nfs4_trigger_vnodeops = NULL;
    196 
    197 	error = vfs_setfsops(fstyp, nfs4_vfsops_template, &nfs4_vfsops);
    198 	if (error != 0) {
    199 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
    200 		    "nfs4init: bad vfs ops template");
    201 		goto out;
    202 	}
    203 
    204 	error = vn_make_ops(name, nfs4_vnodeops_template, &nfs4_vnodeops);
    205 	if (error != 0) {
    206 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
    207 		    "nfs4init: bad vnode ops template");
    208 		goto out;
    209 	}
    210 
    211 	error = vn_make_ops("nfs4_trigger", nfs4_trigger_vnodeops_template,
    212 	    &nfs4_trigger_vnodeops);
    213 	if (error != 0) {
    214 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
    215 		    "nfs4init: bad trigger vnode ops template");
    216 		goto out;
    217 	}
    218 
    219 	nfs4fstyp = fstyp;
    220 	(void) nfs4_vfsinit();
    221 	(void) nfs4_init_dot_entries();
    222 
    223 out:
    224 	if (error) {
    225 		if (nfs4_trigger_vnodeops != NULL)
    226 			vn_freevnodeops(nfs4_trigger_vnodeops);
    227 
    228 		if (nfs4_vnodeops != NULL)
    229 			vn_freevnodeops(nfs4_vnodeops);
    230 
    231 		(void) vfs_freevfsops_by_type(fstyp);
    232 	}
    233 
    234 	return (error);
    235 }
    236 
    237 void
    238 nfs4fini(void)
    239 {
    240 	(void) nfs4_destroy_dot_entries();
    241 	nfs4_vfsfini();
    242 }
    243 
    244 /*
    245  * Create a new sec_data structure to store AUTH_DH related data:
    246  * netname, syncaddr, knetconfig. There is no AUTH_F_RPCTIMESYNC
    247  * flag set for NFS V4 since we are avoiding to contact the rpcbind
    248  * daemon and is using the IP time service (IPPORT_TIMESERVER).
    249  *
    250  * sec_data can be freed by sec_clnt_freeinfo().
    251  */
    252 static struct sec_data *
    253 create_authdh_data(char *netname, int nlen, struct netbuf *syncaddr,
    254 		struct knetconfig *knconf) {
    255 	struct sec_data *secdata;
    256 	dh_k4_clntdata_t *data;
    257 	char *pf, *p;
    258 
    259 	if (syncaddr == NULL || syncaddr->buf == NULL || nlen == 0)
    260 		return (NULL);
    261 
    262 	secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
    263 	secdata->flags = 0;
    264 
    265 	data = kmem_alloc(sizeof (*data), KM_SLEEP);
    266 
    267 	data->syncaddr.maxlen = syncaddr->maxlen;
    268 	data->syncaddr.len = syncaddr->len;
    269 	data->syncaddr.buf = (char *)kmem_alloc(syncaddr->len, KM_SLEEP);
    270 	bcopy(syncaddr->buf, data->syncaddr.buf, syncaddr->len);
    271 
    272 	/*
    273 	 * duplicate the knconf information for the
    274 	 * new opaque data.
    275 	 */
    276 	data->knconf = kmem_alloc(sizeof (*knconf), KM_SLEEP);
    277 	*data->knconf = *knconf;
    278 	pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
    279 	p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
    280 	bcopy(knconf->knc_protofmly, pf, KNC_STRSIZE);
    281 	bcopy(knconf->knc_proto, p, KNC_STRSIZE);
    282 	data->knconf->knc_protofmly = pf;
    283 	data->knconf->knc_proto = p;
    284 
    285 	/* move server netname to the sec_data structure */
    286 	data->netname = kmem_alloc(nlen, KM_SLEEP);
    287 	bcopy(netname, data->netname, nlen);
    288 	data->netnamelen = (int)nlen;
    289 
    290 	secdata->secmod = AUTH_DH;
    291 	secdata->rpcflavor = AUTH_DH;
    292 	secdata->data = (caddr_t)data;
    293 
    294 	return (secdata);
    295 }
    296 
    297 /*
    298  * Returns (deep) copy of sec_data_t. Allocates all memory required; caller
    299  * is responsible for freeing.
    300  */
    301 sec_data_t *
    302 copy_sec_data(sec_data_t *fsecdata) {
    303 	sec_data_t *tsecdata;
    304 
    305 	if (fsecdata == NULL)
    306 		return (NULL);
    307 
    308 	if (fsecdata->rpcflavor == AUTH_DH) {
    309 		dh_k4_clntdata_t *fdata = (dh_k4_clntdata_t *)fsecdata->data;
    310 
    311 		if (fdata == NULL)
    312 			return (NULL);
    313 
    314 		tsecdata = (sec_data_t *)create_authdh_data(fdata->netname,
    315 		    fdata->netnamelen, &fdata->syncaddr, fdata->knconf);
    316 
    317 		return (tsecdata);
    318 	}
    319 
    320 	tsecdata = kmem_zalloc(sizeof (sec_data_t), KM_SLEEP);
    321 
    322 	tsecdata->secmod = fsecdata->secmod;
    323 	tsecdata->rpcflavor = fsecdata->rpcflavor;
    324 	tsecdata->flags = fsecdata->flags;
    325 	tsecdata->uid = fsecdata->uid;
    326 
    327 	if (fsecdata->rpcflavor == RPCSEC_GSS) {
    328 		gss_clntdata_t *gcd = (gss_clntdata_t *)fsecdata->data;
    329 
    330 		tsecdata->data = (caddr_t)copy_sec_data_gss(gcd);
    331 	} else {
    332 		tsecdata->data = NULL;
    333 	}
    334 
    335 	return (tsecdata);
    336 }
    337 
    338 gss_clntdata_t *
    339 copy_sec_data_gss(gss_clntdata_t *fdata)
    340 {
    341 	gss_clntdata_t *tdata;
    342 
    343 	if (fdata == NULL)
    344 		return (NULL);
    345 
    346 	tdata = kmem_zalloc(sizeof (gss_clntdata_t), KM_SLEEP);
    347 
    348 	tdata->mechanism.length = fdata->mechanism.length;
    349 	tdata->mechanism.elements = kmem_zalloc(fdata->mechanism.length,
    350 	    KM_SLEEP);
    351 	bcopy(fdata->mechanism.elements, tdata->mechanism.elements,
    352 	    fdata->mechanism.length);
    353 
    354 	tdata->service = fdata->service;
    355 
    356 	(void) strcpy(tdata->uname, fdata->uname);
    357 	(void) strcpy(tdata->inst, fdata->inst);
    358 	(void) strcpy(tdata->realm, fdata->realm);
    359 
    360 	tdata->qop = fdata->qop;
    361 
    362 	return (tdata);
    363 }
    364 
    365 static int
    366 nfs4_chkdup_servinfo4(servinfo4_t *svp_head, servinfo4_t *svp)
    367 {
    368 	servinfo4_t *si;
    369 
    370 	/*
    371 	 * Iterate over the servinfo4 list to make sure
    372 	 * we do not have a duplicate. Skip any servinfo4
    373 	 * that has been marked "NOT IN USE"
    374 	 */
    375 	for (si = svp_head; si; si = si->sv_next) {
    376 		(void) nfs_rw_enter_sig(&si->sv_lock, RW_READER, 0);
    377 		if (si->sv_flags & SV4_NOTINUSE) {
    378 			nfs_rw_exit(&si->sv_lock);
    379 			continue;
    380 		}
    381 		nfs_rw_exit(&si->sv_lock);
    382 		if (si == svp)
    383 			continue;
    384 		if (si->sv_addr.len == svp->sv_addr.len &&
    385 		    strcmp(si->sv_knconf->knc_protofmly,
    386 		    svp->sv_knconf->knc_protofmly) == 0 &&
    387 		    bcmp(si->sv_addr.buf, svp->sv_addr.buf,
    388 		    si->sv_addr.len) == 0) {
    389 			/* it's a duplicate */
    390 			return (1);
    391 		}
    392 	}
    393 	/* it's not a duplicate */
    394 	return (0);
    395 }
    396 
    397 void
    398 nfs4_free_args(struct nfs_args *nargs)
    399 {
    400 	if (nargs->knconf) {
    401 		if (nargs->knconf->knc_protofmly)
    402 			kmem_free(nargs->knconf->knc_protofmly,
    403 			    KNC_STRSIZE);
    404 		if (nargs->knconf->knc_proto)
    405 			kmem_free(nargs->knconf->knc_proto, KNC_STRSIZE);
    406 		kmem_free(nargs->knconf, sizeof (*nargs->knconf));
    407 		nargs->knconf = NULL;
    408 	}
    409 
    410 	if (nargs->fh) {
    411 		kmem_free(nargs->fh, strlen(nargs->fh) + 1);
    412 		nargs->fh = NULL;
    413 	}
    414 
    415 	if (nargs->hostname) {
    416 		kmem_free(nargs->hostname, strlen(nargs->hostname) + 1);
    417 		nargs->hostname = NULL;
    418 	}
    419 
    420 	if (nargs->addr) {
    421 		if (nargs->addr->buf) {
    422 			ASSERT(nargs->addr->len);
    423 			kmem_free(nargs->addr->buf, nargs->addr->len);
    424 		}
    425 		kmem_free(nargs->addr, sizeof (struct netbuf));
    426 		nargs->addr = NULL;
    427 	}
    428 
    429 	if (nargs->syncaddr) {
    430 		ASSERT(nargs->syncaddr->len);
    431 		if (nargs->syncaddr->buf) {
    432 			ASSERT(nargs->syncaddr->len);
    433 			kmem_free(nargs->syncaddr->buf, nargs->syncaddr->len);
    434 		}
    435 		kmem_free(nargs->syncaddr, sizeof (struct netbuf));
    436 		nargs->syncaddr = NULL;
    437 	}
    438 
    439 	if (nargs->netname) {
    440 		kmem_free(nargs->netname, strlen(nargs->netname) + 1);
    441 		nargs->netname = NULL;
    442 	}
    443 
    444 	if (nargs->nfs_ext_u.nfs_extA.secdata) {
    445 		sec_clnt_freeinfo(
    446 		    nargs->nfs_ext_u.nfs_extA.secdata);
    447 		nargs->nfs_ext_u.nfs_extA.secdata = NULL;
    448 	}
    449 }
    450 
    451 
    452 int
    453 nfs4_copyin(char *data, int datalen, struct nfs_args *nargs)
    454 {
    455 
    456 	int error;
    457 	size_t hlen;			/* length of hostname */
    458 	size_t nlen;			/* length of netname */
    459 	char netname[MAXNETNAMELEN+1];	/* server's netname */
    460 	struct netbuf addr;		/* server's address */
    461 	struct netbuf syncaddr;		/* AUTH_DES time sync addr */
    462 	struct knetconfig *knconf;		/* transport structure */
    463 	struct sec_data *secdata = NULL;	/* security data */
    464 	STRUCT_DECL(nfs_args, args);		/* nfs mount arguments */
    465 	STRUCT_DECL(knetconfig, knconf_tmp);
    466 	STRUCT_DECL(netbuf, addr_tmp);
    467 	int flags;
    468 	char *p, *pf;
    469 	struct pathname pn;
    470 	char *userbufptr;
    471 
    472 
    473 	bzero(nargs, sizeof (*nargs));
    474 
    475 	STRUCT_INIT(args, get_udatamodel());
    476 	bzero(STRUCT_BUF(args), SIZEOF_STRUCT(nfs_args, DATAMODEL_NATIVE));
    477 	if (copyin(data, STRUCT_BUF(args), MIN(datalen,
    478 	    STRUCT_SIZE(args))))
    479 		return (EFAULT);
    480 
    481 	nargs->wsize = STRUCT_FGET(args, wsize);
    482 	nargs->rsize = STRUCT_FGET(args, rsize);
    483 	nargs->timeo = STRUCT_FGET(args, timeo);
    484 	nargs->retrans = STRUCT_FGET(args, retrans);
    485 	nargs->acregmin = STRUCT_FGET(args, acregmin);
    486 	nargs->acregmax = STRUCT_FGET(args, acregmax);
    487 	nargs->acdirmin = STRUCT_FGET(args, acdirmin);
    488 	nargs->acdirmax = STRUCT_FGET(args, acdirmax);
    489 
    490 	flags = STRUCT_FGET(args, flags);
    491 	nargs->flags = flags;
    492 
    493 	addr.buf = NULL;
    494 	syncaddr.buf = NULL;
    495 
    496 
    497 	/*
    498 	 * Allocate space for a knetconfig structure and
    499 	 * its strings and copy in from user-land.
    500 	 */
    501 	knconf = kmem_zalloc(sizeof (*knconf), KM_SLEEP);
    502 	STRUCT_INIT(knconf_tmp, get_udatamodel());
    503 	if (copyin(STRUCT_FGETP(args, knconf), STRUCT_BUF(knconf_tmp),
    504 	    STRUCT_SIZE(knconf_tmp))) {
    505 		kmem_free(knconf, sizeof (*knconf));
    506 		return (EFAULT);
    507 	}
    508 
    509 	knconf->knc_semantics = STRUCT_FGET(knconf_tmp, knc_semantics);
    510 	knconf->knc_protofmly = STRUCT_FGETP(knconf_tmp, knc_protofmly);
    511 	knconf->knc_proto = STRUCT_FGETP(knconf_tmp, knc_proto);
    512 	if (get_udatamodel() != DATAMODEL_LP64) {
    513 		knconf->knc_rdev = expldev(STRUCT_FGET(knconf_tmp, knc_rdev));
    514 	} else {
    515 		knconf->knc_rdev = STRUCT_FGET(knconf_tmp, knc_rdev);
    516 	}
    517 
    518 	pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
    519 	p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
    520 	error = copyinstr(knconf->knc_protofmly, pf, KNC_STRSIZE, NULL);
    521 	if (error) {
    522 		kmem_free(pf, KNC_STRSIZE);
    523 		kmem_free(p, KNC_STRSIZE);
    524 		kmem_free(knconf, sizeof (*knconf));
    525 		return (error);
    526 	}
    527 
    528 	error = copyinstr(knconf->knc_proto, p, KNC_STRSIZE, NULL);
    529 	if (error) {
    530 		kmem_free(pf, KNC_STRSIZE);
    531 		kmem_free(p, KNC_STRSIZE);
    532 		kmem_free(knconf, sizeof (*knconf));
    533 		return (error);
    534 	}
    535 
    536 
    537 	knconf->knc_protofmly = pf;
    538 	knconf->knc_proto = p;
    539 
    540 	nargs->knconf = knconf;
    541 
    542 	/*
    543 	 * Get server address
    544 	 */
    545 	STRUCT_INIT(addr_tmp, get_udatamodel());
    546 	if (copyin(STRUCT_FGETP(args, addr), STRUCT_BUF(addr_tmp),
    547 	    STRUCT_SIZE(addr_tmp))) {
    548 		error = EFAULT;
    549 		goto errout;
    550 	}
    551 
    552 	nargs->addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
    553 	userbufptr = STRUCT_FGETP(addr_tmp, buf);
    554 	addr.len = STRUCT_FGET(addr_tmp, len);
    555 	addr.buf = kmem_alloc(addr.len, KM_SLEEP);
    556 	addr.maxlen = addr.len;
    557 	if (copyin(userbufptr, addr.buf, addr.len)) {
    558 		kmem_free(addr.buf, addr.len);
    559 		error = EFAULT;
    560 		goto errout;
    561 	}
    562 	bcopy(&addr, nargs->addr, sizeof (struct netbuf));
    563 
    564 	/*
    565 	 * Get the root fhandle
    566 	 */
    567 	error = pn_get(STRUCT_FGETP(args, fh), UIO_USERSPACE, &pn);
    568 	if (error)
    569 		goto errout;
    570 
    571 	/* Volatile fh: keep server paths, so use actual-size strings */
    572 	nargs->fh = kmem_alloc(pn.pn_pathlen + 1, KM_SLEEP);
    573 	bcopy(pn.pn_path, nargs->fh, pn.pn_pathlen);
    574 	nargs->fh[pn.pn_pathlen] = '\0';
    575 	pn_free(&pn);
    576 
    577 
    578 	/*
    579 	 * Get server's hostname
    580 	 */
    581 	if (flags & NFSMNT_HOSTNAME) {
    582 		error = copyinstr(STRUCT_FGETP(args, hostname),
    583 		    netname, sizeof (netname), &hlen);
    584 		if (error)
    585 			goto errout;
    586 		nargs->hostname = kmem_zalloc(hlen, KM_SLEEP);
    587 		(void) strcpy(nargs->hostname, netname);
    588 
    589 	} else {
    590 		nargs->hostname = NULL;
    591 	}
    592 
    593 
    594 	/*
    595 	 * If there are syncaddr and netname data, load them in. This is
    596 	 * to support data needed for NFSV4 when AUTH_DH is the negotiated
    597 	 * flavor via SECINFO. (instead of using MOUNT protocol in V3).
    598 	 */
    599 	netname[0] = '\0';
    600 	if (flags & NFSMNT_SECURE) {
    601 
    602 		/* get syncaddr */
    603 		STRUCT_INIT(addr_tmp, get_udatamodel());
    604 		if (copyin(STRUCT_FGETP(args, syncaddr), STRUCT_BUF(addr_tmp),
    605 		    STRUCT_SIZE(addr_tmp))) {
    606 			error = EINVAL;
    607 			goto errout;
    608 		}
    609 		userbufptr = STRUCT_FGETP(addr_tmp, buf);
    610 		syncaddr.len = STRUCT_FGET(addr_tmp, len);
    611 		syncaddr.buf = kmem_alloc(syncaddr.len, KM_SLEEP);
    612 		syncaddr.maxlen = syncaddr.len;
    613 		if (copyin(userbufptr, syncaddr.buf, syncaddr.len)) {
    614 			kmem_free(syncaddr.buf, syncaddr.len);
    615 			error = EFAULT;
    616 			goto errout;
    617 		}
    618 
    619 		nargs->syncaddr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
    620 		bcopy(&syncaddr, nargs->syncaddr, sizeof (struct netbuf));
    621 
    622 		/* get server's netname */
    623 		if (copyinstr(STRUCT_FGETP(args, netname), netname,
    624 		    sizeof (netname), &nlen)) {
    625 			error = EFAULT;
    626 			goto errout;
    627 		}
    628 
    629 		netname[nlen] = '\0';
    630 		nargs->netname = kmem_zalloc(nlen, KM_SLEEP);
    631 		(void) strcpy(nargs->netname, netname);
    632 	}
    633 
    634 	/*
    635 	 * Get the extention data which has the security data structure.
    636 	 * This includes data for AUTH_SYS as well.
    637 	 */
    638 	if (flags & NFSMNT_NEWARGS) {
    639 		nargs->nfs_args_ext = STRUCT_FGET(args, nfs_args_ext);
    640 		if (nargs->nfs_args_ext == NFS_ARGS_EXTA ||
    641 		    nargs->nfs_args_ext == NFS_ARGS_EXTB) {
    642 			/*
    643 			 * Indicating the application is using the new
    644 			 * sec_data structure to pass in the security
    645 			 * data.
    646 			 */
    647 			if (STRUCT_FGETP(args,
    648 			    nfs_ext_u.nfs_extA.secdata) != NULL) {
    649 				error = sec_clnt_loadinfo(
    650 				    (struct sec_data *)STRUCT_FGETP(args,
    651 				    nfs_ext_u.nfs_extA.secdata),
    652 				    &secdata, get_udatamodel());
    653 			}
    654 			nargs->nfs_ext_u.nfs_extA.secdata = secdata;
    655 		}
    656 	}
    657 
    658 	if (error)
    659 		goto errout;
    660 
    661 	/*
    662 	 * Failover support:
    663 	 *
    664 	 * We may have a linked list of nfs_args structures,
    665 	 * which means the user is looking for failover.  If
    666 	 * the mount is either not "read-only" or "soft",
    667 	 * we want to bail out with EINVAL.
    668 	 */
    669 	if (nargs->nfs_args_ext == NFS_ARGS_EXTB)
    670 		nargs->nfs_ext_u.nfs_extB.next =
    671 		    STRUCT_FGETP(args, nfs_ext_u.nfs_extB.next);
    672 
    673 errout:
    674 	if (error)
    675 		nfs4_free_args(nargs);
    676 
    677 	return (error);
    678 }
    679 
    680 
    681 /*
    682  * nfs mount vfsop
    683  * Set up mount info record and attach it to vfs struct.
    684  */
    685 int
    686 nfs4_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
    687 {
    688 	char *data = uap->dataptr;
    689 	int error;
    690 	vnode_t *rtvp;			/* the server's root */
    691 	mntinfo4_t *mi;			/* mount info, pointed at by vfs */
    692 	struct knetconfig *rdma_knconf;	/* rdma transport structure */
    693 	rnode4_t *rp;
    694 	struct servinfo4 *svp;		/* nfs server info */
    695 	struct servinfo4 *svp_tail = NULL; /* previous nfs server info */
    696 	struct servinfo4 *svp_head;	/* first nfs server info */
    697 	struct servinfo4 *svp_2ndlast;	/* 2nd last in server info list */
    698 	struct sec_data *secdata;	/* security data */
    699 	struct nfs_args *args = NULL;
    700 	int flags, addr_type, removed;
    701 	zone_t *zone = nfs_zone();
    702 	nfs4_error_t n4e;
    703 	zone_t *mntzone = NULL;
    704 
    705 	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
    706 		return (EPERM);
    707 	if (mvp->v_type != VDIR)
    708 		return (ENOTDIR);
    709 
    710 	/*
    711 	 * get arguments
    712 	 *
    713 	 * nfs_args is now versioned and is extensible, so
    714 	 * uap->datalen might be different from sizeof (args)
    715 	 * in a compatible situation.
    716 	 */
    717 more:
    718 	if (!(uap->flags & MS_SYSSPACE)) {
    719 		if (args == NULL)
    720 			args = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
    721 		else
    722 			nfs4_free_args(args);
    723 		error = nfs4_copyin(data, uap->datalen, args);
    724 		if (error) {
    725 			if (args) {
    726 				kmem_free(args, sizeof (*args));
    727 			}
    728 			return (error);
    729 		}
    730 	} else {
    731 		args = (struct nfs_args *)data;
    732 	}
    733 
    734 	flags = args->flags;
    735 
    736 	/*
    737 	 * If the request changes the locking type, disallow the remount,
    738 	 * because it's questionable whether we can transfer the
    739 	 * locking state correctly.
    740 	 */
    741 	if (uap->flags & MS_REMOUNT) {
    742 		if (!(uap->flags & MS_SYSSPACE)) {
    743 			nfs4_free_args(args);
    744 			kmem_free(args, sizeof (*args));
    745 		}
    746 		if ((mi = VFTOMI4(vfsp)) != NULL) {
    747 			uint_t new_mi_llock;
    748 			uint_t old_mi_llock;
    749 			new_mi_llock = (flags & NFSMNT_LLOCK) ? 1 : 0;
    750 			old_mi_llock = (mi->mi_flags & MI4_LLOCK) ? 1 : 0;
    751 			if (old_mi_llock != new_mi_llock)
    752 				return (EBUSY);
    753 		}
    754 		return (0);
    755 	}
    756 
    757 	/*
    758 	 * For ephemeral mount trigger stub vnodes, we have two problems
    759 	 * to solve: racing threads will likely fail the v_count check, and
    760 	 * we want only one to proceed with the mount.
    761 	 *
    762 	 * For stubs, if the mount has already occurred (via a racing thread),
    763 	 * just return success. If not, skip the v_count check and proceed.
    764 	 * Note that we are already serialised at this point.
    765 	 */
    766 	mutex_enter(&mvp->v_lock);
    767 	if (vn_matchops(mvp, nfs4_trigger_vnodeops)) {
    768 		/* mntpt is a v4 stub vnode */
    769 		ASSERT(RP_ISSTUB(VTOR4(mvp)));
    770 		ASSERT(!(uap->flags & MS_OVERLAY));
    771 		ASSERT(!(mvp->v_flag & VROOT));
    772 		if (vn_mountedvfs(mvp) != NULL) {
    773 			/* ephemeral mount has already occurred */
    774 			ASSERT(uap->flags & MS_SYSSPACE);
    775 			mutex_exit(&mvp->v_lock);
    776 			return (0);
    777 		}
    778 	} else {
    779 		/* mntpt is a non-v4 or v4 non-stub vnode */
    780 		if (!(uap->flags & MS_OVERLAY) &&
    781 		    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
    782 			mutex_exit(&mvp->v_lock);
    783 			if (!(uap->flags & MS_SYSSPACE)) {
    784 				nfs4_free_args(args);
    785 				kmem_free(args, sizeof (*args));
    786 			}
    787 			return (EBUSY);
    788 		}
    789 	}
    790 	mutex_exit(&mvp->v_lock);
    791 
    792 	/* make sure things are zeroed for errout: */
    793 	rtvp = NULL;
    794 	mi = NULL;
    795 	secdata = NULL;
    796 
    797 	/*
    798 	 * A valid knetconfig structure is required.
    799 	 */
    800 	if (!(flags & NFSMNT_KNCONF) ||
    801 	    args->knconf == NULL || args->knconf->knc_protofmly == NULL ||
    802 	    args->knconf->knc_proto == NULL ||
    803 	    (strcmp(args->knconf->knc_proto, NC_UDP) == 0)) {
    804 		if (!(uap->flags & MS_SYSSPACE)) {
    805 			nfs4_free_args(args);
    806 			kmem_free(args, sizeof (*args));
    807 		}
    808 		return (EINVAL);
    809 	}
    810 
    811 	if ((strlen(args->knconf->knc_protofmly) >= KNC_STRSIZE) ||
    812 	    (strlen(args->knconf->knc_proto) >= KNC_STRSIZE)) {
    813 		if (!(uap->flags & MS_SYSSPACE)) {
    814 			nfs4_free_args(args);
    815 			kmem_free(args, sizeof (*args));
    816 		}
    817 		return (EINVAL);
    818 	}
    819 
    820 	/*
    821 	 * Allocate a servinfo4 struct.
    822 	 */
    823 	svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
    824 	nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
    825 	if (svp_tail) {
    826 		svp_2ndlast = svp_tail;
    827 		svp_tail->sv_next = svp;
    828 	} else {
    829 		svp_head = svp;
    830 		svp_2ndlast = svp;
    831 	}
    832 
    833 	svp_tail = svp;
    834 	svp->sv_knconf = args->knconf;
    835 	args->knconf = NULL;
    836 
    837 	/*
    838 	 * Get server address
    839 	 */
    840 	if (args->addr == NULL || args->addr->buf == NULL) {
    841 		error = EINVAL;
    842 		goto errout;
    843 	}
    844 
    845 	svp->sv_addr.maxlen = args->addr->maxlen;
    846 	svp->sv_addr.len = args->addr->len;
    847 	svp->sv_addr.buf = args->addr->buf;
    848 	args->addr->buf = NULL;
    849 
    850 	/*
    851 	 * Get the root fhandle
    852 	 */
    853 	if (args->fh == NULL || (strlen(args->fh) >= MAXPATHLEN)) {
    854 		error = EINVAL;
    855 		goto errout;
    856 	}
    857 
    858 	svp->sv_path = args->fh;
    859 	svp->sv_pathlen = strlen(args->fh) + 1;
    860 	args->fh = NULL;
    861 
    862 	/*
    863 	 * Get server's hostname
    864 	 */
    865 	if (flags & NFSMNT_HOSTNAME) {
    866 		if (args->hostname == NULL || (strlen(args->hostname) >
    867 		    MAXNETNAMELEN)) {
    868 			error = EINVAL;
    869 			goto errout;
    870 		}
    871 		svp->sv_hostnamelen = strlen(args->hostname) + 1;
    872 		svp->sv_hostname = args->hostname;
    873 		args->hostname = NULL;
    874 	} else {
    875 		char *p = "unknown-host";
    876 		svp->sv_hostnamelen = strlen(p) + 1;
    877 		svp->sv_hostname = kmem_zalloc(svp->sv_hostnamelen, KM_SLEEP);
    878 		(void) strcpy(svp->sv_hostname, p);
    879 	}
    880 
    881 	/*
    882 	 * RDMA MOUNT SUPPORT FOR NFS v4.
    883 	 * Establish, is it possible to use RDMA, if so overload the
    884 	 * knconf with rdma specific knconf and free the orignal knconf.
    885 	 */
    886 	if ((flags & NFSMNT_TRYRDMA) || (flags & NFSMNT_DORDMA)) {
    887 		/*
    888 		 * Determine the addr type for RDMA, IPv4 or v6.
    889 		 */
    890 		if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET) == 0)
    891 			addr_type = AF_INET;
    892 		else if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET6) == 0)
    893 			addr_type = AF_INET6;
    894 
    895 		if (rdma_reachable(addr_type, &svp->sv_addr,
    896 		    &rdma_knconf) == 0) {
    897 			/*
    898 			 * If successful, hijack the orignal knconf and
    899 			 * replace with the new one, depending on the flags.
    900 			 */
    901 			svp->sv_origknconf = svp->sv_knconf;
    902 			svp->sv_knconf = rdma_knconf;
    903 		} else {
    904 			if (flags & NFSMNT_TRYRDMA) {
    905 #ifdef	DEBUG
    906 				if (rdma_debug)
    907 					zcmn_err(getzoneid(), CE_WARN,
    908 					    "no RDMA onboard, revert\n");
    909 #endif
    910 			}
    911 
    912 			if (flags & NFSMNT_DORDMA) {
    913 				/*
    914 				 * If proto=rdma is specified and no RDMA
    915 				 * path to this server is avialable then
    916 				 * ditch this server.
    917 				 * This is not included in the mountable
    918 				 * server list or the replica list.
    919 				 * Check if more servers are specified;
    920 				 * Failover case, otherwise bail out of mount.
    921 				 */
    922 				if (args->nfs_args_ext == NFS_ARGS_EXTB &&
    923 				    args->nfs_ext_u.nfs_extB.next != NULL) {
    924 					data = (char *)
    925 					    args->nfs_ext_u.nfs_extB.next;
    926 					if (uap->flags & MS_RDONLY &&
    927 					    !(flags & NFSMNT_SOFT)) {
    928 						if (svp_head->sv_next == NULL) {
    929 							svp_tail = NULL;
    930 							svp_2ndlast = NULL;
    931 							sv4_free(svp_head);
    932 							goto more;
    933 						} else {
    934 							svp_tail = svp_2ndlast;
    935 							svp_2ndlast->sv_next =
    936 							    NULL;
    937 							sv4_free(svp);
    938 							goto more;
    939 						}
    940 					}
    941 				} else {
    942 					/*
    943 					 * This is the last server specified
    944 					 * in the nfs_args list passed down
    945 					 * and its not rdma capable.
    946 					 */
    947 					if (svp_head->sv_next == NULL) {
    948 						/*
    949 						 * Is this the only one
    950 						 */
    951 						error = EINVAL;
    952 #ifdef	DEBUG
    953 						if (rdma_debug)
    954 							zcmn_err(getzoneid(),
    955 							    CE_WARN,
    956 							    "No RDMA srv");
    957 #endif
    958 						goto errout;
    959 					} else {
    960 						/*
    961 						 * There is list, since some
    962 						 * servers specified before
    963 						 * this passed all requirements
    964 						 */
    965 						svp_tail = svp_2ndlast;
    966 						svp_2ndlast->sv_next = NULL;
    967 						sv4_free(svp);
    968 						goto proceed;
    969 					}
    970 				}
    971 			}
    972 		}
    973 	}
    974 
    975 	/*
    976 	 * If there are syncaddr and netname data, load them in. This is
    977 	 * to support data needed for NFSV4 when AUTH_DH is the negotiated
    978 	 * flavor via SECINFO. (instead of using MOUNT protocol in V3).
    979 	 */
    980 	if (args->flags & NFSMNT_SECURE) {
    981 		svp->sv_dhsec = create_authdh_data(args->netname,
    982 		    strlen(args->netname),
    983 		    args->syncaddr, svp->sv_knconf);
    984 	}
    985 
    986 	/*
    987 	 * Get the extention data which has the security data structure.
    988 	 * This includes data for AUTH_SYS as well.
    989 	 */
    990 	if (flags & NFSMNT_NEWARGS) {
    991 		switch (args->nfs_args_ext) {
    992 		case NFS_ARGS_EXTA:
    993 		case NFS_ARGS_EXTB:
    994 			/*
    995 			 * Indicating the application is using the new
    996 			 * sec_data structure to pass in the security
    997 			 * data.
    998 			 */
    999 			secdata = args->nfs_ext_u.nfs_extA.secdata;
   1000 			if (secdata == NULL) {
   1001 				error = EINVAL;
   1002 			} else if (uap->flags & MS_SYSSPACE) {
   1003 				/*
   1004 				 * Need to validate the flavor here if
   1005 				 * sysspace, userspace was already
   1006 				 * validate from the nfs_copyin function.
   1007 				 */
   1008 				switch (secdata->rpcflavor) {
   1009 				case AUTH_NONE:
   1010 				case AUTH_UNIX:
   1011 				case AUTH_LOOPBACK:
   1012 				case AUTH_DES:
   1013 				case RPCSEC_GSS:
   1014 					break;
   1015 				default:
   1016 					error = EINVAL;
   1017 					goto errout;
   1018 				}
   1019 			}
   1020 			args->nfs_ext_u.nfs_extA.secdata = NULL;
   1021 			break;
   1022 
   1023 		default:
   1024 			error = EINVAL;
   1025 			break;
   1026 		}
   1027 
   1028 	} else if (flags & NFSMNT_SECURE) {
   1029 		/*
   1030 		 * NFSMNT_SECURE is deprecated but we keep it
   1031 		 * to support the rogue user-generated application
   1032 		 * that may use this undocumented interface to do
   1033 		 * AUTH_DH security, e.g. our own rexd.
   1034 		 *
   1035 		 * Also note that NFSMNT_SECURE is used for passing
   1036 		 * AUTH_DH info to be used in negotiation.
   1037 		 */
   1038 		secdata = create_authdh_data(args->netname,
   1039 		    strlen(args->netname), args->syncaddr, svp->sv_knconf);
   1040 
   1041 	} else {
   1042 		secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
   1043 		secdata->secmod = secdata->rpcflavor = AUTH_SYS;
   1044 		secdata->data = NULL;
   1045 	}
   1046 
   1047 	svp->sv_secdata = secdata;
   1048 
   1049 	/*
   1050 	 * User does not explictly specify a flavor, and a user
   1051 	 * defined default flavor is passed down.
   1052 	 */
   1053 	if (flags & NFSMNT_SECDEFAULT) {
   1054 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   1055 		svp->sv_flags |= SV4_TRYSECDEFAULT;
   1056 		nfs_rw_exit(&svp->sv_lock);
   1057 	}
   1058 
   1059 	/*
   1060 	 * Failover support:
   1061 	 *
   1062 	 * We may have a linked list of nfs_args structures,
   1063 	 * which means the user is looking for failover.  If
   1064 	 * the mount is either not "read-only" or "soft",
   1065 	 * we want to bail out with EINVAL.
   1066 	 */
   1067 	if (args->nfs_args_ext == NFS_ARGS_EXTB &&
   1068 	    args->nfs_ext_u.nfs_extB.next != NULL) {
   1069 		if (uap->flags & MS_RDONLY && !(flags & NFSMNT_SOFT)) {
   1070 			data = (char *)args->nfs_ext_u.nfs_extB.next;
   1071 			goto more;
   1072 		}
   1073 		error = EINVAL;
   1074 		goto errout;
   1075 	}
   1076 
   1077 	/*
   1078 	 * Determine the zone we're being mounted into.
   1079 	 */
   1080 	zone_hold(mntzone = zone);		/* start with this assumption */
   1081 	if (getzoneid() == GLOBAL_ZONEID) {
   1082 		zone_rele(mntzone);
   1083 		mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
   1084 		ASSERT(mntzone != NULL);
   1085 		if (mntzone != zone) {
   1086 			error = EBUSY;
   1087 			goto errout;
   1088 		}
   1089 	}
   1090 
   1091 	if (is_system_labeled()) {
   1092 		error = nfs_mount_label_policy(vfsp, &svp->sv_addr,
   1093 		    svp->sv_knconf, cr);
   1094 
   1095 		if (error > 0)
   1096 			goto errout;
   1097 
   1098 		if (error == -1) {
   1099 			/* change mount to read-only to prevent write-down */
   1100 			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
   1101 		}
   1102 	}
   1103 
   1104 	/*
   1105 	 * Stop the mount from going any further if the zone is going away.
   1106 	 */
   1107 	if (zone_status_get(mntzone) >= ZONE_IS_SHUTTING_DOWN) {
   1108 		error = EBUSY;
   1109 		goto errout;
   1110 	}
   1111 
   1112 	/*
   1113 	 * Get root vnode.
   1114 	 */
   1115 proceed:
   1116 	error = nfs4rootvp(&rtvp, vfsp, svp_head, flags, cr, mntzone);
   1117 	if (error) {
   1118 		/* if nfs4rootvp failed, it will free svp_head */
   1119 		svp_head = NULL;
   1120 		goto errout;
   1121 	}
   1122 
   1123 	mi = VTOMI4(rtvp);
   1124 
   1125 	/*
   1126 	 * Send client id to the server, if necessary
   1127 	 */
   1128 	nfs4_error_zinit(&n4e);
   1129 	nfs4setclientid(mi, cr, FALSE, &n4e);
   1130 
   1131 	error = n4e.error;
   1132 
   1133 	if (error)
   1134 		goto errout;
   1135 
   1136 	/*
   1137 	 * Set option fields in the mount info record
   1138 	 */
   1139 
   1140 	if (svp_head->sv_next) {
   1141 		mutex_enter(&mi->mi_lock);
   1142 		mi->mi_flags |= MI4_LLOCK;
   1143 		mutex_exit(&mi->mi_lock);
   1144 	}
   1145 	error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, args);
   1146 	if (error)
   1147 		goto errout;
   1148 
   1149 	/*
   1150 	 * Time to tie in the mirror mount info at last!
   1151 	 */
   1152 	if (flags & NFSMNT_EPHEMERAL)
   1153 		error = nfs4_record_ephemeral_mount(mi, mvp);
   1154 
   1155 errout:
   1156 	if (error) {
   1157 		if (rtvp != NULL) {
   1158 			rp = VTOR4(rtvp);
   1159 			if (rp->r_flags & R4HASHED)
   1160 				rp4_rmhash(rp);
   1161 		}
   1162 		if (mi != NULL) {
   1163 			nfs4_async_stop(vfsp);
   1164 			nfs4_async_manager_stop(vfsp);
   1165 			nfs4_remove_mi_from_server(mi, NULL);
   1166 			if (rtvp != NULL)
   1167 				VN_RELE(rtvp);
   1168 			if (mntzone != NULL)
   1169 				zone_rele(mntzone);
   1170 			/* need to remove it from the zone */
   1171 			removed = nfs4_mi_zonelist_remove(mi);
   1172 			if (removed)
   1173 				zone_rele(mi->mi_zone);
   1174 			MI4_RELE(mi);
   1175 			if (!(uap->flags & MS_SYSSPACE) && args) {
   1176 				nfs4_free_args(args);
   1177 				kmem_free(args, sizeof (*args));
   1178 			}
   1179 			return (error);
   1180 		}
   1181 		if (svp_head)
   1182 			sv4_free(svp_head);
   1183 	}
   1184 
   1185 	if (!(uap->flags & MS_SYSSPACE) && args) {
   1186 		nfs4_free_args(args);
   1187 		kmem_free(args, sizeof (*args));
   1188 	}
   1189 	if (rtvp != NULL)
   1190 		VN_RELE(rtvp);
   1191 
   1192 	if (mntzone != NULL)
   1193 		zone_rele(mntzone);
   1194 
   1195 	return (error);
   1196 }
   1197 
   1198 #ifdef  DEBUG
   1199 #define	VERS_MSG	"NFS4 server "
   1200 #else
   1201 #define	VERS_MSG	"NFS server "
   1202 #endif
   1203 
   1204 #define	READ_MSG        \
   1205 	VERS_MSG "%s returned 0 for read transfer size"
   1206 #define	WRITE_MSG       \
   1207 	VERS_MSG "%s returned 0 for write transfer size"
   1208 #define	SIZE_MSG        \
   1209 	VERS_MSG "%s returned 0 for maximum file size"
   1210 
   1211 /*
   1212  * Get the symbolic link text from the server for a given filehandle
   1213  * of that symlink.
   1214  *
   1215  *      (get symlink text) PUTFH READLINK
   1216  */
   1217 static int
   1218 getlinktext_otw(mntinfo4_t *mi, nfs_fh4 *fh, char **linktextp, cred_t *cr,
   1219     int flags)
   1220 {
   1221 	COMPOUND4args_clnt args;
   1222 	COMPOUND4res_clnt res;
   1223 	int doqueue;
   1224 	nfs_argop4 argop[2];
   1225 	nfs_resop4 *resop;
   1226 	READLINK4res *lr_res;
   1227 	uint_t len;
   1228 	bool_t needrecov = FALSE;
   1229 	nfs4_recov_state_t recov_state;
   1230 	nfs4_sharedfh_t *sfh;
   1231 	nfs4_error_t e;
   1232 	int num_retry = nfs4_max_mount_retry;
   1233 	int recovery = !(flags & NFS4_GETFH_NEEDSOP);
   1234 
   1235 	sfh = sfh4_get(fh, mi);
   1236 	recov_state.rs_flags = 0;
   1237 	recov_state.rs_num_retry_despite_err = 0;
   1238 
   1239 recov_retry:
   1240 	nfs4_error_zinit(&e);
   1241 
   1242 	args.array_len = 2;
   1243 	args.array = argop;
   1244 	args.ctag = TAG_GET_SYMLINK;
   1245 
   1246 	if (! recovery) {
   1247 		e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
   1248 		if (e.error) {
   1249 			sfh4_rele(&sfh);
   1250 			return (e.error);
   1251 		}
   1252 	}
   1253 
   1254 	/* 0. putfh symlink fh */
   1255 	argop[0].argop = OP_CPUTFH;
   1256 	argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
   1257 
   1258 	/* 1. readlink */
   1259 	argop[1].argop = OP_READLINK;
   1260 
   1261 	doqueue = 1;
   1262 
   1263 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
   1264 
   1265 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
   1266 
   1267 	if (needrecov && !recovery && num_retry-- > 0) {
   1268 
   1269 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1270 		    "getlinktext_otw: initiating recovery\n"));
   1271 
   1272 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
   1273 		    OP_READLINK, NULL) == FALSE) {
   1274 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
   1275 			if (!e.error)
   1276 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   1277 				    (caddr_t)&res);
   1278 			goto recov_retry;
   1279 		}
   1280 	}
   1281 
   1282 	/*
   1283 	 * If non-NFS4 pcol error and/or we weren't able to recover.
   1284 	 */
   1285 	if (e.error != 0) {
   1286 		if (! recovery)
   1287 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
   1288 		sfh4_rele(&sfh);
   1289 		return (e.error);
   1290 	}
   1291 
   1292 	if (res.status) {
   1293 		e.error = geterrno4(res.status);
   1294 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1295 		if (! recovery)
   1296 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
   1297 		sfh4_rele(&sfh);
   1298 		return (e.error);
   1299 	}
   1300 
   1301 	/* res.status == NFS4_OK */
   1302 	ASSERT(res.status == NFS4_OK);
   1303 
   1304 	resop = &res.array[1];  /* readlink res */
   1305 	lr_res = &resop->nfs_resop4_u.opreadlink;
   1306 
   1307 	/* treat symlink name as data */
   1308 	*linktextp = utf8_to_str(&lr_res->link, &len, NULL);
   1309 
   1310 	if (! recovery)
   1311 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
   1312 	sfh4_rele(&sfh);
   1313 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1314 	return (0);
   1315 }
   1316 
   1317 /*
   1318  * Skip over consecutive slashes and "/./" in a pathname.
   1319  */
   1320 void
   1321 pathname_skipslashdot(struct pathname *pnp)
   1322 {
   1323 	char *c1, *c2;
   1324 
   1325 	while (pnp->pn_pathlen > 0 && *pnp->pn_path == '/') {
   1326 
   1327 		c1 = pnp->pn_path + 1;
   1328 		c2 = pnp->pn_path + 2;
   1329 
   1330 		if (*c1 == '.' && (*c2 == '/' || *c2 == '\0')) {
   1331 			pnp->pn_path = pnp->pn_path + 2; /* skip "/." */
   1332 			pnp->pn_pathlen = pnp->pn_pathlen - 2;
   1333 		} else {
   1334 			pnp->pn_path++;
   1335 			pnp->pn_pathlen--;
   1336 		}
   1337 	}
   1338 }
   1339 
   1340 /*
   1341  * Resolve a symbolic link path. The symlink is in the nth component of
   1342  * svp->sv_path and has an nfs4 file handle "fh".
   1343  * Upon return, the sv_path will point to the new path that has the nth
   1344  * component resolved to its symlink text.
   1345  */
   1346 int
   1347 resolve_sympath(mntinfo4_t *mi, servinfo4_t *svp, int nth, nfs_fh4 *fh,
   1348     cred_t *cr, int flags)
   1349 {
   1350 	char *oldpath;
   1351 	char *symlink, *newpath;
   1352 	struct pathname oldpn, newpn;
   1353 	char component[MAXNAMELEN];
   1354 	int i, addlen, error = 0;
   1355 	int oldpathlen;
   1356 
   1357 	/* Get the symbolic link text over the wire. */
   1358 	error = getlinktext_otw(mi, fh, &symlink, cr, flags);
   1359 
   1360 	if (error || symlink == NULL || strlen(symlink) == 0)
   1361 		return (error);
   1362 
   1363 	/*
   1364 	 * Compose the new pathname.
   1365 	 * Note:
   1366 	 *    - only the nth component is resolved for the pathname.
   1367 	 *    - pathname.pn_pathlen does not count the ending null byte.
   1368 	 */
   1369 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1370 	oldpath = svp->sv_path;
   1371 	oldpathlen = svp->sv_pathlen;
   1372 	if (error = pn_get(oldpath, UIO_SYSSPACE, &oldpn)) {
   1373 		nfs_rw_exit(&svp->sv_lock);
   1374 		kmem_free(symlink, strlen(symlink) + 1);
   1375 		return (error);
   1376 	}
   1377 	nfs_rw_exit(&svp->sv_lock);
   1378 	pn_alloc(&newpn);
   1379 
   1380 	/*
   1381 	 * Skip over previous components from the oldpath so that the
   1382 	 * oldpn.pn_path will point to the symlink component. Skip
   1383 	 * leading slashes and "/./" (no OP_LOOKUP on ".") so that
   1384 	 * pn_getcompnent can get the component.
   1385 	 */
   1386 	for (i = 1; i < nth; i++) {
   1387 		pathname_skipslashdot(&oldpn);
   1388 		error = pn_getcomponent(&oldpn, component);
   1389 		if (error)
   1390 			goto out;
   1391 	}
   1392 
   1393 	/*
   1394 	 * Copy the old path upto the component right before the symlink
   1395 	 * if the symlink is not an absolute path.
   1396 	 */
   1397 	if (symlink[0] != '/') {
   1398 		addlen = oldpn.pn_path - oldpn.pn_buf;
   1399 		bcopy(oldpn.pn_buf, newpn.pn_path, addlen);
   1400 		newpn.pn_pathlen += addlen;
   1401 		newpn.pn_path += addlen;
   1402 		newpn.pn_buf[newpn.pn_pathlen] = '/';
   1403 		newpn.pn_pathlen++;
   1404 		newpn.pn_path++;
   1405 	}
   1406 
   1407 	/* copy the resolved symbolic link text */
   1408 	addlen = strlen(symlink);
   1409 	if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
   1410 		error = ENAMETOOLONG;
   1411 		goto out;
   1412 	}
   1413 	bcopy(symlink, newpn.pn_path, addlen);
   1414 	newpn.pn_pathlen += addlen;
   1415 	newpn.pn_path += addlen;
   1416 
   1417 	/*
   1418 	 * Check if there is any remaining path after the symlink component.
   1419 	 * First, skip the symlink component.
   1420 	 */
   1421 	pathname_skipslashdot(&oldpn);
   1422 	if (error = pn_getcomponent(&oldpn, component))
   1423 		goto out;
   1424 
   1425 	addlen = pn_pathleft(&oldpn); /* includes counting the slash */
   1426 
   1427 	/*
   1428 	 * Copy the remaining path to the new pathname if there is any.
   1429 	 */
   1430 	if (addlen > 0) {
   1431 		if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
   1432 			error = ENAMETOOLONG;
   1433 			goto out;
   1434 		}
   1435 		bcopy(oldpn.pn_path, newpn.pn_path, addlen);
   1436 		newpn.pn_pathlen += addlen;
   1437 	}
   1438 	newpn.pn_buf[newpn.pn_pathlen] = '\0';
   1439 
   1440 	/* get the newpath and store it in the servinfo4_t */
   1441 	newpath = kmem_alloc(newpn.pn_pathlen + 1, KM_SLEEP);
   1442 	bcopy(newpn.pn_buf, newpath, newpn.pn_pathlen);
   1443 	newpath[newpn.pn_pathlen] = '\0';
   1444 
   1445 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   1446 	svp->sv_path = newpath;
   1447 	svp->sv_pathlen = strlen(newpath) + 1;
   1448 	nfs_rw_exit(&svp->sv_lock);
   1449 
   1450 	kmem_free(oldpath, oldpathlen);
   1451 out:
   1452 	kmem_free(symlink, strlen(symlink) + 1);
   1453 	pn_free(&newpn);
   1454 	pn_free(&oldpn);
   1455 
   1456 	return (error);
   1457 }
   1458 
   1459 /*
   1460  * Get the root filehandle for the given filesystem and server, and update
   1461  * svp.
   1462  *
   1463  * If NFS4_GETFH_NEEDSOP is set, then use nfs4_start_fop and nfs4_end_fop
   1464  * to coordinate with recovery.  Otherwise, the caller is assumed to be
   1465  * the recovery thread or have already done a start_fop.
   1466  *
   1467  * Errors are returned by the nfs4_error_t parameter.
   1468  */
   1469 
   1470 static void
   1471 nfs4getfh_otw(struct mntinfo4 *mi, servinfo4_t *svp, vtype_t *vtp,
   1472     int flags, cred_t *cr, nfs4_error_t *ep)
   1473 {
   1474 	COMPOUND4args_clnt args;
   1475 	COMPOUND4res_clnt res;
   1476 	int doqueue = 1;
   1477 	nfs_argop4 *argop;
   1478 	nfs_resop4 *resop;
   1479 	nfs4_ga_res_t *garp;
   1480 	int num_argops;
   1481 	lookup4_param_t lookuparg;
   1482 	nfs_fh4 *tmpfhp;
   1483 	nfs_fh4 *resfhp;
   1484 	bool_t needrecov = FALSE;
   1485 	nfs4_recov_state_t recov_state;
   1486 	int llndx;
   1487 	int nthcomp;
   1488 	int recovery = !(flags & NFS4_GETFH_NEEDSOP);
   1489 
   1490 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1491 	ASSERT(svp->sv_path != NULL);
   1492 	if (svp->sv_path[0] == '\0') {
   1493 		nfs_rw_exit(&svp->sv_lock);
   1494 		nfs4_error_init(ep, EINVAL);
   1495 		return;
   1496 	}
   1497 	nfs_rw_exit(&svp->sv_lock);
   1498 
   1499 	recov_state.rs_flags = 0;
   1500 	recov_state.rs_num_retry_despite_err = 0;
   1501 recov_retry:
   1502 	nfs4_error_zinit(ep);
   1503 
   1504 	if (!recovery) {
   1505 		ep->error = nfs4_start_fop(mi, NULL, NULL, OH_MOUNT,
   1506 		    &recov_state, NULL);
   1507 
   1508 		/*
   1509 		 * If recovery has been started and this request as
   1510 		 * initiated by a mount, then we must wait for recovery
   1511 		 * to finish before proceeding, otherwise, the error
   1512 		 * cleanup would remove data structures needed by the
   1513 		 * recovery thread.
   1514 		 */
   1515 		if (ep->error) {
   1516 			mutex_enter(&mi->mi_lock);
   1517 			if (mi->mi_flags & MI4_MOUNTING) {
   1518 				mi->mi_flags |= MI4_RECOV_FAIL;
   1519 				mi->mi_error = EIO;
   1520 
   1521 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1522 				    "nfs4getfh_otw: waiting 4 recovery\n"));
   1523 
   1524 				while (mi->mi_flags & MI4_RECOV_ACTIV)
   1525 					cv_wait(&mi->mi_failover_cv,
   1526 					    &mi->mi_lock);
   1527 			}
   1528 			mutex_exit(&mi->mi_lock);
   1529 			return;
   1530 		}
   1531 
   1532 		/*
   1533 		 * If the client does not specify a specific flavor to use
   1534 		 * and has not gotten a secinfo list from the server yet,
   1535 		 * retrieve the secinfo list from the server and use a
   1536 		 * flavor from the list to mount.
   1537 		 *
   1538 		 * If fail to get the secinfo list from the server, then
   1539 		 * try the default flavor.
   1540 		 */
   1541 		if ((svp->sv_flags & SV4_TRYSECDEFAULT) &&
   1542 		    svp->sv_secinfo == NULL) {
   1543 			(void) nfs4_secinfo_path(mi, cr, FALSE);
   1544 		}
   1545 	}
   1546 
   1547 	if (recovery)
   1548 		args.ctag = TAG_REMAP_MOUNT;
   1549 	else
   1550 		args.ctag = TAG_MOUNT;
   1551 
   1552 	lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
   1553 	lookuparg.argsp = &args;
   1554 	lookuparg.resp = &res;
   1555 	lookuparg.header_len = 2;	/* Putrootfh, getfh */
   1556 	lookuparg.trailer_len = 0;
   1557 	lookuparg.ga_bits = FATTR4_FSINFO_MASK;
   1558 	lookuparg.mi = mi;
   1559 
   1560 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1561 	ASSERT(svp->sv_path != NULL);
   1562 	llndx = nfs4lookup_setup(svp->sv_path, &lookuparg, 0);
   1563 	nfs_rw_exit(&svp->sv_lock);
   1564 
   1565 	argop = args.array;
   1566 	num_argops = args.array_len;
   1567 
   1568 	/* choose public or root filehandle */
   1569 	if (flags & NFS4_GETFH_PUBLIC)
   1570 		argop[0].argop = OP_PUTPUBFH;
   1571 	else
   1572 		argop[0].argop = OP_PUTROOTFH;
   1573 
   1574 	/* get fh */
   1575 	argop[1].argop = OP_GETFH;
   1576 
   1577 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
   1578 	    "nfs4getfh_otw: %s call, mi 0x%p",
   1579 	    needrecov ? "recov" : "first", (void *)mi));
   1580 
   1581 	rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
   1582 
   1583 	needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
   1584 
   1585 	if (needrecov) {
   1586 		bool_t abort;
   1587 
   1588 		if (recovery) {
   1589 			nfs4args_lookup_free(argop, num_argops);
   1590 			kmem_free(argop,
   1591 			    lookuparg.arglen * sizeof (nfs_argop4));
   1592 			if (!ep->error)
   1593 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   1594 				    (caddr_t)&res);
   1595 			return;
   1596 		}
   1597 
   1598 		NFS4_DEBUG(nfs4_client_recov_debug,
   1599 		    (CE_NOTE, "nfs4getfh_otw: initiating recovery\n"));
   1600 
   1601 		abort = nfs4_start_recovery(ep, mi, NULL,
   1602 		    NULL, NULL, NULL, OP_GETFH, NULL);
   1603 		if (!ep->error) {
   1604 			ep->error = geterrno4(res.status);
   1605 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1606 		}
   1607 		nfs4args_lookup_free(argop, num_argops);
   1608 		kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
   1609 		nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
   1610 		/* have another go? */
   1611 		if (abort == FALSE)
   1612 			goto recov_retry;
   1613 		return;
   1614 	}
   1615 
   1616 	/*
   1617 	 * No recovery, but check if error is set.
   1618 	 */
   1619 	if (ep->error)  {
   1620 		nfs4args_lookup_free(argop, num_argops);
   1621 		kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
   1622 		if (!recovery)
   1623 			nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
   1624 			    needrecov);
   1625 		return;
   1626 	}
   1627 
   1628 is_link_err:
   1629 
   1630 	/* for non-recovery errors */
   1631 	if (res.status && res.status != NFS4ERR_SYMLINK) {
   1632 		if (!recovery) {
   1633 			nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
   1634 			    needrecov);
   1635 		}
   1636 		nfs4args_lookup_free(argop, num_argops);
   1637 		kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
   1638 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1639 		return;
   1640 	}
   1641 
   1642 	/*
   1643 	 * If any intermediate component in the path is a symbolic link,
   1644 	 * resolve the symlink, then try mount again using the new path.
   1645 	 */
   1646 	if (res.status == NFS4ERR_SYMLINK) {
   1647 		int where;
   1648 
   1649 		/*
   1650 		 * This must be from OP_LOOKUP failure. The (cfh) for this
   1651 		 * OP_LOOKUP is a symlink node. Found out where the
   1652 		 * OP_GETFH is for the (cfh) that is a symlink node.
   1653 		 *
   1654 		 * Example:
   1655 		 * (mount) PUTROOTFH, GETFH, LOOKUP comp1, GETFH, GETATTR,
   1656 		 * LOOKUP comp2, GETFH, GETATTR, LOOKUP comp3, GETFH, GETATTR
   1657 		 *
   1658 		 * LOOKUP comp3 fails with SYMLINK because comp2 is a symlink.
   1659 		 * In this case, where = 7, nthcomp = 2.
   1660 		 */
   1661 		where = res.array_len - 2;
   1662 		ASSERT(where > 0);
   1663 
   1664 		resop = &res.array[where - 1];
   1665 		ASSERT(resop->resop == OP_GETFH);
   1666 		tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
   1667 		nthcomp = res.array_len/3 - 1;
   1668 
   1669 		/*
   1670 		 * Need to call nfs4_end_op before resolve_sympath to avoid
   1671 		 * potential nfs4_start_op deadlock.
   1672 		 */
   1673 		if (!recovery)
   1674 			nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
   1675 			    needrecov);
   1676 
   1677 		ep->error = resolve_sympath(mi, svp, nthcomp, tmpfhp, cr,
   1678 		    flags);
   1679 
   1680 		nfs4args_lookup_free(argop, num_argops);
   1681 		kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
   1682 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1683 
   1684 		if (ep->error)
   1685 			return;
   1686 
   1687 		goto recov_retry;
   1688 	}
   1689 
   1690 	/* getfh */
   1691 	resop = &res.array[res.array_len - 2];
   1692 	ASSERT(resop->resop == OP_GETFH);
   1693 	resfhp = &resop->nfs_resop4_u.opgetfh.object;
   1694 
   1695 	/* getattr fsinfo res */
   1696 	resop++;
   1697 	garp = &resop->nfs_resop4_u.opgetattr.ga_res;
   1698 
   1699 	*vtp = garp->n4g_va.va_type;
   1700 
   1701 	mi->mi_fh_expire_type = garp->n4g_ext_res->n4g_fet;
   1702 
   1703 	mutex_enter(&mi->mi_lock);
   1704 	if (garp->n4g_ext_res->n4g_pc4.pc4_link_support)
   1705 		mi->mi_flags |= MI4_LINK;
   1706 	if (garp->n4g_ext_res->n4g_pc4.pc4_symlink_support)
   1707 		mi->mi_flags |= MI4_SYMLINK;
   1708 	if (garp->n4g_ext_res->n4g_suppattrs & FATTR4_ACL_MASK)
   1709 		mi->mi_flags |= MI4_ACL;
   1710 	mutex_exit(&mi->mi_lock);
   1711 
   1712 	if (garp->n4g_ext_res->n4g_maxread == 0)
   1713 		mi->mi_tsize =
   1714 		    MIN(MAXBSIZE, mi->mi_tsize);
   1715 	else
   1716 		mi->mi_tsize =
   1717 		    MIN(garp->n4g_ext_res->n4g_maxread,
   1718 		    mi->mi_tsize);
   1719 
   1720 	if (garp->n4g_ext_res->n4g_maxwrite == 0)
   1721 		mi->mi_stsize =
   1722 		    MIN(MAXBSIZE, mi->mi_stsize);
   1723 	else
   1724 		mi->mi_stsize =
   1725 		    MIN(garp->n4g_ext_res->n4g_maxwrite,
   1726 		    mi->mi_stsize);
   1727 
   1728 	if (garp->n4g_ext_res->n4g_maxfilesize != 0)
   1729 		mi->mi_maxfilesize =
   1730 		    MIN(garp->n4g_ext_res->n4g_maxfilesize,
   1731 		    mi->mi_maxfilesize);
   1732 
   1733 	/*
   1734 	 * If the final component is a a symbolic link, resolve the symlink,
   1735 	 * then try mount again using the new path.
   1736 	 *
   1737 	 * Assume no symbolic link for root filesysm "/".
   1738 	 */
   1739 	if (*vtp == VLNK) {
   1740 		/*
   1741 		 * nthcomp is the total result length minus
   1742 		 * the 1st 2 OPs (PUTROOTFH, GETFH),
   1743 		 * then divided by 3 (LOOKUP,GETFH,GETATTR)
   1744 		 *
   1745 		 * e.g. PUTROOTFH GETFH LOOKUP 1st-comp GETFH GETATTR
   1746 		 *	LOOKUP 2nd-comp GETFH GETATTR
   1747 		 *
   1748 		 *	(8 - 2)/3 = 2
   1749 		 */
   1750 		nthcomp = (res.array_len - 2)/3;
   1751 
   1752 		/*
   1753 		 * Need to call nfs4_end_op before resolve_sympath to avoid
   1754 		 * potential nfs4_start_op deadlock. See RFE 4777612.
   1755 		 */
   1756 		if (!recovery)
   1757 			nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
   1758 			    needrecov);
   1759 
   1760 		ep->error = resolve_sympath(mi, svp, nthcomp, resfhp, cr,
   1761 		    flags);
   1762 
   1763 		nfs4args_lookup_free(argop, num_argops);
   1764 		kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
   1765 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1766 
   1767 		if (ep->error)
   1768 			return;
   1769 
   1770 		goto recov_retry;
   1771 	}
   1772 
   1773 	/*
   1774 	 * We need to figure out where in the compound the getfh
   1775 	 * for the parent directory is. If the object to be mounted is
   1776 	 * the root, then there is no lookup at all:
   1777 	 * PUTROOTFH, GETFH.
   1778 	 * If the object to be mounted is in the root, then the compound is:
   1779 	 * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR.
   1780 	 * In either of these cases, the index of the GETFH is 1.
   1781 	 * If it is not at the root, then it's something like:
   1782 	 * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR,
   1783 	 * LOOKUP, GETFH, GETATTR
   1784 	 * In this case, the index is llndx (last lookup index) - 2.
   1785 	 */
   1786 	if (llndx == -1 || llndx == 2)
   1787 		resop = &res.array[1];
   1788 	else {
   1789 		ASSERT(llndx > 2);
   1790 		resop = &res.array[llndx-2];
   1791 	}
   1792 
   1793 	ASSERT(resop->resop == OP_GETFH);
   1794 	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
   1795 
   1796 	/* save the filehandles for the replica */
   1797 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   1798 	ASSERT(tmpfhp->nfs_fh4_len <= NFS4_FHSIZE);
   1799 	svp->sv_pfhandle.fh_len = tmpfhp->nfs_fh4_len;
   1800 	bcopy(tmpfhp->nfs_fh4_val, svp->sv_pfhandle.fh_buf,
   1801 	    tmpfhp->nfs_fh4_len);
   1802 	ASSERT(resfhp->nfs_fh4_len <= NFS4_FHSIZE);
   1803 	svp->sv_fhandle.fh_len = resfhp->nfs_fh4_len;
   1804 	bcopy(resfhp->nfs_fh4_val, svp->sv_fhandle.fh_buf, resfhp->nfs_fh4_len);
   1805 
   1806 	/* initialize fsid and supp_attrs for server fs */
   1807 	svp->sv_fsid = garp->n4g_fsid;
   1808 	svp->sv_supp_attrs =
   1809 	    garp->n4g_ext_res->n4g_suppattrs | FATTR4_MANDATTR_MASK;
   1810 
   1811 	nfs_rw_exit(&svp->sv_lock);
   1812 
   1813 	nfs4args_lookup_free(argop, num_argops);
   1814 	kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
   1815 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1816 	if (!recovery)
   1817 		nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
   1818 }
   1819 
   1820 static ushort_t nfs4_max_threads = 8;	/* max number of active async threads */
   1821 static uint_t nfs4_bsize = 32 * 1024;	/* client `block' size */
   1822 static uint_t nfs4_async_clusters = 1;	/* # of reqs from each async queue */
   1823 static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO;
   1824 
   1825 /*
   1826  * Remap the root filehandle for the given filesystem.
   1827  *
   1828  * results returned via the nfs4_error_t parameter.
   1829  */
   1830 void
   1831 nfs4_remap_root(mntinfo4_t *mi, nfs4_error_t *ep, int flags)
   1832 {
   1833 	struct servinfo4 *svp;
   1834 	vtype_t vtype;
   1835 	nfs_fh4 rootfh;
   1836 	int getfh_flags;
   1837 	char *orig_sv_path;
   1838 	int orig_sv_pathlen, num_retry;
   1839 
   1840 	mutex_enter(&mi->mi_lock);
   1841 
   1842 remap_retry:
   1843 	svp = mi->mi_curr_serv;
   1844 	getfh_flags =
   1845 	    (flags & NFS4_REMAP_NEEDSOP) ? NFS4_GETFH_NEEDSOP : 0;
   1846 	getfh_flags |=
   1847 	    (mi->mi_flags & MI4_PUBLIC) ? NFS4_GETFH_PUBLIC : 0;
   1848 	mutex_exit(&mi->mi_lock);
   1849 
   1850 	/*
   1851 	 * Just in case server path being mounted contains
   1852 	 * symlinks and fails w/STALE, save the initial sv_path
   1853 	 * so we can redrive the initial mount compound with the
   1854 	 * initial sv_path -- not a symlink-expanded version.
   1855 	 *
   1856 	 * This could only happen if a symlink was expanded
   1857 	 * and the expanded mount compound failed stale.  Because
   1858 	 * it could be the case that the symlink was removed at
   1859 	 * the server (and replaced with another symlink/dir,
   1860 	 * we need to use the initial sv_path when attempting
   1861 	 * to re-lookup everything and recover.
   1862 	 */
   1863 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1864 	orig_sv_pathlen = svp->sv_pathlen;
   1865 	orig_sv_path = kmem_alloc(orig_sv_pathlen, KM_SLEEP);
   1866 	bcopy(svp->sv_path, orig_sv_path, orig_sv_pathlen);
   1867 	nfs_rw_exit(&svp->sv_lock);
   1868 
   1869 	num_retry = nfs4_max_mount_retry;
   1870 
   1871 	do {
   1872 		/*
   1873 		 * Get the root fh from the server.  Retry nfs4_max_mount_retry
   1874 		 * (2) times if it fails with STALE since the recovery
   1875 		 * infrastructure doesn't do STALE recovery for components
   1876 		 * of the server path to the object being mounted.
   1877 		 */
   1878 		nfs4getfh_otw(mi, svp, &vtype, getfh_flags, CRED(), ep);
   1879 
   1880 		if (ep->error == 0 && ep->stat == NFS4_OK)
   1881 			break;
   1882 
   1883 		/*
   1884 		 * For some reason, the mount compound failed.  Before
   1885 		 * retrying, we need to restore the original sv_path
   1886 		 * because it might have contained symlinks that were
   1887 		 * expanded by nfsgetfh_otw before the failure occurred.
   1888 		 * replace current sv_path with orig sv_path -- just in case
   1889 		 * it changed due to embedded symlinks.
   1890 		 */
   1891 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1892 		if (orig_sv_pathlen != svp->sv_pathlen) {
   1893 			kmem_free(svp->sv_path, svp->sv_pathlen);
   1894 			svp->sv_path = kmem_alloc(orig_sv_pathlen, KM_SLEEP);
   1895 			svp->sv_pathlen = orig_sv_pathlen;
   1896 		}
   1897 		bcopy(orig_sv_path, svp->sv_path, orig_sv_pathlen);
   1898 		nfs_rw_exit(&svp->sv_lock);
   1899 
   1900 	} while (num_retry-- > 0);
   1901 
   1902 	kmem_free(orig_sv_path, orig_sv_pathlen);
   1903 
   1904 	if (ep->error != 0 || ep->stat != 0) {
   1905 		return;
   1906 	}
   1907 
   1908 	if (vtype != VNON && vtype != mi->mi_type) {
   1909 		/* shouldn't happen */
   1910 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
   1911 		    "nfs4_remap_root: server root vnode type (%d) doesn't "
   1912 		    "match mount info (%d)", vtype, mi->mi_type);
   1913 	}
   1914 
   1915 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1916 	rootfh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
   1917 	rootfh.nfs_fh4_len = svp->sv_fhandle.fh_len;
   1918 	nfs_rw_exit(&svp->sv_lock);
   1919 	sfh4_update(mi->mi_rootfh, &rootfh);
   1920 
   1921 	/*
   1922 	 * It's possible that recovery took place on the filesystem
   1923 	 * and the server has been updated between the time we did
   1924 	 * the nfs4getfh_otw and now. Re-drive the otw operation
   1925 	 * to make sure we have a good fh.
   1926 	 */
   1927 	mutex_enter(&mi->mi_lock);
   1928 	if (mi->mi_curr_serv != svp)
   1929 		goto remap_retry;
   1930 
   1931 	mutex_exit(&mi->mi_lock);
   1932 }
   1933 
   1934 static int
   1935 nfs4rootvp(vnode_t **rtvpp, vfs_t *vfsp, struct servinfo4 *svp_head,
   1936     int flags, cred_t *cr, zone_t *zone)
   1937 {
   1938 	vnode_t *rtvp = NULL;
   1939 	mntinfo4_t *mi;
   1940 	dev_t nfs_dev;
   1941 	int error = 0;
   1942 	rnode4_t *rp;
   1943 	int i;
   1944 	struct vattr va;
   1945 	vtype_t vtype = VNON;
   1946 	vtype_t tmp_vtype = VNON;
   1947 	struct servinfo4 *firstsvp = NULL, *svp = svp_head;
   1948 	nfs4_oo_hash_bucket_t *bucketp;
   1949 	nfs_fh4 fh;
   1950 	char *droptext = "";
   1951 	struct nfs_stats *nfsstatsp;
   1952 	nfs4_fname_t *mfname;
   1953 	nfs4_error_t e;
   1954 	char *orig_sv_path;
   1955 	int orig_sv_pathlen, num_retry, removed;
   1956 	cred_t *lcr = NULL, *tcr = cr;
   1957 
   1958 	nfsstatsp = zone_getspecific(nfsstat_zone_key, nfs_zone());
   1959 	ASSERT(nfsstatsp != NULL);
   1960 
   1961 	ASSERT(nfs_zone() == zone);
   1962 	ASSERT(crgetref(cr));
   1963 
   1964 	/*
   1965 	 * Create a mount record and link it to the vfs struct.
   1966 	 */
   1967 	mi = kmem_zalloc(sizeof (*mi), KM_SLEEP);
   1968 	mutex_init(&mi->mi_lock, NULL, MUTEX_DEFAULT, NULL);
   1969 	nfs_rw_init(&mi->mi_recovlock, NULL, RW_DEFAULT, NULL);
   1970 	nfs_rw_init(&mi->mi_rename_lock, NULL, RW_DEFAULT, NULL);
   1971 	nfs_rw_init(&mi->mi_fh_lock, NULL, RW_DEFAULT, NULL);
   1972 
   1973 	if (!(flags & NFSMNT_SOFT))
   1974 		mi->mi_flags |= MI4_HARD;
   1975 	if ((flags & NFSMNT_NOPRINT))
   1976 		mi->mi_flags |= MI4_NOPRINT;
   1977 	if (flags & NFSMNT_INT)
   1978 		mi->mi_flags |= MI4_INT;
   1979 	if (flags & NFSMNT_PUBLIC)
   1980 		mi->mi_flags |= MI4_PUBLIC;
   1981 	if (flags & NFSMNT_MIRRORMOUNT)
   1982 		mi->mi_flags |= MI4_MIRRORMOUNT;
   1983 	mi->mi_retrans = NFS_RETRIES;
   1984 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
   1985 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS)
   1986 		mi->mi_timeo = nfs4_cots_timeo;
   1987 	else
   1988 		mi->mi_timeo = NFS_TIMEO;
   1989 	mi->mi_prog = NFS_PROGRAM;
   1990 	mi->mi_vers = NFS_V4;
   1991 	mi->mi_rfsnames = rfsnames_v4;
   1992 	mi->mi_reqs = nfsstatsp->nfs_stats_v4.rfsreqcnt_ptr;
   1993 	cv_init(&mi->mi_failover_cv, NULL, CV_DEFAULT, NULL);
   1994 	mi->mi_servers = svp;
   1995 	mi->mi_curr_serv = svp;
   1996 	mi->mi_acregmin = SEC2HR(ACREGMIN);
   1997 	mi->mi_acregmax = SEC2HR(ACREGMAX);
   1998 	mi->mi_acdirmin = SEC2HR(ACDIRMIN);
   1999 	mi->mi_acdirmax = SEC2HR(ACDIRMAX);
   2000 	mi->mi_fh_expire_type = FH4_PERSISTENT;
   2001 	mi->mi_clientid_next = NULL;
   2002 	mi->mi_clientid_prev = NULL;
   2003 	mi->mi_srv = NULL;
   2004 	mi->mi_grace_wait = 0;
   2005 	mi->mi_error = 0;
   2006 	mi->mi_srvsettime = 0;
   2007 	mi->mi_srvset_cnt = 0;
   2008 
   2009 	mi->mi_count = 1;
   2010 
   2011 	mi->mi_tsize = nfs4_tsize(svp->sv_knconf);
   2012 	mi->mi_stsize = mi->mi_tsize;
   2013 
   2014 	if (flags & NFSMNT_DIRECTIO)
   2015 		mi->mi_flags |= MI4_DIRECTIO;
   2016 
   2017 	mi->mi_flags |= MI4_MOUNTING;
   2018 
   2019 	/*
   2020 	 * Make a vfs struct for nfs.  We do this here instead of below
   2021 	 * because rtvp needs a vfs before we can do a getattr on it.
   2022 	 *
   2023 	 * Assign a unique device id to the mount
   2024 	 */
   2025 	mutex_enter(&nfs_minor_lock);
   2026 	do {
   2027 		nfs_minor = (nfs_minor + 1) & MAXMIN32;
   2028 		nfs_dev = makedevice(nfs_major, nfs_minor);
   2029 	} while (vfs_devismounted(nfs_dev));
   2030 	mutex_exit(&nfs_minor_lock);
   2031 
   2032 	vfsp->vfs_dev = nfs_dev;
   2033 	vfs_make_fsid(&vfsp->vfs_fsid, nfs_dev, nfs4fstyp);
   2034 	vfsp->vfs_data = (caddr_t)mi;
   2035 	vfsp->vfs_fstype = nfsfstyp;
   2036 	vfsp->vfs_bsize = nfs4_bsize;
   2037 
   2038 	/*
   2039 	 * Initialize fields used to support async putpage operations.
   2040 	 */
   2041 	for (i = 0; i < NFS4_ASYNC_TYPES; i++)
   2042 		mi->mi_async_clusters[i] = nfs4_async_clusters;
   2043 	mi->mi_async_init_clusters = nfs4_async_clusters;
   2044 	mi->mi_async_curr = &mi->mi_async_reqs[0];
   2045 	mi->mi_max_threads = nfs4_max_threads;
   2046 	mutex_init(&mi->mi_async_lock, NULL, MUTEX_DEFAULT, NULL);
   2047 	cv_init(&mi->mi_async_reqs_cv, NULL, CV_DEFAULT, NULL);
   2048 	cv_init(&mi->mi_async_work_cv, NULL, CV_DEFAULT, NULL);
   2049 	cv_init(&mi->mi_async_cv, NULL, CV_DEFAULT, NULL);
   2050 	cv_init(&mi->mi_inact_req_cv, NULL, CV_DEFAULT, NULL);
   2051 
   2052 	mi->mi_vfsp = vfsp;
   2053 	zone_hold(mi->mi_zone = zone);
   2054 	nfs4_mi_zonelist_add(mi);
   2055 
   2056 	/*
   2057 	 * Initialize the <open owner/cred> hash table.
   2058 	 */
   2059 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
   2060 		bucketp = &(mi->mi_oo_list[i]);
   2061 		mutex_init(&bucketp->b_lock, NULL, MUTEX_DEFAULT, NULL);
   2062 		list_create(&bucketp->b_oo_hash_list,
   2063 		    sizeof (nfs4_open_owner_t),
   2064 		    offsetof(nfs4_open_owner_t, oo_hash_node));
   2065 	}
   2066 
   2067 	/*
   2068 	 * Initialize the freed open owner list.
   2069 	 */
   2070 	mi->mi_foo_num = 0;
   2071 	mi->mi_foo_max = NFS4_NUM_FREED_OPEN_OWNERS;
   2072 	list_create(&mi->mi_foo_list, sizeof (nfs4_open_owner_t),
   2073 	    offsetof(nfs4_open_owner_t, oo_foo_node));
   2074 
   2075 	list_create(&mi->mi_lost_state, sizeof (nfs4_lost_rqst_t),
   2076 	    offsetof(nfs4_lost_rqst_t, lr_node));
   2077 
   2078 	list_create(&mi->mi_bseqid_list, sizeof (nfs4_bseqid_entry_t),
   2079 	    offsetof(nfs4_bseqid_entry_t, bs_node));
   2080 
   2081 	/*
   2082 	 * Initialize the msg buffer.
   2083 	 */
   2084 	list_create(&mi->mi_msg_list, sizeof (nfs4_debug_msg_t),
   2085 	    offsetof(nfs4_debug_msg_t, msg_node));
   2086 	mi->mi_msg_count = 0;
   2087 	mutex_init(&mi->mi_msg_list_lock, NULL, MUTEX_DEFAULT, NULL);
   2088 
   2089 	/*
   2090 	 * Initialize kstats
   2091 	 */
   2092 	nfs4_mnt_kstat_init(vfsp);
   2093 
   2094 	/*
   2095 	 * Initialize the shared filehandle pool.
   2096 	 */
   2097 	sfh4_createtab(&mi->mi_filehandles);
   2098 
   2099 	/*
   2100 	 * Save server path we're attempting to mount.
   2101 	 */
   2102 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   2103 	orig_sv_pathlen = svp_head->sv_pathlen;
   2104 	orig_sv_path = kmem_alloc(svp_head->sv_pathlen, KM_SLEEP);
   2105 	bcopy(svp_head->sv_path, orig_sv_path, svp_head->sv_pathlen);
   2106 	nfs_rw_exit(&svp->sv_lock);
   2107 
   2108 	/*
   2109 	 * Make the GETFH call to get root fh for each replica.
   2110 	 */
   2111 	if (svp_head->sv_next)
   2112 		droptext = ", dropping replica";
   2113 
   2114 	/*
   2115 	 * If the uid is set then set the creds for secure mounts
   2116 	 * by proxy processes such as automountd.
   2117 	 */
   2118 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   2119 	if (svp->sv_secdata->uid != 0 &&
   2120 	    svp->sv_secdata->rpcflavor == RPCSEC_GSS) {
   2121 		lcr = crdup(cr);
   2122 		(void) crsetugid(lcr, svp->sv_secdata->uid, crgetgid(cr));
   2123 		tcr = lcr;
   2124 	}
   2125 	nfs_rw_exit(&svp->sv_lock);
   2126 	for (svp = svp_head; svp; svp = svp->sv_next) {
   2127 		if (nfs4_chkdup_servinfo4(svp_head, svp)) {
   2128 			nfs_cmn_err(error, CE_WARN,
   2129 			    VERS_MSG "Host %s is a duplicate%s",
   2130 			    svp->sv_hostname, droptext);
   2131 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   2132 			svp->sv_flags |= SV4_NOTINUSE;
   2133 			nfs_rw_exit(&svp->sv_lock);
   2134 			continue;
   2135 		}
   2136 		mi->mi_curr_serv = svp;
   2137 
   2138 		/*
   2139 		 * Just in case server path being mounted contains
   2140 		 * symlinks and fails w/STALE, save the initial sv_path
   2141 		 * so we can redrive the initial mount compound with the
   2142 		 * initial sv_path -- not a symlink-expanded version.
   2143 		 *
   2144 		 * This could only happen if a symlink was expanded
   2145 		 * and the expanded mount compound failed stale.  Because
   2146 		 * it could be the case that the symlink was removed at
   2147 		 * the server (and replaced with another symlink/dir,
   2148 		 * we need to use the initial sv_path when attempting
   2149 		 * to re-lookup everything and recover.
   2150 		 *
   2151 		 * Other mount errors should evenutally be handled here also
   2152 		 * (NFS4ERR_DELAY, NFS4ERR_RESOURCE).  For now, all mount
   2153 		 * failures will result in mount being redriven a few times.
   2154 		 */
   2155 		num_retry = nfs4_max_mount_retry;
   2156 		do {
   2157 			nfs4getfh_otw(mi, svp, &tmp_vtype,
   2158 			    ((flags & NFSMNT_PUBLIC) ? NFS4_GETFH_PUBLIC : 0) |
   2159 			    NFS4_GETFH_NEEDSOP, tcr, &e);
   2160 
   2161 			if (e.error == 0 && e.stat == NFS4_OK)
   2162 				break;
   2163 
   2164 			/*
   2165 			 * replace current sv_path with orig sv_path -- just in
   2166 			 * case it changed due to embedded symlinks.
   2167 			 */
   2168 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   2169 			if (orig_sv_pathlen != svp->sv_pathlen) {
   2170 				kmem_free(svp->sv_path, svp->sv_pathlen);
   2171 				svp->sv_path = kmem_alloc(orig_sv_pathlen,
   2172 				    KM_SLEEP);
   2173 				svp->sv_pathlen = orig_sv_pathlen;
   2174 			}
   2175 			bcopy(orig_sv_path, svp->sv_path, orig_sv_pathlen);
   2176 			nfs_rw_exit(&svp->sv_lock);
   2177 
   2178 		} while (num_retry-- > 0);
   2179 
   2180 		error = e.error ? e.error : geterrno4(e.stat);
   2181 		if (error) {
   2182 			nfs_cmn_err(error, CE_WARN,
   2183 			    VERS_MSG "initial call to %s failed%s: %m",
   2184 			    svp->sv_hostname, droptext);
   2185 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   2186 			svp->sv_flags |= SV4_NOTINUSE;
   2187 			nfs_rw_exit(&svp->sv_lock);
   2188 			mi->mi_flags &= ~MI4_RECOV_FAIL;
   2189 			mi->mi_error = 0;
   2190 			continue;
   2191 		}
   2192 
   2193 		if (tmp_vtype == VBAD) {
   2194 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
   2195 			    VERS_MSG "%s returned a bad file type for "
   2196 			    "root%s", svp->sv_hostname, droptext);
   2197 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   2198 			svp->sv_flags |= SV4_NOTINUSE;
   2199 			nfs_rw_exit(&svp->sv_lock);
   2200 			continue;
   2201 		}
   2202 
   2203 		if (vtype == VNON) {
   2204 			vtype = tmp_vtype;
   2205 		} else if (vtype != tmp_vtype) {
   2206 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
   2207 			    VERS_MSG "%s returned a different file type "
   2208 			    "for root%s", svp->sv_hostname, droptext);
   2209 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   2210 			svp->sv_flags |= SV4_NOTINUSE;
   2211 			nfs_rw_exit(&svp->sv_lock);
   2212 			continue;
   2213 		}
   2214 		if (firstsvp == NULL)
   2215 			firstsvp = svp;
   2216 	}
   2217 
   2218 	kmem_free(orig_sv_path, orig_sv_pathlen);
   2219 
   2220 	if (firstsvp == NULL) {
   2221 		if (error == 0)
   2222 			error = ENOENT;
   2223 		goto bad;
   2224 	}
   2225 
   2226 	mi->mi_curr_serv = svp = firstsvp;
   2227 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   2228 	ASSERT((mi->mi_curr_serv->sv_flags & SV4_NOTINUSE) == 0);
   2229 	fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
   2230 	fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
   2231 	mi->mi_rootfh = sfh4_get(&fh, mi);
   2232 	fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
   2233 	fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
   2234 	mi->mi_srvparentfh = sfh4_get(&fh, mi);
   2235 	nfs_rw_exit(&svp->sv_lock);
   2236 
   2237 	/*
   2238 	 * Get the fname for filesystem root.
   2239 	 */
   2240 	mi->mi_fname = fn_get(NULL, ".", mi->mi_rootfh);
   2241 	mfname = mi->mi_fname;
   2242 	fn_hold(mfname);
   2243 
   2244 	/*
   2245 	 * Make the root vnode without attributes.
   2246 	 */
   2247 	rtvp = makenfs4node_by_fh(mi->mi_rootfh, NULL,
   2248 	    &mfname, NULL, mi, cr, gethrtime());
   2249 	rtvp->v_type = vtype;
   2250 
   2251 	mi->mi_curread = mi->mi_tsize;
   2252 	mi->mi_curwrite = mi->mi_stsize;
   2253 
   2254 	/*
   2255 	 * Start the manager thread responsible for handling async worker
   2256 	 * threads.
   2257 	 */
   2258 	MI4_HOLD(mi);
   2259 	VFS_HOLD(vfsp);	/* add reference for thread */
   2260 	mi->mi_manager_thread = zthread_create(NULL, 0, nfs4_async_manager,
   2261 	    vfsp, 0, minclsyspri);
   2262 	ASSERT(mi->mi_manager_thread != NULL);
   2263 
   2264 	/*
   2265 	 * Create the thread that handles over-the-wire calls for
   2266 	 * VOP_INACTIVE.
   2267 	 * This needs to happen after the manager thread is created.
   2268 	 */
   2269 	MI4_HOLD(mi);
   2270 	mi->mi_inactive_thread = zthread_create(NULL, 0, nfs4_inactive_thread,
   2271 	    mi, 0, minclsyspri);
   2272 	ASSERT(mi->mi_inactive_thread != NULL);
   2273 
   2274 	/* If we didn't get a type, get one now */
   2275 	if (rtvp->v_type == VNON) {
   2276 		va.va_mask = AT_TYPE;
   2277 		error = nfs4getattr(rtvp, &va, tcr);
   2278 		if (error)
   2279 			goto bad;
   2280 		rtvp->v_type = va.va_type;
   2281 	}
   2282 
   2283 	mi->mi_type = rtvp->v_type;
   2284 
   2285 	mutex_enter(&mi->mi_lock);
   2286 	mi->mi_flags &= ~MI4_MOUNTING;
   2287 	mutex_exit(&mi->mi_lock);
   2288 
   2289 	*rtvpp = rtvp;
   2290 	if (lcr != NULL)
   2291 		crfree(lcr);
   2292 
   2293 	return (0);
   2294 bad:
   2295 	/*
   2296 	 * An error occurred somewhere, need to clean up...
   2297 	 */
   2298 	if (lcr != NULL)
   2299 		crfree(lcr);
   2300 
   2301 	if (rtvp != NULL) {
   2302 		/*
   2303 		 * We need to release our reference to the root vnode and
   2304 		 * destroy the mntinfo4 struct that we just created.
   2305 		 */
   2306 		rp = VTOR4(rtvp);
   2307 		if (rp->r_flags & R4HASHED)
   2308 			rp4_rmhash(rp);
   2309 		VN_RELE(rtvp);
   2310 	}
   2311 	nfs4_async_stop(vfsp);
   2312 	nfs4_async_manager_stop(vfsp);
   2313 	removed = nfs4_mi_zonelist_remove(mi);
   2314 	if (removed)
   2315 		zone_rele(mi->mi_zone);
   2316 
   2317 	/*
   2318 	 * This releases the initial "hold" of the mi since it will never
   2319 	 * be referenced by the vfsp.  Also, when mount returns to vfs.c
   2320 	 * with an error, the vfsp will be destroyed, not rele'd.
   2321 	 */
   2322 	MI4_RELE(mi);
   2323 
   2324 	*rtvpp = NULL;
   2325 	return (error);
   2326 }
   2327 
   2328 /*
   2329  * vfs operations
   2330  */
   2331 static int
   2332 nfs4_unmount(vfs_t *vfsp, int flag, cred_t *cr)
   2333 {
   2334 	mntinfo4_t		*mi;
   2335 	ushort_t		omax;
   2336 	int			removed;
   2337 
   2338 	bool_t			must_unlock;
   2339 	bool_t			must_rele;
   2340 
   2341 	nfs4_ephemeral_tree_t	*eph_tree;
   2342 
   2343 	if (secpolicy_fs_unmount(cr, vfsp) != 0)
   2344 		return (EPERM);
   2345 
   2346 	mi = VFTOMI4(vfsp);
   2347 
   2348 	if (flag & MS_FORCE) {
   2349 		vfsp->vfs_flag |= VFS_UNMOUNTED;
   2350 		if (nfs_zone() != mi->mi_zone) {
   2351 			/*
   2352 			 * If the request is coming from the wrong zone,
   2353 			 * we don't want to create any new threads, and
   2354 			 * performance is not a concern.  Do everything
   2355 			 * inline.
   2356 			 */
   2357 			NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
   2358 			    "nfs4_unmount x-zone forced unmount of vfs %p\n",
   2359 			    (void *)vfsp));
   2360 			nfs4_free_mount(vfsp, flag, cr);
   2361 		} else {
   2362 			/*
   2363 			 * Free data structures asynchronously, to avoid
   2364 			 * blocking the current thread (for performance
   2365 			 * reasons only).
   2366 			 */
   2367 			async_free_mount(vfsp, flag, cr);
   2368 		}
   2369 
   2370 		return (0);
   2371 	}
   2372 
   2373 	/*
   2374 	 * Wait until all asynchronous putpage operations on
   2375 	 * this file system are complete before flushing rnodes
   2376 	 * from the cache.
   2377 	 */
   2378 	omax = mi->mi_max_threads;
   2379 	if (nfs4_async_stop_sig(vfsp))
   2380 		return (EINTR);
   2381 
   2382 	r4flush(vfsp, cr);
   2383 
   2384 	/*
   2385 	 * About the only reason that this would fail would be
   2386 	 * that the harvester is already busy tearing down this
   2387 	 * node. So we fail back to the caller and let them try
   2388 	 * again when needed.
   2389 	 */
   2390 	if (nfs4_ephemeral_umount(mi, flag, cr,
   2391 	    &must_unlock, &must_rele, &eph_tree)) {
   2392 		ASSERT(must_unlock == FALSE);
   2393 		mutex_enter(&mi->mi_async_lock);
   2394 		mi->mi_max_threads = omax;
   2395 		mutex_exit(&mi->mi_async_lock);
   2396 
   2397 		return (EBUSY);
   2398 	}
   2399 
   2400 	/*
   2401 	 * If there are any active vnodes on this file system,
   2402 	 * then the file system is busy and can't be unmounted.
   2403 	 */
   2404 	if (check_rtable4(vfsp)) {
   2405 		nfs4_ephemeral_umount_unlock(&must_unlock, &must_rele,
   2406 		    &eph_tree);
   2407 
   2408 		mutex_enter(&mi->mi_async_lock);
   2409 		mi->mi_max_threads = omax;
   2410 		mutex_exit(&mi->mi_async_lock);
   2411 
   2412 		return (EBUSY);
   2413 	}
   2414 
   2415 	/*
   2416 	 * The unmount can't fail from now on, so record any
   2417 	 * ephemeral changes.
   2418 	 */
   2419 	nfs4_ephemeral_umount_activate(mi, &must_unlock,
   2420 	    &must_rele, &eph_tree);
   2421 
   2422 	/*
   2423 	 * There are no active files that could require over-the-wire
   2424 	 * calls to the server, so stop the async manager and the
   2425 	 * inactive thread.
   2426 	 */
   2427 	nfs4_async_manager_stop(vfsp);
   2428 
   2429 	/*
   2430 	 * Destroy all rnodes belonging to this file system from the
   2431 	 * rnode hash queues and purge any resources allocated to
   2432 	 * them.
   2433 	 */
   2434 	destroy_rtable4(vfsp, cr);
   2435 	vfsp->vfs_flag |= VFS_UNMOUNTED;
   2436 
   2437 	nfs4_remove_mi_from_server(mi, NULL);
   2438 	removed = nfs4_mi_zonelist_remove(mi);
   2439 	if (removed)
   2440 		zone_rele(mi->mi_zone);
   2441 
   2442 	return (0);
   2443 }
   2444 
   2445 /*
   2446  * find root of nfs
   2447  */
   2448 static int
   2449 nfs4_root(vfs_t *vfsp, vnode_t **vpp)
   2450 {
   2451 	mntinfo4_t *mi;
   2452 	vnode_t *vp;
   2453 	nfs4_fname_t *mfname;
   2454 	servinfo4_t *svp;
   2455 
   2456 	mi = VFTOMI4(vfsp);
   2457 
   2458 	if (nfs_zone() != mi->mi_zone)
   2459 		return (EPERM);
   2460 
   2461 	svp = mi->mi_curr_serv;
   2462 	if (svp) {
   2463 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   2464 		if (svp->sv_flags & SV4_ROOT_STALE) {
   2465 			nfs_rw_exit(&svp->sv_lock);
   2466 
   2467 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   2468 			if (svp->sv_flags & SV4_ROOT_STALE) {
   2469 				svp->sv_flags &= ~SV4_ROOT_STALE;
   2470 				nfs_rw_exit(&svp->sv_lock);
   2471 				return (ENOENT);
   2472 			}
   2473 			nfs_rw_exit(&svp->sv_lock);
   2474 		} else
   2475 			nfs_rw_exit(&svp->sv_lock);
   2476 	}
   2477 
   2478 	mfname = mi->mi_fname;
   2479 	fn_hold(mfname);
   2480 	vp = makenfs4node_by_fh(mi->mi_rootfh, NULL, &mfname, NULL,
   2481 	    VFTOMI4(vfsp), CRED(), gethrtime());
   2482 
   2483 	if (VTOR4(vp)->r_flags & R4STALE) {
   2484 		VN_RELE(vp);
   2485 		return (ENOENT);
   2486 	}
   2487 
   2488 	ASSERT(vp->v_type == VNON || vp->v_type == mi->mi_type);
   2489 
   2490 	vp->v_type = mi->mi_type;
   2491 
   2492 	*vpp = vp;
   2493 
   2494 	return (0);
   2495 }
   2496 
   2497 static int
   2498 nfs4_statfs_otw(vnode_t *vp, struct statvfs64 *sbp, cred_t *cr)
   2499 {
   2500 	int error;
   2501 	nfs4_ga_res_t gar;
   2502 	nfs4_ga_ext_res_t ger;
   2503 
   2504 	gar.n4g_ext_res = &ger;
   2505 
   2506 	if (error = nfs4_attr_otw(vp, TAG_FSINFO, &gar,
   2507 	    NFS4_STATFS_ATTR_MASK, cr))
   2508 		return (error);
   2509 
   2510 	*sbp = gar.n4g_ext_res->n4g_sb;
   2511 
   2512 	return (0);
   2513 }
   2514 
   2515 /*
   2516  * Get file system statistics.
   2517  */
   2518 static int
   2519 nfs4_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
   2520 {
   2521 	int error;
   2522 	vnode_t *vp;
   2523 	cred_t *cr;
   2524 
   2525 	error = nfs4_root(vfsp, &vp);
   2526 	if (error)
   2527 		return (error);
   2528 
   2529 	cr = CRED();
   2530 
   2531 	error = nfs4_statfs_otw(vp, sbp, cr);
   2532 	if (!error) {
   2533 		(void) strncpy(sbp->f_basetype,
   2534 		    vfssw[vfsp->vfs_fstype].vsw_name, FSTYPSZ);
   2535 		sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
   2536 	} else {
   2537 		nfs4_purge_stale_fh(error, vp, cr);
   2538 	}
   2539 
   2540 	VN_RELE(vp);
   2541 
   2542 	return (error);
   2543 }
   2544 
   2545 static kmutex_t nfs4_syncbusy;
   2546 
   2547 /*
   2548  * Flush dirty nfs files for file system vfsp.
   2549  * If vfsp == NULL, all nfs files are flushed.
   2550  *
   2551  * SYNC_CLOSE in flag is passed to us to
   2552  * indicate that we are shutting down and or
   2553  * rebooting.
   2554  */
   2555 static int
   2556 nfs4_sync(vfs_t *vfsp, short flag, cred_t *cr)
   2557 {
   2558 	/*
   2559 	 * Cross-zone calls are OK here, since this translates to a
   2560 	 * VOP_PUTPAGE(B_ASYNC), which gets picked up by the right zone.
   2561 	 */
   2562 	if (!(flag & SYNC_ATTR) && mutex_tryenter(&nfs4_syncbusy) != 0) {
   2563 		r4flush(vfsp, cr);
   2564 		mutex_exit(&nfs4_syncbusy);
   2565 	}
   2566 
   2567 	/*
   2568 	 * if SYNC_CLOSE is set then we know that
   2569 	 * the system is rebooting, mark the mntinfo
   2570 	 * for later examination.
   2571 	 */
   2572 	if (vfsp && (flag & SYNC_CLOSE)) {
   2573 		mntinfo4_t *mi;
   2574 
   2575 		mi = VFTOMI4(vfsp);
   2576 		if (!(mi->mi_flags & MI4_SHUTDOWN)) {
   2577 			mutex_enter(&mi->mi_lock);
   2578 			mi->mi_flags |= MI4_SHUTDOWN;
   2579 			mutex_exit(&mi->mi_lock);
   2580 		}
   2581 	}
   2582 	return (0);
   2583 }
   2584 
   2585 /*
   2586  * vget is difficult, if not impossible, to support in v4 because we don't
   2587  * know the parent directory or name, which makes it impossible to create a
   2588  * useful shadow vnode.  And we need the shadow vnode for things like
   2589  * OPEN.
   2590  */
   2591 
   2592 /* ARGSUSED */
   2593 /*
   2594  * XXX Check nfs4_vget_pseudo() for dependency.
   2595  */
   2596 static int
   2597 nfs4_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
   2598 {
   2599 	return (EREMOTE);
   2600 }
   2601 
   2602 /*
   2603  * nfs4_mountroot get called in the case where we are diskless booting.  All
   2604  * we need from here is the ability to get the server info and from there we
   2605  * can simply call nfs4_rootvp.
   2606  */
   2607 /* ARGSUSED */
   2608 static int
   2609 nfs4_mountroot(vfs_t *vfsp, whymountroot_t why)
   2610 {
   2611 	vnode_t *rtvp;
   2612 	char root_hostname[SYS_NMLN+1];
   2613 	struct servinfo4 *svp;
   2614 	int error;
   2615 	int vfsflags;
   2616 	size_t size;
   2617 	char *root_path;
   2618 	struct pathname pn;
   2619 	char *name;
   2620 	cred_t *cr;
   2621 	mntinfo4_t *mi;
   2622 	struct nfs_args args;		/* nfs mount arguments */
   2623 	static char token[10];
   2624 	nfs4_error_t n4e;
   2625 
   2626 	bzero(&args, sizeof (args));
   2627 
   2628 	/* do this BEFORE getfile which causes xid stamps to be initialized */
   2629 	clkset(-1L);		/* hack for now - until we get time svc? */
   2630 
   2631 	if (why == ROOT_REMOUNT) {
   2632 		/*
   2633 		 * Shouldn't happen.
   2634 		 */
   2635 		panic("nfs4_mountroot: why == ROOT_REMOUNT");
   2636 	}
   2637 
   2638 	if (why == ROOT_UNMOUNT) {
   2639 		/*
   2640 		 * Nothing to do for NFS.
   2641 		 */
   2642 		return (0);
   2643 	}
   2644 
   2645 	/*
   2646 	 * why == ROOT_INIT
   2647 	 */
   2648 
   2649 	name = token;
   2650 	*name = 0;
   2651 	(void) getfsname("root", name, sizeof (token));
   2652 
   2653 	pn_alloc(&pn);
   2654 	root_path = pn.pn_path;
   2655 
   2656 	svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
   2657 	nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
   2658 	svp->sv_knconf = kmem_zalloc(sizeof (*svp->sv_knconf), KM_SLEEP);
   2659 	svp->sv_knconf->knc_protofmly = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
   2660 	svp->sv_knconf->knc_proto = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
   2661 
   2662 	/*
   2663 	 * Get server address
   2664 	 * Get the root path
   2665 	 * Get server's transport
   2666 	 * Get server's hostname
   2667 	 * Get options
   2668 	 */
   2669 	args.addr = &svp->sv_addr;
   2670 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   2671 	args.fh = (char *)&svp->sv_fhandle;
   2672 	args.knconf = svp->sv_knconf;
   2673 	args.hostname = root_hostname;
   2674 	vfsflags = 0;
   2675 	if (error = mount_root(*name ? name : "root", root_path, NFS_V4,
   2676 	    &args, &vfsflags)) {
   2677 		if (error == EPROTONOSUPPORT)
   2678 			nfs_cmn_err(error, CE_WARN, "nfs4_mountroot: "
   2679 			    "mount_root failed: server doesn't support NFS V4");
   2680 		else
   2681 			nfs_cmn_err(error, CE_WARN,
   2682 			    "nfs4_mountroot: mount_root failed: %m");
   2683 		nfs_rw_exit(&svp->sv_lock);
   2684 		sv4_free(svp);
   2685 		pn_free(&pn);
   2686 		return (error);
   2687 	}
   2688 	nfs_rw_exit(&svp->sv_lock);
   2689 	svp->sv_hostnamelen = (int)(strlen(root_hostname) + 1);
   2690 	svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
   2691 	(void) strcpy(svp->sv_hostname, root_hostname);
   2692 
   2693 	svp->sv_pathlen = (int)(strlen(root_path) + 1);
   2694 	svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
   2695 	(void) strcpy(svp->sv_path, root_path);
   2696 
   2697 	/*
   2698 	 * Force root partition to always be mounted with AUTH_UNIX for now
   2699 	 */
   2700 	svp->sv_secdata = kmem_alloc(sizeof (*svp->sv_secdata), KM_SLEEP);
   2701 	svp->sv_secdata->secmod = AUTH_UNIX;
   2702 	svp->sv_secdata->rpcflavor = AUTH_UNIX;
   2703 	svp->sv_secdata->data = NULL;
   2704 
   2705 	cr = crgetcred();
   2706 	rtvp = NULL;
   2707 
   2708 	error = nfs4rootvp(&rtvp, vfsp, svp, args.flags, cr, global_zone);
   2709 
   2710 	if (error) {
   2711 		crfree(cr);
   2712 		pn_free(&pn);
   2713 		sv4_free(svp);
   2714 		return (error);
   2715 	}
   2716 
   2717 	mi = VTOMI4(rtvp);
   2718 
   2719 	/*
   2720 	 * Send client id to the server, if necessary
   2721 	 */
   2722 	nfs4_error_zinit(&n4e);
   2723 	nfs4setclientid(mi, cr, FALSE, &n4e);
   2724 	error = n4e.error;
   2725 
   2726 	crfree(cr);
   2727 
   2728 	if (error) {
   2729 		pn_free(&pn);
   2730 		goto errout;
   2731 	}
   2732 
   2733 	error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, &args);
   2734 	if (error) {
   2735 		nfs_cmn_err(error, CE_WARN,
   2736 		    "nfs4_mountroot: invalid root mount options");
   2737 		pn_free(&pn);
   2738 		goto errout;
   2739 	}
   2740 
   2741 	(void) vfs_lock_wait(vfsp);
   2742 	vfs_add(NULL, vfsp, vfsflags);
   2743 	vfs_unlock(vfsp);
   2744 
   2745 	size = strlen(svp->sv_hostname);
   2746 	(void) strcpy(rootfs.bo_name, svp->sv_hostname);
   2747 	rootfs.bo_name[size] = ':';
   2748 	(void) strcpy(&rootfs.bo_name[size + 1], root_path);
   2749 
   2750 	pn_free(&pn);
   2751 
   2752 errout:
   2753 	if (error) {
   2754 		sv4_free(svp);
   2755 		nfs4_async_stop(vfsp);
   2756 		nfs4_async_manager_stop(vfsp);
   2757 	}
   2758 
   2759 	if (rtvp != NULL)
   2760 		VN_RELE(rtvp);
   2761 
   2762 	return (error);
   2763 }
   2764 
   2765 /*
   2766  * Initialization routine for VFS routines.  Should only be called once
   2767  */
   2768 int
   2769 nfs4_vfsinit(void)
   2770 {
   2771 	mutex_init(&nfs4_syncbusy, NULL, MUTEX_DEFAULT, NULL);
   2772 	nfs4setclientid_init();
   2773 	nfs4_ephemeral_init();
   2774 	return (0);
   2775 }
   2776 
   2777 void
   2778 nfs4_vfsfini(void)
   2779 {
   2780 	nfs4_ephemeral_fini();
   2781 	nfs4setclientid_fini();
   2782 	mutex_destroy(&nfs4_syncbusy);
   2783 }
   2784 
   2785 void
   2786 nfs4_freevfs(vfs_t *vfsp)
   2787 {
   2788 	mntinfo4_t *mi;
   2789 
   2790 	/* need to release the initial hold */
   2791 	mi = VFTOMI4(vfsp);
   2792 	MI4_RELE(mi);
   2793 }
   2794 
   2795 /*
   2796  * Client side SETCLIENTID and SETCLIENTID_CONFIRM
   2797  */
   2798 struct nfs4_server nfs4_server_lst =
   2799 	{ &nfs4_server_lst, &nfs4_server_lst };
   2800 
   2801 kmutex_t nfs4_server_lst_lock;
   2802 
   2803 static void
   2804 nfs4setclientid_init(void)
   2805 {
   2806 	mutex_init(&nfs4_server_lst_lock, NULL, MUTEX_DEFAULT, NULL);
   2807 }
   2808 
   2809 static void
   2810 nfs4setclientid_fini(void)
   2811 {
   2812 	mutex_destroy(&nfs4_server_lst_lock);
   2813 }
   2814 
   2815 int nfs4_retry_sclid_delay = NFS4_RETRY_SCLID_DELAY;
   2816 int nfs4_num_sclid_retries = NFS4_NUM_SCLID_RETRIES;
   2817 
   2818 /*
   2819  * Set the clientid for the server for "mi".  No-op if the clientid is
   2820  * already set.
   2821  *
   2822  * The recovery boolean should be set to TRUE if this function was called
   2823  * by the recovery code, and FALSE otherwise.  This is used to determine
   2824  * if we need to call nfs4_start/end_op as well as grab the mi_recovlock
   2825  * for adding a mntinfo4_t to a nfs4_server_t.
   2826  *
   2827  * Error is returned via 'n4ep'.  If there was a 'n4ep->stat' error, then
   2828  * 'n4ep->error' is set to geterrno4(n4ep->stat).
   2829  */
   2830 void
   2831 nfs4setclientid(mntinfo4_t *mi, cred_t *cr, bool_t recovery, nfs4_error_t *n4ep)
   2832 {
   2833 	struct nfs4_server *np;
   2834 	struct servinfo4 *svp = mi->mi_curr_serv;
   2835 	nfs4_recov_state_t recov_state;
   2836 	int num_retries = 0;
   2837 	bool_t retry;
   2838 	cred_t *lcr = NULL;
   2839 	int retry_inuse = 1; /* only retry once on NFS4ERR_CLID_INUSE */
   2840 	time_t lease_time = 0;
   2841 
   2842 	recov_state.rs_flags = 0;
   2843 	recov_state.rs_num_retry_despite_err = 0;
   2844 	ASSERT(n4ep != NULL);
   2845 
   2846 recov_retry:
   2847 	retry = FALSE;
   2848 	nfs4_error_zinit(n4ep);
   2849 	if (!recovery)
   2850 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
   2851 
   2852 	mutex_enter(&nfs4_server_lst_lock);
   2853 	np = servinfo4_to_nfs4_server(svp); /* This locks np if it is found */
   2854 	mutex_exit(&nfs4_server_lst_lock);
   2855 	if (!np) {
   2856 		struct nfs4_server *tnp;
   2857 		np = new_nfs4_server(svp, cr);
   2858 		mutex_enter(&np->s_lock);
   2859 
   2860 		mutex_enter(&nfs4_server_lst_lock);
   2861 		tnp = servinfo4_to_nfs4_server(svp);
   2862 		if (tnp) {
   2863 			/*
   2864 			 * another thread snuck in and put server on list.
   2865 			 * since we aren't adding it to the nfs4_server_list
   2866 			 * we need to set the ref count to 0 and destroy it.
   2867 			 */
   2868 			np->s_refcnt = 0;
   2869 			destroy_nfs4_server(np);
   2870 			np = tnp;
   2871 		} else {
   2872 			/*
   2873 			 * do not give list a reference until everything
   2874 			 * succeeds
   2875 			 */
   2876 			insque(np, &nfs4_server_lst);
   2877 		}
   2878 		mutex_exit(&nfs4_server_lst_lock);
   2879 	}
   2880 	ASSERT(MUTEX_HELD(&np->s_lock));
   2881 	/*
   2882 	 * If we find the server already has N4S_CLIENTID_SET, then
   2883 	 * just return, we've already done SETCLIENTID to that server
   2884 	 */
   2885 	if (np->s_flags & N4S_CLIENTID_SET) {
   2886 		/* add mi to np's mntinfo4_list */
   2887 		nfs4_add_mi_to_server(np, mi);
   2888 		if (!recovery)
   2889 			nfs_rw_exit(&mi->mi_recovlock);
   2890 		mutex_exit(&np->s_lock);
   2891 		nfs4_server_rele(np);
   2892 		return;
   2893 	}
   2894 	mutex_exit(&np->s_lock);
   2895 
   2896 
   2897 	/*
   2898 	 * Drop the mi_recovlock since nfs4_start_op will
   2899 	 * acquire it again for us.
   2900 	 */
   2901 	if (!recovery) {
   2902 		nfs_rw_exit(&mi->mi_recovlock);
   2903 
   2904 		n4ep->error = nfs4_start_op(mi, NULL, NULL, &recov_state);
   2905 		if (n4ep->error) {
   2906 			nfs4_server_rele(np);
   2907 			return;
   2908 		}
   2909 	}
   2910 
   2911 	mutex_enter(&np->s_lock);
   2912 	while (np->s_flags & N4S_CLIENTID_PEND) {
   2913 		if (!cv_wait_sig(&np->s_clientid_pend, &np->s_lock)) {
   2914 			mutex_exit(&np->s_lock);
   2915 			nfs4_server_rele(np);
   2916 			if (!recovery)
   2917 				nfs4_end_op(mi, NULL, NULL, &recov_state,
   2918 				    recovery);
   2919 			n4ep->error = EINTR;
   2920 			return;
   2921 		}
   2922 	}
   2923 
   2924 	if (np->s_flags & N4S_CLIENTID_SET) {
   2925 		/* XXX copied/pasted from above */
   2926 		/* add mi to np's mntinfo4_list */
   2927 		nfs4_add_mi_to_server(np, mi);
   2928 		mutex_exit(&np->s_lock);
   2929 		nfs4_server_rele(np);
   2930 		if (!recovery)
   2931 			nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
   2932 		return;
   2933 	}
   2934 
   2935 	/*
   2936 	 * Reset the N4S_CB_PINGED flag. This is used to
   2937 	 * indicate if we have received a CB_NULL from the
   2938 	 * server. Also we reset the waiter flag.
   2939 	 */
   2940 	np->s_flags &= ~(N4S_CB_PINGED | N4S_CB_WAITER);
   2941 	/* any failure must now clear this flag */
   2942 	np->s_flags |= N4S_CLIENTID_PEND;
   2943 	mutex_exit(&np->s_lock);
   2944 	nfs4setclientid_otw(mi, svp, cr, np, n4ep, &retry_inuse);
   2945 
   2946 	if (n4ep->error == EACCES) {
   2947 		/*
   2948 		 * If the uid is set then set the creds for secure mounts
   2949 		 * by proxy processes such as automountd.
   2950 		 */
   2951 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   2952 		if (svp->sv_secdata->uid != 0) {
   2953 			lcr = crdup(cr);
   2954 			(void) crsetugid(lcr, svp->sv_secdata->uid,
   2955 			    crgetgid(cr));
   2956 		}
   2957 		nfs_rw_exit(&svp->sv_lock);
   2958 
   2959 		if (lcr != NULL) {
   2960 			mutex_enter(&np->s_lock);
   2961 			crfree(np->s_cred);
   2962 			np->s_cred = lcr;
   2963 			mutex_exit(&np->s_lock);
   2964 			nfs4setclientid_otw(mi, svp, lcr, np, n4ep,
   2965 			    &retry_inuse);
   2966 		}
   2967 	}
   2968 	mutex_enter(&np->s_lock);
   2969 	lease_time = np->s_lease_time;
   2970 	np->s_flags &= ~N4S_CLIENTID_PEND;
   2971 	mutex_exit(&np->s_lock);
   2972 
   2973 	if (n4ep->error != 0 || n4ep->stat != NFS4_OK) {
   2974 		/*
   2975 		 * Start recovery if failover is a possibility.  If
   2976 		 * invoked by the recovery thread itself, then just
   2977 		 * return and let it handle the failover first.  NB:
   2978 		 * recovery is not allowed if the mount is in progress
   2979 		 * since the infrastructure is not sufficiently setup
   2980 		 * to allow it.  Just return the error (after suitable
   2981 		 * retries).
   2982 		 */
   2983 		if (FAILOVER_MOUNT4(mi) && nfs4_try_failover(n4ep)) {
   2984 			(void) nfs4_start_recovery(n4ep, mi, NULL,
   2985 			    NULL, NULL, NULL, OP_SETCLIENTID, NULL);
   2986 			/*
   2987 			 * Don't retry here, just return and let
   2988 			 * recovery take over.
   2989 			 */
   2990 			if (recovery)
   2991 				retry = FALSE;
   2992 		} else if (nfs4_rpc_retry_error(n4ep->error) ||
   2993 		    n4ep->stat == NFS4ERR_RESOURCE ||
   2994 		    n4ep->stat == NFS4ERR_STALE_CLIENTID) {
   2995 
   2996 			retry = TRUE;
   2997 			/*
   2998 			 * Always retry if in recovery or once had
   2999 			 * contact with the server (but now it's
   3000 			 * overloaded).
   3001 			 */
   3002 			if (recovery == TRUE ||
   3003 			    n4ep->error == ETIMEDOUT ||
   3004 			    n4ep->error == ECONNRESET)
   3005 				num_retries = 0;
   3006 		} else if (retry_inuse && n4ep->error == 0 &&
   3007 		    n4ep->stat == NFS4ERR_CLID_INUSE) {
   3008 			retry = TRUE;
   3009 			num_retries = 0;
   3010 		}
   3011 	} else {
   3012 		/*
   3013 		 * Since everything succeeded give the list a reference count if
   3014 		 * it hasn't been given one by add_new_nfs4_server() or if this
   3015 		 * is not a recovery situation in which case it is already on
   3016 		 * the list.
   3017 		 */
   3018 		mutex_enter(&np->s_lock);
   3019 		if ((np->s_flags & N4S_INSERTED) == 0) {
   3020 			np->s_refcnt++;
   3021 			np->s_flags |= N4S_INSERTED;
   3022 		}
   3023 		mutex_exit(&np->s_lock);
   3024 	}
   3025 
   3026 	if (!recovery)
   3027 		nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
   3028 
   3029 
   3030 	if (retry && num_retries++ < nfs4_num_sclid_retries) {
   3031 		if (retry_inuse) {
   3032 			delay(SEC_TO_TICK(lease_time + nfs4_retry_sclid_delay));
   3033 			retry_inuse = 0;
   3034 		} else
   3035 			delay(SEC_TO_TICK(nfs4_retry_sclid_delay));
   3036 
   3037 		nfs4_server_rele(np);
   3038 		goto recov_retry;
   3039 	}
   3040 
   3041 
   3042 	if (n4ep->error == 0)
   3043 		n4ep->error = geterrno4(n4ep->stat);
   3044 
   3045 	/* broadcast before release in case no other threads are waiting */
   3046 	cv_broadcast(&np->s_clientid_pend);
   3047 	nfs4_server_rele(np);
   3048 }
   3049 
   3050 int nfs4setclientid_otw_debug = 0;
   3051 
   3052 /*
   3053  * This function handles the recovery of STALE_CLIENTID for SETCLIENTID_CONFRIM,
   3054  * but nothing else; the calling function must be designed to handle those
   3055  * other errors.
   3056  */
   3057 static void
   3058 nfs4setclientid_otw(mntinfo4_t *mi, struct servinfo4 *svp,  cred_t *cr,
   3059     struct nfs4_server *np, nfs4_error_t *ep, int *retry_inusep)
   3060 {
   3061 	COMPOUND4args_clnt args;
   3062 	COMPOUND4res_clnt res;
   3063 	nfs_argop4 argop[3];
   3064 	SETCLIENTID4args *s_args;
   3065 	SETCLIENTID4resok *s_resok;
   3066 	int doqueue = 1;
   3067 	nfs4_ga_res_t *garp = NULL;
   3068 	timespec_t prop_time, after_time;
   3069 	verifier4 verf;
   3070 	clientid4 tmp_clientid;
   3071 
   3072 	ASSERT(!MUTEX_HELD(&np->s_lock));
   3073 
   3074 	args.ctag = TAG_SETCLIENTID;
   3075 
   3076 	args.array = argop;
   3077 	args.array_len = 3;
   3078 
   3079 	/* PUTROOTFH */
   3080 	argop[0].argop = OP_PUTROOTFH;
   3081 
   3082 	/* GETATTR */
   3083 	argop[1].argop = OP_GETATTR;
   3084 	argop[1].nfs_argop4_u.opgetattr.attr_request = FATTR4_LEASE_TIME_MASK;
   3085 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
   3086 
   3087 	/* SETCLIENTID */
   3088 	argop[2].argop = OP_SETCLIENTID;
   3089 
   3090 	s_args = &argop[2].nfs_argop4_u.opsetclientid;
   3091 
   3092 	mutex_enter(&np->s_lock);
   3093 
   3094 	s_args->client.verifier = np->clidtosend.verifier;
   3095 	s_args->client.id_len = np->clidtosend.id_len;
   3096 	ASSERT(s_args->client.id_len <= NFS4_OPAQUE_LIMIT);
   3097 	s_args->client.id_val = np->clidtosend.id_val;
   3098 
   3099 	/*
   3100 	 * Callback needs to happen on non-RDMA transport
   3101 	 * Check if we have saved the original knetconfig
   3102 	 * if so, use that instead.
   3103 	 */
   3104 	if (svp->sv_origknconf != NULL)
   3105 		nfs4_cb_args(np, svp->sv_origknconf, s_args);
   3106 	else
   3107 		nfs4_cb_args(np, svp->sv_knconf, s_args);
   3108 
   3109 	mutex_exit(&np->s_lock);
   3110 
   3111 	rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
   3112 
   3113 	if (ep->error)
   3114 		return;
   3115 
   3116 	/* getattr lease_time res */
   3117 	if ((res.array_len >= 2) &&
   3118 	    (res.array[1].nfs_resop4_u.opgetattr.status == NFS4_OK)) {
   3119 		garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
   3120 
   3121 #ifndef _LP64
   3122 		/*
   3123 		 * The 32 bit client cannot handle a lease time greater than
   3124 		 * (INT32_MAX/1000000).  This is due to the use of the
   3125 		 * lease_time in calls to drv_usectohz() in
   3126 		 * nfs4_renew_lease_thread().  The problem is that
   3127 		 * drv_usectohz() takes a time_t (which is just a long = 4
   3128 		 * bytes) as its parameter.  The lease_time is multiplied by
   3129 		 * 1000000 to convert seconds to usecs for the parameter.  If
   3130 		 * a number bigger than (INT32_MAX/1000000) is used then we
   3131 		 * overflow on the 32bit client.
   3132 		 */
   3133 		if (garp->n4g_ext_res->n4g_leasetime > (INT32_MAX/1000000)) {
   3134 			garp->n4g_ext_res->n4g_leasetime = INT32_MAX/1000000;
   3135 		}
   3136 #endif
   3137 
   3138 		mutex_enter(&np->s_lock);
   3139 		np->s_lease_time = garp->n4g_ext_res->n4g_leasetime;
   3140 
   3141 		/*
   3142 		 * Keep track of the lease period for the mi's
   3143 		 * mi_msg_list.  We need an appropiate time
   3144 		 * bound to associate past facts with a current
   3145 		 * event.  The lease period is perfect for this.
   3146 		 */
   3147 		mutex_enter(&mi->mi_msg_list_lock);
   3148 		mi->mi_lease_period = np->s_lease_time;
   3149 		mutex_exit(&mi->mi_msg_list_lock);
   3150 		mutex_exit(&np->s_lock);
   3151 	}
   3152 
   3153 
   3154 	if (res.status == NFS4ERR_CLID_INUSE) {
   3155 		clientaddr4 *clid_inuse;
   3156 
   3157 		if (!(*retry_inusep)) {
   3158 			clid_inuse = &res.array->nfs_resop4_u.
   3159 			    opsetclientid.SETCLIENTID4res_u.client_using;
   3160 
   3161 			zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
   3162 			    "NFS4 mount (SETCLIENTID failed)."
   3163 			    "  nfs4_client_id.id is in"
   3164 			    "use already by: r_netid<%s> r_addr<%s>",
   3165 			    clid_inuse->r_netid, clid_inuse->r_addr);
   3166 		}
   3167 
   3168 		/*
   3169 		 * XXX - The client should be more robust in its
   3170 		 * handling of clientid in use errors (regen another
   3171 		 * clientid and try again?)
   3172 		 */
   3173 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3174 		return;
   3175 	}
   3176 
   3177 	if (res.status) {
   3178 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3179 		return;
   3180 	}
   3181 
   3182 	s_resok = &res.array[2].nfs_resop4_u.
   3183 	    opsetclientid.SETCLIENTID4res_u.resok4;
   3184 
   3185 	tmp_clientid = s_resok->clientid;
   3186 
   3187 	verf = s_resok->setclientid_confirm;
   3188 
   3189 #ifdef	DEBUG
   3190 	if (nfs4setclientid_otw_debug) {
   3191 		union {
   3192 			clientid4	clientid;
   3193 			int		foo[2];
   3194 		} cid;
   3195 
   3196 		cid.clientid = s_resok->clientid;
   3197 
   3198 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
   3199 		"nfs4setclientid_otw: OK, clientid = %x,%x, "
   3200 		"verifier = %" PRIx64 "\n", cid.foo[0], cid.foo[1], verf);
   3201 	}
   3202 #endif
   3203 
   3204 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3205 
   3206 	/* Confirm the client id and get the lease_time attribute */
   3207 
   3208 	args.ctag = TAG_SETCLIENTID_CF;
   3209 
   3210 	args.array = argop;
   3211 	args.array_len = 1;
   3212 
   3213 	argop[0].argop = OP_SETCLIENTID_CONFIRM;
   3214 
   3215 	argop[0].nfs_argop4_u.opsetclientid_confirm.clientid = tmp_clientid;
   3216 	argop[0].nfs_argop4_u.opsetclientid_confirm.setclientid_confirm = verf;
   3217 
   3218 	/* used to figure out RTT for np */
   3219 	gethrestime(&prop_time);
   3220 
   3221 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlientid_otw: "
   3222 	    "start time: %ld sec %ld nsec", prop_time.tv_sec,
   3223 	    prop_time.tv_nsec));
   3224 
   3225 	rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
   3226 
   3227 	gethrestime(&after_time);
   3228 	mutex_enter(&np->s_lock);
   3229 	np->propagation_delay.tv_sec =
   3230 	    MAX(1, after_time.tv_sec - prop_time.tv_sec);
   3231 	mutex_exit(&np->s_lock);
   3232 
   3233 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlcientid_otw: "
   3234 	    "finish time: %ld sec ", after_time.tv_sec));
   3235 
   3236 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setclientid_otw: "
   3237 	    "propagation delay set to %ld sec",
   3238 	    np->propagation_delay.tv_sec));
   3239 
   3240 	if (ep->error)
   3241 		return;
   3242 
   3243 	if (res.status == NFS4ERR_CLID_INUSE) {
   3244 		clientaddr4 *clid_inuse;
   3245 
   3246 		if (!(*retry_inusep)) {
   3247 			clid_inuse = &res.array->nfs_resop4_u.
   3248 			    opsetclientid.SETCLIENTID4res_u.client_using;
   3249 
   3250 			zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
   3251 			    "SETCLIENTID_CONFIRM failed.  "
   3252 			    "nfs4_client_id.id is in use already by: "
   3253 			    "r_netid<%s> r_addr<%s>",
   3254 			    clid_inuse->r_netid, clid_inuse->r_addr);
   3255 		}
   3256 
   3257 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3258 		return;
   3259 	}
   3260 
   3261 	if (res.status) {
   3262 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3263 		return;
   3264 	}
   3265 
   3266 	mutex_enter(&np->s_lock);
   3267 	np->clientid = tmp_clientid;
   3268 	np->s_flags |= N4S_CLIENTID_SET;
   3269 
   3270 	/* Add mi to np's mntinfo4 list */
   3271 	nfs4_add_mi_to_server(np, mi);
   3272 
   3273 	if (np->lease_valid == NFS4_LEASE_NOT_STARTED) {
   3274 		/*
   3275 		 * Start lease management thread.
   3276 		 * Keep trying until we succeed.
   3277 		 */
   3278 
   3279 		np->s_refcnt++;		/* pass reference to thread */
   3280 		(void) zthread_create(NULL, 0, nfs4_renew_lease_thread, np, 0,
   3281 		    minclsyspri);
   3282 	}
   3283 	mutex_exit(&np->s_lock);
   3284 
   3285 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3286 }
   3287 
   3288 /*
   3289  * Add mi to sp's mntinfo4_list if it isn't already in the list.  Makes
   3290  * mi's clientid the same as sp's.
   3291  * Assumes sp is locked down.
   3292  */
   3293 void
   3294 nfs4_add_mi_to_server(nfs4_server_t *sp, mntinfo4_t *mi)
   3295 {
   3296 	mntinfo4_t *tmi;
   3297 	int in_list = 0;
   3298 
   3299 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
   3300 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
   3301 	ASSERT(sp != &nfs4_server_lst);
   3302 	ASSERT(MUTEX_HELD(&sp->s_lock));
   3303 
   3304 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3305 	    "nfs4_add_mi_to_server: add mi %p to sp %p",
   3306 	    (void*)mi, (void*)sp));
   3307 
   3308 	for (tmi = sp->mntinfo4_list;
   3309 	    tmi != NULL;
   3310 	    tmi = tmi->mi_clientid_next) {
   3311 		if (tmi == mi) {
   3312 			NFS4_DEBUG(nfs4_client_lease_debug,
   3313 			    (CE_NOTE,
   3314 			    "nfs4_add_mi_to_server: mi in list"));
   3315 			in_list = 1;
   3316 		}
   3317 	}
   3318 
   3319 	/*
   3320 	 * First put a hold on the mntinfo4's vfsp so that references via
   3321 	 * mntinfo4_list will be valid.
   3322 	 */
   3323 	if (!in_list)
   3324 		VFS_HOLD(mi->mi_vfsp);
   3325 
   3326 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4_add_mi_to_server: "
   3327 	    "hold vfs %p for mi: %p", (void*)mi->mi_vfsp, (void*)mi));
   3328 
   3329 	if (!in_list) {
   3330 		if (sp->mntinfo4_list)
   3331 			sp->mntinfo4_list->mi_clientid_prev = mi;
   3332 		mi->mi_clientid_next = sp->mntinfo4_list;
   3333 		mi->mi_srv = sp;
   3334 		sp->mntinfo4_list = mi;
   3335 		mi->mi_srvsettime = gethrestime_sec();
   3336 		mi->mi_srvset_cnt++;
   3337 	}
   3338 
   3339 	/* set mi's clientid to that of sp's for later matching */
   3340 	mi->mi_clientid = sp->clientid;
   3341 
   3342 	/*
   3343 	 * Update the clientid for any other mi's belonging to sp.  This
   3344 	 * must be done here while we hold sp->s_lock, so that
   3345 	 * find_nfs4_server() continues to work.
   3346 	 */
   3347 
   3348 	for (tmi = sp->mntinfo4_list;
   3349 	    tmi != NULL;
   3350 	    tmi = tmi->mi_clientid_next) {
   3351 		if (tmi != mi) {
   3352 			tmi->mi_clientid = sp->clientid;
   3353 		}
   3354 	}
   3355 }
   3356 
   3357 /*
   3358  * Remove the mi from sp's mntinfo4_list and release its reference.
   3359  * Exception: if mi still has open files, flag it for later removal (when
   3360  * all the files are closed).
   3361  *
   3362  * If this is the last mntinfo4 in sp's list then tell the lease renewal
   3363  * thread to exit.
   3364  */
   3365 static void
   3366 nfs4_remove_mi_from_server_nolock(mntinfo4_t *mi, nfs4_server_t *sp)
   3367 {
   3368 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3369 	    "nfs4_remove_mi_from_server_nolock: remove mi %p from sp %p",
   3370 	    (void*)mi, (void*)sp));
   3371 
   3372 	ASSERT(sp != NULL);
   3373 	ASSERT(MUTEX_HELD(&sp->s_lock));
   3374 	ASSERT(mi->mi_open_files >= 0);
   3375 
   3376 	/*
   3377 	 * First make sure this mntinfo4 can be taken off of the list,
   3378 	 * ie: it doesn't have any open files remaining.
   3379 	 */
   3380 	if (mi->mi_open_files > 0) {
   3381 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3382 		    "nfs4_remove_mi_from_server_nolock: don't "
   3383 		    "remove mi since it still has files open"));
   3384 
   3385 		mutex_enter(&mi->mi_lock);
   3386 		mi->mi_flags |= MI4_REMOVE_ON_LAST_CLOSE;
   3387 		mutex_exit(&mi->mi_lock);
   3388 		return;
   3389 	}
   3390 
   3391 	VFS_HOLD(mi->mi_vfsp);
   3392 	remove_mi(sp, mi);
   3393 	VFS_RELE(mi->mi_vfsp);
   3394 
   3395 	if (sp->mntinfo4_list == NULL) {
   3396 		/* last fs unmounted, kill the thread */
   3397 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
   3398 		    "remove_mi_from_nfs4_server_nolock: kill the thread"));
   3399 		nfs4_mark_srv_dead(sp);
   3400 	}
   3401 }
   3402 
   3403 /*
   3404  * Remove mi from sp's mntinfo4_list and release the vfs reference.
   3405  */
   3406 static void
   3407 remove_mi(nfs4_server_t *sp, mntinfo4_t *mi)
   3408 {
   3409 	ASSERT(MUTEX_HELD(&sp->s_lock));
   3410 
   3411 	/*
   3412 	 * We release a reference, and the caller must still have a
   3413 	 * reference.
   3414 	 */
   3415 	ASSERT(mi->mi_vfsp->vfs_count >= 2);
   3416 
   3417 	if (mi->mi_clientid_prev) {
   3418 		mi->mi_clientid_prev->mi_clientid_next = mi->mi_clientid_next;
   3419 	} else {
   3420 		/* This is the first mi in sp's mntinfo4_list */
   3421 		/*
   3422 		 * Make sure the first mntinfo4 in the list is the actual
   3423 		 * mntinfo4 passed in.
   3424 		 */
   3425 		ASSERT(sp->mntinfo4_list == mi);
   3426 
   3427 		sp->mntinfo4_list = mi->mi_clientid_next;
   3428 	}
   3429 	if (mi->mi_clientid_next)
   3430 		mi->mi_clientid_next->mi_clientid_prev = mi->mi_clientid_prev;
   3431 
   3432 	/* Now mark the mntinfo4's links as being removed */
   3433 	mi->mi_clientid_prev = mi->mi_clientid_next = NULL;
   3434 	mi->mi_srv = NULL;
   3435 	mi->mi_srvset_cnt++;
   3436 
   3437 	VFS_RELE(mi->mi_vfsp);
   3438 }
   3439 
   3440 /*
   3441  * Free all the entries in sp's mntinfo4_list.
   3442  */
   3443 static void
   3444 remove_all_mi(nfs4_server_t *sp)
   3445 {
   3446 	mntinfo4_t *mi;
   3447 
   3448 	ASSERT(MUTEX_HELD(&sp->s_lock));
   3449 
   3450 	while (sp->mntinfo4_list != NULL) {
   3451 		mi = sp->mntinfo4_list;
   3452 		/*
   3453 		 * Grab a reference in case there is only one left (which
   3454 		 * remove_mi() frees).
   3455 		 */
   3456 		VFS_HOLD(mi->mi_vfsp);
   3457 		remove_mi(sp, mi);
   3458 		VFS_RELE(mi->mi_vfsp);
   3459 	}
   3460 }
   3461 
   3462 /*
   3463  * Remove the mi from sp's mntinfo4_list as above, and rele the vfs.
   3464  *
   3465  * This version can be called with a null nfs4_server_t arg,
   3466  * and will either find the right one and handle locking, or
   3467  * do nothing because the mi wasn't added to an sp's mntinfo4_list.
   3468  */
   3469 void
   3470 nfs4_remove_mi_from_server(mntinfo4_t *mi, nfs4_server_t *esp)
   3471 {
   3472 	nfs4_server_t	*sp;
   3473 
   3474 	if (esp) {
   3475 		nfs4_remove_mi_from_server_nolock(mi, esp);
   3476 		return;
   3477 	}
   3478 
   3479 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
   3480 	if (sp = find_nfs4_server_all(mi, 1)) {
   3481 		nfs4_remove_mi_from_server_nolock(mi, sp);
   3482 		mutex_exit(&sp->s_lock);
   3483 		nfs4_server_rele(sp);
   3484 	}
   3485 	nfs_rw_exit(&mi->mi_recovlock);
   3486 }
   3487 
   3488 /*
   3489  * Return TRUE if the given server has any non-unmounted filesystems.
   3490  */
   3491 
   3492 bool_t
   3493 nfs4_fs_active(nfs4_server_t *sp)
   3494 {
   3495 	mntinfo4_t *mi;
   3496 
   3497 	ASSERT(MUTEX_HELD(&sp->s_lock));
   3498 
   3499 	for (mi = sp->mntinfo4_list; mi != NULL; mi = mi->mi_clientid_next) {
   3500 		if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
   3501 			return (TRUE);
   3502 	}
   3503 
   3504 	return (FALSE);
   3505 }
   3506 
   3507 /*
   3508  * Mark sp as finished and notify any waiters.
   3509  */
   3510 
   3511 void
   3512 nfs4_mark_srv_dead(nfs4_server_t *sp)
   3513 {
   3514 	ASSERT(MUTEX_HELD(&sp->s_lock));
   3515 
   3516 	sp->s_thread_exit = NFS4_THREAD_EXIT;
   3517 	cv_broadcast(&sp->cv_thread_exit);
   3518 }
   3519 
   3520 /*
   3521  * Create a new nfs4_server_t structure.
   3522  * Returns new node unlocked and not in list, but with a reference count of
   3523  * 1.
   3524  */
   3525 struct nfs4_server *
   3526 new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
   3527 {
   3528 	struct nfs4_server *np;
   3529 	timespec_t tt;
   3530 	union {
   3531 		struct {
   3532 			uint32_t sec;
   3533 			uint32_t subsec;
   3534 		} un_curtime;
   3535 		verifier4	un_verifier;
   3536 	} nfs4clientid_verifier;
   3537 	char id_val[] = "Solaris: %s, NFSv4 kernel client";
   3538 	int len;
   3539 
   3540 	np = kmem_zalloc(sizeof (struct nfs4_server), KM_SLEEP);
   3541 	np->saddr.len = svp->sv_addr.len;
   3542 	np->saddr.maxlen = svp->sv_addr.maxlen;
   3543 	np->saddr.buf = kmem_alloc(svp->sv_addr.maxlen, KM_SLEEP);
   3544 	bcopy(svp->sv_addr.buf, np->saddr.buf, svp->sv_addr.len);
   3545 	np->s_refcnt = 1;
   3546 
   3547 	/*
   3548 	 * Build the nfs_client_id4 for this server mount.  Ensure
   3549 	 * the verifier is useful and that the identification is
   3550 	 * somehow based on the server's address for the case of
   3551 	 * multi-homed servers.
   3552 	 */
   3553 	nfs4clientid_verifier.un_verifier = 0;
   3554 	gethrestime(&tt);
   3555 	nfs4clientid_verifier.un_curtime.sec = (uint32_t)tt.tv_sec;
   3556 	nfs4clientid_verifier.un_curtime.subsec = (uint32_t)tt.tv_nsec;
   3557 	np->clidtosend.verifier = nfs4clientid_verifier.un_verifier;
   3558 
   3559 	/*
   3560 	 * calculate the length of the opaque identifier.  Subtract 2
   3561 	 * for the "%s" and add the traditional +1 for null
   3562 	 * termination.
   3563 	 */
   3564 	len = strlen(id_val) - 2 + strlen(uts_nodename()) + 1;
   3565 	np->clidtosend.id_len = len + np->saddr.maxlen;
   3566 
   3567 	np->clidtosend.id_val = kmem_alloc(np->clidtosend.id_len, KM_SLEEP);
   3568 	(void) sprintf(np->clidtosend.id_val, id_val, uts_nodename());
   3569 	bcopy(np->saddr.buf, &np->clidtosend.id_val[len], np->saddr.len);
   3570 
   3571 	np->s_flags = 0;
   3572 	np->mntinfo4_list = NULL;
   3573 	/* save cred for issuing rfs4calls inside the renew thread */
   3574 	crhold(cr);
   3575 	np->s_cred = cr;
   3576 	cv_init(&np->cv_thread_exit, NULL, CV_DEFAULT, NULL);
   3577 	mutex_init(&np->s_lock, NULL, MUTEX_DEFAULT, NULL);
   3578 	nfs_rw_init(&np->s_recovlock, NULL, RW_DEFAULT, NULL);
   3579 	list_create(&np->s_deleg_list, sizeof (rnode4_t),
   3580 	    offsetof(rnode4_t, r_deleg_link));
   3581 	np->s_thread_exit = 0;
   3582 	np->state_ref_count = 0;
   3583 	np->lease_valid = NFS4_LEASE_NOT_STARTED;
   3584 	cv_init(&np->s_cv_otw_count, NULL, CV_DEFAULT, NULL);
   3585 	cv_init(&np->s_clientid_pend, NULL, CV_DEFAULT, NULL);
   3586 	np->s_otw_call_count = 0;
   3587 	cv_init(&np->wait_cb_null, NULL, CV_DEFAULT, NULL);
   3588 	np->zoneid = getzoneid();
   3589 	np->zone_globals = nfs4_get_callback_globals();
   3590 	ASSERT(np->zone_globals != NULL);
   3591 	return (np);
   3592 }
   3593 
   3594 /*
   3595  * Create a new nfs4_server_t structure and add it to the list.
   3596  * Returns new node locked; reference must eventually be freed.
   3597  */
   3598 static struct nfs4_server *
   3599 add_new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
   3600 {
   3601 	nfs4_server_t *sp;
   3602 
   3603 	ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
   3604 	sp = new_nfs4_server(svp, cr);
   3605 	mutex_enter(&sp->s_lock);
   3606 	insque(sp, &nfs4_server_lst);
   3607 	sp->s_refcnt++;			/* list gets a reference */
   3608 	sp->s_flags |= N4S_INSERTED;
   3609 	sp->clientid = 0;
   3610 	return (sp);
   3611 }
   3612 
   3613 int nfs4_server_t_debug = 0;
   3614 
   3615 #ifdef lint
   3616 extern void
   3617 dumpnfs4slist(char *, mntinfo4_t *, clientid4, servinfo4_t *);
   3618 #endif
   3619 
   3620 #ifndef lint
   3621 #ifdef DEBUG
   3622 void
   3623 dumpnfs4slist(char *txt, mntinfo4_t *mi, clientid4 clientid, servinfo4_t *srv_p)
   3624 {
   3625 	int hash16(void *p, int len);
   3626 	nfs4_server_t *np;
   3627 
   3628 	NFS4_DEBUG(nfs4_server_t_debug, (CE_NOTE,
   3629 	    "dumping nfs4_server_t list in %s", txt));
   3630 	NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
   3631 	    "mi 0x%p, want clientid %llx, addr %d/%04X",
   3632 	    mi, (longlong_t)clientid, srv_p->sv_addr.len,
   3633 	    hash16((void *)srv_p->sv_addr.buf, srv_p->sv_addr.len)));
   3634 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst;
   3635 	    np = np->forw) {
   3636 		NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
   3637 		    "node 0x%p,    clientid %llx, addr %d/%04X, cnt %d",
   3638 		    np, (longlong_t)np->clientid, np->saddr.len,
   3639 		    hash16((void *)np->saddr.buf, np->saddr.len),
   3640 		    np->state_ref_count));
   3641 		if (np->saddr.len == srv_p->sv_addr.len &&
   3642 		    bcmp(np->saddr.buf, srv_p->sv_addr.buf,
   3643 		    np->saddr.len) == 0)
   3644 			NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
   3645 			    " - address matches"));
   3646 		if (np->clientid == clientid || np->clientid == 0)
   3647 			NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
   3648 			    " - clientid matches"));
   3649 		if (np->s_thread_exit != NFS4_THREAD_EXIT)
   3650 			NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
   3651 			    " - thread not exiting"));
   3652 	}
   3653 	delay(hz);
   3654 }
   3655 #endif
   3656 #endif
   3657 
   3658 
   3659 /*
   3660  * Move a mntinfo4_t from one server list to another.
   3661  * Locking of the two nfs4_server_t nodes will be done in list order.
   3662  *
   3663  * Returns NULL if the current nfs4_server_t for the filesystem could not
   3664  * be found (e.g., due to forced unmount).  Otherwise returns a reference
   3665  * to the new nfs4_server_t, which must eventually be freed.
   3666  */
   3667 nfs4_server_t *
   3668 nfs4_move_mi(mntinfo4_t *mi, servinfo4_t *old, servinfo4_t *new)
   3669 {
   3670 	nfs4_server_t *p, *op = NULL, *np = NULL;
   3671 	int num_open;
   3672 	zoneid_t zoneid = nfs_zoneid();
   3673 
   3674 	ASSERT(nfs_zone() == mi->mi_zone);
   3675 
   3676 	mutex_enter(&nfs4_server_lst_lock);
   3677 #ifdef DEBUG
   3678 	if (nfs4_server_t_debug)
   3679 		dumpnfs4slist("nfs4_move_mi", mi, (clientid4)0, new);
   3680 #endif
   3681 	for (p = nfs4_server_lst.forw; p != &nfs4_server_lst; p = p->forw) {
   3682 		if (p->zoneid != zoneid)
   3683 			continue;
   3684 		if (p->saddr.len == old->sv_addr.len &&
   3685 		    bcmp(p->saddr.buf, old->sv_addr.buf, p->saddr.len) == 0 &&
   3686 		    p->s_thread_exit != NFS4_THREAD_EXIT) {
   3687 			op = p;
   3688 			mutex_enter(&op->s_lock);
   3689 			op->s_refcnt++;
   3690 		}
   3691 		if (p->saddr.len == new->sv_addr.len &&
   3692 		    bcmp(p->saddr.buf, new->sv_addr.buf, p->saddr.len) == 0 &&
   3693 		    p->s_thread_exit != NFS4_THREAD_EXIT) {
   3694 			np = p;
   3695 			mutex_enter(&np->s_lock);
   3696 		}
   3697 		if (op != NULL && np != NULL)
   3698 			break;
   3699 	}
   3700 	if (op == NULL) {
   3701 		/*
   3702 		 * Filesystem has been forcibly unmounted.  Bail out.
   3703 		 */
   3704 		if (np != NULL)
   3705 			mutex_exit(&np->s_lock);
   3706 		mutex_exit(&nfs4_server_lst_lock);
   3707 		return (NULL);
   3708 	}
   3709 	if (np != NULL) {
   3710 		np->s_refcnt++;
   3711 	} else {
   3712 #ifdef DEBUG
   3713 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   3714 		    "nfs4_move_mi: no target nfs4_server, will create."));
   3715 #endif
   3716 		np = add_new_nfs4_server(new, kcred);
   3717 	}
   3718 	mutex_exit(&nfs4_server_lst_lock);
   3719 
   3720 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   3721 	    "nfs4_move_mi: for mi 0x%p, "
   3722 	    "old servinfo4 0x%p, new servinfo4 0x%p, "
   3723 	    "old nfs4_server 0x%p, new nfs4_server 0x%p, ",
   3724 	    (void*)mi, (void*)old, (void*)new,
   3725 	    (void*)op, (void*)np));
   3726 	ASSERT(op != NULL && np != NULL);
   3727 
   3728 	/* discard any delegations */
   3729 	nfs4_deleg_discard(mi, op);
   3730 
   3731 	num_open = mi->mi_open_files;
   3732 	mi->mi_open_files = 0;
   3733 	op->state_ref_count -= num_open;
   3734 	ASSERT(op->state_ref_count >= 0);
   3735 	np->state_ref_count += num_open;
   3736 	nfs4_remove_mi_from_server_nolock(mi, op);
   3737 	mi->mi_open_files = num_open;
   3738 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   3739 	    "nfs4_move_mi: mi_open_files %d, op->cnt %d, np->cnt %d",
   3740 	    mi->mi_open_files, op->state_ref_count, np->state_ref_count));
   3741 
   3742 	nfs4_add_mi_to_server(np, mi);
   3743 
   3744 	mutex_exit(&op->s_lock);
   3745 	mutex_exit(&np->s_lock);
   3746 	nfs4_server_rele(op);
   3747 
   3748 	return (np);
   3749 }
   3750 
   3751 /*
   3752  * Need to have the nfs4_server_lst_lock.
   3753  * Search the nfs4_server list to find a match on this servinfo4
   3754  * based on its address.
   3755  *
   3756  * Returns NULL if no match is found.  Otherwise returns a reference (which
   3757  * must eventually be freed) to a locked nfs4_server.
   3758  */
   3759 nfs4_server_t *
   3760 servinfo4_to_nfs4_server(servinfo4_t *srv_p)
   3761 {
   3762 	nfs4_server_t *np;
   3763 	zoneid_t zoneid = nfs_zoneid();
   3764 
   3765 	ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
   3766 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
   3767 		if (np->zoneid == zoneid &&
   3768 		    np->saddr.len == srv_p->sv_addr.len &&
   3769 		    bcmp(np->saddr.buf, srv_p->sv_addr.buf,
   3770 		    np->saddr.len) == 0 &&
   3771 		    np->s_thread_exit != NFS4_THREAD_EXIT) {
   3772 			mutex_enter(&np->s_lock);
   3773 			np->s_refcnt++;
   3774 			return (np);
   3775 		}
   3776 	}
   3777 	return (NULL);
   3778 }
   3779 
   3780 /*
   3781  * Locks the nfs4_server down if it is found and returns a reference that
   3782  * must eventually be freed.
   3783  */
   3784 static nfs4_server_t *
   3785 lookup_nfs4_server(nfs4_server_t *sp, int any_state)
   3786 {
   3787 	nfs4_server_t *np;
   3788 
   3789 	mutex_enter(&nfs4_server_lst_lock);
   3790 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
   3791 		mutex_enter(&np->s_lock);
   3792 		if (np == sp && np->s_refcnt > 0 &&
   3793 		    (np->s_thread_exit != NFS4_THREAD_EXIT || any_state)) {
   3794 			mutex_exit(&nfs4_server_lst_lock);
   3795 			np->s_refcnt++;
   3796 			return (np);
   3797 		}
   3798 		mutex_exit(&np->s_lock);
   3799 	}
   3800 	mutex_exit(&nfs4_server_lst_lock);
   3801 
   3802 	return (NULL);
   3803 }
   3804 
   3805 /*
   3806  * The caller should be holding mi->mi_recovlock, and it should continue to
   3807  * hold the lock until done with the returned nfs4_server_t.  Once
   3808  * mi->mi_recovlock is released, there is no guarantee that the returned
   3809  * mi->nfs4_server_t will continue to correspond to mi.
   3810  */
   3811 nfs4_server_t *
   3812 find_nfs4_server(mntinfo4_t *mi)
   3813 {
   3814 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
   3815 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
   3816 
   3817 	return (lookup_nfs4_server(mi->mi_srv, 0));
   3818 }
   3819 
   3820 /*
   3821  * Same as above, but takes an "any_state" parameter which can be
   3822  * set to 1 if the caller wishes to find nfs4_server_t's which
   3823  * have been marked for termination by the exit of the renew
   3824  * thread.  This should only be used by operations which are
   3825  * cleaning up and will not cause an OTW op.
   3826  */
   3827 nfs4_server_t *
   3828 find_nfs4_server_all(mntinfo4_t *mi, int any_state)
   3829 {
   3830 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
   3831 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
   3832 
   3833 	return (lookup_nfs4_server(mi->mi_srv, any_state));
   3834 }
   3835 
   3836 /*
   3837  * Lock sp, but only if it's still active (in the list and hasn't been
   3838  * flagged as exiting) or 'any_state' is non-zero.
   3839  * Returns TRUE if sp got locked and adds a reference to sp.
   3840  */
   3841 bool_t
   3842 nfs4_server_vlock(nfs4_server_t *sp, int any_state)
   3843 {
   3844 	return (lookup_nfs4_server(sp, any_state) != NULL);
   3845 }
   3846 
   3847 /*
   3848  * Release the reference to sp and destroy it if that's the last one.
   3849  */
   3850 
   3851 void
   3852 nfs4_server_rele(nfs4_server_t *sp)
   3853 {
   3854 	mutex_enter(&sp->s_lock);
   3855 	ASSERT(sp->s_refcnt > 0);
   3856 	sp->s_refcnt--;
   3857 	if (sp->s_refcnt > 0) {
   3858 		mutex_exit(&sp->s_lock);
   3859 		return;
   3860 	}
   3861 	mutex_exit(&sp->s_lock);
   3862 
   3863 	mutex_enter(&nfs4_server_lst_lock);
   3864 	mutex_enter(&sp->s_lock);
   3865 	if (sp->s_refcnt > 0) {
   3866 		mutex_exit(&sp->s_lock);
   3867 		mutex_exit(&nfs4_server_lst_lock);
   3868 		return;
   3869 	}
   3870 	remque(sp);
   3871 	sp->forw = sp->back = NULL;
   3872 	mutex_exit(&nfs4_server_lst_lock);
   3873 	destroy_nfs4_server(sp);
   3874 }
   3875 
   3876 static void
   3877 destroy_nfs4_server(nfs4_server_t *sp)
   3878 {
   3879 	ASSERT(MUTEX_HELD(&sp->s_lock));
   3880 	ASSERT(sp->s_refcnt == 0);
   3881 	ASSERT(sp->s_otw_call_count == 0);
   3882 
   3883 	remove_all_mi(sp);
   3884 
   3885 	crfree(sp->s_cred);
   3886 	kmem_free(sp->saddr.buf, sp->saddr.maxlen);
   3887 	kmem_free(sp->clidtosend.id_val, sp->clidtosend.id_len);
   3888 	mutex_exit(&sp->s_lock);
   3889 
   3890 	/* destroy the nfs4_server */
   3891 	nfs4callback_destroy(sp);
   3892 	list_destroy(&sp->s_deleg_list);
   3893 	mutex_destroy(&sp->s_lock);
   3894 	cv_destroy(&sp->cv_thread_exit);
   3895 	cv_destroy(&sp->s_cv_otw_count);
   3896 	cv_destroy(&sp->s_clientid_pend);
   3897 	cv_destroy(&sp->wait_cb_null);
   3898 	nfs_rw_destroy(&sp->s_recovlock);
   3899 	kmem_free(sp, sizeof (*sp));
   3900 }
   3901 
   3902 /*
   3903  * Fork off a thread to free the data structures for a mount.
   3904  */
   3905 
   3906 static void
   3907 async_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
   3908 {
   3909 	freemountargs_t *args;
   3910 	args = kmem_alloc(sizeof (freemountargs_t), KM_SLEEP);
   3911 	args->fm_vfsp = vfsp;
   3912 	VFS_HOLD(vfsp);
   3913 	MI4_HOLD(VFTOMI4(vfsp));
   3914 	args->fm_flag = flag;
   3915 	args->fm_cr = cr;
   3916 	crhold(cr);
   3917 	(void) zthread_create(NULL, 0, nfs4_free_mount_thread, args, 0,
   3918 	    minclsyspri);
   3919 }
   3920 
   3921 static void
   3922 nfs4_free_mount_thread(freemountargs_t *args)
   3923 {
   3924 	mntinfo4_t *mi;
   3925 	nfs4_free_mount(args->fm_vfsp, args->fm_flag, args->fm_cr);
   3926 	mi = VFTOMI4(args->fm_vfsp);
   3927 	crfree(args->fm_cr);
   3928 	VFS_RELE(args->fm_vfsp);
   3929 	MI4_RELE(mi);
   3930 	kmem_free(args, sizeof (freemountargs_t));
   3931 	zthread_exit();
   3932 	/* NOTREACHED */
   3933 }
   3934 
   3935 /*
   3936  * Thread to free the data structures for a given filesystem.
   3937  */
   3938 static void
   3939 nfs4_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
   3940 {
   3941 	mntinfo4_t		*mi = VFTOMI4(vfsp);
   3942 	nfs4_server_t		*sp;
   3943 	callb_cpr_t		cpr_info;
   3944 	kmutex_t		cpr_lock;
   3945 	boolean_t		async_thread;
   3946 	int			removed;
   3947 
   3948 	bool_t			must_unlock;
   3949 	bool_t			must_rele;
   3950 	nfs4_ephemeral_tree_t	*eph_tree;
   3951 
   3952 	/*
   3953 	 * We need to participate in the CPR framework if this is a kernel
   3954 	 * thread.
   3955 	 */
   3956 	async_thread = (curproc == nfs_zone()->zone_zsched);
   3957 	if (async_thread) {
   3958 		mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
   3959 		CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
   3960 		    "nfsv4AsyncUnmount");
   3961 	}
   3962 
   3963 	/*
   3964 	 * We need to wait for all outstanding OTW calls
   3965 	 * and recovery to finish before we remove the mi
   3966 	 * from the nfs4_server_t, as current pending
   3967 	 * calls might still need this linkage (in order
   3968 	 * to find a nfs4_server_t from a mntinfo4_t).
   3969 	 */
   3970 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
   3971 	sp = find_nfs4_server(mi);
   3972 	nfs_rw_exit(&mi->mi_recovlock);
   3973 
   3974 	if (sp) {
   3975 		while (sp->s_otw_call_count != 0) {
   3976 			if (async_thread) {
   3977 				mutex_enter(&cpr_lock);
   3978 				CALLB_CPR_SAFE_BEGIN(&cpr_info);
   3979 				mutex_exit(&cpr_lock);
   3980 			}
   3981 			cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
   3982 			if (async_thread) {
   3983 				mutex_enter(&cpr_lock);
   3984 				CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
   3985 				mutex_exit(&cpr_lock);
   3986 			}
   3987 		}
   3988 		mutex_exit(&sp->s_lock);
   3989 		nfs4_server_rele(sp);
   3990 		sp = NULL;
   3991 	}
   3992 
   3993 	mutex_enter(&mi->mi_lock);
   3994 	while (mi->mi_in_recovery != 0) {
   3995 		if (async_thread) {
   3996 			mutex_enter(&cpr_lock);
   3997 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
   3998 			mutex_exit(&cpr_lock);
   3999 		}
   4000 		cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
   4001 		if (async_thread) {
   4002 			mutex_enter(&cpr_lock);
   4003 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
   4004 			mutex_exit(&cpr_lock);
   4005 		}
   4006 	}
   4007 	mutex_exit(&mi->mi_lock);
   4008 
   4009 	/*
   4010 	 * If we got an error, then do not nuke the
   4011 	 * tree. Either the harvester is busy reclaiming
   4012 	 * this node or we ran into some busy condition.
   4013 	 *
   4014 	 * The harvester will eventually come along and cleanup.
   4015 	 * The only problem would be the root mount point.
   4016 	 *
   4017 	 * Since the busy node can occur for a variety
   4018 	 * of reasons and can result in an entry staying
   4019 	 * in df output but no longer accessible from the
   4020 	 * directory tree, we are okay.
   4021 	 */
   4022 	if (!nfs4_ephemeral_umount(mi, flag, cr,
   4023 	    &must_unlock, &must_rele, &eph_tree))
   4024 		nfs4_ephemeral_umount_activate(mi, &must_unlock,
   4025 		    &must_rele, &eph_tree);
   4026 
   4027 	/*
   4028 	 * The original purge of the dnlc via 'dounmount'
   4029 	 * doesn't guarantee that another dnlc entry was not
   4030 	 * added while we waitied for all outstanding OTW
   4031 	 * and recovery calls to finish.  So re-purge the
   4032 	 * dnlc now.
   4033 	 */
   4034 	(void) dnlc_purge_vfsp(vfsp, 0);
   4035 
   4036 	/*
   4037 	 * We need to explicitly stop the manager thread; the asyc worker
   4038 	 * threads can timeout and exit on their own.
   4039 	 */
   4040 	mutex_enter(&mi->mi_async_lock);
   4041 	mi->mi_max_threads = 0;
   4042 	cv_broadcast(&mi->mi_async_work_cv);
   4043 	mutex_exit(&mi->mi_async_lock);
   4044 	if (mi->mi_manager_thread)
   4045 		nfs4_async_manager_stop(vfsp);
   4046 
   4047 	destroy_rtable4(vfsp, cr);
   4048 
   4049 	nfs4_remove_mi_from_server(mi, NULL);
   4050 
   4051 	if (async_thread) {
   4052 		mutex_enter(&cpr_lock);
   4053 		CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
   4054 		mutex_destroy(&cpr_lock);
   4055 	}
   4056 
   4057 	removed = nfs4_mi_zonelist_remove(mi);
   4058 	if (removed)
   4059 		zone_rele(mi->mi_zone);
   4060 }
   4061