Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
     29  * triggered from a "stub" rnode via a special set of vnodeops.
     30  */
     31 
     32 #include <sys/param.h>
     33 #include <sys/types.h>
     34 #include <sys/systm.h>
     35 #include <sys/cred.h>
     36 #include <sys/time.h>
     37 #include <sys/vnode.h>
     38 #include <sys/vfs.h>
     39 #include <sys/vfs_opreg.h>
     40 #include <sys/file.h>
     41 #include <sys/filio.h>
     42 #include <sys/uio.h>
     43 #include <sys/buf.h>
     44 #include <sys/mman.h>
     45 #include <sys/pathname.h>
     46 #include <sys/dirent.h>
     47 #include <sys/debug.h>
     48 #include <sys/vmsystm.h>
     49 #include <sys/fcntl.h>
     50 #include <sys/flock.h>
     51 #include <sys/swap.h>
     52 #include <sys/errno.h>
     53 #include <sys/strsubr.h>
     54 #include <sys/sysmacros.h>
     55 #include <sys/kmem.h>
     56 #include <sys/mount.h>
     57 #include <sys/cmn_err.h>
     58 #include <sys/pathconf.h>
     59 #include <sys/utsname.h>
     60 #include <sys/dnlc.h>
     61 #include <sys/acl.h>
     62 #include <sys/systeminfo.h>
     63 #include <sys/policy.h>
     64 #include <sys/sdt.h>
     65 #include <sys/list.h>
     66 #include <sys/stat.h>
     67 #include <sys/mntent.h>
     68 #include <sys/priv.h>
     69 
     70 #include <rpc/types.h>
     71 #include <rpc/auth.h>
     72 #include <rpc/clnt.h>
     73 
     74 #include <nfs/nfs.h>
     75 #include <nfs/nfs_clnt.h>
     76 #include <nfs/nfs_acl.h>
     77 #include <nfs/lm.h>
     78 #include <nfs/nfs4.h>
     79 #include <nfs/nfs4_kprot.h>
     80 #include <nfs/rnode4.h>
     81 #include <nfs/nfs4_clnt.h>
     82 #include <nfs/nfsid_map.h>
     83 #include <nfs/nfs4_idmap_impl.h>
     84 
     85 #include <vm/hat.h>
     86 #include <vm/as.h>
     87 #include <vm/page.h>
     88 #include <vm/pvn.h>
     89 #include <vm/seg.h>
     90 #include <vm/seg_map.h>
     91 #include <vm/seg_kpm.h>
     92 #include <vm/seg_vn.h>
     93 
     94 #include <fs/fs_subr.h>
     95 
     96 #include <sys/ddi.h>
     97 #include <sys/int_fmtio.h>
     98 
     99 #include <sys/sunddi.h>
    100 
    101 #include <sys/priv_names.h>
    102 
    103 extern zone_key_t	nfs4clnt_zone_key;
    104 extern zone_key_t	nfsidmap_zone_key;
    105 
    106 /*
    107  * The automatic unmounter thread stuff!
    108  */
    109 static int nfs4_trigger_thread_timer = 20;	/* in seconds */
    110 
    111 /*
    112  * Just a default....
    113  */
    114 static uint_t nfs4_trigger_mount_to = 240;
    115 
    116 typedef struct nfs4_trigger_globals {
    117 	kmutex_t		ntg_forest_lock;
    118 	uint_t			ntg_mount_to;
    119 	int			ntg_thread_started;
    120 	nfs4_ephemeral_tree_t	*ntg_forest;
    121 } nfs4_trigger_globals_t;
    122 
    123 kmutex_t	nfs4_ephemeral_thread_lock;
    124 
    125 zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
    126 
    127 static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
    128 
    129 /*
    130  * Used for ephemeral mounts; contains data either duplicated from
    131  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
    132  *
    133  * It's intended that this structure is used solely for ephemeral
    134  * mount-type specific data, for passing this data to
    135  * nfs4_trigger_nargs_create().
    136  */
    137 typedef struct ephemeral_servinfo {
    138 	char			*esi_hostname;
    139 	char			*esi_netname;
    140 	char			*esi_path;
    141 	int			esi_path_len;
    142 	int			esi_mount_flags;
    143 	struct netbuf		*esi_addr;
    144 	struct netbuf		*esi_syncaddr;
    145 	struct knetconfig	*esi_knconf;
    146 } ephemeral_servinfo_t;
    147 
    148 /*
    149  * Collect together the mount-type specific and generic data args.
    150  */
    151 typedef struct domount_args {
    152 	ephemeral_servinfo_t	*dma_esi;
    153 	char			*dma_hostlist; /* comma-sep. for RO failover */
    154 	struct nfs_args		*dma_nargs;
    155 } domount_args_t;
    156 
    157 
    158 /*
    159  * The vnode ops functions for a trigger stub vnode
    160  */
    161 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
    162 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
    163     caller_context_t *);
    164 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
    165     caller_context_t *);
    166 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
    167     caller_context_t *);
    168 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
    169     caller_context_t *);
    170 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
    171     struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
    172     int *, pathname_t *);
    173 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
    174     enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
    175     vsecattr_t *);
    176 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
    177     int);
    178 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
    179     caller_context_t *, int);
    180 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
    181     cred_t *, caller_context_t *, int);
    182 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
    183     vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
    184 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
    185     caller_context_t *, int);
    186 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
    187     cred_t *, caller_context_t *, int);
    188 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
    189 
    190 /*
    191  * Regular NFSv4 vnodeops that we need to reference directly
    192  */
    193 extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
    194 		    caller_context_t *);
    195 extern void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
    196 extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
    197 extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
    198 extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
    199 		    struct pathname *, int, vnode_t *, cred_t *,
    200 		    caller_context_t *, int *, pathname_t *);
    201 extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
    202 		    caller_context_t *);
    203 extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
    204 		    caller_context_t *);
    205 extern int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
    206 extern int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
    207 
    208 static int	nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
    209 static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
    210     cred_t *, vnode_t **);
    211 static domount_args_t  *nfs4_trigger_domount_args_create(vnode_t *, cred_t *);
    212 static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
    213     vnode_t *vp);
    214 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *,
    215     cred_t *);
    216 static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
    217 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
    218     servinfo4_t *);
    219 static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *,
    220     cred_t *);
    221 static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
    222     ephemeral_servinfo_t *);
    223 static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
    224 static char	*nfs4_trigger_create_mntopts(vfs_t *);
    225 static void	nfs4_trigger_destroy_mntopts(char *);
    226 static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
    227 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
    228 static enum clnt_stat nfs4_ping_server_common(struct knetconfig *,
    229     struct netbuf *, int);
    230 
    231 extern int	umount2_engine(vfs_t *, int, cred_t *, int);
    232 
    233 vnodeops_t *nfs4_trigger_vnodeops;
    234 
    235 /*
    236  * These are the vnodeops that we must define for stub vnodes.
    237  *
    238  *
    239  * Many of the VOPs defined for NFSv4 do not need to be defined here,
    240  * for various reasons. This will result in the VFS default function being
    241  * used:
    242  *
    243  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
    244  *   lost the reference to the stub vnode, meaning these should not be called:
    245  *       close, read, write, ioctl, readdir, seek.
    246  *
    247  * - These VOPs are meaningless for vnodes without data pages. Since the
    248  *   stub vnode is of type VDIR, these should not be called:
    249  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
    250  *
    251  * - These VOPs are otherwise not applicable, and should not be called:
    252  *       dump, setsecattr.
    253  *
    254  *
    255  * These VOPs we do not want to define, but nor do we want the VFS default
    256  * action. Instead, we specify the VFS error function, with fs_error(), but
    257  * note that fs_error() is not actually called. Instead it results in the
    258  * use of the error function defined for the particular VOP, in vn_ops_table[]:
    259  *
    260  * -   frlock, dispose, shrlock.
    261  *
    262  *
    263  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
    264  * NOTE: if any of these ops involve an OTW call with the stub FH, then
    265  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
    266  * to protect the security data in the servinfo4_t for the "parent"
    267  * filesystem that contains the stub.
    268  *
    269  * - These VOPs should not trigger a mount, so that "ls -l" does not:
    270  *       pathconf, getsecattr.
    271  *
    272  * - These VOPs would not make sense to trigger:
    273  *       inactive, rwlock, rwunlock, fid, realvp.
    274  */
    275 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
    276 	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
    277 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
    278 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
    279 	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
    280 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
    281 	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
    282 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
    283 	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
    284 	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
    285 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
    286 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
    287 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
    288 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
    289 	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
    290 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
    291 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
    292 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
    293 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
    294 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
    295 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
    296 	VOPNAME_FRLOCK,		{ .error = fs_error },
    297 	VOPNAME_DISPOSE,	{ .error = fs_error },
    298 	VOPNAME_SHRLOCK,	{ .error = fs_error },
    299 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
    300 	NULL, NULL
    301 };
    302 
    303 static void
    304 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
    305 {
    306 	ASSERT(mutex_owned(&net->net_cnt_lock));
    307 	net->net_refcnt++;
    308 	ASSERT(net->net_refcnt != 0);
    309 }
    310 
    311 static void
    312 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
    313 {
    314 	mutex_enter(&net->net_cnt_lock);
    315 	nfs4_ephemeral_tree_incr(net);
    316 	mutex_exit(&net->net_cnt_lock);
    317 }
    318 
    319 /*
    320  * We need a safe way to decrement the refcnt whilst the
    321  * lock is being held.
    322  */
    323 static void
    324 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
    325 {
    326 	ASSERT(mutex_owned(&net->net_cnt_lock));
    327 	ASSERT(net->net_refcnt != 0);
    328 	net->net_refcnt--;
    329 }
    330 
    331 static void
    332 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
    333 {
    334 	mutex_enter(&net->net_cnt_lock);
    335 	nfs4_ephemeral_tree_decr(net);
    336 	mutex_exit(&net->net_cnt_lock);
    337 }
    338 
    339 /*
    340  * Trigger ops for stub vnodes; for mirror mounts, etc.
    341  *
    342  * The general idea is that a "triggering" op will first call
    343  * nfs4_trigger_mount(), which will find out whether a mount has already
    344  * been triggered.
    345  *
    346  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
    347  * of the covering vfs.
    348  *
    349  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
    350  * and again set newvp, as above.
    351  *
    352  * The triggering op may then re-issue the VOP by calling it on newvp.
    353  *
    354  * Note that some ops may perform custom action, and may or may not need
    355  * to trigger a mount.
    356  *
    357  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
    358  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
    359  * and that would just recurse. Instead, we call the v4 op directly,
    360  * by name.  This is OK, since we know that the vnode is for NFSv4,
    361  * otherwise it couldn't be a stub.
    362  *
    363  */
    364 
    365 static int
    366 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
    367 {
    368 	int error;
    369 	vnode_t *newvp;
    370 
    371 	error = nfs4_trigger_mount(*vpp, cr, &newvp);
    372 	if (error)
    373 		return (error);
    374 
    375 	/* Release the stub vnode, as we're losing the reference to it */
    376 	VN_RELE(*vpp);
    377 
    378 	/* Give the caller the root vnode of the newly-mounted fs */
    379 	*vpp = newvp;
    380 
    381 	/* return with VN_HELD(newvp) */
    382 	return (VOP_OPEN(vpp, flag, cr, ct));
    383 }
    384 
    385 void
    386 nfs4_fake_attrs(vnode_t *vp, struct vattr *vap)
    387 {
    388 	uint_t mask;
    389 	timespec_t now;
    390 
    391 	/*
    392 	 * Set some attributes here for referrals.
    393 	 */
    394 	mask = vap->va_mask;
    395 	bzero(vap, sizeof (struct vattr));
    396 	vap->va_mask	= mask;
    397 	vap->va_uid	= 0;
    398 	vap->va_gid	= 0;
    399 	vap->va_nlink	= 1;
    400 	vap->va_size	= 1;
    401 	gethrestime(&now);
    402 	vap->va_atime	= now;
    403 	vap->va_mtime	= now;
    404 	vap->va_ctime	= now;
    405 	vap->va_type	= VDIR;
    406 	vap->va_mode	= 0555;
    407 	vap->va_fsid	= vp->v_vfsp->vfs_dev;
    408 	vap->va_rdev	= 0;
    409 	vap->va_blksize	= MAXBSIZE;
    410 	vap->va_nblocks	= 1;
    411 	vap->va_seq	= 0;
    412 }
    413 
    414 /*
    415  * For the majority of cases, nfs4_trigger_getattr() will not trigger
    416  * a mount. However, if ATTR_TRIGGER is set, we are being informed
    417  * that we need to force the mount before we attempt to determine
    418  * the attributes. The intent is an atomic operation for security
    419  * testing.
    420  *
    421  * If we're not triggering a mount, we can still inquire about the
    422  * actual attributes from the server in the mirror mount case,
    423  * and will return manufactured attributes for a referral (see
    424  * the 'create' branch of find_referral_stubvp()).
    425  */
    426 static int
    427 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
    428     caller_context_t *ct)
    429 {
    430 	int error;
    431 
    432 	if (flags & ATTR_TRIGGER) {
    433 		vnode_t	*newvp;
    434 
    435 		error = nfs4_trigger_mount(vp, cr, &newvp);
    436 		if (error)
    437 			return (error);
    438 
    439 		error = VOP_GETATTR(newvp, vap, flags, cr, ct);
    440 		VN_RELE(newvp);
    441 
    442 	} else if (RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) {
    443 
    444 		error = nfs4_getattr(vp, vap, flags, cr, ct);
    445 
    446 	} else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) {
    447 
    448 		nfs4_fake_attrs(vp, vap);
    449 		error = 0;
    450 	}
    451 
    452 	return (error);
    453 }
    454 
    455 static int
    456 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
    457 		caller_context_t *ct)
    458 {
    459 	int error;
    460 	vnode_t *newvp;
    461 
    462 	error = nfs4_trigger_mount(vp, cr, &newvp);
    463 	if (error)
    464 		return (error);
    465 
    466 	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
    467 	VN_RELE(newvp);
    468 
    469 	return (error);
    470 }
    471 
    472 static int
    473 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
    474     caller_context_t *ct)
    475 {
    476 	int error;
    477 	vnode_t *newvp;
    478 
    479 	error = nfs4_trigger_mount(vp, cr, &newvp);
    480 	if (error)
    481 		return (error);
    482 
    483 	error = VOP_ACCESS(newvp, mode, flags, cr, ct);
    484 	VN_RELE(newvp);
    485 
    486 	return (error);
    487 }
    488 
    489 static int
    490 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
    491     struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
    492     caller_context_t *ct, int *deflags, pathname_t *rpnp)
    493 {
    494 	int error;
    495 	vnode_t *newdvp;
    496 	rnode4_t *drp = VTOR4(dvp);
    497 
    498 	ASSERT(RP_ISSTUB(drp));
    499 
    500 	/*
    501 	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
    502 	 * that up. Instead, pass onto the regular op, regardless of whether
    503 	 * we've triggered a mount.
    504 	 */
    505 	if (strcmp(nm, "..") == 0)
    506 		if (RP_ISSTUB_MIRRORMOUNT(drp)) {
    507 			return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
    508 			    ct, deflags, rpnp));
    509 		} else if (RP_ISSTUB_REFERRAL(drp)) {
    510 			/* Return the parent vnode */
    511 			return (vtodv(dvp, vpp, cr, TRUE));
    512 		}
    513 
    514 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    515 	if (error)
    516 		return (error);
    517 
    518 	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
    519 	    deflags, rpnp);
    520 	VN_RELE(newdvp);
    521 
    522 	return (error);
    523 }
    524 
    525 static int
    526 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
    527     enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
    528     int flags, caller_context_t *ct, vsecattr_t *vsecp)
    529 {
    530 	int error;
    531 	vnode_t *newdvp;
    532 
    533 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    534 	if (error)
    535 		return (error);
    536 
    537 	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
    538 	    flags, ct, vsecp);
    539 	VN_RELE(newdvp);
    540 
    541 	return (error);
    542 }
    543 
    544 static int
    545 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
    546     int flags)
    547 {
    548 	int error;
    549 	vnode_t *newdvp;
    550 
    551 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    552 	if (error)
    553 		return (error);
    554 
    555 	error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
    556 	VN_RELE(newdvp);
    557 
    558 	return (error);
    559 }
    560 
    561 static int
    562 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
    563     caller_context_t *ct, int flags)
    564 {
    565 	int error;
    566 	vnode_t *newtdvp;
    567 
    568 	error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
    569 	if (error)
    570 		return (error);
    571 
    572 	/*
    573 	 * We don't check whether svp is a stub. Let the NFSv4 code
    574 	 * detect that error, and return accordingly.
    575 	 */
    576 	error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
    577 	VN_RELE(newtdvp);
    578 
    579 	return (error);
    580 }
    581 
    582 static int
    583 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
    584     cred_t *cr, caller_context_t *ct, int flags)
    585 {
    586 	int error;
    587 	vnode_t *newsdvp;
    588 	rnode4_t *tdrp = VTOR4(tdvp);
    589 
    590 	/*
    591 	 * We know that sdvp is a stub, otherwise we would not be here.
    592 	 *
    593 	 * If tdvp is also be a stub, there are two possibilities: it
    594 	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
    595 	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
    596 	 *
    597 	 * In the former case, just trigger sdvp, and treat tdvp as
    598 	 * though it were not a stub.
    599 	 *
    600 	 * In the latter case, it might be a different stub for the
    601 	 * same server fs as sdvp, or for a different server fs.
    602 	 * Regardless, from the client perspective this would still
    603 	 * be a cross-filesystem rename, and should not be allowed,
    604 	 * so return EXDEV, without triggering either mount.
    605 	 */
    606 	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
    607 		return (EXDEV);
    608 
    609 	error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
    610 	if (error)
    611 		return (error);
    612 
    613 	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
    614 
    615 	VN_RELE(newsdvp);
    616 
    617 	return (error);
    618 }
    619 
    620 /* ARGSUSED */
    621 static int
    622 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
    623     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
    624 {
    625 	int error;
    626 	vnode_t *newdvp;
    627 
    628 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    629 	if (error)
    630 		return (error);
    631 
    632 	error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
    633 	VN_RELE(newdvp);
    634 
    635 	return (error);
    636 }
    637 
    638 static int
    639 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
    640     caller_context_t *ct, int flags)
    641 {
    642 	int error;
    643 	vnode_t *newdvp;
    644 
    645 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    646 	if (error)
    647 		return (error);
    648 
    649 	error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
    650 	VN_RELE(newdvp);
    651 
    652 	return (error);
    653 }
    654 
    655 static int
    656 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
    657     cred_t *cr, caller_context_t *ct, int flags)
    658 {
    659 	int error;
    660 	vnode_t *newdvp;
    661 
    662 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    663 	if (error)
    664 		return (error);
    665 
    666 	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
    667 	VN_RELE(newdvp);
    668 
    669 	return (error);
    670 }
    671 
    672 static int
    673 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
    674     caller_context_t *ct)
    675 {
    676 	int error;
    677 	vnode_t *newvp;
    678 
    679 	error = nfs4_trigger_mount(vp, cr, &newvp);
    680 	if (error)
    681 		return (error);
    682 
    683 	error = VOP_READLINK(newvp, uiop, cr, ct);
    684 	VN_RELE(newvp);
    685 
    686 	return (error);
    687 }
    688 
    689 /* end of trigger vnode ops */
    690 
    691 /*
    692  * See if the mount has already been done by another caller.
    693  */
    694 static int
    695 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
    696     bool_t *was_mounted, vfs_t **vfsp)
    697 {
    698 	int		error;
    699 	mntinfo4_t	*mi = VTOMI4(vp);
    700 
    701 	*was_mounted = FALSE;
    702 
    703 	error = vn_vfsrlock_wait(vp);
    704 	if (error)
    705 		return (error);
    706 
    707 	*vfsp = vn_mountedvfs(vp);
    708 	if (*vfsp != NULL) {
    709 		/* the mount has already occurred */
    710 		error = VFS_ROOT(*vfsp, newvpp);
    711 		if (!error) {
    712 			/* need to update the reference time  */
    713 			mutex_enter(&mi->mi_lock);
    714 			if (mi->mi_ephemeral)
    715 				mi->mi_ephemeral->ne_ref_time =
    716 				    gethrestime_sec();
    717 			mutex_exit(&mi->mi_lock);
    718 
    719 			*was_mounted = TRUE;
    720 		}
    721 	}
    722 
    723 	vn_vfsunlock(vp);
    724 	return (0);
    725 }
    726 
    727 /*
    728  * Mount upon a trigger vnode; for mirror-mounts, referrals, etc.
    729  *
    730  * The mount may have already occurred, via another thread. If not,
    731  * assemble the location information - which may require fetching - and
    732  * perform the mount.
    733  *
    734  * Sets newvp to be the root of the fs that is now covering vp. Note
    735  * that we return with VN_HELD(*newvp).
    736  *
    737  * The caller is responsible for passing the VOP onto the covering fs.
    738  */
    739 static int
    740 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
    741 {
    742 	int			 error;
    743 	vfs_t			*vfsp;
    744 	rnode4_t		*rp = VTOR4(vp);
    745 	mntinfo4_t		*mi = VTOMI4(vp);
    746 	domount_args_t		*dma;
    747 
    748 	nfs4_ephemeral_tree_t	*net;
    749 
    750 	bool_t			must_unlock = FALSE;
    751 	bool_t			is_building = FALSE;
    752 	bool_t			was_mounted = FALSE;
    753 
    754 	cred_t			*mcred = NULL;
    755 
    756 	nfs4_trigger_globals_t	*ntg;
    757 
    758 	zone_t			*zone = curproc->p_zone;
    759 
    760 	ASSERT(RP_ISSTUB(rp));
    761 
    762 	*newvpp = NULL;
    763 
    764 	/*
    765 	 * Has the mount already occurred?
    766 	 */
    767 	error = nfs4_trigger_mounted_already(vp, newvpp,
    768 	    &was_mounted, &vfsp);
    769 	if (error || was_mounted)
    770 		goto done;
    771 
    772 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
    773 	ASSERT(ntg != NULL);
    774 
    775 	mutex_enter(&mi->mi_lock);
    776 
    777 	/*
    778 	 * We need to lock down the ephemeral tree.
    779 	 */
    780 	if (mi->mi_ephemeral_tree == NULL) {
    781 		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
    782 		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
    783 		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
    784 		net->net_refcnt = 1;
    785 		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
    786 		is_building = TRUE;
    787 
    788 		/*
    789 		 * We need to add it to the zone specific list for
    790 		 * automatic unmounting and harvesting of deadwood.
    791 		 */
    792 		mutex_enter(&ntg->ntg_forest_lock);
    793 		if (ntg->ntg_forest != NULL)
    794 			net->net_next = ntg->ntg_forest;
    795 		ntg->ntg_forest = net;
    796 		mutex_exit(&ntg->ntg_forest_lock);
    797 
    798 		/*
    799 		 * No lock order confusion with mi_lock because no
    800 		 * other node could have grabbed net_tree_lock.
    801 		 */
    802 		mutex_enter(&net->net_tree_lock);
    803 		mi->mi_ephemeral_tree = net;
    804 		net->net_mount = mi;
    805 		mutex_exit(&mi->mi_lock);
    806 
    807 		MI4_HOLD(mi);
    808 		VFS_HOLD(mi->mi_vfsp);
    809 	} else {
    810 		net = mi->mi_ephemeral_tree;
    811 		nfs4_ephemeral_tree_hold(net);
    812 
    813 		mutex_exit(&mi->mi_lock);
    814 
    815 		mutex_enter(&net->net_tree_lock);
    816 
    817 		/*
    818 		 * We can only procede if the tree is neither locked
    819 		 * nor being torn down.
    820 		 */
    821 		mutex_enter(&net->net_cnt_lock);
    822 		if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
    823 			nfs4_ephemeral_tree_decr(net);
    824 			mutex_exit(&net->net_cnt_lock);
    825 			mutex_exit(&net->net_tree_lock);
    826 
    827 			return (EIO);
    828 		}
    829 		mutex_exit(&net->net_cnt_lock);
    830 	}
    831 
    832 	mutex_enter(&net->net_cnt_lock);
    833 	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
    834 	mutex_exit(&net->net_cnt_lock);
    835 
    836 	must_unlock = TRUE;
    837 
    838 	dma = nfs4_trigger_domount_args_create(vp, cr);
    839 	if (dma == NULL) {
    840 		error = EINVAL;
    841 		goto done;
    842 	}
    843 
    844 	/*
    845 	 * Note that since we define mirror mounts to work
    846 	 * for any user, we simply extend the privileges of
    847 	 * the user's credentials to allow the mount to
    848 	 * proceed.
    849 	 */
    850 	mcred = crdup(cr);
    851 	if (mcred == NULL) {
    852 		error = EINVAL;
    853 		goto done;
    854 	}
    855 
    856 	crset_zone_privall(mcred);
    857 	if (is_system_labeled())
    858 		(void) setpflags(NET_MAC_AWARE, 1, mcred);
    859 
    860 	error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
    861 	nfs4_trigger_domount_args_destroy(dma, vp);
    862 
    863 	DTRACE_PROBE2(nfs4clnt__func__referral__mount,
    864 	    vnode_t *, vp, int, error);
    865 
    866 	crfree(mcred);
    867 
    868 done:
    869 
    870 	if (must_unlock) {
    871 		mutex_enter(&net->net_cnt_lock);
    872 		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
    873 
    874 		/*
    875 		 * REFCNT: If we are the root of the tree, then we need
    876 		 * to keep a reference because we malloced the tree and
    877 		 * this is where we tied it to our mntinfo.
    878 		 *
    879 		 * If we are not the root of the tree, then our tie to
    880 		 * the mntinfo occured elsewhere and we need to
    881 		 * decrement the reference to the tree.
    882 		 */
    883 		if (is_building)
    884 			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
    885 		else
    886 			nfs4_ephemeral_tree_decr(net);
    887 		mutex_exit(&net->net_cnt_lock);
    888 
    889 		mutex_exit(&net->net_tree_lock);
    890 	}
    891 
    892 	if (!error && (newvpp == NULL || *newvpp == NULL))
    893 		error = ENOSYS;
    894 
    895 	return (error);
    896 }
    897 
    898 /*
    899  * Collect together both the generic & mount-type specific args.
    900  */
    901 static domount_args_t *
    902 nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr)
    903 {
    904 	int nointr;
    905 	char *hostlist;
    906 	servinfo4_t *svp;
    907 	struct nfs_args *nargs, *nargs_head;
    908 	enum clnt_stat status;
    909 	ephemeral_servinfo_t *esi, *esi_first;
    910 	domount_args_t *dma;
    911 	mntinfo4_t *mi = VTOMI4(vp);
    912 
    913 	nointr = !(mi->mi_flags & MI4_INT);
    914 	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
    915 
    916 	svp = mi->mi_curr_serv;
    917 	/* check if the current server is responding */
    918 	status = nfs4_trigger_ping_server(svp, nointr);
    919 	if (status == RPC_SUCCESS) {
    920 		esi_first = nfs4_trigger_esi_create(vp, svp, cr);
    921 		if (esi_first == NULL) {
    922 			kmem_free(hostlist, MAXPATHLEN);
    923 			return (NULL);
    924 		}
    925 
    926 		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
    927 
    928 		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
    929 	} else {
    930 		/* current server did not respond */
    931 		esi_first = NULL;
    932 		nargs_head = NULL;
    933 	}
    934 	nargs = nargs_head;
    935 
    936 	/*
    937 	 * NFS RO failover.
    938 	 *
    939 	 * If we have multiple servinfo4 structures, linked via sv_next,
    940 	 * we must create one nfs_args for each, linking the nfs_args via
    941 	 * nfs_ext_u.nfs_extB.next.
    942 	 *
    943 	 * We need to build a corresponding esi for each, too, but that is
    944 	 * used solely for building nfs_args, and may be immediately
    945 	 * discarded, as domount() requires the info from just one esi,
    946 	 * but all the nfs_args.
    947 	 *
    948 	 * Currently, the NFS mount code will hang if not all servers
    949 	 * requested are available. To avoid that, we need to ping each
    950 	 * server, here, and remove it from the list if it is not
    951 	 * responding. This has the side-effect of that server then
    952 	 * being permanently unavailable for this failover mount, even if
    953 	 * it recovers. That's unfortunate, but the best we can do until
    954 	 * the mount code path is fixed.
    955 	 */
    956 
    957 	/*
    958 	 * If the current server was down, loop indefinitely until we find
    959 	 * at least one responsive server.
    960 	 */
    961 	do {
    962 		/* no locking needed for sv_next; it is only set at fs mount */
    963 		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
    964 			struct nfs_args *next;
    965 
    966 			/*
    967 			 * nargs_head: the head of the nfs_args list
    968 			 * nargs: the current tail of the list
    969 			 * next: the newly-created element to be added
    970 			 */
    971 
    972 			/*
    973 			 * We've already tried the current server, above;
    974 			 * if it was responding, we have already included it
    975 			 * and it may now be ignored.
    976 			 *
    977 			 * Otherwise, try it again, since it may now have
    978 			 * recovered.
    979 			 */
    980 			if (svp == mi->mi_curr_serv && esi_first != NULL)
    981 				continue;
    982 
    983 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
    984 			if (svp->sv_flags & SV4_NOTINUSE) {
    985 				nfs_rw_exit(&svp->sv_lock);
    986 				continue;
    987 			}
    988 			nfs_rw_exit(&svp->sv_lock);
    989 
    990 			/* check if the server is responding */
    991 			status = nfs4_trigger_ping_server(svp, nointr);
    992 			/* if the server did not respond, ignore it */
    993 			if (status != RPC_SUCCESS)
    994 				continue;
    995 
    996 			esi = nfs4_trigger_esi_create(vp, svp, cr);
    997 			if (esi == NULL)
    998 				continue;
    999 
   1000 			/*
   1001 			 * If the original current server (mi_curr_serv)
   1002 			 * was down when when we first tried it,
   1003 			 * (i.e. esi_first == NULL),
   1004 			 * we select this new server (svp) to be the server
   1005 			 * that we will actually contact (esi_first).
   1006 			 *
   1007 			 * Note that it's possible that mi_curr_serv == svp,
   1008 			 * if that mi_curr_serv was down but has now recovered.
   1009 			 */
   1010 			next = nfs4_trigger_nargs_create(mi, svp, esi);
   1011 			if (esi_first == NULL) {
   1012 				ASSERT(nargs == NULL);
   1013 				ASSERT(nargs_head == NULL);
   1014 				nargs_head = next;
   1015 				esi_first = esi;
   1016 				(void) strlcpy(hostlist,
   1017 				    esi_first->esi_hostname, MAXPATHLEN);
   1018 			} else {
   1019 				ASSERT(nargs_head != NULL);
   1020 				nargs->nfs_ext_u.nfs_extB.next = next;
   1021 				(void) strlcat(hostlist, ",", MAXPATHLEN);
   1022 				(void) strlcat(hostlist, esi->esi_hostname,
   1023 				    MAXPATHLEN);
   1024 				/* esi was only needed for hostname & nargs */
   1025 				nfs4_trigger_esi_destroy(esi, vp);
   1026 			}
   1027 
   1028 			nargs = next;
   1029 		}
   1030 
   1031 		/* if we've had no response at all, wait a second */
   1032 		if (esi_first == NULL)
   1033 			delay(drv_usectohz(1000000));
   1034 
   1035 	} while (esi_first == NULL);
   1036 	ASSERT(nargs_head != NULL);
   1037 
   1038 	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
   1039 	dma->dma_esi = esi_first;
   1040 	dma->dma_hostlist = hostlist;
   1041 	dma->dma_nargs = nargs_head;
   1042 
   1043 	return (dma);
   1044 }
   1045 
   1046 static void
   1047 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
   1048 {
   1049 	if (dma != NULL) {
   1050 		if (dma->dma_esi != NULL && vp != NULL)
   1051 			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
   1052 
   1053 		if (dma->dma_hostlist != NULL)
   1054 			kmem_free(dma->dma_hostlist, MAXPATHLEN);
   1055 
   1056 		if (dma->dma_nargs != NULL) {
   1057 			struct nfs_args *nargs = dma->dma_nargs;
   1058 
   1059 			do {
   1060 				struct nfs_args *next =
   1061 				    nargs->nfs_ext_u.nfs_extB.next;
   1062 
   1063 				nfs4_trigger_nargs_destroy(nargs);
   1064 				nargs = next;
   1065 			} while (nargs != NULL);
   1066 		}
   1067 
   1068 		kmem_free(dma, sizeof (domount_args_t));
   1069 	}
   1070 }
   1071 
   1072 /*
   1073  * The ephemeral_servinfo_t struct contains basic information we will need to
   1074  * perform the mount. Whilst the structure is generic across different
   1075  * types of ephemeral mount, the way we gather its contents differs.
   1076  */
   1077 static ephemeral_servinfo_t *
   1078 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr)
   1079 {
   1080 	ephemeral_servinfo_t *esi;
   1081 	rnode4_t *rp = VTOR4(vp);
   1082 
   1083 	ASSERT(RP_ISSTUB(rp));
   1084 
   1085 	/* Call the ephemeral type-specific routine */
   1086 	if (RP_ISSTUB_MIRRORMOUNT(rp))
   1087 		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
   1088 	else if (RP_ISSTUB_REFERRAL(rp))
   1089 		esi = nfs4_trigger_esi_create_referral(vp, cr);
   1090 	else
   1091 		esi = NULL;
   1092 	return (esi);
   1093 }
   1094 
   1095 static void
   1096 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
   1097 {
   1098 	rnode4_t *rp = VTOR4(vp);
   1099 
   1100 	ASSERT(RP_ISSTUB(rp));
   1101 
   1102 	/* Currently, no need for an ephemeral type-specific routine */
   1103 
   1104 	/*
   1105 	 * The contents of ephemeral_servinfo_t goes into nfs_args,
   1106 	 * and will be handled by nfs4_trigger_nargs_destroy().
   1107 	 * We need only free the structure itself.
   1108 	 */
   1109 	if (esi != NULL)
   1110 		kmem_free(esi, sizeof (ephemeral_servinfo_t));
   1111 }
   1112 
   1113 /*
   1114  * Some of this may turn out to be common with other ephemeral types,
   1115  * in which case it should be moved to nfs4_trigger_esi_create(), or a
   1116  * common function called.
   1117  */
   1118 
   1119 /*
   1120  * Mirror mounts case - should have all data available
   1121  */
   1122 static ephemeral_servinfo_t *
   1123 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
   1124 {
   1125 	char			*stubpath;
   1126 	struct knetconfig	*sikncp, *svkncp;
   1127 	struct netbuf		*bufp;
   1128 	ephemeral_servinfo_t	*esi;
   1129 
   1130 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
   1131 
   1132 	/* initially set to be our type of ephemeral mount; may be added to */
   1133 	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
   1134 
   1135 	/*
   1136 	 * We're copying info from the stub rnode's servinfo4, but
   1137 	 * we must create new copies, not pointers, since this information
   1138 	 * is to be associated with the new mount, which will be
   1139 	 * unmounted (and its structures freed) separately
   1140 	 */
   1141 
   1142 	/*
   1143 	 * Sizes passed to kmem_[z]alloc here must match those freed
   1144 	 * in nfs4_free_args()
   1145 	 */
   1146 
   1147 	/*
   1148 	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
   1149 	 * is difficult to avoid: as we need to read svp to calculate the
   1150 	 * sizes to be allocated.
   1151 	 */
   1152 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1153 
   1154 	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
   1155 	(void) strcat(esi->esi_hostname, svp->sv_hostname);
   1156 
   1157 	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
   1158 	bufp = esi->esi_addr;
   1159 	bufp->len = svp->sv_addr.len;
   1160 	bufp->maxlen = svp->sv_addr.maxlen;
   1161 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
   1162 	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
   1163 
   1164 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
   1165 	sikncp = esi->esi_knconf;
   1166 	svkncp = svp->sv_knconf;
   1167 	sikncp->knc_semantics = svkncp->knc_semantics;
   1168 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
   1169 	(void) strcat((char *)sikncp->knc_protofmly,
   1170 	    (char *)svkncp->knc_protofmly);
   1171 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
   1172 	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
   1173 	sikncp->knc_rdev = svkncp->knc_rdev;
   1174 
   1175 	/*
   1176 	 * Used when AUTH_DH is negotiated.
   1177 	 *
   1178 	 * This is ephemeral mount-type specific, since it contains the
   1179 	 * server's time-sync syncaddr.
   1180 	 */
   1181 	if (svp->sv_dhsec) {
   1182 		struct netbuf *bufp;
   1183 		sec_data_t *sdata;
   1184 		dh_k4_clntdata_t *data;
   1185 
   1186 		sdata = svp->sv_dhsec;
   1187 		data = (dh_k4_clntdata_t *)sdata->data;
   1188 		ASSERT(sdata->rpcflavor == AUTH_DH);
   1189 
   1190 		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
   1191 		bufp->len = data->syncaddr.len;
   1192 		bufp->maxlen = data->syncaddr.maxlen;
   1193 		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
   1194 		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
   1195 		esi->esi_syncaddr = bufp;
   1196 
   1197 		if (data->netname != NULL) {
   1198 			int nmlen = data->netnamelen;
   1199 
   1200 			/*
   1201 			 * We need to copy from a dh_k4_clntdata_t
   1202 			 * netname/netnamelen pair to a NUL-terminated
   1203 			 * netname string suitable for putting in nfs_args,
   1204 			 * where the latter has no netnamelen field.
   1205 			 */
   1206 			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
   1207 			bcopy(data->netname, esi->esi_netname, nmlen);
   1208 		}
   1209 	} else {
   1210 		esi->esi_syncaddr = NULL;
   1211 		esi->esi_netname = NULL;
   1212 	}
   1213 
   1214 	stubpath = fn_path(VTOSV(vp)->sv_name);
   1215 	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
   1216 	ASSERT(*stubpath == '.');
   1217 	stubpath += 1;
   1218 
   1219 	/* for nfs_args->fh */
   1220 	esi->esi_path_len = strlen(stubpath) + 1;
   1221 	if (strcmp(svp->sv_path, "/") != 0)
   1222 		esi->esi_path_len += strlen(svp->sv_path);
   1223 	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
   1224 	if (strcmp(svp->sv_path, "/") != 0)
   1225 		(void) strcat(esi->esi_path, svp->sv_path);
   1226 	(void) strcat(esi->esi_path, stubpath);
   1227 
   1228 	stubpath -= 1;
   1229 	/* stubpath allocated by fn_path() */
   1230 	kmem_free(stubpath, strlen(stubpath) + 1);
   1231 
   1232 	nfs_rw_exit(&svp->sv_lock);
   1233 
   1234 	return (esi);
   1235 }
   1236 
   1237 /*
   1238  * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to
   1239  * get network information required to do the mount call.
   1240  */
   1241 int
   1242 nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp)
   1243 {
   1244 	door_arg_t	door_args;
   1245 	door_handle_t	dh;
   1246 	XDR		xdr;
   1247 	refd_door_args_t *xdr_argsp;
   1248 	refd_door_res_t  *orig_resp;
   1249 	k_sigset_t	smask;
   1250 	int		xdr_len = 0;
   1251 	int 		res_len = 16; /* length of an ip adress */
   1252 	int		orig_reslen = res_len;
   1253 	int		error = 0;
   1254 	struct nfsidmap_globals *nig;
   1255 
   1256 	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
   1257 		return (ECONNREFUSED);
   1258 
   1259 	nig = zone_getspecific(nfsidmap_zone_key, nfs_zone());
   1260 	ASSERT(nig != NULL);
   1261 
   1262 	mutex_enter(&nig->nfsidmap_daemon_lock);
   1263 	dh = nig->nfsidmap_daemon_dh;
   1264 	if (dh == NULL) {
   1265 		mutex_exit(&nig->nfsidmap_daemon_lock);
   1266 		cmn_err(CE_NOTE,
   1267 		    "nfs4_callmapid: nfsmapid daemon not " \
   1268 		    "running unable to resolve host name\n");
   1269 		return (EINVAL);
   1270 	}
   1271 	door_ki_hold(dh);
   1272 	mutex_exit(&nig->nfsidmap_daemon_lock);
   1273 
   1274 	xdr_len = xdr_sizeof(&(xdr_utf8string), server);
   1275 
   1276 	xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP);
   1277 	xdr_argsp->xdr_len = xdr_len;
   1278 	xdr_argsp->cmd = NFSMAPID_SRV_NETINFO;
   1279 
   1280 	xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg,
   1281 	    xdr_len, XDR_ENCODE);
   1282 
   1283 	if (!xdr_utf8string(&xdr, server)) {
   1284 		kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
   1285 		door_ki_rele(dh);
   1286 		return (1);
   1287 	}
   1288 
   1289 	if (orig_reslen)
   1290 		orig_resp = kmem_alloc(orig_reslen, KM_SLEEP);
   1291 
   1292 	door_args.data_ptr = (char *)xdr_argsp;
   1293 	door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len;
   1294 	door_args.desc_ptr = NULL;
   1295 	door_args.desc_num = 0;
   1296 	door_args.rbuf = orig_resp ? (char *)orig_resp : NULL;
   1297 	door_args.rsize = res_len;
   1298 
   1299 	sigintr(&smask, 1);
   1300 	error = door_ki_upcall(dh, &door_args);
   1301 	sigunintr(&smask);
   1302 
   1303 	door_ki_rele(dh);
   1304 
   1305 	kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
   1306 	if (error) {
   1307 		kmem_free(orig_resp, orig_reslen);
   1308 		/*
   1309 		 * There is no door to connect to. The referral daemon
   1310 		 * must not be running yet.
   1311 		 */
   1312 		cmn_err(CE_WARN,
   1313 		    "nfsmapid not running cannot resolve host name");
   1314 		goto out;
   1315 	}
   1316 
   1317 	/*
   1318 	 * If the results buffer passed back are not the same as
   1319 	 * what was sent free the old buffer and use the new one.
   1320 	 */
   1321 	if (orig_resp && orig_reslen) {
   1322 		refd_door_res_t *door_resp;
   1323 
   1324 		door_resp = (refd_door_res_t *)door_args.rbuf;
   1325 		if ((void *)door_args.rbuf != orig_resp)
   1326 			kmem_free(orig_resp, orig_reslen);
   1327 		if (door_resp->res_status == 0) {
   1328 			xdrmem_create(&xdr, (char *)&door_resp->xdr_res,
   1329 			    door_resp->xdr_len, XDR_DECODE);
   1330 			bzero(resp, sizeof (struct nfs_fsl_info));
   1331 			if (!xdr_nfs_fsl_info(&xdr, resp)) {
   1332 				DTRACE_PROBE2(
   1333 				    nfs4clnt__debug__referral__upcall__xdrfail,
   1334 				    struct nfs_fsl_info *, resp,
   1335 				    char *, "nfs4_callmapid");
   1336 				error = EINVAL;
   1337 			}
   1338 		} else {
   1339 			DTRACE_PROBE2(
   1340 			    nfs4clnt__debug__referral__upcall__badstatus,
   1341 			    int, door_resp->res_status,
   1342 			    char *, "nfs4_callmapid");
   1343 			error = door_resp->res_status;
   1344 		}
   1345 		kmem_free(door_args.rbuf, door_args.rsize);
   1346 	}
   1347 out:
   1348 	DTRACE_PROBE2(nfs4clnt__func__referral__upcall,
   1349 	    char *, server, int, error);
   1350 	return (error);
   1351 }
   1352 
   1353 /*
   1354  * Fetches the fs_locations attribute. Typically called
   1355  * from a Replication/Migration/Referrals/Mirror-mount context
   1356  *
   1357  * Fills in the attributes in garp. The caller is assumed
   1358  * to have allocated memory for garp.
   1359  *
   1360  * lock: if set do not lock s_recovlock and mi_recovlock mutex,
   1361  *	 it's already done by caller. Otherwise lock these mutexes
   1362  *	 before doing the rfs4call().
   1363  *
   1364  * Returns
   1365  * 	1	 for success
   1366  * 	0	 for failure
   1367  */
   1368 int
   1369 nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm,
   1370     cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock)
   1371 {
   1372 	COMPOUND4args_clnt args;
   1373 	COMPOUND4res_clnt res;
   1374 	nfs_argop4 *argop;
   1375 	int argoplist_size = 3 * sizeof (nfs_argop4);
   1376 	nfs4_server_t *sp = NULL;
   1377 	int doqueue = 1;
   1378 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   1379 	int retval = 1;
   1380 	struct nfs4_clnt *nfscl;
   1381 
   1382 	if (lock == TRUE)
   1383 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
   1384 	else
   1385 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
   1386 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
   1387 
   1388 	sp = find_nfs4_server(mi);
   1389 	if (lock == TRUE)
   1390 		nfs_rw_exit(&mi->mi_recovlock);
   1391 
   1392 	if (sp != NULL)
   1393 		mutex_exit(&sp->s_lock);
   1394 
   1395 	if (lock == TRUE) {
   1396 		if (sp != NULL)
   1397 			(void) nfs_rw_enter_sig(&sp->s_recovlock,
   1398 			    RW_WRITER, 0);
   1399 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
   1400 	} else {
   1401 		if (sp != NULL) {
   1402 			ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) ||
   1403 			    nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
   1404 		}
   1405 	}
   1406 
   1407 	/*
   1408 	 * Do we want to do the setup for recovery here?
   1409 	 *
   1410 	 * We know that the server responded to a null ping a very
   1411 	 * short time ago, and we know that we intend to do a
   1412 	 * single stateless operation - we want to fetch attributes,
   1413 	 * so we know we can't encounter errors about state.  If
   1414 	 * something goes wrong with the GETATTR, like not being
   1415 	 * able to get a response from the server or getting any
   1416 	 * kind of FH error, we should fail the mount.
   1417 	 *
   1418 	 * We may want to re-visited this at a later time.
   1419 	 */
   1420 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
   1421 
   1422 	args.ctag = TAG_GETATTR_FSLOCATION;
   1423 	/* PUTFH LOOKUP GETATTR */
   1424 	args.array_len = 3;
   1425 	args.array = argop;
   1426 
   1427 	/* 0. putfh file */
   1428 	argop[0].argop = OP_CPUTFH;
   1429 	argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
   1430 
   1431 	/* 1. lookup name, can't be dotdot */
   1432 	argop[1].argop = OP_CLOOKUP;
   1433 	argop[1].nfs_argop4_u.opclookup.cname = nm;
   1434 
   1435 	/* 2. file attrs */
   1436 	argop[2].argop = OP_GETATTR;
   1437 	argop[2].nfs_argop4_u.opgetattr.attr_request =
   1438 	    FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK |
   1439 	    FATTR4_MOUNTED_ON_FILEID_MASK;
   1440 	argop[2].nfs_argop4_u.opgetattr.mi = mi;
   1441 
   1442 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
   1443 
   1444 	if (lock == TRUE) {
   1445 		nfs_rw_exit(&mi->mi_recovlock);
   1446 		if (sp != NULL)
   1447 			nfs_rw_exit(&sp->s_recovlock);
   1448 	}
   1449 
   1450 	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
   1451 	nfscl->nfscl_stat.referrals.value.ui64++;
   1452 	DTRACE_PROBE3(nfs4clnt__func__referral__fsloc,
   1453 	    nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e);
   1454 
   1455 	if (e.error != 0) {
   1456 		if (sp != NULL)
   1457 			nfs4_server_rele(sp);
   1458 		kmem_free(argop, argoplist_size);
   1459 		return (0);
   1460 	}
   1461 
   1462 	/*
   1463 	 * Check for all possible error conditions.
   1464 	 * For valid replies without an ops array or for illegal
   1465 	 * replies, return a failure.
   1466 	 */
   1467 	if (res.status != NFS4_OK || res.array_len < 3 ||
   1468 	    res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
   1469 		retval = 0;
   1470 		goto exit;
   1471 	}
   1472 
   1473 	/*
   1474 	 * There isn't much value in putting the attributes
   1475 	 * in the attr cache since fs_locations4 aren't
   1476 	 * encountered very frequently, so just make them
   1477 	 * available to the caller.
   1478 	 */
   1479 	*garp = res.array[2].nfs_resop4_u.opgetattr.ga_res;
   1480 
   1481 	DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc,
   1482 	    nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations");
   1483 
   1484 	/* No fs_locations? -- return a failure */
   1485 	if (garp->n4g_ext_res == NULL ||
   1486 	    garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) {
   1487 		retval = 0;
   1488 		goto exit;
   1489 	}
   1490 
   1491 	if (!garp->n4g_fsid_valid)
   1492 		retval = 0;
   1493 
   1494 exit:
   1495 	if (retval == 0) {
   1496 		/* the call was ok but failed validating the call results */
   1497 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1498 	} else {
   1499 		ASSERT(callres != NULL);
   1500 		*callres = res;
   1501 	}
   1502 
   1503 	if (sp != NULL)
   1504 		nfs4_server_rele(sp);
   1505 	kmem_free(argop, argoplist_size);
   1506 	return (retval);
   1507 }
   1508 
   1509 /* tunable to disable referral mounts */
   1510 int nfs4_no_referrals = 0;
   1511 
   1512 /*
   1513  * Returns NULL if the vnode cannot be created or found.
   1514  */
   1515 vnode_t *
   1516 find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr)
   1517 {
   1518 	nfs_fh4 *stub_fh, *dfh;
   1519 	nfs4_sharedfh_t *sfhp;
   1520 	char *newfhval;
   1521 	vnode_t *vp = NULL;
   1522 	fattr4_mounted_on_fileid mnt_on_fileid;
   1523 	nfs4_ga_res_t garp;
   1524 	mntinfo4_t *mi;
   1525 	COMPOUND4res_clnt callres;
   1526 	hrtime_t t;
   1527 
   1528 	if (nfs4_no_referrals)
   1529 		return (NULL);
   1530 
   1531 	/*
   1532 	 * Get the mounted_on_fileid, unique on that server::fsid
   1533 	 */
   1534 	mi = VTOMI4(dvp);
   1535 	if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr,
   1536 	    &garp, &callres, FALSE) == 0)
   1537 		return (NULL);
   1538 	mnt_on_fileid = garp.n4g_mon_fid;
   1539 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
   1540 
   1541 	/*
   1542 	 * Build a fake filehandle from the dir FH and the mounted_on_fileid
   1543 	 */
   1544 	dfh = &VTOR4(dvp)->r_fh->sfh_fh;
   1545 	stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP);
   1546 	stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len +
   1547 	    sizeof (fattr4_mounted_on_fileid), KM_SLEEP);
   1548 	newfhval = stub_fh->nfs_fh4_val;
   1549 
   1550 	/* copy directory's file handle */
   1551 	bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len);
   1552 	stub_fh->nfs_fh4_len = dfh->nfs_fh4_len;
   1553 	newfhval = newfhval + dfh->nfs_fh4_len;
   1554 
   1555 	/* Add mounted_on_fileid. Use bcopy to avoid alignment problem */
   1556 	bcopy((char *)&mnt_on_fileid, newfhval,
   1557 	    sizeof (fattr4_mounted_on_fileid));
   1558 	stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid);
   1559 
   1560 	sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL);
   1561 	kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len +
   1562 	    sizeof (fattr4_mounted_on_fileid));
   1563 	kmem_free(stub_fh, sizeof (nfs_fh4));
   1564 	if (sfhp == NULL)
   1565 		return (NULL);
   1566 
   1567 	t = gethrtime();
   1568 	garp.n4g_va.va_type = VDIR;
   1569 	vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t,
   1570 	    cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
   1571 
   1572 	if (vp != NULL)
   1573 		vp->v_type = VDIR;
   1574 
   1575 	sfh4_rele(&sfhp);
   1576 	return (vp);
   1577 }
   1578 
   1579 int
   1580 nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
   1581 {
   1582 	vnode_t *nvp;
   1583 	rnode4_t *rp;
   1584 
   1585 	if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL)
   1586 		return (EINVAL);
   1587 
   1588 	rp = VTOR4(nvp);
   1589 	mutex_enter(&rp->r_statelock);
   1590 	r4_stub_referral(rp);
   1591 	mutex_exit(&rp->r_statelock);
   1592 	dnlc_enter(dvp, nm, nvp);
   1593 
   1594 	if (*vpp != NULL)
   1595 		VN_RELE(*vpp);	/* no longer need this vnode */
   1596 
   1597 	*vpp = nvp;
   1598 
   1599 	return (0);
   1600 }
   1601 
   1602 /*
   1603  * Fetch the location information and resolve the new server.
   1604  * Caller needs to free up the XDR data which is returned.
   1605  * Input: mount info, shared filehandle, nodename
   1606  * Return: Index to the result or Error(-1)
   1607  * Output: FsLocations Info, Resolved Server Info.
   1608  */
   1609 int
   1610 nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh,
   1611     char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res,
   1612     struct nfs_fsl_info *fsloc)
   1613 {
   1614 	fs_location4 *fsp;
   1615 	struct nfs_fsl_info nfsfsloc;
   1616 	int ret, i, error;
   1617 	nfs4_ga_res_t garp;
   1618 	COMPOUND4res_clnt callres;
   1619 	struct knetconfig *knc;
   1620 
   1621 	ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE);
   1622 	if (ret == 0)
   1623 		return (-1);
   1624 
   1625 	/*
   1626 	 * As a lame attempt to figuring out if we're
   1627 	 * handling a migration event or a referral,
   1628 	 * look for rnodes with this fsid in the rnode
   1629 	 * cache.
   1630 	 *
   1631 	 * If we can find one or more such rnodes, it
   1632 	 * means we're handling a migration event and
   1633 	 * we want to bail out in that case.
   1634 	 */
   1635 	if (r4find_by_fsid(mi, &garp.n4g_fsid)) {
   1636 		DTRACE_PROBE3(nfs4clnt__debug__referral__migration,
   1637 		    mntinfo4_t *, mi, nfs4_ga_res_t *, &garp,
   1638 		    char *, "nfs4_process_referral");
   1639 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
   1640 		return (-1);
   1641 	}
   1642 
   1643 	/*
   1644 	 * Find the first responsive server to mount.  When we find
   1645 	 * one, fsp will point to it.
   1646 	 */
   1647 	for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) {
   1648 
   1649 		fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i];
   1650 		if (fsp->server_len == 0 || fsp->server_val == NULL)
   1651 			continue;
   1652 
   1653 		error = nfs4_callmapid(fsp->server_val, &nfsfsloc);
   1654 		if (error != 0)
   1655 			continue;
   1656 
   1657 		error = nfs4_ping_server_common(nfsfsloc.knconf,
   1658 		    nfsfsloc.addr, !(mi->mi_flags & MI4_INT));
   1659 		if (error == RPC_SUCCESS)
   1660 			break;
   1661 
   1662 		DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr,
   1663 		    sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf,
   1664 		    char *, "nfs4_process_referral");
   1665 
   1666 		(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
   1667 	}
   1668 	knc = nfsfsloc.knconf;
   1669 	if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) ||
   1670 	    (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) {
   1671 		DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc,
   1672 		    nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral");
   1673 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
   1674 		return (-1);
   1675 	}
   1676 
   1677 	/* Send the results back */
   1678 	*fsloc = nfsfsloc;
   1679 	*grp = garp;
   1680 	*res = callres;
   1681 	return (i);
   1682 }
   1683 
   1684 /*
   1685  * Referrals case - need to fetch referral data and then upcall to
   1686  * user-level to get complete mount data.
   1687  */
   1688 static ephemeral_servinfo_t *
   1689 nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr)
   1690 {
   1691 	struct knetconfig	*sikncp, *svkncp;
   1692 	struct netbuf		*bufp;
   1693 	ephemeral_servinfo_t	*esi;
   1694 	vnode_t			*dvp;
   1695 	rnode4_t		*drp;
   1696 	fs_location4		*fsp;
   1697 	struct nfs_fsl_info	nfsfsloc;
   1698 	nfs4_ga_res_t		garp;
   1699 	char			*p;
   1700 	char			fn[MAXNAMELEN];
   1701 	int			i, index = -1;
   1702 	mntinfo4_t		*mi;
   1703 	COMPOUND4res_clnt	callres;
   1704 
   1705 	/*
   1706 	 * If we're passed in a stub vnode that
   1707 	 * isn't a "referral" stub, bail out
   1708 	 * and return a failure
   1709 	 */
   1710 	if (!RP_ISSTUB_REFERRAL(VTOR4(vp)))
   1711 		return (NULL);
   1712 
   1713 	if (vtodv(vp, &dvp, CRED(), TRUE) != 0)
   1714 		return (NULL);
   1715 
   1716 	drp = VTOR4(dvp);
   1717 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
   1718 		VN_RELE(dvp);
   1719 		return (NULL);
   1720 	}
   1721 
   1722 	if (vtoname(vp, fn, MAXNAMELEN) != 0) {
   1723 		nfs_rw_exit(&drp->r_rwlock);
   1724 		VN_RELE(dvp);
   1725 		return (NULL);
   1726 	}
   1727 
   1728 	mi = VTOMI4(dvp);
   1729 	index = nfs4_process_referral(mi, drp->r_fh, fn, cr,
   1730 	    &garp, &callres, &nfsfsloc);
   1731 	nfs_rw_exit(&drp->r_rwlock);
   1732 	VN_RELE(dvp);
   1733 	if (index < 0)
   1734 		return (NULL);
   1735 
   1736 	fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
   1737 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
   1738 
   1739 	/* initially set to be our type of ephemeral mount; may be added to */
   1740 	esi->esi_mount_flags = NFSMNT_REFERRAL;
   1741 
   1742 	esi->esi_hostname =
   1743 	    kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP);
   1744 	bcopy(fsp->server_val->utf8string_val, esi->esi_hostname,
   1745 	    fsp->server_val->utf8string_len);
   1746 	esi->esi_hostname[fsp->server_val->utf8string_len] = '\0';
   1747 
   1748 	bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
   1749 	bufp->len = nfsfsloc.addr->len;
   1750 	bufp->maxlen = nfsfsloc.addr->maxlen;
   1751 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
   1752 	bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len);
   1753 	esi->esi_addr = bufp;
   1754 
   1755 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
   1756 	sikncp = esi->esi_knconf;
   1757 
   1758 	DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc,
   1759 	    struct nfs_fsl_info *, &nfsfsloc,
   1760 	    char *, "nfs4_trigger_esi_create_referral");
   1761 
   1762 	svkncp = nfsfsloc.knconf;
   1763 	sikncp->knc_semantics = svkncp->knc_semantics;
   1764 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
   1765 	(void) strlcat((char *)sikncp->knc_protofmly,
   1766 	    (char *)svkncp->knc_protofmly, KNC_STRSIZE);
   1767 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
   1768 	(void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto,
   1769 	    KNC_STRSIZE);
   1770 	sikncp->knc_rdev = svkncp->knc_rdev;
   1771 
   1772 	DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf,
   1773 	    struct knetconfig *, sikncp,
   1774 	    char *, "nfs4_trigger_esi_create_referral");
   1775 
   1776 	esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP);
   1777 	bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len);
   1778 	esi->esi_syncaddr = NULL;
   1779 
   1780 	esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
   1781 	esi->esi_path_len = MAXPATHLEN;
   1782 	*p++ = '/';
   1783 	for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
   1784 		component4 *comp;
   1785 
   1786 		comp = &fsp->rootpath.pathname4_val[i];
   1787 		/* If no space, null the string and bail */
   1788 		if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN)
   1789 			goto err;
   1790 		bcopy(comp->utf8string_val, p, comp->utf8string_len);
   1791 		p += comp->utf8string_len;
   1792 		*p++ = '/';
   1793 	}
   1794 	if (fsp->rootpath.pathname4_len != 0)
   1795 		*(p - 1) = '\0';
   1796 	else
   1797 		*p = '\0';
   1798 	p = esi->esi_path;
   1799 	esi->esi_path = strdup(p);
   1800 	esi->esi_path_len = strlen(p) + 1;
   1801 	kmem_free(p, MAXPATHLEN);
   1802 
   1803 	/* Allocated in nfs4_process_referral() */
   1804 	(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
   1805 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
   1806 
   1807 	return (esi);
   1808 err:
   1809 	kmem_free(esi->esi_path, esi->esi_path_len);
   1810 	kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1);
   1811 	kmem_free(esi->esi_addr->buf, esi->esi_addr->len);
   1812 	kmem_free(esi->esi_addr, sizeof (struct netbuf));
   1813 	kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE);
   1814 	kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE);
   1815 	kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf));
   1816 	kmem_free(esi->esi_netname, nfsfsloc.netnm_len);
   1817 	kmem_free(esi, sizeof (ephemeral_servinfo_t));
   1818 	(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
   1819 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
   1820 	return (NULL);
   1821 }
   1822 
   1823 /*
   1824  * Assemble the args, and call the generic VFS mount function to
   1825  * finally perform the ephemeral mount.
   1826  */
   1827 static int
   1828 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
   1829     cred_t *cr, vnode_t **newvpp)
   1830 {
   1831 	struct mounta	*uap;
   1832 	char		*mntpt, *orig_path, *path;
   1833 	const char	*orig_mntpt;
   1834 	int		retval;
   1835 	int		mntpt_len;
   1836 	int		spec_len;
   1837 	zone_t		*zone = curproc->p_zone;
   1838 	bool_t		has_leading_slash;
   1839 	int		i;
   1840 
   1841 	vfs_t			*stubvfsp = stubvp->v_vfsp;
   1842 	ephemeral_servinfo_t	*esi = dma->dma_esi;
   1843 	struct nfs_args		*nargs = dma->dma_nargs;
   1844 
   1845 	/* first, construct the mount point for the ephemeral mount */
   1846 	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
   1847 	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
   1848 
   1849 	if (*orig_path == '.')
   1850 		orig_path++;
   1851 
   1852 	/*
   1853 	 * Get rid of zone's root path
   1854 	 */
   1855 	if (zone != global_zone) {
   1856 		/*
   1857 		 * -1 for trailing '/' and -1 for EOS.
   1858 		 */
   1859 		if (strncmp(zone->zone_rootpath, orig_mntpt,
   1860 		    zone->zone_rootpathlen - 1) == 0) {
   1861 			orig_mntpt += (zone->zone_rootpathlen - 2);
   1862 		}
   1863 	}
   1864 
   1865 	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
   1866 	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
   1867 	(void) strcat(mntpt, orig_mntpt);
   1868 	(void) strcat(mntpt, orig_path);
   1869 
   1870 	kmem_free(path, strlen(path) + 1);
   1871 	path = esi->esi_path;
   1872 	if (*path == '.')
   1873 		path++;
   1874 	if (path[0] == '/' && path[1] == '/')
   1875 		path++;
   1876 	has_leading_slash = (*path == '/');
   1877 
   1878 	spec_len = strlen(dma->dma_hostlist);
   1879 	spec_len += strlen(path);
   1880 
   1881 	/* We are going to have to add this in */
   1882 	if (!has_leading_slash)
   1883 		spec_len++;
   1884 
   1885 	/* We need to get the ':' for dma_hostlist:esi_path */
   1886 	spec_len++;
   1887 
   1888 	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
   1889 	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
   1890 	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
   1891 	    has_leading_slash ? "" : "/", path);
   1892 
   1893 	uap->dir = mntpt;
   1894 
   1895 	uap->flags = MS_SYSSPACE | MS_DATA;
   1896 	/* fstype-independent mount options not covered elsewhere */
   1897 	/* copy parent's mount(1M) "-m" flag */
   1898 	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
   1899 		uap->flags |= MS_NOMNTTAB;
   1900 
   1901 	uap->fstype = MNTTYPE_NFS4;
   1902 	uap->dataptr = (char *)nargs;
   1903 	/* not needed for MS_SYSSPACE */
   1904 	uap->datalen = 0;
   1905 
   1906 	/* use optptr to pass in extra mount options */
   1907 	uap->flags |= MS_OPTIONSTR;
   1908 	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
   1909 	if (uap->optptr == NULL) {
   1910 		retval = EINVAL;
   1911 		goto done;
   1912 	}
   1913 
   1914 	/* domount() expects us to count the trailing NUL */
   1915 	uap->optlen = strlen(uap->optptr) + 1;
   1916 
   1917 	/*
   1918 	 * If we get EBUSY, we try again once to see if we can perform
   1919 	 * the mount. We do this because of a spurious race condition.
   1920 	 */
   1921 	for (i = 0; i < 2; i++) {
   1922 		int	error;
   1923 		bool_t	was_mounted;
   1924 
   1925 		retval = domount(NULL, uap, stubvp, cr, vfsp);
   1926 		if (retval == 0) {
   1927 			retval = VFS_ROOT(*vfsp, newvpp);
   1928 			VFS_RELE(*vfsp);
   1929 			break;
   1930 		} else if (retval != EBUSY) {
   1931 			break;
   1932 		}
   1933 
   1934 		/*
   1935 		 * We might find it mounted by the other racer...
   1936 		 */
   1937 		error = nfs4_trigger_mounted_already(stubvp,
   1938 		    newvpp, &was_mounted, vfsp);
   1939 		if (error) {
   1940 			goto done;
   1941 		} else if (was_mounted) {
   1942 			retval = 0;
   1943 			break;
   1944 		}
   1945 	}
   1946 
   1947 done:
   1948 	if (uap->optptr)
   1949 		nfs4_trigger_destroy_mntopts(uap->optptr);
   1950 
   1951 	kmem_free(uap->spec, spec_len + 1);
   1952 	kmem_free(uap, sizeof (struct mounta));
   1953 	kmem_free(mntpt, mntpt_len + 1);
   1954 
   1955 	return (retval);
   1956 }
   1957 
   1958 /*
   1959  * Build an nfs_args structure for passing to domount().
   1960  *
   1961  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
   1962  * generic data - common to all ephemeral mount types - is read directly
   1963  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
   1964  */
   1965 static struct nfs_args *
   1966 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
   1967     ephemeral_servinfo_t *esi)
   1968 {
   1969 	sec_data_t *secdata;
   1970 	struct nfs_args *nargs;
   1971 
   1972 	/* setup the nfs args */
   1973 	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
   1974 
   1975 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1976 
   1977 	nargs->addr = esi->esi_addr;
   1978 
   1979 	/* for AUTH_DH by negotiation */
   1980 	if (esi->esi_syncaddr || esi->esi_netname) {
   1981 		nargs->flags |= NFSMNT_SECURE;
   1982 		nargs->syncaddr = esi->esi_syncaddr;
   1983 		nargs->netname = esi->esi_netname;
   1984 	}
   1985 
   1986 	nargs->flags |= NFSMNT_KNCONF;
   1987 	nargs->knconf = esi->esi_knconf;
   1988 	nargs->flags |= NFSMNT_HOSTNAME;
   1989 	nargs->hostname = esi->esi_hostname;
   1990 	nargs->fh = esi->esi_path;
   1991 
   1992 	/* general mount settings, all copied from parent mount */
   1993 	mutex_enter(&mi->mi_lock);
   1994 
   1995 	if (!(mi->mi_flags & MI4_HARD))
   1996 		nargs->flags |= NFSMNT_SOFT;
   1997 
   1998 	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
   1999 	    NFSMNT_RETRANS;
   2000 	nargs->wsize = mi->mi_stsize;
   2001 	nargs->rsize = mi->mi_tsize;
   2002 	nargs->timeo = mi->mi_timeo;
   2003 	nargs->retrans = mi->mi_retrans;
   2004 
   2005 	if (mi->mi_flags & MI4_INT)
   2006 		nargs->flags |= NFSMNT_INT;
   2007 	if (mi->mi_flags & MI4_NOAC)
   2008 		nargs->flags |= NFSMNT_NOAC;
   2009 
   2010 	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
   2011 	    NFSMNT_ACDIRMAX;
   2012 	nargs->acregmin = HR2SEC(mi->mi_acregmin);
   2013 	nargs->acregmax = HR2SEC(mi->mi_acregmax);
   2014 	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
   2015 	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
   2016 
   2017 	/* add any specific flags for this type of ephemeral mount */
   2018 	nargs->flags |= esi->esi_mount_flags;
   2019 
   2020 	if (mi->mi_flags & MI4_NOCTO)
   2021 		nargs->flags |= NFSMNT_NOCTO;
   2022 	if (mi->mi_flags & MI4_GRPID)
   2023 		nargs->flags |= NFSMNT_GRPID;
   2024 	if (mi->mi_flags & MI4_LLOCK)
   2025 		nargs->flags |= NFSMNT_LLOCK;
   2026 	if (mi->mi_flags & MI4_NOPRINT)
   2027 		nargs->flags |= NFSMNT_NOPRINT;
   2028 	if (mi->mi_flags & MI4_DIRECTIO)
   2029 		nargs->flags |= NFSMNT_DIRECTIO;
   2030 	if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT)
   2031 		nargs->flags |= NFSMNT_PUBLIC;
   2032 
   2033 	/* Do some referral-specific option tweaking */
   2034 	if (nargs->flags & NFSMNT_REFERRAL) {
   2035 		nargs->flags &= ~NFSMNT_DORDMA;
   2036 		nargs->flags |= NFSMNT_TRYRDMA;
   2037 	}
   2038 
   2039 	mutex_exit(&mi->mi_lock);
   2040 
   2041 	/*
   2042 	 * Security data & negotiation policy.
   2043 	 *
   2044 	 * For mirror mounts, we need to preserve the parent mount's
   2045 	 * preference for security negotiation, translating SV4_TRYSECDEFAULT
   2046 	 * to NFSMNT_SECDEFAULT if present.
   2047 	 *
   2048 	 * For referrals, we always want security negotiation and will
   2049 	 * set NFSMNT_SECDEFAULT and we will not copy current secdata.
   2050 	 * The reason is that we can't negotiate down from a parent's
   2051 	 * Kerberos flavor to AUTH_SYS.
   2052 	 *
   2053 	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
   2054 	 * security flavour was requested, with data in sv_secdata, and that
   2055 	 * no negotiation should occur. If this specified flavour fails, that's
   2056 	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
   2057 	 *
   2058 	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
   2059 	 * default flavour, in sv_secdata, but then negotiate a new flavour.
   2060 	 * Possible flavours are recorded in an array in sv_secinfo, with
   2061 	 * currently in-use flavour pointed to by sv_currsec.
   2062 	 *
   2063 	 * If sv_currsec is set, i.e. if negotiation has already occurred,
   2064 	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
   2065 	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
   2066 	 */
   2067 	if (nargs->flags & NFSMNT_REFERRAL) {
   2068 		/* enable negotiation for referral mount */
   2069 		nargs->flags |= NFSMNT_SECDEFAULT;
   2070 		secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP);
   2071 		secdata->secmod = secdata->rpcflavor = AUTH_SYS;
   2072 		secdata->data = NULL;
   2073 	} else if (svp->sv_flags & SV4_TRYSECDEFAULT) {
   2074 		/* enable negotiation for mirror mount */
   2075 		nargs->flags |= NFSMNT_SECDEFAULT;
   2076 
   2077 		/*
   2078 		 * As a starting point for negotiation, copy parent
   2079 		 * mount's negotiated flavour (sv_currsec) if available,
   2080 		 * or its passed-in flavour (sv_secdata) if not.
   2081 		 */
   2082 		if (svp->sv_currsec != NULL)
   2083 			secdata = copy_sec_data(svp->sv_currsec);
   2084 		else if (svp->sv_secdata != NULL)
   2085 			secdata = copy_sec_data(svp->sv_secdata);
   2086 		else
   2087 			secdata = NULL;
   2088 	} else {
   2089 		/* do not enable negotiation; copy parent's passed-in flavour */
   2090 		if (svp->sv_secdata != NULL)
   2091 			secdata = copy_sec_data(svp->sv_secdata);
   2092 		else
   2093 			secdata = NULL;
   2094 	}
   2095 
   2096 	nfs_rw_exit(&svp->sv_lock);
   2097 
   2098 	nargs->flags |= NFSMNT_NEWARGS;
   2099 	nargs->nfs_args_ext = NFS_ARGS_EXTB;
   2100 	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
   2101 
   2102 	/* for NFS RO failover; caller will set if necessary */
   2103 	nargs->nfs_ext_u.nfs_extB.next = NULL;
   2104 
   2105 	return (nargs);
   2106 }
   2107 
   2108 static void
   2109 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
   2110 {
   2111 	/*
   2112 	 * Either the mount failed, in which case the data is not needed, or
   2113 	 * nfs4_mount() has either taken copies of what it needs or,
   2114 	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
   2115 	 * whereby nfs4_free_args() will ignore it.
   2116 	 */
   2117 	nfs4_free_args(nargs);
   2118 	kmem_free(nargs, sizeof (struct nfs_args));
   2119 }
   2120 
   2121 /*
   2122  * When we finally get into the mounting, we need to add this
   2123  * node to the ephemeral tree.
   2124  *
   2125  * This is called from nfs4_mount().
   2126  */
   2127 int
   2128 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
   2129 {
   2130 	mntinfo4_t		*mi_parent;
   2131 	nfs4_ephemeral_t	*eph;
   2132 	nfs4_ephemeral_tree_t	*net;
   2133 
   2134 	nfs4_ephemeral_t	*prior;
   2135 	nfs4_ephemeral_t	*child;
   2136 
   2137 	nfs4_ephemeral_t	*peer;
   2138 
   2139 	nfs4_trigger_globals_t	*ntg;
   2140 	zone_t			*zone = curproc->p_zone;
   2141 
   2142 	int			rc = 0;
   2143 
   2144 	mi_parent = VTOMI4(mvp);
   2145 
   2146 	/*
   2147 	 * Get this before grabbing anything else!
   2148 	 */
   2149 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
   2150 	if (!ntg->ntg_thread_started) {
   2151 		nfs4_ephemeral_start_harvester(ntg);
   2152 	}
   2153 
   2154 	mutex_enter(&mi_parent->mi_lock);
   2155 	mutex_enter(&mi->mi_lock);
   2156 
   2157 	net = mi->mi_ephemeral_tree =
   2158 	    mi_parent->mi_ephemeral_tree;
   2159 
   2160 	/*
   2161 	 * If the mi_ephemeral_tree is NULL, then it
   2162 	 * means that either the harvester or a manual
   2163 	 * umount has cleared the tree out right before
   2164 	 * we got here.
   2165 	 *
   2166 	 * There is nothing we can do here, so return
   2167 	 * to the caller and let them decide whether they
   2168 	 * try again.
   2169 	 */
   2170 	if (net == NULL) {
   2171 		mutex_exit(&mi->mi_lock);
   2172 		mutex_exit(&mi_parent->mi_lock);
   2173 
   2174 		return (EBUSY);
   2175 	}
   2176 
   2177 	/*
   2178 	 * We've just tied the mntinfo to the tree, so
   2179 	 * now we bump the refcnt and hold it there until
   2180 	 * this mntinfo is removed from the tree.
   2181 	 */
   2182 	nfs4_ephemeral_tree_hold(net);
   2183 
   2184 	/*
   2185 	 * We need to tack together the ephemeral mount
   2186 	 * with this new mntinfo.
   2187 	 */
   2188 	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
   2189 	eph->ne_mount = mi;
   2190 	MI4_HOLD(mi);
   2191 	VFS_HOLD(mi->mi_vfsp);
   2192 	eph->ne_ref_time = gethrestime_sec();
   2193 
   2194 	/*
   2195 	 * We need to tell the ephemeral mount when
   2196 	 * to time out.
   2197 	 */
   2198 	eph->ne_mount_to = ntg->ntg_mount_to;
   2199 
   2200 	mi->mi_ephemeral = eph;
   2201 
   2202 	/*
   2203 	 * If the enclosing mntinfo4 is also ephemeral,
   2204 	 * then we need to point to its enclosing parent.
   2205 	 * Else the enclosing mntinfo4 is the enclosing parent.
   2206 	 *
   2207 	 * We also need to weave this ephemeral node
   2208 	 * into the tree.
   2209 	 */
   2210 	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
   2211 		/*
   2212 		 * We need to decide if we are
   2213 		 * the root node of this branch
   2214 		 * or if we are a sibling of this
   2215 		 * branch.
   2216 		 */
   2217 		prior = mi_parent->mi_ephemeral;
   2218 		if (prior == NULL) {
   2219 			/*
   2220 			 * Race condition, clean up, and
   2221 			 * let caller handle mntinfo.
   2222 			 */
   2223 			mi->mi_flags &= ~MI4_EPHEMERAL;
   2224 			mi->mi_ephemeral = NULL;
   2225 			kmem_free(eph, sizeof (*eph));
   2226 			VFS_RELE(mi->mi_vfsp);
   2227 			MI4_RELE(mi);
   2228 			nfs4_ephemeral_tree_rele(net);
   2229 			rc = EBUSY;
   2230 		} else {
   2231 			if (prior->ne_child == NULL) {
   2232 				prior->ne_child = eph;
   2233 			} else {
   2234 				child = prior->ne_child;
   2235 
   2236 				prior->ne_child = eph;
   2237 				eph->ne_peer = child;
   2238 
   2239 				child->ne_prior = eph;
   2240 			}
   2241 
   2242 			eph->ne_prior = prior;
   2243 		}
   2244 	} else {
   2245 		/*
   2246 		 * The parent mntinfo4 is the non-ephemeral
   2247 		 * root of the ephemeral tree. We
   2248 		 * need to decide if we are the root
   2249 		 * node of that tree or if we are a
   2250 		 * sibling of the root node.
   2251 		 *
   2252 		 * We are the root if there is no
   2253 		 * other node.
   2254 		 */
   2255 		if (net->net_root == NULL) {
   2256 			net->net_root = eph;
   2257 		} else {
   2258 			eph->ne_peer = peer = net->net_root;
   2259 			ASSERT(peer != NULL);
   2260 			net->net_root = eph;
   2261 
   2262 			peer->ne_prior = eph;
   2263 		}
   2264 
   2265 		eph->ne_prior = NULL;
   2266 	}
   2267 
   2268 	mutex_exit(&mi->mi_lock);
   2269 	mutex_exit(&mi_parent->mi_lock);
   2270 
   2271 	return (rc);
   2272 }
   2273 
   2274 /*
   2275  * Commit the changes to the ephemeral tree for removing this node.
   2276  */
   2277 static void
   2278 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
   2279 {
   2280 	nfs4_ephemeral_t	*e = eph;
   2281 	nfs4_ephemeral_t	*peer;
   2282 	nfs4_ephemeral_t	*prior;
   2283 
   2284 	peer = eph->ne_peer;
   2285 	prior = e->ne_prior;
   2286 
   2287 	/*
   2288 	 * If this branch root was not the
   2289 	 * tree root, then we need to fix back pointers.
   2290 	 */
   2291 	if (prior) {
   2292 		if (prior->ne_child == e) {
   2293 			prior->ne_child = peer;
   2294 		} else {
   2295 			prior->ne_peer = peer;
   2296 		}
   2297 
   2298 		if (peer)
   2299 			peer->ne_prior = prior;
   2300 	} else if (peer) {
   2301 		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
   2302 		peer->ne_prior = NULL;
   2303 	} else {
   2304 		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
   2305 	}
   2306 }
   2307 
   2308 /*
   2309  * We want to avoid recursion at all costs. So we need to
   2310  * unroll the tree. We do this by a depth first traversal to
   2311  * leaf nodes. We blast away the leaf and work our way back
   2312  * up and down the tree.
   2313  */
   2314 static int
   2315 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
   2316     int isTreeRoot, int flag, cred_t *cr)
   2317 {
   2318 	nfs4_ephemeral_t	*e = eph;
   2319 	nfs4_ephemeral_t	*prior;
   2320 	mntinfo4_t		*mi;
   2321 	vfs_t			*vfsp;
   2322 	int			error;
   2323 
   2324 	/*
   2325 	 * We use the loop while unrolling the ephemeral tree.
   2326 	 */
   2327 	for (;;) {
   2328 		/*
   2329 		 * First we walk down the child.
   2330 		 */
   2331 		if (e->ne_child) {
   2332 			prior = e;
   2333 			e = e->ne_child;
   2334 			continue;
   2335 		}
   2336 
   2337 		/*
   2338 		 * If we are the root of the branch we are removing,
   2339 		 * we end it here. But if the branch is the root of
   2340 		 * the tree, we have to forge on. We do not consider
   2341 		 * the peer list for the root because while it may
   2342 		 * be okay to remove, it is both extra work and a
   2343 		 * potential for a false-positive error to stall the
   2344 		 * unmount attempt.
   2345 		 */
   2346 		if (e == eph && isTreeRoot == FALSE)
   2347 			return (0);
   2348 
   2349 		/*
   2350 		 * Next we walk down the peer list.
   2351 		 */
   2352 		if (e->ne_peer) {
   2353 			prior = e;
   2354 			e = e->ne_peer;
   2355 			continue;
   2356 		}
   2357 
   2358 		/*
   2359 		 * We can only remove the node passed in by the
   2360 		 * caller if it is the root of the ephemeral tree.
   2361 		 * Otherwise, the caller will remove it.
   2362 		 */
   2363 		if (e == eph && isTreeRoot == FALSE)
   2364 			return (0);
   2365 
   2366 		/*
   2367 		 * Okay, we have a leaf node, time
   2368 		 * to prune it!
   2369 		 *
   2370 		 * Note that prior can only be NULL if
   2371 		 * and only if it is the root of the
   2372 		 * ephemeral tree.
   2373 		 */
   2374 		prior = e->ne_prior;
   2375 
   2376 		mi = e->ne_mount;
   2377 		mutex_enter(&mi->mi_lock);
   2378 		vfsp = mi->mi_vfsp;
   2379 		ASSERT(vfsp != NULL);
   2380 
   2381 		/*
   2382 		 * Cleared by umount2_engine.
   2383 		 */
   2384 		VFS_HOLD(vfsp);
   2385 
   2386 		/*
   2387 		 * Inform nfs4_unmount to not recursively
   2388 		 * descend into this node's children when it
   2389 		 * gets processed.
   2390 		 */
   2391 		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
   2392 		mutex_exit(&mi->mi_lock);
   2393 
   2394 		error = umount2_engine(vfsp, flag, cr, FALSE);
   2395 		if (error) {
   2396 			/*
   2397 			 * We need to reenable nfs4_unmount's ability
   2398 			 * to recursively descend on this node.
   2399 			 */
   2400 			mutex_enter(&mi->mi_lock);
   2401 			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
   2402 			mutex_exit(&mi->mi_lock);
   2403 
   2404 			return (error);
   2405 		}
   2406 
   2407 		/*
   2408 		 * If we are the current node, we do not want to
   2409 		 * touch anything else. At this point, the only
   2410 		 * way the current node can have survived to here
   2411 		 * is if it is the root of the ephemeral tree and
   2412 		 * we are unmounting the enclosing mntinfo4.
   2413 		 */
   2414 		if (e == eph) {
   2415 			ASSERT(prior == NULL);
   2416 			return (0);
   2417 		}
   2418 
   2419 		/*
   2420 		 * Stitch up the prior node. Note that since
   2421 		 * we have handled the root of the tree, prior
   2422 		 * must be non-NULL.
   2423 		 */
   2424 		ASSERT(prior != NULL);
   2425 		if (prior->ne_child == e) {
   2426 			prior->ne_child = NULL;
   2427 		} else {
   2428 			ASSERT(prior->ne_peer == e);
   2429 
   2430 			prior->ne_peer = NULL;
   2431 		}
   2432 
   2433 		e = prior;
   2434 	}
   2435 
   2436 	/* NOTREACHED */
   2437 }
   2438 
   2439 /*
   2440  * Common code to safely release net_cnt_lock and net_tree_lock
   2441  */
   2442 void
   2443 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
   2444     nfs4_ephemeral_tree_t **pnet)
   2445 {
   2446 	nfs4_ephemeral_tree_t	*net = *pnet;
   2447 
   2448 	if (*pmust_unlock) {
   2449 		mutex_enter(&net->net_cnt_lock);
   2450 		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
   2451 		mutex_exit(&net->net_cnt_lock);
   2452 
   2453 		mutex_exit(&net->net_tree_lock);
   2454 
   2455 		*pmust_unlock = FALSE;
   2456 	}
   2457 }
   2458 
   2459 /*
   2460  * While we may have removed any child or sibling nodes of this
   2461  * ephemeral node, we can not nuke it until we know that there
   2462  * were no actived vnodes on it. This will do that final
   2463  * work once we know it is not busy.
   2464  */
   2465 void
   2466 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
   2467     nfs4_ephemeral_tree_t **pnet)
   2468 {
   2469 	/*
   2470 	 * Now we need to get rid of the ephemeral data if it exists.
   2471 	 */
   2472 	mutex_enter(&mi->mi_lock);
   2473 	if (mi->mi_ephemeral) {
   2474 		/*
   2475 		 * If we are the root node of an ephemeral branch
   2476 		 * which is being removed, then we need to fixup
   2477 		 * pointers into and out of the node.
   2478 		 */
   2479 		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
   2480 			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
   2481 
   2482 		nfs4_ephemeral_tree_rele(*pnet);
   2483 		ASSERT(mi->mi_ephemeral != NULL);
   2484 
   2485 		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
   2486 		mi->mi_ephemeral = NULL;
   2487 		VFS_RELE(mi->mi_vfsp);
   2488 		MI4_RELE(mi);
   2489 	}
   2490 	mutex_exit(&mi->mi_lock);
   2491 
   2492 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
   2493 }
   2494 
   2495 /*
   2496  * Unmount an ephemeral node.
   2497  *
   2498  * Note that if this code fails, then it must unlock.
   2499  *
   2500  * If it succeeds, then the caller must be prepared to do so.
   2501  */
   2502 int
   2503 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
   2504     bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
   2505 {
   2506 	int			error = 0;
   2507 	nfs4_ephemeral_t	*eph;
   2508 	nfs4_ephemeral_tree_t	*net;
   2509 	int			is_derooting = FALSE;
   2510 	int			is_recursed = FALSE;
   2511 	int			was_locked = FALSE;
   2512 
   2513 	/*
   2514 	 * Make sure to set the default state for cleaning
   2515 	 * up the tree in the caller (and on the way out).
   2516 	 */
   2517 	*pmust_unlock = FALSE;
   2518 
   2519 	/*
   2520 	 * The active vnodes on this file system may be ephemeral
   2521 	 * children. We need to check for and try to unmount them
   2522 	 * here. If any can not be unmounted, we are going
   2523 	 * to return EBUSY.
   2524 	 */
   2525 	mutex_enter(&mi->mi_lock);
   2526 
   2527 	/*
   2528 	 * If an ephemeral tree, we need to check to see if
   2529 	 * the lock is already held. If it is, then we need
   2530 	 * to see if we are being called as a result of
   2531 	 * the recursive removal of some node of the tree or
   2532 	 * if we are another attempt to remove the tree.
   2533 	 *
   2534 	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
   2535 	 * node. mi_ephemeral being non-NULL also does this.
   2536 	 *
   2537 	 * mi_ephemeral_tree being non-NULL is sufficient
   2538 	 * to also indicate either it is an ephemeral node
   2539 	 * or the enclosing mntinfo4.
   2540 	 *
   2541 	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
   2542 	 * when we delete the ephemeral node and need to
   2543 	 * differentiate from an ephemeral node and the
   2544 	 * enclosing root node.
   2545 	 */
   2546 	*pnet = net = mi->mi_ephemeral_tree;
   2547 	if (net == NULL) {
   2548 		mutex_exit(&mi->mi_lock);
   2549 		return (0);
   2550 	}
   2551 
   2552 	eph = mi->mi_ephemeral;
   2553 	is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
   2554 	is_derooting = (eph == NULL);
   2555 
   2556 	mutex_enter(&net->net_cnt_lock);
   2557 
   2558 	/*
   2559 	 * If this is not recursion, then we need to
   2560 	 * check to see if a harvester thread has
   2561 	 * already grabbed the lock.
   2562 	 *
   2563 	 * After we exit this branch, we may not
   2564 	 * blindly return, we need to jump to
   2565 	 * is_busy!
   2566 	 */
   2567 	if (!is_recursed) {
   2568 		if (net->net_status &
   2569 		    NFS4_EPHEMERAL_TREE_LOCKED) {
   2570 			/*
   2571 			 * If the tree is locked, we need
   2572 			 * to decide whether we are the
   2573 			 * harvester or some explicit call
   2574 			 * for a umount. The only way that
   2575 			 * we are the harvester is if
   2576 			 * MS_SYSSPACE is set.
   2577 			 *
   2578 			 * We only let the harvester through
   2579 			 * at this point.
   2580 			 *
   2581 			 * We return EBUSY so that the
   2582 			 * caller knows something is
   2583 			 * going on. Note that by that
   2584 			 * time, the umount in the other
   2585 			 * thread may have already occured.
   2586 			 */
   2587 			if (!(flag & MS_SYSSPACE)) {
   2588 				mutex_exit(&net->net_cnt_lock);
   2589 				mutex_exit(&mi->mi_lock);
   2590 
   2591 				return (EBUSY);
   2592 			}
   2593 
   2594 			was_locked = TRUE;
   2595 		}
   2596 	}
   2597 
   2598 	mutex_exit(&net->net_cnt_lock);
   2599 	mutex_exit(&mi->mi_lock);
   2600 
   2601 	/*
   2602 	 * If we are not the harvester, we need to check
   2603 	 * to see if we need to grab the tree lock.
   2604 	 */
   2605 	if (was_locked == FALSE) {
   2606 		/*
   2607 		 * If we grab the lock, it means that no other
   2608 		 * operation is working on the tree. If we don't
   2609 		 * grab it, we need to decide if this is because
   2610 		 * we are a recursive call or a new operation.
   2611 		 */
   2612 		if (mutex_tryenter(&net->net_tree_lock)) {
   2613 			*pmust_unlock = TRUE;
   2614 		} else {
   2615 			/*
   2616 			 * If we are a recursive call, we can
   2617 			 * proceed without the lock.
   2618 			 * Otherwise we have to wait until
   2619 			 * the lock becomes free.
   2620 			 */
   2621 			if (!is_recursed) {
   2622 				mutex_enter(&net->net_cnt_lock);
   2623 				if (net->net_status &
   2624 				    (NFS4_EPHEMERAL_TREE_DEROOTING
   2625 				    | NFS4_EPHEMERAL_TREE_INVALID)) {
   2626 					mutex_exit(&net->net_cnt_lock);
   2627 					goto is_busy;
   2628 				}
   2629 				mutex_exit(&net->net_cnt_lock);
   2630 
   2631 				/*
   2632 				 * We can't hold any other locks whilst
   2633 				 * we wait on this to free up.
   2634 				 */
   2635 				mutex_enter(&net->net_tree_lock);
   2636 
   2637 				/*
   2638 				 * Note that while mi->mi_ephemeral
   2639 				 * may change and thus we have to
   2640 				 * update eph, it is the case that
   2641 				 * we have tied down net and
   2642 				 * do not care if mi->mi_ephemeral_tree
   2643 				 * has changed.
   2644 				 */
   2645 				mutex_enter(&mi->mi_lock);
   2646 				eph = mi->mi_ephemeral;
   2647 				mutex_exit(&mi->mi_lock);
   2648 
   2649 				/*
   2650 				 * Okay, we need to see if either the
   2651 				 * tree got nuked or the current node
   2652 				 * got nuked. Both of which will cause
   2653 				 * an error.
   2654 				 *
   2655 				 * Note that a subsequent retry of the
   2656 				 * umount shall work.
   2657 				 */
   2658 				mutex_enter(&net->net_cnt_lock);
   2659 				if (net->net_status &
   2660 				    NFS4_EPHEMERAL_TREE_INVALID ||
   2661 				    (!is_derooting && eph == NULL)) {
   2662 					mutex_exit(&net->net_cnt_lock);
   2663 					mutex_exit(&net->net_tree_lock);
   2664 					goto is_busy;
   2665 				}
   2666 				mutex_exit(&net->net_cnt_lock);
   2667 				*pmust_unlock = TRUE;
   2668 			}
   2669 		}
   2670 	}
   2671 
   2672 	/*
   2673 	 * Only once we have grabbed the lock can we mark what we
   2674 	 * are planning on doing to the ephemeral tree.
   2675 	 */
   2676 	if (*pmust_unlock) {
   2677 		mutex_enter(&net->net_cnt_lock);
   2678 		net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
   2679 
   2680 		/*
   2681 		 * Check to see if we are nuking the root.
   2682 		 */
   2683 		if (is_derooting)
   2684 			net->net_status |=
   2685 			    NFS4_EPHEMERAL_TREE_DEROOTING;
   2686 		mutex_exit(&net->net_cnt_lock);
   2687 	}
   2688 
   2689 	if (!is_derooting) {
   2690 		/*
   2691 		 * Only work on children if the caller has not already
   2692 		 * done so.
   2693 		 */
   2694 		if (!is_recursed) {
   2695 			ASSERT(eph != NULL);
   2696 
   2697 			error = nfs4_ephemeral_unmount_engine(eph,
   2698 			    FALSE, flag, cr);
   2699 			if (error)
   2700 				goto is_busy;
   2701 		}
   2702 	} else {
   2703 		eph = net->net_root;
   2704 
   2705 		/*
   2706 		 * Only work if there is something there.
   2707 		 */
   2708 		if (eph) {
   2709 			error = nfs4_ephemeral_unmount_engine(eph, TRUE,
   2710 			    flag, cr);
   2711 			if (error) {
   2712 				mutex_enter(&net->net_cnt_lock);
   2713 				net->net_status &=
   2714 				    ~NFS4_EPHEMERAL_TREE_DEROOTING;
   2715 				mutex_exit(&net->net_cnt_lock);
   2716 				goto is_busy;
   2717 			}
   2718 
   2719 			/*
   2720 			 * Nothing else which goes wrong will
   2721 			 * invalidate the blowing away of the
   2722 			 * ephmeral tree.
   2723 			 */
   2724 			net->net_root = NULL;
   2725 		}
   2726 
   2727 		/*
   2728 		 * We have derooted and we have caused the tree to be
   2729 		 * invalidated.
   2730 		 */
   2731 		mutex_enter(&net->net_cnt_lock);
   2732 		net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
   2733 		net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
   2734 		DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting,
   2735 		    uint_t, net->net_refcnt);
   2736 
   2737 		/*
   2738 		 * We will not finalize this node, so safe to
   2739 		 * release it.
   2740 		 */
   2741 		nfs4_ephemeral_tree_decr(net);
   2742 		mutex_exit(&net->net_cnt_lock);
   2743 
   2744 		if (was_locked == FALSE)
   2745 			mutex_exit(&net->net_tree_lock);
   2746 
   2747 		/*
   2748 		 * We have just blown away any notation of this
   2749 		 * tree being locked or having a refcnt.
   2750 		 * We can't let the caller try to clean things up.
   2751 		 */
   2752 		*pmust_unlock = FALSE;
   2753 
   2754 		/*
   2755 		 * At this point, the tree should no longer be
   2756 		 * associated with the mntinfo4. We need to pull
   2757 		 * it off there and let the harvester take
   2758 		 * care of it once the refcnt drops.
   2759 		 */
   2760 		mutex_enter(&mi->mi_lock);
   2761 		mi->mi_ephemeral_tree = NULL;
   2762 		mutex_exit(&mi->mi_lock);
   2763 	}
   2764 
   2765 	return (0);
   2766 
   2767 is_busy:
   2768 
   2769 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
   2770 
   2771 	return (error);
   2772 }
   2773 
   2774 /*
   2775  * Do the umount and record any error in the parent.
   2776  */
   2777 static void
   2778 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
   2779     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
   2780 {
   2781 	int	error;
   2782 
   2783 	/*
   2784 	 * Only act on if the fs is still mounted.
   2785 	 */
   2786 	if (vfsp == NULL)
   2787 		return;
   2788 
   2789 	error = umount2_engine(vfsp, flag, kcred, FALSE);
   2790 	if (error) {
   2791 		if (prior) {
   2792 			if (prior->ne_child == e)
   2793 				prior->ne_state |=
   2794 				    NFS4_EPHEMERAL_CHILD_ERROR;
   2795 			else
   2796 				prior->ne_state |=
   2797 				    NFS4_EPHEMERAL_PEER_ERROR;
   2798 		}
   2799 	}
   2800 }
   2801 
   2802 /*
   2803  * For each tree in the forest (where the forest is in
   2804  * effect all of the ephemeral trees for this zone),
   2805  * scan to see if a node can be unmounted. Note that
   2806  * unlike nfs4_ephemeral_unmount_engine(), we do
   2807  * not process the current node before children or
   2808  * siblings. I.e., if a node can be unmounted, we
   2809  * do not recursively check to see if the nodes
   2810  * hanging off of it can also be unmounted.
   2811  *
   2812  * Instead, we delve down deep to try and remove the
   2813  * children first. Then, because we share code with
   2814  * nfs4_ephemeral_unmount_engine(), we will try
   2815  * them again. This could be a performance issue in
   2816  * the future.
   2817  *
   2818  * Also note that unlike nfs4_ephemeral_unmount_engine(),
   2819  * we do not halt on an error. We will not remove the
   2820  * current node, but we will keep on trying to remove
   2821  * the others.
   2822  *
   2823  * force indicates that we want the unmount to occur
   2824  * even if there is something blocking it.
   2825  *
   2826  * time_check indicates that we want to see if the
   2827  * mount has expired past mount_to or not. Typically
   2828  * we want to do this and only on a shutdown of the
   2829  * zone would we want to ignore the check.
   2830  */
   2831 static void
   2832 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
   2833     bool_t force, bool_t time_check)
   2834 {
   2835 	nfs4_ephemeral_tree_t	*net;
   2836 	nfs4_ephemeral_tree_t	*prev = NULL;
   2837 	nfs4_ephemeral_tree_t	*next;
   2838 	nfs4_ephemeral_t	*e;
   2839 	nfs4_ephemeral_t	*prior;
   2840 	time_t			now = gethrestime_sec();
   2841 
   2842 	nfs4_ephemeral_tree_t	*harvest = NULL;
   2843 
   2844 	int			flag;
   2845 
   2846 	mntinfo4_t		*mi;
   2847 	vfs_t			*vfsp;
   2848 
   2849 	if (force)
   2850 		flag = MS_FORCE | MS_SYSSPACE;
   2851 	else
   2852 		flag = MS_SYSSPACE;
   2853 
   2854 	mutex_enter(&ntg->ntg_forest_lock);
   2855 	for (net = ntg->ntg_forest; net != NULL; net = next) {
   2856 		next = net->net_next;
   2857 
   2858 		nfs4_ephemeral_tree_hold(net);
   2859 
   2860 		mutex_enter(&net->net_tree_lock);
   2861 
   2862 		/*
   2863 		 * Let the unmount code know that the
   2864 		 * tree is already locked!
   2865 		 */
   2866 		mutex_enter(&net->net_cnt_lock);
   2867 		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
   2868 		mutex_exit(&net->net_cnt_lock);
   2869 
   2870 		/*
   2871 		 * If the intent is force all ephemeral nodes to
   2872 		 * be unmounted in this zone, we can short circuit a
   2873 		 * lot of tree traversal and simply zap the root node.
   2874 		 */
   2875 		if (force) {
   2876 			if (net->net_root) {
   2877 				mi = net->net_root->ne_mount;
   2878 
   2879 				vfsp = mi->mi_vfsp;
   2880 				ASSERT(vfsp != NULL);
   2881 
   2882 				/*
   2883 				 * Cleared by umount2_engine.
   2884 				 */
   2885 				VFS_HOLD(vfsp);
   2886 
   2887 				(void) umount2_engine(vfsp, flag,
   2888 				    kcred, FALSE);
   2889 
   2890 				goto check_done;
   2891 			}
   2892 		}
   2893 
   2894 		e = net->net_root;
   2895 		if (e)
   2896 			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
   2897 
   2898 		while (e) {
   2899 			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
   2900 				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
   2901 				if (e->ne_child) {
   2902 					e = e->ne_child;
   2903 					e->ne_state =
   2904 					    NFS4_EPHEMERAL_VISIT_CHILD;
   2905 				}
   2906 
   2907 				continue;
   2908 			} else if (e->ne_state ==
   2909 			    NFS4_EPHEMERAL_VISIT_SIBLING) {
   2910 				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
   2911 				if (e->ne_peer) {
   2912 					e = e->ne_peer;
   2913 					e->ne_state =
   2914 					    NFS4_EPHEMERAL_VISIT_CHILD;
   2915 				}
   2916 
   2917 				continue;
   2918 			} else if (e->ne_state ==
   2919 			    NFS4_EPHEMERAL_CHILD_ERROR) {
   2920 				prior = e->ne_prior;
   2921 
   2922 				/*
   2923 				 * If a child reported an error, do
   2924 				 * not bother trying to unmount.
   2925 				 *
   2926 				 * If your prior node is a parent,
   2927 				 * pass the error up such that they
   2928 				 * also do not try to unmount.
   2929 				 *
   2930 				 * However, if your prior is a sibling,
   2931 				 * let them try to unmount if they can.
   2932 				 */
   2933 				if (prior) {
   2934 					if (prior->ne_child == e)
   2935 						prior->ne_state |=
   2936 						    NFS4_EPHEMERAL_CHILD_ERROR;
   2937 					else
   2938 						prior->ne_state |=
   2939 						    NFS4_EPHEMERAL_PEER_ERROR;
   2940 				}
   2941 
   2942 				/*
   2943 				 * Clear the error and if needed, process peers.
   2944 				 *
   2945 				 * Once we mask out the error, we know whether
   2946 				 * or we have to process another node.
   2947 				 */
   2948 				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
   2949 				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
   2950 					e = prior;
   2951 
   2952 				continue;
   2953 			} else if (e->ne_state ==
   2954 			    NFS4_EPHEMERAL_PEER_ERROR) {
   2955 				prior = e->ne_prior;
   2956 
   2957 				if (prior) {
   2958 					if (prior->ne_child == e)
   2959 						prior->ne_state =
   2960 						    NFS4_EPHEMERAL_CHILD_ERROR;
   2961 					else
   2962 						prior->ne_state =
   2963 						    NFS4_EPHEMERAL_PEER_ERROR;
   2964 				}
   2965 
   2966 				/*
   2967 				 * Clear the error from this node and do the
   2968 				 * correct processing.
   2969 				 */
   2970 				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
   2971 				continue;
   2972 			}
   2973 
   2974 			prior = e->ne_prior;
   2975 			e->ne_state = NFS4_EPHEMERAL_OK;
   2976 
   2977 			/*
   2978 			 * It must be the case that we need to process
   2979 			 * this node.
   2980 			 */
   2981 			if (!time_check ||
   2982 			    now - e->ne_ref_time > e->ne_mount_to) {
   2983 				mi = e->ne_mount;
   2984 				vfsp = mi->mi_vfsp;
   2985 
   2986 				/*
   2987 				 * Cleared by umount2_engine.
   2988 				 */
   2989 				if (vfsp != NULL)
   2990 					VFS_HOLD(vfsp);
   2991 
   2992 				/*
   2993 				 * Note that we effectively work down to the
   2994 				 * leaf nodes first, try to unmount them,
   2995 				 * then work our way back up into the leaf
   2996 				 * nodes.
   2997 				 *
   2998 				 * Also note that we deal with a lot of
   2999 				 * complexity by sharing the work with
   3000 				 * the manual unmount code.
   3001 				 */
   3002 				nfs4_ephemeral_record_umount(vfsp, flag,
   3003 				    e, prior);
   3004 			}
   3005 
   3006 			e = prior;
   3007 		}
   3008 
   3009 check_done:
   3010 
   3011 		/*
   3012 		 * At this point we are done processing this tree.
   3013 		 *
   3014 		 * If the tree is invalid and we were the only reference
   3015 		 * to it, then we push it on the local linked list
   3016 		 * to remove it at the end. We avoid that action now
   3017 		 * to keep the tree processing going along at a fair clip.
   3018 		 *
   3019 		 * Else, even if we were the only reference, we
   3020 		 * allow it to be reused as needed.
   3021 		 */
   3022 		mutex_enter(&net->net_cnt_lock);
   3023 		nfs4_ephemeral_tree_decr(net);
   3024 		if (net->net_refcnt == 0 &&
   3025 		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
   3026 			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
   3027 			mutex_exit(&net->net_cnt_lock);
   3028 			mutex_exit(&net->net_tree_lock);
   3029 
   3030 			if (prev)
   3031 				prev->net_next = net->net_next;
   3032 			else
   3033 				ntg->ntg_forest = net->net_next;
   3034 
   3035 			net->net_next = harvest;
   3036 			harvest = net;
   3037 
   3038 			VFS_RELE(net->net_mount->mi_vfsp);
   3039 			MI4_RELE(net->net_mount);
   3040 
   3041 			continue;
   3042 		}
   3043 
   3044 		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
   3045 		mutex_exit(&net->net_cnt_lock);
   3046 		mutex_exit(&net->net_tree_lock);
   3047 
   3048 		prev = net;
   3049 	}
   3050 	mutex_exit(&ntg->ntg_forest_lock);
   3051 
   3052 	for (net = harvest; net != NULL; net = next) {
   3053 		next = net->net_next;
   3054 
   3055 		mutex_destroy(&net->net_tree_lock);
   3056 		mutex_destroy(&net->net_cnt_lock);
   3057 		kmem_free(net, sizeof (*net));
   3058 	}
   3059 }
   3060 
   3061 /*
   3062  * This is the thread which decides when the harvesting
   3063  * can proceed and when to kill it off for this zone.
   3064  */
   3065 static void
   3066 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
   3067 {
   3068 	clock_t		timeleft;
   3069 	zone_t		*zone = curproc->p_zone;
   3070 
   3071 	for (;;) {
   3072 		timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
   3073 		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
   3074 
   3075 		/*
   3076 		 * zone is exiting...
   3077 		 */
   3078 		if (timeleft != -1) {
   3079 			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
   3080 			zthread_exit();
   3081 			/* NOTREACHED */
   3082 		}
   3083 
   3084 		/*
   3085 		 * Only bother scanning if there is potential
   3086 		 * work to be done.
   3087 		 */
   3088 		if (ntg->ntg_forest == NULL)
   3089 			continue;
   3090 
   3091 		/*
   3092 		 * Now scan the list and get rid of everything which
   3093 		 * is old.
   3094 		 */
   3095 		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
   3096 	}
   3097 
   3098 	/* NOTREACHED */
   3099 }
   3100 
   3101 /*
   3102  * The zone specific glue needed to start the unmount harvester.
   3103  *
   3104  * Note that we want to avoid holding the mutex as long as possible,
   3105  * hence the multiple checks.
   3106  *
   3107  * The caller should avoid us getting down here in the first
   3108  * place.
   3109  */
   3110 static void
   3111 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
   3112 {
   3113 	/*
   3114 	 * It got started before we got here...
   3115 	 */
   3116 	if (ntg->ntg_thread_started)
   3117 		return;
   3118 
   3119 	mutex_enter(&nfs4_ephemeral_thread_lock);
   3120 
   3121 	if (ntg->ntg_thread_started) {
   3122 		mutex_exit(&nfs4_ephemeral_thread_lock);
   3123 		return;
   3124 	}
   3125 
   3126 	/*
   3127 	 * Start the unmounter harvester thread for this zone.
   3128 	 */
   3129 	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
   3130 	    ntg, 0, minclsyspri);
   3131 
   3132 	ntg->ntg_thread_started = TRUE;
   3133 	mutex_exit(&nfs4_ephemeral_thread_lock);
   3134 }
   3135 
   3136 /*ARGSUSED*/
   3137 static void *
   3138 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
   3139 {
   3140 	nfs4_trigger_globals_t	*ntg;
   3141 
   3142 	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
   3143 	ntg->ntg_thread_started = FALSE;
   3144 
   3145 	/*
   3146 	 * This is the default....
   3147 	 */
   3148 	ntg->ntg_mount_to = nfs4_trigger_thread_timer;
   3149 
   3150 	mutex_init(&ntg->ntg_forest_lock, NULL,
   3151 	    MUTEX_DEFAULT, NULL);
   3152 
   3153 	return (ntg);
   3154 }
   3155 
   3156 /*
   3157  * Try a nice gentle walk down the forest and convince
   3158  * all of the trees to gracefully give it up.
   3159  */
   3160 /*ARGSUSED*/
   3161 static void
   3162 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
   3163 {
   3164 	nfs4_trigger_globals_t	*ntg = arg;
   3165 
   3166 	if (!ntg)
   3167 		return;
   3168 
   3169 	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
   3170 }
   3171 
   3172 /*
   3173  * Race along the forest and rip all of the trees out by
   3174  * their rootballs!
   3175  */
   3176 /*ARGSUSED*/
   3177 static void
   3178 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
   3179 {
   3180 	nfs4_trigger_globals_t	*ntg = arg;
   3181 
   3182 	if (!ntg)
   3183 		return;
   3184 
   3185 	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
   3186 
   3187 	mutex_destroy(&ntg->ntg_forest_lock);
   3188 	kmem_free(ntg, sizeof (*ntg));
   3189 }
   3190 
   3191 /*
   3192  * This is the zone independent cleanup needed for
   3193  * emphemeral mount processing.
   3194  */
   3195 void
   3196 nfs4_ephemeral_fini(void)
   3197 {
   3198 	(void) zone_key_delete(nfs4_ephemeral_key);
   3199 	mutex_destroy(&nfs4_ephemeral_thread_lock);
   3200 }
   3201 
   3202 /*
   3203  * This is the zone independent initialization needed for
   3204  * emphemeral mount processing.
   3205  */
   3206 void
   3207 nfs4_ephemeral_init(void)
   3208 {
   3209 	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
   3210 	    NULL);
   3211 
   3212 	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
   3213 	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
   3214 }
   3215 
   3216 /*
   3217  * nfssys() calls this function to set the per-zone
   3218  * value of mount_to to drive when an ephemeral mount is
   3219  * timed out. Each mount will grab a copy of this value
   3220  * when mounted.
   3221  */
   3222 void
   3223 nfs4_ephemeral_set_mount_to(uint_t mount_to)
   3224 {
   3225 	nfs4_trigger_globals_t	*ntg;
   3226 	zone_t			*zone = curproc->p_zone;
   3227 
   3228 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
   3229 
   3230 	ntg->ntg_mount_to = mount_to;
   3231 }
   3232 
   3233 /*
   3234  * Walk the list of v4 mount options; if they are currently set in vfsp,
   3235  * append them to a new comma-separated mount option string, and return it.
   3236  *
   3237  * Caller should free by calling nfs4_trigger_destroy_mntopts().
   3238  */
   3239 static char *
   3240 nfs4_trigger_create_mntopts(vfs_t *vfsp)
   3241 {
   3242 	uint_t i;
   3243 	char *mntopts;
   3244 	struct vfssw *vswp;
   3245 	mntopts_t *optproto;
   3246 
   3247 	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
   3248 
   3249 	/* get the list of applicable mount options for v4; locks *vswp */
   3250 	vswp = vfs_getvfssw(MNTTYPE_NFS4);
   3251 	optproto = &vswp->vsw_optproto;
   3252 
   3253 	for (i = 0; i < optproto->mo_count; i++) {
   3254 		struct mntopt *mop = &optproto->mo_list[i];
   3255 
   3256 		if (mop->mo_flags & MO_EMPTY)
   3257 			continue;
   3258 
   3259 		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
   3260 			kmem_free(mntopts, MAX_MNTOPT_STR);
   3261 			vfs_unrefvfssw(vswp);
   3262 			return (NULL);
   3263 		}
   3264 	}
   3265 
   3266 	vfs_unrefvfssw(vswp);
   3267 
   3268 	/*
   3269 	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
   3270 	 * and it may only be passed via MS_OPTIONSTR, so we
   3271 	 * must handle it here.
   3272 	 *
   3273 	 * Ideally, it would be in the list, but NFS does not specify its
   3274 	 * own opt proto list, it uses instead the default one. Since
   3275 	 * not all filesystems support extended attrs, it would not be
   3276 	 * appropriate to add it there.
   3277 	 */
   3278 	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
   3279 	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
   3280 		kmem_free(mntopts, MAX_MNTOPT_STR);
   3281 		return (NULL);
   3282 	}
   3283 
   3284 	return (mntopts);
   3285 }
   3286 
   3287 static void
   3288 nfs4_trigger_destroy_mntopts(char *mntopts)
   3289 {
   3290 	if (mntopts)
   3291 		kmem_free(mntopts, MAX_MNTOPT_STR);
   3292 }
   3293 
   3294 /*
   3295  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
   3296  */
   3297 static int
   3298 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
   3299 {
   3300 	if (mntopts == NULL || optname == NULL || vfsp == NULL)
   3301 		return (EINVAL);
   3302 
   3303 	if (vfs_optionisset(vfsp, optname, NULL)) {
   3304 		size_t mntoptslen = strlen(mntopts);
   3305 		size_t optnamelen = strlen(optname);
   3306 
   3307 		/* +1 for ',', +1 for NUL */
   3308 		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
   3309 			return (EOVERFLOW);
   3310 
   3311 		/* first or subsequent mount option? */
   3312 		if (*mntopts != '\0')
   3313 			(void) strcat(mntopts, ",");
   3314 
   3315 		(void) strcat(mntopts, optname);
   3316 	}
   3317 
   3318 	return (0);
   3319 }
   3320 
   3321 static enum clnt_stat
   3322 nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr)
   3323 {
   3324 	int retries;
   3325 	uint_t max_msgsize;
   3326 	enum clnt_stat status;
   3327 	CLIENT *cl;
   3328 	struct timeval timeout;
   3329 
   3330 	/* as per recov_newserver() */
   3331 	max_msgsize = 0;
   3332 	retries = 1;
   3333 	timeout.tv_sec = 2;
   3334 	timeout.tv_usec = 0;
   3335 
   3336 	if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4,
   3337 	    max_msgsize, retries, CRED(), &cl) != 0)
   3338 		return (RPC_FAILED);
   3339 
   3340 	if (nointr)
   3341 		cl->cl_nosignal = TRUE;
   3342 	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
   3343 	    timeout);
   3344 	if (nointr)
   3345 		cl->cl_nosignal = FALSE;
   3346 
   3347 	AUTH_DESTROY(cl->cl_auth);
   3348 	CLNT_DESTROY(cl);
   3349 
   3350 	return (status);
   3351 }
   3352 
   3353 static enum clnt_stat
   3354 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
   3355 {
   3356 	return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr));
   3357 }
   3358