Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
     29  * triggered from a "stub" rnode via a special set of vnodeops.
     30  */
     31 
     32 #include <sys/param.h>
     33 #include <sys/types.h>
     34 #include <sys/systm.h>
     35 #include <sys/cred.h>
     36 #include <sys/time.h>
     37 #include <sys/vnode.h>
     38 #include <sys/vfs.h>
     39 #include <sys/vfs_opreg.h>
     40 #include <sys/file.h>
     41 #include <sys/filio.h>
     42 #include <sys/uio.h>
     43 #include <sys/buf.h>
     44 #include <sys/mman.h>
     45 #include <sys/pathname.h>
     46 #include <sys/dirent.h>
     47 #include <sys/debug.h>
     48 #include <sys/vmsystm.h>
     49 #include <sys/fcntl.h>
     50 #include <sys/flock.h>
     51 #include <sys/swap.h>
     52 #include <sys/errno.h>
     53 #include <sys/strsubr.h>
     54 #include <sys/sysmacros.h>
     55 #include <sys/kmem.h>
     56 #include <sys/mount.h>
     57 #include <sys/cmn_err.h>
     58 #include <sys/pathconf.h>
     59 #include <sys/utsname.h>
     60 #include <sys/dnlc.h>
     61 #include <sys/acl.h>
     62 #include <sys/systeminfo.h>
     63 #include <sys/policy.h>
     64 #include <sys/sdt.h>
     65 #include <sys/list.h>
     66 #include <sys/stat.h>
     67 #include <sys/mntent.h>
     68 
     69 #include <rpc/types.h>
     70 #include <rpc/auth.h>
     71 #include <rpc/clnt.h>
     72 
     73 #include <nfs/nfs.h>
     74 #include <nfs/nfs_clnt.h>
     75 #include <nfs/nfs_acl.h>
     76 #include <nfs/lm.h>
     77 #include <nfs/nfs4.h>
     78 #include <nfs/nfs4_kprot.h>
     79 #include <nfs/rnode4.h>
     80 #include <nfs/nfs4_clnt.h>
     81 
     82 #include <vm/hat.h>
     83 #include <vm/as.h>
     84 #include <vm/page.h>
     85 #include <vm/pvn.h>
     86 #include <vm/seg.h>
     87 #include <vm/seg_map.h>
     88 #include <vm/seg_kpm.h>
     89 #include <vm/seg_vn.h>
     90 
     91 #include <fs/fs_subr.h>
     92 
     93 #include <sys/ddi.h>
     94 #include <sys/int_fmtio.h>
     95 
     96 #include <sys/sunddi.h>
     97 
     98 #include <sys/priv_names.h>
     99 
    100 /*
    101  * The automatic unmounter thread stuff!
    102  */
    103 static int nfs4_trigger_thread_timer = 20;	/* in seconds */
    104 
    105 /*
    106  * Just a default....
    107  */
    108 static uint_t nfs4_trigger_mount_to = 240;
    109 
    110 typedef struct nfs4_trigger_globals {
    111 	kmutex_t		ntg_forest_lock;
    112 	uint_t			ntg_mount_to;
    113 	int			ntg_thread_started;
    114 	nfs4_ephemeral_tree_t	*ntg_forest;
    115 } nfs4_trigger_globals_t;
    116 
    117 kmutex_t	nfs4_ephemeral_thread_lock;
    118 
    119 zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
    120 
    121 static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
    122 
    123 /*
    124  * Used for ephemeral mounts; contains data either duplicated from
    125  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
    126  *
    127  * It's intended that this structure is used solely for ephemeral
    128  * mount-type specific data, for passing this data to
    129  * nfs4_trigger_nargs_create().
    130  */
    131 typedef struct ephemeral_servinfo {
    132 	char			*esi_hostname;
    133 	char			*esi_netname;
    134 	char			*esi_path;
    135 	int			esi_path_len;
    136 	int			esi_mount_flags;
    137 	struct netbuf		*esi_addr;
    138 	struct netbuf		*esi_syncaddr;
    139 	struct knetconfig	*esi_knconf;
    140 } ephemeral_servinfo_t;
    141 
    142 /*
    143  * Collect together the mount-type specific and generic data args.
    144  */
    145 typedef struct domount_args {
    146 	ephemeral_servinfo_t	*dma_esi;
    147 	char			*dma_hostlist; /* comma-sep. for RO failover */
    148 	struct nfs_args		*dma_nargs;
    149 } domount_args_t;
    150 
    151 
    152 /*
    153  * The vnode ops functions for a trigger stub vnode
    154  */
    155 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
    156 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
    157     caller_context_t *);
    158 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
    159     caller_context_t *);
    160 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
    161     caller_context_t *);
    162 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
    163     caller_context_t *);
    164 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
    165     struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
    166     int *, pathname_t *);
    167 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
    168     enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
    169     vsecattr_t *);
    170 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
    171     int);
    172 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
    173     caller_context_t *, int);
    174 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
    175     cred_t *, caller_context_t *, int);
    176 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
    177     vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
    178 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
    179     caller_context_t *, int);
    180 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
    181     cred_t *, caller_context_t *, int);
    182 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
    183 
    184 /*
    185  * Regular NFSv4 vnodeops that we need to reference directly
    186  */
    187 extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
    188 		    caller_context_t *);
    189 extern void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
    190 extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
    191 extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
    192 extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
    193 		    struct pathname *, int, vnode_t *, cred_t *,
    194 		    caller_context_t *, int *, pathname_t *);
    195 extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
    196 		    caller_context_t *);
    197 extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
    198 		    caller_context_t *);
    199 extern int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
    200 extern int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
    201 
    202 static int	nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
    203 static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
    204     cred_t *, vnode_t **);
    205 static domount_args_t  *nfs4_trigger_domount_args_create(vnode_t *);
    206 static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
    207     vnode_t *vp);
    208 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *);
    209 static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
    210 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
    211     servinfo4_t *);
    212 static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
    213     ephemeral_servinfo_t *);
    214 static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
    215 static char	*nfs4_trigger_create_mntopts(vfs_t *);
    216 static void	nfs4_trigger_destroy_mntopts(char *);
    217 static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
    218 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
    219 
    220 extern int	umount2_engine(vfs_t *, int, cred_t *, int);
    221 
    222 
    223 vnodeops_t *nfs4_trigger_vnodeops;
    224 
    225 /*
    226  * These are the vnodeops that we must define for stub vnodes.
    227  *
    228  *
    229  * Many of the VOPs defined for NFSv4 do not need to be defined here,
    230  * for various reasons. This will result in the VFS default function being
    231  * used:
    232  *
    233  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
    234  *   lost the reference to the stub vnode, meaning these should not be called:
    235  *       close, read, write, ioctl, readdir, seek.
    236  *
    237  * - These VOPs are meaningless for vnodes without data pages. Since the
    238  *   stub vnode is of type VDIR, these should not be called:
    239  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
    240  *
    241  * - These VOPs are otherwise not applicable, and should not be called:
    242  *       dump, setsecattr.
    243  *
    244  *
    245  * These VOPs we do not want to define, but nor do we want the VFS default
    246  * action. Instead, we specify the VFS error function, with fs_error(), but
    247  * note that fs_error() is not actually called. Instead it results in the
    248  * use of the error function defined for the particular VOP, in vn_ops_table[]:
    249  *
    250  * -   frlock, dispose, shrlock.
    251  *
    252  *
    253  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
    254  * NOTE: if any of these ops involve an OTW call with the stub FH, then
    255  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
    256  * to protect the security data in the servinfo4_t for the "parent"
    257  * filesystem that contains the stub.
    258  *
    259  * - These VOPs should not trigger a mount, so that "ls -l" does not:
    260  *       pathconf, getsecattr.
    261  *
    262  * - These VOPs would not make sense to trigger:
    263  *       inactive, rwlock, rwunlock, fid, realvp.
    264  */
    265 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
    266 	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
    267 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
    268 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
    269 	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
    270 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
    271 	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
    272 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
    273 	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
    274 	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
    275 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
    276 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
    277 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
    278 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
    279 	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
    280 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
    281 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
    282 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
    283 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
    284 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
    285 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
    286 	VOPNAME_FRLOCK,		{ .error = fs_error },
    287 	VOPNAME_DISPOSE,	{ .error = fs_error },
    288 	VOPNAME_SHRLOCK,	{ .error = fs_error },
    289 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
    290 	NULL, NULL
    291 };
    292 
    293 static void
    294 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
    295 {
    296 	ASSERT(mutex_owned(&net->net_cnt_lock));
    297 	net->net_refcnt++;
    298 	ASSERT(net->net_refcnt != 0);
    299 }
    300 
    301 static void
    302 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
    303 {
    304 	mutex_enter(&net->net_cnt_lock);
    305 	nfs4_ephemeral_tree_incr(net);
    306 	mutex_exit(&net->net_cnt_lock);
    307 }
    308 
    309 /*
    310  * We need a safe way to decrement the refcnt whilst the
    311  * lock is being held.
    312  */
    313 static void
    314 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
    315 {
    316 	ASSERT(mutex_owned(&net->net_cnt_lock));
    317 	ASSERT(net->net_refcnt != 0);
    318 	net->net_refcnt--;
    319 }
    320 
    321 static void
    322 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
    323 {
    324 	mutex_enter(&net->net_cnt_lock);
    325 	nfs4_ephemeral_tree_decr(net);
    326 	mutex_exit(&net->net_cnt_lock);
    327 }
    328 
    329 /*
    330  * Trigger ops for stub vnodes; for mirror mounts, etc.
    331  *
    332  * The general idea is that a "triggering" op will first call
    333  * nfs4_trigger_mount(), which will find out whether a mount has already
    334  * been triggered.
    335  *
    336  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
    337  * of the covering vfs.
    338  *
    339  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
    340  * and again set newvp, as above.
    341  *
    342  * The triggering op may then re-issue the VOP by calling it on newvp.
    343  *
    344  * Note that some ops may perform custom action, and may or may not need
    345  * to trigger a mount.
    346  *
    347  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
    348  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
    349  * and that would just recurse. Instead, we call the v4 op directly,
    350  * by name.  This is OK, since we know that the vnode is for NFSv4,
    351  * otherwise it couldn't be a stub.
    352  *
    353  */
    354 
    355 static int
    356 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
    357 {
    358 	int error;
    359 	vnode_t *newvp;
    360 
    361 	error = nfs4_trigger_mount(*vpp, cr, &newvp);
    362 	if (error)
    363 		return (error);
    364 
    365 	/* Release the stub vnode, as we're losing the reference to it */
    366 	VN_RELE(*vpp);
    367 
    368 	/* Give the caller the root vnode of the newly-mounted fs */
    369 	*vpp = newvp;
    370 
    371 	/* return with VN_HELD(newvp) */
    372 	return (VOP_OPEN(vpp, flag, cr, ct));
    373 }
    374 
    375 /*
    376  * For the majority of cases, nfs4_trigger_getattr() will not trigger
    377  * a mount. However, if ATTR_TRIGGER is set, we are being informed
    378  * that we need to force the mount before we attempt to determine
    379  * the attributes. The intent is an atomic operation for security
    380  * testing.
    381  */
    382 static int
    383 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
    384     caller_context_t *ct)
    385 {
    386 	int error;
    387 
    388 	if (flags & ATTR_TRIGGER) {
    389 		vnode_t	*newvp;
    390 
    391 		error = nfs4_trigger_mount(vp, cr, &newvp);
    392 		if (error)
    393 			return (error);
    394 
    395 		error = VOP_GETATTR(newvp, vap, flags, cr, ct);
    396 		VN_RELE(newvp);
    397 	} else {
    398 		error = nfs4_getattr(vp, vap, flags, cr, ct);
    399 	}
    400 
    401 	return (error);
    402 }
    403 
    404 static int
    405 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
    406 		caller_context_t *ct)
    407 {
    408 	int error;
    409 	vnode_t *newvp;
    410 
    411 	error = nfs4_trigger_mount(vp, cr, &newvp);
    412 	if (error)
    413 		return (error);
    414 
    415 	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
    416 	VN_RELE(newvp);
    417 
    418 	return (error);
    419 }
    420 
    421 static int
    422 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
    423     caller_context_t *ct)
    424 {
    425 	int error;
    426 	vnode_t *newvp;
    427 
    428 	error = nfs4_trigger_mount(vp, cr, &newvp);
    429 	if (error)
    430 		return (error);
    431 
    432 	error = VOP_ACCESS(newvp, mode, flags, cr, ct);
    433 	VN_RELE(newvp);
    434 
    435 	return (error);
    436 }
    437 
    438 static int
    439 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
    440     struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
    441     caller_context_t *ct, int *deflags, pathname_t *rpnp)
    442 {
    443 	int error;
    444 	vnode_t *newdvp;
    445 	rnode4_t *drp = VTOR4(dvp);
    446 
    447 	ASSERT(RP_ISSTUB(drp));
    448 
    449 	/* for now, we only support mirror-mounts */
    450 	ASSERT(RP_ISSTUB_MIRRORMOUNT(drp));
    451 
    452 	/*
    453 	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
    454 	 * that up. Instead, pass onto the regular op, regardless of whether
    455 	 * we've triggered a mount.
    456 	 */
    457 	if (strcmp(nm, "..") == 0)
    458 		return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
    459 		    ct, deflags, rpnp));
    460 
    461 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    462 	if (error)
    463 		return (error);
    464 
    465 	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
    466 	    deflags, rpnp);
    467 	VN_RELE(newdvp);
    468 
    469 	return (error);
    470 }
    471 
    472 static int
    473 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
    474     enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
    475     int flags, caller_context_t *ct, vsecattr_t *vsecp)
    476 {
    477 	int error;
    478 	vnode_t *newdvp;
    479 
    480 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    481 	if (error)
    482 		return (error);
    483 
    484 	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
    485 	    flags, ct, vsecp);
    486 	VN_RELE(newdvp);
    487 
    488 	return (error);
    489 }
    490 
    491 static int
    492 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
    493     int flags)
    494 {
    495 	int error;
    496 	vnode_t *newdvp;
    497 
    498 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    499 	if (error)
    500 		return (error);
    501 
    502 	error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
    503 	VN_RELE(newdvp);
    504 
    505 	return (error);
    506 }
    507 
    508 static int
    509 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
    510     caller_context_t *ct, int flags)
    511 {
    512 	int error;
    513 	vnode_t *newtdvp;
    514 
    515 	error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
    516 	if (error)
    517 		return (error);
    518 
    519 	/*
    520 	 * We don't check whether svp is a stub. Let the NFSv4 code
    521 	 * detect that error, and return accordingly.
    522 	 */
    523 	error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
    524 	VN_RELE(newtdvp);
    525 
    526 	return (error);
    527 }
    528 
    529 static int
    530 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
    531     cred_t *cr, caller_context_t *ct, int flags)
    532 {
    533 	int error;
    534 	vnode_t *newsdvp;
    535 	rnode4_t *tdrp = VTOR4(tdvp);
    536 
    537 	/*
    538 	 * We know that sdvp is a stub, otherwise we would not be here.
    539 	 *
    540 	 * If tdvp is also be a stub, there are two possibilities: it
    541 	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
    542 	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
    543 	 *
    544 	 * In the former case, just trigger sdvp, and treat tdvp as
    545 	 * though it were not a stub.
    546 	 *
    547 	 * In the latter case, it might be a different stub for the
    548 	 * same server fs as sdvp, or for a different server fs.
    549 	 * Regardless, from the client perspective this would still
    550 	 * be a cross-filesystem rename, and should not be allowed,
    551 	 * so return EXDEV, without triggering either mount.
    552 	 */
    553 	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
    554 		return (EXDEV);
    555 
    556 	error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
    557 	if (error)
    558 		return (error);
    559 
    560 	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
    561 
    562 	VN_RELE(newsdvp);
    563 
    564 	return (error);
    565 }
    566 
    567 /* ARGSUSED */
    568 static int
    569 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
    570     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
    571 {
    572 	int error;
    573 	vnode_t *newdvp;
    574 
    575 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    576 	if (error)
    577 		return (error);
    578 
    579 	error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
    580 	VN_RELE(newdvp);
    581 
    582 	return (error);
    583 }
    584 
    585 static int
    586 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
    587     caller_context_t *ct, int flags)
    588 {
    589 	int error;
    590 	vnode_t *newdvp;
    591 
    592 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    593 	if (error)
    594 		return (error);
    595 
    596 	error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
    597 	VN_RELE(newdvp);
    598 
    599 	return (error);
    600 }
    601 
    602 static int
    603 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
    604     cred_t *cr, caller_context_t *ct, int flags)
    605 {
    606 	int error;
    607 	vnode_t *newdvp;
    608 
    609 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
    610 	if (error)
    611 		return (error);
    612 
    613 	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
    614 	VN_RELE(newdvp);
    615 
    616 	return (error);
    617 }
    618 
    619 static int
    620 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
    621     caller_context_t *ct)
    622 {
    623 	int error;
    624 	vnode_t *newvp;
    625 
    626 	error = nfs4_trigger_mount(vp, cr, &newvp);
    627 	if (error)
    628 		return (error);
    629 
    630 	error = VOP_READLINK(newvp, uiop, cr, ct);
    631 	VN_RELE(newvp);
    632 
    633 	return (error);
    634 }
    635 
    636 /* end of trigger vnode ops */
    637 
    638 /*
    639  * See if the mount has already been done by another caller.
    640  */
    641 static int
    642 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
    643     bool_t *was_mounted, vfs_t **vfsp)
    644 {
    645 	int		error;
    646 	mntinfo4_t	*mi = VTOMI4(vp);
    647 
    648 	*was_mounted = FALSE;
    649 
    650 	error = vn_vfsrlock_wait(vp);
    651 	if (error)
    652 		return (error);
    653 
    654 	*vfsp = vn_mountedvfs(vp);
    655 	if (*vfsp != NULL) {
    656 		/* the mount has already occurred */
    657 		error = VFS_ROOT(*vfsp, newvpp);
    658 		if (!error) {
    659 			/* need to update the reference time  */
    660 			mutex_enter(&mi->mi_lock);
    661 			if (mi->mi_ephemeral)
    662 				mi->mi_ephemeral->ne_ref_time =
    663 				    gethrestime_sec();
    664 			mutex_exit(&mi->mi_lock);
    665 
    666 			*was_mounted = TRUE;
    667 		}
    668 	}
    669 
    670 	vn_vfsunlock(vp);
    671 	return (0);
    672 }
    673 
    674 /*
    675  * Mount upon a trigger vnode; for mirror-mounts, etc.
    676  *
    677  * The mount may have already occurred, via another thread. If not,
    678  * assemble the location information - which may require fetching - and
    679  * perform the mount.
    680  *
    681  * Sets newvp to be the root of the fs that is now covering vp. Note
    682  * that we return with VN_HELD(*newvp).
    683  *
    684  * The caller is responsible for passing the VOP onto the covering fs.
    685  */
    686 static int
    687 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
    688 {
    689 	int			 error;
    690 	vfs_t			*vfsp;
    691 	rnode4_t		*rp = VTOR4(vp);
    692 	mntinfo4_t		*mi = VTOMI4(vp);
    693 	domount_args_t		*dma;
    694 
    695 	nfs4_ephemeral_tree_t	*net;
    696 
    697 	bool_t			must_unlock = FALSE;
    698 	bool_t			is_building = FALSE;
    699 	bool_t			was_mounted = FALSE;
    700 
    701 	cred_t			*mcred = NULL;
    702 
    703 	nfs4_trigger_globals_t	*ntg;
    704 
    705 	zone_t			*zone = curproc->p_zone;
    706 
    707 	ASSERT(RP_ISSTUB(rp));
    708 
    709 	/* for now, we only support mirror-mounts */
    710 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
    711 
    712 	*newvpp = NULL;
    713 
    714 	/*
    715 	 * Has the mount already occurred?
    716 	 */
    717 	error = nfs4_trigger_mounted_already(vp, newvpp,
    718 	    &was_mounted, &vfsp);
    719 	if (error || was_mounted)
    720 		goto done;
    721 
    722 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
    723 	ASSERT(ntg != NULL);
    724 
    725 	mutex_enter(&mi->mi_lock);
    726 
    727 	/*
    728 	 * We need to lock down the ephemeral tree.
    729 	 */
    730 	if (mi->mi_ephemeral_tree == NULL) {
    731 		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
    732 		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
    733 		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
    734 		net->net_refcnt = 1;
    735 		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
    736 		is_building = TRUE;
    737 
    738 		/*
    739 		 * We need to add it to the zone specific list for
    740 		 * automatic unmounting and harvesting of deadwood.
    741 		 */
    742 		mutex_enter(&ntg->ntg_forest_lock);
    743 		if (ntg->ntg_forest != NULL)
    744 			net->net_next = ntg->ntg_forest;
    745 		ntg->ntg_forest = net;
    746 		mutex_exit(&ntg->ntg_forest_lock);
    747 
    748 		/*
    749 		 * No lock order confusion with mi_lock because no
    750 		 * other node could have grabbed net_tree_lock.
    751 		 */
    752 		mutex_enter(&net->net_tree_lock);
    753 		mi->mi_ephemeral_tree = net;
    754 		net->net_mount = mi;
    755 		mutex_exit(&mi->mi_lock);
    756 	} else {
    757 		net = mi->mi_ephemeral_tree;
    758 		nfs4_ephemeral_tree_hold(net);
    759 
    760 		mutex_exit(&mi->mi_lock);
    761 
    762 		mutex_enter(&net->net_tree_lock);
    763 
    764 		/*
    765 		 * We can only procede if the tree is neither locked
    766 		 * nor being torn down.
    767 		 */
    768 		mutex_enter(&net->net_cnt_lock);
    769 		if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
    770 			nfs4_ephemeral_tree_decr(net);
    771 			mutex_exit(&net->net_cnt_lock);
    772 			mutex_exit(&net->net_tree_lock);
    773 
    774 			return (EIO);
    775 		}
    776 		mutex_exit(&net->net_cnt_lock);
    777 	}
    778 
    779 	mutex_enter(&net->net_cnt_lock);
    780 	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
    781 	mutex_exit(&net->net_cnt_lock);
    782 
    783 	must_unlock = TRUE;
    784 
    785 	dma = nfs4_trigger_domount_args_create(vp);
    786 	if (dma == NULL) {
    787 		error = EINVAL;
    788 		goto done;
    789 	}
    790 
    791 	/*
    792 	 * Note that since we define mirror mounts to work
    793 	 * for any user, we simply extend the privileges of
    794 	 * the user's credentials to allow the mount to
    795 	 * proceed.
    796 	 */
    797 	mcred = crdup(cr);
    798 	if (mcred == NULL) {
    799 		error = EINVAL;
    800 		goto done;
    801 	}
    802 
    803 	crset_zone_privall(mcred);
    804 
    805 	error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
    806 	nfs4_trigger_domount_args_destroy(dma, vp);
    807 
    808 	crfree(mcred);
    809 
    810 done:
    811 
    812 	if (must_unlock) {
    813 		mutex_enter(&net->net_cnt_lock);
    814 		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
    815 		if (is_building)
    816 			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
    817 		nfs4_ephemeral_tree_decr(net);
    818 		mutex_exit(&net->net_cnt_lock);
    819 
    820 		mutex_exit(&net->net_tree_lock);
    821 	}
    822 
    823 	if (!error && (newvpp == NULL || *newvpp == NULL))
    824 		error = ENOSYS;
    825 
    826 	return (error);
    827 }
    828 
    829 /*
    830  * Collect together both the generic & mount-type specific args.
    831  */
    832 static domount_args_t *
    833 nfs4_trigger_domount_args_create(vnode_t *vp)
    834 {
    835 	int nointr;
    836 	char *hostlist;
    837 	servinfo4_t *svp;
    838 	struct nfs_args *nargs, *nargs_head;
    839 	enum clnt_stat status;
    840 	ephemeral_servinfo_t *esi, *esi_first;
    841 	domount_args_t *dma;
    842 	mntinfo4_t *mi = VTOMI4(vp);
    843 
    844 	nointr = !(mi->mi_flags & MI4_INT);
    845 	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
    846 
    847 	svp = mi->mi_curr_serv;
    848 	/* check if the current server is responding */
    849 	status = nfs4_trigger_ping_server(svp, nointr);
    850 	if (status == RPC_SUCCESS) {
    851 		esi_first = nfs4_trigger_esi_create(vp, svp);
    852 		if (esi_first == NULL) {
    853 			kmem_free(hostlist, MAXPATHLEN);
    854 			return (NULL);
    855 		}
    856 
    857 		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
    858 
    859 		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
    860 	} else {
    861 		/* current server did not respond */
    862 		esi_first = NULL;
    863 		nargs_head = NULL;
    864 	}
    865 	nargs = nargs_head;
    866 
    867 	/*
    868 	 * NFS RO failover.
    869 	 *
    870 	 * If we have multiple servinfo4 structures, linked via sv_next,
    871 	 * we must create one nfs_args for each, linking the nfs_args via
    872 	 * nfs_ext_u.nfs_extB.next.
    873 	 *
    874 	 * We need to build a corresponding esi for each, too, but that is
    875 	 * used solely for building nfs_args, and may be immediately
    876 	 * discarded, as domount() requires the info from just one esi,
    877 	 * but all the nfs_args.
    878 	 *
    879 	 * Currently, the NFS mount code will hang if not all servers
    880 	 * requested are available. To avoid that, we need to ping each
    881 	 * server, here, and remove it from the list if it is not
    882 	 * responding. This has the side-effect of that server then
    883 	 * being permanently unavailable for this failover mount, even if
    884 	 * it recovers. That's unfortunate, but the best we can do until
    885 	 * the mount code path is fixed.
    886 	 */
    887 
    888 	/*
    889 	 * If the current server was down, loop indefinitely until we find
    890 	 * at least one responsive server.
    891 	 */
    892 	do {
    893 		/* no locking needed for sv_next; it is only set at fs mount */
    894 		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
    895 			struct nfs_args *next;
    896 
    897 			/*
    898 			 * nargs_head: the head of the nfs_args list
    899 			 * nargs: the current tail of the list
    900 			 * next: the newly-created element to be added
    901 			 */
    902 
    903 			/*
    904 			 * We've already tried the current server, above;
    905 			 * if it was responding, we have already included it
    906 			 * and it may now be ignored.
    907 			 *
    908 			 * Otherwise, try it again, since it may now have
    909 			 * recovered.
    910 			 */
    911 			if (svp == mi->mi_curr_serv && esi_first != NULL)
    912 				continue;
    913 
    914 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
    915 			if (svp->sv_flags & SV4_NOTINUSE) {
    916 				nfs_rw_exit(&svp->sv_lock);
    917 				continue;
    918 			}
    919 			nfs_rw_exit(&svp->sv_lock);
    920 
    921 			/* check if the server is responding */
    922 			status = nfs4_trigger_ping_server(svp, nointr);
    923 			/* if the server did not respond, ignore it */
    924 			if (status != RPC_SUCCESS)
    925 				continue;
    926 
    927 			esi = nfs4_trigger_esi_create(vp, svp);
    928 			if (esi == NULL)
    929 				continue;
    930 
    931 			/*
    932 			 * If the original current server (mi_curr_serv)
    933 			 * was down when when we first tried it,
    934 			 * (i.e. esi_first == NULL),
    935 			 * we select this new server (svp) to be the server
    936 			 * that we will actually contact (esi_first).
    937 			 *
    938 			 * Note that it's possible that mi_curr_serv == svp,
    939 			 * if that mi_curr_serv was down but has now recovered.
    940 			 */
    941 			next = nfs4_trigger_nargs_create(mi, svp, esi);
    942 			if (esi_first == NULL) {
    943 				ASSERT(nargs == NULL);
    944 				ASSERT(nargs_head == NULL);
    945 				nargs_head = next;
    946 				esi_first = esi;
    947 				(void) strlcpy(hostlist,
    948 				    esi_first->esi_hostname, MAXPATHLEN);
    949 			} else {
    950 				ASSERT(nargs_head != NULL);
    951 				nargs->nfs_ext_u.nfs_extB.next = next;
    952 				(void) strlcat(hostlist, ",", MAXPATHLEN);
    953 				(void) strlcat(hostlist, esi->esi_hostname,
    954 				    MAXPATHLEN);
    955 				/* esi was only needed for hostname & nargs */
    956 				nfs4_trigger_esi_destroy(esi, vp);
    957 			}
    958 
    959 			nargs = next;
    960 		}
    961 
    962 		/* if we've had no response at all, wait a second */
    963 		if (esi_first == NULL)
    964 			delay(drv_usectohz(1000000));
    965 
    966 	} while (esi_first == NULL);
    967 	ASSERT(nargs_head != NULL);
    968 
    969 	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
    970 	dma->dma_esi = esi_first;
    971 	dma->dma_hostlist = hostlist;
    972 	dma->dma_nargs = nargs_head;
    973 
    974 	return (dma);
    975 }
    976 
    977 static void
    978 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
    979 {
    980 	if (dma != NULL) {
    981 		if (dma->dma_esi != NULL && vp != NULL)
    982 			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
    983 
    984 		if (dma->dma_hostlist != NULL)
    985 			kmem_free(dma->dma_hostlist, MAXPATHLEN);
    986 
    987 		if (dma->dma_nargs != NULL) {
    988 			struct nfs_args *nargs = dma->dma_nargs;
    989 
    990 			do {
    991 				struct nfs_args *next =
    992 				    nargs->nfs_ext_u.nfs_extB.next;
    993 
    994 				nfs4_trigger_nargs_destroy(nargs);
    995 				nargs = next;
    996 			} while (nargs != NULL);
    997 		}
    998 
    999 		kmem_free(dma, sizeof (domount_args_t));
   1000 	}
   1001 }
   1002 
   1003 /*
   1004  * The ephemeral_servinfo_t struct contains basic information we will need to
   1005  * perform the mount. Whilst the structure is generic across different
   1006  * types of ephemeral mount, the way we gather its contents differs.
   1007  */
   1008 static ephemeral_servinfo_t *
   1009 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp)
   1010 {
   1011 	ephemeral_servinfo_t *esi;
   1012 	rnode4_t *rp = VTOR4(vp);
   1013 
   1014 	ASSERT(RP_ISSTUB(rp));
   1015 
   1016 	/* Call the ephemeral type-specific routine */
   1017 	if (RP_ISSTUB_MIRRORMOUNT(rp))
   1018 		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
   1019 	else
   1020 		esi = NULL;
   1021 
   1022 	/* for now, we only support mirror-mounts */
   1023 	ASSERT(esi != NULL);
   1024 
   1025 	return (esi);
   1026 }
   1027 
   1028 static void
   1029 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
   1030 {
   1031 	rnode4_t *rp = VTOR4(vp);
   1032 
   1033 	ASSERT(RP_ISSTUB(rp));
   1034 
   1035 	/* for now, we only support mirror-mounts */
   1036 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
   1037 
   1038 	/* Currently, no need for an ephemeral type-specific routine */
   1039 
   1040 	/*
   1041 	 * The contents of ephemeral_servinfo_t goes into nfs_args,
   1042 	 * and will be handled by nfs4_trigger_nargs_destroy().
   1043 	 * We need only free the structure itself.
   1044 	 */
   1045 	if (esi != NULL)
   1046 		kmem_free(esi, sizeof (ephemeral_servinfo_t));
   1047 }
   1048 
   1049 /*
   1050  * Some of this may turn out to be common with other ephemeral types,
   1051  * in which case it should be moved to nfs4_trigger_esi_create(), or a
   1052  * common function called.
   1053  */
   1054 static ephemeral_servinfo_t *
   1055 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
   1056 {
   1057 	char			*stubpath;
   1058 	struct knetconfig	*sikncp, *svkncp;
   1059 	struct netbuf		*bufp;
   1060 	ephemeral_servinfo_t	*esi;
   1061 
   1062 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
   1063 
   1064 	/* initially set to be our type of ephemeral mount; may be added to */
   1065 	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
   1066 
   1067 	/*
   1068 	 * We're copying info from the stub rnode's servinfo4, but
   1069 	 * we must create new copies, not pointers, since this information
   1070 	 * is to be associated with the new mount, which will be
   1071 	 * unmounted (and its structures freed) separately
   1072 	 */
   1073 
   1074 	/*
   1075 	 * Sizes passed to kmem_[z]alloc here must match those freed
   1076 	 * in nfs4_free_args()
   1077 	 */
   1078 
   1079 	/*
   1080 	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
   1081 	 * is difficult to avoid: as we need to read svp to calculate the
   1082 	 * sizes to be allocated.
   1083 	 */
   1084 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1085 
   1086 	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
   1087 	(void) strcat(esi->esi_hostname, svp->sv_hostname);
   1088 
   1089 	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
   1090 	bufp = esi->esi_addr;
   1091 	bufp->len = svp->sv_addr.len;
   1092 	bufp->maxlen = svp->sv_addr.maxlen;
   1093 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
   1094 	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
   1095 
   1096 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
   1097 	sikncp = esi->esi_knconf;
   1098 	svkncp = svp->sv_knconf;
   1099 	sikncp->knc_semantics = svkncp->knc_semantics;
   1100 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
   1101 	(void) strcat((char *)sikncp->knc_protofmly,
   1102 	    (char *)svkncp->knc_protofmly);
   1103 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
   1104 	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
   1105 	sikncp->knc_rdev = svkncp->knc_rdev;
   1106 
   1107 	/*
   1108 	 * Used when AUTH_DH is negotiated.
   1109 	 *
   1110 	 * This is ephemeral mount-type specific, since it contains the
   1111 	 * server's time-sync syncaddr.
   1112 	 */
   1113 	if (svp->sv_dhsec) {
   1114 		struct netbuf *bufp;
   1115 		sec_data_t *sdata;
   1116 		dh_k4_clntdata_t *data;
   1117 
   1118 		sdata = svp->sv_dhsec;
   1119 		data = (dh_k4_clntdata_t *)sdata->data;
   1120 		ASSERT(sdata->rpcflavor == AUTH_DH);
   1121 
   1122 		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
   1123 		bufp->len = data->syncaddr.len;
   1124 		bufp->maxlen = data->syncaddr.maxlen;
   1125 		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
   1126 		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
   1127 		esi->esi_syncaddr = bufp;
   1128 
   1129 		if (data->netname != NULL) {
   1130 			int nmlen = data->netnamelen;
   1131 
   1132 			/*
   1133 			 * We need to copy from a dh_k4_clntdata_t
   1134 			 * netname/netnamelen pair to a NUL-terminated
   1135 			 * netname string suitable for putting in nfs_args,
   1136 			 * where the latter has no netnamelen field.
   1137 			 */
   1138 			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
   1139 			bcopy(data->netname, esi->esi_netname, nmlen);
   1140 		}
   1141 	} else {
   1142 		esi->esi_syncaddr = NULL;
   1143 		esi->esi_netname = NULL;
   1144 	}
   1145 
   1146 	stubpath = fn_path(VTOSV(vp)->sv_name);
   1147 	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
   1148 	ASSERT(*stubpath == '.');
   1149 	stubpath += 1;
   1150 
   1151 	/* for nfs_args->fh */
   1152 	esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1;
   1153 	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
   1154 	(void) strcat(esi->esi_path, svp->sv_path);
   1155 	(void) strcat(esi->esi_path, stubpath);
   1156 
   1157 	stubpath -= 1;
   1158 	/* stubpath allocated by fn_path() */
   1159 	kmem_free(stubpath, strlen(stubpath) + 1);
   1160 
   1161 	nfs_rw_exit(&svp->sv_lock);
   1162 
   1163 	return (esi);
   1164 }
   1165 
   1166 /*
   1167  * Assemble the args, and call the generic VFS mount function to
   1168  * finally perform the ephemeral mount.
   1169  */
   1170 static int
   1171 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
   1172     cred_t *cr, vnode_t **newvpp)
   1173 {
   1174 	struct mounta	*uap;
   1175 	char		*mntpt, *orig_path, *path;
   1176 	const char	*orig_mntpt;
   1177 	int		retval;
   1178 	int		mntpt_len;
   1179 	int		spec_len;
   1180 	zone_t		*zone = curproc->p_zone;
   1181 	bool_t		has_leading_slash;
   1182 	int		i;
   1183 
   1184 	vfs_t			*stubvfsp = stubvp->v_vfsp;
   1185 	ephemeral_servinfo_t	*esi = dma->dma_esi;
   1186 	struct nfs_args		*nargs = dma->dma_nargs;
   1187 
   1188 	/* first, construct the mount point for the ephemeral mount */
   1189 	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
   1190 	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
   1191 
   1192 	if (*orig_path == '.')
   1193 		orig_path++;
   1194 
   1195 	/*
   1196 	 * Get rid of zone's root path
   1197 	 */
   1198 	if (zone != global_zone) {
   1199 		/*
   1200 		 * -1 for trailing '/' and -1 for EOS.
   1201 		 */
   1202 		if (strncmp(zone->zone_rootpath, orig_mntpt,
   1203 		    zone->zone_rootpathlen - 1) == 0) {
   1204 			orig_mntpt += (zone->zone_rootpathlen - 2);
   1205 		}
   1206 	}
   1207 
   1208 	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
   1209 	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
   1210 	(void) strcat(mntpt, orig_mntpt);
   1211 	(void) strcat(mntpt, orig_path);
   1212 
   1213 	kmem_free(path, strlen(path) + 1);
   1214 	path = esi->esi_path;
   1215 	if (*path == '.')
   1216 		path++;
   1217 	if (path[0] == '/' && path[1] == '/')
   1218 		path++;
   1219 	has_leading_slash = (*path == '/');
   1220 
   1221 	spec_len = strlen(dma->dma_hostlist);
   1222 	spec_len += strlen(path);
   1223 
   1224 	/* We are going to have to add this in */
   1225 	if (!has_leading_slash)
   1226 		spec_len++;
   1227 
   1228 	/* We need to get the ':' for dma_hostlist:esi_path */
   1229 	spec_len++;
   1230 
   1231 	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
   1232 	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
   1233 	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
   1234 	    has_leading_slash ? "" : "/", path);
   1235 
   1236 	uap->dir = mntpt;
   1237 
   1238 	uap->flags = MS_SYSSPACE | MS_DATA;
   1239 	/* fstype-independent mount options not covered elsewhere */
   1240 	/* copy parent's mount(1M) "-m" flag */
   1241 	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
   1242 		uap->flags |= MS_NOMNTTAB;
   1243 
   1244 	uap->fstype = MNTTYPE_NFS4;
   1245 	uap->dataptr = (char *)nargs;
   1246 	/* not needed for MS_SYSSPACE */
   1247 	uap->datalen = 0;
   1248 
   1249 	/* use optptr to pass in extra mount options */
   1250 	uap->flags |= MS_OPTIONSTR;
   1251 	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
   1252 	if (uap->optptr == NULL) {
   1253 		retval = EINVAL;
   1254 		goto done;
   1255 	}
   1256 
   1257 	/* domount() expects us to count the trailing NUL */
   1258 	uap->optlen = strlen(uap->optptr) + 1;
   1259 
   1260 	/*
   1261 	 * If we get EBUSY, we try again once to see if we can perform
   1262 	 * the mount. We do this because of a spurious race condition.
   1263 	 */
   1264 	for (i = 0; i < 2; i++) {
   1265 		int	error;
   1266 		bool_t	was_mounted;
   1267 
   1268 		retval = domount(NULL, uap, stubvp, cr, vfsp);
   1269 		if (retval == 0) {
   1270 			retval = VFS_ROOT(*vfsp, newvpp);
   1271 			VFS_RELE(*vfsp);
   1272 			break;
   1273 		} else if (retval != EBUSY) {
   1274 			break;
   1275 		}
   1276 
   1277 		/*
   1278 		 * We might find it mounted by the other racer...
   1279 		 */
   1280 		error = nfs4_trigger_mounted_already(stubvp,
   1281 		    newvpp, &was_mounted, vfsp);
   1282 		if (error) {
   1283 			goto done;
   1284 		} else if (was_mounted) {
   1285 			retval = 0;
   1286 			break;
   1287 		}
   1288 	}
   1289 
   1290 done:
   1291 	if (uap->optptr)
   1292 		nfs4_trigger_destroy_mntopts(uap->optptr);
   1293 
   1294 	kmem_free(uap->spec, spec_len + 1);
   1295 	kmem_free(uap, sizeof (struct mounta));
   1296 	kmem_free(mntpt, mntpt_len + 1);
   1297 
   1298 	return (retval);
   1299 }
   1300 
   1301 /*
   1302  * Build an nfs_args structure for passing to domount().
   1303  *
   1304  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
   1305  * generic data - common to all ephemeral mount types - is read directly
   1306  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
   1307  */
   1308 static struct nfs_args *
   1309 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
   1310     ephemeral_servinfo_t *esi)
   1311 {
   1312 	sec_data_t *secdata;
   1313 	struct nfs_args *nargs;
   1314 
   1315 	/* setup the nfs args */
   1316 	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
   1317 
   1318 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1319 
   1320 	nargs->addr = esi->esi_addr;
   1321 
   1322 	/* for AUTH_DH by negotiation */
   1323 	if (esi->esi_syncaddr || esi->esi_netname) {
   1324 		nargs->flags |= NFSMNT_SECURE;
   1325 		nargs->syncaddr = esi->esi_syncaddr;
   1326 		nargs->netname = esi->esi_netname;
   1327 	}
   1328 
   1329 	nargs->flags |= NFSMNT_KNCONF;
   1330 	nargs->knconf = esi->esi_knconf;
   1331 	nargs->flags |= NFSMNT_HOSTNAME;
   1332 	nargs->hostname = esi->esi_hostname;
   1333 	nargs->fh = esi->esi_path;
   1334 
   1335 	/* general mount settings, all copied from parent mount */
   1336 	mutex_enter(&mi->mi_lock);
   1337 
   1338 	if (!(mi->mi_flags & MI4_HARD))
   1339 		nargs->flags |= NFSMNT_SOFT;
   1340 
   1341 	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
   1342 	    NFSMNT_RETRANS;
   1343 	nargs->wsize = mi->mi_stsize;
   1344 	nargs->rsize = mi->mi_tsize;
   1345 	nargs->timeo = mi->mi_timeo;
   1346 	nargs->retrans = mi->mi_retrans;
   1347 
   1348 	if (mi->mi_flags & MI4_INT)
   1349 		nargs->flags |= NFSMNT_INT;
   1350 	if (mi->mi_flags & MI4_NOAC)
   1351 		nargs->flags |= NFSMNT_NOAC;
   1352 
   1353 	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
   1354 	    NFSMNT_ACDIRMAX;
   1355 	nargs->acregmin = HR2SEC(mi->mi_acregmin);
   1356 	nargs->acregmax = HR2SEC(mi->mi_acregmax);
   1357 	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
   1358 	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
   1359 
   1360 	if (mi->mi_flags & MI4_NOCTO)
   1361 		nargs->flags |= NFSMNT_NOCTO;
   1362 	if (mi->mi_flags & MI4_GRPID)
   1363 		nargs->flags |= NFSMNT_GRPID;
   1364 	if (mi->mi_flags & MI4_LLOCK)
   1365 		nargs->flags |= NFSMNT_LLOCK;
   1366 	if (mi->mi_flags & MI4_NOPRINT)
   1367 		nargs->flags |= NFSMNT_NOPRINT;
   1368 	if (mi->mi_flags & MI4_DIRECTIO)
   1369 		nargs->flags |= NFSMNT_DIRECTIO;
   1370 	if (mi->mi_flags & MI4_PUBLIC)
   1371 		nargs->flags |= NFSMNT_PUBLIC;
   1372 
   1373 	mutex_exit(&mi->mi_lock);
   1374 
   1375 	/* add any specific flags for this type of ephemeral mount */
   1376 	nargs->flags |= esi->esi_mount_flags;
   1377 
   1378 	/*
   1379 	 * Security data & negotiation policy.
   1380 	 *
   1381 	 * We need to preserve the parent mount's preference for security
   1382 	 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT.
   1383 	 *
   1384 	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
   1385 	 * security flavour was requested, with data in sv_secdata, and that
   1386 	 * no negotiation should occur. If this specified flavour fails, that's
   1387 	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
   1388 	 *
   1389 	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
   1390 	 * default flavour, in sv_secdata, but then negotiate a new flavour.
   1391 	 * Possible flavours are recorded in an array in sv_secinfo, with
   1392 	 * currently in-use flavour pointed to by sv_currsec.
   1393 	 *
   1394 	 * If sv_currsec is set, i.e. if negotiation has already occurred,
   1395 	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
   1396 	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
   1397 	 */
   1398 	if (svp->sv_flags & SV4_TRYSECDEFAULT) {
   1399 		/* enable negotiation for ephemeral mount */
   1400 		nargs->flags |= NFSMNT_SECDEFAULT;
   1401 
   1402 		/*
   1403 		 * As a starting point for negotiation, copy parent
   1404 		 * mount's negotiated flavour (sv_currsec) if available,
   1405 		 * or its passed-in flavour (sv_secdata) if not.
   1406 		 */
   1407 		if (svp->sv_currsec != NULL)
   1408 			secdata = copy_sec_data(svp->sv_currsec);
   1409 		else if (svp->sv_secdata != NULL)
   1410 			secdata = copy_sec_data(svp->sv_secdata);
   1411 		else
   1412 			secdata = NULL;
   1413 	} else {
   1414 		/* do not enable negotiation; copy parent's passed-in flavour */
   1415 		if (svp->sv_secdata != NULL)
   1416 			secdata = copy_sec_data(svp->sv_secdata);
   1417 		else
   1418 			secdata = NULL;
   1419 	}
   1420 
   1421 	nfs_rw_exit(&svp->sv_lock);
   1422 
   1423 	nargs->flags |= NFSMNT_NEWARGS;
   1424 	nargs->nfs_args_ext = NFS_ARGS_EXTB;
   1425 	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
   1426 
   1427 	/* for NFS RO failover; caller will set if necessary */
   1428 	nargs->nfs_ext_u.nfs_extB.next = NULL;
   1429 
   1430 	return (nargs);
   1431 }
   1432 
   1433 static void
   1434 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
   1435 {
   1436 	/*
   1437 	 * Either the mount failed, in which case the data is not needed, or
   1438 	 * nfs4_mount() has either taken copies of what it needs or,
   1439 	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
   1440 	 * whereby nfs4_free_args() will ignore it.
   1441 	 */
   1442 	nfs4_free_args(nargs);
   1443 	kmem_free(nargs, sizeof (struct nfs_args));
   1444 }
   1445 
   1446 /*
   1447  * When we finally get into the mounting, we need to add this
   1448  * node to the ephemeral tree.
   1449  *
   1450  * This is called from nfs4_mount().
   1451  */
   1452 int
   1453 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
   1454 {
   1455 	mntinfo4_t		*mi_parent;
   1456 	nfs4_ephemeral_t	*eph;
   1457 	nfs4_ephemeral_tree_t	*net;
   1458 
   1459 	nfs4_ephemeral_t	*prior;
   1460 	nfs4_ephemeral_t	*child;
   1461 
   1462 	nfs4_ephemeral_t	*peer;
   1463 
   1464 	nfs4_trigger_globals_t	*ntg;
   1465 	zone_t			*zone = curproc->p_zone;
   1466 
   1467 	int			rc = 0;
   1468 
   1469 	mi_parent = VTOMI4(mvp);
   1470 
   1471 	/*
   1472 	 * Get this before grabbing anything else!
   1473 	 */
   1474 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
   1475 	if (!ntg->ntg_thread_started) {
   1476 		nfs4_ephemeral_start_harvester(ntg);
   1477 	}
   1478 
   1479 	mutex_enter(&mi_parent->mi_lock);
   1480 	mutex_enter(&mi->mi_lock);
   1481 
   1482 	net = mi->mi_ephemeral_tree =
   1483 	    mi_parent->mi_ephemeral_tree;
   1484 
   1485 	/*
   1486 	 * If the mi_ephemeral_tree is NULL, then it
   1487 	 * means that either the harvester or a manual
   1488 	 * umount has cleared the tree out right before
   1489 	 * we got here.
   1490 	 *
   1491 	 * There is nothing we can do here, so return
   1492 	 * to the caller and let them decide whether they
   1493 	 * try again.
   1494 	 */
   1495 	if (net == NULL) {
   1496 		mutex_exit(&mi->mi_lock);
   1497 		mutex_exit(&mi_parent->mi_lock);
   1498 
   1499 		return (EBUSY);
   1500 	}
   1501 
   1502 	nfs4_ephemeral_tree_hold(net);
   1503 
   1504 	/*
   1505 	 * We need to tack together the ephemeral mount
   1506 	 * with this new mntinfo.
   1507 	 */
   1508 	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
   1509 	eph->ne_mount = mi;
   1510 	eph->ne_ref_time = gethrestime_sec();
   1511 
   1512 	/*
   1513 	 * We need to tell the ephemeral mount when
   1514 	 * to time out.
   1515 	 */
   1516 	eph->ne_mount_to = ntg->ntg_mount_to;
   1517 
   1518 	mi->mi_flags |= MI4_EPHEMERAL;
   1519 	mi->mi_ephemeral = eph;
   1520 
   1521 	/*
   1522 	 * If the enclosing mntinfo4 is also ephemeral,
   1523 	 * then we need to point to its enclosing parent.
   1524 	 * Else the enclosing mntinfo4 is the enclosing parent.
   1525 	 *
   1526 	 * We also need to weave this ephemeral node
   1527 	 * into the tree.
   1528 	 */
   1529 	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
   1530 		/*
   1531 		 * We need to decide if we are
   1532 		 * the root node of this branch
   1533 		 * or if we are a sibling of this
   1534 		 * branch.
   1535 		 */
   1536 		prior = mi_parent->mi_ephemeral;
   1537 		if (prior == NULL) {
   1538 			/*
   1539 			 * Race condition, clean up, and
   1540 			 * let caller handle mntinfo.
   1541 			 */
   1542 			mi->mi_flags &= ~MI4_EPHEMERAL;
   1543 			mi->mi_ephemeral = NULL;
   1544 			kmem_free(eph, sizeof (*eph));
   1545 			rc = EBUSY;
   1546 		} else {
   1547 			if (prior->ne_child == NULL) {
   1548 				prior->ne_child = eph;
   1549 			} else {
   1550 				child = prior->ne_child;
   1551 
   1552 				prior->ne_child = eph;
   1553 				eph->ne_peer = child;
   1554 
   1555 				child->ne_prior = eph;
   1556 			}
   1557 
   1558 			eph->ne_prior = prior;
   1559 		}
   1560 	} else {
   1561 		/*
   1562 		 * The parent mntinfo4 is the non-ephemeral
   1563 		 * root of the ephemeral tree. We
   1564 		 * need to decide if we are the root
   1565 		 * node of that tree or if we are a
   1566 		 * sibling of the root node.
   1567 		 *
   1568 		 * We are the root if there is no
   1569 		 * other node.
   1570 		 */
   1571 		if (net->net_root == NULL) {
   1572 			net->net_root = eph;
   1573 		} else {
   1574 			eph->ne_peer = peer = net->net_root;
   1575 			ASSERT(peer != NULL);
   1576 			net->net_root = eph;
   1577 
   1578 			peer->ne_prior = eph;
   1579 		}
   1580 
   1581 		eph->ne_prior = NULL;
   1582 	}
   1583 
   1584 	nfs4_ephemeral_tree_rele(net);
   1585 
   1586 	mutex_exit(&mi->mi_lock);
   1587 	mutex_exit(&mi_parent->mi_lock);
   1588 
   1589 	return (rc);
   1590 }
   1591 
   1592 /*
   1593  * Commit the changes to the ephemeral tree for removing this node.
   1594  */
   1595 static void
   1596 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
   1597 {
   1598 	nfs4_ephemeral_t	*e = eph;
   1599 	nfs4_ephemeral_t	*peer;
   1600 	nfs4_ephemeral_t	*prior;
   1601 
   1602 	peer = eph->ne_peer;
   1603 	prior = e->ne_prior;
   1604 
   1605 	/*
   1606 	 * If this branch root was not the
   1607 	 * tree root, then we need to fix back pointers.
   1608 	 */
   1609 	if (prior) {
   1610 		if (prior->ne_child == e) {
   1611 			prior->ne_child = peer;
   1612 		} else {
   1613 			prior->ne_peer = peer;
   1614 		}
   1615 
   1616 		if (peer)
   1617 			peer->ne_prior = prior;
   1618 	} else if (peer) {
   1619 		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
   1620 		peer->ne_prior = NULL;
   1621 	} else {
   1622 		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
   1623 	}
   1624 }
   1625 
   1626 /*
   1627  * We want to avoid recursion at all costs. So we need to
   1628  * unroll the tree. We do this by a depth first traversal to
   1629  * leaf nodes. We blast away the leaf and work our way back
   1630  * up and down the tree.
   1631  */
   1632 static int
   1633 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
   1634     int isTreeRoot, int flag, cred_t *cr)
   1635 {
   1636 	nfs4_ephemeral_t	*e = eph;
   1637 	nfs4_ephemeral_t	*prior;
   1638 	mntinfo4_t		*mi;
   1639 	vfs_t			*vfsp;
   1640 	int			error;
   1641 
   1642 	/*
   1643 	 * We use the loop while unrolling the ephemeral tree.
   1644 	 */
   1645 	for (;;) {
   1646 		/*
   1647 		 * First we walk down the child.
   1648 		 */
   1649 		if (e->ne_child) {
   1650 			prior = e;
   1651 			e = e->ne_child;
   1652 			continue;
   1653 		}
   1654 
   1655 		/*
   1656 		 * If we are the root of the branch we are removing,
   1657 		 * we end it here. But if the branch is the root of
   1658 		 * the tree, we have to forge on. We do not consider
   1659 		 * the peer list for the root because while it may
   1660 		 * be okay to remove, it is both extra work and a
   1661 		 * potential for a false-positive error to stall the
   1662 		 * unmount attempt.
   1663 		 */
   1664 		if (e == eph && isTreeRoot == FALSE)
   1665 			return (0);
   1666 
   1667 		/*
   1668 		 * Next we walk down the peer list.
   1669 		 */
   1670 		if (e->ne_peer) {
   1671 			prior = e;
   1672 			e = e->ne_peer;
   1673 			continue;
   1674 		}
   1675 
   1676 		/*
   1677 		 * We can only remove the node passed in by the
   1678 		 * caller if it is the root of the ephemeral tree.
   1679 		 * Otherwise, the caller will remove it.
   1680 		 */
   1681 		if (e == eph && isTreeRoot == FALSE)
   1682 			return (0);
   1683 
   1684 		/*
   1685 		 * Okay, we have a leaf node, time
   1686 		 * to prune it!
   1687 		 *
   1688 		 * Note that prior can only be NULL if
   1689 		 * and only if it is the root of the
   1690 		 * ephemeral tree.
   1691 		 */
   1692 		prior = e->ne_prior;
   1693 
   1694 		mi = e->ne_mount;
   1695 		mutex_enter(&mi->mi_lock);
   1696 		vfsp = mi->mi_vfsp;
   1697 
   1698 		/*
   1699 		 * Cleared by umount2_engine.
   1700 		 */
   1701 		VFS_HOLD(vfsp);
   1702 
   1703 		/*
   1704 		 * Inform nfs4_unmount to not recursively
   1705 		 * descend into this node's children when it
   1706 		 * gets processed.
   1707 		 */
   1708 		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
   1709 		mutex_exit(&mi->mi_lock);
   1710 
   1711 		error = umount2_engine(vfsp, flag, cr, FALSE);
   1712 		if (error) {
   1713 			/*
   1714 			 * We need to reenable nfs4_unmount's ability
   1715 			 * to recursively descend on this node.
   1716 			 */
   1717 			mutex_enter(&mi->mi_lock);
   1718 			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
   1719 			mutex_exit(&mi->mi_lock);
   1720 
   1721 			return (error);
   1722 		}
   1723 
   1724 		/*
   1725 		 * If we are the current node, we do not want to
   1726 		 * touch anything else. At this point, the only
   1727 		 * way the current node can have survived to here
   1728 		 * is if it is the root of the ephemeral tree and
   1729 		 * we are unmounting the enclosing mntinfo4.
   1730 		 */
   1731 		if (e == eph) {
   1732 			ASSERT(prior == NULL);
   1733 			return (0);
   1734 		}
   1735 
   1736 		/*
   1737 		 * Stitch up the prior node. Note that since
   1738 		 * we have handled the root of the tree, prior
   1739 		 * must be non-NULL.
   1740 		 */
   1741 		ASSERT(prior != NULL);
   1742 		if (prior->ne_child == e) {
   1743 			prior->ne_child = NULL;
   1744 		} else {
   1745 			ASSERT(prior->ne_peer == e);
   1746 
   1747 			prior->ne_peer = NULL;
   1748 		}
   1749 
   1750 		e = prior;
   1751 	}
   1752 
   1753 	/* NOTREACHED */
   1754 }
   1755 
   1756 /*
   1757  * Common code to safely release net_cnt_lock and net_tree_lock
   1758  */
   1759 void
   1760 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
   1761     bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet)
   1762 {
   1763 	nfs4_ephemeral_tree_t	*net = *pnet;
   1764 
   1765 	if (*pmust_unlock) {
   1766 		mutex_enter(&net->net_cnt_lock);
   1767 		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
   1768 		if (*pmust_rele)
   1769 			nfs4_ephemeral_tree_decr(net);
   1770 		mutex_exit(&net->net_cnt_lock);
   1771 
   1772 		mutex_exit(&net->net_tree_lock);
   1773 
   1774 		*pmust_unlock = FALSE;
   1775 	}
   1776 }
   1777 
   1778 /*
   1779  * While we may have removed any child or sibling nodes of this
   1780  * ephemeral node, we can not nuke it until we know that there
   1781  * were no actived vnodes on it. This will do that final
   1782  * work once we know it is not busy.
   1783  */
   1784 void
   1785 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
   1786     bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet)
   1787 {
   1788 	/*
   1789 	 * Now we need to get rid of the ephemeral data if it exists.
   1790 	 */
   1791 	mutex_enter(&mi->mi_lock);
   1792 	if (mi->mi_ephemeral) {
   1793 		/*
   1794 		 * If we are the root node of an ephemeral branch
   1795 		 * which is being removed, then we need to fixup
   1796 		 * pointers into and out of the node.
   1797 		 */
   1798 		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
   1799 			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
   1800 
   1801 		ASSERT(mi->mi_ephemeral != NULL);
   1802 
   1803 		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
   1804 		mi->mi_ephemeral = NULL;
   1805 	}
   1806 	mutex_exit(&mi->mi_lock);
   1807 
   1808 	nfs4_ephemeral_umount_unlock(pmust_unlock, pmust_rele, pnet);
   1809 }
   1810 
   1811 /*
   1812  * Unmount an ephemeral node.
   1813  */
   1814 int
   1815 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
   1816     bool_t *pmust_unlock, bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet)
   1817 {
   1818 	int			error = 0;
   1819 	nfs4_ephemeral_t	*eph;
   1820 	nfs4_ephemeral_tree_t	*net;
   1821 	int			is_derooting = FALSE;
   1822 	int			is_recursed = FALSE;
   1823 	int			was_locked = FALSE;
   1824 
   1825 	/*
   1826 	 * Make sure to set the default state for cleaning
   1827 	 * up the tree in the caller (and on the way out).
   1828 	 */
   1829 	*pmust_unlock = *pmust_rele = FALSE;
   1830 
   1831 	/*
   1832 	 * The active vnodes on this file system may be ephemeral
   1833 	 * children. We need to check for and try to unmount them
   1834 	 * here. If any can not be unmounted, we are going
   1835 	 * to return EBUSY.
   1836 	 */
   1837 	mutex_enter(&mi->mi_lock);
   1838 
   1839 	/*
   1840 	 * If an ephemeral tree, we need to check to see if
   1841 	 * the lock is already held. If it is, then we need
   1842 	 * to see if we are being called as a result of
   1843 	 * the recursive removal of some node of the tree or
   1844 	 * if we are another attempt to remove the tree.
   1845 	 *
   1846 	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
   1847 	 * node. mi_ephemeral being non-NULL also does this.
   1848 	 *
   1849 	 * mi_ephemeral_tree being non-NULL is sufficient
   1850 	 * to also indicate either it is an ephemeral node
   1851 	 * or the enclosing mntinfo4.
   1852 	 *
   1853 	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
   1854 	 * when we delete the ephemeral node and need to
   1855 	 * differentiate from an ephemeral node and the
   1856 	 * enclosing root node.
   1857 	 */
   1858 	*pnet = net = mi->mi_ephemeral_tree;
   1859 	if (net == NULL) {
   1860 		mutex_exit(&mi->mi_lock);
   1861 		return (0);
   1862 	}
   1863 
   1864 	eph = mi->mi_ephemeral;
   1865 	is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
   1866 	is_derooting = (eph == NULL);
   1867 
   1868 	/*
   1869 	 * If this is not recursion, then we need to
   1870 	 * grab a ref count.
   1871 	 *
   1872 	 * But wait, we also do not want to do that
   1873 	 * if a harvester thread has already grabbed
   1874 	 * the lock.
   1875 	 */
   1876 	if (!is_recursed) {
   1877 		mutex_enter(&net->net_cnt_lock);
   1878 		if (net->net_status &
   1879 		    NFS4_EPHEMERAL_TREE_LOCKED) {
   1880 			/*
   1881 			 * If the tree is locked, we need
   1882 			 * to decide whether we are the
   1883 			 * harvester or some explicit call
   1884 			 * for a umount. The only way that
   1885 			 * we are the harvester is if
   1886 			 * MS_SYSSPACE is set.
   1887 			 *
   1888 			 * We only let the harvester through
   1889 			 * at this point.
   1890 			 *
   1891 			 * We return EBUSY so that the
   1892 			 * caller knows something is
   1893 			 * going on. Note that by that
   1894 			 * time, the umount in the other
   1895 			 * thread may have already occured.
   1896 			 */
   1897 			if (!(flag & MS_SYSSPACE)) {
   1898 				mutex_exit(&net->net_cnt_lock);
   1899 				mutex_exit(&mi->mi_lock);
   1900 
   1901 				return (EBUSY);
   1902 			}
   1903 
   1904 			was_locked = TRUE;
   1905 		} else {
   1906 			nfs4_ephemeral_tree_incr(net);
   1907 			*pmust_rele = TRUE;
   1908 		}
   1909 
   1910 		mutex_exit(&net->net_cnt_lock);
   1911 	}
   1912 	mutex_exit(&mi->mi_lock);
   1913 
   1914 	/*
   1915 	 * If we are not the harvester, we need to check
   1916 	 * to see if we need to grab the tree lock.
   1917 	 */
   1918 	if (was_locked == FALSE) {
   1919 		/*
   1920 		 * If we grab the lock, it means that no other
   1921 		 * operation is working on the tree. If we don't
   1922 		 * grab it, we need to decide if this is because
   1923 		 * we are a recursive call or a new operation.
   1924 		 */
   1925 		if (mutex_tryenter(&net->net_tree_lock)) {
   1926 			*pmust_unlock = TRUE;
   1927 		} else {
   1928 			/*
   1929 			 * If we are a recursive call, we can
   1930 			 * proceed without the lock.
   1931 			 * Otherwise we have to wait until
   1932 			 * the lock becomes free.
   1933 			 */
   1934 			if (!is_recursed) {
   1935 				mutex_enter(&net->net_cnt_lock);
   1936 				if (net->net_status &
   1937 				    (NFS4_EPHEMERAL_TREE_DEROOTING
   1938 				    | NFS4_EPHEMERAL_TREE_INVALID)) {
   1939 					nfs4_ephemeral_tree_decr(net);
   1940 					mutex_exit(&net->net_cnt_lock);
   1941 					*pmust_rele = FALSE;
   1942 					goto is_busy;
   1943 				}
   1944 				mutex_exit(&net->net_cnt_lock);
   1945 
   1946 				/*
   1947 				 * We can't hold any other locks whilst
   1948 				 * we wait on this to free up.
   1949 				 */
   1950 				mutex_enter(&net->net_tree_lock);
   1951 
   1952 				/*
   1953 				 * Note that while mi->mi_ephemeral
   1954 				 * may change and thus we have to
   1955 				 * update eph, it is the case that
   1956 				 * we have tied down net and
   1957 				 * do not care if mi->mi_ephemeral_tree
   1958 				 * has changed.
   1959 				 */
   1960 				mutex_enter(&mi->mi_lock);
   1961 				eph = mi->mi_ephemeral;
   1962 				mutex_exit(&mi->mi_lock);
   1963 
   1964 				/*
   1965 				 * Okay, we need to see if either the
   1966 				 * tree got nuked or the current node
   1967 				 * got nuked. Both of which will cause
   1968 				 * an error.
   1969 				 *
   1970 				 * Note that a subsequent retry of the
   1971 				 * umount shall work.
   1972 				 */
   1973 				mutex_enter(&net->net_cnt_lock);
   1974 				if (net->net_status &
   1975 				    NFS4_EPHEMERAL_TREE_INVALID ||
   1976 				    (!is_derooting && eph == NULL)) {
   1977 					nfs4_ephemeral_tree_decr(net);
   1978 					mutex_exit(&net->net_cnt_lock);
   1979 					mutex_exit(&net->net_tree_lock);
   1980 					*pmust_rele = FALSE;
   1981 					goto is_busy;
   1982 				}
   1983 				mutex_exit(&net->net_cnt_lock);
   1984 				*pmust_unlock = TRUE;
   1985 			}
   1986 		}
   1987 	}
   1988 
   1989 	/*
   1990 	 * Only once we have grabbed the lock can we mark what we
   1991 	 * are planning on doing to the ephemeral tree.
   1992 	 */
   1993 	if (*pmust_unlock) {
   1994 		mutex_enter(&net->net_cnt_lock);
   1995 		net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
   1996 
   1997 		/*
   1998 		 * Check to see if we are nuking the root.
   1999 		 */
   2000 		if (is_derooting)
   2001 			net->net_status |=
   2002 			    NFS4_EPHEMERAL_TREE_DEROOTING;
   2003 		mutex_exit(&net->net_cnt_lock);
   2004 	}
   2005 
   2006 	if (!is_derooting) {
   2007 		/*
   2008 		 * Only work on children if the caller has not already
   2009 		 * done so.
   2010 		 */
   2011 		if (!is_recursed) {
   2012 			ASSERT(eph != NULL);
   2013 
   2014 			error = nfs4_ephemeral_unmount_engine(eph,
   2015 			    FALSE, flag, cr);
   2016 			if (error)
   2017 				goto is_busy;
   2018 		}
   2019 	} else {
   2020 		eph = net->net_root;
   2021 
   2022 		/*
   2023 		 * Only work if there is something there.
   2024 		 */
   2025 		if (eph) {
   2026 			error = nfs4_ephemeral_unmount_engine(eph, TRUE,
   2027 			    flag, cr);
   2028 			if (error) {
   2029 				mutex_enter(&net->net_cnt_lock);
   2030 				net->net_status &=
   2031 				    ~NFS4_EPHEMERAL_TREE_DEROOTING;
   2032 				mutex_exit(&net->net_cnt_lock);
   2033 				goto is_busy;
   2034 			}
   2035 
   2036 			/*
   2037 			 * Nothing else which goes wrong will
   2038 			 * invalidate the blowing away of the
   2039 			 * ephmeral tree.
   2040 			 */
   2041 			net->net_root = NULL;
   2042 		}
   2043 
   2044 		/*
   2045 		 * We have derooted and we have caused the tree to be
   2046 		 * invalidated.
   2047 		 */
   2048 		mutex_enter(&net->net_cnt_lock);
   2049 		net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
   2050 		net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
   2051 		if (was_locked == FALSE)
   2052 			nfs4_ephemeral_tree_decr(net);
   2053 		mutex_exit(&net->net_cnt_lock);
   2054 
   2055 		if (was_locked == FALSE)
   2056 			mutex_exit(&net->net_tree_lock);
   2057 
   2058 		/*
   2059 		 * We have just blown away any notation of this
   2060 		 * tree being locked. We can't let the caller
   2061 		 * try to clean things up.
   2062 		 */
   2063 		*pmust_unlock = FALSE;
   2064 
   2065 		/*
   2066 		 * At this point, the tree should no longer be
   2067 		 * associated with the mntinfo4. We need to pull
   2068 		 * it off there and let the harvester take
   2069 		 * care of it once the refcnt drops.
   2070 		 */
   2071 		mutex_enter(&mi->mi_lock);
   2072 		mi->mi_ephemeral_tree = NULL;
   2073 		mutex_exit(&mi->mi_lock);
   2074 	}
   2075 
   2076 	return (0);
   2077 
   2078 is_busy:
   2079 
   2080 	nfs4_ephemeral_umount_unlock(pmust_unlock, pmust_rele,
   2081 	    pnet);
   2082 
   2083 	return (error);
   2084 }
   2085 
   2086 /*
   2087  * Do the umount and record any error in the parent.
   2088  */
   2089 static void
   2090 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
   2091     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
   2092 {
   2093 	int	error;
   2094 
   2095 	error = umount2_engine(vfsp, flag, kcred, FALSE);
   2096 	if (error) {
   2097 		if (prior) {
   2098 			if (prior->ne_child == e)
   2099 				prior->ne_state |=
   2100 				    NFS4_EPHEMERAL_CHILD_ERROR;
   2101 			else
   2102 				prior->ne_state |=
   2103 				    NFS4_EPHEMERAL_PEER_ERROR;
   2104 		}
   2105 	}
   2106 }
   2107 
   2108 /*
   2109  * For each tree in the forest (where the forest is in
   2110  * effect all of the ephemeral trees for this zone),
   2111  * scan to see if a node can be unmounted. Note that
   2112  * unlike nfs4_ephemeral_unmount_engine(), we do
   2113  * not process the current node before children or
   2114  * siblings. I.e., if a node can be unmounted, we
   2115  * do not recursively check to see if the nodes
   2116  * hanging off of it can also be unmounted.
   2117  *
   2118  * Instead, we delve down deep to try and remove the
   2119  * children first. Then, because we share code with
   2120  * nfs4_ephemeral_unmount_engine(), we will try
   2121  * them again. This could be a performance issue in
   2122  * the future.
   2123  *
   2124  * Also note that unlike nfs4_ephemeral_unmount_engine(),
   2125  * we do not halt on an error. We will not remove the
   2126  * current node, but we will keep on trying to remove
   2127  * the others.
   2128  *
   2129  * force indicates that we want the unmount to occur
   2130  * even if there is something blocking it.
   2131  *
   2132  * time_check indicates that we want to see if the
   2133  * mount has expired past mount_to or not. Typically
   2134  * we want to do this and only on a shutdown of the
   2135  * zone would we want to ignore the check.
   2136  */
   2137 static void
   2138 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
   2139     bool_t force, bool_t time_check)
   2140 {
   2141 	nfs4_ephemeral_tree_t	*net;
   2142 	nfs4_ephemeral_tree_t	*prev = NULL;
   2143 	nfs4_ephemeral_tree_t	*next;
   2144 	nfs4_ephemeral_t	*e;
   2145 	nfs4_ephemeral_t	*prior;
   2146 	time_t			now = gethrestime_sec();
   2147 
   2148 	nfs4_ephemeral_tree_t	*harvest = NULL;
   2149 
   2150 	int			flag;
   2151 
   2152 	mntinfo4_t		*mi;
   2153 	vfs_t			*vfsp;
   2154 
   2155 	if (force)
   2156 		flag = MS_FORCE | MS_SYSSPACE;
   2157 	else
   2158 		flag = MS_SYSSPACE;
   2159 
   2160 	mutex_enter(&ntg->ntg_forest_lock);
   2161 	for (net = ntg->ntg_forest; net != NULL; net = next) {
   2162 		next = net->net_next;
   2163 
   2164 		nfs4_ephemeral_tree_hold(net);
   2165 
   2166 		mutex_enter(&net->net_tree_lock);
   2167 
   2168 		/*
   2169 		 * Let the unmount code know that the
   2170 		 * tree is already locked!
   2171 		 */
   2172 		mutex_enter(&net->net_cnt_lock);
   2173 		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
   2174 		mutex_exit(&net->net_cnt_lock);
   2175 
   2176 		/*
   2177 		 * If the intent is force all ephemeral nodes to
   2178 		 * be unmounted in this zone, we can short circuit a
   2179 		 * lot of tree traversal and simply zap the root node.
   2180 		 */
   2181 		if (force) {
   2182 			if (net->net_root) {
   2183 				mi = net->net_root->ne_mount;
   2184 				vfsp = mi->mi_vfsp;
   2185 
   2186 				/*
   2187 				 * Cleared by umount2_engine.
   2188 				 */
   2189 				VFS_HOLD(vfsp);
   2190 
   2191 				(void) umount2_engine(vfsp, flag,
   2192 				    kcred, FALSE);
   2193 
   2194 				goto check_done;
   2195 			}
   2196 		}
   2197 
   2198 		e = net->net_root;
   2199 		if (e)
   2200 			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
   2201 
   2202 		while (e) {
   2203 			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
   2204 				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
   2205 				if (e->ne_child) {
   2206 					e = e->ne_child;
   2207 					e->ne_state =
   2208 					    NFS4_EPHEMERAL_VISIT_CHILD;
   2209 				}
   2210 
   2211 				continue;
   2212 			} else if (e->ne_state ==
   2213 			    NFS4_EPHEMERAL_VISIT_SIBLING) {
   2214 				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
   2215 				if (e->ne_peer) {
   2216 					e = e->ne_peer;
   2217 					e->ne_state =
   2218 					    NFS4_EPHEMERAL_VISIT_CHILD;
   2219 				}
   2220 
   2221 				continue;
   2222 			} else if (e->ne_state ==
   2223 			    NFS4_EPHEMERAL_CHILD_ERROR) {
   2224 				prior = e->ne_prior;
   2225 
   2226 				/*
   2227 				 * If a child reported an error, do
   2228 				 * not bother trying to unmount.
   2229 				 *
   2230 				 * If your prior node is a parent,
   2231 				 * pass the error up such that they
   2232 				 * also do not try to unmount.
   2233 				 *
   2234 				 * However, if your prior is a sibling,
   2235 				 * let them try to unmount if they can.
   2236 				 */
   2237 				if (prior) {
   2238 					if (prior->ne_child == e)
   2239 						prior->ne_state |=
   2240 						    NFS4_EPHEMERAL_CHILD_ERROR;
   2241 					else
   2242 						prior->ne_state |=
   2243 						    NFS4_EPHEMERAL_PEER_ERROR;
   2244 				}
   2245 
   2246 				/*
   2247 				 * Clear the error and if needed, process peers.
   2248 				 *
   2249 				 * Once we mask out the error, we know whether
   2250 				 * or we have to process another node.
   2251 				 */
   2252 				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
   2253 				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
   2254 					e = prior;
   2255 
   2256 				continue;
   2257 			} else if (e->ne_state ==
   2258 			    NFS4_EPHEMERAL_PEER_ERROR) {
   2259 				prior = e->ne_prior;
   2260 
   2261 				if (prior) {
   2262 					if (prior->ne_child == e)
   2263 						prior->ne_state =
   2264 						    NFS4_EPHEMERAL_CHILD_ERROR;
   2265 					else
   2266 						prior->ne_state =
   2267 						    NFS4_EPHEMERAL_PEER_ERROR;
   2268 				}
   2269 
   2270 				/*
   2271 				 * Clear the error from this node and do the
   2272 				 * correct processing.
   2273 				 */
   2274 				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
   2275 				continue;
   2276 			}
   2277 
   2278 			prior = e->ne_prior;
   2279 			e->ne_state = NFS4_EPHEMERAL_OK;
   2280 
   2281 			/*
   2282 			 * It must be the case that we need to process
   2283 			 * this node.
   2284 			 */
   2285 			if (!time_check ||
   2286 			    now - e->ne_ref_time > e->ne_mount_to) {
   2287 				mi = e->ne_mount;
   2288 				vfsp = mi->mi_vfsp;
   2289 
   2290 				/*
   2291 				 * Cleared by umount2_engine.
   2292 				 */
   2293 				VFS_HOLD(vfsp);
   2294 
   2295 				/*
   2296 				 * Note that we effectively work down to the
   2297 				 * leaf nodes first, try to unmount them,
   2298 				 * then work our way back up into the leaf
   2299 				 * nodes.
   2300 				 *
   2301 				 * Also note that we deal with a lot of
   2302 				 * complexity by sharing the work with
   2303 				 * the manual unmount code.
   2304 				 */
   2305 				nfs4_ephemeral_record_umount(vfsp, flag,
   2306 				    e, prior);
   2307 			}
   2308 
   2309 			e = prior;
   2310 		}
   2311 
   2312 check_done:
   2313 
   2314 		/*
   2315 		 * At this point we are done processing this tree.
   2316 		 *
   2317 		 * If the tree is invalid and we are the only reference
   2318 		 * to it, then we push it on the local linked list
   2319 		 * to remove it at the end. We avoid that action now
   2320 		 * to keep the tree processing going along at a fair clip.
   2321 		 *
   2322 		 * Else, even if we are the only reference, we drop
   2323 		 * our hold on the current tree and allow it to be
   2324 		 * reused as needed.
   2325 		 */
   2326 		mutex_enter(&net->net_cnt_lock);
   2327 		if (net->net_refcnt == 1 &&
   2328 		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
   2329 			nfs4_ephemeral_tree_decr(net);
   2330 			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
   2331 			mutex_exit(&net->net_cnt_lock);
   2332 			mutex_exit(&net->net_tree_lock);
   2333 
   2334 			if (prev)
   2335 				prev->net_next = net->net_next;
   2336 			else
   2337 				ntg->ntg_forest = net->net_next;
   2338 
   2339 			net->net_next = harvest;
   2340 			harvest = net;
   2341 			continue;
   2342 		}
   2343 
   2344 		nfs4_ephemeral_tree_decr(net);
   2345 		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
   2346 		mutex_exit(&net->net_cnt_lock);
   2347 		mutex_exit(&net->net_tree_lock);
   2348 
   2349 		prev = net;
   2350 	}
   2351 	mutex_exit(&ntg->ntg_forest_lock);
   2352 
   2353 	for (net = harvest; net != NULL; net = next) {
   2354 		next = net->net_next;
   2355 
   2356 		mutex_destroy(&net->net_tree_lock);
   2357 		mutex_destroy(&net->net_cnt_lock);
   2358 		kmem_free(net, sizeof (*net));
   2359 	}
   2360 }
   2361 
   2362 /*
   2363  * This is the thread which decides when the harvesting
   2364  * can proceed and when to kill it off for this zone.
   2365  */
   2366 static void
   2367 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
   2368 {
   2369 	clock_t		timeleft;
   2370 	zone_t		*zone = curproc->p_zone;
   2371 
   2372 	for (;;) {
   2373 		timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
   2374 		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
   2375 
   2376 		/*
   2377 		 * zone is exiting...
   2378 		 */
   2379 		if (timeleft != -1) {
   2380 			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
   2381 			zthread_exit();
   2382 			/* NOTREACHED */
   2383 		}
   2384 
   2385 		/*
   2386 		 * Only bother scanning if there is potential
   2387 		 * work to be done.
   2388 		 */
   2389 		if (ntg->ntg_forest == NULL)
   2390 			continue;
   2391 
   2392 		/*
   2393 		 * Now scan the list and get rid of everything which
   2394 		 * is old.
   2395 		 */
   2396 		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
   2397 	}
   2398 
   2399 	/* NOTREACHED */
   2400 }
   2401 
   2402 /*
   2403  * The zone specific glue needed to start the unmount harvester.
   2404  *
   2405  * Note that we want to avoid holding the mutex as long as possible,
   2406  * hence the multiple checks.
   2407  *
   2408  * The caller should avoid us getting down here in the first
   2409  * place.
   2410  */
   2411 static void
   2412 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
   2413 {
   2414 	/*
   2415 	 * It got started before we got here...
   2416 	 */
   2417 	if (ntg->ntg_thread_started)
   2418 		return;
   2419 
   2420 	mutex_enter(&nfs4_ephemeral_thread_lock);
   2421 
   2422 	if (ntg->ntg_thread_started) {
   2423 		mutex_exit(&nfs4_ephemeral_thread_lock);
   2424 		return;
   2425 	}
   2426 
   2427 	/*
   2428 	 * Start the unmounter harvester thread for this zone.
   2429 	 */
   2430 	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
   2431 	    ntg, 0, minclsyspri);
   2432 
   2433 	ntg->ntg_thread_started = TRUE;
   2434 	mutex_exit(&nfs4_ephemeral_thread_lock);
   2435 }
   2436 
   2437 /*ARGSUSED*/
   2438 static void *
   2439 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
   2440 {
   2441 	nfs4_trigger_globals_t	*ntg;
   2442 
   2443 	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
   2444 	ntg->ntg_thread_started = FALSE;
   2445 
   2446 	/*
   2447 	 * This is the default....
   2448 	 */
   2449 	ntg->ntg_mount_to = nfs4_trigger_thread_timer;
   2450 
   2451 	mutex_init(&ntg->ntg_forest_lock, NULL,
   2452 	    MUTEX_DEFAULT, NULL);
   2453 
   2454 	return (ntg);
   2455 }
   2456 
   2457 /*
   2458  * Try a nice gentle walk down the forest and convince
   2459  * all of the trees to gracefully give it up.
   2460  */
   2461 /*ARGSUSED*/
   2462 static void
   2463 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
   2464 {
   2465 	nfs4_trigger_globals_t	*ntg = arg;
   2466 
   2467 	if (!ntg)
   2468 		return;
   2469 
   2470 	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
   2471 }
   2472 
   2473 /*
   2474  * Race along the forest and rip all of the trees out by
   2475  * their rootballs!
   2476  */
   2477 /*ARGSUSED*/
   2478 static void
   2479 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
   2480 {
   2481 	nfs4_trigger_globals_t	*ntg = arg;
   2482 
   2483 	if (!ntg)
   2484 		return;
   2485 
   2486 	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
   2487 
   2488 	mutex_destroy(&ntg->ntg_forest_lock);
   2489 	kmem_free(ntg, sizeof (*ntg));
   2490 }
   2491 
   2492 /*
   2493  * This is the zone independent cleanup needed for
   2494  * emphemeral mount processing.
   2495  */
   2496 void
   2497 nfs4_ephemeral_fini(void)
   2498 {
   2499 	(void) zone_key_delete(nfs4_ephemeral_key);
   2500 	mutex_destroy(&nfs4_ephemeral_thread_lock);
   2501 }
   2502 
   2503 /*
   2504  * This is the zone independent initialization needed for
   2505  * emphemeral mount processing.
   2506  */
   2507 void
   2508 nfs4_ephemeral_init(void)
   2509 {
   2510 	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
   2511 	    NULL);
   2512 
   2513 	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
   2514 	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
   2515 }
   2516 
   2517 /*
   2518  * nfssys() calls this function to set the per-zone
   2519  * value of mount_to to drive when an ephemeral mount is
   2520  * timed out. Each mount will grab a copy of this value
   2521  * when mounted.
   2522  */
   2523 void
   2524 nfs4_ephemeral_set_mount_to(uint_t mount_to)
   2525 {
   2526 	nfs4_trigger_globals_t	*ntg;
   2527 	zone_t			*zone = curproc->p_zone;
   2528 
   2529 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
   2530 
   2531 	ntg->ntg_mount_to = mount_to;
   2532 }
   2533 
   2534 /*
   2535  * Walk the list of v4 mount options; if they are currently set in vfsp,
   2536  * append them to a new comma-separated mount option string, and return it.
   2537  *
   2538  * Caller should free by calling nfs4_trigger_destroy_mntopts().
   2539  */
   2540 static char *
   2541 nfs4_trigger_create_mntopts(vfs_t *vfsp)
   2542 {
   2543 	uint_t i;
   2544 	char *mntopts;
   2545 	struct vfssw *vswp;
   2546 	mntopts_t *optproto;
   2547 
   2548 	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
   2549 
   2550 	/* get the list of applicable mount options for v4; locks *vswp */
   2551 	vswp = vfs_getvfssw(MNTTYPE_NFS4);
   2552 	optproto = &vswp->vsw_optproto;
   2553 
   2554 	for (i = 0; i < optproto->mo_count; i++) {
   2555 		struct mntopt *mop = &optproto->mo_list[i];
   2556 
   2557 		if (mop->mo_flags & MO_EMPTY)
   2558 			continue;
   2559 
   2560 		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
   2561 			kmem_free(mntopts, MAX_MNTOPT_STR);
   2562 			vfs_unrefvfssw(vswp);
   2563 			return (NULL);
   2564 		}
   2565 	}
   2566 
   2567 	vfs_unrefvfssw(vswp);
   2568 
   2569 	/*
   2570 	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
   2571 	 * and it may only be passed via MS_OPTIONSTR, so we
   2572 	 * must handle it here.
   2573 	 *
   2574 	 * Ideally, it would be in the list, but NFS does not specify its
   2575 	 * own opt proto list, it uses instead the default one. Since
   2576 	 * not all filesystems support extended attrs, it would not be
   2577 	 * appropriate to add it there.
   2578 	 */
   2579 	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
   2580 	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
   2581 		kmem_free(mntopts, MAX_MNTOPT_STR);
   2582 		return (NULL);
   2583 	}
   2584 
   2585 	return (mntopts);
   2586 }
   2587 
   2588 static void
   2589 nfs4_trigger_destroy_mntopts(char *mntopts)
   2590 {
   2591 	if (mntopts)
   2592 		kmem_free(mntopts, MAX_MNTOPT_STR);
   2593 }
   2594 
   2595 /*
   2596  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
   2597  */
   2598 static int
   2599 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
   2600 {
   2601 	if (mntopts == NULL || optname == NULL || vfsp == NULL)
   2602 		return (EINVAL);
   2603 
   2604 	if (vfs_optionisset(vfsp, optname, NULL)) {
   2605 		size_t mntoptslen = strlen(mntopts);
   2606 		size_t optnamelen = strlen(optname);
   2607 
   2608 		/* +1 for ',', +1 for NUL */
   2609 		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
   2610 			return (EOVERFLOW);
   2611 
   2612 		/* first or subsequent mount option? */
   2613 		if (*mntopts != '\0')
   2614 			(void) strcat(mntopts, ",");
   2615 
   2616 		(void) strcat(mntopts, optname);
   2617 	}
   2618 
   2619 	return (0);
   2620 }
   2621 
   2622 static enum clnt_stat
   2623 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
   2624 {
   2625 	int retries, error;
   2626 	uint_t max_msgsize;
   2627 	enum clnt_stat status;
   2628 	CLIENT *cl;
   2629 	struct timeval timeout;
   2630 
   2631 	/* as per recov_newserver() */
   2632 	max_msgsize = 0;
   2633 	retries = 1;
   2634 	timeout.tv_sec = 2;
   2635 	timeout.tv_usec = 0;
   2636 
   2637 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM,
   2638 	    NFS_V4, max_msgsize, retries, CRED(), &cl);
   2639 	if (error)
   2640 		return (RPC_FAILED);
   2641 
   2642 	if (nointr)
   2643 		cl->cl_nosignal = TRUE;
   2644 	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
   2645 	    timeout);
   2646 	if (nointr)
   2647 		cl->cl_nosignal = FALSE;
   2648 
   2649 	AUTH_DESTROY(cl->cl_auth);
   2650 	CLNT_DESTROY(cl);
   2651 
   2652 	return (status);
   2653 }
   2654