Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  *	Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
     28  *	All Rights Reserved
     29  */
     30 
     31 #include <sys/param.h>
     32 #include <sys/types.h>
     33 #include <sys/systm.h>
     34 #include <sys/cred.h>
     35 #include <sys/time.h>
     36 #include <sys/vnode.h>
     37 #include <sys/vfs.h>
     38 #include <sys/vfs_opreg.h>
     39 #include <sys/file.h>
     40 #include <sys/filio.h>
     41 #include <sys/uio.h>
     42 #include <sys/buf.h>
     43 #include <sys/mman.h>
     44 #include <sys/pathname.h>
     45 #include <sys/dirent.h>
     46 #include <sys/debug.h>
     47 #include <sys/vmsystm.h>
     48 #include <sys/fcntl.h>
     49 #include <sys/flock.h>
     50 #include <sys/swap.h>
     51 #include <sys/errno.h>
     52 #include <sys/strsubr.h>
     53 #include <sys/sysmacros.h>
     54 #include <sys/kmem.h>
     55 #include <sys/cmn_err.h>
     56 #include <sys/pathconf.h>
     57 #include <sys/utsname.h>
     58 #include <sys/dnlc.h>
     59 #include <sys/acl.h>
     60 #include <sys/systeminfo.h>
     61 #include <sys/policy.h>
     62 #include <sys/sdt.h>
     63 #include <sys/list.h>
     64 #include <sys/stat.h>
     65 #include <sys/zone.h>
     66 
     67 #include <rpc/types.h>
     68 #include <rpc/auth.h>
     69 #include <rpc/clnt.h>
     70 
     71 #include <nfs/nfs.h>
     72 #include <nfs/nfs_clnt.h>
     73 #include <nfs/nfs_acl.h>
     74 #include <nfs/lm.h>
     75 #include <nfs/nfs4.h>
     76 #include <nfs/nfs4_kprot.h>
     77 #include <nfs/rnode4.h>
     78 #include <nfs/nfs4_clnt.h>
     79 
     80 #include <vm/hat.h>
     81 #include <vm/as.h>
     82 #include <vm/page.h>
     83 #include <vm/pvn.h>
     84 #include <vm/seg.h>
     85 #include <vm/seg_map.h>
     86 #include <vm/seg_kpm.h>
     87 #include <vm/seg_vn.h>
     88 
     89 #include <fs/fs_subr.h>
     90 
     91 #include <sys/ddi.h>
     92 #include <sys/int_fmtio.h>
     93 
     94 typedef struct {
     95 	nfs4_ga_res_t	*di_garp;
     96 	cred_t		*di_cred;
     97 	hrtime_t	di_time_call;
     98 } dirattr_info_t;
     99 
    100 typedef enum nfs4_acl_op {
    101 	NFS4_ACL_GET,
    102 	NFS4_ACL_SET
    103 } nfs4_acl_op_t;
    104 
    105 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi);
    106 
    107 static void	nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
    108 			char *, dirattr_info_t *);
    109 
    110 static void	nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
    111 		    nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
    112 		    nfs4_error_t *, int *);
    113 static int	nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
    114 			cred_t *);
    115 static int	nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
    116 			stable_how4 *);
    117 static int	nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
    118 			cred_t *, bool_t, struct uio *);
    119 static int	nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
    120 			vsecattr_t *);
    121 static int	nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
    122 static int	nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
    123 static int	nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
    124 static int	nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
    125 static int	nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
    126 static int	nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
    127 			int, vnode_t **, cred_t *);
    128 static int	nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
    129 			cred_t *, int, int, enum createmode4, int);
    130 static int	nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
    131 			caller_context_t *);
    132 static int	nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
    133 			vnode_t *, char *, cred_t *, nfsstat4 *);
    134 static int	nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
    135 			vnode_t *, char *, cred_t *, nfsstat4 *);
    136 static int	do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
    137 static void	nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
    138 static int	nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
    139 static int	nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
    140 			page_t *[], size_t, struct seg *, caddr_t,
    141 			enum seg_rw, cred_t *);
    142 static void	nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
    143 			cred_t *);
    144 static int	nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
    145 			int, cred_t *);
    146 static int	nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
    147 			int, cred_t *);
    148 static int	nfs4_commit(vnode_t *, offset4, count4, cred_t *);
    149 static void	nfs4_set_mod(vnode_t *);
    150 static void	nfs4_get_commit(vnode_t *);
    151 static void	nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
    152 static int	nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
    153 static int	nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
    154 static int	nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
    155 			cred_t *);
    156 static void	do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
    157 			cred_t *);
    158 static int	nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
    159 			hrtime_t, vnode_t *, cred_t *);
    160 static int	nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
    161 static int	nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
    162 static void	nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
    163 			u_offset_t);
    164 static int 	nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
    165 static int	nfs4_block_and_wait(clock_t *, rnode4_t *);
    166 static cred_t  *state_to_cred(nfs4_open_stream_t *);
    167 static int	vtoname(vnode_t *, char *, ssize_t);
    168 static void	denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
    169 static pid_t	lo_to_pid(lock_owner4 *);
    170 static void	nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
    171 			cred_t *, nfs4_lock_owner_t *);
    172 static void	push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
    173 			nfs4_lock_owner_t *);
    174 static int 	open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
    175 static void	nfs4_delmap_callback(struct as *, void *, uint_t);
    176 static void	nfs4_free_delmapcall(nfs4_delmapcall_t *);
    177 static nfs4_delmapcall_t	*nfs4_init_delmapcall();
    178 static int	nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
    179 static int	nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
    180 static int	nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
    181 			uid_t, gid_t, int);
    182 
    183 /*
    184  * Routines that implement the setting of v4 args for the misc. ops
    185  */
    186 static void	nfs4args_lock_free(nfs_argop4 *);
    187 static void	nfs4args_lockt_free(nfs_argop4 *);
    188 static void	nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
    189 			int, rnode4_t *, cred_t *, bitmap4, int *,
    190 			nfs4_stateid_types_t *);
    191 static void	nfs4args_setattr_free(nfs_argop4 *);
    192 static int	nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
    193 			bitmap4);
    194 static void	nfs4args_verify_free(nfs_argop4 *);
    195 static void	nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
    196 			WRITE4args **, nfs4_stateid_types_t *);
    197 
    198 /*
    199  * These are the vnode ops functions that implement the vnode interface to
    200  * the networked file system.  See more comments below at nfs4_vnodeops.
    201  */
    202 static int	nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
    203 static int	nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
    204 			caller_context_t *);
    205 static int	nfs4_read(vnode_t *, struct uio *, int, cred_t *,
    206 			caller_context_t *);
    207 static int	nfs4_write(vnode_t *, struct uio *, int, cred_t *,
    208 			caller_context_t *);
    209 static int	nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
    210 			caller_context_t *);
    211 static int	nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
    212 			caller_context_t *);
    213 static int	nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
    214 static int	nfs4_readlink(vnode_t *, struct uio *, cred_t *,
    215 			caller_context_t *);
    216 static int	nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
    217 static int	nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
    218 			int, vnode_t **, cred_t *, int, caller_context_t *,
    219 			vsecattr_t *);
    220 static int	nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
    221 			int);
    222 static int	nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
    223 			caller_context_t *, int);
    224 static int	nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
    225 			caller_context_t *, int);
    226 static int	nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
    227 			cred_t *, caller_context_t *, int, vsecattr_t *);
    228 static int	nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
    229 			caller_context_t *, int);
    230 static int	nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
    231 			cred_t *, caller_context_t *, int);
    232 static int	nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
    233 			caller_context_t *, int);
    234 static int	nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
    235 static int	nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
    236 			page_t *[], size_t, struct seg *, caddr_t,
    237 			enum seg_rw, cred_t *, caller_context_t *);
    238 static int	nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
    239 			caller_context_t *);
    240 static int	nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
    241 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
    242 static int	nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
    243 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
    244 static int	nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
    245 static int	nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
    246 			struct flk_callback *, cred_t *, caller_context_t *);
    247 static int	nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
    248 			cred_t *, caller_context_t *);
    249 static int	nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
    250 			uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
    251 static int	nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
    252 			cred_t *, caller_context_t *);
    253 static void	nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
    254 			caller_context_t *);
    255 static int	nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
    256 			caller_context_t *);
    257 /*
    258  * These vnode ops are required to be called from outside this source file,
    259  * e.g. by ephemeral mount stub vnode ops, and so may not be declared
    260  * as static.
    261  */
    262 int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
    263 	    caller_context_t *);
    264 void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
    265 int	nfs4_lookup(vnode_t *, char *, vnode_t **,
    266 	    struct pathname *, int, vnode_t *, cred_t *,
    267 	    caller_context_t *, int *, pathname_t *);
    268 int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
    269 int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
    270 void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
    271 int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
    272 int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
    273 	    caller_context_t *);
    274 int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
    275 	    caller_context_t *);
    276 int	nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
    277 	    caller_context_t *);
    278 
    279 /*
    280  * Used for nfs4_commit_vp() to indicate if we should
    281  * wait on pending writes.
    282  */
    283 #define	NFS4_WRITE_NOWAIT	0
    284 #define	NFS4_WRITE_WAIT		1
    285 
    286 #define	NFS4_BASE_WAIT_TIME 1	/* 1 second */
    287 
    288 /*
    289  * Error flags used to pass information about certain special errors
    290  * which need to be handled specially.
    291  */
    292 #define	NFS_EOF			-98
    293 #define	NFS_VERF_MISMATCH	-97
    294 
    295 /*
    296  * Flags used to differentiate between which operation drove the
    297  * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
    298  */
    299 #define	NFS4_CLOSE_OP		0x1
    300 #define	NFS4_DELMAP_OP		0x2
    301 #define	NFS4_INACTIVE_OP	0x3
    302 
    303 #define	ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
    304 
    305 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
    306 #define	ALIGN64(x, ptr, sz)						\
    307 	x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);		\
    308 	if (x) {							\
    309 		x = sizeof (uint64_t) - (x);				\
    310 		sz -= (x);						\
    311 		ptr += (x);						\
    312 	}
    313 
    314 #ifdef DEBUG
    315 int nfs4_client_attr_debug = 0;
    316 int nfs4_client_state_debug = 0;
    317 int nfs4_client_shadow_debug = 0;
    318 int nfs4_client_lock_debug = 0;
    319 int nfs4_seqid_sync = 0;
    320 int nfs4_client_map_debug = 0;
    321 static int nfs4_pageio_debug = 0;
    322 int nfs4_client_inactive_debug = 0;
    323 int nfs4_client_recov_debug = 0;
    324 int nfs4_client_failover_debug = 0;
    325 int nfs4_client_call_debug = 0;
    326 int nfs4_client_lookup_debug = 0;
    327 int nfs4_client_zone_debug = 0;
    328 int nfs4_lost_rqst_debug = 0;
    329 int nfs4_rdattrerr_debug = 0;
    330 int nfs4_open_stream_debug = 0;
    331 
    332 int nfs4read_error_inject;
    333 
    334 static int nfs4_create_misses = 0;
    335 
    336 static int nfs4_readdir_cache_shorts = 0;
    337 static int nfs4_readdir_readahead = 0;
    338 
    339 static int nfs4_bio_do_stop = 0;
    340 
    341 static int nfs4_lostpage = 0;	/* number of times we lost original page */
    342 
    343 int nfs4_mmap_debug = 0;
    344 
    345 static int nfs4_pathconf_cache_hits = 0;
    346 static int nfs4_pathconf_cache_misses = 0;
    347 
    348 int nfs4close_all_cnt;
    349 int nfs4close_one_debug = 0;
    350 int nfs4close_notw_debug = 0;
    351 
    352 int denied_to_flk_debug = 0;
    353 void *lockt_denied_debug;
    354 
    355 #endif
    356 
    357 /*
    358  * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
    359  * or NFS4ERR_RESOURCE.
    360  */
    361 static int confirm_retry_sec = 30;
    362 
    363 static int nfs4_lookup_neg_cache = 1;
    364 
    365 /*
    366  * number of pages to read ahead
    367  * optimized for 100 base-T.
    368  */
    369 static int nfs4_nra = 4;
    370 
    371 static int nfs4_do_symlink_cache = 1;
    372 
    373 static int nfs4_pathconf_disable_cache = 0;
    374 
    375 /*
    376  * These are the vnode ops routines which implement the vnode interface to
    377  * the networked file system.  These routines just take their parameters,
    378  * make them look networkish by putting the right info into interface structs,
    379  * and then calling the appropriate remote routine(s) to do the work.
    380  *
    381  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
    382  * we purge the directory cache relative to that vnode.  This way, the
    383  * user won't get burned by the cache repeatedly.  See <nfs/rnode4.h> for
    384  * more details on rnode locking.
    385  */
    386 
    387 struct vnodeops *nfs4_vnodeops;
    388 
    389 const fs_operation_def_t nfs4_vnodeops_template[] = {
    390 	VOPNAME_OPEN,		{ .vop_open = nfs4_open },
    391 	VOPNAME_CLOSE,		{ .vop_close = nfs4_close },
    392 	VOPNAME_READ,		{ .vop_read = nfs4_read },
    393 	VOPNAME_WRITE,		{ .vop_write = nfs4_write },
    394 	VOPNAME_IOCTL,		{ .vop_ioctl = nfs4_ioctl },
    395 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_getattr },
    396 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_setattr },
    397 	VOPNAME_ACCESS,		{ .vop_access = nfs4_access },
    398 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_lookup },
    399 	VOPNAME_CREATE,		{ .vop_create = nfs4_create },
    400 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_remove },
    401 	VOPNAME_LINK,		{ .vop_link = nfs4_link },
    402 	VOPNAME_RENAME,		{ .vop_rename = nfs4_rename },
    403 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_mkdir },
    404 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_rmdir },
    405 	VOPNAME_READDIR,	{ .vop_readdir = nfs4_readdir },
    406 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_symlink },
    407 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_readlink },
    408 	VOPNAME_FSYNC,		{ .vop_fsync = nfs4_fsync },
    409 	VOPNAME_INACTIVE,	{ .vop_inactive = nfs4_inactive },
    410 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
    411 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
    412 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
    413 	VOPNAME_SEEK,		{ .vop_seek = nfs4_seek },
    414 	VOPNAME_FRLOCK,		{ .vop_frlock = nfs4_frlock },
    415 	VOPNAME_SPACE,		{ .vop_space = nfs4_space },
    416 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
    417 	VOPNAME_GETPAGE,	{ .vop_getpage = nfs4_getpage },
    418 	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs4_putpage },
    419 	VOPNAME_MAP,		{ .vop_map = nfs4_map },
    420 	VOPNAME_ADDMAP,		{ .vop_addmap = nfs4_addmap },
    421 	VOPNAME_DELMAP,		{ .vop_delmap = nfs4_delmap },
    422 	/* no separate nfs4_dump */
    423 	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
    424 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
    425 	VOPNAME_PAGEIO,		{ .vop_pageio = nfs4_pageio },
    426 	VOPNAME_DISPOSE,	{ .vop_dispose = nfs4_dispose },
    427 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs4_setsecattr },
    428 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
    429 	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs4_shrlock },
    430 	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
    431 	NULL,			NULL
    432 };
    433 
    434 /*
    435  * The following are subroutines and definitions to set args or get res
    436  * for the different nfsv4 ops
    437  */
    438 
    439 void
    440 nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
    441 {
    442 	int		i;
    443 
    444 	for (i = 0; i < arglen; i++) {
    445 		if (argop[i].argop == OP_LOOKUP) {
    446 			kmem_free(
    447 			    argop[i].nfs_argop4_u.oplookup.
    448 			    objname.utf8string_val,
    449 			    argop[i].nfs_argop4_u.oplookup.
    450 			    objname.utf8string_len);
    451 		}
    452 	}
    453 }
    454 
    455 static void
    456 nfs4args_lock_free(nfs_argop4 *argop)
    457 {
    458 	locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
    459 
    460 	if (locker->new_lock_owner == TRUE) {
    461 		open_to_lock_owner4 *open_owner;
    462 
    463 		open_owner = &locker->locker4_u.open_owner;
    464 		if (open_owner->lock_owner.owner_val != NULL) {
    465 			kmem_free(open_owner->lock_owner.owner_val,
    466 			    open_owner->lock_owner.owner_len);
    467 		}
    468 	}
    469 }
    470 
    471 static void
    472 nfs4args_lockt_free(nfs_argop4 *argop)
    473 {
    474 	lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
    475 
    476 	if (lowner->owner_val != NULL) {
    477 		kmem_free(lowner->owner_val, lowner->owner_len);
    478 	}
    479 }
    480 
    481 static void
    482 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
    483     rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
    484     nfs4_stateid_types_t *sid_types)
    485 {
    486 	fattr4		*attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
    487 	mntinfo4_t	*mi;
    488 
    489 	argop->argop = OP_SETATTR;
    490 	/*
    491 	 * The stateid is set to 0 if client is not modifying the size
    492 	 * and otherwise to whatever nfs4_get_stateid() returns.
    493 	 *
    494 	 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
    495 	 * state struct could be found for the process/file pair.  We may
    496 	 * want to change this in the future (by OPENing the file).  See
    497 	 * bug # 4474852.
    498 	 */
    499 	if (vap->va_mask & AT_SIZE) {
    500 
    501 		ASSERT(rp != NULL);
    502 		mi = VTOMI4(RTOV4(rp));
    503 
    504 		argop->nfs_argop4_u.opsetattr.stateid =
    505 		    nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
    506 		    OP_SETATTR, sid_types, FALSE);
    507 	} else {
    508 		bzero(&argop->nfs_argop4_u.opsetattr.stateid,
    509 		    sizeof (stateid4));
    510 	}
    511 
    512 	*error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
    513 	if (*error)
    514 		bzero(attr, sizeof (*attr));
    515 }
    516 
    517 static void
    518 nfs4args_setattr_free(nfs_argop4 *argop)
    519 {
    520 	nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
    521 }
    522 
    523 static int
    524 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
    525     bitmap4 supp)
    526 {
    527 	fattr4 *attr;
    528 	int error = 0;
    529 
    530 	argop->argop = op;
    531 	switch (op) {
    532 	case OP_VERIFY:
    533 		attr = &argop->nfs_argop4_u.opverify.obj_attributes;
    534 		break;
    535 	case OP_NVERIFY:
    536 		attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
    537 		break;
    538 	default:
    539 		return (EINVAL);
    540 	}
    541 	if (!error)
    542 		error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
    543 	if (error)
    544 		bzero(attr, sizeof (*attr));
    545 	return (error);
    546 }
    547 
    548 static void
    549 nfs4args_verify_free(nfs_argop4 *argop)
    550 {
    551 	switch (argop->argop) {
    552 	case OP_VERIFY:
    553 		nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
    554 		break;
    555 	case OP_NVERIFY:
    556 		nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
    557 		break;
    558 	default:
    559 		break;
    560 	}
    561 }
    562 
    563 static void
    564 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
    565     WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
    566 {
    567 	WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
    568 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
    569 
    570 	argop->argop = OP_WRITE;
    571 	wargs->stable = stable;
    572 	wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
    573 	    mi, OP_WRITE, sid_tp);
    574 	wargs->mblk = NULL;
    575 	*wargs_pp = wargs;
    576 }
    577 
    578 void
    579 nfs4args_copen_free(OPEN4cargs *open_args)
    580 {
    581 	if (open_args->owner.owner_val) {
    582 		kmem_free(open_args->owner.owner_val,
    583 		    open_args->owner.owner_len);
    584 	}
    585 	if ((open_args->opentype == OPEN4_CREATE) &&
    586 	    (open_args->mode != EXCLUSIVE4)) {
    587 		nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
    588 	}
    589 }
    590 
    591 /*
    592  * XXX:  This is referenced in modstubs.s
    593  */
    594 struct vnodeops *
    595 nfs4_getvnodeops(void)
    596 {
    597 	return (nfs4_vnodeops);
    598 }
    599 
    600 /*
    601  * The OPEN operation opens a regular file.
    602  */
    603 /*ARGSUSED3*/
    604 static int
    605 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
    606 {
    607 	vnode_t *dvp = NULL;
    608 	rnode4_t *rp, *drp;
    609 	int error;
    610 	int just_been_created;
    611 	char fn[MAXNAMELEN];
    612 
    613 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
    614 	if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
    615 		return (EIO);
    616 	rp = VTOR4(*vpp);
    617 
    618 	/*
    619 	 * Check to see if opening something besides a regular file;
    620 	 * if so skip the OTW call
    621 	 */
    622 	if ((*vpp)->v_type != VREG) {
    623 		error = nfs4_open_non_reg_file(vpp, flag, cr);
    624 		return (error);
    625 	}
    626 
    627 	/*
    628 	 * XXX - would like a check right here to know if the file is
    629 	 * executable or not, so as to skip OTW
    630 	 */
    631 
    632 	if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
    633 		return (error);
    634 
    635 	drp = VTOR4(dvp);
    636 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
    637 		return (EINTR);
    638 
    639 	if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
    640 		nfs_rw_exit(&drp->r_rwlock);
    641 		return (error);
    642 	}
    643 
    644 	/*
    645 	 * See if this file has just been CREATEd.
    646 	 * If so, clear the flag and update the dnlc, which was previously
    647 	 * skipped in nfs4_create.
    648 	 * XXX need better serilization on this.
    649 	 * XXX move this into the nf4open_otw call, after we have
    650 	 * XXX acquired the open owner seqid sync.
    651 	 */
    652 	mutex_enter(&rp->r_statev4_lock);
    653 	if (rp->created_v4) {
    654 		rp->created_v4 = 0;
    655 		mutex_exit(&rp->r_statev4_lock);
    656 
    657 		dnlc_update(dvp, fn, *vpp);
    658 		/* This is needed so we don't bump the open ref count */
    659 		just_been_created = 1;
    660 	} else {
    661 		mutex_exit(&rp->r_statev4_lock);
    662 		just_been_created = 0;
    663 	}
    664 
    665 	/*
    666 	 * If caller specified O_TRUNC/FTRUNC, then be sure to set
    667 	 * FWRITE (to drive successful setattr(size=0) after open)
    668 	 */
    669 	if (flag & FTRUNC)
    670 		flag |= FWRITE;
    671 
    672 	error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
    673 	    just_been_created);
    674 
    675 	if (!error && !((*vpp)->v_flag & VROOT))
    676 		dnlc_update(dvp, fn, *vpp);
    677 
    678 	nfs_rw_exit(&drp->r_rwlock);
    679 
    680 	/* release the hold from vtodv */
    681 	VN_RELE(dvp);
    682 
    683 	/* exchange the shadow for the master vnode, if needed */
    684 
    685 	if (error == 0 && IS_SHADOW(*vpp, rp))
    686 		sv_exchange(vpp);
    687 
    688 	return (error);
    689 }
    690 
    691 /*
    692  * See if there's a "lost open" request to be saved and recovered.
    693  */
    694 static void
    695 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
    696     nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
    697     vnode_t *dvp, OPEN4cargs *open_args)
    698 {
    699 	vfs_t *vfsp;
    700 	char *srccfp;
    701 
    702 	vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
    703 
    704 	if (error != ETIMEDOUT && error != EINTR &&
    705 	    !NFS4_FRC_UNMT_ERR(error, vfsp)) {
    706 		lost_rqstp->lr_op = 0;
    707 		return;
    708 	}
    709 
    710 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
    711 	    "nfs4open_save_lost_rqst: error %d", error));
    712 
    713 	lost_rqstp->lr_op = OP_OPEN;
    714 
    715 	/*
    716 	 * The vp (if it is not NULL) and dvp are held and rele'd via
    717 	 * the recovery code.  See nfs4_save_lost_rqst.
    718 	 */
    719 	lost_rqstp->lr_vp = vp;
    720 	lost_rqstp->lr_dvp = dvp;
    721 	lost_rqstp->lr_oop = oop;
    722 	lost_rqstp->lr_osp = NULL;
    723 	lost_rqstp->lr_lop = NULL;
    724 	lost_rqstp->lr_cr = cr;
    725 	lost_rqstp->lr_flk = NULL;
    726 	lost_rqstp->lr_oacc = open_args->share_access;
    727 	lost_rqstp->lr_odeny = open_args->share_deny;
    728 	lost_rqstp->lr_oclaim = open_args->claim;
    729 	if (open_args->claim == CLAIM_DELEGATE_CUR) {
    730 		lost_rqstp->lr_ostateid =
    731 		    open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
    732 		srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
    733 	} else {
    734 		srccfp = open_args->open_claim4_u.cfile;
    735 	}
    736 	lost_rqstp->lr_ofile.utf8string_len = 0;
    737 	lost_rqstp->lr_ofile.utf8string_val = NULL;
    738 	(void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
    739 	lost_rqstp->lr_putfirst = FALSE;
    740 }
    741 
    742 struct nfs4_excl_time {
    743 	uint32 seconds;
    744 	uint32 nseconds;
    745 };
    746 
    747 /*
    748  * The OPEN operation creates and/or opens a regular file
    749  *
    750  * ARGSUSED
    751  */
    752 static int
    753 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
    754     vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
    755     enum createmode4 createmode, int file_just_been_created)
    756 {
    757 	rnode4_t *rp;
    758 	rnode4_t *drp = VTOR4(dvp);
    759 	vnode_t *vp = NULL;
    760 	vnode_t *vpi = *vpp;
    761 	bool_t needrecov = FALSE;
    762 
    763 	int doqueue = 1;
    764 
    765 	COMPOUND4args_clnt args;
    766 	COMPOUND4res_clnt res;
    767 	nfs_argop4 *argop;
    768 	nfs_resop4 *resop;
    769 	int argoplist_size;
    770 	int idx_open, idx_fattr;
    771 
    772 	GETFH4res *gf_res = NULL;
    773 	OPEN4res *op_res = NULL;
    774 	nfs4_ga_res_t *garp;
    775 	fattr4 *attr = NULL;
    776 	struct nfs4_excl_time verf;
    777 	bool_t did_excl_setup = FALSE;
    778 	int created_osp;
    779 
    780 	OPEN4cargs *open_args;
    781 	nfs4_open_owner_t	*oop = NULL;
    782 	nfs4_open_stream_t	*osp = NULL;
    783 	seqid4 seqid = 0;
    784 	bool_t retry_open = FALSE;
    785 	nfs4_recov_state_t recov_state;
    786 	nfs4_lost_rqst_t lost_rqst;
    787 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
    788 	hrtime_t t;
    789 	int acc = 0;
    790 	cred_t *cred_otw = NULL;	/* cred used to do the RPC call */
    791 	cred_t *ncr = NULL;
    792 
    793 	nfs4_sharedfh_t *otw_sfh;
    794 	nfs4_sharedfh_t *orig_sfh;
    795 	int fh_differs = 0;
    796 	int numops, setgid_flag;
    797 	int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
    798 
    799 	/*
    800 	 * Make sure we properly deal with setting the right gid on
    801 	 * a newly created file to reflect the parent's setgid bit
    802 	 */
    803 	setgid_flag = 0;
    804 	if (create_flag && in_va) {
    805 
    806 		/*
    807 		 * If there is grpid mount flag used or
    808 		 * the parent's directory has the setgid bit set
    809 		 * _and_ the client was able to get a valid mapping
    810 		 * for the parent dir's owner_group, we want to
    811 		 * append NVERIFY(owner_group == dva.va_gid) and
    812 		 * SETATTR to the CREATE compound.
    813 		 */
    814 		mutex_enter(&drp->r_statelock);
    815 		if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
    816 		    drp->r_attr.va_mode & VSGID) &&
    817 		    drp->r_attr.va_gid != GID_NOBODY) {
    818 			in_va->va_mask |= AT_GID;
    819 			in_va->va_gid = drp->r_attr.va_gid;
    820 			setgid_flag = 1;
    821 		}
    822 		mutex_exit(&drp->r_statelock);
    823 	}
    824 
    825 	/*
    826 	 * Normal/non-create compound:
    827 	 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
    828 	 *
    829 	 * Open(create) compound no setgid:
    830 	 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
    831 	 * RESTOREFH + GETATTR
    832 	 *
    833 	 * Open(create) setgid:
    834 	 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
    835 	 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
    836 	 * NVERIFY(grp) + SETATTR
    837 	 */
    838 	if (setgid_flag) {
    839 		numops = 10;
    840 		idx_open = 1;
    841 		idx_fattr = 3;
    842 	} else if (create_flag) {
    843 		numops = 7;
    844 		idx_open = 2;
    845 		idx_fattr = 4;
    846 	} else {
    847 		numops = 4;
    848 		idx_open = 1;
    849 		idx_fattr = 3;
    850 	}
    851 
    852 	args.array_len = numops;
    853 	argoplist_size = numops * sizeof (nfs_argop4);
    854 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
    855 
    856 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
    857 	    "open %s open flag 0x%x cred %p", file_name, open_flag,
    858 	    (void *)cr));
    859 
    860 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
    861 	if (create_flag) {
    862 		/*
    863 		 * We are to create a file.  Initialize the passed in vnode
    864 		 * pointer.
    865 		 */
    866 		vpi = NULL;
    867 	} else {
    868 		/*
    869 		 * Check to see if the client owns a read delegation and is
    870 		 * trying to open for write.  If so, then return the delegation
    871 		 * to avoid the server doing a cb_recall and returning DELAY.
    872 		 * NB - we don't use the statev4_lock here because we'd have
    873 		 * to drop the lock anyway and the result would be stale.
    874 		 */
    875 		if ((open_flag & FWRITE) &&
    876 		    VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
    877 			(void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
    878 
    879 		/*
    880 		 * If the file has a delegation, then do an access check up
    881 		 * front.  This avoids having to an access check later after
    882 		 * we've already done start_op, which could deadlock.
    883 		 */
    884 		if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
    885 			if (open_flag & FREAD &&
    886 			    nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
    887 				acc |= VREAD;
    888 			if (open_flag & FWRITE &&
    889 			    nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
    890 				acc |= VWRITE;
    891 		}
    892 	}
    893 
    894 	drp = VTOR4(dvp);
    895 
    896 	recov_state.rs_flags = 0;
    897 	recov_state.rs_num_retry_despite_err = 0;
    898 	cred_otw = cr;
    899 
    900 recov_retry:
    901 	fh_differs = 0;
    902 	nfs4_error_zinit(&e);
    903 
    904 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
    905 	if (e.error) {
    906 		if (ncr != NULL)
    907 			crfree(ncr);
    908 		kmem_free(argop, argoplist_size);
    909 		return (e.error);
    910 	}
    911 
    912 	args.ctag = TAG_OPEN;
    913 	args.array_len = numops;
    914 	args.array = argop;
    915 
    916 	/* putfh directory fh */
    917 	argop[0].argop = OP_CPUTFH;
    918 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
    919 
    920 	/* OPEN: either op 1 or op 2 depending upon create/setgid flags */
    921 	argop[idx_open].argop = OP_COPEN;
    922 	open_args = &argop[idx_open].nfs_argop4_u.opcopen;
    923 	open_args->claim = CLAIM_NULL;
    924 
    925 	/* name of file */
    926 	open_args->open_claim4_u.cfile = file_name;
    927 	open_args->owner.owner_len = 0;
    928 	open_args->owner.owner_val = NULL;
    929 
    930 	if (create_flag) {
    931 		/* CREATE a file */
    932 		open_args->opentype = OPEN4_CREATE;
    933 		open_args->mode = createmode;
    934 		if (createmode == EXCLUSIVE4) {
    935 			if (did_excl_setup == FALSE) {
    936 				verf.seconds = zone_get_hostid(NULL);
    937 				if (verf.seconds != 0)
    938 					verf.nseconds = newnum();
    939 				else {
    940 					timestruc_t now;
    941 
    942 					gethrestime(&now);
    943 					verf.seconds = now.tv_sec;
    944 					verf.nseconds = now.tv_nsec;
    945 				}
    946 				/*
    947 				 * Since the server will use this value for the
    948 				 * mtime, make sure that it can't overflow. Zero
    949 				 * out the MSB. The actual value does not matter
    950 				 * here, only its uniqeness.
    951 				 */
    952 				verf.seconds &= INT32_MAX;
    953 				did_excl_setup = TRUE;
    954 			}
    955 
    956 			/* Now copy over verifier to OPEN4args. */
    957 			open_args->createhow4_u.createverf = *(uint64_t *)&verf;
    958 		} else {
    959 			int v_error;
    960 			bitmap4 supp_attrs;
    961 			servinfo4_t *svp;
    962 
    963 			attr = &open_args->createhow4_u.createattrs;
    964 
    965 			svp = drp->r_server;
    966 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
    967 			supp_attrs = svp->sv_supp_attrs;
    968 			nfs_rw_exit(&svp->sv_lock);
    969 
    970 			/* GUARDED4 or UNCHECKED4 */
    971 			v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
    972 			    supp_attrs);
    973 			if (v_error) {
    974 				bzero(attr, sizeof (*attr));
    975 				nfs4args_copen_free(open_args);
    976 				nfs4_end_op(VTOMI4(dvp), dvp, vpi,
    977 				    &recov_state, FALSE);
    978 				if (ncr != NULL)
    979 					crfree(ncr);
    980 				kmem_free(argop, argoplist_size);
    981 				return (v_error);
    982 			}
    983 		}
    984 	} else {
    985 		/* NO CREATE */
    986 		open_args->opentype = OPEN4_NOCREATE;
    987 	}
    988 
    989 	if (recov_state.rs_sp != NULL) {
    990 		mutex_enter(&recov_state.rs_sp->s_lock);
    991 		open_args->owner.clientid = recov_state.rs_sp->clientid;
    992 		mutex_exit(&recov_state.rs_sp->s_lock);
    993 	} else {
    994 		/* XXX should we just fail here? */
    995 		open_args->owner.clientid = 0;
    996 	}
    997 
    998 	/*
    999 	 * This increments oop's ref count or creates a temporary 'just_created'
   1000 	 * open owner that will become valid when this OPEN/OPEN_CONFIRM call
   1001 	 * completes.
   1002 	 */
   1003 	mutex_enter(&VTOMI4(dvp)->mi_lock);
   1004 
   1005 	/* See if a permanent or just created open owner exists */
   1006 	oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
   1007 	if (!oop) {
   1008 		/*
   1009 		 * This open owner does not exist so create a temporary
   1010 		 * just created one.
   1011 		 */
   1012 		oop = create_open_owner(cr, VTOMI4(dvp));
   1013 		ASSERT(oop != NULL);
   1014 	}
   1015 	mutex_exit(&VTOMI4(dvp)->mi_lock);
   1016 
   1017 	/* this length never changes, do alloc before seqid sync */
   1018 	open_args->owner.owner_len = sizeof (oop->oo_name);
   1019 	open_args->owner.owner_val =
   1020 	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
   1021 
   1022 	e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
   1023 	if (e.error == EAGAIN) {
   1024 		open_owner_rele(oop);
   1025 		nfs4args_copen_free(open_args);
   1026 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
   1027 		if (ncr != NULL) {
   1028 			crfree(ncr);
   1029 			ncr = NULL;
   1030 		}
   1031 		goto recov_retry;
   1032 	}
   1033 
   1034 	/* Check to see if we need to do the OTW call */
   1035 	if (!create_flag) {
   1036 		if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
   1037 		    file_just_been_created, &e.error, acc, &recov_state)) {
   1038 
   1039 			/*
   1040 			 * The OTW open is not necessary.  Either
   1041 			 * the open can succeed without it (eg.
   1042 			 * delegation, error == 0) or the open
   1043 			 * must fail due to an access failure
   1044 			 * (error != 0).  In either case, tidy
   1045 			 * up and return.
   1046 			 */
   1047 
   1048 			nfs4_end_open_seqid_sync(oop);
   1049 			open_owner_rele(oop);
   1050 			nfs4args_copen_free(open_args);
   1051 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
   1052 			if (ncr != NULL)
   1053 				crfree(ncr);
   1054 			kmem_free(argop, argoplist_size);
   1055 			return (e.error);
   1056 		}
   1057 	}
   1058 
   1059 	bcopy(&oop->oo_name, open_args->owner.owner_val,
   1060 	    open_args->owner.owner_len);
   1061 
   1062 	seqid = nfs4_get_open_seqid(oop) + 1;
   1063 	open_args->seqid = seqid;
   1064 	open_args->share_access = 0;
   1065 	if (open_flag & FREAD)
   1066 		open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
   1067 	if (open_flag & FWRITE)
   1068 		open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
   1069 	open_args->share_deny = OPEN4_SHARE_DENY_NONE;
   1070 
   1071 
   1072 
   1073 	/*
   1074 	 * getfh w/sanity check for idx_open/idx_fattr
   1075 	 */
   1076 	ASSERT((idx_open + 1) == (idx_fattr - 1));
   1077 	argop[idx_open + 1].argop = OP_GETFH;
   1078 
   1079 	/* getattr */
   1080 	argop[idx_fattr].argop = OP_GETATTR;
   1081 	argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   1082 	argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
   1083 
   1084 	if (setgid_flag) {
   1085 		vattr_t	_v;
   1086 		servinfo4_t *svp;
   1087 		bitmap4	supp_attrs;
   1088 
   1089 		svp = drp->r_server;
   1090 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1091 		supp_attrs = svp->sv_supp_attrs;
   1092 		nfs_rw_exit(&svp->sv_lock);
   1093 
   1094 		/*
   1095 		 * For setgid case, we need to:
   1096 		 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
   1097 		 */
   1098 		argop[4].argop = OP_SAVEFH;
   1099 
   1100 		argop[5].argop = OP_CPUTFH;
   1101 		argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
   1102 
   1103 		argop[6].argop = OP_GETATTR;
   1104 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   1105 		argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
   1106 
   1107 		argop[7].argop = OP_RESTOREFH;
   1108 
   1109 		/*
   1110 		 * nverify
   1111 		 */
   1112 		_v.va_mask = AT_GID;
   1113 		_v.va_gid = in_va->va_gid;
   1114 		if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
   1115 		    supp_attrs))) {
   1116 
   1117 			/*
   1118 			 * setattr
   1119 			 *
   1120 			 * We _know_ we're not messing with AT_SIZE or
   1121 			 * AT_XTIME, so no need for stateid or flags.
   1122 			 * Also we specify NULL rp since we're only
   1123 			 * interested in setting owner_group attributes.
   1124 			 */
   1125 			nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
   1126 			    supp_attrs, &e.error, 0);
   1127 			if (e.error)
   1128 				nfs4args_verify_free(&argop[8]);
   1129 		}
   1130 
   1131 		if (e.error) {
   1132 			/*
   1133 			 * XXX - Revisit the last argument to nfs4_end_op()
   1134 			 *	 once 5020486 is fixed.
   1135 			 */
   1136 			nfs4_end_open_seqid_sync(oop);
   1137 			open_owner_rele(oop);
   1138 			nfs4args_copen_free(open_args);
   1139 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
   1140 			if (ncr != NULL)
   1141 				crfree(ncr);
   1142 			kmem_free(argop, argoplist_size);
   1143 			return (e.error);
   1144 		}
   1145 	} else if (create_flag) {
   1146 		argop[1].argop = OP_SAVEFH;
   1147 
   1148 		argop[5].argop = OP_RESTOREFH;
   1149 
   1150 		argop[6].argop = OP_GETATTR;
   1151 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   1152 		argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
   1153 	}
   1154 
   1155 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
   1156 	    "nfs4open_otw: %s call, nm %s, rp %s",
   1157 	    needrecov ? "recov" : "first", file_name,
   1158 	    rnode4info(VTOR4(dvp))));
   1159 
   1160 	t = gethrtime();
   1161 
   1162 	rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
   1163 
   1164 	if (!e.error && nfs4_need_to_bump_seqid(&res))
   1165 		nfs4_set_open_seqid(seqid, oop, args.ctag);
   1166 
   1167 	needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
   1168 
   1169 	if (e.error || needrecov) {
   1170 		bool_t abort = FALSE;
   1171 
   1172 		if (needrecov) {
   1173 			nfs4_bseqid_entry_t *bsep = NULL;
   1174 
   1175 			nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
   1176 			    cred_otw, vpi, dvp, open_args);
   1177 
   1178 			if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
   1179 				bsep = nfs4_create_bseqid_entry(oop, NULL,
   1180 				    vpi, 0, args.ctag, open_args->seqid);
   1181 				num_bseqid_retry--;
   1182 			}
   1183 
   1184 			abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
   1185 			    NULL, lost_rqst.lr_op == OP_OPEN ?
   1186 			    &lost_rqst : NULL, OP_OPEN, bsep);
   1187 
   1188 			if (bsep)
   1189 				kmem_free(bsep, sizeof (*bsep));
   1190 			/* give up if we keep getting BAD_SEQID */
   1191 			if (num_bseqid_retry == 0)
   1192 				abort = TRUE;
   1193 			if (abort == TRUE && e.error == 0)
   1194 				e.error = geterrno4(res.status);
   1195 		}
   1196 		nfs4_end_open_seqid_sync(oop);
   1197 		open_owner_rele(oop);
   1198 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
   1199 		nfs4args_copen_free(open_args);
   1200 		if (setgid_flag) {
   1201 			nfs4args_verify_free(&argop[8]);
   1202 			nfs4args_setattr_free(&argop[9]);
   1203 		}
   1204 		if (!e.error)
   1205 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1206 		if (ncr != NULL) {
   1207 			crfree(ncr);
   1208 			ncr = NULL;
   1209 		}
   1210 		if (!needrecov || abort == TRUE || e.error == EINTR ||
   1211 		    NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
   1212 			kmem_free(argop, argoplist_size);
   1213 			return (e.error);
   1214 		}
   1215 		goto recov_retry;
   1216 	}
   1217 
   1218 	/*
   1219 	 * Will check and update lease after checking the rflag for
   1220 	 * OPEN_CONFIRM in the successful OPEN call.
   1221 	 */
   1222 	if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
   1223 
   1224 		/*
   1225 		 * XXX what if we're crossing mount points from server1:/drp
   1226 		 * to server2:/drp/rp.
   1227 		 */
   1228 
   1229 		/* Signal our end of use of the open seqid */
   1230 		nfs4_end_open_seqid_sync(oop);
   1231 
   1232 		/*
   1233 		 * This will destroy the open owner if it was just created,
   1234 		 * and no one else has put a reference on it.
   1235 		 */
   1236 		open_owner_rele(oop);
   1237 		if (create_flag && (createmode != EXCLUSIVE4) &&
   1238 		    res.status == NFS4ERR_BADOWNER)
   1239 			nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
   1240 
   1241 		e.error = geterrno4(res.status);
   1242 		nfs4args_copen_free(open_args);
   1243 		if (setgid_flag) {
   1244 			nfs4args_verify_free(&argop[8]);
   1245 			nfs4args_setattr_free(&argop[9]);
   1246 		}
   1247 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1248 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
   1249 		/*
   1250 		 * If the reply is NFS4ERR_ACCESS, it may be because
   1251 		 * we are root (no root net access).  If the real uid
   1252 		 * is not root, then retry with the real uid instead.
   1253 		 */
   1254 		if (ncr != NULL) {
   1255 			crfree(ncr);
   1256 			ncr = NULL;
   1257 		}
   1258 		if (res.status == NFS4ERR_ACCESS &&
   1259 		    (ncr = crnetadjust(cred_otw)) != NULL) {
   1260 			cred_otw = ncr;
   1261 			goto recov_retry;
   1262 		}
   1263 		kmem_free(argop, argoplist_size);
   1264 		return (e.error);
   1265 	}
   1266 
   1267 	resop = &res.array[idx_open];  /* open res */
   1268 	op_res = &resop->nfs_resop4_u.opopen;
   1269 
   1270 #ifdef DEBUG
   1271 	/*
   1272 	 * verify attrset bitmap
   1273 	 */
   1274 	if (create_flag &&
   1275 	    (createmode == UNCHECKED4 || createmode == GUARDED4)) {
   1276 		/* make sure attrset returned is what we asked for */
   1277 		/* XXX Ignore this 'error' for now */
   1278 		if (attr->attrmask != op_res->attrset)
   1279 			/* EMPTY */;
   1280 	}
   1281 #endif
   1282 
   1283 	if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
   1284 		mutex_enter(&VTOMI4(dvp)->mi_lock);
   1285 		VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
   1286 		mutex_exit(&VTOMI4(dvp)->mi_lock);
   1287 	}
   1288 
   1289 	resop = &res.array[idx_open + 1];  /* getfh res */
   1290 	gf_res = &resop->nfs_resop4_u.opgetfh;
   1291 
   1292 	otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
   1293 
   1294 	/*
   1295 	 * The open stateid has been updated on the server but not
   1296 	 * on the client yet.  There is a path: makenfs4node->nfs4_attr_cache->
   1297 	 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
   1298 	 * WRITE call.  That, however, will use the old stateid, so go ahead
   1299 	 * and upate the open stateid now, before any call to makenfs4node.
   1300 	 */
   1301 	if (vpi) {
   1302 		nfs4_open_stream_t	*tmp_osp;
   1303 		rnode4_t		*tmp_rp = VTOR4(vpi);
   1304 
   1305 		tmp_osp = find_open_stream(oop, tmp_rp);
   1306 		if (tmp_osp) {
   1307 			tmp_osp->open_stateid = op_res->stateid;
   1308 			mutex_exit(&tmp_osp->os_sync_lock);
   1309 			open_stream_rele(tmp_osp, tmp_rp);
   1310 		}
   1311 
   1312 		/*
   1313 		 * We must determine if the file handle given by the otw open
   1314 		 * is the same as the file handle which was passed in with
   1315 		 * *vpp.  This case can be reached if the file we are trying
   1316 		 * to open has been removed and another file has been created
   1317 		 * having the same file name.  The passed in vnode is released
   1318 		 * later.
   1319 		 */
   1320 		orig_sfh = VTOR4(vpi)->r_fh;
   1321 		fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
   1322 	}
   1323 
   1324 	garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
   1325 
   1326 	if (create_flag || fh_differs) {
   1327 		int rnode_err = 0;
   1328 
   1329 		vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
   1330 		    dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
   1331 
   1332 		if (e.error)
   1333 			PURGE_ATTRCACHE4(vp);
   1334 		/*
   1335 		 * For the newly created vp case, make sure the rnode
   1336 		 * isn't bad before using it.
   1337 		 */
   1338 		mutex_enter(&(VTOR4(vp))->r_statelock);
   1339 		if (VTOR4(vp)->r_flags & R4RECOVERR)
   1340 			rnode_err = EIO;
   1341 		mutex_exit(&(VTOR4(vp))->r_statelock);
   1342 
   1343 		if (rnode_err) {
   1344 			nfs4_end_open_seqid_sync(oop);
   1345 			nfs4args_copen_free(open_args);
   1346 			if (setgid_flag) {
   1347 				nfs4args_verify_free(&argop[8]);
   1348 				nfs4args_setattr_free(&argop[9]);
   1349 			}
   1350 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1351 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
   1352 			    needrecov);
   1353 			open_owner_rele(oop);
   1354 			VN_RELE(vp);
   1355 			if (ncr != NULL)
   1356 				crfree(ncr);
   1357 			sfh4_rele(&otw_sfh);
   1358 			kmem_free(argop, argoplist_size);
   1359 			return (EIO);
   1360 		}
   1361 	} else {
   1362 		vp = vpi;
   1363 	}
   1364 	sfh4_rele(&otw_sfh);
   1365 
   1366 	/*
   1367 	 * It seems odd to get a full set of attrs and then not update
   1368 	 * the object's attrcache in the non-create case.  Create case uses
   1369 	 * the attrs since makenfs4node checks to see if the attrs need to
   1370 	 * be updated (and then updates them).  The non-create case should
   1371 	 * update attrs also.
   1372 	 */
   1373 	if (! create_flag && ! fh_differs && !e.error) {
   1374 		nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
   1375 	}
   1376 
   1377 	nfs4_error_zinit(&e);
   1378 	if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
   1379 		/* This does not do recovery for vp explicitly. */
   1380 		nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
   1381 		    &retry_open, oop, FALSE, &e, &num_bseqid_retry);
   1382 
   1383 		if (e.error || e.stat) {
   1384 			nfs4_end_open_seqid_sync(oop);
   1385 			nfs4args_copen_free(open_args);
   1386 			if (setgid_flag) {
   1387 				nfs4args_verify_free(&argop[8]);
   1388 				nfs4args_setattr_free(&argop[9]);
   1389 			}
   1390 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1391 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
   1392 			    needrecov);
   1393 			open_owner_rele(oop);
   1394 			if (create_flag || fh_differs) {
   1395 				/* rele the makenfs4node */
   1396 				VN_RELE(vp);
   1397 			}
   1398 			if (ncr != NULL) {
   1399 				crfree(ncr);
   1400 				ncr = NULL;
   1401 			}
   1402 			if (retry_open == TRUE) {
   1403 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1404 				    "nfs4open_otw: retry the open since OPEN "
   1405 				    "CONFIRM failed with error %d stat %d",
   1406 				    e.error, e.stat));
   1407 				if (create_flag && createmode == GUARDED4) {
   1408 					NFS4_DEBUG(nfs4_client_recov_debug,
   1409 					    (CE_NOTE, "nfs4open_otw: switch "
   1410 					    "createmode from GUARDED4 to "
   1411 					    "UNCHECKED4"));
   1412 					createmode = UNCHECKED4;
   1413 				}
   1414 				goto recov_retry;
   1415 			}
   1416 			if (!e.error) {
   1417 				if (create_flag && (createmode != EXCLUSIVE4) &&
   1418 				    e.stat == NFS4ERR_BADOWNER)
   1419 					nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
   1420 
   1421 				e.error = geterrno4(e.stat);
   1422 			}
   1423 			kmem_free(argop, argoplist_size);
   1424 			return (e.error);
   1425 		}
   1426 	}
   1427 
   1428 	rp = VTOR4(vp);
   1429 
   1430 	mutex_enter(&rp->r_statev4_lock);
   1431 	if (create_flag)
   1432 		rp->created_v4 = 1;
   1433 	mutex_exit(&rp->r_statev4_lock);
   1434 
   1435 	mutex_enter(&oop->oo_lock);
   1436 	/* Doesn't matter if 'oo_just_created' already was set as this */
   1437 	oop->oo_just_created = NFS4_PERM_CREATED;
   1438 	if (oop->oo_cred_otw)
   1439 		crfree(oop->oo_cred_otw);
   1440 	oop->oo_cred_otw = cred_otw;
   1441 	crhold(oop->oo_cred_otw);
   1442 	mutex_exit(&oop->oo_lock);
   1443 
   1444 	/* returns with 'os_sync_lock' held */
   1445 	osp = find_or_create_open_stream(oop, rp, &created_osp);
   1446 	if (!osp) {
   1447 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
   1448 		    "nfs4open_otw: failed to create an open stream"));
   1449 		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
   1450 		    "signal our end of use of the open seqid"));
   1451 
   1452 		nfs4_end_open_seqid_sync(oop);
   1453 		open_owner_rele(oop);
   1454 		nfs4args_copen_free(open_args);
   1455 		if (setgid_flag) {
   1456 			nfs4args_verify_free(&argop[8]);
   1457 			nfs4args_setattr_free(&argop[9]);
   1458 		}
   1459 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1460 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
   1461 		if (create_flag || fh_differs)
   1462 			VN_RELE(vp);
   1463 		if (ncr != NULL)
   1464 			crfree(ncr);
   1465 
   1466 		kmem_free(argop, argoplist_size);
   1467 		return (EINVAL);
   1468 
   1469 	}
   1470 
   1471 	osp->open_stateid = op_res->stateid;
   1472 
   1473 	if (open_flag & FREAD)
   1474 		osp->os_share_acc_read++;
   1475 	if (open_flag & FWRITE)
   1476 		osp->os_share_acc_write++;
   1477 	osp->os_share_deny_none++;
   1478 
   1479 	/*
   1480 	 * Need to reset this bitfield for the possible case where we were
   1481 	 * going to OTW CLOSE the file, got a non-recoverable error, and before
   1482 	 * we could retry the CLOSE, OPENed the file again.
   1483 	 */
   1484 	ASSERT(osp->os_open_owner->oo_seqid_inuse);
   1485 	osp->os_final_close = 0;
   1486 	osp->os_force_close = 0;
   1487 #ifdef DEBUG
   1488 	if (osp->os_failed_reopen)
   1489 		NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
   1490 		    " clearing os_failed_reopen for osp %p, cr %p, rp %s",
   1491 		    (void *)osp, (void *)cr, rnode4info(rp)));
   1492 #endif
   1493 	osp->os_failed_reopen = 0;
   1494 
   1495 	mutex_exit(&osp->os_sync_lock);
   1496 
   1497 	nfs4_end_open_seqid_sync(oop);
   1498 
   1499 	if (created_osp && recov_state.rs_sp != NULL) {
   1500 		mutex_enter(&recov_state.rs_sp->s_lock);
   1501 		nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
   1502 		mutex_exit(&recov_state.rs_sp->s_lock);
   1503 	}
   1504 
   1505 	/* get rid of our reference to find oop */
   1506 	open_owner_rele(oop);
   1507 
   1508 	open_stream_rele(osp, rp);
   1509 
   1510 	/* accept delegation, if any */
   1511 	nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
   1512 
   1513 	nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
   1514 
   1515 	if (createmode == EXCLUSIVE4 &&
   1516 	    (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
   1517 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
   1518 		    " EXCLUSIVE4: sending a SETATTR"));
   1519 		/*
   1520 		 * If doing an exclusive create, then generate
   1521 		 * a SETATTR to set the initial attributes.
   1522 		 * Try to set the mtime and the atime to the
   1523 		 * server's current time.  It is somewhat
   1524 		 * expected that these fields will be used to
   1525 		 * store the exclusive create cookie.  If not,
   1526 		 * server implementors will need to know that
   1527 		 * a SETATTR will follow an exclusive create
   1528 		 * and the cookie should be destroyed if
   1529 		 * appropriate.
   1530 		 *
   1531 		 * The AT_GID and AT_SIZE bits are turned off
   1532 		 * so that the SETATTR request will not attempt
   1533 		 * to process these.  The gid will be set
   1534 		 * separately if appropriate.  The size is turned
   1535 		 * off because it is assumed that a new file will
   1536 		 * be created empty and if the file wasn't empty,
   1537 		 * then the exclusive create will have failed
   1538 		 * because the file must have existed already.
   1539 		 * Therefore, no truncate operation is needed.
   1540 		 */
   1541 		in_va->va_mask &= ~(AT_GID | AT_SIZE);
   1542 		in_va->va_mask |= (AT_MTIME | AT_ATIME);
   1543 
   1544 		e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
   1545 		if (e.error) {
   1546 			/*
   1547 			 * Couldn't correct the attributes of
   1548 			 * the newly created file and the
   1549 			 * attributes are wrong.  Remove the
   1550 			 * file and return an error to the
   1551 			 * application.
   1552 			 */
   1553 			/* XXX will this take care of client state ? */
   1554 			NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
   1555 			    "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
   1556 			    " remove file", e.error));
   1557 			VN_RELE(vp);
   1558 			(void) nfs4_remove(dvp, file_name, cr, NULL, 0);
   1559 			/*
   1560 			 * Since we've reled the vnode and removed
   1561 			 * the file we now need to return the error.
   1562 			 * At this point we don't want to update the
   1563 			 * dircaches, call nfs4_waitfor_purge_complete
   1564 			 * or set vpp to vp so we need to skip these
   1565 			 * as well.
   1566 			 */
   1567 			goto skip_update_dircaches;
   1568 		}
   1569 	}
   1570 
   1571 	/*
   1572 	 * If we created or found the correct vnode, due to create_flag or
   1573 	 * fh_differs being set, then update directory cache attribute, readdir
   1574 	 * and dnlc caches.
   1575 	 */
   1576 	if (create_flag || fh_differs) {
   1577 		dirattr_info_t dinfo, *dinfop;
   1578 
   1579 		/*
   1580 		 * Make sure getattr succeeded before using results.
   1581 		 * note: op 7 is getattr(dir) for both flavors of
   1582 		 * open(create).
   1583 		 */
   1584 		if (create_flag && res.status == NFS4_OK) {
   1585 			dinfo.di_time_call = t;
   1586 			dinfo.di_cred = cr;
   1587 			dinfo.di_garp =
   1588 			    &res.array[6].nfs_resop4_u.opgetattr.ga_res;
   1589 			dinfop = &dinfo;
   1590 		} else {
   1591 			dinfop = NULL;
   1592 		}
   1593 
   1594 		nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
   1595 		    dinfop);
   1596 	}
   1597 
   1598 	/*
   1599 	 * If the page cache for this file was flushed from actions
   1600 	 * above, it was done asynchronously and if that is true,
   1601 	 * there is a need to wait here for it to complete.  This must
   1602 	 * be done outside of start_fop/end_fop.
   1603 	 */
   1604 	(void) nfs4_waitfor_purge_complete(vp);
   1605 
   1606 	/*
   1607 	 * It is implicit that we are in the open case (create_flag == 0) since
   1608 	 * fh_differs can only be set to a non-zero value in the open case.
   1609 	 */
   1610 	if (fh_differs != 0 && vpi != NULL)
   1611 		VN_RELE(vpi);
   1612 
   1613 	/*
   1614 	 * Be sure to set *vpp to the correct value before returning.
   1615 	 */
   1616 	*vpp = vp;
   1617 
   1618 skip_update_dircaches:
   1619 
   1620 	nfs4args_copen_free(open_args);
   1621 	if (setgid_flag) {
   1622 		nfs4args_verify_free(&argop[8]);
   1623 		nfs4args_setattr_free(&argop[9]);
   1624 	}
   1625 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1626 
   1627 	if (ncr)
   1628 		crfree(ncr);
   1629 	kmem_free(argop, argoplist_size);
   1630 	return (e.error);
   1631 }
   1632 
   1633 /*
   1634  * Reopen an open instance.  cf. nfs4open_otw().
   1635  *
   1636  * Errors are returned by the nfs4_error_t parameter.
   1637  * - ep->error contains an errno value or zero.
   1638  * - if it is zero, ep->stat is set to an NFS status code, if any.
   1639  *   If the file could not be reopened, but the caller should continue, the
   1640  *   file is marked dead and no error values are returned.  If the caller
   1641  *   should stop recovering open files and start over, either the ep->error
   1642  *   value or ep->stat will indicate an error (either something that requires
   1643  *   recovery or EAGAIN).  Note that some recovery (e.g., expired volatile
   1644  *   filehandles) may be handled silently by this routine.
   1645  * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
   1646  *   will be started, so the caller should not do it.
   1647  *
   1648  * Gotos:
   1649  * - kill_file : reopen failed in such a fashion to constitute marking the
   1650  *    file dead and setting the open stream's 'os_failed_reopen' as 1.  This
   1651  *   is for cases where recovery is not possible.
   1652  * - failed_reopen : same as above, except that the file has already been
   1653  *   marked dead, so no need to do it again.
   1654  * - bailout : reopen failed but we are able to recover and retry the reopen -
   1655  *   either within this function immediately or via the calling function.
   1656  */
   1657 
   1658 void
   1659 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
   1660     open_claim_type4 claim, bool_t frc_use_claim_previous,
   1661     bool_t is_recov)
   1662 {
   1663 	COMPOUND4args_clnt args;
   1664 	COMPOUND4res_clnt res;
   1665 	nfs_argop4 argop[4];
   1666 	nfs_resop4 *resop;
   1667 	OPEN4res *op_res = NULL;
   1668 	OPEN4cargs *open_args;
   1669 	GETFH4res *gf_res;
   1670 	rnode4_t *rp = VTOR4(vp);
   1671 	int doqueue = 1;
   1672 	cred_t *cr = NULL, *cred_otw = NULL;
   1673 	nfs4_open_owner_t *oop = NULL;
   1674 	seqid4 seqid;
   1675 	nfs4_ga_res_t *garp;
   1676 	char fn[MAXNAMELEN];
   1677 	nfs4_recov_state_t recov = {NULL, 0};
   1678 	nfs4_lost_rqst_t lost_rqst;
   1679 	mntinfo4_t *mi = VTOMI4(vp);
   1680 	bool_t abort;
   1681 	char *failed_msg = "";
   1682 	int fh_different;
   1683 	hrtime_t t;
   1684 	nfs4_bseqid_entry_t *bsep = NULL;
   1685 
   1686 	ASSERT(nfs4_consistent_type(vp));
   1687 	ASSERT(nfs_zone() == mi->mi_zone);
   1688 
   1689 	nfs4_error_zinit(ep);
   1690 
   1691 	/* this is the cred used to find the open owner */
   1692 	cr = state_to_cred(osp);
   1693 	if (cr == NULL) {
   1694 		failed_msg = "Couldn't reopen: no cred";
   1695 		goto kill_file;
   1696 	}
   1697 	/* use this cred for OTW operations */
   1698 	cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
   1699 
   1700 top:
   1701 	nfs4_error_zinit(ep);
   1702 
   1703 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
   1704 		/* File system has been unmounted, quit */
   1705 		ep->error = EIO;
   1706 		failed_msg = "Couldn't reopen: file system has been unmounted";
   1707 		goto kill_file;
   1708 	}
   1709 
   1710 	oop = osp->os_open_owner;
   1711 
   1712 	ASSERT(oop != NULL);
   1713 	if (oop == NULL) {	/* be defensive in non-DEBUG */
   1714 		failed_msg = "can't reopen: no open owner";
   1715 		goto kill_file;
   1716 	}
   1717 	open_owner_hold(oop);
   1718 
   1719 	ep->error = nfs4_start_open_seqid_sync(oop, mi);
   1720 	if (ep->error) {
   1721 		open_owner_rele(oop);
   1722 		oop = NULL;
   1723 		goto bailout;
   1724 	}
   1725 
   1726 	/*
   1727 	 * If the rnode has a delegation and the delegation has been
   1728 	 * recovered and the server didn't request a recall and the caller
   1729 	 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
   1730 	 * recovery) and the rnode hasn't been marked dead, then install
   1731 	 * the delegation stateid in the open stream.  Otherwise, proceed
   1732 	 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
   1733 	 */
   1734 	mutex_enter(&rp->r_statev4_lock);
   1735 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
   1736 	    !rp->r_deleg_return_pending &&
   1737 	    (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
   1738 	    !rp->r_deleg_needs_recall &&
   1739 	    claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
   1740 	    !(rp->r_flags & R4RECOVERR)) {
   1741 		mutex_enter(&osp->os_sync_lock);
   1742 		osp->os_delegation = 1;
   1743 		osp->open_stateid = rp->r_deleg_stateid;
   1744 		mutex_exit(&osp->os_sync_lock);
   1745 		mutex_exit(&rp->r_statev4_lock);
   1746 		goto bailout;
   1747 	}
   1748 	mutex_exit(&rp->r_statev4_lock);
   1749 
   1750 	/*
   1751 	 * If the file failed recovery, just quit.  This failure need not
   1752 	 * affect other reopens, so don't return an error.
   1753 	 */
   1754 	mutex_enter(&rp->r_statelock);
   1755 	if (rp->r_flags & R4RECOVERR) {
   1756 		mutex_exit(&rp->r_statelock);
   1757 		ep->error = 0;
   1758 		goto failed_reopen;
   1759 	}
   1760 	mutex_exit(&rp->r_statelock);
   1761 
   1762 	/*
   1763 	 * argop is empty here
   1764 	 *
   1765 	 * PUTFH, OPEN, GETATTR
   1766 	 */
   1767 	args.ctag = TAG_REOPEN;
   1768 	args.array_len = 4;
   1769 	args.array = argop;
   1770 
   1771 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1772 	    "nfs4_reopen: file is type %d, id %s",
   1773 	    vp->v_type, rnode4info(VTOR4(vp))));
   1774 
   1775 	argop[0].argop = OP_CPUTFH;
   1776 
   1777 	if (claim != CLAIM_PREVIOUS) {
   1778 		/*
   1779 		 * if this is a file mount then
   1780 		 * use the mntinfo parentfh
   1781 		 */
   1782 		argop[0].nfs_argop4_u.opcputfh.sfh =
   1783 		    (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
   1784 		    VTOSV(vp)->sv_dfh;
   1785 	} else {
   1786 		/* putfh fh to reopen */
   1787 		argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
   1788 	}
   1789 
   1790 	argop[1].argop = OP_COPEN;
   1791 	open_args = &argop[1].nfs_argop4_u.opcopen;
   1792 	open_args->claim = claim;
   1793 
   1794 	if (claim == CLAIM_NULL) {
   1795 
   1796 		if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
   1797 			nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
   1798 			    "failed for vp 0x%p for CLAIM_NULL with %m",
   1799 			    (void *)vp);
   1800 			failed_msg = "Couldn't reopen: vtoname failed for "
   1801 			    "CLAIM_NULL";
   1802 			/* nothing allocated yet */
   1803 			goto kill_file;
   1804 		}
   1805 
   1806 		open_args->open_claim4_u.cfile = fn;
   1807 	} else if (claim == CLAIM_PREVIOUS) {
   1808 
   1809 		/*
   1810 		 * We have two cases to deal with here:
   1811 		 * 1) We're being called to reopen files in order to satisfy
   1812 		 *    a lock operation request which requires us to explicitly
   1813 		 *    reopen files which were opened under a delegation.  If
   1814 		 *    we're in recovery, we *must* use CLAIM_PREVIOUS.  In
   1815 		 *    that case, frc_use_claim_previous is TRUE and we must
   1816 		 *    use the rnode's current delegation type (r_deleg_type).
   1817 		 * 2) We're reopening files during some form of recovery.
   1818 		 *    In this case, frc_use_claim_previous is FALSE and we
   1819 		 *    use the delegation type appropriate for recovery
   1820 		 *    (r_deleg_needs_recovery).
   1821 		 */
   1822 		mutex_enter(&rp->r_statev4_lock);
   1823 		open_args->open_claim4_u.delegate_type =
   1824 		    frc_use_claim_previous ?
   1825 		    rp->r_deleg_type :
   1826 		    rp->r_deleg_needs_recovery;
   1827 		mutex_exit(&rp->r_statev4_lock);
   1828 
   1829 	} else if (claim == CLAIM_DELEGATE_CUR) {
   1830 
   1831 		if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
   1832 			nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
   1833 			    "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
   1834 			    "with %m", (void *)vp);
   1835 			failed_msg = "Couldn't reopen: vtoname failed for "
   1836 			    "CLAIM_DELEGATE_CUR";
   1837 			/* nothing allocated yet */
   1838 			goto kill_file;
   1839 		}
   1840 
   1841 		mutex_enter(&rp->r_statev4_lock);
   1842 		open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
   1843 		    rp->r_deleg_stateid;
   1844 		mutex_exit(&rp->r_statev4_lock);
   1845 
   1846 		open_args->open_claim4_u.delegate_cur_info.cfile = fn;
   1847 	}
   1848 	open_args->opentype = OPEN4_NOCREATE;
   1849 	open_args->owner.clientid = mi2clientid(mi);
   1850 	open_args->owner.owner_len = sizeof (oop->oo_name);
   1851 	open_args->owner.owner_val =
   1852 	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
   1853 	bcopy(&oop->oo_name, open_args->owner.owner_val,
   1854 	    open_args->owner.owner_len);
   1855 	open_args->share_access = 0;
   1856 	open_args->share_deny = 0;
   1857 
   1858 	mutex_enter(&osp->os_sync_lock);
   1859 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
   1860 	    "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
   1861 	    "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
   1862 	    (void *)osp, (void *)rp, osp->os_share_acc_read,
   1863 	    osp->os_share_acc_write, osp->os_open_ref_count,
   1864 	    osp->os_mmap_read, osp->os_mmap_write, claim));
   1865 
   1866 	if (osp->os_share_acc_read || osp->os_mmap_read)
   1867 		open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
   1868 	if (osp->os_share_acc_write || osp->os_mmap_write)
   1869 		open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
   1870 	if (osp->os_share_deny_read)
   1871 		open_args->share_deny |= OPEN4_SHARE_DENY_READ;
   1872 	if (osp->os_share_deny_write)
   1873 		open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
   1874 	mutex_exit(&osp->os_sync_lock);
   1875 
   1876 	seqid = nfs4_get_open_seqid(oop) + 1;
   1877 	open_args->seqid = seqid;
   1878 
   1879 	/* Construct the getfh part of the compound */
   1880 	argop[2].argop = OP_GETFH;
   1881 
   1882 	/* Construct the getattr part of the compound */
   1883 	argop[3].argop = OP_GETATTR;
   1884 	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   1885 	argop[3].nfs_argop4_u.opgetattr.mi = mi;
   1886 
   1887 	t = gethrtime();
   1888 
   1889 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
   1890 
   1891 	if (ep->error) {
   1892 		if (!is_recov && !frc_use_claim_previous &&
   1893 		    (ep->error == EINTR || ep->error == ETIMEDOUT ||
   1894 		    NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
   1895 			nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
   1896 			    cred_otw, vp, NULL, open_args);
   1897 			abort = nfs4_start_recovery(ep,
   1898 			    VTOMI4(vp), vp, NULL, NULL,
   1899 			    lost_rqst.lr_op == OP_OPEN ?
   1900 			    &lost_rqst : NULL, OP_OPEN, NULL);
   1901 			nfs4args_copen_free(open_args);
   1902 			goto bailout;
   1903 		}
   1904 
   1905 		nfs4args_copen_free(open_args);
   1906 
   1907 		if (ep->error == EACCES && cred_otw != cr) {
   1908 			crfree(cred_otw);
   1909 			cred_otw = cr;
   1910 			crhold(cred_otw);
   1911 			nfs4_end_open_seqid_sync(oop);
   1912 			open_owner_rele(oop);
   1913 			oop = NULL;
   1914 			goto top;
   1915 		}
   1916 		if (ep->error == ETIMEDOUT)
   1917 			goto bailout;
   1918 		failed_msg = "Couldn't reopen: rpc error";
   1919 		goto kill_file;
   1920 	}
   1921 
   1922 	if (nfs4_need_to_bump_seqid(&res))
   1923 		nfs4_set_open_seqid(seqid, oop, args.ctag);
   1924 
   1925 	switch (res.status) {
   1926 	case NFS4_OK:
   1927 		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
   1928 			mutex_enter(&rp->r_statelock);
   1929 			rp->r_delay_interval = 0;
   1930 			mutex_exit(&rp->r_statelock);
   1931 		}
   1932 		break;
   1933 	case NFS4ERR_BAD_SEQID:
   1934 		bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
   1935 		    args.ctag, open_args->seqid);
   1936 
   1937 		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
   1938 		    NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
   1939 		    NULL, OP_OPEN, bsep);
   1940 
   1941 		nfs4args_copen_free(open_args);
   1942 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1943 		nfs4_end_open_seqid_sync(oop);
   1944 		open_owner_rele(oop);
   1945 		oop = NULL;
   1946 		kmem_free(bsep, sizeof (*bsep));
   1947 
   1948 		goto kill_file;
   1949 	case NFS4ERR_NO_GRACE:
   1950 		nfs4args_copen_free(open_args);
   1951 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1952 		nfs4_end_open_seqid_sync(oop);
   1953 		open_owner_rele(oop);
   1954 		oop = NULL;
   1955 		if (claim == CLAIM_PREVIOUS) {
   1956 			/*
   1957 			 * Retry as a plain open. We don't need to worry about
   1958 			 * checking the changeinfo: it is acceptable for a
   1959 			 * client to re-open a file and continue processing
   1960 			 * (in the absence of locks).
   1961 			 */
   1962 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1963 			    "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
   1964 			    "will retry as CLAIM_NULL"));
   1965 			claim = CLAIM_NULL;
   1966 			nfs4_mi_kstat_inc_no_grace(mi);
   1967 			goto top;
   1968 		}
   1969 		failed_msg =
   1970 		    "Couldn't reopen: tried reclaim outside grace period. ";
   1971 		goto kill_file;
   1972 	case NFS4ERR_GRACE:
   1973 		nfs4_set_grace_wait(mi);
   1974 		nfs4args_copen_free(open_args);
   1975 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1976 		nfs4_end_open_seqid_sync(oop);
   1977 		open_owner_rele(oop);
   1978 		oop = NULL;
   1979 		ep->error = nfs4_wait_for_grace(mi, &recov);
   1980 		if (ep->error != 0)
   1981 			goto bailout;
   1982 		goto top;
   1983 	case NFS4ERR_DELAY:
   1984 		nfs4_set_delay_wait(vp);
   1985 		nfs4args_copen_free(open_args);
   1986 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   1987 		nfs4_end_open_seqid_sync(oop);
   1988 		open_owner_rele(oop);
   1989 		oop = NULL;
   1990 		ep->error = nfs4_wait_for_delay(vp, &recov);
   1991 		nfs4_mi_kstat_inc_delay(mi);
   1992 		if (ep->error != 0)
   1993 			goto bailout;
   1994 		goto top;
   1995 	case NFS4ERR_FHEXPIRED:
   1996 		/* recover filehandle and retry */
   1997 		abort = nfs4_start_recovery(ep,
   1998 		    mi, vp, NULL, NULL, NULL, OP_OPEN, NULL);
   1999 		nfs4args_copen_free(open_args);
   2000 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   2001 		nfs4_end_open_seqid_sync(oop);
   2002 		open_owner_rele(oop);
   2003 		oop = NULL;
   2004 		if (abort == FALSE)
   2005 			goto top;
   2006 		failed_msg = "Couldn't reopen: recovery aborted";
   2007 		goto kill_file;
   2008 	case NFS4ERR_RESOURCE:
   2009 	case NFS4ERR_STALE_CLIENTID:
   2010 	case NFS4ERR_WRONGSEC:
   2011 	case NFS4ERR_EXPIRED:
   2012 		/*
   2013 		 * Do not mark the file dead and let the calling
   2014 		 * function initiate recovery.
   2015 		 */
   2016 		nfs4args_copen_free(open_args);
   2017 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   2018 		nfs4_end_open_seqid_sync(oop);
   2019 		open_owner_rele(oop);
   2020 		oop = NULL;
   2021 		goto bailout;
   2022 	case NFS4ERR_ACCESS:
   2023 		if (cred_otw != cr) {
   2024 			crfree(cred_otw);
   2025 			cred_otw = cr;
   2026 			crhold(cred_otw);
   2027 			nfs4args_copen_free(open_args);
   2028 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   2029 			nfs4_end_open_seqid_sync(oop);
   2030 			open_owner_rele(oop);
   2031 			oop = NULL;
   2032 			goto top;
   2033 		}
   2034 		/* fall through */
   2035 	default:
   2036 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   2037 		    "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
   2038 		    (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
   2039 		    rnode4info(VTOR4(vp))));
   2040 		failed_msg = "Couldn't reopen: NFSv4 error";
   2041 		nfs4args_copen_free(open_args);
   2042 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   2043 		goto kill_file;
   2044 	}
   2045 
   2046 	resop = &res.array[1];  /* open res */
   2047 	op_res = &resop->nfs_resop4_u.opopen;
   2048 
   2049 	garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
   2050 
   2051 	/*
   2052 	 * Check if the path we reopened really is the same
   2053 	 * file. We could end up in a situation where the file
   2054 	 * was removed and a new file created with the same name.
   2055 	 */
   2056 	resop = &res.array[2];
   2057 	gf_res = &resop->nfs_resop4_u.opgetfh;
   2058 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
   2059 	fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
   2060 	if (fh_different) {
   2061 		if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
   2062 		    mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
   2063 			/* Oops, we don't have the same file */
   2064 			if (mi->mi_fh_expire_type == FH4_PERSISTENT)
   2065 				failed_msg = "Couldn't reopen: Persistent "
   2066 				    "file handle changed";
   2067 			else
   2068 				failed_msg = "Couldn't reopen: Volatile "
   2069 				    "(no expire on open) file handle changed";
   2070 
   2071 			nfs4args_copen_free(open_args);
   2072 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   2073 			nfs_rw_exit(&mi->mi_fh_lock);
   2074 			goto kill_file;
   2075 
   2076 		} else {
   2077 			/*
   2078 			 * We have volatile file handles that don't compare.
   2079 			 * If the fids are the same then we assume that the
   2080 			 * file handle expired but the rnode still refers to
   2081 			 * the same file object.
   2082 			 *
   2083 			 * First check that we have fids or not.
   2084 			 * If we don't we have a dumb server so we will
   2085 			 * just assume every thing is ok for now.
   2086 			 */
   2087 			if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
   2088 			    rp->r_attr.va_mask & AT_NODEID &&
   2089 			    rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
   2090 				/*
   2091 				 * We have fids, but they don't
   2092 				 * compare. So kill the file.
   2093 				 */
   2094 				failed_msg =
   2095 				    "Couldn't reopen: file handle changed"
   2096 				    " due to mismatched fids";
   2097 				nfs4args_copen_free(open_args);
   2098 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   2099 				    (caddr_t)&res);
   2100 				nfs_rw_exit(&mi->mi_fh_lock);
   2101 				goto kill_file;
   2102 			} else {
   2103 				/*
   2104 				 * We have volatile file handles that refers
   2105 				 * to the same file (at least they have the
   2106 				 * same fid) or we don't have fids so we
   2107 				 * can't tell. :(. We'll be a kind and accepting
   2108 				 * client so we'll update the rnode's file
   2109 				 * handle with the otw handle.
   2110 				 *
   2111 				 * We need to drop mi->mi_fh_lock since
   2112 				 * sh4_update acquires it. Since there is
   2113 				 * only one recovery thread there is no
   2114 				 * race.
   2115 				 */
   2116 				nfs_rw_exit(&mi->mi_fh_lock);
   2117 				sfh4_update(rp->r_fh, &gf_res->object);
   2118 			}
   2119 		}
   2120 	} else {
   2121 		nfs_rw_exit(&mi->mi_fh_lock);
   2122 	}
   2123 
   2124 	ASSERT(nfs4_consistent_type(vp));
   2125 
   2126 	/*
   2127 	 * If the server wanted an OPEN_CONFIRM but that fails, just start
   2128 	 * over.  Presumably if there is a persistent error it will show up
   2129 	 * when we resend the OPEN.
   2130 	 */
   2131 	if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
   2132 		bool_t retry_open = FALSE;
   2133 
   2134 		nfs4open_confirm(vp, &seqid, &op_res->stateid,
   2135 		    cred_otw, is_recov, &retry_open,
   2136 		    oop, FALSE, ep, NULL);
   2137 		if (ep->error || ep->stat) {
   2138 			nfs4args_copen_free(open_args);
   2139 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   2140 			nfs4_end_open_seqid_sync(oop);
   2141 			open_owner_rele(oop);
   2142 			oop = NULL;
   2143 			goto top;
   2144 		}
   2145 	}
   2146 
   2147 	mutex_enter(&osp->os_sync_lock);
   2148 	osp->open_stateid = op_res->stateid;
   2149 	osp->os_delegation = 0;
   2150 	/*
   2151 	 * Need to reset this bitfield for the possible case where we were
   2152 	 * going to OTW CLOSE the file, got a non-recoverable error, and before
   2153 	 * we could retry the CLOSE, OPENed the file again.
   2154 	 */
   2155 	ASSERT(osp->os_open_owner->oo_seqid_inuse);
   2156 	osp->os_final_close = 0;
   2157 	osp->os_force_close = 0;
   2158 	if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
   2159 		osp->os_dc_openacc = open_args->share_access;
   2160 	mutex_exit(&osp->os_sync_lock);
   2161 
   2162 	nfs4_end_open_seqid_sync(oop);
   2163 
   2164 	/* accept delegation, if any */
   2165 	nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
   2166 
   2167 	nfs4args_copen_free(open_args);
   2168 
   2169 	nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
   2170 
   2171 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   2172 
   2173 	ASSERT(nfs4_consistent_type(vp));
   2174 
   2175 	open_owner_rele(oop);
   2176 	crfree(cr);
   2177 	crfree(cred_otw);
   2178 	return;
   2179 
   2180 kill_file:
   2181 	nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
   2182 failed_reopen:
   2183 	NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
   2184 	    "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
   2185 	    (void *)osp, (void *)cr, rnode4info(rp)));
   2186 	mutex_enter(&osp->os_sync_lock);
   2187 	osp->os_failed_reopen = 1;
   2188 	mutex_exit(&osp->os_sync_lock);
   2189 bailout:
   2190 	if (oop != NULL) {
   2191 		nfs4_end_open_seqid_sync(oop);
   2192 		open_owner_rele(oop);
   2193 	}
   2194 	if (cr != NULL)
   2195 		crfree(cr);
   2196 	if (cred_otw != NULL)
   2197 		crfree(cred_otw);
   2198 }
   2199 
   2200 /* for . and .. OPENs */
   2201 /* ARGSUSED */
   2202 static int
   2203 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
   2204 {
   2205 	rnode4_t *rp;
   2206 	nfs4_ga_res_t gar;
   2207 
   2208 	ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
   2209 
   2210 	/*
   2211 	 * If close-to-open consistency checking is turned off or
   2212 	 * if there is no cached data, we can avoid
   2213 	 * the over the wire getattr.  Otherwise, force a
   2214 	 * call to the server to get fresh attributes and to
   2215 	 * check caches. This is required for close-to-open
   2216 	 * consistency.
   2217 	 */
   2218 	rp = VTOR4(*vpp);
   2219 	if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
   2220 	    (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
   2221 		return (0);
   2222 
   2223 	gar.n4g_va.va_mask = AT_ALL;
   2224 	return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
   2225 }
   2226 
   2227 /*
   2228  * CLOSE a file
   2229  */
   2230 /* ARGSUSED */
   2231 static int
   2232 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
   2233 	caller_context_t *ct)
   2234 {
   2235 	rnode4_t	*rp;
   2236 	int		 error = 0;
   2237 	int		 r_error = 0;
   2238 	int		 n4error = 0;
   2239 	nfs4_error_t	 e = { 0, NFS4_OK, RPC_SUCCESS };
   2240 
   2241 	/*
   2242 	 * Remove client state for this (lockowner, file) pair.
   2243 	 * Issue otw v4 call to have the server do the same.
   2244 	 */
   2245 
   2246 	rp = VTOR4(vp);
   2247 
   2248 	/*
   2249 	 * zone_enter(2) prevents processes from changing zones with NFS files
   2250 	 * open; if we happen to get here from the wrong zone we can't do
   2251 	 * anything over the wire.
   2252 	 */
   2253 	if (VTOMI4(vp)->mi_zone != nfs_zone()) {
   2254 		/*
   2255 		 * We could attempt to clean up locks, except we're sure
   2256 		 * that the current process didn't acquire any locks on
   2257 		 * the file: any attempt to lock a file belong to another zone
   2258 		 * will fail, and one can't lock an NFS file and then change
   2259 		 * zones, as that fails too.
   2260 		 *
   2261 		 * Returning an error here is the sane thing to do.  A
   2262 		 * subsequent call to VN_RELE() which translates to a
   2263 		 * nfs4_inactive() will clean up state: if the zone of the
   2264 		 * vnode's origin is still alive and kicking, the inactive
   2265 		 * thread will handle the request (from the correct zone), and
   2266 		 * everything (minus the OTW close call) should be OK.  If the
   2267 		 * zone is going away nfs4_async_inactive() will throw away
   2268 		 * delegations, open streams and cached pages inline.
   2269 		 */
   2270 		return (EIO);
   2271 	}
   2272 
   2273 	/*
   2274 	 * If we are using local locking for this filesystem, then
   2275 	 * release all of the SYSV style record locks.  Otherwise,
   2276 	 * we are doing network locking and we need to release all
   2277 	 * of the network locks.  All of the locks held by this
   2278 	 * process on this file are released no matter what the
   2279 	 * incoming reference count is.
   2280 	 */
   2281 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
   2282 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
   2283 		cleanshares(vp, ttoproc(curthread)->p_pid);
   2284 	} else
   2285 		e.error = nfs4_lockrelease(vp, flag, offset, cr);
   2286 
   2287 	if (e.error) {
   2288 		struct lm_sysid *lmsid;
   2289 		lmsid = nfs4_find_sysid(VTOMI4(vp));
   2290 		if (lmsid == NULL) {
   2291 			DTRACE_PROBE2(unknown__sysid, int, e.error,
   2292 			    vnode_t *, vp);
   2293 		} else {
   2294 			cleanlocks(vp, ttoproc(curthread)->p_pid,
   2295 			    (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
   2296 		}
   2297 		return (e.error);
   2298 	}
   2299 
   2300 	if (count > 1)
   2301 		return (0);
   2302 
   2303 	/*
   2304 	 * If the file has been `unlinked', then purge the
   2305 	 * DNLC so that this vnode will get reycled quicker
   2306 	 * and the .nfs* file on the server will get removed.
   2307 	 */
   2308 	if (rp->r_unldvp != NULL)
   2309 		dnlc_purge_vp(vp);
   2310 
   2311 	/*
   2312 	 * If the file was open for write and there are pages,
   2313 	 * do a synchronous flush and commit of all of the
   2314 	 * dirty and uncommitted pages.
   2315 	 */
   2316 	ASSERT(!e.error);
   2317 	if ((flag & FWRITE) && nfs4_has_pages(vp))
   2318 		error = nfs4_putpage_commit(vp, 0, 0, cr);
   2319 
   2320 	mutex_enter(&rp->r_statelock);
   2321 	r_error = rp->r_error;
   2322 	rp->r_error = 0;
   2323 	mutex_exit(&rp->r_statelock);
   2324 
   2325 	/*
   2326 	 * If this file type is one for which no explicit 'open' was
   2327 	 * done, then bail now (ie. no need for protocol 'close'). If
   2328 	 * there was an error w/the vm subsystem, return _that_ error,
   2329 	 * otherwise, return any errors that may've been reported via
   2330 	 * the rnode.
   2331 	 */
   2332 	if (vp->v_type != VREG)
   2333 		return (error ? error : r_error);
   2334 
   2335 	/*
   2336 	 * The sync putpage commit may have failed above, but since
   2337 	 * we're working w/a regular file, we need to do the protocol
   2338 	 * 'close' (nfs4close_one will figure out if an otw close is
   2339 	 * needed or not). Report any errors _after_ doing the protocol
   2340 	 * 'close'.
   2341 	 */
   2342 	nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
   2343 	n4error = e.error ? e.error : geterrno4(e.stat);
   2344 
   2345 	/*
   2346 	 * Error reporting prio (Hi -> Lo)
   2347 	 *
   2348 	 *   i) nfs4_putpage_commit (error)
   2349 	 *  ii) rnode's (r_error)
   2350 	 * iii) nfs4close_one (n4error)
   2351 	 */
   2352 	return (error ? error : (r_error ? r_error : n4error));
   2353 }
   2354 
   2355 /*
   2356  * Initialize *lost_rqstp.
   2357  */
   2358 
   2359 static void
   2360 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
   2361     nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
   2362     vnode_t *vp)
   2363 {
   2364 	if (error != ETIMEDOUT && error != EINTR &&
   2365 	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
   2366 		lost_rqstp->lr_op = 0;
   2367 		return;
   2368 	}
   2369 
   2370 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
   2371 	    "nfs4close_save_lost_rqst: error %d", error));
   2372 
   2373 	lost_rqstp->lr_op = OP_CLOSE;
   2374 	/*
   2375 	 * The vp is held and rele'd via the recovery code.
   2376 	 * See nfs4_save_lost_rqst.
   2377 	 */
   2378 	lost_rqstp->lr_vp = vp;
   2379 	lost_rqstp->lr_dvp = NULL;
   2380 	lost_rqstp->lr_oop = oop;
   2381 	lost_rqstp->lr_osp = osp;
   2382 	ASSERT(osp != NULL);
   2383 	ASSERT(mutex_owned(&osp->os_sync_lock));
   2384 	osp->os_pending_close = 1;
   2385 	lost_rqstp->lr_lop = NULL;
   2386 	lost_rqstp->lr_cr = cr;
   2387 	lost_rqstp->lr_flk = NULL;
   2388 	lost_rqstp->lr_putfirst = FALSE;
   2389 }
   2390 
   2391 /*
   2392  * Assumes you already have the open seqid sync grabbed as well as the
   2393  * 'os_sync_lock'.  Note: this will release the open seqid sync and
   2394  * 'os_sync_lock' if client recovery starts.  Calling functions have to
   2395  * be prepared to handle this.
   2396  *
   2397  * 'recov' is returned as 1 if the CLOSE operation detected client recovery
   2398  * was needed and was started, and that the calling function should retry
   2399  * this function; otherwise it is returned as 0.
   2400  *
   2401  * Errors are returned via the nfs4_error_t parameter.
   2402  */
   2403 static void
   2404 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
   2405     nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
   2406     nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
   2407 {
   2408 	COMPOUND4args_clnt args;
   2409 	COMPOUND4res_clnt res;
   2410 	CLOSE4args *close_args;
   2411 	nfs_resop4 *resop;
   2412 	nfs_argop4 argop[3];
   2413 	int doqueue = 1;
   2414 	mntinfo4_t *mi;
   2415 	seqid4 seqid;
   2416 	vnode_t *vp;
   2417 	bool_t needrecov = FALSE;
   2418 	nfs4_lost_rqst_t lost_rqst;
   2419 	hrtime_t t;
   2420 
   2421 	ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
   2422 
   2423 	ASSERT(MUTEX_HELD(&osp->os_sync_lock));
   2424 
   2425 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
   2426 
   2427 	/* Only set this to 1 if recovery is started */
   2428 	*recov = 0;
   2429 
   2430 	/* do the OTW call to close the file */
   2431 
   2432 	if (close_type == CLOSE_RESEND)
   2433 		args.ctag = TAG_CLOSE_LOST;
   2434 	else if (close_type == CLOSE_AFTER_RESEND)
   2435 		args.ctag = TAG_CLOSE_UNDO;
   2436 	else
   2437 		args.ctag = TAG_CLOSE;
   2438 
   2439 	args.array_len = 3;
   2440 	args.array = argop;
   2441 
   2442 	vp = RTOV4(rp);
   2443 
   2444 	mi = VTOMI4(vp);
   2445 
   2446 	/* putfh target fh */
   2447 	argop[0].argop = OP_CPUTFH;
   2448 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
   2449 
   2450 	argop[1].argop = OP_GETATTR;
   2451 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   2452 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
   2453 
   2454 	argop[2].argop = OP_CLOSE;
   2455 	close_args = &argop[2].nfs_argop4_u.opclose;
   2456 
   2457 	seqid = nfs4_get_open_seqid(oop) + 1;
   2458 
   2459 	close_args->seqid = seqid;
   2460 	close_args->open_stateid = osp->open_stateid;
   2461 
   2462 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
   2463 	    "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
   2464 	    rnode4info(rp)));
   2465 
   2466 	t = gethrtime();
   2467 
   2468 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
   2469 
   2470 	if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
   2471 		nfs4_set_open_seqid(seqid, oop, args.ctag);
   2472 	}
   2473 
   2474 	needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
   2475 	if (ep->error && !needrecov) {
   2476 		/*
   2477 		 * if there was an error and no recovery is to be done
   2478 		 * then then set up the file to flush its cache if
   2479 		 * needed for the next caller.
   2480 		 */
   2481 		mutex_enter(&rp->r_statelock);
   2482 		PURGE_ATTRCACHE4_LOCKED(rp);
   2483 		rp->r_flags &= ~R4WRITEMODIFIED;
   2484 		mutex_exit(&rp->r_statelock);
   2485 		return;
   2486 	}
   2487 
   2488 	if (needrecov) {
   2489 		bool_t abort;
   2490 		nfs4_bseqid_entry_t *bsep = NULL;
   2491 
   2492 		if (close_type != CLOSE_RESEND)
   2493 			nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
   2494 			    osp, cred_otw, vp);
   2495 
   2496 		if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
   2497 			bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
   2498 			    0, args.ctag, close_args->seqid);
   2499 
   2500 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2501 		    "nfs4close_otw: initiating recovery. error %d "
   2502 		    "res.status %d", ep->error, res.status));
   2503 
   2504 		/*
   2505 		 * Drop the 'os_sync_lock' here so we don't hit
   2506 		 * a potential recursive mutex_enter via an
   2507 		 * 'open_stream_hold()'.
   2508 		 */
   2509 		mutex_exit(&osp->os_sync_lock);
   2510 		*have_sync_lockp = 0;
   2511 		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
   2512 		    (close_type != CLOSE_RESEND &&
   2513 		    lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
   2514 		    OP_CLOSE, bsep);
   2515 
   2516 		/* drop open seq sync, and let the calling function regrab it */
   2517 		nfs4_end_open_seqid_sync(oop);
   2518 		*did_start_seqid_syncp = 0;
   2519 
   2520 		if (bsep)
   2521 			kmem_free(bsep, sizeof (*bsep));
   2522 		/*
   2523 		 * For signals, the caller wants to quit, so don't say to
   2524 		 * retry.  For forced unmount, if it's a user thread, it
   2525 		 * wants to quit.  If it's a recovery thread, the retry
   2526 		 * will happen higher-up on the call stack.  Either way,
   2527 		 * don't say to retry.
   2528 		 */
   2529 		if (abort == FALSE && ep->error != EINTR &&
   2530 		    !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
   2531 		    close_type != CLOSE_RESEND &&
   2532 		    close_type != CLOSE_AFTER_RESEND)
   2533 			*recov = 1;
   2534 		else
   2535 			*recov = 0;
   2536 
   2537 		if (!ep->error)
   2538 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   2539 		return;
   2540 	}
   2541 
   2542 	if (res.status) {
   2543 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   2544 		return;
   2545 	}
   2546 
   2547 	mutex_enter(&rp->r_statev4_lock);
   2548 	rp->created_v4 = 0;
   2549 	mutex_exit(&rp->r_statev4_lock);
   2550 
   2551 	resop = &res.array[2];
   2552 	osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
   2553 	osp->os_valid = 0;
   2554 
   2555 	/*
   2556 	 * This removes the reference obtained at OPEN; ie, when the
   2557 	 * open stream structure was created.
   2558 	 *
   2559 	 * We don't have to worry about calling 'open_stream_rele'
   2560 	 * since we our currently holding a reference to the open
   2561 	 * stream which means the count cannot go to 0 with this
   2562 	 * decrement.
   2563 	 */
   2564 	ASSERT(osp->os_ref_count >= 2);
   2565 	osp->os_ref_count--;
   2566 
   2567 	if (!ep->error)
   2568 		nfs4_attr_cache(vp,
   2569 		    &res.array[1].nfs_resop4_u.opgetattr.ga_res,
   2570 		    t, cred_otw, TRUE, NULL);
   2571 
   2572 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
   2573 	    " returning %d", ep->error));
   2574 
   2575 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   2576 }
   2577 
   2578 /* ARGSUSED */
   2579 static int
   2580 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
   2581     caller_context_t *ct)
   2582 {
   2583 	rnode4_t *rp;
   2584 	u_offset_t off;
   2585 	offset_t diff;
   2586 	uint_t on;
   2587 	uint_t n;
   2588 	caddr_t base;
   2589 	uint_t flags;
   2590 	int error;
   2591 	mntinfo4_t *mi;
   2592 
   2593 	rp = VTOR4(vp);
   2594 
   2595 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
   2596 
   2597 	if (IS_SHADOW(vp, rp))
   2598 		vp = RTOV4(rp);
   2599 
   2600 	if (vp->v_type != VREG)
   2601 		return (EISDIR);
   2602 
   2603 	mi = VTOMI4(vp);
   2604 
   2605 	if (nfs_zone() != mi->mi_zone)
   2606 		return (EIO);
   2607 
   2608 	if (uiop->uio_resid == 0)
   2609 		return (0);
   2610 
   2611 	if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
   2612 		return (EINVAL);
   2613 
   2614 	mutex_enter(&rp->r_statelock);
   2615 	if (rp->r_flags & R4RECOVERRP)
   2616 		error = (rp->r_error ? rp->r_error : EIO);
   2617 	else
   2618 		error = 0;
   2619 	mutex_exit(&rp->r_statelock);
   2620 	if (error)
   2621 		return (error);
   2622 
   2623 	/*
   2624 	 * Bypass VM if caching has been disabled (e.g., locking) or if
   2625 	 * using client-side direct I/O and the file is not mmap'd and
   2626 	 * there are no cached pages.
   2627 	 */
   2628 	if ((vp->v_flag & VNOCACHE) ||
   2629 	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
   2630 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
   2631 		size_t resid = 0;
   2632 
   2633 		return (nfs4read(vp, NULL, uiop->uio_loffset,
   2634 		    uiop->uio_resid, &resid, cr, FALSE, uiop));
   2635 	}
   2636 
   2637 	error = 0;
   2638 
   2639 	do {
   2640 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
   2641 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
   2642 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
   2643 
   2644 		if (error = nfs4_validate_caches(vp, cr))
   2645 			break;
   2646 
   2647 		mutex_enter(&rp->r_statelock);
   2648 		while (rp->r_flags & R4INCACHEPURGE) {
   2649 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
   2650 				mutex_exit(&rp->r_statelock);
   2651 				return (EINTR);
   2652 			}
   2653 		}
   2654 		diff = rp->r_size - uiop->uio_loffset;
   2655 		mutex_exit(&rp->r_statelock);
   2656 		if (diff <= 0)
   2657 			break;
   2658 		if (diff < n)
   2659 			n = (uint_t)diff;
   2660 
   2661 		if (vpm_enable) {
   2662 			/*
   2663 			 * Copy data.
   2664 			 */
   2665 			error = vpm_data_copy(vp, off + on, n, uiop,
   2666 			    1, NULL, 0, S_READ);
   2667 		} else {
   2668 			base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
   2669 			    S_READ);
   2670 
   2671 			error = uiomove(base + on, n, UIO_READ, uiop);
   2672 		}
   2673 
   2674 		if (!error) {
   2675 			/*
   2676 			 * If read a whole block or read to eof,
   2677 			 * won't need this buffer again soon.
   2678 			 */
   2679 			mutex_enter(&rp->r_statelock);
   2680 			if (n + on == MAXBSIZE ||
   2681 			    uiop->uio_loffset == rp->r_size)
   2682 				flags = SM_DONTNEED;
   2683 			else
   2684 				flags = 0;
   2685 			mutex_exit(&rp->r_statelock);
   2686 			if (vpm_enable) {
   2687 				error = vpm_sync_pages(vp, off, n, flags);
   2688 			} else {
   2689 				error = segmap_release(segkmap, base, flags);
   2690 			}
   2691 		} else {
   2692 			if (vpm_enable) {
   2693 				(void) vpm_sync_pages(vp, off, n, 0);
   2694 			} else {
   2695 				(void) segmap_release(segkmap, base, 0);
   2696 			}
   2697 		}
   2698 	} while (!error && uiop->uio_resid > 0);
   2699 
   2700 	return (error);
   2701 }
   2702 
   2703 /* ARGSUSED */
   2704 static int
   2705 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
   2706     caller_context_t *ct)
   2707 {
   2708 	rlim64_t limit = uiop->uio_llimit;
   2709 	rnode4_t *rp;
   2710 	u_offset_t off;
   2711 	caddr_t base;
   2712 	uint_t flags;
   2713 	int remainder;
   2714 	size_t n;
   2715 	int on;
   2716 	int error;
   2717 	int resid;
   2718 	u_offset_t offset;
   2719 	mntinfo4_t *mi;
   2720 	uint_t bsize;
   2721 
   2722 	rp = VTOR4(vp);
   2723 
   2724 	if (IS_SHADOW(vp, rp))
   2725 		vp = RTOV4(rp);
   2726 
   2727 	if (vp->v_type != VREG)
   2728 		return (EISDIR);
   2729 
   2730 	mi = VTOMI4(vp);
   2731 
   2732 	if (nfs_zone() != mi->mi_zone)
   2733 		return (EIO);
   2734 
   2735 	if (uiop->uio_resid == 0)
   2736 		return (0);
   2737 
   2738 	mutex_enter(&rp->r_statelock);
   2739 	if (rp->r_flags & R4RECOVERRP)
   2740 		error = (rp->r_error ? rp->r_error : EIO);
   2741 	else
   2742 		error = 0;
   2743 	mutex_exit(&rp->r_statelock);
   2744 	if (error)
   2745 		return (error);
   2746 
   2747 	if (ioflag & FAPPEND) {
   2748 		struct vattr va;
   2749 
   2750 		/*
   2751 		 * Must serialize if appending.
   2752 		 */
   2753 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
   2754 			nfs_rw_exit(&rp->r_rwlock);
   2755 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
   2756 			    INTR4(vp)))
   2757 				return (EINTR);
   2758 		}
   2759 
   2760 		va.va_mask = AT_SIZE;
   2761 		error = nfs4getattr(vp, &va, cr);
   2762 		if (error)
   2763 			return (error);
   2764 		uiop->uio_loffset = va.va_size;
   2765 	}
   2766 
   2767 	offset = uiop->uio_loffset + uiop->uio_resid;
   2768 
   2769 	if (uiop->uio_loffset < (offset_t)0 || offset < 0)
   2770 		return (EINVAL);
   2771 
   2772 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
   2773 		limit = MAXOFFSET_T;
   2774 
   2775 	/*
   2776 	 * Check to make sure that the process will not exceed
   2777 	 * its limit on file size.  It is okay to write up to
   2778 	 * the limit, but not beyond.  Thus, the write which
   2779 	 * reaches the limit will be short and the next write
   2780 	 * will return an error.
   2781 	 */
   2782 	remainder = 0;
   2783 	if (offset > uiop->uio_llimit) {
   2784 		remainder = offset - uiop->uio_llimit;
   2785 		uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
   2786 		if (uiop->uio_resid <= 0) {
   2787 			proc_t *p = ttoproc(curthread);
   2788 
   2789 			uiop->uio_resid += remainder;
   2790 			mutex_enter(&p->p_lock);
   2791 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
   2792 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
   2793 			mutex_exit(&p->p_lock);
   2794 			return (EFBIG);
   2795 		}
   2796 	}
   2797 
   2798 	/* update the change attribute, if we have a write delegation */
   2799 
   2800 	mutex_enter(&rp->r_statev4_lock);
   2801 	if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
   2802 		rp->r_deleg_change++;
   2803 
   2804 	mutex_exit(&rp->r_statev4_lock);
   2805 
   2806 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
   2807 		return (EINTR);
   2808 
   2809 	/*
   2810 	 * Bypass VM if caching has been disabled (e.g., locking) or if
   2811 	 * using client-side direct I/O and the file is not mmap'd and
   2812 	 * there are no cached pages.
   2813 	 */
   2814 	if ((vp->v_flag & VNOCACHE) ||
   2815 	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
   2816 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
   2817 		size_t bufsize;
   2818 		int count;
   2819 		u_offset_t org_offset;
   2820 		stable_how4 stab_comm;
   2821 nfs4_fwrite:
   2822 		if (rp->r_flags & R4STALE) {
   2823 			resid = uiop->uio_resid;
   2824 			offset = uiop->uio_loffset;
   2825 			error = rp->r_error;
   2826 			/*
   2827 			 * A close may have cleared r_error, if so,
   2828 			 * propagate ESTALE error return properly
   2829 			 */
   2830 			if (error == 0)
   2831 				error = ESTALE;
   2832 			goto bottom;
   2833 		}
   2834 
   2835 		bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
   2836 		base = kmem_alloc(bufsize, KM_SLEEP);
   2837 		do {
   2838 			if (ioflag & FDSYNC)
   2839 				stab_comm = DATA_SYNC4;
   2840 			else
   2841 				stab_comm = FILE_SYNC4;
   2842 			resid = uiop->uio_resid;
   2843 			offset = uiop->uio_loffset;
   2844 			count = MIN(uiop->uio_resid, bufsize);
   2845 			org_offset = uiop->uio_loffset;
   2846 			error = uiomove(base, count, UIO_WRITE, uiop);
   2847 			if (!error) {
   2848 				error = nfs4write(vp, base, org_offset,
   2849 				    count, cr, &stab_comm);
   2850 				if (!error) {
   2851 					mutex_enter(&rp->r_statelock);
   2852 					if (rp->r_size < uiop->uio_loffset)
   2853 						rp->r_size = uiop->uio_loffset;
   2854 					mutex_exit(&rp->r_statelock);
   2855 				}
   2856 			}
   2857 		} while (!error && uiop->uio_resid > 0);
   2858 		kmem_free(base, bufsize);
   2859 		goto bottom;
   2860 	}
   2861 
   2862 	bsize = vp->v_vfsp->vfs_bsize;
   2863 
   2864 	do {
   2865 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
   2866 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
   2867 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
   2868 
   2869 		resid = uiop->uio_resid;
   2870 		offset = uiop->uio_loffset;
   2871 
   2872 		if (rp->r_flags & R4STALE) {
   2873 			error = rp->r_error;
   2874 			/*
   2875 			 * A close may have cleared r_error, if so,
   2876 			 * propagate ESTALE error return properly
   2877 			 */
   2878 			if (error == 0)
   2879 				error = ESTALE;
   2880 			break;
   2881 		}
   2882 
   2883 		/*
   2884 		 * Don't create dirty pages faster than they
   2885 		 * can be cleaned so that the system doesn't
   2886 		 * get imbalanced.  If the async queue is
   2887 		 * maxed out, then wait for it to drain before
   2888 		 * creating more dirty pages.  Also, wait for
   2889 		 * any threads doing pagewalks in the vop_getattr
   2890 		 * entry points so that they don't block for
   2891 		 * long periods.
   2892 		 */
   2893 		mutex_enter(&rp->r_statelock);
   2894 		while ((mi->mi_max_threads != 0 &&
   2895 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
   2896 		    rp->r_gcount > 0) {
   2897 			if (INTR4(vp)) {
   2898 				klwp_t *lwp = ttolwp(curthread);
   2899 
   2900 				if (lwp != NULL)
   2901 					lwp->lwp_nostop++;
   2902 				if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
   2903 					mutex_exit(&rp->r_statelock);
   2904 					if (lwp != NULL)
   2905 						lwp->lwp_nostop--;
   2906 					error = EINTR;
   2907 					goto bottom;
   2908 				}
   2909 				if (lwp != NULL)
   2910 					lwp->lwp_nostop--;
   2911 			} else
   2912 				cv_wait(&rp->r_cv, &rp->r_statelock);
   2913 		}
   2914 		mutex_exit(&rp->r_statelock);
   2915 
   2916 		/*
   2917 		 * Touch the page and fault it in if it is not in core
   2918 		 * before segmap_getmapflt or vpm_data_copy can lock it.
   2919 		 * This is to avoid the deadlock if the buffer is mapped
   2920 		 * to the same file through mmap which we want to write.
   2921 		 */
   2922 		uio_prefaultpages((long)n, uiop);
   2923 
   2924 		if (vpm_enable) {
   2925 			/*
   2926 			 * It will use kpm mappings, so no need to
   2927 			 * pass an address.
   2928 			 */
   2929 			error = writerp4(rp, NULL, n, uiop, 0);
   2930 		} else  {
   2931 			if (segmap_kpm) {
   2932 				int pon = uiop->uio_loffset & PAGEOFFSET;
   2933 				size_t pn = MIN(PAGESIZE - pon,
   2934 				    uiop->uio_resid);
   2935 				int pagecreate;
   2936 
   2937 				mutex_enter(&rp->r_statelock);
   2938 				pagecreate = (pon == 0) && (pn == PAGESIZE ||
   2939 				    uiop->uio_loffset + pn >= rp->r_size);
   2940 				mutex_exit(&rp->r_statelock);
   2941 
   2942 				base = segmap_getmapflt(segkmap, vp, off + on,
   2943 				    pn, !pagecreate, S_WRITE);
   2944 
   2945 				error = writerp4(rp, base + pon, n, uiop,
   2946 				    pagecreate);
   2947 
   2948 			} else {
   2949 				base = segmap_getmapflt(segkmap, vp, off + on,
   2950 				    n, 0, S_READ);
   2951 				error = writerp4(rp, base + on, n, uiop, 0);
   2952 			}
   2953 		}
   2954 
   2955 		if (!error) {
   2956 			if (mi->mi_flags & MI4_NOAC)
   2957 				flags = SM_WRITE;
   2958 			else if ((uiop->uio_loffset % bsize) == 0 ||
   2959 			    IS_SWAPVP(vp)) {
   2960 				/*
   2961 				 * Have written a whole block.
   2962 				 * Start an asynchronous write
   2963 				 * and mark the buffer to
   2964 				 * indicate that it won't be
   2965 				 * needed again soon.
   2966 				 */
   2967 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
   2968 			} else
   2969 				flags = 0;
   2970 			if ((ioflag & (FSYNC|FDSYNC)) ||
   2971 			    (rp->r_flags & R4OUTOFSPACE)) {
   2972 				flags &= ~SM_ASYNC;
   2973 				flags |= SM_WRITE;
   2974 			}
   2975 			if (vpm_enable) {
   2976 				error = vpm_sync_pages(vp, off, n, flags);
   2977 			} else {
   2978 				error = segmap_release(segkmap, base, flags);
   2979 			}
   2980 		} else {
   2981 			if (vpm_enable) {
   2982 				(void) vpm_sync_pages(vp, off, n, 0);
   2983 			} else {
   2984 				(void) segmap_release(segkmap, base, 0);
   2985 			}
   2986 			/*
   2987 			 * In the event that we got an access error while
   2988 			 * faulting in a page for a write-only file just
   2989 			 * force a write.
   2990 			 */
   2991 			if (error == EACCES)
   2992 				goto nfs4_fwrite;
   2993 		}
   2994 	} while (!error && uiop->uio_resid > 0);
   2995 
   2996 bottom:
   2997 	if (error) {
   2998 		uiop->uio_resid = resid + remainder;
   2999 		uiop->uio_loffset = offset;
   3000 	} else {
   3001 		uiop->uio_resid += remainder;
   3002 
   3003 		mutex_enter(&rp->r_statev4_lock);
   3004 		if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
   3005 			gethrestime(&rp->r_attr.va_mtime);
   3006 			rp->r_attr.va_ctime = rp->r_attr.va_mtime;
   3007 		}
   3008 		mutex_exit(&rp->r_statev4_lock);
   3009 	}
   3010 
   3011 	nfs_rw_exit(&rp->r_lkserlock);
   3012 
   3013 	return (error);
   3014 }
   3015 
   3016 /*
   3017  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
   3018  */
   3019 static int
   3020 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
   3021     int flags, cred_t *cr)
   3022 {
   3023 	struct buf *bp;
   3024 	int error;
   3025 	page_t *savepp;
   3026 	uchar_t fsdata;
   3027 	stable_how4 stab_comm;
   3028 
   3029 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
   3030 	bp = pageio_setup(pp, len, vp, flags);
   3031 	ASSERT(bp != NULL);
   3032 
   3033 	/*
   3034 	 * pageio_setup should have set b_addr to 0.  This
   3035 	 * is correct since we want to do I/O on a page
   3036 	 * boundary.  bp_mapin will use this addr to calculate
   3037 	 * an offset, and then set b_addr to the kernel virtual
   3038 	 * address it allocated for us.
   3039 	 */
   3040 	ASSERT(bp->b_un.b_addr == 0);
   3041 
   3042 	bp->b_edev = 0;
   3043 	bp->b_dev = 0;
   3044 	bp->b_lblkno = lbtodb(off);
   3045 	bp->b_file = vp;
   3046 	bp->b_offset = (offset_t)off;
   3047 	bp_mapin(bp);
   3048 
   3049 	if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
   3050 	    freemem > desfree)
   3051 		stab_comm = UNSTABLE4;
   3052 	else
   3053 		stab_comm = FILE_SYNC4;
   3054 
   3055 	error = nfs4_bio(bp, &stab_comm, cr, FALSE);
   3056 
   3057 	bp_mapout(bp);
   3058 	pageio_done(bp);
   3059 
   3060 	if (stab_comm == UNSTABLE4)
   3061 		fsdata = C_DELAYCOMMIT;
   3062 	else
   3063 		fsdata = C_NOCOMMIT;
   3064 
   3065 	savepp = pp;
   3066 	do {
   3067 		pp->p_fsdata = fsdata;
   3068 	} while ((pp = pp->p_next) != savepp);
   3069 
   3070 	return (error);
   3071 }
   3072 
   3073 /*
   3074  */
   3075 static int
   3076 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
   3077 {
   3078 	nfs4_open_owner_t	*oop;
   3079 	nfs4_open_stream_t	*osp;
   3080 	rnode4_t		*rp = VTOR4(vp);
   3081 	mntinfo4_t 		*mi = VTOMI4(vp);
   3082 	int 			reopen_needed;
   3083 
   3084 	ASSERT(nfs_zone() == mi->mi_zone);
   3085 
   3086 
   3087 	oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
   3088 	if (!oop)
   3089 		return (EIO);
   3090 
   3091 	/* returns with 'os_sync_lock' held */
   3092 	osp = find_open_stream(oop, rp);
   3093 	if (!osp) {
   3094 		open_owner_rele(oop);
   3095 		return (EIO);
   3096 	}
   3097 
   3098 	if (osp->os_failed_reopen) {
   3099 		mutex_exit(&osp->os_sync_lock);
   3100 		open_stream_rele(osp, rp);
   3101 		open_owner_rele(oop);
   3102 		return (EIO);
   3103 	}
   3104 
   3105 	/*
   3106 	 * Determine whether a reopen is needed.  If this
   3107 	 * is a delegation open stream, then the os_delegation bit
   3108 	 * should be set.
   3109 	 */
   3110 
   3111 	reopen_needed = osp->os_delegation;
   3112 
   3113 	mutex_exit(&osp->os_sync_lock);
   3114 	open_owner_rele(oop);
   3115 
   3116 	if (reopen_needed) {
   3117 		nfs4_error_zinit(ep);
   3118 		nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
   3119 		mutex_enter(&osp->os_sync_lock);
   3120 		if (ep->error || ep->stat || osp->os_failed_reopen) {
   3121 			mutex_exit(&osp->os_sync_lock);
   3122 			open_stream_rele(osp, rp);
   3123 			return (EIO);
   3124 		}
   3125 		mutex_exit(&osp->os_sync_lock);
   3126 	}
   3127 	open_stream_rele(osp, rp);
   3128 
   3129 	return (0);
   3130 }
   3131 
   3132 /*
   3133  * Write to file.  Writes to remote server in largest size
   3134  * chunks that the server can handle.  Write is synchronous.
   3135  */
   3136 static int
   3137 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
   3138     stable_how4 *stab_comm)
   3139 {
   3140 	mntinfo4_t *mi;
   3141 	COMPOUND4args_clnt args;
   3142 	COMPOUND4res_clnt res;
   3143 	WRITE4args *wargs;
   3144 	WRITE4res *wres;
   3145 	nfs_argop4 argop[2];
   3146 	nfs_resop4 *resop;
   3147 	int tsize;
   3148 	stable_how4 stable;
   3149 	rnode4_t *rp;
   3150 	int doqueue = 1;
   3151 	bool_t needrecov;
   3152 	nfs4_recov_state_t recov_state;
   3153 	nfs4_stateid_types_t sid_types;
   3154 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   3155 	int recov;
   3156 
   3157 	rp = VTOR4(vp);
   3158 	mi = VTOMI4(vp);
   3159 
   3160 	ASSERT(nfs_zone() == mi->mi_zone);
   3161 
   3162 	stable = *stab_comm;
   3163 	*stab_comm = FILE_SYNC4;
   3164 
   3165 	needrecov = FALSE;
   3166 	recov_state.rs_flags = 0;
   3167 	recov_state.rs_num_retry_despite_err = 0;
   3168 	nfs4_init_stateid_types(&sid_types);
   3169 
   3170 	/* Is curthread the recovery thread? */
   3171 	mutex_enter(&mi->mi_lock);
   3172 	recov = (mi->mi_recovthread == curthread);
   3173 	mutex_exit(&mi->mi_lock);
   3174 
   3175 recov_retry:
   3176 	args.ctag = TAG_WRITE;
   3177 	args.array_len = 2;
   3178 	args.array = argop;
   3179 
   3180 	if (!recov) {
   3181 		e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
   3182 		    &recov_state, NULL);
   3183 		if (e.error)
   3184 			return (e.error);
   3185 	}
   3186 
   3187 	/* 0. putfh target fh */
   3188 	argop[0].argop = OP_CPUTFH;
   3189 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
   3190 
   3191 	/* 1. write */
   3192 	nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
   3193 
   3194 	do {
   3195 
   3196 		wargs->offset = (offset4)offset;
   3197 		wargs->data_val = base;
   3198 
   3199 		if (mi->mi_io_kstats) {
   3200 			mutex_enter(&mi->mi_lock);
   3201 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   3202 			mutex_exit(&mi->mi_lock);
   3203 		}
   3204 
   3205 		if ((vp->v_flag & VNOCACHE) ||
   3206 		    (rp->r_flags & R4DIRECTIO) ||
   3207 		    (mi->mi_flags & MI4_DIRECTIO))
   3208 			tsize = MIN(mi->mi_stsize, count);
   3209 		else
   3210 			tsize = MIN(mi->mi_curwrite, count);
   3211 		wargs->data_len = (uint_t)tsize;
   3212 		rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
   3213 
   3214 		if (mi->mi_io_kstats) {
   3215 			mutex_enter(&mi->mi_lock);
   3216 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
   3217 			mutex_exit(&mi->mi_lock);
   3218 		}
   3219 
   3220 		if (!recov) {
   3221 			needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
   3222 			if (e.error && !needrecov) {
   3223 				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
   3224 				    &recov_state, needrecov);
   3225 				return (e.error);
   3226 			}
   3227 		} else {
   3228 			if (e.error)
   3229 				return (e.error);
   3230 		}
   3231 
   3232 		/*
   3233 		 * Do handling of OLD_STATEID outside
   3234 		 * of the normal recovery framework.
   3235 		 *
   3236 		 * If write receives a BAD stateid error while using a
   3237 		 * delegation stateid, retry using the open stateid (if it
   3238 		 * exists).  If it doesn't have an open stateid, reopen the
   3239 		 * file first, then retry.
   3240 		 */
   3241 		if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
   3242 		    sid_types.cur_sid_type != SPEC_SID) {
   3243 			nfs4_save_stateid(&wargs->stateid, &sid_types);
   3244 			if (!recov)
   3245 				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
   3246 				    &recov_state, needrecov);
   3247 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3248 			goto recov_retry;
   3249 		} else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
   3250 		    sid_types.cur_sid_type == DEL_SID) {
   3251 			nfs4_save_stateid(&wargs->stateid, &sid_types);
   3252 			mutex_enter(&rp->r_statev4_lock);
   3253 			rp->r_deleg_return_pending = TRUE;
   3254 			mutex_exit(&rp->r_statev4_lock);
   3255 			if (nfs4rdwr_check_osid(vp, &e, cr)) {
   3256 				if (!recov)
   3257 					nfs4_end_fop(mi, vp, NULL, OH_WRITE,
   3258 					    &recov_state, needrecov);
   3259 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   3260 				    (caddr_t)&res);
   3261 				return (EIO);
   3262 			}
   3263 			if (!recov)
   3264 				nfs4_end_fop(mi, vp, NULL, OH_WRITE,
   3265 				    &recov_state, needrecov);
   3266 			/* hold needed for nfs4delegreturn_thread */
   3267 			VN_HOLD(vp);
   3268 			nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
   3269 			    NFS4_DR_DISCARD), FALSE);
   3270 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3271 			goto recov_retry;
   3272 		}
   3273 
   3274 		if (needrecov) {
   3275 			bool_t abort;
   3276 
   3277 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   3278 			    "nfs4write: client got error %d, res.status %d"
   3279 			    ", so start recovery", e.error, res.status));
   3280 
   3281 			abort = nfs4_start_recovery(&e,
   3282 			    VTOMI4(vp), vp, NULL, &wargs->stateid,
   3283 			    NULL, OP_WRITE, NULL);
   3284 			if (!e.error) {
   3285 				e.error = geterrno4(res.status);
   3286 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   3287 				    (caddr_t)&res);
   3288 			}
   3289 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
   3290 			    &recov_state, needrecov);
   3291 			if (abort == FALSE)
   3292 				goto recov_retry;
   3293 			return (e.error);
   3294 		}
   3295 
   3296 		if (res.status) {
   3297 			e.error = geterrno4(res.status);
   3298 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3299 			if (!recov)
   3300 				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
   3301 				    &recov_state, needrecov);
   3302 			return (e.error);
   3303 		}
   3304 
   3305 		resop = &res.array[1];	/* write res */
   3306 		wres = &resop->nfs_resop4_u.opwrite;
   3307 
   3308 		if ((int)wres->count > tsize) {
   3309 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3310 
   3311 			zcmn_err(getzoneid(), CE_WARN,
   3312 			    "nfs4write: server wrote %u, requested was %u",
   3313 			    (int)wres->count, tsize);
   3314 			if (!recov)
   3315 				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
   3316 				    &recov_state, needrecov);
   3317 			return (EIO);
   3318 		}
   3319 		if (wres->committed == UNSTABLE4) {
   3320 			*stab_comm = UNSTABLE4;
   3321 			if (wargs->stable == DATA_SYNC4 ||
   3322 			    wargs->stable == FILE_SYNC4) {
   3323 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   3324 				    (caddr_t)&res);
   3325 				zcmn_err(getzoneid(), CE_WARN,
   3326 				    "nfs4write: server %s did not commit "
   3327 				    "to stable storage",
   3328 				    rp->r_server->sv_hostname);
   3329 				if (!recov)
   3330 					nfs4_end_fop(VTOMI4(vp), vp, NULL,
   3331 					    OH_WRITE, &recov_state, needrecov);
   3332 				return (EIO);
   3333 			}
   3334 		}
   3335 
   3336 		tsize = (int)wres->count;
   3337 		count -= tsize;
   3338 		base += tsize;
   3339 		offset += tsize;
   3340 		if (mi->mi_io_kstats) {
   3341 			mutex_enter(&mi->mi_lock);
   3342 			KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
   3343 			KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
   3344 			    tsize;
   3345 			mutex_exit(&mi->mi_lock);
   3346 		}
   3347 		lwp_stat_update(LWP_STAT_OUBLK, 1);
   3348 		mutex_enter(&rp->r_statelock);
   3349 		if (rp->r_flags & R4HAVEVERF) {
   3350 			if (rp->r_writeverf != wres->writeverf) {
   3351 				nfs4_set_mod(vp);
   3352 				rp->r_writeverf = wres->writeverf;
   3353 			}
   3354 		} else {
   3355 			rp->r_writeverf = wres->writeverf;
   3356 			rp->r_flags |= R4HAVEVERF;
   3357 		}
   3358 		PURGE_ATTRCACHE4_LOCKED(rp);
   3359 		rp->r_flags |= R4WRITEMODIFIED;
   3360 		gethrestime(&rp->r_attr.va_mtime);
   3361 		rp->r_attr.va_ctime = rp->r_attr.va_mtime;
   3362 		mutex_exit(&rp->r_statelock);
   3363 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3364 	} while (count);
   3365 
   3366 	if (!recov)
   3367 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state,
   3368 		    needrecov);
   3369 
   3370 	return (e.error);
   3371 }
   3372 
   3373 /*
   3374  * Read from a file.  Reads data in largest chunks our interface can handle.
   3375  */
   3376 static int
   3377 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
   3378     size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
   3379 {
   3380 	mntinfo4_t *mi;
   3381 	COMPOUND4args_clnt args;
   3382 	COMPOUND4res_clnt res;
   3383 	READ4args *rargs;
   3384 	nfs_argop4 argop[2];
   3385 	int tsize;
   3386 	int doqueue;
   3387 	rnode4_t *rp;
   3388 	int data_len;
   3389 	bool_t is_eof;
   3390 	bool_t needrecov = FALSE;
   3391 	nfs4_recov_state_t recov_state;
   3392 	nfs4_stateid_types_t sid_types;
   3393 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   3394 
   3395 	rp = VTOR4(vp);
   3396 	mi = VTOMI4(vp);
   3397 	doqueue = 1;
   3398 
   3399 	ASSERT(nfs_zone() == mi->mi_zone);
   3400 
   3401 	args.ctag = async ? TAG_READAHEAD : TAG_READ;
   3402 
   3403 	args.array_len = 2;
   3404 	args.array = argop;
   3405 
   3406 	nfs4_init_stateid_types(&sid_types);
   3407 
   3408 	recov_state.rs_flags = 0;
   3409 	recov_state.rs_num_retry_despite_err = 0;
   3410 
   3411 recov_retry:
   3412 	e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
   3413 	    &recov_state, NULL);
   3414 	if (e.error)
   3415 		return (e.error);
   3416 
   3417 	/* putfh target fh */
   3418 	argop[0].argop = OP_CPUTFH;
   3419 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
   3420 
   3421 	/* read */
   3422 	argop[1].argop = OP_READ;
   3423 	rargs = &argop[1].nfs_argop4_u.opread;
   3424 	rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
   3425 	    OP_READ, &sid_types, async);
   3426 
   3427 	do {
   3428 		if (mi->mi_io_kstats) {
   3429 			mutex_enter(&mi->mi_lock);
   3430 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   3431 			mutex_exit(&mi->mi_lock);
   3432 		}
   3433 
   3434 		NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
   3435 		    "nfs4read: %s call, rp %s",
   3436 		    needrecov ? "recov" : "first",
   3437 		    rnode4info(rp)));
   3438 
   3439 		if ((vp->v_flag & VNOCACHE) ||
   3440 		    (rp->r_flags & R4DIRECTIO) ||
   3441 		    (mi->mi_flags & MI4_DIRECTIO))
   3442 			tsize = MIN(mi->mi_tsize, count);
   3443 		else
   3444 			tsize = MIN(mi->mi_curread, count);
   3445 
   3446 		rargs->offset = (offset4)offset;
   3447 		rargs->count = (count4)tsize;
   3448 		rargs->res_data_val_alt = NULL;
   3449 		rargs->res_mblk = NULL;
   3450 		rargs->res_uiop = NULL;
   3451 		rargs->res_maxsize = 0;
   3452 		rargs->wlist = NULL;
   3453 
   3454 		if (uiop)
   3455 			rargs->res_uiop = uiop;
   3456 		else
   3457 			rargs->res_data_val_alt = base;
   3458 		rargs->res_maxsize = tsize;
   3459 
   3460 		rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
   3461 #ifdef	DEBUG
   3462 		if (nfs4read_error_inject) {
   3463 			res.status = nfs4read_error_inject;
   3464 			nfs4read_error_inject = 0;
   3465 		}
   3466 #endif
   3467 
   3468 		if (mi->mi_io_kstats) {
   3469 			mutex_enter(&mi->mi_lock);
   3470 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
   3471 			mutex_exit(&mi->mi_lock);
   3472 		}
   3473 
   3474 		needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
   3475 		if (e.error != 0 && !needrecov) {
   3476 			nfs4_end_fop(mi, vp, NULL, OH_READ,
   3477 			    &recov_state, needrecov);
   3478 			return (e.error);
   3479 		}
   3480 
   3481 		/*
   3482 		 * Do proper retry for OLD and BAD stateid errors outside
   3483 		 * of the normal recovery framework.  There are two differences
   3484 		 * between async and sync reads.  The first is that we allow
   3485 		 * retry on BAD_STATEID for async reads, but not sync reads.
   3486 		 * The second is that we mark the file dead for a failed
   3487 		 * attempt with a special stateid for sync reads, but just
   3488 		 * return EIO for async reads.
   3489 		 *
   3490 		 * If a sync read receives a BAD stateid error while using a
   3491 		 * delegation stateid, retry using the open stateid (if it
   3492 		 * exists).  If it doesn't have an open stateid, reopen the
   3493 		 * file first, then retry.
   3494 		 */
   3495 		if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
   3496 		    res.status == NFS4ERR_BAD_STATEID) && async) {
   3497 			nfs4_end_fop(mi, vp, NULL, OH_READ,
   3498 			    &recov_state, needrecov);
   3499 			if (sid_types.cur_sid_type == SPEC_SID) {
   3500 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   3501 				    (caddr_t)&res);
   3502 				return (EIO);
   3503 			}
   3504 			nfs4_save_stateid(&rargs->stateid, &sid_types);
   3505 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3506 			goto recov_retry;
   3507 		} else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
   3508 		    !async && sid_types.cur_sid_type != SPEC_SID) {
   3509 			nfs4_save_stateid(&rargs->stateid, &sid_types);
   3510 			nfs4_end_fop(mi, vp, NULL, OH_READ,
   3511 			    &recov_state, needrecov);
   3512 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3513 			goto recov_retry;
   3514 		} else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
   3515 		    sid_types.cur_sid_type == DEL_SID) {
   3516 			nfs4_save_stateid(&rargs->stateid, &sid_types);
   3517 			mutex_enter(&rp->r_statev4_lock);
   3518 			rp->r_deleg_return_pending = TRUE;
   3519 			mutex_exit(&rp->r_statev4_lock);
   3520 			if (nfs4rdwr_check_osid(vp, &e, cr)) {
   3521 				nfs4_end_fop(mi, vp, NULL, OH_READ,
   3522 				    &recov_state, needrecov);
   3523 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   3524 				    (caddr_t)&res);
   3525 				return (EIO);
   3526 			}
   3527 			nfs4_end_fop(mi, vp, NULL, OH_READ,
   3528 			    &recov_state, needrecov);
   3529 			/* hold needed for nfs4delegreturn_thread */
   3530 			VN_HOLD(vp);
   3531 			nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
   3532 			    NFS4_DR_DISCARD), FALSE);
   3533 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3534 			goto recov_retry;
   3535 		}
   3536 		if (needrecov) {
   3537 			bool_t abort;
   3538 
   3539 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   3540 			    "nfs4read: initiating recovery\n"));
   3541 			abort = nfs4_start_recovery(&e,
   3542 			    mi, vp, NULL, &rargs->stateid,
   3543 			    NULL, OP_READ, NULL);
   3544 			nfs4_end_fop(mi, vp, NULL, OH_READ,
   3545 			    &recov_state, needrecov);
   3546 			/*
   3547 			 * Do not retry if we got OLD_STATEID using a special
   3548 			 * stateid.  This avoids looping with a broken server.
   3549 			 */
   3550 			if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
   3551 			    sid_types.cur_sid_type == SPEC_SID)
   3552 				abort = TRUE;
   3553 
   3554 			if (abort == FALSE) {
   3555 				/*
   3556 				 * Need to retry all possible stateids in
   3557 				 * case the recovery error wasn't stateid
   3558 				 * related or the stateids have become
   3559 				 * stale (server reboot).
   3560 				 */
   3561 				nfs4_init_stateid_types(&sid_types);
   3562 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   3563 				    (caddr_t)&res);
   3564 				goto recov_retry;
   3565 			}
   3566 
   3567 			if (!e.error) {
   3568 				e.error = geterrno4(res.status);
   3569 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   3570 				    (caddr_t)&res);
   3571 			}
   3572 			return (e.error);
   3573 		}
   3574 
   3575 		if (res.status) {
   3576 			e.error = geterrno4(res.status);
   3577 			nfs4_end_fop(mi, vp, NULL, OH_READ,
   3578 			    &recov_state, needrecov);
   3579 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3580 			return (e.error);
   3581 		}
   3582 
   3583 		data_len = res.array[1].nfs_resop4_u.opread.data_len;
   3584 		count -= data_len;
   3585 		if (base)
   3586 			base += data_len;
   3587 		offset += data_len;
   3588 		if (mi->mi_io_kstats) {
   3589 			mutex_enter(&mi->mi_lock);
   3590 			KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
   3591 			KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
   3592 			mutex_exit(&mi->mi_lock);
   3593 		}
   3594 		lwp_stat_update(LWP_STAT_INBLK, 1);
   3595 		is_eof = res.array[1].nfs_resop4_u.opread.eof;
   3596 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3597 
   3598 	} while (count && !is_eof);
   3599 
   3600 	*residp = count;
   3601 
   3602 	nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
   3603 
   3604 	return (e.error);
   3605 }
   3606 
   3607 /* ARGSUSED */
   3608 static int
   3609 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
   3610 	caller_context_t *ct)
   3611 {
   3612 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
   3613 		return (EIO);
   3614 	switch (cmd) {
   3615 		case _FIODIRECTIO:
   3616 			return (nfs4_directio(vp, (int)arg, cr));
   3617 		default:
   3618 			return (ENOTTY);
   3619 	}
   3620 }
   3621 
   3622 /* ARGSUSED */
   3623 int
   3624 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
   3625     caller_context_t *ct)
   3626 {
   3627 	int error;
   3628 	rnode4_t *rp = VTOR4(vp);
   3629 
   3630 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
   3631 		return (EIO);
   3632 	/*
   3633 	 * If it has been specified that the return value will
   3634 	 * just be used as a hint, and we are only being asked
   3635 	 * for size, fsid or rdevid, then return the client's
   3636 	 * notion of these values without checking to make sure
   3637 	 * that the attribute cache is up to date.
   3638 	 * The whole point is to avoid an over the wire GETATTR
   3639 	 * call.
   3640 	 */
   3641 	if (flags & ATTR_HINT) {
   3642 		if (vap->va_mask ==
   3643 		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
   3644 			mutex_enter(&rp->r_statelock);
   3645 			if (vap->va_mask | AT_SIZE)
   3646 				vap->va_size = rp->r_size;
   3647 			if (vap->va_mask | AT_FSID)
   3648 				vap->va_fsid = rp->r_attr.va_fsid;
   3649 			if (vap->va_mask | AT_RDEV)
   3650 				vap->va_rdev = rp->r_attr.va_rdev;
   3651 			mutex_exit(&rp->r_statelock);
   3652 			return (0);
   3653 		}
   3654 	}
   3655 
   3656 	/*
   3657 	 * Only need to flush pages if asking for the mtime
   3658 	 * and if there any dirty pages or any outstanding
   3659 	 * asynchronous (write) requests for this file.
   3660 	 */
   3661 	if (vap->va_mask & AT_MTIME) {
   3662 		rp = VTOR4(vp);
   3663 		if (nfs4_has_pages(vp)) {
   3664 			mutex_enter(&rp->r_statev4_lock);
   3665 			if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
   3666 				mutex_exit(&rp->r_statev4_lock);
   3667 				if (rp->r_flags & R4DIRTY ||
   3668 				    rp->r_awcount > 0) {
   3669 					mutex_enter(&rp->r_statelock);
   3670 					rp->r_gcount++;
   3671 					mutex_exit(&rp->r_statelock);
   3672 					error =
   3673 					    nfs4_putpage(vp, (u_offset_t)0,
   3674 					    0, 0, cr, NULL);
   3675 					mutex_enter(&rp->r_statelock);
   3676 					if (error && (error == ENOSPC ||
   3677 					    error == EDQUOT)) {
   3678 						if (!rp->r_error)
   3679 							rp->r_error = error;
   3680 					}
   3681 					if (--rp->r_gcount == 0)
   3682 						cv_broadcast(&rp->r_cv);
   3683 					mutex_exit(&rp->r_statelock);
   3684 				}
   3685 			} else {
   3686 				mutex_exit(&rp->r_statev4_lock);
   3687 			}
   3688 		}
   3689 	}
   3690 	return (nfs4getattr(vp, vap, cr));
   3691 }
   3692 
   3693 int
   3694 nfs4_compare_modes(mode_t from_server, mode_t on_client)
   3695 {
   3696 	/*
   3697 	 * If these are the only two bits cleared
   3698 	 * on the server then return 0 (OK) else
   3699 	 * return 1 (BAD).
   3700 	 */
   3701 	on_client &= ~(S_ISUID|S_ISGID);
   3702 	if (on_client == from_server)
   3703 		return (0);
   3704 	else
   3705 		return (1);
   3706 }
   3707 
   3708 /*ARGSUSED4*/
   3709 static int
   3710 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
   3711     caller_context_t *ct)
   3712 {
   3713 	if (vap->va_mask & AT_NOSET)
   3714 		return (EINVAL);
   3715 
   3716 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
   3717 		return (EIO);
   3718 
   3719 	/*
   3720 	 * Don't call secpolicy_vnode_setattr, the client cannot
   3721 	 * use its cached attributes to make security decisions
   3722 	 * as the server may be faking mode bits or mapping uid/gid.
   3723 	 * Always just let the server to the checking.
   3724 	 * If we provide the ability to remove basic priviledges
   3725 	 * to setattr (e.g. basic without chmod) then we will
   3726 	 * need to add a check here before calling the server.
   3727 	 */
   3728 
   3729 	return (nfs4setattr(vp, vap, flags, cr, NULL));
   3730 }
   3731 
   3732 /*
   3733  * To replace the "guarded" version 3 setattr, we use two types of compound
   3734  * setattr requests:
   3735  * 1. The "normal" setattr, used when the size of the file isn't being
   3736  *    changed - { Putfh <fh>; Setattr; Getattr }/
   3737  * 2. If the size is changed, precede Setattr with: Getattr; Verify
   3738  *    with only ctime as the argument. If the server ctime differs from
   3739  *    what is cached on the client, the verify will fail, but we would
   3740  *    already have the ctime from the preceding getattr, so just set it
   3741  *    and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
   3742  *	Setattr; Getattr }.
   3743  *
   3744  * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
   3745  * this setattr and NULL if they are not.
   3746  */
   3747 static int
   3748 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
   3749     vsecattr_t *vsap)
   3750 {
   3751 	COMPOUND4args_clnt args;
   3752 	COMPOUND4res_clnt res, *resp = NULL;
   3753 	nfs4_ga_res_t *garp = NULL;
   3754 	int numops = 3;			/* { Putfh; Setattr; Getattr } */
   3755 	nfs_argop4 argop[5];
   3756 	int verify_argop = -1;
   3757 	int setattr_argop = 1;
   3758 	nfs_resop4 *resop;
   3759 	vattr_t va;
   3760 	rnode4_t *rp;
   3761 	int doqueue = 1;
   3762 	uint_t mask = vap->va_mask;
   3763 	mode_t omode;
   3764 	vsecattr_t *vsp;
   3765 	timestruc_t ctime;
   3766 	bool_t needrecov = FALSE;
   3767 	nfs4_recov_state_t recov_state;
   3768 	nfs4_stateid_types_t sid_types;
   3769 	stateid4 stateid;
   3770 	hrtime_t t;
   3771 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   3772 	servinfo4_t *svp;
   3773 	bitmap4 supp_attrs;
   3774 
   3775 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
   3776 	rp = VTOR4(vp);
   3777 	nfs4_init_stateid_types(&sid_types);
   3778 
   3779 	/*
   3780 	 * Only need to flush pages if there are any pages and
   3781 	 * if the file is marked as dirty in some fashion.  The
   3782 	 * file must be flushed so that we can accurately
   3783 	 * determine the size of the file and the cached data
   3784 	 * after the SETATTR returns.  A file is considered to
   3785 	 * be dirty if it is either marked with R4DIRTY, has
   3786 	 * outstanding i/o's active, or is mmap'd.  In this
   3787 	 * last case, we can't tell whether there are dirty
   3788 	 * pages, so we flush just to be sure.
   3789 	 */
   3790 	if (nfs4_has_pages(vp) &&
   3791 	    ((rp->r_flags & R4DIRTY) ||
   3792 	    rp->r_count > 0 ||
   3793 	    rp->r_mapcnt > 0)) {
   3794 		ASSERT(vp->v_type != VCHR);
   3795 		e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
   3796 		if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
   3797 			mutex_enter(&rp->r_statelock);
   3798 			if (!rp->r_error)
   3799 				rp->r_error = e.error;
   3800 			mutex_exit(&rp->r_statelock);
   3801 		}
   3802 	}
   3803 
   3804 	if (mask & AT_SIZE) {
   3805 		/*
   3806 		 * Verification setattr compound for non-deleg AT_SIZE:
   3807 		 *	{ Putfh; Getattr; Verify; Setattr; Getattr }
   3808 		 * Set ctime local here (outside the do_again label)
   3809 		 * so that subsequent retries (after failed VERIFY)
   3810 		 * will use ctime from GETATTR results (from failed
   3811 		 * verify compound) as VERIFY arg.
   3812 		 * If file has delegation, then VERIFY(time_metadata)
   3813 		 * is of little added value, so don't bother.
   3814 		 */
   3815 		mutex_enter(&rp->r_statev4_lock);
   3816 		if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
   3817 		    rp->r_deleg_return_pending) {
   3818 			numops = 5;
   3819 			ctime = rp->r_attr.va_ctime;
   3820 		}
   3821 		mutex_exit(&rp->r_statev4_lock);
   3822 	}
   3823 
   3824 	recov_state.rs_flags = 0;
   3825 	recov_state.rs_num_retry_despite_err = 0;
   3826 
   3827 	args.ctag = TAG_SETATTR;
   3828 do_again:
   3829 recov_retry:
   3830 	setattr_argop = numops - 2;
   3831 
   3832 	args.array = argop;
   3833 	args.array_len = numops;
   3834 
   3835 	e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
   3836 	if (e.error)
   3837 		return (e.error);
   3838 
   3839 
   3840 	/* putfh target fh */
   3841 	argop[0].argop = OP_CPUTFH;
   3842 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
   3843 
   3844 	if (numops == 5) {
   3845 		/*
   3846 		 * We only care about the ctime, but need to get mtime
   3847 		 * and size for proper cache update.
   3848 		 */
   3849 		/* getattr */
   3850 		argop[1].argop = OP_GETATTR;
   3851 		argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   3852 		argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
   3853 
   3854 		/* verify - set later in loop */
   3855 		verify_argop = 2;
   3856 	}
   3857 
   3858 	/* setattr */
   3859 	svp = rp->r_server;
   3860 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   3861 	supp_attrs = svp->sv_supp_attrs;
   3862 	nfs_rw_exit(&svp->sv_lock);
   3863 
   3864 	nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
   3865 	    supp_attrs, &e.error, &sid_types);
   3866 	stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
   3867 	if (e.error) {
   3868 		/* req time field(s) overflow - return immediately */
   3869 		nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
   3870 		nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
   3871 		    opsetattr.obj_attributes);
   3872 		return (e.error);
   3873 	}
   3874 	omode = rp->r_attr.va_mode;
   3875 
   3876 	/* getattr */
   3877 	argop[numops-1].argop = OP_GETATTR;
   3878 	argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   3879 	/*
   3880 	 * If we are setting the ACL (indicated only by vsap != NULL), request
   3881 	 * the ACL in this getattr.  The ACL returned from this getattr will be
   3882 	 * used in updating the ACL cache.
   3883 	 */
   3884 	if (vsap != NULL)
   3885 		argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
   3886 		    FATTR4_ACL_MASK;
   3887 	argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
   3888 
   3889 	/*
   3890 	 * setattr iterates if the object size is set and the cached ctime
   3891 	 * does not match the file ctime. In that case, verify the ctime first.
   3892 	 */
   3893 
   3894 	do {
   3895 		if (verify_argop != -1) {
   3896 			/*
   3897 			 * Verify that the ctime match before doing setattr.
   3898 			 */
   3899 			va.va_mask = AT_CTIME;
   3900 			va.va_ctime = ctime;
   3901 			svp = rp->r_server;
   3902 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   3903 			supp_attrs = svp->sv_supp_attrs;
   3904 			nfs_rw_exit(&svp->sv_lock);
   3905 			e.error = nfs4args_verify(&argop[verify_argop], &va,
   3906 			    OP_VERIFY, supp_attrs);
   3907 			if (e.error) {
   3908 				/* req time field(s) overflow - return */
   3909 				nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
   3910 				    needrecov);
   3911 				break;
   3912 			}
   3913 		}
   3914 
   3915 		doqueue = 1;
   3916 
   3917 		t = gethrtime();
   3918 
   3919 		rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
   3920 
   3921 		/*
   3922 		 * Purge the access cache and ACL cache if changing either the
   3923 		 * owner of the file, the group owner, or the mode.  These may
   3924 		 * change the access permissions of the file, so purge old
   3925 		 * information and start over again.
   3926 		 */
   3927 		if (mask & (AT_UID | AT_GID | AT_MODE)) {
   3928 			(void) nfs4_access_purge_rp(rp);
   3929 			if (rp->r_secattr != NULL) {
   3930 				mutex_enter(&rp->r_statelock);
   3931 				vsp = rp->r_secattr;
   3932 				rp->r_secattr = NULL;
   3933 				mutex_exit(&rp->r_statelock);
   3934 				if (vsp != NULL)
   3935 					nfs4_acl_free_cache(vsp);
   3936 			}
   3937 		}
   3938 
   3939 		/*
   3940 		 * If res.array_len == numops, then everything succeeded,
   3941 		 * except for possibly the final getattr.  If only the
   3942 		 * last getattr failed, give up, and don't try recovery.
   3943 		 */
   3944 		if (res.array_len == numops) {
   3945 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
   3946 			    needrecov);
   3947 			if (! e.error)
   3948 				resp = &res;
   3949 			break;
   3950 		}
   3951 
   3952 		/*
   3953 		 * if either rpc call failed or completely succeeded - done
   3954 		 */
   3955 		needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
   3956 		if (e.error) {
   3957 			PURGE_ATTRCACHE4(vp);
   3958 			if (!needrecov) {
   3959 				nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
   3960 				    needrecov);
   3961 				break;
   3962 			}
   3963 		}
   3964 
   3965 		/*
   3966 		 * Do proper retry for OLD_STATEID outside of the normal
   3967 		 * recovery framework.
   3968 		 */
   3969 		if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
   3970 		    sid_types.cur_sid_type != SPEC_SID &&
   3971 		    sid_types.cur_sid_type != NO_SID) {
   3972 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
   3973 			    needrecov);
   3974 			nfs4_save_stateid(&stateid, &sid_types);
   3975 			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
   3976 			    opsetattr.obj_attributes);
   3977 			if (verify_argop != -1) {
   3978 				nfs4args_verify_free(&argop[verify_argop]);
   3979 				verify_argop = -1;
   3980 			}
   3981 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   3982 			goto recov_retry;
   3983 		}
   3984 
   3985 		if (needrecov) {
   3986 			bool_t abort;
   3987 
   3988 			abort = nfs4_start_recovery(&e,
   3989 			    VTOMI4(vp), vp, NULL, NULL, NULL,
   3990 			    OP_SETATTR, NULL);
   3991 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
   3992 			    needrecov);
   3993 			/*
   3994 			 * Do not retry if we failed with OLD_STATEID using
   3995 			 * a special stateid.  This is done to avoid looping
   3996 			 * with a broken server.
   3997 			 */
   3998 			if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
   3999 			    (sid_types.cur_sid_type == SPEC_SID ||
   4000 			    sid_types.cur_sid_type == NO_SID))
   4001 				abort = TRUE;
   4002 			if (!e.error) {
   4003 				if (res.status == NFS4ERR_BADOWNER)
   4004 					nfs4_log_badowner(VTOMI4(vp),
   4005 					    OP_SETATTR);
   4006 
   4007 				e.error = geterrno4(res.status);
   4008 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   4009 				    (caddr_t)&res);
   4010 			}
   4011 			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
   4012 			    opsetattr.obj_attributes);
   4013 			if (verify_argop != -1) {
   4014 				nfs4args_verify_free(&argop[verify_argop]);
   4015 				verify_argop = -1;
   4016 			}
   4017 			if (abort == FALSE) {
   4018 				/*
   4019 				 * Need to retry all possible stateids in
   4020 				 * case the recovery error wasn't stateid
   4021 				 * related or the stateids have become
   4022 				 * stale (server reboot).
   4023 				 */
   4024 				nfs4_init_stateid_types(&sid_types);
   4025 				goto recov_retry;
   4026 			}
   4027 			return (e.error);
   4028 		}
   4029 
   4030 		/*
   4031 		 * Need to call nfs4_end_op before nfs4getattr to
   4032 		 * avoid potential nfs4_start_op deadlock. See RFE
   4033 		 * 4777612.  Calls to nfs4_invalidate_pages() and
   4034 		 * nfs4_purge_stale_fh() might also generate over the
   4035 		 * wire calls which my cause nfs4_start_op() deadlock.
   4036 		 */
   4037 		nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
   4038 
   4039 		/*
   4040 		 * Check to update lease.
   4041 		 */
   4042 		resp = &res;
   4043 		if (res.status == NFS4_OK) {
   4044 			break;
   4045 		}
   4046 
   4047 		/*
   4048 		 * Check if verify failed to see if try again
   4049 		 */
   4050 		if ((verify_argop == -1) || (res.array_len != 3)) {
   4051 			/*
   4052 			 * can't continue...
   4053 			 */
   4054 			if (res.status == NFS4ERR_BADOWNER)
   4055 				nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
   4056 
   4057 			e.error = geterrno4(res.status);
   4058 		} else {
   4059 			/*
   4060 			 * When the verify request fails, the client ctime is
   4061 			 * not in sync with the server. This is the same as
   4062 			 * the version 3 "not synchronized" error, and we
   4063 			 * handle it in a similar manner (XXX do we need to???).
   4064 			 * Use the ctime returned in the first getattr for
   4065 			 * the input to the next verify.
   4066 			 * If we couldn't get the attributes, then we give up
   4067 			 * because we can't complete the operation as required.
   4068 			 */
   4069 			garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
   4070 		}
   4071 		if (e.error) {
   4072 			PURGE_ATTRCACHE4(vp);
   4073 			nfs4_purge_stale_fh(e.error, vp, cr);
   4074 		} else {
   4075 			/*
   4076 			 * retry with a new verify value
   4077 			 */
   4078 			ctime = garp->n4g_va.va_ctime;
   4079 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   4080 			resp = NULL;
   4081 		}
   4082 		if (!e.error) {
   4083 			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
   4084 			    opsetattr.obj_attributes);
   4085 			if (verify_argop != -1) {
   4086 				nfs4args_verify_free(&argop[verify_argop]);
   4087 				verify_argop = -1;
   4088 			}
   4089 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   4090 			goto do_again;
   4091 		}
   4092 	} while (!e.error);
   4093 
   4094 	if (e.error) {
   4095 		/*
   4096 		 * If we are here, rfs4call has an irrecoverable error - return
   4097 		 */
   4098 		nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
   4099 		    opsetattr.obj_attributes);
   4100 		if (verify_argop != -1) {
   4101 			nfs4args_verify_free(&argop[verify_argop]);
   4102 			verify_argop = -1;
   4103 		}
   4104 		if (resp)
   4105 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
   4106 		return (e.error);
   4107 	}
   4108 
   4109 
   4110 
   4111 	/*
   4112 	 * If changing the size of the file, invalidate
   4113 	 * any local cached data which is no longer part
   4114 	 * of the file.  We also possibly invalidate the
   4115 	 * last page in the file.  We could use
   4116 	 * pvn_vpzero(), but this would mark the page as
   4117 	 * modified and require it to be written back to
   4118 	 * the server for no particularly good reason.
   4119 	 * This way, if we access it, then we bring it
   4120 	 * back in.  A read should be cheaper than a
   4121 	 * write.
   4122 	 */
   4123 	if (mask & AT_SIZE) {
   4124 		nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
   4125 	}
   4126 
   4127 	/* either no error or one of the postop getattr failed */
   4128 
   4129 	/*
   4130 	 * XXX Perform a simplified version of wcc checking. Instead of
   4131 	 * have another getattr to get pre-op, just purge cache if
   4132 	 * any of the ops prior to and including the getattr failed.
   4133 	 * If the getattr succeeded then update the attrcache accordingly.
   4134 	 */
   4135 
   4136 	garp = NULL;
   4137 	if (res.status == NFS4_OK) {
   4138 		/*
   4139 		 * Last getattr
   4140 		 */
   4141 		resop = &res.array[numops - 1];
   4142 		garp = &resop->nfs_resop4_u.opgetattr.ga_res;
   4143 	}
   4144 	/*
   4145 	 * In certain cases, nfs4_update_attrcache() will purge the attrcache,
   4146 	 * rather than filling it.  See the function itself for details.
   4147 	 */
   4148 	e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
   4149 	if (garp != NULL) {
   4150 		if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
   4151 			nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
   4152 			vs_ace4_destroy(&garp->n4g_vsa);
   4153 		} else {
   4154 			if (vsap != NULL) {
   4155 				/*
   4156 				 * The ACL was supposed to be set and to be
   4157 				 * returned in the last getattr of this
   4158 				 * compound, but for some reason the getattr
   4159 				 * result doesn't contain the ACL.  In this
   4160 				 * case, purge the ACL cache.
   4161 				 */
   4162 				if (rp->r_secattr != NULL) {
   4163 					mutex_enter(&rp->r_statelock);
   4164 					vsp = rp->r_secattr;
   4165 					rp->r_secattr = NULL;
   4166 					mutex_exit(&rp->r_statelock);
   4167 					if (vsp != NULL)
   4168 						nfs4_acl_free_cache(vsp);
   4169 				}
   4170 			}
   4171 		}
   4172 	}
   4173 
   4174 	if (res.status == NFS4_OK && (mask & AT_SIZE)) {
   4175 		/*
   4176 		 * Set the size, rather than relying on getting it updated
   4177 		 * via a GETATTR.  With delegations the client tries to
   4178 		 * suppress GETATTR calls.
   4179 		 */
   4180 		mutex_enter(&rp->r_statelock);
   4181 		rp->r_size = vap->va_size;
   4182 		mutex_exit(&rp->r_statelock);
   4183 	}
   4184 
   4185 	/*
   4186 	 * Can free up request args and res
   4187 	 */
   4188 	nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
   4189 	    opsetattr.obj_attributes);
   4190 	if (verify_argop != -1) {
   4191 		nfs4args_verify_free(&argop[verify_argop]);
   4192 		verify_argop = -1;
   4193 	}
   4194 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   4195 
   4196 	/*
   4197 	 * Some servers will change the mode to clear the setuid
   4198 	 * and setgid bits when changing the uid or gid.  The
   4199 	 * client needs to compensate appropriately.
   4200 	 */
   4201 	if (mask & (AT_UID | AT_GID)) {
   4202 		int terror, do_setattr;
   4203 
   4204 		do_setattr = 0;
   4205 		va.va_mask = AT_MODE;
   4206 		terror = nfs4getattr(vp, &va, cr);
   4207 		if (!terror &&
   4208 		    (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
   4209 		    (!(mask & AT_MODE) && va.va_mode != omode))) {
   4210 			va.va_mask = AT_MODE;
   4211 			if (mask & AT_MODE) {
   4212 				/*
   4213 				 * We asked the mode to be changed and what
   4214 				 * we just got from the server in getattr is
   4215 				 * not what we wanted it to be, so set it now.
   4216 				 */
   4217 				va.va_mode = vap->va_mode;
   4218 				do_setattr = 1;
   4219 			} else {
   4220 				/*
   4221 				 * We did not ask the mode to be changed,
   4222 				 * Check to see that the server just cleared
   4223 				 * I_SUID and I_GUID from it. If not then
   4224 				 * set mode to omode with UID/GID cleared.
   4225 				 */
   4226 				if (nfs4_compare_modes(va.va_mode, omode)) {
   4227 					omode &= ~(S_ISUID|S_ISGID);
   4228 					va.va_mode = omode;
   4229 					do_setattr = 1;
   4230 				}
   4231 			}
   4232 
   4233 			if (do_setattr)
   4234 				(void) nfs4setattr(vp, &va, 0, cr, NULL);
   4235 		}
   4236 	}
   4237 
   4238 	return (e.error);
   4239 }
   4240 
   4241 /* ARGSUSED */
   4242 static int
   4243 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
   4244 {
   4245 	COMPOUND4args_clnt args;
   4246 	COMPOUND4res_clnt res;
   4247 	int doqueue;
   4248 	uint32_t acc, resacc, argacc;
   4249 	rnode4_t *rp;
   4250 	cred_t *cred, *ncr, *ncrfree = NULL;
   4251 	nfs4_access_type_t cacc;
   4252 	int num_ops;
   4253 	nfs_argop4 argop[3];
   4254 	nfs_resop4 *resop;
   4255 	bool_t needrecov = FALSE, do_getattr;
   4256 	nfs4_recov_state_t recov_state;
   4257 	int rpc_error;
   4258 	hrtime_t t;
   4259 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   4260 	mntinfo4_t *mi = VTOMI4(vp);
   4261 
   4262 	if (nfs_zone() != mi->mi_zone)
   4263 		return (EIO);
   4264 
   4265 	acc = 0;
   4266 	if (mode & VREAD)
   4267 		acc |= ACCESS4_READ;
   4268 	if (mode & VWRITE) {
   4269 		if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
   4270 			return (EROFS);
   4271 		if (vp->v_type == VDIR)
   4272 			acc |= ACCESS4_DELETE;
   4273 		acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
   4274 	}
   4275 	if (mode & VEXEC) {
   4276 		if (vp->v_type == VDIR)
   4277 			acc |= ACCESS4_LOOKUP;
   4278 		else
   4279 			acc |= ACCESS4_EXECUTE;
   4280 	}
   4281 
   4282 	if (VTOR4(vp)->r_acache != NULL) {
   4283 		e.error = nfs4_validate_caches(vp, cr);
   4284 		if (e.error)
   4285 			return (e.error);
   4286 	}
   4287 
   4288 	rp = VTOR4(vp);
   4289 	if (vp->v_type == VDIR)
   4290 		argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
   4291 		    ACCESS4_EXTEND | ACCESS4_LOOKUP;
   4292 	else
   4293 		argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
   4294 		    ACCESS4_EXECUTE;
   4295 	recov_state.rs_flags = 0;
   4296 	recov_state.rs_num_retry_despite_err = 0;
   4297 
   4298 	cred = cr;
   4299 	/*
   4300 	 * ncr and ncrfree both initially
   4301 	 * point to the memory area returned
   4302 	 * by crnetadjust();
   4303 	 * ncrfree not NULL when exiting means
   4304 	 * that we need to release it
   4305 	 */
   4306 	ncr = crnetadjust(cred);
   4307 	ncrfree = ncr;
   4308 
   4309 tryagain:
   4310 	cacc = nfs4_access_check(rp, acc, cred);
   4311 	if (cacc == NFS4_ACCESS_ALLOWED) {
   4312 		if (ncrfree != NULL)
   4313 			crfree(ncrfree);
   4314 		return (0);
   4315 	}
   4316 	if (cacc == NFS4_ACCESS_DENIED) {
   4317 		/*
   4318 		 * If the cred can be adjusted, try again
   4319 		 * with the new cred.
   4320 		 */
   4321 		if (ncr != NULL) {
   4322 			cred = ncr;
   4323 			ncr = NULL;
   4324 			goto tryagain;
   4325 		}
   4326 		if (ncrfree != NULL)
   4327 			crfree(ncrfree);
   4328 		return (EACCES);
   4329 	}
   4330 
   4331 recov_retry:
   4332 	/*
   4333 	 * Don't take with r_statev4_lock here. r_deleg_type could
   4334 	 * change as soon as lock is released.  Since it is an int,
   4335 	 * there is no atomicity issue.
   4336 	 */
   4337 	do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
   4338 	num_ops = do_getattr ? 3 : 2;
   4339 
   4340 	args.ctag = TAG_ACCESS;
   4341 
   4342 	args.array_len = num_ops;
   4343 	args.array = argop;
   4344 
   4345 	if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
   4346 	    &recov_state, NULL)) {
   4347 		if (ncrfree != NULL)
   4348 			crfree(ncrfree);
   4349 		return (e.error);
   4350 	}
   4351 
   4352 	/* putfh target fh */
   4353 	argop[0].argop = OP_CPUTFH;
   4354 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
   4355 
   4356 	/* access */
   4357 	argop[1].argop = OP_ACCESS;
   4358 	argop[1].nfs_argop4_u.opaccess.access = argacc;
   4359 
   4360 	/* getattr */
   4361 	if (do_getattr) {
   4362 		argop[2].argop = OP_GETATTR;
   4363 		argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   4364 		argop[2].nfs_argop4_u.opgetattr.mi = mi;
   4365 	}
   4366 
   4367 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
   4368 	    "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
   4369 	    rnode4info(VTOR4(vp))));
   4370 
   4371 	doqueue = 1;
   4372 	t = gethrtime();
   4373 	rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
   4374 	rpc_error = e.error;
   4375 
   4376 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
   4377 	if (needrecov) {
   4378 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   4379 		    "nfs4_access: initiating recovery\n"));
   4380 
   4381 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
   4382 		    NULL, OP_ACCESS, NULL) == FALSE) {
   4383 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
   4384 			    &recov_state, needrecov);
   4385 			if (!e.error)
   4386 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   4387 				    (caddr_t)&res);
   4388 			goto recov_retry;
   4389 		}
   4390 	}
   4391 	nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
   4392 
   4393 	if (e.error)
   4394 		goto out;
   4395 
   4396 	if (res.status) {
   4397 		e.error = geterrno4(res.status);
   4398 		/*
   4399 		 * This might generate over the wire calls throught
   4400 		 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
   4401 		 * here to avoid a deadlock.
   4402 		 */
   4403 		nfs4_purge_stale_fh(e.error, vp, cr);
   4404 		goto out;
   4405 	}
   4406 	resop = &res.array[1];	/* access res */
   4407 
   4408 	resacc = resop->nfs_resop4_u.opaccess.access;
   4409 
   4410 	if (do_getattr) {
   4411 		resop++;	/* getattr res */
   4412 		nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
   4413 		    t, cr, FALSE, NULL);
   4414 	}
   4415 
   4416 	if (!e.error) {
   4417 		nfs4_access_cache(rp, argacc, resacc, cred);
   4418 		/*
   4419 		 * we just cached results with cred; if cred is the
   4420 		 * adjusted credentials from crnetadjust, we do not want
   4421 		 * to release them before exiting: hence setting ncrfree
   4422 		 * to NULL
   4423 		 */
   4424 		if (cred != cr)
   4425 			ncrfree = NULL;
   4426 		/* XXX check the supported bits too? */
   4427 		if ((acc & resacc) != acc) {
   4428 			/*
   4429 			 * The following code implements the semantic
   4430 			 * that a setuid root program has *at least* the
   4431 			 * permissions of the user that is running the
   4432 			 * program.  See rfs3call() for more portions
   4433 			 * of the implementation of this functionality.
   4434 			 */
   4435 			/* XXX-LP */
   4436 			if (ncr != NULL) {
   4437 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   4438 				    (caddr_t)&res);
   4439 				cred = ncr;
   4440 				ncr = NULL;
   4441 				goto tryagain;
   4442 			}
   4443 			e.error = EACCES;
   4444 		}
   4445 	}
   4446 
   4447 out:
   4448 	if (!rpc_error)
   4449 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   4450 
   4451 	if (ncrfree != NULL)
   4452 		crfree(ncrfree);
   4453 
   4454 	return (e.error);
   4455 }
   4456 
   4457 /* ARGSUSED */
   4458 static int
   4459 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
   4460 {
   4461 	COMPOUND4args_clnt args;
   4462 	COMPOUND4res_clnt res;
   4463 	int doqueue;
   4464 	rnode4_t *rp;
   4465 	nfs_argop4 argop[3];
   4466 	nfs_resop4 *resop;
   4467 	READLINK4res *lr_res;
   4468 	nfs4_ga_res_t *garp;
   4469 	uint_t len;
   4470 	char *linkdata;
   4471 	bool_t needrecov = FALSE;
   4472 	nfs4_recov_state_t recov_state;
   4473 	hrtime_t t;
   4474 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   4475 
   4476 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
   4477 		return (EIO);
   4478 	/*
   4479 	 * Can't readlink anything other than a symbolic link.
   4480 	 */
   4481 	if (vp->v_type != VLNK)
   4482 		return (EINVAL);
   4483 
   4484 	rp = VTOR4(vp);
   4485 	if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
   4486 		e.error = nfs4_validate_caches(vp, cr);
   4487 		if (e.error)
   4488 			return (e.error);
   4489 		mutex_enter(&rp->r_statelock);
   4490 		if (rp->r_symlink.contents != NULL) {
   4491 			e.error = uiomove(rp->r_symlink.contents,
   4492 			    rp->r_symlink.len, UIO_READ, uiop);
   4493 			mutex_exit(&rp->r_statelock);
   4494 			return (e.error);
   4495 		}
   4496 		mutex_exit(&rp->r_statelock);
   4497 	}
   4498 	recov_state.rs_flags = 0;
   4499 	recov_state.rs_num_retry_despite_err = 0;
   4500 
   4501 recov_retry:
   4502 	args.array_len = 3;
   4503 	args.array = argop;
   4504 	args.ctag = TAG_READLINK;
   4505 
   4506 	e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
   4507 	if (e.error) {
   4508 		return (e.error);
   4509 	}
   4510 
   4511 	/* 0. putfh symlink fh */
   4512 	argop[0].argop = OP_CPUTFH;
   4513 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
   4514 
   4515 	/* 1. readlink */
   4516 	argop[1].argop = OP_READLINK;
   4517 
   4518 	/* 2. getattr */
   4519 	argop[2].argop = OP_GETATTR;
   4520 	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   4521 	argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
   4522 
   4523 	doqueue = 1;
   4524 
   4525 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
   4526 	    "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
   4527 	    rnode4info(VTOR4(vp))));
   4528 
   4529 	t = gethrtime();
   4530 
   4531 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
   4532 
   4533 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
   4534 	if (needrecov) {
   4535 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   4536 		    "nfs4_readlink: initiating recovery\n"));
   4537 
   4538 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
   4539 		    NULL, OP_READLINK, NULL) == FALSE) {
   4540 			if (!e.error)
   4541 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   4542 				    (caddr_t)&res);
   4543 
   4544 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
   4545 			    needrecov);
   4546 			goto recov_retry;
   4547 		}
   4548 	}
   4549 
   4550 	nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
   4551 
   4552 	if (e.error)
   4553 		return (e.error);
   4554 
   4555 	/*
   4556 	 * There is an path in the code below which calls
   4557 	 * nfs4_purge_stale_fh(), which may generate otw calls through
   4558 	 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
   4559 	 * here to avoid nfs4_start_op() deadlock.
   4560 	 */
   4561 
   4562 	if (res.status && (res.array_len < args.array_len)) {
   4563 		/*
   4564 		 * either Putfh or Link failed
   4565 		 */
   4566 		e.error = geterrno4(res.status);
   4567 		nfs4_purge_stale_fh(e.error, vp, cr);
   4568 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   4569 		return (e.error);
   4570 	}
   4571 
   4572 	resop = &res.array[1];	/* readlink res */
   4573 	lr_res = &resop->nfs_resop4_u.opreadlink;
   4574 
   4575 	/*
   4576 	 * treat symlink names as data
   4577 	 */
   4578 	linkdata = utf8_to_str(&lr_res->link, &len, NULL);
   4579 	if (linkdata != NULL) {
   4580 		int uio_len = len - 1;
   4581 		/* len includes null byte, which we won't uiomove */
   4582 		e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
   4583 		if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
   4584 			mutex_enter(&rp->r_statelock);
   4585 			if (rp->r_symlink.contents == NULL) {
   4586 				rp->r_symlink.contents = linkdata;
   4587 				rp->r_symlink.len = uio_len;
   4588 				rp->r_symlink.size = len;
   4589 				mutex_exit(&rp->r_statelock);
   4590 			} else {
   4591 				mutex_exit(&rp->r_statelock);
   4592 				kmem_free(linkdata, len);
   4593 			}
   4594 		} else {
   4595 			kmem_free(linkdata, len);
   4596 		}
   4597 	}
   4598 	if (res.status == NFS4_OK) {
   4599 		resop++;	/* getattr res */
   4600 		garp = &resop->nfs_resop4_u.opgetattr.ga_res;
   4601 	}
   4602 	e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
   4603 
   4604 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   4605 
   4606 	/*
   4607 	 * The over the wire error for attempting to readlink something
   4608 	 * other than a symbolic link is ENXIO.  However, we need to
   4609 	 * return EINVAL instead of ENXIO, so we map it here.
   4610 	 */
   4611 	return (e.error == ENXIO ? EINVAL : e.error);
   4612 }
   4613 
   4614 /*
   4615  * Flush local dirty pages to stable storage on the server.
   4616  *
   4617  * If FNODSYNC is specified, then there is nothing to do because
   4618  * metadata changes are not cached on the client before being
   4619  * sent to the server.
   4620  */
   4621 /* ARGSUSED */
   4622 static int
   4623 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
   4624 {
   4625 	int error;
   4626 
   4627 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
   4628 		return (0);
   4629 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
   4630 		return (EIO);
   4631 	error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
   4632 	if (!error)
   4633 		error = VTOR4(vp)->r_error;
   4634 	return (error);
   4635 }
   4636 
   4637 /*
   4638  * Weirdness: if the file was removed or the target of a rename
   4639  * operation while it was open, it got renamed instead.  Here we
   4640  * remove the renamed file.
   4641  */
   4642 /* ARGSUSED */
   4643 void
   4644 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
   4645 {
   4646 	rnode4_t *rp;
   4647 
   4648 	ASSERT(vp != DNLC_NO_VNODE);
   4649 
   4650 	rp = VTOR4(vp);
   4651 
   4652 	if (IS_SHADOW(vp, rp)) {
   4653 		sv_inactive(vp);
   4654 		return;
   4655 	}
   4656 
   4657 	/*
   4658 	 * If this is coming from the wrong zone, we let someone in the right
   4659 	 * zone take care of it asynchronously.  We can get here due to
   4660 	 * VN_RELE() being called from pageout() or fsflush().  This call may
   4661 	 * potentially turn into an expensive no-op if, for instance, v_count
   4662 	 * gets incremented in the meantime, but it's still correct.
   4663 	 */
   4664 	if (nfs_zone() != VTOMI4(vp)->mi_zone) {
   4665 		nfs4_async_inactive(vp, cr);
   4666 		return;
   4667 	}
   4668 
   4669 	/*
   4670 	 * Some of the cleanup steps might require over-the-wire
   4671 	 * operations.  Since VOP_INACTIVE can get called as a result of
   4672 	 * other over-the-wire operations (e.g., an attribute cache update
   4673 	 * can lead to a DNLC purge), doing those steps now would lead to a
   4674 	 * nested call to the recovery framework, which can deadlock.  So
   4675 	 * do any over-the-wire cleanups asynchronously, in a separate
   4676 	 * thread.
   4677 	 */
   4678 
   4679 	mutex_enter(&rp->r_os_lock);
   4680 	mutex_enter(&rp->r_statelock);
   4681 	mutex_enter(&rp->r_statev4_lock);
   4682 
   4683 	if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
   4684 		mutex_exit(&rp->r_statev4_lock);
   4685 		mutex_exit(&rp->r_statelock);
   4686 		mutex_exit(&rp->r_os_lock);
   4687 		nfs4_async_inactive(vp, cr);
   4688 		return;
   4689 	}
   4690 
   4691 	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
   4692 	    rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
   4693 		mutex_exit(&rp->r_statev4_lock);
   4694 		mutex_exit(&rp->r_statelock);
   4695 		mutex_exit(&rp->r_os_lock);
   4696 		nfs4_async_inactive(vp, cr);
   4697 		return;
   4698 	}
   4699 
   4700 	if (rp->r_unldvp != NULL) {
   4701 		mutex_exit(&rp->r_statev4_lock);
   4702 		mutex_exit(&rp->r_statelock);
   4703 		mutex_exit(&rp->r_os_lock);
   4704 		nfs4_async_inactive(vp, cr);
   4705 		return;
   4706 	}
   4707 	mutex_exit(&rp->r_statev4_lock);
   4708 	mutex_exit(&rp->r_statelock);
   4709 	mutex_exit(&rp->r_os_lock);
   4710 
   4711 	rp4_addfree(rp, cr);
   4712 }
   4713 
   4714 /*
   4715  * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
   4716  * various bits of state.  The caller must not refer to vp after this call.
   4717  */
   4718 
   4719 void
   4720 nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
   4721 {
   4722 	rnode4_t *rp = VTOR4(vp);
   4723 	nfs4_recov_state_t recov_state;
   4724 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   4725 	vnode_t *unldvp;
   4726 	char *unlname;
   4727 	cred_t *unlcred;
   4728 	COMPOUND4args_clnt args;
   4729 	COMPOUND4res_clnt res, *resp;
   4730 	nfs_argop4 argop[2];
   4731 	int doqueue;
   4732 #ifdef DEBUG
   4733 	char *name;
   4734 #endif
   4735 
   4736 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
   4737 	ASSERT(!IS_SHADOW(vp, rp));
   4738 
   4739 #ifdef DEBUG
   4740 	name = fn_name(VTOSV(vp)->sv_name);
   4741 	NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
   4742 	    "release vnode %s", name));
   4743 	kmem_free(name, MAXNAMELEN);
   4744 #endif
   4745 
   4746 	if (vp->v_type == VREG) {
   4747 		bool_t recov_failed = FALSE;
   4748 
   4749 		e.error = nfs4close_all(vp, cr);
   4750 		if (e.error) {
   4751 			/* Check to see if recovery failed */
   4752 			mutex_enter(&(VTOMI4(vp)->mi_lock));
   4753 			if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
   4754 				recov_failed = TRUE;
   4755 			mutex_exit(&(VTOMI4(vp)->mi_lock));
   4756 			if (!recov_failed) {
   4757 				mutex_enter(&rp->r_statelock);
   4758 				if (rp->r_flags & R4RECOVERR)
   4759 					recov_failed = TRUE;
   4760 				mutex_exit(&rp->r_statelock);
   4761 			}
   4762 			if (recov_failed) {
   4763 				NFS4_DEBUG(nfs4_client_recov_debug,
   4764 				    (CE_NOTE, "nfs4_inactive_otw: "
   4765 				    "close failed (recovery failure)"));
   4766 			}
   4767 		}
   4768 	}
   4769 
   4770 redo:
   4771 	if (rp->r_unldvp == NULL) {
   4772 		rp4_addfree(rp, cr);
   4773 		return;
   4774 	}
   4775 
   4776 	/*
   4777 	 * Save the vnode pointer for the directory where the
   4778 	 * unlinked-open file got renamed, then set it to NULL
   4779 	 * to prevent another thread from getting here before
   4780 	 * we're done with the remove.  While we have the
   4781 	 * statelock, make local copies of the pertinent rnode
   4782 	 * fields.  If we weren't to do this in an atomic way, the
   4783 	 * the unl* fields could become inconsistent with respect
   4784 	 * to each other due to a race condition between this
   4785 	 * code and nfs_remove().  See bug report 1034328.
   4786 	 */
   4787 	mutex_enter(&rp->r_statelock);
   4788 	if (rp->r_unldvp == NULL) {
   4789 		mutex_exit(&rp->r_statelock);
   4790 		rp4_addfree(rp, cr);
   4791 		return;
   4792 	}
   4793 
   4794 	unldvp = rp->r_unldvp;
   4795 	rp->r_unldvp = NULL;
   4796 	unlname = rp->r_unlname;
   4797 	rp->r_unlname = NULL;
   4798 	unlcred = rp->r_unlcred;
   4799 	rp->r_unlcred = NULL;
   4800 	mutex_exit(&rp->r_statelock);
   4801 
   4802 	/*
   4803 	 * If there are any dirty pages left, then flush
   4804 	 * them.  This is unfortunate because they just
   4805 	 * may get thrown away during the remove operation,
   4806 	 * but we have to do this for correctness.
   4807 	 */
   4808 	if (nfs4_has_pages(vp) &&
   4809 	    ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
   4810 		ASSERT(vp->v_type != VCHR);
   4811 		e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL);
   4812 		if (e.error) {
   4813 			mutex_enter(&rp->r_statelock);
   4814 			if (!rp->r_error)
   4815 				rp->r_error = e.error;
   4816 			mutex_exit(&rp->r_statelock);
   4817 		}
   4818 	}
   4819 
   4820 	recov_state.rs_flags = 0;
   4821 	recov_state.rs_num_retry_despite_err = 0;
   4822 recov_retry_remove:
   4823 	/*
   4824 	 * Do the remove operation on the renamed file
   4825 	 */
   4826 	args.ctag = TAG_INACTIVE;
   4827 
   4828 	/*
   4829 	 * Remove ops: putfh dir; remove
   4830 	 */
   4831 	args.array_len = 2;
   4832 	args.array = argop;
   4833 
   4834 	e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
   4835 	if (e.error) {
   4836 		kmem_free(unlname, MAXNAMELEN);
   4837 		crfree(unlcred);
   4838 		VN_RELE(unldvp);
   4839 		/*
   4840 		 * Try again; this time around r_unldvp will be NULL, so we'll
   4841 		 * just call rp4_addfree() and return.
   4842 		 */
   4843 		goto redo;
   4844 	}
   4845 
   4846 	/* putfh directory */
   4847 	argop[0].argop = OP_CPUTFH;
   4848 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
   4849 
   4850 	/* remove */
   4851 	argop[1].argop = OP_CREMOVE;
   4852 	argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
   4853 
   4854 	doqueue = 1;
   4855 	resp = &res;
   4856 
   4857 #if 0 /* notyet */
   4858 	/*
   4859 	 * Can't do this yet.  We may be being called from
   4860 	 * dnlc_purge_XXX while that routine is holding a
   4861 	 * mutex lock to the nc_rele list.  The calls to
   4862 	 * nfs3_cache_wcc_data may result in calls to
   4863 	 * dnlc_purge_XXX.  This will result in a deadlock.
   4864 	 */
   4865 	rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
   4866 	if (e.error) {
   4867 		PURGE_ATTRCACHE4(unldvp);
   4868 		resp = NULL;
   4869 	} else if (res.status) {
   4870 		e.error = geterrno4(res.status);
   4871 		PURGE_ATTRCACHE4(unldvp);
   4872 		/*
   4873 		 * This code is inactive right now
   4874 		 * but if made active there should
   4875 		 * be a nfs4_end_op() call before
   4876 		 * nfs4_purge_stale_fh to avoid start_op()
   4877 		 * deadlock. See BugId: 4948726
   4878 		 */
   4879 		nfs4_purge_stale_fh(error, unldvp, cr);
   4880 	} else {
   4881 		nfs_resop4 *resop;
   4882 		REMOVE4res *rm_res;
   4883 
   4884 		resop = &res.array[1];
   4885 		rm_res = &resop->nfs_resop4_u.opremove;
   4886 		/*
   4887 		 * Update directory cache attribute,
   4888 		 * readdir and dnlc caches.
   4889 		 */
   4890 		nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
   4891 	}
   4892 #else
   4893 	rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
   4894 
   4895 	PURGE_ATTRCACHE4(unldvp);
   4896 #endif
   4897 
   4898 	if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
   4899 		if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
   4900 		    NULL, NULL, OP_REMOVE, NULL) == FALSE) {
   4901 			if (!e.error)
   4902 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   4903 				    (caddr_t)&res);
   4904 			nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
   4905 			    &recov_state, TRUE);
   4906 			goto recov_retry_remove;
   4907 		}
   4908 	}
   4909 	nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
   4910 
   4911 	/*
   4912 	 * Release stuff held for the remove
   4913 	 */
   4914 	VN_RELE(unldvp);
   4915 	if (!e.error && resp)
   4916 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
   4917 
   4918 	kmem_free(unlname, MAXNAMELEN);
   4919 	crfree(unlcred);
   4920 	goto redo;
   4921 }
   4922 
   4923 /*
   4924  * Remote file system operations having to do with directory manipulation.
   4925  */
   4926 /* ARGSUSED3 */
   4927 int
   4928 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
   4929     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
   4930     int *direntflags, pathname_t *realpnp)
   4931 {
   4932 	int error;
   4933 	vnode_t *vp, *avp = NULL;
   4934 	rnode4_t *drp;
   4935 
   4936 	*vpp = NULL;
   4937 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
   4938 		return (EPERM);
   4939 	/*
   4940 	 * if LOOKUP_XATTR, must replace dvp (object) with
   4941 	 * object's attrdir before continuing with lookup
   4942 	 */
   4943 	if (flags & LOOKUP_XATTR) {
   4944 		error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
   4945 		if (error)
   4946 			return (error);
   4947 
   4948 		dvp = avp;
   4949 
   4950 		/*
   4951 		 * If lookup is for "", just return dvp now.  The attrdir
   4952 		 * has already been activated (from nfs4lookup_xattr), and
   4953 		 * the caller will RELE the original dvp -- not
   4954 		 * the attrdir.  So, set vpp and return.
   4955 		 * Currently, when the LOOKUP_XATTR flag is
   4956 		 * passed to VOP_LOOKUP, the name is always empty, and
   4957 		 * shortcircuiting here avoids 3 unneeded lock/unlock
   4958 		 * pairs.
   4959 		 *
   4960 		 * If a non-empty name was provided, then it is the
   4961 		 * attribute name, and it will be looked up below.
   4962 		 */
   4963 		if (*nm == '\0') {
   4964 			*vpp = dvp;
   4965 			return (0);
   4966 		}
   4967 
   4968 		/*
   4969 		 * The vfs layer never sends a name when asking for the
   4970 		 * attrdir, so we should never get here (unless of course
   4971 		 * name is passed at some time in future -- at which time
   4972 		 * we'll blow up here).
   4973 		 */
   4974 		ASSERT(0);
   4975 	}
   4976 
   4977 	drp = VTOR4(dvp);
   4978 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
   4979 		return (EINTR);
   4980 
   4981 	error = nfs4lookup(dvp, nm, vpp, cr, 0);
   4982 	nfs_rw_exit(&drp->r_rwlock);
   4983 
   4984 	/*
   4985 	 * If vnode is a device, create special vnode.
   4986 	 */
   4987 	if (!error && ISVDEV((*vpp)->v_type)) {
   4988 		vp = *vpp;
   4989 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
   4990 		VN_RELE(vp);
   4991 	}
   4992 
   4993 	return (error);
   4994 }
   4995 
   4996 /* ARGSUSED */
   4997 static int
   4998 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
   4999 {
   5000 	int error;
   5001 	rnode4_t *drp;
   5002 	int cflag = ((flags & CREATE_XATTR_DIR) != 0);
   5003 	mntinfo4_t *mi;
   5004 
   5005 	mi = VTOMI4(dvp);
   5006 	if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) &&
   5007 	    !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS))
   5008 		return (EINVAL);
   5009 
   5010 	drp = VTOR4(dvp);
   5011 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
   5012 		return (EINTR);
   5013 
   5014 	mutex_enter(&drp->r_statelock);
   5015 	/*
   5016 	 * If the server doesn't support xattrs just return EINVAL
   5017 	 */
   5018 	if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
   5019 		mutex_exit(&drp->r_statelock);
   5020 		nfs_rw_exit(&drp->r_rwlock);
   5021 		return (EINVAL);
   5022 	}
   5023 
   5024 	/*
   5025 	 * If there is a cached xattr directory entry,
   5026 	 * use it as long as the attributes are valid. If the
   5027 	 * attributes are not valid, take the simple approach and
   5028 	 * free the cached value and re-fetch a new value.
   5029 	 *
   5030 	 * We don't negative entry cache for now, if we did we
   5031 	 * would need to check if the file has changed on every
   5032 	 * lookup. But xattrs don't exist very often and failing
   5033 	 * an openattr is not much more expensive than and NVERIFY or GETATTR
   5034 	 * so do an openattr over the wire for now.
   5035 	 */
   5036 	if (drp->r_xattr_dir != NULL) {
   5037 		if (ATTRCACHE4_VALID(dvp)) {
   5038 			VN_HOLD(drp->r_xattr_dir);
   5039 			*vpp = drp->r_xattr_dir;
   5040 			mutex_exit(&drp->r_statelock);
   5041 			nfs_rw_exit(&drp->r_rwlock);
   5042 			return (0);
   5043 		}
   5044 		VN_RELE(drp->r_xattr_dir);
   5045 		drp->r_xattr_dir = NULL;
   5046 	}
   5047 	mutex_exit(&drp->r_statelock);
   5048 
   5049 	error = nfs4openattr(dvp, vpp, cflag, cr);
   5050 
   5051 	nfs_rw_exit(&drp->r_rwlock);
   5052 
   5053 	return (error);
   5054 }
   5055 
   5056 static int
   5057 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
   5058 {
   5059 	int error;
   5060 	rnode4_t *drp;
   5061 
   5062 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
   5063 
   5064 	/*
   5065 	 * If lookup is for "", just return dvp.  Don't need
   5066 	 * to send it over the wire, look it up in the dnlc,
   5067 	 * or perform any access checks.
   5068 	 */
   5069 	if (*nm == '\0') {
   5070 		VN_HOLD(dvp);
   5071 		*vpp = dvp;
   5072 		return (0);
   5073 	}
   5074 
   5075 	/*
   5076 	 * Can't do lookups in non-directories.
   5077 	 */
   5078 	if (dvp->v_type != VDIR)
   5079 		return (ENOTDIR);
   5080 
   5081 	/*
   5082 	 * If lookup is for ".", just return dvp.  Don't need
   5083 	 * to send it over the wire or look it up in the dnlc,
   5084 	 * just need to check access.
   5085 	 */
   5086 	if (nm[0] == '.' && nm[1] == '\0') {
   5087 		error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
   5088 		if (error)
   5089 			return (error);
   5090 		VN_HOLD(dvp);
   5091 		*vpp = dvp;
   5092 		return (0);
   5093 	}
   5094 
   5095 	drp = VTOR4(dvp);
   5096 	if (!(drp->r_flags & R4LOOKUP)) {
   5097 		mutex_enter(&drp->r_statelock);
   5098 		drp->r_flags |= R4LOOKUP;
   5099 		mutex_exit(&drp->r_statelock);
   5100 	}
   5101 
   5102 	*vpp = NULL;
   5103 	/*
   5104 	 * Lookup this name in the DNLC.  If there is no entry
   5105 	 * lookup over the wire.
   5106 	 */
   5107 	if (!skipdnlc)
   5108 		*vpp = dnlc_lookup(dvp, nm);
   5109 	if (*vpp == NULL) {
   5110 		/*
   5111 		 * We need to go over the wire to lookup the name.
   5112 		 */
   5113 		return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
   5114 	}
   5115 
   5116 	/*
   5117 	 * We hit on the dnlc
   5118 	 */
   5119 	if (*vpp != DNLC_NO_VNODE ||
   5120 	    (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
   5121 		/*
   5122 		 * But our attrs may not be valid.
   5123 		 */
   5124 		if (ATTRCACHE4_VALID(dvp)) {
   5125 			error = nfs4_waitfor_purge_complete(dvp);
   5126 			if (error) {
   5127 				VN_RELE(*vpp);
   5128 				*vpp = NULL;
   5129 				return (error);
   5130 			}
   5131 
   5132 			/*
   5133 			 * If after the purge completes, check to make sure
   5134 			 * our attrs are still valid.
   5135 			 */
   5136 			if (ATTRCACHE4_VALID(dvp)) {
   5137 				/*
   5138 				 * If we waited for a purge we may have
   5139 				 * lost our vnode so look it up again.
   5140 				 */
   5141 				VN_RELE(*vpp);
   5142 				*vpp = dnlc_lookup(dvp, nm);
   5143 				if (*vpp == NULL)
   5144 					return (nfs4lookupnew_otw(dvp,
   5145 					    nm, vpp, cr));
   5146 
   5147 				/*
   5148 				 * The access cache should almost always hit
   5149 				 */
   5150 				error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
   5151 
   5152 				if (error) {
   5153 					VN_RELE(*vpp);
   5154 					*vpp = NULL;
   5155 					return (error);
   5156 				}
   5157 				if (*vpp == DNLC_NO_VNODE) {
   5158 					VN_RELE(*vpp);
   5159 					*vpp = NULL;
   5160 					return (ENOENT);
   5161 				}
   5162 				return (0);
   5163 			}
   5164 		}
   5165 	}
   5166 
   5167 	ASSERT(*vpp != NULL);
   5168 
   5169 	/*
   5170 	 * We may have gotten here we have one of the following cases:
   5171 	 *	1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
   5172 	 *		need to validate them.
   5173 	 *	2) vpp == DNLC_NO_VNODE, a negative entry that we always
   5174 	 *		must validate.
   5175 	 *
   5176 	 * Go to the server and check if the directory has changed, if
   5177 	 * it hasn't we are done and can use the dnlc entry.
   5178 	 */
   5179 	return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
   5180 }
   5181 
   5182 /*
   5183  * Go to the server and check if the directory has changed, if
   5184  * it hasn't we are done and can use the dnlc entry.  If it
   5185  * has changed we get a new copy of its attributes and check
   5186  * the access for VEXEC, then relookup the filename and
   5187  * get its filehandle and attributes.
   5188  *
   5189  * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
   5190  *	if the NVERIFY failed we must
   5191  *		purge the caches
   5192  *		cache new attributes (will set r_time_attr_inval)
   5193  *		cache new access
   5194  *		recheck VEXEC access
   5195  *		add name to dnlc, possibly negative
   5196  *		if LOOKUP succeeded
   5197  *			cache new attributes
   5198  *	else
   5199  *		set a new r_time_attr_inval for dvp
   5200  *		check to make sure we have access
   5201  *
   5202  * The vpp returned is the vnode passed in if the directory is valid,
   5203  * a new vnode if successful lookup, or NULL on error.
   5204  */
   5205 static int
   5206 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
   5207 {
   5208 	COMPOUND4args_clnt args;
   5209 	COMPOUND4res_clnt res;
   5210 	fattr4 *ver_fattr;
   5211 	fattr4_change dchange;
   5212 	int32_t *ptr;
   5213 	int argoplist_size  = 7 * sizeof (nfs_argop4);
   5214 	nfs_argop4 *argop;
   5215 	int doqueue;
   5216 	mntinfo4_t *mi;
   5217 	nfs4_recov_state_t recov_state;
   5218 	hrtime_t t;
   5219 	int isdotdot;
   5220 	vnode_t *nvp;
   5221 	nfs_fh4 *fhp;
   5222 	nfs4_sharedfh_t *sfhp;
   5223 	nfs4_access_type_t cacc;
   5224 	rnode4_t *nrp;
   5225 	rnode4_t *drp = VTOR4(dvp);
   5226 	nfs4_ga_res_t *garp = NULL;
   5227 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   5228 
   5229 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
   5230 	ASSERT(nm != NULL);
   5231 	ASSERT(nm[0] != '\0');
   5232 	ASSERT(dvp->v_type == VDIR);
   5233 	ASSERT(nm[0] != '.' || nm[1] != '\0');
   5234 	ASSERT(*vpp != NULL);
   5235 
   5236 	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
   5237 		isdotdot = 1;
   5238 		args.ctag = TAG_LOOKUP_VPARENT;
   5239 	} else {
   5240 		/*
   5241 		 * If dvp were a stub, it should have triggered and caused
   5242 		 * a mount for us to get this far.
   5243 		 */
   5244 		ASSERT(!RP_ISSTUB(VTOR4(dvp)));
   5245 
   5246 		isdotdot = 0;
   5247 		args.ctag = TAG_LOOKUP_VALID;
   5248 	}
   5249 
   5250 	mi = VTOMI4(dvp);
   5251 	recov_state.rs_flags = 0;
   5252 	recov_state.rs_num_retry_despite_err = 0;
   5253 
   5254 	nvp = NULL;
   5255 
   5256 	/* Save the original mount point security information */
   5257 	(void) save_mnt_secinfo(mi->mi_curr_serv);
   5258 
   5259 recov_retry:
   5260 	e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
   5261 	    &recov_state, NULL);
   5262 	if (e.error) {
   5263 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
   5264 		VN_RELE(*vpp);
   5265 		*vpp = NULL;
   5266 		return (e.error);
   5267 	}
   5268 
   5269 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
   5270 
   5271 	/* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
   5272 	args.array_len = 7;
   5273 	args.array = argop;
   5274 
   5275 	/* 0. putfh file */
   5276 	argop[0].argop = OP_CPUTFH;
   5277 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
   5278 
   5279 	/* 1. nverify the change info */
   5280 	argop[1].argop = OP_NVERIFY;
   5281 	ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
   5282 	ver_fattr->attrmask = FATTR4_CHANGE_MASK;
   5283 	ver_fattr->attrlist4 = (char *)&dchange;
   5284 	ptr = (int32_t *)&dchange;
   5285 	IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
   5286 	ver_fattr->attrlist4_len = sizeof (fattr4_change);
   5287 
   5288 	/* 2. getattr directory */
   5289 	argop[2].argop = OP_GETATTR;
   5290 	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   5291 	argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
   5292 
   5293 	/* 3. access directory */
   5294 	argop[3].argop = OP_ACCESS;
   5295 	argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
   5296 	    ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
   5297 
   5298 	/* 4. lookup name */
   5299 	if (isdotdot) {
   5300 		argop[4].argop = OP_LOOKUPP;
   5301 	} else {
   5302 		argop[4].argop = OP_CLOOKUP;
   5303 		argop[4].nfs_argop4_u.opclookup.cname = nm;
   5304 	}
   5305 
   5306 	/* 5. resulting file handle */
   5307 	argop[5].argop = OP_GETFH;
   5308 
   5309 	/* 6. resulting file attributes */
   5310 	argop[6].argop = OP_GETATTR;
   5311 	argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   5312 	argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
   5313 
   5314 	doqueue = 1;
   5315 	t = gethrtime();
   5316 
   5317 	rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
   5318 
   5319 	if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
   5320 		/*
   5321 		 * For WRONGSEC of a non-dotdot case, send secinfo directly
   5322 		 * from this thread, do not go thru the recovery thread since
   5323 		 * we need the nm information.
   5324 		 *
   5325 		 * Not doing dotdot case because there is no specification
   5326 		 * for (PUTFH, SECINFO "..") yet.
   5327 		 */
   5328 		if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
   5329 			if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
   5330 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
   5331 				    &recov_state, FALSE);
   5332 			else
   5333 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
   5334 				    &recov_state, TRUE);
   5335 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   5336 			kmem_free(argop, argoplist_size);
   5337 			if (!e.error)
   5338 				goto recov_retry;
   5339 			(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
   5340 			VN_RELE(*vpp);
   5341 			*vpp = NULL;
   5342 			return (e.error);
   5343 		}
   5344 
   5345 		if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
   5346 		    OP_LOOKUP, NULL) == FALSE) {
   5347 			nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
   5348 			    &recov_state, TRUE);
   5349 
   5350 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   5351 			kmem_free(argop, argoplist_size);
   5352 			goto recov_retry;
   5353 		}
   5354 	}
   5355 
   5356 	nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
   5357 
   5358 	if (e.error || res.array_len == 0) {
   5359 		/*
   5360 		 * If e.error isn't set, then reply has no ops (or we couldn't
   5361 		 * be here).  The only legal way to reply without an op array
   5362 		 * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
   5363 		 * be in the reply for all other status values.
   5364 		 *
   5365 		 * For valid replies without an ops array, return ENOTSUP
   5366 		 * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
   5367 		 * return EIO -- don't trust status.
   5368 		 */
   5369 		if (e.error == 0)
   5370 			e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
   5371 			    ENOTSUP : EIO;
   5372 		VN_RELE(*vpp);
   5373 		*vpp = NULL;
   5374 		kmem_free(argop, argoplist_size);
   5375 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
   5376 		return (e.error);
   5377 	}
   5378 
   5379 	if (res.status != NFS4ERR_SAME) {
   5380 		e.error = geterrno4(res.status);
   5381 
   5382 		/*
   5383 		 * The NVERIFY "failed" so the directory has changed
   5384 		 * First make sure PUTFH succeeded and NVERIFY "failed"
   5385 		 * cleanly.
   5386 		 */
   5387 		if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
   5388 		    (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
   5389 			nfs4_purge_stale_fh(e.error, dvp, cr);
   5390 			VN_RELE(*vpp);
   5391 			*vpp = NULL;
   5392 			goto exit;
   5393 		}
   5394 
   5395 		/*
   5396 		 * We know the NVERIFY "failed" so we must:
   5397 		 *	purge the caches (access and indirectly dnlc if needed)
   5398 		 */
   5399 		nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
   5400 
   5401 		if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
   5402 			nfs4_purge_stale_fh(e.error, dvp, cr);
   5403 			VN_RELE(*vpp);
   5404 			*vpp = NULL;
   5405 			goto exit;
   5406 		}
   5407 
   5408 		/*
   5409 		 * Install new cached attributes for the directory
   5410 		 */
   5411 		nfs4_attr_cache(dvp,
   5412 		    &res.array[2].nfs_resop4_u.opgetattr.ga_res,
   5413 		    t, cr, FALSE, NULL);
   5414 
   5415 		if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
   5416 			nfs4_purge_stale_fh(e.error, dvp, cr);
   5417 			VN_RELE(*vpp);
   5418 			*vpp = NULL;
   5419 			e.error = geterrno4(res.status);
   5420 			goto exit;
   5421 		}
   5422 
   5423 		/*
   5424 		 * Now we know the directory is valid,
   5425 		 * cache new directory access
   5426 		 */
   5427 		nfs4_access_cache(drp,
   5428 		    args.array[3].nfs_argop4_u.opaccess.access,
   5429 		    res.array[3].nfs_resop4_u.opaccess.access, cr);
   5430 
   5431 		/*
   5432 		 * recheck VEXEC access
   5433 		 */
   5434 		cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
   5435 		if (cacc != NFS4_ACCESS_ALLOWED) {
   5436 			/*
   5437 			 * Directory permissions might have been revoked
   5438 			 */
   5439 			if (cacc == NFS4_ACCESS_DENIED) {
   5440 				e.error = EACCES;
   5441 				VN_RELE(*vpp);
   5442 				*vpp = NULL;
   5443 				goto exit;
   5444 			}
   5445 
   5446 			/*
   5447 			 * Somehow we must not have asked for enough
   5448 			 * so try a singleton ACCESS, should never happen.
   5449 			 */
   5450 			e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
   5451 			if (e.error) {
   5452 				VN_RELE(*vpp);
   5453 				*vpp = NULL;
   5454 				goto exit;
   5455 			}
   5456 		}
   5457 
   5458 		e.error = geterrno4(res.status);
   5459 		if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
   5460 			/*
   5461 			 * The lookup failed, probably no entry
   5462 			 */
   5463 			if (e.error == ENOENT && nfs4_lookup_neg_cache) {
   5464 				dnlc_update(dvp, nm, DNLC_NO_VNODE);
   5465 			} else {
   5466 				/*
   5467 				 * Might be some other error, so remove
   5468 				 * the dnlc entry to make sure we start all
   5469 				 * over again, next time.
   5470 				 */
   5471 				dnlc_remove(dvp, nm);
   5472 			}
   5473 			VN_RELE(*vpp);
   5474 			*vpp = NULL;
   5475 			goto exit;
   5476 		}
   5477 
   5478 		if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
   5479 			/*
   5480 			 * The file exists but we can't get its fh for
   5481 			 * some unknown reason.  Remove it from the dnlc
   5482 			 * and error out to be safe.
   5483 			 */
   5484 			dnlc_remove(dvp, nm);
   5485 			VN_RELE(*vpp);
   5486 			*vpp = NULL;
   5487 			goto exit;
   5488 		}
   5489 		fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
   5490 		if (fhp->nfs_fh4_len == 0) {
   5491 			/*
   5492 			 * The file exists but a bogus fh
   5493 			 * some unknown reason.  Remove it from the dnlc
   5494 			 * and error out to be safe.
   5495 			 */
   5496 			e.error = ENOENT;
   5497 			dnlc_remove(dvp, nm);
   5498 			VN_RELE(*vpp);
   5499 			*vpp = NULL;
   5500 			goto exit;
   5501 		}
   5502 		sfhp = sfh4_get(fhp, mi);
   5503 
   5504 		if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
   5505 			garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
   5506 
   5507 		/*
   5508 		 * Make the new rnode
   5509 		 */
   5510 		if (isdotdot) {
   5511 			e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
   5512 			if (e.error) {
   5513 				sfh4_rele(&sfhp);
   5514 				VN_RELE(*vpp);
   5515 				*vpp = NULL;
   5516 				goto exit;
   5517 			}
   5518 			/*
   5519 			 * XXX if nfs4_make_dotdot uses an existing rnode
   5520 			 * XXX it doesn't update the attributes.
   5521 			 * XXX for now just save them again to save an OTW
   5522 			 */
   5523 			nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
   5524 		} else {
   5525 			nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
   5526 			    dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
   5527 			/*
   5528 			 * If v_type == VNON, then garp was NULL because
   5529 			 * the last op in the compound failed and makenfs4node
   5530 			 * could not find the vnode for sfhp. It created
   5531 			 * a new vnode, so we have nothing to purge here.
   5532 			 */
   5533 			if (nvp->v_type == VNON) {
   5534 				vattr_t vattr;
   5535 
   5536 				vattr.va_mask = AT_TYPE;
   5537 				/*
   5538 				 * N.B. We've already called nfs4_end_fop above.
   5539 				 */
   5540 				e.error = nfs4getattr(nvp, &vattr, cr);
   5541 				if (e.error) {
   5542 					sfh4_rele(&sfhp);
   5543 					VN_RELE(*vpp);
   5544 					*vpp = NULL;
   5545 					VN_RELE(nvp);
   5546 					goto exit;
   5547 				}
   5548 				nvp->v_type = vattr.va_type;
   5549 			}
   5550 		}
   5551 		sfh4_rele(&sfhp);
   5552 
   5553 		nrp = VTOR4(nvp);
   5554 		mutex_enter(&nrp->r_statev4_lock);
   5555 		if (!nrp->created_v4) {
   5556 			mutex_exit(&nrp->r_statev4_lock);
   5557 			dnlc_update(dvp, nm, nvp);
   5558 		} else
   5559 			mutex_exit(&nrp->r_statev4_lock);
   5560 
   5561 		VN_RELE(*vpp);
   5562 		*vpp = nvp;
   5563 	} else {
   5564 		hrtime_t now;
   5565 		hrtime_t delta = 0;
   5566 
   5567 		e.error = 0;
   5568 
   5569 		/*
   5570 		 * Because the NVERIFY "succeeded" we know that the
   5571 		 * directory attributes are still valid
   5572 		 * so update r_time_attr_inval
   5573 		 */
   5574 		now = gethrtime();
   5575 		mutex_enter(&drp->r_statelock);
   5576 		if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
   5577 			delta = now - drp->r_time_attr_saved;
   5578 			if (delta < mi->mi_acdirmin)
   5579 				delta = mi->mi_acdirmin;
   5580 			else if (delta > mi->mi_acdirmax)
   5581 				delta = mi->mi_acdirmax;
   5582 		}
   5583 		drp->r_time_attr_inval = now + delta;
   5584 		mutex_exit(&drp->r_statelock);
   5585 		dnlc_update(dvp, nm, *vpp);
   5586 
   5587 		/*
   5588 		 * Even though we have a valid directory attr cache
   5589 		 * and dnlc entry, we may not have access.
   5590 		 * This should almost always hit the cache.
   5591 		 */
   5592 		e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
   5593 		if (e.error) {
   5594 			VN_RELE(*vpp);
   5595 			*vpp = NULL;
   5596 		}
   5597 
   5598 		if (*vpp == DNLC_NO_VNODE) {
   5599 			VN_RELE(*vpp);
   5600 			*vpp = NULL;
   5601 			e.error = ENOENT;
   5602 		}
   5603 	}
   5604 
   5605 exit:
   5606 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   5607 	kmem_free(argop, argoplist_size);
   5608 	(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
   5609 	return (e.error);
   5610 }
   5611 
   5612 /*
   5613  * We need to go over the wire to lookup the name, but
   5614  * while we are there verify the directory has not
   5615  * changed but if it has, get new attributes and check access
   5616  *
   5617  * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
   5618  *					NVERIFY GETATTR ACCESS
   5619  *
   5620  * With the results:
   5621  *	if the NVERIFY failed we must purge the caches, add new attributes,
   5622  *		and cache new access.
   5623  *	set a new r_time_attr_inval
   5624  *	add name to dnlc, possibly negative
   5625  *	if LOOKUP succeeded
   5626  *		cache new attributes
   5627  */
   5628 static int
   5629 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
   5630 {
   5631 	COMPOUND4args_clnt args;
   5632 	COMPOUND4res_clnt res;
   5633 	fattr4 *ver_fattr;
   5634 	fattr4_change dchange;
   5635 	int32_t *ptr;
   5636 	nfs4_ga_res_t *garp = NULL;
   5637 	int argoplist_size  = 9 * sizeof (nfs_argop4);
   5638 	nfs_argop4 *argop;
   5639 	int doqueue;
   5640 	mntinfo4_t *mi;
   5641 	nfs4_recov_state_t recov_state;
   5642 	hrtime_t t;
   5643 	int isdotdot;
   5644 	vnode_t *nvp;
   5645 	nfs_fh4 *fhp;
   5646 	nfs4_sharedfh_t *sfhp;
   5647 	nfs4_access_type_t cacc;
   5648 	rnode4_t *nrp;
   5649 	rnode4_t *drp = VTOR4(dvp);
   5650 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   5651 
   5652 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
   5653 	ASSERT(nm != NULL);
   5654 	ASSERT(nm[0] != '\0');
   5655 	ASSERT(dvp->v_type == VDIR);
   5656 	ASSERT(nm[0] != '.' || nm[1] != '\0');
   5657 	ASSERT(*vpp == NULL);
   5658 
   5659 	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
   5660 		isdotdot = 1;
   5661 		args.ctag = TAG_LOOKUP_PARENT;
   5662 	} else {
   5663 		/*
   5664 		 * If dvp were a stub, it should have triggered and caused
   5665 		 * a mount for us to get this far.
   5666 		 */
   5667 		ASSERT(!RP_ISSTUB(VTOR4(dvp)));
   5668 
   5669 		isdotdot = 0;
   5670 		args.ctag = TAG_LOOKUP;
   5671 	}
   5672 
   5673 	mi = VTOMI4(dvp);
   5674 	recov_state.rs_flags = 0;
   5675 	recov_state.rs_num_retry_despite_err = 0;
   5676 
   5677 	nvp = NULL;
   5678 
   5679 	/* Save the original mount point security information */
   5680 	(void) save_mnt_secinfo(mi->mi_curr_serv);
   5681 
   5682 recov_retry:
   5683 	e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
   5684 	    &recov_state, NULL);
   5685 	if (e.error) {
   5686 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
   5687 		return (e.error);
   5688 	}
   5689 
   5690 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
   5691 
   5692 	/* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
   5693 	args.array_len = 9;
   5694 	args.array = argop;
   5695 
   5696 	/* 0. putfh file */
   5697 	argop[0].argop = OP_CPUTFH;
   5698 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
   5699 
   5700 	/* 1. savefh for the nverify */
   5701 	argop[1].argop = OP_SAVEFH;
   5702 
   5703 	/* 2. lookup name */
   5704 	if (isdotdot) {
   5705 		argop[2].argop = OP_LOOKUPP;
   5706 	} else {
   5707 		argop[2].argop = OP_CLOOKUP;
   5708 		argop[2].nfs_argop4_u.opclookup.cname = nm;
   5709 	}
   5710 
   5711 	/* 3. resulting file handle */
   5712 	argop[3].argop = OP_GETFH;
   5713 
   5714 	/* 4. resulting file attributes */
   5715 	argop[4].argop = OP_GETATTR;
   5716 	argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   5717 	argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
   5718 
   5719 	/* 5. restorefh back the directory for the nverify */
   5720 	argop[5].argop = OP_RESTOREFH;
   5721 
   5722 	/* 6. nverify the change info */
   5723 	argop[6].argop = OP_NVERIFY;
   5724 	ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
   5725 	ver_fattr->attrmask = FATTR4_CHANGE_MASK;
   5726 	ver_fattr->attrlist4 = (char *)&dchange;
   5727 	ptr = (int32_t *)&dchange;
   5728 	IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
   5729 	ver_fattr->attrlist4_len = sizeof (fattr4_change);
   5730 
   5731 	/* 7. getattr directory */
   5732 	argop[7].argop = OP_GETATTR;
   5733 	argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   5734 	argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
   5735 
   5736 	/* 8. access directory */
   5737 	argop[8].argop = OP_ACCESS;
   5738 	argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
   5739 	    ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
   5740 
   5741 	doqueue = 1;
   5742 	t = gethrtime();
   5743 
   5744 	rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
   5745 
   5746 	if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
   5747 		/*
   5748 		 * For WRONGSEC of a non-dotdot case, send secinfo directly
   5749 		 * from this thread, do not go thru the recovery thread since
   5750 		 * we need the nm information.
   5751 		 *
   5752 		 * Not doing dotdot case because there is no specification
   5753 		 * for (PUTFH, SECINFO "..") yet.
   5754 		 */
   5755 		if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
   5756 			if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
   5757 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
   5758 				    &recov_state, FALSE);
   5759 			else
   5760 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
   5761 				    &recov_state, TRUE);
   5762 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   5763 			kmem_free(argop, argoplist_size);
   5764 			if (!e.error)
   5765 				goto recov_retry;
   5766 			(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
   5767 			return (e.error);
   5768 		}
   5769 
   5770 		if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
   5771 		    OP_LOOKUP, NULL) == FALSE) {
   5772 			nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
   5773 			    &recov_state, TRUE);
   5774 
   5775 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   5776 			kmem_free(argop, argoplist_size);
   5777 			goto recov_retry;
   5778 		}
   5779 	}
   5780 
   5781 	nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
   5782 
   5783 	if (e.error || res.array_len == 0) {
   5784 		/*
   5785 		 * If e.error isn't set, then reply has no ops (or we couldn't
   5786 		 * be here).  The only legal way to reply without an op array
   5787 		 * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
   5788 		 * be in the reply for all other status values.
   5789 		 *
   5790 		 * For valid replies without an ops array, return ENOTSUP
   5791 		 * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
   5792 		 * return EIO -- don't trust status.
   5793 		 */
   5794 		if (e.error == 0)
   5795 			e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
   5796 			    ENOTSUP : EIO;
   5797 
   5798 		kmem_free(argop, argoplist_size);
   5799 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
   5800 		return (e.error);
   5801 	}
   5802 
   5803 	e.error = geterrno4(res.status);
   5804 
   5805 	/*
   5806 	 * The PUTFH and SAVEFH may have failed.
   5807 	 */
   5808 	if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
   5809 	    (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) {
   5810 		nfs4_purge_stale_fh(e.error, dvp, cr);
   5811 		goto exit;
   5812 	}
   5813 
   5814 	/*
   5815 	 * Check if the file exists, if it does delay entering
   5816 	 * into the dnlc until after we update the directory
   5817 	 * attributes so we don't cause it to get purged immediately.
   5818 	 */
   5819 	if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) {
   5820 		/*
   5821 		 * The lookup failed, probably no entry
   5822 		 */
   5823 		if (e.error == ENOENT && nfs4_lookup_neg_cache)
   5824 			dnlc_update(dvp, nm, DNLC_NO_VNODE);
   5825 		goto exit;
   5826 	}
   5827 
   5828 	if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) {
   5829 		/*
   5830 		 * The file exists but we can't get its fh for
   5831 		 * some unknown reason. Error out to be safe.
   5832 		 */
   5833 		goto exit;
   5834 	}
   5835 
   5836 	fhp = &res.array[3].nfs_resop4_u.opgetfh.object;
   5837 	if (fhp->nfs_fh4_len == 0) {
   5838 		/*
   5839 		 * The file exists but a bogus fh
   5840 		 * some unknown reason.  Error out to be safe.
   5841 		 */
   5842 		e.error = EIO;
   5843 		goto exit;
   5844 	}
   5845 	sfhp = sfh4_get(fhp, mi);
   5846 
   5847 	if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) {
   5848 		sfh4_rele(&sfhp);
   5849 		goto exit;
   5850 	}
   5851 	garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
   5852 
   5853 	/*
   5854 	 * The RESTOREFH may have failed
   5855 	 */
   5856 	if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) {
   5857 		sfh4_rele(&sfhp);
   5858 		e.error = EIO;
   5859 		goto exit;
   5860 	}
   5861 
   5862 	if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) {
   5863 		/*
   5864 		 * First make sure the NVERIFY failed as we expected,
   5865 		 * if it didn't then be conservative and error out
   5866 		 * as we can't trust the directory.
   5867 		 */
   5868 		if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) {
   5869 			sfh4_rele(&sfhp);
   5870 			e.error = EIO;
   5871 			goto exit;
   5872 		}
   5873 
   5874 		/*
   5875 		 * We know the NVERIFY "failed" so the directory has changed,
   5876 		 * so we must:
   5877 		 *	purge the caches (access and indirectly dnlc if needed)
   5878 		 */
   5879 		nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
   5880 
   5881 		if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) {
   5882 			sfh4_rele(&sfhp);
   5883 			goto exit;
   5884 		}
   5885 		nfs4_attr_cache(dvp,
   5886 		    &res.array[7].nfs_resop4_u.opgetattr.ga_res,
   5887 		    t, cr, FALSE, NULL);
   5888 
   5889 		if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) {
   5890 			nfs4_purge_stale_fh(e.error, dvp, cr);
   5891 			sfh4_rele(&sfhp);
   5892 			e.error = geterrno4(res.status);
   5893 			goto exit;
   5894 		}
   5895 
   5896 		/*
   5897 		 * Now we know the directory is valid,
   5898 		 * cache new directory access
   5899 		 */
   5900 		nfs4_access_cache(drp,
   5901 		    args.array[8].nfs_argop4_u.opaccess.access,
   5902 		    res.array[8].nfs_resop4_u.opaccess.access, cr);
   5903 
   5904 		/*
   5905 		 * recheck VEXEC access
   5906 		 */
   5907 		cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
   5908 		if (cacc != NFS4_ACCESS_ALLOWED) {
   5909 			/*
   5910 			 * Directory permissions might have been revoked
   5911 			 */
   5912 			if (cacc == NFS4_ACCESS_DENIED) {
   5913 				sfh4_rele(&sfhp);
   5914 				e.error = EACCES;
   5915 				goto exit;
   5916 			}
   5917 
   5918 			/*
   5919 			 * Somehow we must not have asked for enough
   5920 			 * so try a singleton ACCESS should never happen
   5921 			 */
   5922 			e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
   5923 			if (e.error) {
   5924 				sfh4_rele(&sfhp);
   5925 				goto exit;
   5926 			}
   5927 		}
   5928 
   5929 		e.error = geterrno4(res.status);
   5930 	} else {
   5931 		hrtime_t now;
   5932 		hrtime_t delta = 0;
   5933 
   5934 		e.error = 0;
   5935 
   5936 		/*
   5937 		 * Because the NVERIFY "succeeded" we know that the
   5938 		 * directory attributes are still valid
   5939 		 * so update r_time_attr_inval
   5940 		 */
   5941 		now = gethrtime();
   5942 		mutex_enter(&drp->r_statelock);
   5943 		if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
   5944 			delta = now - drp->r_time_attr_saved;
   5945 			if (delta < mi->mi_acdirmin)
   5946 				delta = mi->mi_acdirmin;
   5947 			else if (delta > mi->mi_acdirmax)
   5948 				delta = mi->mi_acdirmax;
   5949 		}
   5950 		drp->r_time_attr_inval = now + delta;
   5951 		mutex_exit(&drp->r_statelock);
   5952 
   5953 		/*
   5954 		 * Even though we have a valid directory attr cache,
   5955 		 * we may not have access.
   5956 		 * This should almost always hit the cache.
   5957 		 */
   5958 		e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
   5959 		if (e.error) {
   5960 			sfh4_rele(&sfhp);
   5961 			goto exit;
   5962 		}
   5963 	}
   5964 
   5965 	/*
   5966 	 * Now we have successfully completed the lookup, if the
   5967 	 * directory has changed we now have the valid attributes.
   5968 	 * We also know we have directory access.
   5969 	 * Create the new rnode and insert it in the dnlc.
   5970 	 */
   5971 	if (isdotdot) {
   5972 		e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
   5973 		if (e.error) {
   5974 			sfh4_rele(&sfhp);
   5975 			goto exit;
   5976 		}
   5977 		/*
   5978 		 * XXX if nfs4_make_dotdot uses an existing rnode
   5979 		 * XXX it doesn't update the attributes.
   5980 		 * XXX for now just save them again to save an OTW
   5981 		 */
   5982 		nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
   5983 	} else {
   5984 		nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
   5985 		    dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
   5986 	}
   5987 	sfh4_rele(&sfhp);
   5988 
   5989 	nrp = VTOR4(nvp);
   5990 	mutex_enter(&nrp->r_statev4_lock);
   5991 	if (!nrp->created_v4) {
   5992 		mutex_exit(&nrp->r_statev4_lock);
   5993 		dnlc_update(dvp, nm, nvp);
   5994 	} else
   5995 		mutex_exit(&nrp->r_statev4_lock);
   5996 
   5997 	*vpp = nvp;
   5998 
   5999 exit:
   6000 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   6001 	kmem_free(argop, argoplist_size);
   6002 	(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
   6003 	return (e.error);
   6004 }
   6005 
   6006 #ifdef DEBUG
   6007 void
   6008 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt)
   6009 {
   6010 	uint_t i, len;
   6011 	zoneid_t zoneid = getzoneid();
   6012 	char *s;
   6013 
   6014 	zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where);
   6015 	for (i = 0; i < argcnt; i++) {
   6016 		nfs_argop4 *op = &argbase[i];
   6017 		switch (op->argop) {
   6018 		case OP_CPUTFH:
   6019 		case OP_PUTFH:
   6020 			zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i);
   6021 			break;
   6022 		case OP_PUTROOTFH:
   6023 			zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i);
   6024 			break;
   6025 		case OP_CLOOKUP:
   6026 			s = op->nfs_argop4_u.opclookup.cname;
   6027 			zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
   6028 			break;
   6029 		case OP_LOOKUP:
   6030 			s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname,
   6031 			    &len, NULL);
   6032 			zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
   6033 			kmem_free(s, len);
   6034 			break;
   6035 		case OP_LOOKUPP:
   6036 			zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i);
   6037 			break;
   6038 		case OP_GETFH:
   6039 			zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i);
   6040 			break;
   6041 		case OP_GETATTR:
   6042 			zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i);
   6043 			break;
   6044 		case OP_OPENATTR:
   6045 			zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i);
   6046 			break;
   6047 		default:
   6048 			zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i,
   6049 			    op->argop);
   6050 			break;
   6051 		}
   6052 	}
   6053 }
   6054 #endif
   6055 
   6056 /*
   6057  * nfs4lookup_setup - constructs a multi-lookup compound request.
   6058  *
   6059  * Given the path "nm1/nm2/.../nmn", the following compound requests
   6060  * may be created:
   6061  *
   6062  * Note: Getfh is not be needed because filehandle attr is mandatory, but it
   6063  * is faster, for now.
   6064  *
   6065  * l4_getattrs indicates the type of compound requested.
   6066  *
   6067  * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo):
   6068  *
   6069  *	compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ...  Lookup {nmn} }
   6070  *
   6071  *   total number of ops is n + 1.
   6072  *
   6073  * LKP4_LAST_NAMED_ATTR - multi-component path for a named
   6074  *      attribute: create lookups plus one OPENATTR/GETFH/GETATTR
   6075  *      before the last component, and only get attributes
   6076  *      for the last component.  Note that the second-to-last
   6077  *	pathname component is XATTR_RPATH, which does NOT go
   6078  *	over-the-wire as a lookup.
   6079  *
   6080  *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2};
   6081  *		Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr }
   6082  *
   6083  *   and total number of ops is n + 5.
   6084  *
   6085  * LKP4_LAST_ATTRDIR - multi-component path for the hidden named
   6086  *      attribute directory: create lookups plus an OPENATTR
   6087  *	replacing the last lookup.  Note that the last pathname
   6088  *	component is XATTR_RPATH, which does NOT go over-the-wire
   6089  *	as a lookup.
   6090  *
   6091  *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr;
   6092  *		Openattr; Getfh; Getattr }
   6093  *
   6094  *   and total number of ops is n + 5.
   6095  *
   6096  * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate
   6097  *	nodes too.
   6098  *
   6099  *	compound { Put*fh; Lookup {nm1}; Getfh; Getattr;
   6100  *		Lookup {nm2}; ...  Lookup {nmn}; Getfh; Getattr }
   6101  *
   6102  *   and total number of ops is 3*n + 1.
   6103  *
   6104  * All cases: returns the index in the arg array of the final LOOKUP op, or
   6105  * -1 if no LOOKUPs were used.
   6106  */
   6107 int
   6108 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh)
   6109 {
   6110 	enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs;
   6111 	nfs_argop4 *argbase, *argop;
   6112 	int arglen, argcnt;
   6113 	int n = 1;	/* number of components */
   6114 	int nga = 1;	/* number of Getattr's in request */
   6115 	char c = '\0', *s, *p;
   6116 	int lookup_idx = -1;
   6117 	int argoplist_size;
   6118 
   6119 	/* set lookuparg response result to 0 */
   6120 	lookupargp->resp->status = NFS4_OK;
   6121 
   6122 	/* skip leading "/" or "." e.g. ".//./" if there is */
   6123 	for (; ; nm++) {
   6124 		if (*nm != '/' && *nm != '.')
   6125 			break;
   6126 
   6127 		/* ".." is counted as 1 component */
   6128 		if (*nm == '.' && *(nm + 1) != '/')
   6129 			break;
   6130 	}
   6131 
   6132 	/*
   6133 	 * Find n = number of components - nm must be null terminated
   6134 	 * Skip "." components.
   6135 	 */
   6136 	if (*nm != '\0')
   6137 		for (n = 1, s = nm; *s != '\0'; s++) {
   6138 			if ((*s == '/') && (*(s + 1) != '/') &&
   6139 			    (*(s + 1) != '\0') &&
   6140 			    !(*(s + 1) == '.' && (*(s + 2) == '/' ||
   6141 			    *(s + 2) == '\0')))
   6142 				n++;
   6143 		}
   6144 	else
   6145 		n = 0;
   6146 
   6147 	/*
   6148 	 * nga is number of components that need Getfh+Getattr
   6149 	 */
   6150 	switch (l4_getattrs) {
   6151 	case LKP4_NO_ATTRIBUTES:
   6152 		nga = 0;
   6153 		break;
   6154 	case LKP4_ALL_ATTRIBUTES:
   6155 		nga = n;
   6156 		/*
   6157 		 * Always have at least 1 getfh, getattr pair
   6158 		 */
   6159 		if (nga == 0)
   6160 			nga++;
   6161 		break;
   6162 	case LKP4_LAST_ATTRDIR:
   6163 	case LKP4_LAST_NAMED_ATTR:
   6164 		nga = n+1;
   6165 		break;
   6166 	}
   6167 
   6168 	/*
   6169 	 * If change to use the filehandle attr instead of getfh
   6170 	 * the following line can be deleted.
   6171 	 */
   6172 	nga *= 2;
   6173 
   6174 	/*
   6175 	 * calculate number of ops in request as
   6176 	 * header + trailer + lookups + getattrs
   6177 	 */
   6178 	arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga;
   6179 
   6180 	argoplist_size = arglen * sizeof (nfs_argop4);
   6181 	argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP);
   6182 	lookupargp->argsp->array = argop;
   6183 
   6184 	argcnt = lookupargp->header_len;
   6185 	argop += argcnt;
   6186 
   6187 	/*
   6188 	 * loop and create a lookup op and possibly getattr/getfh for
   6189 	 * each component. Skip "." components.
   6190 	 */
   6191 	for (s = nm; *s != '\0'; s = p) {
   6192 		/*
   6193 		 * Set up a pathname struct for each component if needed
   6194 		 */
   6195 		while (*s == '/')
   6196 			s++;
   6197 		if (*s == '\0')
   6198 			break;
   6199 
   6200 		for (p = s; (*p != '/') && (*p != '\0'); p++)
   6201 			;
   6202 		c = *p;
   6203 		*p = '\0';
   6204 
   6205 		if (s[0] == '.' && s[1] == '\0') {
   6206 			*p = c;
   6207 			continue;
   6208 		}
   6209 		if (l4_getattrs == LKP4_LAST_ATTRDIR &&
   6210 		    strcmp(s, XATTR_RPATH) == 0) {
   6211 			/* getfh XXX may not be needed in future */
   6212 			argop->argop = OP_GETFH;
   6213 			argop++;
   6214 			argcnt++;
   6215 
   6216 			/* getattr */
   6217 			argop->argop = OP_GETATTR;
   6218 			argop->nfs_argop4_u.opgetattr.attr_request =
   6219 			    lookupargp->ga_bits;
   6220 			argop->nfs_argop4_u.opgetattr.mi =
   6221 			    lookupargp->mi;
   6222 			argop++;
   6223 			argcnt++;
   6224 
   6225 			/* openattr */
   6226 			argop->argop = OP_OPENATTR;
   6227 		} else if (l4_getattrs == LKP4_LAST_NAMED_ATTR &&
   6228 		    strcmp(s, XATTR_RPATH) == 0) {
   6229 			/* openattr */
   6230 			argop->argop = OP_OPENATTR;
   6231 			argop++;
   6232 			argcnt++;
   6233 
   6234 			/* getfh XXX may not be needed in future */
   6235 			argop->argop = OP_GETFH;
   6236 			argop++;
   6237 			argcnt++;
   6238 
   6239 			/* getattr */
   6240 			argop->argop = OP_GETATTR;
   6241 			argop->nfs_argop4_u.opgetattr.attr_request =
   6242 			    lookupargp->ga_bits;
   6243 			argop->nfs_argop4_u.opgetattr.mi =
   6244 			    lookupargp->mi;
   6245 			argop++;
   6246 			argcnt++;
   6247 			*p = c;
   6248 			continue;
   6249 		} else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') {
   6250 			/* lookupp */
   6251 			argop->argop = OP_LOOKUPP;
   6252 		} else {
   6253 			/* lookup */
   6254 			argop->argop = OP_LOOKUP;
   6255 			(void) str_to_utf8(s,
   6256 			    &argop->nfs_argop4_u.oplookup.objname);
   6257 		}
   6258 		lookup_idx = argcnt;
   6259 		argop++;
   6260 		argcnt++;
   6261 
   6262 		*p = c;
   6263 
   6264 		if (l4_getattrs == LKP4_ALL_ATTRIBUTES) {
   6265 			/* getfh XXX may not be needed in future */
   6266 			argop->argop = OP_GETFH;
   6267 			argop++;
   6268 			argcnt++;
   6269 
   6270 			/* getattr */
   6271 			argop->argop = OP_GETATTR;
   6272 			argop->nfs_argop4_u.opgetattr.attr_request =
   6273 			    lookupargp->ga_bits;
   6274 			argop->nfs_argop4_u.opgetattr.mi =
   6275 			    lookupargp->mi;
   6276 			argop++;
   6277 			argcnt++;
   6278 		}
   6279 	}
   6280 
   6281 	if ((l4_getattrs != LKP4_NO_ATTRIBUTES) &&
   6282 	    ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) {
   6283 		if (needgetfh) {
   6284 			/* stick in a post-lookup getfh */
   6285 			argop->argop = OP_GETFH;
   6286 			argcnt++;
   6287 			argop++;
   6288 		}
   6289 		/* post-lookup getattr */
   6290 		argop->argop = OP_GETATTR;
   6291 		argop->nfs_argop4_u.opgetattr.attr_request =
   6292 		    lookupargp->ga_bits;
   6293 		argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi;
   6294 		argcnt++;
   6295 	}
   6296 	argcnt += lookupargp->trailer_len;	/* actual op count */
   6297 	lookupargp->argsp->array_len = argcnt;
   6298 	lookupargp->arglen = arglen;
   6299 
   6300 #ifdef DEBUG
   6301 	if (nfs4_client_lookup_debug)
   6302 		nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt);
   6303 #endif
   6304 
   6305 	return (lookup_idx);
   6306 }
   6307 
   6308 static int
   6309 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr)
   6310 {
   6311 	COMPOUND4args_clnt	args;
   6312 	COMPOUND4res_clnt	res;
   6313 	GETFH4res	*gf_res = NULL;
   6314 	nfs_argop4	argop[4];
   6315 	nfs_resop4	*resop = NULL;
   6316 	nfs4_sharedfh_t *sfhp;
   6317 	hrtime_t t;
   6318 	nfs4_error_t	e;
   6319 
   6320 	rnode4_t	*drp;
   6321 	int		doqueue = 1;
   6322 	vnode_t		*vp;
   6323 	int		needrecov = 0;
   6324 	nfs4_recov_state_t recov_state;
   6325 
   6326 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
   6327 
   6328 	*avp = NULL;
   6329 	recov_state.rs_flags = 0;
   6330 	recov_state.rs_num_retry_despite_err = 0;
   6331 
   6332 recov_retry:
   6333 	/* COMPOUND: putfh, openattr, getfh, getattr */
   6334 	args.array_len = 4;
   6335 	args.array = argop;
   6336 	args.ctag = TAG_OPENATTR;
   6337 
   6338 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
   6339 	if (e.error)
   6340 		return (e.error);
   6341 
   6342 	drp = VTOR4(dvp);
   6343 
   6344 	/* putfh */
   6345 	argop[0].argop = OP_CPUTFH;
   6346 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
   6347 
   6348 	/* openattr */
   6349 	argop[1].argop = OP_OPENATTR;
   6350 	argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE);
   6351 
   6352 	/* getfh */
   6353 	argop[2].argop = OP_GETFH;
   6354 
   6355 	/* getattr */
   6356 	argop[3].argop = OP_GETATTR;
   6357 	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   6358 	argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
   6359 
   6360 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
   6361 	    "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first",
   6362 	    rnode4info(drp)));
   6363 
   6364 	t = gethrtime();
   6365 
   6366 	rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
   6367 
   6368 	needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp);
   6369 	if (needrecov) {
   6370 		bool_t abort;
   6371 
   6372 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   6373 		    "nfs4openattr: initiating recovery\n"));
   6374 
   6375 		abort = nfs4_start_recovery(&e,
   6376 		    VTOMI4(dvp), dvp, NULL, NULL, NULL,
   6377 		    OP_OPENATTR, NULL);
   6378 		nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
   6379 		if (!e.error) {
   6380 			e.error = geterrno4(res.status);
   6381 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   6382 		}
   6383 		if (abort == FALSE)
   6384 			goto recov_retry;
   6385 		return (e.error);
   6386 	}
   6387 
   6388 	if (e.error) {
   6389 		nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
   6390 		return (e.error);
   6391 	}
   6392 
   6393 	if (res.status) {
   6394 		/*
   6395 		 * If OTW errro is NOTSUPP, then it should be
   6396 		 * translated to EINVAL.  All Solaris file system
   6397 		 * implementations return EINVAL to the syscall layer
   6398 		 * when the attrdir cannot be created due to an
   6399 		 * implementation restriction or noxattr mount option.
   6400 		 */
   6401 		if (res.status == NFS4ERR_NOTSUPP) {
   6402 			mutex_enter(&drp->r_statelock);
   6403 			if (drp->r_xattr_dir)
   6404 				VN_RELE(drp->r_xattr_dir);
   6405 			VN_HOLD(NFS4_XATTR_DIR_NOTSUPP);
   6406 			drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP;
   6407 			mutex_exit(&drp->r_statelock);
   6408 
   6409 			e.error = EINVAL;
   6410 		} else {
   6411 			e.error = geterrno4(res.status);
   6412 		}
   6413 
   6414 		if (e.error) {
   6415 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   6416 			nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
   6417 			    needrecov);
   6418 			return (e.error);
   6419 		}
   6420 	}
   6421 
   6422 	resop = &res.array[0];  /* putfh res */
   6423 	ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK);
   6424 
   6425 	resop = &res.array[1];  /* openattr res */
   6426 	ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK);
   6427 
   6428 	resop = &res.array[2];  /* getfh res */
   6429 	gf_res = &resop->nfs_resop4_u.opgetfh;
   6430 	if (gf_res->object.nfs_fh4_len == 0) {
   6431 		*avp = NULL;
   6432 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   6433 		nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
   6434 		return (ENOENT);
   6435 	}
   6436 
   6437 	sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp));
   6438 	vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res,
   6439 	    dvp->v_vfsp, t, cr, dvp,
   6440 	    fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp));
   6441 	sfh4_rele(&sfhp);
   6442 
   6443 	if (e.error)
   6444 		PURGE_ATTRCACHE4(vp);
   6445 
   6446 	mutex_enter(&vp->v_lock);
   6447 	vp->v_flag |= V_XATTRDIR;
   6448 	mutex_exit(&vp->v_lock);
   6449 
   6450 	*avp = vp;
   6451 
   6452 	mutex_enter(&drp->r_statelock);
   6453 	if (drp->r_xattr_dir)
   6454 		VN_RELE(drp->r_xattr_dir);
   6455 	VN_HOLD(vp);
   6456 	drp->r_xattr_dir = vp;
   6457 
   6458 	/*
   6459 	 * Invalidate pathconf4 cache because r_xattr_dir is no longer
   6460 	 * NULL.  xattrs could be created at any time, and we have no
   6461 	 * way to update pc4_xattr_exists in the base object if/when
   6462 	 * it happens.
   6463 	 */
   6464 	drp->r_pathconf.pc4_xattr_valid = 0;
   6465 
   6466 	mutex_exit(&drp->r_statelock);
   6467 
   6468 	nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
   6469 
   6470 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   6471 
   6472 	return (0);
   6473 }
   6474 
   6475 /* ARGSUSED */
   6476 static int
   6477 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
   6478 	int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct,
   6479 	vsecattr_t *vsecp)
   6480 {
   6481 	int error;
   6482 	vnode_t *vp = NULL;
   6483 	rnode4_t *rp;
   6484 	struct vattr vattr;
   6485 	rnode4_t *drp;
   6486 	vnode_t *tempvp;
   6487 	enum createmode4 createmode;
   6488 	bool_t must_trunc = FALSE;
   6489 	int	truncating = 0;
   6490 
   6491 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
   6492 		return (EPERM);
   6493 	if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) {
   6494 		return (EINVAL);
   6495 	}
   6496 
   6497 	/* . and .. have special meaning in the protocol, reject them. */
   6498 
   6499 	if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0')))
   6500 		return (EISDIR);
   6501 
   6502 	drp = VTOR4(dvp);
   6503 
   6504 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
   6505 		return (EINTR);
   6506 
   6507 top:
   6508 	/*
   6509 	 * We make a copy of the attributes because the caller does not
   6510 	 * expect us to change what va points to.
   6511 	 */
   6512 	vattr = *va;
   6513 
   6514 	/*
   6515 	 * If the pathname is "", then dvp is the root vnode of
   6516 	 * a remote file mounted over a local directory.
   6517 	 * All that needs to be done is access
   6518 	 * checking and truncation.  Note that we avoid doing
   6519 	 * open w/ create because the parent directory might
   6520 	 * be in pseudo-fs and the open would fail.
   6521 	 */
   6522 	if (*nm == '\0') {
   6523 		error = 0;
   6524 		VN_HOLD(dvp);
   6525 		vp = dvp;
   6526 		must_trunc = TRUE;
   6527 	} else {
   6528 		/*
   6529 		 * We need to go over the wire, just to be sure whether the
   6530 		 * file exists or not.  Using the DNLC can be dangerous in
   6531 		 * this case when making a decision regarding existence.
   6532 		 */
   6533 		error = nfs4lookup(dvp, nm, &vp, cr, 1);
   6534 	}
   6535 
   6536 	if (exclusive)
   6537 		createmode = EXCLUSIVE4;
   6538 	else
   6539 		createmode = GUARDED4;
   6540 
   6541 	/*
   6542 	 * error would be set if the file does not exist on the
   6543 	 * server, so lets go create it.
   6544 	 */
   6545 	if (error) {
   6546 		goto create_otw;
   6547 	}
   6548 
   6549 	/*
   6550 	 * File does exist on the server
   6551 	 */
   6552 	if (exclusive == EXCL)
   6553 		error = EEXIST;
   6554 	else if (vp->v_type == VDIR && (mode & VWRITE))
   6555 		error = EISDIR;
   6556 	else {
   6557 		/*
   6558 		 * If vnode is a device, create special vnode.
   6559 		 */
   6560 		if (ISVDEV(vp->v_type)) {
   6561 			tempvp = vp;
   6562 			vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
   6563 			VN_RELE(tempvp);
   6564 		}
   6565 		if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
   6566 			if ((vattr.va_mask & AT_SIZE) &&
   6567 			    vp->v_type == VREG) {
   6568 				rp = VTOR4(vp);
   6569 				/*
   6570 				 * Check here for large file handled
   6571 				 * by LF-unaware process (as
   6572 				 * ufs_create() does)
   6573 				 */
   6574 				if (!(flags & FOFFMAX)) {
   6575 					mutex_enter(&rp->r_statelock);
   6576 					if (rp->r_size > MAXOFF32_T)
   6577 						error = EOVERFLOW;
   6578 					mutex_exit(&rp->r_statelock);
   6579 				}
   6580 
   6581 				/* if error is set then we need to return */
   6582 				if (error) {
   6583 					nfs_rw_exit(&drp->r_rwlock);
   6584 					VN_RELE(vp);
   6585 					return (error);
   6586 				}
   6587 
   6588 				if (must_trunc) {
   6589 					vattr.va_mask = AT_SIZE;
   6590 					error = nfs4setattr(vp, &vattr, 0, cr,
   6591 					    NULL);
   6592 				} else {
   6593 				/*
   6594 				 * we know we have a regular file that already
   6595 				 * exists and we may end up truncating the file
   6596 				 * as a result of the open_otw, so flush out
   6597 				 * any dirty pages for this file first.
   6598 				 */
   6599 					if (nfs4_has_pages(vp) &&
   6600 					    ((rp->r_flags & R4DIRTY) ||
   6601 					    rp->r_count > 0 ||
   6602 					    rp->r_mapcnt > 0)) {
   6603 						error = nfs4_putpage(vp,
   6604 						    (offset_t)0, 0, 0, cr, ct);
   6605 						if (error && (error == ENOSPC ||
   6606 						    error == EDQUOT)) {
   6607 							mutex_enter(
   6608 							    &rp->r_statelock);
   6609 							if (!rp->r_error)
   6610 								rp->r_error =
   6611 								    error;
   6612 							mutex_exit(
   6613 							    &rp->r_statelock);
   6614 						}
   6615 					}
   6616 					vattr.va_mask = (AT_SIZE |
   6617 					    AT_TYPE | AT_MODE);
   6618 					vattr.va_type = VREG;
   6619 					createmode = UNCHECKED4;
   6620 					truncating = 1;
   6621 					goto create_otw;
   6622 				}
   6623 			}
   6624 		}
   6625 	}
   6626 	nfs_rw_exit(&drp->r_rwlock);
   6627 	if (error) {
   6628 		VN_RELE(vp);
   6629 	} else {
   6630 		vnode_t *tvp;
   6631 		rnode4_t *trp;
   6632 		/*
   6633 		 * existing file got truncated, notify.
   6634 		 */
   6635 		tvp = vp;
   6636 		if (vp->v_type == VREG) {
   6637 			trp = VTOR4(vp);
   6638 			if (IS_SHADOW(vp, trp))
   6639 				tvp = RTOV4(trp);
   6640 		}
   6641 		vnevent_create(tvp, ct);
   6642 		*vpp = vp;
   6643 	}
   6644 	return (error);
   6645 
   6646 create_otw:
   6647 	dnlc_remove(dvp, nm);
   6648 
   6649 	ASSERT(vattr.va_mask & AT_TYPE);
   6650 
   6651 	/*
   6652 	 * If not a regular file let nfs4mknod() handle it.
   6653 	 */
   6654 	if (vattr.va_type != VREG) {
   6655 		error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
   6656 		nfs_rw_exit(&drp->r_rwlock);
   6657 		return (error);
   6658 	}
   6659 
   6660 	/*
   6661 	 * It _is_ a regular file.
   6662 	 */
   6663 	ASSERT(vattr.va_mask & AT_MODE);
   6664 	if (MANDMODE(vattr.va_mode)) {
   6665 		nfs_rw_exit(&drp->r_rwlock);
   6666 		return (EACCES);
   6667 	}
   6668 
   6669 	/*
   6670 	 * If this happens to be a mknod of a regular file, then flags will
   6671 	 * have neither FREAD or FWRITE.  However, we must set at least one
   6672 	 * for the call to nfs4open_otw.  If it's open(O_CREAT) driving
   6673 	 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been
   6674 	 * set (based on openmode specified by app).
   6675 	 */
   6676 	if ((flags & (FREAD|FWRITE)) == 0)
   6677 		flags |= (FREAD|FWRITE);
   6678 
   6679 	error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0);
   6680 
   6681 	if (vp != NULL) {
   6682 		/* if create was successful, throw away the file's pages */
   6683 		if (!error && (vattr.va_mask & AT_SIZE))
   6684 			nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK),
   6685 			    cr);
   6686 		/* release the lookup hold */
   6687 		VN_RELE(vp);
   6688 		vp = NULL;
   6689 	}
   6690 
   6691 	/*
   6692 	 * validate that we opened a regular file. This handles a misbehaving
   6693 	 * server that returns an incorrect FH.
   6694 	 */
   6695 	if ((error == 0) && *vpp && (*vpp)->v_type != VREG) {
   6696 		error = EISDIR;
   6697 		VN_RELE(*vpp);
   6698 	}
   6699 
   6700 	/*
   6701 	 * If this is not an exclusive create, then the CREATE
   6702 	 * request will be made with the GUARDED mode set.  This
   6703 	 * means that the server will return EEXIST if the file
   6704 	 * exists.  The file could exist because of a retransmitted
   6705 	 * request.  In this case, we recover by starting over and
   6706 	 * checking to see whether the file exists.  This second
   6707 	 * time through it should and a CREATE request will not be
   6708 	 * sent.
   6709 	 *
   6710 	 * This handles the problem of a dangling CREATE request
   6711 	 * which contains attributes which indicate that the file
   6712 	 * should be truncated.  This retransmitted request could
   6713 	 * possibly truncate valid data in the file if not caught
   6714 	 * by the duplicate request mechanism on the server or if
   6715 	 * not caught by other means.  The scenario is:
   6716 	 *
   6717 	 * Client transmits CREATE request with size = 0
   6718 	 * Client times out, retransmits request.
   6719 	 * Response to the first request arrives from the server
   6720 	 *  and the client proceeds on.
   6721 	 * Client writes data to the file.
   6722 	 * The server now processes retransmitted CREATE request
   6723 	 *  and truncates file.
   6724 	 *
   6725 	 * The use of the GUARDED CREATE request prevents this from
   6726 	 * happening because the retransmitted CREATE would fail
   6727 	 * with EEXIST and would not truncate the file.
   6728 	 */
   6729 	if (error == EEXIST && exclusive == NONEXCL) {
   6730 #ifdef DEBUG
   6731 		nfs4_create_misses++;
   6732 #endif
   6733 		goto top;
   6734 	}
   6735 	nfs_rw_exit(&drp->r_rwlock);
   6736 	if (truncating && !error && *vpp) {
   6737 		vnode_t *tvp;
   6738 		rnode4_t *trp;
   6739 		/*
   6740 		 * existing file got truncated, notify.
   6741 		 */
   6742 		tvp = *vpp;
   6743 		trp = VTOR4(tvp);
   6744 		if (IS_SHADOW(tvp, trp))
   6745 			tvp = RTOV4(trp);
   6746 		vnevent_create(tvp, ct);
   6747 	}
   6748 	return (error);
   6749 }
   6750 
   6751 /*
   6752  * Create compound (for mkdir, mknod, symlink):
   6753  * { Putfh <dfh>; Create; Getfh; Getattr }
   6754  * It's okay if setattr failed to set gid - this is not considered
   6755  * an error, but purge attrs in that case.
   6756  */
   6757 static int
   6758 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va,
   6759     vnode_t **vpp, cred_t *cr, nfs_ftype4 type)
   6760 {
   6761 	int need_end_op = FALSE;
   6762 	COMPOUND4args_clnt args;
   6763 	COMPOUND4res_clnt res, *resp = NULL;
   6764 	nfs_argop4 *argop;
   6765 	nfs_resop4 *resop;
   6766 	int doqueue;
   6767 	mntinfo4_t *mi;
   6768 	rnode4_t *drp = VTOR4(dvp);
   6769 	change_info4 *cinfo;
   6770 	GETFH4res *gf_res;
   6771 	struct vattr vattr;
   6772 	vnode_t *vp;
   6773 	fattr4 *crattr;
   6774 	bool_t needrecov = FALSE;
   6775 	nfs4_recov_state_t recov_state;
   6776 	nfs4_sharedfh_t *sfhp = NULL;
   6777 	hrtime_t t;
   6778 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   6779 	int numops, argoplist_size, setgid_flag, idx_create, idx_fattr;
   6780 	dirattr_info_t dinfo, *dinfop;
   6781 	servinfo4_t *svp;
   6782 	bitmap4 supp_attrs;
   6783 
   6784 	ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK ||
   6785 	    type == NF4CHR || type == NF4SOCK || type == NF4FIFO);
   6786 
   6787 	mi = VTOMI4(dvp);
   6788 
   6789 	/*
   6790 	 * Make sure we properly deal with setting the right gid
   6791 	 * on a new directory to reflect the parent's setgid bit
   6792 	 */
   6793 	setgid_flag = 0;
   6794 	if (type == NF4DIR) {
   6795 		struct vattr dva;
   6796 
   6797 		va->va_mode &= ~VSGID;
   6798 		dva.va_mask = AT_MODE | AT_GID;
   6799 		if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) {
   6800 
   6801 			/*
   6802 			 * If the parent's directory has the setgid bit set
   6803 			 * _and_ the client was able to get a valid mapping
   6804 			 * for the parent dir's owner_group, we want to
   6805 			 * append NVERIFY(owner_group == dva.va_gid) and
   6806 			 * SETTATTR to the CREATE compound.
   6807 			 */
   6808 			if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) {
   6809 				setgid_flag = 1;
   6810 				va->va_mode |= VSGID;
   6811 				if (dva.va_gid != GID_NOBODY) {
   6812 					va->va_mask |= AT_GID;
   6813 					va->va_gid = dva.va_gid;
   6814 				}
   6815 			}
   6816 		}
   6817 	}
   6818 
   6819 	/*
   6820 	 * Create ops:
   6821 	 *	0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new)
   6822 	 *	5:restorefh(dir) 6:getattr(dir)
   6823 	 *
   6824 	 * if (setgid)
   6825 	 *	0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new)
   6826 	 *	4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
   6827 	 *	8:nverify 9:setattr
   6828 	 */
   6829 	if (setgid_flag) {
   6830 		numops = 10;
   6831 		idx_create = 1;
   6832 		idx_fattr = 3;
   6833 	} else {
   6834 		numops = 7;
   6835 		idx_create = 2;
   6836 		idx_fattr = 4;
   6837 	}
   6838 
   6839 	ASSERT(nfs_zone() == mi->mi_zone);
   6840 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) {
   6841 		return (EINTR);
   6842 	}
   6843 	recov_state.rs_flags = 0;
   6844 	recov_state.rs_num_retry_despite_err = 0;
   6845 
   6846 	argoplist_size = numops * sizeof (nfs_argop4);
   6847 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
   6848 
   6849 recov_retry:
   6850 	if (type == NF4LNK)
   6851 		args.ctag = TAG_SYMLINK;
   6852 	else if (type == NF4DIR)
   6853 		args.ctag = TAG_MKDIR;
   6854 	else
   6855 		args.ctag = TAG_MKNOD;
   6856 
   6857 	args.array_len = numops;
   6858 	args.array = argop;
   6859 
   6860 	if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) {
   6861 		nfs_rw_exit(&drp->r_rwlock);
   6862 		kmem_free(argop, argoplist_size);
   6863 		return (e.error);
   6864 	}
   6865 	need_end_op = TRUE;
   6866 
   6867 
   6868 	/* 0: putfh directory */
   6869 	argop[0].argop = OP_CPUTFH;
   6870 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
   6871 
   6872 	/* 1/2: Create object */
   6873 	argop[idx_create].argop = OP_CCREATE;
   6874 	argop[idx_create].nfs_argop4_u.opccreate.cname = nm;
   6875 	argop[idx_create].nfs_argop4_u.opccreate.type = type;
   6876 	if (type == NF4LNK) {
   6877 		/*
   6878 		 * symlink, treat name as data
   6879 		 */
   6880 		ASSERT(data != NULL);
   6881 		argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata =
   6882 		    (char *)data;
   6883 	}
   6884 	if (type == NF4BLK || type == NF4CHR) {
   6885 		ASSERT(data != NULL);
   6886 		argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata =
   6887 		    *((specdata4 *)data);
   6888 	}
   6889 
   6890 	crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs;
   6891 
   6892 	svp = drp->r_server;
   6893 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   6894 	supp_attrs = svp->sv_supp_attrs;
   6895 	nfs_rw_exit(&svp->sv_lock);
   6896 
   6897 	if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) {
   6898 		nfs_rw_exit(&drp->r_rwlock);
   6899 		nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
   6900 		e.error = EINVAL;
   6901 		kmem_free(argop, argoplist_size);
   6902 		return (e.error);
   6903 	}
   6904 
   6905 	/* 2/3: getfh fh of created object */
   6906 	ASSERT(idx_create + 1 == idx_fattr - 1);
   6907 	argop[idx_create + 1].argop = OP_GETFH;
   6908 
   6909 	/* 3/4: getattr of new object */
   6910 	argop[idx_fattr].argop = OP_GETATTR;
   6911 	argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   6912 	argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi;
   6913 
   6914 	if (setgid_flag) {
   6915 		vattr_t	_v;
   6916 
   6917 		argop[4].argop = OP_SAVEFH;
   6918 
   6919 		argop[5].argop = OP_CPUTFH;
   6920 		argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
   6921 
   6922 		argop[6].argop = OP_GETATTR;
   6923 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   6924 		argop[6].nfs_argop4_u.opgetattr.mi = mi;
   6925 
   6926 		argop[7].argop = OP_RESTOREFH;
   6927 
   6928 		/*
   6929 		 * nverify
   6930 		 *
   6931 		 * XXX - Revisit the last argument to nfs4_end_op()
   6932 		 *	 once 5020486 is fixed.
   6933 		 */
   6934 		_v.va_mask = AT_GID;
   6935 		_v.va_gid = va->va_gid;
   6936 		if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
   6937 		    supp_attrs)) {
   6938 			nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
   6939 			nfs_rw_exit(&drp->r_rwlock);
   6940 			nfs4_fattr4_free(crattr);
   6941 			kmem_free(argop, argoplist_size);
   6942 			return (e.error);
   6943 		}
   6944 
   6945 		/*
   6946 		 * setattr
   6947 		 *
   6948 		 * We _know_ we're not messing with AT_SIZE or AT_XTIME,
   6949 		 * so no need for stateid or flags. Also we specify NULL
   6950 		 * rp since we're only interested in setting owner_group
   6951 		 * attributes.
   6952 		 */
   6953 		nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs,
   6954 		    &e.error, 0);
   6955 
   6956 		if (e.error) {
   6957 			nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
   6958 			nfs_rw_exit(&drp->r_rwlock);
   6959 			nfs4_fattr4_free(crattr);
   6960 			nfs4args_verify_free(&argop[8]);
   6961 			kmem_free(argop, argoplist_size);
   6962 			return (e.error);
   6963 		}
   6964 	} else {
   6965 		argop[1].argop = OP_SAVEFH;
   6966 
   6967 		argop[5].argop = OP_RESTOREFH;
   6968 
   6969 		argop[6].argop = OP_GETATTR;
   6970 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   6971 		argop[6].nfs_argop4_u.opgetattr.mi = mi;
   6972 	}
   6973 
   6974 	dnlc_remove(dvp, nm);
   6975 
   6976 	doqueue = 1;
   6977 	t = gethrtime();
   6978 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
   6979 
   6980 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
   6981 	if (e.error) {
   6982 		PURGE_ATTRCACHE4(dvp);
   6983 		if (!needrecov)
   6984 			goto out;
   6985 	}
   6986 
   6987 	if (needrecov) {
   6988 		if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
   6989 		    OP_CREATE, NULL) == FALSE) {
   6990 			nfs4_end_op(mi, dvp, NULL, &recov_state,
   6991 			    needrecov);
   6992 			need_end_op = FALSE;
   6993 			nfs4_fattr4_free(crattr);
   6994 			if (setgid_flag) {
   6995 				nfs4args_verify_free(&argop[8]);
   6996 				nfs4args_setattr_free(&argop[9]);
   6997 			}
   6998 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
   6999 			goto recov_retry;
   7000 		}
   7001 	}
   7002 
   7003 	resp = &res;
   7004 
   7005 	if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
   7006 
   7007 		if (res.status == NFS4ERR_BADOWNER)
   7008 			nfs4_log_badowner(mi, OP_CREATE);
   7009 
   7010 		e.error = geterrno4(res.status);
   7011 
   7012 		/*
   7013 		 * This check is left over from when create was implemented
   7014 		 * using a setattr op (instead of createattrs).  If the
   7015 		 * putfh/create/getfh failed, the error was returned.  If
   7016 		 * setattr/getattr failed, we keep going.
   7017 		 *
   7018 		 * It might be better to get rid of the GETFH also, and just
   7019 		 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory.
   7020 		 * Then if any of the operations failed, we could return the
   7021 		 * error now, and remove much of the error code below.
   7022 		 */
   7023 		if (res.array_len <= idx_fattr) {
   7024 			/*
   7025 			 * Either Putfh, Create or Getfh failed.
   7026 			 */
   7027 			PURGE_ATTRCACHE4(dvp);
   7028 			/*
   7029 			 * nfs4_purge_stale_fh() may generate otw calls through
   7030 			 * nfs4_invalidate_pages. Hence the need to call
   7031 			 * nfs4_end_op() here to avoid nfs4_start_op() deadlock.
   7032 			 */
   7033 			nfs4_end_op(mi, dvp, NULL, &recov_state,
   7034 			    needrecov);
   7035 			need_end_op = FALSE;
   7036 			nfs4_purge_stale_fh(e.error, dvp, cr);
   7037 			goto out;
   7038 		}
   7039 	}
   7040 
   7041 	resop = &res.array[idx_create];	/* create res */
   7042 	cinfo = &resop->nfs_resop4_u.opcreate.cinfo;
   7043 
   7044 	resop = &res.array[idx_create + 1]; /* getfh res */
   7045 	gf_res = &resop->nfs_resop4_u.opgetfh;
   7046 
   7047 	sfhp = sfh4_get(&gf_res->object, mi);
   7048 	if (e.error) {
   7049 		*vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp,
   7050 		    fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
   7051 		if (vp->v_type == VNON) {
   7052 			vattr.va_mask = AT_TYPE;
   7053 			/*
   7054 			 * Need to call nfs4_end_op before nfs4getattr to avoid
   7055 			 * potential nfs4_start_op deadlock. See RFE 4777612.
   7056 			 */
   7057 			nfs4_end_op(mi, dvp, NULL, &recov_state,
   7058 			    needrecov);
   7059 			need_end_op = FALSE;
   7060 			e.error = nfs4getattr(vp, &vattr, cr);
   7061 			if (e.error) {
   7062 				VN_RELE(vp);
   7063 				*vpp = NULL;
   7064 				goto out;
   7065 			}
   7066 			vp->v_type = vattr.va_type;
   7067 		}
   7068 		e.error = 0;
   7069 	} else {
   7070 		*vpp = vp = makenfs4node(sfhp,
   7071 		    &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res,
   7072 		    dvp->v_vfsp, t, cr,
   7073 		    dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
   7074 	}
   7075 
   7076 	/*
   7077 	 * If compound succeeded, then update dir attrs
   7078 	 */
   7079 	if (res.status == NFS4_OK) {
   7080 		dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
   7081 		dinfo.di_cred = cr;
   7082 		dinfo.di_time_call = t;
   7083 		dinfop = &dinfo;
   7084 	} else
   7085 		dinfop = NULL;
   7086 
   7087 	/* Update directory cache attribute, readdir and dnlc caches */
   7088 	nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop);
   7089 
   7090 out:
   7091 	if (sfhp != NULL)
   7092 		sfh4_rele(&sfhp);
   7093 	nfs_rw_exit(&drp->r_rwlock);
   7094 	nfs4_fattr4_free(crattr);
   7095 	if (setgid_flag) {
   7096 		nfs4args_verify_free(&argop[8]);
   7097 		nfs4args_setattr_free(&argop[9]);
   7098 	}
   7099 	if (resp)
   7100 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
   7101 	if (need_end_op)
   7102 		nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
   7103 
   7104 	kmem_free(argop, argoplist_size);
   7105 	return (e.error);
   7106 }
   7107 
   7108 /* ARGSUSED */
   7109 static int
   7110 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
   7111     int mode, vnode_t **vpp, cred_t *cr)
   7112 {
   7113 	int error;
   7114 	vnode_t *vp;
   7115 	nfs_ftype4 type;
   7116 	specdata4 spec, *specp = NULL;
   7117 
   7118 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
   7119 
   7120 	switch (va->va_type) {
   7121 	case VCHR:
   7122 	case VBLK:
   7123 		type = (va->va_type == VCHR) ? NF4CHR : NF4BLK;
   7124 		spec.specdata1 = getmajor(va->va_rdev);
   7125 		spec.specdata2 = getminor(va->va_rdev);
   7126 		specp = &spec;
   7127 		break;
   7128 
   7129 	case VFIFO:
   7130 		type = NF4FIFO;
   7131 		break;
   7132 	case VSOCK:
   7133 		type = NF4SOCK;
   7134 		break;
   7135 
   7136 	default:
   7137 		return (EINVAL);
   7138 	}
   7139 
   7140 	error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type);
   7141 	if (error) {
   7142 		return (error);
   7143 	}
   7144 
   7145 	/*
   7146 	 * This might not be needed any more; special case to deal
   7147 	 * with problematic v2/v3 servers.  Since create was unable
   7148 	 * to set group correctly, not sure what hope setattr has.
   7149 	 */
   7150 	if (va->va_gid != VTOR4(vp)->r_attr.va_gid) {
   7151 		va->va_mask = AT_GID;
   7152 		(void) nfs4setattr(vp, va, 0, cr, NULL);
   7153 	}
   7154 
   7155 	/*
   7156 	 * If vnode is a device create special vnode
   7157 	 */
   7158 	if (ISVDEV(vp->v_type)) {
   7159 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
   7160 		VN_RELE(vp);
   7161 	} else {
   7162 		*vpp = vp;
   7163 	}
   7164 	return (error);
   7165 }
   7166 
   7167 /*
   7168  * Remove requires that the current fh be the target directory.
   7169  * After the operation, the current fh is unchanged.
   7170  * The compound op structure is:
   7171  *      PUTFH(targetdir), REMOVE
   7172  *
   7173  * Weirdness: if the vnode to be removed is open
   7174  * we rename it instead of removing it and nfs_inactive
   7175  * will remove the new name.
   7176  */
   7177 /* ARGSUSED */
   7178 static int
   7179 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
   7180 {
   7181 	COMPOUND4args_clnt args;
   7182 	COMPOUND4res_clnt res, *resp = NULL;
   7183 	REMOVE4res *rm_res;
   7184 	nfs_argop4 argop[3];
   7185 	nfs_resop4 *resop;
   7186 	vnode_t *vp;
   7187 	char *tmpname;
   7188 	int doqueue;
   7189 	mntinfo4_t *mi;
   7190 	rnode4_t *rp;
   7191 	rnode4_t *drp;
   7192 	int needrecov = 0;
   7193 	nfs4_recov_state_t recov_state;
   7194 	int isopen;
   7195 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   7196 	dirattr_info_t dinfo;
   7197 
   7198 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
   7199 		return (EPERM);
   7200 	drp = VTOR4(dvp);
   7201 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
   7202 		return (EINTR);
   7203 
   7204 	e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
   7205 	if (e.error) {
   7206 		nfs_rw_exit(&drp->r_rwlock);
   7207 		return (e.error);
   7208 	}
   7209 
   7210 	if (vp->v_type == VDIR) {
   7211 		VN_RELE(vp);
   7212 		nfs_rw_exit(&drp->r_rwlock);
   7213 		return (EISDIR);
   7214 	}
   7215 
   7216 	/*
   7217 	 * First just remove the entry from the name cache, as it
   7218 	 * is most likely the only entry for this vp.
   7219 	 */
   7220 	dnlc_remove(dvp, nm);
   7221 
   7222 	rp = VTOR4(vp);
   7223 
   7224 	/*
   7225 	 * For regular file types, check to see if the file is open by looking
   7226 	 * at the open streams.
   7227 	 * For all other types, check the reference count on the vnode.  Since
   7228 	 * they are not opened OTW they never have an open stream.
   7229 	 *
   7230 	 * If the file is open, rename it to .nfsXXXX.
   7231 	 */
   7232 	if (vp->v_type != VREG) {
   7233 		/*
   7234 		 * If the file has a v_count > 1 then there may be more than one
   7235 		 * entry in the name cache due multiple links or an open file,
   7236 		 * but we don't have the real reference count so flush all
   7237 		 * possible entries.
   7238 		 */
   7239 		if (vp->v_count > 1)
   7240 			dnlc_purge_vp(vp);
   7241 
   7242 		/*
   7243 		 * Now we have the real reference count.
   7244 		 */
   7245 		isopen = vp->v_count > 1;
   7246 	} else {
   7247 		mutex_enter(&rp->r_os_lock);
   7248 		isopen = list_head(&rp->r_open_streams) != NULL;
   7249 		mutex_exit(&rp->r_os_lock);
   7250 	}
   7251 
   7252 	mutex_enter(&rp->r_statelock);
   7253 	if (isopen &&
   7254 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
   7255 		mutex_exit(&rp->r_statelock);
   7256 		tmpname = newname();
   7257 		e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct);
   7258 		if (e.error)
   7259 			kmem_free(tmpname, MAXNAMELEN);
   7260 		else {
   7261 			mutex_enter(&rp->r_statelock);
   7262 			if (rp->r_unldvp == NULL) {
   7263 				VN_HOLD(dvp);
   7264 				rp->r_unldvp = dvp;
   7265 				if (rp->r_unlcred != NULL)
   7266 					crfree(rp->r_unlcred);
   7267 				crhold(cr);
   7268 				rp->r_unlcred = cr;
   7269 				rp->r_unlname = tmpname;
   7270 			} else {
   7271 				kmem_free(rp->r_unlname, MAXNAMELEN);
   7272 				rp->r_unlname = tmpname;
   7273 			}
   7274 			mutex_exit(&rp->r_statelock);
   7275 		}
   7276 		VN_RELE(vp);
   7277 		nfs_rw_exit(&drp->r_rwlock);
   7278 		return (e.error);
   7279 	}
   7280 	/*
   7281 	 * Actually remove the file/dir
   7282 	 */
   7283 	mutex_exit(&rp->r_statelock);
   7284 
   7285 	/*
   7286 	 * We need to flush any dirty pages which happen to
   7287 	 * be hanging around before removing the file.
   7288 	 * This shouldn't happen very often since in NFSv4
   7289 	 * we should be close to open consistent.
   7290 	 */
   7291 	if (nfs4_has_pages(vp) &&
   7292 	    ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
   7293 		e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct);
   7294 		if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
   7295 			mutex_enter(&rp->r_statelock);
   7296 			if (!rp->r_error)
   7297 				rp->r_error = e.error;
   7298 			mutex_exit(&rp->r_statelock);
   7299 		}
   7300 	}
   7301 
   7302 	mi = VTOMI4(dvp);
   7303 
   7304 	(void) nfs4delegreturn(rp, NFS4_DR_REOPEN);
   7305 	recov_state.rs_flags = 0;
   7306 	recov_state.rs_num_retry_despite_err = 0;
   7307 
   7308 recov_retry:
   7309 	/*
   7310 	 * Remove ops: putfh dir; remove
   7311 	 */
   7312 	args.ctag = TAG_REMOVE;
   7313 	args.array_len = 3;
   7314 	args.array = argop;
   7315 
   7316 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
   7317 	if (e.error) {
   7318 		nfs_rw_exit(&drp->r_rwlock);
   7319 		VN_RELE(vp);
   7320 		return (e.error);
   7321 	}
   7322 
   7323 	/* putfh directory */
   7324 	argop[0].argop = OP_CPUTFH;
   7325 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
   7326 
   7327 	/* remove */
   7328 	argop[1].argop = OP_CREMOVE;
   7329 	argop[1].nfs_argop4_u.opcremove.ctarget = nm;
   7330 
   7331 	/* getattr dir */
   7332 	argop[2].argop = OP_GETATTR;
   7333 	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   7334 	argop[2].nfs_argop4_u.opgetattr.mi = mi;
   7335 
   7336 	doqueue = 1;
   7337 	dinfo.di_time_call = gethrtime();
   7338 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
   7339 
   7340 	PURGE_ATTRCACHE4(vp);
   7341 
   7342 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
   7343 	if (e.error)
   7344 		PURGE_ATTRCACHE4(dvp);
   7345 
   7346 	if (needrecov) {
   7347 		if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp,
   7348 		    NULL, NULL, NULL, OP_REMOVE, NULL) == FALSE) {
   7349 			if (!e.error)
   7350 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   7351 				    (caddr_t)&res);
   7352 			nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
   7353 			    needrecov);
   7354 			goto recov_retry;
   7355 		}
   7356 	}
   7357 
   7358 	/*
   7359 	 * Matching nfs4_end_op() for start_op() above.
   7360 	 * There is a path in the code below which calls
   7361 	 * nfs4_purge_stale_fh(), which may generate otw calls through
   7362 	 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
   7363 	 * here to avoid nfs4_start_op() deadlock.
   7364 	 */
   7365 	nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
   7366 
   7367 	if (!e.error) {
   7368 		resp = &res;
   7369 
   7370 		if (res.status) {
   7371 			e.error = geterrno4(res.status);
   7372 			PURGE_ATTRCACHE4(dvp);
   7373 			nfs4_purge_stale_fh(e.error, dvp, cr);
   7374 		} else {
   7375 			resop = &res.array[1];	/* remove res */
   7376 			rm_res = &resop->nfs_resop4_u.opremove;
   7377 
   7378 			dinfo.di_garp =
   7379 			    &res.array[2].nfs_resop4_u.opgetattr.ga_res;
   7380 			dinfo.di_cred = cr;
   7381 
   7382 			/* Update directory attr, readdir and dnlc caches */
   7383 			nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
   7384 			    &dinfo);
   7385 		}
   7386 	}
   7387 	nfs_rw_exit(&drp->r_rwlock);
   7388 	if (resp)
   7389 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
   7390 
   7391 	if (e.error == 0) {
   7392 		vnode_t *tvp;
   7393 		rnode4_t *trp;
   7394 		trp = VTOR4(vp);
   7395 		tvp = vp;
   7396 		if (IS_SHADOW(vp, trp))
   7397 			tvp = RTOV4(trp);
   7398 		vnevent_remove(tvp, dvp, nm, ct);
   7399 	}
   7400 	VN_RELE(vp);
   7401 	return (e.error);
   7402 }
   7403 
   7404 /*
   7405  * Link requires that the current fh be the target directory and the
   7406  * saved fh be the source fh. After the operation, the current fh is unchanged.
   7407  * Thus the compound op structure is:
   7408  *	PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH,
   7409  *	GETATTR(file)
   7410  */
   7411 /* ARGSUSED */
   7412 static int
   7413 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
   7414     caller_context_t *ct, int flags)
   7415 {
   7416 	COMPOUND4args_clnt args;
   7417 	COMPOUND4res_clnt res, *resp = NULL;
   7418 	LINK4res *ln_res;
   7419 	int argoplist_size  = 7 * sizeof (nfs_argop4);
   7420 	nfs_argop4 *argop;
   7421 	nfs_resop4 *resop;
   7422 	vnode_t *realvp, *nvp;
   7423 	int doqueue;
   7424 	mntinfo4_t *mi;
   7425 	rnode4_t *tdrp;
   7426 	bool_t needrecov = FALSE;
   7427 	nfs4_recov_state_t recov_state;
   7428 	hrtime_t t;
   7429 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   7430 	dirattr_info_t dinfo;
   7431 
   7432 	ASSERT(*tnm != '\0');
   7433 	ASSERT(tdvp->v_type == VDIR);
   7434 	ASSERT(nfs4_consistent_type(tdvp));
   7435 	ASSERT(nfs4_consistent_type(svp));
   7436 
   7437 	if (nfs_zone() != VTOMI4(tdvp)->mi_zone)
   7438 		return (EPERM);
   7439 	if (VOP_REALVP(svp, &realvp, ct) == 0) {
   7440 		svp = realvp;
   7441 		ASSERT(nfs4_consistent_type(svp));
   7442 	}
   7443 
   7444 	tdrp = VTOR4(tdvp);
   7445 	mi = VTOMI4(svp);
   7446 
   7447 	if (!(mi->mi_flags & MI4_LINK)) {
   7448 		return (EOPNOTSUPP);
   7449 	}
   7450 	recov_state.rs_flags = 0;
   7451 	recov_state.rs_num_retry_despite_err = 0;
   7452 
   7453 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp)))
   7454 		return (EINTR);
   7455 
   7456 recov_retry:
   7457 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
   7458 
   7459 	args.ctag = TAG_LINK;
   7460 
   7461 	/*
   7462 	 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir);
   7463 	 * restorefh; getattr(fl)
   7464 	 */
   7465 	args.array_len = 7;
   7466 	args.array = argop;
   7467 
   7468 	e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state);
   7469 	if (e.error) {
   7470 		kmem_free(argop, argoplist_size);
   7471 		nfs_rw_exit(&tdrp->r_rwlock);
   7472 		return (e.error);
   7473 	}
   7474 
   7475 	/* 0. putfh file */
   7476 	argop[0].argop = OP_CPUTFH;
   7477 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh;
   7478 
   7479 	/* 1. save current fh to free up the space for the dir */
   7480 	argop[1].argop = OP_SAVEFH;
   7481 
   7482 	/* 2. putfh targetdir */
   7483 	argop[2].argop = OP_CPUTFH;
   7484 	argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh;
   7485 
   7486 	/* 3. link: current_fh is targetdir, saved_fh is source */
   7487 	argop[3].argop = OP_CLINK;
   7488 	argop[3].nfs_argop4_u.opclink.cnewname = tnm;
   7489 
   7490 	/* 4. Get attributes of dir */
   7491 	argop[4].argop = OP_GETATTR;
   7492 	argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   7493 	argop[4].nfs_argop4_u.opgetattr.mi = mi;
   7494 
   7495 	/* 5. If link was successful, restore current vp to file */
   7496 	argop[5].argop = OP_RESTOREFH;
   7497 
   7498 	/* 6. Get attributes of linked object */
   7499 	argop[6].argop = OP_GETATTR;
   7500 	argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
   7501 	argop[6].nfs_argop4_u.opgetattr.mi = mi;
   7502 
   7503 	dnlc_remove(tdvp, tnm);
   7504 
   7505 	doqueue = 1;
   7506 	t = gethrtime();
   7507 
   7508 	rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e);
   7509 
   7510 	needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp);
   7511 	if (e.error != 0 && !needrecov) {
   7512 		PURGE_ATTRCACHE4(tdvp);
   7513 		PURGE_ATTRCACHE4(svp);
   7514 		nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
   7515 		goto out;
   7516 	}
   7517 
   7518 	if (needrecov) {
   7519 		bool_t abort;
   7520 
   7521 		abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp,
   7522 		    NULL, NULL, OP_LINK, NULL);
   7523 		if (abort == FALSE) {
   7524 			nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state,
   7525 			    needrecov);
   7526 			kmem_free(argop, argoplist_size);
   7527 			if (!e.error)
   7528 				(void) xdr_free(xdr_COMPOUND4res_clnt,
   7529 				    (caddr_t)&res);
   7530 			goto recov_retry;
   7531 		} else {
   7532 			if (e.error != 0) {
   7533 				PURGE_ATTRCACHE4(tdvp);
   7534 				PURGE_ATTRCACHE4(svp);
   7535 				nfs4_end_op(VTOMI4(svp), svp, tdvp,
   7536 				    &recov_state, needrecov);
   7537 				goto out;
   7538 			}
   7539 			/* fall through for res.status case */
   7540 		}
   7541 	}
   7542 
   7543 	nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
   7544 
   7545 	resp = &res;
   7546 	if (res.status) {
   7547 		/* If link succeeded, then don't return error */
   7548 		e.error = geterrno4(res.status);
   7549 		if (res.array_len <= 4) {
   7550 			/*
   7551 			 * Either Putfh, Savefh, Putfh dir, or Link failed
   7552 			 */
   7553 			PURGE_ATTRCACHE4(svp);
   7554 			PURGE_ATTRCACHE4(tdvp);
   7555 			if (e.error == EOPNOTSUPP) {
   7556 				mutex_enter(&mi->mi_lock);
   7557 				mi->mi_flags &= ~MI4_LINK;
   7558