Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  *
     25  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
     26  *	All rights reserved.
     27  */
     28 
     29 #include <sys/param.h>
     30 #include <sys/types.h>
     31 #include <sys/systm.h>
     32 #include <sys/cred.h>
     33 #include <sys/time.h>
     34 #include <sys/vnode.h>
     35 #include <sys/vfs.h>
     36 #include <sys/vfs_opreg.h>
     37 #include <sys/file.h>
     38 #include <sys/filio.h>
     39 #include <sys/uio.h>
     40 #include <sys/buf.h>
     41 #include <sys/mman.h>
     42 #include <sys/pathname.h>
     43 #include <sys/dirent.h>
     44 #include <sys/debug.h>
     45 #include <sys/vmsystm.h>
     46 #include <sys/fcntl.h>
     47 #include <sys/flock.h>
     48 #include <sys/swap.h>
     49 #include <sys/errno.h>
     50 #include <sys/strsubr.h>
     51 #include <sys/sysmacros.h>
     52 #include <sys/kmem.h>
     53 #include <sys/cmn_err.h>
     54 #include <sys/pathconf.h>
     55 #include <sys/utsname.h>
     56 #include <sys/dnlc.h>
     57 #include <sys/acl.h>
     58 #include <sys/atomic.h>
     59 #include <sys/policy.h>
     60 #include <sys/sdt.h>
     61 
     62 #include <rpc/types.h>
     63 #include <rpc/auth.h>
     64 #include <rpc/clnt.h>
     65 
     66 #include <nfs/nfs.h>
     67 #include <nfs/nfs_clnt.h>
     68 #include <nfs/rnode.h>
     69 #include <nfs/nfs_acl.h>
     70 #include <nfs/lm.h>
     71 
     72 #include <vm/hat.h>
     73 #include <vm/as.h>
     74 #include <vm/page.h>
     75 #include <vm/pvn.h>
     76 #include <vm/seg.h>
     77 #include <vm/seg_map.h>
     78 #include <vm/seg_kpm.h>
     79 #include <vm/seg_vn.h>
     80 
     81 #include <fs/fs_subr.h>
     82 
     83 #include <sys/ddi.h>
     84 
     85 static int	nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
     86 			cred_t *);
     87 static int	nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
     88 static int	nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
     89 static int	nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
     90 static int	nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
     91 static int	nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
     92 static int	nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *,
     93 			caller_context_t *);
     94 static int	nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
     95 static int	nfs_bio(struct buf *, cred_t *);
     96 static int	nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
     97 			page_t *[], size_t, struct seg *, caddr_t,
     98 			enum seg_rw, cred_t *);
     99 static void	nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
    100 			cred_t *);
    101 static int	nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
    102 			int, cred_t *);
    103 static int	nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
    104 			int, cred_t *);
    105 static void	nfs_delmap_callback(struct as *, void *, uint_t);
    106 
    107 /*
    108  * Error flags used to pass information about certain special errors
    109  * which need to be handled specially.
    110  */
    111 #define	NFS_EOF			-98
    112 
    113 /*
    114  * These are the vnode ops routines which implement the vnode interface to
    115  * the networked file system.  These routines just take their parameters,
    116  * make them look networkish by putting the right info into interface structs,
    117  * and then calling the appropriate remote routine(s) to do the work.
    118  *
    119  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
    120  * we purge the directory cache relative to that vnode.  This way, the
    121  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
    122  * more details on rnode locking.
    123  */
    124 
    125 static int	nfs_open(vnode_t **, int, cred_t *, caller_context_t *);
    126 static int	nfs_close(vnode_t *, int, int, offset_t, cred_t *,
    127 			caller_context_t *);
    128 static int	nfs_read(vnode_t *, struct uio *, int, cred_t *,
    129 			caller_context_t *);
    130 static int	nfs_write(vnode_t *, struct uio *, int, cred_t *,
    131 			caller_context_t *);
    132 static int	nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
    133 			caller_context_t *);
    134 static int	nfs_getattr(vnode_t *, struct vattr *, int, cred_t *,
    135 			caller_context_t *);
    136 static int	nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
    137 			caller_context_t *);
    138 static int	nfs_access(vnode_t *, int, int, cred_t *, caller_context_t *);
    139 static int	nfs_accessx(void *, int, cred_t *);
    140 static int	nfs_readlink(vnode_t *, struct uio *, cred_t *,
    141 			caller_context_t *);
    142 static int	nfs_fsync(vnode_t *, int, cred_t *, caller_context_t *);
    143 static void	nfs_inactive(vnode_t *, cred_t *, caller_context_t *);
    144 static int	nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
    145 			int, vnode_t *, cred_t *, caller_context_t *,
    146 			int *, pathname_t *);
    147 static int	nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
    148 			int, vnode_t **, cred_t *, int, caller_context_t *,
    149 			vsecattr_t *);
    150 static int	nfs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
    151 			int);
    152 static int	nfs_link(vnode_t *, vnode_t *, char *, cred_t *,
    153 			caller_context_t *, int);
    154 static int	nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
    155 			caller_context_t *, int);
    156 static int	nfs_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
    157 			cred_t *, caller_context_t *, int, vsecattr_t *);
    158 static int	nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
    159 			caller_context_t *, int);
    160 static int	nfs_symlink(vnode_t *, char *, struct vattr *, char *,
    161 			cred_t *, caller_context_t *, int);
    162 static int	nfs_readdir(vnode_t *, struct uio *, cred_t *, int *,
    163 			caller_context_t *, int);
    164 static int	nfs_fid(vnode_t *, fid_t *, caller_context_t *);
    165 static int	nfs_rwlock(vnode_t *, int, caller_context_t *);
    166 static void	nfs_rwunlock(vnode_t *, int, caller_context_t *);
    167 static int	nfs_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
    168 static int	nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
    169 			page_t *[], size_t, struct seg *, caddr_t,
    170 			enum seg_rw, cred_t *, caller_context_t *);
    171 static int	nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
    172 			caller_context_t *);
    173 static int	nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
    174 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
    175 static int	nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
    176 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
    177 static int	nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
    178 			struct flk_callback *, cred_t *, caller_context_t *);
    179 static int	nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
    180 			cred_t *, caller_context_t *);
    181 static int	nfs_realvp(vnode_t *, vnode_t **, caller_context_t *);
    182 static int	nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
    183 			uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
    184 static int	nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *,
    185 			caller_context_t *);
    186 static int	nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
    187 			cred_t *, caller_context_t *);
    188 static int	nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
    189 			caller_context_t *);
    190 static int	nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
    191 			caller_context_t *);
    192 static int	nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
    193 			caller_context_t *);
    194 
    195 struct vnodeops *nfs_vnodeops;
    196 
    197 const fs_operation_def_t nfs_vnodeops_template[] = {
    198 	VOPNAME_OPEN,		{ .vop_open = nfs_open },
    199 	VOPNAME_CLOSE,		{ .vop_close = nfs_close },
    200 	VOPNAME_READ,		{ .vop_read = nfs_read },
    201 	VOPNAME_WRITE,		{ .vop_write = nfs_write },
    202 	VOPNAME_IOCTL,		{ .vop_ioctl = nfs_ioctl },
    203 	VOPNAME_GETATTR,	{ .vop_getattr = nfs_getattr },
    204 	VOPNAME_SETATTR,	{ .vop_setattr = nfs_setattr },
    205 	VOPNAME_ACCESS,		{ .vop_access = nfs_access },
    206 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs_lookup },
    207 	VOPNAME_CREATE,		{ .vop_create = nfs_create },
    208 	VOPNAME_REMOVE,		{ .vop_remove = nfs_remove },
    209 	VOPNAME_LINK,		{ .vop_link = nfs_link },
    210 	VOPNAME_RENAME,		{ .vop_rename = nfs_rename },
    211 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs_mkdir },
    212 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs_rmdir },
    213 	VOPNAME_READDIR,	{ .vop_readdir = nfs_readdir },
    214 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs_symlink },
    215 	VOPNAME_READLINK,	{ .vop_readlink = nfs_readlink },
    216 	VOPNAME_FSYNC,		{ .vop_fsync = nfs_fsync },
    217 	VOPNAME_INACTIVE,	{ .vop_inactive = nfs_inactive },
    218 	VOPNAME_FID,		{ .vop_fid = nfs_fid },
    219 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs_rwlock },
    220 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs_rwunlock },
    221 	VOPNAME_SEEK,		{ .vop_seek = nfs_seek },
    222 	VOPNAME_FRLOCK,		{ .vop_frlock = nfs_frlock },
    223 	VOPNAME_SPACE,		{ .vop_space = nfs_space },
    224 	VOPNAME_REALVP,		{ .vop_realvp = nfs_realvp },
    225 	VOPNAME_GETPAGE,	{ .vop_getpage = nfs_getpage },
    226 	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs_putpage },
    227 	VOPNAME_MAP,		{ .vop_map = nfs_map },
    228 	VOPNAME_ADDMAP,		{ .vop_addmap = nfs_addmap },
    229 	VOPNAME_DELMAP,		{ .vop_delmap = nfs_delmap },
    230 	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
    231 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs_pathconf },
    232 	VOPNAME_PAGEIO,		{ .vop_pageio = nfs_pageio },
    233 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs_setsecattr },
    234 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs_getsecattr },
    235 	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs_shrlock },
    236 	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
    237 	NULL,			NULL
    238 };
    239 
    240 /*
    241  * XXX:  This is referenced in modstubs.s
    242  */
    243 struct vnodeops *
    244 nfs_getvnodeops(void)
    245 {
    246 	return (nfs_vnodeops);
    247 }
    248 
    249 /* ARGSUSED */
    250 static int
    251 nfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
    252 {
    253 	int error;
    254 	struct vattr va;
    255 	rnode_t *rp;
    256 	vnode_t *vp;
    257 
    258 	vp = *vpp;
    259 	rp = VTOR(vp);
    260 	if (nfs_zone() != VTOMI(vp)->mi_zone)
    261 		return (EIO);
    262 	mutex_enter(&rp->r_statelock);
    263 	if (rp->r_cred == NULL) {
    264 		crhold(cr);
    265 		rp->r_cred = cr;
    266 	}
    267 	mutex_exit(&rp->r_statelock);
    268 
    269 	/*
    270 	 * If there is no cached data or if close-to-open
    271 	 * consistency checking is turned off, we can avoid
    272 	 * the over the wire getattr.  Otherwise, if the
    273 	 * file system is mounted readonly, then just verify
    274 	 * the caches are up to date using the normal mechanism.
    275 	 * Else, if the file is not mmap'd, then just mark
    276 	 * the attributes as timed out.  They will be refreshed
    277 	 * and the caches validated prior to being used.
    278 	 * Else, the file system is mounted writeable so
    279 	 * force an over the wire GETATTR in order to ensure
    280 	 * that all cached data is valid.
    281 	 */
    282 	if (vp->v_count > 1 ||
    283 	    ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
    284 	    !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
    285 		if (vn_is_readonly(vp))
    286 			error = nfs_validate_caches(vp, cr);
    287 		else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
    288 			PURGE_ATTRCACHE(vp);
    289 			error = 0;
    290 		} else {
    291 			va.va_mask = AT_ALL;
    292 			error = nfs_getattr_otw(vp, &va, cr);
    293 		}
    294 	} else
    295 		error = 0;
    296 
    297 	return (error);
    298 }
    299 
    300 /* ARGSUSED */
    301 static int
    302 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
    303 	caller_context_t *ct)
    304 {
    305 	rnode_t *rp;
    306 	int error;
    307 	struct vattr va;
    308 
    309 	/*
    310 	 * zone_enter(2) prevents processes from changing zones with NFS files
    311 	 * open; if we happen to get here from the wrong zone we can't do
    312 	 * anything over the wire.
    313 	 */
    314 	if (VTOMI(vp)->mi_zone != nfs_zone()) {
    315 		/*
    316 		 * We could attempt to clean up locks, except we're sure
    317 		 * that the current process didn't acquire any locks on
    318 		 * the file: any attempt to lock a file belong to another zone
    319 		 * will fail, and one can't lock an NFS file and then change
    320 		 * zones, as that fails too.
    321 		 *
    322 		 * Returning an error here is the sane thing to do.  A
    323 		 * subsequent call to VN_RELE() which translates to a
    324 		 * nfs_inactive() will clean up state: if the zone of the
    325 		 * vnode's origin is still alive and kicking, an async worker
    326 		 * thread will handle the request (from the correct zone), and
    327 		 * everything (minus the final nfs_getattr_otw() call) should
    328 		 * be OK. If the zone is going away nfs_async_inactive() will
    329 		 * throw away cached pages inline.
    330 		 */
    331 		return (EIO);
    332 	}
    333 
    334 	/*
    335 	 * If we are using local locking for this filesystem, then
    336 	 * release all of the SYSV style record locks.  Otherwise,
    337 	 * we are doing network locking and we need to release all
    338 	 * of the network locks.  All of the locks held by this
    339 	 * process on this file are released no matter what the
    340 	 * incoming reference count is.
    341 	 */
    342 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
    343 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
    344 		cleanshares(vp, ttoproc(curthread)->p_pid);
    345 	} else
    346 		nfs_lockrelease(vp, flag, offset, cr);
    347 
    348 	if (count > 1)
    349 		return (0);
    350 
    351 	/*
    352 	 * If the file has been `unlinked', then purge the
    353 	 * DNLC so that this vnode will get reycled quicker
    354 	 * and the .nfs* file on the server will get removed.
    355 	 */
    356 	rp = VTOR(vp);
    357 	if (rp->r_unldvp != NULL)
    358 		dnlc_purge_vp(vp);
    359 
    360 	/*
    361 	 * If the file was open for write and there are pages,
    362 	 * then if the file system was mounted using the "no-close-
    363 	 *	to-open" semantics, then start an asynchronous flush
    364 	 *	of the all of the pages in the file.
    365 	 * else the file system was not mounted using the "no-close-
    366 	 *	to-open" semantics, then do a synchronous flush and
    367 	 *	commit of all of the dirty and uncommitted pages.
    368 	 *
    369 	 * The asynchronous flush of the pages in the "nocto" path
    370 	 * mostly just associates a cred pointer with the rnode so
    371 	 * writes which happen later will have a better chance of
    372 	 * working.  It also starts the data being written to the
    373 	 * server, but without unnecessarily delaying the application.
    374 	 */
    375 	if ((flag & FWRITE) && vn_has_cached_data(vp)) {
    376 		if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
    377 			error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC,
    378 			    cr, ct);
    379 			if (error == EAGAIN)
    380 				error = 0;
    381 		} else
    382 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
    383 		if (!error) {
    384 			mutex_enter(&rp->r_statelock);
    385 			error = rp->r_error;
    386 			rp->r_error = 0;
    387 			mutex_exit(&rp->r_statelock);
    388 		}
    389 	} else {
    390 		mutex_enter(&rp->r_statelock);
    391 		error = rp->r_error;
    392 		rp->r_error = 0;
    393 		mutex_exit(&rp->r_statelock);
    394 	}
    395 
    396 	/*
    397 	 * If RWRITEATTR is set, then issue an over the wire GETATTR to
    398 	 * refresh the attribute cache with a set of attributes which
    399 	 * weren't returned from a WRITE.  This will enable the close-
    400 	 * to-open processing to work.
    401 	 */
    402 	if (rp->r_flags & RWRITEATTR)
    403 		(void) nfs_getattr_otw(vp, &va, cr);
    404 
    405 	return (error);
    406 }
    407 
    408 /* ARGSUSED */
    409 static int
    410 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
    411 	caller_context_t *ct)
    412 {
    413 	rnode_t *rp;
    414 	u_offset_t off;
    415 	offset_t diff;
    416 	int on;
    417 	size_t n;
    418 	caddr_t base;
    419 	uint_t flags;
    420 	int error;
    421 	mntinfo_t *mi;
    422 
    423 	rp = VTOR(vp);
    424 	mi = VTOMI(vp);
    425 
    426 	if (nfs_zone() != mi->mi_zone)
    427 		return (EIO);
    428 
    429 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
    430 
    431 	if (vp->v_type != VREG)
    432 		return (EISDIR);
    433 
    434 	if (uiop->uio_resid == 0)
    435 		return (0);
    436 
    437 	if (uiop->uio_loffset > MAXOFF32_T)
    438 		return (EFBIG);
    439 
    440 	if (uiop->uio_loffset < 0 ||
    441 	    uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
    442 		return (EINVAL);
    443 
    444 	/*
    445 	 * Bypass VM if caching has been disabled (e.g., locking) or if
    446 	 * using client-side direct I/O and the file is not mmap'd and
    447 	 * there are no cached pages.
    448 	 */
    449 	if ((vp->v_flag & VNOCACHE) ||
    450 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
    451 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
    452 	    !vn_has_cached_data(vp))) {
    453 		size_t bufsize;
    454 		size_t resid = 0;
    455 
    456 		/*
    457 		 * Let's try to do read in as large a chunk as we can
    458 		 * (Filesystem (NFS client) bsize if possible/needed).
    459 		 * For V3, this is 32K and for V2, this is 8K.
    460 		 */
    461 		bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
    462 		base = kmem_alloc(bufsize, KM_SLEEP);
    463 		do {
    464 			n = MIN(uiop->uio_resid, bufsize);
    465 			error = nfsread(vp, base, uiop->uio_offset, n,
    466 			    &resid, cr);
    467 			if (!error) {
    468 				n -= resid;
    469 				error = uiomove(base, n, UIO_READ, uiop);
    470 			}
    471 		} while (!error && uiop->uio_resid > 0 && n > 0);
    472 		kmem_free(base, bufsize);
    473 		return (error);
    474 	}
    475 
    476 	error = 0;
    477 
    478 	do {
    479 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
    480 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
    481 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
    482 
    483 		error = nfs_validate_caches(vp, cr);
    484 		if (error)
    485 			break;
    486 
    487 		mutex_enter(&rp->r_statelock);
    488 		while (rp->r_flags & RINCACHEPURGE) {
    489 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
    490 				mutex_exit(&rp->r_statelock);
    491 				return (EINTR);
    492 			}
    493 		}
    494 		diff = rp->r_size - uiop->uio_loffset;
    495 		mutex_exit(&rp->r_statelock);
    496 		if (diff <= 0)
    497 			break;
    498 		if (diff < n)
    499 			n = (size_t)diff;
    500 
    501 		if (vpm_enable) {
    502 			/*
    503 			 * Copy data.
    504 			 */
    505 			error = vpm_data_copy(vp, off + on, n, uiop,
    506 			    1, NULL, 0, S_READ);
    507 		} else {
    508 			base = segmap_getmapflt(segkmap, vp, off + on, n,
    509 			    1, S_READ);
    510 			error = uiomove(base + on, n, UIO_READ, uiop);
    511 		}
    512 
    513 		if (!error) {
    514 			/*
    515 			 * If read a whole block or read to eof,
    516 			 * won't need this buffer again soon.
    517 			 */
    518 			mutex_enter(&rp->r_statelock);
    519 			if (n + on == MAXBSIZE ||
    520 			    uiop->uio_loffset == rp->r_size)
    521 				flags = SM_DONTNEED;
    522 			else
    523 				flags = 0;
    524 			mutex_exit(&rp->r_statelock);
    525 			if (vpm_enable) {
    526 				error = vpm_sync_pages(vp, off, n, flags);
    527 			} else {
    528 				error = segmap_release(segkmap, base, flags);
    529 			}
    530 		} else {
    531 			if (vpm_enable) {
    532 				(void) vpm_sync_pages(vp, off, n, 0);
    533 			} else {
    534 				(void) segmap_release(segkmap, base, 0);
    535 			}
    536 		}
    537 	} while (!error && uiop->uio_resid > 0);
    538 
    539 	return (error);
    540 }
    541 
    542 /* ARGSUSED */
    543 static int
    544 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
    545 	caller_context_t *ct)
    546 {
    547 	rnode_t *rp;
    548 	u_offset_t off;
    549 	caddr_t base;
    550 	uint_t flags;
    551 	int remainder;
    552 	size_t n;
    553 	int on;
    554 	int error;
    555 	int resid;
    556 	offset_t offset;
    557 	rlim_t limit;
    558 	mntinfo_t *mi;
    559 
    560 	rp = VTOR(vp);
    561 
    562 	mi = VTOMI(vp);
    563 	if (nfs_zone() != mi->mi_zone)
    564 		return (EIO);
    565 	if (vp->v_type != VREG)
    566 		return (EISDIR);
    567 
    568 	if (uiop->uio_resid == 0)
    569 		return (0);
    570 
    571 	if (ioflag & FAPPEND) {
    572 		struct vattr va;
    573 
    574 		/*
    575 		 * Must serialize if appending.
    576 		 */
    577 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
    578 			nfs_rw_exit(&rp->r_rwlock);
    579 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
    580 			    INTR(vp)))
    581 				return (EINTR);
    582 		}
    583 
    584 		va.va_mask = AT_SIZE;
    585 		error = nfsgetattr(vp, &va, cr);
    586 		if (error)
    587 			return (error);
    588 		uiop->uio_loffset = va.va_size;
    589 	}
    590 
    591 	if (uiop->uio_loffset > MAXOFF32_T)
    592 		return (EFBIG);
    593 
    594 	offset = uiop->uio_loffset + uiop->uio_resid;
    595 
    596 	if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
    597 		return (EINVAL);
    598 
    599 	if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
    600 		limit = MAXOFF32_T;
    601 	} else {
    602 		limit = (rlim_t)uiop->uio_llimit;
    603 	}
    604 
    605 	/*
    606 	 * Check to make sure that the process will not exceed
    607 	 * its limit on file size.  It is okay to write up to
    608 	 * the limit, but not beyond.  Thus, the write which
    609 	 * reaches the limit will be short and the next write
    610 	 * will return an error.
    611 	 */
    612 	remainder = 0;
    613 	if (offset > limit) {
    614 		remainder = offset - limit;
    615 		uiop->uio_resid = limit - uiop->uio_offset;
    616 		if (uiop->uio_resid <= 0) {
    617 			proc_t *p = ttoproc(curthread);
    618 
    619 			uiop->uio_resid += remainder;
    620 			mutex_enter(&p->p_lock);
    621 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
    622 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
    623 			mutex_exit(&p->p_lock);
    624 			return (EFBIG);
    625 		}
    626 	}
    627 
    628 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
    629 		return (EINTR);
    630 
    631 	/*
    632 	 * Bypass VM if caching has been disabled (e.g., locking) or if
    633 	 * using client-side direct I/O and the file is not mmap'd and
    634 	 * there are no cached pages.
    635 	 */
    636 	if ((vp->v_flag & VNOCACHE) ||
    637 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
    638 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
    639 	    !vn_has_cached_data(vp))) {
    640 		size_t bufsize;
    641 		int count;
    642 		uint_t org_offset;
    643 
    644 nfs_fwrite:
    645 		if (rp->r_flags & RSTALE) {
    646 			resid = uiop->uio_resid;
    647 			offset = uiop->uio_loffset;
    648 			error = rp->r_error;
    649 			/*
    650 			 * A close may have cleared r_error, if so,
    651 			 * propagate ESTALE error return properly
    652 			 */
    653 			if (error == 0)
    654 				error = ESTALE;
    655 			goto bottom;
    656 		}
    657 		bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
    658 		base = kmem_alloc(bufsize, KM_SLEEP);
    659 		do {
    660 			resid = uiop->uio_resid;
    661 			offset = uiop->uio_loffset;
    662 			count = MIN(uiop->uio_resid, bufsize);
    663 			org_offset = uiop->uio_offset;
    664 			error = uiomove(base, count, UIO_WRITE, uiop);
    665 			if (!error) {
    666 				error = nfswrite(vp, base, org_offset,
    667 				    count, cr);
    668 			}
    669 		} while (!error && uiop->uio_resid > 0);
    670 		kmem_free(base, bufsize);
    671 		goto bottom;
    672 	}
    673 
    674 	do {
    675 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
    676 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
    677 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
    678 
    679 		resid = uiop->uio_resid;
    680 		offset = uiop->uio_loffset;
    681 
    682 		if (rp->r_flags & RSTALE) {
    683 			error = rp->r_error;
    684 			/*
    685 			 * A close may have cleared r_error, if so,
    686 			 * propagate ESTALE error return properly
    687 			 */
    688 			if (error == 0)
    689 				error = ESTALE;
    690 			break;
    691 		}
    692 
    693 		/*
    694 		 * Don't create dirty pages faster than they
    695 		 * can be cleaned so that the system doesn't
    696 		 * get imbalanced.  If the async queue is
    697 		 * maxed out, then wait for it to drain before
    698 		 * creating more dirty pages.  Also, wait for
    699 		 * any threads doing pagewalks in the vop_getattr
    700 		 * entry points so that they don't block for
    701 		 * long periods.
    702 		 */
    703 		mutex_enter(&rp->r_statelock);
    704 		while ((mi->mi_max_threads != 0 &&
    705 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
    706 		    rp->r_gcount > 0) {
    707 			if (INTR(vp)) {
    708 				klwp_t *lwp = ttolwp(curthread);
    709 
    710 				if (lwp != NULL)
    711 					lwp->lwp_nostop++;
    712 				if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
    713 					mutex_exit(&rp->r_statelock);
    714 					if (lwp != NULL)
    715 						lwp->lwp_nostop--;
    716 					error = EINTR;
    717 					goto bottom;
    718 				}
    719 				if (lwp != NULL)
    720 					lwp->lwp_nostop--;
    721 			} else
    722 				cv_wait(&rp->r_cv, &rp->r_statelock);
    723 		}
    724 		mutex_exit(&rp->r_statelock);
    725 
    726 		/*
    727 		 * Touch the page and fault it in if it is not in core
    728 		 * before segmap_getmapflt or vpm_data_copy can lock it.
    729 		 * This is to avoid the deadlock if the buffer is mapped
    730 		 * to the same file through mmap which we want to write.
    731 		 */
    732 		uio_prefaultpages((long)n, uiop);
    733 
    734 		if (vpm_enable) {
    735 			/*
    736 			 * It will use kpm mappings, so no need to
    737 			 * pass an address.
    738 			 */
    739 			error = writerp(rp, NULL, n, uiop, 0);
    740 		} else  {
    741 			if (segmap_kpm) {
    742 				int pon = uiop->uio_loffset & PAGEOFFSET;
    743 				size_t pn = MIN(PAGESIZE - pon,
    744 				    uiop->uio_resid);
    745 				int pagecreate;
    746 
    747 				mutex_enter(&rp->r_statelock);
    748 				pagecreate = (pon == 0) && (pn == PAGESIZE ||
    749 				    uiop->uio_loffset + pn >= rp->r_size);
    750 				mutex_exit(&rp->r_statelock);
    751 
    752 				base = segmap_getmapflt(segkmap, vp, off + on,
    753 				    pn, !pagecreate, S_WRITE);
    754 
    755 				error = writerp(rp, base + pon, n, uiop,
    756 				    pagecreate);
    757 
    758 			} else {
    759 				base = segmap_getmapflt(segkmap, vp, off + on,
    760 				    n, 0, S_READ);
    761 				error = writerp(rp, base + on, n, uiop, 0);
    762 			}
    763 		}
    764 
    765 		if (!error) {
    766 			if (mi->mi_flags & MI_NOAC)
    767 				flags = SM_WRITE;
    768 			else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
    769 				/*
    770 				 * Have written a whole block.
    771 				 * Start an asynchronous write
    772 				 * and mark the buffer to
    773 				 * indicate that it won't be
    774 				 * needed again soon.
    775 				 */
    776 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
    777 			} else
    778 				flags = 0;
    779 			if ((ioflag & (FSYNC|FDSYNC)) ||
    780 			    (rp->r_flags & ROUTOFSPACE)) {
    781 				flags &= ~SM_ASYNC;
    782 				flags |= SM_WRITE;
    783 			}
    784 			if (vpm_enable) {
    785 				error = vpm_sync_pages(vp, off, n, flags);
    786 			} else {
    787 				error = segmap_release(segkmap, base, flags);
    788 			}
    789 		} else {
    790 			if (vpm_enable) {
    791 				(void) vpm_sync_pages(vp, off, n, 0);
    792 			} else {
    793 				(void) segmap_release(segkmap, base, 0);
    794 			}
    795 			/*
    796 			 * In the event that we got an access error while
    797 			 * faulting in a page for a write-only file just
    798 			 * force a write.
    799 			 */
    800 			if (error == EACCES)
    801 				goto nfs_fwrite;
    802 		}
    803 	} while (!error && uiop->uio_resid > 0);
    804 
    805 bottom:
    806 	if (error) {
    807 		uiop->uio_resid = resid + remainder;
    808 		uiop->uio_loffset = offset;
    809 	} else
    810 		uiop->uio_resid += remainder;
    811 
    812 	nfs_rw_exit(&rp->r_lkserlock);
    813 
    814 	return (error);
    815 }
    816 
    817 /*
    818  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
    819  */
    820 static int
    821 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
    822 	int flags, cred_t *cr)
    823 {
    824 	struct buf *bp;
    825 	int error;
    826 
    827 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
    828 	bp = pageio_setup(pp, len, vp, flags);
    829 	ASSERT(bp != NULL);
    830 
    831 	/*
    832 	 * pageio_setup should have set b_addr to 0.  This
    833 	 * is correct since we want to do I/O on a page
    834 	 * boundary.  bp_mapin will use this addr to calculate
    835 	 * an offset, and then set b_addr to the kernel virtual
    836 	 * address it allocated for us.
    837 	 */
    838 	ASSERT(bp->b_un.b_addr == 0);
    839 
    840 	bp->b_edev = 0;
    841 	bp->b_dev = 0;
    842 	bp->b_lblkno = lbtodb(off);
    843 	bp->b_file = vp;
    844 	bp->b_offset = (offset_t)off;
    845 	bp_mapin(bp);
    846 
    847 	error = nfs_bio(bp, cr);
    848 
    849 	bp_mapout(bp);
    850 	pageio_done(bp);
    851 
    852 	return (error);
    853 }
    854 
    855 /*
    856  * Write to file.  Writes to remote server in largest size
    857  * chunks that the server can handle.  Write is synchronous.
    858  */
    859 static int
    860 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
    861 {
    862 	rnode_t *rp;
    863 	mntinfo_t *mi;
    864 	struct nfswriteargs wa;
    865 	struct nfsattrstat ns;
    866 	int error;
    867 	int tsize;
    868 	int douprintf;
    869 
    870 	douprintf = 1;
    871 
    872 	rp = VTOR(vp);
    873 	mi = VTOMI(vp);
    874 
    875 	ASSERT(nfs_zone() == mi->mi_zone);
    876 
    877 	wa.wa_args = &wa.wa_args_buf;
    878 	wa.wa_fhandle = *VTOFH(vp);
    879 
    880 	do {
    881 		tsize = MIN(mi->mi_curwrite, count);
    882 		wa.wa_data = base;
    883 		wa.wa_begoff = offset;
    884 		wa.wa_totcount = tsize;
    885 		wa.wa_count = tsize;
    886 		wa.wa_offset = offset;
    887 
    888 		if (mi->mi_io_kstats) {
    889 			mutex_enter(&mi->mi_lock);
    890 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
    891 			mutex_exit(&mi->mi_lock);
    892 		}
    893 		wa.wa_mblk = NULL;
    894 		do {
    895 			error = rfs2call(mi, RFS_WRITE,
    896 			    xdr_writeargs, (caddr_t)&wa,
    897 			    xdr_attrstat, (caddr_t)&ns, cr,
    898 			    &douprintf, &ns.ns_status, 0, NULL);
    899 		} while (error == ENFS_TRYAGAIN);
    900 		if (mi->mi_io_kstats) {
    901 			mutex_enter(&mi->mi_lock);
    902 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
    903 			mutex_exit(&mi->mi_lock);
    904 		}
    905 
    906 		if (!error) {
    907 			error = geterrno(ns.ns_status);
    908 			/*
    909 			 * Can't check for stale fhandle and purge caches
    910 			 * here because pages are held by nfs_getpage.
    911 			 * Just mark the attribute cache as timed out
    912 			 * and set RWRITEATTR to indicate that the file
    913 			 * was modified with a WRITE operation.
    914 			 */
    915 			if (!error) {
    916 				count -= tsize;
    917 				base += tsize;
    918 				offset += tsize;
    919 				if (mi->mi_io_kstats) {
    920 					mutex_enter(&mi->mi_lock);
    921 					KSTAT_IO_PTR(mi->mi_io_kstats)->
    922 					    writes++;
    923 					KSTAT_IO_PTR(mi->mi_io_kstats)->
    924 					    nwritten += tsize;
    925 					mutex_exit(&mi->mi_lock);
    926 				}
    927 				lwp_stat_update(LWP_STAT_OUBLK, 1);
    928 				mutex_enter(&rp->r_statelock);
    929 				PURGE_ATTRCACHE_LOCKED(rp);
    930 				rp->r_flags |= RWRITEATTR;
    931 				mutex_exit(&rp->r_statelock);
    932 			}
    933 		}
    934 	} while (!error && count);
    935 
    936 	return (error);
    937 }
    938 
    939 /*
    940  * Read from a file.  Reads data in largest chunks our interface can handle.
    941  */
    942 static int
    943 nfsread(vnode_t *vp, caddr_t base, uint_t offset,
    944     int count, size_t *residp, cred_t *cr)
    945 {
    946 	mntinfo_t *mi;
    947 	struct nfsreadargs ra;
    948 	struct nfsrdresult rr;
    949 	int tsize;
    950 	int error;
    951 	int douprintf;
    952 	failinfo_t fi;
    953 	rnode_t *rp;
    954 	struct vattr va;
    955 	hrtime_t t;
    956 
    957 	rp = VTOR(vp);
    958 	mi = VTOMI(vp);
    959 
    960 	ASSERT(nfs_zone() == mi->mi_zone);
    961 
    962 	douprintf = 1;
    963 
    964 	ra.ra_fhandle = *VTOFH(vp);
    965 
    966 	fi.vp = vp;
    967 	fi.fhp = (caddr_t)&ra.ra_fhandle;
    968 	fi.copyproc = nfscopyfh;
    969 	fi.lookupproc = nfslookup;
    970 	fi.xattrdirproc = acl_getxattrdir2;
    971 
    972 	do {
    973 		if (mi->mi_io_kstats) {
    974 			mutex_enter(&mi->mi_lock);
    975 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
    976 			mutex_exit(&mi->mi_lock);
    977 		}
    978 
    979 		do {
    980 			tsize = MIN(mi->mi_curread, count);
    981 			rr.rr_data = base;
    982 			ra.ra_offset = offset;
    983 			ra.ra_totcount = tsize;
    984 			ra.ra_count = tsize;
    985 			ra.ra_data = base;
    986 			t = gethrtime();
    987 			error = rfs2call(mi, RFS_READ,
    988 			    xdr_readargs, (caddr_t)&ra,
    989 			    xdr_rdresult, (caddr_t)&rr, cr,
    990 			    &douprintf, &rr.rr_status, 0, &fi);
    991 		} while (error == ENFS_TRYAGAIN);
    992 
    993 		if (mi->mi_io_kstats) {
    994 			mutex_enter(&mi->mi_lock);
    995 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
    996 			mutex_exit(&mi->mi_lock);
    997 		}
    998 
    999 		if (!error) {
   1000 			error = geterrno(rr.rr_status);
   1001 			if (!error) {
   1002 				count -= rr.rr_count;
   1003 				base += rr.rr_count;
   1004 				offset += rr.rr_count;
   1005 				if (mi->mi_io_kstats) {
   1006 					mutex_enter(&mi->mi_lock);
   1007 					KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
   1008 					KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
   1009 					    rr.rr_count;
   1010 					mutex_exit(&mi->mi_lock);
   1011 				}
   1012 				lwp_stat_update(LWP_STAT_INBLK, 1);
   1013 			}
   1014 		}
   1015 	} while (!error && count && rr.rr_count == tsize);
   1016 
   1017 	*residp = count;
   1018 
   1019 	if (!error) {
   1020 		/*
   1021 		 * Since no error occurred, we have the current
   1022 		 * attributes and we need to do a cache check and then
   1023 		 * potentially update the cached attributes.  We can't
   1024 		 * use the normal attribute check and cache mechanisms
   1025 		 * because they might cause a cache flush which would
   1026 		 * deadlock.  Instead, we just check the cache to see
   1027 		 * if the attributes have changed.  If it is, then we
   1028 		 * just mark the attributes as out of date.  The next
   1029 		 * time that the attributes are checked, they will be
   1030 		 * out of date, new attributes will be fetched, and
   1031 		 * the page cache will be flushed.  If the attributes
   1032 		 * weren't changed, then we just update the cached
   1033 		 * attributes with these attributes.
   1034 		 */
   1035 		/*
   1036 		 * If NFS_ACL is supported on the server, then the
   1037 		 * attributes returned by server may have minimal
   1038 		 * permissions sometimes denying access to users having
   1039 		 * proper access.  To get the proper attributes, mark
   1040 		 * the attributes as expired so that they will be
   1041 		 * regotten via the NFS_ACL GETATTR2 procedure.
   1042 		 */
   1043 		error = nattr_to_vattr(vp, &rr.rr_attr, &va);
   1044 		mutex_enter(&rp->r_statelock);
   1045 		if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
   1046 		    (mi->mi_flags & MI_ACL)) {
   1047 			mutex_exit(&rp->r_statelock);
   1048 			PURGE_ATTRCACHE(vp);
   1049 		} else {
   1050 			if (rp->r_mtime <= t) {
   1051 				nfs_attrcache_va(vp, &va);
   1052 			}
   1053 			mutex_exit(&rp->r_statelock);
   1054 		}
   1055 	}
   1056 
   1057 	return (error);
   1058 }
   1059 
   1060 /* ARGSUSED */
   1061 static int
   1062 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
   1063 	caller_context_t *ct)
   1064 {
   1065 
   1066 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   1067 		return (EIO);
   1068 	switch (cmd) {
   1069 		case _FIODIRECTIO:
   1070 			return (nfs_directio(vp, (int)arg, cr));
   1071 		default:
   1072 			return (ENOTTY);
   1073 	}
   1074 }
   1075 
   1076 /* ARGSUSED */
   1077 static int
   1078 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
   1079 	caller_context_t *ct)
   1080 {
   1081 	int error;
   1082 	rnode_t *rp;
   1083 
   1084 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   1085 		return (EIO);
   1086 	/*
   1087 	 * If it has been specified that the return value will
   1088 	 * just be used as a hint, and we are only being asked
   1089 	 * for size, fsid or rdevid, then return the client's
   1090 	 * notion of these values without checking to make sure
   1091 	 * that the attribute cache is up to date.
   1092 	 * The whole point is to avoid an over the wire GETATTR
   1093 	 * call.
   1094 	 */
   1095 	rp = VTOR(vp);
   1096 	if (flags & ATTR_HINT) {
   1097 		if (vap->va_mask ==
   1098 		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
   1099 			mutex_enter(&rp->r_statelock);
   1100 			if (vap->va_mask | AT_SIZE)
   1101 				vap->va_size = rp->r_size;
   1102 			if (vap->va_mask | AT_FSID)
   1103 				vap->va_fsid = rp->r_attr.va_fsid;
   1104 			if (vap->va_mask | AT_RDEV)
   1105 				vap->va_rdev = rp->r_attr.va_rdev;
   1106 			mutex_exit(&rp->r_statelock);
   1107 			return (0);
   1108 		}
   1109 	}
   1110 
   1111 	/*
   1112 	 * Only need to flush pages if asking for the mtime
   1113 	 * and if there any dirty pages or any outstanding
   1114 	 * asynchronous (write) requests for this file.
   1115 	 */
   1116 	if (vap->va_mask & AT_MTIME) {
   1117 		if (vn_has_cached_data(vp) &&
   1118 		    ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
   1119 			mutex_enter(&rp->r_statelock);
   1120 			rp->r_gcount++;
   1121 			mutex_exit(&rp->r_statelock);
   1122 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
   1123 			mutex_enter(&rp->r_statelock);
   1124 			if (error && (error == ENOSPC || error == EDQUOT)) {
   1125 				if (!rp->r_error)
   1126 					rp->r_error = error;
   1127 			}
   1128 			if (--rp->r_gcount == 0)
   1129 				cv_broadcast(&rp->r_cv);
   1130 			mutex_exit(&rp->r_statelock);
   1131 		}
   1132 	}
   1133 
   1134 	return (nfsgetattr(vp, vap, cr));
   1135 }
   1136 
   1137 /*ARGSUSED4*/
   1138 static int
   1139 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
   1140 		caller_context_t *ct)
   1141 {
   1142 	int error;
   1143 	uint_t mask;
   1144 	struct vattr va;
   1145 
   1146 	mask = vap->va_mask;
   1147 
   1148 	if (mask & AT_NOSET)
   1149 		return (EINVAL);
   1150 
   1151 	if ((mask & AT_SIZE) &&
   1152 	    vap->va_type == VREG &&
   1153 	    vap->va_size > MAXOFF32_T)
   1154 		return (EFBIG);
   1155 
   1156 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   1157 		return (EIO);
   1158 
   1159 	va.va_mask = AT_UID | AT_MODE;
   1160 
   1161 	error = nfsgetattr(vp, &va, cr);
   1162 	if (error)
   1163 		return (error);
   1164 
   1165 	error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
   1166 	    vp);
   1167 
   1168 	if (error)
   1169 		return (error);
   1170 
   1171 	return (nfssetattr(vp, vap, flags, cr));
   1172 }
   1173 
   1174 static int
   1175 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
   1176 {
   1177 	int error;
   1178 	uint_t mask;
   1179 	struct nfssaargs args;
   1180 	struct nfsattrstat ns;
   1181 	int douprintf;
   1182 	rnode_t *rp;
   1183 	struct vattr va;
   1184 	mode_t omode;
   1185 	mntinfo_t *mi;
   1186 	vsecattr_t *vsp;
   1187 	hrtime_t t;
   1188 
   1189 	mask = vap->va_mask;
   1190 
   1191 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
   1192 
   1193 	rp = VTOR(vp);
   1194 
   1195 	/*
   1196 	 * Only need to flush pages if there are any pages and
   1197 	 * if the file is marked as dirty in some fashion.  The
   1198 	 * file must be flushed so that we can accurately
   1199 	 * determine the size of the file and the cached data
   1200 	 * after the SETATTR returns.  A file is considered to
   1201 	 * be dirty if it is either marked with RDIRTY, has
   1202 	 * outstanding i/o's active, or is mmap'd.  In this
   1203 	 * last case, we can't tell whether there are dirty
   1204 	 * pages, so we flush just to be sure.
   1205 	 */
   1206 	if (vn_has_cached_data(vp) &&
   1207 	    ((rp->r_flags & RDIRTY) ||
   1208 	    rp->r_count > 0 ||
   1209 	    rp->r_mapcnt > 0)) {
   1210 		ASSERT(vp->v_type != VCHR);
   1211 		error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
   1212 		if (error && (error == ENOSPC || error == EDQUOT)) {
   1213 			mutex_enter(&rp->r_statelock);
   1214 			if (!rp->r_error)
   1215 				rp->r_error = error;
   1216 			mutex_exit(&rp->r_statelock);
   1217 		}
   1218 	}
   1219 
   1220 	/*
   1221 	 * If the system call was utime(2) or utimes(2) and the
   1222 	 * application did not specify the times, then set the
   1223 	 * mtime nanosecond field to 1 billion.  This will get
   1224 	 * translated from 1 billion nanoseconds to 1 million
   1225 	 * microseconds in the over the wire request.  The
   1226 	 * server will use 1 million in the microsecond field
   1227 	 * to tell whether both the mtime and atime should be
   1228 	 * set to the server's current time.
   1229 	 *
   1230 	 * This is an overload of the protocol and should be
   1231 	 * documented in the NFS Version 2 protocol specification.
   1232 	 */
   1233 	if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
   1234 		vap->va_mtime.tv_nsec = 1000000000;
   1235 		if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
   1236 		    NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
   1237 			error = vattr_to_sattr(vap, &args.saa_sa);
   1238 		} else {
   1239 			/*
   1240 			 * Use server times. vap time values will not be used.
   1241 			 * To ensure no time overflow, make sure vap has
   1242 			 * valid values, but retain the original values.
   1243 			 */
   1244 			timestruc_t	mtime = vap->va_mtime;
   1245 			timestruc_t	atime = vap->va_atime;
   1246 			time_t		now;
   1247 
   1248 			now = gethrestime_sec();
   1249 			if (NFS_TIME_T_OK(now)) {
   1250 				/* Just in case server does not know of this */
   1251 				vap->va_mtime.tv_sec = now;
   1252 				vap->va_atime.tv_sec = now;
   1253 			} else {
   1254 				vap->va_mtime.tv_sec = 0;
   1255 				vap->va_atime.tv_sec = 0;
   1256 			}
   1257 			error = vattr_to_sattr(vap, &args.saa_sa);
   1258 			/* set vap times back on */
   1259 			vap->va_mtime = mtime;
   1260 			vap->va_atime = atime;
   1261 		}
   1262 	} else {
   1263 		/* Either do not set times or use the client specified times */
   1264 		error = vattr_to_sattr(vap, &args.saa_sa);
   1265 	}
   1266 	if (error) {
   1267 		/* req time field(s) overflow - return immediately */
   1268 		return (error);
   1269 	}
   1270 	args.saa_fh = *VTOFH(vp);
   1271 
   1272 	va.va_mask = AT_MODE;
   1273 	error = nfsgetattr(vp, &va, cr);
   1274 	if (error)
   1275 		return (error);
   1276 	omode = va.va_mode;
   1277 
   1278 	mi = VTOMI(vp);
   1279 
   1280 	douprintf = 1;
   1281 
   1282 	t = gethrtime();
   1283 
   1284 	error = rfs2call(mi, RFS_SETATTR,
   1285 	    xdr_saargs, (caddr_t)&args,
   1286 	    xdr_attrstat, (caddr_t)&ns, cr,
   1287 	    &douprintf, &ns.ns_status, 0, NULL);
   1288 
   1289 	/*
   1290 	 * Purge the access cache and ACL cache if changing either the
   1291 	 * owner of the file, the group owner, or the mode.  These may
   1292 	 * change the access permissions of the file, so purge old
   1293 	 * information and start over again.
   1294 	 */
   1295 	if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
   1296 		(void) nfs_access_purge_rp(rp);
   1297 		if (rp->r_secattr != NULL) {
   1298 			mutex_enter(&rp->r_statelock);
   1299 			vsp = rp->r_secattr;
   1300 			rp->r_secattr = NULL;
   1301 			mutex_exit(&rp->r_statelock);
   1302 			if (vsp != NULL)
   1303 				nfs_acl_free(vsp);
   1304 		}
   1305 	}
   1306 
   1307 	if (!error) {
   1308 		error = geterrno(ns.ns_status);
   1309 		if (!error) {
   1310 			/*
   1311 			 * If changing the size of the file, invalidate
   1312 			 * any local cached data which is no longer part
   1313 			 * of the file.  We also possibly invalidate the
   1314 			 * last page in the file.  We could use
   1315 			 * pvn_vpzero(), but this would mark the page as
   1316 			 * modified and require it to be written back to
   1317 			 * the server for no particularly good reason.
   1318 			 * This way, if we access it, then we bring it
   1319 			 * back in.  A read should be cheaper than a
   1320 			 * write.
   1321 			 */
   1322 			if (mask & AT_SIZE) {
   1323 				nfs_invalidate_pages(vp,
   1324 				    (vap->va_size & PAGEMASK), cr);
   1325 			}
   1326 			(void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
   1327 			/*
   1328 			 * If NFS_ACL is supported on the server, then the
   1329 			 * attributes returned by server may have minimal
   1330 			 * permissions sometimes denying access to users having
   1331 			 * proper access.  To get the proper attributes, mark
   1332 			 * the attributes as expired so that they will be
   1333 			 * regotten via the NFS_ACL GETATTR2 procedure.
   1334 			 */
   1335 			if (mi->mi_flags & MI_ACL) {
   1336 				PURGE_ATTRCACHE(vp);
   1337 			}
   1338 			/*
   1339 			 * This next check attempts to deal with NFS
   1340 			 * servers which can not handle increasing
   1341 			 * the size of the file via setattr.  Most
   1342 			 * of these servers do not return an error,
   1343 			 * but do not change the size of the file.
   1344 			 * Hence, this check and then attempt to set
   1345 			 * the file size by writing 1 byte at the
   1346 			 * offset of the end of the file that we need.
   1347 			 */
   1348 			if ((mask & AT_SIZE) &&
   1349 			    ns.ns_attr.na_size < (uint32_t)vap->va_size) {
   1350 				char zb = '\0';
   1351 
   1352 				error = nfswrite(vp, &zb,
   1353 				    vap->va_size - sizeof (zb),
   1354 				    sizeof (zb), cr);
   1355 			}
   1356 			/*
   1357 			 * Some servers will change the mode to clear the setuid
   1358 			 * and setgid bits when changing the uid or gid.  The
   1359 			 * client needs to compensate appropriately.
   1360 			 */
   1361 			if (mask & (AT_UID | AT_GID)) {
   1362 				int terror;
   1363 
   1364 				va.va_mask = AT_MODE;
   1365 				terror = nfsgetattr(vp, &va, cr);
   1366 				if (!terror &&
   1367 				    (((mask & AT_MODE) &&
   1368 				    va.va_mode != vap->va_mode) ||
   1369 				    (!(mask & AT_MODE) &&
   1370 				    va.va_mode != omode))) {
   1371 					va.va_mask = AT_MODE;
   1372 					if (mask & AT_MODE)
   1373 						va.va_mode = vap->va_mode;
   1374 					else
   1375 						va.va_mode = omode;
   1376 					(void) nfssetattr(vp, &va, 0, cr);
   1377 				}
   1378 			}
   1379 		} else {
   1380 			PURGE_ATTRCACHE(vp);
   1381 			PURGE_STALE_FH(error, vp, cr);
   1382 		}
   1383 	} else {
   1384 		PURGE_ATTRCACHE(vp);
   1385 	}
   1386 
   1387 	return (error);
   1388 }
   1389 
   1390 static int
   1391 nfs_accessx(void *vp, int mode, cred_t *cr)
   1392 {
   1393 	ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
   1394 	return (nfs_access(vp, mode, 0, cr, NULL));
   1395 }
   1396 
   1397 /* ARGSUSED */
   1398 static int
   1399 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
   1400 {
   1401 	struct vattr va;
   1402 	int error;
   1403 	mntinfo_t *mi;
   1404 	int shift = 0;
   1405 
   1406 	mi = VTOMI(vp);
   1407 
   1408 	if (nfs_zone() != mi->mi_zone)
   1409 		return (EIO);
   1410 	if (mi->mi_flags & MI_ACL) {
   1411 		error = acl_access2(vp, mode, flags, cr);
   1412 		if (mi->mi_flags & MI_ACL)
   1413 			return (error);
   1414 	}
   1415 
   1416 	va.va_mask = AT_MODE | AT_UID | AT_GID;
   1417 	error = nfsgetattr(vp, &va, cr);
   1418 	if (error)
   1419 		return (error);
   1420 
   1421 	/*
   1422 	 * Disallow write attempts on read-only
   1423 	 * file systems, unless the file is a
   1424 	 * device node.
   1425 	 */
   1426 	if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
   1427 		return (EROFS);
   1428 
   1429 	/*
   1430 	 * Disallow attempts to access mandatory lock files.
   1431 	 */
   1432 	if ((mode & (VWRITE | VREAD | VEXEC)) &&
   1433 	    MANDLOCK(vp, va.va_mode))
   1434 		return (EACCES);
   1435 
   1436 	/*
   1437 	 * Access check is based on only
   1438 	 * one of owner, group, public.
   1439 	 * If not owner, then check group.
   1440 	 * If not a member of the group,
   1441 	 * then check public access.
   1442 	 */
   1443 	if (crgetuid(cr) != va.va_uid) {
   1444 		shift += 3;
   1445 		if (!groupmember(va.va_gid, cr))
   1446 			shift += 3;
   1447 	}
   1448 found:
   1449 	mode &= ~(va.va_mode << shift);
   1450 	if (mode == 0)
   1451 		return (0);
   1452 
   1453 	return (secpolicy_vnode_access(cr, vp, va.va_uid, mode));
   1454 }
   1455 
   1456 static int nfs_do_symlink_cache = 1;
   1457 
   1458 /* ARGSUSED */
   1459 static int
   1460 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
   1461 {
   1462 	int error;
   1463 	struct nfsrdlnres rl;
   1464 	rnode_t *rp;
   1465 	int douprintf;
   1466 	failinfo_t fi;
   1467 
   1468 	/*
   1469 	 * We want to be consistent with UFS semantics so we will return
   1470 	 * EINVAL instead of ENXIO. This violates the XNFS spec and
   1471 	 * the RFC 1094, which are wrong any way. BUGID 1138002.
   1472 	 */
   1473 	if (vp->v_type != VLNK)
   1474 		return (EINVAL);
   1475 
   1476 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   1477 		return (EIO);
   1478 
   1479 	rp = VTOR(vp);
   1480 	if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
   1481 		error = nfs_validate_caches(vp, cr);
   1482 		if (error)
   1483 			return (error);
   1484 		mutex_enter(&rp->r_statelock);
   1485 		if (rp->r_symlink.contents != NULL) {
   1486 			error = uiomove(rp->r_symlink.contents,
   1487 			    rp->r_symlink.len, UIO_READ, uiop);
   1488 			mutex_exit(&rp->r_statelock);
   1489 			return (error);
   1490 		}
   1491 		mutex_exit(&rp->r_statelock);
   1492 	}
   1493 
   1494 
   1495 	rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
   1496 
   1497 	fi.vp = vp;
   1498 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
   1499 	fi.copyproc = nfscopyfh;
   1500 	fi.lookupproc = nfslookup;
   1501 	fi.xattrdirproc = acl_getxattrdir2;
   1502 
   1503 	douprintf = 1;
   1504 
   1505 	error = rfs2call(VTOMI(vp), RFS_READLINK,
   1506 	    xdr_readlink, (caddr_t)VTOFH(vp),
   1507 	    xdr_rdlnres, (caddr_t)&rl, cr,
   1508 	    &douprintf, &rl.rl_status, 0, &fi);
   1509 
   1510 	if (error) {
   1511 
   1512 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
   1513 		return (error);
   1514 	}
   1515 
   1516 	error = geterrno(rl.rl_status);
   1517 	if (!error) {
   1518 		error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
   1519 		if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
   1520 			mutex_enter(&rp->r_statelock);
   1521 			if (rp->r_symlink.contents == NULL) {
   1522 				rp->r_symlink.contents = rl.rl_data;
   1523 				rp->r_symlink.len = (int)rl.rl_count;
   1524 				rp->r_symlink.size = NFS_MAXPATHLEN;
   1525 				mutex_exit(&rp->r_statelock);
   1526 			} else {
   1527 				mutex_exit(&rp->r_statelock);
   1528 
   1529 				kmem_free((void *)rl.rl_data,
   1530 				    NFS_MAXPATHLEN);
   1531 			}
   1532 		} else {
   1533 
   1534 			kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
   1535 		}
   1536 	} else {
   1537 		PURGE_STALE_FH(error, vp, cr);
   1538 
   1539 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
   1540 	}
   1541 
   1542 	/*
   1543 	 * Conform to UFS semantics (see comment above)
   1544 	 */
   1545 	return (error == ENXIO ? EINVAL : error);
   1546 }
   1547 
   1548 /*
   1549  * Flush local dirty pages to stable storage on the server.
   1550  *
   1551  * If FNODSYNC is specified, then there is nothing to do because
   1552  * metadata changes are not cached on the client before being
   1553  * sent to the server.
   1554  */
   1555 /* ARGSUSED */
   1556 static int
   1557 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
   1558 {
   1559 	int error;
   1560 
   1561 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
   1562 		return (0);
   1563 
   1564 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   1565 		return (EIO);
   1566 
   1567 	error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
   1568 	if (!error)
   1569 		error = VTOR(vp)->r_error;
   1570 	return (error);
   1571 }
   1572 
   1573 
   1574 /*
   1575  * Weirdness: if the file was removed or the target of a rename
   1576  * operation while it was open, it got renamed instead.  Here we
   1577  * remove the renamed file.
   1578  */
   1579 /* ARGSUSED */
   1580 static void
   1581 nfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
   1582 {
   1583 	rnode_t *rp;
   1584 
   1585 	ASSERT(vp != DNLC_NO_VNODE);
   1586 
   1587 	/*
   1588 	 * If this is coming from the wrong zone, we let someone in the right
   1589 	 * zone take care of it asynchronously.  We can get here due to
   1590 	 * VN_RELE() being called from pageout() or fsflush().  This call may
   1591 	 * potentially turn into an expensive no-op if, for instance, v_count
   1592 	 * gets incremented in the meantime, but it's still correct.
   1593 	 */
   1594 	if (nfs_zone() != VTOMI(vp)->mi_zone) {
   1595 		nfs_async_inactive(vp, cr, nfs_inactive);
   1596 		return;
   1597 	}
   1598 
   1599 	rp = VTOR(vp);
   1600 redo:
   1601 	if (rp->r_unldvp != NULL) {
   1602 		/*
   1603 		 * Save the vnode pointer for the directory where the
   1604 		 * unlinked-open file got renamed, then set it to NULL
   1605 		 * to prevent another thread from getting here before
   1606 		 * we're done with the remove.  While we have the
   1607 		 * statelock, make local copies of the pertinent rnode
   1608 		 * fields.  If we weren't to do this in an atomic way, the
   1609 		 * the unl* fields could become inconsistent with respect
   1610 		 * to each other due to a race condition between this
   1611 		 * code and nfs_remove().  See bug report 1034328.
   1612 		 */
   1613 		mutex_enter(&rp->r_statelock);
   1614 		if (rp->r_unldvp != NULL) {
   1615 			vnode_t *unldvp;
   1616 			char *unlname;
   1617 			cred_t *unlcred;
   1618 			struct nfsdiropargs da;
   1619 			enum nfsstat status;
   1620 			int douprintf;
   1621 			int error;
   1622 
   1623 			unldvp = rp->r_unldvp;
   1624 			rp->r_unldvp = NULL;
   1625 			unlname = rp->r_unlname;
   1626 			rp->r_unlname = NULL;
   1627 			unlcred = rp->r_unlcred;
   1628 			rp->r_unlcred = NULL;
   1629 			mutex_exit(&rp->r_statelock);
   1630 
   1631 			/*
   1632 			 * If there are any dirty pages left, then flush
   1633 			 * them.  This is unfortunate because they just
   1634 			 * may get thrown away during the remove operation,
   1635 			 * but we have to do this for correctness.
   1636 			 */
   1637 			if (vn_has_cached_data(vp) &&
   1638 			    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
   1639 				ASSERT(vp->v_type != VCHR);
   1640 				error = nfs_putpage(vp, (offset_t)0, 0, 0,
   1641 				    cr, ct);
   1642 				if (error) {
   1643 					mutex_enter(&rp->r_statelock);
   1644 					if (!rp->r_error)
   1645 						rp->r_error = error;
   1646 					mutex_exit(&rp->r_statelock);
   1647 				}
   1648 			}
   1649 
   1650 			/*
   1651 			 * Do the remove operation on the renamed file
   1652 			 */
   1653 			setdiropargs(&da, unlname, unldvp);
   1654 
   1655 			douprintf = 1;
   1656 
   1657 			(void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
   1658 			    xdr_diropargs, (caddr_t)&da,
   1659 			    xdr_enum, (caddr_t)&status, unlcred,
   1660 			    &douprintf, &status, 0, NULL);
   1661 
   1662 			if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
   1663 				nfs_purge_rddir_cache(unldvp);
   1664 			PURGE_ATTRCACHE(unldvp);
   1665 
   1666 			/*
   1667 			 * Release stuff held for the remove
   1668 			 */
   1669 			VN_RELE(unldvp);
   1670 			kmem_free(unlname, MAXNAMELEN);
   1671 			crfree(unlcred);
   1672 			goto redo;
   1673 		}
   1674 		mutex_exit(&rp->r_statelock);
   1675 	}
   1676 
   1677 	rp_addfree(rp, cr);
   1678 }
   1679 
   1680 /*
   1681  * Remote file system operations having to do with directory manipulation.
   1682  */
   1683 
   1684 /* ARGSUSED */
   1685 static int
   1686 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
   1687 	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
   1688 	int *direntflags, pathname_t *realpnp)
   1689 {
   1690 	int error;
   1691 	vnode_t *vp;
   1692 	vnode_t *avp = NULL;
   1693 	rnode_t *drp;
   1694 
   1695 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
   1696 		return (EPERM);
   1697 
   1698 	drp = VTOR(dvp);
   1699 
   1700 	/*
   1701 	 * Are we looking up extended attributes?  If so, "dvp" is
   1702 	 * the file or directory for which we want attributes, and
   1703 	 * we need a lookup of the hidden attribute directory
   1704 	 * before we lookup the rest of the path.
   1705 	 */
   1706 	if (flags & LOOKUP_XATTR) {
   1707 		bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
   1708 		mntinfo_t *mi;
   1709 
   1710 		mi = VTOMI(dvp);
   1711 		if (!(mi->mi_flags & MI_EXTATTR))
   1712 			return (EINVAL);
   1713 
   1714 		if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
   1715 			return (EINTR);
   1716 
   1717 		(void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
   1718 		if (avp == NULL)
   1719 			error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
   1720 		else
   1721 			error = 0;
   1722 
   1723 		nfs_rw_exit(&drp->r_rwlock);
   1724 
   1725 		if (error) {
   1726 			if (mi->mi_flags & MI_EXTATTR)
   1727 				return (error);
   1728 			return (EINVAL);
   1729 		}
   1730 		dvp = avp;
   1731 		drp = VTOR(dvp);
   1732 	}
   1733 
   1734 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
   1735 		error = EINTR;
   1736 		goto out;
   1737 	}
   1738 
   1739 	error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
   1740 
   1741 	nfs_rw_exit(&drp->r_rwlock);
   1742 
   1743 	/*
   1744 	 * If vnode is a device, create special vnode.
   1745 	 */
   1746 	if (!error && IS_DEVVP(*vpp)) {
   1747 		vp = *vpp;
   1748 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
   1749 		VN_RELE(vp);
   1750 	}
   1751 
   1752 out:
   1753 	if (avp != NULL)
   1754 		VN_RELE(avp);
   1755 
   1756 	return (error);
   1757 }
   1758 
   1759 static int nfs_lookup_neg_cache = 1;
   1760 
   1761 #ifdef DEBUG
   1762 static int nfs_lookup_dnlc_hits = 0;
   1763 static int nfs_lookup_dnlc_misses = 0;
   1764 static int nfs_lookup_dnlc_neg_hits = 0;
   1765 static int nfs_lookup_dnlc_disappears = 0;
   1766 static int nfs_lookup_dnlc_lookups = 0;
   1767 #endif
   1768 
   1769 /* ARGSUSED */
   1770 int
   1771 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
   1772 	int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
   1773 {
   1774 	int error;
   1775 
   1776 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
   1777 
   1778 	/*
   1779 	 * If lookup is for "", just return dvp.  Don't need
   1780 	 * to send it over the wire, look it up in the dnlc,
   1781 	 * or perform any access checks.
   1782 	 */
   1783 	if (*nm == '\0') {
   1784 		VN_HOLD(dvp);
   1785 		*vpp = dvp;
   1786 		return (0);
   1787 	}
   1788 
   1789 	/*
   1790 	 * Can't do lookups in non-directories.
   1791 	 */
   1792 	if (dvp->v_type != VDIR)
   1793 		return (ENOTDIR);
   1794 
   1795 	/*
   1796 	 * If we're called with RFSCALL_SOFT, it's important that
   1797 	 * the only rfscall is one we make directly; if we permit
   1798 	 * an access call because we're looking up "." or validating
   1799 	 * a dnlc hit, we'll deadlock because that rfscall will not
   1800 	 * have the RFSCALL_SOFT set.
   1801 	 */
   1802 	if (rfscall_flags & RFSCALL_SOFT)
   1803 		goto callit;
   1804 
   1805 	/*
   1806 	 * If lookup is for ".", just return dvp.  Don't need
   1807 	 * to send it over the wire or look it up in the dnlc,
   1808 	 * just need to check access.
   1809 	 */
   1810 	if (strcmp(nm, ".") == 0) {
   1811 		error = nfs_access(dvp, VEXEC, 0, cr, NULL);
   1812 		if (error)
   1813 			return (error);
   1814 		VN_HOLD(dvp);
   1815 		*vpp = dvp;
   1816 		return (0);
   1817 	}
   1818 
   1819 	/*
   1820 	 * Lookup this name in the DNLC.  If there was a valid entry,
   1821 	 * then return the results of the lookup.
   1822 	 */
   1823 	error = nfslookup_dnlc(dvp, nm, vpp, cr);
   1824 	if (error || *vpp != NULL)
   1825 		return (error);
   1826 
   1827 callit:
   1828 	error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
   1829 
   1830 	return (error);
   1831 }
   1832 
   1833 static int
   1834 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
   1835 {
   1836 	int error;
   1837 	vnode_t *vp;
   1838 
   1839 	ASSERT(*nm != '\0');
   1840 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
   1841 
   1842 	/*
   1843 	 * Lookup this name in the DNLC.  If successful, then validate
   1844 	 * the caches and then recheck the DNLC.  The DNLC is rechecked
   1845 	 * just in case this entry got invalidated during the call
   1846 	 * to nfs_validate_caches.
   1847 	 *
   1848 	 * An assumption is being made that it is safe to say that a
   1849 	 * file exists which may not on the server.  Any operations to
   1850 	 * the server will fail with ESTALE.
   1851 	 */
   1852 #ifdef DEBUG
   1853 	nfs_lookup_dnlc_lookups++;
   1854 #endif
   1855 	vp = dnlc_lookup(dvp, nm);
   1856 	if (vp != NULL) {
   1857 		VN_RELE(vp);
   1858 		if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
   1859 			PURGE_ATTRCACHE(dvp);
   1860 		}
   1861 		error = nfs_validate_caches(dvp, cr);
   1862 		if (error)
   1863 			return (error);
   1864 		vp = dnlc_lookup(dvp, nm);
   1865 		if (vp != NULL) {
   1866 			error = nfs_access(dvp, VEXEC, 0, cr, NULL);
   1867 			if (error) {
   1868 				VN_RELE(vp);
   1869 				return (error);
   1870 			}
   1871 			if (vp == DNLC_NO_VNODE) {
   1872 				VN_RELE(vp);
   1873 #ifdef DEBUG
   1874 				nfs_lookup_dnlc_neg_hits++;
   1875 #endif
   1876 				return (ENOENT);
   1877 			}
   1878 			*vpp = vp;
   1879 #ifdef DEBUG
   1880 			nfs_lookup_dnlc_hits++;
   1881 #endif
   1882 			return (0);
   1883 		}
   1884 #ifdef DEBUG
   1885 		nfs_lookup_dnlc_disappears++;
   1886 #endif
   1887 	}
   1888 #ifdef DEBUG
   1889 	else
   1890 		nfs_lookup_dnlc_misses++;
   1891 #endif
   1892 
   1893 	*vpp = NULL;
   1894 
   1895 	return (0);
   1896 }
   1897 
   1898 static int
   1899 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
   1900 	int rfscall_flags)
   1901 {
   1902 	int error;
   1903 	struct nfsdiropargs da;
   1904 	struct nfsdiropres dr;
   1905 	int douprintf;
   1906 	failinfo_t fi;
   1907 	hrtime_t t;
   1908 
   1909 	ASSERT(*nm != '\0');
   1910 	ASSERT(dvp->v_type == VDIR);
   1911 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
   1912 
   1913 	setdiropargs(&da, nm, dvp);
   1914 
   1915 	fi.vp = dvp;
   1916 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
   1917 	fi.copyproc = nfscopyfh;
   1918 	fi.lookupproc = nfslookup;
   1919 	fi.xattrdirproc = acl_getxattrdir2;
   1920 
   1921 	douprintf = 1;
   1922 
   1923 	t = gethrtime();
   1924 
   1925 	error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
   1926 	    xdr_diropargs, (caddr_t)&da,
   1927 	    xdr_diropres, (caddr_t)&dr, cr,
   1928 	    &douprintf, &dr.dr_status, rfscall_flags, &fi);
   1929 
   1930 	if (!error) {
   1931 		error = geterrno(dr.dr_status);
   1932 		if (!error) {
   1933 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
   1934 			    dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
   1935 			/*
   1936 			 * If NFS_ACL is supported on the server, then the
   1937 			 * attributes returned by server may have minimal
   1938 			 * permissions sometimes denying access to users having
   1939 			 * proper access.  To get the proper attributes, mark
   1940 			 * the attributes as expired so that they will be
   1941 			 * regotten via the NFS_ACL GETATTR2 procedure.
   1942 			 */
   1943 			if (VTOMI(*vpp)->mi_flags & MI_ACL) {
   1944 				PURGE_ATTRCACHE(*vpp);
   1945 			}
   1946 			if (!(rfscall_flags & RFSCALL_SOFT))
   1947 				dnlc_update(dvp, nm, *vpp);
   1948 		} else {
   1949 			PURGE_STALE_FH(error, dvp, cr);
   1950 			if (error == ENOENT && nfs_lookup_neg_cache)
   1951 				dnlc_enter(dvp, nm, DNLC_NO_VNODE);
   1952 		}
   1953 	}
   1954 
   1955 	return (error);
   1956 }
   1957 
   1958 /* ARGSUSED */
   1959 static int
   1960 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
   1961 	int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
   1962 	vsecattr_t *vsecp)
   1963 {
   1964 	int error;
   1965 	struct nfscreatargs args;
   1966 	struct nfsdiropres dr;
   1967 	int douprintf;
   1968 	vnode_t *vp;
   1969 	rnode_t *rp;
   1970 	struct vattr vattr;
   1971 	rnode_t *drp;
   1972 	vnode_t *tempvp;
   1973 	hrtime_t t;
   1974 
   1975 	drp = VTOR(dvp);
   1976 
   1977 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
   1978 		return (EPERM);
   1979 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
   1980 		return (EINTR);
   1981 
   1982 	/*
   1983 	 * We make a copy of the attributes because the caller does not
   1984 	 * expect us to change what va points to.
   1985 	 */
   1986 	vattr = *va;
   1987 
   1988 	/*
   1989 	 * If the pathname is "", just use dvp.  Don't need
   1990 	 * to send it over the wire, look it up in the dnlc,
   1991 	 * or perform any access checks.
   1992 	 */
   1993 	if (*nm == '\0') {
   1994 		error = 0;
   1995 		VN_HOLD(dvp);
   1996 		vp = dvp;
   1997 	/*
   1998 	 * If the pathname is ".", just use dvp.  Don't need
   1999 	 * to send it over the wire or look it up in the dnlc,
   2000 	 * just need to check access.
   2001 	 */
   2002 	} else if (strcmp(nm, ".") == 0) {
   2003 		error = nfs_access(dvp, VEXEC, 0, cr, ct);
   2004 		if (error) {
   2005 			nfs_rw_exit(&drp->r_rwlock);
   2006 			return (error);
   2007 		}
   2008 		VN_HOLD(dvp);
   2009 		vp = dvp;
   2010 	/*
   2011 	 * We need to go over the wire, just to be sure whether the
   2012 	 * file exists or not.  Using the DNLC can be dangerous in
   2013 	 * this case when making a decision regarding existence.
   2014 	 */
   2015 	} else {
   2016 		error = nfslookup_otw(dvp, nm, &vp, cr, 0);
   2017 	}
   2018 	if (!error) {
   2019 		if (exclusive == EXCL)
   2020 			error = EEXIST;
   2021 		else if (vp->v_type == VDIR && (mode & VWRITE))
   2022 			error = EISDIR;
   2023 		else {
   2024 			/*
   2025 			 * If vnode is a device, create special vnode.
   2026 			 */
   2027 			if (IS_DEVVP(vp)) {
   2028 				tempvp = vp;
   2029 				vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
   2030 				VN_RELE(tempvp);
   2031 			}
   2032 			if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
   2033 				if ((vattr.va_mask & AT_SIZE) &&
   2034 				    vp->v_type == VREG) {
   2035 					vattr.va_mask = AT_SIZE;
   2036 					error = nfssetattr(vp, &vattr, 0, cr);
   2037 				}
   2038 			}
   2039 		}
   2040 		nfs_rw_exit(&drp->r_rwlock);
   2041 		if (error) {
   2042 			VN_RELE(vp);
   2043 		} else {
   2044 			/*
   2045 			 * existing file got truncated, notify.
   2046 			 */
   2047 			vnevent_create(vp, ct);
   2048 			*vpp = vp;
   2049 		}
   2050 		return (error);
   2051 	}
   2052 
   2053 	ASSERT(vattr.va_mask & AT_TYPE);
   2054 	if (vattr.va_type == VREG) {
   2055 		ASSERT(vattr.va_mask & AT_MODE);
   2056 		if (MANDMODE(vattr.va_mode)) {
   2057 			nfs_rw_exit(&drp->r_rwlock);
   2058 			return (EACCES);
   2059 		}
   2060 	}
   2061 
   2062 	dnlc_remove(dvp, nm);
   2063 
   2064 	setdiropargs(&args.ca_da, nm, dvp);
   2065 
   2066 	/*
   2067 	 * Decide what the group-id of the created file should be.
   2068 	 * Set it in attribute list as advisory...then do a setattr
   2069 	 * if the server didn't get it right the first time.
   2070 	 */
   2071 	error = setdirgid(dvp, &vattr.va_gid, cr);
   2072 	if (error) {
   2073 		nfs_rw_exit(&drp->r_rwlock);
   2074 		return (error);
   2075 	}
   2076 	vattr.va_mask |= AT_GID;
   2077 
   2078 	/*
   2079 	 * This is a completely gross hack to make mknod
   2080 	 * work over the wire until we can wack the protocol
   2081 	 */
   2082 #define	IFCHR		0020000		/* character special */
   2083 #define	IFBLK		0060000		/* block special */
   2084 #define	IFSOCK		0140000		/* socket */
   2085 
   2086 	/*
   2087 	 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
   2088 	 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
   2089 	 * bits in the minor number where 4.x supports 8 bits.  If the 5.x
   2090 	 * minor/major numbers <= 8 bits long, compress the device
   2091 	 * number before sending it. Otherwise, the 4.x server will not
   2092 	 * create the device with the correct device number and nothing can be
   2093 	 * done about this.
   2094 	 */
   2095 	if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
   2096 		dev_t d = vattr.va_rdev;
   2097 		dev32_t dev32;
   2098 
   2099 		if (vattr.va_type == VCHR)
   2100 			vattr.va_mode |= IFCHR;
   2101 		else
   2102 			vattr.va_mode |= IFBLK;
   2103 
   2104 		(void) cmpldev(&dev32, d);
   2105 		if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
   2106 			vattr.va_size = (u_offset_t)dev32;
   2107 		else
   2108 			vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
   2109 
   2110 		vattr.va_mask |= AT_MODE|AT_SIZE;
   2111 	} else if (vattr.va_type == VFIFO) {
   2112 		vattr.va_mode |= IFCHR;		/* xtra kludge for namedpipe */
   2113 		vattr.va_size = (u_offset_t)NFS_FIFO_DEV;	/* blech */
   2114 		vattr.va_mask |= AT_MODE|AT_SIZE;
   2115 	} else if (vattr.va_type == VSOCK) {
   2116 		vattr.va_mode |= IFSOCK;
   2117 		/*
   2118 		 * To avoid triggering bugs in the servers set AT_SIZE
   2119 		 * (all other RFS_CREATE calls set this).
   2120 		 */
   2121 		vattr.va_size = 0;
   2122 		vattr.va_mask |= AT_MODE|AT_SIZE;
   2123 	}
   2124 
   2125 	args.ca_sa = &args.ca_sa_buf;
   2126 	error = vattr_to_sattr(&vattr, args.ca_sa);
   2127 	if (error) {
   2128 		/* req time field(s) overflow - return immediately */
   2129 		nfs_rw_exit(&drp->r_rwlock);
   2130 		return (error);
   2131 	}
   2132 
   2133 	douprintf = 1;
   2134 
   2135 	t = gethrtime();
   2136 
   2137 	error = rfs2call(VTOMI(dvp), RFS_CREATE,
   2138 	    xdr_creatargs, (caddr_t)&args,
   2139 	    xdr_diropres, (caddr_t)&dr, cr,
   2140 	    &douprintf, &dr.dr_status, 0, NULL);
   2141 
   2142 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
   2143 
   2144 	if (!error) {
   2145 		error = geterrno(dr.dr_status);
   2146 		if (!error) {
   2147 			if (HAVE_RDDIR_CACHE(drp))
   2148 				nfs_purge_rddir_cache(dvp);
   2149 			vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
   2150 			    dvp->v_vfsp, t, cr, NULL, NULL);
   2151 			/*
   2152 			 * If NFS_ACL is supported on the server, then the
   2153 			 * attributes returned by server may have minimal
   2154 			 * permissions sometimes denying access to users having
   2155 			 * proper access.  To get the proper attributes, mark
   2156 			 * the attributes as expired so that they will be
   2157 			 * regotten via the NFS_ACL GETATTR2 procedure.
   2158 			 */
   2159 			if (VTOMI(vp)->mi_flags & MI_ACL) {
   2160 				PURGE_ATTRCACHE(vp);
   2161 			}
   2162 			dnlc_update(dvp, nm, vp);
   2163 			rp = VTOR(vp);
   2164 			if (vattr.va_size == 0) {
   2165 				mutex_enter(&rp->r_statelock);
   2166 				rp->r_size = 0;
   2167 				mutex_exit(&rp->r_statelock);
   2168 				if (vn_has_cached_data(vp)) {
   2169 					ASSERT(vp->v_type != VCHR);
   2170 					nfs_invalidate_pages(vp,
   2171 					    (u_offset_t)0, cr);
   2172 				}
   2173 			}
   2174 
   2175 			/*
   2176 			 * Make sure the gid was set correctly.
   2177 			 * If not, try to set it (but don't lose
   2178 			 * any sleep over it).
   2179 			 */
   2180 			if (vattr.va_gid != rp->r_attr.va_gid) {
   2181 				vattr.va_mask = AT_GID;
   2182 				(void) nfssetattr(vp, &vattr, 0, cr);
   2183 			}
   2184 
   2185 			/*
   2186 			 * If vnode is a device create special vnode
   2187 			 */
   2188 			if (IS_DEVVP(vp)) {
   2189 				*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
   2190 				VN_RELE(vp);
   2191 			} else
   2192 				*vpp = vp;
   2193 		} else {
   2194 			PURGE_STALE_FH(error, dvp, cr);
   2195 		}
   2196 	}
   2197 
   2198 	nfs_rw_exit(&drp->r_rwlock);
   2199 
   2200 	return (error);
   2201 }
   2202 
   2203 /*
   2204  * Weirdness: if the vnode to be removed is open
   2205  * we rename it instead of removing it and nfs_inactive
   2206  * will remove the new name.
   2207  */
   2208 /* ARGSUSED */
   2209 static int
   2210 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
   2211 {
   2212 	int error;
   2213 	struct nfsdiropargs da;
   2214 	enum nfsstat status;
   2215 	vnode_t *vp;
   2216 	char *tmpname;
   2217 	int douprintf;
   2218 	rnode_t *rp;
   2219 	rnode_t *drp;
   2220 
   2221 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
   2222 		return (EPERM);
   2223 	drp = VTOR(dvp);
   2224 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
   2225 		return (EINTR);
   2226 
   2227 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
   2228 	if (error) {
   2229 		nfs_rw_exit(&drp->r_rwlock);
   2230 		return (error);
   2231 	}
   2232 
   2233 	if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
   2234 		VN_RELE(vp);
   2235 		nfs_rw_exit(&drp->r_rwlock);
   2236 		return (EPERM);
   2237 	}
   2238 
   2239 	/*
   2240 	 * First just remove the entry from the name cache, as it
   2241 	 * is most likely the only entry for this vp.
   2242 	 */
   2243 	dnlc_remove(dvp, nm);
   2244 
   2245 	/*
   2246 	 * If the file has a v_count > 1 then there may be more than one
   2247 	 * entry in the name cache due multiple links or an open file,
   2248 	 * but we don't have the real reference count so flush all
   2249 	 * possible entries.
   2250 	 */
   2251 	if (vp->v_count > 1)
   2252 		dnlc_purge_vp(vp);
   2253 
   2254 	/*
   2255 	 * Now we have the real reference count on the vnode
   2256 	 */
   2257 	rp = VTOR(vp);
   2258 	mutex_enter(&rp->r_statelock);
   2259 	if (vp->v_count > 1 &&
   2260 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
   2261 		mutex_exit(&rp->r_statelock);
   2262 		tmpname = newname();
   2263 		error = nfsrename(dvp, nm, dvp, tmpname, cr, ct);
   2264 		if (error)
   2265 			kmem_free(tmpname, MAXNAMELEN);
   2266 		else {
   2267 			mutex_enter(&rp->r_statelock);
   2268 			if (rp->r_unldvp == NULL) {
   2269 				VN_HOLD(dvp);
   2270 				rp->r_unldvp = dvp;
   2271 				if (rp->r_unlcred != NULL)
   2272 					crfree(rp->r_unlcred);
   2273 				crhold(cr);
   2274 				rp->r_unlcred = cr;
   2275 				rp->r_unlname = tmpname;
   2276 			} else {
   2277 				kmem_free(rp->r_unlname, MAXNAMELEN);
   2278 				rp->r_unlname = tmpname;
   2279 			}
   2280 			mutex_exit(&rp->r_statelock);
   2281 		}
   2282 	} else {
   2283 		mutex_exit(&rp->r_statelock);
   2284 		/*
   2285 		 * We need to flush any dirty pages which happen to
   2286 		 * be hanging around before removing the file.  This
   2287 		 * shouldn't happen very often and mostly on file
   2288 		 * systems mounted "nocto".
   2289 		 */
   2290 		if (vn_has_cached_data(vp) &&
   2291 		    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
   2292 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
   2293 			if (error && (error == ENOSPC || error == EDQUOT)) {
   2294 				mutex_enter(&rp->r_statelock);
   2295 				if (!rp->r_error)
   2296 					rp->r_error = error;
   2297 				mutex_exit(&rp->r_statelock);
   2298 			}
   2299 		}
   2300 
   2301 		setdiropargs(&da, nm, dvp);
   2302 
   2303 		douprintf = 1;
   2304 
   2305 		error = rfs2call(VTOMI(dvp), RFS_REMOVE,
   2306 		    xdr_diropargs, (caddr_t)&da,
   2307 		    xdr_enum, (caddr_t)&status, cr,
   2308 		    &douprintf, &status, 0, NULL);
   2309 
   2310 		/*
   2311 		 * The xattr dir may be gone after last attr is removed,
   2312 		 * so flush it from dnlc.
   2313 		 */
   2314 		if (dvp->v_flag & V_XATTRDIR)
   2315 			dnlc_purge_vp(dvp);
   2316 
   2317 		PURGE_ATTRCACHE(dvp);	/* mod time changed */
   2318 		PURGE_ATTRCACHE(vp);	/* link count changed */
   2319 
   2320 		if (!error) {
   2321 			error = geterrno(status);
   2322 			if (!error) {
   2323 				if (HAVE_RDDIR_CACHE(drp))
   2324 					nfs_purge_rddir_cache(dvp);
   2325 			} else {
   2326 				PURGE_STALE_FH(error, dvp, cr);
   2327 			}
   2328 		}
   2329 	}
   2330 
   2331 	if (error == 0) {
   2332 		vnevent_remove(vp, dvp, nm, ct);
   2333 	}
   2334 	VN_RELE(vp);
   2335 
   2336 	nfs_rw_exit(&drp->r_rwlock);
   2337 
   2338 	return (error);
   2339 }
   2340 
   2341 /* ARGSUSED */
   2342 static int
   2343 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
   2344 	caller_context_t *ct, int flags)
   2345 {
   2346 	int error;
   2347 	struct nfslinkargs args;
   2348 	enum nfsstat status;
   2349 	vnode_t *realvp;
   2350 	int douprintf;
   2351 	rnode_t *tdrp;
   2352 
   2353 	if (nfs_zone() != VTOMI(tdvp)->mi_zone)
   2354 		return (EPERM);
   2355 	if (VOP_REALVP(svp, &realvp, ct) == 0)
   2356 		svp = realvp;
   2357 
   2358 	args.la_from = VTOFH(svp);
   2359 	setdiropargs(&args.la_to, tnm, tdvp);
   2360 
   2361 	tdrp = VTOR(tdvp);
   2362 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
   2363 		return (EINTR);
   2364 
   2365 	dnlc_remove(tdvp, tnm);
   2366 
   2367 	douprintf = 1;
   2368 
   2369 	error = rfs2call(VTOMI(svp), RFS_LINK,
   2370 	    xdr_linkargs, (caddr_t)&args,
   2371 	    xdr_enum, (caddr_t)&status, cr,
   2372 	    &douprintf, &status, 0, NULL);
   2373 
   2374 	PURGE_ATTRCACHE(tdvp);	/* mod time changed */
   2375 	PURGE_ATTRCACHE(svp);	/* link count changed */
   2376 
   2377 	if (!error) {
   2378 		error = geterrno(status);
   2379 		if (!error) {
   2380 			if (HAVE_RDDIR_CACHE(tdrp))
   2381 				nfs_purge_rddir_cache(tdvp);
   2382 		}
   2383 	}
   2384 
   2385 	nfs_rw_exit(&tdrp->r_rwlock);
   2386 
   2387 	if (!error) {
   2388 		/*
   2389 		 * Notify the source file of this link operation.
   2390 		 */
   2391 		vnevent_link(svp, ct);
   2392 	}
   2393 	return (error);
   2394 }
   2395 
   2396 /* ARGSUSED */
   2397 static int
   2398 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
   2399 	caller_context_t *ct, int flags)
   2400 {
   2401 	vnode_t *realvp;
   2402 
   2403 	if (nfs_zone() != VTOMI(odvp)->mi_zone)
   2404 		return (EPERM);
   2405 	if (VOP_REALVP(ndvp, &realvp, ct) == 0)
   2406 		ndvp = realvp;
   2407 
   2408 	return (nfsrename(odvp, onm, ndvp, nnm, cr, ct));
   2409 }
   2410 
   2411 /*
   2412  * nfsrename does the real work of renaming in NFS Version 2.
   2413  */
   2414 static int
   2415 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
   2416     caller_context_t *ct)
   2417 {
   2418 	int error;
   2419 	enum nfsstat status;
   2420 	struct nfsrnmargs args;
   2421 	int douprintf;
   2422 	vnode_t *nvp = NULL;
   2423 	vnode_t *ovp = NULL;
   2424 	char *tmpname;
   2425 	rnode_t *rp;
   2426 	rnode_t *odrp;
   2427 	rnode_t *ndrp;
   2428 
   2429 	ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
   2430 	if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
   2431 	    strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
   2432 		return (EINVAL);
   2433 
   2434 	odrp = VTOR(odvp);
   2435 	ndrp = VTOR(ndvp);
   2436 	if ((intptr_t)odrp < (intptr_t)ndrp) {
   2437 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
   2438 			return (EINTR);
   2439 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
   2440 			nfs_rw_exit(&odrp->r_rwlock);
   2441 			return (EINTR);
   2442 		}
   2443 	} else {
   2444 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
   2445 			return (EINTR);
   2446 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
   2447 			nfs_rw_exit(&ndrp->r_rwlock);
   2448 			return (EINTR);
   2449 		}
   2450 	}
   2451 
   2452 	/*
   2453 	 * Lookup the target file.  If it exists, it needs to be
   2454 	 * checked to see whether it is a mount point and whether
   2455 	 * it is active (open).
   2456 	 */
   2457 	error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
   2458 	if (!error) {
   2459 		/*
   2460 		 * If this file has been mounted on, then just
   2461 		 * return busy because renaming to it would remove
   2462 		 * the mounted file system from the name space.
   2463 		 */
   2464 		if (vn_mountedvfs(nvp) != NULL) {
   2465 			VN_RELE(nvp);
   2466 			nfs_rw_exit(&odrp->r_rwlock);
   2467 			nfs_rw_exit(&ndrp->r_rwlock);
   2468 			return (EBUSY);
   2469 		}
   2470 
   2471 		/*
   2472 		 * Purge the name cache of all references to this vnode
   2473 		 * so that we can check the reference count to infer
   2474 		 * whether it is active or not.
   2475 		 */
   2476 		/*
   2477 		 * First just remove the entry from the name cache, as it
   2478 		 * is most likely the only entry for this vp.
   2479 		 */
   2480 		dnlc_remove(ndvp, nnm);
   2481 		/*
   2482 		 * If the file has a v_count > 1 then there may be more
   2483 		 * than one entry in the name cache due multiple links
   2484 		 * or an open file, but we don't have the real reference
   2485 		 * count so flush all possible entries.
   2486 		 */
   2487 		if (nvp->v_count > 1)
   2488 			dnlc_purge_vp(nvp);
   2489 
   2490 		/*
   2491 		 * If the vnode is active and is not a directory,
   2492 		 * arrange to rename it to a
   2493 		 * temporary file so that it will continue to be
   2494 		 * accessible.  This implements the "unlink-open-file"
   2495 		 * semantics for the target of a rename operation.
   2496 		 * Before doing this though, make sure that the
   2497 		 * source and target files are not already the same.
   2498 		 */
   2499 		if (nvp->v_count > 1 && nvp->v_type != VDIR) {
   2500 			/*
   2501 			 * Lookup the source name.
   2502 			 */
   2503 			error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
   2504 			    cr, 0);
   2505 
   2506 			/*
   2507 			 * The source name *should* already exist.
   2508 			 */
   2509 			if (error) {
   2510 				VN_RELE(nvp);
   2511 				nfs_rw_exit(&odrp->r_rwlock);
   2512 				nfs_rw_exit(&ndrp->r_rwlock);
   2513 				return (error);
   2514 			}
   2515 
   2516 			/*
   2517 			 * Compare the two vnodes.  If they are the same,
   2518 			 * just release all held vnodes and return success.
   2519 			 */
   2520 			if (ovp == nvp) {
   2521 				VN_RELE(ovp);
   2522 				VN_RELE(nvp);
   2523 				nfs_rw_exit(&odrp->r_rwlock);
   2524 				nfs_rw_exit(&ndrp->r_rwlock);
   2525 				return (0);
   2526 			}
   2527 
   2528 			/*
   2529 			 * Can't mix and match directories and non-
   2530 			 * directories in rename operations.  We already
   2531 			 * know that the target is not a directory.  If
   2532 			 * the source is a directory, return an error.
   2533 			 */
   2534 			if (ovp->v_type == VDIR) {
   2535 				VN_RELE(ovp);
   2536 				VN_RELE(nvp);
   2537 				nfs_rw_exit(&odrp->r_rwlock);
   2538 				nfs_rw_exit(&ndrp->r_rwlock);
   2539 				return (ENOTDIR);
   2540 			}
   2541 
   2542 			/*
   2543 			 * The target file exists, is not the same as
   2544 			 * the source file, and is active.  Link it
   2545 			 * to a temporary filename to avoid having
   2546 			 * the server removing the file completely.
   2547 			 */
   2548 			tmpname = newname();
   2549 			error = nfs_link(ndvp, nvp, tmpname, cr, NULL, 0);
   2550 			if (error == EOPNOTSUPP) {
   2551 				error = nfs_rename(ndvp, nnm, ndvp, tmpname,
   2552 				    cr, NULL, 0);
   2553 			}
   2554 			if (error) {
   2555 				kmem_free(tmpname, MAXNAMELEN);
   2556 				VN_RELE(ovp);
   2557 				VN_RELE(nvp);
   2558 				nfs_rw_exit(&odrp->r_rwlock);
   2559 				nfs_rw_exit(&ndrp->r_rwlock);
   2560 				return (error);
   2561 			}
   2562 			rp = VTOR(nvp);
   2563 			mutex_enter(&rp->r_statelock);
   2564 			if (rp->r_unldvp == NULL) {
   2565 				VN_HOLD(ndvp);
   2566 				rp->r_unldvp = ndvp;
   2567 				if (rp->r_unlcred != NULL)
   2568 					crfree(rp->r_unlcred);
   2569 				crhold(cr);
   2570 				rp->r_unlcred = cr;
   2571 				rp->r_unlname = tmpname;
   2572 			} else {
   2573 				kmem_free(rp->r_unlname, MAXNAMELEN);
   2574 				rp->r_unlname = tmpname;
   2575 			}
   2576 			mutex_exit(&rp->r_statelock);
   2577 		}
   2578 	}
   2579 
   2580 	if (ovp == NULL) {
   2581 		/*
   2582 		 * When renaming directories to be a subdirectory of a
   2583 		 * different parent, the dnlc entry for ".." will no
   2584 		 * longer be valid, so it must be removed.
   2585 		 *
   2586 		 * We do a lookup here to determine whether we are renaming
   2587 		 * a directory and we need to check if we are renaming
   2588 		 * an unlinked file.  This might have already been done
   2589 		 * in previous code, so we check ovp == NULL to avoid
   2590 		 * doing it twice.
   2591 		 */
   2592 
   2593 		error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
   2594 
   2595 		/*
   2596 		 * The source name *should* already exist.
   2597 		 */
   2598 		if (error) {
   2599 			nfs_rw_exit(&odrp->r_rwlock);
   2600 			nfs_rw_exit(&ndrp->r_rwlock);
   2601 			if (nvp) {
   2602 				VN_RELE(nvp);
   2603 			}
   2604 			return (error);
   2605 		}
   2606 		ASSERT(ovp != NULL);
   2607 	}
   2608 
   2609 	dnlc_remove(odvp, onm);
   2610 	dnlc_remove(ndvp, nnm);
   2611 
   2612 	setdiropargs(&args.rna_from, onm, odvp);
   2613 	setdiropargs(&args.rna_to, nnm, ndvp);
   2614 
   2615 	douprintf = 1;
   2616 
   2617 	error = rfs2call(VTOMI(odvp), RFS_RENAME,
   2618 	    xdr_rnmargs, (caddr_t)&args,
   2619 	    xdr_enum, (caddr_t)&status, cr,
   2620 	    &douprintf, &status, 0, NULL);
   2621 
   2622 	PURGE_ATTRCACHE(odvp);	/* mod time changed */
   2623 	PURGE_ATTRCACHE(ndvp);	/* mod time changed */
   2624 
   2625 	if (!error) {
   2626 		error = geterrno(status);
   2627 		if (!error) {
   2628 			if (HAVE_RDDIR_CACHE(odrp))
   2629 				nfs_purge_rddir_cache(odvp);
   2630 			if (HAVE_RDDIR_CACHE(ndrp))
   2631 				nfs_purge_rddir_cache(ndvp);
   2632 			/*
   2633 			 * when renaming directories to be a subdirectory of a
   2634 			 * different parent, the dnlc entry for ".." will no
   2635 			 * longer be valid, so it must be removed
   2636 			 */
   2637 			rp = VTOR(ovp);
   2638 			if (ndvp != odvp) {
   2639 				if (ovp->v_type == VDIR) {
   2640 					dnlc_remove(ovp, "..");
   2641 					if (HAVE_RDDIR_CACHE(rp))
   2642 						nfs_purge_rddir_cache(ovp);
   2643 				}
   2644 			}
   2645 
   2646 			/*
   2647 			 * If we are renaming the unlinked file, update the
   2648 			 * r_unldvp and r_unlname as needed.
   2649 			 */
   2650 			mutex_enter(&rp->r_statelock);
   2651 			if (rp->r_unldvp != NULL) {
   2652 				if (strcmp(rp->r_unlname, onm) == 0) {
   2653 					(void) strncpy(rp->r_unlname,
   2654 					    nnm, MAXNAMELEN);
   2655 					rp->r_unlname[MAXNAMELEN - 1] = '\0';
   2656 
   2657 					if (ndvp != rp->r_unldvp) {
   2658 						VN_RELE(rp->r_unldvp);
   2659 						rp->r_unldvp = ndvp;
   2660 						VN_HOLD(ndvp);
   2661 					}
   2662 				}
   2663 			}
   2664 			mutex_exit(&rp->r_statelock);
   2665 		} else {
   2666 			/*
   2667 			 * System V defines rename to return EEXIST, not
   2668 			 * ENOTEMPTY if the target directory is not empty.
   2669 			 * Over the wire, the error is NFSERR_ENOTEMPTY
   2670 			 * which geterrno maps to ENOTEMPTY.
   2671 			 */
   2672 			if (error == ENOTEMPTY)
   2673 				error = EEXIST;
   2674 		}
   2675 	}
   2676 
   2677 	if (error == 0) {
   2678 		if (nvp)
   2679 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
   2680 
   2681 		if (odvp != ndvp)
   2682 			vnevent_rename_dest_dir(ndvp, ct);
   2683 
   2684 		ASSERT(ovp != NULL);
   2685 		vnevent_rename_src(ovp, odvp, onm, ct);
   2686 	}
   2687 
   2688 	if (nvp) {
   2689 		VN_RELE(nvp);
   2690 	}
   2691 	VN_RELE(ovp);
   2692 
   2693 	nfs_rw_exit(&odrp->r_rwlock);
   2694 	nfs_rw_exit(&ndrp->r_rwlock);
   2695 
   2696 	return (error);
   2697 }
   2698 
   2699 /* ARGSUSED */
   2700 static int
   2701 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
   2702 	caller_context_t *ct, int flags, vsecattr_t *vsecp)
   2703 {
   2704 	int error;
   2705 	struct nfscreatargs args;
   2706 	struct nfsdiropres dr;
   2707 	int douprintf;
   2708 	rnode_t *drp;
   2709 	hrtime_t t;
   2710 
   2711 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
   2712 		return (EPERM);
   2713 
   2714 	setdiropargs(&args.ca_da, nm, dvp);
   2715 
   2716 	/*
   2717 	 * Decide what the group-id and set-gid bit of the created directory
   2718 	 * should be.  May have to do a setattr to get the gid right.
   2719 	 */
   2720 	error = setdirgid(dvp, &va->va_gid, cr);
   2721 	if (error)
   2722 		return (error);
   2723 	error = setdirmode(dvp, &va->va_mode, cr);
   2724 	if (error)
   2725 		return (error);
   2726 	va->va_mask |= AT_MODE|AT_GID;
   2727 
   2728 	args.ca_sa = &args.ca_sa_buf;
   2729 	error = vattr_to_sattr(va, args.ca_sa);
   2730 	if (error) {
   2731 		/* req time field(s) overflow - return immediately */
   2732 		return (error);
   2733 	}
   2734 
   2735 	drp = VTOR(dvp);
   2736 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
   2737 		return (EINTR);
   2738 
   2739 	dnlc_remove(dvp, nm);
   2740 
   2741 	douprintf = 1;
   2742 
   2743 	t = gethrtime();
   2744 
   2745 	error = rfs2call(VTOMI(dvp), RFS_MKDIR,
   2746 	    xdr_creatargs, (caddr_t)&args,
   2747 	    xdr_diropres, (caddr_t)&dr, cr,
   2748 	    &douprintf, &dr.dr_status, 0, NULL);
   2749 
   2750 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
   2751 
   2752 	if (!error) {
   2753 		error = geterrno(dr.dr_status);
   2754 		if (!error) {
   2755 			if (HAVE_RDDIR_CACHE(drp))
   2756 				nfs_purge_rddir_cache(dvp);
   2757 			/*
   2758 			 * The attributes returned by RFS_MKDIR can not
   2759 			 * be depended upon, so mark the attribute cache
   2760 			 * as purged.  A subsequent GETATTR will get the
   2761 			 * correct attributes from the server.
   2762 			 */
   2763 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
   2764 			    dvp->v_vfsp, t, cr, NULL, NULL);
   2765 			PURGE_ATTRCACHE(*vpp);
   2766 			dnlc_update(dvp, nm, *vpp);
   2767 
   2768 			/*
   2769 			 * Make sure the gid was set correctly.
   2770 			 * If not, try to set it (but don't lose
   2771 			 * any sleep over it).
   2772 			 */
   2773 			if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
   2774 				va->va_mask = AT_GID;
   2775 				(void) nfssetattr(*vpp, va, 0, cr);
   2776 			}
   2777 		} else {
   2778 			PURGE_STALE_FH(error, dvp, cr);
   2779 		}
   2780 	}
   2781 
   2782 	nfs_rw_exit(&drp->r_rwlock);
   2783 
   2784 	return (error);
   2785 }
   2786 
   2787 /* ARGSUSED */
   2788 static int
   2789 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
   2790 	caller_context_t *ct, int flags)
   2791 {
   2792 	int error;
   2793 	enum nfsstat status;
   2794 	struct nfsdiropargs da;
   2795 	vnode_t *vp;
   2796 	int douprintf;
   2797 	rnode_t *drp;
   2798 
   2799 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
   2800 		return (EPERM);
   2801 	drp = VTOR(dvp);
   2802 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
   2803 		return (EINTR);
   2804 
   2805 	/*
   2806 	 * Attempt to prevent a rmdir(".") from succeeding.
   2807 	 */
   2808 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
   2809 	if (error) {
   2810 		nfs_rw_exit(&drp->r_rwlock);
   2811 		return (error);
   2812 	}
   2813 
   2814 	if (vp == cdir) {
   2815 		VN_RELE(vp);
   2816 		nfs_rw_exit(&drp->r_rwlock);
   2817 		return (EINVAL);
   2818 	}
   2819 
   2820 	setdiropargs(&da, nm, dvp);
   2821 
   2822 	/*
   2823 	 * First just remove the entry from the name cache, as it
   2824 	 * is most likely an entry for this vp.
   2825 	 */
   2826 	dnlc_remove(dvp, nm);
   2827 
   2828 	/*
   2829 	 * If there vnode reference count is greater than one, then
   2830 	 * there may be additional references in the DNLC which will
   2831 	 * need to be purged.  First, trying removing the entry for
   2832 	 * the parent directory and see if that removes the additional
   2833 	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
   2834 	 * to completely remove any references to the directory which
   2835 	 * might still exist in the DNLC.
   2836 	 */
   2837 	if (vp->v_count > 1) {
   2838 		dnlc_remove(vp, "..");
   2839 		if (vp->v_count > 1)
   2840 			dnlc_purge_vp(vp);
   2841 	}
   2842 
   2843 	douprintf = 1;
   2844 
   2845 	error = rfs2call(VTOMI(dvp), RFS_RMDIR,
   2846 	    xdr_diropargs, (caddr_t)&da,
   2847 	    xdr_enum, (caddr_t)&status, cr,
   2848 	    &douprintf, &status, 0, NULL);
   2849 
   2850 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
   2851 
   2852 	if (error) {
   2853 		VN_RELE(vp);
   2854 		nfs_rw_exit(&drp->r_rwlock);
   2855 		return (error);
   2856 	}
   2857 
   2858 	error = geterrno(status);
   2859 	if (!error) {
   2860 		if (HAVE_RDDIR_CACHE(drp))
   2861 			nfs_purge_rddir_cache(dvp);
   2862 		if (HAVE_RDDIR_CACHE(VTOR(vp)))
   2863 			nfs_purge_rddir_cache(vp);
   2864 	} else {
   2865 		PURGE_STALE_FH(error, dvp, cr);
   2866 		/*
   2867 		 * System V defines rmdir to return EEXIST, not
   2868 		 * ENOTEMPTY if the directory is not empty.  Over
   2869 		 * the wire, the error is NFSERR_ENOTEMPTY which
   2870 		 * geterrno maps to ENOTEMPTY.
   2871 		 */
   2872 		if (error == ENOTEMPTY)
   2873 			error = EEXIST;
   2874 	}
   2875 
   2876 	if (error == 0) {
   2877 		vnevent_rmdir(vp, dvp, nm, ct);
   2878 	}
   2879 	VN_RELE(vp);
   2880 
   2881 	nfs_rw_exit(&drp->r_rwlock);
   2882 
   2883 	return (error);
   2884 }
   2885 
   2886 /* ARGSUSED */
   2887 static int
   2888 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
   2889 	caller_context_t *ct, int flags)
   2890 {
   2891 	int error;
   2892 	struct nfsslargs args;
   2893 	enum nfsstat status;
   2894 	int douprintf;
   2895 	rnode_t *drp;
   2896 
   2897 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
   2898 		return (EPERM);
   2899 	setdiropargs(&args.sla_from, lnm, dvp);
   2900 	args.sla_sa = &args.sla_sa_buf;
   2901 	error = vattr_to_sattr(tva, args.sla_sa);
   2902 	if (error) {
   2903 		/* req time field(s) overflow - return immediately */
   2904 		return (error);
   2905 	}
   2906 	args.sla_tnm = tnm;
   2907 
   2908 	drp = VTOR(dvp);
   2909 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
   2910 		return (EINTR);
   2911 
   2912 	dnlc_remove(dvp, lnm);
   2913 
   2914 	douprintf = 1;
   2915 
   2916 	error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
   2917 	    xdr_slargs, (caddr_t)&args,
   2918 	    xdr_enum, (caddr_t)&status, cr,
   2919 	    &douprintf, &status, 0, NULL);
   2920 
   2921 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
   2922 
   2923 	if (!error) {
   2924 		error = geterrno(status);
   2925 		if (!error) {
   2926 			if (HAVE_RDDIR_CACHE(drp))
   2927 				nfs_purge_rddir_cache(dvp);
   2928 		} else {
   2929 			PURGE_STALE_FH(error, dvp, cr);
   2930 		}
   2931 	}
   2932 
   2933 	nfs_rw_exit(&drp->r_rwlock);
   2934 
   2935 	return (error);
   2936 }
   2937 
   2938 #ifdef DEBUG
   2939 static int nfs_readdir_cache_hits = 0;
   2940 static int nfs_readdir_cache_shorts = 0;
   2941 static int nfs_readdir_cache_waits = 0;
   2942 static int nfs_readdir_cache_misses = 0;
   2943 static int nfs_readdir_readahead = 0;
   2944 #endif
   2945 
   2946 static int nfs_shrinkreaddir = 0;
   2947 
   2948 /*
   2949  * Read directory entries.
   2950  * There are some weird things to look out for here.  The uio_offset
   2951  * field is either 0 or it is the offset returned from a previous
   2952  * readdir.  It is an opaque value used by the server to find the
   2953  * correct directory block to read. The count field is the number
   2954  * of blocks to read on the server.  This is advisory only, the server
   2955  * may return only one block's worth of entries.  Entries may be compressed
   2956  * on the server.
   2957  */
   2958 /* ARGSUSED */
   2959 static int
   2960 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
   2961 	caller_context_t *ct, int flags)
   2962 {
   2963 	int error;
   2964 	size_t count;
   2965 	rnode_t *rp;
   2966 	rddir_cache *rdc;
   2967 	rddir_cache *nrdc;
   2968 	rddir_cache *rrdc;
   2969 #ifdef DEBUG
   2970 	int missed;
   2971 #endif
   2972 	rddir_cache srdc;
   2973 	avl_index_t where;
   2974 
   2975 	rp = VTOR(vp);
   2976 
   2977 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
   2978 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   2979 		return (EIO);
   2980 	/*
   2981 	 * Make sure that the directory cache is valid.
   2982 	 */
   2983 	if (HAVE_RDDIR_CACHE(rp)) {
   2984 		if (nfs_disable_rddir_cache) {
   2985 			/*
   2986 			 * Setting nfs_disable_rddir_cache in /etc/system
   2987 			 * allows interoperability with servers that do not
   2988 			 * properly update the attributes of directories.
   2989 			 * Any cached information gets purged before an
   2990 			 * access is made to it.
   2991 			 */
   2992 			nfs_purge_rddir_cache(vp);
   2993 		} else {
   2994 			error = nfs_validate_caches(vp, cr);
   2995 			if (error)
   2996 				return (error);
   2997 		}
   2998 	}
   2999 
   3000 	/*
   3001 	 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
   3002 	 * RFS_READDIR request with rda_count set to more than 0x400. So
   3003 	 * we reduce the request size here purely for compatibility.
   3004 	 *
   3005 	 * In general, this is no longer required.  However, if a server
   3006 	 * is discovered which can not handle requests larger than 1024,
   3007 	 * nfs_shrinkreaddir can be set to 1 to enable this backwards
   3008 	 * compatibility.
   3009 	 *
   3010 	 * In any case, the request size is limited to NFS_MAXDATA bytes.
   3011 	 */
   3012 	count = MIN(uiop->uio_iov->iov_len,
   3013 	    nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
   3014 
   3015 	nrdc = NULL;
   3016 #ifdef DEBUG
   3017 	missed = 0;
   3018 #endif
   3019 top:
   3020 	/*
   3021 	 * Short circuit last readdir which always returns 0 bytes.
   3022 	 * This can be done after the directory has been read through
   3023 	 * completely at least once.  This will set r_direof which
   3024 	 * can be used to find the value of the last cookie.
   3025 	 */
   3026 	mutex_enter(&rp->r_statelock);
   3027 	if (rp->r_direof != NULL &&
   3028 	    uiop->uio_offset == rp->r_direof->nfs_ncookie) {
   3029 		mutex_exit(&rp->r_statelock);
   3030 #ifdef DEBUG
   3031 		nfs_readdir_cache_shorts++;
   3032 #endif
   3033 		if (eofp)
   3034 			*eofp = 1;
   3035 		if (nrdc != NULL)
   3036 			rddir_cache_rele(nrdc);
   3037 		return (0);
   3038 	}
   3039 	/*
   3040 	 * Look for a cache entry.  Cache entries are identified
   3041 	 * by the NFS cookie value and the byte count requested.
   3042 	 */
   3043 	srdc.nfs_cookie = uiop->uio_offset;
   3044 	srdc.buflen = count;
   3045 	rdc = avl_find(&rp->r_dir, &srdc, &where);
   3046 	if (rdc != NULL) {
   3047 		rddir_cache_hold(rdc);
   3048 		/*
   3049 		 * If the cache entry is in the process of being
   3050 		 * filled in, wait until this completes.  The
   3051 		 * RDDIRWAIT bit is set to indicate that someone
   3052 		 * is waiting and then the thread currently
   3053 		 * filling the entry is done, it should do a
   3054 		 * cv_broadcast to wakeup all of the threads
   3055 		 * waiting for it to finish.
   3056 		 */
   3057 		if (rdc->flags & RDDIR) {
   3058 			nfs_rw_exit(&rp->r_rwlock);
   3059 			rdc->flags |= RDDIRWAIT;
   3060 #ifdef DEBUG
   3061 			nfs_readdir_cache_waits++;
   3062 #endif
   3063 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
   3064 				/*
   3065 				 * We got interrupted, probably
   3066 				 * the user typed ^C or an alarm
   3067 				 * fired.  We free the new entry
   3068 				 * if we allocated one.
   3069 				 */
   3070 				mutex_exit(&rp->r_statelock);
   3071 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
   3072 				    RW_READER, FALSE);
   3073 				rddir_cache_rele(rdc);
   3074 				if (nrdc != NULL)
   3075 					rddir_cache_rele(nrdc);
   3076 				return (EINTR);
   3077 			}
   3078 			mutex_exit(&rp->r_statelock);
   3079 			(void) nfs_rw_enter_sig(&rp->r_rwlock,
   3080 			    RW_READER, FALSE);
   3081 			rddir_cache_rele(rdc);
   3082 			goto top;
   3083 		}
   3084 		/*
   3085 		 * Check to see if a readdir is required to
   3086 		 * fill the entry.  If so, mark this entry
   3087 		 * as being filled, remove our reference,
   3088 		 * and branch to the code to fill the entry.
   3089 		 */
   3090 		if (rdc->flags & RDDIRREQ) {
   3091 			rdc->flags &= ~RDDIRREQ;
   3092 			rdc->flags |= RDDIR;
   3093 			if (nrdc != NULL)
   3094 				rddir_cache_rele(nrdc);
   3095 			nrdc = rdc;
   3096 			mutex_exit(&rp->r_statelock);
   3097 			goto bottom;
   3098 		}
   3099 #ifdef DEBUG
   3100 		if (!missed)
   3101 			nfs_readdir_cache_hits++;
   3102 #endif
   3103 		/*
   3104 		 * If an error occurred while attempting
   3105 		 * to fill the cache entry, just return it.
   3106 		 */
   3107 		if (rdc->error) {
   3108 			error = rdc->error;
   3109 			mutex_exit(&rp->r_statelock);
   3110 			rddir_cache_rele(rdc);
   3111 			if (nrdc != NULL)
   3112 				rddir_cache_rele(nrdc);
   3113 			return (error);
   3114 		}
   3115 
   3116 		/*
   3117 		 * The cache entry is complete and good,
   3118 		 * copyout the dirent structs to the calling
   3119 		 * thread.
   3120 		 */
   3121 		error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
   3122 
   3123 		/*
   3124 		 * If no error occurred during the copyout,
   3125 		 * update the offset in the uio struct to
   3126 		 * contain the value of the next cookie
   3127 		 * and set the eof value appropriately.
   3128 		 */
   3129 		if (!error) {
   3130 			uiop->uio_offset = rdc->nfs_ncookie;
   3131 			if (eofp)
   3132 				*eofp = rdc->eof;
   3133 		}
   3134 
   3135 		/*
   3136 		 * Decide whether to do readahead.  Don't if
   3137 		 * have already read to the end of directory.
   3138 		 */
   3139 		if (rdc->eof) {
   3140 			rp->r_direof = rdc;
   3141 			mutex_exit(&rp->r_statelock);
   3142 			rddir_cache_rele(rdc);
   3143 			if (nrdc != NULL)
   3144 				rddir_cache_rele(nrdc);
   3145 			return (error);
   3146 		}
   3147 
   3148 		/*
   3149 		 * Check to see whether we found an entry
   3150 		 * for the readahead.  If so, we don't need
   3151 		 * to do anything further, so free the new
   3152 		 * entry if one was allocated.  Otherwise,
   3153 		 * allocate a new entry, add it to the cache,
   3154 		 * and then initiate an asynchronous readdir
   3155 		 * operation to fill it.
   3156 		 */
   3157 		srdc.nfs_cookie = rdc->nfs_ncookie;
   3158 		srdc.buflen = count;
   3159 		rrdc = avl_find(&rp->r_dir, &srdc, &where);
   3160 		if (rrdc != NULL) {
   3161 			if (nrdc != NULL)
   3162 				rddir_cache_rele(nrdc);
   3163 		} else {
   3164 			if (nrdc != NULL)
   3165 				rrdc = nrdc;
   3166 			else {
   3167 				rrdc = rddir_cache_alloc(KM_NOSLEEP);
   3168 			}
   3169 			if (rrdc != NULL) {
   3170 				rrdc->nfs_cookie = rdc->nfs_ncookie;
   3171 				rrdc->buflen = count;
   3172 				avl_insert(&rp->r_dir, rrdc, where);
   3173 				rddir_cache_hold(rrdc);
   3174 				mutex_exit(&rp->r_statelock);
   3175 				rddir_cache_rele(rdc);
   3176 #ifdef DEBUG
   3177 				nfs_readdir_readahead++;
   3178 #endif
   3179 				nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
   3180 				return (error);
   3181 			}
   3182 		}
   3183 
   3184 		mutex_exit(&rp->r_statelock);
   3185 		rddir_cache_rele(rdc);
   3186 		return (error);
   3187 	}
   3188 
   3189 	/*
   3190 	 * Didn't find an entry in the cache.  Construct a new empty
   3191 	 * entry and link it into the cache.  Other processes attempting
   3192 	 * to access this entry will need to wait until it is filled in.
   3193 	 *
   3194 	 * Since kmem_alloc may block, another pass through the cache
   3195 	 * will need to be taken to make sure that another process
   3196 	 * hasn't already added an entry to the cache for this request.
   3197 	 */
   3198 	if (nrdc == NULL) {
   3199 		mutex_exit(&rp->r_statelock);
   3200 		nrdc = rddir_cache_alloc(KM_SLEEP);
   3201 		nrdc->nfs_cookie = uiop->uio_offset;
   3202 		nrdc->buflen = count;
   3203 		goto top;
   3204 	}
   3205 
   3206 	/*
   3207 	 * Add this entry to the cache.
   3208 	 */
   3209 	avl_insert(&rp->r_dir, nrdc, where);
   3210 	rddir_cache_hold(nrdc);
   3211 	mutex_exit(&rp->r_statelock);
   3212 
   3213 bottom:
   3214 #ifdef DEBUG
   3215 	missed = 1;
   3216 	nfs_readdir_cache_misses++;
   3217 #endif
   3218 	/*
   3219 	 * Do the readdir.
   3220 	 */
   3221 	error = nfsreaddir(vp, nrdc, cr);
   3222 
   3223 	/*
   3224 	 * If this operation failed, just return the error which occurred.
   3225 	 */
   3226 	if (error != 0)
   3227 		return (error);
   3228 
   3229 	/*
   3230 	 * Since the RPC operation will have taken sometime and blocked
   3231 	 * this process, another pass through the cache will need to be
   3232 	 * taken to find the correct cache entry.  It is possible that
   3233 	 * the correct cache entry will not be there (although one was
   3234 	 * added) because the directory changed during the RPC operation
   3235 	 * and the readdir cache was flushed.  In this case, just start
   3236 	 * over.  It is hoped that this will not happen too often... :-)
   3237 	 */
   3238 	nrdc = NULL;
   3239 	goto top;
   3240 	/* NOTREACHED */
   3241 }
   3242 
   3243 static int
   3244 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
   3245 {
   3246 	int error;
   3247 	struct nfsrddirargs rda;
   3248 	struct nfsrddirres rd;
   3249 	rnode_t *rp;
   3250 	mntinfo_t *mi;
   3251 	uint_t count;
   3252 	int douprintf;
   3253 	failinfo_t fi, *fip;
   3254 
   3255 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
   3256 	count = rdc->buflen;
   3257 
   3258 	rp = VTOR(vp);
   3259 	mi = VTOMI(vp);
   3260 
   3261 	rda.rda_fh = *VTOFH(vp);
   3262 	rda.rda_offset = rdc->nfs_cookie;
   3263 
   3264 	/*
   3265 	 * NFS client failover support
   3266 	 * suppress failover unless we have a zero cookie
   3267 	 */
   3268 	if (rdc->nfs_cookie == (off_t)0) {
   3269 		fi.vp = vp;
   3270 		fi.fhp = (caddr_t)&rda.rda_fh;
   3271 		fi.copyproc = nfscopyfh;
   3272 		fi.lookupproc = nfslookup;
   3273 		fi.xattrdirproc = acl_getxattrdir2;
   3274 		fip = &fi;
   3275 	} else {
   3276 		fip = NULL;
   3277 	}
   3278 
   3279 	rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP);
   3280 	rd.rd_size = count;
   3281 	rd.rd_offset = rda.rda_offset;
   3282 
   3283 	douprintf = 1;
   3284 
   3285 	if (mi->mi_io_kstats) {
   3286 		mutex_enter(&mi->mi_lock);
   3287 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
   3288 		mutex_exit(&mi->mi_lock);
   3289 	}
   3290 
   3291 	do {
   3292 		rda.rda_count = MIN(count, mi->mi_curread);
   3293 		error = rfs2call(mi, RFS_READDIR,
   3294 		    xdr_rddirargs, (caddr_t)&rda,
   3295 		    xdr_getrddirres, (caddr_t)&rd, cr,
   3296 		    &douprintf, &rd.rd_status, 0, fip);
   3297 	} while (error == ENFS_TRYAGAIN);
   3298 
   3299 	if (mi->mi_io_kstats) {
   3300 		mutex_enter(&mi->mi_lock);
   3301 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
   3302 		mutex_exit(&mi->mi_lock);
   3303 	}
   3304 
   3305 	/*
   3306 	 * Since we are actually doing a READDIR RPC, we must have
   3307 	 * exclusive access to the cache entry being filled.  Thus,
   3308 	 * it is safe to update all fields except for the flags
   3309 	 * field.  The r_statelock in the rnode must be held to
   3310 	 * prevent two different threads from simultaneously
   3311 	 * attempting to update the flags field.  This can happen
   3312 	 * if we are turning off RDDIR and the other thread is
   3313 	 * trying to set RDDIRWAIT.
   3314 	 */
   3315 	ASSERT(rdc->flags & RDDIR);
   3316 	if (!error) {
   3317 		error = geterrno(rd.rd_status);
   3318 		if (!error) {
   3319 			rdc->nfs_ncookie = rd.rd_offset;
   3320 			rdc->eof = rd.rd_eof ? 1 : 0;
   3321 			rdc->entlen = rd.rd_size;
   3322 			ASSERT(rdc->entlen <= rdc->buflen);
   3323 #ifdef DEBUG
   3324 			rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
   3325 			    KM_SLEEP);
   3326 #else
   3327 			rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
   3328 #endif
   3329 			bcopy(rd.rd_entries, rdc->entries, rdc->entlen);
   3330 			rdc->error = 0;
   3331 			if (mi->mi_io_kstats) {
   3332 				mutex_enter(&mi->mi_lock);
   3333 				KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
   3334 				KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
   3335 				    rd.rd_size;
   3336 				mutex_exit(&mi->mi_lock);
   3337 			}
   3338 		} else {
   3339 			PURGE_STALE_FH(error, vp, cr);
   3340 		}
   3341 	}
   3342 	if (error) {
   3343 		rdc->entries = NULL;
   3344 		rdc->error = error;
   3345 	}
   3346 	kmem_free(rd.rd_entries, rdc->buflen);
   3347 
   3348 	mutex_enter(&rp->r_statelock);
   3349 	rdc->flags &= ~RDDIR;
   3350 	if (rdc->flags & RDDIRWAIT) {
   3351 		rdc->flags &= ~RDDIRWAIT;
   3352 		cv_broadcast(&rdc->cv);
   3353 	}
   3354 	if (error)
   3355 		rdc->flags |= RDDIRREQ;
   3356 	mutex_exit(&rp->r_statelock);
   3357 
   3358 	rddir_cache_rele(rdc);
   3359 
   3360 	return (error);
   3361 }
   3362 
   3363 #ifdef DEBUG
   3364 static int nfs_bio_do_stop = 0;
   3365 #endif
   3366 
   3367 static int
   3368 nfs_bio(struct buf *bp, cred_t *cr)
   3369 {
   3370 	rnode_t *rp = VTOR(bp->b_vp);
   3371 	int count;
   3372 	int error;
   3373 	cred_t *cred;
   3374 	uint_t offset;
   3375 
   3376 	DTRACE_IO1(start, struct buf *, bp);
   3377 
   3378 	ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
   3379 	offset = dbtob(bp->b_blkno);
   3380 
   3381 	if (bp->b_flags & B_READ) {
   3382 		mutex_enter(&rp->r_statelock);
   3383 		if (rp->r_cred != NULL) {
   3384 			cred = rp->r_cred;
   3385 			crhold(cred);
   3386 		} else {
   3387 			rp->r_cred = cr;
   3388 			crhold(cr);
   3389 			cred = cr;
   3390 			crhold(cred);
   3391 		}
   3392 		mutex_exit(&rp->r_statelock);
   3393 	read_again:
   3394 		error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
   3395 		    offset, bp->b_bcount, &bp->b_resid, cred);
   3396 
   3397 		crfree(cred);
   3398 		if (!error) {
   3399 			if (bp->b_resid) {
   3400 				/*
   3401 				 * Didn't get it all because we hit EOF,
   3402 				 * zero all the memory beyond the EOF.
   3403 				 */
   3404 				/* bzero(rdaddr + */
   3405 				bzero(bp->b_un.b_addr +
   3406 				    bp->b_bcount - bp->b_resid, bp->b_resid);
   3407 			}
   3408 			mutex_enter(&rp->r_statelock);
   3409 			if (bp->b_resid == bp->b_bcount &&
   3410 			    offset >= rp->r_size) {
   3411 				/*
   3412 				 * We didn't read anything at all as we are
   3413 				 * past EOF.  Return an error indicator back
   3414 				 * but don't destroy the pages (yet).
   3415 				 */
   3416 				error = NFS_EOF;
   3417 			}
   3418 			mutex_exit(&rp->r_statelock);
   3419 		} else if (error == EACCES) {
   3420 			mutex_enter(&rp->r_statelock);
   3421 			if (cred != cr) {
   3422 				if (rp->r_cred != NULL)
   3423 					crfree(rp->r_cred);
   3424 				rp->r_cred = cr;
   3425 				crhold(cr);
   3426 				cred = cr;
   3427 				crhold(cred);
   3428 				mutex_exit(&rp->r_statelock);
   3429 				goto read_again;
   3430 			}
   3431 			mutex_exit(&rp->r_statelock);
   3432 		}
   3433 	} else {
   3434 		if (!(rp->r_flags & RSTALE)) {
   3435 			mutex_enter(&rp->r_statelock);
   3436 			if (rp->r_cred != NULL) {
   3437 				cred = rp->r_cred;
   3438 				crhold(cred);
   3439 			} else {
   3440 				rp->r_cred = cr;
   3441 				crhold(cr);
   3442 				cred = cr;
   3443 				crhold(cred);
   3444 			}
   3445 			mutex_exit(&rp->r_statelock);
   3446 		write_again:
   3447 			mutex_enter(&rp->r_statelock);
   3448 			count = MIN(bp->b_bcount, rp->r_size - offset);
   3449 			mutex_exit(&rp->r_statelock);
   3450 			if (count < 0)
   3451 				cmn_err(CE_PANIC, "nfs_bio: write count < 0");
   3452 #ifdef DEBUG
   3453 			if (count == 0) {
   3454 				zcmn_err(getzoneid(), CE_WARN,
   3455 				    "nfs_bio: zero length write at %d",
   3456 				    offset);
   3457 				nfs_printfhandle(&rp->r_fh);
   3458 				if (nfs_bio_do_stop)
   3459 					debug_enter("nfs_bio");
   3460 			}
   3461 #endif
   3462 			error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
   3463 			    count, cred);
   3464 			if (error == EACCES) {
   3465 				mutex_enter(&rp->r_statelock);
   3466 				if (cred != cr) {
   3467 					if (rp->r_cred != NULL)
   3468 						crfree(rp->r_cred);
   3469 					rp->r_cred = cr;
   3470 					crhold(cr);
   3471 					crfree(cred);
   3472 					cred = cr;
   3473 					crhold(cred);
   3474 					mutex_exit(&rp->r_statelock);
   3475 					goto write_again;
   3476 				}
   3477 				mutex_exit(&rp->r_statelock);
   3478 			}
   3479 			bp->b_error = error;
   3480 			if (error && error != EINTR) {
   3481 				/*
   3482 				 * Don't print EDQUOT errors on the console.
   3483 				 * Don't print asynchronous EACCES errors.
   3484 				 * Don't print EFBIG errors.
   3485 				 * Print all other write errors.
   3486 				 */
   3487 				if (error != EDQUOT && error != EFBIG &&
   3488 				    (error != EACCES ||
   3489 				    !(bp->b_flags & B_ASYNC)))
   3490 					nfs_write_error(bp->b_vp, error, cred);
   3491 				/*
   3492 				 * Update r_error and r_flags as appropriate.
   3493 				 * If the error was ESTALE, then mark the
   3494 				 * rnode as not being writeable and save
   3495 				 * the error status.  Otherwise, save any
   3496 				 * errors which occur from asynchronous
   3497 				 * page invalidations.  Any errors occurring
   3498 				 * from other operations should be saved
   3499 				 * by the caller.
   3500 				 */
   3501 				mutex_enter(&rp->r_statelock);
   3502 				if (error == ESTALE) {
   3503 					rp->r_flags |= RSTALE;
   3504 					if (!rp->r_error)
   3505 						rp->r_error = error;
   3506 				} else if (!rp->r_error &&
   3507 				    (bp->b_flags &
   3508 				    (B_INVAL|B_FORCE|B_ASYNC)) ==
   3509 				    (B_INVAL|B_FORCE|B_ASYNC)) {
   3510 					rp->r_error = error;
   3511 				}
   3512 				mutex_exit(&rp->r_statelock);
   3513 			}
   3514 			crfree(cred);
   3515 		} else {
   3516 			error = rp->r_error;
   3517 			/*
   3518 			 * A close may have cleared r_error, if so,
   3519 			 * propagate ESTALE error return properly
   3520 			 */
   3521 			if (error == 0)
   3522 				error = ESTALE;
   3523 		}
   3524 	}
   3525 
   3526 	if (error != 0 && error != NFS_EOF)
   3527 		bp->b_flags |= B_ERROR;
   3528 
   3529 	DTRACE_IO1(done, struct buf *, bp);
   3530 
   3531 	return (error);
   3532 }
   3533 
   3534 /* ARGSUSED */
   3535 static int
   3536 nfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
   3537 {
   3538 	struct nfs_fid *fp;
   3539 	rnode_t *rp;
   3540 
   3541 	rp = VTOR(vp);
   3542 
   3543 	if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
   3544 		fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
   3545 		return (ENOSPC);
   3546 	}
   3547 	fp = (struct nfs_fid *)fidp;
   3548 	fp->nf_pad = 0;
   3549 	fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
   3550 	bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
   3551 	return (0);
   3552 }
   3553 
   3554 /* ARGSUSED2 */
   3555 static int
   3556 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
   3557 {
   3558 	rnode_t *rp = VTOR(vp);
   3559 
   3560 	if (!write_lock) {
   3561 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
   3562 		return (V_WRITELOCK_FALSE);
   3563 	}
   3564 
   3565 	if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
   3566 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
   3567 		if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
   3568 			return (V_WRITELOCK_FALSE);
   3569 		nfs_rw_exit(&rp->r_rwlock);
   3570 	}
   3571 
   3572 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
   3573 	return (V_WRITELOCK_TRUE);
   3574 }
   3575 
   3576 /* ARGSUSED */
   3577 static void
   3578 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
   3579 {
   3580 	rnode_t *rp = VTOR(vp);
   3581 
   3582 	nfs_rw_exit(&rp->r_rwlock);
   3583 }
   3584 
   3585 /* ARGSUSED */
   3586 static int
   3587 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
   3588 {
   3589 
   3590 	/*
   3591 	 * Because we stuff the readdir cookie into the offset field
   3592 	 * someone may attempt to do an lseek with the cookie which
   3593 	 * we want to succeed.
   3594 	 */
   3595 	if (vp->v_type == VDIR)
   3596 		return (0);
   3597 	if (*noffp < 0 || *noffp > MAXOFF32_T)
   3598 		return (EINVAL);
   3599 	return (0);
   3600 }
   3601 
   3602 /*
   3603  * number of NFS_MAXDATA blocks to read ahead
   3604  * optimized for 100 base-T.
   3605  */
   3606 static int nfs_nra = 4;
   3607 
   3608 #ifdef DEBUG
   3609 static int nfs_lostpage = 0;	/* number of times we lost original page */
   3610 #endif
   3611 
   3612 /*
   3613  * Return all the pages from [off..off+len) in file
   3614  */
   3615 /* ARGSUSED */
   3616 static int
   3617 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
   3618 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
   3619 	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
   3620 {
   3621 	rnode_t *rp;
   3622 	int error;
   3623 	mntinfo_t *mi;
   3624 
   3625 	if (vp->v_flag & VNOMAP)
   3626 		return (ENOSYS);
   3627 
   3628 	ASSERT(off <= MAXOFF32_T);
   3629 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   3630 		return (EIO);
   3631 	if (protp != NULL)
   3632 		*protp = PROT_ALL;
   3633 
   3634 	/*
   3635 	 * Now valididate that the caches are up to date.
   3636 	 */
   3637 	error = nfs_validate_caches(vp, cr);
   3638 	if (error)
   3639 		return (error);
   3640 
   3641 	rp = VTOR(vp);
   3642 	mi = VTOMI(vp);
   3643 retry:
   3644 	mutex_enter(&rp->r_statelock);
   3645 
   3646 	/*
   3647 	 * Don't create dirty pages faster than they
   3648 	 * can be cleaned so that the system doesn't
   3649 	 * get imbalanced.  If the async queue is
   3650 	 * maxed out, then wait for it to drain before
   3651 	 * creating more dirty pages.  Also, wait for
   3652 	 * any threads doing pagewalks in the vop_getattr
   3653 	 * entry points so that they don't block for
   3654 	 * long periods.
   3655 	 */
   3656 	if (rw == S_CREATE) {
   3657 		while ((mi->mi_max_threads != 0 &&
   3658 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
   3659 		    rp->r_gcount > 0)
   3660 			cv_wait(&rp->r_cv, &rp->r_statelock);
   3661 	}
   3662 
   3663 	/*
   3664 	 * If we are getting called as a side effect of an nfs_write()
   3665 	 * operation the local file size might not be extended yet.
   3666 	 * In this case we want to be able to return pages of zeroes.
   3667 	 */
   3668 	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
   3669 		mutex_exit(&rp->r_statelock);
   3670 		return (EFAULT);		/* beyond EOF */
   3671 	}
   3672 
   3673 	mutex_exit(&rp->r_statelock);
   3674 
   3675 	if (len <= PAGESIZE) {
   3676 		error = nfs_getapage(vp, off, len, protp, pl, plsz,
   3677 		    seg, addr, rw, cr);
   3678 	} else {
   3679 		error = pvn_getpages(nfs_getapage, vp, off, len, protp,
   3680 		    pl, plsz, seg, addr, rw, cr);
   3681 	}
   3682 
   3683 	switch (error) {
   3684 	case NFS_EOF:
   3685 		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
   3686 		goto retry;
   3687 	case ESTALE:
   3688 		PURGE_STALE_FH(error, vp, cr);
   3689 	}
   3690 
   3691 	return (error);
   3692 }
   3693 
   3694 /*
   3695  * Called from pvn_getpages or nfs_getpage to get a particular page.
   3696  */
   3697 /* ARGSUSED */
   3698 static int
   3699 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
   3700 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
   3701 	enum seg_rw rw, cred_t *cr)
   3702 {
   3703 	rnode_t *rp;
   3704 	uint_t bsize;
   3705 	struct buf *bp;
   3706 	page_t *pp;
   3707 	u_offset_t lbn;
   3708 	u_offset_t io_off;
   3709 	u_offset_t blkoff;
   3710 	u_offset_t rablkoff;
   3711 	size_t io_len;
   3712 	uint_t blksize;
   3713 	int error;
   3714 	int readahead;
   3715 	int readahead_issued = 0;
   3716 	int ra_window; /* readahead window */
   3717 	page_t *pagefound;
   3718 
   3719 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   3720 		return (EIO);
   3721 	rp = VTOR(vp);
   3722 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
   3723 
   3724 reread:
   3725 	bp = NULL;
   3726 	pp = NULL;
   3727 	pagefound = NULL;
   3728 
   3729 	if (pl != NULL)
   3730 		pl[0] = NULL;
   3731 
   3732 	error = 0;
   3733 	lbn = off / bsize;
   3734 	blkoff = lbn * bsize;
   3735 
   3736 	/*
   3737 	 * Queueing up the readahead before doing the synchronous read
   3738 	 * results in a significant increase in read throughput because
   3739 	 * of the increased parallelism between the async threads and
   3740 	 * the process context.
   3741 	 */
   3742 	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
   3743 	    rw != S_CREATE &&
   3744 	    !(vp->v_flag & VNOCACHE)) {
   3745 		mutex_enter(&rp->r_statelock);
   3746 
   3747 		/*
   3748 		 * Calculate the number of readaheads to do.
   3749 		 * a) No readaheads at offset = 0.
   3750 		 * b) Do maximum(nfs_nra) readaheads when the readahead
   3751 		 *    window is closed.
   3752 		 * c) Do readaheads between 1 to (nfs_nra - 1) depending
   3753 		 *    upon how far the readahead window is open or close.
   3754 		 * d) No readaheads if rp->r_nextr is not within the scope
   3755 		 *    of the readahead window (random i/o).
   3756 		 */
   3757 
   3758 		if (off == 0)
   3759 			readahead = 0;
   3760 		else if (blkoff == rp->r_nextr)
   3761 			readahead = nfs_nra;
   3762 		else if (rp->r_nextr > blkoff &&
   3763 		    ((ra_window = (rp->r_nextr - blkoff) / bsize)
   3764 		    <= (nfs_nra - 1)))
   3765 			readahead = nfs_nra - ra_window;
   3766 		else
   3767 			readahead = 0;
   3768 
   3769 		rablkoff = rp->r_nextr;
   3770 		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
   3771 			mutex_exit(&rp->r_statelock);
   3772 			if (nfs_async_readahead(vp, rablkoff + bsize,
   3773 			    addr + (rablkoff + bsize - off), seg, cr,
   3774 			    nfs_readahead) < 0) {
   3775 				mutex_enter(&rp->r_statelock);
   3776 				break;
   3777 			}
   3778 			readahead--;
   3779 			rablkoff += bsize;
   3780 			/*
   3781 			 * Indicate that we did a readahead so
   3782 			 * readahead offset is not updated
   3783 			 * by the synchronous read below.
   3784 			 */
   3785 			readahead_issued = 1;
   3786 			mutex_enter(&rp->r_statelock);
   3787 			/*
   3788 			 * set readahead offset to
   3789 			 * offset of last async readahead
   3790 			 * request.
   3791 			 */
   3792 			rp->r_nextr = rablkoff;
   3793 		}
   3794 		mutex_exit(&rp->r_statelock);
   3795 	}
   3796 
   3797 again:
   3798 	if ((pagefound = page_exists(vp, off)) == NULL) {
   3799 		if (pl == NULL) {
   3800 			(void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
   3801 			    nfs_readahead);
   3802 		} else if (rw == S_CREATE) {
   3803 			/*
   3804 			 * Block for this page is not allocated, or the offset
   3805 			 * is beyond the current allocation size, or we're
   3806 			 * allocating a swap slot and the page was not found,
   3807 			 * so allocate it and return a zero page.
   3808 			 */
   3809 			if ((pp = page_create_va(vp, off,
   3810 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
   3811 				cmn_err(CE_PANIC, "nfs_getapage: page_create");
   3812 			io_len = PAGESIZE;
   3813 			mutex_enter(&rp->r_statelock);
   3814 			rp->r_nextr = off + PAGESIZE;
   3815 			mutex_exit(&rp->r_statelock);
   3816 		} else {
   3817 			/*
   3818 			 * Need to go to server to get a BLOCK, exception to
   3819 			 * that being while reading at offset = 0 or doing
   3820 			 * random i/o, in that case read only a PAGE.
   3821 			 */
   3822 			mutex_enter(&rp->r_statelock);
   3823 			if (blkoff < rp->r_size &&
   3824 			    blkoff + bsize >= rp->r_size) {
   3825 				/*
   3826 				 * If only a block or less is left in
   3827 				 * the file, read all that is remaining.
   3828 				 */
   3829 				if (rp->r_size <= off) {
   3830 					/*
   3831 					 * Trying to access beyond EOF,
   3832 					 * set up to get at least one page.
   3833 					 */
   3834 					blksize = off + PAGESIZE - blkoff;
   3835 				} else
   3836 					blksize = rp->r_size - blkoff;
   3837 			} else if ((off == 0) ||
   3838 			    (off != rp->r_nextr && !readahead_issued)) {
   3839 				blksize = PAGESIZE;
   3840 				blkoff = off; /* block = page here */
   3841 			} else
   3842 				blksize = bsize;
   3843 			mutex_exit(&rp->r_statelock);
   3844 
   3845 			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
   3846 			    &io_len, blkoff, blksize, 0);
   3847 
   3848 			/*
   3849 			 * Some other thread has entered the page,
   3850 			 * so just use it.
   3851 			 */
   3852 			if (pp == NULL)
   3853 				goto again;
   3854 
   3855 			/*
   3856 			 * Now round the request size up to page boundaries.
   3857 			 * This ensures that the entire page will be
   3858 			 * initialized to zeroes if EOF is encountered.
   3859 			 */
   3860 			io_len = ptob(btopr(io_len));
   3861 
   3862 			bp = pageio_setup(pp, io_len, vp, B_READ);
   3863 			ASSERT(bp != NULL);
   3864 
   3865 			/*
   3866 			 * pageio_setup should have set b_addr to 0.  This
   3867 			 * is correct since we want to do I/O on a page
   3868 			 * boundary.  bp_mapin will use this addr to calculate
   3869 			 * an offset, and then set b_addr to the kernel virtual
   3870 			 * address it allocated for us.
   3871 			 */
   3872 			ASSERT(bp->b_un.b_addr == 0);
   3873 
   3874 			bp->b_edev = 0;
   3875 			bp->b_dev = 0;
   3876 			bp->b_lblkno = lbtodb(io_off);
   3877 			bp->b_file = vp;
   3878 			bp->b_offset = (offset_t)off;
   3879 			bp_mapin(bp);
   3880 
   3881 			/*
   3882 			 * If doing a write beyond what we believe is EOF,
   3883 			 * don't bother trying to read the pages from the
   3884 			 * server, we'll just zero the pages here.  We
   3885 			 * don't check that the rw flag is S_WRITE here
   3886 			 * because some implementations may attempt a
   3887 			 * read access to the buffer before copying data.
   3888 			 */
   3889 			mutex_enter(&rp->r_statelock);
   3890 			if (io_off >= rp->r_size && seg == segkmap) {
   3891 				mutex_exit(&rp->r_statelock);
   3892 				bzero(bp->b_un.b_addr, io_len);
   3893 			} else {
   3894 				mutex_exit(&rp->r_statelock);
   3895 				error = nfs_bio(bp, cr);
   3896 			}
   3897 
   3898 			/*
   3899 			 * Unmap the buffer before freeing it.
   3900 			 */
   3901 			bp_mapout(bp);
   3902 			pageio_done(bp);
   3903 
   3904 			if (error == NFS_EOF) {
   3905 				/*
   3906 				 * If doing a write system call just return
   3907 				 * zeroed pages, else user tried to get pages
   3908 				 * beyond EOF, return error.  We don't check
   3909 				 * that the rw flag is S_WRITE here because
   3910 				 * some implementations may attempt a read
   3911 				 * access to the buffer before copying data.
   3912 				 */
   3913 				if (seg == segkmap)
   3914 					error = 0;
   3915 				else
   3916 					error = EFAULT;
   3917 			}
   3918 
   3919 			if (!readahead_issued && !error) {
   3920 				mutex_enter(&rp->r_statelock);
   3921 				rp->r_nextr = io_off + io_len;
   3922 				mutex_exit(&rp->r_statelock);
   3923 			}
   3924 		}
   3925 	}
   3926 
   3927 out:
   3928 	if (pl == NULL)
   3929 		return (error);
   3930 
   3931 	if (error) {
   3932 		if (pp != NULL)
   3933 			pvn_read_done(pp, B_ERROR);
   3934 		return (error);
   3935 	}
   3936 
   3937 	if (pagefound) {
   3938 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
   3939 
   3940 		/*
   3941 		 * Page exists in the cache, acquire the appropriate lock.
   3942 		 * If this fails, start all over again.
   3943 		 */
   3944 		if ((pp = page_lookup(vp, off, se)) == NULL) {
   3945 #ifdef DEBUG
   3946 			nfs_lostpage++;
   3947 #endif
   3948 			goto reread;
   3949 		}
   3950 		pl[0] = pp;
   3951 		pl[1] = NULL;
   3952 		return (0);
   3953 	}
   3954 
   3955 	if (pp != NULL)
   3956 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
   3957 
   3958 	return (error);
   3959 }
   3960 
   3961 static void
   3962 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
   3963 	cred_t *cr)
   3964 {
   3965 	int error;
   3966 	page_t *pp;
   3967 	u_offset_t io_off;
   3968 	size_t io_len;
   3969 	struct buf *bp;
   3970 	uint_t bsize, blksize;
   3971 	rnode_t *rp = VTOR(vp);
   3972 
   3973 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
   3974 
   3975 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
   3976 
   3977 	mutex_enter(&rp->r_statelock);
   3978 	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
   3979 		/*
   3980 		 * If less than a block left in file read less
   3981 		 * than a block.
   3982 		 */
   3983 		blksize = rp->r_size - blkoff;
   3984 	} else
   3985 		blksize = bsize;
   3986 	mutex_exit(&rp->r_statelock);
   3987 
   3988 	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
   3989 	    &io_off, &io_len, blkoff, blksize, 1);
   3990 	/*
   3991 	 * The isra flag passed to the kluster function is 1, we may have
   3992 	 * gotten a return value of NULL for a variety of reasons (# of free
   3993 	 * pages < minfree, someone entered the page on the vnode etc). In all
   3994 	 * cases, we want to punt on the readahead.
   3995 	 */
   3996 	if (pp == NULL)
   3997 		return;
   3998 
   3999 	/*
   4000 	 * Now round the request size up to page boundaries.
   4001 	 * This ensures that the entire page will be
   4002 	 * initialized to zeroes if EOF is encountered.
   4003 	 */
   4004 	io_len = ptob(btopr(io_len));
   4005 
   4006 	bp = pageio_setup(pp, io_len, vp, B_READ);
   4007 	ASSERT(bp != NULL);
   4008 
   4009 	/*
   4010 	 * pageio_setup should have set b_addr to 0.  This is correct since
   4011 	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
   4012 	 * to calculate an offset, and then set b_addr to the kernel virtual
   4013 	 * address it allocated for us.
   4014 	 */
   4015 	ASSERT(bp->b_un.b_addr == 0);
   4016 
   4017 	bp->b_edev = 0;
   4018 	bp->b_dev = 0;
   4019 	bp->b_lblkno = lbtodb(io_off);
   4020 	bp->b_file = vp;
   4021 	bp->b_offset = (offset_t)blkoff;
   4022 	bp_mapin(bp);
   4023 
   4024 	/*
   4025 	 * If doing a write beyond what we believe is EOF, don't bother trying
   4026 	 * to read the pages from the server, we'll just zero the pages here.
   4027 	 * We don't check that the rw flag is S_WRITE here because some
   4028 	 * implementations may attempt a read access to the buffer before
   4029 	 * copying data.
   4030 	 */
   4031 	mutex_enter(&rp->r_statelock);
   4032 	if (io_off >= rp->r_size && seg == segkmap) {
   4033 		mutex_exit(&rp->r_statelock);
   4034 		bzero(bp->b_un.b_addr, io_len);
   4035 		error = 0;
   4036 	} else {
   4037 		mutex_exit(&rp->r_statelock);
   4038 		error = nfs_bio(bp, cr);
   4039 		if (error == NFS_EOF)
   4040 			error = 0;
   4041 	}
   4042 
   4043 	/*
   4044 	 * Unmap the buffer before freeing it.
   4045 	 */
   4046 	bp_mapout(bp);
   4047 	pageio_done(bp);
   4048 
   4049 	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
   4050 
   4051 	/*
   4052 	 * In case of error set readahead offset
   4053 	 * to the lowest offset.
   4054 	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
   4055 	 */
   4056 	if (error && rp->r_nextr > io_off) {
   4057 		mutex_enter(&rp->r_statelock);
   4058 		if (rp->r_nextr > io_off)
   4059 			rp->r_nextr = io_off;
   4060 		mutex_exit(&rp->r_statelock);
   4061 	}
   4062 }
   4063 
   4064 /*
   4065  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
   4066  * If len == 0, do from off to EOF.
   4067  *
   4068  * The normal cases should be len == 0 && off == 0 (entire vp list),
   4069  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
   4070  * (from pageout).
   4071  */
   4072 /* ARGSUSED */
   4073 static int
   4074 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
   4075 	caller_context_t *ct)
   4076 {
   4077 	int error;
   4078 	rnode_t *rp;
   4079 
   4080 	ASSERT(cr != NULL);
   4081 
   4082 	/*
   4083 	 * XXX - Why should this check be made here?
   4084 	 */
   4085 	if (vp->v_flag & VNOMAP)
   4086 		return (ENOSYS);
   4087 
   4088 	if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
   4089 		return (0);
   4090 
   4091 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
   4092 		return (EIO);
   4093 	ASSERT(off <= MAXOFF32_T);
   4094 
   4095 	rp = VTOR(vp);
   4096 	mutex_enter(&rp->r_statelock);
   4097 	rp->r_count++;
   4098 	mutex_exit(&rp->r_statelock);
   4099 	error = nfs_putpages(vp, off, len, flags, cr);
   4100 	mutex_enter(&rp->r_statelock);
   4101 	rp->r_count--;
   4102 	cv_broadcast(&rp->r_cv);
   4103 	mutex_exit(&rp->r_statelock);
   4104 
   4105 	return (error);
   4106 }
   4107 
   4108 /*
   4109  * Write out a single page, possibly klustering adjacent dirty pages.
   4110  */
   4111 int
   4112 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
   4113 	int flags, cred_t *cr)
   4114 {
   4115 	u_offset_t io_off;
   4116 	u_offset_t lbn_off;
   4117 	u_offset_t lbn;
   4118 	size_t io_len;
   4119 	uint_t bsize;
   4120 	int error;
   4121 	rnode_t *rp;
   4122 
   4123 	ASSERT(!vn_is_readonly(vp));
   4124 	ASSERT(pp != NULL);
   4125 	ASSERT(cr != NULL);
   4126 	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
   4127 
   4128 	rp = VTOR(vp);
   4129 	ASSERT(rp->r_count > 0);
   4130 
   4131 	ASSERT(pp->p_offset <= MAXOFF32_T);
   4132 
   4133 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
   4134 	lbn = pp->p_offset / bsize;
   4135 	lbn_off = lbn * bsize;
   4136 
   4137 	/*
   4138 	 * Find a kluster that fits in one block, or in
   4139 	 * one page if pages are bigger than blocks.  If
   4140 	 * there is less file space allocated than a whole
   4141 	 * page, we'll shorten the i/o request below.
   4142 	 */
   4143 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
   4144 	    roundup(bsize, PAGESIZE), flags);
   4145 
   4146 	/*
   4147 	 * pvn_write_kluster shouldn't have returned a page with offset
   4148 	 * behind the original page we were given.  Verify that.
   4149 	 */
   4150 	ASSERT((pp->p_offset / bsize) >= lbn);
   4151 
   4152 	/*
   4153 	 * Now pp will have the list of kept dirty pages marked for
   4154 	 * write back.  It will also handle invalidation and freeing
   4155 	 * of pages that are not dirty.  Check for page length rounding
   4156 	 * problems.
   4157 	 */
   4158 	if (io_off + io_len > lbn_off + bsize) {
   4159 		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
   4160 		io_len = lbn_off + bsize - io_off;
   4161 	}
   4162 	/*
   4163 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
   4164 	 * consistent value of r_size. RMODINPROGRESS is set in writerp().
   4165 	 * When RMODINPROGRESS is set it indicates that a uiomove() is in
   4166 	 * progress and the r_size has not been made consistent with the
   4167 	 * new size of the file. When the uiomove() completes the r_size is
   4168 	 * updated and the RMODINPROGRESS flag is cleared.
   4169 	 *
   4170 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
   4171 	 * consistent value of r_size. Without this handshaking, it is
   4172 	 * possible that nfs(3)_bio() picks  up the old value of r_size
   4173 	 * before the uiomove() in writerp() completes. This will result
   4174 	 * in the write through nfs(3)_bio() being dropped.
   4175 	 *
   4176 	 * More precisely, there is a window between the time the uiomove()
   4177 	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
   4178 	 * operation intervenes in this window, the page will be picked up,
   4179 	 * because it is dirty (it will be unlocked, unless it was
   4180 	 * pagecreate'd). When the page is picked up as dirty, the dirty
   4181 	 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
   4182 	 * checked. This will still be the old size. Therefore the page will
   4183 	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
   4184 	 * the page will be found to be clean and the write will be dropped.
   4185 	 */
   4186 	if (rp->r_flags & RMODINPROGRESS) {
   4187 		mutex_enter(&rp->r_statelock);
   4188 		if ((rp->r_flags & RMODINPROGRESS) &&
   4189 		    rp->r_modaddr + MAXBSIZE > io_off &&
   4190 		    rp->r_modaddr < io_off + io_len) {
   4191 			page_t *plist;
   4192 			/*
   4193 			 * A write is in progress for this region of the file.
   4194 			 * If we did not detect RMODINPROGRESS here then this
   4195 			 * path through nfs_putapage() would eventually go to
   4196 			 * nfs(3)_bio() and may not write out all of the data
   4197 			 * in the pages. We end up losing data. So we decide
   4198 			 * to set the modified bit on each page in the page
   4199 			 * list and mark the rnode with RDIRTY. This write
   4200 			 * will be restarted at some later time.
   4201 			 */
   4202 			plist = pp;
   4203 			while (plist != NULL) {
   4204 				pp = plist;
   4205 				page_sub(&plist, pp);
   4206 				hat_setmod(pp);
   4207 				page_io_unlock(pp);
   4208 				page_unlock(pp);
   4209 			}
   4210 			rp->r_flags |= RDIRTY;
   4211 			mutex_exit(&rp->r_statelock);
   4212 			if (offp)
   4213 				*offp = io_off;
   4214 			if (lenp)
   4215 				*lenp = io_len;
   4216 			return (0);
   4217 		}
   4218 		mutex_exit(&rp->r_statelock);
   4219 	}
   4220 
   4221 	if (flags & B_ASYNC) {
   4222 		error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
   4223 		    nfs_sync_putapage);
   4224 	} else
   4225 		error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
   4226 
   4227 	if (offp)
   4228 		*offp = io_off;
   4229 	if (lenp)
   4230 		*lenp = io_len;
   4231 	return (error);
   4232 }
   4233 
   4234 static int
   4235 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
   4236 	int flags, cred_t *cr)
   4237 {
   4238 	int error;
   4239 	rnode_t *rp;
   4240 
   4241 	flags |= B_WRITE;
   4242 
   4243 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
   4244 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
   4245 
   4246 	rp = VTOR(vp);
   4247 
   4248 	if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
   4249 	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
   4250 		if (!(rp->r_flags & ROUTOFSPACE)) {
   4251 			mutex_enter(&rp->r_statelock);
   4252 			rp->r_flags |= ROUTOFSPACE;
   4253 			mutex_exit(&rp->r_statelock);
   4254 		}
   4255 		flags |= B_ERROR;
   4256 		pvn_write_done(pp, flags);
   4257 		/*
   4258 		 * If this was not an async thread, then try again to
   4259 		 * write out the pages, but this time, also destroy
   4260 		 * them whether or not the write is successful.  This
   4261 		 * will prevent memory from filling up with these
   4262 		 * pages and destroying them is the only alternative
   4263 		 * if they can't be written out.
   4264 		 *
   4265 		 * Don't do this if this is an async thread because
   4266 		 * when the pages are unlocked in pvn_write_done,
   4267 		 * some other thread could have come along, locked
   4268 		 * them, and queued for an async thread.  It would be
   4269 		 * possible for all of the async threads to be tied
   4270 		 * up waiting to lock the pages again and they would
   4271 		 * all already be locked and waiting for an async
   4272 		 * thread to handle them.  Deadlock.
   4273 		 */
   4274 		if (!(flags & B_ASYNC)) {
   4275 			error = nfs_putpage(vp, io_off, io_len,
   4276 			    B_INVAL | B_FORCE, cr, NULL);
   4277 		}
   4278 	} else {
   4279 		if (error)
   4280 			flags |= B_ERROR;
   4281 		else if (rp->r_flags & ROUTOFSPACE) {
   4282 			mutex_enter(&rp->r_statelock);
   4283 			rp->r_flags &= ~ROUTOFSPACE;
   4284 			mutex_exit(&rp->r_statelock);
   4285 		}
   4286 		pvn_write_done(pp, flags);
   4287 	}
   4288 
   4289 	return (error);
   4290 }
   4291 
   4292 /* ARGSUSED */
   4293 static int
   4294 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
   4295 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
   4296 	caller_context_t *ct)
   4297 {
   4298 	struct segvn_crargs vn_a;
   4299 	int error;
   4300 	rnode_t *rp;
   4301 	struct vattr va;
   4302 
   4303 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   4304 		return (EIO);
   4305 
   4306 	if (vp->v_flag & VNOMAP)
   4307 		return (ENOSYS);
   4308 
   4309 	if (off > MAXOFF32_T)
   4310 		return (EFBIG);
   4311 
   4312 	if (off < 0 || off + len < 0)
   4313 		return (ENXIO);
   4314 
   4315 	if (vp->v_type != VREG)
   4316 		return (ENODEV);
   4317 
   4318 	/*
   4319 	 * If there is cached data and if close-to-open consistency
   4320 	 * checking is not turned off and if the file system is not
   4321 	 * mounted readonly, then force an over the wire getattr.
   4322 	 * Otherwise, just invoke nfsgetattr to get a copy of the
   4323 	 * attributes.  The attribute cache will be used unless it
   4324 	 * is timed out and if it is, then an over the wire getattr
   4325 	 * will be issued.
   4326 	 */
   4327 	va.va_mask = AT_ALL;
   4328 	if (vn_has_cached_data(vp) &&
   4329 	    !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
   4330 		error = nfs_getattr_otw(vp, &va, cr);
   4331 	else
   4332 		error = nfsgetattr(vp, &va, cr);
   4333 	if (error)
   4334 		return (error);
   4335 
   4336 	/*
   4337 	 * Check to see if the vnode is currently marked as not cachable.
   4338 	 * This means portions of the file are locked (through VOP_FRLOCK).
   4339 	 * In this case the map request must be refused.  We use
   4340 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
   4341 	 */
   4342 	rp = VTOR(vp);
   4343 
   4344 	/*
   4345 	 * Atomically increment r_inmap after acquiring r_rwlock. The
   4346 	 * idea here is to acquire r_rwlock to block read/write and
   4347 	 * not to protect r_inmap. r_inmap will inform nfs_read/write()
   4348 	 * that we are in nfs_map(). Now, r_rwlock is acquired in order
   4349 	 * and we can prevent the deadlock that would have occurred
   4350 	 * when nfs_addmap() would have acquired it out of order.
   4351 	 *
   4352 	 * Since we are not protecting r_inmap by any lock, we do not
   4353 	 * hold any lock when we decrement it. We atomically decrement
   4354 	 * r_inmap after we release r_lkserlock.
   4355 	 */
   4356 
   4357 	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
   4358 		return (EINTR);
   4359 	atomic_add_int(&rp->r_inmap, 1);
   4360 	nfs_rw_exit(&rp->r_rwlock);
   4361 
   4362 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
   4363 		atomic_add_int(&rp->r_inmap, -1);
   4364 		return (EINTR);
   4365 	}
   4366 	if (vp->v_flag & VNOCACHE) {
   4367 		error = EAGAIN;
   4368 		goto done;
   4369 	}
   4370 
   4371 	/*
   4372 	 * Don't allow concurrent locks and mapping if mandatory locking is
   4373 	 * enabled.
   4374 	 */
   4375 	if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
   4376 	    MANDLOCK(vp, va.va_mode)) {
   4377 		error = EAGAIN;
   4378 		goto done;
   4379 	}
   4380 
   4381 	as_rangelock(as);
   4382 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
   4383 	if (error != 0) {
   4384 		as_rangeunlock(as);
   4385 		goto done;
   4386 	}
   4387 
   4388 	vn_a.vp = vp;
   4389 	vn_a.offset = off;
   4390 	vn_a.type = (flags & MAP_TYPE);
   4391 	vn_a.prot = (uchar_t)prot;
   4392 	vn_a.maxprot = (uchar_t)maxprot;
   4393 	vn_a.flags = (flags & ~MAP_TYPE);
   4394 	vn_a.cred = cr;
   4395 	vn_a.amp = NULL;
   4396 	vn_a.szc = 0;
   4397 	vn_a.lgrp_mem_policy_flags = 0;
   4398 
   4399 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
   4400 	as_rangeunlock(as);
   4401 
   4402 done:
   4403 	nfs_rw_exit(&rp->r_lkserlock);
   4404 	atomic_add_int(&rp->r_inmap, -1);
   4405 	return (error);
   4406 }
   4407 
   4408 /* ARGSUSED */
   4409 static int
   4410 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
   4411 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
   4412 	caller_context_t *ct)
   4413 {
   4414 	rnode_t *rp;
   4415 
   4416 	if (vp->v_flag & VNOMAP)
   4417 		return (ENOSYS);
   4418 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   4419 		return (EIO);
   4420 
   4421 	rp = VTOR(vp);
   4422 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
   4423 
   4424 	return (0);
   4425 }
   4426 
   4427 /* ARGSUSED */
   4428 static int
   4429 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
   4430 	struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
   4431 {
   4432 	netobj lm_fh;
   4433 	int rc;
   4434 	u_offset_t start, end;
   4435 	rnode_t *rp;
   4436 	int error = 0, intr = INTR(vp);
   4437 
   4438 	/* check for valid cmd parameter */
   4439 	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
   4440 		return (EINVAL);
   4441 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   4442 		return (EIO);
   4443 
   4444 	/* Verify l_type. */
   4445 	switch (bfp->l_type) {
   4446 	case F_RDLCK:
   4447 		if (cmd != F_GETLK && !(flag & FREAD))
   4448 			return (EBADF);
   4449 		break;
   4450 	case F_WRLCK:
   4451 		if (cmd != F_GETLK && !(flag & FWRITE))
   4452 			return (EBADF);
   4453 		break;
   4454 	case F_UNLCK:
   4455 		intr = 0;
   4456 		break;
   4457 
   4458 	default:
   4459 		return (EINVAL);
   4460 	}
   4461 
   4462 	/* check the validity of the lock range */
   4463 	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
   4464 		return (rc);
   4465 	if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
   4466 		return (rc);
   4467 
   4468 	/*
   4469 	 * If the filesystem is mounted using local locking, pass the
   4470 	 * request off to the local locking code.
   4471 	 */
   4472 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
   4473 		if (offset > MAXOFF32_T)
   4474 			return (EFBIG);
   4475 		if (cmd == F_SETLK || cmd == F_SETLKW) {
   4476 			/*
   4477 			 * For complete safety, we should be holding
   4478 			 * r_lkserlock.  However, we can't call
   4479 			 * lm_safelock and then fs_frlock while
   4480 			 * holding r_lkserlock, so just invoke
   4481 			 * lm_safelock and expect that this will
   4482 			 * catch enough of the cases.
   4483 			 */
   4484 			if (!lm_safelock(vp, bfp, cr))
   4485 				return (EAGAIN);
   4486 		}
   4487 		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
   4488 	}
   4489 
   4490 	rp = VTOR(vp);
   4491 
   4492 	/*
   4493 	 * Check whether the given lock request can proceed, given the
   4494 	 * current file mappings.
   4495 	 */
   4496 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
   4497 		return (EINTR);
   4498 	if (cmd == F_SETLK || cmd == F_SETLKW) {
   4499 		if (!lm_safelock(vp, bfp, cr)) {
   4500 			rc = EAGAIN;
   4501 			goto done;
   4502 		}
   4503 	}
   4504 
   4505 	/*
   4506 	 * Flush the cache after waiting for async I/O to finish.  For new
   4507 	 * locks, this is so that the process gets the latest bits from the
   4508 	 * server.  For unlocks, this is so that other clients see the
   4509 	 * latest bits once the file has been unlocked.  If currently dirty
   4510 	 * pages can't be flushed, then don't allow a lock to be set.  But
   4511 	 * allow unlocks to succeed, to avoid having orphan locks on the
   4512 	 * server.
   4513 	 */
   4514 	if (cmd != F_GETLK) {
   4515 		mutex_enter(&rp->r_statelock);
   4516 		while (rp->r_count > 0) {
   4517 			if (intr) {
   4518 				klwp_t *lwp = ttolwp(curthread);
   4519 
   4520 				if (lwp != NULL)
   4521 					lwp->lwp_nostop++;
   4522 				if (cv_wait_sig(&rp->r_cv, &rp->r_statelock)
   4523 				    == 0) {
   4524 					if (lwp != NULL)
   4525 						lwp->lwp_nostop--;
   4526 					rc = EINTR;
   4527 					break;
   4528 				}
   4529 				if (lwp != NULL)
   4530 					lwp->lwp_nostop--;
   4531 			} else
   4532 			cv_wait(&rp->r_cv, &rp->r_statelock);
   4533 		}
   4534 		mutex_exit(&rp->r_statelock);
   4535 		if (rc != 0)
   4536 			goto done;
   4537 		error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
   4538 		if (error) {
   4539 			if (error == ENOSPC || error == EDQUOT) {
   4540 				mutex_enter(&rp->r_statelock);
   4541 				if (!rp->r_error)
   4542 					rp->r_error = error;
   4543 				mutex_exit(&rp->r_statelock);
   4544 			}
   4545 			if (bfp->l_type != F_UNLCK) {
   4546 				rc = ENOLCK;
   4547 				goto done;
   4548 			}
   4549 		}
   4550 	}
   4551 
   4552 	lm_fh.n_len = sizeof (fhandle_t);
   4553 	lm_fh.n_bytes = (char *)VTOFH(vp);
   4554 
   4555 	/*
   4556 	 * Call the lock manager to do the real work of contacting
   4557 	 * the server and obtaining the lock.
   4558 	 */
   4559 	rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
   4560 
   4561 	if (rc == 0)
   4562 		nfs_lockcompletion(vp, cmd);
   4563 
   4564 done:
   4565 	nfs_rw_exit(&rp->r_lkserlock);
   4566 	return (rc);
   4567 }
   4568 
   4569 /*
   4570  * Free storage space associated with the specified vnode.  The portion
   4571  * to be freed is specified by bfp->l_start and bfp->l_len (already
   4572  * normalized to a "whence" of 0).
   4573  *
   4574  * This is an experimental facility whose continued existence is not
   4575  * guaranteed.  Currently, we only support the special case
   4576  * of l_len == 0, meaning free to end of file.
   4577  */
   4578 /* ARGSUSED */
   4579 static int
   4580 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
   4581 	offset_t offset, cred_t *cr, caller_context_t *ct)
   4582 {
   4583 	int error;
   4584 
   4585 	ASSERT(vp->v_type == VREG);
   4586 	if (cmd != F_FREESP)
   4587 		return (EINVAL);
   4588 
   4589 	if (offset > MAXOFF32_T)
   4590 		return (EFBIG);
   4591 
   4592 	if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
   4593 	    (bfp->l_len > MAXOFF32_T))
   4594 		return (EFBIG);
   4595 
   4596 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   4597 		return (EIO);
   4598 
   4599 	error = convoff(vp, bfp, 0, offset);
   4600 	if (!error) {
   4601 		ASSERT(bfp->l_start >= 0);
   4602 		if (bfp->l_len == 0) {
   4603 			struct vattr va;
   4604 
   4605 			/*
   4606 			 * ftruncate should not change the ctime and
   4607 			 * mtime if we truncate the file to its
   4608 			 * previous size.
   4609 			 */
   4610 			va.va_mask = AT_SIZE;
   4611 			error = nfsgetattr(vp, &va, cr);
   4612 			if (error || va.va_size == bfp->l_start)
   4613 				return (error);
   4614 			va.va_mask = AT_SIZE;
   4615 			va.va_size = bfp->l_start;
   4616 			error = nfssetattr(vp, &va, 0, cr);
   4617 		} else
   4618 			error = EINVAL;
   4619 	}
   4620 
   4621 	return (error);
   4622 }
   4623 
   4624 /* ARGSUSED */
   4625 static int
   4626 nfs_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
   4627 {
   4628 
   4629 	return (EINVAL);
   4630 }
   4631 
   4632 /*
   4633  * Setup and add an address space callback to do the work of the delmap call.
   4634  * The callback will (and must be) deleted in the actual callback function.
   4635  *
   4636  * This is done in order to take care of the problem that we have with holding
   4637  * the address space's a_lock for a long period of time (e.g. if the NFS server
   4638  * is down).  Callbacks will be executed in the address space code while the
   4639  * a_lock is not held.	Holding the address space's a_lock causes things such
   4640  * as ps and fork to hang because they are trying to acquire this lock as well.
   4641  */
   4642 /* ARGSUSED */
   4643 static int
   4644 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
   4645 	size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
   4646 	caller_context_t *ct)
   4647 {
   4648 	int			caller_found;
   4649 	int			error;
   4650 	rnode_t			*rp;
   4651 	nfs_delmap_args_t	*dmapp;
   4652 	nfs_delmapcall_t	*delmap_call;
   4653 
   4654 	if (vp->v_flag & VNOMAP)
   4655 		return (ENOSYS);
   4656 	/*
   4657 	 * A process may not change zones if it has NFS pages mmap'ed
   4658 	 * in, so we can't legitimately get here from the wrong zone.
   4659 	 */
   4660 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
   4661 
   4662 	rp = VTOR(vp);
   4663 
   4664 	/*
   4665 	 * The way that the address space of this process deletes its mapping
   4666 	 * of this file is via the following call chains:
   4667 	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
   4668 	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
   4669 	 *
   4670 	 * With the use of address space callbacks we are allowed to drop the
   4671 	 * address space lock, a_lock, while executing the NFS operations that
   4672 	 * need to go over the wire.  Returning EAGAIN to the caller of this
   4673 	 * function is what drives the execution of the callback that we add
   4674 	 * below.  The callback will be executed by the address space code
   4675 	 * after dropping the a_lock.  When the callback is finished, since
   4676 	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
   4677 	 * is called again on the same segment to finish the rest of the work
   4678 	 * that needs to happen during unmapping.
   4679 	 *
   4680 	 * This action of calling back into the segment driver causes
   4681 	 * nfs_delmap() to get called again, but since the callback was
   4682 	 * already executed at this point, it already did the work and there
   4683 	 * is nothing left for us to do.
   4684 	 *
   4685 	 * To Summarize:
   4686 	 * - The first time nfs_delmap is called by the current thread is when
   4687 	 * we add the caller associated with this delmap to the delmap caller
   4688 	 * list, add the callback, and return EAGAIN.
   4689 	 * - The second time in this call chain when nfs_delmap is called we
   4690 	 * will find this caller in the delmap caller list and realize there
   4691 	 * is no more work to do thus removing this caller from the list and
   4692 	 * returning the error that was set in the callback execution.
   4693 	 */
   4694 	caller_found = nfs_find_and_delete_delmapcall(rp, &error);
   4695 	if (caller_found) {
   4696 		/*
   4697 		 * 'error' is from the actual delmap operations.  To avoid
   4698 		 * hangs, we need to handle the return of EAGAIN differently
   4699 		 * since this is what drives the callback execution.
   4700 		 * In this case, we don't want to return EAGAIN and do the
   4701 		 * callback execution because there are none to execute.
   4702 		 */
   4703 		if (error == EAGAIN)
   4704 			return (0);
   4705 		else
   4706 			return (error);
   4707 	}
   4708 
   4709 	/* current caller was not in the list */
   4710 	delmap_call = nfs_init_delmapcall();
   4711 
   4712 	mutex_enter(&rp->r_statelock);
   4713 	list_insert_tail(&rp->r_indelmap, delmap_call);
   4714 	mutex_exit(&rp->r_statelock);
   4715 
   4716 	dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
   4717 
   4718 	dmapp->vp = vp;
   4719 	dmapp->off = off;
   4720 	dmapp->addr = addr;
   4721 	dmapp->len = len;
   4722 	dmapp->prot = prot;
   4723 	dmapp->maxprot = maxprot;
   4724 	dmapp->flags = flags;
   4725 	dmapp->cr = cr;
   4726 	dmapp->caller = delmap_call;
   4727 
   4728 	error = as_add_callback(as, nfs_delmap_callback, dmapp,
   4729 	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
   4730 
   4731 	return (error ? error : EAGAIN);
   4732 }
   4733 
   4734 /*
   4735  * Remove some pages from an mmap'd vnode.  Just update the
   4736  * count of pages.  If doing close-to-open, then flush all
   4737  * of the pages associated with this file.  Otherwise, start
   4738  * an asynchronous page flush to write out any dirty pages.
   4739  * This will also associate a credential with the rnode which
   4740  * can be used to write the pages.
   4741  */
   4742 /* ARGSUSED */
   4743 static void
   4744 nfs_delmap_callback(struct as *as, void *arg, uint_t event)
   4745 {
   4746 	int			error;
   4747 	rnode_t			*rp;
   4748 	mntinfo_t		*mi;
   4749 	nfs_delmap_args_t	*dmapp = (nfs_delmap_args_t *)arg;
   4750 
   4751 	rp = VTOR(dmapp->vp);
   4752 	mi = VTOMI(dmapp->vp);
   4753 
   4754 	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
   4755 	ASSERT(rp->r_mapcnt >= 0);
   4756 
   4757 	/*
   4758 	 * Initiate a page flush if there are pages, the file system
   4759 	 * was not mounted readonly, the segment was mapped shared, and
   4760 	 * the pages themselves were writeable.
   4761 	 */
   4762 	if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
   4763 	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
   4764 		mutex_enter(&rp->r_statelock);
   4765 		rp->r_flags |= RDIRTY;
   4766 		mutex_exit(&rp->r_statelock);
   4767 		/*
   4768 		 * If this is a cross-zone access a sync putpage won't work, so
   4769 		 * the best we can do is try an async putpage.  That seems
   4770 		 * better than something more draconian such as discarding the
   4771 		 * dirty pages.
   4772 		 */
   4773 		if ((mi->mi_flags & MI_NOCTO) ||
   4774 		    nfs_zone() != mi->mi_zone)
   4775 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
   4776 			    B_ASYNC, dmapp->cr, NULL);
   4777 		else
   4778 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
   4779 			    0, dmapp->cr, NULL);
   4780 		if (!error) {
   4781 			mutex_enter(&rp->r_statelock);
   4782 			error = rp->r_error;
   4783 			rp->r_error = 0;
   4784 			mutex_exit(&rp->r_statelock);
   4785 		}
   4786 	} else
   4787 		error = 0;
   4788 
   4789 	if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
   4790 		(void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
   4791 		    B_INVAL, dmapp->cr, NULL);
   4792 
   4793 	dmapp->caller->error = error;
   4794 	(void) as_delete_callback(as, arg);
   4795 	kmem_free(dmapp, sizeof (nfs_delmap_args_t));
   4796 }
   4797 
   4798 /* ARGSUSED */
   4799 static int
   4800 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
   4801 	caller_context_t *ct)
   4802 {
   4803 	int error = 0;
   4804 
   4805 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   4806 		return (EIO);
   4807 	/*
   4808 	 * This looks a little weird because it's written in a general
   4809 	 * manner but we make little use of cases.  If cntl() ever gets
   4810 	 * widely used, the outer switch will make more sense.
   4811 	 */
   4812 
   4813 	switch (cmd) {
   4814 
   4815 	/*
   4816 	 * Large file spec - need to base answer new query with
   4817 	 * hardcoded constant based on the protocol.
   4818 	 */
   4819 	case _PC_FILESIZEBITS:
   4820 		*valp = 32;
   4821 		return (0);
   4822 
   4823 	case _PC_LINK_MAX:
   4824 	case _PC_NAME_MAX:
   4825 	case _PC_PATH_MAX:
   4826 	case _PC_SYMLINK_MAX:
   4827 	case _PC_CHOWN_RESTRICTED:
   4828 	case _PC_NO_TRUNC: {
   4829 		mntinfo_t *mi;
   4830 		struct pathcnf *pc;
   4831 
   4832 		if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
   4833 			return (EINVAL);
   4834 		error = _PC_ISSET(cmd, pc->pc_mask);    /* error or bool */
   4835 		switch (cmd) {
   4836 		case _PC_LINK_MAX:
   4837 			*valp = pc->pc_link_max;
   4838 			break;
   4839 		case _PC_NAME_MAX:
   4840 			*valp = pc->pc_name_max;
   4841 			break;
   4842 		case _PC_PATH_MAX:
   4843 		case _PC_SYMLINK_MAX:
   4844 			*valp = pc->pc_path_max;
   4845 			break;
   4846 		case _PC_CHOWN_RESTRICTED:
   4847 			/*
   4848 			 * if we got here, error is really a boolean which
   4849 			 * indicates whether cmd is set or not.
   4850 			 */
   4851 			*valp = error ? 1 : 0;	/* see above */
   4852 			error = 0;
   4853 			break;
   4854 		case _PC_NO_TRUNC:
   4855 			/*
   4856 			 * if we got here, error is really a boolean which
   4857 			 * indicates whether cmd is set or not.
   4858 			 */
   4859 			*valp = error ? 1 : 0;	/* see above */
   4860 			error = 0;
   4861 			break;
   4862 		}
   4863 		return (error ? EINVAL : 0);
   4864 		}
   4865 
   4866 	case _PC_XATTR_EXISTS:
   4867 		*valp = 0;
   4868 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
   4869 			vnode_t *avp;
   4870 			rnode_t *rp;
   4871 			mntinfo_t *mi = VTOMI(vp);
   4872 
   4873 			if (!(mi->mi_flags & MI_EXTATTR))
   4874 				return (0);
   4875 
   4876 			rp = VTOR(vp);
   4877 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
   4878 			    INTR(vp)))
   4879 				return (EINTR);
   4880 
   4881 			error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
   4882 			if (error || avp == NULL)
   4883 				error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
   4884 
   4885 			nfs_rw_exit(&rp->r_rwlock);
   4886 
   4887 			if (error == 0 && avp != NULL) {
   4888 				error = do_xattr_exists_check(avp, valp, cr);
   4889 				VN_RELE(avp);
   4890 			}
   4891 		}
   4892 		return (error ? EINVAL : 0);
   4893 
   4894 	case _PC_ACL_ENABLED:
   4895 		*valp = _ACL_ACLENT_ENABLED;
   4896 		return (0);
   4897 
   4898 	default:
   4899 		return (EINVAL);
   4900 	}
   4901 }
   4902 
   4903 /*
   4904  * Called by async thread to do synchronous pageio. Do the i/o, wait
   4905  * for it to complete, and cleanup the page list when done.
   4906  */
   4907 static int
   4908 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
   4909 	int flags, cred_t *cr)
   4910 {
   4911 	int error;
   4912 
   4913 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
   4914 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
   4915 	if (flags & B_READ)
   4916 		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
   4917 	else
   4918 		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
   4919 	return (error);
   4920 }
   4921 
   4922 /* ARGSUSED */
   4923 static int
   4924 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
   4925 	int flags, cred_t *cr, caller_context_t *ct)
   4926 {
   4927 	int error;
   4928 	rnode_t *rp;
   4929 
   4930 	if (pp == NULL)
   4931 		return (EINVAL);
   4932 
   4933 	if (io_off > MAXOFF32_T)
   4934 		return (EFBIG);
   4935 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   4936 		return (EIO);
   4937 	rp = VTOR(vp);
   4938 	mutex_enter(&rp->r_statelock);
   4939 	rp->r_count++;
   4940 	mutex_exit(&rp->r_statelock);
   4941 
   4942 	if (flags & B_ASYNC) {
   4943 		error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
   4944 		    nfs_sync_pageio);
   4945 	} else
   4946 		error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
   4947 	mutex_enter(&rp->r_statelock);
   4948 	rp->r_count--;
   4949 	cv_broadcast(&rp->r_cv);
   4950 	mutex_exit(&rp->r_statelock);
   4951 	return (error);
   4952 }
   4953 
   4954 /* ARGSUSED */
   4955 static int
   4956 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
   4957 	caller_context_t *ct)
   4958 {
   4959 	int error;
   4960 	mntinfo_t *mi;
   4961 
   4962 	mi = VTOMI(vp);
   4963 
   4964 	if (nfs_zone() != mi->mi_zone)
   4965 		return (EIO);
   4966 	if (mi->mi_flags & MI_ACL) {
   4967 		error = acl_setacl2(vp, vsecattr, flag, cr);
   4968 		if (mi->mi_flags & MI_ACL)
   4969 			return (error);
   4970 	}
   4971 
   4972 	return (ENOSYS);
   4973 }
   4974 
   4975 /* ARGSUSED */
   4976 static int
   4977 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
   4978 	caller_context_t *ct)
   4979 {
   4980 	int error;
   4981 	mntinfo_t *mi;
   4982 
   4983 	mi = VTOMI(vp);
   4984 
   4985 	if (nfs_zone() != mi->mi_zone)
   4986 		return (EIO);
   4987 	if (mi->mi_flags & MI_ACL) {
   4988 		error = acl_getacl2(vp, vsecattr, flag, cr);
   4989 		if (mi->mi_flags & MI_ACL)
   4990 			return (error);
   4991 	}
   4992 
   4993 	return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
   4994 }
   4995 
   4996 /* ARGSUSED */
   4997 static int
   4998 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
   4999 	caller_context_t *ct)
   5000 {
   5001 	int error;
   5002 	struct shrlock nshr;
   5003 	struct nfs_owner nfs_owner;
   5004 	netobj lm_fh;
   5005 
   5006 	if (nfs_zone() != VTOMI(vp)->mi_zone)
   5007 		return (EIO);
   5008 
   5009 	/*
   5010 	 * check for valid cmd parameter
   5011 	 */
   5012 	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
   5013 		return (EINVAL);
   5014 
   5015 	/*
   5016 	 * Check access permissions
   5017 	 */
   5018 	if (cmd == F_SHARE &&
   5019 	    (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
   5020 	    ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
   5021 		return (EBADF);
   5022 
   5023 	/*
   5024 	 * If the filesystem is mounted using local locking, pass the
   5025 	 * request off to the local share code.
   5026 	 */
   5027 	if (VTOMI(vp)->mi_flags & MI_LLOCK)
   5028 		return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
   5029 
   5030 	switch (cmd) {
   5031 	case F_SHARE:
   5032 	case F_UNSHARE:
   5033 		lm_fh.n_len = sizeof (fhandle_t);
   5034 		lm_fh.n_bytes = (char *)VTOFH(vp);
   5035 
   5036 		/*
   5037 		 * If passed an owner that is too large to fit in an
   5038 		 * nfs_owner it is likely a recursive call from the
   5039 		 * lock manager client and pass it straight through.  If
   5040 		 * it is not a nfs_owner then simply return an error.
   5041 		 */
   5042 		if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
   5043 			if (((struct nfs_owner *)shr->s_owner)->magic !=
   5044 			    NFS_OWNER_MAGIC)
   5045 				return (EINVAL);
   5046 
   5047 			if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
   5048 				error = set_errno(error);
   5049 			}
   5050 			return (error);
   5051 		}
   5052 		/*
   5053 		 * Remote share reservations owner is a combination of
   5054 		 * a magic number, hostname, and the local owner
   5055 		 */
   5056 		bzero(&nfs_owner, sizeof (nfs_owner));
   5057 		nfs_owner.magic = NFS_OWNER_MAGIC;
   5058 		(void) strncpy(nfs_owner.hname, uts_nodename(),
   5059 		    sizeof (nfs_owner.hname));
   5060 		bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
   5061 		nshr.s_access = shr->s_access;
   5062 		nshr.s_deny = shr->s_deny;
   5063 		nshr.s_sysid = 0;
   5064 		nshr.s_pid = ttoproc(curthread)->p_pid;
   5065 		nshr.s_own_len = sizeof (nfs_owner);
   5066 		nshr.s_owner = (caddr_t)&nfs_owner;
   5067 
   5068 		if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
   5069 			error = set_errno(error);
   5070 		}
   5071 
   5072 		break;
   5073 
   5074 	case F_HASREMOTELOCKS:
   5075 		/*
   5076 		 * NFS client can't store remote locks itself
   5077 		 */
   5078 		shr->s_access = 0;
   5079 		error = 0;
   5080 		break;
   5081 
   5082 	default:
   5083 		error = EINVAL;
   5084 		break;
   5085 	}
   5086 
   5087 	return (error);
   5088 }
   5089