Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
     28  *	All rights reserved.
     29  */
     30 
     31 #include <sys/param.h>
     32 #include <sys/types.h>
     33 #include <sys/systm.h>
     34 #include <sys/cred.h>
     35 #include <sys/buf.h>
     36 #include <sys/vfs.h>
     37 #include <sys/vnode.h>
     38 #include <sys/uio.h>
     39 #include <sys/stat.h>
     40 #include <sys/errno.h>
     41 #include <sys/sysmacros.h>
     42 #include <sys/statvfs.h>
     43 #include <sys/kmem.h>
     44 #include <sys/kstat.h>
     45 #include <sys/dirent.h>
     46 #include <sys/cmn_err.h>
     47 #include <sys/debug.h>
     48 #include <sys/vtrace.h>
     49 #include <sys/mode.h>
     50 #include <sys/acl.h>
     51 #include <sys/nbmlock.h>
     52 #include <sys/policy.h>
     53 #include <sys/sdt.h>
     54 
     55 #include <rpc/types.h>
     56 #include <rpc/auth.h>
     57 #include <rpc/svc.h>
     58 
     59 #include <nfs/nfs.h>
     60 #include <nfs/export.h>
     61 #include <nfs/nfs_cmd.h>
     62 
     63 #include <vm/hat.h>
     64 #include <vm/as.h>
     65 #include <vm/seg.h>
     66 #include <vm/seg_map.h>
     67 #include <vm/seg_kmem.h>
     68 
     69 #include <sys/strsubr.h>
     70 
     71 /*
     72  * These are the interface routines for the server side of the
     73  * Network File System.  See the NFS version 2 protocol specification
     74  * for a description of this interface.
     75  */
     76 
     77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
     78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
     79 			cred_t *);
     80 
     81 /*
     82  * Some "over the wire" UNIX file types.  These are encoded
     83  * into the mode.  This needs to be fixed in the next rev.
     84  */
     85 #define	IFMT		0170000		/* type of file */
     86 #define	IFCHR		0020000		/* character special */
     87 #define	IFBLK		0060000		/* block special */
     88 #define	IFSOCK		0140000		/* socket */
     89 
     90 u_longlong_t nfs2_srv_caller_id;
     91 
     92 /*
     93  * Get file attributes.
     94  * Returns the current attributes of the file with the given fhandle.
     95  */
     96 /* ARGSUSED */
     97 void
     98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
     99 	struct svc_req *req, cred_t *cr)
    100 {
    101 	int error;
    102 	vnode_t *vp;
    103 	struct vattr va;
    104 
    105 	vp = nfs_fhtovp(fhp, exi);
    106 	if (vp == NULL) {
    107 		ns->ns_status = NFSERR_STALE;
    108 		return;
    109 	}
    110 
    111 	/*
    112 	 * Do the getattr.
    113 	 */
    114 	va.va_mask = AT_ALL;	/* we want all the attributes */
    115 
    116 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
    117 
    118 	/* check for overflows */
    119 	if (!error) {
    120 		acl_perm(vp, exi, &va, cr);
    121 		error = vattr_to_nattr(&va, &ns->ns_attr);
    122 	}
    123 
    124 	VN_RELE(vp);
    125 
    126 	ns->ns_status = puterrno(error);
    127 }
    128 void *
    129 rfs_getattr_getfh(fhandle_t *fhp)
    130 {
    131 	return (fhp);
    132 }
    133 
    134 /*
    135  * Set file attributes.
    136  * Sets the attributes of the file with the given fhandle.  Returns
    137  * the new attributes.
    138  */
    139 void
    140 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
    141 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
    142 {
    143 	int error;
    144 	int flag;
    145 	int in_crit = 0;
    146 	vnode_t *vp;
    147 	struct vattr va;
    148 	struct vattr bva;
    149 	struct flock64 bf;
    150 	caller_context_t ct;
    151 
    152 
    153 	vp = nfs_fhtovp(&args->saa_fh, exi);
    154 	if (vp == NULL) {
    155 		ns->ns_status = NFSERR_STALE;
    156 		return;
    157 	}
    158 
    159 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
    160 		VN_RELE(vp);
    161 		ns->ns_status = NFSERR_ROFS;
    162 		return;
    163 	}
    164 
    165 	error = sattr_to_vattr(&args->saa_sa, &va);
    166 	if (error) {
    167 		VN_RELE(vp);
    168 		ns->ns_status = puterrno(error);
    169 		return;
    170 	}
    171 
    172 	/*
    173 	 * If the client is requesting a change to the mtime,
    174 	 * but the nanosecond field is set to 1 billion, then
    175 	 * this is a flag to the server that it should set the
    176 	 * atime and mtime fields to the server's current time.
    177 	 * The 1 billion number actually came from the client
    178 	 * as 1 million, but the units in the over the wire
    179 	 * request are microseconds instead of nanoseconds.
    180 	 *
    181 	 * This is an overload of the protocol and should be
    182 	 * documented in the NFS Version 2 protocol specification.
    183 	 */
    184 	if (va.va_mask & AT_MTIME) {
    185 		if (va.va_mtime.tv_nsec == 1000000000) {
    186 			gethrestime(&va.va_mtime);
    187 			va.va_atime = va.va_mtime;
    188 			va.va_mask |= AT_ATIME;
    189 			flag = 0;
    190 		} else
    191 			flag = ATTR_UTIME;
    192 	} else
    193 		flag = 0;
    194 
    195 	/*
    196 	 * If the filesystem is exported with nosuid, then mask off
    197 	 * the setuid and setgid bits.
    198 	 */
    199 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
    200 	    (exi->exi_export.ex_flags & EX_NOSUID))
    201 		va.va_mode &= ~(VSUID | VSGID);
    202 
    203 	ct.cc_sysid = 0;
    204 	ct.cc_pid = 0;
    205 	ct.cc_caller_id = nfs2_srv_caller_id;
    206 	ct.cc_flags = CC_DONTBLOCK;
    207 
    208 	/*
    209 	 * We need to specially handle size changes because it is
    210 	 * possible for the client to create a file with modes
    211 	 * which indicate read-only, but with the file opened for
    212 	 * writing.  If the client then tries to set the size of
    213 	 * the file, then the normal access checking done in
    214 	 * VOP_SETATTR would prevent the client from doing so,
    215 	 * although it should be legal for it to do so.  To get
    216 	 * around this, we do the access checking for ourselves
    217 	 * and then use VOP_SPACE which doesn't do the access
    218 	 * checking which VOP_SETATTR does. VOP_SPACE can only
    219 	 * operate on VREG files, let VOP_SETATTR handle the other
    220 	 * extremely rare cases.
    221 	 * Also the client should not be allowed to change the
    222 	 * size of the file if there is a conflicting non-blocking
    223 	 * mandatory lock in the region of change.
    224 	 */
    225 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
    226 		if (nbl_need_check(vp)) {
    227 			nbl_start_crit(vp, RW_READER);
    228 			in_crit = 1;
    229 		}
    230 
    231 		bva.va_mask = AT_UID | AT_SIZE;
    232 
    233 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
    234 
    235 		if (error) {
    236 			if (in_crit)
    237 				nbl_end_crit(vp);
    238 			VN_RELE(vp);
    239 			ns->ns_status = puterrno(error);
    240 			return;
    241 		}
    242 
    243 		if (in_crit) {
    244 			u_offset_t offset;
    245 			ssize_t length;
    246 
    247 			if (va.va_size < bva.va_size) {
    248 				offset = va.va_size;
    249 				length = bva.va_size - va.va_size;
    250 			} else {
    251 				offset = bva.va_size;
    252 				length = va.va_size - bva.va_size;
    253 			}
    254 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
    255 			    NULL)) {
    256 				error = EACCES;
    257 			}
    258 		}
    259 
    260 		if (crgetuid(cr) == bva.va_uid && !error &&
    261 		    va.va_size != bva.va_size) {
    262 			va.va_mask &= ~AT_SIZE;
    263 			bf.l_type = F_WRLCK;
    264 			bf.l_whence = 0;
    265 			bf.l_start = (off64_t)va.va_size;
    266 			bf.l_len = 0;
    267 			bf.l_sysid = 0;
    268 			bf.l_pid = 0;
    269 
    270 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
    271 			    (offset_t)va.va_size, cr, &ct);
    272 		}
    273 		if (in_crit)
    274 			nbl_end_crit(vp);
    275 	} else
    276 		error = 0;
    277 
    278 	/*
    279 	 * Do the setattr.
    280 	 */
    281 	if (!error && va.va_mask) {
    282 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
    283 	}
    284 
    285 	/*
    286 	 * check if the monitor on either vop_space or vop_setattr detected
    287 	 * a delegation conflict and if so, mark the thread flag as
    288 	 * wouldblock so that the response is dropped and the client will
    289 	 * try again.
    290 	 */
    291 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
    292 		VN_RELE(vp);
    293 		curthread->t_flag |= T_WOULDBLOCK;
    294 		return;
    295 	}
    296 
    297 	if (!error) {
    298 		va.va_mask = AT_ALL;	/* get everything */
    299 
    300 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
    301 
    302 		/* check for overflows */
    303 		if (!error) {
    304 			acl_perm(vp, exi, &va, cr);
    305 			error = vattr_to_nattr(&va, &ns->ns_attr);
    306 		}
    307 	}
    308 
    309 	ct.cc_flags = 0;
    310 
    311 	/*
    312 	 * Force modified metadata out to stable storage.
    313 	 */
    314 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
    315 
    316 	VN_RELE(vp);
    317 
    318 	ns->ns_status = puterrno(error);
    319 }
    320 void *
    321 rfs_setattr_getfh(struct nfssaargs *args)
    322 {
    323 	return (&args->saa_fh);
    324 }
    325 
    326 /*
    327  * Directory lookup.
    328  * Returns an fhandle and file attributes for file name in a directory.
    329  */
    330 /* ARGSUSED */
    331 void
    332 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
    333 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
    334 {
    335 	int error;
    336 	vnode_t *dvp;
    337 	vnode_t *vp;
    338 	struct vattr va;
    339 	fhandle_t *fhp = da->da_fhandle;
    340 	struct sec_ol sec = {0, 0};
    341 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
    342 	char *name;
    343 	struct sockaddr *ca;
    344 
    345 	/*
    346 	 * Trusted Extension doesn't support NFSv2. MOUNT
    347 	 * will reject v2 clients. Need to prevent v2 client
    348 	 * access via WebNFS here.
    349 	 */
    350 	if (is_system_labeled() && req->rq_vers == 2) {
    351 		dr->dr_status = NFSERR_ACCES;
    352 		return;
    353 	}
    354 
    355 	/*
    356 	 * Disallow NULL paths
    357 	 */
    358 	if (da->da_name == NULL || *da->da_name == '\0') {
    359 		dr->dr_status = NFSERR_ACCES;
    360 		return;
    361 	}
    362 
    363 	/*
    364 	 * Allow lookups from the root - the default
    365 	 * location of the public filehandle.
    366 	 */
    367 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
    368 		dvp = rootdir;
    369 		VN_HOLD(dvp);
    370 	} else {
    371 		dvp = nfs_fhtovp(fhp, exi);
    372 		if (dvp == NULL) {
    373 			dr->dr_status = NFSERR_STALE;
    374 			return;
    375 		}
    376 	}
    377 
    378 	/*
    379 	 * Not allow lookup beyond root.
    380 	 * If the filehandle matches a filehandle of the exi,
    381 	 * then the ".." refers beyond the root of an exported filesystem.
    382 	 */
    383 	if (strcmp(da->da_name, "..") == 0 &&
    384 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
    385 		VN_RELE(dvp);
    386 		dr->dr_status = NFSERR_NOENT;
    387 		return;
    388 	}
    389 
    390 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
    391 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
    392 	    MAXPATHLEN);
    393 
    394 	if (name == NULL) {
    395 		dr->dr_status = NFSERR_ACCES;
    396 		return;
    397 	}
    398 
    399 	/*
    400 	 * If the public filehandle is used then allow
    401 	 * a multi-component lookup, i.e. evaluate
    402 	 * a pathname and follow symbolic links if
    403 	 * necessary.
    404 	 *
    405 	 * This may result in a vnode in another filesystem
    406 	 * which is OK as long as the filesystem is exported.
    407 	 */
    408 	if (PUBLIC_FH2(fhp)) {
    409 		publicfh_flag = TRUE;
    410 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
    411 		    &sec);
    412 	} else {
    413 		/*
    414 		 * Do a normal single component lookup.
    415 		 */
    416 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
    417 		    NULL, NULL, NULL);
    418 	}
    419 
    420 	if (name != da->da_name)
    421 		kmem_free(name, MAXPATHLEN);
    422 
    423 
    424 	if (!error) {
    425 		va.va_mask = AT_ALL;	/* we want everything */
    426 
    427 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
    428 
    429 		/* check for overflows */
    430 		if (!error) {
    431 			acl_perm(vp, exi, &va, cr);
    432 			error = vattr_to_nattr(&va, &dr->dr_attr);
    433 			if (!error) {
    434 				if (sec.sec_flags & SEC_QUERY)
    435 					error = makefh_ol(&dr->dr_fhandle, exi,
    436 					    sec.sec_index);
    437 				else {
    438 					error = makefh(&dr->dr_fhandle, vp,
    439 					    exi);
    440 					if (!error && publicfh_flag &&
    441 					    !chk_clnt_sec(exi, req))
    442 						auth_weak = TRUE;
    443 				}
    444 			}
    445 		}
    446 		VN_RELE(vp);
    447 	}
    448 
    449 	VN_RELE(dvp);
    450 
    451 	/*
    452 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
    453 	 * and have obtained a new exportinfo in exi which needs to be
    454 	 * released. Note the the original exportinfo pointed to by exi
    455 	 * will be released by the caller, comon_dispatch.
    456 	 */
    457 	if (publicfh_flag && exi != NULL)
    458 		exi_rele(exi);
    459 
    460 	/*
    461 	 * If it's public fh, no 0x81, and client's flavor is
    462 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
    463 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
    464 	 */
    465 	if (auth_weak)
    466 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
    467 	else
    468 		dr->dr_status = puterrno(error);
    469 }
    470 void *
    471 rfs_lookup_getfh(struct nfsdiropargs *da)
    472 {
    473 	return (da->da_fhandle);
    474 }
    475 
    476 /*
    477  * Read symbolic link.
    478  * Returns the string in the symbolic link at the given fhandle.
    479  */
    480 /* ARGSUSED */
    481 void
    482 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
    483 	struct svc_req *req, cred_t *cr)
    484 {
    485 	int error;
    486 	struct iovec iov;
    487 	struct uio uio;
    488 	vnode_t *vp;
    489 	struct vattr va;
    490 	struct sockaddr *ca;
    491 	char *name = NULL;
    492 
    493 	vp = nfs_fhtovp(fhp, exi);
    494 	if (vp == NULL) {
    495 		rl->rl_data = NULL;
    496 		rl->rl_status = NFSERR_STALE;
    497 		return;
    498 	}
    499 
    500 	va.va_mask = AT_MODE;
    501 
    502 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
    503 
    504 	if (error) {
    505 		VN_RELE(vp);
    506 		rl->rl_data = NULL;
    507 		rl->rl_status = puterrno(error);
    508 		return;
    509 	}
    510 
    511 	if (MANDLOCK(vp, va.va_mode)) {
    512 		VN_RELE(vp);
    513 		rl->rl_data = NULL;
    514 		rl->rl_status = NFSERR_ACCES;
    515 		return;
    516 	}
    517 
    518 	/*
    519 	 * XNFS and RFC1094 require us to return ENXIO if argument
    520 	 * is not a link. BUGID 1138002.
    521 	 */
    522 	if (vp->v_type != VLNK) {
    523 		VN_RELE(vp);
    524 		rl->rl_data = NULL;
    525 		rl->rl_status = NFSERR_NXIO;
    526 		return;
    527 	}
    528 
    529 	/*
    530 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
    531 	 */
    532 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
    533 
    534 	/*
    535 	 * Set up io vector to read sym link data
    536 	 */
    537 	iov.iov_base = rl->rl_data;
    538 	iov.iov_len = NFS_MAXPATHLEN;
    539 	uio.uio_iov = &iov;
    540 	uio.uio_iovcnt = 1;
    541 	uio.uio_segflg = UIO_SYSSPACE;
    542 	uio.uio_extflg = UIO_COPY_CACHED;
    543 	uio.uio_loffset = (offset_t)0;
    544 	uio.uio_resid = NFS_MAXPATHLEN;
    545 
    546 	/*
    547 	 * Do the readlink.
    548 	 */
    549 	error = VOP_READLINK(vp, &uio, cr, NULL);
    550 
    551 	VN_RELE(vp);
    552 
    553 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
    554 	rl->rl_data[rl->rl_count] = '\0';
    555 
    556 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
    557 	name = nfscmd_convname(ca, exi, rl->rl_data,
    558 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
    559 
    560 	if (name != NULL && name != rl->rl_data) {
    561 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
    562 		rl->rl_data = name;
    563 	}
    564 
    565 	/*
    566 	 * XNFS and RFC1094 require us to return ENXIO if argument
    567 	 * is not a link. UFS returns EINVAL if this is the case,
    568 	 * so we do the mapping here. BUGID 1138002.
    569 	 */
    570 	if (error == EINVAL)
    571 		rl->rl_status = NFSERR_NXIO;
    572 	else
    573 		rl->rl_status = puterrno(error);
    574 
    575 }
    576 void *
    577 rfs_readlink_getfh(fhandle_t *fhp)
    578 {
    579 	return (fhp);
    580 }
    581 /*
    582  * Free data allocated by rfs_readlink
    583  */
    584 void
    585 rfs_rlfree(struct nfsrdlnres *rl)
    586 {
    587 	if (rl->rl_data != NULL)
    588 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
    589 }
    590 
    591 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
    592 
    593 /*
    594  * Read data.
    595  * Returns some data read from the file at the given fhandle.
    596  */
    597 /* ARGSUSED */
    598 void
    599 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
    600 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
    601 {
    602 	vnode_t *vp;
    603 	int error;
    604 	struct vattr va;
    605 	struct iovec iov;
    606 	struct uio uio;
    607 	mblk_t *mp;
    608 	int alloc_err = 0;
    609 	int in_crit = 0;
    610 	caller_context_t ct;
    611 
    612 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
    613 	if (vp == NULL) {
    614 		rr->rr_data = NULL;
    615 		rr->rr_status = NFSERR_STALE;
    616 		return;
    617 	}
    618 
    619 	if (vp->v_type != VREG) {
    620 		VN_RELE(vp);
    621 		rr->rr_data = NULL;
    622 		rr->rr_status = NFSERR_ISDIR;
    623 		return;
    624 	}
    625 
    626 	ct.cc_sysid = 0;
    627 	ct.cc_pid = 0;
    628 	ct.cc_caller_id = nfs2_srv_caller_id;
    629 	ct.cc_flags = CC_DONTBLOCK;
    630 
    631 	/*
    632 	 * Enter the critical region before calling VOP_RWLOCK
    633 	 * to avoid a deadlock with write requests.
    634 	 */
    635 	if (nbl_need_check(vp)) {
    636 		nbl_start_crit(vp, RW_READER);
    637 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
    638 		    0, NULL)) {
    639 			nbl_end_crit(vp);
    640 			VN_RELE(vp);
    641 			rr->rr_data = NULL;
    642 			rr->rr_status = NFSERR_ACCES;
    643 			return;
    644 		}
    645 		in_crit = 1;
    646 	}
    647 
    648 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
    649 
    650 	/* check if a monitor detected a delegation conflict */
    651 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
    652 		VN_RELE(vp);
    653 		/* mark as wouldblock so response is dropped */
    654 		curthread->t_flag |= T_WOULDBLOCK;
    655 
    656 		rr->rr_data = NULL;
    657 		return;
    658 	}
    659 
    660 	va.va_mask = AT_ALL;
    661 
    662 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
    663 
    664 	if (error) {
    665 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    666 		if (in_crit)
    667 			nbl_end_crit(vp);
    668 
    669 		VN_RELE(vp);
    670 		rr->rr_data = NULL;
    671 		rr->rr_status = puterrno(error);
    672 
    673 		return;
    674 	}
    675 
    676 	/*
    677 	 * This is a kludge to allow reading of files created
    678 	 * with no read permission.  The owner of the file
    679 	 * is always allowed to read it.
    680 	 */
    681 	if (crgetuid(cr) != va.va_uid) {
    682 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
    683 
    684 		if (error) {
    685 			/*
    686 			 * Exec is the same as read over the net because
    687 			 * of demand loading.
    688 			 */
    689 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
    690 		}
    691 		if (error) {
    692 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    693 			if (in_crit)
    694 				nbl_end_crit(vp);
    695 			VN_RELE(vp);
    696 			rr->rr_data = NULL;
    697 			rr->rr_status = puterrno(error);
    698 
    699 			return;
    700 		}
    701 	}
    702 
    703 	if (MANDLOCK(vp, va.va_mode)) {
    704 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    705 		if (in_crit)
    706 			nbl_end_crit(vp);
    707 
    708 		VN_RELE(vp);
    709 		rr->rr_data = NULL;
    710 		rr->rr_status = NFSERR_ACCES;
    711 
    712 		return;
    713 	}
    714 
    715 	rr->rr_ok.rrok_wlist_len = 0;
    716 	rr->rr_ok.rrok_wlist = NULL;
    717 
    718 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
    719 		rr->rr_count = 0;
    720 		rr->rr_data = NULL;
    721 		/*
    722 		 * In this case, status is NFS_OK, but there is no data
    723 		 * to encode. So set rr_mp to NULL.
    724 		 */
    725 		rr->rr_mp = NULL;
    726 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
    727 		if (rr->rr_ok.rrok_wlist)
    728 			clist_zero_len(rr->rr_ok.rrok_wlist);
    729 		goto done;
    730 	}
    731 
    732 	if (ra->ra_wlist) {
    733 		mp = NULL;
    734 		rr->rr_mp = NULL;
    735 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
    736 	} else {
    737 		/*
    738 		 * mp will contain the data to be sent out in the read reply.
    739 		 * This will be freed after the reply has been sent out (by the
    740 		 * driver).
    741 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
    742 		 * that the call to xdrmblk_putmblk() never fails.
    743 		 */
    744 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
    745 		    &alloc_err);
    746 		ASSERT(mp != NULL);
    747 		ASSERT(alloc_err == 0);
    748 
    749 		rr->rr_mp = mp;
    750 
    751 		/*
    752 		 * Set up io vector
    753 		 */
    754 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
    755 		iov.iov_len = ra->ra_count;
    756 	}
    757 
    758 	uio.uio_iov = &iov;
    759 	uio.uio_iovcnt = 1;
    760 	uio.uio_segflg = UIO_SYSSPACE;
    761 	uio.uio_extflg = UIO_COPY_CACHED;
    762 	uio.uio_loffset = (offset_t)ra->ra_offset;
    763 	uio.uio_resid = ra->ra_count;
    764 
    765 	error = VOP_READ(vp, &uio, 0, cr, &ct);
    766 
    767 	if (error) {
    768 		if (mp)
    769 			freeb(mp);
    770 
    771 		/*
    772 		 * check if a monitor detected a delegation conflict and
    773 		 * mark as wouldblock so response is dropped
    774 		 */
    775 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
    776 			curthread->t_flag |= T_WOULDBLOCK;
    777 		else
    778 			rr->rr_status = puterrno(error);
    779 
    780 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    781 		if (in_crit)
    782 			nbl_end_crit(vp);
    783 
    784 		VN_RELE(vp);
    785 		rr->rr_data = NULL;
    786 
    787 		return;
    788 	}
    789 
    790 	/*
    791 	 * Get attributes again so we can send the latest access
    792 	 * time to the client side for his cache.
    793 	 */
    794 	va.va_mask = AT_ALL;
    795 
    796 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
    797 
    798 	if (error) {
    799 		if (mp)
    800 			freeb(mp);
    801 
    802 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    803 		if (in_crit)
    804 			nbl_end_crit(vp);
    805 
    806 		VN_RELE(vp);
    807 		rr->rr_data = NULL;
    808 		rr->rr_status = puterrno(error);
    809 
    810 		return;
    811 	}
    812 
    813 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
    814 
    815 	if (mp) {
    816 		rr->rr_data = (char *)mp->b_datap->db_base;
    817 	} else {
    818 		if (ra->ra_wlist) {
    819 			rr->rr_data = (caddr_t)iov.iov_base;
    820 			if (!rdma_setup_read_data2(ra, rr)) {
    821 				rr->rr_data = NULL;
    822 				rr->rr_status = puterrno(NFSERR_INVAL);
    823 			}
    824 		}
    825 	}
    826 done:
    827 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    828 	if (in_crit)
    829 		nbl_end_crit(vp);
    830 
    831 	acl_perm(vp, exi, &va, cr);
    832 
    833 	/* check for overflows */
    834 	error = vattr_to_nattr(&va, &rr->rr_attr);
    835 
    836 	VN_RELE(vp);
    837 
    838 	rr->rr_status = puterrno(error);
    839 }
    840 
    841 /*
    842  * Free data allocated by rfs_read
    843  */
    844 void
    845 rfs_rdfree(struct nfsrdresult *rr)
    846 {
    847 	mblk_t *mp;
    848 
    849 	if (rr->rr_status == NFS_OK) {
    850 		mp = rr->rr_mp;
    851 		if (mp != NULL)
    852 			freeb(mp);
    853 	}
    854 }
    855 
    856 void *
    857 rfs_read_getfh(struct nfsreadargs *ra)
    858 {
    859 	return (&ra->ra_fhandle);
    860 }
    861 
    862 #define	MAX_IOVECS	12
    863 
    864 #ifdef DEBUG
    865 static int rfs_write_sync_hits = 0;
    866 static int rfs_write_sync_misses = 0;
    867 #endif
    868 
    869 /*
    870  * Write data to file.
    871  * Returns attributes of a file after writing some data to it.
    872  *
    873  * Any changes made here, especially in error handling might have
    874  * to also be done in rfs_write (which clusters write requests).
    875  */
    876 void
    877 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
    878 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
    879 {
    880 	int error;
    881 	vnode_t *vp;
    882 	rlim64_t rlimit;
    883 	struct vattr va;
    884 	struct uio uio;
    885 	struct iovec iov[MAX_IOVECS];
    886 	mblk_t *m;
    887 	struct iovec *iovp;
    888 	int iovcnt;
    889 	cred_t *savecred;
    890 	int in_crit = 0;
    891 	caller_context_t ct;
    892 
    893 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
    894 	if (vp == NULL) {
    895 		ns->ns_status = NFSERR_STALE;
    896 		return;
    897 	}
    898 
    899 	if (rdonly(exi, req)) {
    900 		VN_RELE(vp);
    901 		ns->ns_status = NFSERR_ROFS;
    902 		return;
    903 	}
    904 
    905 	if (vp->v_type != VREG) {
    906 		VN_RELE(vp);
    907 		ns->ns_status = NFSERR_ISDIR;
    908 		return;
    909 	}
    910 
    911 	ct.cc_sysid = 0;
    912 	ct.cc_pid = 0;
    913 	ct.cc_caller_id = nfs2_srv_caller_id;
    914 	ct.cc_flags = CC_DONTBLOCK;
    915 
    916 	va.va_mask = AT_UID|AT_MODE;
    917 
    918 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
    919 
    920 	if (error) {
    921 		VN_RELE(vp);
    922 		ns->ns_status = puterrno(error);
    923 
    924 		return;
    925 	}
    926 
    927 	if (crgetuid(cr) != va.va_uid) {
    928 		/*
    929 		 * This is a kludge to allow writes of files created
    930 		 * with read only permission.  The owner of the file
    931 		 * is always allowed to write it.
    932 		 */
    933 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
    934 
    935 		if (error) {
    936 			VN_RELE(vp);
    937 			ns->ns_status = puterrno(error);
    938 			return;
    939 		}
    940 	}
    941 
    942 	/*
    943 	 * Can't access a mandatory lock file.  This might cause
    944 	 * the NFS service thread to block forever waiting for a
    945 	 * lock to be released that will never be released.
    946 	 */
    947 	if (MANDLOCK(vp, va.va_mode)) {
    948 		VN_RELE(vp);
    949 		ns->ns_status = NFSERR_ACCES;
    950 		return;
    951 	}
    952 
    953 	/*
    954 	 * We have to enter the critical region before calling VOP_RWLOCK
    955 	 * to avoid a deadlock with ufs.
    956 	 */
    957 	if (nbl_need_check(vp)) {
    958 		nbl_start_crit(vp, RW_READER);
    959 		in_crit = 1;
    960 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
    961 		    wa->wa_count, 0, NULL)) {
    962 			error = EACCES;
    963 			goto out;
    964 		}
    965 	}
    966 
    967 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
    968 
    969 	/* check if a monitor detected a delegation conflict */
    970 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
    971 		VN_RELE(vp);
    972 		/* mark as wouldblock so response is dropped */
    973 		curthread->t_flag |= T_WOULDBLOCK;
    974 		return;
    975 	}
    976 
    977 	if (wa->wa_data || wa->wa_rlist) {
    978 		/* Do the RDMA thing if necessary */
    979 		if (wa->wa_rlist) {
    980 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
    981 			iov[0].iov_len = wa->wa_count;
    982 		} else  {
    983 			iov[0].iov_base = wa->wa_data;
    984 			iov[0].iov_len = wa->wa_count;
    985 		}
    986 		uio.uio_iov = iov;
    987 		uio.uio_iovcnt = 1;
    988 		uio.uio_segflg = UIO_SYSSPACE;
    989 		uio.uio_extflg = UIO_COPY_DEFAULT;
    990 		uio.uio_loffset = (offset_t)wa->wa_offset;
    991 		uio.uio_resid = wa->wa_count;
    992 		/*
    993 		 * The limit is checked on the client. We
    994 		 * should allow any size writes here.
    995 		 */
    996 		uio.uio_llimit = curproc->p_fsz_ctl;
    997 		rlimit = uio.uio_llimit - wa->wa_offset;
    998 		if (rlimit < (rlim64_t)uio.uio_resid)
    999 			uio.uio_resid = (uint_t)rlimit;
   1000 
   1001 		/*
   1002 		 * for now we assume no append mode
   1003 		 */
   1004 		/*
   1005 		 * We're changing creds because VM may fault and we need
   1006 		 * the cred of the current thread to be used if quota
   1007 		 * checking is enabled.
   1008 		 */
   1009 		savecred = curthread->t_cred;
   1010 		curthread->t_cred = cr;
   1011 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
   1012 		curthread->t_cred = savecred;
   1013 	} else {
   1014 		iovcnt = 0;
   1015 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
   1016 			iovcnt++;
   1017 		if (iovcnt <= MAX_IOVECS) {
   1018 #ifdef DEBUG
   1019 			rfs_write_sync_hits++;
   1020 #endif
   1021 			iovp = iov;
   1022 		} else {
   1023 #ifdef DEBUG
   1024 			rfs_write_sync_misses++;
   1025 #endif
   1026 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
   1027 		}
   1028 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
   1029 		uio.uio_iov = iovp;
   1030 		uio.uio_iovcnt = iovcnt;
   1031 		uio.uio_segflg = UIO_SYSSPACE;
   1032 		uio.uio_extflg = UIO_COPY_DEFAULT;
   1033 		uio.uio_loffset = (offset_t)wa->wa_offset;
   1034 		uio.uio_resid = wa->wa_count;
   1035 		/*
   1036 		 * The limit is checked on the client. We
   1037 		 * should allow any size writes here.
   1038 		 */
   1039 		uio.uio_llimit = curproc->p_fsz_ctl;
   1040 		rlimit = uio.uio_llimit - wa->wa_offset;
   1041 		if (rlimit < (rlim64_t)uio.uio_resid)
   1042 			uio.uio_resid = (uint_t)rlimit;
   1043 
   1044 		/*
   1045 		 * For now we assume no append mode.
   1046 		 */
   1047 		/*
   1048 		 * We're changing creds because VM may fault and we need
   1049 		 * the cred of the current thread to be used if quota
   1050 		 * checking is enabled.
   1051 		 */
   1052 		savecred = curthread->t_cred;
   1053 		curthread->t_cred = cr;
   1054 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
   1055 		curthread->t_cred = savecred;
   1056 
   1057 		if (iovp != iov)
   1058 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
   1059 	}
   1060 
   1061 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
   1062 
   1063 	if (!error) {
   1064 		/*
   1065 		 * Get attributes again so we send the latest mod
   1066 		 * time to the client side for his cache.
   1067 		 */
   1068 		va.va_mask = AT_ALL;	/* now we want everything */
   1069 
   1070 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
   1071 
   1072 		/* check for overflows */
   1073 		if (!error) {
   1074 			acl_perm(vp, exi, &va, cr);
   1075 			error = vattr_to_nattr(&va, &ns->ns_attr);
   1076 		}
   1077 	}
   1078 
   1079 out:
   1080 	if (in_crit)
   1081 		nbl_end_crit(vp);
   1082 	VN_RELE(vp);
   1083 
   1084 	/* check if a monitor detected a delegation conflict */
   1085 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
   1086 		/* mark as wouldblock so response is dropped */
   1087 		curthread->t_flag |= T_WOULDBLOCK;
   1088 	else
   1089 		ns->ns_status = puterrno(error);
   1090 
   1091 }
   1092 
   1093 struct rfs_async_write {
   1094 	struct nfswriteargs *wa;
   1095 	struct nfsattrstat *ns;
   1096 	struct svc_req *req;
   1097 	cred_t *cr;
   1098 	kthread_t *thread;
   1099 	struct rfs_async_write *list;
   1100 };
   1101 
   1102 struct rfs_async_write_list {
   1103 	fhandle_t *fhp;
   1104 	kcondvar_t cv;
   1105 	struct rfs_async_write *list;
   1106 	struct rfs_async_write_list *next;
   1107 };
   1108 
   1109 static struct rfs_async_write_list *rfs_async_write_head = NULL;
   1110 static kmutex_t rfs_async_write_lock;
   1111 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
   1112 
   1113 #define	MAXCLIOVECS	42
   1114 #define	RFSWRITE_INITVAL (enum nfsstat) -1
   1115 
   1116 #ifdef DEBUG
   1117 static int rfs_write_hits = 0;
   1118 static int rfs_write_misses = 0;
   1119 #endif
   1120 
   1121 /*
   1122  * Write data to file.
   1123  * Returns attributes of a file after writing some data to it.
   1124  */
   1125 void
   1126 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
   1127 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   1128 {
   1129 	int error;
   1130 	vnode_t *vp;
   1131 	rlim64_t rlimit;
   1132 	struct vattr va;
   1133 	struct uio uio;
   1134 	struct rfs_async_write_list *lp;
   1135 	struct rfs_async_write_list *nlp;
   1136 	struct rfs_async_write *rp;
   1137 	struct rfs_async_write *nrp;
   1138 	struct rfs_async_write *trp;
   1139 	struct rfs_async_write *lrp;
   1140 	int data_written;
   1141 	int iovcnt;
   1142 	mblk_t *m;
   1143 	struct iovec *iovp;
   1144 	struct iovec *niovp;
   1145 	struct iovec iov[MAXCLIOVECS];
   1146 	int count;
   1147 	int rcount;
   1148 	uint_t off;
   1149 	uint_t len;
   1150 	struct rfs_async_write nrpsp;
   1151 	struct rfs_async_write_list nlpsp;
   1152 	ushort_t t_flag;
   1153 	cred_t *savecred;
   1154 	int in_crit = 0;
   1155 	caller_context_t ct;
   1156 
   1157 	if (!rfs_write_async) {
   1158 		rfs_write_sync(wa, ns, exi, req, cr);
   1159 		return;
   1160 	}
   1161 
   1162 	/*
   1163 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
   1164 	 * is considered an OK.
   1165 	 */
   1166 	ns->ns_status = RFSWRITE_INITVAL;
   1167 
   1168 	nrp = &nrpsp;
   1169 	nrp->wa = wa;
   1170 	nrp->ns = ns;
   1171 	nrp->req = req;
   1172 	nrp->cr = cr;
   1173 	nrp->thread = curthread;
   1174 
   1175 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
   1176 
   1177 	/*
   1178 	 * Look to see if there is already a cluster started
   1179 	 * for this file.
   1180 	 */
   1181 	mutex_enter(&rfs_async_write_lock);
   1182 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
   1183 		if (bcmp(&wa->wa_fhandle, lp->fhp,
   1184 		    sizeof (fhandle_t)) == 0)
   1185 			break;
   1186 	}
   1187 
   1188 	/*
   1189 	 * If lp is non-NULL, then there is already a cluster
   1190 	 * started.  We need to place ourselves in the cluster
   1191 	 * list in the right place as determined by starting
   1192 	 * offset.  Conflicts with non-blocking mandatory locked
   1193 	 * regions will be checked when the cluster is processed.
   1194 	 */
   1195 	if (lp != NULL) {
   1196 		rp = lp->list;
   1197 		trp = NULL;
   1198 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
   1199 			trp = rp;
   1200 			rp = rp->list;
   1201 		}
   1202 		nrp->list = rp;
   1203 		if (trp == NULL)
   1204 			lp->list = nrp;
   1205 		else
   1206 			trp->list = nrp;
   1207 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
   1208 			cv_wait(&lp->cv, &rfs_async_write_lock);
   1209 		mutex_exit(&rfs_async_write_lock);
   1210 
   1211 		return;
   1212 	}
   1213 
   1214 	/*
   1215 	 * No cluster started yet, start one and add ourselves
   1216 	 * to the list of clusters.
   1217 	 */
   1218 	nrp->list = NULL;
   1219 
   1220 	nlp = &nlpsp;
   1221 	nlp->fhp = &wa->wa_fhandle;
   1222 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
   1223 	nlp->list = nrp;
   1224 	nlp->next = NULL;
   1225 
   1226 	if (rfs_async_write_head == NULL) {
   1227 		rfs_async_write_head = nlp;
   1228 	} else {
   1229 		lp = rfs_async_write_head;
   1230 		while (lp->next != NULL)
   1231 			lp = lp->next;
   1232 		lp->next = nlp;
   1233 	}
   1234 	mutex_exit(&rfs_async_write_lock);
   1235 
   1236 	/*
   1237 	 * Convert the file handle common to all of the requests
   1238 	 * in this cluster to a vnode.
   1239 	 */
   1240 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
   1241 	if (vp == NULL) {
   1242 		mutex_enter(&rfs_async_write_lock);
   1243 		if (rfs_async_write_head == nlp)
   1244 			rfs_async_write_head = nlp->next;
   1245 		else {
   1246 			lp = rfs_async_write_head;
   1247 			while (lp->next != nlp)
   1248 				lp = lp->next;
   1249 			lp->next = nlp->next;
   1250 		}
   1251 		t_flag = curthread->t_flag & T_WOULDBLOCK;
   1252 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
   1253 			rp->ns->ns_status = NFSERR_STALE;
   1254 			rp->thread->t_flag |= t_flag;
   1255 		}
   1256 		cv_broadcast(&nlp->cv);
   1257 		mutex_exit(&rfs_async_write_lock);
   1258 
   1259 		return;
   1260 	}
   1261 
   1262 	/*
   1263 	 * Can only write regular files.  Attempts to write any
   1264 	 * other file types fail with EISDIR.
   1265 	 */
   1266 	if (vp->v_type != VREG) {
   1267 		VN_RELE(vp);
   1268 		mutex_enter(&rfs_async_write_lock);
   1269 		if (rfs_async_write_head == nlp)
   1270 			rfs_async_write_head = nlp->next;
   1271 		else {
   1272 			lp = rfs_async_write_head;
   1273 			while (lp->next != nlp)
   1274 				lp = lp->next;
   1275 			lp->next = nlp->next;
   1276 		}
   1277 		t_flag = curthread->t_flag & T_WOULDBLOCK;
   1278 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
   1279 			rp->ns->ns_status = NFSERR_ISDIR;
   1280 			rp->thread->t_flag |= t_flag;
   1281 		}
   1282 		cv_broadcast(&nlp->cv);
   1283 		mutex_exit(&rfs_async_write_lock);
   1284 
   1285 		return;
   1286 	}
   1287 
   1288 	/*
   1289 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
   1290 	 * deadlock with ufs.
   1291 	 */
   1292 	if (nbl_need_check(vp)) {
   1293 		nbl_start_crit(vp, RW_READER);
   1294 		in_crit = 1;
   1295 	}
   1296 
   1297 	ct.cc_sysid = 0;
   1298 	ct.cc_pid = 0;
   1299 	ct.cc_caller_id = nfs2_srv_caller_id;
   1300 	ct.cc_flags = CC_DONTBLOCK;
   1301 
   1302 	/*
   1303 	 * Lock the file for writing.  This operation provides
   1304 	 * the delay which allows clusters to grow.
   1305 	 */
   1306 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
   1307 
   1308 	/* check if a monitor detected a delegation conflict */
   1309 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
   1310 		if (in_crit)
   1311 			nbl_end_crit(vp);
   1312 		VN_RELE(vp);
   1313 		/* mark as wouldblock so response is dropped */
   1314 		curthread->t_flag |= T_WOULDBLOCK;
   1315 		mutex_enter(&rfs_async_write_lock);
   1316 		if (rfs_async_write_head == nlp)
   1317 			rfs_async_write_head = nlp->next;
   1318 		else {
   1319 			lp = rfs_async_write_head;
   1320 			while (lp->next != nlp)
   1321 				lp = lp->next;
   1322 			lp->next = nlp->next;
   1323 		}
   1324 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
   1325 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
   1326 				rp->ns->ns_status = puterrno(error);
   1327 				rp->thread->t_flag |= T_WOULDBLOCK;
   1328 			}
   1329 		}
   1330 		cv_broadcast(&nlp->cv);
   1331 		mutex_exit(&rfs_async_write_lock);
   1332 
   1333 		return;
   1334 	}
   1335 
   1336 	/*
   1337 	 * Disconnect this cluster from the list of clusters.
   1338 	 * The cluster that is being dealt with must be fixed
   1339 	 * in size after this point, so there is no reason
   1340 	 * to leave it on the list so that new requests can
   1341 	 * find it.
   1342 	 *
   1343 	 * The algorithm is that the first write request will
   1344 	 * create a cluster, convert the file handle to a
   1345 	 * vnode pointer, and then lock the file for writing.
   1346 	 * This request is not likely to be clustered with
   1347 	 * any others.  However, the next request will create
   1348 	 * a new cluster and be blocked in VOP_RWLOCK while
   1349 	 * the first request is being processed.  This delay
   1350 	 * will allow more requests to be clustered in this
   1351 	 * second cluster.
   1352 	 */
   1353 	mutex_enter(&rfs_async_write_lock);
   1354 	if (rfs_async_write_head == nlp)
   1355 		rfs_async_write_head = nlp->next;
   1356 	else {
   1357 		lp = rfs_async_write_head;
   1358 		while (lp->next != nlp)
   1359 			lp = lp->next;
   1360 		lp->next = nlp->next;
   1361 	}
   1362 	mutex_exit(&rfs_async_write_lock);
   1363 
   1364 	/*
   1365 	 * Step through the list of requests in this cluster.
   1366 	 * We need to check permissions to make sure that all
   1367 	 * of the requests have sufficient permission to write
   1368 	 * the file.  A cluster can be composed of requests
   1369 	 * from different clients and different users on each
   1370 	 * client.
   1371 	 *
   1372 	 * As a side effect, we also calculate the size of the
   1373 	 * byte range that this cluster encompasses.
   1374 	 */
   1375 	rp = nlp->list;
   1376 	off = rp->wa->wa_offset;
   1377 	len = (uint_t)0;
   1378 	do {
   1379 		if (rdonly(exi, rp->req)) {
   1380 			rp->ns->ns_status = NFSERR_ROFS;
   1381 			t_flag = curthread->t_flag & T_WOULDBLOCK;
   1382 			rp->thread->t_flag |= t_flag;
   1383 			continue;
   1384 		}
   1385 
   1386 		va.va_mask = AT_UID|AT_MODE;
   1387 
   1388 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
   1389 
   1390 		if (!error) {
   1391 			if (crgetuid(rp->cr) != va.va_uid) {
   1392 				/*
   1393 				 * This is a kludge to allow writes of files
   1394 				 * created with read only permission.  The
   1395 				 * owner of the file is always allowed to
   1396 				 * write it.
   1397 				 */
   1398 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
   1399 			}
   1400 			if (!error && MANDLOCK(vp, va.va_mode))
   1401 				error = EACCES;
   1402 		}
   1403 
   1404 		/*
   1405 		 * Check for a conflict with a nbmand-locked region.
   1406 		 */
   1407 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
   1408 		    rp->wa->wa_count, 0, NULL)) {
   1409 			error = EACCES;
   1410 		}
   1411 
   1412 		if (error) {
   1413 			rp->ns->ns_status = puterrno(error);
   1414 			t_flag = curthread->t_flag & T_WOULDBLOCK;
   1415 			rp->thread->t_flag |= t_flag;
   1416 			continue;
   1417 		}
   1418 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
   1419 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
   1420 	} while ((rp = rp->list) != NULL);
   1421 
   1422 	/*
   1423 	 * Step through the cluster attempting to gather as many
   1424 	 * requests which are contiguous as possible.  These
   1425 	 * contiguous requests are handled via one call to VOP_WRITE
   1426 	 * instead of different calls to VOP_WRITE.  We also keep
   1427 	 * track of the fact that any data was written.
   1428 	 */
   1429 	rp = nlp->list;
   1430 	data_written = 0;
   1431 	do {
   1432 		/*
   1433 		 * Skip any requests which are already marked as having an
   1434 		 * error.
   1435 		 */
   1436 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
   1437 			rp = rp->list;
   1438 			continue;
   1439 		}
   1440 
   1441 		/*
   1442 		 * Count the number of iovec's which are required
   1443 		 * to handle this set of requests.  One iovec is
   1444 		 * needed for each data buffer, whether addressed
   1445 		 * by wa_data or by the b_rptr pointers in the
   1446 		 * mblk chains.
   1447 		 */
   1448 		iovcnt = 0;
   1449 		lrp = rp;
   1450 		for (;;) {
   1451 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
   1452 				iovcnt++;
   1453 			else {
   1454 				m = lrp->wa->wa_mblk;
   1455 				while (m != NULL) {
   1456 					iovcnt++;
   1457 					m = m->b_cont;
   1458 				}
   1459 			}
   1460 			if (lrp->list == NULL ||
   1461 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
   1462 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
   1463 			    lrp->list->wa->wa_offset) {
   1464 				lrp = lrp->list;
   1465 				break;
   1466 			}
   1467 			lrp = lrp->list;
   1468 		}
   1469 
   1470 		if (iovcnt <= MAXCLIOVECS) {
   1471 #ifdef DEBUG
   1472 			rfs_write_hits++;
   1473 #endif
   1474 			niovp = iov;
   1475 		} else {
   1476 #ifdef DEBUG
   1477 			rfs_write_misses++;
   1478 #endif
   1479 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
   1480 		}
   1481 		/*
   1482 		 * Put together the scatter/gather iovecs.
   1483 		 */
   1484 		iovp = niovp;
   1485 		trp = rp;
   1486 		count = 0;
   1487 		do {
   1488 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
   1489 				if (trp->wa->wa_rlist) {
   1490 					iovp->iov_base =
   1491 					    (char *)((trp->wa->wa_rlist)->
   1492 					    u.c_daddr3);
   1493 					iovp->iov_len = trp->wa->wa_count;
   1494 				} else  {
   1495 					iovp->iov_base = trp->wa->wa_data;
   1496 					iovp->iov_len = trp->wa->wa_count;
   1497 				}
   1498 				iovp++;
   1499 			} else {
   1500 				m = trp->wa->wa_mblk;
   1501 				rcount = trp->wa->wa_count;
   1502 				while (m != NULL) {
   1503 					iovp->iov_base = (caddr_t)m->b_rptr;
   1504 					iovp->iov_len = (m->b_wptr - m->b_rptr);
   1505 					rcount -= iovp->iov_len;
   1506 					if (rcount < 0)
   1507 						iovp->iov_len += rcount;
   1508 					iovp++;
   1509 					if (rcount <= 0)
   1510 						break;
   1511 					m = m->b_cont;
   1512 				}
   1513 			}
   1514 			count += trp->wa->wa_count;
   1515 			trp = trp->list;
   1516 		} while (trp != lrp);
   1517 
   1518 		uio.uio_iov = niovp;
   1519 		uio.uio_iovcnt = iovcnt;
   1520 		uio.uio_segflg = UIO_SYSSPACE;
   1521 		uio.uio_extflg = UIO_COPY_DEFAULT;
   1522 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
   1523 		uio.uio_resid = count;
   1524 		/*
   1525 		 * The limit is checked on the client. We
   1526 		 * should allow any size writes here.
   1527 		 */
   1528 		uio.uio_llimit = curproc->p_fsz_ctl;
   1529 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
   1530 		if (rlimit < (rlim64_t)uio.uio_resid)
   1531 			uio.uio_resid = (uint_t)rlimit;
   1532 
   1533 		/*
   1534 		 * For now we assume no append mode.
   1535 		 */
   1536 
   1537 		/*
   1538 		 * We're changing creds because VM may fault
   1539 		 * and we need the cred of the current
   1540 		 * thread to be used if quota * checking is
   1541 		 * enabled.
   1542 		 */
   1543 		savecred = curthread->t_cred;
   1544 		curthread->t_cred = cr;
   1545 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
   1546 		curthread->t_cred = savecred;
   1547 
   1548 		/* check if a monitor detected a delegation conflict */
   1549 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
   1550 			/* mark as wouldblock so response is dropped */
   1551 			curthread->t_flag |= T_WOULDBLOCK;
   1552 
   1553 		if (niovp != iov)
   1554 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
   1555 
   1556 		if (!error) {
   1557 			data_written = 1;
   1558 			/*
   1559 			 * Get attributes again so we send the latest mod
   1560 			 * time to the client side for his cache.
   1561 			 */
   1562 			va.va_mask = AT_ALL;	/* now we want everything */
   1563 
   1564 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
   1565 
   1566 			if (!error)
   1567 				acl_perm(vp, exi, &va, rp->cr);
   1568 		}
   1569 
   1570 		/*
   1571 		 * Fill in the status responses for each request
   1572 		 * which was just handled.  Also, copy the latest
   1573 		 * attributes in to the attribute responses if
   1574 		 * appropriate.
   1575 		 */
   1576 		t_flag = curthread->t_flag & T_WOULDBLOCK;
   1577 		do {
   1578 			rp->thread->t_flag |= t_flag;
   1579 			/* check for overflows */
   1580 			if (!error) {
   1581 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
   1582 			}
   1583 			rp->ns->ns_status = puterrno(error);
   1584 			rp = rp->list;
   1585 		} while (rp != lrp);
   1586 	} while (rp != NULL);
   1587 
   1588 	/*
   1589 	 * If any data was written at all, then we need to flush
   1590 	 * the data and metadata to stable storage.
   1591 	 */
   1592 	if (data_written) {
   1593 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
   1594 
   1595 		if (!error) {
   1596 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
   1597 		}
   1598 	}
   1599 
   1600 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
   1601 
   1602 	if (in_crit)
   1603 		nbl_end_crit(vp);
   1604 	VN_RELE(vp);
   1605 
   1606 	t_flag = curthread->t_flag & T_WOULDBLOCK;
   1607 	mutex_enter(&rfs_async_write_lock);
   1608 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
   1609 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
   1610 			rp->ns->ns_status = puterrno(error);
   1611 			rp->thread->t_flag |= t_flag;
   1612 		}
   1613 	}
   1614 	cv_broadcast(&nlp->cv);
   1615 	mutex_exit(&rfs_async_write_lock);
   1616 
   1617 }
   1618 
   1619 void *
   1620 rfs_write_getfh(struct nfswriteargs *wa)
   1621 {
   1622 	return (&wa->wa_fhandle);
   1623 }
   1624 
   1625 /*
   1626  * Create a file.
   1627  * Creates a file with given attributes and returns those attributes
   1628  * and an fhandle for the new file.
   1629  */
   1630 void
   1631 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
   1632 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   1633 {
   1634 	int error;
   1635 	int lookuperr;
   1636 	int in_crit = 0;
   1637 	struct vattr va;
   1638 	vnode_t *vp;
   1639 	vnode_t *realvp;
   1640 	vnode_t *dvp;
   1641 	char *name = args->ca_da.da_name;
   1642 	vnode_t *tvp = NULL;
   1643 	int mode;
   1644 	int lookup_ok;
   1645 	bool_t trunc;
   1646 	struct sockaddr *ca;
   1647 
   1648 	/*
   1649 	 * Disallow NULL paths
   1650 	 */
   1651 	if (name == NULL || *name == '\0') {
   1652 		dr->dr_status = NFSERR_ACCES;
   1653 		return;
   1654 	}
   1655 
   1656 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
   1657 	if (dvp == NULL) {
   1658 		dr->dr_status = NFSERR_STALE;
   1659 		return;
   1660 	}
   1661 
   1662 	error = sattr_to_vattr(args->ca_sa, &va);
   1663 	if (error) {
   1664 		dr->dr_status = puterrno(error);
   1665 		return;
   1666 	}
   1667 
   1668 	/*
   1669 	 * Must specify the mode.
   1670 	 */
   1671 	if (!(va.va_mask & AT_MODE)) {
   1672 		VN_RELE(dvp);
   1673 		dr->dr_status = NFSERR_INVAL;
   1674 		return;
   1675 	}
   1676 
   1677 	/*
   1678 	 * This is a completely gross hack to make mknod
   1679 	 * work over the wire until we can wack the protocol
   1680 	 */
   1681 	if ((va.va_mode & IFMT) == IFCHR) {
   1682 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
   1683 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
   1684 		else {
   1685 			va.va_type = VCHR;
   1686 			/*
   1687 			 * uncompress the received dev_t
   1688 			 * if the top half is zero indicating a request
   1689 			 * from an `older style' OS.
   1690 			 */
   1691 			if ((va.va_size & 0xffff0000) == 0)
   1692 				va.va_rdev = nfsv2_expdev(va.va_size);
   1693 			else
   1694 				va.va_rdev = (dev_t)va.va_size;
   1695 		}
   1696 		va.va_mask &= ~AT_SIZE;
   1697 	} else if ((va.va_mode & IFMT) == IFBLK) {
   1698 		va.va_type = VBLK;
   1699 		/*
   1700 		 * uncompress the received dev_t
   1701 		 * if the top half is zero indicating a request
   1702 		 * from an `older style' OS.
   1703 		 */
   1704 		if ((va.va_size & 0xffff0000) == 0)
   1705 			va.va_rdev = nfsv2_expdev(va.va_size);
   1706 		else
   1707 			va.va_rdev = (dev_t)va.va_size;
   1708 		va.va_mask &= ~AT_SIZE;
   1709 	} else if ((va.va_mode & IFMT) == IFSOCK) {
   1710 		va.va_type = VSOCK;
   1711 	} else {
   1712 		va.va_type = VREG;
   1713 	}
   1714 	va.va_mode &= ~IFMT;
   1715 	va.va_mask |= AT_TYPE;
   1716 
   1717 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
   1718 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
   1719 	    MAXPATHLEN);
   1720 	if (name == NULL) {
   1721 		dr->dr_status = puterrno(EINVAL);
   1722 		return;
   1723 	}
   1724 
   1725 	/*
   1726 	 * Why was the choice made to use VWRITE as the mode to the
   1727 	 * call to VOP_CREATE ? This results in a bug.  When a client
   1728 	 * opens a file that already exists and is RDONLY, the second
   1729 	 * open fails with an EACESS because of the mode.
   1730 	 * bug ID 1054648.
   1731 	 */
   1732 	lookup_ok = 0;
   1733 	mode = VWRITE;
   1734 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
   1735 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
   1736 		    NULL, NULL, NULL);
   1737 		if (!error) {
   1738 			struct vattr at;
   1739 
   1740 			lookup_ok = 1;
   1741 			at.va_mask = AT_MODE;
   1742 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
   1743 			if (!error)
   1744 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
   1745 			VN_RELE(tvp);
   1746 			tvp = NULL;
   1747 		}
   1748 	}
   1749 
   1750 	if (!lookup_ok) {
   1751 		if (rdonly(exi, req)) {
   1752 			error = EROFS;
   1753 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
   1754 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
   1755 			error = EPERM;
   1756 		} else {
   1757 			error = 0;
   1758 		}
   1759 	}
   1760 
   1761 	/*
   1762 	 * If file size is being modified on an already existing file
   1763 	 * make sure that there are no conflicting non-blocking mandatory
   1764 	 * locks in the region being manipulated. Return EACCES if there
   1765 	 * are conflicting locks.
   1766 	 */
   1767 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
   1768 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
   1769 		    NULL, NULL, NULL);
   1770 
   1771 		if (!lookuperr &&
   1772 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
   1773 			VN_RELE(tvp);
   1774 			curthread->t_flag |= T_WOULDBLOCK;
   1775 			goto out;
   1776 		}
   1777 
   1778 		if (!lookuperr && nbl_need_check(tvp)) {
   1779 			/*
   1780 			 * The file exists. Now check if it has any
   1781 			 * conflicting non-blocking mandatory locks
   1782 			 * in the region being changed.
   1783 			 */
   1784 			struct vattr bva;
   1785 			u_offset_t offset;
   1786 			ssize_t length;
   1787 
   1788 			nbl_start_crit(tvp, RW_READER);
   1789 			in_crit = 1;
   1790 
   1791 			bva.va_mask = AT_SIZE;
   1792 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
   1793 			if (!error) {
   1794 				if (va.va_size < bva.va_size) {
   1795 					offset = va.va_size;
   1796 					length = bva.va_size - va.va_size;
   1797 				} else {
   1798 					offset = bva.va_size;
   1799 					length = va.va_size - bva.va_size;
   1800 				}
   1801 				if (length) {
   1802 					if (nbl_conflict(tvp, NBL_WRITE,
   1803 					    offset, length, 0, NULL)) {
   1804 						error = EACCES;
   1805 					}
   1806 				}
   1807 			}
   1808 			if (error) {
   1809 				nbl_end_crit(tvp);
   1810 				VN_RELE(tvp);
   1811 				in_crit = 0;
   1812 			}
   1813 		} else if (tvp != NULL) {
   1814 			VN_RELE(tvp);
   1815 		}
   1816 	}
   1817 
   1818 	if (!error) {
   1819 		/*
   1820 		 * If filesystem is shared with nosuid the remove any
   1821 		 * setuid/setgid bits on create.
   1822 		 */
   1823 		if (va.va_type == VREG &&
   1824 		    exi->exi_export.ex_flags & EX_NOSUID)
   1825 			va.va_mode &= ~(VSUID | VSGID);
   1826 
   1827 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
   1828 		    NULL, NULL);
   1829 
   1830 		if (!error) {
   1831 
   1832 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
   1833 				trunc = TRUE;
   1834 			else
   1835 				trunc = FALSE;
   1836 
   1837 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
   1838 				VN_RELE(vp);
   1839 				curthread->t_flag |= T_WOULDBLOCK;
   1840 				goto out;
   1841 			}
   1842 			va.va_mask = AT_ALL;
   1843 
   1844 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
   1845 
   1846 			/* check for overflows */
   1847 			if (!error) {
   1848 				acl_perm(vp, exi, &va, cr);
   1849 				error = vattr_to_nattr(&va, &dr->dr_attr);
   1850 				if (!error) {
   1851 					error = makefh(&dr->dr_fhandle, vp,
   1852 					    exi);
   1853 				}
   1854 			}
   1855 			/*
   1856 			 * Force modified metadata out to stable storage.
   1857 			 *
   1858 			 * if a underlying vp exists, pass it to VOP_FSYNC
   1859 			 */
   1860 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
   1861 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
   1862 			else
   1863 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
   1864 			VN_RELE(vp);
   1865 		}
   1866 
   1867 		if (in_crit) {
   1868 			nbl_end_crit(tvp);
   1869 			VN_RELE(tvp);
   1870 		}
   1871 	}
   1872 
   1873 	/*
   1874 	 * Force modified data and metadata out to stable storage.
   1875 	 */
   1876 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
   1877 
   1878 out:
   1879 
   1880 	VN_RELE(dvp);
   1881 
   1882 	dr->dr_status = puterrno(error);
   1883 
   1884 	if (name != args->ca_da.da_name)
   1885 		kmem_free(name, MAXPATHLEN);
   1886 }
   1887 void *
   1888 rfs_create_getfh(struct nfscreatargs *args)
   1889 {
   1890 	return (args->ca_da.da_fhandle);
   1891 }
   1892 
   1893 /*
   1894  * Remove a file.
   1895  * Remove named file from parent directory.
   1896  */
   1897 void
   1898 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
   1899 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   1900 {
   1901 	int error = 0;
   1902 	vnode_t *vp;
   1903 	vnode_t *targvp;
   1904 	int in_crit = 0;
   1905 
   1906 	/*
   1907 	 * Disallow NULL paths
   1908 	 */
   1909 	if (da->da_name == NULL || *da->da_name == '\0') {
   1910 		*status = NFSERR_ACCES;
   1911 		return;
   1912 	}
   1913 
   1914 	vp = nfs_fhtovp(da->da_fhandle, exi);
   1915 	if (vp == NULL) {
   1916 		*status = NFSERR_STALE;
   1917 		return;
   1918 	}
   1919 
   1920 	if (rdonly(exi, req)) {
   1921 		VN_RELE(vp);
   1922 		*status = NFSERR_ROFS;
   1923 		return;
   1924 	}
   1925 
   1926 	/*
   1927 	 * Check for a conflict with a non-blocking mandatory share reservation.
   1928 	 */
   1929 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
   1930 	    NULL, cr, NULL, NULL, NULL);
   1931 	if (error != 0) {
   1932 		VN_RELE(vp);
   1933 		*status = puterrno(error);
   1934 		return;
   1935 	}
   1936 
   1937 	/*
   1938 	 * If the file is delegated to an v4 client, then initiate
   1939 	 * recall and drop this request (by setting T_WOULDBLOCK).
   1940 	 * The client will eventually re-transmit the request and
   1941 	 * (hopefully), by then, the v4 client will have returned
   1942 	 * the delegation.
   1943 	 */
   1944 
   1945 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
   1946 		VN_RELE(vp);
   1947 		VN_RELE(targvp);
   1948 		curthread->t_flag |= T_WOULDBLOCK;
   1949 		return;
   1950 	}
   1951 
   1952 	if (nbl_need_check(targvp)) {
   1953 		nbl_start_crit(targvp, RW_READER);
   1954 		in_crit = 1;
   1955 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
   1956 			error = EACCES;
   1957 			goto out;
   1958 		}
   1959 	}
   1960 
   1961 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
   1962 
   1963 	/*
   1964 	 * Force modified data and metadata out to stable storage.
   1965 	 */
   1966 	(void) VOP_FSYNC(vp, 0, cr, NULL);
   1967 
   1968 out:
   1969 	if (in_crit)
   1970 		nbl_end_crit(targvp);
   1971 	VN_RELE(targvp);
   1972 	VN_RELE(vp);
   1973 
   1974 	*status = puterrno(error);
   1975 
   1976 }
   1977 
   1978 void *
   1979 rfs_remove_getfh(struct nfsdiropargs *da)
   1980 {
   1981 	return (da->da_fhandle);
   1982 }
   1983 
   1984 /*
   1985  * rename a file
   1986  * Give a file (from) a new name (to).
   1987  */
   1988 void
   1989 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
   1990 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   1991 {
   1992 	int error = 0;
   1993 	vnode_t *fromvp;
   1994 	vnode_t *tovp;
   1995 	struct exportinfo *to_exi;
   1996 	fhandle_t *fh;
   1997 	vnode_t *srcvp;
   1998 	vnode_t *targvp;
   1999 	int in_crit = 0;
   2000 
   2001 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
   2002 	if (fromvp == NULL) {
   2003 		*status = NFSERR_STALE;
   2004 		return;
   2005 	}
   2006 
   2007 	fh = args->rna_to.da_fhandle;
   2008 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
   2009 	if (to_exi == NULL) {
   2010 		VN_RELE(fromvp);
   2011 		*status = NFSERR_ACCES;
   2012 		return;
   2013 	}
   2014 	exi_rele(to_exi);
   2015 
   2016 	if (to_exi != exi) {
   2017 		VN_RELE(fromvp);
   2018 		*status = NFSERR_XDEV;
   2019 		return;
   2020 	}
   2021 
   2022 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
   2023 	if (tovp == NULL) {
   2024 		VN_RELE(fromvp);
   2025 		*status = NFSERR_STALE;
   2026 		return;
   2027 	}
   2028 
   2029 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
   2030 		VN_RELE(tovp);
   2031 		VN_RELE(fromvp);
   2032 		*status = NFSERR_NOTDIR;
   2033 		return;
   2034 	}
   2035 
   2036 	/*
   2037 	 * Disallow NULL paths
   2038 	 */
   2039 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
   2040 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
   2041 		VN_RELE(tovp);
   2042 		VN_RELE(fromvp);
   2043 		*status = NFSERR_ACCES;
   2044 		return;
   2045 	}
   2046 
   2047 	if (rdonly(exi, req)) {
   2048 		VN_RELE(tovp);
   2049 		VN_RELE(fromvp);
   2050 		*status = NFSERR_ROFS;
   2051 		return;
   2052 	}
   2053 
   2054 	/*
   2055 	 * Check for a conflict with a non-blocking mandatory share reservation.
   2056 	 */
   2057 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
   2058 	    NULL, cr, NULL, NULL, NULL);
   2059 	if (error != 0) {
   2060 		VN_RELE(tovp);
   2061 		VN_RELE(fromvp);
   2062 		*status = puterrno(error);
   2063 		return;
   2064 	}
   2065 
   2066 	/* Check for delegations on the source file */
   2067 
   2068 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
   2069 		VN_RELE(tovp);
   2070 		VN_RELE(fromvp);
   2071 		VN_RELE(srcvp);
   2072 		curthread->t_flag |= T_WOULDBLOCK;
   2073 		return;
   2074 	}
   2075 
   2076 	/* Check for delegation on the file being renamed over, if it exists */
   2077 
   2078 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
   2079 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
   2080 	    NULL, NULL, NULL) == 0) {
   2081 
   2082 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
   2083 			VN_RELE(tovp);
   2084 			VN_RELE(fromvp);
   2085 			VN_RELE(srcvp);
   2086 			VN_RELE(targvp);
   2087 			curthread->t_flag |= T_WOULDBLOCK;
   2088 			return;
   2089 		}
   2090 		VN_RELE(targvp);
   2091 	}
   2092 
   2093 
   2094 	if (nbl_need_check(srcvp)) {
   2095 		nbl_start_crit(srcvp, RW_READER);
   2096 		in_crit = 1;
   2097 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
   2098 			error = EACCES;
   2099 			goto out;
   2100 		}
   2101 	}
   2102 
   2103 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
   2104 	    tovp, args->rna_to.da_name, cr, NULL, 0);
   2105 
   2106 	if (error == 0)
   2107 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
   2108 		    strlen(args->rna_to.da_name));
   2109 
   2110 	/*
   2111 	 * Force modified data and metadata out to stable storage.
   2112 	 */
   2113 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
   2114 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
   2115 
   2116 out:
   2117 	if (in_crit)
   2118 		nbl_end_crit(srcvp);
   2119 	VN_RELE(srcvp);
   2120 	VN_RELE(tovp);
   2121 	VN_RELE(fromvp);
   2122 
   2123 	*status = puterrno(error);
   2124 
   2125 }
   2126 void *
   2127 rfs_rename_getfh(struct nfsrnmargs *args)
   2128 {
   2129 	return (args->rna_from.da_fhandle);
   2130 }
   2131 
   2132 /*
   2133  * Link to a file.
   2134  * Create a file (to) which is a hard link to the given file (from).
   2135  */
   2136 void
   2137 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
   2138 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   2139 {
   2140 	int error;
   2141 	vnode_t *fromvp;
   2142 	vnode_t *tovp;
   2143 	struct exportinfo *to_exi;
   2144 	fhandle_t *fh;
   2145 
   2146 	fromvp = nfs_fhtovp(args->la_from, exi);
   2147 	if (fromvp == NULL) {
   2148 		*status = NFSERR_STALE;
   2149 		return;
   2150 	}
   2151 
   2152 	fh = args->la_to.da_fhandle;
   2153 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
   2154 	if (to_exi == NULL) {
   2155 		VN_RELE(fromvp);
   2156 		*status = NFSERR_ACCES;
   2157 		return;
   2158 	}
   2159 	exi_rele(to_exi);
   2160 
   2161 	if (to_exi != exi) {
   2162 		VN_RELE(fromvp);
   2163 		*status = NFSERR_XDEV;
   2164 		return;
   2165 	}
   2166 
   2167 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
   2168 	if (tovp == NULL) {
   2169 		VN_RELE(fromvp);
   2170 		*status = NFSERR_STALE;
   2171 		return;
   2172 	}
   2173 
   2174 	if (tovp->v_type != VDIR) {
   2175 		VN_RELE(tovp);
   2176 		VN_RELE(fromvp);
   2177 		*status = NFSERR_NOTDIR;
   2178 		return;
   2179 	}
   2180 	/*
   2181 	 * Disallow NULL paths
   2182 	 */
   2183 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
   2184 		VN_RELE(tovp);
   2185 		VN_RELE(fromvp);
   2186 		*status = NFSERR_ACCES;
   2187 		return;
   2188 	}
   2189 
   2190 	if (rdonly(exi, req)) {
   2191 		VN_RELE(tovp);
   2192 		VN_RELE(fromvp);
   2193 		*status = NFSERR_ROFS;
   2194 		return;
   2195 	}
   2196 
   2197 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
   2198 
   2199 	/*
   2200 	 * Force modified data and metadata out to stable storage.
   2201 	 */
   2202 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
   2203 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
   2204 
   2205 	VN_RELE(tovp);
   2206 	VN_RELE(fromvp);
   2207 
   2208 	*status = puterrno(error);
   2209 
   2210 }
   2211 void *
   2212 rfs_link_getfh(struct nfslinkargs *args)
   2213 {
   2214 	return (args->la_from);
   2215 }
   2216 
   2217 /*
   2218  * Symbolicly link to a file.
   2219  * Create a file (to) with the given attributes which is a symbolic link
   2220  * to the given path name (to).
   2221  */
   2222 void
   2223 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
   2224 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   2225 {
   2226 	int error;
   2227 	struct vattr va;
   2228 	vnode_t *vp;
   2229 	vnode_t *svp;
   2230 	int lerror;
   2231 	struct sockaddr *ca;
   2232 	char *name = NULL;
   2233 
   2234 	/*
   2235 	 * Disallow NULL paths
   2236 	 */
   2237 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
   2238 		*status = NFSERR_ACCES;
   2239 		return;
   2240 	}
   2241 
   2242 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
   2243 	if (vp == NULL) {
   2244 		*status = NFSERR_STALE;
   2245 		return;
   2246 	}
   2247 
   2248 	if (rdonly(exi, req)) {
   2249 		VN_RELE(vp);
   2250 		*status = NFSERR_ROFS;
   2251 		return;
   2252 	}
   2253 
   2254 	error = sattr_to_vattr(args->sla_sa, &va);
   2255 	if (error) {
   2256 		VN_RELE(vp);
   2257 		*status = puterrno(error);
   2258 		return;
   2259 	}
   2260 
   2261 	if (!(va.va_mask & AT_MODE)) {
   2262 		VN_RELE(vp);
   2263 		*status = NFSERR_INVAL;
   2264 		return;
   2265 	}
   2266 
   2267 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
   2268 	name = nfscmd_convname(ca, exi, args->sla_tnm,
   2269 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
   2270 
   2271 	if (name == NULL) {
   2272 		*status = NFSERR_ACCES;
   2273 		return;
   2274 	}
   2275 
   2276 	va.va_type = VLNK;
   2277 	va.va_mask |= AT_TYPE;
   2278 
   2279 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
   2280 
   2281 	/*
   2282 	 * Force new data and metadata out to stable storage.
   2283 	 */
   2284 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
   2285 	    NULL, cr, NULL, NULL, NULL);
   2286 
   2287 	if (!lerror) {
   2288 		(void) VOP_FSYNC(svp, 0, cr, NULL);
   2289 		VN_RELE(svp);
   2290 	}
   2291 
   2292 	/*
   2293 	 * Force modified data and metadata out to stable storage.
   2294 	 */
   2295 	(void) VOP_FSYNC(vp, 0, cr, NULL);
   2296 
   2297 	VN_RELE(vp);
   2298 
   2299 	*status = puterrno(error);
   2300 	if (name != args->sla_tnm)
   2301 		kmem_free(name, MAXPATHLEN);
   2302 
   2303 }
   2304 void *
   2305 rfs_symlink_getfh(struct nfsslargs *args)
   2306 {
   2307 	return (args->sla_from.da_fhandle);
   2308 }
   2309 
   2310 /*
   2311  * Make a directory.
   2312  * Create a directory with the given name, parent directory, and attributes.
   2313  * Returns a file handle and attributes for the new directory.
   2314  */
   2315 void
   2316 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
   2317 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   2318 {
   2319 	int error;
   2320 	struct vattr va;
   2321 	vnode_t *dvp = NULL;
   2322 	vnode_t *vp;
   2323 	char *name = args->ca_da.da_name;
   2324 
   2325 	/*
   2326 	 * Disallow NULL paths
   2327 	 */
   2328 	if (name == NULL || *name == '\0') {
   2329 		dr->dr_status = NFSERR_ACCES;
   2330 		return;
   2331 	}
   2332 
   2333 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
   2334 	if (vp == NULL) {
   2335 		dr->dr_status = NFSERR_STALE;
   2336 		return;
   2337 	}
   2338 
   2339 	if (rdonly(exi, req)) {
   2340 		VN_RELE(vp);
   2341 		dr->dr_status = NFSERR_ROFS;
   2342 		return;
   2343 	}
   2344 
   2345 	error = sattr_to_vattr(args->ca_sa, &va);
   2346 	if (error) {
   2347 		VN_RELE(vp);
   2348 		dr->dr_status = puterrno(error);
   2349 		return;
   2350 	}
   2351 
   2352 	if (!(va.va_mask & AT_MODE)) {
   2353 		VN_RELE(vp);
   2354 		dr->dr_status = NFSERR_INVAL;
   2355 		return;
   2356 	}
   2357 
   2358 	va.va_type = VDIR;
   2359 	va.va_mask |= AT_TYPE;
   2360 
   2361 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
   2362 
   2363 	if (!error) {
   2364 		/*
   2365 		 * Attribtutes of the newly created directory should
   2366 		 * be returned to the client.
   2367 		 */
   2368 		va.va_mask = AT_ALL; /* We want everything */
   2369 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
   2370 
   2371 		/* check for overflows */
   2372 		if (!error) {
   2373 			acl_perm(vp, exi, &va, cr);
   2374 			error = vattr_to_nattr(&va, &dr->dr_attr);
   2375 			if (!error) {
   2376 				error = makefh(&dr->dr_fhandle, dvp, exi);
   2377 			}
   2378 		}
   2379 		/*
   2380 		 * Force new data and metadata out to stable storage.
   2381 		 */
   2382 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
   2383 		VN_RELE(dvp);
   2384 	}
   2385 
   2386 	/*
   2387 	 * Force modified data and metadata out to stable storage.
   2388 	 */
   2389 	(void) VOP_FSYNC(vp, 0, cr, NULL);
   2390 
   2391 	VN_RELE(vp);
   2392 
   2393 	dr->dr_status = puterrno(error);
   2394 
   2395 }
   2396 void *
   2397 rfs_mkdir_getfh(struct nfscreatargs *args)
   2398 {
   2399 	return (args->ca_da.da_fhandle);
   2400 }
   2401 
   2402 /*
   2403  * Remove a directory.
   2404  * Remove the given directory name from the given parent directory.
   2405  */
   2406 void
   2407 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
   2408 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   2409 {
   2410 	int error;
   2411 	vnode_t *vp;
   2412 
   2413 
   2414 	/*
   2415 	 * Disallow NULL paths
   2416 	 */
   2417 	if (da->da_name == NULL || *da->da_name == '\0') {
   2418 		*status = NFSERR_ACCES;
   2419 		return;
   2420 	}
   2421 
   2422 	vp = nfs_fhtovp(da->da_fhandle, exi);
   2423 	if (vp == NULL) {
   2424 		*status = NFSERR_STALE;
   2425 		return;
   2426 	}
   2427 
   2428 	if (rdonly(exi, req)) {
   2429 		VN_RELE(vp);
   2430 		*status = NFSERR_ROFS;
   2431 		return;
   2432 	}
   2433 
   2434 	/*
   2435 	 * VOP_RMDIR now takes a new third argument (the current
   2436 	 * directory of the process).  That's because someone
   2437 	 * wants to return EINVAL if one tries to remove ".".
   2438 	 * Of course, NFS servers have no idea what their
   2439 	 * clients' current directories are.  We fake it by
   2440 	 * supplying a vnode known to exist and illegal to
   2441 	 * remove.
   2442 	 */
   2443 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
   2444 
   2445 	/*
   2446 	 * Force modified data and metadata out to stable storage.
   2447 	 */
   2448 	(void) VOP_FSYNC(vp, 0, cr, NULL);
   2449 
   2450 	VN_RELE(vp);
   2451 
   2452 	/*
   2453 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
   2454 	 * if the directory is not empty.  A System V NFS server
   2455 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
   2456 	 * over the wire.
   2457 	 */
   2458 	if (error == EEXIST)
   2459 		*status = NFSERR_NOTEMPTY;
   2460 	else
   2461 		*status = puterrno(error);
   2462 
   2463 }
   2464 void *
   2465 rfs_rmdir_getfh(struct nfsdiropargs *da)
   2466 {
   2467 	return (da->da_fhandle);
   2468 }
   2469 
   2470 /* ARGSUSED */
   2471 void
   2472 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
   2473 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   2474 {
   2475 	int error;
   2476 	int iseof;
   2477 	struct iovec iov;
   2478 	struct uio uio;
   2479 	vnode_t *vp;
   2480 	char *ndata = NULL;
   2481 	struct sockaddr *ca;
   2482 	size_t nents;
   2483 	int ret;
   2484 
   2485 	vp = nfs_fhtovp(&rda->rda_fh, exi);
   2486 	if (vp == NULL) {
   2487 		rd->rd_entries = NULL;
   2488 		rd->rd_status = NFSERR_STALE;
   2489 		return;
   2490 	}
   2491 
   2492 	if (vp->v_type != VDIR) {
   2493 		VN_RELE(vp);
   2494 		rd->rd_entries = NULL;
   2495 		rd->rd_status = NFSERR_NOTDIR;
   2496 		return;
   2497 	}
   2498 
   2499 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
   2500 
   2501 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
   2502 
   2503 	if (error) {
   2504 		rd->rd_entries = NULL;
   2505 		goto bad;
   2506 	}
   2507 
   2508 	if (rda->rda_count == 0) {
   2509 		rd->rd_entries = NULL;
   2510 		rd->rd_size = 0;
   2511 		rd->rd_eof = FALSE;
   2512 		goto bad;
   2513 	}
   2514 
   2515 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
   2516 
   2517 	/*
   2518 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
   2519 	 */
   2520 	rd->rd_bufsize = (uint_t)rda->rda_count;
   2521 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
   2522 
   2523 	/*
   2524 	 * Set up io vector to read directory data
   2525 	 */
   2526 	iov.iov_base = (caddr_t)rd->rd_entries;
   2527 	iov.iov_len = rda->rda_count;
   2528 	uio.uio_iov = &iov;
   2529 	uio.uio_iovcnt = 1;
   2530 	uio.uio_segflg = UIO_SYSSPACE;
   2531 	uio.uio_extflg = UIO_COPY_CACHED;
   2532 	uio.uio_loffset = (offset_t)rda->rda_offset;
   2533 	uio.uio_resid = rda->rda_count;
   2534 
   2535 	/*
   2536 	 * read directory
   2537 	 */
   2538 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
   2539 
   2540 	/*
   2541 	 * Clean up
   2542 	 */
   2543 	if (!error) {
   2544 		/*
   2545 		 * set size and eof
   2546 		 */
   2547 		if (uio.uio_resid == rda->rda_count) {
   2548 			rd->rd_size = 0;
   2549 			rd->rd_eof = TRUE;
   2550 		} else {
   2551 			rd->rd_size = (uint32_t)(rda->rda_count -
   2552 			    uio.uio_resid);
   2553 			rd->rd_eof = iseof ? TRUE : FALSE;
   2554 		}
   2555 	}
   2556 
   2557 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
   2558 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
   2559 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
   2560 	    rda->rda_count, &ndata);
   2561 
   2562 	if (ret != 0) {
   2563 		size_t dropbytes;
   2564 		/*
   2565 		 * We had to drop one or more entries in order to fit
   2566 		 * during the character conversion.  We need to patch
   2567 		 * up the size and eof info.
   2568 		 */
   2569 		if (rd->rd_eof)
   2570 			rd->rd_eof = FALSE;
   2571 		dropbytes = nfscmd_dropped_entrysize(
   2572 		    (struct dirent64 *)rd->rd_entries, nents, ret);
   2573 		rd->rd_size -= dropbytes;
   2574 	}
   2575 	if (ndata == NULL) {
   2576 		ndata = (char *)rd->rd_entries;
   2577 	} else if (ndata != (char *)rd->rd_entries) {
   2578 		kmem_free(rd->rd_entries, rd->rd_bufsize);
   2579 		rd->rd_entries = (void *)ndata;
   2580 		rd->rd_bufsize = rda->rda_count;
   2581 	}
   2582 
   2583 bad:
   2584 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
   2585 
   2586 #if 0 /* notyet */
   2587 	/*
   2588 	 * Don't do this.  It causes local disk writes when just
   2589 	 * reading the file and the overhead is deemed larger
   2590 	 * than the benefit.
   2591 	 */
   2592 	/*
   2593 	 * Force modified metadata out to stable storage.
   2594 	 */
   2595 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
   2596 #endif
   2597 
   2598 	VN_RELE(vp);
   2599 
   2600 	rd->rd_status = puterrno(error);
   2601 
   2602 }
   2603 void *
   2604 rfs_readdir_getfh(struct nfsrddirargs *rda)
   2605 {
   2606 	return (&rda->rda_fh);
   2607 }
   2608 void
   2609 rfs_rddirfree(struct nfsrddirres *rd)
   2610 {
   2611 	if (rd->rd_entries != NULL)
   2612 		kmem_free(rd->rd_entries, rd->rd_bufsize);
   2613 }
   2614 
   2615 /* ARGSUSED */
   2616 void
   2617 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
   2618 	struct svc_req *req, cred_t *cr)
   2619 {
   2620 	int error;
   2621 	struct statvfs64 sb;
   2622 	vnode_t *vp;
   2623 
   2624 	vp = nfs_fhtovp(fh, exi);
   2625 	if (vp == NULL) {
   2626 		fs->fs_status = NFSERR_STALE;
   2627 		return;
   2628 	}
   2629 
   2630 	error = VFS_STATVFS(vp->v_vfsp, &sb);
   2631 
   2632 	if (!error) {
   2633 		fs->fs_tsize = nfstsize();
   2634 		fs->fs_bsize = sb.f_frsize;
   2635 		fs->fs_blocks = sb.f_blocks;
   2636 		fs->fs_bfree = sb.f_bfree;
   2637 		fs->fs_bavail = sb.f_bavail;
   2638 	}
   2639 
   2640 	VN_RELE(vp);
   2641 
   2642 	fs->fs_status = puterrno(error);
   2643 
   2644 }
   2645 void *
   2646 rfs_statfs_getfh(fhandle_t *fh)
   2647 {
   2648 	return (fh);
   2649 }
   2650 
   2651 static int
   2652 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
   2653 {
   2654 	vap->va_mask = 0;
   2655 
   2656 	/*
   2657 	 * There was a sign extension bug in some VFS based systems
   2658 	 * which stored the mode as a short.  When it would get
   2659 	 * assigned to a u_long, no sign extension would occur.
   2660 	 * It needed to, but this wasn't noticed because sa_mode
   2661 	 * would then get assigned back to the short, thus ignoring
   2662 	 * the upper 16 bits of sa_mode.
   2663 	 *
   2664 	 * To make this implementation work for both broken
   2665 	 * clients and good clients, we check for both versions
   2666 	 * of the mode.
   2667 	 */
   2668 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
   2669 	    sa->sa_mode != (uint32_t)-1) {
   2670 		vap->va_mask |= AT_MODE;
   2671 		vap->va_mode = sa->sa_mode;
   2672 	}
   2673 	if (sa->sa_uid != (uint32_t)-1) {
   2674 		vap->va_mask |= AT_UID;
   2675 		vap->va_uid = sa->sa_uid;
   2676 	}
   2677 	if (sa->sa_gid != (uint32_t)-1) {
   2678 		vap->va_mask |= AT_GID;
   2679 		vap->va_gid = sa->sa_gid;
   2680 	}
   2681 	if (sa->sa_size != (uint32_t)-1) {
   2682 		vap->va_mask |= AT_SIZE;
   2683 		vap->va_size = sa->sa_size;
   2684 	}
   2685 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
   2686 	    sa->sa_atime.tv_usec != (int32_t)-1) {
   2687 #ifndef _LP64
   2688 		/* return error if time overflow */
   2689 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
   2690 			return (EOVERFLOW);
   2691 #endif
   2692 		vap->va_mask |= AT_ATIME;
   2693 		/*
   2694 		 * nfs protocol defines times as unsigned so don't extend sign,
   2695 		 * unless sysadmin set nfs_allow_preepoch_time.
   2696 		 */
   2697 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
   2698 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
   2699 	}
   2700 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
   2701 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
   2702 #ifndef _LP64
   2703 		/* return error if time overflow */
   2704 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
   2705 			return (EOVERFLOW);
   2706 #endif
   2707 		vap->va_mask |= AT_MTIME;
   2708 		/*
   2709 		 * nfs protocol defines times as unsigned so don't extend sign,
   2710 		 * unless sysadmin set nfs_allow_preepoch_time.
   2711 		 */
   2712 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
   2713 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
   2714 	}
   2715 	return (0);
   2716 }
   2717 
   2718 static enum nfsftype vt_to_nf[] = {
   2719 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
   2720 };
   2721 
   2722 /*
   2723  * check the following fields for overflow: nodeid, size, and time.
   2724  * There could be a problem when converting 64-bit LP64 fields
   2725  * into 32-bit ones.  Return an error if there is an overflow.
   2726  */
   2727 int
   2728 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
   2729 {
   2730 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
   2731 	na->na_type = vt_to_nf[vap->va_type];
   2732 
   2733 	if (vap->va_mode == (unsigned short) -1)
   2734 		na->na_mode = (uint32_t)-1;
   2735 	else
   2736 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
   2737 
   2738 	if (vap->va_uid == (unsigned short)(-1))
   2739 		na->na_uid = (uint32_t)(-1);
   2740 	else if (vap->va_uid == UID_NOBODY)
   2741 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
   2742 	else
   2743 		na->na_uid = vap->va_uid;
   2744 
   2745 	if (vap->va_gid == (unsigned short)(-1))
   2746 		na->na_gid = (uint32_t)-1;
   2747 	else if (vap->va_gid == GID_NOBODY)
   2748 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
   2749 	else
   2750 		na->na_gid = vap->va_gid;
   2751 
   2752 	/*
   2753 	 * Do we need to check fsid for overflow?  It is 64-bit in the
   2754 	 * vattr, but are bigger than 32 bit values supported?
   2755 	 */
   2756 	na->na_fsid = vap->va_fsid;
   2757 
   2758 	na->na_nodeid = vap->va_nodeid;
   2759 
   2760 	/*
   2761 	 * Check to make sure that the nodeid is representable over the
   2762 	 * wire without losing bits.
   2763 	 */
   2764 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
   2765 		return (EFBIG);
   2766 	na->na_nlink = vap->va_nlink;
   2767 
   2768 	/*
   2769 	 * Check for big files here, instead of at the caller.  See
   2770 	 * comments in cstat for large special file explanation.
   2771 	 */
   2772 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
   2773 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
   2774 			return (EFBIG);
   2775 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
   2776 			/* UNKNOWN_SIZE | OVERFLOW */
   2777 			na->na_size = MAXOFF32_T;
   2778 		} else
   2779 			na->na_size = vap->va_size;
   2780 	} else
   2781 		na->na_size = vap->va_size;
   2782 
   2783 	/*
   2784 	 * If the vnode times overflow the 32-bit times that NFS2
   2785 	 * uses on the wire then return an error.
   2786 	 */
   2787 	if (!NFS_VAP_TIME_OK(vap)) {
   2788 		return (EOVERFLOW);
   2789 	}
   2790 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
   2791 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
   2792 
   2793 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
   2794 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
   2795 
   2796 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
   2797 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
   2798 
   2799 	/*
   2800 	 * If the dev_t will fit into 16 bits then compress
   2801 	 * it, otherwise leave it alone. See comments in
   2802 	 * nfs_client.c.
   2803 	 */
   2804 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
   2805 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
   2806 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
   2807 	else
   2808 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
   2809 
   2810 	na->na_blocks = vap->va_nblocks;
   2811 	na->na_blocksize = vap->va_blksize;
   2812 
   2813 	/*
   2814 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
   2815 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
   2816 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
   2817 	 *
   2818 	 * BUYER BEWARE:
   2819 	 *  If you are porting the NFS to a non-Sun server, you probably
   2820 	 *  don't want to include the following block of code.  The
   2821 	 *  over-the-wire special file types will be changing with the
   2822 	 *  NFS Protocol Revision.
   2823 	 */
   2824 	if (vap->va_type == VFIFO)
   2825 		NA_SETFIFO(na);
   2826 	return (0);
   2827 }
   2828 
   2829 /*
   2830  * acl v2 support: returns approximate permission.
   2831  *	default: returns minimal permission (more restrictive)
   2832  *	aclok: returns maximal permission (less restrictive)
   2833  *	This routine changes the permissions that are alaredy in *va.
   2834  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
   2835  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
   2836  */
   2837 static void
   2838 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
   2839 {
   2840 	vsecattr_t	vsa;
   2841 	int		aclcnt;
   2842 	aclent_t	*aclentp;
   2843 	mode_t		mask_perm;
   2844 	mode_t		grp_perm;
   2845 	mode_t		other_perm;
   2846 	mode_t		other_orig;
   2847 	int		error;
   2848 
   2849 	/* dont care default acl */
   2850 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
   2851 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
   2852 
   2853 	if (!error) {
   2854 		aclcnt = vsa.vsa_aclcnt;
   2855 		if (aclcnt > MIN_ACL_ENTRIES) {
   2856 			/* non-trivial ACL */
   2857 			aclentp = vsa.vsa_aclentp;
   2858 			if (exi->exi_export.ex_flags & EX_ACLOK) {
   2859 				/* maximal permissions */
   2860 				grp_perm = 0;
   2861 				other_perm = 0;
   2862 				for (; aclcnt > 0; aclcnt--, aclentp++) {
   2863 					switch (aclentp->a_type) {
   2864 					case USER_OBJ:
   2865 						break;
   2866 					case USER:
   2867 						grp_perm |=
   2868 						    aclentp->a_perm << 3;
   2869 						other_perm |= aclentp->a_perm;
   2870 						break;
   2871 					case GROUP_OBJ:
   2872 						grp_perm |=
   2873 						    aclentp->a_perm << 3;
   2874 						break;
   2875 					case GROUP:
   2876 						other_perm |= aclentp->a_perm;
   2877 						break;
   2878 					case OTHER_OBJ:
   2879 						other_orig = aclentp->a_perm;
   2880 						break;
   2881 					case CLASS_OBJ:
   2882 						mask_perm = aclentp->a_perm;
   2883 						break;
   2884 					default:
   2885 						break;
   2886 					}
   2887 				}
   2888 				grp_perm &= mask_perm << 3;
   2889 				other_perm &= mask_perm;
   2890 				other_perm |= other_orig;
   2891 
   2892 			} else {
   2893 				/* minimal permissions */
   2894 				grp_perm = 070;
   2895 				other_perm = 07;
   2896 				for (; aclcnt > 0; aclcnt--, aclentp++) {
   2897 					switch (aclentp->a_type) {
   2898 					case USER_OBJ:
   2899 						break;
   2900 					case USER:
   2901 					case CLASS_OBJ:
   2902 						grp_perm &=
   2903 						    aclentp->a_perm << 3;
   2904 						other_perm &=
   2905 						    aclentp->a_perm;
   2906 						break;
   2907 					case GROUP_OBJ:
   2908 						grp_perm &=
   2909 						    aclentp->a_perm << 3;
   2910 						break;
   2911 					case GROUP:
   2912 						other_perm &=
   2913 						    aclentp->a_perm;
   2914 						break;
   2915 					case OTHER_OBJ:
   2916 						other_perm &=
   2917 						    aclentp->a_perm;
   2918 						break;
   2919 					default:
   2920 						break;
   2921 					}
   2922 				}
   2923 			}
   2924 			/* copy to va */
   2925 			va->va_mode &= ~077;
   2926 			va->va_mode |= grp_perm | other_perm;
   2927 		}
   2928 		if (vsa.vsa_aclcnt)
   2929 			kmem_free(vsa.vsa_aclentp,
   2930 			    vsa.vsa_aclcnt * sizeof (aclent_t));
   2931 	}
   2932 }
   2933 
   2934 void
   2935 rfs_srvrinit(void)
   2936 {
   2937 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
   2938 	nfs2_srv_caller_id = fs_new_caller_id();
   2939 }
   2940 
   2941 void
   2942 rfs_srvrfini(void)
   2943 {
   2944 	mutex_destroy(&rfs_async_write_lock);
   2945 }
   2946 
   2947 static int
   2948 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
   2949 {
   2950 	struct clist	*wcl;
   2951 	int		wlist_len;
   2952 	uint32_t	count = rr->rr_count;
   2953 
   2954 	wcl = ra->ra_wlist;
   2955 
   2956 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
   2957 		return (FALSE);
   2958 	}
   2959 
   2960 	wcl = ra->ra_wlist;
   2961 	rr->rr_ok.rrok_wlist_len = wlist_len;
   2962 	rr->rr_ok.rrok_wlist = wcl;
   2963 
   2964 	return (TRUE);
   2965 }
   2966