Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
     28  *	All rights reserved.
     29  */
     30 
     31 #include <sys/param.h>
     32 #include <sys/types.h>
     33 #include <sys/systm.h>
     34 #include <sys/cred.h>
     35 #include <sys/buf.h>
     36 #include <sys/vfs.h>
     37 #include <sys/vnode.h>
     38 #include <sys/uio.h>
     39 #include <sys/stat.h>
     40 #include <sys/errno.h>
     41 #include <sys/sysmacros.h>
     42 #include <sys/statvfs.h>
     43 #include <sys/kmem.h>
     44 #include <sys/kstat.h>
     45 #include <sys/dirent.h>
     46 #include <sys/cmn_err.h>
     47 #include <sys/debug.h>
     48 #include <sys/vtrace.h>
     49 #include <sys/mode.h>
     50 #include <sys/acl.h>
     51 #include <sys/nbmlock.h>
     52 #include <sys/policy.h>
     53 #include <sys/sdt.h>
     54 
     55 #include <rpc/types.h>
     56 #include <rpc/auth.h>
     57 #include <rpc/svc.h>
     58 
     59 #include <nfs/nfs.h>
     60 #include <nfs/export.h>
     61 #include <nfs/nfs_cmd.h>
     62 
     63 #include <vm/hat.h>
     64 #include <vm/as.h>
     65 #include <vm/seg.h>
     66 #include <vm/seg_map.h>
     67 #include <vm/seg_kmem.h>
     68 
     69 #include <sys/strsubr.h>
     70 
     71 /*
     72  * These are the interface routines for the server side of the
     73  * Network File System.  See the NFS version 2 protocol specification
     74  * for a description of this interface.
     75  */
     76 
     77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
     78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
     79 			cred_t *);
     80 
     81 /*
     82  * Some "over the wire" UNIX file types.  These are encoded
     83  * into the mode.  This needs to be fixed in the next rev.
     84  */
     85 #define	IFMT		0170000		/* type of file */
     86 #define	IFCHR		0020000		/* character special */
     87 #define	IFBLK		0060000		/* block special */
     88 #define	IFSOCK		0140000		/* socket */
     89 
     90 u_longlong_t nfs2_srv_caller_id;
     91 
     92 /*
     93  * Get file attributes.
     94  * Returns the current attributes of the file with the given fhandle.
     95  */
     96 /* ARGSUSED */
     97 void
     98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
     99 	struct svc_req *req, cred_t *cr)
    100 {
    101 	int error;
    102 	vnode_t *vp;
    103 	struct vattr va;
    104 
    105 	vp = nfs_fhtovp(fhp, exi);
    106 	if (vp == NULL) {
    107 		ns->ns_status = NFSERR_STALE;
    108 		return;
    109 	}
    110 
    111 	/*
    112 	 * Do the getattr.
    113 	 */
    114 	va.va_mask = AT_ALL;	/* we want all the attributes */
    115 
    116 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
    117 
    118 	/* check for overflows */
    119 	if (!error) {
    120 		/* Lie about the object type for a referral */
    121 		if (vn_is_nfs_reparse(vp, cr))
    122 			va.va_type = VLNK;
    123 
    124 		acl_perm(vp, exi, &va, cr);
    125 		error = vattr_to_nattr(&va, &ns->ns_attr);
    126 	}
    127 
    128 	VN_RELE(vp);
    129 
    130 	ns->ns_status = puterrno(error);
    131 }
    132 void *
    133 rfs_getattr_getfh(fhandle_t *fhp)
    134 {
    135 	return (fhp);
    136 }
    137 
    138 /*
    139  * Set file attributes.
    140  * Sets the attributes of the file with the given fhandle.  Returns
    141  * the new attributes.
    142  */
    143 void
    144 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
    145 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
    146 {
    147 	int error;
    148 	int flag;
    149 	int in_crit = 0;
    150 	vnode_t *vp;
    151 	struct vattr va;
    152 	struct vattr bva;
    153 	struct flock64 bf;
    154 	caller_context_t ct;
    155 
    156 
    157 	vp = nfs_fhtovp(&args->saa_fh, exi);
    158 	if (vp == NULL) {
    159 		ns->ns_status = NFSERR_STALE;
    160 		return;
    161 	}
    162 
    163 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
    164 		VN_RELE(vp);
    165 		ns->ns_status = NFSERR_ROFS;
    166 		return;
    167 	}
    168 
    169 	error = sattr_to_vattr(&args->saa_sa, &va);
    170 	if (error) {
    171 		VN_RELE(vp);
    172 		ns->ns_status = puterrno(error);
    173 		return;
    174 	}
    175 
    176 	/*
    177 	 * If the client is requesting a change to the mtime,
    178 	 * but the nanosecond field is set to 1 billion, then
    179 	 * this is a flag to the server that it should set the
    180 	 * atime and mtime fields to the server's current time.
    181 	 * The 1 billion number actually came from the client
    182 	 * as 1 million, but the units in the over the wire
    183 	 * request are microseconds instead of nanoseconds.
    184 	 *
    185 	 * This is an overload of the protocol and should be
    186 	 * documented in the NFS Version 2 protocol specification.
    187 	 */
    188 	if (va.va_mask & AT_MTIME) {
    189 		if (va.va_mtime.tv_nsec == 1000000000) {
    190 			gethrestime(&va.va_mtime);
    191 			va.va_atime = va.va_mtime;
    192 			va.va_mask |= AT_ATIME;
    193 			flag = 0;
    194 		} else
    195 			flag = ATTR_UTIME;
    196 	} else
    197 		flag = 0;
    198 
    199 	/*
    200 	 * If the filesystem is exported with nosuid, then mask off
    201 	 * the setuid and setgid bits.
    202 	 */
    203 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
    204 	    (exi->exi_export.ex_flags & EX_NOSUID))
    205 		va.va_mode &= ~(VSUID | VSGID);
    206 
    207 	ct.cc_sysid = 0;
    208 	ct.cc_pid = 0;
    209 	ct.cc_caller_id = nfs2_srv_caller_id;
    210 	ct.cc_flags = CC_DONTBLOCK;
    211 
    212 	/*
    213 	 * We need to specially handle size changes because it is
    214 	 * possible for the client to create a file with modes
    215 	 * which indicate read-only, but with the file opened for
    216 	 * writing.  If the client then tries to set the size of
    217 	 * the file, then the normal access checking done in
    218 	 * VOP_SETATTR would prevent the client from doing so,
    219 	 * although it should be legal for it to do so.  To get
    220 	 * around this, we do the access checking for ourselves
    221 	 * and then use VOP_SPACE which doesn't do the access
    222 	 * checking which VOP_SETATTR does. VOP_SPACE can only
    223 	 * operate on VREG files, let VOP_SETATTR handle the other
    224 	 * extremely rare cases.
    225 	 * Also the client should not be allowed to change the
    226 	 * size of the file if there is a conflicting non-blocking
    227 	 * mandatory lock in the region of change.
    228 	 */
    229 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
    230 		if (nbl_need_check(vp)) {
    231 			nbl_start_crit(vp, RW_READER);
    232 			in_crit = 1;
    233 		}
    234 
    235 		bva.va_mask = AT_UID | AT_SIZE;
    236 
    237 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
    238 
    239 		if (error) {
    240 			if (in_crit)
    241 				nbl_end_crit(vp);
    242 			VN_RELE(vp);
    243 			ns->ns_status = puterrno(error);
    244 			return;
    245 		}
    246 
    247 		if (in_crit) {
    248 			u_offset_t offset;
    249 			ssize_t length;
    250 
    251 			if (va.va_size < bva.va_size) {
    252 				offset = va.va_size;
    253 				length = bva.va_size - va.va_size;
    254 			} else {
    255 				offset = bva.va_size;
    256 				length = va.va_size - bva.va_size;
    257 			}
    258 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
    259 			    NULL)) {
    260 				error = EACCES;
    261 			}
    262 		}
    263 
    264 		if (crgetuid(cr) == bva.va_uid && !error &&
    265 		    va.va_size != bva.va_size) {
    266 			va.va_mask &= ~AT_SIZE;
    267 			bf.l_type = F_WRLCK;
    268 			bf.l_whence = 0;
    269 			bf.l_start = (off64_t)va.va_size;
    270 			bf.l_len = 0;
    271 			bf.l_sysid = 0;
    272 			bf.l_pid = 0;
    273 
    274 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
    275 			    (offset_t)va.va_size, cr, &ct);
    276 		}
    277 		if (in_crit)
    278 			nbl_end_crit(vp);
    279 	} else
    280 		error = 0;
    281 
    282 	/*
    283 	 * Do the setattr.
    284 	 */
    285 	if (!error && va.va_mask) {
    286 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
    287 	}
    288 
    289 	/*
    290 	 * check if the monitor on either vop_space or vop_setattr detected
    291 	 * a delegation conflict and if so, mark the thread flag as
    292 	 * wouldblock so that the response is dropped and the client will
    293 	 * try again.
    294 	 */
    295 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
    296 		VN_RELE(vp);
    297 		curthread->t_flag |= T_WOULDBLOCK;
    298 		return;
    299 	}
    300 
    301 	if (!error) {
    302 		va.va_mask = AT_ALL;	/* get everything */
    303 
    304 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
    305 
    306 		/* check for overflows */
    307 		if (!error) {
    308 			acl_perm(vp, exi, &va, cr);
    309 			error = vattr_to_nattr(&va, &ns->ns_attr);
    310 		}
    311 	}
    312 
    313 	ct.cc_flags = 0;
    314 
    315 	/*
    316 	 * Force modified metadata out to stable storage.
    317 	 */
    318 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
    319 
    320 	VN_RELE(vp);
    321 
    322 	ns->ns_status = puterrno(error);
    323 }
    324 void *
    325 rfs_setattr_getfh(struct nfssaargs *args)
    326 {
    327 	return (&args->saa_fh);
    328 }
    329 
    330 /*
    331  * Directory lookup.
    332  * Returns an fhandle and file attributes for file name in a directory.
    333  */
    334 /* ARGSUSED */
    335 void
    336 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
    337 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
    338 {
    339 	int error;
    340 	vnode_t *dvp;
    341 	vnode_t *vp;
    342 	struct vattr va;
    343 	fhandle_t *fhp = da->da_fhandle;
    344 	struct sec_ol sec = {0, 0};
    345 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
    346 	char *name;
    347 	struct sockaddr *ca;
    348 
    349 	/*
    350 	 * Trusted Extension doesn't support NFSv2. MOUNT
    351 	 * will reject v2 clients. Need to prevent v2 client
    352 	 * access via WebNFS here.
    353 	 */
    354 	if (is_system_labeled() && req->rq_vers == 2) {
    355 		dr->dr_status = NFSERR_ACCES;
    356 		return;
    357 	}
    358 
    359 	/*
    360 	 * Disallow NULL paths
    361 	 */
    362 	if (da->da_name == NULL || *da->da_name == '\0') {
    363 		dr->dr_status = NFSERR_ACCES;
    364 		return;
    365 	}
    366 
    367 	/*
    368 	 * Allow lookups from the root - the default
    369 	 * location of the public filehandle.
    370 	 */
    371 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
    372 		dvp = rootdir;
    373 		VN_HOLD(dvp);
    374 	} else {
    375 		dvp = nfs_fhtovp(fhp, exi);
    376 		if (dvp == NULL) {
    377 			dr->dr_status = NFSERR_STALE;
    378 			return;
    379 		}
    380 	}
    381 
    382 	/*
    383 	 * Not allow lookup beyond root.
    384 	 * If the filehandle matches a filehandle of the exi,
    385 	 * then the ".." refers beyond the root of an exported filesystem.
    386 	 */
    387 	if (strcmp(da->da_name, "..") == 0 &&
    388 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
    389 		VN_RELE(dvp);
    390 		dr->dr_status = NFSERR_NOENT;
    391 		return;
    392 	}
    393 
    394 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
    395 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
    396 	    MAXPATHLEN);
    397 
    398 	if (name == NULL) {
    399 		dr->dr_status = NFSERR_ACCES;
    400 		return;
    401 	}
    402 
    403 	/*
    404 	 * If the public filehandle is used then allow
    405 	 * a multi-component lookup, i.e. evaluate
    406 	 * a pathname and follow symbolic links if
    407 	 * necessary.
    408 	 *
    409 	 * This may result in a vnode in another filesystem
    410 	 * which is OK as long as the filesystem is exported.
    411 	 */
    412 	if (PUBLIC_FH2(fhp)) {
    413 		publicfh_flag = TRUE;
    414 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
    415 		    &sec);
    416 	} else {
    417 		/*
    418 		 * Do a normal single component lookup.
    419 		 */
    420 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
    421 		    NULL, NULL, NULL);
    422 	}
    423 
    424 	if (name != da->da_name)
    425 		kmem_free(name, MAXPATHLEN);
    426 
    427 
    428 	if (!error) {
    429 		va.va_mask = AT_ALL;	/* we want everything */
    430 
    431 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
    432 
    433 		/* check for overflows */
    434 		if (!error) {
    435 			acl_perm(vp, exi, &va, cr);
    436 			error = vattr_to_nattr(&va, &dr->dr_attr);
    437 			if (!error) {
    438 				if (sec.sec_flags & SEC_QUERY)
    439 					error = makefh_ol(&dr->dr_fhandle, exi,
    440 					    sec.sec_index);
    441 				else {
    442 					error = makefh(&dr->dr_fhandle, vp,
    443 					    exi);
    444 					if (!error && publicfh_flag &&
    445 					    !chk_clnt_sec(exi, req))
    446 						auth_weak = TRUE;
    447 				}
    448 			}
    449 		}
    450 		VN_RELE(vp);
    451 	}
    452 
    453 	VN_RELE(dvp);
    454 
    455 	/*
    456 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
    457 	 * and have obtained a new exportinfo in exi which needs to be
    458 	 * released. Note the the original exportinfo pointed to by exi
    459 	 * will be released by the caller, comon_dispatch.
    460 	 */
    461 	if (publicfh_flag && exi != NULL)
    462 		exi_rele(exi);
    463 
    464 	/*
    465 	 * If it's public fh, no 0x81, and client's flavor is
    466 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
    467 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
    468 	 */
    469 	if (auth_weak)
    470 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
    471 	else
    472 		dr->dr_status = puterrno(error);
    473 }
    474 void *
    475 rfs_lookup_getfh(struct nfsdiropargs *da)
    476 {
    477 	return (da->da_fhandle);
    478 }
    479 
    480 /*
    481  * Read symbolic link.
    482  * Returns the string in the symbolic link at the given fhandle.
    483  */
    484 /* ARGSUSED */
    485 void
    486 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
    487 	struct svc_req *req, cred_t *cr)
    488 {
    489 	int error;
    490 	struct iovec iov;
    491 	struct uio uio;
    492 	vnode_t *vp;
    493 	struct vattr va;
    494 	struct sockaddr *ca;
    495 	char *name = NULL;
    496 	int is_referral = 0;
    497 
    498 	vp = nfs_fhtovp(fhp, exi);
    499 	if (vp == NULL) {
    500 		rl->rl_data = NULL;
    501 		rl->rl_status = NFSERR_STALE;
    502 		return;
    503 	}
    504 
    505 	va.va_mask = AT_MODE;
    506 
    507 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
    508 
    509 	if (error) {
    510 		VN_RELE(vp);
    511 		rl->rl_data = NULL;
    512 		rl->rl_status = puterrno(error);
    513 		return;
    514 	}
    515 
    516 	if (MANDLOCK(vp, va.va_mode)) {
    517 		VN_RELE(vp);
    518 		rl->rl_data = NULL;
    519 		rl->rl_status = NFSERR_ACCES;
    520 		return;
    521 	}
    522 
    523 	/* We lied about the object type for a referral */
    524 	if (vn_is_nfs_reparse(vp, cr))
    525 		is_referral = 1;
    526 
    527 	/*
    528 	 * XNFS and RFC1094 require us to return ENXIO if argument
    529 	 * is not a link. BUGID 1138002.
    530 	 */
    531 	if (vp->v_type != VLNK && !is_referral) {
    532 		VN_RELE(vp);
    533 		rl->rl_data = NULL;
    534 		rl->rl_status = NFSERR_NXIO;
    535 		return;
    536 	}
    537 
    538 	/*
    539 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
    540 	 */
    541 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
    542 
    543 	if (is_referral) {
    544 		char *s;
    545 		size_t strsz;
    546 
    547 		/* Get an artificial symlink based on a referral */
    548 		s = build_symlink(vp, cr, &strsz);
    549 		global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
    550 		DTRACE_PROBE2(nfs2serv__func__referral__reflink,
    551 		    vnode_t *, vp, char *, s);
    552 		if (s == NULL)
    553 			error = EINVAL;
    554 		else {
    555 			error = 0;
    556 			(void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
    557 			rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
    558 			kmem_free(s, strsz);
    559 		}
    560 
    561 	} else {
    562 
    563 		/*
    564 		 * Set up io vector to read sym link data
    565 		 */
    566 		iov.iov_base = rl->rl_data;
    567 		iov.iov_len = NFS_MAXPATHLEN;
    568 		uio.uio_iov = &iov;
    569 		uio.uio_iovcnt = 1;
    570 		uio.uio_segflg = UIO_SYSSPACE;
    571 		uio.uio_extflg = UIO_COPY_CACHED;
    572 		uio.uio_loffset = (offset_t)0;
    573 		uio.uio_resid = NFS_MAXPATHLEN;
    574 
    575 		/*
    576 		 * Do the readlink.
    577 		 */
    578 		error = VOP_READLINK(vp, &uio, cr, NULL);
    579 
    580 		rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
    581 
    582 		if (!error)
    583 			rl->rl_data[rl->rl_count] = '\0';
    584 
    585 	}
    586 
    587 
    588 	VN_RELE(vp);
    589 
    590 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
    591 	name = nfscmd_convname(ca, exi, rl->rl_data,
    592 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
    593 
    594 	if (name != NULL && name != rl->rl_data) {
    595 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
    596 		rl->rl_data = name;
    597 	}
    598 
    599 	/*
    600 	 * XNFS and RFC1094 require us to return ENXIO if argument
    601 	 * is not a link. UFS returns EINVAL if this is the case,
    602 	 * so we do the mapping here. BUGID 1138002.
    603 	 */
    604 	if (error == EINVAL)
    605 		rl->rl_status = NFSERR_NXIO;
    606 	else
    607 		rl->rl_status = puterrno(error);
    608 
    609 }
    610 void *
    611 rfs_readlink_getfh(fhandle_t *fhp)
    612 {
    613 	return (fhp);
    614 }
    615 /*
    616  * Free data allocated by rfs_readlink
    617  */
    618 void
    619 rfs_rlfree(struct nfsrdlnres *rl)
    620 {
    621 	if (rl->rl_data != NULL)
    622 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
    623 }
    624 
    625 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
    626 
    627 /*
    628  * Read data.
    629  * Returns some data read from the file at the given fhandle.
    630  */
    631 /* ARGSUSED */
    632 void
    633 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
    634 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
    635 {
    636 	vnode_t *vp;
    637 	int error;
    638 	struct vattr va;
    639 	struct iovec iov;
    640 	struct uio uio;
    641 	mblk_t *mp;
    642 	int alloc_err = 0;
    643 	int in_crit = 0;
    644 	caller_context_t ct;
    645 
    646 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
    647 	if (vp == NULL) {
    648 		rr->rr_data = NULL;
    649 		rr->rr_status = NFSERR_STALE;
    650 		return;
    651 	}
    652 
    653 	if (vp->v_type != VREG) {
    654 		VN_RELE(vp);
    655 		rr->rr_data = NULL;
    656 		rr->rr_status = NFSERR_ISDIR;
    657 		return;
    658 	}
    659 
    660 	ct.cc_sysid = 0;
    661 	ct.cc_pid = 0;
    662 	ct.cc_caller_id = nfs2_srv_caller_id;
    663 	ct.cc_flags = CC_DONTBLOCK;
    664 
    665 	/*
    666 	 * Enter the critical region before calling VOP_RWLOCK
    667 	 * to avoid a deadlock with write requests.
    668 	 */
    669 	if (nbl_need_check(vp)) {
    670 		nbl_start_crit(vp, RW_READER);
    671 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
    672 		    0, NULL)) {
    673 			nbl_end_crit(vp);
    674 			VN_RELE(vp);
    675 			rr->rr_data = NULL;
    676 			rr->rr_status = NFSERR_ACCES;
    677 			return;
    678 		}
    679 		in_crit = 1;
    680 	}
    681 
    682 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
    683 
    684 	/* check if a monitor detected a delegation conflict */
    685 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
    686 		VN_RELE(vp);
    687 		/* mark as wouldblock so response is dropped */
    688 		curthread->t_flag |= T_WOULDBLOCK;
    689 
    690 		rr->rr_data = NULL;
    691 		return;
    692 	}
    693 
    694 	va.va_mask = AT_ALL;
    695 
    696 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
    697 
    698 	if (error) {
    699 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    700 		if (in_crit)
    701 			nbl_end_crit(vp);
    702 
    703 		VN_RELE(vp);
    704 		rr->rr_data = NULL;
    705 		rr->rr_status = puterrno(error);
    706 
    707 		return;
    708 	}
    709 
    710 	/*
    711 	 * This is a kludge to allow reading of files created
    712 	 * with no read permission.  The owner of the file
    713 	 * is always allowed to read it.
    714 	 */
    715 	if (crgetuid(cr) != va.va_uid) {
    716 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
    717 
    718 		if (error) {
    719 			/*
    720 			 * Exec is the same as read over the net because
    721 			 * of demand loading.
    722 			 */
    723 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
    724 		}
    725 		if (error) {
    726 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    727 			if (in_crit)
    728 				nbl_end_crit(vp);
    729 			VN_RELE(vp);
    730 			rr->rr_data = NULL;
    731 			rr->rr_status = puterrno(error);
    732 
    733 			return;
    734 		}
    735 	}
    736 
    737 	if (MANDLOCK(vp, va.va_mode)) {
    738 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    739 		if (in_crit)
    740 			nbl_end_crit(vp);
    741 
    742 		VN_RELE(vp);
    743 		rr->rr_data = NULL;
    744 		rr->rr_status = NFSERR_ACCES;
    745 
    746 		return;
    747 	}
    748 
    749 	rr->rr_ok.rrok_wlist_len = 0;
    750 	rr->rr_ok.rrok_wlist = NULL;
    751 
    752 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
    753 		rr->rr_count = 0;
    754 		rr->rr_data = NULL;
    755 		/*
    756 		 * In this case, status is NFS_OK, but there is no data
    757 		 * to encode. So set rr_mp to NULL.
    758 		 */
    759 		rr->rr_mp = NULL;
    760 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
    761 		if (rr->rr_ok.rrok_wlist)
    762 			clist_zero_len(rr->rr_ok.rrok_wlist);
    763 		goto done;
    764 	}
    765 
    766 	if (ra->ra_wlist) {
    767 		mp = NULL;
    768 		rr->rr_mp = NULL;
    769 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
    770 	} else {
    771 		/*
    772 		 * mp will contain the data to be sent out in the read reply.
    773 		 * This will be freed after the reply has been sent out (by the
    774 		 * driver).
    775 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
    776 		 * that the call to xdrmblk_putmblk() never fails.
    777 		 */
    778 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
    779 		    &alloc_err);
    780 		ASSERT(mp != NULL);
    781 		ASSERT(alloc_err == 0);
    782 
    783 		rr->rr_mp = mp;
    784 
    785 		/*
    786 		 * Set up io vector
    787 		 */
    788 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
    789 		iov.iov_len = ra->ra_count;
    790 	}
    791 
    792 	uio.uio_iov = &iov;
    793 	uio.uio_iovcnt = 1;
    794 	uio.uio_segflg = UIO_SYSSPACE;
    795 	uio.uio_extflg = UIO_COPY_CACHED;
    796 	uio.uio_loffset = (offset_t)ra->ra_offset;
    797 	uio.uio_resid = ra->ra_count;
    798 
    799 	error = VOP_READ(vp, &uio, 0, cr, &ct);
    800 
    801 	if (error) {
    802 		if (mp)
    803 			freeb(mp);
    804 
    805 		/*
    806 		 * check if a monitor detected a delegation conflict and
    807 		 * mark as wouldblock so response is dropped
    808 		 */
    809 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
    810 			curthread->t_flag |= T_WOULDBLOCK;
    811 		else
    812 			rr->rr_status = puterrno(error);
    813 
    814 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    815 		if (in_crit)
    816 			nbl_end_crit(vp);
    817 
    818 		VN_RELE(vp);
    819 		rr->rr_data = NULL;
    820 
    821 		return;
    822 	}
    823 
    824 	/*
    825 	 * Get attributes again so we can send the latest access
    826 	 * time to the client side for his cache.
    827 	 */
    828 	va.va_mask = AT_ALL;
    829 
    830 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
    831 
    832 	if (error) {
    833 		if (mp)
    834 			freeb(mp);
    835 
    836 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    837 		if (in_crit)
    838 			nbl_end_crit(vp);
    839 
    840 		VN_RELE(vp);
    841 		rr->rr_data = NULL;
    842 		rr->rr_status = puterrno(error);
    843 
    844 		return;
    845 	}
    846 
    847 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
    848 
    849 	if (mp) {
    850 		rr->rr_data = (char *)mp->b_datap->db_base;
    851 	} else {
    852 		if (ra->ra_wlist) {
    853 			rr->rr_data = (caddr_t)iov.iov_base;
    854 			if (!rdma_setup_read_data2(ra, rr)) {
    855 				rr->rr_data = NULL;
    856 				rr->rr_status = puterrno(NFSERR_INVAL);
    857 			}
    858 		}
    859 	}
    860 done:
    861 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
    862 	if (in_crit)
    863 		nbl_end_crit(vp);
    864 
    865 	acl_perm(vp, exi, &va, cr);
    866 
    867 	/* check for overflows */
    868 	error = vattr_to_nattr(&va, &rr->rr_attr);
    869 
    870 	VN_RELE(vp);
    871 
    872 	rr->rr_status = puterrno(error);
    873 }
    874 
    875 /*
    876  * Free data allocated by rfs_read
    877  */
    878 void
    879 rfs_rdfree(struct nfsrdresult *rr)
    880 {
    881 	mblk_t *mp;
    882 
    883 	if (rr->rr_status == NFS_OK) {
    884 		mp = rr->rr_mp;
    885 		if (mp != NULL)
    886 			freeb(mp);
    887 	}
    888 }
    889 
    890 void *
    891 rfs_read_getfh(struct nfsreadargs *ra)
    892 {
    893 	return (&ra->ra_fhandle);
    894 }
    895 
    896 #define	MAX_IOVECS	12
    897 
    898 #ifdef DEBUG
    899 static int rfs_write_sync_hits = 0;
    900 static int rfs_write_sync_misses = 0;
    901 #endif
    902 
    903 /*
    904  * Write data to file.
    905  * Returns attributes of a file after writing some data to it.
    906  *
    907  * Any changes made here, especially in error handling might have
    908  * to also be done in rfs_write (which clusters write requests).
    909  */
    910 void
    911 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
    912 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
    913 {
    914 	int error;
    915 	vnode_t *vp;
    916 	rlim64_t rlimit;
    917 	struct vattr va;
    918 	struct uio uio;
    919 	struct iovec iov[MAX_IOVECS];
    920 	mblk_t *m;
    921 	struct iovec *iovp;
    922 	int iovcnt;
    923 	cred_t *savecred;
    924 	int in_crit = 0;
    925 	caller_context_t ct;
    926 
    927 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
    928 	if (vp == NULL) {
    929 		ns->ns_status = NFSERR_STALE;
    930 		return;
    931 	}
    932 
    933 	if (rdonly(exi, req)) {
    934 		VN_RELE(vp);
    935 		ns->ns_status = NFSERR_ROFS;
    936 		return;
    937 	}
    938 
    939 	if (vp->v_type != VREG) {
    940 		VN_RELE(vp);
    941 		ns->ns_status = NFSERR_ISDIR;
    942 		return;
    943 	}
    944 
    945 	ct.cc_sysid = 0;
    946 	ct.cc_pid = 0;
    947 	ct.cc_caller_id = nfs2_srv_caller_id;
    948 	ct.cc_flags = CC_DONTBLOCK;
    949 
    950 	va.va_mask = AT_UID|AT_MODE;
    951 
    952 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
    953 
    954 	if (error) {
    955 		VN_RELE(vp);
    956 		ns->ns_status = puterrno(error);
    957 
    958 		return;
    959 	}
    960 
    961 	if (crgetuid(cr) != va.va_uid) {
    962 		/*
    963 		 * This is a kludge to allow writes of files created
    964 		 * with read only permission.  The owner of the file
    965 		 * is always allowed to write it.
    966 		 */
    967 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
    968 
    969 		if (error) {
    970 			VN_RELE(vp);
    971 			ns->ns_status = puterrno(error);
    972 			return;
    973 		}
    974 	}
    975 
    976 	/*
    977 	 * Can't access a mandatory lock file.  This might cause
    978 	 * the NFS service thread to block forever waiting for a
    979 	 * lock to be released that will never be released.
    980 	 */
    981 	if (MANDLOCK(vp, va.va_mode)) {
    982 		VN_RELE(vp);
    983 		ns->ns_status = NFSERR_ACCES;
    984 		return;
    985 	}
    986 
    987 	/*
    988 	 * We have to enter the critical region before calling VOP_RWLOCK
    989 	 * to avoid a deadlock with ufs.
    990 	 */
    991 	if (nbl_need_check(vp)) {
    992 		nbl_start_crit(vp, RW_READER);
    993 		in_crit = 1;
    994 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
    995 		    wa->wa_count, 0, NULL)) {
    996 			error = EACCES;
    997 			goto out;
    998 		}
    999 	}
   1000 
   1001 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
   1002 
   1003 	/* check if a monitor detected a delegation conflict */
   1004 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
   1005 		VN_RELE(vp);
   1006 		/* mark as wouldblock so response is dropped */
   1007 		curthread->t_flag |= T_WOULDBLOCK;
   1008 		return;
   1009 	}
   1010 
   1011 	if (wa->wa_data || wa->wa_rlist) {
   1012 		/* Do the RDMA thing if necessary */
   1013 		if (wa->wa_rlist) {
   1014 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
   1015 			iov[0].iov_len = wa->wa_count;
   1016 		} else  {
   1017 			iov[0].iov_base = wa->wa_data;
   1018 			iov[0].iov_len = wa->wa_count;
   1019 		}
   1020 		uio.uio_iov = iov;
   1021 		uio.uio_iovcnt = 1;
   1022 		uio.uio_segflg = UIO_SYSSPACE;
   1023 		uio.uio_extflg = UIO_COPY_DEFAULT;
   1024 		uio.uio_loffset = (offset_t)wa->wa_offset;
   1025 		uio.uio_resid = wa->wa_count;
   1026 		/*
   1027 		 * The limit is checked on the client. We
   1028 		 * should allow any size writes here.
   1029 		 */
   1030 		uio.uio_llimit = curproc->p_fsz_ctl;
   1031 		rlimit = uio.uio_llimit - wa->wa_offset;
   1032 		if (rlimit < (rlim64_t)uio.uio_resid)
   1033 			uio.uio_resid = (uint_t)rlimit;
   1034 
   1035 		/*
   1036 		 * for now we assume no append mode
   1037 		 */
   1038 		/*
   1039 		 * We're changing creds because VM may fault and we need
   1040 		 * the cred of the current thread to be used if quota
   1041 		 * checking is enabled.
   1042 		 */
   1043 		savecred = curthread->t_cred;
   1044 		curthread->t_cred = cr;
   1045 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
   1046 		curthread->t_cred = savecred;
   1047 	} else {
   1048 		iovcnt = 0;
   1049 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
   1050 			iovcnt++;
   1051 		if (iovcnt <= MAX_IOVECS) {
   1052 #ifdef DEBUG
   1053 			rfs_write_sync_hits++;
   1054 #endif
   1055 			iovp = iov;
   1056 		} else {
   1057 #ifdef DEBUG
   1058 			rfs_write_sync_misses++;
   1059 #endif
   1060 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
   1061 		}
   1062 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
   1063 		uio.uio_iov = iovp;
   1064 		uio.uio_iovcnt = iovcnt;
   1065 		uio.uio_segflg = UIO_SYSSPACE;
   1066 		uio.uio_extflg = UIO_COPY_DEFAULT;
   1067 		uio.uio_loffset = (offset_t)wa->wa_offset;
   1068 		uio.uio_resid = wa->wa_count;
   1069 		/*
   1070 		 * The limit is checked on the client. We
   1071 		 * should allow any size writes here.
   1072 		 */
   1073 		uio.uio_llimit = curproc->p_fsz_ctl;
   1074 		rlimit = uio.uio_llimit - wa->wa_offset;
   1075 		if (rlimit < (rlim64_t)uio.uio_resid)
   1076 			uio.uio_resid = (uint_t)rlimit;
   1077 
   1078 		/*
   1079 		 * For now we assume no append mode.
   1080 		 */
   1081 		/*
   1082 		 * We're changing creds because VM may fault and we need
   1083 		 * the cred of the current thread to be used if quota
   1084 		 * checking is enabled.
   1085 		 */
   1086 		savecred = curthread->t_cred;
   1087 		curthread->t_cred = cr;
   1088 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
   1089 		curthread->t_cred = savecred;
   1090 
   1091 		if (iovp != iov)
   1092 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
   1093 	}
   1094 
   1095 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
   1096 
   1097 	if (!error) {
   1098 		/*
   1099 		 * Get attributes again so we send the latest mod
   1100 		 * time to the client side for his cache.
   1101 		 */
   1102 		va.va_mask = AT_ALL;	/* now we want everything */
   1103 
   1104 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
   1105 
   1106 		/* check for overflows */
   1107 		if (!error) {
   1108 			acl_perm(vp, exi, &va, cr);
   1109 			error = vattr_to_nattr(&va, &ns->ns_attr);
   1110 		}
   1111 	}
   1112 
   1113 out:
   1114 	if (in_crit)
   1115 		nbl_end_crit(vp);
   1116 	VN_RELE(vp);
   1117 
   1118 	/* check if a monitor detected a delegation conflict */
   1119 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
   1120 		/* mark as wouldblock so response is dropped */
   1121 		curthread->t_flag |= T_WOULDBLOCK;
   1122 	else
   1123 		ns->ns_status = puterrno(error);
   1124 
   1125 }
   1126 
   1127 struct rfs_async_write {
   1128 	struct nfswriteargs *wa;
   1129 	struct nfsattrstat *ns;
   1130 	struct svc_req *req;
   1131 	cred_t *cr;
   1132 	kthread_t *thread;
   1133 	struct rfs_async_write *list;
   1134 };
   1135 
   1136 struct rfs_async_write_list {
   1137 	fhandle_t *fhp;
   1138 	kcondvar_t cv;
   1139 	struct rfs_async_write *list;
   1140 	struct rfs_async_write_list *next;
   1141 };
   1142 
   1143 static struct rfs_async_write_list *rfs_async_write_head = NULL;
   1144 static kmutex_t rfs_async_write_lock;
   1145 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
   1146 
   1147 #define	MAXCLIOVECS	42
   1148 #define	RFSWRITE_INITVAL (enum nfsstat) -1
   1149 
   1150 #ifdef DEBUG
   1151 static int rfs_write_hits = 0;
   1152 static int rfs_write_misses = 0;
   1153 #endif
   1154 
   1155 /*
   1156  * Write data to file.
   1157  * Returns attributes of a file after writing some data to it.
   1158  */
   1159 void
   1160 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
   1161 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   1162 {
   1163 	int error;
   1164 	vnode_t *vp;
   1165 	rlim64_t rlimit;
   1166 	struct vattr va;
   1167 	struct uio uio;
   1168 	struct rfs_async_write_list *lp;
   1169 	struct rfs_async_write_list *nlp;
   1170 	struct rfs_async_write *rp;
   1171 	struct rfs_async_write *nrp;
   1172 	struct rfs_async_write *trp;
   1173 	struct rfs_async_write *lrp;
   1174 	int data_written;
   1175 	int iovcnt;
   1176 	mblk_t *m;
   1177 	struct iovec *iovp;
   1178 	struct iovec *niovp;
   1179 	struct iovec iov[MAXCLIOVECS];
   1180 	int count;
   1181 	int rcount;
   1182 	uint_t off;
   1183 	uint_t len;
   1184 	struct rfs_async_write nrpsp;
   1185 	struct rfs_async_write_list nlpsp;
   1186 	ushort_t t_flag;
   1187 	cred_t *savecred;
   1188 	int in_crit = 0;
   1189 	caller_context_t ct;
   1190 
   1191 	if (!rfs_write_async) {
   1192 		rfs_write_sync(wa, ns, exi, req, cr);
   1193 		return;
   1194 	}
   1195 
   1196 	/*
   1197 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
   1198 	 * is considered an OK.
   1199 	 */
   1200 	ns->ns_status = RFSWRITE_INITVAL;
   1201 
   1202 	nrp = &nrpsp;
   1203 	nrp->wa = wa;
   1204 	nrp->ns = ns;
   1205 	nrp->req = req;
   1206 	nrp->cr = cr;
   1207 	nrp->thread = curthread;
   1208 
   1209 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
   1210 
   1211 	/*
   1212 	 * Look to see if there is already a cluster started
   1213 	 * for this file.
   1214 	 */
   1215 	mutex_enter(&rfs_async_write_lock);
   1216 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
   1217 		if (bcmp(&wa->wa_fhandle, lp->fhp,
   1218 		    sizeof (fhandle_t)) == 0)
   1219 			break;
   1220 	}
   1221 
   1222 	/*
   1223 	 * If lp is non-NULL, then there is already a cluster
   1224 	 * started.  We need to place ourselves in the cluster
   1225 	 * list in the right place as determined by starting
   1226 	 * offset.  Conflicts with non-blocking mandatory locked
   1227 	 * regions will be checked when the cluster is processed.
   1228 	 */
   1229 	if (lp != NULL) {
   1230 		rp = lp->list;
   1231 		trp = NULL;
   1232 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
   1233 			trp = rp;
   1234 			rp = rp->list;
   1235 		}
   1236 		nrp->list = rp;
   1237 		if (trp == NULL)
   1238 			lp->list = nrp;
   1239 		else
   1240 			trp->list = nrp;
   1241 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
   1242 			cv_wait(&lp->cv, &rfs_async_write_lock);
   1243 		mutex_exit(&rfs_async_write_lock);
   1244 
   1245 		return;
   1246 	}
   1247 
   1248 	/*
   1249 	 * No cluster started yet, start one and add ourselves
   1250 	 * to the list of clusters.
   1251 	 */
   1252 	nrp->list = NULL;
   1253 
   1254 	nlp = &nlpsp;
   1255 	nlp->fhp = &wa->wa_fhandle;
   1256 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
   1257 	nlp->list = nrp;
   1258 	nlp->next = NULL;
   1259 
   1260 	if (rfs_async_write_head == NULL) {
   1261 		rfs_async_write_head = nlp;
   1262 	} else {
   1263 		lp = rfs_async_write_head;
   1264 		while (lp->next != NULL)
   1265 			lp = lp->next;
   1266 		lp->next = nlp;
   1267 	}
   1268 	mutex_exit(&rfs_async_write_lock);
   1269 
   1270 	/*
   1271 	 * Convert the file handle common to all of the requests
   1272 	 * in this cluster to a vnode.
   1273 	 */
   1274 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
   1275 	if (vp == NULL) {
   1276 		mutex_enter(&rfs_async_write_lock);
   1277 		if (rfs_async_write_head == nlp)
   1278 			rfs_async_write_head = nlp->next;
   1279 		else {
   1280 			lp = rfs_async_write_head;
   1281 			while (lp->next != nlp)
   1282 				lp = lp->next;
   1283 			lp->next = nlp->next;
   1284 		}
   1285 		t_flag = curthread->t_flag & T_WOULDBLOCK;
   1286 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
   1287 			rp->ns->ns_status = NFSERR_STALE;
   1288 			rp->thread->t_flag |= t_flag;
   1289 		}
   1290 		cv_broadcast(&nlp->cv);
   1291 		mutex_exit(&rfs_async_write_lock);
   1292 
   1293 		return;
   1294 	}
   1295 
   1296 	/*
   1297 	 * Can only write regular files.  Attempts to write any
   1298 	 * other file types fail with EISDIR.
   1299 	 */
   1300 	if (vp->v_type != VREG) {
   1301 		VN_RELE(vp);
   1302 		mutex_enter(&rfs_async_write_lock);
   1303 		if (rfs_async_write_head == nlp)
   1304 			rfs_async_write_head = nlp->next;
   1305 		else {
   1306 			lp = rfs_async_write_head;
   1307 			while (lp->next != nlp)
   1308 				lp = lp->next;
   1309 			lp->next = nlp->next;
   1310 		}
   1311 		t_flag = curthread->t_flag & T_WOULDBLOCK;
   1312 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
   1313 			rp->ns->ns_status = NFSERR_ISDIR;
   1314 			rp->thread->t_flag |= t_flag;
   1315 		}
   1316 		cv_broadcast(&nlp->cv);
   1317 		mutex_exit(&rfs_async_write_lock);
   1318 
   1319 		return;
   1320 	}
   1321 
   1322 	/*
   1323 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
   1324 	 * deadlock with ufs.
   1325 	 */
   1326 	if (nbl_need_check(vp)) {
   1327 		nbl_start_crit(vp, RW_READER);
   1328 		in_crit = 1;
   1329 	}
   1330 
   1331 	ct.cc_sysid = 0;
   1332 	ct.cc_pid = 0;
   1333 	ct.cc_caller_id = nfs2_srv_caller_id;
   1334 	ct.cc_flags = CC_DONTBLOCK;
   1335 
   1336 	/*
   1337 	 * Lock the file for writing.  This operation provides
   1338 	 * the delay which allows clusters to grow.
   1339 	 */
   1340 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
   1341 
   1342 	/* check if a monitor detected a delegation conflict */
   1343 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
   1344 		if (in_crit)
   1345 			nbl_end_crit(vp);
   1346 		VN_RELE(vp);
   1347 		/* mark as wouldblock so response is dropped */
   1348 		curthread->t_flag |= T_WOULDBLOCK;
   1349 		mutex_enter(&rfs_async_write_lock);
   1350 		if (rfs_async_write_head == nlp)
   1351 			rfs_async_write_head = nlp->next;
   1352 		else {
   1353 			lp = rfs_async_write_head;
   1354 			while (lp->next != nlp)
   1355 				lp = lp->next;
   1356 			lp->next = nlp->next;
   1357 		}
   1358 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
   1359 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
   1360 				rp->ns->ns_status = puterrno(error);
   1361 				rp->thread->t_flag |= T_WOULDBLOCK;
   1362 			}
   1363 		}
   1364 		cv_broadcast(&nlp->cv);
   1365 		mutex_exit(&rfs_async_write_lock);
   1366 
   1367 		return;
   1368 	}
   1369 
   1370 	/*
   1371 	 * Disconnect this cluster from the list of clusters.
   1372 	 * The cluster that is being dealt with must be fixed
   1373 	 * in size after this point, so there is no reason
   1374 	 * to leave it on the list so that new requests can
   1375 	 * find it.
   1376 	 *
   1377 	 * The algorithm is that the first write request will
   1378 	 * create a cluster, convert the file handle to a
   1379 	 * vnode pointer, and then lock the file for writing.
   1380 	 * This request is not likely to be clustered with
   1381 	 * any others.  However, the next request will create
   1382 	 * a new cluster and be blocked in VOP_RWLOCK while
   1383 	 * the first request is being processed.  This delay
   1384 	 * will allow more requests to be clustered in this
   1385 	 * second cluster.
   1386 	 */
   1387 	mutex_enter(&rfs_async_write_lock);
   1388 	if (rfs_async_write_head == nlp)
   1389 		rfs_async_write_head = nlp->next;
   1390 	else {
   1391 		lp = rfs_async_write_head;
   1392 		while (lp->next != nlp)
   1393 			lp = lp->next;
   1394 		lp->next = nlp->next;
   1395 	}
   1396 	mutex_exit(&rfs_async_write_lock);
   1397 
   1398 	/*
   1399 	 * Step through the list of requests in this cluster.
   1400 	 * We need to check permissions to make sure that all
   1401 	 * of the requests have sufficient permission to write
   1402 	 * the file.  A cluster can be composed of requests
   1403 	 * from different clients and different users on each
   1404 	 * client.
   1405 	 *
   1406 	 * As a side effect, we also calculate the size of the
   1407 	 * byte range that this cluster encompasses.
   1408 	 */
   1409 	rp = nlp->list;
   1410 	off = rp->wa->wa_offset;
   1411 	len = (uint_t)0;
   1412 	do {
   1413 		if (rdonly(exi, rp->req)) {
   1414 			rp->ns->ns_status = NFSERR_ROFS;
   1415 			t_flag = curthread->t_flag & T_WOULDBLOCK;
   1416 			rp->thread->t_flag |= t_flag;
   1417 			continue;
   1418 		}
   1419 
   1420 		va.va_mask = AT_UID|AT_MODE;
   1421 
   1422 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
   1423 
   1424 		if (!error) {
   1425 			if (crgetuid(rp->cr) != va.va_uid) {
   1426 				/*
   1427 				 * This is a kludge to allow writes of files
   1428 				 * created with read only permission.  The
   1429 				 * owner of the file is always allowed to
   1430 				 * write it.
   1431 				 */
   1432 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
   1433 			}
   1434 			if (!error && MANDLOCK(vp, va.va_mode))
   1435 				error = EACCES;
   1436 		}
   1437 
   1438 		/*
   1439 		 * Check for a conflict with a nbmand-locked region.
   1440 		 */
   1441 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
   1442 		    rp->wa->wa_count, 0, NULL)) {
   1443 			error = EACCES;
   1444 		}
   1445 
   1446 		if (error) {
   1447 			rp->ns->ns_status = puterrno(error);
   1448 			t_flag = curthread->t_flag & T_WOULDBLOCK;
   1449 			rp->thread->t_flag |= t_flag;
   1450 			continue;
   1451 		}
   1452 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
   1453 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
   1454 	} while ((rp = rp->list) != NULL);
   1455 
   1456 	/*
   1457 	 * Step through the cluster attempting to gather as many
   1458 	 * requests which are contiguous as possible.  These
   1459 	 * contiguous requests are handled via one call to VOP_WRITE
   1460 	 * instead of different calls to VOP_WRITE.  We also keep
   1461 	 * track of the fact that any data was written.
   1462 	 */
   1463 	rp = nlp->list;
   1464 	data_written = 0;
   1465 	do {
   1466 		/*
   1467 		 * Skip any requests which are already marked as having an
   1468 		 * error.
   1469 		 */
   1470 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
   1471 			rp = rp->list;
   1472 			continue;
   1473 		}
   1474 
   1475 		/*
   1476 		 * Count the number of iovec's which are required
   1477 		 * to handle this set of requests.  One iovec is
   1478 		 * needed for each data buffer, whether addressed
   1479 		 * by wa_data or by the b_rptr pointers in the
   1480 		 * mblk chains.
   1481 		 */
   1482 		iovcnt = 0;
   1483 		lrp = rp;
   1484 		for (;;) {
   1485 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
   1486 				iovcnt++;
   1487 			else {
   1488 				m = lrp->wa->wa_mblk;
   1489 				while (m != NULL) {
   1490 					iovcnt++;
   1491 					m = m->b_cont;
   1492 				}
   1493 			}
   1494 			if (lrp->list == NULL ||
   1495 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
   1496 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
   1497 			    lrp->list->wa->wa_offset) {
   1498 				lrp = lrp->list;
   1499 				break;
   1500 			}
   1501 			lrp = lrp->list;
   1502 		}
   1503 
   1504 		if (iovcnt <= MAXCLIOVECS) {
   1505 #ifdef DEBUG
   1506 			rfs_write_hits++;
   1507 #endif
   1508 			niovp = iov;
   1509 		} else {
   1510 #ifdef DEBUG
   1511 			rfs_write_misses++;
   1512 #endif
   1513 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
   1514 		}
   1515 		/*
   1516 		 * Put together the scatter/gather iovecs.
   1517 		 */
   1518 		iovp = niovp;
   1519 		trp = rp;
   1520 		count = 0;
   1521 		do {
   1522 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
   1523 				if (trp->wa->wa_rlist) {
   1524 					iovp->iov_base =
   1525 					    (char *)((trp->wa->wa_rlist)->
   1526 					    u.c_daddr3);
   1527 					iovp->iov_len = trp->wa->wa_count;
   1528 				} else  {
   1529 					iovp->iov_base = trp->wa->wa_data;
   1530 					iovp->iov_len = trp->wa->wa_count;
   1531 				}
   1532 				iovp++;
   1533 			} else {
   1534 				m = trp->wa->wa_mblk;
   1535 				rcount = trp->wa->wa_count;
   1536 				while (m != NULL) {
   1537 					iovp->iov_base = (caddr_t)m->b_rptr;
   1538 					iovp->iov_len = (m->b_wptr - m->b_rptr);
   1539 					rcount -= iovp->iov_len;
   1540 					if (rcount < 0)
   1541 						iovp->iov_len += rcount;
   1542 					iovp++;
   1543 					if (rcount <= 0)
   1544 						break;
   1545 					m = m->b_cont;
   1546 				}
   1547 			}
   1548 			count += trp->wa->wa_count;
   1549 			trp = trp->list;
   1550 		} while (trp != lrp);
   1551 
   1552 		uio.uio_iov = niovp;
   1553 		uio.uio_iovcnt = iovcnt;
   1554 		uio.uio_segflg = UIO_SYSSPACE;
   1555 		uio.uio_extflg = UIO_COPY_DEFAULT;
   1556 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
   1557 		uio.uio_resid = count;
   1558 		/*
   1559 		 * The limit is checked on the client. We
   1560 		 * should allow any size writes here.
   1561 		 */
   1562 		uio.uio_llimit = curproc->p_fsz_ctl;
   1563 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
   1564 		if (rlimit < (rlim64_t)uio.uio_resid)
   1565 			uio.uio_resid = (uint_t)rlimit;
   1566 
   1567 		/*
   1568 		 * For now we assume no append mode.
   1569 		 */
   1570 
   1571 		/*
   1572 		 * We're changing creds because VM may fault
   1573 		 * and we need the cred of the current
   1574 		 * thread to be used if quota * checking is
   1575 		 * enabled.
   1576 		 */
   1577 		savecred = curthread->t_cred;
   1578 		curthread->t_cred = cr;
   1579 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
   1580 		curthread->t_cred = savecred;
   1581 
   1582 		/* check if a monitor detected a delegation conflict */
   1583 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
   1584 			/* mark as wouldblock so response is dropped */
   1585 			curthread->t_flag |= T_WOULDBLOCK;
   1586 
   1587 		if (niovp != iov)
   1588 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
   1589 
   1590 		if (!error) {
   1591 			data_written = 1;
   1592 			/*
   1593 			 * Get attributes again so we send the latest mod
   1594 			 * time to the client side for his cache.
   1595 			 */
   1596 			va.va_mask = AT_ALL;	/* now we want everything */
   1597 
   1598 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
   1599 
   1600 			if (!error)
   1601 				acl_perm(vp, exi, &va, rp->cr);
   1602 		}
   1603 
   1604 		/*
   1605 		 * Fill in the status responses for each request
   1606 		 * which was just handled.  Also, copy the latest
   1607 		 * attributes in to the attribute responses if
   1608 		 * appropriate.
   1609 		 */
   1610 		t_flag = curthread->t_flag & T_WOULDBLOCK;
   1611 		do {
   1612 			rp->thread->t_flag |= t_flag;
   1613 			/* check for overflows */
   1614 			if (!error) {
   1615 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
   1616 			}
   1617 			rp->ns->ns_status = puterrno(error);
   1618 			rp = rp->list;
   1619 		} while (rp != lrp);
   1620 	} while (rp != NULL);
   1621 
   1622 	/*
   1623 	 * If any data was written at all, then we need to flush
   1624 	 * the data and metadata to stable storage.
   1625 	 */
   1626 	if (data_written) {
   1627 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
   1628 
   1629 		if (!error) {
   1630 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
   1631 		}
   1632 	}
   1633 
   1634 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
   1635 
   1636 	if (in_crit)
   1637 		nbl_end_crit(vp);
   1638 	VN_RELE(vp);
   1639 
   1640 	t_flag = curthread->t_flag & T_WOULDBLOCK;
   1641 	mutex_enter(&rfs_async_write_lock);
   1642 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
   1643 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
   1644 			rp->ns->ns_status = puterrno(error);
   1645 			rp->thread->t_flag |= t_flag;
   1646 		}
   1647 	}
   1648 	cv_broadcast(&nlp->cv);
   1649 	mutex_exit(&rfs_async_write_lock);
   1650 
   1651 }
   1652 
   1653 void *
   1654 rfs_write_getfh(struct nfswriteargs *wa)
   1655 {
   1656 	return (&wa->wa_fhandle);
   1657 }
   1658 
   1659 /*
   1660  * Create a file.
   1661  * Creates a file with given attributes and returns those attributes
   1662  * and an fhandle for the new file.
   1663  */
   1664 void
   1665 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
   1666 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   1667 {
   1668 	int error;
   1669 	int lookuperr;
   1670 	int in_crit = 0;
   1671 	struct vattr va;
   1672 	vnode_t *vp;
   1673 	vnode_t *realvp;
   1674 	vnode_t *dvp;
   1675 	char *name = args->ca_da.da_name;
   1676 	vnode_t *tvp = NULL;
   1677 	int mode;
   1678 	int lookup_ok;
   1679 	bool_t trunc;
   1680 	struct sockaddr *ca;
   1681 
   1682 	/*
   1683 	 * Disallow NULL paths
   1684 	 */
   1685 	if (name == NULL || *name == '\0') {
   1686 		dr->dr_status = NFSERR_ACCES;
   1687 		return;
   1688 	}
   1689 
   1690 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
   1691 	if (dvp == NULL) {
   1692 		dr->dr_status = NFSERR_STALE;
   1693 		return;
   1694 	}
   1695 
   1696 	error = sattr_to_vattr(args->ca_sa, &va);
   1697 	if (error) {
   1698 		dr->dr_status = puterrno(error);
   1699 		return;
   1700 	}
   1701 
   1702 	/*
   1703 	 * Must specify the mode.
   1704 	 */
   1705 	if (!(va.va_mask & AT_MODE)) {
   1706 		VN_RELE(dvp);
   1707 		dr->dr_status = NFSERR_INVAL;
   1708 		return;
   1709 	}
   1710 
   1711 	/*
   1712 	 * This is a completely gross hack to make mknod
   1713 	 * work over the wire until we can wack the protocol
   1714 	 */
   1715 	if ((va.va_mode & IFMT) == IFCHR) {
   1716 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
   1717 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
   1718 		else {
   1719 			va.va_type = VCHR;
   1720 			/*
   1721 			 * uncompress the received dev_t
   1722 			 * if the top half is zero indicating a request
   1723 			 * from an `older style' OS.
   1724 			 */
   1725 			if ((va.va_size & 0xffff0000) == 0)
   1726 				va.va_rdev = nfsv2_expdev(va.va_size);
   1727 			else
   1728 				va.va_rdev = (dev_t)va.va_size;
   1729 		}
   1730 		va.va_mask &= ~AT_SIZE;
   1731 	} else if ((va.va_mode & IFMT) == IFBLK) {
   1732 		va.va_type = VBLK;
   1733 		/*
   1734 		 * uncompress the received dev_t
   1735 		 * if the top half is zero indicating a request
   1736 		 * from an `older style' OS.
   1737 		 */
   1738 		if ((va.va_size & 0xffff0000) == 0)
   1739 			va.va_rdev = nfsv2_expdev(va.va_size);
   1740 		else
   1741 			va.va_rdev = (dev_t)va.va_size;
   1742 		va.va_mask &= ~AT_SIZE;
   1743 	} else if ((va.va_mode & IFMT) == IFSOCK) {
   1744 		va.va_type = VSOCK;
   1745 	} else {
   1746 		va.va_type = VREG;
   1747 	}
   1748 	va.va_mode &= ~IFMT;
   1749 	va.va_mask |= AT_TYPE;
   1750 
   1751 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
   1752 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
   1753 	    MAXPATHLEN);
   1754 	if (name == NULL) {
   1755 		dr->dr_status = puterrno(EINVAL);
   1756 		return;
   1757 	}
   1758 
   1759 	/*
   1760 	 * Why was the choice made to use VWRITE as the mode to the
   1761 	 * call to VOP_CREATE ? This results in a bug.  When a client
   1762 	 * opens a file that already exists and is RDONLY, the second
   1763 	 * open fails with an EACESS because of the mode.
   1764 	 * bug ID 1054648.
   1765 	 */
   1766 	lookup_ok = 0;
   1767 	mode = VWRITE;
   1768 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
   1769 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
   1770 		    NULL, NULL, NULL);
   1771 		if (!error) {
   1772 			struct vattr at;
   1773 
   1774 			lookup_ok = 1;
   1775 			at.va_mask = AT_MODE;
   1776 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
   1777 			if (!error)
   1778 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
   1779 			VN_RELE(tvp);
   1780 			tvp = NULL;
   1781 		}
   1782 	}
   1783 
   1784 	if (!lookup_ok) {
   1785 		if (rdonly(exi, req)) {
   1786 			error = EROFS;
   1787 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
   1788 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
   1789 			error = EPERM;
   1790 		} else {
   1791 			error = 0;
   1792 		}
   1793 	}
   1794 
   1795 	/*
   1796 	 * If file size is being modified on an already existing file
   1797 	 * make sure that there are no conflicting non-blocking mandatory
   1798 	 * locks in the region being manipulated. Return EACCES if there
   1799 	 * are conflicting locks.
   1800 	 */
   1801 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
   1802 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
   1803 		    NULL, NULL, NULL);
   1804 
   1805 		if (!lookuperr &&
   1806 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
   1807 			VN_RELE(tvp);
   1808 			curthread->t_flag |= T_WOULDBLOCK;
   1809 			goto out;
   1810 		}
   1811 
   1812 		if (!lookuperr && nbl_need_check(tvp)) {
   1813 			/*
   1814 			 * The file exists. Now check if it has any
   1815 			 * conflicting non-blocking mandatory locks
   1816 			 * in the region being changed.
   1817 			 */
   1818 			struct vattr bva;
   1819 			u_offset_t offset;
   1820 			ssize_t length;
   1821 
   1822 			nbl_start_crit(tvp, RW_READER);
   1823 			in_crit = 1;
   1824 
   1825 			bva.va_mask = AT_SIZE;
   1826 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
   1827 			if (!error) {
   1828 				if (va.va_size < bva.va_size) {
   1829 					offset = va.va_size;
   1830 					length = bva.va_size - va.va_size;
   1831 				} else {
   1832 					offset = bva.va_size;
   1833 					length = va.va_size - bva.va_size;
   1834 				}
   1835 				if (length) {
   1836 					if (nbl_conflict(tvp, NBL_WRITE,
   1837 					    offset, length, 0, NULL)) {
   1838 						error = EACCES;
   1839 					}
   1840 				}
   1841 			}
   1842 			if (error) {
   1843 				nbl_end_crit(tvp);
   1844 				VN_RELE(tvp);
   1845 				in_crit = 0;
   1846 			}
   1847 		} else if (tvp != NULL) {
   1848 			VN_RELE(tvp);
   1849 		}
   1850 	}
   1851 
   1852 	if (!error) {
   1853 		/*
   1854 		 * If filesystem is shared with nosuid the remove any
   1855 		 * setuid/setgid bits on create.
   1856 		 */
   1857 		if (va.va_type == VREG &&
   1858 		    exi->exi_export.ex_flags & EX_NOSUID)
   1859 			va.va_mode &= ~(VSUID | VSGID);
   1860 
   1861 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
   1862 		    NULL, NULL);
   1863 
   1864 		if (!error) {
   1865 
   1866 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
   1867 				trunc = TRUE;
   1868 			else
   1869 				trunc = FALSE;
   1870 
   1871 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
   1872 				VN_RELE(vp);
   1873 				curthread->t_flag |= T_WOULDBLOCK;
   1874 				goto out;
   1875 			}
   1876 			va.va_mask = AT_ALL;
   1877 
   1878 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
   1879 
   1880 			/* check for overflows */
   1881 			if (!error) {
   1882 				acl_perm(vp, exi, &va, cr);
   1883 				error = vattr_to_nattr(&va, &dr->dr_attr);
   1884 				if (!error) {
   1885 					error = makefh(&dr->dr_fhandle, vp,
   1886 					    exi);
   1887 				}
   1888 			}
   1889 			/*
   1890 			 * Force modified metadata out to stable storage.
   1891 			 *
   1892 			 * if a underlying vp exists, pass it to VOP_FSYNC
   1893 			 */
   1894 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
   1895 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
   1896 			else
   1897 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
   1898 			VN_RELE(vp);
   1899 		}
   1900 
   1901 		if (in_crit) {
   1902 			nbl_end_crit(tvp);
   1903 			VN_RELE(tvp);
   1904 		}
   1905 	}
   1906 
   1907 	/*
   1908 	 * Force modified data and metadata out to stable storage.
   1909 	 */
   1910 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
   1911 
   1912 out:
   1913 
   1914 	VN_RELE(dvp);
   1915 
   1916 	dr->dr_status = puterrno(error);
   1917 
   1918 	if (name != args->ca_da.da_name)
   1919 		kmem_free(name, MAXPATHLEN);
   1920 }
   1921 void *
   1922 rfs_create_getfh(struct nfscreatargs *args)
   1923 {
   1924 	return (args->ca_da.da_fhandle);
   1925 }
   1926 
   1927 /*
   1928  * Remove a file.
   1929  * Remove named file from parent directory.
   1930  */
   1931 void
   1932 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
   1933 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   1934 {
   1935 	int error = 0;
   1936 	vnode_t *vp;
   1937 	vnode_t *targvp;
   1938 	int in_crit = 0;
   1939 
   1940 	/*
   1941 	 * Disallow NULL paths
   1942 	 */
   1943 	if (da->da_name == NULL || *da->da_name == '\0') {
   1944 		*status = NFSERR_ACCES;
   1945 		return;
   1946 	}
   1947 
   1948 	vp = nfs_fhtovp(da->da_fhandle, exi);
   1949 	if (vp == NULL) {
   1950 		*status = NFSERR_STALE;
   1951 		return;
   1952 	}
   1953 
   1954 	if (rdonly(exi, req)) {
   1955 		VN_RELE(vp);
   1956 		*status = NFSERR_ROFS;
   1957 		return;
   1958 	}
   1959 
   1960 	/*
   1961 	 * Check for a conflict with a non-blocking mandatory share reservation.
   1962 	 */
   1963 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
   1964 	    NULL, cr, NULL, NULL, NULL);
   1965 	if (error != 0) {
   1966 		VN_RELE(vp);
   1967 		*status = puterrno(error);
   1968 		return;
   1969 	}
   1970 
   1971 	/*
   1972 	 * If the file is delegated to an v4 client, then initiate
   1973 	 * recall and drop this request (by setting T_WOULDBLOCK).
   1974 	 * The client will eventually re-transmit the request and
   1975 	 * (hopefully), by then, the v4 client will have returned
   1976 	 * the delegation.
   1977 	 */
   1978 
   1979 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
   1980 		VN_RELE(vp);
   1981 		VN_RELE(targvp);
   1982 		curthread->t_flag |= T_WOULDBLOCK;
   1983 		return;
   1984 	}
   1985 
   1986 	if (nbl_need_check(targvp)) {
   1987 		nbl_start_crit(targvp, RW_READER);
   1988 		in_crit = 1;
   1989 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
   1990 			error = EACCES;
   1991 			goto out;
   1992 		}
   1993 	}
   1994 
   1995 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
   1996 
   1997 	/*
   1998 	 * Force modified data and metadata out to stable storage.
   1999 	 */
   2000 	(void) VOP_FSYNC(vp, 0, cr, NULL);
   2001 
   2002 out:
   2003 	if (in_crit)
   2004 		nbl_end_crit(targvp);
   2005 	VN_RELE(targvp);
   2006 	VN_RELE(vp);
   2007 
   2008 	*status = puterrno(error);
   2009 
   2010 }
   2011 
   2012 void *
   2013 rfs_remove_getfh(struct nfsdiropargs *da)
   2014 {
   2015 	return (da->da_fhandle);
   2016 }
   2017 
   2018 /*
   2019  * rename a file
   2020  * Give a file (from) a new name (to).
   2021  */
   2022 void
   2023 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
   2024 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   2025 {
   2026 	int error = 0;
   2027 	vnode_t *fromvp;
   2028 	vnode_t *tovp;
   2029 	struct exportinfo *to_exi;
   2030 	fhandle_t *fh;
   2031 	vnode_t *srcvp;
   2032 	vnode_t *targvp;
   2033 	int in_crit = 0;
   2034 
   2035 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
   2036 	if (fromvp == NULL) {
   2037 		*status = NFSERR_STALE;
   2038 		return;
   2039 	}
   2040 
   2041 	fh = args->rna_to.da_fhandle;
   2042 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
   2043 	if (to_exi == NULL) {
   2044 		VN_RELE(fromvp);
   2045 		*status = NFSERR_ACCES;
   2046 		return;
   2047 	}
   2048 	exi_rele(to_exi);
   2049 
   2050 	if (to_exi != exi) {
   2051 		VN_RELE(fromvp);
   2052 		*status = NFSERR_XDEV;
   2053 		return;
   2054 	}
   2055 
   2056 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
   2057 	if (tovp == NULL) {
   2058 		VN_RELE(fromvp);
   2059 		*status = NFSERR_STALE;
   2060 		return;
   2061 	}
   2062 
   2063 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
   2064 		VN_RELE(tovp);
   2065 		VN_RELE(fromvp);
   2066 		*status = NFSERR_NOTDIR;
   2067 		return;
   2068 	}
   2069 
   2070 	/*
   2071 	 * Disallow NULL paths
   2072 	 */
   2073 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
   2074 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
   2075 		VN_RELE(tovp);
   2076 		VN_RELE(fromvp);
   2077 		*status = NFSERR_ACCES;
   2078 		return;
   2079 	}
   2080 
   2081 	if (rdonly(exi, req)) {
   2082 		VN_RELE(tovp);
   2083 		VN_RELE(fromvp);
   2084 		*status = NFSERR_ROFS;
   2085 		return;
   2086 	}
   2087 
   2088 	/*
   2089 	 * Check for a conflict with a non-blocking mandatory share reservation.
   2090 	 */
   2091 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
   2092 	    NULL, cr, NULL, NULL, NULL);
   2093 	if (error != 0) {
   2094 		VN_RELE(tovp);
   2095 		VN_RELE(fromvp);
   2096 		*status = puterrno(error);
   2097 		return;
   2098 	}
   2099 
   2100 	/* Check for delegations on the source file */
   2101 
   2102 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
   2103 		VN_RELE(tovp);
   2104 		VN_RELE(fromvp);
   2105 		VN_RELE(srcvp);
   2106 		curthread->t_flag |= T_WOULDBLOCK;
   2107 		return;
   2108 	}
   2109 
   2110 	/* Check for delegation on the file being renamed over, if it exists */
   2111 
   2112 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
   2113 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
   2114 	    NULL, NULL, NULL) == 0) {
   2115 
   2116 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
   2117 			VN_RELE(tovp);
   2118 			VN_RELE(fromvp);
   2119 			VN_RELE(srcvp);
   2120 			VN_RELE(targvp);
   2121 			curthread->t_flag |= T_WOULDBLOCK;
   2122 			return;
   2123 		}
   2124 		VN_RELE(targvp);
   2125 	}
   2126 
   2127 
   2128 	if (nbl_need_check(srcvp)) {
   2129 		nbl_start_crit(srcvp, RW_READER);
   2130 		in_crit = 1;
   2131 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
   2132 			error = EACCES;
   2133 			goto out;
   2134 		}
   2135 	}
   2136 
   2137 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
   2138 	    tovp, args->rna_to.da_name, cr, NULL, 0);
   2139 
   2140 	if (error == 0)
   2141 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
   2142 		    strlen(args->rna_to.da_name));
   2143 
   2144 	/*
   2145 	 * Force modified data and metadata out to stable storage.
   2146 	 */
   2147 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
   2148 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
   2149 
   2150 out:
   2151 	if (in_crit)
   2152 		nbl_end_crit(srcvp);
   2153 	VN_RELE(srcvp);
   2154 	VN_RELE(tovp);
   2155 	VN_RELE(fromvp);
   2156 
   2157 	*status = puterrno(error);
   2158 
   2159 }
   2160 void *
   2161 rfs_rename_getfh(struct nfsrnmargs *args)
   2162 {
   2163 	return (args->rna_from.da_fhandle);
   2164 }
   2165 
   2166 /*
   2167  * Link to a file.
   2168  * Create a file (to) which is a hard link to the given file (from).
   2169  */
   2170 void
   2171 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
   2172 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   2173 {
   2174 	int error;
   2175 	vnode_t *fromvp;
   2176 	vnode_t *tovp;
   2177 	struct exportinfo *to_exi;
   2178 	fhandle_t *fh;
   2179 
   2180 	fromvp = nfs_fhtovp(args->la_from, exi);
   2181 	if (fromvp == NULL) {
   2182 		*status = NFSERR_STALE;
   2183 		return;
   2184 	}
   2185 
   2186 	fh = args->la_to.da_fhandle;
   2187 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
   2188 	if (to_exi == NULL) {
   2189 		VN_RELE(fromvp);
   2190 		*status = NFSERR_ACCES;
   2191 		return;
   2192 	}
   2193 	exi_rele(to_exi);
   2194 
   2195 	if (to_exi != exi) {
   2196 		VN_RELE(fromvp);
   2197 		*status = NFSERR_XDEV;
   2198 		return;
   2199 	}
   2200 
   2201 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
   2202 	if (tovp == NULL) {
   2203 		VN_RELE(fromvp);
   2204 		*status = NFSERR_STALE;
   2205 		return;
   2206 	}
   2207 
   2208 	if (tovp->v_type != VDIR) {
   2209 		VN_RELE(tovp);
   2210 		VN_RELE(fromvp);
   2211 		*status = NFSERR_NOTDIR;
   2212 		return;
   2213 	}
   2214 	/*
   2215 	 * Disallow NULL paths
   2216 	 */
   2217 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
   2218 		VN_RELE(tovp);
   2219 		VN_RELE(fromvp);
   2220 		*status = NFSERR_ACCES;
   2221 		return;
   2222 	}
   2223 
   2224 	if (rdonly(exi, req)) {
   2225 		VN_RELE(tovp);
   2226 		VN_RELE(fromvp);
   2227 		*status = NFSERR_ROFS;
   2228 		return;
   2229 	}
   2230 
   2231 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
   2232 
   2233 	/*
   2234 	 * Force modified data and metadata out to stable storage.
   2235 	 */
   2236 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
   2237 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
   2238 
   2239 	VN_RELE(tovp);
   2240 	VN_RELE(fromvp);
   2241 
   2242 	*status = puterrno(error);
   2243 
   2244 }
   2245 void *
   2246 rfs_link_getfh(struct nfslinkargs *args)
   2247 {
   2248 	return (args->la_from);
   2249 }
   2250 
   2251 /*
   2252  * Symbolicly link to a file.
   2253  * Create a file (to) with the given attributes which is a symbolic link
   2254  * to the given path name (to).
   2255  */
   2256 void
   2257 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
   2258 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   2259 {
   2260 	int error;
   2261 	struct vattr va;
   2262 	vnode_t *vp;
   2263 	vnode_t *svp;
   2264 	int lerror;
   2265 	struct sockaddr *ca;
   2266 	char *name = NULL;
   2267 
   2268 	/*
   2269 	 * Disallow NULL paths
   2270 	 */
   2271 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
   2272 		*status = NFSERR_ACCES;
   2273 		return;
   2274 	}
   2275 
   2276 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
   2277 	if (vp == NULL) {
   2278 		*status = NFSERR_STALE;
   2279 		return;
   2280 	}
   2281 
   2282 	if (rdonly(exi, req)) {
   2283 		VN_RELE(vp);
   2284 		*status = NFSERR_ROFS;
   2285 		return;
   2286 	}
   2287 
   2288 	error = sattr_to_vattr(args->sla_sa, &va);
   2289 	if (error) {
   2290 		VN_RELE(vp);
   2291 		*status = puterrno(error);
   2292 		return;
   2293 	}
   2294 
   2295 	if (!(va.va_mask & AT_MODE)) {
   2296 		VN_RELE(vp);
   2297 		*status = NFSERR_INVAL;
   2298 		return;
   2299 	}
   2300 
   2301 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
   2302 	name = nfscmd_convname(ca, exi, args->sla_tnm,
   2303 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
   2304 
   2305 	if (name == NULL) {
   2306 		*status = NFSERR_ACCES;
   2307 		return;
   2308 	}
   2309 
   2310 	va.va_type = VLNK;
   2311 	va.va_mask |= AT_TYPE;
   2312 
   2313 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
   2314 
   2315 	/*
   2316 	 * Force new data and metadata out to stable storage.
   2317 	 */
   2318 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
   2319 	    NULL, cr, NULL, NULL, NULL);
   2320 
   2321 	if (!lerror) {
   2322 		(void) VOP_FSYNC(svp, 0, cr, NULL);
   2323 		VN_RELE(svp);
   2324 	}
   2325 
   2326 	/*
   2327 	 * Force modified data and metadata out to stable storage.
   2328 	 */
   2329 	(void) VOP_FSYNC(vp, 0, cr, NULL);
   2330 
   2331 	VN_RELE(vp);
   2332 
   2333 	*status = puterrno(error);
   2334 	if (name != args->sla_tnm)
   2335 		kmem_free(name, MAXPATHLEN);
   2336 
   2337 }
   2338 void *
   2339 rfs_symlink_getfh(struct nfsslargs *args)
   2340 {
   2341 	return (args->sla_from.da_fhandle);
   2342 }
   2343 
   2344 /*
   2345  * Make a directory.
   2346  * Create a directory with the given name, parent directory, and attributes.
   2347  * Returns a file handle and attributes for the new directory.
   2348  */
   2349 void
   2350 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
   2351 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   2352 {
   2353 	int error;
   2354 	struct vattr va;
   2355 	vnode_t *dvp = NULL;
   2356 	vnode_t *vp;
   2357 	char *name = args->ca_da.da_name;
   2358 
   2359 	/*
   2360 	 * Disallow NULL paths
   2361 	 */
   2362 	if (name == NULL || *name == '\0') {
   2363 		dr->dr_status = NFSERR_ACCES;
   2364 		return;
   2365 	}
   2366 
   2367 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
   2368 	if (vp == NULL) {
   2369 		dr->dr_status = NFSERR_STALE;
   2370 		return;
   2371 	}
   2372 
   2373 	if (rdonly(exi, req)) {
   2374 		VN_RELE(vp);
   2375 		dr->dr_status = NFSERR_ROFS;
   2376 		return;
   2377 	}
   2378 
   2379 	error = sattr_to_vattr(args->ca_sa, &va);
   2380 	if (error) {
   2381 		VN_RELE(vp);
   2382 		dr->dr_status = puterrno(error);
   2383 		return;
   2384 	}
   2385 
   2386 	if (!(va.va_mask & AT_MODE)) {
   2387 		VN_RELE(vp);
   2388 		dr->dr_status = NFSERR_INVAL;
   2389 		return;
   2390 	}
   2391 
   2392 	va.va_type = VDIR;
   2393 	va.va_mask |= AT_TYPE;
   2394 
   2395 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
   2396 
   2397 	if (!error) {
   2398 		/*
   2399 		 * Attribtutes of the newly created directory should
   2400 		 * be returned to the client.
   2401 		 */
   2402 		va.va_mask = AT_ALL; /* We want everything */
   2403 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
   2404 
   2405 		/* check for overflows */
   2406 		if (!error) {
   2407 			acl_perm(vp, exi, &va, cr);
   2408 			error = vattr_to_nattr(&va, &dr->dr_attr);
   2409 			if (!error) {
   2410 				error = makefh(&dr->dr_fhandle, dvp, exi);
   2411 			}
   2412 		}
   2413 		/*
   2414 		 * Force new data and metadata out to stable storage.
   2415 		 */
   2416 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
   2417 		VN_RELE(dvp);
   2418 	}
   2419 
   2420 	/*
   2421 	 * Force modified data and metadata out to stable storage.
   2422 	 */
   2423 	(void) VOP_FSYNC(vp, 0, cr, NULL);
   2424 
   2425 	VN_RELE(vp);
   2426 
   2427 	dr->dr_status = puterrno(error);
   2428 
   2429 }
   2430 void *
   2431 rfs_mkdir_getfh(struct nfscreatargs *args)
   2432 {
   2433 	return (args->ca_da.da_fhandle);
   2434 }
   2435 
   2436 /*
   2437  * Remove a directory.
   2438  * Remove the given directory name from the given parent directory.
   2439  */
   2440 void
   2441 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
   2442 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   2443 {
   2444 	int error;
   2445 	vnode_t *vp;
   2446 
   2447 
   2448 	/*
   2449 	 * Disallow NULL paths
   2450 	 */
   2451 	if (da->da_name == NULL || *da->da_name == '\0') {
   2452 		*status = NFSERR_ACCES;
   2453 		return;
   2454 	}
   2455 
   2456 	vp = nfs_fhtovp(da->da_fhandle, exi);
   2457 	if (vp == NULL) {
   2458 		*status = NFSERR_STALE;
   2459 		return;
   2460 	}
   2461 
   2462 	if (rdonly(exi, req)) {
   2463 		VN_RELE(vp);
   2464 		*status = NFSERR_ROFS;
   2465 		return;
   2466 	}
   2467 
   2468 	/*
   2469 	 * VOP_RMDIR now takes a new third argument (the current
   2470 	 * directory of the process).  That's because someone
   2471 	 * wants to return EINVAL if one tries to remove ".".
   2472 	 * Of course, NFS servers have no idea what their
   2473 	 * clients' current directories are.  We fake it by
   2474 	 * supplying a vnode known to exist and illegal to
   2475 	 * remove.
   2476 	 */
   2477 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
   2478 
   2479 	/*
   2480 	 * Force modified data and metadata out to stable storage.
   2481 	 */
   2482 	(void) VOP_FSYNC(vp, 0, cr, NULL);
   2483 
   2484 	VN_RELE(vp);
   2485 
   2486 	/*
   2487 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
   2488 	 * if the directory is not empty.  A System V NFS server
   2489 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
   2490 	 * over the wire.
   2491 	 */
   2492 	if (error == EEXIST)
   2493 		*status = NFSERR_NOTEMPTY;
   2494 	else
   2495 		*status = puterrno(error);
   2496 
   2497 }
   2498 void *
   2499 rfs_rmdir_getfh(struct nfsdiropargs *da)
   2500 {
   2501 	return (da->da_fhandle);
   2502 }
   2503 
   2504 /* ARGSUSED */
   2505 void
   2506 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
   2507 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
   2508 {
   2509 	int error;
   2510 	int iseof;
   2511 	struct iovec iov;
   2512 	struct uio uio;
   2513 	vnode_t *vp;
   2514 	char *ndata = NULL;
   2515 	struct sockaddr *ca;
   2516 	size_t nents;
   2517 	int ret;
   2518 
   2519 	vp = nfs_fhtovp(&rda->rda_fh, exi);
   2520 	if (vp == NULL) {
   2521 		rd->rd_entries = NULL;
   2522 		rd->rd_status = NFSERR_STALE;
   2523 		return;
   2524 	}
   2525 
   2526 	if (vp->v_type != VDIR) {
   2527 		VN_RELE(vp);
   2528 		rd->rd_entries = NULL;
   2529 		rd->rd_status = NFSERR_NOTDIR;
   2530 		return;
   2531 	}
   2532 
   2533 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
   2534 
   2535 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
   2536 
   2537 	if (error) {
   2538 		rd->rd_entries = NULL;
   2539 		goto bad;
   2540 	}
   2541 
   2542 	if (rda->rda_count == 0) {
   2543 		rd->rd_entries = NULL;
   2544 		rd->rd_size = 0;
   2545 		rd->rd_eof = FALSE;
   2546 		goto bad;
   2547 	}
   2548 
   2549 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
   2550 
   2551 	/*
   2552 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
   2553 	 */
   2554 	rd->rd_bufsize = (uint_t)rda->rda_count;
   2555 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
   2556 
   2557 	/*
   2558 	 * Set up io vector to read directory data
   2559 	 */
   2560 	iov.iov_base = (caddr_t)rd->rd_entries;
   2561 	iov.iov_len = rda->rda_count;
   2562 	uio.uio_iov = &iov;
   2563 	uio.uio_iovcnt = 1;
   2564 	uio.uio_segflg = UIO_SYSSPACE;
   2565 	uio.uio_extflg = UIO_COPY_CACHED;
   2566 	uio.uio_loffset = (offset_t)rda->rda_offset;
   2567 	uio.uio_resid = rda->rda_count;
   2568 
   2569 	/*
   2570 	 * read directory
   2571 	 */
   2572 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
   2573 
   2574 	/*
   2575 	 * Clean up
   2576 	 */
   2577 	if (!error) {
   2578 		/*
   2579 		 * set size and eof
   2580 		 */
   2581 		if (uio.uio_resid == rda->rda_count) {
   2582 			rd->rd_size = 0;
   2583 			rd->rd_eof = TRUE;
   2584 		} else {
   2585 			rd->rd_size = (uint32_t)(rda->rda_count -
   2586 			    uio.uio_resid);
   2587 			rd->rd_eof = iseof ? TRUE : FALSE;
   2588 		}
   2589 	}
   2590 
   2591 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
   2592 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
   2593 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
   2594 	    rda->rda_count, &ndata);
   2595 
   2596 	if (ret != 0) {
   2597 		size_t dropbytes;
   2598 		/*
   2599 		 * We had to drop one or more entries in order to fit
   2600 		 * during the character conversion.  We need to patch
   2601 		 * up the size and eof info.
   2602 		 */
   2603 		if (rd->rd_eof)
   2604 			rd->rd_eof = FALSE;
   2605 		dropbytes = nfscmd_dropped_entrysize(
   2606 		    (struct dirent64 *)rd->rd_entries, nents, ret);
   2607 		rd->rd_size -= dropbytes;
   2608 	}
   2609 	if (ndata == NULL) {
   2610 		ndata = (char *)rd->rd_entries;
   2611 	} else if (ndata != (char *)rd->rd_entries) {
   2612 		kmem_free(rd->rd_entries, rd->rd_bufsize);
   2613 		rd->rd_entries = (void *)ndata;
   2614 		rd->rd_bufsize = rda->rda_count;
   2615 	}
   2616 
   2617 bad:
   2618 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
   2619 
   2620 #if 0 /* notyet */
   2621 	/*
   2622 	 * Don't do this.  It causes local disk writes when just
   2623 	 * reading the file and the overhead is deemed larger
   2624 	 * than the benefit.
   2625 	 */
   2626 	/*
   2627 	 * Force modified metadata out to stable storage.
   2628 	 */
   2629 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
   2630 #endif
   2631 
   2632 	VN_RELE(vp);
   2633 
   2634 	rd->rd_status = puterrno(error);
   2635 
   2636 }
   2637 void *
   2638 rfs_readdir_getfh(struct nfsrddirargs *rda)
   2639 {
   2640 	return (&rda->rda_fh);
   2641 }
   2642 void
   2643 rfs_rddirfree(struct nfsrddirres *rd)
   2644 {
   2645 	if (rd->rd_entries != NULL)
   2646 		kmem_free(rd->rd_entries, rd->rd_bufsize);
   2647 }
   2648 
   2649 /* ARGSUSED */
   2650 void
   2651 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
   2652 	struct svc_req *req, cred_t *cr)
   2653 {
   2654 	int error;
   2655 	struct statvfs64 sb;
   2656 	vnode_t *vp;
   2657 
   2658 	vp = nfs_fhtovp(fh, exi);
   2659 	if (vp == NULL) {
   2660 		fs->fs_status = NFSERR_STALE;
   2661 		return;
   2662 	}
   2663 
   2664 	error = VFS_STATVFS(vp->v_vfsp, &sb);
   2665 
   2666 	if (!error) {
   2667 		fs->fs_tsize = nfstsize();
   2668 		fs->fs_bsize = sb.f_frsize;
   2669 		fs->fs_blocks = sb.f_blocks;
   2670 		fs->fs_bfree = sb.f_bfree;
   2671 		fs->fs_bavail = sb.f_bavail;
   2672 	}
   2673 
   2674 	VN_RELE(vp);
   2675 
   2676 	fs->fs_status = puterrno(error);
   2677 
   2678 }
   2679 void *
   2680 rfs_statfs_getfh(fhandle_t *fh)
   2681 {
   2682 	return (fh);
   2683 }
   2684 
   2685 static int
   2686 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
   2687 {
   2688 	vap->va_mask = 0;
   2689 
   2690 	/*
   2691 	 * There was a sign extension bug in some VFS based systems
   2692 	 * which stored the mode as a short.  When it would get
   2693 	 * assigned to a u_long, no sign extension would occur.
   2694 	 * It needed to, but this wasn't noticed because sa_mode
   2695 	 * would then get assigned back to the short, thus ignoring
   2696 	 * the upper 16 bits of sa_mode.
   2697 	 *
   2698 	 * To make this implementation work for both broken
   2699 	 * clients and good clients, we check for both versions
   2700 	 * of the mode.
   2701 	 */
   2702 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
   2703 	    sa->sa_mode != (uint32_t)-1) {
   2704 		vap->va_mask |= AT_MODE;
   2705 		vap->va_mode = sa->sa_mode;
   2706 	}
   2707 	if (sa->sa_uid != (uint32_t)-1) {
   2708 		vap->va_mask |= AT_UID;
   2709 		vap->va_uid = sa->sa_uid;
   2710 	}
   2711 	if (sa->sa_gid != (uint32_t)-1) {
   2712 		vap->va_mask |= AT_GID;
   2713 		vap->va_gid = sa->sa_gid;
   2714 	}
   2715 	if (sa->sa_size != (uint32_t)-1) {
   2716 		vap->va_mask |= AT_SIZE;
   2717 		vap->va_size = sa->sa_size;
   2718 	}
   2719 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
   2720 	    sa->sa_atime.tv_usec != (int32_t)-1) {
   2721 #ifndef _LP64
   2722 		/* return error if time overflow */
   2723 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
   2724 			return (EOVERFLOW);
   2725 #endif
   2726 		vap->va_mask |= AT_ATIME;
   2727 		/*
   2728 		 * nfs protocol defines times as unsigned so don't extend sign,
   2729 		 * unless sysadmin set nfs_allow_preepoch_time.
   2730 		 */
   2731 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
   2732 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
   2733 	}
   2734 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
   2735 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
   2736 #ifndef _LP64
   2737 		/* return error if time overflow */
   2738 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
   2739 			return (EOVERFLOW);
   2740 #endif
   2741 		vap->va_mask |= AT_MTIME;
   2742 		/*
   2743 		 * nfs protocol defines times as unsigned so don't extend sign,
   2744 		 * unless sysadmin set nfs_allow_preepoch_time.
   2745 		 */
   2746 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
   2747 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
   2748 	}
   2749 	return (0);
   2750 }
   2751 
   2752 static enum nfsftype vt_to_nf[] = {
   2753 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
   2754 };
   2755 
   2756 /*
   2757  * check the following fields for overflow: nodeid, size, and time.
   2758  * There could be a problem when converting 64-bit LP64 fields
   2759  * into 32-bit ones.  Return an error if there is an overflow.
   2760  */
   2761 int
   2762 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
   2763 {
   2764 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
   2765 	na->na_type = vt_to_nf[vap->va_type];
   2766 
   2767 	if (vap->va_mode == (unsigned short) -1)
   2768 		na->na_mode = (uint32_t)-1;
   2769 	else
   2770 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
   2771 
   2772 	if (vap->va_uid == (unsigned short)(-1))
   2773 		na->na_uid = (uint32_t)(-1);
   2774 	else if (vap->va_uid == UID_NOBODY)
   2775 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
   2776 	else
   2777 		na->na_uid = vap->va_uid;
   2778 
   2779 	if (vap->va_gid == (unsigned short)(-1))
   2780 		na->na_gid = (uint32_t)-1;
   2781 	else if (vap->va_gid == GID_NOBODY)
   2782 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
   2783 	else
   2784 		na->na_gid = vap->va_gid;
   2785 
   2786 	/*
   2787 	 * Do we need to check fsid for overflow?  It is 64-bit in the
   2788 	 * vattr, but are bigger than 32 bit values supported?
   2789 	 */
   2790 	na->na_fsid = vap->va_fsid;
   2791 
   2792 	na->na_nodeid = vap->va_nodeid;
   2793 
   2794 	/*
   2795 	 * Check to make sure that the nodeid is representable over the
   2796 	 * wire without losing bits.
   2797 	 */
   2798 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
   2799 		return (EFBIG);
   2800 	na->na_nlink = vap->va_nlink;
   2801 
   2802 	/*
   2803 	 * Check for big files here, instead of at the caller.  See
   2804 	 * comments in cstat for large special file explanation.
   2805 	 */
   2806 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
   2807 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
   2808 			return (EFBIG);
   2809 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
   2810 			/* UNKNOWN_SIZE | OVERFLOW */
   2811 			na->na_size = MAXOFF32_T;
   2812 		} else
   2813 			na->na_size = vap->va_size;
   2814 	} else
   2815 		na->na_size = vap->va_size;
   2816 
   2817 	/*
   2818 	 * If the vnode times overflow the 32-bit times that NFS2
   2819 	 * uses on the wire then return an error.
   2820 	 */
   2821 	if (!NFS_VAP_TIME_OK(vap)) {
   2822 		return (EOVERFLOW);
   2823 	}
   2824 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
   2825 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
   2826 
   2827 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
   2828 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
   2829 
   2830 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
   2831 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
   2832 
   2833 	/*
   2834 	 * If the dev_t will fit into 16 bits then compress
   2835 	 * it, otherwise leave it alone. See comments in
   2836 	 * nfs_client.c.
   2837 	 */
   2838 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
   2839 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
   2840 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
   2841 	else
   2842 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
   2843 
   2844 	na->na_blocks = vap->va_nblocks;
   2845 	na->na_blocksize = vap->va_blksize;
   2846 
   2847 	/*
   2848 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
   2849 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
   2850 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
   2851 	 *
   2852 	 * BUYER BEWARE:
   2853 	 *  If you are porting the NFS to a non-Sun server, you probably
   2854 	 *  don't want to include the following block of code.  The
   2855 	 *  over-the-wire special file types will be changing with the
   2856 	 *  NFS Protocol Revision.
   2857 	 */
   2858 	if (vap->va_type == VFIFO)
   2859 		NA_SETFIFO(na);
   2860 	return (0);
   2861 }
   2862 
   2863 /*
   2864  * acl v2 support: returns approximate permission.
   2865  *	default: returns minimal permission (more restrictive)
   2866  *	aclok: returns maximal permission (less restrictive)
   2867  *	This routine changes the permissions that are alaredy in *va.
   2868  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
   2869  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
   2870  */
   2871 static void
   2872 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
   2873 {
   2874 	vsecattr_t	vsa;
   2875 	int		aclcnt;
   2876 	aclent_t	*aclentp;
   2877 	mode_t		mask_perm;
   2878 	mode_t		grp_perm;
   2879 	mode_t		other_perm;
   2880 	mode_t		other_orig;
   2881 	int		error;
   2882 
   2883 	/* dont care default acl */
   2884 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
   2885 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
   2886 
   2887 	if (!error) {
   2888 		aclcnt = vsa.vsa_aclcnt;
   2889 		if (aclcnt > MIN_ACL_ENTRIES) {
   2890 			/* non-trivial ACL */
   2891 			aclentp = vsa.vsa_aclentp;
   2892 			if (exi->exi_export.ex_flags & EX_ACLOK) {
   2893 				/* maximal permissions */
   2894 				grp_perm = 0;
   2895 				other_perm = 0;
   2896 				for (; aclcnt > 0; aclcnt--, aclentp++) {
   2897 					switch (aclentp->a_type) {
   2898 					case USER_OBJ:
   2899 						break;
   2900 					case USER:
   2901 						grp_perm |=
   2902 						    aclentp->a_perm << 3;
   2903 						other_perm |= aclentp->a_perm;
   2904 						break;
   2905 					case GROUP_OBJ:
   2906 						grp_perm |=
   2907 						    aclentp->a_perm << 3;
   2908 						break;
   2909 					case GROUP:
   2910 						other_perm |= aclentp->a_perm;
   2911 						break;
   2912 					case OTHER_OBJ:
   2913 						other_orig = aclentp->a_perm;
   2914 						break;
   2915 					case CLASS_OBJ:
   2916 						mask_perm = aclentp->a_perm;
   2917 						break;
   2918 					default:
   2919 						break;
   2920 					}
   2921 				}
   2922 				grp_perm &= mask_perm << 3;
   2923 				other_perm &= mask_perm;
   2924 				other_perm |= other_orig;
   2925 
   2926 			} else {
   2927 				/* minimal permissions */
   2928 				grp_perm = 070;
   2929 				other_perm = 07;
   2930 				for (; aclcnt > 0; aclcnt--, aclentp++) {
   2931 					switch (aclentp->a_type) {
   2932 					case USER_OBJ:
   2933 						break;
   2934 					case USER:
   2935 					case CLASS_OBJ:
   2936 						grp_perm &=
   2937 						    aclentp->a_perm << 3;
   2938 						other_perm &=
   2939 						    aclentp->a_perm;
   2940 						break;
   2941 					case GROUP_OBJ:
   2942 						grp_perm &=
   2943 						    aclentp->a_perm << 3;
   2944 						break;
   2945 					case GROUP:
   2946 						other_perm &=
   2947 						    aclentp->a_perm;
   2948 						break;
   2949 					case OTHER_OBJ:
   2950 						other_perm &=
   2951 						    aclentp->a_perm;
   2952 						break;
   2953 					default:
   2954 						break;
   2955 					}
   2956 				}
   2957 			}
   2958 			/* copy to va */
   2959 			va->va_mode &= ~077;
   2960 			va->va_mode |= grp_perm | other_perm;
   2961 		}
   2962 		if (vsa.vsa_aclcnt)
   2963 			kmem_free(vsa.vsa_aclentp,
   2964 			    vsa.vsa_aclcnt * sizeof (aclent_t));
   2965 	}
   2966 }
   2967 
   2968 void
   2969 rfs_srvrinit(void)
   2970 {
   2971 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
   2972 	nfs2_srv_caller_id = fs_new_caller_id();
   2973 }
   2974 
   2975 void
   2976 rfs_srvrfini(void)
   2977 {
   2978 	mutex_destroy(&rfs_async_write_lock);
   2979 }
   2980 
   2981 static int
   2982 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
   2983 {
   2984 	struct clist	*wcl;
   2985 	int		wlist_len;
   2986 	uint32_t	count = rr->rr_count;
   2987 
   2988 	wcl = ra->ra_wlist;
   2989 
   2990 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
   2991 		return (FALSE);
   2992 	}
   2993 
   2994 	wcl = ra->ra_wlist;
   2995 	rr->rr_ok.rrok_wlist_len = wlist_len;
   2996 	rr->rr_ok.rrok_wlist = wcl;
   2997 
   2998 	return (TRUE);
   2999 }
   3000