Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /* Portions Copyright 2007 Jeremy Teo */
     27 
     28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     29 
     30 #include <sys/types.h>
     31 #include <sys/param.h>
     32 #include <sys/time.h>
     33 #include <sys/systm.h>
     34 #include <sys/sysmacros.h>
     35 #include <sys/resource.h>
     36 #include <sys/vfs.h>
     37 #include <sys/vfs_opreg.h>
     38 #include <sys/vnode.h>
     39 #include <sys/file.h>
     40 #include <sys/stat.h>
     41 #include <sys/kmem.h>
     42 #include <sys/taskq.h>
     43 #include <sys/uio.h>
     44 #include <sys/vmsystm.h>
     45 #include <sys/atomic.h>
     46 #include <sys/vm.h>
     47 #include <vm/seg_vn.h>
     48 #include <vm/pvn.h>
     49 #include <vm/as.h>
     50 #include <sys/mman.h>
     51 #include <sys/pathname.h>
     52 #include <sys/cmn_err.h>
     53 #include <sys/errno.h>
     54 #include <sys/unistd.h>
     55 #include <sys/zfs_dir.h>
     56 #include <sys/zfs_acl.h>
     57 #include <sys/zfs_ioctl.h>
     58 #include <sys/fs/zfs.h>
     59 #include <sys/dmu.h>
     60 #include <sys/spa.h>
     61 #include <sys/txg.h>
     62 #include <sys/dbuf.h>
     63 #include <sys/zap.h>
     64 #include <sys/dirent.h>
     65 #include <sys/policy.h>
     66 #include <sys/sunddi.h>
     67 #include <sys/filio.h>
     68 #include "fs/fs_subr.h"
     69 #include <sys/zfs_ctldir.h>
     70 #include <sys/zfs_fuid.h>
     71 #include <sys/dnlc.h>
     72 #include <sys/zfs_rlock.h>
     73 #include <sys/extdirent.h>
     74 #include <sys/kidmap.h>
     75 #include <sys/cred_impl.h>
     76 #include <sys/attr.h>
     77 
     78 /*
     79  * Programming rules.
     80  *
     81  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
     82  * properly lock its in-core state, create a DMU transaction, do the work,
     83  * record this work in the intent log (ZIL), commit the DMU transaction,
     84  * and wait for the intent log to commit if it is a synchronous operation.
     85  * Moreover, the vnode ops must work in both normal and log replay context.
     86  * The ordering of events is important to avoid deadlocks and references
     87  * to freed memory.  The example below illustrates the following Big Rules:
     88  *
     89  *  (1) A check must be made in each zfs thread for a mounted file system.
     90  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
     91  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
     92  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
     93  *      can return EIO from the calling function.
     94  *
     95  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
     96  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
     97  *	First, if it's the last reference, the vnode/znode
     98  *	can be freed, so the zp may point to freed memory.  Second, the last
     99  *	reference will call zfs_zinactive(), which may induce a lot of work --
    100  *	pushing cached pages (which acquires range locks) and syncing out
    101  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
    102  *	which could deadlock the system if you were already holding one.
    103  *
    104  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
    105  *	as they can span dmu_tx_assign() calls.
    106  *
    107  *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
    108  *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
    109  *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
    110  *	This is critical because we don't want to block while holding locks.
    111  *	Note, in particular, that if a lock is sometimes acquired before
    112  *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
    113  *	use a non-blocking assign can deadlock the system.  The scenario:
    114  *
    115  *	Thread A has grabbed a lock before calling dmu_tx_assign().
    116  *	Thread B is in an already-assigned tx, and blocks for this lock.
    117  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
    118  *	forever, because the previous txg can't quiesce until B's tx commits.
    119  *
    120  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
    121  *	then drop all locks, call dmu_tx_wait(), and try again.
    122  *
    123  *  (5)	If the operation succeeded, generate the intent log entry for it
    124  *	before dropping locks.  This ensures that the ordering of events
    125  *	in the intent log matches the order in which they actually occurred.
    126  *
    127  *  (6)	At the end of each vnode op, the DMU tx must always commit,
    128  *	regardless of whether there were any errors.
    129  *
    130  *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
    131  *	to ensure that synchronous semantics are provided when necessary.
    132  *
    133  * In general, this is how things should be ordered in each vnode op:
    134  *
    135  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
    136  * top:
    137  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
    138  *	rw_enter(...);			// grab any other locks you need
    139  *	tx = dmu_tx_create(...);	// get DMU tx
    140  *	dmu_tx_hold_*();		// hold each object you might modify
    141  *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
    142  *	if (error) {
    143  *		rw_exit(...);		// drop locks
    144  *		zfs_dirent_unlock(dl);	// unlock directory entry
    145  *		VN_RELE(...);		// release held vnodes
    146  *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
    147  *			dmu_tx_wait(tx);
    148  *			dmu_tx_abort(tx);
    149  *			goto top;
    150  *		}
    151  *		dmu_tx_abort(tx);	// abort DMU tx
    152  *		ZFS_EXIT(zfsvfs);	// finished in zfs
    153  *		return (error);		// really out of space
    154  *	}
    155  *	error = do_real_work();		// do whatever this VOP does
    156  *	if (error == 0)
    157  *		zfs_log_*(...);		// on success, make ZIL entry
    158  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
    159  *	rw_exit(...);			// drop locks
    160  *	zfs_dirent_unlock(dl);		// unlock directory entry
    161  *	VN_RELE(...);			// release held vnodes
    162  *	zil_commit(zilog, seq, foid);	// synchronous when necessary
    163  *	ZFS_EXIT(zfsvfs);		// finished in zfs
    164  *	return (error);			// done, report error
    165  */
    166 
    167 /* ARGSUSED */
    168 static int
    169 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
    170 {
    171 	znode_t	*zp = VTOZ(*vpp);
    172 
    173 	if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
    174 	    ((flag & FAPPEND) == 0)) {
    175 		return (EPERM);
    176 	}
    177 
    178 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
    179 	    ZTOV(zp)->v_type == VREG &&
    180 	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
    181 	    zp->z_phys->zp_size > 0)
    182 		if (fs_vscan(*vpp, cr, 0) != 0)
    183 			return (EACCES);
    184 
    185 	/* Keep a count of the synchronous opens in the znode */
    186 	if (flag & (FSYNC | FDSYNC))
    187 		atomic_inc_32(&zp->z_sync_cnt);
    188 
    189 	return (0);
    190 }
    191 
    192 /* ARGSUSED */
    193 static int
    194 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
    195     caller_context_t *ct)
    196 {
    197 	znode_t	*zp = VTOZ(vp);
    198 
    199 	/* Decrement the synchronous opens in the znode */
    200 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
    201 		atomic_dec_32(&zp->z_sync_cnt);
    202 
    203 	/*
    204 	 * Clean up any locks held by this process on the vp.
    205 	 */
    206 	cleanlocks(vp, ddi_get_pid(), 0);
    207 	cleanshares(vp, ddi_get_pid());
    208 
    209 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
    210 	    ZTOV(zp)->v_type == VREG &&
    211 	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
    212 	    zp->z_phys->zp_size > 0)
    213 		VERIFY(fs_vscan(vp, cr, 1) == 0);
    214 
    215 	return (0);
    216 }
    217 
    218 /*
    219  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
    220  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
    221  */
    222 static int
    223 zfs_holey(vnode_t *vp, int cmd, offset_t *off)
    224 {
    225 	znode_t	*zp = VTOZ(vp);
    226 	uint64_t noff = (uint64_t)*off; /* new offset */
    227 	uint64_t file_sz;
    228 	int error;
    229 	boolean_t hole;
    230 
    231 	file_sz = zp->z_phys->zp_size;
    232 	if (noff >= file_sz)  {
    233 		return (ENXIO);
    234 	}
    235 
    236 	if (cmd == _FIO_SEEK_HOLE)
    237 		hole = B_TRUE;
    238 	else
    239 		hole = B_FALSE;
    240 
    241 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
    242 
    243 	/* end of file? */
    244 	if ((error == ESRCH) || (noff > file_sz)) {
    245 		/*
    246 		 * Handle the virtual hole at the end of file.
    247 		 */
    248 		if (hole) {
    249 			*off = file_sz;
    250 			return (0);
    251 		}
    252 		return (ENXIO);
    253 	}
    254 
    255 	if (noff < *off)
    256 		return (error);
    257 	*off = noff;
    258 	return (error);
    259 }
    260 
    261 /* ARGSUSED */
    262 static int
    263 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
    264     int *rvalp, caller_context_t *ct)
    265 {
    266 	offset_t off;
    267 	int error;
    268 	zfsvfs_t *zfsvfs;
    269 	znode_t *zp;
    270 
    271 	switch (com) {
    272 	case _FIOFFS:
    273 		return (zfs_sync(vp->v_vfsp, 0, cred));
    274 
    275 		/*
    276 		 * The following two ioctls are used by bfu.  Faking out,
    277 		 * necessary to avoid bfu errors.
    278 		 */
    279 	case _FIOGDIO:
    280 	case _FIOSDIO:
    281 		return (0);
    282 
    283 	case _FIO_SEEK_DATA:
    284 	case _FIO_SEEK_HOLE:
    285 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
    286 			return (EFAULT);
    287 
    288 		zp = VTOZ(vp);
    289 		zfsvfs = zp->z_zfsvfs;
    290 		ZFS_ENTER(zfsvfs);
    291 		ZFS_VERIFY_ZP(zp);
    292 
    293 		/* offset parameter is in/out */
    294 		error = zfs_holey(vp, com, &off);
    295 		ZFS_EXIT(zfsvfs);
    296 		if (error)
    297 			return (error);
    298 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
    299 			return (EFAULT);
    300 		return (0);
    301 	}
    302 	return (ENOTTY);
    303 }
    304 
    305 /*
    306  * When a file is memory mapped, we must keep the IO data synchronized
    307  * between the DMU cache and the memory mapped pages.  What this means:
    308  *
    309  * On Write:	If we find a memory mapped page, we write to *both*
    310  *		the page and the dmu buffer.
    311  *
    312  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
    313  *	the file is memory mapped.
    314  */
    315 static int
    316 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
    317 {
    318 	znode_t	*zp = VTOZ(vp);
    319 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    320 	int64_t	start, off;
    321 	int len = nbytes;
    322 	int error = 0;
    323 
    324 	start = uio->uio_loffset;
    325 	off = start & PAGEOFFSET;
    326 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
    327 		page_t *pp;
    328 		uint64_t bytes = MIN(PAGESIZE - off, len);
    329 		uint64_t woff = uio->uio_loffset;
    330 
    331 		/*
    332 		 * We don't want a new page to "appear" in the middle of
    333 		 * the file update (because it may not get the write
    334 		 * update data), so we grab a lock to block
    335 		 * zfs_getpage().
    336 		 */
    337 		rw_enter(&zp->z_map_lock, RW_WRITER);
    338 		if (pp = page_lookup(vp, start, SE_SHARED)) {
    339 			caddr_t va;
    340 
    341 			rw_exit(&zp->z_map_lock);
    342 			va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L);
    343 			error = uiomove(va+off, bytes, UIO_WRITE, uio);
    344 			if (error == 0) {
    345 				dmu_write(zfsvfs->z_os, zp->z_id,
    346 				    woff, bytes, va+off, tx);
    347 			}
    348 			ppmapout(va);
    349 			page_unlock(pp);
    350 		} else {
    351 			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
    352 			    uio, bytes, tx);
    353 			rw_exit(&zp->z_map_lock);
    354 		}
    355 		len -= bytes;
    356 		off = 0;
    357 		if (error)
    358 			break;
    359 	}
    360 	return (error);
    361 }
    362 
    363 /*
    364  * When a file is memory mapped, we must keep the IO data synchronized
    365  * between the DMU cache and the memory mapped pages.  What this means:
    366  *
    367  * On Read:	We "read" preferentially from memory mapped pages,
    368  *		else we default from the dmu buffer.
    369  *
    370  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
    371  *	the file is memory mapped.
    372  */
    373 static int
    374 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
    375 {
    376 	znode_t *zp = VTOZ(vp);
    377 	objset_t *os = zp->z_zfsvfs->z_os;
    378 	int64_t	start, off;
    379 	int len = nbytes;
    380 	int error = 0;
    381 
    382 	start = uio->uio_loffset;
    383 	off = start & PAGEOFFSET;
    384 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
    385 		page_t *pp;
    386 		uint64_t bytes = MIN(PAGESIZE - off, len);
    387 
    388 		if (pp = page_lookup(vp, start, SE_SHARED)) {
    389 			caddr_t va;
    390 
    391 			va = ppmapin(pp, PROT_READ, (caddr_t)-1L);
    392 			error = uiomove(va + off, bytes, UIO_READ, uio);
    393 			ppmapout(va);
    394 			page_unlock(pp);
    395 		} else {
    396 			error = dmu_read_uio(os, zp->z_id, uio, bytes);
    397 		}
    398 		len -= bytes;
    399 		off = 0;
    400 		if (error)
    401 			break;
    402 	}
    403 	return (error);
    404 }
    405 
    406 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
    407 
    408 /*
    409  * Read bytes from specified file into supplied buffer.
    410  *
    411  *	IN:	vp	- vnode of file to be read from.
    412  *		uio	- structure supplying read location, range info,
    413  *			  and return buffer.
    414  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
    415  *		cr	- credentials of caller.
    416  *		ct	- caller context
    417  *
    418  *	OUT:	uio	- updated offset and range, buffer filled.
    419  *
    420  *	RETURN:	0 if success
    421  *		error code if failure
    422  *
    423  * Side Effects:
    424  *	vp - atime updated if byte count > 0
    425  */
    426 /* ARGSUSED */
    427 static int
    428 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
    429 {
    430 	znode_t		*zp = VTOZ(vp);
    431 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
    432 	objset_t	*os;
    433 	ssize_t		n, nbytes;
    434 	int		error;
    435 	rl_t		*rl;
    436 
    437 	ZFS_ENTER(zfsvfs);
    438 	ZFS_VERIFY_ZP(zp);
    439 	os = zfsvfs->z_os;
    440 
    441 	if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
    442 		ZFS_EXIT(zfsvfs);
    443 		return (EACCES);
    444 	}
    445 
    446 	/*
    447 	 * Validate file offset
    448 	 */
    449 	if (uio->uio_loffset < (offset_t)0) {
    450 		ZFS_EXIT(zfsvfs);
    451 		return (EINVAL);
    452 	}
    453 
    454 	/*
    455 	 * Fasttrack empty reads
    456 	 */
    457 	if (uio->uio_resid == 0) {
    458 		ZFS_EXIT(zfsvfs);
    459 		return (0);
    460 	}
    461 
    462 	/*
    463 	 * Check for mandatory locks
    464 	 */
    465 	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
    466 		if (error = chklock(vp, FREAD,
    467 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
    468 			ZFS_EXIT(zfsvfs);
    469 			return (error);
    470 		}
    471 	}
    472 
    473 	/*
    474 	 * If we're in FRSYNC mode, sync out this znode before reading it.
    475 	 */
    476 	if (ioflag & FRSYNC)
    477 		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
    478 
    479 	/*
    480 	 * Lock the range against changes.
    481 	 */
    482 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
    483 
    484 	/*
    485 	 * If we are reading past end-of-file we can skip
    486 	 * to the end; but we might still need to set atime.
    487 	 */
    488 	if (uio->uio_loffset >= zp->z_phys->zp_size) {
    489 		error = 0;
    490 		goto out;
    491 	}
    492 
    493 	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
    494 	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
    495 
    496 	while (n > 0) {
    497 		nbytes = MIN(n, zfs_read_chunk_size -
    498 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
    499 
    500 		if (vn_has_cached_data(vp))
    501 			error = mappedread(vp, nbytes, uio);
    502 		else
    503 			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
    504 		if (error)
    505 			break;
    506 
    507 		n -= nbytes;
    508 	}
    509 
    510 out:
    511 	zfs_range_unlock(rl);
    512 
    513 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
    514 	ZFS_EXIT(zfsvfs);
    515 	return (error);
    516 }
    517 
    518 /*
    519  * Fault in the pages of the first n bytes specified by the uio structure.
    520  * 1 byte in each page is touched and the uio struct is unmodified.
    521  * Any error will exit this routine as this is only a best
    522  * attempt to get the pages resident. This is a copy of ufs_trans_touch().
    523  */
    524 static void
    525 zfs_prefault_write(ssize_t n, struct uio *uio)
    526 {
    527 	struct iovec *iov;
    528 	ulong_t cnt, incr;
    529 	caddr_t p;
    530 	uint8_t tmp;
    531 
    532 	iov = uio->uio_iov;
    533 
    534 	while (n) {
    535 		cnt = MIN(iov->iov_len, n);
    536 		if (cnt == 0) {
    537 			/* empty iov entry */
    538 			iov++;
    539 			continue;
    540 		}
    541 		n -= cnt;
    542 		/*
    543 		 * touch each page in this segment.
    544 		 */
    545 		p = iov->iov_base;
    546 		while (cnt) {
    547 			switch (uio->uio_segflg) {
    548 			case UIO_USERSPACE:
    549 			case UIO_USERISPACE:
    550 				if (fuword8(p, &tmp))
    551 					return;
    552 				break;
    553 			case UIO_SYSSPACE:
    554 				if (kcopy(p, &tmp, 1))
    555 					return;
    556 				break;
    557 			}
    558 			incr = MIN(cnt, PAGESIZE);
    559 			p += incr;
    560 			cnt -= incr;
    561 		}
    562 		/*
    563 		 * touch the last byte in case it straddles a page.
    564 		 */
    565 		p--;
    566 		switch (uio->uio_segflg) {
    567 		case UIO_USERSPACE:
    568 		case UIO_USERISPACE:
    569 			if (fuword8(p, &tmp))
    570 				return;
    571 			break;
    572 		case UIO_SYSSPACE:
    573 			if (kcopy(p, &tmp, 1))
    574 				return;
    575 			break;
    576 		}
    577 		iov++;
    578 	}
    579 }
    580 
    581 /*
    582  * Write the bytes to a file.
    583  *
    584  *	IN:	vp	- vnode of file to be written to.
    585  *		uio	- structure supplying write location, range info,
    586  *			  and data buffer.
    587  *		ioflag	- FAPPEND flag set if in append mode.
    588  *		cr	- credentials of caller.
    589  *		ct	- caller context (NFS/CIFS fem monitor only)
    590  *
    591  *	OUT:	uio	- updated offset and range.
    592  *
    593  *	RETURN:	0 if success
    594  *		error code if failure
    595  *
    596  * Timestamps:
    597  *	vp - ctime|mtime updated if byte count > 0
    598  */
    599 /* ARGSUSED */
    600 static int
    601 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
    602 {
    603 	znode_t		*zp = VTOZ(vp);
    604 	rlim64_t	limit = uio->uio_llimit;
    605 	ssize_t		start_resid = uio->uio_resid;
    606 	ssize_t		tx_bytes;
    607 	uint64_t	end_size;
    608 	dmu_tx_t	*tx;
    609 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
    610 	zilog_t		*zilog;
    611 	offset_t	woff;
    612 	ssize_t		n, nbytes;
    613 	rl_t		*rl;
    614 	int		max_blksz = zfsvfs->z_max_blksz;
    615 	uint64_t	pflags;
    616 	int		error;
    617 
    618 	/*
    619 	 * Fasttrack empty write
    620 	 */
    621 	n = start_resid;
    622 	if (n == 0)
    623 		return (0);
    624 
    625 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
    626 		limit = MAXOFFSET_T;
    627 
    628 	ZFS_ENTER(zfsvfs);
    629 	ZFS_VERIFY_ZP(zp);
    630 
    631 	/*
    632 	 * If immutable or not appending then return EPERM
    633 	 */
    634 	pflags = zp->z_phys->zp_flags;
    635 	if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
    636 	    ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
    637 	    (uio->uio_loffset < zp->z_phys->zp_size))) {
    638 		ZFS_EXIT(zfsvfs);
    639 		return (EPERM);
    640 	}
    641 
    642 	zilog = zfsvfs->z_log;
    643 
    644 	/*
    645 	 * Pre-fault the pages to ensure slow (eg NFS) pages
    646 	 * don't hold up txg.
    647 	 */
    648 	zfs_prefault_write(n, uio);
    649 
    650 	/*
    651 	 * If in append mode, set the io offset pointer to eof.
    652 	 */
    653 	if (ioflag & FAPPEND) {
    654 		/*
    655 		 * Range lock for a file append:
    656 		 * The value for the start of range will be determined by
    657 		 * zfs_range_lock() (to guarantee append semantics).
    658 		 * If this write will cause the block size to increase,
    659 		 * zfs_range_lock() will lock the entire file, so we must
    660 		 * later reduce the range after we grow the block size.
    661 		 */
    662 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
    663 		if (rl->r_len == UINT64_MAX) {
    664 			/* overlocked, zp_size can't change */
    665 			woff = uio->uio_loffset = zp->z_phys->zp_size;
    666 		} else {
    667 			woff = uio->uio_loffset = rl->r_off;
    668 		}
    669 	} else {
    670 		woff = uio->uio_loffset;
    671 		/*
    672 		 * Validate file offset
    673 		 */
    674 		if (woff < 0) {
    675 			ZFS_EXIT(zfsvfs);
    676 			return (EINVAL);
    677 		}
    678 
    679 		/*
    680 		 * If we need to grow the block size then zfs_range_lock()
    681 		 * will lock a wider range than we request here.
    682 		 * Later after growing the block size we reduce the range.
    683 		 */
    684 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
    685 	}
    686 
    687 	if (woff >= limit) {
    688 		zfs_range_unlock(rl);
    689 		ZFS_EXIT(zfsvfs);
    690 		return (EFBIG);
    691 	}
    692 
    693 	if ((woff + n) > limit || woff > (limit - n))
    694 		n = limit - woff;
    695 
    696 	/*
    697 	 * Check for mandatory locks
    698 	 */
    699 	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
    700 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
    701 		zfs_range_unlock(rl);
    702 		ZFS_EXIT(zfsvfs);
    703 		return (error);
    704 	}
    705 	end_size = MAX(zp->z_phys->zp_size, woff + n);
    706 
    707 	/*
    708 	 * Write the file in reasonable size chunks.  Each chunk is written
    709 	 * in a separate transaction; this keeps the intent log records small
    710 	 * and allows us to do more fine-grained space accounting.
    711 	 */
    712 	while (n > 0) {
    713 		/*
    714 		 * Start a transaction.
    715 		 */
    716 		woff = uio->uio_loffset;
    717 		tx = dmu_tx_create(zfsvfs->z_os);
    718 		dmu_tx_hold_bonus(tx, zp->z_id);
    719 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
    720 		error = dmu_tx_assign(tx, zfsvfs->z_assign);
    721 		if (error) {
    722 			if (error == ERESTART &&
    723 			    zfsvfs->z_assign == TXG_NOWAIT) {
    724 				dmu_tx_wait(tx);
    725 				dmu_tx_abort(tx);
    726 				continue;
    727 			}
    728 			dmu_tx_abort(tx);
    729 			break;
    730 		}
    731 
    732 		/*
    733 		 * If zfs_range_lock() over-locked we grow the blocksize
    734 		 * and then reduce the lock range.  This will only happen
    735 		 * on the first iteration since zfs_range_reduce() will
    736 		 * shrink down r_len to the appropriate size.
    737 		 */
    738 		if (rl->r_len == UINT64_MAX) {
    739 			uint64_t new_blksz;
    740 
    741 			if (zp->z_blksz > max_blksz) {
    742 				ASSERT(!ISP2(zp->z_blksz));
    743 				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
    744 			} else {
    745 				new_blksz = MIN(end_size, max_blksz);
    746 			}
    747 			zfs_grow_blocksize(zp, new_blksz, tx);
    748 			zfs_range_reduce(rl, woff, n);
    749 		}
    750 
    751 		/*
    752 		 * XXX - should we really limit each write to z_max_blksz?
    753 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
    754 		 */
    755 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
    756 		rw_enter(&zp->z_map_lock, RW_READER);
    757 
    758 		tx_bytes = uio->uio_resid;
    759 		if (vn_has_cached_data(vp)) {
    760 			rw_exit(&zp->z_map_lock);
    761 			error = mappedwrite(vp, nbytes, uio, tx);
    762 		} else {
    763 			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
    764 			    uio, nbytes, tx);
    765 			rw_exit(&zp->z_map_lock);
    766 		}
    767 		tx_bytes -= uio->uio_resid;
    768 
    769 		/*
    770 		 * If we made no progress, we're done.  If we made even
    771 		 * partial progress, update the znode and ZIL accordingly.
    772 		 */
    773 		if (tx_bytes == 0) {
    774 			dmu_tx_commit(tx);
    775 			ASSERT(error != 0);
    776 			break;
    777 		}
    778 
    779 		/*
    780 		 * Clear Set-UID/Set-GID bits on successful write if not
    781 		 * privileged and at least one of the excute bits is set.
    782 		 *
    783 		 * It would be nice to to this after all writes have
    784 		 * been done, but that would still expose the ISUID/ISGID
    785 		 * to another app after the partial write is committed.
    786 		 *
    787 		 * Note: we don't call zfs_fuid_map_id() here because
    788 		 * user 0 is not an ephemeral uid.
    789 		 */
    790 		mutex_enter(&zp->z_acl_lock);
    791 		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
    792 		    (S_IXUSR >> 6))) != 0 &&
    793 		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
    794 		    secpolicy_vnode_setid_retain(cr,
    795 		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
    796 		    zp->z_phys->zp_uid == 0) != 0) {
    797 			zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
    798 		}
    799 		mutex_exit(&zp->z_acl_lock);
    800 
    801 		/*
    802 		 * Update time stamp.  NOTE: This marks the bonus buffer as
    803 		 * dirty, so we don't have to do it again for zp_size.
    804 		 */
    805 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
    806 
    807 		/*
    808 		 * Update the file size (zp_size) if it has changed;
    809 		 * account for possible concurrent updates.
    810 		 */
    811 		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
    812 			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
    813 			    uio->uio_loffset);
    814 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
    815 		dmu_tx_commit(tx);
    816 
    817 		if (error != 0)
    818 			break;
    819 		ASSERT(tx_bytes == nbytes);
    820 		n -= nbytes;
    821 	}
    822 
    823 	zfs_range_unlock(rl);
    824 
    825 	/*
    826 	 * If we're in replay mode, or we made no progress, return error.
    827 	 * Otherwise, it's at least a partial write, so it's successful.
    828 	 */
    829 	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
    830 		ZFS_EXIT(zfsvfs);
    831 		return (error);
    832 	}
    833 
    834 	if (ioflag & (FSYNC | FDSYNC))
    835 		zil_commit(zilog, zp->z_last_itx, zp->z_id);
    836 
    837 	ZFS_EXIT(zfsvfs);
    838 	return (0);
    839 }
    840 
    841 void
    842 zfs_get_done(dmu_buf_t *db, void *vzgd)
    843 {
    844 	zgd_t *zgd = (zgd_t *)vzgd;
    845 	rl_t *rl = zgd->zgd_rl;
    846 	vnode_t *vp = ZTOV(rl->r_zp);
    847 
    848 	dmu_buf_rele(db, vzgd);
    849 	zfs_range_unlock(rl);
    850 	VN_RELE(vp);
    851 	zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
    852 	kmem_free(zgd, sizeof (zgd_t));
    853 }
    854 
    855 /*
    856  * Get data to generate a TX_WRITE intent log record.
    857  */
    858 int
    859 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
    860 {
    861 	zfsvfs_t *zfsvfs = arg;
    862 	objset_t *os = zfsvfs->z_os;
    863 	znode_t *zp;
    864 	uint64_t off = lr->lr_offset;
    865 	dmu_buf_t *db;
    866 	rl_t *rl;
    867 	zgd_t *zgd;
    868 	int dlen = lr->lr_length;		/* length of user data */
    869 	int error = 0;
    870 
    871 	ASSERT(zio);
    872 	ASSERT(dlen != 0);
    873 
    874 	/*
    875 	 * Nothing to do if the file has been removed
    876 	 */
    877 	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
    878 		return (ENOENT);
    879 	if (zp->z_unlinked) {
    880 		VN_RELE(ZTOV(zp));
    881 		return (ENOENT);
    882 	}
    883 
    884 	/*
    885 	 * Write records come in two flavors: immediate and indirect.
    886 	 * For small writes it's cheaper to store the data with the
    887 	 * log record (immediate); for large writes it's cheaper to
    888 	 * sync the data and get a pointer to it (indirect) so that
    889 	 * we don't have to write the data twice.
    890 	 */
    891 	if (buf != NULL) { /* immediate write */
    892 		rl = zfs_range_lock(zp, off, dlen, RL_READER);
    893 		/* test for truncation needs to be done while range locked */
    894 		if (off >= zp->z_phys->zp_size) {
    895 			error = ENOENT;
    896 			goto out;
    897 		}
    898 		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
    899 	} else { /* indirect write */
    900 		uint64_t boff; /* block starting offset */
    901 
    902 		/*
    903 		 * Have to lock the whole block to ensure when it's
    904 		 * written out and it's checksum is being calculated
    905 		 * that no one can change the data. We need to re-check
    906 		 * blocksize after we get the lock in case it's changed!
    907 		 */
    908 		for (;;) {
    909 			if (ISP2(zp->z_blksz)) {
    910 				boff = P2ALIGN_TYPED(off, zp->z_blksz,
    911 				    uint64_t);
    912 			} else {
    913 				boff = 0;
    914 			}
    915 			dlen = zp->z_blksz;
    916 			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
    917 			if (zp->z_blksz == dlen)
    918 				break;
    919 			zfs_range_unlock(rl);
    920 		}
    921 		/* test for truncation needs to be done while range locked */
    922 		if (off >= zp->z_phys->zp_size) {
    923 			error = ENOENT;
    924 			goto out;
    925 		}
    926 		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
    927 		zgd->zgd_rl = rl;
    928 		zgd->zgd_zilog = zfsvfs->z_log;
    929 		zgd->zgd_bp = &lr->lr_blkptr;
    930 		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
    931 		ASSERT(boff == db->db_offset);
    932 		lr->lr_blkoff = off - boff;
    933 		error = dmu_sync(zio, db, &lr->lr_blkptr,
    934 		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
    935 		ASSERT((error && error != EINPROGRESS) ||
    936 		    lr->lr_length <= zp->z_blksz);
    937 		if (error == 0)
    938 			zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
    939 		/*
    940 		 * If we get EINPROGRESS, then we need to wait for a
    941 		 * write IO initiated by dmu_sync() to complete before
    942 		 * we can release this dbuf.  We will finish everything
    943 		 * up in the zfs_get_done() callback.
    944 		 */
    945 		if (error == EINPROGRESS)
    946 			return (0);
    947 		dmu_buf_rele(db, zgd);
    948 		kmem_free(zgd, sizeof (zgd_t));
    949 	}
    950 out:
    951 	zfs_range_unlock(rl);
    952 	VN_RELE(ZTOV(zp));
    953 	return (error);
    954 }
    955 
    956 /*ARGSUSED*/
    957 static int
    958 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
    959     caller_context_t *ct)
    960 {
    961 	znode_t *zp = VTOZ(vp);
    962 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    963 	int error;
    964 
    965 	ZFS_ENTER(zfsvfs);
    966 	ZFS_VERIFY_ZP(zp);
    967 
    968 	if (flag & V_ACE_MASK)
    969 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
    970 	else
    971 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
    972 
    973 	ZFS_EXIT(zfsvfs);
    974 	return (error);
    975 }
    976 
    977 /*
    978  * Lookup an entry in a directory, or an extended attribute directory.
    979  * If it exists, return a held vnode reference for it.
    980  *
    981  *	IN:	dvp	- vnode of directory to search.
    982  *		nm	- name of entry to lookup.
    983  *		pnp	- full pathname to lookup [UNUSED].
    984  *		flags	- LOOKUP_XATTR set if looking for an attribute.
    985  *		rdir	- root directory vnode [UNUSED].
    986  *		cr	- credentials of caller.
    987  *		ct	- caller context
    988  *		direntflags - directory lookup flags
    989  *		realpnp - returned pathname.
    990  *
    991  *	OUT:	vpp	- vnode of located entry, NULL if not found.
    992  *
    993  *	RETURN:	0 if success
    994  *		error code if failure
    995  *
    996  * Timestamps:
    997  *	NA
    998  */
    999 /* ARGSUSED */
   1000 static int
   1001 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
   1002     int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
   1003     int *direntflags, pathname_t *realpnp)
   1004 {
   1005 	znode_t *zdp = VTOZ(dvp);
   1006 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
   1007 	int	error;
   1008 
   1009 	ZFS_ENTER(zfsvfs);
   1010 	ZFS_VERIFY_ZP(zdp);
   1011 
   1012 	*vpp = NULL;
   1013 
   1014 	if (flags & LOOKUP_XATTR) {
   1015 		/*
   1016 		 * If the xattr property is off, refuse the lookup request.
   1017 		 */
   1018 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
   1019 			ZFS_EXIT(zfsvfs);
   1020 			return (EINVAL);
   1021 		}
   1022 
   1023 		/*
   1024 		 * We don't allow recursive attributes..
   1025 		 * Maybe someday we will.
   1026 		 */
   1027 		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
   1028 			ZFS_EXIT(zfsvfs);
   1029 			return (EINVAL);
   1030 		}
   1031 
   1032 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
   1033 			ZFS_EXIT(zfsvfs);
   1034 			return (error);
   1035 		}
   1036 
   1037 		/*
   1038 		 * Do we have permission to get into attribute directory?
   1039 		 */
   1040 
   1041 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
   1042 		    B_FALSE, cr)) {
   1043 			VN_RELE(*vpp);
   1044 			*vpp = NULL;
   1045 		}
   1046 
   1047 		ZFS_EXIT(zfsvfs);
   1048 		return (error);
   1049 	}
   1050 
   1051 	if (dvp->v_type != VDIR) {
   1052 		ZFS_EXIT(zfsvfs);
   1053 		return (ENOTDIR);
   1054 	}
   1055 
   1056 	/*
   1057 	 * Check accessibility of directory.
   1058 	 */
   1059 
   1060 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
   1061 		ZFS_EXIT(zfsvfs);
   1062 		return (error);
   1063 	}
   1064 
   1065 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
   1066 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
   1067 		ZFS_EXIT(zfsvfs);
   1068 		return (EILSEQ);
   1069 	}
   1070 
   1071 	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
   1072 	if (error == 0) {
   1073 		/*
   1074 		 * Convert device special files
   1075 		 */
   1076 		if (IS_DEVVP(*vpp)) {
   1077 			vnode_t	*svp;
   1078 
   1079 			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
   1080 			VN_RELE(*vpp);
   1081 			if (svp == NULL)
   1082 				error = ENOSYS;
   1083 			else
   1084 				*vpp = svp;
   1085 		}
   1086 	}
   1087 
   1088 	ZFS_EXIT(zfsvfs);
   1089 	return (error);
   1090 }
   1091 
   1092 /*
   1093  * Attempt to create a new entry in a directory.  If the entry
   1094  * already exists, truncate the file if permissible, else return
   1095  * an error.  Return the vp of the created or trunc'd file.
   1096  *
   1097  *	IN:	dvp	- vnode of directory to put new file entry in.
   1098  *		name	- name of new file entry.
   1099  *		vap	- attributes of new file.
   1100  *		excl	- flag indicating exclusive or non-exclusive mode.
   1101  *		mode	- mode to open file with.
   1102  *		cr	- credentials of caller.
   1103  *		flag	- large file flag [UNUSED].
   1104  *		ct	- caller context
   1105  *		vsecp 	- ACL to be set
   1106  *
   1107  *	OUT:	vpp	- vnode of created or trunc'd entry.
   1108  *
   1109  *	RETURN:	0 if success
   1110  *		error code if failure
   1111  *
   1112  * Timestamps:
   1113  *	dvp - ctime|mtime updated if new entry created
   1114  *	 vp - ctime|mtime always, atime if new
   1115  */
   1116 
   1117 /* ARGSUSED */
   1118 static int
   1119 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
   1120     int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
   1121     vsecattr_t *vsecp)
   1122 {
   1123 	znode_t		*zp, *dzp = VTOZ(dvp);
   1124 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
   1125 	zilog_t		*zilog;
   1126 	objset_t	*os;
   1127 	zfs_dirlock_t	*dl;
   1128 	dmu_tx_t	*tx;
   1129 	int		error;
   1130 	zfs_acl_t	*aclp = NULL;
   1131 	zfs_fuid_info_t *fuidp = NULL;
   1132 
   1133 	/*
   1134 	 * If we have an ephemeral id, ACL, or XVATTR then
   1135 	 * make sure file system is at proper version
   1136 	 */
   1137