Home | History | Annotate | Download | only in zfs
      1    789     ahrens /*
      2    789     ahrens  * CDDL HEADER START
      3    789     ahrens  *
      4    789     ahrens  * The contents of this file are subject to the terms of the
      5   1460      marks  * Common Development and Distribution License (the "License").
      6   1460      marks  * You may not use this file except in compliance with the License.
      7    789     ahrens  *
      8    789     ahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9    789     ahrens  * or http://www.opensolaris.org/os/licensing.
     10    789     ahrens  * See the License for the specific language governing permissions
     11    789     ahrens  * and limitations under the License.
     12    789     ahrens  *
     13    789     ahrens  * When distributing Covered Code, include this CDDL HEADER in each
     14    789     ahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15    789     ahrens  * If applicable, add the following below this CDDL HEADER, with the
     16    789     ahrens  * fields enclosed by brackets "[]" replaced with your own identifying
     17    789     ahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
     18    789     ahrens  *
     19    789     ahrens  * CDDL HEADER END
     20    789     ahrens  */
     21    789     ahrens /*
     22   8636       Mark  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23    789     ahrens  * Use is subject to license terms.
     24    789     ahrens  */
     25   4144      peteh 
     26   4144      peteh /* Portions Copyright 2007 Jeremy Teo */
     27    789     ahrens 
     28    789     ahrens #include <sys/types.h>
     29    789     ahrens #include <sys/param.h>
     30    789     ahrens #include <sys/time.h>
     31    789     ahrens #include <sys/systm.h>
     32    789     ahrens #include <sys/sysmacros.h>
     33    789     ahrens #include <sys/resource.h>
     34    789     ahrens #include <sys/vfs.h>
     35   3898        rsb #include <sys/vfs_opreg.h>
     36    789     ahrens #include <sys/vnode.h>
     37    789     ahrens #include <sys/file.h>
     38    789     ahrens #include <sys/stat.h>
     39    789     ahrens #include <sys/kmem.h>
     40    789     ahrens #include <sys/taskq.h>
     41    789     ahrens #include <sys/uio.h>
     42    789     ahrens #include <sys/vmsystm.h>
     43    789     ahrens #include <sys/atomic.h>
     44   2688     maybee #include <sys/vm.h>
     45    789     ahrens #include <vm/seg_vn.h>
     46    789     ahrens #include <vm/pvn.h>
     47    789     ahrens #include <vm/as.h>
     48   7315   Jonathan #include <vm/kpm.h>
     49   7315   Jonathan #include <vm/seg_kpm.h>
     50    789     ahrens #include <sys/mman.h>
     51    789     ahrens #include <sys/pathname.h>
     52    789     ahrens #include <sys/cmn_err.h>
     53    789     ahrens #include <sys/errno.h>
     54    789     ahrens #include <sys/unistd.h>
     55    789     ahrens #include <sys/zfs_dir.h>
     56    789     ahrens #include <sys/zfs_acl.h>
     57    789     ahrens #include <sys/zfs_ioctl.h>
     58    789     ahrens #include <sys/fs/zfs.h>
     59    789     ahrens #include <sys/dmu.h>
     60    789     ahrens #include <sys/spa.h>
     61    789     ahrens #include <sys/txg.h>
     62    789     ahrens #include <sys/dbuf.h>
     63    789     ahrens #include <sys/zap.h>
     64    789     ahrens #include <sys/dirent.h>
     65    789     ahrens #include <sys/policy.h>
     66    789     ahrens #include <sys/sunddi.h>
     67    789     ahrens #include <sys/filio.h>
     68   7847       Mark #include <sys/sid.h>
     69    789     ahrens #include "fs/fs_subr.h"
     70    789     ahrens #include <sys/zfs_ctldir.h>
     71   5331        amw #include <sys/zfs_fuid.h>
     72   1484   ek110237 #include <sys/dnlc.h>
     73   1669     perrin #include <sys/zfs_rlock.h>
     74   5331        amw #include <sys/extdirent.h>
     75   5331        amw #include <sys/kidmap.h>
     76  11134     Casper #include <sys/cred.h>
     77   5663   ck153898 #include <sys/attr.h>
     78    789     ahrens 
     79    789     ahrens /*
     80    789     ahrens  * Programming rules.
     81    789     ahrens  *
     82    789     ahrens  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
     83    789     ahrens  * properly lock its in-core state, create a DMU transaction, do the work,
     84    789     ahrens  * record this work in the intent log (ZIL), commit the DMU transaction,
     85   5331        amw  * and wait for the intent log to commit if it is a synchronous operation.
     86   5331        amw  * Moreover, the vnode ops must work in both normal and log replay context.
     87    789     ahrens  * The ordering of events is important to avoid deadlocks and references
     88    789     ahrens  * to freed memory.  The example below illustrates the following Big Rules:
     89    789     ahrens  *
     90    789     ahrens  *  (1) A check must be made in each zfs thread for a mounted file system.
     91   5367     ahrens  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
     92   5367     ahrens  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
     93   5367     ahrens  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
     94   5367     ahrens  *      can return EIO from the calling function.
     95    789     ahrens  *
     96    789     ahrens  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
     97   2638     perrin  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
     98    789     ahrens  *	First, if it's the last reference, the vnode/znode
     99    789     ahrens  *	can be freed, so the zp may point to freed memory.  Second, the last
    100    789     ahrens  *	reference will call zfs_zinactive(), which may induce a lot of work --
    101   1669     perrin  *	pushing cached pages (which acquires range locks) and syncing out
    102    789     ahrens  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
    103    789     ahrens  *	which could deadlock the system if you were already holding one.
    104   9321       Neil  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
    105    789     ahrens  *
    106   1757     perrin  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
    107   1757     perrin  *	as they can span dmu_tx_assign() calls.
    108   1757     perrin  *
    109   8227       Neil  *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
    110    789     ahrens  *	This is critical because we don't want to block while holding locks.
    111    789     ahrens  *	Note, in particular, that if a lock is sometimes acquired before
    112    789     ahrens  *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
    113    789     ahrens  *	use a non-blocking assign can deadlock the system.  The scenario:
    114    789     ahrens  *
    115    789     ahrens  *	Thread A has grabbed a lock before calling dmu_tx_assign().
    116    789     ahrens  *	Thread B is in an already-assigned tx, and blocks for this lock.
    117    789     ahrens  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
    118    789     ahrens  *	forever, because the previous txg can't quiesce until B's tx commits.
    119    789     ahrens  *
    120    789     ahrens  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
    121   2113     ahrens  *	then drop all locks, call dmu_tx_wait(), and try again.
    122    789     ahrens  *
    123   1757     perrin  *  (5)	If the operation succeeded, generate the intent log entry for it
    124    789     ahrens  *	before dropping locks.  This ensures that the ordering of events
    125    789     ahrens  *	in the intent log matches the order in which they actually occurred.
    126   8227       Neil  *      During ZIL replay the zfs_log_* functions will update the sequence
    127   8227       Neil  *	number to indicate the zil transaction has replayed.
    128    789     ahrens  *
    129   1757     perrin  *  (6)	At the end of each vnode op, the DMU tx must always commit,
    130    789     ahrens  *	regardless of whether there were any errors.
    131    789     ahrens  *
    132   2638     perrin  *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
    133    789     ahrens  *	to ensure that synchronous semantics are provided when necessary.
    134    789     ahrens  *
    135    789     ahrens  * In general, this is how things should be ordered in each vnode op:
    136    789     ahrens  *
    137    789     ahrens  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
    138    789     ahrens  * top:
    139    789     ahrens  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
    140    789     ahrens  *	rw_enter(...);			// grab any other locks you need
    141    789     ahrens  *	tx = dmu_tx_create(...);	// get DMU tx
    142    789     ahrens  *	dmu_tx_hold_*();		// hold each object you might modify
    143   8227       Neil  *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
    144    789     ahrens  *	if (error) {
    145    789     ahrens  *		rw_exit(...);		// drop locks
    146    789     ahrens  *		zfs_dirent_unlock(dl);	// unlock directory entry
    147    789     ahrens  *		VN_RELE(...);		// release held vnodes
    148   8227       Neil  *		if (error == ERESTART) {
    149   2113     ahrens  *			dmu_tx_wait(tx);
    150   2113     ahrens  *			dmu_tx_abort(tx);
    151    789     ahrens  *			goto top;
    152    789     ahrens  *		}
    153   2113     ahrens  *		dmu_tx_abort(tx);	// abort DMU tx
    154    789     ahrens  *		ZFS_EXIT(zfsvfs);	// finished in zfs
    155    789     ahrens  *		return (error);		// really out of space
    156    789     ahrens  *	}
    157    789     ahrens  *	error = do_real_work();		// do whatever this VOP does
    158    789     ahrens  *	if (error == 0)
    159   2638     perrin  *		zfs_log_*(...);		// on success, make ZIL entry
    160    789     ahrens  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
    161    789     ahrens  *	rw_exit(...);			// drop locks
    162    789     ahrens  *	zfs_dirent_unlock(dl);		// unlock directory entry
    163    789     ahrens  *	VN_RELE(...);			// release held vnodes
    164   2638     perrin  *	zil_commit(zilog, seq, foid);	// synchronous when necessary
    165    789     ahrens  *	ZFS_EXIT(zfsvfs);		// finished in zfs
    166    789     ahrens  *	return (error);			// done, report error
    167    789     ahrens  */
    168   5367     ahrens 
    169    789     ahrens /* ARGSUSED */
    170    789     ahrens static int
    171   5331        amw zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
    172    789     ahrens {
    173   3063     perrin 	znode_t	*zp = VTOZ(*vpp);
    174   7844       Mark 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    175   7844       Mark 
    176   7844       Mark 	ZFS_ENTER(zfsvfs);
    177   7844       Mark 	ZFS_VERIFY_ZP(zp);
    178   5331        amw 
    179   5331        amw 	if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
    180   5331        amw 	    ((flag & FAPPEND) == 0)) {
    181   7844       Mark 		ZFS_EXIT(zfsvfs);
    182   5331        amw 		return (EPERM);
    183   5331        amw 	}
    184   5331        amw 
    185   5331        amw 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
    186   5331        amw 	    ZTOV(zp)->v_type == VREG &&
    187   5331        amw 	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
    188   7844       Mark 	    zp->z_phys->zp_size > 0) {
    189   7844       Mark 		if (fs_vscan(*vpp, cr, 0) != 0) {
    190   7844       Mark 			ZFS_EXIT(zfsvfs);
    191   5331        amw 			return (EACCES);
    192   7844       Mark 		}
    193   7844       Mark 	}
    194   3063     perrin 
    195   3063     perrin 	/* Keep a count of the synchronous opens in the znode */
    196   3063     perrin 	if (flag & (FSYNC | FDSYNC))
    197   3063     perrin 		atomic_inc_32(&zp->z_sync_cnt);
    198   5331        amw 
    199   7844       Mark 	ZFS_EXIT(zfsvfs);
    200   5331        amw 	return (0);
    201   5331        amw }
    202   5331        amw 
    203   5331        amw /* ARGSUSED */
    204   5331        amw static int
    205   5331        amw zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
    206   5331        amw     caller_context_t *ct)
    207    789     ahrens {
    208   3063     perrin 	znode_t	*zp = VTOZ(vp);
    209   7844       Mark 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    210   7844       Mark 
    211   9909      chris 	/*
    212   9909      chris 	 * Clean up any locks held by this process on the vp.
    213   9909      chris 	 */
    214   9909      chris 	cleanlocks(vp, ddi_get_pid(), 0);
    215   9909      chris 	cleanshares(vp, ddi_get_pid());
    216   9909      chris 
    217   7844       Mark 	ZFS_ENTER(zfsvfs);
    218   7844       Mark 	ZFS_VERIFY_ZP(zp);
    219   3063     perrin 
    220   3063     perrin 	/* Decrement the synchronous opens in the znode */
    221   4339     perrin 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
    222   3063     perrin 		atomic_dec_32(&zp->z_sync_cnt);
    223   5331        amw 
    224   5331        amw 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
    225   5331        amw 	    ZTOV(zp)->v_type == VREG &&
    226   5331        amw 	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
    227   5331        amw 	    zp->z_phys->zp_size > 0)
    228   5331        amw 		VERIFY(fs_vscan(vp, cr, 1) == 0);
    229    789     ahrens 
    230   7844       Mark 	ZFS_EXIT(zfsvfs);
    231    789     ahrens 	return (0);
    232    789     ahrens }
    233    789     ahrens 
    234    789     ahrens /*
    235    789     ahrens  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
    236    789     ahrens  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
    237    789     ahrens  */
    238    789     ahrens static int
    239    789     ahrens zfs_holey(vnode_t *vp, int cmd, offset_t *off)
    240    789     ahrens {
    241    789     ahrens 	znode_t	*zp = VTOZ(vp);
    242    789     ahrens 	uint64_t noff = (uint64_t)*off; /* new offset */
    243    789     ahrens 	uint64_t file_sz;
    244    789     ahrens 	int error;
    245    789     ahrens 	boolean_t hole;
    246    789     ahrens 
    247    789     ahrens 	file_sz = zp->z_phys->zp_size;
    248    789     ahrens 	if (noff >= file_sz)  {
    249    789     ahrens 		return (ENXIO);
    250    789     ahrens 	}
    251    789     ahrens 
    252    789     ahrens 	if (cmd == _FIO_SEEK_HOLE)
    253    789     ahrens 		hole = B_TRUE;
    254    789     ahrens 	else
    255    789     ahrens 		hole = B_FALSE;
    256    789     ahrens 
    257    789     ahrens 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
    258    789     ahrens 
    259    789     ahrens 	/* end of file? */
    260    789     ahrens 	if ((error == ESRCH) || (noff > file_sz)) {
    261    789     ahrens 		/*
    262    789     ahrens 		 * Handle the virtual hole at the end of file.
    263    789     ahrens 		 */
    264    789     ahrens 		if (hole) {
    265    789     ahrens 			*off = file_sz;
    266    789     ahrens 			return (0);
    267    789     ahrens 		}
    268    789     ahrens 		return (ENXIO);
    269    789     ahrens 	}
    270    789     ahrens 
    271    789     ahrens 	if (noff < *off)
    272    789     ahrens 		return (error);
    273    789     ahrens 	*off = noff;
    274    789     ahrens 	return (error);
    275    789     ahrens }
    276    789     ahrens 
    277    789     ahrens /* ARGSUSED */
    278    789     ahrens static int
    279    789     ahrens zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
    280   5331        amw     int *rvalp, caller_context_t *ct)
    281    789     ahrens {
    282    789     ahrens 	offset_t off;
    283    789     ahrens 	int error;
    284    789     ahrens 	zfsvfs_t *zfsvfs;
    285   5326   ek110237 	znode_t *zp;
    286    789     ahrens 
    287    789     ahrens 	switch (com) {
    288   4339     perrin 	case _FIOFFS:
    289    789     ahrens 		return (zfs_sync(vp->v_vfsp, 0, cred));
    290    789     ahrens 
    291   1544   eschrock 		/*
    292   1544   eschrock 		 * The following two ioctls are used by bfu.  Faking out,
    293   1544   eschrock 		 * necessary to avoid bfu errors.
    294   1544   eschrock 		 */
    295   4339     perrin 	case _FIOGDIO:
    296   4339     perrin 	case _FIOSDIO:
    297   1544   eschrock 		return (0);
    298   1544   eschrock 
    299   4339     perrin 	case _FIO_SEEK_DATA:
    300   4339     perrin 	case _FIO_SEEK_HOLE:
    301    789     ahrens 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
    302    789     ahrens 			return (EFAULT);
    303    789     ahrens 
    304   5326   ek110237 		zp = VTOZ(vp);
    305   5326   ek110237 		zfsvfs = zp->z_zfsvfs;
    306   5367     ahrens 		ZFS_ENTER(zfsvfs);
    307   5367     ahrens 		ZFS_VERIFY_ZP(zp);
    308    789     ahrens 
    309    789     ahrens 		/* offset parameter is in/out */
    310    789     ahrens 		error = zfs_holey(vp, com, &off);
    311    789     ahrens 		ZFS_EXIT(zfsvfs);
    312    789     ahrens 		if (error)
    313    789     ahrens 			return (error);
    314    789     ahrens 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
    315    789     ahrens 			return (EFAULT);
    316    789     ahrens 		return (0);
    317    789     ahrens 	}
    318    789     ahrens 	return (ENOTTY);
    319    789     ahrens }
    320    789     ahrens 
    321    789     ahrens /*
    322   7315   Jonathan  * Utility functions to map and unmap a single physical page.  These
    323   7315   Jonathan  * are used to manage the mappable copies of ZFS file data, and therefore
    324   7315   Jonathan  * do not update ref/mod bits.
    325   7315   Jonathan  */
    326   7315   Jonathan caddr_t
    327   7315   Jonathan zfs_map_page(page_t *pp, enum seg_rw rw)
    328   7315   Jonathan {
    329   7315   Jonathan 	if (kpm_enable)
    330   7315   Jonathan 		return (hat_kpm_mapin(pp, 0));
    331   7315   Jonathan 	ASSERT(rw == S_READ || rw == S_WRITE);
    332   7315   Jonathan 	return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
    333   7315   Jonathan 	    (caddr_t)-1));
    334   7315   Jonathan }
    335   7315   Jonathan 
    336   7315   Jonathan void
    337   7315   Jonathan zfs_unmap_page(page_t *pp, caddr_t addr)
    338   7315   Jonathan {
    339   7315   Jonathan 	if (kpm_enable) {
    340   7315   Jonathan 		hat_kpm_mapout(pp, 0, addr);
    341   7315   Jonathan 	} else {
    342   7315   Jonathan 		ppmapout(addr);
    343   7315   Jonathan 	}
    344   7315   Jonathan }
    345   7315   Jonathan 
    346   7315   Jonathan /*
    347    789     ahrens  * When a file is memory mapped, we must keep the IO data synchronized
    348    789     ahrens  * between the DMU cache and the memory mapped pages.  What this means:
    349    789     ahrens  *
    350    789     ahrens  * On Write:	If we find a memory mapped page, we write to *both*
    351    789     ahrens  *		the page and the dmu buffer.
    352   8636       Mark  */
    353   8636       Mark static void
    354   8636       Mark update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
    355   8636       Mark {
    356   8636       Mark 	int64_t	off;
    357   8636       Mark 
    358    789     ahrens 	off = start & PAGEOFFSET;
    359    789     ahrens 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
    360    789     ahrens 		page_t *pp;
    361   8636       Mark 		uint64_t nbytes = MIN(PAGESIZE - off, len);
    362   8636       Mark 
    363    789     ahrens 		if (pp = page_lookup(vp, start, SE_SHARED)) {
    364    789     ahrens 			caddr_t va;
    365    789     ahrens 
    366   7315   Jonathan 			va = zfs_map_page(pp, S_WRITE);
    367   9512       Neil 			(void) dmu_read(os, oid, start+off, nbytes, va+off,
    368   9512       Neil 			    DMU_READ_PREFETCH);
    369   7315   Jonathan 			zfs_unmap_page(pp, va);
    370    789     ahrens 			page_unlock(pp);
    371   8636       Mark 		}
    372   8636       Mark 		len -= nbytes;
    373    789     ahrens 		off = 0;
    374   8636       Mark 	}
    375    789     ahrens }
    376    789     ahrens 
    377    789     ahrens /*
    378    789     ahrens  * When a file is memory mapped, we must keep the IO data synchronized
    379    789     ahrens  * between the DMU cache and the memory mapped pages.  What this means:
    380    789     ahrens  *
    381    789     ahrens  * On Read:	We "read" preferentially from memory mapped pages,
    382    789     ahrens  *		else we default from the dmu buffer.
    383    789     ahrens  *
    384    789     ahrens  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
    385    789     ahrens  *	the file is memory mapped.
    386    789     ahrens  */
    387    789     ahrens static int
    388   3638      billm mappedread(vnode_t *vp, int nbytes, uio_t *uio)
    389    789     ahrens {
    390   3638      billm 	znode_t *zp = VTOZ(vp);
    391   3638      billm 	objset_t *os = zp->z_zfsvfs->z_os;
    392   3638      billm 	int64_t	start, off;
    393    789     ahrens 	int len = nbytes;
    394    789     ahrens 	int error = 0;
    395    789     ahrens 
    396    789     ahrens 	start = uio->uio_loffset;
    397    789     ahrens 	off = start & PAGEOFFSET;
    398    789     ahrens 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
    399    789     ahrens 		page_t *pp;
    400   3638      billm 		uint64_t bytes = MIN(PAGESIZE - off, len);
    401    789     ahrens 
    402    789     ahrens 		if (pp = page_lookup(vp, start, SE_SHARED)) {
    403    789     ahrens 			caddr_t va;
    404    789     ahrens 
    405   7315   Jonathan 			va = zfs_map_page(pp, S_READ);
    406    789     ahrens 			error = uiomove(va + off, bytes, UIO_READ, uio);
    407   7315   Jonathan 			zfs_unmap_page(pp, va);
    408    789     ahrens 			page_unlock(pp);
    409    789     ahrens 		} else {
    410   3638      billm 			error = dmu_read_uio(os, zp->z_id, uio, bytes);
    411    789     ahrens 		}
    412    789     ahrens 		len -= bytes;
    413    789     ahrens 		off = 0;
    414    789     ahrens 		if (error)
    415    789     ahrens 			break;
    416    789     ahrens 	}
    417    789     ahrens 	return (error);
    418    789     ahrens }
    419    789     ahrens 
    420   3638      billm offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
    421    789     ahrens 
    422    789     ahrens /*
    423    789     ahrens  * Read bytes from specified file into supplied buffer.
    424    789     ahrens  *
    425    789     ahrens  *	IN:	vp	- vnode of file to be read from.
    426    789     ahrens  *		uio	- structure supplying read location, range info,
    427    789     ahrens  *			  and return buffer.
    428    789     ahrens  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
    429    789     ahrens  *		cr	- credentials of caller.
    430   5331        amw  *		ct	- caller context
    431    789     ahrens  *
    432    789     ahrens  *	OUT:	uio	- updated offset and range, buffer filled.
    433    789     ahrens  *
    434    789     ahrens  *	RETURN:	0 if success
    435    789     ahrens  *		error code if failure
    436    789     ahrens  *
    437    789     ahrens  * Side Effects:
    438    789     ahrens  *	vp - atime updated if byte count > 0
    439    789     ahrens  */
    440    789     ahrens /* ARGSUSED */
    441    789     ahrens static int
    442    789     ahrens zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
    443    789     ahrens {
    444    789     ahrens 	znode_t		*zp = VTOZ(vp);
    445    789     ahrens 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
    446   5326   ek110237 	objset_t	*os;
    447   3638      billm 	ssize_t		n, nbytes;
    448   3638      billm 	int		error;
    449   1669     perrin 	rl_t		*rl;
    450    789     ahrens 
    451   5367     ahrens 	ZFS_ENTER(zfsvfs);
    452   5367     ahrens 	ZFS_VERIFY_ZP(zp);
    453   5326   ek110237 	os = zfsvfs->z_os;
    454    789     ahrens 
    455   5929      marks 	if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
    456   5929      marks 		ZFS_EXIT(zfsvfs);
    457   5929      marks 		return (EACCES);
    458   5929      marks 	}
    459   5929      marks 
    460    789     ahrens 	/*
    461    789     ahrens 	 * Validate file offset
    462    789     ahrens 	 */
    463    789     ahrens 	if (uio->uio_loffset < (offset_t)0) {
    464    789     ahrens 		ZFS_EXIT(zfsvfs);
    465    789     ahrens 		return (EINVAL);
    466    789     ahrens 	}
    467    789     ahrens 
    468    789     ahrens 	/*
    469    789     ahrens 	 * Fasttrack empty reads
    470    789     ahrens 	 */
    471    789     ahrens 	if (uio->uio_resid == 0) {
    472    789     ahrens 		ZFS_EXIT(zfsvfs);
    473    789     ahrens 		return (0);
    474    789     ahrens 	}
    475    789     ahrens 
    476    789     ahrens 	/*
    477   1669     perrin 	 * Check for mandatory locks
    478    789     ahrens 	 */
    479    789     ahrens 	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
    480    789     ahrens 		if (error = chklock(vp, FREAD,
    481    789     ahrens 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
    482    789     ahrens 			ZFS_EXIT(zfsvfs);
    483    789     ahrens 			return (error);
    484    789     ahrens 		}
    485    789     ahrens 	}
    486    789     ahrens 
    487    789     ahrens 	/*
    488    789     ahrens 	 * If we're in FRSYNC mode, sync out this znode before reading it.
    489    789     ahrens 	 */
    490   2638     perrin 	if (ioflag & FRSYNC)
    491   2638     perrin 		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
    492    789     ahrens 
    493    789     ahrens 	/*
    494   1669     perrin 	 * Lock the range against changes.
    495    789     ahrens 	 */
    496   1669     perrin 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
    497   1669     perrin 
    498    789     ahrens 	/*
    499    789     ahrens 	 * If we are reading past end-of-file we can skip
    500    789     ahrens 	 * to the end; but we might still need to set atime.
    501    789     ahrens 	 */
    502    789     ahrens 	if (uio->uio_loffset >= zp->z_phys->zp_size) {
    503    789     ahrens 		error = 0;
    504    789     ahrens 		goto out;
    505    789     ahrens 	}
    506    789     ahrens 
    507   3638      billm 	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
    508   3638      billm 	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
    509    789     ahrens 
    510   3638      billm 	while (n > 0) {
    511   3638      billm 		nbytes = MIN(n, zfs_read_chunk_size -
    512   3638      billm 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
    513   3638      billm 
    514   3638      billm 		if (vn_has_cached_data(vp))
    515   3638      billm 			error = mappedread(vp, nbytes, uio);
    516   3638      billm 		else
    517   3638      billm 			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
    518   7294     perrin 		if (error) {
    519   7294     perrin 			/* convert checksum errors into IO errors */
    520   7294     perrin 			if (error == ECKSUM)
    521   7294     perrin 				error = EIO;
    522   7294     perrin 			break;
    523   7294     perrin 		}
    524    789     ahrens 
    525   3638      billm 		n -= nbytes;
    526    789     ahrens 	}
    527   3638      billm 
    528    789     ahrens out:
    529   2237     maybee 	zfs_range_unlock(rl);
    530    789     ahrens 
    531    789     ahrens 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
    532    789     ahrens 	ZFS_EXIT(zfsvfs);
    533    789     ahrens 	return (error);
    534    789     ahrens }
    535    789     ahrens 
    536    789     ahrens /*
    537    789     ahrens  * Write the bytes to a file.
    538    789     ahrens  *
    539    789     ahrens  *	IN:	vp	- vnode of file to be written to.
    540    789     ahrens  *		uio	- structure supplying write location, range info,
    541    789     ahrens  *			  and data buffer.
    542    789     ahrens  *		ioflag	- FAPPEND flag set if in append mode.
    543    789     ahrens  *		cr	- credentials of caller.
    544   5331        amw  *		ct	- caller context (NFS/CIFS fem monitor only)
    545    789     ahrens  *
    546    789     ahrens  *	OUT:	uio	- updated offset and range.
    547    789     ahrens  *
    548    789     ahrens  *	RETURN:	0 if success
    549    789     ahrens  *		error code if failure
    550    789     ahrens  *
    551    789     ahrens  * Timestamps:
    552    789     ahrens  *	vp - ctime|mtime updated if byte count > 0
    553    789     ahrens  */
    554    789     ahrens /* ARGSUSED */
    555    789     ahrens static int
    556    789     ahrens zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
    557    789     ahrens {
    558    789     ahrens 	znode_t		*zp = VTOZ(vp);
    559    789     ahrens 	rlim64_t	limit = uio->uio_llimit;
    560    789     ahrens 	ssize_t		start_resid = uio->uio_resid;
    561    789     ahrens 	ssize_t		tx_bytes;
    562    789     ahrens 	uint64_t	end_size;
    563    789     ahrens 	dmu_tx_t	*tx;
    564    789     ahrens 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
    565   5326   ek110237 	zilog_t		*zilog;
    566    789     ahrens 	offset_t	woff;
    567    789     ahrens 	ssize_t		n, nbytes;
    568   1669     perrin 	rl_t		*rl;
    569    789     ahrens 	int		max_blksz = zfsvfs->z_max_blksz;
    570   6743      marks 	uint64_t	pflags;
    571   6743      marks 	int		error;
    572   9412  Aleksandr 	arc_buf_t	*abuf;
    573   6743      marks 
    574   6743      marks 	/*
    575   6743      marks 	 * Fasttrack empty write
    576   6743      marks 	 */
    577   6743      marks 	n = start_resid;
    578   6743      marks 	if (n == 0)
    579   6743      marks 		return (0);
    580   6743      marks 
    581   6743      marks 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
    582   6743      marks 		limit = MAXOFFSET_T;
    583   6743      marks 
    584   6743      marks 	ZFS_ENTER(zfsvfs);
    585   6743      marks 	ZFS_VERIFY_ZP(zp);
    586   5331        amw 
    587   5331        amw 	/*
    588   5331        amw 	 * If immutable or not appending then return EPERM
    589   5331        amw 	 */
    590   6743      marks 	pflags = zp->z_phys->zp_flags;
    591   5331        amw 	if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
    592   5331        amw 	    ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
    593   6743      marks 	    (uio->uio_loffset < zp->z_phys->zp_size))) {
    594   6743      marks 		ZFS_EXIT(zfsvfs);
    595   6743      marks 		return (EPERM);
    596   6743      marks 	}
    597   6743      marks 
    598   5326   ek110237 	zilog = zfsvfs->z_log;
    599    789     ahrens 
    600    789     ahrens 	/*
    601  11083    william 	 * Validate file offset
    602  11083    william 	 */
    603  11083    william 	woff = ioflag & FAPPEND ? zp->z_phys->zp_size : uio->uio_loffset;
    604  11083    william 	if (woff < 0) {
    605  11083    william 		ZFS_EXIT(zfsvfs);
    606  11083    william 		return (EINVAL);
    607  11083    william 	}
    608  11083    william 
    609  11083    william 	/*
    610  11083    william 	 * Check for mandatory locks before calling zfs_range_lock()
    611  11083    william 	 * in order to prevent a deadlock with locks set via fcntl().
    612  11083    william 	 */
    613  11083    william 	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
    614  11083    william 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
    615  11083    william 		ZFS_EXIT(zfsvfs);
    616  11083    william 		return (error);
    617  11083    william 	}
    618  11083    william 
    619  11083    william 	/*
    620   2237     maybee 	 * Pre-fault the pages to ensure slow (eg NFS) pages
    621   1669     perrin 	 * don't hold up txg.
    622    789     ahrens 	 */
    623   8059    Donghai 	uio_prefaultpages(n, uio);
    624    789     ahrens 
    625    789     ahrens 	/*
    626    789     ahrens 	 * If in append mode, set the io offset pointer to eof.
    627    789     ahrens 	 */
    628   1669     perrin 	if (ioflag & FAPPEND) {
    629   1669     perrin 		/*
    630  11083    william 		 * Obtain an appending range lock to guarantee file append
    631  11083    william 		 * semantics.  We reset the write offset once we have the lock.
    632   1669     perrin 		 */
    633   1669     perrin 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
    634  11083    william 		woff = rl->r_off;
    635   1669     perrin 		if (rl->r_len == UINT64_MAX) {
    636  11083    william 			/*
    637  11083    william 			 * We overlocked the file because this write will cause
    638  11083    william 			 * the file block size to increase.
    639  11083    william 			 * Note that zp_size cannot change with this lock held.
    640  11083    william 			 */
    641  11083    william 			woff = zp->z_phys->zp_size;
    642  11083    william 		}
    643  11083    william 		uio->uio_loffset = woff;
    644  11083    william 	} else {
    645  11083    william 		/*
    646  11083    william 		 * Note that if the file block size will change as a result of
    647  11083    william 		 * this write, then this range lock will lock the entire file
    648  11083    william 		 * so that we can re-write the block safely.
    649    789     ahrens 		 */
    650   1669     perrin 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
    651    789     ahrens 	}
    652    789     ahrens 
    653    789     ahrens 	if (woff >= limit) {
    654   3638      billm 		zfs_range_unlock(rl);
    655   3638      billm 		ZFS_EXIT(zfsvfs);
    656   3638      billm 		return (EFBIG);
    657    789     ahrens 	}
    658    789     ahrens 
    659    789     ahrens 	if ((woff + n) > limit || woff > (limit - n))
    660    789     ahrens 		n = limit - woff;
    661    789     ahrens 
    662   1669     perrin 	end_size = MAX(zp->z_phys->zp_size, woff + n);
    663    789     ahrens 
    664   1669     perrin 	/*
    665   3638      billm 	 * Write the file in reasonable size chunks.  Each chunk is written
    666   3638      billm 	 * in a separate transaction; this keeps the intent log records small
    667   3638      billm 	 * and allows us to do more fine-grained space accounting.
    668    789     ahrens 	 */
    669    789     ahrens 	while (n > 0) {
    670   9412  Aleksandr 		abuf = NULL;
    671   9412  Aleksandr 		woff = uio->uio_loffset;
    672   9412  Aleksandr 
    673   9412  Aleksandr again:
    674   9396    Matthew 		if (zfs_usergroup_overquota(zfsvfs,
    675   9396    Matthew 		    B_FALSE, zp->z_phys->zp_uid) ||
    676   9396    Matthew 		    zfs_usergroup_overquota(zfsvfs,
    677   9396    Matthew 		    B_TRUE, zp->z_phys->zp_gid)) {
    678   9412  Aleksandr 			if (abuf != NULL)
    679   9412  Aleksandr 				dmu_return_arcbuf(abuf);
    680   9396    Matthew 			error = EDQUOT;
    681   9396    Matthew 			break;
    682   9396    Matthew 		}
    683   9412  Aleksandr 
    684   9412  Aleksandr 		/*
    685   9412  Aleksandr 		 * If dmu_assign_arcbuf() is expected to execute with minimum
    686   9412  Aleksandr 		 * overhead loan an arc buffer and copy user data to it before
    687   9412  Aleksandr 		 * we enter a txg.  This avoids holding a txg forever while we
    688   9412  Aleksandr 		 * pagefault on a hanging NFS server mapping.
    689   9412  Aleksandr 		 */
    690   9412  Aleksandr 		if (abuf == NULL && n >= max_blksz &&
    691   9412  Aleksandr 		    woff >= zp->z_phys->zp_size &&
    692   9412  Aleksandr 		    P2PHASE(woff, max_blksz) == 0 &&
    693   9412  Aleksandr 		    zp->z_blksz == max_blksz) {
    694   9412  Aleksandr 			size_t cbytes;
    695   9412  Aleksandr 
    696   9412  Aleksandr 			abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
    697   9412  Aleksandr 			ASSERT(abuf != NULL);
    698   9412  Aleksandr 			ASSERT(arc_buf_size(abuf) == max_blksz);
    699   9412  Aleksandr 			if (error = uiocopy(abuf->b_data, max_blksz,
    700   9412  Aleksandr 			    UIO_WRITE, uio, &cbytes)) {
    701   9412  Aleksandr 				dmu_return_arcbuf(abuf);
    702   9412  Aleksandr 				break;
    703   9412  Aleksandr 			}
    704   9412  Aleksandr 			ASSERT(cbytes == max_blksz);
    705   9412  Aleksandr 		}
    706   9412  Aleksandr 
    707   9412  Aleksandr 		/*
    708   9412  Aleksandr 		 * Start a transaction.
    709   9412  Aleksandr 		 */
    710   3638      billm 		tx = dmu_tx_create(zfsvfs->z_os);
    711   3638      billm 		dmu_tx_hold_bonus(tx, zp->z_id);
    712   3638      billm 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
    713   8227       Neil 		error = dmu_tx_assign(tx, TXG_NOWAIT);
    714   8227       Neil 		if (error) {
    715   8227       Neil 			if (error == ERESTART) {
    716   3638      billm 				dmu_tx_wait(tx);
    717   3638      billm 				dmu_tx_abort(tx);
    718   9412  Aleksandr 				goto again;
    719   9412  Aleksandr 			}
    720   9412  Aleksandr 			dmu_tx_abort(tx);
    721   9412  Aleksandr 			if (abuf != NULL)
    722   9412  Aleksandr 				dmu_return_arcbuf(abuf);
    723   3638      billm 			break;
    724   3638      billm 		}
    725   3638      billm 
    726   3638      billm 		/*
    727   3638      billm 		 * If zfs_range_lock() over-locked we grow the blocksize
    728   3638      billm 		 * and then reduce the lock range.  This will only happen
    729   3638      billm 		 * on the first iteration since zfs_range_reduce() will
    730   3638      billm 		 * shrink down r_len to the appropriate size.
    731   3638      billm 		 */
    732   3638      billm 		if (rl->r_len == UINT64_MAX) {
    733   3638      billm 			uint64_t new_blksz;
    734   3638      billm 
    735   3638      billm 			if (zp->z_blksz > max_blksz) {
    736   3638      billm 				ASSERT(!ISP2(zp->z_blksz));
    737   3638      billm 				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
    738   3638      billm 			} else {
    739   3638      billm 				new_blksz = MIN(end_size, max_blksz);
    740   3638      billm 			}
    741   3638      billm 			zfs_grow_blocksize(zp, new_blksz, tx);
    742   3638      billm 			zfs_range_reduce(rl, woff, n);
    743   3638      billm 		}
    744   3638      billm 
    745    789     ahrens 		/*
    746    789     ahrens 		 * XXX - should we really limit each write to z_max_blksz?
    747    789     ahrens 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
    748    789     ahrens 		 */
    749    789     ahrens 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
    750    789     ahrens 
    751   9412  Aleksandr 		if (abuf == NULL) {
    752   9412  Aleksandr 			tx_bytes = uio->uio_resid;
    753   9412  Aleksandr 			error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
    754   9412  Aleksandr 			    nbytes, tx);
    755   9412  Aleksandr 			tx_bytes -= uio->uio_resid;
    756   9412  Aleksandr 		} else {
    757   9412  Aleksandr 			tx_bytes = nbytes;
    758   9412  Aleksandr 			ASSERT(tx_bytes == max_blksz);
    759   9412  Aleksandr 			dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
    760   9412  Aleksandr 			ASSERT(tx_bytes <= uio->uio_resid);
    761   9412  Aleksandr 			uioskip(uio, tx_bytes);
    762   9412  Aleksandr 		}
    763   9412  Aleksandr 		if (tx_bytes && vn_has_cached_data(vp)) {
    764   8636       Mark 			update_pages(vp, woff,
    765   8636       Mark 			    tx_bytes, zfsvfs->z_os, zp->z_id);
    766   9412  Aleksandr 		}
    767    789     ahrens 
    768   3638      billm 		/*
    769   3638      billm 		 * If we made no progress, we're done.  If we made even
    770   3638      billm 		 * partial progress, update the znode and ZIL accordingly.
    771   3638      billm 		 */
    772   3638      billm 		if (tx_bytes == 0) {
    773   3897     maybee 			dmu_tx_commit(tx);
    774   3638      billm 			ASSERT(error != 0);
    775    789     ahrens 			break;
    776    789     ahrens 		}
    777   1576      marks 
    778   1576      marks 		/*
    779   1576      marks 		 * Clear Set-UID/Set-GID bits on successful write if not
    780   1576      marks 		 * privileged and at least one of the excute bits is set.
    781   1576      marks 		 *
    782   1576      marks 		 * It would be nice to to this after all writes have
    783   1576      marks 		 * been done, but that would still expose the ISUID/ISGID
    784   1576      marks 		 * to another app after the partial write is committed.
    785   5331        amw 		 *
    786   5331        amw 		 * Note: we don't call zfs_fuid_map_id() here because
    787   5331        amw 		 * user 0 is not an ephemeral uid.
    788   1576      marks 		 */
    789   1576      marks 		mutex_enter(&zp->z_acl_lock);
    790   1576      marks 		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
    791   1576      marks 		    (S_IXUSR >> 6))) != 0 &&
    792   1576      marks 		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
    793   1576      marks 		    secpolicy_vnode_setid_retain(cr,
    794   1576      marks 		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
    795   1576      marks 		    zp->z_phys->zp_uid == 0) != 0) {
    796   4339     perrin 			zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
    797   1576      marks 		}
    798   1576      marks 		mutex_exit(&zp->z_acl_lock);
    799    789     ahrens 
    800   3638      billm 		/*
    801   3638      billm 		 * Update time stamp.  NOTE: This marks the bonus buffer as
    802   3638      billm 		 * dirty, so we don't have to do it again for zp_size.
    803   3638      billm 		 */
    804   3638      billm 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
    805    789     ahrens 
    806    789     ahrens 		/*
    807   3638      billm 		 * Update the file size (zp_size) if it has changed;
    808   3638      billm 		 * account for possible concurrent updates.
    809    789     ahrens 		 */
    810   3638      billm 		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
    811    789     ahrens 			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
    812    789     ahrens 			    uio->uio_loffset);
    813   3638      billm 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
    814    789     ahrens 		dmu_tx_commit(tx);
    815    789     ahrens 
    816   3638      billm 		if (error != 0)
    817   3638      billm 			break;
    818   3638      billm 		ASSERT(tx_bytes == nbytes);
    819   3638      billm 		n -= nbytes;
    820    789     ahrens 	}
    821    789     ahrens 
    822   2237     maybee 	zfs_range_unlock(rl);
    823    789     ahrens 
    824    789     ahrens 	/*
    825    789     ahrens 	 * If we're in replay mode, or we made no progress, return error.
    826    789     ahrens 	 * Otherwise, it's at least a partial write, so it's successful.
    827    789     ahrens 	 */
    828   8227       Neil 	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
    829    789     ahrens 		ZFS_EXIT(zfsvfs);
    830    789     ahrens 		return (error);
    831    789     ahrens 	}
    832    789     ahrens 
    833   2638     perrin 	if (ioflag & (FSYNC | FDSYNC))
    834   2638     perrin 		zil_commit(zilog, zp->z_last_itx, zp->z_id);
    835    789     ahrens 
    836    789     ahrens 	ZFS_EXIT(zfsvfs);
    837    789     ahrens 	return (0);
    838    789     ahrens }
    839    789     ahrens 
    840   2237     maybee void
    841  10922       Jeff zfs_get_done(zgd_t *zgd, int error)
    842  10922       Jeff {
    843  10922       Jeff 	znode_t *zp = zgd->zgd_private;
    844  10922       Jeff 	objset_t *os = zp->z_zfsvfs->z_os;
    845  10922       Jeff 
    846  10922       Jeff 	if (zgd->zgd_db)
    847  10922       Jeff 		dmu_buf_rele(zgd->zgd_db, zgd);
    848  10922       Jeff 
    849  10922       Jeff 	zfs_range_unlock(zgd->zgd_rl);
    850  10922       Jeff 
    851   9321       Neil 	/*
    852   9321       Neil 	 * Release the vnode asynchronously as we currently have the
    853   9321       Neil 	 * txg stopped from syncing.
    854   9321       Neil 	 */
    855  10922       Jeff 	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
    856  10922       Jeff 
    857  10922       Jeff 	if (error == 0 && zgd->zgd_bp)
    858  10922       Jeff 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
    859  10922       Jeff 
    860   3063     perrin 	kmem_free(zgd, sizeof (zgd_t));
    861   2237     maybee }
    862   2237     maybee 
    863  10209       Mark #ifdef DEBUG
    864  10209       Mark static int zil_fault_io = 0;
    865  10209       Mark #endif
    866  10209       Mark 
    867    789     ahrens /*
    868    789     ahrens  * Get data to generate a TX_WRITE intent log record.
    869    789     ahrens  */
    870    789     ahrens int
    871   2237     maybee zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
    872    789     ahrens {
    873    789     ahrens 	zfsvfs_t *zfsvfs = arg;
    874    789     ahrens 	objset_t *os = zfsvfs->z_os;
    875    789     ahrens 	znode_t *zp;
    876  10922       Jeff 	uint64_t object = lr->lr_foid;
    877  10922       Jeff 	uint64_t offset = lr->lr_offset;
    878  10922       Jeff 	uint64_t size = lr->lr_length;
    879  10922       Jeff 	blkptr_t *bp = &lr->lr_blkptr;
    880   2237     maybee 	dmu_buf_t *db;
    881   3063     perrin 	zgd_t *zgd;
    882    789     ahrens 	int error = 0;
    883    789     ahrens 
    884  10922       Jeff 	ASSERT(zio != NULL);
    885  10922       Jeff 	ASSERT(size != 0);
    886    789     ahrens 
    887    789     ahrens 	/*
    888   1669     perrin 	 * Nothing to do if the file has been removed
    889    789     ahrens 	 */
    890  10922       Jeff 	if (zfs_zget(zfsvfs, object, &zp) != 0)
    891    789     ahrens 		return (ENOENT);
    892   3461     ahrens 	if (zp->z_unlinked) {
    893   9321       Neil 		/*
    894   9321       Neil 		 * Release the vnode asynchronously as we currently have the
    895   9321       Neil 		 * txg stopped from syncing.
    896   9321       Neil 		 */
    897   9321       Neil 		VN_RELE_ASYNC(ZTOV(zp),
    898   9321       Neil 		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
    899    789     ahrens 		return (ENOENT);
    900    789     ahrens 	}
    901  10922       Jeff 
    902  10922       Jeff 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
    903  10922       Jeff 	zgd->zgd_zilog = zfsvfs->z_log;
    904  10922       Jeff 	zgd->zgd_private = zp;
    905    789     ahrens 
    906    789     ahrens 	/*
    907    789     ahrens 	 * Write records come in two flavors: immediate and indirect.
    908    789     ahrens 	 * For small writes it's cheaper to store the data with the
    909    789     ahrens 	 * log record (immediate); for large writes it's cheaper to
    910    789     ahrens 	 * sync the data and get a pointer to it (indirect) so that
    911    789     ahrens 	 * we don't have to write the data twice.
    912    789     ahrens 	 */
    913   1669     perrin 	if (buf != NULL) { /* immediate write */
    914  10922       Jeff 		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
    915   1669     perrin 		/* test for truncation needs to be done while range locked */
    916  10922       Jeff 		if (offset >= zp->z_phys->zp_size) {
    917   1669     perrin 			error = ENOENT;
    918  10922       Jeff 		} else {
    919  10922       Jeff 			error = dmu_read(os, object, offset, size, buf,
    920  10922       Jeff 			    DMU_READ_NO_PREFETCH);
    921  10922       Jeff 		}
    922  10922       Jeff 		ASSERT(error == 0 || error == ENOENT);
    923   1669     perrin 	} else { /* indirect write */
    924    789     ahrens 		/*
    925   1669     perrin 		 * Have to lock the whole block to ensure when it's
    926   1669     perrin 		 * written out and it's checksum is being calculated
    927   1669     perrin 		 * that no one can change the data. We need to re-check
    928   1669     perrin 		 * blocksize after we get the lock in case it's changed!
    929    789     ahrens 		 */
    930   1669     perrin 		for (;;) {
    931  10922       Jeff 			uint64_t blkoff;
    932  10922       Jeff 			size = zp->z_blksz;
    933  10945       Jeff 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
    934  10922       Jeff 			offset -= blkoff;
    935  10922       Jeff 			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
    936  10922       Jeff 			    RL_READER);
    937  10922       Jeff 			if (zp->z_blksz == size)
    938   1669     perrin 				break;
    939  10922       Jeff 			offset += blkoff;
    940  10922       Jeff 			zfs_range_unlock(zgd->zgd_rl);
    941   1669     perrin 		}
    942   1669     perrin 		/* test for truncation needs to be done while range locked */
    943  10945       Jeff 		if (lr->lr_offset >= zp->z_phys->zp_size)
    944   1669     perrin 			error = ENOENT;
    945  10209       Mark #ifdef DEBUG
    946  10209       Mark 		if (zil_fault_io) {
    947  10209       Mark 			error = EIO;
    948  10209       Mark 			zil_fault_io = 0;
    949  10922       Jeff 		}
    950  10209       Mark #endif
    951  10922       Jeff 		if (error == 0)
    952  10922       Jeff 			error = dmu_buf_hold(os, object, offset, zgd, &db);
    953  10922       Jeff 
    954  10800       Neil 		if (error == 0) {
    955  10922       Jeff 			zgd->zgd_db = db;
    956  10922       Jeff 			zgd->zgd_bp = bp;
    957  10922       Jeff 
    958  10922       Jeff 			ASSERT(db->db_offset == offset);
    959  10922       Jeff 			ASSERT(db->db_size == size);
    960  10922       Jeff 
    961  10922       Jeff 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
    962  10922       Jeff 			    zfs_get_done, zgd);
    963  10922       Jeff 			ASSERT(error || lr->lr_length <= zp->z_blksz);
    964  10922       Jeff 
    965  10922       Jeff 			/*
    966  10922       Jeff 			 * On success, we need to wait for the write I/O
    967  10922       Jeff 			 * initiated by dmu_sync() to complete before we can
    968  10922       Jeff 			 * release this dbuf.  We will finish everything up
    969  10922       Jeff 			 * in the zfs_get_done() callback.
    970  10922       Jeff 			 */
    971  10922       Jeff 			if (error == 0)
    972  10922       Jeff 				return (0);
    973  10922       Jeff 
    974  10922       Jeff 			if (error == EALREADY) {
    975  10922       Jeff 				lr->lr_common.lrc_txtype = TX_WRITE2;
    976  10922       Jeff 				error = 0;
    977  10922       Jeff 			}
    978  10922       Jeff 		}
    979  10922       Jeff 	}
    980  10922       Jeff 
    981  10922       Jeff 	zfs_get_done(zgd, error);
    982  10922       Jeff 
    983    789     ahrens 	return (error);
    984    789     ahrens }
    985    789     ahrens 
    986    789     ahrens /*ARGSUSED*/
    987    789     ahrens static int
    988   5331        amw zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
    989   5331        amw     caller_context_t *ct)
    990   5331        amw {
    991   5331        amw 	znode_t *zp = VTOZ(vp);
    992   5331        amw 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    993   5331        amw 	int error;
    994   5331        amw 
    995   5367     ahrens 	ZFS_ENTER(zfsvfs);
    996   5367     ahrens 	ZFS_VERIFY_ZP(zp);
    997   5331        amw 
    998   5331        amw 	if (flag & V_ACE_MASK)
    999   5331        amw 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
   1000   5331        amw 	else
   1001   5331        amw 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
   1002   5331        amw 
   1003    789     ahrens 	ZFS_EXIT(zfsvfs);
   1004    789     ahrens 	return (error);
   1005    789     ahrens }
   1006    789     ahrens 
   1007    789     ahrens /*
   1008   9981        Tim  * If vnode is for a device return a specfs vnode instead.
   1009   9981        Tim  */
   1010   9981        Tim static int
   1011   9981        Tim specvp_check(vnode_t **vpp, cred_t *cr)
   1012   9981        Tim {
   1013   9981        Tim 	int error = 0;
   1014   9981        Tim 
   1015   9981        Tim 	if (IS_DEVVP(*vpp)) {
   1016   9981        Tim 		struct vnode *svp;
   1017   9981        Tim 
   1018   9981        Tim 		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
   1019   9981        Tim 		VN_RELE(*vpp);
   1020   9981        Tim 		if (svp == NULL)
   1021   9981        Tim 			error = ENOSYS;
   1022   9981        Tim 		*vpp = svp;
   1023   9981        Tim 	}
   1024   9981        Tim 	return (error);
   1025   9981        Tim }
   1026   9981        Tim 
   1027   9981        Tim 
   1028   9981        Tim /*
   1029    789     ahrens  * Lookup an entry in a directory, or an extended attribute directory.
   1030    789     ahrens  * If it exists, return a held vnode reference for it.
   1031    789     ahrens  *
   1032    789     ahrens  *	IN:	dvp	- vnode of directory to search.
   1033    789     ahrens  *		nm	- name of entry to lookup.
   1034    789     ahrens  *		pnp	- full pathname to lookup [UNUSED].
   1035    789     ahrens  *		flags	- LOOKUP_XATTR set if looking for an attribute.
   1036    789     ahrens  *		rdir	- root directory vnode [UNUSED].
   1037    789     ahrens  *		cr	- credentials of caller.
   1038   5331        amw  *		ct	- caller context
   1039   5331        amw  *		direntflags - directory lookup flags
   1040   5331        amw  *		realpnp - returned pathname.
   1041    789     ahrens  *
   1042    789     ahrens  *	OUT:	vpp	- vnode of located entry, NULL if not found.
   1043    789     ahrens  *
   1044    789     ahrens  *	RETURN:	0 if success
   1045    789     ahrens  *		error code if failure
   1046    789     ahrens  *
   1047    789     ahrens  * Timestamps:
   1048    789     ahrens  *	NA
   1049    789     ahrens  */
   1050    789     ahrens /* ARGSUSED */
   1051    789     ahrens static int
   1052    789     ahrens zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
   1053   5331        amw     int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
   1054   5331        amw     int *direntflags, pathname_t *realpnp)
   1055   5331        amw {
   1056    789     ahrens 	znode_t *zdp = VTOZ(dvp);
   1057    789     ahrens 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
   1058   9981        Tim 	int	error = 0;
   1059   9981        Tim 
   1060   9981        Tim 	/* fast path */
   1061   9981        Tim 	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
   1062   9981        Tim 
   1063   9981        Tim 		if (dvp->v_type != VDIR) {
   1064   9981        Tim 			return (ENOTDIR);
   1065   9981        Tim 		} else if (zdp->z_dbuf == NULL) {
   1066   9981        Tim 			return (EIO);
   1067   9981        Tim 		}
   1068   9981        Tim 
   1069   9981        Tim 		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
   1070   9981        Tim 			error = zfs_fastaccesschk_execute(zdp, cr);
   1071   9981        Tim 			if (!error) {
   1072   9981        Tim 				*vpp = dvp;
   1073   9981        Tim 				VN_HOLD(*vpp);
   1074   9981        Tim 				return (0);
   1075   9981        Tim 			}
   1076   9981        Tim 			return (error);
   1077   9981        Tim 		} else {
   1078   9981        Tim 			vnode_t *tvp = dnlc_lookup(dvp, nm);
   1079   9981        Tim 
   1080   9981        Tim 			if (tvp) {
   1081   9981        Tim 				error = zfs_fastaccesschk_execute(zdp, cr);
   1082   9981        Tim 				if (error) {
   1083   9981        Tim 					VN_RELE(tvp);
   1084   9981        Tim 					return (error);
   1085   9981        Tim 				}
   1086   9981        Tim 				if (tvp == DNLC_NO_VNODE) {
   1087   9981        Tim 					VN_RELE(tvp);
   1088   9981        Tim 					return (ENOENT);
   1089   9981        Tim 				} else {
   1090   9981        Tim 					*vpp = tvp;
   1091   9981        Tim 					return (specvp_check(vpp, cr));
   1092   9981        Tim 				}
   1093   9981        Tim 			}
   1094   9981        Tim 		}
   1095   9981        Tim 	}
   1096   9981        Tim 
   1097   9981        Tim 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
   1098    789     ahrens 
   1099   5367     ahrens 	ZFS_ENTER(zfsvfs);
   1100   5367     ahrens 	ZFS_VERIFY_ZP(zdp);
   1101    789     ahrens 
   1102    789     ahrens 	*vpp = NULL;
   1103    789     ahrens 
   1104    789     ahrens 	if (flags & LOOKUP_XATTR) {
   1105   3234   ck153898 		/*
   1106   3234   ck153898 		 * If the xattr property is off, refuse the lookup request.
   1107   3234   ck153898 		 */
   1108   3234   ck153898 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
   1109   3234   ck153898 			ZFS_EXIT(zfsvfs);
   1110   3234   ck153898 			return (EINVAL);
   1111   3234   ck153898 		}
   1112   3234   ck153898 
   1113    789     ahrens 		/*
   1114    789     ahrens 		 * We don't allow recursive attributes..
   1115    789     ahrens 		 * Maybe someday we will.
   1116    789     ahrens 		 */
   1117    789     ahrens 		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
   1118    789     ahrens 			ZFS_EXIT(zfsvfs);
   1119    789     ahrens 			return (EINVAL);
   1120    789     ahrens 		}
   1121    789     ahrens 
   1122   3280   ck153898 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
   1123    789     ahrens 			ZFS_EXIT(zfsvfs);
   1124    789     ahrens 			return (error);
   1125    789     ahrens 		}
   1126    789     ahrens 
   1127    789     ahrens 		/*
   1128    789     ahrens 		 * Do we have permission to get into attribute directory?
   1129    789     ahrens 		 */
   1130    789     ahrens 
   1131   5331        amw 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
   1132   5331        amw 		    B_FALSE, cr)) {
   1133    789     ahrens 			VN_RELE(*vpp);
   1134   5331        amw 			*vpp = NULL;
   1135    789     ahrens 		}
   1136    789     ahrens 
   1137    789     ahrens 		ZFS_EXIT(zfsvfs);
   1138    789     ahrens 		return (error);
   1139    789     ahrens 	}
   1140   1460      marks 
   1141   1512   ek110237 	if (dvp->v_type != VDIR) {
   1142   1512   ek110237 		ZFS_EXIT(zfsvfs);
   1143   1460      marks 		return (ENOTDIR);
   1144   1512   ek110237 	}
   1145    789     ahrens 
   1146    789     ahrens 	/*
   1147    789     ahrens 	 * Check accessibility of directory.
   1148    789     ahrens 	 */
   1149    789     ahrens 
   1150   5331        amw 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
   1151   5331        amw 		ZFS_EXIT(zfsvfs);
   1152   5331        amw 		return (error);
   1153   5331        amw 	}
   1154   5331        amw 
   1155   5498       timh 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
   1156   5331        amw 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
   1157   5331        amw 		ZFS_EXIT(zfsvfs);
   1158   5331        amw 		return (EILSEQ);
   1159   5331        amw 	}
   1160   5331        amw 
   1161   5331        amw 	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
   1162   9981        Tim 	if (error == 0)
   1163   9981        Tim 		error = specvp_check(vpp, cr);
   1164    789     ahrens 
   1165    789     ahrens 	ZFS_EXIT(zfsvfs);
   1166    789     ahrens 	return (error);
   1167    789     ahrens }
   1168    789     ahrens 
   1169    789     ahrens /*
   1170    789     ahrens  * Attempt to create a new entry in a directory.  If the entry
   1171    789     ahrens  * already exists, truncate the file if permissible, else return
   1172    789     ahrens  * an error.  Return the vp of the created or trunc'd file.
   1173    789     ahrens  *
   1174    789     ahrens  *	IN:	dvp	- vnode of directory to put new file entry in.
   1175    789     ahrens  *		name	- name of new file entry.
   1176    789     ahrens  *		vap	- attributes of new file.
   1177    789     ahrens  *		excl	- flag indicating exclusive or non-exclusive mode.
   1178    789     ahrens  *		mode	- mode to open file with.
   1179    789     ahrens  *		cr	- credentials of caller.
   1180    789     ahrens  *		flag	- large file flag [UNUSED].
   1181   5331        amw  *		ct	- caller context
   1182   5331        amw  *		vsecp 	- ACL to be set
   1183    789     ahrens  *
   1184    789     ahrens  *	OUT:	vpp	- vnode of created or trunc'd entry.
   1185    789     ahrens  *
   1186    789     ahrens  *	RETURN:	0 if success
   1187    789     ahrens  *		error code if failure
   1188    789     ahrens  *
   1189    789     ahrens  * Timestamps:
   1190    789     ahrens  *	dvp - ctime|mtime updated if new entry created
   1191    789     ahrens  *	 vp - ctime|mtime always, atime if new
   1192    789     ahrens  */
   1193   5331        amw 
   1194    789     ahrens /* ARGSUSED */
   1195    789     ahrens static int
   1196    789     ahrens zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
   1197   5331        amw     int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
   1198   5331        amw     vsecattr_t *vsecp)
   1199    789     ahrens {
   1200    789     ahrens 	znode_t		*zp, *dzp = VTOZ(dvp);
   1201    789     ahrens 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
   1202   5326   ek110237 	zilog_t		*zilog;
   1203   5326   ek110237 	objset_t	*os;
   1204    789     ahrens 	zfs_dirlock_t	*dl;
   1205    789     ahrens 	dmu_tx_t	*tx;
   1206    789     ahrens 	int		error;
   1207   7847       Mark 	ksid_t		*ksid;
   1208   7847       Mark 	uid_t		uid;
   1209   7847       Mark 	gid_t		gid = crgetgid(cr);
   1210   9179       Mark 	zfs_acl_ids_t	acl_ids;
   1211   9179       Mark 	boolean_t	fuid_dirtied;
   1212   5331        amw 
   1213   5331        amw 	/*
   1214   5331        amw 	 * If we have an ephemeral id, ACL, or XVATTR then
   1215   5331        amw 	 * make sure file system is at proper version
   1216   5331        amw 	 */
   1217   5331        amw 
   1218   7847       Mark 	ksid = crgetsid(cr, KSID_OWNER);
   1219   7847       Mark 	if (ksid)
   1220   7847       Mark 		uid = ksid_getid(ksid);
   1221   7847       Mark 	else
   1222   7847       Mark 		uid = crgetuid(cr);
   1223   7847       Mark 
   1224   5331        amw 	if (zfsvfs->z_use_fuids == B_FALSE &&
   1225   5331        amw 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
   1226   7847       Mark 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
   1227   5331        amw 		return (EINVAL);
   1228    789     ahrens 
   1229   5367     ahrens 	ZFS_ENTER(zfsvfs);
   1230   5367     ahrens 	ZFS_VERIFY_ZP(dzp);
   1231   5326   ek110237 	os = zfsvfs->z_os;
   1232   5326   ek110237 	zilog = zfsvfs->z_log;
   1233    789     ahrens 
   1234   5498       timh 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
   1235   5331        amw 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
   1236   5331        amw 		ZFS_EXIT(zfsvfs);
   1237   5331        amw 		return (EILSEQ);
   1238   5331        amw 	}
   1239   5331        amw 
   1240   5331        amw 	if (vap->va_mask & AT_XVATTR) {
   1241   5331        amw 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
   1242   5331        amw 		    crgetuid(cr), cr, vap->va_type)) != 0) {
   1243   5331        amw 			ZFS_EXIT(zfsvfs);
   1244   5331        amw 			return (error);
   1245   5331        amw 		}
   1246   5331        amw 	}
   1247    789     ahrens top:
   1248    789     ahrens 	*vpp = NULL;
   1249    789     ahrens 
   1250    789     ahrens 	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
   1251    789     ahrens 		vap->va_mode &= ~VSVTX;
   1252    789     ahrens 
   1253    789     ahrens 	if (*name == '\0') {
   1254    789     ahrens 		/*
   1255    789     ahrens 		 * Null component name refers to the directory itself.
   1256    789     ahrens 		 */
   1257    789     ahrens 		VN_HOLD(dvp);
   1258    789     ahrens 		zp = dzp;
   1259    789     ahrens 		dl = NULL;
   1260    789     ahrens 		error = 0;
   1261    789     ahrens 	} else {
   1262    789     ahrens 		/* possible VN_HOLD(zp) */
   1263   5331        amw 		int zflg = 0;
   1264   5331        amw 
   1265   5331        amw 		if (flag & FIGNORECASE)
   1266   5331        amw 			zflg |= ZCILOOK;
   1267   5331        amw 
   1268   5331        amw 		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
   1269   5331        amw 		    NULL, NULL);
   1270   5331        amw 		if (error) {
   1271    789     ahrens 			if (strcmp(name, "..") == 0)
   1272    789     ahrens 				error = EISDIR;
   1273    789     ahrens 			ZFS_EXIT(zfsvfs);
   1274   9179       Mark 			return (error);
   1275   9179       Mark 		}
   1276   9179       Mark 	}
   1277    789     ahrens 	if (zp == NULL) {
   1278   5331        amw 		uint64_t txtype;
   1279   5331        amw 
   1280    789     ahrens 		/*
   1281    789     ahrens 		 * Create a new file object and update the directory
   1282    789     ahrens 		 * to reference it.
   1283    789     ahrens 		 */
   1284   5331        amw 		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
   1285    789     ahrens 			goto out;
   1286    789     ahrens 		}
   1287    789     ahrens 
   1288    789     ahrens 		/*
   1289    789     ahrens 		 * We only support the creation of regular files in
   1290    789     ahrens 		 * extended attribute directories.
   1291    789     ahrens 		 */
   1292    789     ahrens 		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
   1293    789     ahrens 		    (vap->va_type != VREG)) {
   1294    789     ahrens 			error = EINVAL;
   1295    789     ahrens 			goto out;
   1296    789     ahrens 		}
   1297    789     ahrens 
   1298   9179       Mark 		if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
   1299   9179       Mark 		    &acl_ids)) != 0)
   1300   9179       Mark 			goto out;
   1301   9396    Matthew 		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
   1302  10143        Tim 			zfs_acl_ids_free(&acl_ids);
   1303   9396    Matthew 			error = EDQUOT;
   1304   9396    Matthew 			goto out;
   1305   9396    Matthew 		}
   1306   9179       Mark 
   1307    789     ahrens 		tx = dmu_tx_create(os);
   1308    789     ahrens 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
   1309   9179       Mark 		fuid_dirtied = zfsvfs->z_fuid_dirty;
   1310   9396    Matthew 		if (fuid_dirtied)
   1311   9396    Matthew 			zfs_fuid_txhold(zfsvfs, tx);
   1312    789     ahrens 		dmu_tx_hold_bonus(tx, dzp->z_id);
   1313   1544   eschrock 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
   1314   9179       Mark 		if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
   1315    789     ahrens 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
   1316    789     ahrens 			    0, SPA_MAXBLOCKSIZE);
   1317   5331        amw 		}
   1318   8227       Neil 		error = dmu_tx_assign(tx, TXG_NOWAIT);
   1319    789     ahrens 		if (error) {
   1320   9179       Mark 			zfs_acl_ids_free(&acl_ids);
   1321    789     ahrens 			zfs_dirent_unlock(dl);
   1322   8227       Neil 			if (error == ERESTART) {
   1323   2113     ahrens 				dmu_tx_wait(tx);
   1324   2113     ahrens 				dmu_tx_abort(tx);
   1325    789     ahrens 				goto top;
   1326    789     ahrens 			}
   1327   2113     ahrens 			dmu_tx_abort(tx);
   1328    789     ahrens 			ZFS_EXIT(zfsvfs);
   1329   9179       Mark 			return (error);
   1330   9179       Mark 		}
   1331   9179       Mark 		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
   1332   9179       Mark 
   1333   9179       Mark 		if (fuid_dirtied)
   1334   9179       Mark 			zfs_fuid_sync(zfsvfs, tx);
   1335   9179       Mark 
   1336    789     ahrens 		(void) zfs_link_create(dl, zp, tx, ZNEW);
   1337   9179       Mark 
   1338   5331        amw 		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
   1339   5331        amw 		if (flag & FIGNORECASE)
   1340   5331        amw 			txtype |= TX_CI;
   1341   5331        amw 		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
   1342   9179       Mark 		    vsecp, acl_ids.z_fuidp, vap);
   1343   9179       Mark 		zfs_acl_ids_free(&acl_ids);
   1344    789     ahrens 		dmu_tx_commit(tx);
   1345    789     ahrens 	} else {
   1346   5331        amw 		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
   1347   5331        amw 
   1348    789     ahrens 		/*
   1349    789     ahrens 		 * A directory entry already exists for this name.
   1350    789     ahrens 		 */
   1351    789     ahrens 		/*
   1352    789     ahrens 		 * Can't truncate an existing file if in exclusive mode.
   1353    789     ahrens 		 */
   1354    789     ahrens 		if (excl == EXCL) {
   1355    789     ahrens 			error = EEXIST;
   1356    789     ahrens 			goto out;
   1357    789     ahrens 		}
   1358    789     ahrens 		/*
   1359    789     ahrens 		 * Can't open a directory for writing.
   1360    789     ahrens 		 */
   1361    789     ahrens 		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
   1362    789     ahrens 			error = EISDIR;
   1363    789     ahrens 			goto out;
   1364    789     ahrens 		}
   1365    789     ahrens 		/*
   1366    789     ahrens 		 * Verify requested access to file.
   1367    789     ahrens 		 */
   1368   5331        amw 		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
   1369    789     ahrens 			goto out;
   1370    789     ahrens 		}
   1371    789     ahrens 
   1372    789     ahrens 		mutex_enter(&dzp->z_lock);
   1373    789     ahrens 		dzp->z_seq++;
   1374    789     ahrens 		mutex_exit(&dzp->z_lock);
   1375    789     ahrens 
   1376   1878     maybee 		/*
   1377   1878     maybee 		 * Truncate regular files if requested.
   1378   1878     maybee 		 */
   1379   1878     maybee 		if ((ZTOV(zp)->v_type == VREG) &&
   1380    789     ahrens 		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
   1381   6992     maybee 			/* we can't hold any locks when calling zfs_freesp() */
   1382   6992     maybee 			zfs_dirent_unlock(dl);
   1383   6992     maybee 			dl = NULL;
   1384   1878     maybee 			error = zfs_freesp(zp, 0, 0, mode, TRUE);
   1385   4863      praks 			if (error == 0) {
   1386   5331        amw 				vnevent_create(ZTOV(zp), ct);
   1387   4863      praks 			}
   1388    789     ahrens 		}
   1389    789     ahrens 	}
   1390    789     ahrens out:
   1391    789     ahrens 
   1392    789     ahrens 	if (dl)
   1393    789     ahrens 		zfs_dirent_unlock(dl);
   1394    789     ahrens 
   1395    789     ahrens 	if (error) {
   1396    789     ahrens 		if (zp)
   1397    789     ahrens 			VN_RELE(ZTOV(zp));
   1398    789     ahrens 	} else {
   1399    789     ahrens 		*vpp = ZTOV(zp);
   1400   9981        Tim 		error = specvp_check(vpp, cr);
   1401    789     ahrens 	}
   1402    789     ahrens 
   1403    789     ahrens 	ZFS_EXIT(zfsvfs);
   1404    789     ahrens 	return (error);
   1405    789     ahrens }
   1406    789     ahrens 
   1407    789     ahrens /*
   1408    789     ahrens  * Remove an entry from a directory.
   1409    789     ahrens  *
   1410    789     ahrens  *	IN:	dvp	- vnode of directory to remove entry from.
   1411    789     ahrens  *		name	- name of entry to remove.
   1412    789     ahrens  *		cr	- credentials of caller.
   1413   5331        amw  *		ct	- caller context
   1414   5331        amw  *		flags	- case flags
   1415    789     ahrens  *
   1416    789     ahrens  *	RETURN:	0 if success
   1417    789     ahrens  *		error code if failure
   1418    789     ahrens  *
   1419    789     ahrens  * Timestamps:
   1420    789     ahrens  *	dvp - ctime|mtime
   1421    789     ahrens  *	 vp - ctime (if nlink > 0)
   1422    789     ahrens  */
   1423   5331        amw /*ARGSUSED*/
   1424   5331        amw static int
   1425   5331        amw zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
   1426   5331        amw     int flags)
   1427    789     ahrens {
   1428    789     ahrens 	znode_t		*zp, *dzp = VTOZ(dvp);
   1429    789     ahrens 	znode_t		*xzp = NULL;
   1430    789     ahrens 	vnode_t		*vp;
   1431    789     ahrens 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
   1432   5326   ek110237 	zilog_t		*zilog;
   1433    789     ahrens 	uint64_t	acl_obj, xattr_obj;
   1434    789     ahrens 	zfs_dirlock_t	*dl;
   1435    789     ahrens 	dmu_tx_t	*tx;
   1436   3461     ahrens 	boolean_t	may_delete_now, delete_now = FALSE;
   1437   6992     maybee 	boolean_t	unlinked, toobig = FALSE;
   1438   5331        amw 	uint64_t	txtype;
   1439   5331        amw 	pathname_t	*realnmp = NULL;
   1440   5331        amw 	pathname_t	realnm;
   1441   5331        amw 	int		error;
   1442   5331        amw 	int		zflg = ZEXISTS;
   1443   5331        amw 
   1444   5367     ahrens 	ZFS_ENTER(zfsvfs);
   1445   5367     ahrens 	ZFS_VERIFY_ZP(dzp);
   1446   5331        amw 	zilog = zfsvfs->z_log;
   1447   5331        amw 
   1448   5331        amw 	if (flags & FIGNORECASE) {
   1449   5331        amw 		zflg |= ZCILOOK;
   1450   5331        amw 		pn_alloc(&realnm);
   1451   5331        amw 		realnmp = &realnm;
   1452   5331        amw 	}
   1453    789     ahrens 
   1454    789     ahrens top:
   1455    789     ahrens 	/*
   1456    789     ahrens 	 * Attempt to lock directory; fail if entry doesn't exist.
   1457    789     ahrens 	 */
   1458   5331        amw 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
   1459   5331        amw 	    NULL, realnmp)) {
   1460   5331        amw 		if (realnmp)
   1461   5331        amw 			pn_free(realnmp);
   1462    789     ahrens 		ZFS_EXIT(zfsvfs);
   1463    789     ahrens 		return (error);
   1464    789     ahrens 	}
   1465    789     ahrens 
   1466    789     ahrens 	vp = ZTOV(zp);
   1467    789     ahrens 
   1468    789     ahrens 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
   1469    789     ahrens 		goto out;
   1470    789     ahrens 	}
   1471    789     ahrens 
   1472    789     ahrens 	/*
   1473    789     ahrens 	 * Need to use rmdir for removing directories.
   1474    789     ahrens 	 */
   1475    789     ahrens 	if (vp->v_type == VDIR) {
   1476    789     ahrens 		error = EPERM;
   1477    789     ahrens 		goto out;
   1478    789     ahrens 	}
   1479    789     ahrens 
   1480   5331        amw 	vnevent_remove(vp, dvp, name, ct);
   1481   5331        amw 
   1482   5331        amw 	if (realnmp)
   1483   6492       timh 		dnlc_remove(dvp, realnmp->pn_buf);
   1484   5331        amw 	else
   1485   5331        amw 		dnlc_remove(dvp, name);
   1486    789     ahrens 
   1487    789     ahrens 	mutex_enter(&vp->v_lock);
   1488    789     ahrens 	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
   1489    789     ahrens 	mutex_exit(&vp->v_lock);
   1490    789     ahrens 
   1491    789     ahrens 	/*
   1492   3461     ahrens 	 * We may delete the znode now, or we may put it in the unlinked set;
   1493    789     ahrens 	 * it depends on whether we're the last link, and on whether there are
   1494    789     ahrens 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
   1495    789     ahrens 	 * allow for either case.
   1496    789     ahrens 	 */
   1497    789     ahrens 	tx = dmu_tx_create(zfsvfs->z_os);
   1498   1544   eschrock 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
   1499    789     ahrens 	dmu_tx_hold_bonus(tx, zp->z_id);
   1500   6992     maybee 	if (may_delete_now) {
   1501   6992     maybee 		toobig =
   1502   6992     maybee 		    zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
   1503   6992     maybee 		/* if the file is too big, only hold_free a token amount */
   1504   6992     maybee 		dmu_tx_hold_free(tx, zp->z_id, 0,
   1505   6992     maybee 		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
   1506   6992     maybee 	}
   1507    789     ahrens 
   1508    789     ahrens 	/* are there any extended attributes? */
   1509    789     ahrens 	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
   1510    789     ahrens 		/* XXX - do we need this if we are deleting? */
   1511    789     ahrens 		dmu_tx_hold_bonus(tx, xattr_obj);
   1512    789     ahrens 	}
   1513    789     ahrens 
   1514    789     ahrens 	/* are there any additional acls */
   1515    789     ahrens 	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
   1516    789     ahrens 	    may_delete_now)
   1517    789     ahrens 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
   1518    789     ahrens 
   1519    789     ahrens 	/* charge as an update -- would be nice not to charge at all */
   1520   3461     ahrens 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
   1521    789     ahrens 
   1522   8227       Neil 	error = dmu_tx_assign(tx, TXG_NOWAIT);
   1523    789     ahrens 	if (error) {
   1524    789     ahrens 		zfs_dirent_unlock(dl);
   1525    789     ahrens 		VN_RELE(vp);
   1526   8227       Neil 		if (error == ERESTART) {
   1527   2113     ahrens 			dmu_tx_wait(tx);
   1528   2113     ahrens 			dmu_tx_abort(tx);
   1529    789     ahrens 			goto top;
   1530    789     ahrens 		}
   1531   5331        amw 		if (realnmp)
   1532   5331        amw 			pn_free(realnmp);
   1533   2113     ahrens 		dmu_tx_abort(tx);
   1534    789     ahrens 		ZFS_EXIT(zfsvfs);
   1535    789     ahrens 		return (error);
   1536    789     ahrens 	}
   1537    789     ahrens 
   1538    789     ahrens 	/*
   1539    789     ahrens 	 * Remove the directory entry.
   1540    789     ahrens 	 */
   1541   5331        amw 	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
   1542    789     ahrens 
   1543    789     ahrens 	if (error) {
   1544    789     ahrens 		dmu_tx_commit(tx);
   1545    789     ahrens 		goto out;
   1546    789     ahrens 	}
   1547    789     ahrens 
   1548   3461     ahrens 	if (unlinked) {
   1549    789     ahrens 		mutex_enter(&vp->v_lock);
   1550   6992     maybee 		delete_now = may_delete_now && !toobig &&
   1551    789     ahrens 		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
   1552    789     ahrens 		    zp->z_phys->zp_xattr == xattr_obj &&
   1553    789     ahrens 		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
   1554    789     ahrens 		mutex_exit(&vp->v_lock);
   1555    789     ahrens 	}
   1556    789     ahrens 
   1557    789     ahrens 	if (delete_now) {
   1558    789     ahrens 		if (zp->z_phys->zp_xattr) {
   1559    789     ahrens 			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
   1560    789     ahrens 			ASSERT3U(error, ==, 0);
   1561    789     ahrens 			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
   1562    789     ahrens 			dmu_buf_will_dirty(xzp->z_dbuf, tx);
   1563    789     ahrens 			mutex_enter(&xzp->z_lock);
   1564   3461     ahrens 			xzp->z_unlinked = 1;
   1565    789     ahrens 			xzp->z_phys->zp_links = 0;
   1566    789     ahrens 			mutex_exit(&xzp->z_lock);
   1567   3461     ahrens 			zfs_unlinked_add(xzp, tx);
   1568    789     ahrens 			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
   1569    789     ahrens 		}
   1570    789     ahrens 		mutex_enter(&zp->z_lock);
   1571    789     ahrens 		mutex_enter(&vp->v_lock);
   1572    789     ahrens 		vp->v_count--;
   1573    789     ahrens 		ASSERT3U(vp->v_count, ==, 0);
   1574    789     ahrens 		mutex_exit(&vp->v_lock);
   1575    789     ahrens 		mutex_exit(&zp->z_lock);
   1576    789     ahrens 		zfs_znode_delete(zp, tx);
   1577   3461     ahrens 	} else if (unlinked) {
   1578   3461     ahrens 		zfs_unlinked_add(zp, tx);
   1579    789     ahrens 	}
   1580    789     ahrens 
   1581   5331        amw 	txtype = TX_REMOVE;
   1582   5331        amw 	if (flags & FIGNORECASE)
   1583   5331        amw 		txtype |= TX_CI;
   1584   5331        amw 	zfs_log_remove(zilog, tx, txtype, dzp, name);
   1585   5331        amw 
   1586   5331        amw 	dmu_tx_commit(tx);
   1587   5331        amw out:
   1588   5331        amw 	if (realnmp)
   1589   5331        amw 		pn_free(realnmp);
   1590   5331        amw 
   1591    789     ahrens 	zfs_dirent_unlock(dl);
   1592    789     ahrens 
   1593    789     ahrens 	if (!delete_now) {
   1594    789     ahrens 		VN_RELE(vp);
   1595    789     ahrens 	} else if (xzp) {
   1596   6992     maybee 		/* this rele is delayed to prevent nesting transactions */
   1597    789     ahrens 		VN_RELE(ZTOV(xzp));
   1598    789     ahrens 	}
   1599    789     ahrens 
   1600    789     ahrens 	ZFS_EXIT(zfsvfs);
   1601    789     ahrens 	return (error);
   1602    789     ahrens }
   1603    789     ahrens 
   1604    789     ahrens /*
   1605    789     ahrens  * Create a new directory and insert it into dvp using the name
   1606    789     ahrens  * provided.  Return a pointer to the inserted directory.
   1607    789     ahrens  *
   1608    789     ahrens  *	IN:	dvp	- vnode of directory to add subdir to.
   1609    789     ahrens  *		dirname	- name of new directory.
   1610    789     ahrens  *		vap	- attributes of new directory.
   1611    789     ahrens  *		cr	- credentials of caller.
   1612   5331        amw  *		ct	- caller context
   1613   5331        amw  *		vsecp	- ACL to be set
   1614    789     ahrens  *
   1615    789     ahrens  *	OUT:	vpp	- vnode of created directory.
   1616    789     ahrens  *
   1617    789     ahrens  *	RETURN:	0 if success
   1618    789     ahrens  *		error code if failure
   1619    789     ahrens  *
   1620    789     ahrens  * Timestamps:
   1621    789     ahrens  *	dvp - ctime|mtime updated
   1622    789     ahrens  *	 vp - ctime|mtime|atime updated
   1623    789     ahrens  */
   1624   5331        amw /*ARGSUSED*/
   1625   5331        amw static int
   1626   5331        amw zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
   1627   5331        amw     caller_context_t *ct, int flags, vsecattr_t *vsecp)
   1628    789     ahrens {
   1629    789     ahrens 	znode_t		*zp, *dzp = VTOZ(dvp);
   1630    789     ahrens 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
   1631   5326   ek110237 	zilog_t		*zilog;
   1632    789     ahrens 	zfs_dirlock_t	*dl;
   1633   5331        amw 	uint64_t	txtype;
   1634   5331        amw 	dmu_tx_t	*tx;
   1635   5331        amw 	int		error;
   1636   5331        amw 	int		zf = ZNEW;
   1637   7847       Mark 	ksid_t		*ksid;
   1638   7847       Mark 	uid_t		uid;
   1639   7847       Mark 	gid_t		gid = crgetgid(cr);
   1640   9179       Mark 	zfs_acl_ids_t	acl_ids;
   1641   9179       Mark 	boolean_t	fuid_dirtied;
   1642    789     ahrens 
   1643    789     ahrens 	ASSERT(vap->va_type == VDIR);
   1644    789     ahrens 
   1645   5331        amw 	/*
   1646   5331        amw 	 * If we have an ephemeral id, ACL, or XVATTR then
   1647   5331        amw 	 * make sure file system is at proper version
   1648   5331        amw 	 */
   1649   5331        amw 
   1650   7847       Mark 	ksid = crgetsid(cr, KSID_OWNER);
   1651   7847       Mark 	if (ksid)
   1652   7847       Mark 		uid = ksid_getid(ksid);
   1653   7847       Mark 	else
   1654   7847       Mark 		uid = crgetuid(cr);
   1655   5331        amw 	if (zfsvfs->z_use_fuids == B_FALSE &&
   1656   7847       Mark 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
   1657   7876       Mark 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
   1658   5331        amw 		return (EINVAL);
   1659   5331        amw 
   1660   5367     ahrens 	ZFS_ENTER(zfsvfs);
   1661   5367     ahrens 	ZFS_VERIFY_ZP(dzp);
   1662   5326   ek110237 	zilog = zfsvfs->z_log;
   1663    789     ahrens 
   1664    789     ahrens 	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
   1665    789     ahrens 		ZFS_EXIT(zfsvfs);
   1666    789     ahrens 		return (EINVAL);
   1667    789     ahrens 	}
   1668   5331        amw 
   1669   5498       timh 	if (zfsvfs->z_utf8 && u8_validate(dirname,
   1670   5331        amw 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
   1671   5331        amw 		ZFS_EXIT(zfsvfs);
   1672   5331        amw 		return (EILSEQ);
   1673   5331        amw 	}
   1674   5331        amw 	if (flags & FIGNORECASE)
   1675   5331        amw 		zf |= ZCILOOK;
   1676   5331        amw 
   1677   5331        amw 	if (vap->va_mask & AT_XVATTR)
   1678   5331        amw 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
   1679   5331        amw 		    crgetuid(cr), cr, vap->va_type)) != 0) {
   1680   5331        amw 			ZFS_EXIT(zfsvfs);
   1681   5331        amw 			return (error);
   1682   5331        amw 		}
   1683   5331        amw 
   1684   5331        amw 	/*
   1685   5331        amw 	 * First make sure the new directory doesn't exist.
   1686   5331        amw 	 */
   1687    789     ahrens top:
   1688    789     ahrens 	*vpp = NULL;
   1689    789     ahrens 
   1690   5331        amw 	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
   1691   5331        amw 	    NULL, NULL)) {
   1692   5331        amw 		ZFS_EXIT(zfsvfs);
   1693   5331        amw 		return (error);
   1694   5331        amw 	}
   1695   5331        amw 
   1696   5331        amw 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
   1697   5331        amw 		zfs_dirent_unlock(dl);
   1698   5331        amw 		ZFS_EXIT(zfsvfs);
   1699   5331        amw 		return (error);
   1700   5331        amw 	}
   1701   5331        amw 
   1702   9179       Mark 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
   1703   9179       Mark 	    &acl_ids)) != 0) {
   1704   9179       Mark 		zfs_dirent_unlock(dl);
   1705   9179       Mark 		ZFS_EXIT(zfsvfs);
   1706   9179       Mark 		return (error);
   1707   9179       Mark 	}
   1708   9396    Matthew 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
   1709  10143        Tim 		zfs_acl_ids_free(&acl_ids);
   1710   9396    Matthew 		zfs_dirent_unlock(dl);
   1711   9396    Matthew 		ZFS_EXIT(zfsvfs);
   1712   9396    Matthew 		return (EDQUOT);
   1713   9396    Matthew 	}
   1714   9179       Mark 
   1715    789     ahrens 	/*
   1716    789     ahrens 	 * Add a new entry to the directory.
   1717    789     ahrens 	 */
   1718    789     ahrens 	tx = dmu_tx_create(zfsvfs->z_os);
   1719   1544   eschrock 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
   1720   1544   eschrock 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
   1721   9179       Mark 	fuid_dirtied = zfsvfs->z_fuid_dirty;
   1722   9396    Matthew 	if (fuid_dirtied)
   1723   9396    Matthew 		zfs_fuid_txhold(zfsvfs, tx);
   1724   9179       Mark 	if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
   1725    789     ahrens 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
   1726    789     ahrens 		    0, SPA_MAXBLOCKSIZE);
   1727   8227       Neil 	error = dmu_tx_assign(tx, TXG_NOWAIT);
   1728   8227       Neil 	if (error) {
   1729   9179       Mark 		zfs_acl_ids_free(&acl_ids);
   1730   9179       Mark 		zfs_dirent_unlock(dl);
   1731   9179       Mark 		if (error == ERESTART) {
   1732   9179       Mark 			dmu_tx_wait(tx);
   1733   9179       Mark 			dmu_tx_abort(tx);
   1734   9179       Mark 			goto top;
   1735   9179       Mark 		}
   1736   9179       Mark 		dmu_tx_abort(tx);
   1737   9179       Mark 		ZFS_EXIT(zfsvfs);
   1738    789     ahrens 		return (error);
   1739    789     ahrens 	}
   1740    789     ahrens 
   1741    789     ahrens 	/*
   1742    789     ahrens 	 * Create new node.
   1743    789     ahrens 	 */
   1744   9179       Mark 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
   1745   9179       Mark 
   1746   9179       Mark 	if (fuid_dirtied)
   1747   9179       Mark 		zfs_fuid_sync(zfsvfs, tx);
   1748    789     ahrens 	/*
   1749    789     ahrens 	 * Now put new name in parent dir.
   1750    789     ahrens 	 */
   1751    789     ahrens 	(void) zfs_link_create(dl, zp, tx, ZNEW);
   1752    789     ahrens 
   1753    789     ahrens 	*vpp = ZTOV(zp);
   1754    789     ahrens 
   1755   5331        amw 	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
   1756   5331        amw 	if (flags & FIGNORECASE)
   1757   5331        amw 		txtype |= TX_CI;
   1758   9179       Mark 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
   1759   9179       Mark 	    acl_ids.z_fuidp, vap);
   1760   9179       Mark 
   1761   9179       Mark 	zfs_acl_ids_free(&acl_ids);
   1762    789     ahrens 	dmu_tx_commit(tx);
   1763    789     ahrens 
   1764    789     ahrens 	zfs_dirent_unlock(dl);
   1765    789     ahrens 
   1766    789     ahrens 	ZFS_EXIT(zfsvfs);
   1767    789     ahrens 	return (0);
   1768    789     ahrens }
   1769    789     ahrens 
   1770    789     ahrens /*
   1771    789     ahrens  * Remove a directory subdir entry.  If the current working
   1772    789     ahrens  * directory is the same as the subdir to be removed, the
   1773    789     ahrens  * remove will fail.
   1774    789     ahrens  *
   1775    789     ahrens  *	IN:	dvp	- vnode of directory to remove from.
   1776    789     ahrens  *		name	- name of directory to be removed.
   1777    789     ahrens  *		cwd	- vnode of current working directory.
   1778    789     ahrens  *		cr	- credentials of caller.
   1779   5331        amw  *		ct	- caller context
   1780   5331        amw  *		flags	- case flags
   1781    789     ahrens  *
   1782    789     ahrens  *	RETURN:	0 if success
   1783    789     ahrens  *		error code if failure
   1784    789     ahrens  *
   1785    789     ahrens  * Timestamps:
   1786    789     ahrens  *	dvp - ctime|mtime updated
   1787    789     ahrens  */
   1788   5331        amw /*ARGSUSED*/
   1789   5331        amw static int
   1790   5331        amw zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
   1791   5331        amw     caller_context_t *ct, int flags)
   1792    789     ahrens {
   1793    789     ahrens 	znode_t		*dzp = VTOZ(dvp);
   1794    789     ahrens 	znode_t		*zp;
   1795    789     ahrens 	vnode_t		*vp;
   1796    789     ahrens 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
   1797   5326   ek110237 	zilog_t		*zilog;
   1798    789     ahrens 	zfs_dirlock_t	*dl;
   1799    789     ahrens 	dmu_tx_t	*tx;
   1800    789     ahrens 	int		error;
   1801   5331        amw 	int		zflg = ZEXISTS;
   1802   5331        amw 
   1803   5367     ahrens 	ZFS_ENTER(zfsvfs);
   1804   5367     ahrens 	ZFS_VERIFY_ZP(dzp);
   1805   5331        amw 	zilog = zfsvfs->z_log;
   1806   5331        amw 
   1807   5331        amw 	if (flags & FIGNORECASE)
   1808   5331        amw 		zflg |= ZCILOOK;
   1809    789     ahrens top:
   1810    789     ahrens 	zp = NULL;
   1811    789     ahrens 
   1812    789     ahrens 	/*
   1813    789     ahrens 	 * Attempt to lock directory; fail if entry doesn't exist.
   1814    789     ahrens 	 */
   1815   5331        amw 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
   1816   5331        amw 	    NULL, NULL)) {
   1817    789     ahrens 		ZFS_EXIT(zfsvfs);
   1818    789     ahrens 		return (error);
   1819    789     ahrens 	}
   1820    789     ahrens 
   1821    789     ahrens 	vp = ZTOV(zp);
   1822    789     ahrens 
   1823    789     ahrens 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
   1824    789     ahrens 		goto out;
   1825    789     ahrens 	}
   1826    789     ahrens 
   1827    789     ahrens 	if (vp->v_type != VDIR) {
   1828    789     ahrens 		error = ENOTDIR;
   1829    789     ahrens 		goto out;
   1830    789     ahrens 	}
   1831    789     ahrens 
   1832    789     ahrens 	if (vp == cwd) {
   1833    789     ahrens 		error = EINVAL;
   1834    789     ahrens 		goto out;
   1835    789     ahrens 	}
   1836    789     ahrens 
   1837   5331        amw 	vnevent_rmdir(vp, dvp, name, ct);
   1838    789     ahrens 
   1839    789     ahrens 	/*
   1840   3897     maybee 	 * Grab a lock on the directory to make sure that noone is
   1841   3897     maybee 	 * trying to add (or lookup) entries while we are removing it.
   1842   3897     maybee 	 */
   1843   3897     maybee 	rw_enter(&zp->z_name_lock, RW_WRITER);
   1844   3897     maybee 
   1845   3897     maybee 	/*
   1846   3897     maybee 	 * Grab a lock on the parent pointer to make sure we play well
   1847    789     ahrens 	 * with the treewalk and directory rename code.
   1848    789     ahrens 	 */
   1849    789     ahrens 	rw_enter(&zp->z_parent_lock, RW_WRITER);
   1850    789     ahrens 
   1851    789     ahrens 	tx = dmu_tx_create(zfsvfs->z_os);
   1852   1544   eschrock 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
   1853    789     ahrens 	dmu_tx_hold_bonus(tx, zp->z_id);
   1854   3461     ahrens 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
   1855   8227       Neil 	error = dmu_tx_assign(tx, TXG_NOWAIT);
   1856    789     ahrens 	if (error) {
   1857    789     ahrens 		rw_exit(&zp->z_parent_lock);
   1858   3897     maybee 		rw_exit(&zp->z_name_lock);
   1859    789     ahrens 		zfs_dirent_unlock(dl);
   1860    789     ahrens 		VN_RELE(vp);
   1861   8227       Neil 		if (error == ERESTART) {
   1862   2113     ahrens 			dmu_tx_wait(tx);
   1863   2113     ahrens 			dmu_tx_abort(tx);
   1864    789     ahrens 			goto top;
   1865    789     ahrens 		}
   1866   2113     ahrens 		dmu_tx_abort(tx);
   1867    789     ahrens 		ZFS_EXIT(zfsvfs);
   1868    789     ahrens 		return (error);
   1869    789     ahrens 	}
   1870    789     ahrens 
   1871   5331        amw 	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
   1872   5331        amw 
   1873   5331        amw 	if (error == 0) {
   1874   5331        amw 		uint64_t txtype = TX_RMDIR;
   1875   5331        amw 		if (flags & FIGNORECASE)
   1876   5331        amw 			txtype |= TX_CI;
   1877   5331        amw 		zfs_log_remove(zilog, tx, txtype, dzp, name);
   1878   5331        amw 	}
   1879    789     ahrens 
   1880    789     ahrens 	dmu_tx_commit(tx);
   1881    789     ahrens 
   1882    789     ahrens 	rw_exit(&zp->z_parent_lock);
   1883   3897     maybee 	rw_exit(&zp->z_name_lock);
   1884    789     ahrens out:
   1885    789     ahrens 	zfs_dirent_unlock(dl);
   1886    789     ahrens 
   1887    789     ahrens 	VN_RELE(vp);
   1888    789     ahrens 
   1889    789     ahrens 	ZFS_EXIT(zfsvfs);
   1890    789     ahrens 	return (error);
   1891    789     ahrens }
   1892    789     ahrens 
   1893    789     ahrens /*
   1894    789     ahrens  * Read as many directory entries as will fit into the provided
   1895    789     ahrens  * buffer from the given directory cursor position (specified in
   1896    789     ahrens  * the uio structure.
   1897    789     ahrens  *
   1898    789     ahrens  *	IN:	vp	- vnode of directory to read.
   1899    789     ahrens  *		uio	- structure supplying read location, range info,
   1900    789     ahrens  *			  and return buffer.
   1901    789     ahrens  *		cr	- credentials of caller.
   1902   5331        amw  *		ct	- caller context
   1903   5331        amw  *		flags	- case flags
   1904    789     ahrens  *
   1905    789     ahrens  *	OUT:	uio	- updated offset and range, buffer filled.
   1906    789     ahrens  *		eofp	- set to true if end-of-file detected.
   1907    789     ahrens  *
   1908    789     ahrens  *	RETURN:	0 if success
   1909    789     ahrens  *		error code if failure
   1910    789     ahrens  *
   1911    789     ahrens  * Timestamps:
   1912    789     ahrens  *	vp - atime updated
   1913    789     ahrens  *
   1914    789     ahrens  * Note that the low 4 bits of the cookie returned by zap is always zero.
   1915    789     ahrens  * This allows us to use the low range for "special" directory entries:
   1916    789     ahrens  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
   1917    789     ahrens  * we use the offset 2 for the '.zfs' directory.
   1918    789     ahrens  */
   1919    789     ahrens /* ARGSUSED */
   1920    789     ahrens static int
   1921   5331        amw zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
   1922   5331        amw     caller_context_t *ct, int flags)
   1923    789     ahrens {
   1924    789     ahrens 	znode_t		*zp = VTOZ(vp);
   1925    789     ahrens 	iovec_t		*iovp;
   1926   5331        amw 	edirent_t	*eodp;
   1927    789     ahrens 	dirent64_t	*odp;
   1928    789     ahrens 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
   1929    869     perrin 	objset_t	*os;
   1930    789     ahrens 	caddr_t		outbuf;
   1931    789     ahrens 	size_t		bufsize;
   1932    789     ahrens 	zap_cursor_t	zc;
   1933    789     ahrens 	zap_attribute_t	zap;
   1934    789     ahrens 	uint_t		bytes_wanted;
   1935    789     ahrens 	uint64_t	offset; /* must be unsigned; checks for < 1 */
   1936    789     ahrens 	int		local_eof;
   1937    869     perrin 	int		outcount;
   1938    869     perrin 	int		error;
   1939    869     perrin 	uint8_t		prefetch;
   1940   5663   ck153898 	boolean_t	check_sysattrs;
   1941    789     ahrens 
   1942   5367     ahrens 	ZFS_ENTER(zfsvfs);
   1943   5367     ahrens 	ZFS_VERIFY_ZP(zp);
   1944    789     ahrens 
   1945    789     ahrens 	/*
   1946    789     ahrens 	 * If we are not given an eof variable,
   1947    789     ahrens 	 * use a local one.
   1948    789     ahrens 	 */
   1949    789     ahrens 	if (eofp == NULL)
   1950    789     ahrens 		eofp = &local_eof;
   1951    789     ahrens 
   1952    789     ahrens 	/*
   1953    789     ahrens 	 * Check for valid iov_len.
   1954    789     ahrens 	 */
   1955    789     ahrens 	if (uio->uio_iov->iov_len <= 0) {
   1956    789     ahrens 		ZFS_EXIT(zfsvfs);
   1957    789     ahrens 		return (EINVAL);
   1958    789     ahrens 	}
   1959    789     ahrens 
   1960    789     ahrens 	/*
   1961    789     ahrens 	 * Quit if directory has been removed (posix)
   1962    789     ahrens 	 */
   1963   3461     ahrens 	if ((*eofp = zp->z_unlinked) != 0) {
   1964    789     ahrens 		ZFS_EXIT(zfsvfs);
   1965    789     ahrens 		return (0);
   1966    789     ahrens 	}
   1967    789     ahrens 
   1968    869     perrin 	error = 0;
   1969    869     perrin 	os = zfsvfs->z_os;
   1970    869     perrin 	offset = uio->uio_loffset;
   1971    869     perrin 	prefetch = zp->z_zn_prefetch;
   1972    869     perrin 
   1973    789     ahrens 	/*
   1974    789     ahrens 	 * Initialize the iterator cursor.
   1975    789     ahrens 	 */
   1976    789     ahrens 	if (offset <= 3) {
   1977    789     ahrens 		/*
   1978    789     ahrens 		 * Start iteration from the beginning of the directory.
   1979    789     ahrens 		 */
   1980    869     perrin 		zap_cursor_init(&zc, os, zp->z_id);
   1981    789     ahrens 	} else {
   1982    789     ahrens 		/*
   1983    789     ahrens 		 * The offset is a serialized cursor.
   1984    789     ahrens 		 */
   1985    869     perrin 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
   1986    789     ahrens 	}
   1987    789     ahrens 
   1988    789     ahrens 	/*
   1989    789     ahrens 	 * Get space to change directory entries into fs independent format.
   1990    789     ahrens 	 */
   1991    789     ahrens 	iovp = uio->uio_iov;
   1992    789     ahrens 	bytes_wanted = iovp->iov_len;
   1993    789     ahrens 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
   1994    789     ahrens 		bufsize = bytes_wanted;
   1995    789     ahrens 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
   1996    789     ahrens 		odp = (struct dirent64 *)outbuf;
   1997    789     ahrens 	} else {
   1998    789     ahrens 		bufsize = bytes_wanted;
   1999    789     ahrens 		odp = (struct dirent64 *)iovp->iov_base;
   2000    789     ahrens 	}
   2001   5331        amw 	eodp = (struct edirent *)odp;
   2002   5663   ck153898 
   2003   5663   ck153898 	/*
   2004   7757     Janice 	 * If this VFS supports the system attribute view interface; and
   2005   7757     Janice 	 * we're looking at an extended attribute directory; and we care
   2006   7757     Janice 	 * about normalization conflicts on this vfs; then we must check
   2007   7757     Janice 	 * for normalization conflicts with the sysattr name space.
   2008   7757     Janice 	 */
   2009   7757     Janice 	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
   2010   5663   ck153898 	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
   2011   5663   ck153898 	    (flags & V_RDDIR_ENTFLAGS);
   2012    789     ahrens 
   2013    789     ahrens 	/*
   2014    789     ahrens 	 * Transform to file-system independent format
   2015    789     ahrens 	 */
   2016    789     ahrens 	outcount = 0;
   2017    789     ahrens 	while (outcount < bytes_wanted) {
   2018   3912      lling 		ino64_t objnum;
   2019   3912      lling 		ushort_t reclen;
   2020   3912      lling 		off64_t *next;
   2021   3912      lling 
   2022    789     ahrens 		/*
   2023    789     ahrens 		 * Special case `.', `..', and `.zfs'.
   2024    789     ahrens 		 */
   2025    789     ahrens 		if (offset == 0) {
   2026    789     ahrens 			(void) strcpy(zap.za_name, ".");
   2027   5331        amw 			zap.za_normalization_conflict = 0;
   2028   3912      lling 			objnum = zp->z_id;
   2029    789     ahrens 		} else if (offset == 1) {
   2030    789     ahrens 			(void) strcpy(zap.za_name, "..");
   2031   5331        amw 			zap.za_normalization_conflict = 0;
   2032   3912      lling 			objnum = zp->z_phys->zp_parent;
   2033    789     ahrens 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
   2034    789     ahrens 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
   2035   5331        amw 			zap.za_normalization_conflict = 0;
   2036   3912      lling 			objnum = ZFSCTL_INO_ROOT;
   2037    789     ahrens 		} else {
   2038    789     ahrens 			/*
   2039    789     ahrens 			 * Grab next entry.
   2040    789     ahrens 			 */
   2041    789     ahrens 			if (error = zap_cursor_retrieve(&zc, &zap)) {
   2042    789     ahrens 				if ((*eofp = (error == ENOENT)) != 0)
   2043    789     ahrens 					break;
   2044    789     ahrens 				else
   2045    789     ahrens 					goto update;
   2046    789     ahrens 			}
   2047    789     ahrens 
   2048    789     ahrens 			if (zap.za_integer_length != 8 ||
   2049    789     ahrens 			    zap.za_num_integers != 1) {
   2050    789     ahrens 				cmn_err(CE_WARN, "zap_readdir: bad directory "
   2051    789     ahrens 				    "entry, obj = %lld, offset = %lld\n",
   2052    789     ahrens 				    (u_longlong_t)zp->z_id,
   2053    789     ahrens 				    (u_longlong_t)offset);
   2054    789     ahrens 				error = ENXIO;
   2055    789     ahrens 				goto update;
   2056    789     ahrens 			}
   2057   3912      lling 
   2058   3912      lling 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
   2059   3912      lling 			/*
   2060   3912      lling 			 * MacOS X can extract the object type here such as:
   2061   3912      lling 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
   2062   3912      lling 			 */
   2063   5663   ck153898 
   2064   5663   ck153898 			if (check_sysattrs && !zap.za_normalization_conflict) {
   2065   5663   ck153898 				zap.za_normalization_conflict =
   2066   5663   ck153898 				    xattr_sysattr_casechk(zap.za_name);
   2067   5663   ck153898 			}
   2068    789     ahrens 		}
   2069   5331        amw 
   2070   9749        Tim 		if (flags & V_RDDIR_ACCFILTER) {
   2071   9749        Tim 			/*
   2072   9749        Tim 			 * If we have no access at all, don't include
   2073   9749        Tim 			 * this entry in the returned information
   2074   9749        Tim 			 */
   2075   9749        Tim 			znode_t	*ezp;
   2076   9749        Tim 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
   2077   9749        Tim 				goto skip_entry;
   2078   9749        Tim 			if (!zfs_has_access(ezp, cr)) {
   2079   9749        Tim 				VN_RELE(ZTOV(ezp));
   2080   9749        Tim 				goto skip_entry;
   2081   9749        Tim 			}
   2082   9749        Tim 			VN_RELE(ZTOV(ezp));
   2083   9749        Tim 		}
   2084   9749        Tim 
   2085   5331        amw 		if (flags & V_RDDIR_ENTFLAGS)
   2086   5331        amw 			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
   2087   5331        amw 		else
   2088   5331        amw 			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
   2089    789     ahrens 
   2090    789     ahrens 		/*
   2091    789     ahrens 		 * Will this entry fit in the buffer?
   2092    789     ahrens 		 */
   2093   3912      lling 		if (outcount + reclen > bufsize) {
   2094    789     ahrens 			/*
   2095    789     ahrens 			 * Did we manage to fit anything in the buffer?
   2096    789     ahrens 			 */
   2097    789     ahrens 			if (!outcount) {
   2098    789     ahrens 				error = EINVAL;
   2099    789     ahrens 				goto update;
   2100    789     ahrens 			}
   2101    789     ahrens 			break;
   2102    789     ahrens 		}
   2103   5331        amw 		if (flags & V_RDDIR_ENTFLAGS) {
   2104   5331        amw 			/*
   2105   5331        amw 			 * Add extended flag entry:
   2106   5331        amw 			 */
   2107   5331        amw 			eodp->ed_ino = objnum;
   2108   5331        amw 			eodp->ed_reclen = reclen;
   2109   5331        amw 			/* NOTE: ed_off is the offset for the *next* entry */
   2110   5331        amw 			next = &(eodp->ed_off);
   2111   5331        amw 			eodp->ed_eflags = zap.za_normalization_conflict ?
   2112   5331        amw 			    ED_CASE_CONFLICT : 0;
   2113   5331        amw 			(void) strncpy(eodp->ed_name, zap.za_name,
   2114   5331        amw 			    EDIRENT_NAMELEN(reclen));
   2115   5331        amw 			eodp = (edirent_t *)((intptr_t)eodp + reclen);
   2116   5331        amw 		} else {
   2117   5331        amw 			/*
   2118   5331        amw 			 * Add normal entry:
   2119   5331        amw 			 */
   2120   5331        amw 			odp->d_ino = objnum;
   2121   5331        amw 			odp->d_reclen = reclen;
   2122   5331        amw 			/* NOTE: d_off is the offset for the *next* entry */
   2123   5331        amw 			next = &(odp->d_off);
   2124   5331        amw 			(void) strncpy(odp->d_name, zap.za_name,
   2125   5331        amw 			    DIRENT64_NAMELEN(reclen));
   2126   5331        amw 			odp = (dirent64_t *)((intptr_t)odp + reclen);
   2127   5331        amw 		}
   2128   3912      lling 		outcount += reclen;
   2129    789     ahrens 
   2130    789     ahrens 		ASSERT(outcount <= bufsize);
   2131    789     ahrens 
   2132    789     ahrens 		/* Prefetch znode */
   2133    869     perrin 		if (prefetch)
   2134   3912      lling 			dmu_prefetch(os, objnum, 0, 0);
   2135    789     ahrens 
   2136   9749        Tim 	skip_entry:
   2137    789     ahrens 		/*
   2138    789     ahrens 		 * Move to the next entry, fill in the previous offset.
   2139    789     ahrens 		 */
   2140    789     ahrens 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
   2141    789     ahrens 			zap_cursor_advance(&zc);
   2142    789     ahrens 			offset = zap_cursor_serialize(&zc);
   2143    789     ahrens 		} else {
   2144    789     ahrens 			offset += 1;
   2145    789     ahrens 		}
   2146    789     ahrens 		*next = offset;
   2147    789     ahrens 	}
   2148    869     perrin 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
   2149    789     ahrens 
   2150    789     ahrens 	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
   2151    789     ahrens 		iovp->iov_base += outcount;
   2152    789     ahrens 		iovp->iov_len -= outcount;
   2153    789     ahrens 		uio->uio_resid -= outcount;
   2154    789     ahrens 	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
   2155    789     ahrens 		/*
   2156    789     ahrens 		 * Reset the pointer.
   2157    789     ahrens 		 */
   2158    789     ahrens 		offset = uio->uio_loffset;
   2159    789     ahrens 	}
   2160    789     ahrens 
   2161    789     ahrens update:
   2162    885     ahrens 	zap_cursor_fini(&zc);
   2163    789     ahrens 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
   2164    789     ahrens 		kmem_free(outbuf, bufsize);
   2165    789     ahrens 
   2166    789     ahrens 	if (error == ENOENT)
   2167    789     ahrens 		error = 0;
   2168    789     ahrens 
   2169    789     ahrens 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
   2170    789     ahrens 
   2171    789     ahrens 	uio->uio_loffset = offset;
   2172    789     ahrens 	ZFS_EXIT(zfsvfs);
   2173    789     ahrens 	return (error);
   2174    789     ahrens }
   2175    789     ahrens 
   2176   4720   fr157268 ulong_t zfs_fsync_sync_cnt = 4;
   2177   4720   fr157268 
   2178    789     ahrens static int
   2179   5331        amw zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
   2180    789     ahrens {
   2181    789     ahrens 	znode_t	*zp = VTOZ(vp);
   2182    789     ahrens 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
   2183   1773   eschrock 
   2184   1773   eschrock 	/*
   2185   1773   eschrock 	 * Regardless of whether this is required for standards conformance,
   2186   1773   eschrock 	 * this is the logical behavior when fsync() is called on a file with
   2187   1773   eschrock 	 * dirty pages.  We use B_ASYNC since the ZIL transactions are already
   2188   1773   eschrock 	 * going to be pushed out as part of the zil_commit().
   2189   1773   eschrock 	 */
   2190   1773   eschrock 	if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
   2191   1773   eschrock 	    (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
   2192   5331        amw 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
   2193   4720   fr157268 
   2194   4720   fr157268 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
   2195    789     ahrens 
   2196   5367     ahrens 	ZFS_ENTER(zfsvfs);
   2197   5367     ahrens 	ZFS_VERIFY_ZP(zp);
   2198   2638     perrin 	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
   2199    789     ahrens 	ZFS_EXIT(zfsvfs);
   2200    789     ahrens 	return (0);
   2201    789     ahrens }
   2202   5331        amw 
   2203    789     ahrens 
   2204    789     ahrens /*
   2205    789     ahrens  * Get the requested file attributes and place them in the provided
   2206    789     ahrens  * vattr structure.
   2207    789     ahrens  *
   2208    789     ahrens  *	IN:	vp	- vnode of file.
   2209    789     ahrens  *		vap	- va_mask identifies requested attributes.
   2210   5331        amw  *			  If AT_XVATTR set, then optional attrs are requested
   2211   5331        amw  *		flags	- ATTR_NOACLCHECK (CIFS server context)
   2212   5331        amw  *		cr	- credentials of caller.
   2213   5331        amw  *		ct	- caller context
   2214    789     ahrens  *
   2215    789     ahrens  *	OUT:	vap	- attribute values.
   2216    789     ahrens  *
   2217    789     ahrens  *	RETURN:	0 (always succeeds)
   2218    789     ahrens  */
   2219    789     ahrens /* ARGSUSED */
   2220    789     ahrens static int
   2221   5331        amw zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
   2222   5331        amw     caller_context_t *ct)
   2223    789     ahrens {
   2224    789     ahrens 	znode_t *zp = VTOZ(vp);
   2225    789     ahrens 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
   2226   5326   ek110237 	znode_phys_t *pzp;
   2227   5331        amw 	int	error = 0;
   2228   4543      marks 	uint64_t links;
   2229   5331        amw 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
   2230   5331        amw 	xoptattr_t *xoap = NULL;
   2231   5331        amw 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
   2232    789     ahrens 
   2233   5367     ahrens 	ZFS_ENTER(zfsvfs);
   2234   5367     ahrens 	ZFS_VERIFY_ZP(zp);
   2235   5326   ek110237 	pzp = zp->z_phys;
   2236   5331        amw 
   2237   5331        amw 	/*
   2238   5331        amw 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
   2239   5331        amw 	 * Also, if we are the owner don't bother, since owner should
   2240   5331        amw 	 * always be allowed to read basic attributes of file.
   2241   5331        amw 	 */
   2242   5331        amw 	if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
   2243   5331        amw 	    (pzp->zp_uid != crgetuid(cr))) {
   2244   5331        amw 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
   2245   5331        amw 		    skipaclchk, cr)) {
   2246   5331        amw 			ZFS_EXIT(zfsvfs);
   2247   5331        amw 			return (error);
   2248   5331        amw 		}
   2249   5331        amw 	}
   2250    789     ahrens 
   2251    789     ahrens 	/*
   2252    789     ahrens 	 * Return all attributes.  It's cheaper to provide the answer
   2253    789     ahrens 	 * than to determine whether we were asked the question.
   2254    789     ahrens 	 */
   2255    789     ahrens 
   2256   9774        Ray 	mutex_enter(&zp->z_lock);
   2257    789     ahrens 	vap->va_type = vp->v_type;
   2258    789     ahrens 	vap->va_mode = pzp->zp_mode & MODEMASK;
   2259   5771   jp151216 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
   2260    789     ahrens 	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
   2261    789     ahrens 	vap->va_nodeid = zp->z_id;
   2262   4543      marks 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
   2263   4543      marks 		links = pzp->zp_links + 1;
   2264   4543      marks 	else
   2265   4543      marks 		links = pzp->zp_links;
   2266   4543      marks 	vap->va_nlink = MIN(links, UINT32_MAX);	/* nlink_t limit! */
   2267    789     ahrens 	vap->va_size = pzp->zp_size;
   2268   1816      marks 	vap->va_rdev = vp->v_rdev;
   2269    789     ahrens 	vap->va_seq = zp->z_seq;
   2270    789     ahrens 
   2271   5331        amw 	/*
   2272   5331        amw 	 * Add in any requested optional attributes and the create time.
   2273   5331        amw 	 * Also set the corresponding bits in the returned attribute bitmap.
   2274   5331        amw 	 */
   2275   5331        amw 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
   2276   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
   2277   5331        amw 			xoap->xoa_archive =
   2278   5331        amw 			    ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
   2279   5331        amw 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
   2280   5331        amw 		}
   2281   5331        amw 
   2282   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
   2283   5331        amw 			xoap->xoa_readonly =
   2284   5331        amw 			    ((pzp->zp_flags & ZFS_READONLY) != 0);
   2285   5331        amw 			XVA_SET_RTN(xvap, XAT_READONLY);
   2286   5331        amw 		}
   2287   5331        amw 
   2288   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
   2289   5331        amw 			xoap->xoa_system =
   2290   5331        amw 			    ((pzp->zp_flags & ZFS_SYSTEM) != 0);
   2291   5331        amw 			XVA_SET_RTN(xvap, XAT_SYSTEM);
   2292   5331        amw 		}
   2293   5331        amw 
   2294   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
   2295   5331        amw 			xoap->xoa_hidden =
   2296   5331        amw 			    ((pzp->zp_flags & ZFS_HIDDEN) != 0);
   2297   5331        amw 			XVA_SET_RTN(xvap, XAT_HIDDEN);
   2298   5331        amw 		}
   2299   5331        amw 
   2300   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
   2301   5331        amw 			xoap->xoa_nounlink =
   2302   5331        amw 			    ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
   2303   5331        amw 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
   2304   5331        amw 		}
   2305   5331        amw 
   2306   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
   2307   5331        amw 			xoap->xoa_immutable =
   2308   5331        amw 			    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
   2309   5331        amw 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
   2310   5331        amw 		}
   2311   5331        amw 
   2312   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
   2313   5331        amw 			xoap->xoa_appendonly =
   2314   5331        amw 			    ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
   2315   5331        amw 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
   2316   5331        amw 		}
   2317   5331        amw 
   2318   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
   2319   5331        amw 			xoap->xoa_nodump =
   2320   5331        amw 			    ((pzp->zp_flags & ZFS_NODUMP) != 0);
   2321   5331        amw 			XVA_SET_RTN(xvap, XAT_NODUMP);
   2322   5331        amw 		}
   2323   5331        amw 
   2324   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
   2325   5331        amw 			xoap->xoa_opaque =
   2326   5331        amw 			    ((pzp->zp_flags & ZFS_OPAQUE) != 0);
   2327   5331        amw 			XVA_SET_RTN(xvap, XAT_OPAQUE);
   2328   5331        amw 		}
   2329   5331        amw 
   2330   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
   2331   5331        amw 			xoap->xoa_av_quarantined =
   2332   5331        amw 			    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
   2333   5331        amw 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
   2334   5331        amw 		}
   2335   5331        amw 
   2336   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
   2337   5331        amw 			xoap->xoa_av_modified =
   2338   5331        amw 			    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
   2339   5331        amw 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
   2340   5331        amw 		}
   2341   5331        amw 
   2342   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
   2343   5331        amw 		    vp->v_type == VREG &&
   2344   5331        amw 		    (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
   2345   5331        amw 			size_t len;
   2346   5331        amw 			dmu_object_info_t doi;
   2347   5331        amw 
   2348   5331        amw 			/*
   2349   5331        amw 			 * Only VREG files have anti-virus scanstamps, so we
   2350   5331        amw 			 * won't conflict with symlinks in the bonus buffer.
   2351   5331        amw 			 */
   2352   5331        amw 			dmu_object_info_from_db(zp->z_dbuf, &doi);
   2353   5331        amw 			len = sizeof (xoap->xoa_av_scanstamp) +
   2354   5331        amw 			    sizeof (znode_phys_t);
   2355   5331        amw 			if (len <= doi.doi_bonus_size) {
   2356   5331        amw 				/*
   2357   5331        amw 				 * pzp points to the start of the
   2358   5331        amw 				 * znode_phys_t. pzp + 1 points to the
   2359   5331        amw 				 * first byte after the znode_phys_t.
   2360   5331        amw 				 */
   2361   5331        amw 				(void) memcpy(xoap->xoa_av_scanstamp,
   2362   5331        amw 				    pzp + 1,
   2363   5331        amw 				    sizeof (xoap->xoa_av_scanstamp));
   2364   5331        amw 				XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
   2365   5331        amw 			}
   2366   5331        amw 		}
   2367   5331        amw 
   2368   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
   2369   5331        amw 			ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
   2370   5331        amw 			XVA_SET_RTN(xvap, XAT_CREATETIME);
   2371   5331        amw 		}
   2372  10793        dai 
   2373  10793        dai 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
   2374  10793        dai 			xoap->xoa_reparse =
   2375  10793        dai 			    ((pzp->zp_flags & ZFS_REPARSE) != 0);
   2376  10793        dai 			XVA_SET_RTN(xvap, XAT_REPARSE);
   2377  10793        dai 		}
   2378   5331        amw 	}
   2379   5331        amw 
   2380    789     ahrens 	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
   2381    789     ahrens 	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
   2382    789     ahrens 	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
   2383    789     ahrens 
   2384    789     ahrens 	mutex_exit(&zp->z_lock);
   2385    789     ahrens 
   2386    789     ahrens 	dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks);
   2387    789     ahrens 
   2388    789     ahrens 	if (zp->z_blksz == 0) {
   2389    789     ahrens 		/*
   2390    789     ahrens 		 * Block size hasn't been set; suggest maximal I/O transfers.
   2391    789     ahrens 		 */
   2392    789     ahrens 		vap->va_blksize = zfsvfs->z_max_blksz;
   2393    789     ahrens 	}
   2394    789     ahrens 
   2395    789     ahrens 	ZFS_EXIT(zfsvfs);
   2396    789     ahrens 	return (0);
   2397    789     ahrens }
   2398    789     ahrens 
   2399    789     ahrens /*
   2400    789     ahrens  * Set the file attributes to the values contained in the
   2401    789     ahrens  * vattr structure.
   2402    789     ahrens  *
   2403    789     ahrens  *	IN:	vp	- vnode of file to be modified.
   2404    789     ahrens  *		vap	- new attribute values.
   2405   5331        amw  *			  If AT_XVATTR set, then optional attrs are being set
   2406    789     ahrens  *		flags	- ATTR_UTIME set if non-default time values provided.
   2407   5331        amw  *			- ATTR_NOACLCHECK (CIFS context only).
   2408   5331        amw  *		cr	- credentials of caller.
   2409   5331        amw  *		ct	- caller context
   2410    789     ahrens  *
   2411    789     ahrens  *	RETURN:	0 if success
   2412    789     ahrens  *		error code if failure
   2413    789     ahrens  *
   2414    789     ahrens  * Timestamps:
   2415    789     ahrens  *	vp - ctime updated, mtime updated if size changed.
   2416    789     ahrens  */
   2417    789     ahrens /* ARGSUSED */
   2418    789     ahrens static int
   2419    789     ahrens zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
   2420    789     ahrens 	caller_context_t *ct)
   2421    789     ahrens {
   2422   5326   ek110237 	znode_t		*zp = VTOZ(vp);
   2423   5326   ek110237 	znode_phys_t	*pzp;
   2424    789     ahrens 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
   2425   5326   ek110237 	zilog_t		*zilog;
   2426    789     ahrens 	dmu_tx_t	*tx;
   2427   1878     maybee 	vattr_t		oldva;
   2428   8190       Mark 	xvattr_t	tmpxvattr;
   2429    789     ahrens 	uint_t		mask = vap->va_mask;
   2430   1878     maybee 	uint_t		saved_mask;
   2431   2796      marks 	int		trim_mask = 0;
   2432    789     ahrens 	uint64_t	new_mode;
   2433   9179       Mark 	uint64_t	new_uid, new_gid;
   2434   1231      marks 	znode_t		*attrzp;
   2435    789     ahrens 	int		need_policy = FALSE;
   2436    789     ahrens 	int		err;
   2437   5331        amw 	zfs_fuid_info_t *fuidp = NULL;
   2438   5331        amw 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
   2439   5331        amw 	xoptattr_t	*xoap;
   2440   5824      marks 	zfs_acl_t	*aclp = NULL;
   2441   5331        amw 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
   2442   9179       Mark 	boolean_t fuid_dirtied = B_FALSE;
   2443    789     ahrens 
   2444    789     ahrens 	if (mask == 0)
   2445    789     ahrens 		return (0);
   2446    789     ahrens 
   2447    789     ahrens 	if (mask & AT_NOSET)
   2448   5331        amw 		return (EINVAL);
   2449   5331        amw 
   2450   5367     ahrens 	ZFS_ENTER(zfsvfs);
   2451   5367     ahrens 	ZFS_VERIFY_ZP(zp);
   2452   5331        amw 
   2453   5331        amw 	pzp = zp->z_phys;
   2454   5331        amw 	zilog = zfsvfs->z_log;
   2455   5331        amw 
   2456   5331        amw 	/*
   2457   5331        amw 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
   2458   5331        amw 	 * that file system is at proper version level
   2459   5331        amw 	 */
   2460   5331        amw 
   2461   5331        amw 	if (zfsvfs->z_use_fuids == B_FALSE &&
   2462   5331        amw 	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
   2463   5331        amw 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
   2464   5386       timh 	    (mask & AT_XVATTR))) {
   2465   5386       timh 		ZFS_EXIT(zfsvfs);
   2466   5386       timh 		return (EINVAL);
   2467   5386       timh 	}
   2468   5386       timh 
   2469   5386       timh 	if (mask & AT_SIZE && vp->v_type == VDIR) {
   2470   5386       timh 		ZFS_EXIT(zfsvfs);
   2471    789     ahrens 		return (EISDIR);
   2472   5386       timh 	}
   2473   5386       timh 
   2474   5386       timh 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
   2475   5386       timh 		ZFS_EXIT(zfsvfs);
   2476   5386       timh 		return (EINVAL);
   2477   5386       timh 	}
   2478    789     ahrens 
   2479   5331        amw 	/*
   2480   5331        amw 	 * If this is an xvattr_t, then get a pointer to the structure of
   2481   5331        amw 	 * optional attributes.  If this is NULL, then we have a vattr_t.
   2482   5331        amw 	 */
   2483   5331        amw 	xoap = xva_getxoptattr(xvap);
   2484   5331        amw 
   2485   8190       Mark 	xva_init(&tmpxvattr);
   2486   8190       Mark 
   2487   5331        amw 	/*
   2488   5331        amw 	 * Immutable files can only alter immutable bit and atime
   2489   5331        amw 	 */
   2490   5331        amw 	if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
   2491   5331        amw 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
   2492   5386       timh 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
   2493   5386       timh 		ZFS_EXIT(zfsvfs);
   2494   5386       timh 		return (EPERM);
   2495   5386       timh 	}
   2496   5386       timh 
   2497   5386       timh 	if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
   2498   5386       timh 		ZFS_EXIT(zfsvfs);
   2499   5386       timh 		return (EPERM);
   2500   5386       timh 	}
   2501    789     ahrens 
   2502   6064      marks 	/*
   2503   6064      marks 	 * Verify timestamps doesn't overflow 32 bits.
   2504   6064      marks 	 * ZFS can handle large timestamps, but 32bit syscalls can't
   2505   6064      marks 	 * handle times greater than 2039.  This check should be removed
   2506   6064      marks 	 * once large timestamps are fully supported.
   2507   6064      marks 	 */
   2508   6064      marks 	if (mask & (AT_ATIME | AT_MTIME)) {
   2509   6064      marks 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
   2510   6064      marks 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
   2511   6064      marks 			ZFS_EXIT(zfsvfs);
   2512   6064      marks 			return (EOVERFLOW);
   2513   6064      marks 		}
   2514   6064      marks 	}
   2515   6064      marks 
   2516    789     ahrens top:
   2517   1231      marks 	attrzp = NULL;
   2518    789     ahrens 
   2519   9981        Tim 	/* Can this be moved to before the top label? */
   2520    789     ahrens 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
   2521    789     ahrens 		ZFS_EXIT(zfsvfs);
   2522    789     ahrens 		return (EROFS);
   2523    789     ahrens 	}
   2524    789     ahrens 
   2525    789     ahrens 	/*
   2526    789     ahrens 	 * First validate permissions
   2527    789     ahrens 	 */
   2528    789     ahrens 
   2529    789     ahrens 	if (mask & AT_SIZE) {
   2530   5331        amw 		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
   2531   1878     maybee 		if (err) {
   2532   1878     maybee 			ZFS_EXIT(zfsvfs);
   2533   1878     maybee 			return (err);
   2534   1878     maybee 		}
   2535   1878     maybee 		/*
   2536   1878     maybee 		 * XXX - Note, we are not providing any open
   2537   1878     maybee 		 * mode flags here (like FNDELAY), so we may
   2538   1878     maybee 		 * block if there are locks present... this
   2539   1878     maybee 		 * should be addressed in openat().
   2540   1878     maybee 		 */
   2541   6992     maybee 		/* XXX - would it be OK to generate a log record here? */
   2542   6992     maybee 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
   2543    789     ahrens 		if (err) {
   2544    789     ahrens 			ZFS_EXIT(zfsvfs);
   2545    789     ahrens 			return (err);
   2546    789     ahrens 		}
   2547    789     ahrens 	}
   2548    789     ahrens 
   2549   5331        amw 	if (mask & (AT_ATIME|AT_MTIME) ||
   2550   5331        amw 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
   2551   5331        amw 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
   2552   5331        amw 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
   2553   5331        amw 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
   2554   5331        amw 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
   2555   5331        amw 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
   2556   5331        amw 		    skipaclchk, cr);
   2557    789     ahrens 
   2558    789     ahrens 	if (mask & (AT_UID|AT_GID)) {
   2559    789     ahrens 		int	idmask = (mask & (AT_UID|AT_GID));
   2560    789     ahrens 		int	take_owner;
   2561    789     ahrens 		int	take_group;
   2562    789     ahrens 
   2563    789     ahrens 		/*
   2564    913      marks 		 * NOTE: even if a new mode is being set,
   2565    913      marks 		 * we may clear S_ISUID/S_ISGID bits.
   2566    913      marks 		 */
   2567    913      marks 
   2568    913      marks 		if (!(mask & AT_MODE))
   2569    913      marks 			vap->va_mode = pzp->zp_mode;
   2570    913      marks 
   2571    913      marks 		/*
   2572    789     ahrens 		 * Take ownership or chgrp to group we are a member of
   2573    789     ahrens 		 */
   2574    789     ahrens 
   2575    789     ahrens 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
   2576   5331        amw 		take_group = (mask & AT_GID) &&
   2577   5331        amw 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
   2578    789     ahrens 
   2579    789     ahrens 		/*
   2580    789     ahrens 		 * If both AT_UID and AT_GID are set then take_owner and
   2581    789     ahrens 		 * take_group must both be set in order to allow taking
   2582    789     ahrens 		 * ownership.
   2583    789     ahrens 		 *
   2584    789     ahrens 		 * Otherwise, send the check through secpolicy_vnode_setattr()
   2585    789     ahrens 		 *
   2586    789     ahrens 		 */
   2587    789     ahrens 
   2588    789     ahrens 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
   2589    789     ahrens 		    ((idmask == AT_UID) && take_owner) ||
   2590    789     ahrens 		    ((idmask == AT_GID) && take_group)) {
   2591   5331        amw 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
   2592   5331        amw 			    skipaclchk, cr) == 0) {
   2593    789     ahrens 				/*
   2594    789     ahrens 				 * Remove setuid/setgid for non-privileged users
   2595    789     ahrens 				 */
   2596   1115      marks 				secpolicy_setid_clear(vap, cr);
   2597   2796      marks 				trim_mask = (mask & (AT_UID|AT_GID));
   2598    789     ahrens 			} else {
   2599    789     ahrens 				need_policy =  TRUE;
   2600    789     ahrens 			}
   2601    789     ahrens 		} else {
   2602    789     ahrens 			need_policy =  TRUE;
   2603    789     ahrens 		}
   2604    789     ahrens 	}
   2605    789     ahrens 
   2606   2796      marks 	mutex_enter(&zp->z_lock);
   2607   2796      marks 	oldva.va_mode = pzp->zp_mode;
   2608   5771   jp151216 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
   2609   5331        amw 	if (mask & AT_XVATTR) {
   2610   8190       Mark 		/*
   2611   8190       Mark 		 * Update xvattr mask to include only those attributes
   2612   8190       Mark 		 * that are actually changing.
   2613   8190       Mark 		 *
   2614   8190       Mark 		 * the bits will be restored prior to actually setting
   2615   8190       Mark 		 * the attributes so the caller thinks they were set.
   2616   8190       Mark 		 */
   2617   8190       Mark 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
   2618   8190       Mark 			if (xoap->xoa_appendonly !=
   2619   8190       Mark 			    ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
   2620   8190       Mark 				need_policy = TRUE;
   2621   8190       Mark 			} else {
   2622   8190       Mark 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
   2623   8190       Mark 				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
   2624   8190       Mark 			}
   2625   8190       Mark 		}
   2626   8190       Mark 
   2627   8190       Mark 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
   2628   8190       Mark 			if (xoap->xoa_nounlink !=
   2629   8190       Mark 			    ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
   2630   8190       Mark 				need_policy = TRUE;
   2631   8190       Mark 			} else {
   2632   8190       Mark 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
   2633   8190       Mark 				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
   2634   8190       Mark 			}
   2635   8190       Mark 		}
   2636   8190       Mark 
   2637   8190       Mark 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
   2638   8190       Mark 			if (xoap->xoa_immutable !=
   2639   8190       Mark 			    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
   2640   8190       Mark 				need_policy = TRUE;
   2641   8190       Mark 			} else {
   2642   8190       Mark 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
   2643   8190       Mark 				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
   2644   8190       Mark 			}
   2645   8190       Mark 		}
   2646   8190       Mark 
   2647   8190       Mark 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
   2648   8190       Mark 			if (xoap->xoa_nodump !=
   2649   8190       Mark 			    ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
   2650   8190       Mark 				need_policy = TRUE;
   2651   8190       Mark 			} else {
   2652   8190       Mark 				XVA_CLR_REQ(xvap, XAT_NODUMP);
   2653   8190       Mark 				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
   2654   8190       Mark 			}
   2655   8190       Mark 		}
   2656   8190       Mark 
   2657   8190       Mark 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
   2658   8190       Mark 			if (xoap->xoa_av_modified !=
   2659   8190       Mark 			    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
   2660   8190       Mark 				need_policy = TRUE;
   2661   8190       Mark 			} else {
   2662   8190       Mark 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
   2663   8190       Mark 				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
   2664   8190       Mark 			}
   2665   8190       Mark 		}
   2666   8190       Mark 
   2667   8190       Mark 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
   2668   8190       Mark 			if ((vp->v_type != VREG &&
   2669   8190       Mark 			    xoap->xoa_av_quarantined) ||
   2670   8190       Mark 			    xoap->xoa_av_quarantined !=
   2671   8190       Mark 			    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
   2672   8190       Mark 				need_policy = TRUE;
   2673   8190       Mark 			} else {
   2674   8190       Mark 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
   2675   8190       Mark 				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
   2676   8190       Mark 			}
   2677   8190       Mark 		}
   2678   8190       Mark 
   2679  10793        dai 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
   2680  10793        dai 			mutex_exit(&zp->z_lock);
   2681  10793        dai 			ZFS_EXIT(zfsvfs);
   2682  10793        dai 			return (EPERM);
   2683  10793        dai 		}
   2684  10793        dai 
   2685   8190       Mark 		if (need_policy == FALSE &&
   2686   8190       Mark 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
   2687   8190       Mark 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
   2688   5331        amw 			need_policy = TRUE;
   2689   5331        amw 		}
   2690   5331        amw 	}
   2691   5331        amw 
   2692   2796      marks 	mutex_exit(&zp->z_lock);
   2693   2796      marks 
   2694   2796      marks 	if (mask & AT_MODE) {
   2695   5331        amw 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
   2696   2796      marks 			err = secpolicy_setid_setsticky_clear(vp, vap,
   2697   2796      marks 			    &oldva, cr);
   2698   2796      marks 			if (err) {
   2699   2796      marks 				ZFS_EXIT(zfsvfs);
   2700   2796      marks 				return (err);
   2701   2796      marks 			}
   2702   2796      marks 			trim_mask |= AT_MODE;
   2703   2796      marks 		} else {
   2704   2796      marks 			need_policy = TRUE;
   2705   2796      marks 		}
   2706   2796      marks 	}
   2707    789     ahrens 
   2708    789     ahrens 	if (need_policy) {
   2709   1115      marks 		/*
   2710   1115      marks 		 * If trim_mask is set then take ownership
   2711   2796      marks 		 * has been granted or write_acl is present and user
   2712   2796      marks 		 * has the ability to modify mode.  In that case remove
   2713   2796      marks 		 * UID|GID and or MODE from mask so that
   2714   1115      marks 		 * secpolicy_vnode_setattr() doesn't revoke it.
   2715   1115      marks 		 */
   2716   1115      marks 
   2717   2796      marks 		if (trim_mask) {
   2718   2796      marks 			saved_mask = vap->va_mask;
   2719   2796      marks 			vap->va_mask &= ~trim_mask;
   2720   2796      marks 		}
   2721    789     ahrens 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
   2722   5331        amw 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
   2723    789     ahrens 		if (err) {
   2724    789     ahrens 			ZFS_EXIT(zfsvfs);
   2725    789     ahrens 			return (err);
   2726    789     ahrens 		}
   2727   1115      marks 
   2728   1115      marks 		if (trim_mask)
   2729   2796      marks 			vap->va_mask |= saved_mask;
   2730    789     ahrens 	}
   2731    789     ahrens 
   2732    789     ahrens 	/*
   2733    789     ahrens 	 * secpolicy_vnode_setattr, or take ownership may have
   2734    789     ahrens 	 * changed va_mask
   2735    789     ahrens 	 */
   2736    789     ahrens 	mask = vap->va_mask;
   2737    789     ahrens 
   2738    789     ahrens 	tx = dmu_tx_create(zfsvfs->z_os);
   2739    789     ahrens 	dmu_tx_hold_bonus(tx, zp->z_id);
   2740    789     ahrens 
   2741    789     ahrens 	if (mask & AT_MODE) {
   2742   1576      marks 		uint64_t pmode = pzp->zp_mode;
   2743    789     ahrens 
   2744   1576      marks 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
   2745    789     ahrens 
   2746   9396    Matthew 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
   2747   9396    Matthew 			goto out;
   2748   5331        amw 		if (pzp->zp_acl.z_acl_extern_obj) {
   2749   5331        amw 			/* Are we upgrading ACL from old V0 format to new V1 */
   2750   5331        amw 			if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
   2751   5331        amw 			    pzp->zp_acl.z_acl_version ==
   2752   5331        amw 			    ZFS_ACL_VERSION_INITIAL) {
   2753   5331        amw 				dmu_tx_hold_free(tx,
   2754   5331        amw 				    pzp->zp_acl.z_acl_extern_obj, 0,
   2755   5331        amw 				    DMU_OBJECT_END);
   2756   5331        amw 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
   2757   5824      marks 				    0, aclp->z_acl_bytes);
   2758   5331        amw 			} else {
   2759   5331        amw 				dmu_tx_hold_write(tx,
   2760   5331        amw 				    pzp->zp_acl.z_acl_extern_obj, 0,
   2761   5824      marks 				    aclp->z_acl_bytes);
   2762   5824      marks 			}
   2763   6180      marks 		} else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
   2764   6180      marks 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
   2765   6180      marks 			    0, aclp->z_acl_bytes);
   2766   5331        amw 		}
   2767   5331        amw 	}
   2768   5331        amw 
   2769   9179       Mark 	if (mask & (AT_UID | AT_GID)) {
   2770   9179       Mark 		if (pzp->zp_xattr) {
   2771   9179       Mark 			err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
   2772   9396    Matthew 			if (err)
   2773   9396    Matthew 				goto out;
   2774   9179       Mark 			dmu_tx_hold_bonus(tx, attrzp->z_id);
   2775   9179       Mark 		}
   2776   9179       Mark 		if (mask & AT_UID) {
   2777   9179       Mark 			new_uid = zfs_fuid_create(zfsvfs,
   2778   9179       Mark 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
   2779   9396    Matthew 			if (new_uid != pzp->zp_uid &&
   2780   9396    Matthew 			    zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
   2781   9396    Matthew 				err = EDQUOT;
   2782   9396    Matthew 				goto out;
   2783   9396    Matthew 			}
   2784   9396    Matthew 		}
   2785   9396    Matthew 
   2786   9179       Mark 		if (mask & AT_GID) {
   2787   9179       Mark 			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
   2788   9179       Mark 			    cr, ZFS_GROUP, &fuidp);
   2789   9396    Matthew 			if (new_gid != pzp->zp_gid &&
   2790   9396    Matthew 			    zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
   2791   9396    Matthew 				err = EDQUOT;
   2792   9396    Matthew 				goto out;
   2793   9396    Matthew 			}
   2794   9179       Mark 		}
   2795   9179       Mark 		fuid_dirtied = zfsvfs->z_fuid_dirty;
   2796   9179       Mark 		if (fuid_dirtied) {
   2797   9179       Mark 			if (zfsvfs->z_fuid_obj == 0) {
   2798   9179       Mark 				dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
   2799   9179       Mark 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
   2800   9179       Mark 				    FUID_SIZE_ESTIMATE(zfsvfs));
   2801   9179       Mark 				dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
   2802   9179       Mark 				    FALSE, NULL);
   2803   9179       Mark 			} else {
   2804   9179       Mark 				dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
   2805   9179       Mark 				dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
   2806   9179       Mark 				    FUID_SIZE_ESTIMATE(zfsvfs));
   2807   9179       Mark 			}
   2808   9179       Mark 		}
   2809   1231      marks 	}
   2810   1231      marks 
   2811   8227       Neil 	err = dmu_tx_assign(tx, TXG_NOWAIT);
   2812    789     ahrens 	if (err) {
   2813   9396    Matthew 		if (err == ERESTART)
   2814   9396    Matthew 			dmu_tx_wait(tx);
   2815   9396    Matthew 		goto out;
   2816    789     ahrens 	}
   2817    789     ahrens 
   2818    789     ahrens 	dmu_buf_will_dirty(zp->z_dbuf, tx);
   2819    789     ahrens 
   2820    789     ahrens 	/*
   2821    789     ahrens 	 * Set each attribute requested.
   2822    789     ahrens 	 * We group settings according to the locks they need to acquire.
   2823    789     ahrens 	 *
   2824    789     ahrens 	 * Note: you cannot set ctime directly, although it will be
   2825    789     ahrens 	 * updated as a side-effect of calling this function.
   2826    789     ahrens 	 */
   2827    789     ahrens 
   2828    789     ahrens 	mutex_enter(&zp->z_lock);
   2829    789     ahrens 
   2830    789     ahrens 	if (mask & AT_MODE) {
   2831   5824      marks 		mutex_enter(&zp->z_acl_lock);
   2832   5824      marks 		zp->z_phys->zp_mode = new_mode;
   2833   9179       Mark 		err = zfs_aclset_common(zp, aclp, cr, tx);
   2834    789     ahrens 		ASSERT3U(err, ==, 0);
   2835  10143        Tim 		zp->z_acl_cached = aclp;
   2836  10143        Tim 		aclp = NULL;
   2837   5824      marks 		mutex_exit(&zp->z_acl_lock);
   2838    789     ahrens 	}
   2839    789     ahrens 
   2840   1231      marks 	if (attrzp)
   2841   1231      marks 		mutex_enter(&attrzp->z_lock);
   2842   1231      marks 
   2843   1231      marks 	if (mask & AT_UID) {
   2844   9179       Mark 		pzp->zp_uid = new_uid;
   2845   9179       Mark 		if (attrzp)
   2846   9179       Mark 			attrzp->z_phys->zp_uid = new_uid;
   2847   1231      marks 	}
   2848    789     ahrens 
   2849   1231      marks 	if (mask & AT_GID) {
   2850   9179       Mark 		pzp->zp_gid = new_gid;
   2851   1231      marks 		if (attrzp)
   2852   9179       Mark 			attrzp->z_phys->zp_gid = new_gid;
   2853   1231      marks 	}
   2854   5824      marks 
   2855   1231      marks 	if (attrzp)
   2856   1231      marks 		mutex_exit(&attrzp->z_lock);
   2857    789     ahrens 
   2858    789     ahrens 	if (mask & AT_ATIME)
   2859    789     ahrens 		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
   2860    789     ahrens 
   2861    789     ahrens 	if (mask & AT_MTIME)
   2862    789     ahrens 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
   2863    789     ahrens 
   2864   6992     maybee 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
   2865   1878     maybee 	if (mask & AT_SIZE)
   2866    789     ahrens 		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
   2867   1878     maybee 	else if (mask != 0)
   2868    789     ahrens 		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
   2869   5331        amw 	/*
   2870   5331        amw 	 * Do this after setting timestamps to prevent timestamp
   2871   5331        amw 	 * update from toggling bit
   2872   5331        amw 	 */
   2873   5331        amw 
   2874   5331        amw 	if (xoap && (mask & AT_XVATTR)) {
   2875   8190       Mark 
   2876   8190       Mark 		/*
   2877   8190       Mark 		 * restore trimmed off masks
   2878   8190       Mark 		 * so that return masks can be set for caller.
   2879   8190       Mark 		 */
   2880   8190       Mark 
   2881   8190       Mark 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
   2882   8190       Mark 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
   2883   8190       Mark 		}
   2884   8190       Mark 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
   2885   8190       Mark 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
   2886   8190       Mark 		}
   2887   8190       Mark 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
   2888   8190       Mark 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
   2889   8190       Mark 		}
   2890   8190       Mark 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
   2891   8190       Mark 			XVA_SET_REQ(xvap, XAT_NODUMP);
   2892   8190       Mark 		}
   2893   8190       Mark 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
   2894   8190       Mark 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
   2895   8190       Mark 		}
   2896   8190       Mark 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
   2897   8190       Mark 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
   2898   8190       Mark 		}
   2899   8190       Mark 
   2900   5331        amw 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
   2901   5331        amw 			size_t len;
   2902   5331        amw 			dmu_object_info_t doi;
   2903   5331        amw 
   2904   5331        amw 			ASSERT(vp->v_type == VREG);
   2905   5331        amw 
   2906   5331        amw 			/* Grow the bonus buffer if necessary. */
   2907   5331        amw 			dmu_object_info_from_db(zp->z_dbuf, &doi);
   2908   5331        amw 			len = sizeof (xoap->xoa_av_scanstamp) +
   2909   5331        amw 			    sizeof (znode_phys_t);
   2910   5331        amw 			if (len > doi.doi_bonus_size)
   2911   5331        amw 				VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
   2912   5331        amw 		}
   2913   5331        amw 		zfs_xvattr_set(zp, xvap);
   2914   5331        amw 	}
   2915    789     ahrens 
   2916   9179       Mark 	if (fuid_dirtied)
   2917   9179       Mark 		zfs_fuid_sync(zfsvfs, tx);
   2918   9179       Mark 
   2919   1878     maybee 	if (mask != 0)
   2920   5331        amw 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
   2921   5331        amw 
   2922    789     ahrens 	mutex_exit(&zp->z_lock);
   2923   1231      marks 
   2924   9396    Matthew out:
   2925   1231      marks 	if (attrzp)
   2926   1231      marks 		VN_RELE(ZTOV(attrzp));
   2927  10143        Tim 
   2928  10143        Tim 	if (aclp)
   2929  10143        Tim 		zfs_acl_free(aclp);
   2930   9396    Matthew 
   2931   9396    Matthew 	if (fuidp) {
   2932   9396    Matthew 		zfs_fuid_info_free(fuidp);
   2933   9396    Matthew 		fuidp = NULL;
   2934   9396    Matthew 	}
   2935   9396    Matthew 
   2936   9396    Matthew 	if (err)
   2937   9396    Matthew 		dmu_tx_abort(tx);
   2938   9396    Matthew 	else
   2939   9396    Matthew 		dmu_tx_commit(tx);
   2940   9396    Matthew 
   2941   9396    Matthew 	if (err == ERESTART)
   2942   9396    Matthew 		goto top;
   2943    789     ahrens 
   2944    789     ahrens 	ZFS_EXIT(zfsvfs);
   2945    789     ahrens 	return (err);
   2946    789     ahrens }
   2947    789     ahrens 
   2948   3271     maybee typedef struct zfs_zlock {
   2949   3271     maybee 	krwlock_t	*zl_rwlock;	/* lock we acquired */
   2950   3271     maybee 	znode_t		*zl_znode;	/* znode we held */
   2951   3271     maybee 	struct zfs_zlock *zl_next;	/* next in list */
   2952   3271     maybee } zfs_zlock_t;
   2953   3271     maybee 
   2954   3271     maybee /*
   2955   3271     maybee  * Drop locks and release vnodes that were held by zfs_rename_lock().
   2956   3271     maybee  */
   2957   3271     maybee static void
   2958   3271     maybee zfs_rename_unlock(zfs_zlock_t **zlpp)
   2959   3271     maybee {
   2960   3271     maybee 	zfs_zlock_t *zl;
   2961   3271     maybee 
   2962   3271     maybee 	while ((zl = *zlpp) != NULL) {
   2963   3271     maybee 		if (zl->zl_znode != NULL)
   2964   3271     maybee 			VN_RELE(ZTOV(zl->zl_znode));
   2965   3271     maybee 		rw_exit(zl->zl_rwlock);
   2966   3271     maybee 		*zlpp = zl->zl_next;
   2967   3271     maybee 		kmem_free(zl, sizeof (*zl));
   2968   3271     maybee 	}
   2969   3271     maybee }
   2970   3271     maybee 
   2971    789     ahrens /*
   2972    789     ahrens  * Search back through the directory tree, using the ".." entries.
   2973    789     ahrens  * Lock each directory in the chain to prevent concurrent renames.
   2974    789     ahrens  * Fail any attempt to move a directory into one of its own descendants.
   2975    789     ahrens  * XXX - z_parent_lock can overlap with map or grow locks
   2976    789     ahrens  */
   2977    789     ahrens static int
   2978    789     ahrens zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
   2979    789     ahrens {
   2980    789     ahrens 	zfs_zlock_t	*zl;
   2981   3638      billm 	znode_t		*zp = tdzp;
   2982    789     ahrens 	uint64_t	rootid = zp->z_zfsvfs->z_root;
   2983    789     ahrens 	uint64_t	*oidp = &zp->z_id;
   2984    789     ahrens 	krwlock_t	*rwlp = &szp->z_parent_lock;
   2985    789     ahrens 	krw_t		rw = RW_WRITER;
   2986    789     ahrens 
   2987    789     ahrens 	/*
   2988    789     ahrens 	 * First pass write-locks szp and compares to zp->z_id.
   2989    789     ahrens 	 * Later passes read-lock zp and compare to zp->z_parent.
   2990    789     ahrens 	 */
   2991    789     ahrens 	do {
   2992   3271     maybee 		if (!rw_tryenter(rwlp, rw)) {
   2993   3271     maybee 			/*
   2994   3271     maybee 			 * Another thread is renaming in this path.
   2995   3271     maybee 			 * Note that if we are a WRITER, we don't have any
   2996   3271     maybee 			 * parent_locks held yet.
   2997   3271     maybee 			 */
   2998   3271     maybee 			if (rw == RW_READER && zp->z_id > szp->z_id) {
   2999   3271     maybee 				/*
   3000   3271     maybee 				 * Drop our locks and restart
   3001   3271     maybee 				 */
   3002   3271     maybee 				zfs_rename_unlock(&zl);
   3003   3271     maybee 				*zlpp = NULL;
   3004   3271     maybee 				zp = tdzp;
   3005   3271     maybee 				oidp = &zp->z_id;
   3006   3271     maybee 				rwlp = &szp->z_parent_lock;
   3007   3271     maybee 				rw = RW_WRITER;
   3008   3271     maybee 				continue;
   3009   3271     maybee 			} else {
   3010   3271     maybee 				/*
   3011   3271     maybee 				 * Wait for other thread to drop its locks
   3012   3271     maybee 				 */
   3013   3271     maybee 				rw_enter(rwlp, rw);
   3014   3271     maybee 			}
   3015   3271     maybee 		}
   3016   3271     maybee 
   3017    789     ahrens 		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
   3018    789     ahrens 		zl->zl_rwlock = rwlp;
   3019    789     ahrens 		zl->zl_znode = NULL;
   3020    789     ahrens 		zl->zl_next = *zlpp;
   3021    789     ahrens 		*zlpp = zl;
   3022    789     ahrens 
   3023    789     ahrens 		if (*oidp == szp->z_id)		/* We're a descendant of szp */
   3024    789     ahrens 			return (EINVAL);
   3025    789     ahrens 
   3026    789     ahrens 		if (*oidp == rootid)		/* We've hit the top */
   3027    789     ahrens 			return (0);
   3028    789     ahrens 
   3029    789     ahrens 		if (rw == RW_READER) {		/* i.e. not the first pass */
   3030    789     ahrens 			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
   3031    789     ahrens 			if (error)
   3032    789     ahrens 				return (error);
   3033    789     ahrens 			zl->zl_znode = zp;
   3034    789     ahrens 		}
   3035    789     ahrens 		oidp = &zp->z_phys->zp_parent;
   3036    789     ahrens 		rwlp = &zp->z_parent_lock;
   3037    789     ahrens 		rw = RW_READER;
   3038    789     ahrens 
   3039    789     ahrens 	} while (zp->z_id != sdzp->z_id);
   3040    789     ahrens 
   3041    789     ahrens 	return (0);
   3042    789     ahrens }
   3043    789     ahrens 
   3044    789     ahrens /*
   3045    789     ahrens  * Move an entry from the provided source directory to the target
   3046    789     ahrens  * directory.  Change the entry name as indicated.
   3047    789     ahrens  *
   3048    789     ahrens  *	IN:	sdvp	- Source directory containing the "old entry".
   3049    789     ahrens  *		snm	- Old entry name.
   3050    789     ahrens  *		tdvp	- Target directory to contain the "new entry".
   3051    789     ahrens  *		tnm	- New entry name.
   3052    789     ahrens  *		cr	- credentials of caller.
   3053   5331        amw  *		ct	- caller context
   3054   5331        amw  *		flags	- case flags
   3055 <