Home | History | Annotate | Download | only in zfs
      1   789    ahrens /*
      2   789    ahrens  * CDDL HEADER START
      3   789    ahrens  *
      4   789    ahrens  * The contents of this file are subject to the terms of the
      5  1484  ek110237  * Common Development and Distribution License (the "License").
      6  1484  ek110237  * You may not use this file except in compliance with the License.
      7   789    ahrens  *
      8   789    ahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9   789    ahrens  * or http://www.opensolaris.org/os/licensing.
     10   789    ahrens  * See the License for the specific language governing permissions
     11   789    ahrens  * and limitations under the License.
     12   789    ahrens  *
     13   789    ahrens  * When distributing Covered Code, include this CDDL HEADER in each
     14   789    ahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15   789    ahrens  * If applicable, add the following below this CDDL HEADER, with the
     16   789    ahrens  * fields enclosed by brackets "[]" replaced with your own identifying
     17   789    ahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
     18   789    ahrens  *
     19   789    ahrens  * CDDL HEADER END
     20   789    ahrens  */
     21   789    ahrens /*
     22  1231     marks  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
     23   789    ahrens  * Use is subject to license terms.
     24   789    ahrens  */
     25   789    ahrens 
     26   789    ahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27   789    ahrens 
     28   789    ahrens #include <sys/types.h>
     29   789    ahrens #include <sys/param.h>
     30   789    ahrens #include <sys/time.h>
     31   789    ahrens #include <sys/systm.h>
     32   789    ahrens #include <sys/sysmacros.h>
     33   789    ahrens #include <sys/resource.h>
     34   789    ahrens #include <sys/vfs.h>
     35   789    ahrens #include <sys/vnode.h>
     36   789    ahrens #include <sys/file.h>
     37   789    ahrens #include <sys/mode.h>
     38   789    ahrens #include <sys/kmem.h>
     39   789    ahrens #include <sys/uio.h>
     40   789    ahrens #include <sys/pathname.h>
     41   789    ahrens #include <sys/cmn_err.h>
     42   789    ahrens #include <sys/errno.h>
     43   789    ahrens #include <sys/stat.h>
     44   789    ahrens #include <sys/unistd.h>
     45   789    ahrens #include <sys/random.h>
     46   789    ahrens #include <sys/policy.h>
     47   789    ahrens #include <sys/zfs_dir.h>
     48   789    ahrens #include <sys/zfs_acl.h>
     49   789    ahrens #include <sys/fs/zfs.h>
     50   789    ahrens #include "fs/fs_subr.h"
     51   789    ahrens #include <sys/zap.h>
     52   789    ahrens #include <sys/dmu.h>
     53   789    ahrens #include <sys/atomic.h>
     54   789    ahrens #include <sys/zfs_ctldir.h>
     55  1484  ek110237 #include <sys/dnlc.h>
     56   789    ahrens 
     57   789    ahrens /*
     58   789    ahrens  * Lock a directory entry.  A dirlock on <dzp, name> protects that name
     59   789    ahrens  * in dzp's directory zap object.  As long as you hold a dirlock, you can
     60   789    ahrens  * assume two things: (1) dzp cannot be reaped, and (2) no other thread
     61   789    ahrens  * can change the zap entry for (i.e. link or unlink) this name.
     62   789    ahrens  *
     63   789    ahrens  * Input arguments:
     64   789    ahrens  *	dzp	- znode for directory
     65   789    ahrens  *	name	- name of entry to lock
     66   789    ahrens  *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
     67   789    ahrens  *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
     68   789    ahrens  *		  ZSHARED: allow concurrent access with other ZSHARED callers.
     69   789    ahrens  *		  ZXATTR: we want dzp's xattr directory
     70   789    ahrens  *
     71   789    ahrens  * Output arguments:
     72   789    ahrens  *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
     73   789    ahrens  *	dlpp	- pointer to the dirlock for this entry (NULL on error)
     74   789    ahrens  *
     75   789    ahrens  * Return value: 0 on success or errno on failure.
     76   789    ahrens  *
     77   789    ahrens  * NOTE: Always checks for, and rejects, '.' and '..'.
     78   789    ahrens  */
     79   789    ahrens int
     80   789    ahrens zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
     81   789    ahrens 	int flag)
     82   789    ahrens {
     83   789    ahrens 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
     84   789    ahrens 	zfs_dirlock_t	*dl;
     85   789    ahrens 	uint64_t	zoid;
     86   789    ahrens 	int		error;
     87  1484  ek110237 	vnode_t		*vp;
     88   789    ahrens 
     89   789    ahrens 	*zpp = NULL;
     90   789    ahrens 	*dlpp = NULL;
     91   789    ahrens 
     92   789    ahrens 	/*
     93   789    ahrens 	 * Verify that we are not trying to lock '.', '..', or '.zfs'
     94   789    ahrens 	 */
     95   789    ahrens 	if (name[0] == '.' &&
     96   789    ahrens 	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
     97   789    ahrens 	    zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
     98   789    ahrens 		return (EEXIST);
     99   789    ahrens 
    100   789    ahrens 	/*
    101   789    ahrens 	 * Wait until there are no locks on this name.
    102   789    ahrens 	 */
    103   789    ahrens 	mutex_enter(&dzp->z_lock);
    104   789    ahrens 	for (;;) {
    105   789    ahrens 		if (dzp->z_reap) {
    106   789    ahrens 			mutex_exit(&dzp->z_lock);
    107   789    ahrens 			return (ENOENT);
    108   789    ahrens 		}
    109   789    ahrens 		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next)
    110   789    ahrens 			if (strcmp(name, dl->dl_name) == 0)
    111   789    ahrens 				break;
    112   789    ahrens 		if (dl == NULL)	{
    113   789    ahrens 			/*
    114   789    ahrens 			 * Allocate a new dirlock and add it to the list.
    115   789    ahrens 			 */
    116   789    ahrens 			dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
    117   789    ahrens 			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
    118   789    ahrens 			dl->dl_name = name;
    119   789    ahrens 			dl->dl_sharecnt = 0;
    120   789    ahrens 			dl->dl_namesize = 0;
    121   789    ahrens 			dl->dl_dzp = dzp;
    122   789    ahrens 			dl->dl_next = dzp->z_dirlocks;
    123   789    ahrens 			dzp->z_dirlocks = dl;
    124   789    ahrens 			break;
    125   789    ahrens 		}
    126   789    ahrens 		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
    127   789    ahrens 			break;
    128   789    ahrens 		cv_wait(&dl->dl_cv, &dzp->z_lock);
    129   789    ahrens 	}
    130   789    ahrens 
    131   789    ahrens 	if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
    132   789    ahrens 		/*
    133   789    ahrens 		 * We're the second shared reference to dl.  Make a copy of
    134   789    ahrens 		 * dl_name in case the first thread goes away before we do.
    135   789    ahrens 		 * Note that we initialize the new name before storing its
    136   789    ahrens 		 * pointer into dl_name, because the first thread may load
    137   789    ahrens 		 * dl->dl_name at any time.  He'll either see the old value,
    138   789    ahrens 		 * which is his, or the new shared copy; either is OK.
    139   789    ahrens 		 */
    140   789    ahrens 		dl->dl_namesize = strlen(dl->dl_name) + 1;
    141   789    ahrens 		name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
    142   789    ahrens 		bcopy(dl->dl_name, name, dl->dl_namesize);
    143   789    ahrens 		dl->dl_name = name;
    144   789    ahrens 	}
    145   789    ahrens 
    146   789    ahrens 	mutex_exit(&dzp->z_lock);
    147   789    ahrens 
    148   789    ahrens 	/*
    149   789    ahrens 	 * We have a dirlock on the name.  (Note that it is the dirlock,
    150   789    ahrens 	 * not the dzp's z_lock, that protects the name in the zap object.)
    151   789    ahrens 	 * See if there's an object by this name; if so, put a hold on it.
    152   789    ahrens 	 */
    153   789    ahrens 	if (flag & ZXATTR) {
    154   789    ahrens 		zoid = dzp->z_phys->zp_xattr;
    155   789    ahrens 		error = (zoid == 0 ? ENOENT : 0);
    156   789    ahrens 	} else {
    157  1484  ek110237 		vp = dnlc_lookup(ZTOV(dzp), name);
    158  1484  ek110237 		if (vp == DNLC_NO_VNODE) {
    159  1484  ek110237 			VN_RELE(vp);
    160  1484  ek110237 			error = ENOENT;
    161  1484  ek110237 		} else if (vp) {
    162  1484  ek110237 			if (flag & ZNEW) {
    163  1484  ek110237 				zfs_dirent_unlock(dl);
    164  1484  ek110237 				VN_RELE(vp);
    165  1484  ek110237 				return (EEXIST);
    166  1484  ek110237 			}
    167  1484  ek110237 			*dlpp = dl;
    168  1484  ek110237 			*zpp = VTOZ(vp);
    169  1484  ek110237 			return (0);
    170  1484  ek110237 		} else {
    171  1484  ek110237 			error = zap_lookup(zfsvfs->z_os, dzp->z_id, name,
    172  1484  ek110237 			    8, 1, &zoid);
    173  1484  ek110237 			if (error == ENOENT)
    174  1484  ek110237 				dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
    175  1484  ek110237 		}
    176   789    ahrens 	}
    177   789    ahrens 	if (error) {
    178   789    ahrens 		if (error != ENOENT || (flag & ZEXISTS)) {
    179   789    ahrens 			zfs_dirent_unlock(dl);
    180   789    ahrens 			return (error);
    181   789    ahrens 		}
    182   789    ahrens 	} else {
    183   789    ahrens 		if (flag & ZNEW) {
    184   789    ahrens 			zfs_dirent_unlock(dl);
    185   789    ahrens 			return (EEXIST);
    186   789    ahrens 		}
    187   789    ahrens 		error = zfs_zget(zfsvfs, zoid, zpp);
    188   789    ahrens 		if (error) {
    189   789    ahrens 			zfs_dirent_unlock(dl);
    190   789    ahrens 			return (error);
    191   789    ahrens 		}
    192  1484  ek110237 		if (!(flag & ZXATTR))
    193  1484  ek110237 			dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
    194   789    ahrens 	}
    195   789    ahrens 
    196   789    ahrens 	*dlpp = dl;
    197   789    ahrens 
    198   789    ahrens 	return (0);
    199   789    ahrens }
    200   789    ahrens 
    201   789    ahrens /*
    202   789    ahrens  * Unlock this directory entry and wake anyone who was waiting for it.
    203   789    ahrens  */
    204   789    ahrens void
    205   789    ahrens zfs_dirent_unlock(zfs_dirlock_t *dl)
    206   789    ahrens {
    207   789    ahrens 	znode_t *dzp = dl->dl_dzp;
    208   789    ahrens 	zfs_dirlock_t **prev_dl, *cur_dl;
    209   789    ahrens 
    210   789    ahrens 	mutex_enter(&dzp->z_lock);
    211   789    ahrens 	if (dl->dl_sharecnt > 1) {
    212   789    ahrens 		dl->dl_sharecnt--;
    213   789    ahrens 		mutex_exit(&dzp->z_lock);
    214   789    ahrens 		return;
    215   789    ahrens 	}
    216   789    ahrens 	prev_dl = &dzp->z_dirlocks;
    217   789    ahrens 	while ((cur_dl = *prev_dl) != dl)
    218   789    ahrens 		prev_dl = &cur_dl->dl_next;
    219   789    ahrens 	*prev_dl = dl->dl_next;
    220   789    ahrens 	cv_broadcast(&dl->dl_cv);
    221   789    ahrens 	mutex_exit(&dzp->z_lock);
    222   789    ahrens 
    223   789    ahrens 	if (dl->dl_namesize != 0)
    224   789    ahrens 		kmem_free(dl->dl_name, dl->dl_namesize);
    225   789    ahrens 	cv_destroy(&dl->dl_cv);
    226   789    ahrens 	kmem_free(dl, sizeof (*dl));
    227   789    ahrens }
    228   789    ahrens 
    229   789    ahrens /*
    230   789    ahrens  * Look up an entry in a directory.
    231   789    ahrens  *
    232   789    ahrens  * NOTE: '.' and '..' are handled as special cases because
    233   789    ahrens  *	no directory entries are actually stored for them.  If this is
    234   789    ahrens  *	the root of a filesystem, then '.zfs' is also treated as a
    235   789    ahrens  *	special pseudo-directory.
    236   789    ahrens  */
    237   789    ahrens int
    238   789    ahrens zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
    239   789    ahrens {
    240   789    ahrens 	zfs_dirlock_t *dl;
    241   789    ahrens 	znode_t *zp;
    242   789    ahrens 	int error = 0;
    243   789    ahrens 
    244   789    ahrens 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
    245   789    ahrens 		*vpp = ZTOV(dzp);
    246   789    ahrens 		VN_HOLD(*vpp);
    247   789    ahrens 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
    248   789    ahrens 		zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
    249   789    ahrens 		/*
    250   789    ahrens 		 * If we are a snapshot mounted under .zfs, return
    251   789    ahrens 		 * the vp for the snapshot directory.
    252   789    ahrens 		 */
    253  1878    maybee 		if (dzp->z_phys->zp_parent == dzp->z_id &&
    254  1878    maybee 		    zfsvfs->z_parent != zfsvfs) {
    255   789    ahrens 			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
    256   789    ahrens 			    "snapshot", vpp, NULL, 0, NULL, kcred);
    257   789    ahrens 			return (error);
    258   789    ahrens 		}
    259   789    ahrens 		rw_enter(&dzp->z_parent_lock, RW_READER);
    260   789    ahrens 		error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
    261   789    ahrens 		if (error == 0)
    262   789    ahrens 			*vpp = ZTOV(zp);
    263   789    ahrens 		rw_exit(&dzp->z_parent_lock);
    264   789    ahrens 	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
    265   789    ahrens 		*vpp = zfsctl_root(dzp);
    266   789    ahrens 	} else {
    267   789    ahrens 		error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED);
    268   789    ahrens 		if (error == 0) {
    269   789    ahrens 			*vpp = ZTOV(zp);
    270   789    ahrens 			zfs_dirent_unlock(dl);
    271   869    perrin 			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
    272   789    ahrens 		}
    273   789    ahrens 	}
    274   789    ahrens 
    275   789    ahrens 	return (error);
    276   789    ahrens }
    277   789    ahrens 
    278   789    ahrens static char *
    279   789    ahrens zfs_dq_hexname(char namebuf[17], uint64_t x)
    280   789    ahrens {
    281   789    ahrens 	char *name = &namebuf[16];
    282   789    ahrens 	const char digits[16] = "0123456789abcdef";
    283   789    ahrens 
    284   789    ahrens 	*name = '\0';
    285   789    ahrens 	do {
    286   789    ahrens 		*--name = digits[x & 0xf];
    287   789    ahrens 		x >>= 4;
    288   789    ahrens 	} while (x != 0);
    289   789    ahrens 
    290   789    ahrens 	return (name);
    291   789    ahrens }
    292   789    ahrens 
    293  1544  eschrock /*
    294  1544  eschrock  * Delete Queue Error Handling
    295  1544  eschrock  *
    296  1544  eschrock  * When dealing with the delete queue, we dmu_tx_hold_zap(), but we
    297  1544  eschrock  * don't specify the name of the entry that we will be manipulating.  We
    298  1544  eschrock  * also fib and say that we won't be adding any new entries to the
    299  1544  eschrock  * delete queue, even though we might (this is to lower the minimum file
    300  1544  eschrock  * size that can be deleted in a full filesystem).  So on the small
    301  1544  eschrock  * chance that the delete queue is using a fat zap (ie. has more than
    302  1544  eschrock  * 2000 entries), we *may* not pre-read a block that's needed.
    303  1544  eschrock  * Therefore it is remotely possible for some of the assertions
    304  1544  eschrock  * regarding the delete queue below to fail due to i/o error.  On a
    305  1544  eschrock  * nondebug system, this will result in the space being leaked.
    306  1544  eschrock  */
    307  1544  eschrock 
    308   789    ahrens void
    309   789    ahrens zfs_dq_add(znode_t *zp, dmu_tx_t *tx)
    310   789    ahrens {
    311   789    ahrens 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    312   789    ahrens 	char obj_name[17];
    313   789    ahrens 	int error;
    314   789    ahrens 
    315   789    ahrens 	ASSERT(zp->z_reap);
    316   789    ahrens 	ASSERT3U(zp->z_phys->zp_links, ==, 0);
    317   789    ahrens 
    318   789    ahrens 	error = zap_add(zfsvfs->z_os, zfsvfs->z_dqueue,
    319   789    ahrens 	    zfs_dq_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
    320   789    ahrens 	ASSERT3U(error, ==, 0);
    321   789    ahrens }
    322   789    ahrens 
    323   789    ahrens /*
    324   789    ahrens  * Delete the entire contents of a directory.  Return a count
    325   789    ahrens  * of the number of entries that could not be deleted.
    326   789    ahrens  *
    327   789    ahrens  * NOTE: this function assumes that the directory is inactive,
    328   789    ahrens  *	so there is no need to lock its entries before deletion.
    329   789    ahrens  *	Also, it assumes the directory contents is *only* regular
    330   789    ahrens  *	files.
    331   789    ahrens  */
    332   789    ahrens static int
    333   789    ahrens zfs_purgedir(znode_t *dzp)
    334   789    ahrens {
    335   789    ahrens 	zap_cursor_t	zc;
    336   789    ahrens 	zap_attribute_t	zap;
    337   789    ahrens 	znode_t		*xzp;
    338   789    ahrens 	dmu_tx_t	*tx;
    339   789    ahrens 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
    340   789    ahrens 	zfs_dirlock_t	dl;
    341   789    ahrens 	int skipped = 0;
    342   789    ahrens 	int error;
    343   789    ahrens 
    344   789    ahrens 
    345   789    ahrens 	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
    346   789    ahrens 	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
    347   789    ahrens 	    zap_cursor_advance(&zc)) {
    348   789    ahrens 		error = zfs_zget(zfsvfs, zap.za_first_integer, &xzp);
    349   789    ahrens 		ASSERT3U(error, ==, 0);
    350   789    ahrens 
    351   789    ahrens 		ASSERT((ZTOV(xzp)->v_type == VREG) ||
    352   789    ahrens 		    (ZTOV(xzp)->v_type == VLNK));
    353   789    ahrens 
    354   789    ahrens 		tx = dmu_tx_create(zfsvfs->z_os);
    355   789    ahrens 		dmu_tx_hold_bonus(tx, dzp->z_id);
    356  1544  eschrock 		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
    357   789    ahrens 		dmu_tx_hold_bonus(tx, xzp->z_id);
    358  1544  eschrock 		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
    359   789    ahrens 		error = dmu_tx_assign(tx, TXG_WAIT);
    360   789    ahrens 		if (error) {
    361   789    ahrens 			dmu_tx_abort(tx);
    362   789    ahrens 			VN_RELE(ZTOV(xzp));
    363   789    ahrens 			skipped += 1;
    364   789    ahrens 			continue;
    365   789    ahrens 		}
    366   789    ahrens 		bzero(&dl, sizeof (dl));
    367   789    ahrens 		dl.dl_dzp = dzp;
    368   789    ahrens 		dl.dl_name = zap.za_name;
    369   789    ahrens 
    370   789    ahrens 		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
    371   789    ahrens 		ASSERT3U(error, ==, 0);
    372   789    ahrens 		dmu_tx_commit(tx);
    373   789    ahrens 
    374   789    ahrens 		VN_RELE(ZTOV(xzp));
    375   789    ahrens 	}
    376   885    ahrens 	zap_cursor_fini(&zc);
    377   789    ahrens 	ASSERT(error == ENOENT);
    378   789    ahrens 	return (skipped);
    379   789    ahrens }
    380   789    ahrens 
    381   789    ahrens /*
    382   789    ahrens  * Special function to requeue the znodes for deletion that were
    383   789    ahrens  * in progress when we either crashed or umounted the file system.
    384  2245     marks  *
    385  2245     marks  * returns 1 if queue was drained.
    386   789    ahrens  */
    387  2245     marks static int
    388   789    ahrens zfs_drain_dq(zfsvfs_t *zfsvfs)
    389   789    ahrens {
    390   789    ahrens 	zap_cursor_t	zc;
    391   789    ahrens 	zap_attribute_t zap;
    392   789    ahrens 	dmu_object_info_t doi;
    393   789    ahrens 	znode_t		*zp;
    394   789    ahrens 	int		error;
    395   789    ahrens 
    396   789    ahrens 	/*
    397   789    ahrens 	 * Interate over the contents of the delete queue.
    398   789    ahrens 	 */
    399   789    ahrens 	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_dqueue);
    400   789    ahrens 	    zap_cursor_retrieve(&zc, &zap) == 0;
    401   789    ahrens 	    zap_cursor_advance(&zc)) {
    402   789    ahrens 
    403   789    ahrens 		/*
    404  2245     marks 		 * Create more threads if necessary to balance the load.
    405  2245     marks 		 * quit if the delete threads have been shut down.
    406   789    ahrens 		 */
    407   789    ahrens 		if (zfs_delete_thread_target(zfsvfs, -1) != 0)
    408  2245     marks 			return (0);
    409   789    ahrens 
    410   789    ahrens 		/*
    411   789    ahrens 		 * See what kind of object we have in queue
    412   789    ahrens 		 */
    413   789    ahrens 
    414   789    ahrens 		error = dmu_object_info(zfsvfs->z_os,
    415   789    ahrens 		    zap.za_first_integer, &doi);
    416   789    ahrens 		if (error != 0)
    417   789    ahrens 			continue;
    418   789    ahrens 
    419   789    ahrens 		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
    420   789    ahrens 		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
    421   789    ahrens 		/*
    422   789    ahrens 		 * We need to re-mark these queue entries for reaping,
    423   789    ahrens 		 * so we pull them back into core and set zp->z_reap.
    424   789    ahrens 		 */
    425   789    ahrens 		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
    426   789    ahrens 
    427   789    ahrens 		/*
    428   789    ahrens 		 * We may pick up znodes that are already marked for reaping.
    429   789    ahrens 		 * This could happen during the purge of an extended attribute
    430   789    ahrens 		 * directory.  All we need to do is skip over them, since they
    431  2245     marks 		 * are already in the system to be processed by the delete
    432  2245     marks 		 * thread(s).
    433   789    ahrens 		 */
    434   789    ahrens 		if (error != 0) {
    435   789    ahrens 			continue;
    436   789    ahrens 		}
    437  2245     marks 
    438   789    ahrens 		zp->z_reap = 1;
    439   789    ahrens 		VN_RELE(ZTOV(zp));
    440   789    ahrens 	}
    441   885    ahrens 	zap_cursor_fini(&zc);
    442  2245     marks 	return (1);
    443   789    ahrens }
    444   789    ahrens 
    445   789    ahrens void
    446   789    ahrens zfs_delete_thread(void *arg)
    447   789    ahrens {
    448   789    ahrens 	zfsvfs_t	*zfsvfs = arg;
    449   789    ahrens 	zfs_delete_t 	*zd = &zfsvfs->z_delete_head;
    450   789    ahrens 	znode_t		*zp;
    451   789    ahrens 	callb_cpr_t	cprinfo;
    452  2245     marks 	int		drained;
    453   789    ahrens 
    454   789    ahrens 	CALLB_CPR_INIT(&cprinfo, &zd->z_mutex, callb_generic_cpr, "zfs_delete");
    455   789    ahrens 
    456   789    ahrens 	mutex_enter(&zd->z_mutex);
    457   789    ahrens 
    458   789    ahrens 	if (!zd->z_drained && !zd->z_draining) {
    459   789    ahrens 		zd->z_draining = B_TRUE;
    460   789    ahrens 		mutex_exit(&zd->z_mutex);
    461  2245     marks 		drained = zfs_drain_dq(zfsvfs);
    462   789    ahrens 		mutex_enter(&zd->z_mutex);
    463   789    ahrens 		zd->z_draining = B_FALSE;
    464  2245     marks 		zd->z_drained = drained;
    465   789    ahrens 		cv_broadcast(&zd->z_quiesce_cv);
    466   789    ahrens 	}
    467   789    ahrens 
    468   789    ahrens 	while (zd->z_thread_count <= zd->z_thread_target) {
    469   789    ahrens 		zp = list_head(&zd->z_znodes);
    470   789    ahrens 		if (zp == NULL) {
    471   789    ahrens 			ASSERT(zd->z_znode_count == 0);
    472   789    ahrens 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
    473   789    ahrens 			cv_wait(&zd->z_cv, &zd->z_mutex);
    474   789    ahrens 			CALLB_CPR_SAFE_END(&cprinfo, &zd->z_mutex);
    475   789    ahrens 			continue;
    476   789    ahrens 		}
    477   789    ahrens 		ASSERT(zd->z_znode_count != 0);
    478   789    ahrens 		list_remove(&zd->z_znodes, zp);
    479   789    ahrens 		if (--zd->z_znode_count == 0)
    480   789    ahrens 			cv_broadcast(&zd->z_quiesce_cv);
    481   789    ahrens 		mutex_exit(&zd->z_mutex);
    482   789    ahrens 		zfs_rmnode(zp);
    483   789    ahrens 		(void) zfs_delete_thread_target(zfsvfs, -1);
    484   789    ahrens 		mutex_enter(&zd->z_mutex);
    485   789    ahrens 	}
    486   789    ahrens 
    487   789    ahrens 	ASSERT(zd->z_thread_count != 0);
    488   789    ahrens 	if (--zd->z_thread_count == 0)
    489   789    ahrens 		cv_broadcast(&zd->z_cv);
    490   789    ahrens 
    491   789    ahrens 	CALLB_CPR_EXIT(&cprinfo);	/* NB: drops z_mutex */
    492   789    ahrens 	thread_exit();
    493   789    ahrens }
    494   789    ahrens 
    495   789    ahrens static int zfs_work_per_thread_shift = 11;	/* 2048 (2^11) per thread */
    496   789    ahrens 
    497   789    ahrens /*
    498   789    ahrens  * Set the target number of delete threads to 'nthreads'.
    499   789    ahrens  * If nthreads == -1, choose a number based on current workload.
    500   789    ahrens  * If nthreads == 0, don't return until the threads have exited.
    501   789    ahrens  */
    502   789    ahrens int
    503   789    ahrens zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads)
    504   789    ahrens {
    505   789    ahrens 	zfs_delete_t *zd = &zfsvfs->z_delete_head;
    506   789    ahrens 
    507   789    ahrens 	mutex_enter(&zd->z_mutex);
    508   789    ahrens 
    509   789    ahrens 	if (nthreads == -1) {
    510   789    ahrens 		if (zd->z_thread_target == 0) {
    511   789    ahrens 			mutex_exit(&zd->z_mutex);
    512   789    ahrens 			return (EBUSY);
    513   789    ahrens 		}
    514   789    ahrens 		nthreads = zd->z_znode_count >> zfs_work_per_thread_shift;
    515   789    ahrens 		nthreads = MIN(nthreads, ncpus << 1);
    516   789    ahrens 		nthreads = MAX(nthreads, 1);
    517   789    ahrens 		nthreads += !!zd->z_draining;
    518   789    ahrens 	}
    519   789    ahrens 
    520   789    ahrens 	zd->z_thread_target = nthreads;
    521   789    ahrens 
    522   789    ahrens 	while (zd->z_thread_count < zd->z_thread_target) {
    523   789    ahrens 		(void) thread_create(NULL, 0, zfs_delete_thread, zfsvfs,
    524   789    ahrens 		    0, &p0, TS_RUN, minclsyspri);
    525   789    ahrens 		zd->z_thread_count++;
    526   789    ahrens 	}
    527   789    ahrens 
    528   789    ahrens 	while (zd->z_thread_count > zd->z_thread_target && nthreads == 0) {
    529   789    ahrens 		cv_broadcast(&zd->z_cv);
    530   789    ahrens 		cv_wait(&zd->z_cv, &zd->z_mutex);
    531   789    ahrens 	}
    532   789    ahrens 
    533   789    ahrens 	mutex_exit(&zd->z_mutex);
    534   789    ahrens 
    535   789    ahrens 	return (0);
    536   789    ahrens }
    537   789    ahrens 
    538   789    ahrens /*
    539   789    ahrens  * Wait until everything that's been queued has been deleted.
    540   789    ahrens  */
    541   789    ahrens void
    542   789    ahrens zfs_delete_wait_empty(zfsvfs_t *zfsvfs)
    543   789    ahrens {
    544   789    ahrens 	zfs_delete_t *zd = &zfsvfs->z_delete_head;
    545   789    ahrens 
    546   789    ahrens 	mutex_enter(&zd->z_mutex);
    547   789    ahrens 	ASSERT(zd->z_thread_target != 0);
    548   789    ahrens 	while (!zd->z_drained || zd->z_znode_count != 0) {
    549   789    ahrens 		ASSERT(zd->z_thread_target != 0);
    550   789    ahrens 		cv_wait(&zd->z_quiesce_cv, &zd->z_mutex);
    551   789    ahrens 	}
    552   789    ahrens 	mutex_exit(&zd->z_mutex);
    553   789    ahrens }
    554   789    ahrens 
    555   789    ahrens void
    556   789    ahrens zfs_rmnode(znode_t *zp)
    557   789    ahrens {
    558   789    ahrens 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
    559   789    ahrens 	objset_t	*os = zfsvfs->z_os;
    560   789    ahrens 	znode_t		*xzp = NULL;
    561   789    ahrens 	char		obj_name[17];
    562   789    ahrens 	dmu_tx_t	*tx;
    563   789    ahrens 	uint64_t	acl_obj;
    564   789    ahrens 	int		error;
    565   789    ahrens 
    566   789    ahrens 	ASSERT(ZTOV(zp)->v_count == 0);
    567   789    ahrens 	ASSERT(zp->z_phys->zp_links == 0);
    568   789    ahrens 
    569   789    ahrens 	/*
    570   789    ahrens 	 * If this is an attribute directory, purge its contents.
    571   789    ahrens 	 */
    572   789    ahrens 	if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR))
    573   789    ahrens 		if (zfs_purgedir(zp) != 0) {
    574   789    ahrens 			zfs_delete_t *delq = &zfsvfs->z_delete_head;
    575   789    ahrens 			/*
    576   789    ahrens 			 * Add this back to the delete list to be retried later.
    577   789    ahrens 			 *
    578   789    ahrens 			 * XXX - this could just busy loop on us...
    579   789    ahrens 			 */
    580   789    ahrens 			mutex_enter(&delq->z_mutex);
    581   789    ahrens 			list_insert_tail(&delq->z_znodes, zp);
    582   789    ahrens 			delq->z_znode_count++;
    583   789    ahrens 			mutex_exit(&delq->z_mutex);
    584   789    ahrens 			return;
    585   789    ahrens 		}
    586   789    ahrens 
    587   789    ahrens 	/*
    588   789    ahrens 	 * If the file has extended attributes, unlink the xattr dir.
    589   789    ahrens 	 */
    590   789    ahrens 	if (zp->z_phys->zp_xattr) {
    591   789    ahrens 		error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
    592   789    ahrens 		ASSERT(error == 0);
    593   789    ahrens 	}
    594   789    ahrens 
    595   789    ahrens 	acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
    596   789    ahrens 
    597   789    ahrens 	/*
    598   789    ahrens 	 * Set up the transaction.
    599   789    ahrens 	 */
    600   789    ahrens 	tx = dmu_tx_create(os);
    601   789    ahrens 	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
    602  1544  eschrock 	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
    603   789    ahrens 	if (xzp) {
    604   789    ahrens 		dmu_tx_hold_bonus(tx, xzp->z_id);
    605  1544  eschrock 		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, TRUE, NULL);
    606   789    ahrens 	}
    607   789    ahrens 	if (acl_obj)
    608   789    ahrens 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
    609   789    ahrens 	error = dmu_tx_assign(tx, TXG_WAIT);
    610   789    ahrens 	if (error) {
    611   789    ahrens 		zfs_delete_t *delq = &zfsvfs->z_delete_head;
    612   789    ahrens 
    613   789    ahrens 		dmu_tx_abort(tx);
    614   789    ahrens 		/*
    615   789    ahrens 		 * Add this back to the delete list to be retried later.
    616   789    ahrens 		 *
    617   789    ahrens 		 * XXX - this could just busy loop on us...
    618   789    ahrens 		 */
    619   789    ahrens 		mutex_enter(&delq->z_mutex);
    620   789    ahrens 		list_insert_tail(&delq->z_znodes, zp);
    621   789    ahrens 		delq->z_znode_count++;
    622   789    ahrens 		mutex_exit(&delq->z_mutex);
    623   789    ahrens 		return;
    624   789    ahrens 	}
    625   789    ahrens 
    626   789    ahrens 	if (xzp) {
    627   789    ahrens 		dmu_buf_will_dirty(xzp->z_dbuf, tx);
    628   789    ahrens 		mutex_enter(&xzp->z_lock);
    629   789    ahrens 		xzp->z_reap = 1;		/* mark xzp for deletion */
    630   789    ahrens 		xzp->z_phys->zp_links = 0;	/* no more links to it */
    631   789    ahrens 		mutex_exit(&xzp->z_lock);
    632   789    ahrens 		zfs_dq_add(xzp, tx);		/* add xzp to delete queue */
    633   789    ahrens 	}
    634   789    ahrens 
    635   789    ahrens 	/*
    636   789    ahrens 	 * Remove this znode from delete queue
    637   789    ahrens 	 */
    638   789    ahrens 	error = zap_remove(os, zfsvfs->z_dqueue,
    639   789    ahrens 	    zfs_dq_hexname(obj_name, zp->z_id), tx);
    640   789    ahrens 	ASSERT3U(error, ==, 0);
    641   789    ahrens 
    642   789    ahrens 	zfs_znode_delete(zp, tx);
    643   789    ahrens 
    644   789    ahrens 	dmu_tx_commit(tx);
    645   789    ahrens 
    646   789    ahrens 	if (xzp)
    647   789    ahrens 		VN_RELE(ZTOV(xzp));
    648   789    ahrens }
    649   789    ahrens 
    650   789    ahrens /*
    651   789    ahrens  * Link zp into dl.  Can only fail if zp has been reaped.
    652   789    ahrens  */
    653   789    ahrens int
    654   789    ahrens zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
    655   789    ahrens {
    656   789    ahrens 	znode_t *dzp = dl->dl_dzp;
    657   789    ahrens 	vnode_t *vp = ZTOV(zp);
    658   789    ahrens 	int zp_is_dir = (vp->v_type == VDIR);
    659   789    ahrens 	int error;
    660   789    ahrens 
    661   789    ahrens 	dmu_buf_will_dirty(zp->z_dbuf, tx);
    662   789    ahrens 	mutex_enter(&zp->z_lock);
    663   789    ahrens 
    664   789    ahrens 	if (!(flag & ZRENAMING)) {
    665   789    ahrens 		if (zp->z_reap) {	/* no new links to reaped zp */
    666   789    ahrens 			ASSERT(!(flag & (ZNEW | ZEXISTS)));
    667   789    ahrens 			mutex_exit(&zp->z_lock);
    668   789    ahrens 			return (ENOENT);
    669   789    ahrens 		}
    670   789    ahrens 		zp->z_phys->zp_links++;
    671   789    ahrens 	}
    672   789    ahrens 	zp->z_phys->zp_parent = dzp->z_id;	/* dzp is now zp's parent */
    673   789    ahrens 
    674   789    ahrens 	if (!(flag & ZNEW))
    675   789    ahrens 		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
    676   789    ahrens 	mutex_exit(&zp->z_lock);
    677   789    ahrens 
    678   789    ahrens 	dmu_buf_will_dirty(dzp->z_dbuf, tx);
    679   789    ahrens 	mutex_enter(&dzp->z_lock);
    680   789    ahrens 	dzp->z_phys->zp_size++;			/* one dirent added */
    681   789    ahrens 	dzp->z_phys->zp_links += zp_is_dir;	/* ".." link from zp */
    682   789    ahrens 	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
    683   789    ahrens 	mutex_exit(&dzp->z_lock);
    684   789    ahrens 
    685   789    ahrens 	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
    686   789    ahrens 	    8, 1, &zp->z_id, tx);
    687   789    ahrens 	ASSERT(error == 0);
    688   789    ahrens 
    689  1484  ek110237 	dnlc_update(ZTOV(dzp), dl->dl_name, vp);
    690  1484  ek110237 
    691   789    ahrens 	return (0);
    692   789    ahrens }
    693   789    ahrens 
    694   789    ahrens /*
    695   789    ahrens  * Unlink zp from dl, and mark zp for reaping if this was the last link.
    696   789    ahrens  * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
    697   789    ahrens  * If 'reaped_ptr' is NULL, we put reaped znodes on the delete queue.
    698   789    ahrens  * If it's non-NULL, we use it to indicate whether the znode needs reaping,
    699   789    ahrens  * and it's the caller's job to do it.
    700   789    ahrens  */
    701   789    ahrens int
    702   789    ahrens zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
    703   789    ahrens 	int *reaped_ptr)
    704   789    ahrens {
    705   789    ahrens 	znode_t *dzp = dl->dl_dzp;
    706   789    ahrens 	vnode_t *vp = ZTOV(zp);
    707   789    ahrens 	int zp_is_dir = (vp->v_type == VDIR);
    708   789    ahrens 	int reaped = 0;
    709   789    ahrens 	int error;
    710  1484  ek110237 
    711  1484  ek110237 	dnlc_remove(ZTOV(dzp), dl->dl_name);
    712   789    ahrens 
    713   789    ahrens 	if (!(flag & ZRENAMING)) {
    714   789    ahrens 		dmu_buf_will_dirty(zp->z_dbuf, tx);
    715   789    ahrens 
    716   789    ahrens 		if (vn_vfswlock(vp))		/* prevent new mounts on zp */
    717   789    ahrens 			return (EBUSY);
    718   789    ahrens 
    719   789    ahrens 		if (vn_ismntpt(vp)) {		/* don't remove mount point */
    720   789    ahrens 			vn_vfsunlock(vp);
    721   789    ahrens 			return (EBUSY);
    722   789    ahrens 		}
    723   789    ahrens 
    724   789    ahrens 		mutex_enter(&zp->z_lock);
    725   789    ahrens 		if (zp_is_dir && !zfs_dirempty(zp)) {	/* dir not empty */
    726   789    ahrens 			mutex_exit(&zp->z_lock);
    727   789    ahrens 			vn_vfsunlock(vp);
    728   789    ahrens 			return (EEXIST);
    729   789    ahrens 		}
    730   789    ahrens 		ASSERT(zp->z_phys->zp_links > zp_is_dir);
    731   789    ahrens 		if (--zp->z_phys->zp_links == zp_is_dir) {
    732   789    ahrens 			zp->z_reap = 1;
    733   789    ahrens 			zp->z_phys->zp_links = 0;
    734   789    ahrens 			reaped = 1;
    735   789    ahrens 		} else {
    736   789    ahrens 			zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
    737   789    ahrens 		}
    738   789    ahrens 		mutex_exit(&zp->z_lock);
    739   789    ahrens 		vn_vfsunlock(vp);
    740   789    ahrens 	}
    741   789    ahrens 
    742   789    ahrens 	dmu_buf_will_dirty(dzp->z_dbuf, tx);
    743   789    ahrens 	mutex_enter(&dzp->z_lock);
    744   789    ahrens 	dzp->z_phys->zp_size--;			/* one dirent removed */
    745   789    ahrens 	dzp->z_phys->zp_links -= zp_is_dir;	/* ".." link from zp */
    746   789    ahrens 	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
    747   789    ahrens 	mutex_exit(&dzp->z_lock);
    748   789    ahrens 
    749   789    ahrens 	error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx);
    750   789    ahrens 	ASSERT(error == 0);
    751   789    ahrens 
    752   789    ahrens 	if (reaped_ptr != NULL)
    753   789    ahrens 		*reaped_ptr = reaped;
    754   789    ahrens 	else if (reaped)
    755   789    ahrens 		zfs_dq_add(zp, tx);
    756   789    ahrens 
    757   789    ahrens 	return (0);
    758   789    ahrens }
    759   789    ahrens 
    760   789    ahrens /*
    761   789    ahrens  * Indicate whether the directory is empty.  Works with or without z_lock
    762   789    ahrens  * held, but can only be consider a hint in the latter case.  Returns true
    763   789    ahrens  * if only "." and ".." remain and there's no work in progress.
    764   789    ahrens  */
    765   789    ahrens boolean_t
    766   789    ahrens zfs_dirempty(znode_t *dzp)
    767   789    ahrens {
    768   789    ahrens 	return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
    769   789    ahrens }
    770   789    ahrens 
    771   789    ahrens int
    772   789    ahrens zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
    773   789    ahrens {
    774   789    ahrens 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    775   789    ahrens 	znode_t *xzp;
    776   789    ahrens 	dmu_tx_t *tx;
    777   789    ahrens 	uint64_t xoid;
    778   789    ahrens 	int error;
    779   789    ahrens 
    780   789    ahrens 	*xvpp = NULL;
    781   789    ahrens 
    782   789    ahrens 	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr))
    783   789    ahrens 		return (error);
    784   789    ahrens 
    785   789    ahrens 	tx = dmu_tx_create(zfsvfs->z_os);
    786   789    ahrens 	dmu_tx_hold_bonus(tx, zp->z_id);
    787  1544  eschrock 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
    788   789    ahrens 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
    789   789    ahrens 	if (error) {
    790  2113    ahrens 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
    791  2113    ahrens 			dmu_tx_wait(tx);
    792   789    ahrens 		dmu_tx_abort(tx);
    793   789    ahrens 		return (error);
    794   789    ahrens 	}
    795   789    ahrens 	zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0);
    796   789    ahrens 	ASSERT(xzp->z_id == xoid);
    797   789    ahrens 	ASSERT(xzp->z_phys->zp_parent == zp->z_id);
    798   789    ahrens 	dmu_buf_will_dirty(zp->z_dbuf, tx);
    799   789    ahrens 	zp->z_phys->zp_xattr = xoid;
    800   789    ahrens 
    801   789    ahrens 	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "");
    802   789    ahrens 	dmu_tx_commit(tx);
    803   789    ahrens 
    804   789    ahrens 	*xvpp = ZTOV(xzp);
    805   789    ahrens 
    806   789    ahrens 	return (0);
    807   789    ahrens }
    808   789    ahrens 
    809   789    ahrens /*
    810   789    ahrens  * Return a znode for the extended attribute directory for zp.
    811   789    ahrens  * ** If the directory does not already exist, it is created **
    812   789    ahrens  *
    813   789    ahrens  *	IN:	zp	- znode to obtain attribute directory from
    814   789    ahrens  *		cr	- credentials of caller
    815   789    ahrens  *
    816   789    ahrens  *	OUT:	xzpp	- pointer to extended attribute znode
    817   789    ahrens  *
    818   789    ahrens  *	RETURN:	0 on success
    819   789    ahrens  *		error number on failure
    820   789    ahrens  */
    821   789    ahrens int
    822   789    ahrens zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr)
    823   789    ahrens {
    824   789    ahrens 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
    825   789    ahrens 	znode_t		*xzp;
    826   789    ahrens 	zfs_dirlock_t	*dl;
    827   789    ahrens 	vattr_t		va;
    828   789    ahrens 	int		error;
    829   789    ahrens top:
    830   789    ahrens 	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR);
    831   789    ahrens 	if (error)
    832   789    ahrens 		return (error);
    833   789    ahrens 
    834   789    ahrens 	if (xzp != NULL) {
    835   789    ahrens 		*xvpp = ZTOV(xzp);
    836   789    ahrens 		zfs_dirent_unlock(dl);
    837   789    ahrens 		return (0);
    838   789    ahrens 	}
    839   789    ahrens 
    840   789    ahrens 	ASSERT(zp->z_phys->zp_xattr == 0);
    841   789    ahrens 
    842   789    ahrens 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
    843   789    ahrens 		zfs_dirent_unlock(dl);
    844   789    ahrens 		return (EROFS);
    845   789    ahrens 	}
    846   789    ahrens 
    847   789    ahrens 	/*
    848   789    ahrens 	 * The ability to 'create' files in an attribute
    849   789    ahrens 	 * directory comes from the write_xattr permission on the base file.
    850   789    ahrens 	 *
    851   789    ahrens 	 * The ability to 'search' an attribute directory requires
    852   789    ahrens 	 * read_xattr permission on the base file.
    853   789    ahrens 	 *
    854   789    ahrens 	 * Once in a directory the ability to read/write attributes
    855   789    ahrens 	 * is controlled by the permissions on the attribute file.
    856   789    ahrens 	 */
    857   789    ahrens 	va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
    858   789    ahrens 	va.va_type = VDIR;
    859  1231     marks 	va.va_mode = S_IFDIR | S_ISVTX | 0777;
    860   789    ahrens 	va.va_uid = (uid_t)zp->z_phys->zp_uid;
    861   789    ahrens 	va.va_gid = (gid_t)zp->z_phys->zp_gid;
    862   789    ahrens 
    863   789    ahrens 	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
    864   789    ahrens 	zfs_dirent_unlock(dl);
    865   789    ahrens 
    866   789    ahrens 	if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
    867  2113    ahrens 		/* NB: we already did dmu_tx_wait() if necessary */
    868   789    ahrens 		goto top;
    869   789    ahrens 	}
    870   789    ahrens 
    871   789    ahrens 	return (error);
    872   789    ahrens }
    873   789    ahrens 
    874   789    ahrens /*
    875   789    ahrens  * Decide whether it is okay to remove within a sticky directory.
    876   789    ahrens  *
    877   789    ahrens  * In sticky directories, write access is not sufficient;
    878   789    ahrens  * you can remove entries from a directory only if:
    879   789    ahrens  *
    880   789    ahrens  *	you own the directory,
    881   789    ahrens  *	you own the entry,
    882   789    ahrens  *	the entry is a plain file and you have write access,
    883   789    ahrens  *	or you are privileged (checked in secpolicy...).
    884   789    ahrens  *
    885   789    ahrens  * The function returns 0 if remove access is granted.
    886   789    ahrens  */
    887   789    ahrens int
    888   789    ahrens zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
    889   789    ahrens {
    890   789    ahrens 	uid_t  		uid;
    891   789    ahrens 
    892   789    ahrens 	if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL)	/* ZIL replay */
    893   789    ahrens 		return (0);
    894   789    ahrens 
    895   789    ahrens 	if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 ||
    896   789    ahrens 	    (uid = crgetuid(cr)) == zdp->z_phys->zp_uid ||
    897   789    ahrens 	    uid == zp->z_phys->zp_uid ||
    898   789    ahrens 	    (ZTOV(zp)->v_type == VREG &&
    899   789    ahrens 	    zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0))
    900   789    ahrens 		return (0);
    901   789    ahrens 	else
    902   789    ahrens 		return (secpolicy_vnode_remove(cr));
    903   789    ahrens }
    904