Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 /*
     40  * Directory manipulation routines.
     41  *
     42  * When manipulating directories, the i_rwlock provides serialization
     43  * since directories cannot be mmapped. The i_contents lock is redundant.
     44  */
     45 
     46 #include <sys/types.h>
     47 #include <sys/t_lock.h>
     48 #include <sys/param.h>
     49 #include <sys/systm.h>
     50 #include <sys/signal.h>
     51 #include <sys/cred.h>
     52 #include <sys/proc.h>
     53 #include <sys/disp.h>
     54 #include <sys/user.h>
     55 #include <sys/vfs.h>
     56 #include <sys/vnode.h>
     57 #include <sys/stat.h>
     58 #include <sys/mode.h>
     59 #include <sys/buf.h>
     60 #include <sys/uio.h>
     61 #include <sys/dnlc.h>
     62 #include <sys/fs/ufs_inode.h>
     63 #include <sys/fs/ufs_fs.h>
     64 #include <sys/mount.h>
     65 #include <sys/fs/ufs_fsdir.h>
     66 #include <sys/fs/ufs_trans.h>
     67 #include <sys/fs/ufs_panic.h>
     68 #include <sys/fs/ufs_quota.h>
     69 #include <sys/errno.h>
     70 #include <sys/debug.h>
     71 #include <vm/seg.h>
     72 #include <sys/sysmacros.h>
     73 #include <sys/cmn_err.h>
     74 #include <sys/cpuvar.h>
     75 #include <sys/unistd.h>
     76 #include <sys/policy.h>
     77 
     78 /*
     79  * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
     80  */
     81 #if !ISP2(DIRBLKSIZ)
     82 #error	"DIRBLKSIZ not a power of 2"
     83 #endif
     84 
     85 /*
     86  * A virgin directory.
     87  */
     88 static struct dirtemplate mastertemplate = {
     89 	0, 12, 1, ".",
     90 	0, DIRBLKSIZ - 12, 2, ".."
     91 };
     92 
     93 #define	LDIRSIZ(len) \
     94 	((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
     95 #define	MAX_DIR_NAME_LEN(len) \
     96 	(((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)
     97 
     98 /*
     99  * The dnlc directory cache allows a 64 bit handle for directory entries.
    100  * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
    101  * into the handle. Note, a 32 bit offset allows a 4GB directory, which
    102  * is way beyond what could be cached in memory by the directory
    103  * caching routines. So we are quite safe with this limit.
    104  * The macros below pack and unpack the handle.
    105  */
    106 #define	H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
    107 #define	H_TO_OFF(h) (off_t)((h) >> 32)
    108 #define	INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))
    109 
    110 /*
    111  * The average size of a typical on disk directory entry is about 16 bytes
    112  * and so defines AV_DIRECT_SHIFT : log2(16)
    113  * This define is only used to approximate the number of entries
    114  * is a directory. This is needed for dnlc_dir_start() which will immediately
    115  * return an error if the value is not within its acceptable range of
    116  * number of files in a directory.
    117  */
    118 #define	AV_DIRECT_SHIFT 4
    119 /*
    120  * If the directory size (from i_size) is greater than the ufs_min_dir_cache
    121  * tunable then we request dnlc directory caching.
    122  * This has found to be profitable after 1024 file names.
    123  */
    124 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT;
    125 
    126 /* The time point the dnlc directory caching was disabled */
    127 static hrtime_t ufs_dc_disable_at;
    128 /* directory caching disable duration */
    129 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5;
    130 
    131 #ifdef DEBUG
    132 int dirchk = 1;
    133 #else /* !DEBUG */
    134 int dirchk = 0;
    135 #endif /* DEBUG */
    136 int ufs_negative_cache = 1;
    137 uint64_t ufs_dirremove_retry_cnt;
    138 
    139 static void dirbad();
    140 static int ufs_dirrename();
    141 static int ufs_diraddentry();
    142 static int ufs_dirempty();
    143 static int ufs_dirscan();
    144 static int ufs_dirclrdotdot();
    145 static int ufs_dirfixdotdot();
    146 static int ufs_dirpurgedotdot();
    147 static int dirprepareentry();
    148 static int ufs_dirmakedirect();
    149 static int dirbadname();
    150 static int dirmangled();
    151 
    152 /*
    153  * Check accessibility of directory against inquired mode and type.
    154  * Execute access is required to search the directory.
    155  * Access for write is interpreted as allowing
    156  * deletion of files in the directory.
    157  * Note, the reader i_contents lock will be acquired in
    158  * ufs_iaccess().
    159  */
    160 int
    161 ufs_diraccess(struct inode *ip, int mode, struct cred *cr)
    162 {
    163 	if (((ip->i_mode & IFMT) != IFDIR) &&
    164 	    ((ip->i_mode & IFMT) != IFATTRDIR))
    165 		return (ENOTDIR);
    166 
    167 	return (ufs_iaccess(ip, mode, cr, 1));
    168 }
    169 
    170 /*
    171  * Look for a given name in a directory.  On successful return, *ipp
    172  * will point to the VN_HELD inode.
    173  * The caller is responsible for checking accessibility upfront
    174  * via ufs_diraccess().
    175  */
    176 int
    177 ufs_dirlook(
    178 	struct inode *dp,
    179 	char *namep,
    180 	struct inode **ipp,
    181 	struct cred *cr,
    182 	int skipdnlc)			/* skip the 1st level dnlc */
    183 {
    184 	uint64_t handle;
    185 	struct fbuf *fbp;		/* a buffer of directory entries */
    186 	struct direct *ep;		/* the current directory entry */
    187 	struct vnode *vp;
    188 	struct vnode *dvp;		/* directory vnode ptr */
    189 	struct ulockfs *ulp;
    190 	dcanchor_t *dcap;
    191 	off_t endsearch;		/* offset to end directory search */
    192 	off_t offset;
    193 	off_t start_off;		/* starting offset from middle search */
    194 	off_t last_offset;		/* last offset */
    195 	int entryoffsetinblock;		/* offset of ep in addr's buffer */
    196 	int numdirpasses;		/* strategy for directory search */
    197 	int namlen;			/* length of name */
    198 	int err;
    199 	int doingchk;
    200 	int i;
    201 	int caching;
    202 	int indeadlock;
    203 	ino_t ep_ino;			/* entry i number */
    204 	ino_t chkino;
    205 	ushort_t ep_reclen;		/* direct local d_reclen */
    206 
    207 	ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */
    208 
    209 	if (dp->i_ufsvfs)
    210 		ulp = &dp->i_ufsvfs->vfs_ulockfs;
    211 
    212 	/*
    213 	 * Check the directory name lookup cache, first for individual files
    214 	 * then for complete directories.
    215 	 */
    216 	dvp = ITOV(dp);
    217 	if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) {
    218 		/* vp is already held from dnlc_lookup */
    219 		if (vp == DNLC_NO_VNODE) {
    220 			VN_RELE(vp);
    221 			return (ENOENT);
    222 		}
    223 		*ipp = VTOI(vp);
    224 		return (0);
    225 	}
    226 
    227 	dcap = &dp->i_danchor;
    228 
    229 	/*
    230 	 * Grab the reader lock on the directory data before checking
    231 	 * the dnlc to avoid a race with ufs_dirremove() & friends.
    232 	 *
    233 	 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
    234 	 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
    235 	 * possible, retries the operation.
    236 	 */
    237 	ufs_tryirwlock((&dp->i_rwlock), RW_READER, retry_dircache);
    238 	if (indeadlock)
    239 		return (EAGAIN);
    240 
    241 	switch (dnlc_dir_lookup(dcap, namep, &handle)) {
    242 	case DFOUND:
    243 		ep_ino = (ino_t)H_TO_INO(handle);
    244 		if (dp->i_number == ep_ino) {
    245 			VN_HOLD(dvp);	/* want ourself, "." */
    246 			*ipp = dp;
    247 			rw_exit(&dp->i_rwlock);
    248 			return (0);
    249 		}
    250 		if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) {
    251 			uint64_t handle2;
    252 			/*
    253 			 * release the lock on the dir we are searching
    254 			 * to avoid a deadlock when grabbing the
    255 			 * i_contents lock in ufs_iget_alloced().
    256 			 */
    257 			rw_exit(&dp->i_rwlock);
    258 			rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
    259 			err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
    260 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
    261 			/*
    262 			 * must recheck as we dropped dp->i_rwlock
    263 			 */
    264 			ufs_tryirwlock(&dp->i_rwlock, RW_READER, retry_parent);
    265 			if (indeadlock) {
    266 				if (!err)
    267 					VN_RELE(ITOV(*ipp));
    268 				return (EAGAIN);
    269 			}
    270 			if (!err && (dnlc_dir_lookup(dcap, namep, &handle2)
    271 			    == DFOUND) && (handle == handle2)) {
    272 				dnlc_update(dvp, namep, ITOV(*ipp));
    273 				rw_exit(&dp->i_rwlock);
    274 				return (0);
    275 			}
    276 			/* check failed, read the actual directory */
    277 			if (!err) {
    278 				VN_RELE(ITOV(*ipp));
    279 			}
    280 			goto restart;
    281 		}
    282 		/* usual case of not "." nor ".." */
    283 		rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
    284 		err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
    285 		rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
    286 		if (err) {
    287 			rw_exit(&dp->i_rwlock);
    288 			return (err);
    289 		}
    290 		dnlc_update(dvp, namep, ITOV(*ipp));
    291 		rw_exit(&dp->i_rwlock);
    292 		return (0);
    293 	case DNOENT:
    294 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
    295 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
    296 		}
    297 		rw_exit(&dp->i_rwlock);
    298 		return (ENOENT);
    299 	default:
    300 		break;
    301 	}
    302 restart:
    303 
    304 	fbp = NULL;
    305 	doingchk = 0;
    306 	chkino = 0;
    307 	caching = 0;
    308 
    309 	/*
    310 	 * Attempt to cache any directories greater than the tunable
    311 	 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM),
    312 	 * disable caching for this directory and record the system time.
    313 	 * Any attempt after the disable time has expired will enable
    314 	 * the caching again.
    315 	 */
    316 	if (dp->i_size >= ufs_min_dir_cache) {
    317 		/*
    318 		 * if the directory caching disable time has expired
    319 		 * enable the caching again.
    320 		 */
    321 		if (dp->i_cachedir == CD_DISABLED_NOMEM &&
    322 		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
    323 			ufs_dc_disable_at = 0;
    324 			dp->i_cachedir = CD_ENABLED;
    325 		}
    326 		if (dp->i_cachedir == CD_ENABLED) {
    327 			switch (dnlc_dir_start(dcap, dp->i_size >>
    328 			    AV_DIRECT_SHIFT)) {
    329 			case DNOMEM:
    330 				dp->i_cachedir = CD_DISABLED_NOMEM;
    331 				ufs_dc_disable_at = gethrtime();
    332 				break;
    333 			case DTOOBIG:
    334 				dp->i_cachedir = CD_DISABLED_TOOBIG;
    335 				break;
    336 			case DOK:
    337 				caching = 1;
    338 				break;
    339 			default:
    340 				break;
    341 			}
    342 		}
    343 	}
    344 	/*
    345 	 * If caching we don't stop when the file has been
    346 	 * found, but need to know later, so clear *ipp now
    347 	 */
    348 	*ipp = NULL;
    349 
    350 recheck:
    351 	if (caching) {
    352 		offset = 0;
    353 		entryoffsetinblock = 0;
    354 		numdirpasses = 1;
    355 	} else {
    356 		/*
    357 		 * Take care to look at dp->i_diroff only once, as it
    358 		 * may be changing due to other threads/cpus.
    359 		 */
    360 		offset = dp->i_diroff;
    361 		if (offset > dp->i_size) {
    362 			offset = 0;
    363 		}
    364 		if (offset == 0) {
    365 			entryoffsetinblock = 0;
    366 			numdirpasses = 1;
    367 		} else {
    368 			start_off = offset;
    369 
    370 			entryoffsetinblock = blkoff(dp->i_fs, offset);
    371 			if (entryoffsetinblock != 0) {
    372 				err = blkatoff(dp, offset, (char **)0, &fbp);
    373 				if (err)
    374 					goto bad;
    375 			}
    376 			numdirpasses = 2;
    377 		}
    378 	}
    379 	endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t);
    380 	namlen = strlen(namep);
    381 	last_offset = 0;
    382 
    383 searchloop:
    384 	while (offset < endsearch) {
    385 		/*
    386 		 * If offset is on a block boundary,
    387 		 * read the next directory block.
    388 		 * Release previous if it exists.
    389 		 */
    390 		if (blkoff(dp->i_fs, offset) == 0) {
    391 			if (fbp != NULL) {
    392 				fbrelse(fbp, S_OTHER);
    393 			}
    394 			err = blkatoff(dp, offset, (char **)0, &fbp);
    395 			if (err)
    396 				goto bad;
    397 			entryoffsetinblock = 0;
    398 		}
    399 
    400 		/*
    401 		 * If the offset to the next entry is invalid or if the
    402 		 * next entry is a zero length record or if the record
    403 		 * length is invalid, then skip to the next directory
    404 		 * block.  Complete validation checks are done if the
    405 		 * record length is invalid.
    406 		 *
    407 		 * Full validation checks are slow so they are disabled
    408 		 * by default.  Complete checks can be run by patching
    409 		 * "dirchk" to be true.
    410 		 *
    411 		 * We have to check the validity of entryoffsetinblock
    412 		 * here because it can be set to i_diroff above.
    413 		 */
    414 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock);
    415 		if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 ||
    416 		    (dirchk || (ep->d_reclen & 0x3)) &&
    417 		    dirmangled(dp, ep, entryoffsetinblock, offset)) {
    418 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
    419 			offset += i;
    420 			entryoffsetinblock += i;
    421 			if (caching) {
    422 				dnlc_dir_purge(dcap);
    423 				caching = 0;
    424 			}
    425 			continue;
    426 		}
    427 
    428 		ep_reclen = ep->d_reclen;
    429 
    430 		/*
    431 		 * Add named entries and free space into the directory cache
    432 		 */
    433 		if (caching) {
    434 			ushort_t extra;
    435 			off_t off2;
    436 
    437 			if (ep->d_ino == 0) {
    438 				extra = ep_reclen;
    439 				if (offset & (DIRBLKSIZ - 1)) {
    440 					dnlc_dir_purge(dcap);
    441 					dp->i_cachedir = CD_DISABLED;
    442 					caching = 0;
    443 				}
    444 			} else {
    445 				/*
    446 				 * entries hold the previous offset except the
    447 				 * 1st which holds the offset + 1
    448 				 */
    449 				if (offset & (DIRBLKSIZ - 1)) {
    450 					off2 = last_offset;
    451 				} else {
    452 					off2 = offset + 1;
    453 				}
    454 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
    455 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
    456 				extra = ep_reclen - DIRSIZ(ep);
    457 			}
    458 			if (caching && (extra >= LDIRSIZ(1))) {
    459 				caching = (dnlc_dir_add_space(dcap, extra,
    460 				    (uint64_t)offset) == DOK);
    461 			}
    462 		}
    463 
    464 		/*
    465 		 * Check for a name match.
    466 		 * We have the parent inode read locked with i_rwlock.
    467 		 */
    468 		if (ep->d_ino && ep->d_namlen == namlen &&
    469 		    *namep == *ep->d_name &&	/* fast chk 1st chr */
    470 		    bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) {
    471 
    472 			/*
    473 			 * We have to release the fbp early here to avoid
    474 			 * a possible deadlock situation where we have the
    475 			 * fbp and want the directory inode and someone doing
    476 			 * a ufs_direnter_* has the directory inode and wants
    477 			 * the fbp.  XXX - is this still needed?
    478 			 */
    479 			ep_ino = (ino_t)ep->d_ino;
    480 			ASSERT(fbp != NULL);
    481 			fbrelse(fbp, S_OTHER);
    482 			fbp = NULL;
    483 
    484 			/*
    485 			 * Atomic update (read lock held)
    486 			 */
    487 			dp->i_diroff = offset;
    488 
    489 			if (namlen == 2 && namep[0] == '.' && namep[1] == '.') {
    490 				struct timeval32 omtime;
    491 
    492 				if (caching) {
    493 					dnlc_dir_purge(dcap);
    494 					caching = 0;
    495 				}
    496 				if (doingchk) {
    497 					/*
    498 					 * if the inumber didn't change
    499 					 * continue with already found inode.
    500 					 */
    501 					if (ep_ino == chkino)
    502 						goto checkok;
    503 					else {
    504 						VN_RELE(ITOV(*ipp));
    505 						/* *ipp is nulled at restart */
    506 						goto restart;
    507 					}
    508 				}
    509 				/*
    510 				 * release the lock on the dir we are searching
    511 				 * to avoid a deadlock when grabbing the
    512 				 * i_contents lock in ufs_iget_alloced().
    513 				 */
    514 				omtime = dp->i_mtime;
    515 				rw_exit(&dp->i_rwlock);
    516 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
    517 				    RW_READER);
    518 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
    519 				    cr);
    520 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
    521 				ufs_tryirwlock(&dp->i_rwlock, RW_READER,
    522 				    retry_disk);
    523 				if (indeadlock) {
    524 					if (!err)
    525 						VN_RELE(ITOV(*ipp));
    526 					return (EAGAIN);
    527 				}
    528 				if (err)
    529 					goto bad;
    530 				/*
    531 				 * Since we released the lock on the directory,
    532 				 * we must check that the same inode is still
    533 				 * the ".." entry for this directory.
    534 				 */
    535 				/*CSTYLED*/
    536 				if (timercmp(&omtime, &dp->i_mtime, !=)) {
    537 					/*
    538 					 * Modification time changed on the
    539 					 * directory, we must go check if
    540 					 * the inumber changed for ".."
    541 					 */
    542 					doingchk = 1;
    543 					chkino = ep_ino;
    544 					entryoffsetinblock = 0;
    545 					if (caching) {
    546 						/*
    547 						 * Forget directory caching
    548 						 * for this rare case
    549 						 */
    550 						dnlc_dir_purge(dcap);
    551 						caching = 0;
    552 					}
    553 					goto recheck;
    554 				}
    555 			} else if (dp->i_number == ep_ino) {
    556 				VN_HOLD(dvp);	/* want ourself, "." */
    557 				*ipp = dp;
    558 				if (caching) {
    559 					dnlc_dir_purge(dcap);
    560 					caching = 0;
    561 				}
    562 			} else {
    563 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
    564 				    RW_READER);
    565 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
    566 				    cr);
    567 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
    568 				if (err)
    569 					goto bad;
    570 			}
    571 checkok:
    572 			ASSERT(*ipp);
    573 			dnlc_update(dvp, namep, ITOV(*ipp));
    574 			/*
    575 			 * If we are not caching then just return the entry
    576 			 * otherwise complete loading up the cache
    577 			 */
    578 			if (!caching) {
    579 				rw_exit(&dp->i_rwlock);
    580 				return (0);
    581 			}
    582 			err = blkatoff(dp, offset, (char **)0, &fbp);
    583 			if (err)
    584 				goto bad;
    585 		}
    586 		last_offset = offset;
    587 		offset += ep_reclen;
    588 		entryoffsetinblock += ep_reclen;
    589 	}
    590 	/*
    591 	 * If we started in the middle of the directory and failed
    592 	 * to find our target, we must check the beginning as well.
    593 	 */
    594 	if (numdirpasses == 2) {
    595 		numdirpasses--;
    596 		offset = 0;
    597 		endsearch = start_off;
    598 		goto searchloop;
    599 	}
    600 
    601 	/*
    602 	 * If whole directory caching is on (or was originally on) then
    603 	 * the entry may have been found.
    604 	 */
    605 	if (*ipp == NULL) {
    606 		err = ENOENT;
    607 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
    608 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
    609 		}
    610 	}
    611 	if (caching) {
    612 		dnlc_dir_complete(dcap);
    613 		caching = 0;
    614 	}
    615 
    616 bad:
    617 	if (err && *ipp) {
    618 		/*
    619 		 * err and *ipp can both be set if we were attempting to
    620 		 * cache the directory, and we found the entry, then later
    621 		 * while trying to complete the directory cache encountered
    622 		 * a error (eg reading a directory sector).
    623 		 */
    624 		VN_RELE(ITOV(*ipp));
    625 		*ipp = NULL;
    626 	}
    627 
    628 	if (fbp)
    629 		fbrelse(fbp, S_OTHER);
    630 	rw_exit(&dp->i_rwlock);
    631 	if (caching)
    632 		dnlc_dir_purge(dcap);
    633 	return (err);
    634 }
    635 
    636 /*
    637  * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
    638  */
    639 int
    640 ufs_direnter_cm(
    641 	struct inode *tdp,	/* target directory to make entry in */
    642 	char *namep,		/* name of entry */
    643 	enum de_op op,		/* entry operation */
    644 	struct vattr *vap,	/* attributes if new inode needed */
    645 	struct inode **ipp,	/* return entered inode here */
    646 	struct cred *cr,	/* user credentials */
    647 	int flags)		/* no entry exists */
    648 {
    649 	struct inode *tip;	/* inode of (existing) target file */
    650 	char *s;
    651 	struct ufs_slot slot;	/* slot info to pass around */
    652 	int namlen;		/* length of name */
    653 	int err;		/* error number */
    654 	struct inode *nip;	/* new inode */
    655 	int do_rele_nip = 0;	/* release nip */
    656 	int noentry = flags & ~IQUIET;
    657 	int quiet = flags & IQUIET;	/* Suppress out of inodes message */
    658 	int indeadlock;
    659 	struct ulockfs *ulp;
    660 
    661 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
    662 
    663 	if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) ||
    664 	    ((vap->va_type == VCHR) || (vap->va_type == VBLK) ||
    665 	    (vap->va_type == VDOOR) || (vap->va_type == VSOCK) ||
    666 	    (vap->va_type == VFIFO))))
    667 		return (EINVAL);
    668 
    669 	/* don't allow '/' characters in pathname component */
    670 	for (s = namep, namlen = 0; *s; s++, namlen++)
    671 		if (*s == '/')
    672 			return (EACCES);
    673 	ASSERT(namlen);
    674 
    675 	/*
    676 	 * Check accessibility of target directory.
    677 	 */
    678 	if (err = ufs_diraccess(tdp, IEXEC, cr))
    679 		return (err);
    680 
    681 	/*
    682 	 * If name is "." or ".." then if this is a create look it up
    683 	 * and return EEXIST.
    684 	 */
    685 	if (namep[0] == '.' &&
    686 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
    687 		/*
    688 		 * ufs_dirlook will acquire the i_rwlock
    689 		 */
    690 		if (tdp->i_ufsvfs)
    691 			ulp = &tdp->i_ufsvfs->vfs_ulockfs;
    692 		rw_exit(&tdp->i_rwlock);
    693 		if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) {
    694 			if (err == EAGAIN)
    695 				return (err);
    696 
    697 			/*
    698 			 * ufs_tryirwlock uses rw_tryenter and checks for
    699 			 * SLOCK to avoid i_rwlock, ufs_lockfs_begin deadlock.
    700 			 * If deadlock possible, retries the operation.
    701 			 */
    702 			ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry_err);
    703 			if (indeadlock)
    704 				return (EAGAIN);
    705 
    706 			return (err);
    707 		}
    708 		ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry);
    709 		if (indeadlock) {
    710 			VN_RELE(ITOV(*ipp));
    711 			return (EAGAIN);
    712 		}
    713 		return (EEXIST);
    714 	}
    715 
    716 	/*
    717 	 * If target directory has not been removed, then we can consider
    718 	 * allowing file to be created.
    719 	 */
    720 	if (tdp->i_nlink <= 0) {
    721 		return (ENOENT);
    722 	}
    723 
    724 	/*
    725 	 * Search for the entry. Return VN_HELD tip if found.
    726 	 */
    727 	tip = NULL;
    728 	slot.fbp = NULL;
    729 	slot.status = NONE;
    730 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
    731 	rw_enter(&tdp->i_contents, RW_WRITER);
    732 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry);
    733 	if (err)
    734 		goto out;
    735 	if (tip) {
    736 		ASSERT(!noentry);
    737 		*ipp = tip;
    738 		err = EEXIST;
    739 	} else {
    740 		/*
    741 		 * The entry does not exist. Check write permission in
    742 		 * directory to see if entry can be created.
    743 		 */
    744 		if (err = ufs_iaccess(tdp, IWRITE, cr, 0))
    745 			goto out;
    746 		/*
    747 		 * Make new inode and directory entry.
    748 		 */
    749 		tdp->i_flag |= quiet;
    750 		if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) {
    751 			if (nip != NULL)
    752 				do_rele_nip = 1;
    753 			goto out;
    754 		}
    755 		if (err = ufs_diraddentry(tdp, namep, op,
    756 		    namlen, &slot, nip, NULL, cr)) {
    757 			/*
    758 			 * Unmake the inode we just made.
    759 			 */
    760 			rw_enter(&nip->i_contents, RW_WRITER);
    761 			if (((nip->i_mode & IFMT) == IFDIR) ||
    762 			    ((nip->i_mode & IFMT) == IFATTRDIR)) {
    763 				tdp->i_nlink--;
    764 				ufs_setreclaim(tdp);
    765 				tdp->i_flag |= ICHG;
    766 				tdp->i_seq++;
    767 				TRANS_INODE(tdp->i_ufsvfs, tdp);
    768 				ITIMES_NOLOCK(tdp);
    769 			}
    770 			nip->i_nlink = 0;
    771 			ufs_setreclaim(nip);
    772 			TRANS_INODE(nip->i_ufsvfs, nip);
    773 			nip->i_flag |= ICHG;
    774 			nip->i_seq++;
    775 			ITIMES_NOLOCK(nip);
    776 			rw_exit(&nip->i_contents);
    777 			do_rele_nip = 1;
    778 		} else {
    779 			*ipp = nip;
    780 		}
    781 	}
    782 
    783 out:
    784 	if (slot.fbp)
    785 		fbrelse(slot.fbp, S_OTHER);
    786 
    787 	tdp->i_flag &= ~quiet;
    788 	rw_exit(&tdp->i_contents);
    789 
    790 	/*
    791 	 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
    792 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
    793 	 */
    794 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
    795 
    796 	if (do_rele_nip) {
    797 		VN_RELE(ITOV(nip));
    798 	}
    799 
    800 	return (err);
    801 }
    802 
    803 /*
    804  * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
    805  * If tvpp is non-null, return with the pointer to the target vnode.
    806  */
    807 int
    808 ufs_direnter_lr(
    809 	struct inode *tdp,	/* target directory to make entry in */
    810 	char *namep,		/* name of entry */
    811 	enum de_op op,		/* entry operation */
    812 	struct inode *sdp,	/* source inode parent if rename */
    813 	struct inode *sip,	/* source inode */
    814 	struct cred *cr,	/* user credentials */
    815 	vnode_t **tvpp)		/* Return: (held) vnode of (existing) target */
    816 {
    817 	struct inode *tip;	/* inode of (existing) target file */
    818 	char *s;
    819 	struct ufs_slot slot;	/* slot info to pass around */
    820 	int namlen;		/* length of name */
    821 	int err;		/* error number */
    822 
    823 	/* don't allow '/' characters in pathname component */
    824 	for (s = namep, namlen = 0; *s; s++, namlen++)
    825 		if (*s == '/')
    826 			return (EACCES);
    827 	ASSERT(namlen);
    828 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
    829 
    830 	/*
    831 	 * If name is "." or ".." then if this is a create look it up
    832 	 * and return EEXIST.  Rename or link TO "." or ".." is forbidden.
    833 	 */
    834 	if (namep[0] == '.' &&
    835 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
    836 		if (op == DE_RENAME) {
    837 			return (EINVAL);	/* *SIGH* should be ENOTEMPTY */
    838 		}
    839 		return (EEXIST);
    840 	}
    841 	/*
    842 	 * For link and rename lock the source entry and check the link count
    843 	 * to see if it has been removed while it was unlocked.  If not, we
    844 	 * increment the link count and force the inode to disk to make sure
    845 	 * that it is there before any directory entry that points to it.
    846 	 *
    847 	 * In the case of a symbolic link, we are dealing with a new inode
    848 	 * which does not yet have any links.  We've created it with a link
    849 	 * count of 1, and we don't want to increment it since this will be
    850 	 * its first link.
    851 	 *
    852 	 * We are about to push the inode to disk. We make sure
    853 	 * that the inode's data blocks are flushed first so the
    854 	 * inode and it's data blocks are always in sync.  This
    855 	 * adds some robustness in in the event of a power failure
    856 	 * or panic where sync fails. If we panic before the
    857 	 * inode is updated, then the inode still refers to the
    858 	 * old data blocks (or none for a new file). If we panic
    859 	 * after the inode is updated, then the inode refers to
    860 	 * the new data blocks.
    861 	 *
    862 	 * We do this before grabbing the i_contents lock because
    863 	 * ufs_syncip() will want that lock. We could do the data
    864 	 * syncing after the removal checks, but upon return from
    865 	 * the data sync we would have to repeat the removal
    866 	 * checks.
    867 	 */
    868 	if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) {
    869 		return (err);
    870 	}
    871 
    872 	rw_enter(&sip->i_contents, RW_WRITER);
    873 	if (sip->i_nlink <= 0) {
    874 		rw_exit(&sip->i_contents);
    875 		return (ENOENT);
    876 	}
    877 	if (sip->i_nlink == MAXLINK) {
    878 		rw_exit(&sip->i_contents);
    879 		return (EMLINK);
    880 	}
    881 
    882 	/*
    883 	 * Sync the indirect blocks associated with the file
    884 	 * for the same reasons as described above.  Since this
    885 	 * call wants the i_contents lock held for it we can do
    886 	 * this here with no extra work.
    887 	 */
    888 	if (err = ufs_sync_indir(sip)) {
    889 		rw_exit(&sip->i_contents);
    890 		return (err);
    891 	}
    892 
    893 	if (op != DE_SYMLINK)
    894 		sip->i_nlink++;
    895 	TRANS_INODE(sip->i_ufsvfs, sip);
    896 	sip->i_flag |= ICHG;
    897 	sip->i_seq++;
    898 	ufs_iupdat(sip, I_SYNC);
    899 	rw_exit(&sip->i_contents);
    900 
    901 	/*
    902 	 * If target directory has not been removed, then we can consider
    903 	 * allowing file to be created.
    904 	 */
    905 	if (tdp->i_nlink <= 0) {
    906 		err = ENOENT;
    907 		goto out2;
    908 	}
    909 
    910 	/*
    911 	 * Check accessibility of target directory.
    912 	 */
    913 	if (err = ufs_diraccess(tdp, IEXEC, cr))
    914 		goto out2;
    915 
    916 	/*
    917 	 * Search for the entry. Return VN_HELD tip if found.
    918 	 */
    919 	tip = NULL;
    920 	slot.status = NONE;
    921 	slot.fbp = NULL;
    922 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
    923 	rw_enter(&tdp->i_contents, RW_WRITER);
    924 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0);
    925 	if (err)
    926 		goto out;
    927 
    928 	if (tip) {
    929 		switch (op) {
    930 		case DE_RENAME:
    931 			err = ufs_dirrename(sdp, sip, tdp, namep,
    932 			    tip, &slot, cr);
    933 			break;
    934 
    935 		case DE_LINK:
    936 		case DE_SYMLINK:
    937 			/*
    938 			 * Can't link to an existing file.
    939 			 */
    940 			err = EEXIST;
    941 			break;
    942 		default:
    943 			break;
    944 		}
    945 	} else {
    946 		/*
    947 		 * The entry does not exist. Check write permission in
    948 		 * directory to see if entry can be created.
    949 		 */
    950 		if (err = ufs_iaccess(tdp, IWRITE, cr, 0))
    951 			goto out;
    952 		err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp,
    953 		    cr);
    954 	}
    955 
    956 out:
    957 	if (slot.fbp)
    958 		fbrelse(slot.fbp, S_OTHER);
    959 
    960 	rw_exit(&tdp->i_contents);
    961 
    962 	/*
    963 	 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
    964 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
    965 	 */
    966 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
    967 
    968 	/*
    969 	 * If we renamed a file over the top of an existing file,
    970 	 * or linked a file to an existing file (or tried to),
    971 	 * then set *tvpp to the target vnode, if tvpp is non-null
    972 	 * otherwise, release and delete (or just release) the inode.
    973 	 *
    974 	 * N.B., by returning the target's vnode pointer to the caller,
    975 	 * that caller becomes responsible for doing the VN_RELE.
    976 	 */
    977 	if (tip) {
    978 		if ((err == 0) && (tvpp != NULL)) {
    979 			*tvpp = ITOV(tip);
    980 		} else {
    981 			VN_RELE(ITOV(tip));
    982 		}
    983 	}
    984 
    985 out2:
    986 	if (err) {
    987 		/*
    988 		 * Undo bumped link count.
    989 		 */
    990 		if (op != DE_SYMLINK) {
    991 			rw_enter(&sip->i_contents, RW_WRITER);
    992 			sip->i_nlink--;
    993 			ufs_setreclaim(sip);
    994 			TRANS_INODE(sip->i_ufsvfs, sip);
    995 			sip->i_flag |= ICHG;
    996 			sip->i_seq++;
    997 			ITIMES_NOLOCK(sip);
    998 			rw_exit(&sip->i_contents);
    999 		}
   1000 	}
   1001 	return (err);
   1002 }
   1003 
   1004 /*
   1005  * Check for the existence of a name in a directory (unless noentry
   1006  * is set) , or else of an empty
   1007  * slot in which an entry may be made.  If the requested name is found,
   1008  * then on return *ipp points at the inode and *offp contains
   1009  * its offset in the directory.  If the name is not found, then *ipp
   1010  * will be NULL and *slotp will contain information about a directory slot in
   1011  * which an entry may be made (either an empty slot, or the first position
   1012  * past the end of the directory).
   1013  * The target directory inode (tdp) is supplied write locked (i_rwlock).
   1014  *
   1015  * This may not be used on "." or "..", but aliases of "." are ok.
   1016  */
   1017 int
   1018 ufs_dircheckforname(
   1019 	struct inode *tdp,	/* inode of directory being checked */
   1020 	char *namep,		/* name we're checking for */
   1021 	int namlen,		/* length of name, excluding null */
   1022 	struct ufs_slot *slotp,	/* slot structure */
   1023 	struct inode **ipp,	/* return inode if we find one */
   1024 	struct cred *cr,
   1025 	int noentry)		/* noentry - just look for space */
   1026 {
   1027 	uint64_t handle;
   1028 	struct fbuf *fbp;	/* pointer to directory block */
   1029 	struct direct *ep;	/* directory entry */
   1030 	struct direct *nep;	/* next directory entry */
   1031 	dcanchor_t *dcap;
   1032 	vnode_t *dvp;		/* directory vnode ptr */
   1033 	off_t dirsize;		/* size of the directory */
   1034 	off_t offset;		/* offset in the directory */
   1035 	off_t last_offset;	/* last offset */
   1036 	off_t enduseful;	/* pointer past last used dir slot */
   1037 	int entryoffsetinblk;	/* offset of ep in fbp's buffer */
   1038 	int i;			/* length of mangled entry */
   1039 	int needed;
   1040 	int err;
   1041 	int first;
   1042 	int caching;
   1043 	int stat;
   1044 	ino_t ep_ino;
   1045 	slotstat_t initstat = slotp->status;
   1046 
   1047 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
   1048 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
   1049 	ASSERT(*ipp == NULL);
   1050 	fbp = NULL;
   1051 
   1052 	/*
   1053 	 * First check if there is a complete cache of the directory.
   1054 	 */
   1055 	dvp = ITOV(tdp);
   1056 
   1057 	dcap = &tdp->i_danchor;
   1058 	if (noentry) {
   1059 		/*
   1060 		 * We know from the 1st level dnlc cache that the entry
   1061 		 * doesn't exist, so don't bother searching the directory
   1062 		 * cache, but just look for space (possibly in the directory
   1063 		 * cache).
   1064 		 */
   1065 		stat = DNOENT;
   1066 	} else {
   1067 		stat = dnlc_dir_lookup(dcap, namep, &handle);
   1068 	}
   1069 	switch (stat) {
   1070 	case DFOUND:
   1071 		ep_ino = (ino_t)H_TO_INO(handle);
   1072 		if (tdp->i_number == ep_ino) {
   1073 			*ipp = tdp;	/* we want ourself, ie "." */
   1074 			VN_HOLD(dvp);
   1075 		} else {
   1076 			err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr);
   1077 			if (err)
   1078 				return (err);
   1079 		}
   1080 		offset = H_TO_OFF(handle);
   1081 		first = 0;
   1082 		if (offset & 1) {
   1083 			/* This is the first entry in the block */
   1084 			first = 1;
   1085 			offset -= 1;
   1086 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
   1087 		}
   1088 		err = blkatoff(tdp, offset, (char **)&ep, &fbp);
   1089 		if (err) {
   1090 			VN_RELE(ITOV(*ipp));
   1091 			*ipp = NULL;
   1092 			return (err);
   1093 		}
   1094 		/*
   1095 		 * Check the validity of the entry.
   1096 		 * If it's bad, then throw away the cache and
   1097 		 * continue without it. The dirmangled() routine
   1098 		 * will then be called upon it.
   1099 		 */
   1100 		if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
   1101 			VN_RELE(ITOV(*ipp));
   1102 			*ipp = NULL;
   1103 			dnlc_dir_purge(dcap);
   1104 			break;
   1105 		}
   1106 		/*
   1107 		 * Remember the returned offset is the offset of the
   1108 		 * preceding record (unless this is the 1st record
   1109 		 * in the DIRBLKSIZ sized block (disk sector)), then it's
   1110 		 * offset + 1. Note, no real offsets are on odd boundaries.
   1111 		 */
   1112 		if (first) {
   1113 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
   1114 			slotp->offset = offset;
   1115 			slotp->size = 0;
   1116 			slotp->ep = ep;
   1117 		} else {
   1118 			/* get the next entry */
   1119 			nep = (struct direct *)((char *)ep + ep->d_reclen);
   1120 			/*
   1121 			 * Check the validity of this entry as well
   1122 			 * If it's bad, then throw away the cache and
   1123 			 * continue without it. The dirmangled() routine
   1124 			 * will then be called upon it.
   1125 			 */
   1126 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
   1127 			    (nep->d_ino != ep_ino)) {
   1128 				VN_RELE(ITOV(*ipp));
   1129 				*ipp = NULL;
   1130 				dnlc_dir_purge(dcap);
   1131 				break;
   1132 			}
   1133 			slotp->offset = offset + ep->d_reclen;
   1134 			slotp->size = ep->d_reclen;
   1135 			slotp->ep = nep;
   1136 		}
   1137 		slotp->status = EXIST;
   1138 		slotp->fbp = fbp;
   1139 		slotp->endoff = 0;
   1140 		slotp->cached = 1;
   1141 		dnlc_update(dvp, namep, ITOV(*ipp));
   1142 		return (0);
   1143 	case DNOENT:
   1144 		/*
   1145 		 * The caller gets to set the initial slot status to
   1146 		 * indicate whether it's interested in getting a
   1147 		 * empty slot. For example, the status can be set
   1148 		 * to FOUND when an entry is being deleted.
   1149 		 */
   1150 		ASSERT(slotp->fbp == NULL);
   1151 		if (slotp->status == FOUND) {
   1152 			return (0);
   1153 		}
   1154 		switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen),
   1155 		    &handle)) {
   1156 		case DFOUND:
   1157 			offset = (off_t)handle;
   1158 			err = blkatoff(tdp, offset, (char **)&ep, &fbp);
   1159 			if (err) {
   1160 				dnlc_dir_purge(dcap);
   1161 				ASSERT(*ipp == NULL);
   1162 				return (err);
   1163 			}
   1164 			/*
   1165 			 * Check the validity of the entry.
   1166 			 * If it's bad, then throw away the cache and
   1167 			 * continue without it. The dirmangled() routine
   1168 			 * will then be called upon it.
   1169 			 */
   1170 			if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
   1171 				dnlc_dir_purge(dcap);
   1172 				break;
   1173 			}
   1174 			/*
   1175 			 * Remember the returned offset is the offset of the
   1176 			 * containing record.
   1177 			 */
   1178 			slotp->status = FOUND;
   1179 			slotp->ep = ep;
   1180 			slotp->offset = offset;
   1181 			slotp->fbp = fbp;
   1182 			slotp->size = ep->d_reclen;
   1183 			/*
   1184 			 * Set end offset to 0. Truncation is handled
   1185 			 * because the dnlc cache will blow away the
   1186 			 * cached directory when an entry is removed
   1187 			 * that drops the entries left to less than half
   1188 			 * the minumum number (dnlc_min_dir_cache).
   1189 			 */
   1190 			slotp->endoff = 0;
   1191 			slotp->cached = 1;
   1192 			return (0);
   1193 		case DNOENT:
   1194 			slotp->status = NONE;
   1195 			slotp->offset = P2ROUNDUP_TYPED(tdp->i_size,
   1196 			    DIRBLKSIZ, u_offset_t);
   1197 			slotp->size = DIRBLKSIZ;
   1198 			slotp->endoff = 0;
   1199 			slotp->cached = 1;
   1200 			return (0);
   1201 		default:
   1202 			break;
   1203 		}
   1204 		break;
   1205 	}
   1206 	slotp->cached = 0;
   1207 	caching = NULL;
   1208 	if (!noentry && tdp->i_size >= ufs_min_dir_cache) {
   1209 		/*
   1210 		 * if the directory caching disable time has expired
   1211 		 * enable caching again.
   1212 		 */
   1213 		if (tdp->i_cachedir == CD_DISABLED_NOMEM &&
   1214 		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
   1215 			ufs_dc_disable_at = 0;
   1216 			tdp->i_cachedir = CD_ENABLED;
   1217 		}
   1218 		/*
   1219 		 * Attempt to cache any directories greater than the tunable
   1220 		 * ufs_min_cache_dir. If it fails due to memory shortage
   1221 		 * (DNOMEM), disable caching for this directory and record
   1222 		 * the system time. Any attempt after the disable time has
   1223 		 * expired will enable the caching again.
   1224 		 */
   1225 		if (tdp->i_cachedir == CD_ENABLED) {
   1226 			switch (dnlc_dir_start(dcap,
   1227 			    tdp->i_size >> AV_DIRECT_SHIFT)) {
   1228 			case DNOMEM:
   1229 				tdp->i_cachedir = CD_DISABLED_NOMEM;
   1230 				ufs_dc_disable_at = gethrtime();
   1231 				break;
   1232 			case DTOOBIG:
   1233 				tdp->i_cachedir = CD_DISABLED_TOOBIG;
   1234 				break;
   1235 			case DOK:
   1236 				caching = 1;
   1237 				break;
   1238 			default:
   1239 				break;
   1240 			}
   1241 		}
   1242 	}
   1243 
   1244 	/*
   1245 	 * No point in using i_diroff since we must search whole directory
   1246 	 */
   1247 	dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t);
   1248 	enduseful = 0;
   1249 	offset = last_offset = 0;
   1250 	entryoffsetinblk = 0;
   1251 	needed = (int)LDIRSIZ(namlen);
   1252 	while (offset < dirsize) {
   1253 		/*
   1254 		 * If offset is on a block boundary,
   1255 		 * read the next directory block.
   1256 		 * Release previous if it exists.
   1257 		 */
   1258 		if (blkoff(tdp->i_fs, offset) == 0) {
   1259 			if (fbp != NULL)
   1260 				fbrelse(fbp, S_OTHER);
   1261 
   1262 			err = blkatoff(tdp, offset, (char **)0, &fbp);
   1263 			if (err) {
   1264 				ASSERT(*ipp == NULL);
   1265 				if (caching) {
   1266 					dnlc_dir_purge(dcap);
   1267 				}
   1268 				return (err);
   1269 			}
   1270 			entryoffsetinblk = 0;
   1271 		}
   1272 		/*
   1273 		 * If still looking for a slot, and at a DIRBLKSIZ
   1274 		 * boundary, have to start looking for free space
   1275 		 * again.
   1276 		 */
   1277 		if (slotp->status == NONE &&
   1278 		    (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) {
   1279 			slotp->offset = -1;
   1280 		}
   1281 		/*
   1282 		 * If the next entry is a zero length record or if the
   1283 		 * record length is invalid, then skip to the next
   1284 		 * directory block.  Complete validation checks are
   1285 		 * done if the record length is invalid.
   1286 		 *
   1287 		 * Full validation checks are slow so they are disabled
   1288 		 * by default.  Complete checks can be run by patching
   1289 		 * "dirchk" to be true.
   1290 		 *
   1291 		 * We do not have to check the validity of
   1292 		 * entryoffsetinblk here because it starts out as zero
   1293 		 * and is only incremented by d_reclen values that we
   1294 		 * validate here.
   1295 		 */
   1296 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
   1297 		if (ep->d_reclen == 0 ||
   1298 		    (dirchk || (ep->d_reclen & 0x3)) &&
   1299 		    dirmangled(tdp, ep, entryoffsetinblk, offset)) {
   1300 			i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1));
   1301 			offset += i;
   1302 			entryoffsetinblk += i;
   1303 			if (caching) {
   1304 				dnlc_dir_purge(dcap);
   1305 				caching = 0;
   1306 			}
   1307 			continue;
   1308 		}
   1309 
   1310 		/*
   1311 		 * Add named entries and free space into the directory cache
   1312 		 */
   1313 		if (caching) {
   1314 			ushort_t extra;
   1315 			off_t off2;
   1316 
   1317 			if (ep->d_ino == 0) {
   1318 				extra = ep->d_reclen;
   1319 				if (offset & (DIRBLKSIZ - 1)) {
   1320 					dnlc_dir_purge(dcap);
   1321 					caching = 0;
   1322 				}
   1323 			} else {
   1324 				/*
   1325 				 * entries hold the previous offset if
   1326 				 * not the 1st one
   1327 				 */
   1328 				if (offset & (DIRBLKSIZ - 1)) {
   1329 					off2 = last_offset;
   1330 				} else {
   1331 					off2 = offset + 1;
   1332 				}
   1333 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
   1334 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
   1335 				extra = ep->d_reclen - DIRSIZ(ep);
   1336 			}
   1337 			if (caching && (extra >= LDIRSIZ(1))) {
   1338 				caching = (dnlc_dir_add_space(dcap, extra,
   1339 				    (uint64_t)offset) == DOK);
   1340 			}
   1341 		}
   1342 
   1343 		/*
   1344 		 * If an appropriate sized slot has not yet been found,
   1345 		 * check to see if one is available.
   1346 		 */
   1347 		if ((slotp->status != FOUND) && (slotp->status != EXIST)) {
   1348 			int size = ep->d_reclen;
   1349 
   1350 			if (ep->d_ino != 0)
   1351 				size -= DIRSIZ(ep);
   1352 			if (size > 0) {
   1353 				if (size >= needed) {
   1354 					slotp->offset = offset;
   1355 					slotp->size = ep->d_reclen;
   1356 					if (noentry) {
   1357 						slotp->ep = ep;
   1358 						slotp->fbp = fbp;
   1359 						slotp->status = FOUND;
   1360 						slotp->endoff = 0;
   1361 						return (0);
   1362 					}
   1363 					slotp->status = FOUND;
   1364 				} else if (slotp->status == NONE) {
   1365 					if (slotp->offset == -1)
   1366 						slotp->offset = offset;
   1367 				}
   1368 			}
   1369 		}
   1370 		/*
   1371 		 * Check for a name match.
   1372 		 */
   1373 		if (ep->d_ino && ep->d_namlen == namlen &&
   1374 		    *namep == *ep->d_name &&	/* fast chk 1st char */
   1375 		    bcmp(namep, ep->d_name, namlen) == 0) {
   1376 
   1377 			tdp->i_diroff = offset;
   1378 
   1379 			if (tdp->i_number == ep->d_ino) {
   1380 				*ipp = tdp;	/* we want ourself, ie "." */
   1381 				VN_HOLD(dvp);
   1382 			} else {
   1383 				err = ufs_iget_alloced(tdp->i_vfs,
   1384 				    (ino_t)ep->d_ino, ipp, cr);
   1385 				if (err) {
   1386 					fbrelse(fbp, S_OTHER);
   1387 					if (caching)
   1388 						dnlc_dir_purge(dcap);
   1389 					return (err);
   1390 				}
   1391 			}
   1392 			slotp->status = EXIST;
   1393 			slotp->offset = offset;
   1394 			slotp->size = (int)(offset - last_offset);
   1395 			slotp->fbp = fbp;
   1396 			slotp->ep = ep;
   1397 			slotp->endoff = 0;
   1398 			if (caching)
   1399 				dnlc_dir_purge(dcap);
   1400 			return (0);
   1401 		}
   1402 		last_offset = offset;
   1403 		offset += ep->d_reclen;
   1404 		entryoffsetinblk += ep->d_reclen;
   1405 		if (ep->d_ino)
   1406 			enduseful = offset;
   1407 	}
   1408 	if (fbp) {
   1409 		fbrelse(fbp, S_OTHER);
   1410 	}
   1411 
   1412 	if (caching) {
   1413 		dnlc_dir_complete(dcap);
   1414 		slotp->cached = 1;
   1415 		if (slotp->status == FOUND) {
   1416 			if (initstat == FOUND) {
   1417 				return (0);
   1418 			}
   1419 			(void) dnlc_dir_rem_space_by_handle(dcap,
   1420 			    slotp->offset);
   1421 			slotp->endoff = 0;
   1422 			return (0);
   1423 		}
   1424 	}
   1425 
   1426 	if (slotp->status == NONE) {
   1427 		/*
   1428 		 * We didn't find a slot; the new directory entry should be put
   1429 		 * at the end of the directory.  Return an indication of where
   1430 		 * this is, and set "endoff" to zero; since we're going to have
   1431 		 * to extend the directory, we're certainly not going to
   1432 		 * truncate it.
   1433 		 */
   1434 		slotp->offset = dirsize;
   1435 		slotp->size = DIRBLKSIZ;
   1436 		slotp->endoff = 0;
   1437 	} else {
   1438 		/*
   1439 		 * We found a slot, and will return an indication of where that
   1440 		 * slot is, as any new directory entry will be put there.
   1441 		 * Since that slot will become a useful entry, if the last
   1442 		 * useful entry we found was before this one, update the offset
   1443 		 * of the last useful entry.
   1444 		 */
   1445 		if (enduseful < slotp->offset + slotp->size)
   1446 			enduseful = slotp->offset + slotp->size;
   1447 		slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t);
   1448 	}
   1449 	*ipp = NULL;
   1450 	return (0);
   1451 }
   1452 
   1453 uint64_t ufs_dirrename_retry_cnt;
   1454 
   1455 /*
   1456  * Rename the entry in the directory tdp so that it points to
   1457  * sip instead of tip.
   1458  */
   1459 static int
   1460 ufs_dirrename(
   1461 	struct inode *sdp,	/* parent directory of source */
   1462 	struct inode *sip,	/* source inode */
   1463 	struct inode *tdp,	/* parent directory of target */
   1464 	char *namep,		/* entry we are trying to change */
   1465 	struct inode *tip,	/* target inode */
   1466 	struct ufs_slot *slotp,	/* slot for entry */
   1467 	struct cred *cr)	/* credentials */
   1468 {
   1469 	vnode_t *tdvp;
   1470 	off_t offset;
   1471 	int err;
   1472 	int doingdirectory;
   1473 
   1474 	ASSERT(sdp->i_ufsvfs != NULL);
   1475 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
   1476 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
   1477 	/*
   1478 	 * Short circuit rename of something to itself.
   1479 	 */
   1480 	if (sip->i_number == tip->i_number) {
   1481 		return (ESAME); /* special KLUDGE error code */
   1482 	}
   1483 
   1484 	/*
   1485 	 * We're locking 2 peer level locks, so must use tryenter
   1486 	 * on the 2nd to avoid deadlocks that would occur
   1487 	 * if we renamed a->b and b->a concurrently.
   1488 	 */
   1489 retry:
   1490 	rw_enter(&tip->i_contents, RW_WRITER);
   1491 	if (!rw_tryenter(&sip->i_contents, RW_READER)) {
   1492 		/*
   1493 		 * drop tip and wait (sleep) until we stand a chance
   1494 		 * of holding sip
   1495 		 */
   1496 		rw_exit(&tip->i_contents);
   1497 		rw_enter(&sip->i_contents, RW_READER);
   1498 		/*
   1499 		 * Reverse the lock grabs in case we have heavy
   1500 		 * contention on the 2nd lock.
   1501 		 */
   1502 		if (!rw_tryenter(&tip->i_contents, RW_WRITER)) {
   1503 			ufs_dirrename_retry_cnt++;
   1504 			rw_exit(&sip->i_contents);
   1505 			goto retry;
   1506 		}
   1507 	}
   1508 
   1509 	/*
   1510 	 * Check that everything is on the same filesystem.
   1511 	 */
   1512 	if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) ||
   1513 	    (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) {
   1514 		err = EXDEV;		/* XXX archaic */
   1515 		goto out;
   1516 	}
   1517 	/*
   1518 	 * Must have write permission to rewrite target entry.
   1519 	 * Perform additional checks for sticky directories.
   1520 	 */
   1521 	if ((err = ufs_iaccess(tdp, IWRITE, cr, 0)) != 0 ||
   1522 	    (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0)
   1523 		goto out;
   1524 
   1525 	/*
   1526 	 * Ensure source and target are compatible (both directories
   1527 	 * or both not directories).  If target is a directory it must
   1528 	 * be empty and have no links to it; in addition it must not
   1529 	 * be a mount point, and both the source and target must be
   1530 	 * writable.
   1531 	 */
   1532 	doingdirectory = (((sip->i_mode & IFMT) == IFDIR) ||
   1533 	    ((sip->i_mode & IFMT) == IFATTRDIR));
   1534 	if (((tip->i_mode & IFMT) == IFDIR) ||
   1535 	    ((tip->i_mode & IFMT) == IFATTRDIR)) {
   1536 		if (!doingdirectory) {
   1537 			err = EISDIR;
   1538 			goto out;
   1539 		}
   1540 		/*
   1541 		 * vn_vfsrlock will prevent mounts from using the directory
   1542 		 * until we are done.
   1543 		 */
   1544 		if (vn_vfsrlock(ITOV(tip))) {
   1545 			err = EBUSY;
   1546 			goto out;
   1547 		}
   1548 		if (vn_mountedvfs(ITOV(tip)) != NULL) {
   1549 			vn_vfsunlock(ITOV(tip));
   1550 			err = EBUSY;
   1551 			goto out;
   1552 		}
   1553 		if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) {
   1554 			vn_vfsunlock(ITOV(tip));
   1555 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
   1556 			goto out;
   1557 		}
   1558 	} else if (doingdirectory) {
   1559 		err = ENOTDIR;
   1560 		goto out;
   1561 	}
   1562 
   1563 	/*
   1564 	 * Rewrite the inode pointer for target name entry
   1565 	 * from the target inode (ip) to the source inode (sip).
   1566 	 * This prevents the target entry from disappearing
   1567 	 * during a crash. Mark the directory inode to reflect the changes.
   1568 	 */
   1569 	tdvp = ITOV(tdp);
   1570 	slotp->ep->d_ino = (int32_t)sip->i_number;
   1571 	dnlc_update(tdvp, namep, ITOV(sip));
   1572 	if (slotp->size) {
   1573 		offset = slotp->offset - slotp->size;
   1574 	} else {
   1575 		offset = slotp->offset + 1;
   1576 	}
   1577 	if (slotp->cached) {
   1578 		(void) dnlc_dir_update(&tdp->i_danchor, namep,
   1579 		    INO_OFF_TO_H(slotp->ep->d_ino, offset));
   1580 	}
   1581 
   1582 	err = TRANS_DIR(tdp, slotp->offset);
   1583 	if (err)
   1584 		fbrelse(slotp->fbp, S_OTHER);
   1585 	else
   1586 		err = ufs_fbwrite(slotp->fbp, tdp);
   1587 
   1588 	slotp->fbp = NULL;
   1589 	if (err) {
   1590 		if (doingdirectory)
   1591 			vn_vfsunlock(ITOV(tip));
   1592 		goto out;
   1593 	}
   1594 
   1595 	TRANS_INODE(tdp->i_ufsvfs, tdp);
   1596 	tdp->i_flag |= IUPD|ICHG;
   1597 	tdp->i_seq++;
   1598 	ITIMES_NOLOCK(tdp);
   1599 
   1600 	/*
   1601 	 * Decrement the link count of the target inode.
   1602 	 * Fix the ".." entry in sip to point to dp.
   1603 	 * This is done after the new entry is on the disk.
   1604 	 */
   1605 	tip->i_nlink--;
   1606 	TRANS_INODE(tip->i_ufsvfs, tip);
   1607 	tip->i_flag |= ICHG;
   1608 	tip->i_seq++;
   1609 	ITIMES_NOLOCK(tip);
   1610 	if (doingdirectory) {
   1611 		/*
   1612 		 * The entry for tip no longer exists so I can unlock the
   1613 		 * vfslock.
   1614 		 */
   1615 		vn_vfsunlock(ITOV(tip));
   1616 		/*
   1617 		 * Decrement target link count once more if it was a directory.
   1618 		 */
   1619 		if (--tip->i_nlink != 0) {
   1620 			err = ufs_fault(ITOV(tip),
   1621 		    "ufs_dirrename: target directory link count != 0 (%s)",
   1622 			    tip->i_fs->fs_fsmnt);
   1623 			rw_exit(&tip->i_contents);
   1624 			return (err);
   1625 		}
   1626 		TRANS_INODE(tip->i_ufsvfs, tip);
   1627 		ufs_setreclaim(tip);
   1628 		/*
   1629 		 * Renaming a directory with the parent different
   1630 		 * requires that ".." be rewritten.  The window is
   1631 		 * still there for ".." to be inconsistent, but this
   1632 		 * is unavoidable, and a lot shorter than when it was
   1633 		 * done in a user process.  We decrement the link
   1634 		 * count in the new parent as appropriate to reflect
   1635 		 * the just-removed target.  If the parent is the
   1636 		 * same, this is appropriate since the original
   1637 		 * directory is going away.  If the new parent is
   1638 		 * different, ufs_dirfixdotdot() will bump the link count
   1639 		 * back.
   1640 		 */
   1641 		tdp->i_nlink--;
   1642 		ufs_setreclaim(tdp);
   1643 		TRANS_INODE(tdp->i_ufsvfs, tdp);
   1644 		tdp->i_flag |= ICHG;
   1645 		tdp->i_seq++;
   1646 		ITIMES_NOLOCK(tdp);
   1647 		if (sdp != tdp) {
   1648 			rw_exit(&tip->i_contents);
   1649 			rw_exit(&sip->i_contents);
   1650 			err = ufs_dirfixdotdot(sip, sdp, tdp);
   1651 			return (err);
   1652 		}
   1653 	} else
   1654 		ufs_setreclaim(tip);
   1655 out:
   1656 	rw_exit(&tip->i_contents);
   1657 	rw_exit(&sip->i_contents);
   1658 	return (err);
   1659 }
   1660 
   1661 /*
   1662  * Fix the ".." entry of the child directory so that it points
   1663  * to the new parent directory instead of the old one.  Routine
   1664  * assumes that dp is a directory and that all the inodes are on
   1665  * the same file system.
   1666  */
   1667 static int
   1668 ufs_dirfixdotdot(
   1669 	struct inode *dp,	/* child directory */
   1670 	struct inode *opdp,	/* old parent directory */
   1671 	struct inode *npdp)	/* new parent directory */
   1672 {
   1673 	struct fbuf *fbp;
   1674 	struct dirtemplate *dirp;
   1675 	vnode_t *dvp;
   1676 	int err;
   1677 
   1678 	ASSERT(RW_WRITE_HELD(&npdp->i_rwlock));
   1679 	ASSERT(RW_WRITE_HELD(&npdp->i_contents));
   1680 
   1681 	/*
   1682 	 * We hold the child directory's i_contents lock before calling
   1683 	 * blkatoff so that we honor correct locking protocol which is
   1684 	 * i_contents lock and then page lock. (blkatoff will call
   1685 	 * ufs_getpage where we want the page lock)
   1686 	 * We hold the child directory's i_rwlock before i_contents (as
   1687 	 * per the locking protocol) since we are modifying the ".." entry
   1688 	 * of the child directory.
   1689 	 * We hold the i_rwlock and i_contents lock until we record
   1690 	 * this directory delta to the log (via ufs_trans_dir) and have
   1691 	 * done fbrelse.
   1692 	 */
   1693 	rw_enter(&dp->i_rwlock, RW_WRITER);
   1694 	rw_enter(&dp->i_contents, RW_WRITER);
   1695 	err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp);
   1696 	if (err)
   1697 		goto bad;
   1698 
   1699 	if (dp->i_nlink <= 0 ||
   1700 	    dp->i_size < sizeof (struct dirtemplate)) {
   1701 		err = ENOENT;
   1702 		goto bad;
   1703 	}
   1704 
   1705 	if (dirp->dotdot_namlen != 2 ||
   1706 	    dirp->dotdot_name[0] != '.' ||
   1707 	    dirp->dotdot_name[1] != '.') {	/* Sanity check. */
   1708 		dirbad(dp, "mangled .. entry", (off_t)0);
   1709 		err = ENOTDIR;
   1710 		goto bad;
   1711 	}
   1712 
   1713 	/*
   1714 	 * Increment the link count in the new parent inode and force it out.
   1715 	 */
   1716 	if (npdp->i_nlink == MAXLINK) {
   1717 		err = EMLINK;
   1718 		goto bad;
   1719 	}
   1720 	npdp->i_nlink++;
   1721 	TRANS_INODE(npdp->i_ufsvfs, npdp);
   1722 	npdp->i_flag |= ICHG;
   1723 	npdp->i_seq++;
   1724 	ufs_iupdat(npdp, I_SYNC);
   1725 
   1726 	/*
   1727 	 * Rewrite the child ".." entry and force it out.
   1728 	 */
   1729 	dvp = ITOV(dp);
   1730 	dirp->dotdot_ino = (uint32_t)npdp->i_number;
   1731 	dnlc_update(dvp, "..", ITOV(npdp));
   1732 	(void) dnlc_dir_update(&dp->i_danchor, "..",
   1733 	    INO_OFF_TO_H(dirp->dotdot_ino, 0));
   1734 
   1735 	err = TRANS_DIR(dp, 0);
   1736 	if (err)
   1737 		fbrelse(fbp, S_OTHER);
   1738 	else
   1739 		err = ufs_fbwrite(fbp, dp);
   1740 
   1741 	fbp = NULL;
   1742 	if (err)
   1743 		goto bad;
   1744 
   1745 	rw_exit(&dp->i_contents);
   1746 	rw_exit(&dp->i_rwlock);
   1747 
   1748 	/*
   1749 	 * Decrement the link count of the old parent inode and force it out.
   1750 	 */
   1751 	ASSERT(opdp);
   1752 	rw_enter(&opdp->i_contents, RW_WRITER);
   1753 	ASSERT(opdp->i_nlink > 0);
   1754 	opdp->i_nlink--;
   1755 	ufs_setreclaim(opdp);
   1756 	TRANS_INODE(opdp->i_ufsvfs, opdp);
   1757 	opdp->i_flag |= ICHG;
   1758 	opdp->i_seq++;
   1759 	ufs_iupdat(opdp, I_SYNC);
   1760 	rw_exit(&opdp->i_contents);
   1761 	return (0);
   1762 
   1763 bad:
   1764 	if (fbp)
   1765 		fbrelse(fbp, S_OTHER);
   1766 	rw_exit(&dp->i_contents);
   1767 	rw_exit(&dp->i_rwlock);
   1768 	return (err);
   1769 }
   1770 
   1771 /*
   1772  * Enter the file sip in the directory tdp with name namep.
   1773  */
   1774 static int
   1775 ufs_diraddentry(
   1776 	struct inode *tdp,
   1777 	char *namep,
   1778 	enum de_op op,
   1779 	int namlen,
   1780 	struct ufs_slot *slotp,
   1781 	struct inode *sip,
   1782 	struct inode *sdp,
   1783 	struct cred *cr)
   1784 {
   1785 	struct direct *ep, *nep;
   1786 	vnode_t *tdvp;
   1787 	dcanchor_t *dcap = &tdp->i_danchor;
   1788 	off_t offset;
   1789 	int err;
   1790 	ushort_t extra;
   1791 
   1792 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
   1793 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
   1794 	/*
   1795 	 * Prepare a new entry.  If the caller has not supplied an
   1796 	 * existing inode, make a new one.
   1797 	 */
   1798 	err = dirprepareentry(tdp, slotp, cr);
   1799 	if (err) {
   1800 		if (slotp->fbp) {
   1801 			fbrelse(slotp->fbp, S_OTHER);
   1802 			slotp->fbp = NULL;
   1803 		}
   1804 		return (err);
   1805 	}
   1806 	/*
   1807 	 * Check inode to be linked to see if it is in the
   1808 	 * same filesystem.
   1809 	 */
   1810 	if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) {
   1811 		err = EXDEV;
   1812 		goto bad;
   1813 	}
   1814 
   1815 	/*
   1816 	 * If renaming a directory then fix up the ".." entry in the
   1817 	 * directory to point to the new parent.
   1818 	 */
   1819 	if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) ||
   1820 	    ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) {
   1821 		err = ufs_dirfixdotdot(sip, sdp, tdp);
   1822 		if (err)
   1823 			goto bad;
   1824 	}
   1825 
   1826 	/*
   1827 	 * Fill in entry data.
   1828 	 */
   1829 	ep = slotp->ep;
   1830 	ep->d_namlen = (ushort_t)namlen;
   1831 	(void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3));
   1832 	ep->d_ino = (uint32_t)sip->i_number;
   1833 	tdvp = ITOV(tdp);
   1834 	dnlc_update(tdvp, namep, ITOV(sip));
   1835 	/*
   1836 	 * Note the offset supplied for any named entry is
   1837 	 * the offset of the previous one, unless it's the 1st.
   1838 	 * slotp->size is used to pass the length to
   1839 	 * the previous entry.
   1840 	 */
   1841 	if (slotp->size) {
   1842 		offset = slotp->offset - slotp->size;
   1843 	} else {
   1844 		offset = slotp->offset + 1;
   1845 	}
   1846 
   1847 	if (slotp->cached) {
   1848 		/*
   1849 		 * Add back any usable unused space to the dnlc directory
   1850 		 * cache.
   1851 		 */
   1852 		extra = ep->d_reclen - DIRSIZ(ep);
   1853 		if (extra >= LDIRSIZ(1)) {
   1854 			(void) dnlc_dir_add_space(dcap, extra,
   1855 			    (uint64_t)slotp->offset);
   1856 		}
   1857 
   1858 		(void) dnlc_dir_add_entry(dcap, namep,
   1859 		    INO_OFF_TO_H(ep->d_ino, offset));
   1860 
   1861 		/* adjust the previous offset of the next entry */
   1862 		nep = (struct direct *)((char *)ep + ep->d_reclen);
   1863 		if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
   1864 			/*
   1865 			 * Not a new block.
   1866 			 *
   1867 			 * Check the validity of the next entry.
   1868 			 * If it's bad, then throw away the cache, and
   1869 			 * continue as before directory caching.
   1870 			 */
   1871 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
   1872 			    dnlc_dir_update(dcap, nep->d_name,
   1873 			    INO_OFF_TO_H(nep->d_ino, slotp->offset))
   1874 			    == DNOENT) {
   1875 				dnlc_dir_purge(dcap);
   1876 				slotp->cached = 0;
   1877 			}
   1878 		}
   1879 	}
   1880 
   1881 	/*
   1882 	 * Write out the directory block.
   1883 	 */
   1884 	err = TRANS_DIR(tdp, slotp->offset);
   1885 	if (err)
   1886 		fbrelse(slotp->fbp, S_OTHER);
   1887 	else
   1888 		err = ufs_fbwrite(slotp->fbp, tdp);
   1889 
   1890 	slotp->fbp = NULL;
   1891 	/*
   1892 	 * If this is a rename of a directory, then we have already
   1893 	 * fixed the ".." entry to refer to the new parent. If err
   1894 	 * is true at this point, we have failed to update the new
   1895 	 * parent to refer to the renamed directory.
   1896 	 * XXX - we need to unwind the ".." fix.
   1897 	 */
   1898 	if (err)
   1899 		return (err);
   1900 
   1901 	/*
   1902 	 * Mark the directory inode to reflect the changes.
   1903 	 * Truncate the directory to chop off blocks of empty entries.
   1904 	 */
   1905 
   1906 	TRANS_INODE(tdp->i_ufsvfs, tdp);
   1907 	tdp->i_flag |= IUPD|ICHG;
   1908 	tdp->i_seq++;
   1909 	tdp->i_diroff = 0;
   1910 	ITIMES_NOLOCK(tdp);
   1911 	/*
   1912 	 * If the directory grew then dirprepareentry() will have
   1913 	 * set IATTCHG in tdp->i_flag, then the directory inode must
   1914 	 * be flushed out. This is because if fsync() is used later
   1915 	 * the directory size must be correct, otherwise a crash would
   1916 	 * cause fsck to move the file to lost+found. Also because later
   1917 	 * a file may be linked in more than one directory, then there
   1918 	 * is no way to flush the original directory. So it must be
   1919 	 * flushed out on creation. See bug 4293809.
   1920 	 */
   1921 	if (tdp->i_flag & IATTCHG) {
   1922 		ufs_iupdat(tdp, I_SYNC);
   1923 	}
   1924 
   1925 	if (slotp->endoff && (slotp->endoff < tdp->i_size)) {
   1926 		if (!TRANS_ISTRANS(tdp->i_ufsvfs)) {
   1927 			(void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0,
   1928 			    cr);
   1929 		}
   1930 	}
   1931 
   1932 
   1933 	return (0);
   1934 
   1935 bad:
   1936 	if (slotp->cached) {
   1937 		dnlc_dir_purge(dcap);
   1938 		fbrelse(slotp->fbp, S_OTHER);
   1939 		slotp->cached = 0;
   1940 		slotp->fbp = NULL;
   1941 		return (err);
   1942 	}
   1943 
   1944 	/*
   1945 	 * Clear out entry prepared by dirprepareent.
   1946 	 */
   1947 	slotp->ep->d_ino = 0;
   1948 	slotp->ep->d_namlen = 0;
   1949 
   1950 	/*
   1951 	 * Don't touch err so we don't clobber the real error that got us here.
   1952 	 */
   1953 	if (TRANS_DIR(tdp, slotp->offset))
   1954 		fbrelse(slotp->fbp, S_OTHER);
   1955 	else
   1956 		(void) ufs_fbwrite(slotp->fbp, tdp);
   1957 	slotp->fbp = NULL;
   1958 	return (err);
   1959 }
   1960 
   1961 /*
   1962  * Prepare a directory slot to receive an entry.
   1963  */
   1964 static int
   1965 dirprepareentry(
   1966 	struct inode *dp,	/* directory we are working in */
   1967 	struct ufs_slot *slotp,	/* available slot info */
   1968 	struct cred *cr)
   1969 {
   1970 	struct direct *ep, *nep;
   1971 	off_t entryend;
   1972 	int err;
   1973 	slotstat_t status = slotp->status;
   1974 	ushort_t dsize;
   1975 
   1976 	ASSERT((status == NONE) || (status == FOUND));
   1977 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
   1978 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
   1979 	/*
   1980 	 * If we didn't find a slot, then indicate that the
   1981 	 * new slot belongs at the end of the directory.
   1982 	 * If we found a slot, then the new entry can be
   1983 	 * put at slotp->offset.
   1984 	 */
   1985 	entryend = slotp->offset + slotp->size;
   1986 	if (status == NONE) {
   1987 		ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0);
   1988 		if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
   1989 			err = ufs_fault(ITOV(dp),
   1990 			    "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
   1991 			    " > dp->i_fs->fs_fsize: %d (%s)",
   1992 			    DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt);
   1993 			return (err);
   1994 		}
   1995 		/*
   1996 		 * Allocate the new block.
   1997 		 */
   1998 		err = BMAPALLOC(dp, (u_offset_t)slotp->offset,
   1999 		    (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr);
   2000 		if (err) {
   2001 			return (err);
   2002 		}
   2003 		dp->i_size = entryend;
   2004 		TRANS_INODE(dp->i_ufsvfs, dp);
   2005 		dp->i_flag |= IUPD|ICHG|IATTCHG;
   2006 		dp->i_seq++;
   2007 		ITIMES_NOLOCK(dp);
   2008 	} else if (entryend > dp->i_size) {
   2009 		/*
   2010 		 * Adjust directory size, if needed. This should never
   2011 		 * push the size past a new multiple of DIRBLKSIZ.
   2012 		 * This is an artifact of the old (4.2BSD) way of initializing
   2013 		 * directory sizes to be less than DIRBLKSIZ.
   2014 		 */
   2015 		dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t);
   2016 		TRANS_INODE(dp->i_ufsvfs, dp);
   2017 		dp->i_flag |= IUPD|ICHG|IATTCHG;
   2018 		dp->i_seq++;
   2019 		ITIMES_NOLOCK(dp);
   2020 	}
   2021 
   2022 	/*
   2023 	 * Get the block containing the space for the new directory entry.
   2024 	 */
   2025 	if (slotp->fbp == NULL) {
   2026 		err = blkatoff(dp, slotp->offset, (char **)&slotp->ep,
   2027 		    &slotp->fbp);
   2028 		if (err) {
   2029 			return (err);
   2030 		}
   2031 	}
   2032 	ep = slotp->ep;
   2033 
   2034 	switch (status) {
   2035 	case NONE:
   2036 		/*
   2037 		 * No space in the directory. slotp->offset will be on a
   2038 		 * directory block boundary and we will write the new entry
   2039 		 * into a fresh block.
   2040 		 */
   2041 		ep->d_reclen = DIRBLKSIZ;
   2042 		slotp->size = 0; /* length of previous entry */
   2043 		break;
   2044 	case FOUND:
   2045 		/*
   2046 		 * An entry of the required size has been found. Use it.
   2047 		 */
   2048 		if (ep->d_ino == 0) {
   2049 			/* this is the 1st record in a block */
   2050 			slotp->size = 0; /* length of previous entry */
   2051 		} else {
   2052 			dsize = DIRSIZ(ep);
   2053 			nep = (struct direct *)((char *)ep + dsize);
   2054 			nep->d_reclen = ep->d_reclen - dsize;
   2055 			ep->d_reclen = dsize;
   2056 			slotp->ep = nep;
   2057 			slotp->offset += dsize;
   2058 			slotp->size = dsize; /* length of previous entry */
   2059 		}
   2060 		break;
   2061 	default:
   2062 		break;
   2063 	}
   2064 	return (0);
   2065 }
   2066 
   2067 /*
   2068  * Allocate and initialize a new inode that will go into directory tdp.
   2069  * This routine is called from ufs_symlink(), as well as within this file.
   2070  */
   2071 int
   2072 ufs_dirmakeinode(
   2073 	struct inode *tdp,
   2074 	struct inode **ipp,
   2075 	struct vattr *vap,
   2076 	enum de_op op,
   2077 	struct cred *cr)
   2078 {
   2079 	struct inode *ip;
   2080 	enum vtype type;
   2081 	int imode;			/* mode and format as in inode */
   2082 	ino_t ipref;
   2083 	int err;
   2084 	timestruc_t now;
   2085 
   2086 	ASSERT(vap != NULL);
   2087 	ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR ||
   2088 	    op == DE_SYMLINK);
   2089 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
   2090 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
   2091 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
   2092 	/*
   2093 	 * Allocate a new inode.
   2094 	 */
   2095 	type = vap->va_type;
   2096 	if (type == VDIR) {
   2097 		ipref = dirpref(tdp);
   2098 	} else {
   2099 		ipref = tdp->i_number;
   2100 	}
   2101 	if (op == DE_ATTRDIR)
   2102 		imode = vap->va_mode;
   2103 	else
   2104 		imode = MAKEIMODE(type, vap->va_mode);
   2105 	*ipp = NULL;
   2106 	err = ufs_ialloc(tdp, ipref, imode, &ip, cr);
   2107 	if (err)
   2108 		return (err);
   2109 
   2110 	/*
   2111 	 * We don't need to grab vfs_dqrwlock here because it is held
   2112 	 * in ufs_direnter_*() above us.
   2113 	 */
   2114 	ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock));
   2115 	rw_enter(&ip->i_contents, RW_WRITER);
   2116 	if (ip->i_dquot != NULL) {
   2117 		err = ufs_fault(ITOV(ip),
   2118 		    "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
   2119 		    tdp->i_fs->fs_fsmnt);
   2120 		rw_exit(&ip->i_contents);
   2121 		return (err);
   2122 	}
   2123 	*ipp = ip;
   2124 	ip->i_mode = (o_mode_t)imode;
   2125 	if (type == VBLK || type == VCHR) {
   2126 		dev_t d = vap->va_rdev;
   2127 		dev32_t dev32;
   2128 
   2129 		/*
   2130 		 * Don't allow a special file to be created with a
   2131 		 * dev_t that cannot be represented by this filesystem
   2132 		 * format on disk.
   2133 		 */
   2134 		if (!cmpldev(&dev32, d)) {
   2135 			err = EOVERFLOW;
   2136 			goto fail;
   2137 		}
   2138 
   2139 		ITOV(ip)->v_rdev = ip->i_rdev = d;
   2140 
   2141 		if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
   2142 			ip->i_ordev = dev32; /* can't use old format */
   2143 		} else {
   2144 			ip->i_ordev = cmpdev(d);
   2145 		}
   2146 	}
   2147 	ITOV(ip)->v_type = type;
   2148 	ufs_reset_vnode(ip->i_vnode);
   2149 	if (type == VDIR) {
   2150 		ip->i_nlink = 2; /* anticipating a call to dirmakedirect */
   2151 	} else {
   2152 		ip->i_nlink = 1;
   2153 	}
   2154 
   2155 	if (op == DE_ATTRDIR) {
   2156 		ip->i_uid = vap->va_uid;
   2157 		ip->i_gid = vap->va_gid;
   2158 	} else
   2159 		ip->i_uid = crgetuid(cr);
   2160 	/*
   2161 	 * To determine the group-id of the created file:
   2162 	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
   2163 	 *	clients are not likely to set the gid), then use it if
   2164 	 *	the process is privileged, belongs to the target group,
   2165 	 *	or the group is the same as the parent directory.
   2166 	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
   2167 	 *	GRPID option, and the directory's set-gid bit is clear,
   2168 	 *	then use the process's gid.
   2169 	 *   3) Otherwise, set the group-id to the gid of the parent directory.
   2170 	 */
   2171 	if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) &&
   2172 	    ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) ||
   2173 	    secpolicy_vnode_create_gid(cr) == 0)) {
   2174 		/*
   2175 		 * XXX - is this only the case when a 4.0 NFS client, or a
   2176 		 * client derived from that code, makes a call over the wire?
   2177 		 */
   2178 		ip->i_gid = vap->va_gid;
   2179 	} else
   2180 		ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr);
   2181 
   2182 	/*
   2183 	 * For SunOS 5.0->5.4, the lines below read:
   2184 	 *
   2185 	 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
   2186 	 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
   2187 	 *
   2188 	 * where MAXUID was set to 60002.  See notes on this in ufs_inode.c
   2189 	 */
   2190 	ip->i_suid =
   2191 	    (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? UID_LONG : ip->i_uid;
   2192 	ip->i_sgid =
   2193 	    (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? GID_LONG : ip->i_gid;
   2194 
   2195 	/*
   2196 	 * If we're creating a directory, and the parent directory has the
   2197 	 * set-GID bit set, set it on the new directory.
   2198 	 * Otherwise, if the user is neither privileged nor a member of the
   2199 	 * file's new group, clear the file's set-GID bit.
   2200 	 */
   2201 	if ((tdp->i_mode & ISGID) && (type == VDIR))
   2202 		ip->i_mode |= ISGID;
   2203 	else {
   2204 		if ((ip->i_mode & ISGID) &&
   2205 		    secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0)
   2206 			ip->i_mode &= ~ISGID;
   2207 	}
   2208 
   2209 	if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
   2210 	    ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
   2211 		err = EOVERFLOW;
   2212 		goto fail;
   2213 	}
   2214 
   2215 	/*
   2216 	 * Extended attribute directories are not subject to quotas.
   2217 	 */
   2218 	if (op != DE_ATTRDIR)
   2219 		ip->i_dquot = getinoquota(ip);
   2220 	else
   2221 		ip->i_dquot = NULL;
   2222 
   2223 	if (op == DE_MKDIR || op == DE_ATTRDIR) {
   2224 		err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr);
   2225 		if (err)
   2226 			goto fail;
   2227 	}
   2228 
   2229 	/*
   2230 	 * generate the shadow inode and attach it to the new object
   2231 	 */
   2232 	ASSERT((tdp->i_shadow && tdp->i_ufs_acl) ||
   2233 	    (!tdp->i_shadow && !tdp->i_ufs_acl));
   2234 	if (tdp->i_shadow && tdp->i_ufs_acl &&
   2235 	    (((tdp->i_mode & IFMT) == IFDIR) ||
   2236 	    ((tdp->i_mode & IFMT) == IFATTRDIR))) {
   2237 		err = ufs_si_inherit(ip, tdp, ip->i_mode, cr);
   2238 		if (err) {
   2239 			if (op == DE_MKDIR) {
   2240 				/*
   2241 				 * clean up parent directory
   2242 				 *
   2243 				 * tdp->i_contents already locked from
   2244 				 * ufs_direnter_*()
   2245 				 */
   2246 				tdp->i_nlink--;
   2247 				TRANS_INODE(tdp->i_ufsvfs, tdp);
   2248 				tdp->i_flag |= ICHG;
   2249 				tdp->i_seq++;
   2250 				ufs_iupdat(tdp, I_SYNC);
   2251 			}
   2252 			goto fail;
   2253 		}
   2254 	}
   2255 
   2256 	/*
   2257 	 * If the passed in attributes contain atime and/or mtime
   2258 	 * settings, then use them instead of using the current
   2259 	 * high resolution time.
   2260 	 */
   2261 	if (vap->va_mask & (AT_MTIME|AT_ATIME)) {
   2262 		if (vap->va_mask & AT_ATIME) {
   2263 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
   2264 			ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
   2265 			ip->i_flag &= ~IACC;
   2266 		} else
   2267 			ip->i_flag |= IACC;
   2268 		if (vap->va_mask & AT_MTIME) {
   2269 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
   2270 			ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
   2271 			gethrestime(&now);
   2272 			if (now.tv_sec > TIME32_MAX) {
   2273 				/*
   2274 				 * In 2038, ctime sticks forever..
   2275 				 */
   2276 				ip->i_ctime.tv_sec = TIME32_MAX;
   2277 				ip->i_ctime.tv_usec = 0;
   2278 			} else {
   2279 				ip->i_ctime.tv_sec = now.tv_sec;
   2280 				ip->i_ctime.tv_usec = now.tv_nsec / 1000;
   2281 			}
   2282 			ip->i_flag &= ~(IUPD|ICHG);
   2283 			ip->i_flag |= IMODTIME;
   2284 		} else
   2285 			ip->i_flag |= IUPD|ICHG;
   2286 		ip->i_flag |= IMOD;
   2287 	} else
   2288 		ip->i_flag |= IACC|IUPD|ICHG;
   2289 	ip->i_seq++;
   2290 
   2291 	/*
   2292 	 * If this is an attribute tag it as one.
   2293 	 */
   2294 	if ((tdp->i_mode & IFMT) == IFATTRDIR) {
   2295 		ip->i_cflags |= IXATTR;
   2296 	}
   2297 
   2298 	/*
   2299 	 * push inode before it's name appears in a directory
   2300 	 */
   2301 	TRANS_INODE(ip->i_ufsvfs, ip);
   2302 	ufs_iupdat(ip, I_SYNC);
   2303 	rw_exit(&ip->i_contents);
   2304 	return (0);
   2305 
   2306 fail:
   2307 	/* Throw away inode we just allocated. */
   2308 	ip->i_nlink = 0;
   2309 	ufs_setreclaim(ip);
   2310 	TRANS_INODE(ip->i_ufsvfs, ip);
   2311 	ip->i_flag |= ICHG;
   2312 	ip->i_seq++;
   2313 	ITIMES_NOLOCK(ip);
   2314 	rw_exit(&ip->i_contents);
   2315 	return (err);
   2316 }
   2317 
   2318 /*
   2319  * Write a prototype directory into the empty inode ip, whose parent is dp.
   2320  */
   2321 static int
   2322 ufs_dirmakedirect(
   2323 	struct inode *ip,		/* new directory */
   2324 	struct inode *dp,		/* parent directory */
   2325 	int	attrdir,
   2326 	struct cred *cr)
   2327 {
   2328 	struct dirtemplate *dirp;
   2329 	struct fbuf *fbp;
   2330 	int err;
   2331 
   2332 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
   2333 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
   2334 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
   2335 	/*
   2336 	 * Allocate space for the directory we're creating.
   2337 	 */
   2338 	err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr);
   2339 	if (err)
   2340 		return (err);
   2341 	if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
   2342 		err = ufs_fault(ITOV(dp),
   2343 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
   2344 		    DIRBLKSIZ, dp->i_fs->fs_fsize,
   2345 		    dp->i_fs->fs_fsmnt);
   2346 		return (err);
   2347 	}
   2348 	ip->i_size = DIRBLKSIZ;
   2349 	TRANS_INODE(ip->i_ufsvfs, ip);
   2350 	ip->i_flag |= IUPD|ICHG|IATTCHG;
   2351 	ip->i_seq++;
   2352 	ITIMES_NOLOCK(ip);
   2353 	/*
   2354 	 * Update the tdp link count and write out the change.
   2355 	 * This reflects the ".." entry we'll soon write.
   2356 	 */
   2357 	if (dp->i_nlink == MAXLINK)
   2358 		return (EMLINK);
   2359 	if (attrdir == 0)
   2360 		dp->i_nlink++;
   2361 	TRANS_INODE(dp->i_ufsvfs, dp);
   2362 	dp->i_flag |= ICHG;
   2363 	dp->i_seq++;
   2364 	ufs_iupdat(dp, I_SYNC);
   2365 	/*
   2366 	 * Initialize directory with "."
   2367 	 * and ".." from static template.
   2368 	 *
   2369 	 * Since the parent directory is locked, we don't have to
   2370 	 * worry about anything changing when we drop the write
   2371 	 * lock on (ip).
   2372 	 *
   2373 	 */
   2374 	err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize,
   2375 	    S_READ, &fbp);
   2376 
   2377 	if (err) {
   2378 		goto fail;
   2379 	}
   2380 	dirp = (struct dirtemplate *)fbp->fb_addr;
   2381 	/*
   2382 	 * Now initialize the directory we're creating
   2383 	 * with the "." and ".." entries.
   2384 	 */
   2385 	*dirp = mastertemplate;			/* structure assignment */
   2386 	dirp->dot_ino = (uint32_t)ip->i_number;
   2387 	dirp->dotdot_ino = (uint32_t)dp->i_number;
   2388 
   2389 	err = TRANS_DIR(ip, 0);
   2390 	if (err) {
   2391 		fbrelse(fbp, S_OTHER);
   2392 		goto fail;
   2393 	}
   2394 
   2395 	err = ufs_fbwrite(fbp, ip);
   2396 	if (err) {
   2397 		goto fail;
   2398 	}
   2399 
   2400 	return (0);
   2401 
   2402 fail:
   2403 	if (attrdir == 0)
   2404 		dp->i_nlink--;
   2405 	TRANS_INODE(dp->i_ufsvfs, dp);
   2406 	dp->i_flag |= ICHG;
   2407 	dp->i_seq++;
   2408 	ufs_iupdat(dp, I_SYNC);
   2409 	return (err);
   2410 }
   2411 
   2412 /*
   2413  * Delete a directory entry.  If oip is nonzero the entry is checked
   2414  * to make sure it still reflects oip.
   2415  *
   2416  * If vpp is non-null, return the ptr of the (held) vnode associated with
   2417  * the removed name.  The caller is responsible for doing the VN_RELE().
   2418  */
   2419 int
   2420 ufs_dirremove(
   2421 	struct inode *dp,
   2422 	char *namep,
   2423 	struct inode *oip,
   2424 	struct vnode *cdir,
   2425 	enum dr_op op,
   2426 	struct cred *cr,
   2427 	vnode_t **vpp)	/* Return (held) vnode ptr of removed file/dir */
   2428 {
   2429 	struct direct *ep, *pep, *nep;
   2430 	struct inode *ip;
   2431 	vnode_t *dvp, *vp;
   2432 	struct ufs_slot slot;
   2433 	int namlen;
   2434 	int err;
   2435 	int mode;
   2436 	ushort_t extra;
   2437 
   2438 	namlen = (int)strlen(namep);
   2439 	if (namlen == 0) {
   2440 		struct fs	*fs = dp->i_fs;
   2441 
   2442 		cmn_err(CE_WARN, "%s: ufs_dirremove: attempted to remove"
   2443 		    " nameless file in directory (directory inode %llu)",
   2444 		    fs->fs_fsmnt, (u_longlong_t)dp->i_number);
   2445 		ASSERT(namlen != 0);
   2446 
   2447 		return (ENOENT);
   2448 	}
   2449 
   2450 	/*
   2451 	 * return error when removing . and ..
   2452 	 */
   2453 	if (namep[0] == '.') {
   2454 		if (namlen == 1)
   2455 			return (EINVAL);
   2456 		else if (namlen == 2 && namep[1] == '.') {
   2457 			return (EEXIST);	/* SIGH should be ENOTEMPTY */
   2458 		}
   2459 	}
   2460 
   2461 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
   2462 
   2463 retry:
   2464 	/*
   2465 	 * Check accessibility of directory.
   2466 	 */
   2467 	if (err = ufs_diraccess(dp, IEXEC|IWRITE, cr))
   2468 		return (err);
   2469 
   2470 	ip = NULL;
   2471 	slot.fbp = NULL;
   2472 	slot.status = FOUND;	/* don't need to look for empty slot */
   2473 	rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
   2474 	rw_enter(&dp->i_contents, RW_WRITER);
   2475 
   2476 	err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0);
   2477 	if (err)
   2478 		goto out_novfs;
   2479 	if (ip == NULL) {
   2480 		err = ENOENT;
   2481 		goto out_novfs;
   2482 	}
   2483 	vp = ITOV(ip);
   2484 	if (oip && oip != ip) {
   2485 		err = ENOENT;
   2486 		goto out_novfs;
   2487 	}
   2488 
   2489 	mode = ip->i_mode & IFMT;
   2490 	if (mode == IFDIR || mode == IFATTRDIR) {
   2491 
   2492 		/*
   2493 		 * vn_vfsrlock() prevents races between mount and rmdir.
   2494 		 */
   2495 		if (vn_vfsrlock(vp)) {
   2496 			err = EBUSY;
   2497 			goto out_novfs;
   2498 		}
   2499 		if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) {
   2500 			err = EBUSY;
   2501 			goto out;
   2502 		}
   2503 		/*
   2504 		 * If we are removing a directory, get a lock on it.
   2505 		 * Taking a writer lock prevents a parallel ufs_dirlook from
   2506 		 * incorrectly entering a negative cache vnode entry in the dnlc
   2507 		 * If the directory is empty, it will stay empty until
   2508 		 * we can remove it.
   2509 		 */
   2510 		if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) {
   2511 			/*
   2512 			 * It is possible that a thread in rename would have
   2513 			 * acquired this rwlock. To prevent a deadlock we
   2514 			 * do a rw_tryenter. If we fail to get the lock
   2515 			 * we drop all the locks we have acquired, wait
   2516 			 * for 2 ticks and reacquire the
   2517 			 * directory's (dp) i_rwlock and try again.
   2518 			 * If we dont drop dp's i_rwlock then we will panic
   2519 			 * with a "Deadlock: cycle in blocking chain"
   2520 			 * since in ufs_dircheckpath we want dp's i_rwlock.
   2521 			 * dp is guaranteed to exist since ufs_dirremove is
   2522 			 * called after a VN_HOLD(dp) has been done.
   2523 			 */
   2524 			ufs_dirremove_retry_cnt++;
   2525 			vn_vfsunlock(vp);
   2526 			if (slot.fbp)
   2527 				fbrelse(slot.fbp, S_OTHER);
   2528 			rw_exit(&dp->i_contents);
   2529 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
   2530 			rw_exit(&dp->i_rwlock);
   2531 			VN_RELE(vp);
   2532 			delay(2);
   2533 			rw_enter(&dp->i_rwlock, RW_WRITER);
   2534 			goto retry;
   2535 		}
   2536 	}
   2537 	rw_enter(&ip->i_contents, RW_READER);
   2538 
   2539 	/*
   2540 	 * Now check the restrictions that apply on sticky directories.
   2541 	 */
   2542 	if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) {
   2543 		rw_exit(&ip->i_contents);
   2544 		if (mode == IFDIR || mode == IFATTRDIR)
   2545 			rw_exit(&ip->i_rwlock);
   2546 		goto out;
   2547 	}
   2548 
   2549 	if (op == DR_RMDIR) {
   2550 		/*
   2551 		 * For rmdir(2), some special checks are required.
   2552 		 * (a) Don't remove any alias of the parent (e.g. ".").
   2553 		 * (b) Don't remove the current directory.
   2554 		 * (c) Make sure the entry is (still) a directory.
   2555 		 * (d) Make sure the directory is empty.
   2556 		 */
   2557 
   2558 		if (dp == ip || vp == cdir)
   2559 			err = EINVAL;
   2560 		else if (((ip->i_mode & IFMT) != IFDIR) &&
   2561 		    ((ip->i_mode & IFMT) != IFATTRDIR))
   2562 			err = ENOTDIR;
   2563 		else if ((ip->i_nlink > 2) ||
   2564 		    !ufs_dirempty(ip, dp->i_number, cr)) {
   2565 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
   2566 		}
   2567 
   2568 		if (err) {
   2569 			rw_exit(&ip->i_contents);
   2570 			if (mode == IFDIR || mode == IFATTRDIR)
   2571 				rw_exit(&ip->i_rwlock);
   2572 			goto out;
   2573 		}
   2574 	} else if (op == DR_REMOVE)  {
   2575 		/*
   2576 		 * unlink(2) requires a different check: allow only
   2577 		 * privileged users to unlink a directory.
   2578 		 */
   2579 		if (vp->v_type == VDIR &&
   2580 		    secpolicy_fs_linkdir(cr, vp->v_vfsp)) {
   2581 			err = EPERM;
   2582 			rw_exit(&ip->i_contents);
   2583 			rw_exit(&ip->i_rwlock);
   2584 			goto out;
   2585 		}
   2586 	}
   2587 
   2588 	rw_exit(&ip->i_contents);
   2589 
   2590 	/*
   2591 	 * Remove the cache'd entry, if any.
   2592 	 */
   2593 	dvp = ITOV(dp);
   2594 	dnlc_remove(dvp, namep);
   2595 	ep = slot.ep;
   2596 	ep->d_ino = 0;
   2597 
   2598 	if (slot.cached) {
   2599 		dcanchor_t *dcap = &dp->i_danchor;
   2600 
   2601 		(void) dnlc_dir_rem_entry(dcap, namep, NULL);
   2602 		if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) {
   2603 			(void) dnlc_dir_rem_space_by_handle(dcap, slot.offset);
   2604 		}
   2605 		if (slot.offset & (DIRBLKSIZ - 1)) {
   2606 			/*
   2607 			 * Collapse new free space into previous entry.
   2608 			 * Note, the previous entry has already been
   2609 			 * validated in ufs_dircheckforname().
   2610 			 */
   2611 			ASSERT(slot.size);
   2612 			pep = (struct direct *)((char *)ep - slot.size);
   2613 			if ((pep->d_ino == 0) &&
   2614 			    ((uintptr_t)pep & (DIRBLKSIZ - 1))) {
   2615 				dnlc_dir_purge(dcap);
   2616 				slot.cached = 0;
   2617 				goto nocache;
   2618 			}
   2619 			if (pep->d_ino) {
   2620 				extra = pep->d_reclen - DIRSIZ(pep);
   2621 			} else {
   2622 				extra = pep->d_reclen;
   2623 			}
   2624 			if (extra >= LDIRSIZ(1)) {
   2625 				(void) dnlc_dir_rem_space_by_handle(dcap,
   2626 				    (uint64_t)(slot.offset - slot.size));
   2627 			}
   2628 			pep->d_reclen += ep->d_reclen;
   2629 			(void) dnlc_dir_add_space(dcap, extra + ep->d_reclen,
   2630 			    (uint64_t)(slot.offset - slot.size));
   2631 			/* adjust the previous pointer in the next entry */
   2632 			nep = (struct direct *)((char *)ep + ep->d_reclen);
   2633 			if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
   2634 				/*
   2635 				 * Not a new block.
   2636 				 *
   2637 				 * Check the validity of the entry.
   2638 				 * If it's bad, then throw away the cache and
   2639 				 * continue.
   2640 				 */
   2641 				if ((nep->d_reclen == 0) ||
   2642 				    (nep->d_reclen & 0x3) ||
   2643 				    (dnlc_dir_update(dcap, nep->d_name,
   2644 				    INO_OFF_TO_H(nep->d_ino,
   2645 				    slot.offset - slot.size)) == DNOENT)) {
   2646 					dnlc_dir_purge(dcap);
   2647 					slot.cached = 0;
   2648 				}
   2649 			}
   2650 		} else {
   2651 			(void) dnlc_dir_add_space(dcap, ep->d_reclen,
   2652 			    (uint64_t)slot.offset);
   2653 		}
   2654 	} else {
   2655 		/*
   2656 		 * If the entry isn't the first in the directory, we must
   2657 		 * reclaim the space of the now empty record by adding
   2658 		 * the record size to the size of the previous entry.
   2659 		 */
   2660 		if (slot.offset & (DIRBLKSIZ - 1)) {
   2661 			/*
   2662 			 * Collapse new free space into previous entry.
   2663 			 */
   2664 			pep = (struct direct *)((char *)ep - slot.size);
   2665 			pep->d_reclen += ep->d_reclen;
   2666 		}
   2667 	}
   2668 nocache:
   2669 
   2670 
   2671 	err = TRANS_DIR(dp, slot.offset);
   2672 	if (err)
   2673 		fbrelse(slot.fbp, S_OTHER);
   2674 	else
   2675 		err = ufs_fbwrite(slot.fbp, dp);
   2676 	slot.fbp = NULL;
   2677 
   2678 	/*
   2679 	 * If we were removing a directory, it is 'gone' now, but we cannot
   2680 	 * unlock it as a thread may be waiting for the lock in ufs_create. If
   2681 	 * we did, it could then create a file in a deleted directory.
   2682 	 */
   2683 
   2684 	if (err) {
   2685 		if (mode == IFDIR || mode == IFATTRDIR)
   2686 			rw_exit(&ip->i_rwlock);
   2687 		goto out;
   2688 	}
   2689 
   2690 	rw_enter(&ip->i_contents, RW_WRITER);
   2691 
   2692 	dp->i_flag |= IUPD|ICHG;
   2693 	dp->i_seq++;
   2694 	ip->i_flag |= ICHG;
   2695 	ip->i_seq++;
   2696 
   2697 	TRANS_INODE(dp->i_ufsvfs, dp);
   2698 	TRANS_INODE(ip->i_ufsvfs, ip);
   2699 	/*
   2700 	 * Now dispose of the inode.
   2701 	 */
   2702 	if (ip->i_nlink > 0) {
   2703 		/*
   2704 		 * This is not done for IFATTRDIR's because they don't
   2705 		 * have entries in the dnlc and the link counts are
   2706 		 * not incremented when they are created.
   2707 		 */
   2708 		if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) {
   2709 			/*
   2710 			 * Decrement by 2 because we're trashing the "."
   2711 			 * entry as well as removing the entry in dp.
   2712 			 * Clear the directory entry, but there may be
   2713 			 * other hard links so don't free the inode.
   2714 			 * Decrement the dp linkcount because we're
   2715 			 * trashing the ".." entry.
   2716 			 */
   2717 			ip->i_nlink -= 2;
   2718 			dp->i_nlink--;
   2719 			ufs_setreclaim(dp);
   2720 			/*
   2721 			 * XXX need to discard negative cache entries
   2722 			 * for vp.  See comment in ufs_delete().
   2723 			 */
   2724 			dnlc_remove(vp, ".");
   2725 			dnlc_remove(vp, "..");
   2726 			/*
   2727 			 * The return value is ignored here bacause if
   2728 			 * the directory purge fails we don't want to
   2729 			 * stop the delete. If ufs_dirpurgedotdot fails
   2730 			 * the delete will continue with the preexiting
   2731 			 * behavior.
   2732 			 */
   2733 			(void) ufs_dirpurgedotdot(ip, dp->i_number, cr);
   2734 		} else {
   2735 			ip->i_nlink--;
   2736 		}
   2737 		ufs_setreclaim(ip);
   2738 	}
   2739 	ITIMES_NOLOCK(dp);
   2740 	ITIMES_NOLOCK(ip);
   2741 
   2742 	if (!TRANS_ISTRANS(dp->i_ufsvfs))
   2743 		ufs_iupdat(dp, I_SYNC);
   2744 	if (!TRANS_ISTRANS(ip->i_ufsvfs))
   2745 		ufs_iupdat(ip, I_SYNC);
   2746 
   2747 	rw_exit(&ip->i_contents);
   2748 	if (mode == IFDIR || mode == IFATTRDIR)
   2749 		rw_exit(&ip->i_rwlock);
   2750 out:
   2751 	if (mode == IFDIR || mode == IFATTRDIR) {
   2752 		vn_vfsunlock(vp);
   2753 	}
   2754 out_novfs:
   2755 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
   2756 
   2757 	if (slot.fbp)
   2758 		fbrelse(slot.fbp, S_OTHER);
   2759 
   2760 	rw_exit(&dp->i_contents);
   2761 	rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
   2762 
   2763 	/*
   2764 	 * If no error and vpp is non-NULL, return the vnode ptr to the caller.
   2765 	 * The caller becomes responsible for the VN_RELE().  Otherwise,
   2766 	 * Release (and delete) the inode after we drop vfs_dqrwlock to
   2767 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
   2768 	 */
   2769 	if (ip) {
   2770 		if ((err == 0) && (vpp != NULL)) {
   2771 			*vpp = ITOV(ip);
   2772 		} else {
   2773 			VN_RELE(vp);
   2774 		}
   2775 	}
   2776 
   2777 	return (err);
   2778 }
   2779 
   2780 /*
   2781  * Return buffer with contents of block "offset"
   2782  * from the beginning of directory "ip".  If "res"
   2783  * is non-zero, fill it in with a pointer to the
   2784  * remaining space in the directory.
   2785  *
   2786  */
   2787 
   2788 int
   2789 blkatoff(
   2790 	struct inode *ip,
   2791 	off_t offset,
   2792 	char **res,
   2793 	struct fbuf **fbpp)
   2794 {
   2795 	struct fs *fs;
   2796 	struct fbuf *fbp;
   2797 	daddr_t lbn;
   2798 	uint_t bsize;
   2799 	int err;
   2800 
   2801 	CPU_STATS_ADD_K(sys, ufsdirblk, 1);
   2802 	fs = ip->i_fs;
   2803 	lbn = (daddr_t)lblkno(fs, offset);
   2804 	bsize = (uint_t)blksize(fs, ip, lbn);
   2805 	err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask),
   2806 	    bsize, S_READ, &fbp);
   2807 	if (err) {
   2808 		*fbpp = (struct fbuf *)NULL;
   2809 		return (err);
   2810 	}
   2811 	if (res)
   2812 		*res = fbp->fb_addr + blkoff(fs, offset);
   2813 	*fbpp = fbp;
   2814 	return (0);
   2815 }
   2816 
   2817 /*
   2818  * Do consistency checking:
   2819  *	record length must be multiple of 4
   2820  *	entry must fit in rest of its DIRBLKSIZ block
   2821  *	record must be large enough to contain entry
   2822  *	name is not longer than MAXNAMLEN
   2823  *	name must be as long as advertised, and null terminated
   2824  * NOTE: record length must not be zero (should be checked previously).
   2825  *       This routine is only called if dirchk is true.
   2826  *       It would be nice to set the FSBAD flag in the super-block when
   2827  *       this routine fails so that a fsck is forced on next reboot,
   2828  *       but locking is a problem.
   2829  */
   2830 static int
   2831 dirmangled(
   2832 	struct inode *dp,
   2833 	struct direct *ep,
   2834 	int entryoffsetinblock,
   2835 	off_t offset)
   2836 {
   2837 	int i;
   2838 
   2839 	i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
   2840 	if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i ||
   2841 	    (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN ||
   2842 	    ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) {
   2843 		dirbad(dp, "mangled entry", offset);
   2844 		return (1);
   2845 	}
   2846 	return (0);
   2847 }
   2848 
   2849 static void
   2850 dirbad(struct inode *ip, char *how, off_t offset)
   2851 {
   2852 	cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s",
   2853 	    ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how);
   2854 }
   2855 
   2856 static int
   2857 dirbadname(char *sp, int l)
   2858 {
   2859 	while (l--) {			/* check for nulls */
   2860 		if (*sp++ == '\0') {
   2861 			return (1);
   2862 		}
   2863 	}
   2864 	return (*sp);			/* check for terminating null */
   2865 }
   2866 
   2867 /*
   2868  * Check if a directory is empty or not.
   2869  */
   2870 static int
   2871 ufs_dirempty(
   2872 	struct inode *ip,
   2873 	ino_t parentino,
   2874 	struct cred *cr)
   2875 {
   2876 	return (ufs_dirscan(ip, parentino, cr, 0));
   2877 }
   2878 
   2879 /*
   2880  * clear the .. directory entry.
   2881  */
   2882 static int
   2883 ufs_dirpurgedotdot(
   2884 	struct inode *ip,
   2885 	ino_t parentino,
   2886 	struct cred *cr)
   2887 {
   2888 	return (ufs_dirscan(ip, parentino, cr, 1));
   2889 }
   2890 
   2891 /*
   2892  * Scan the directoy. If clr_dotdot is true clear the ..
   2893  * directory else check to see if the directory is empty.
   2894  *
   2895  * Using a struct dirtemplate here is not precisely
   2896  * what we want, but better than using a struct direct.
   2897  *
   2898  * clr_dotdot is used as a flag to tell us if we need
   2899  * to clear the dotdot entry
   2900  *
   2901  * N.B.: does not handle corrupted directories.
   2902  */
   2903 static int
   2904 ufs_dirscan(
   2905 	struct inode *ip,
   2906 	ino_t parentino,
   2907 	struct cred *cr,
   2908 	int clr_dotdot)
   2909 {
   2910 	offset_t off;
   2911 	struct dirtemplate dbuf;
   2912 	struct direct *dp = (struct direct *)&dbuf;
   2913 	int err, count;
   2914 	int empty = 1;	/* Assume it's empty */
   2915 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
   2916 
   2917 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   2918 
   2919 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
   2920 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
   2921 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
   2922 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
   2923 		/*
   2924 		 * Since we read MINDIRSIZ, residual must
   2925 		 * be 0 unless we're at end of file.
   2926 		 */
   2927 		if (err || count != 0 || dp->d_reclen == 0) {
   2928 			empty = 0;
   2929 			break;
   2930 		}
   2931 		/* skip empty entries */
   2932 		if (dp->d_ino == 0)
   2933 			continue;
   2934 		/* accept only "." and ".." */
   2935 		if (dp->d_namlen > 2 || dp->d_name[0] != '.') {
   2936 			empty = 0;
   2937 			break;
   2938 		}
   2939 		/*
   2940 		 * At this point d_namlen must be 1 or 2.
   2941 		 * 1 implies ".", 2 implies ".." if second
   2942 		 * char is also "."
   2943 		 */
   2944 		if (dp->d_namlen == 1)
   2945 			continue;
   2946 		if (dp->d_name[1] == '.' &&
   2947 		    (ino_t)dp->d_ino == parentino) {
   2948 			/*
   2949 			 * If we're doing a purge we need to check for
   2950 			 * the . and .. entries and clear the d_ino for ..
   2951 			 *
   2952 			 * if clr_dotdot is set ufs_dirscan does not
   2953 			 * check for an empty directory.
   2954 			 */
   2955 			if (clr_dotdot) {
   2956 				/*
   2957 				 * Have to actually zap the ..
   2958 				 * entry in the directory, as
   2959 				 * otherwise someone might have
   2960 				 * dp as its cwd and try to
   2961 				 * open .., which now points to
   2962 				 * an unallocated inode.
   2963 				 */
   2964 				empty = ufs_dirclrdotdot(ip, parentino);
   2965 				break;
   2966 			} else {
   2967 				continue;
   2968 			}
   2969 		}
   2970 		empty = 0;
   2971 		break;
   2972 	}
   2973 	return (empty);
   2974 }
   2975 
   2976 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */
   2977 uint64_t dircheck_retry_cnt;
   2978 /*
   2979  * Check if source directory inode is in the path of the target directory.
   2980  * Target is supplied locked.
   2981  *
   2982  * The source and target inode's should be different upon entry.
   2983  */
   2984 int
   2985 ufs_dircheckpath(
   2986 	ino_t source_ino,
   2987 	struct inode *target,
   2988 	struct inode *sdp,
   2989 	struct cred *cr)
   2990 {
   2991 	struct fbuf *fbp;
   2992 	struct dirtemplate *dirp;
   2993 	struct inode *ip;
   2994 	struct ufsvfs *ufsvfsp;
   2995 	struct inode *tip;
   2996 	ino_t dotdotino;
   2997 	int err;
   2998 
   2999 	ASSERT(target->i_ufsvfs != NULL);
   3000 	ASSERT(RW_LOCK_HELD(&target->i_rwlock));
   3001 	ASSERT(RW_LOCK_HELD(&sdp->i_rwlock));
   3002 
   3003 	ip = target;
   3004 	if (ip->i_number == source_ino) {
   3005 		err = EINVAL;
   3006 		goto out;
   3007 	}
   3008 	if (ip->i_number == UFSROOTINO) {
   3009 		err = 0;
   3010 		goto out;
   3011 	}
   3012 	/*
   3013 	 * Search back through the directory tree, using the ".." entries.
   3014 	 * Fail any attempt to move a directory into an ancestor directory.
   3015 	 */
   3016 	fbp = NULL;
   3017 	for (;;) {
   3018 		struct vfs	*vfs;
   3019 
   3020 		err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp);
   3021 		if (err)
   3022 			break;
   3023 		if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 ||
   3024 		    ip->i_size < sizeof (struct dirtemplate)) {
   3025 			dirbad(ip, "bad size, unlinked or not dir", (off_t)0);
   3026 			err = ENOTDIR;
   3027 			break;
   3028 		}
   3029 		if (dirp->dotdot_namlen != 2 ||
   3030 		    dirp->dotdot_name[0] != '.' ||
   3031 		    dirp->dotdot_name[1] != '.') {
   3032 			dirbad(ip, "mangled .. entry", (off_t)0);
   3033 			err = ENOTDIR;		/* Sanity check */
   3034 			break;
   3035 		}
   3036 		dotdotino = (ino_t)dirp->dotdot_ino;
   3037 		if (dotdotino == source_ino) {
   3038 			err = EINVAL;
   3039 			break;
   3040 		}
   3041 		if (dotdotino == UFSROOTINO)
   3042 			break;
   3043 		if (fbp) {
   3044 			fbrelse(fbp, S_OTHER);
   3045 			fbp = NULL;
   3046 		}
   3047 		vfs = ip->i_vfs;
   3048 		ufsvfsp = ip->i_ufsvfs;
   3049 
   3050 		if (ip != target) {
   3051 			rw_exit(&ip->i_rwlock);
   3052 			VN_RELE(ITOV(ip));
   3053 		}
   3054 		/*
   3055 		 * Race to get the inode.
   3056 		 */
   3057 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   3058 		if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) {
   3059 			rw_exit(&ufsvfsp->vfs_dqrwlock);
   3060 			ip = NULL;
   3061 			break;
   3062 		}
   3063 		rw_exit(&ufsvfsp->vfs_dqrwlock);
   3064 		/*
   3065 		 * If the directory of the source inode (also a directory)
   3066 		 * is the same as this next entry up the chain, then
   3067 		 * we know the source directory itself can't be in the
   3068 		 * chain. This also prevents a panic because we already
   3069 		 * have sdp->i_rwlock locked.
   3070 		 */
   3071 		if (tip == sdp) {
   3072 			VN_RELE(ITOV(tip));
   3073 			ip = NULL;
   3074 			break;
   3075 		}
   3076 		ip = tip;
   3077 
   3078 		/*
   3079 		 * If someone has set the WRITE_WANTED bit in this lock and if
   3080 		 * this happens to be a sdp or tdp of another parallel rename
   3081 		 * which is executing  the same code and in similar situation
   3082 		 * we end up in a 4 way deadlock. We need to make sure that
   3083 		 * the WRITE_WANTED bit is not  set.
   3084 		 */
   3085 retry_lock:
   3086 		if (!rw_tryenter(&ip->i_rwlock, RW_READER)) {
   3087 			/*
   3088 			 * If the lock held as WRITER thats fine but if it
   3089 			 * has WRITE_WANTED bit set we might end up in a
   3090 			 * deadlock. If WRITE_WANTED is set we return
   3091 			 * with EAGAIN else we just go back and try.
   3092 			 */
   3093 			if (RW_ISWRITER(&ip->i_rwlock) &&
   3094 			    !(RW_WRITE_HELD(&ip->i_rwlock))) {
   3095 				err = EAGAIN;
   3096 				if (fbp) {
   3097 					fbrelse(fbp, S_OTHER);
   3098 				}
   3099 				VN_RELE(ITOV(ip));
   3100 				return (err);
   3101 			} else {
   3102 				/*
   3103 				 * The lock is being write held. We could
   3104 				 * just do a rw_enter here but there is a
   3105 				 * window between the check and now, where
   3106 				 * the status could have changed, so to
   3107 				 * avoid looping we backoff and go back to
   3108 				 * try for the lock.
   3109 				 */
   3110 				delay(retry_backoff_delay);
   3111 				dircheck_retry_cnt++;
   3112 				goto retry_lock;
   3113 			}
   3114 		}
   3115 	}
   3116 	if (fbp) {
   3117 		fbrelse(fbp, S_OTHER);
   3118 	}
   3119 out:
   3120 	if (ip) {
   3121 		if (ip != target) {
   3122 			rw_exit(&ip->i_rwlock);
   3123 			VN_RELE(ITOV(ip));
   3124 		}
   3125 	}
   3126 	return (err);
   3127 }
   3128 
   3129 int
   3130 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
   3131 {
   3132 	offset_t off;
   3133 	struct dirtemplate dbuf;
   3134 	struct direct *dp = (struct direct *)&dbuf;
   3135 	int err, count;
   3136 	int empty = 1;	/* Assume it's empty */
   3137 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
   3138 
   3139 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   3140 
   3141 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
   3142 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
   3143 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
   3144 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
   3145 		/*
   3146 		 * Since we read MINDIRSIZ, residual must
   3147 		 * be 0 unless we're at end of file.
   3148 		 */
   3149 
   3150 		if (err || count != 0 || dp->d_reclen == 0) {
   3151 			empty = 0;
   3152 			break;
   3153 		}
   3154 		/* skip empty entries */
   3155 		if (dp->d_ino == 0)
   3156 			continue;
   3157 		/*
   3158 		 * At this point d_namlen must be 1 or 2.
   3159 		 * 1 implies ".", 2 implies ".." if second
   3160 		 * char is also "."
   3161 		 */
   3162 
   3163 		if (dp->d_namlen == 1 && dp->d_name[0] == '.' &&
   3164 		    (ino_t)dp->d_ino == parentino)
   3165 			continue;
   3166 
   3167 		if (dp->d_namlen == 2 && dp->d_name[0] == '.' &&
   3168 		    dp->d_name[1] == '.') {
   3169 			continue;
   3170 		}
   3171 		empty = 0;
   3172 		break;
   3173 	}
   3174 	return (empty);
   3175 }
   3176 
   3177 
   3178 /*
   3179  * Allocate and initialize a new shadow inode to contain extended attributes.
   3180  */
   3181 int
   3182 ufs_xattrmkdir(
   3183 	struct inode *tdp,
   3184 	struct inode **ipp,
   3185 	int flags,
   3186 	struct cred *cr)
   3187 {
   3188 	struct inode *ip;
   3189 	struct vattr va;
   3190 	int err;
   3191 	int retry = 1;
   3192 	struct ufsvfs *ufsvfsp;
   3193 	struct ulockfs *ulp;
   3194 	int issync;
   3195 	int trans_size;
   3196 	int dorwlock;		/* 0 = not yet taken, */
   3197 				/* 1 = taken outside the transaction, */
   3198 				/* 2 = taken inside the transaction */
   3199 
   3200 	/*
   3201 	 * Validate permission to create attribute directory
   3202 	 */
   3203 
   3204 	if ((err = ufs_iaccess(tdp, IWRITE, cr, 1)) != 0) {
   3205 		return (err);
   3206 	}
   3207 
   3208 	if (vn_is_readonly(ITOV(tdp)))
   3209 		return (EROFS);
   3210 
   3211 	/*
   3212 	 * No need to re-init err after again:, since it's set before
   3213 	 * the next use of it.
   3214 	 */
   3215 again:
   3216 	dorwlock = 0;
   3217 	va.va_type = VDIR;
   3218 	va.va_uid = tdp->i_uid;
   3219 	va.va_gid = tdp->i_gid;
   3220 
   3221 	if ((tdp->i_mode & IFMT) == IFDIR) {
   3222 		va.va_mode = (o_mode_t)IFATTRDIR;
   3223 		va.va_mode |= tdp->i_mode & 0777;
   3224 	} else {
   3225 		va.va_mode = (o_mode_t)IFATTRDIR|0700;
   3226 		if (tdp->i_mode & 0040)
   3227 			va.va_mode |= 0750;
   3228 		if (tdp->i_mode & 0004)
   3229 			va.va_mode |= 0705;
   3230 	}
   3231 	va.va_mask = AT_TYPE|AT_MODE;
   3232 
   3233 	ufsvfsp = tdp->i_ufsvfs;
   3234 
   3235 	err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
   3236 	if (err)
   3237 		return (err);
   3238 
   3239 	/*
   3240 	 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
   3241 	 * This follows the protocol for read()/write().
   3242 	 */
   3243 	if (ITOV(tdp)->v_type != VDIR) {
   3244 		rw_enter(&tdp->i_rwlock, RW_WRITER);
   3245 		dorwlock = 1;
   3246 	}
   3247 
   3248 	if (ulp) {
   3249 		trans_size = (int)TOP_MKDIR_SIZE(tdp);
   3250 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size);
   3251 	}
   3252 
   3253 	/*
   3254 	 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
   3255 	 * This follows the protocol established by
   3256 	 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
   3257 	 */
   3258 	if (dorwlock == 0) {
   3259 		rw_enter(&tdp->i_rwlock, RW_WRITER);
   3260 		dorwlock = 2;
   3261 	}
   3262 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   3263 	rw_enter(&tdp->i_contents, RW_WRITER);
   3264 
   3265 	/*
   3266 	 * Suppress out of inodes messages if we will retry.
   3267 	 */
   3268 	if (retry)
   3269 		tdp->i_flag |= IQUIET;
   3270 	err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr);
   3271 	tdp->i_flag &= ~IQUIET;
   3272 
   3273 	if (err)
   3274 		goto fail;
   3275 
   3276 	if (flags) {
   3277 
   3278 		/*
   3279 		 * Now attach it to src file.
   3280 		 */
   3281 
   3282 		tdp->i_oeftflag = ip->i_number;
   3283 	}
   3284 
   3285 	ip->i_cflags |= IXATTR;
   3286 	ITOV(ip)->v_flag |= V_XATTRDIR;
   3287 	TRANS_INODE(ufsvfsp, tdp);
   3288 	tdp->i_flag |= ICHG | IUPD;
   3289 	tdp->i_seq++;
   3290 	ufs_iupdat(tdp, I_SYNC);
   3291 	rw_exit(&tdp->i_contents);
   3292 	rw_exit(&ufsvfsp->vfs_dqrwlock);
   3293 
   3294 	rw_enter(&ip->i_rwlock, RW_WRITER);
   3295 	rw_enter(&ip->i_contents, RW_WRITER);
   3296 	TRANS_INODE(ufsvfsp, ip);
   3297 	ip->i_flag |= ICHG| IUPD;
   3298 	ip->i_seq++;
   3299 	ufs_iupdat(ip, I_SYNC);
   3300 	rw_exit(&ip->i_contents);
   3301 	rw_exit(&ip->i_rwlock);
   3302 	if (dorwlock == 2)
   3303 		rw_exit(&tdp->i_rwlock);
   3304 	if (ulp) {
   3305 		int terr = 0;
   3306 
   3307 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
   3308 		ufs_lockfs_end(ulp);
   3309 		if (err == 0)
   3310 			err = terr;
   3311 	}
   3312 	if (dorwlock == 1)
   3313 		rw_exit(&tdp->i_rwlock);
   3314 	*ipp = ip;
   3315 	return (err);
   3316 
   3317 fail:
   3318 	rw_exit(&tdp->i_contents);
   3319 	rw_exit(&ufsvfsp->vfs_dqrwlock);
   3320 	if (dorwlock == 2)
   3321 		rw_exit(&tdp->i_rwlock);
   3322 	if (ulp) {
   3323 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
   3324 		ufs_lockfs_end(ulp);
   3325 	}
   3326 	if (dorwlock == 1)
   3327 		rw_exit(&tdp->i_rwlock);
   3328 	if (ip != NULL)
   3329 		VN_RELE(ITOV(ip));
   3330 
   3331 	/*
   3332 	 * No inodes?  See if any are tied up in pending deletions.
   3333 	 * This has to be done outside of any of the above, because
   3334 	 * the draining operation can't be done from inside a transaction.
   3335 	 */
   3336 	if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
   3337 		ufs_delete_drain_wait(ufsvfsp, 1);
   3338 		retry = 0;
   3339 		goto again;
   3340 	}
   3341 
   3342 	return (err);
   3343 }
   3344 
   3345 /*
   3346  * clear the dotdot directory entry.
   3347  * Used by ufs_dirscan when clr_dotdot
   3348  * flag is set and we're deleting a
   3349  * directory.
   3350  */
   3351 static int
   3352 ufs_dirclrdotdot(struct inode *ip, ino_t parentino)
   3353 {
   3354 	struct fbuf *fbp;
   3355 	struct direct *dotp, *dotdotp;
   3356 	int err = 0;
   3357 
   3358 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
   3359 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   3360 	err = blkatoff(ip, 0, NULL, &fbp);
   3361 	if (err) {
   3362 		return (err);
   3363 	}
   3364 
   3365 	dotp = (struct direct *)fbp->fb_addr;
   3366 	if ((dotp->d_namlen < (MAXNAMLEN + 1)) &&
   3367 	    ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) {
   3368 		dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen);
   3369 		if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) &&
   3370 		    ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) {
   3371 
   3372 			dotp->d_reclen += dotdotp->d_reclen;
   3373 			if (parentino == dotdotp->d_ino) {
   3374 				dotdotp->d_ino = 0;
   3375 				dotdotp->d_namlen = 0;
   3376 				dotdotp->d_reclen = 0;
   3377 			}
   3378 
   3379 			err = TRANS_DIR(ip, 0);
   3380 			if (err) {
   3381 				fbrelse(fbp, S_OTHER);
   3382 			} else {
   3383 				err = ufs_fbwrite(fbp, ip);
   3384 			}
   3385 		}
   3386 	} else {
   3387 		err = -1;
   3388 	}
   3389 	return (err);
   3390 }
   3391