Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     28 /*	  All Rights Reserved  	*/
     29 
     30 /*
     31  * Portions of this source code were derived from Berkeley 4.3 BSD
     32  * under license from the Regents of the University of California.
     33  */
     34 
     35 #include <sys/types.h>
     36 #include <sys/t_lock.h>
     37 #include <sys/ksynch.h>
     38 #include <sys/param.h>
     39 #include <sys/time.h>
     40 #include <sys/systm.h>
     41 #include <sys/sysmacros.h>
     42 #include <sys/resource.h>
     43 #include <sys/signal.h>
     44 #include <sys/cred.h>
     45 #include <sys/user.h>
     46 #include <sys/buf.h>
     47 #include <sys/vfs.h>
     48 #include <sys/vfs_opreg.h>
     49 #include <sys/vnode.h>
     50 #include <sys/proc.h>
     51 #include <sys/disp.h>
     52 #include <sys/file.h>
     53 #include <sys/fcntl.h>
     54 #include <sys/flock.h>
     55 #include <sys/atomic.h>
     56 #include <sys/kmem.h>
     57 #include <sys/uio.h>
     58 #include <sys/dnlc.h>
     59 #include <sys/conf.h>
     60 #include <sys/mman.h>
     61 #include <sys/pathname.h>
     62 #include <sys/debug.h>
     63 #include <sys/vmsystm.h>
     64 #include <sys/cmn_err.h>
     65 #include <sys/filio.h>
     66 #include <sys/policy.h>
     67 
     68 #include <sys/fs/ufs_fs.h>
     69 #include <sys/fs/ufs_lockfs.h>
     70 #include <sys/fs/ufs_filio.h>
     71 #include <sys/fs/ufs_inode.h>
     72 #include <sys/fs/ufs_fsdir.h>
     73 #include <sys/fs/ufs_quota.h>
     74 #include <sys/fs/ufs_log.h>
     75 #include <sys/fs/ufs_snap.h>
     76 #include <sys/fs/ufs_trans.h>
     77 #include <sys/fs/ufs_panic.h>
     78 #include <sys/fs/ufs_bio.h>
     79 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
     80 #include <sys/errno.h>
     81 #include <sys/fssnap_if.h>
     82 #include <sys/unistd.h>
     83 #include <sys/sunddi.h>
     84 
     85 #include <sys/filio.h>		/* _FIOIO */
     86 
     87 #include <vm/hat.h>
     88 #include <vm/page.h>
     89 #include <vm/pvn.h>
     90 #include <vm/as.h>
     91 #include <vm/seg.h>
     92 #include <vm/seg_map.h>
     93 #include <vm/seg_vn.h>
     94 #include <vm/seg_kmem.h>
     95 #include <vm/rm.h>
     96 #include <sys/swap.h>
     97 
     98 #include <fs/fs_subr.h>
     99 
    100 #include <sys/fs/decomp.h>
    101 
    102 static struct instats ins;
    103 
    104 static 	int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
    105 static	int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *,
    106 		caddr_t, struct page **, size_t, enum seg_rw, int);
    107 static	int ufs_open(struct vnode **, int, struct cred *, caller_context_t *);
    108 static	int ufs_close(struct vnode *, int, int, offset_t, struct cred *,
    109 		caller_context_t *);
    110 static	int ufs_read(struct vnode *, struct uio *, int, struct cred *,
    111 		struct caller_context *);
    112 static	int ufs_write(struct vnode *, struct uio *, int, struct cred *,
    113 		struct caller_context *);
    114 static	int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *,
    115 		int *, caller_context_t *);
    116 static	int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *,
    117 		caller_context_t *);
    118 static	int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *,
    119 		caller_context_t *);
    120 static	int ufs_access(struct vnode *, int, int, struct cred *,
    121 		caller_context_t *);
    122 static	int ufs_lookup(struct vnode *, char *, struct vnode **,
    123 		struct pathname *, int, struct vnode *, struct cred *,
    124 		caller_context_t *, int *, pathname_t *);
    125 static	int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl,
    126 		int, struct vnode **, struct cred *, int,
    127 		caller_context_t *, vsecattr_t  *);
    128 static	int ufs_remove(struct vnode *, char *, struct cred *,
    129 		caller_context_t *, int);
    130 static	int ufs_link(struct vnode *, struct vnode *, char *, struct cred *,
    131 		caller_context_t *, int);
    132 static	int ufs_rename(struct vnode *, char *, struct vnode *, char *,
    133 		struct cred *, caller_context_t *, int);
    134 static	int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **,
    135 		struct cred *, caller_context_t *, int, vsecattr_t *);
    136 static	int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *,
    137 		caller_context_t *, int);
    138 static	int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *,
    139 		caller_context_t *, int);
    140 static	int ufs_symlink(struct vnode *, char *, struct vattr *, char *,
    141 		struct cred *, caller_context_t *, int);
    142 static	int ufs_readlink(struct vnode *, struct uio *, struct cred *,
    143 		caller_context_t *);
    144 static	int ufs_fsync(struct vnode *, int, struct cred *, caller_context_t *);
    145 static	void ufs_inactive(struct vnode *, struct cred *, caller_context_t *);
    146 static	int ufs_fid(struct vnode *, struct fid *, caller_context_t *);
    147 static	int ufs_rwlock(struct vnode *, int, caller_context_t *);
    148 static	void ufs_rwunlock(struct vnode *, int, caller_context_t *);
    149 static	int ufs_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
    150 static	int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
    151 		struct flk_callback *, struct cred *,
    152 		caller_context_t *);
    153 static  int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t,
    154 		cred_t *, caller_context_t *);
    155 static	int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *,
    156 		struct page **, size_t, struct seg *, caddr_t,
    157 		enum seg_rw, struct cred *, caller_context_t *);
    158 static	int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *,
    159 		caller_context_t *);
    160 static	int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *);
    161 static	int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
    162 		uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
    163 static	int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
    164 		uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
    165 static	int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
    166 		uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
    167 static	int ufs_poll(vnode_t *, short, int, short *, struct pollhead **,
    168 		caller_context_t *);
    169 static	int ufs_dump(vnode_t *, caddr_t, offset_t, offset_t,
    170     caller_context_t *);
    171 static	int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *,
    172 		caller_context_t *);
    173 static	int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int,
    174 		struct cred *, caller_context_t *);
    175 static	int ufs_dumpctl(vnode_t *, int, offset_t *, caller_context_t *);
    176 static	daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *,
    177 		daddr32_t *, int, int);
    178 static	int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
    179 		caller_context_t *);
    180 static	int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
    181 		caller_context_t *);
    182 static	int ufs_priv_access(void *, int, struct cred *);
    183 extern int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
    184 
    185 /*
    186  * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions.
    187  *
    188  * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet.
    189  */
    190 struct vnodeops *ufs_vnodeops;
    191 
    192 /* NOTE: "not blkd" below  means that the operation isn't blocked by lockfs */
    193 const fs_operation_def_t ufs_vnodeops_template[] = {
    194 	VOPNAME_OPEN,		{ .vop_open = ufs_open },	/* not blkd */
    195 	VOPNAME_CLOSE,		{ .vop_close = ufs_close },	/* not blkd */
    196 	VOPNAME_READ,		{ .vop_read = ufs_read },
    197 	VOPNAME_WRITE,		{ .vop_write = ufs_write },
    198 	VOPNAME_IOCTL,		{ .vop_ioctl = ufs_ioctl },
    199 	VOPNAME_GETATTR,	{ .vop_getattr = ufs_getattr },
    200 	VOPNAME_SETATTR,	{ .vop_setattr = ufs_setattr },
    201 	VOPNAME_ACCESS,		{ .vop_access = ufs_access },
    202 	VOPNAME_LOOKUP,		{ .vop_lookup = ufs_lookup },
    203 	VOPNAME_CREATE,		{ .vop_create = ufs_create },
    204 	VOPNAME_REMOVE,		{ .vop_remove = ufs_remove },
    205 	VOPNAME_LINK,		{ .vop_link = ufs_link },
    206 	VOPNAME_RENAME,		{ .vop_rename = ufs_rename },
    207 	VOPNAME_MKDIR,		{ .vop_mkdir = ufs_mkdir },
    208 	VOPNAME_RMDIR,		{ .vop_rmdir = ufs_rmdir },
    209 	VOPNAME_READDIR,	{ .vop_readdir = ufs_readdir },
    210 	VOPNAME_SYMLINK,	{ .vop_symlink = ufs_symlink },
    211 	VOPNAME_READLINK,	{ .vop_readlink = ufs_readlink },
    212 	VOPNAME_FSYNC,		{ .vop_fsync = ufs_fsync },
    213 	VOPNAME_INACTIVE,	{ .vop_inactive = ufs_inactive }, /* not blkd */
    214 	VOPNAME_FID,		{ .vop_fid = ufs_fid },
    215 	VOPNAME_RWLOCK,		{ .vop_rwlock = ufs_rwlock },	/* not blkd */
    216 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = ufs_rwunlock }, /* not blkd */
    217 	VOPNAME_SEEK,		{ .vop_seek = ufs_seek },
    218 	VOPNAME_FRLOCK,		{ .vop_frlock = ufs_frlock },
    219 	VOPNAME_SPACE,		{ .vop_space = ufs_space },
    220 	VOPNAME_GETPAGE,	{ .vop_getpage = ufs_getpage },
    221 	VOPNAME_PUTPAGE,	{ .vop_putpage = ufs_putpage },
    222 	VOPNAME_MAP,		{ .vop_map = ufs_map },
    223 	VOPNAME_ADDMAP,		{ .vop_addmap = ufs_addmap },	/* not blkd */
    224 	VOPNAME_DELMAP,		{ .vop_delmap = ufs_delmap },	/* not blkd */
    225 	VOPNAME_POLL,		{ .vop_poll = ufs_poll },	/* not blkd */
    226 	VOPNAME_DUMP,		{ .vop_dump = ufs_dump },
    227 	VOPNAME_PATHCONF,	{ .vop_pathconf = ufs_l_pathconf },
    228 	VOPNAME_PAGEIO,		{ .vop_pageio = ufs_pageio },
    229 	VOPNAME_DUMPCTL,	{ .vop_dumpctl = ufs_dumpctl },
    230 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = ufs_getsecattr },
    231 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = ufs_setsecattr },
    232 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
    233 	NULL,			NULL
    234 };
    235 
    236 #define	MAX_BACKFILE_COUNT	9999
    237 
    238 /*
    239  * Created by ufs_dumpctl() to store a file's disk block info into memory.
    240  * Used by ufs_dump() to dump data to disk directly.
    241  */
    242 struct dump {
    243 	struct inode	*ip;		/* the file we contain */
    244 	daddr_t		fsbs;		/* number of blocks stored */
    245 	struct timeval32 time;		/* time stamp for the struct */
    246 	daddr32_t 	dblk[1];	/* place holder for block info */
    247 };
    248 
    249 static struct dump *dump_info = NULL;
    250 
    251 /*
    252  * Previously there was no special action required for ordinary files.
    253  * (Devices are handled through the device file system.)
    254  * Now we support Large Files and Large File API requires open to
    255  * fail if file is large.
    256  * We could take care to prevent data corruption
    257  * by doing an atomic check of size and truncate if file is opened with
    258  * FTRUNC flag set but traditionally this is being done by the vfs/vnode
    259  * layers. So taking care of truncation here is a change in the existing
    260  * semantics of VOP_OPEN and therefore we chose not to implement any thing
    261  * here. The check for the size of the file > 2GB is being done at the
    262  * vfs layer in routine vn_open().
    263  */
    264 
    265 /* ARGSUSED */
    266 static int
    267 ufs_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ct)
    268 {
    269 	return (0);
    270 }
    271 
    272 /*ARGSUSED*/
    273 static int
    274 ufs_close(struct vnode *vp, int flag, int count, offset_t offset,
    275 	struct cred *cr, caller_context_t *ct)
    276 {
    277 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
    278 	cleanshares(vp, ttoproc(curthread)->p_pid);
    279 
    280 	/*
    281 	 * Push partially filled cluster at last close.
    282 	 * ``last close'' is approximated because the dnlc
    283 	 * may have a hold on the vnode.
    284 	 * Checking for VBAD here will also act as a forced umount check.
    285 	 */
    286 	if (vp->v_count <= 2 && vp->v_type != VBAD) {
    287 		struct inode *ip = VTOI(vp);
    288 		if (ip->i_delaylen) {
    289 			ins.in_poc.value.ul++;
    290 			(void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen,
    291 			    B_ASYNC | B_FREE, cr);
    292 			ip->i_delaylen = 0;
    293 		}
    294 	}
    295 
    296 	return (0);
    297 }
    298 
    299 /*ARGSUSED*/
    300 static int
    301 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
    302 	struct caller_context *ct)
    303 {
    304 	struct inode *ip = VTOI(vp);
    305 	struct ufsvfs *ufsvfsp;
    306 	struct ulockfs *ulp = NULL;
    307 	int error = 0;
    308 	int intrans = 0;
    309 
    310 	ASSERT(RW_READ_HELD(&ip->i_rwlock));
    311 
    312 	/*
    313 	 * Mandatory locking needs to be done before ufs_lockfs_begin()
    314 	 * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep.
    315 	 */
    316 	if (MANDLOCK(vp, ip->i_mode)) {
    317 		/*
    318 		 * ufs_getattr ends up being called by chklock
    319 		 */
    320 		error = chklock(vp, FREAD, uiop->uio_loffset,
    321 		    uiop->uio_resid, uiop->uio_fmode, ct);
    322 		if (error)
    323 			goto out;
    324 	}
    325 
    326 	ufsvfsp = ip->i_ufsvfs;
    327 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK);
    328 	if (error)
    329 		goto out;
    330 
    331 	/*
    332 	 * In the case that a directory is opened for reading as a file
    333 	 * (eg "cat .") with the  O_RSYNC, O_SYNC and O_DSYNC flags set.
    334 	 * The locking order had to be changed to avoid a deadlock with
    335 	 * an update taking place on that directory at the same time.
    336 	 */
    337 	if ((ip->i_mode & IFMT) == IFDIR) {
    338 
    339 		rw_enter(&ip->i_contents, RW_READER);
    340 		error = rdip(ip, uiop, ioflag, cr);
    341 		rw_exit(&ip->i_contents);
    342 
    343 		if (error) {
    344 			if (ulp)
    345 				ufs_lockfs_end(ulp);
    346 			goto out;
    347 		}
    348 
    349 		if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
    350 		    TRANS_ISTRANS(ufsvfsp)) {
    351 			rw_exit(&ip->i_rwlock);
    352 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
    353 			    error);
    354 			ASSERT(!error);
    355 			TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
    356 			    TOP_READ_SIZE);
    357 			rw_enter(&ip->i_rwlock, RW_READER);
    358 		}
    359 	} else {
    360 		/*
    361 		 * Only transact reads to files opened for sync-read and
    362 		 * sync-write on a file system that is not write locked.
    363 		 *
    364 		 * The ``not write locked'' check prevents problems with
    365 		 * enabling/disabling logging on a busy file system.  E.g.,
    366 		 * logging exists at the beginning of the read but does not
    367 		 * at the end.
    368 		 *
    369 		 */
    370 		if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
    371 		    TRANS_ISTRANS(ufsvfsp)) {
    372 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
    373 			    error);
    374 			ASSERT(!error);
    375 			intrans = 1;
    376 		}
    377 
    378 		rw_enter(&ip->i_contents, RW_READER);
    379 		error = rdip(ip, uiop, ioflag, cr);
    380 		rw_exit(&ip->i_contents);
    381 
    382 		if (intrans) {
    383 			TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
    384 			    TOP_READ_SIZE);
    385 		}
    386 	}
    387 
    388 	if (ulp) {
    389 		ufs_lockfs_end(ulp);
    390 	}
    391 out:
    392 
    393 	return (error);
    394 }
    395 
    396 extern	int	ufs_HW;		/* high water mark */
    397 extern	int	ufs_LW;		/* low water mark */
    398 int	ufs_WRITES = 1;		/* XXX - enable/disable */
    399 int	ufs_throttles = 0;	/* throttling count */
    400 int	ufs_allow_shared_writes = 1;	/* directio shared writes */
    401 
    402 static int
    403 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag)
    404 {
    405 	int	shared_write;
    406 
    407 	/*
    408 	 * If the FDSYNC flag is set then ignore the global
    409 	 * ufs_allow_shared_writes in this case.
    410 	 */
    411 	shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes;
    412 
    413 	/*
    414 	 * Filter to determine if this request is suitable as a
    415 	 * concurrent rewrite. This write must not allocate blocks
    416 	 * by extending the file or filling in holes. No use trying
    417 	 * through FSYNC descriptors as the inode will be synchronously
    418 	 * updated after the write. The uio structure has not yet been
    419 	 * checked for sanity, so assume nothing.
    420 	 */
    421 	return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) &&
    422 	    (uiop->uio_loffset >= (offset_t)0) &&
    423 	    (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) &&
    424 	    ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) &&
    425 	    !(ioflag & FSYNC) && !bmap_has_holes(ip) &&
    426 	    shared_write);
    427 }
    428 
    429 /*ARGSUSED*/
    430 static int
    431 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr,
    432 	caller_context_t *ct)
    433 {
    434 	struct inode *ip = VTOI(vp);
    435 	struct ufsvfs *ufsvfsp;
    436 	struct ulockfs *ulp;
    437 	int retry = 1;
    438 	int error, resv, resid = 0;
    439 	int directio_status;
    440 	int exclusive;
    441 	int rewriteflg;
    442 	long start_resid = uiop->uio_resid;
    443 
    444 	ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
    445 
    446 retry_mandlock:
    447 	/*
    448 	 * Mandatory locking needs to be done before ufs_lockfs_begin()
    449 	 * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep.
    450 	 * Check for forced unmounts normally done in ufs_lockfs_begin().
    451 	 */
    452 	if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
    453 		error = EIO;
    454 		goto out;
    455 	}
    456 	if (MANDLOCK(vp, ip->i_mode)) {
    457 
    458 		ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
    459 
    460 		/*
    461 		 * ufs_getattr ends up being called by chklock
    462 		 */
    463 		error = chklock(vp, FWRITE, uiop->uio_loffset,
    464 		    uiop->uio_resid, uiop->uio_fmode, ct);
    465 		if (error)
    466 			goto out;
    467 	}
    468 
    469 	/* i_rwlock can change in chklock */
    470 	exclusive = rw_write_held(&ip->i_rwlock);
    471 	rewriteflg = ufs_check_rewrite(ip, uiop, ioflag);
    472 
    473 	/*
    474 	 * Check for fast-path special case of directio re-writes.
    475 	 */
    476 	if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) &&
    477 	    !exclusive && rewriteflg) {
    478 
    479 		error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
    480 		if (error)
    481 			goto out;
    482 
    483 		rw_enter(&ip->i_contents, RW_READER);
    484 		error = ufs_directio_write(ip, uiop, ioflag, 1, cr,
    485 		    &directio_status);
    486 		if (directio_status == DIRECTIO_SUCCESS) {
    487 			uint_t i_flag_save;
    488 
    489 			if (start_resid != uiop->uio_resid)
    490 				error = 0;
    491 			/*
    492 			 * Special treatment of access times for re-writes.
    493 			 * If IMOD is not already set, then convert it
    494 			 * to IMODACC for this operation. This defers
    495 			 * entering a delta into the log until the inode
    496 			 * is flushed. This mimics what is done for read
    497 			 * operations and inode access time.
    498 			 */
    499 			mutex_enter(&ip->i_tlock);
    500 			i_flag_save = ip->i_flag;
    501 			ip->i_flag |= IUPD | ICHG;
    502 			ip->i_seq++;
    503 			ITIMES_NOLOCK(ip);
    504 			if ((i_flag_save & IMOD) == 0) {
    505 				ip->i_flag &= ~IMOD;
    506 				ip->i_flag |= IMODACC;
    507 			}
    508 			mutex_exit(&ip->i_tlock);
    509 			rw_exit(&ip->i_contents);
    510 			if (ulp)
    511 				ufs_lockfs_end(ulp);
    512 			goto out;
    513 		}
    514 		rw_exit(&ip->i_contents);
    515 		if (ulp)
    516 			ufs_lockfs_end(ulp);
    517 	}
    518 
    519 	if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) {
    520 		rw_exit(&ip->i_rwlock);
    521 		rw_enter(&ip->i_rwlock, RW_WRITER);
    522 		/*
    523 		 * Mandatory locking could have been enabled
    524 		 * after dropping the i_rwlock.
    525 		 */
    526 		if (MANDLOCK(vp, ip->i_mode))
    527 			goto retry_mandlock;
    528 	}
    529 
    530 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
    531 	if (error)
    532 		goto out;
    533 
    534 	/*
    535 	 * Amount of log space needed for this write
    536 	 */
    537 	if (!rewriteflg || !(ioflag & FDSYNC))
    538 		TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid);
    539 
    540 	/*
    541 	 * Throttle writes.
    542 	 */
    543 	if (ufs_WRITES && (ip->i_writes > ufs_HW)) {
    544 		mutex_enter(&ip->i_tlock);
    545 		while (ip->i_writes > ufs_HW) {
    546 			ufs_throttles++;
    547 			cv_wait(&ip->i_wrcv, &ip->i_tlock);
    548 		}
    549 		mutex_exit(&ip->i_tlock);
    550 	}
    551 
    552 	/*
    553 	 * Enter Transaction
    554 	 *
    555 	 * If the write is a rewrite there is no need to open a transaction
    556 	 * if the FDSYNC flag is set and not the FSYNC.  In this case just
    557 	 * set the IMODACC flag to modify do the update at a later time
    558 	 * thus avoiding the overhead of the logging transaction that is
    559 	 * not required.
    560 	 */
    561 	if (ioflag & (FSYNC|FDSYNC)) {
    562 		if (ulp) {
    563 			if (rewriteflg) {
    564 				uint_t i_flag_save;
    565 
    566 				rw_enter(&ip->i_contents, RW_READER);
    567 				mutex_enter(&ip->i_tlock);
    568 				i_flag_save = ip->i_flag;
    569 				ip->i_flag |= IUPD | ICHG;
    570 				ip->i_seq++;
    571 				ITIMES_NOLOCK(ip);
    572 				if ((i_flag_save & IMOD) == 0) {
    573 					ip->i_flag &= ~IMOD;
    574 					ip->i_flag |= IMODACC;
    575 				}
    576 				mutex_exit(&ip->i_tlock);
    577 				rw_exit(&ip->i_contents);
    578 			} else {
    579 				int terr = 0;
    580 				TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv,
    581 				    terr);
    582 				ASSERT(!terr);
    583 			}
    584 		}
    585 	} else {
    586 		if (ulp)
    587 			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
    588 	}
    589 
    590 	/*
    591 	 * Write the file
    592 	 */
    593 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
    594 	rw_enter(&ip->i_contents, RW_WRITER);
    595 	if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) {
    596 		/*
    597 		 * In append mode start at end of file.
    598 		 */
    599 		uiop->uio_loffset = ip->i_size;
    600 	}
    601 
    602 	/*
    603 	 * Mild optimisation, don't call ufs_trans_write() unless we have to
    604 	 * Also, suppress file system full messages if we will retry.
    605 	 */
    606 	if (retry)
    607 		ip->i_flag |= IQUIET;
    608 	if (resid) {
    609 		TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid);
    610 	} else {
    611 		error = wrip(ip, uiop, ioflag, cr);
    612 	}
    613 	ip->i_flag &= ~IQUIET;
    614 
    615 	rw_exit(&ip->i_contents);
    616 	rw_exit(&ufsvfsp->vfs_dqrwlock);
    617 
    618 	/*
    619 	 * Leave Transaction
    620 	 */
    621 	if (ulp) {
    622 		if (ioflag & (FSYNC|FDSYNC)) {
    623 			if (!rewriteflg) {
    624 				int terr = 0;
    625 
    626 				TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC,
    627 				    resv);
    628 				if (error == 0)
    629 					error = terr;
    630 			}
    631 		} else {
    632 			TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
    633 		}
    634 		ufs_lockfs_end(ulp);
    635 	}
    636 out:
    637 	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
    638 		/*
    639 		 * Any blocks tied up in pending deletes?
    640 		 */
    641 		ufs_delete_drain_wait(ufsvfsp, 1);
    642 		retry = 0;
    643 		goto retry_mandlock;
    644 	}
    645 
    646 	if (error == ENOSPC && (start_resid != uiop->uio_resid))
    647 		error = 0;
    648 
    649 	return (error);
    650 }
    651 
    652 /*
    653  * Don't cache write blocks to files with the sticky bit set.
    654  * Used to keep swap files from blowing the page cache on a server.
    655  */
    656 int stickyhack = 1;
    657 
    658 /*
    659  * Free behind hacks.  The pager is busted.
    660  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
    661  * or B_FREE_IF_TIGHT_ON_MEMORY.
    662  */
    663 int	freebehind = 1;
    664 int	smallfile = 0;
    665 u_offset_t smallfile64 = 32 * 1024;
    666 
    667 /*
    668  * While we should, in most cases, cache the pages for write, we
    669  * may also want to cache the pages for read as long as they are
    670  * frequently re-usable.
    671  *
    672  * If cache_read_ahead = 1, the pages for read will go to the tail
    673  * of the cache list when they are released, otherwise go to the head.
    674  */
    675 int	cache_read_ahead = 0;
    676 
    677 /*
    678  * Freebehind exists  so that as we read  large files  sequentially we
    679  * don't consume most of memory with pages  from a few files. It takes
    680  * longer to re-read from disk multiple small files as it does reading
    681  * one large one sequentially.  As system  memory grows customers need
    682  * to retain bigger chunks   of files in  memory.   The advent of  the
    683  * cachelist opens up of the possibility freeing pages  to the head or
    684  * tail of the list.
    685  *
    686  * Not freeing a page is a bet that the page will be read again before
    687  * it's segmap slot is needed for something else. If we loose the bet,
    688  * it means some  other thread is  burdened with the  page free we did
    689  * not do. If we win we save a free and reclaim.
    690  *
    691  * Freeing it at the tail  vs the head of cachelist  is a bet that the
    692  * page will survive until the next  read.  It's also saying that this
    693  * page is more likely to  be re-used than a  page freed some time ago
    694  * and never reclaimed.
    695  *
    696  * Freebehind maintains a  range of  file offset [smallfile1; smallfile2]
    697  *
    698  *            0 < offset < smallfile1 : pages are not freed.
    699  *   smallfile1 < offset < smallfile2 : pages freed to tail of cachelist.
    700  *   smallfile2 < offset              : pages freed to head of cachelist.
    701  *
    702  * The range  is  computed  at most  once  per second  and  depends on
    703  * freemem  and  ncpus_online.  Both parameters  are   bounded to be
    704  * >= smallfile && >= smallfile64.
    705  *
    706  * smallfile1 = (free memory / ncpu) / 1000
    707  * smallfile2 = (free memory / ncpu) / 10
    708  *
    709  * A few examples values:
    710  *
    711  *       Free Mem (in Bytes) [smallfile1; smallfile2]  [smallfile1; smallfile2]
    712  *                                 ncpus_online = 4          ncpus_online = 64
    713  *       ------------------  -----------------------   -----------------------
    714  *             1G                   [256K;  25M]               [32K; 1.5M]
    715  *            10G                   [2.5M; 250M]              [156K; 15M]
    716  *           100G                    [25M; 2.5G]              [1.5M; 150M]
    717  *
    718  */
    719 
    720 #define	SMALLFILE1_D 1000
    721 #define	SMALLFILE2_D 10
    722 static u_offset_t smallfile1 = 32 * 1024;
    723 static u_offset_t smallfile2 = 32 * 1024;
    724 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */
    725 uint_t smallfile1_d = SMALLFILE1_D;
    726 uint_t smallfile2_d = SMALLFILE2_D;
    727 
    728 /*
    729  * wrip does the real work of write requests for ufs.
    730  */
    731 int
    732 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr)
    733 {
    734 	rlim64_t limit = uio->uio_llimit;
    735 	u_offset_t off;
    736 	u_offset_t old_i_size;
    737 	struct fs *fs;
    738 	struct vnode *vp;
    739 	struct ufsvfs *ufsvfsp;
    740 	caddr_t base;
    741 	long start_resid = uio->uio_resid;	/* save starting resid */
    742 	long premove_resid;			/* resid before uiomove() */
    743 	uint_t flags;
    744 	int newpage;
    745 	int iupdat_flag, directio_status;
    746 	int n, on, mapon;
    747 	int error, pagecreate;
    748 	int do_dqrwlock;		/* drop/reacquire vfs_dqrwlock */
    749 	int32_t	iblocks;
    750 	int	new_iblocks;
    751 
    752 	/*
    753 	 * ip->i_size is incremented before the uiomove
    754 	 * is done on a write.  If the move fails (bad user
    755 	 * address) reset ip->i_size.
    756 	 * The better way would be to increment ip->i_size
    757 	 * only if the uiomove succeeds.
    758 	 */
    759 	int i_size_changed = 0;
    760 	o_mode_t type;
    761 	int i_seq_needed = 0;
    762 
    763 	vp = ITOV(ip);
    764 
    765 	/*
    766 	 * check for forced unmount - should not happen as
    767 	 * the request passed the lockfs checks.
    768 	 */
    769 	if ((ufsvfsp = ip->i_ufsvfs) == NULL)
    770 		return (EIO);
    771 
    772 	fs = ip->i_fs;
    773 
    774 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
    775 
    776 	/* check for valid filetype */
    777 	type = ip->i_mode & IFMT;
    778 	if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
    779 	    (type != IFLNK) && (type != IFSHAD)) {
    780 		return (EIO);
    781 	}
    782 
    783 	/*
    784 	 * the actual limit of UFS file size
    785 	 * is UFS_MAXOFFSET_T
    786 	 */
    787 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
    788 		limit = MAXOFFSET_T;
    789 
    790 	if (uio->uio_loffset >= limit) {
    791 		proc_t *p = ttoproc(curthread);
    792 
    793 		mutex_enter(&p->p_lock);
    794 		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
    795 		    p, RCA_UNSAFE_SIGINFO);
    796 		mutex_exit(&p->p_lock);
    797 		return (EFBIG);
    798 	}
    799 
    800 	/*
    801 	 * if largefiles are disallowed, the limit is
    802 	 * the pre-largefiles value of 2GB
    803 	 */
    804 	if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
    805 		limit = MIN(UFS_MAXOFFSET_T, limit);
    806 	else
    807 		limit = MIN(MAXOFF32_T, limit);
    808 
    809 	if (uio->uio_loffset < (offset_t)0) {
    810 		return (EINVAL);
    811 	}
    812 	if (uio->uio_resid == 0) {
    813 		return (0);
    814 	}
    815 
    816 	if (uio->uio_loffset >= limit)
    817 		return (EFBIG);
    818 
    819 	ip->i_flag |= INOACC;	/* don't update ref time in getpage */
    820 
    821 	if (ioflag & (FSYNC|FDSYNC)) {
    822 		ip->i_flag |= ISYNC;
    823 		iupdat_flag = 1;
    824 	}
    825 	/*
    826 	 * Try to go direct
    827 	 */
    828 	if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
    829 		uio->uio_llimit = limit;
    830 		error = ufs_directio_write(ip, uio, ioflag, 0, cr,
    831 		    &directio_status);
    832 		/*
    833 		 * If ufs_directio wrote to the file or set the flags,
    834 		 * we need to update i_seq, but it may be deferred.
    835 		 */
    836 		if (start_resid != uio->uio_resid ||
    837 		    (ip->i_flag & (ICHG|IUPD))) {
    838 			i_seq_needed = 1;
    839 			ip->i_flag |= ISEQ;
    840 		}
    841 		if (directio_status == DIRECTIO_SUCCESS)
    842 			goto out;
    843 	}
    844 
    845 	/*
    846 	 * Behavior with respect to dropping/reacquiring vfs_dqrwlock:
    847 	 *
    848 	 * o shadow inodes: vfs_dqrwlock is not held at all
    849 	 * o quota updates: vfs_dqrwlock is read or write held
    850 	 * o other updates: vfs_dqrwlock is read held
    851 	 *
    852 	 * The first case is the only one where we do not hold
    853 	 * vfs_dqrwlock at all while entering wrip().
    854 	 * We must make sure not to downgrade/drop vfs_dqrwlock if we
    855 	 * have it as writer, i.e. if we are updating the quota inode.
    856 	 * There is no potential deadlock scenario in this case as
    857 	 * ufs_getpage() takes care of this and avoids reacquiring
    858 	 * vfs_dqrwlock in that case.
    859 	 *
    860 	 * This check is done here since the above conditions do not change
    861 	 * and we possibly loop below, so save a few cycles.
    862 	 */
    863 	if ((type == IFSHAD) ||
    864 	    (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) {
    865 		do_dqrwlock = 0;
    866 	} else {
    867 		do_dqrwlock = 1;
    868 	}
    869 
    870 	/*
    871 	 * Large Files: We cast MAXBMASK to offset_t
    872 	 * inorder to mask out the higher bits. Since offset_t
    873 	 * is a signed value, the high order bit set in MAXBMASK
    874 	 * value makes it do the right thing by having all bits 1
    875 	 * in the higher word. May be removed for _SOLARIS64_.
    876 	 */
    877 
    878 	fs = ip->i_fs;
    879 	do {
    880 		u_offset_t uoff = uio->uio_loffset;
    881 		off = uoff & (offset_t)MAXBMASK;
    882 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
    883 		on = (int)blkoff(fs, uoff);
    884 		n = (int)MIN(fs->fs_bsize - on, uio->uio_resid);
    885 		new_iblocks = 1;
    886 
    887 		if (type == IFREG && uoff + n >= limit) {
    888 			if (uoff >= limit) {
    889 				error = EFBIG;
    890 				goto out;
    891 			}
    892 			/*
    893 			 * since uoff + n >= limit,
    894 			 * therefore n >= limit - uoff, and n is an int
    895 			 * so it is safe to cast it to an int
    896 			 */
    897 			n = (int)(limit - (rlim64_t)uoff);
    898 		}
    899 		if (uoff + n > ip->i_size) {
    900 			/*
    901 			 * We are extending the length of the file.
    902 			 * bmap is used so that we are sure that
    903 			 * if we need to allocate new blocks, that it
    904 			 * is done here before we up the file size.
    905 			 */
    906 			error = bmap_write(ip, uoff, (int)(on + n),
    907 			    mapon == 0, NULL, cr);
    908 			/*
    909 			 * bmap_write never drops i_contents so if
    910 			 * the flags are set it changed the file.
    911 			 */
    912 			if (ip->i_flag & (ICHG|IUPD)) {
    913 				i_seq_needed = 1;
    914 				ip->i_flag |= ISEQ;
    915 			}
    916 			if (error)
    917 				break;
    918 			/*
    919 			 * There is a window of vulnerability here.
    920 			 * The sequence of operations: allocate file
    921 			 * system blocks, uiomove the data into pages,
    922 			 * and then update the size of the file in the
    923 			 * inode, must happen atomically.  However, due
    924 			 * to current locking constraints, this can not
    925 			 * be done.
    926 			 */
    927 			ASSERT(ip->i_writer == NULL);
    928 			ip->i_writer = curthread;
    929 			i_size_changed = 1;
    930 			/*
    931 			 * If we are writing from the beginning of
    932 			 * the mapping, we can just create the
    933 			 * pages without having to read them.
    934 			 */
    935 			pagecreate = (mapon == 0);
    936 		} else if (n == MAXBSIZE) {
    937 			/*
    938 			 * Going to do a whole mappings worth,
    939 			 * so we can just create the pages w/o
    940 			 * having to read them in.  But before
    941 			 * we do that, we need to make sure any
    942 			 * needed blocks are allocated first.
    943 			 */
    944 			iblocks = ip->i_blocks;
    945 			error = bmap_write(ip, uoff, (int)(on + n),
    946 			    BI_ALLOC_ONLY, NULL, cr);
    947 			/*
    948 			 * bmap_write never drops i_contents so if
    949 			 * the flags are set it changed the file.
    950 			 */
    951 			if (ip->i_flag & (ICHG|IUPD)) {
    952 				i_seq_needed = 1;
    953 				ip->i_flag |= ISEQ;
    954 			}
    955 			if (error)
    956 				break;
    957 			pagecreate = 1;
    958 			/*
    959 			 * check if the new created page needed the
    960 			 * allocation of new disk blocks.
    961 			 */
    962 			if (iblocks == ip->i_blocks)
    963 				new_iblocks = 0; /* no new blocks allocated */
    964 		} else {
    965 			pagecreate = 0;
    966 			/*
    967 			 * In sync mode flush the indirect blocks which
    968 			 * may have been allocated and not written on
    969 			 * disk. In above cases bmap_write will allocate
    970 			 * in sync mode.
    971 			 */
    972 			if (ioflag & (FSYNC|FDSYNC)) {
    973 				error = ufs_indirblk_sync(ip, uoff);
    974 				if (error)
    975 					break;
    976 			}
    977 		}
    978 
    979 		/*
    980 		 * At this point we can enter ufs_getpage() in one
    981 		 * of two ways:
    982 		 * 1) segmap_getmapflt() calls ufs_getpage() when the
    983 		 *    forcefault parameter is true (pagecreate == 0)
    984 		 * 2) uiomove() causes a page fault.
    985 		 *
    986 		 * We have to drop the contents lock to prevent the VM
    987 		 * system from trying to reacquire it in ufs_getpage()
    988 		 * should the uiomove cause a pagefault.
    989 		 *
    990 		 * We have to drop the reader vfs_dqrwlock here as well.
    991 		 */
    992 		rw_exit(&ip->i_contents);
    993 		if (do_dqrwlock) {
    994 			ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
    995 			ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock)));
    996 			rw_exit(&ufsvfsp->vfs_dqrwlock);
    997 		}
    998 
    999 		newpage = 0;
   1000 		premove_resid = uio->uio_resid;
   1001 
   1002 		/*
   1003 		 * Touch the page and fault it in if it is not in core
   1004 		 * before segmap_getmapflt or vpm_data_copy can lock it.
   1005 		 * This is to avoid the deadlock if the buffer is mapped
   1006 		 * to the same file through mmap which we want to write.
   1007 		 */
   1008 		uio_prefaultpages((long)n, uio);
   1009 
   1010 		if (vpm_enable) {
   1011 			/*
   1012 			 * Copy data. If new pages are created, part of
   1013 			 * the page that is not written will be initizliazed
   1014 			 * with zeros.
   1015 			 */
   1016 			error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
   1017 			    uio, !pagecreate, &newpage, 0, S_WRITE);
   1018 		} else {
   1019 
   1020 			base = segmap_getmapflt(segkmap, vp, (off + mapon),
   1021 			    (uint_t)n, !pagecreate, S_WRITE);
   1022 
   1023 			/*
   1024 			 * segmap_pagecreate() returns 1 if it calls
   1025 			 * page_create_va() to allocate any pages.
   1026 			 */
   1027 
   1028 			if (pagecreate)
   1029 				newpage = segmap_pagecreate(segkmap, base,
   1030 				    (size_t)n, 0);
   1031 
   1032 			error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
   1033 		}
   1034 
   1035 		/*
   1036 		 * If "newpage" is set, then a new page was created and it
   1037 		 * does not contain valid data, so it needs to be initialized
   1038 		 * at this point.
   1039 		 * Otherwise the page contains old data, which was overwritten
   1040 		 * partially or as a whole in uiomove.
   1041 		 * If there is only one iovec structure within uio, then
   1042 		 * on error uiomove will not be able to update uio->uio_loffset
   1043 		 * and we would zero the whole page here!
   1044 		 *
   1045 		 * If uiomove fails because of an error, the old valid data
   1046 		 * is kept instead of filling the rest of the page with zero's.
   1047 		 */
   1048 		if (!vpm_enable && newpage &&
   1049 		    uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
   1050 			/*
   1051 			 * We created pages w/o initializing them completely,
   1052 			 * thus we need to zero the part that wasn't set up.
   1053 			 * This happens on most EOF write cases and if
   1054 			 * we had some sort of error during the uiomove.
   1055 			 */
   1056 			int nzero, nmoved;
   1057 
   1058 			nmoved = (int)(uio->uio_loffset - (off + mapon));
   1059 			ASSERT(nmoved >= 0 && nmoved <= n);
   1060 			nzero = roundup(on + n, PAGESIZE) - nmoved;
   1061 			ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
   1062 			(void) kzero(base + mapon + nmoved, (uint_t)nzero);
   1063 		}
   1064 
   1065 		/*
   1066 		 * Unlock the pages allocated by page_create_va()
   1067 		 * in segmap_pagecreate()
   1068 		 */
   1069 		if (!vpm_enable && newpage)
   1070 			segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
   1071 
   1072 		/*
   1073 		 * If the size of the file changed, then update the
   1074 		 * size field in the inode now.  This can't be done
   1075 		 * before the call to segmap_pageunlock or there is
   1076 		 * a potential deadlock with callers to ufs_putpage().
   1077 		 * They will be holding i_contents and trying to lock
   1078 		 * a page, while this thread is holding a page locked
   1079 		 * and trying to acquire i_contents.
   1080 		 */
   1081 		if (i_size_changed) {
   1082 			rw_enter(&ip->i_contents, RW_WRITER);
   1083 			old_i_size = ip->i_size;
   1084 			UFS_SET_ISIZE(uoff + n, ip);
   1085 			TRANS_INODE(ufsvfsp, ip);
   1086 			/*
   1087 			 * file has grown larger than 2GB. Set flag
   1088 			 * in superblock to indicate this, if it
   1089 			 * is not already set.
   1090 			 */
   1091 			if ((ip->i_size > MAXOFF32_T) &&
   1092 			    !(fs->fs_flags & FSLARGEFILES)) {
   1093 				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
   1094 				mutex_enter(&ufsvfsp->vfs_lock);
   1095 				fs->fs_flags |= FSLARGEFILES;
   1096 				ufs_sbwrite(ufsvfsp);
   1097 				mutex_exit(&ufsvfsp->vfs_lock);
   1098 			}
   1099 			mutex_enter(&ip->i_tlock);
   1100 			ip->i_writer = NULL;
   1101 			cv_broadcast(&ip->i_wrcv);
   1102 			mutex_exit(&ip->i_tlock);
   1103 			rw_exit(&ip->i_contents);
   1104 		}
   1105 
   1106 		if (error) {
   1107 			/*
   1108 			 * If we failed on a write, we may have already
   1109 			 * allocated file blocks as well as pages.  It's
   1110 			 * hard to undo the block allocation, but we must
   1111 			 * be sure to invalidate any pages that may have
   1112 			 * been allocated.
   1113 			 *
   1114 			 * If the page was created without initialization
   1115 			 * then we must check if it should be possible
   1116 			 * to destroy the new page and to keep the old data
   1117 			 * on the disk.
   1118 			 *
   1119 			 * It is possible to destroy the page without
   1120 			 * having to write back its contents only when
   1121 			 * - the size of the file keeps unchanged
   1122 			 * - bmap_write() did not allocate new disk blocks
   1123 			 *   it is possible to create big files using "seek" and
   1124 			 *   write to the end of the file. A "write" to a
   1125 			 *   position before the end of the file would not
   1126 			 *   change the size of the file but it would allocate
   1127 			 *   new disk blocks.
   1128 			 * - uiomove intended to overwrite the whole page.
   1129 			 * - a new page was created (newpage == 1).
   1130 			 */
   1131 
   1132 			if (i_size_changed == 0 && new_iblocks == 0 &&
   1133 			    newpage) {
   1134 
   1135 				/* unwind what uiomove eventually last did */
   1136 				uio->uio_resid = premove_resid;
   1137 
   1138 				/*
   1139 				 * destroy the page, do not write ambiguous
   1140 				 * data to the disk.
   1141 				 */
   1142 				flags = SM_DESTROY;
   1143 			} else {
   1144 				/*
   1145 				 * write the page back to the disk, if dirty,
   1146 				 * and remove the page from the cache.
   1147 				 */
   1148 				flags = SM_INVAL;
   1149 			}
   1150 
   1151 			if (vpm_enable) {
   1152 				/*
   1153 				 *  Flush pages.
   1154 				 */
   1155 				(void) vpm_sync_pages(vp, off, n, flags);
   1156 			} else {
   1157 				(void) segmap_release(segkmap, base, flags);
   1158 			}
   1159 		} else {
   1160 			flags = 0;
   1161 			/*
   1162 			 * Force write back for synchronous write cases.
   1163 			 */
   1164 			if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) {
   1165 				/*
   1166 				 * If the sticky bit is set but the
   1167 				 * execute bit is not set, we do a
   1168 				 * synchronous write back and free
   1169 				 * the page when done.  We set up swap
   1170 				 * files to be handled this way to
   1171 				 * prevent servers from keeping around
   1172 				 * the client's swap pages too long.
   1173 				 * XXX - there ought to be a better way.
   1174 				 */
   1175 				if (IS_SWAPVP(vp)) {
   1176 					flags = SM_WRITE | SM_FREE |
   1177 					    SM_DONTNEED;
   1178 					iupdat_flag = 0;
   1179 				} else {
   1180 					flags = SM_WRITE;
   1181 				}
   1182 			} else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
   1183 				/*
   1184 				 * Have written a whole block.
   1185 				 * Start an asynchronous write and
   1186 				 * mark the buffer to indicate that
   1187 				 * it won't be needed again soon.
   1188 				 */
   1189 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
   1190 			}
   1191 			if (vpm_enable) {
   1192 				/*
   1193 				 * Flush pages.
   1194 				 */
   1195 				error = vpm_sync_pages(vp, off, n, flags);
   1196 			} else {
   1197 				error = segmap_release(segkmap, base, flags);
   1198 			}
   1199 			/*
   1200 			 * If the operation failed and is synchronous,
   1201 			 * then we need to unwind what uiomove() last
   1202 			 * did so we can potentially return an error to
   1203 			 * the caller.  If this write operation was
   1204 			 * done in two pieces and the first succeeded,
   1205 			 * then we won't return an error for the second
   1206 			 * piece that failed.  However, we only want to
   1207 			 * return a resid value that reflects what was
   1208 			 * really done.
   1209 			 *
   1210 			 * Failures for non-synchronous operations can
   1211 			 * be ignored since the page subsystem will
   1212 			 * retry the operation until it succeeds or the
   1213 			 * file system is unmounted.
   1214 			 */
   1215 			if (error) {
   1216 				if ((ioflag & (FSYNC | FDSYNC)) ||
   1217 				    type == IFDIR) {
   1218 					uio->uio_resid = premove_resid;
   1219 				} else {
   1220 					error = 0;
   1221 				}
   1222 			}
   1223 		}
   1224 
   1225 		/*
   1226 		 * Re-acquire contents lock.
   1227 		 * If it was dropped, reacquire reader vfs_dqrwlock as well.
   1228 		 */
   1229 		if (do_dqrwlock)
   1230 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   1231 		rw_enter(&ip->i_contents, RW_WRITER);
   1232 
   1233 		/*
   1234 		 * If the uiomove() failed or if a synchronous
   1235 		 * page push failed, fix up i_size.
   1236 		 */
   1237 		if (error) {
   1238 			if (i_size_changed) {
   1239 				/*
   1240 				 * The uiomove failed, and we
   1241 				 * allocated blocks,so get rid
   1242 				 * of them.
   1243 				 */
   1244 				(void) ufs_itrunc(ip, old_i_size, 0, cr);
   1245 			}
   1246 		} else {
   1247 			/*
   1248 			 * XXX - Can this be out of the loop?
   1249 			 */
   1250 			ip->i_flag |= IUPD | ICHG;
   1251 			/*
   1252 			 * Only do one increase of i_seq for multiple
   1253 			 * pieces.  Because we drop locks, record
   1254 			 * the fact that we changed the timestamp and
   1255 			 * are deferring the increase in case another thread
   1256 			 * pushes our timestamp update.
   1257 			 */
   1258 			i_seq_needed = 1;
   1259 			ip->i_flag |= ISEQ;
   1260 			if (i_size_changed)
   1261 				ip->i_flag |= IATTCHG;
   1262 			if ((ip->i_mode & (IEXEC | (IEXEC >> 3) |
   1263 			    (IEXEC >> 6))) != 0 &&
   1264 			    (ip->i_mode & (ISUID | ISGID)) != 0 &&
   1265 			    secpolicy_vnode_setid_retain(cr,
   1266 			    (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) {
   1267 				/*
   1268 				 * Clear Set-UID & Set-GID bits on
   1269 				 * successful write if not privileged
   1270 				 * and at least one of the execute bits
   1271 				 * is set.  If we always clear Set-GID,
   1272 				 * mandatory file and record locking is
   1273 				 * unuseable.
   1274 				 */
   1275 				ip->i_mode &= ~(ISUID | ISGID);
   1276 			}
   1277 		}
   1278 		/*
   1279 		 * In the case the FDSYNC flag is set and this is a
   1280 		 * "rewrite" we won't log a delta.
   1281 		 * The FSYNC flag overrides all cases.
   1282 		 */
   1283 		if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) {
   1284 			TRANS_INODE(ufsvfsp, ip);
   1285 		}
   1286 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
   1287 
   1288 out:
   1289 	/*
   1290 	 * Make sure i_seq is increased at least once per write
   1291 	 */
   1292 	if (i_seq_needed) {
   1293 		ip->i_seq++;
   1294 		ip->i_flag &= ~ISEQ;	/* no longer deferred */
   1295 	}
   1296 
   1297 	/*
   1298 	 * Inode is updated according to this table -
   1299 	 *
   1300 	 *   FSYNC	  FDSYNC(posix.4)
   1301 	 *   --------------------------
   1302 	 *   always@	  IATTCHG|IBDWRITE
   1303 	 *
   1304 	 * @ - 	If we are doing synchronous write the only time we should
   1305 	 *	not be sync'ing the ip here is if we have the stickyhack
   1306 	 *	activated, the file is marked with the sticky bit and
   1307 	 *	no exec bit, the file length has not been changed and
   1308 	 *	no new blocks have been allocated during this write.
   1309 	 */
   1310 
   1311 	if ((ip->i_flag & ISYNC) != 0) {
   1312 		/*
   1313 		 * we have eliminated nosync
   1314 		 */
   1315 		if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
   1316 		    ((ioflag & FSYNC) && iupdat_flag)) {
   1317 			ufs_iupdat(ip, 1);
   1318 		}
   1319 	}
   1320 
   1321 	/*
   1322 	 * If we've already done a partial-write, terminate
   1323 	 * the write but return no error unless the error is ENOSPC
   1324 	 * because the caller can detect this and free resources and
   1325 	 * try again.
   1326 	 */
   1327 	if ((start_resid != uio->uio_resid) && (error != ENOSPC))
   1328 		error = 0;
   1329 
   1330 	ip->i_flag &= ~(INOACC | ISYNC);
   1331 	ITIMES_NOLOCK(ip);
   1332 	return (error);
   1333 }
   1334 
   1335 /*
   1336  * rdip does the real work of read requests for ufs.
   1337  */
   1338 int
   1339 rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
   1340 {
   1341 	u_offset_t off;
   1342 	caddr_t base;
   1343 	struct fs *fs;
   1344 	struct ufsvfs *ufsvfsp;
   1345 	struct vnode *vp;
   1346 	long oresid = uio->uio_resid;
   1347 	u_offset_t n, on, mapon;
   1348 	int error = 0;
   1349 	int doupdate = 1;
   1350 	uint_t flags;
   1351 	int dofree, directio_status;
   1352 	krw_t rwtype;
   1353 	o_mode_t type;
   1354 	clock_t	now;
   1355 
   1356 	vp = ITOV(ip);
   1357 
   1358 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   1359 
   1360 	ufsvfsp = ip->i_ufsvfs;
   1361 
   1362 	if (ufsvfsp == NULL)
   1363 		return (EIO);
   1364 
   1365 	fs = ufsvfsp->vfs_fs;
   1366 
   1367 	/* check for valid filetype */
   1368 	type = ip->i_mode & IFMT;
   1369 	if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
   1370 	    (type != IFLNK) && (type != IFSHAD)) {
   1371 		return (EIO);
   1372 	}
   1373 
   1374 	if (uio->uio_loffset > UFS_MAXOFFSET_T) {
   1375 		error = 0;
   1376 		goto out;
   1377 	}
   1378 	if (uio->uio_loffset < (offset_t)0) {
   1379 		return (EINVAL);
   1380 	}
   1381 	if (uio->uio_resid == 0) {
   1382 		return (0);
   1383 	}
   1384 
   1385 	if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) &&
   1386 	    (!ufsvfsp->vfs_noatime)) {
   1387 		mutex_enter(&ip->i_tlock);
   1388 		ip->i_flag |= IACC;
   1389 		mutex_exit(&ip->i_tlock);
   1390 	}
   1391 	/*
   1392 	 * Try to go direct
   1393 	 */
   1394 	if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
   1395 		error = ufs_directio_read(ip, uio, cr, &directio_status);
   1396 		if (directio_status == DIRECTIO_SUCCESS)
   1397 			goto out;
   1398 	}
   1399 
   1400 	rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
   1401 
   1402 	do {
   1403 		offset_t diff;
   1404 		u_offset_t uoff = uio->uio_loffset;
   1405 		off = uoff & (offset_t)MAXBMASK;
   1406 		mapon = (u_offset_t)(uoff & (offset_t)MAXBOFFSET);
   1407 		on = (u_offset_t)blkoff(fs, uoff);
   1408 		n = MIN((u_offset_t)fs->fs_bsize - on,
   1409 		    (u_offset_t)uio->uio_resid);
   1410 
   1411 		diff = ip->i_size - uoff;
   1412 
   1413 		if (diff <= (offset_t)0) {
   1414 			error = 0;
   1415 			goto out;
   1416 		}
   1417 		if (diff < (offset_t)n)
   1418 			n = (int)diff;
   1419 
   1420 		/*
   1421 		 * We update smallfile2 and smallfile1 at most every second.
   1422 		 */
   1423 		now = ddi_get_lbolt();
   1424 		if (now >= smallfile_update) {
   1425 			uint64_t percpufreeb;
   1426 			if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
   1427 			if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
   1428 			percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
   1429 			smallfile1 = percpufreeb / smallfile1_d;
   1430 			smallfile2 = percpufreeb / smallfile2_d;
   1431 			smallfile1 = MAX(smallfile1, smallfile);
   1432 			smallfile1 = MAX(smallfile1, smallfile64);
   1433 			smallfile2 = MAX(smallfile1, smallfile2);
   1434 			smallfile_update = now + hz;
   1435 		}
   1436 
   1437 		dofree = freebehind &&
   1438 		    ip->i_nextr == (off & PAGEMASK) && off > smallfile1;
   1439 
   1440 		/*
   1441 		 * At this point we can enter ufs_getpage() in one of two
   1442 		 * ways:
   1443 		 * 1) segmap_getmapflt() calls ufs_getpage() when the
   1444 		 *    forcefault parameter is true (value of 1 is passed)
   1445 		 * 2) uiomove() causes a page fault.
   1446 		 *
   1447 		 * We cannot hold onto an i_contents reader lock without
   1448 		 * risking deadlock in ufs_getpage() so drop a reader lock.
   1449 		 * The ufs_getpage() dolock logic already allows for a
   1450 		 * thread holding i_contents as writer to work properly
   1451 		 * so we keep a writer lock.
   1452 		 */
   1453 		if (rwtype == RW_READER)
   1454 			rw_exit(&ip->i_contents);
   1455 
   1456 		if (vpm_enable) {
   1457 			/*
   1458 			 * Copy data.
   1459 			 */
   1460 			error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
   1461 			    uio, 1, NULL, 0, S_READ);
   1462 		} else {
   1463 			base = segmap_getmapflt(segkmap, vp, (off + mapon),
   1464 			    (uint_t)n, 1, S_READ);
   1465 			error = uiomove(base + mapon, (long)n, UIO_READ, uio);
   1466 		}
   1467 
   1468 		flags = 0;
   1469 		if (!error) {
   1470 			/*
   1471 			 * If  reading sequential  we won't need  this
   1472 			 * buffer again  soon.  For  offsets in  range
   1473 			 * [smallfile1,  smallfile2] release the pages
   1474 			 * at   the  tail  of the   cache list, larger
   1475 			 * offsets are released at the head.
   1476 			 */
   1477 			if (dofree) {
   1478 				flags = SM_FREE | SM_ASYNC;
   1479 				if ((cache_read_ahead == 0) &&
   1480 				    (off > smallfile2))
   1481 					flags |=  SM_DONTNEED;
   1482 			}
   1483 			/*
   1484 			 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
   1485 			 * we want to make sure that the page which has
   1486 			 * been read, is written on disk if it is dirty.
   1487 			 * And corresponding indirect blocks should also
   1488 			 * be flushed out.
   1489 			 */
   1490 			if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
   1491 				flags &= ~SM_ASYNC;
   1492 				flags |= SM_WRITE;
   1493 			}
   1494 			if (vpm_enable) {
   1495 				error = vpm_sync_pages(vp, off, n, flags);
   1496 			} else {
   1497 				error = segmap_release(segkmap, base, flags);
   1498 			}
   1499 		} else {
   1500 			if (vpm_enable) {
   1501 				(void) vpm_sync_pages(vp, off, n, flags);
   1502 			} else {
   1503 				(void) segmap_release(segkmap, base, flags);
   1504 			}
   1505 		}
   1506 
   1507 		if (rwtype == RW_READER)
   1508 			rw_enter(&ip->i_contents, rwtype);
   1509 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
   1510 out:
   1511 	/*
   1512 	 * Inode is updated according to this table if FRSYNC is set.
   1513 	 *
   1514 	 *   FSYNC	  FDSYNC(posix.4)
   1515 	 *   --------------------------
   1516 	 *   always	  IATTCHG|IBDWRITE
   1517 	 */
   1518 	/*
   1519 	 * The inode is not updated if we're logging and the inode is a
   1520 	 * directory with FRSYNC, FSYNC and FDSYNC flags set.
   1521 	 */
   1522 	if (ioflag & FRSYNC) {
   1523 		if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) {
   1524 			doupdate = 0;
   1525 		}
   1526 		if (doupdate) {
   1527 			if ((ioflag & FSYNC) ||
   1528 			    ((ioflag & FDSYNC) &&
   1529 			    (ip->i_flag & (IATTCHG|IBDWRITE)))) {
   1530 				ufs_iupdat(ip, 1);
   1531 			}
   1532 		}
   1533 	}
   1534 	/*
   1535 	 * If we've already done a partial read, terminate
   1536 	 * the read but return no error.
   1537 	 */
   1538 	if (oresid != uio->uio_resid)
   1539 		error = 0;
   1540 	ITIMES(ip);
   1541 
   1542 	return (error);
   1543 }
   1544 
   1545 /* ARGSUSED */
   1546 static int
   1547 ufs_ioctl(
   1548 	struct vnode	*vp,
   1549 	int		cmd,
   1550 	intptr_t	arg,
   1551 	int		flag,
   1552 	struct cred	*cr,
   1553 	int		*rvalp,
   1554 	caller_context_t *ct)
   1555 {
   1556 	struct lockfs	lockfs, lockfs_out;
   1557 	struct ufsvfs	*ufsvfsp = VTOI(vp)->i_ufsvfs;
   1558 	char		*comment, *original_comment;
   1559 	struct fs	*fs;
   1560 	struct ulockfs	*ulp;
   1561 	offset_t	off;
   1562 	extern int	maxphys;
   1563 	int		error;
   1564 	int		issync;
   1565 	int		trans_size;
   1566 
   1567 
   1568 	/*
   1569 	 * forcibly unmounted
   1570 	 */
   1571 	if (ufsvfsp == NULL || vp->v_vfsp == NULL ||
   1572 	    vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
   1573 		return (EIO);
   1574 	fs = ufsvfsp->vfs_fs;
   1575 
   1576 	if (cmd == Q_QUOTACTL) {
   1577 		error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK);
   1578 		if (error)
   1579 			return (error);
   1580 
   1581 		if (ulp) {
   1582 			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA,
   1583 			    TOP_SETQUOTA_SIZE(fs));
   1584 		}
   1585 
   1586 		error = quotactl(vp, arg, flag, cr);
   1587 
   1588 		if (ulp) {
   1589 			TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA,
   1590 			    TOP_SETQUOTA_SIZE(fs));
   1591 			ufs_lockfs_end(ulp);
   1592 		}
   1593 		return (error);
   1594 	}
   1595 
   1596 	switch (cmd) {
   1597 		case _FIOLFS:
   1598 			/*
   1599 			 * file system locking
   1600 			 */
   1601 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
   1602 				return (EPERM);
   1603 
   1604 			if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
   1605 				if (copyin((caddr_t)arg, &lockfs,
   1606 				    sizeof (struct lockfs)))
   1607 					return (EFAULT);
   1608 			}
   1609 #ifdef _SYSCALL32_IMPL
   1610 			else {
   1611 				struct lockfs32	lockfs32;
   1612 				/* Translate ILP32 lockfs to LP64 lockfs */
   1613 				if (copyin((caddr_t)arg, &lockfs32,
   1614 				    sizeof (struct lockfs32)))
   1615 					return (EFAULT);
   1616 				lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
   1617 				lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
   1618 				lockfs.lf_key = (ulong_t)lockfs32.lf_key;
   1619 				lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
   1620 				lockfs.lf_comment =
   1621 				    (caddr_t)(uintptr_t)lockfs32.lf_comment;
   1622 			}
   1623 #endif /* _SYSCALL32_IMPL */
   1624 
   1625 			if (lockfs.lf_comlen) {
   1626 				if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN)
   1627 					return (ENAMETOOLONG);
   1628 				comment =
   1629 				    kmem_alloc(lockfs.lf_comlen, KM_SLEEP);
   1630 				if (copyin(lockfs.lf_comment, comment,
   1631 				    lockfs.lf_comlen)) {
   1632 					kmem_free(comment, lockfs.lf_comlen);
   1633 					return (EFAULT);
   1634 				}
   1635 				original_comment = lockfs.lf_comment;
   1636 				lockfs.lf_comment = comment;
   1637 			}
   1638 			if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) {
   1639 				lockfs.lf_comment = original_comment;
   1640 
   1641 				if ((flag & DATAMODEL_MASK) ==
   1642 				    DATAMODEL_NATIVE) {
   1643 					(void) copyout(&lockfs, (caddr_t)arg,
   1644 					    sizeof (struct lockfs));
   1645 				}
   1646 #ifdef _SYSCALL32_IMPL
   1647 				else {
   1648 					struct lockfs32	lockfs32;
   1649 					/* Translate LP64 to ILP32 lockfs */
   1650 					lockfs32.lf_lock =
   1651 					    (uint32_t)lockfs.lf_lock;
   1652 					lockfs32.lf_flags =
   1653 					    (uint32_t)lockfs.lf_flags;
   1654 					lockfs32.lf_key =
   1655 					    (uint32_t)lockfs.lf_key;
   1656 					lockfs32.lf_comlen =
   1657 					    (uint32_t)lockfs.lf_comlen;
   1658 					lockfs32.lf_comment =
   1659 					    (uint32_t)(uintptr_t)
   1660 					    lockfs.lf_comment;
   1661 					(void) copyout(&lockfs32, (caddr_t)arg,
   1662 					    sizeof (struct lockfs32));
   1663 				}
   1664 #endif /* _SYSCALL32_IMPL */
   1665 
   1666 			} else {
   1667 				if (lockfs.lf_comlen)
   1668 					kmem_free(comment, lockfs.lf_comlen);
   1669 			}
   1670 			return (error);
   1671 
   1672 		case _FIOLFSS:
   1673 			/*
   1674 			 * get file system locking status
   1675 			 */
   1676 
   1677 			if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
   1678 				if (copyin((caddr_t)arg, &lockfs,
   1679 				    sizeof (struct lockfs)))
   1680 					return (EFAULT);
   1681 			}
   1682 #ifdef _SYSCALL32_IMPL
   1683 			else {
   1684 				struct lockfs32	lockfs32;
   1685 				/* Translate ILP32 lockfs to LP64 lockfs */
   1686 				if (copyin((caddr_t)arg, &lockfs32,
   1687 				    sizeof (struct lockfs32)))
   1688 					return (EFAULT);
   1689 				lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
   1690 				lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
   1691 				lockfs.lf_key = (ulong_t)lockfs32.lf_key;
   1692 				lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
   1693 				lockfs.lf_comment =
   1694 				    (caddr_t)(uintptr_t)lockfs32.lf_comment;
   1695 			}
   1696 #endif /* _SYSCALL32_IMPL */
   1697 
   1698 			if (error =  ufs_fiolfss(vp, &lockfs_out))
   1699 				return (error);
   1700 			lockfs.lf_lock = lockfs_out.lf_lock;
   1701 			lockfs.lf_key = lockfs_out.lf_key;
   1702 			lockfs.lf_flags = lockfs_out.lf_flags;
   1703 			lockfs.lf_comlen = MIN(lockfs.lf_comlen,
   1704 			    lockfs_out.lf_comlen);
   1705 
   1706 			if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
   1707 				if (copyout(&lockfs, (caddr_t)arg,
   1708 				    sizeof (struct lockfs)))
   1709 					return (EFAULT);
   1710 			}
   1711 #ifdef _SYSCALL32_IMPL
   1712 			else {
   1713 				/* Translate LP64 to ILP32 lockfs */
   1714 				struct lockfs32	lockfs32;
   1715 				lockfs32.lf_lock = (uint32_t)lockfs.lf_lock;
   1716 				lockfs32.lf_flags = (uint32_t)lockfs.lf_flags;
   1717 				lockfs32.lf_key = (uint32_t)lockfs.lf_key;
   1718 				lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen;
   1719 				lockfs32.lf_comment =
   1720 				    (uint32_t)(uintptr_t)lockfs.lf_comment;
   1721 				if (copyout(&lockfs32, (caddr_t)arg,
   1722 				    sizeof (struct lockfs32)))
   1723 					return (EFAULT);
   1724 			}
   1725 #endif /* _SYSCALL32_IMPL */
   1726 
   1727 			if (lockfs.lf_comlen &&
   1728 			    lockfs.lf_comment && lockfs_out.lf_comment)
   1729 				if (copyout(lockfs_out.lf_comment,
   1730 				    lockfs.lf_comment, lockfs.lf_comlen))
   1731 					return (EFAULT);
   1732 			return (0);
   1733 
   1734 		case _FIOSATIME:
   1735 			/*
   1736 			 * set access time
   1737 			 */
   1738 
   1739 			/*
   1740 			 * if mounted w/o atime, return quietly.
   1741 			 * I briefly thought about returning ENOSYS, but
   1742 			 * figured that most apps would consider this fatal
   1743 			 * but the idea is to make this as seamless as poss.
   1744 			 */
   1745 			if (ufsvfsp->vfs_noatime)
   1746 				return (0);
   1747 
   1748 			error = ufs_lockfs_begin(ufsvfsp, &ulp,
   1749 			    ULOCKFS_SETATTR_MASK);
   1750 			if (error)
   1751 				return (error);
   1752 
   1753 			if (ulp) {
   1754 				trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp));
   1755 				TRANS_BEGIN_CSYNC(ufsvfsp, issync,
   1756 				    TOP_SETATTR, trans_size);
   1757 			}
   1758 
   1759 			error = ufs_fiosatime(vp, (struct timeval *)arg,
   1760 			    flag, cr);
   1761 
   1762 			if (ulp) {
   1763 				TRANS_END_CSYNC(ufsvfsp, error, issync,
   1764 				    TOP_SETATTR, trans_size);
   1765 				ufs_lockfs_end(ulp);
   1766 			}
   1767 			return (error);
   1768 
   1769 		case _FIOSDIO:
   1770 			/*
   1771 			 * set delayed-io
   1772 			 */
   1773 			return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr));
   1774 
   1775 		case _FIOGDIO:
   1776 			/*
   1777 			 * get delayed-io
   1778 			 */
   1779 			return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr));
   1780 
   1781 		case _FIOIO:
   1782 			/*
   1783 			 * inode open
   1784 			 */
   1785 			error = ufs_lockfs_begin(ufsvfsp, &ulp,
   1786 			    ULOCKFS_VGET_MASK);
   1787 			if (error)
   1788 				return (error);
   1789 
   1790 			error = ufs_fioio(vp, (struct fioio *)arg, flag, cr);
   1791 
   1792 			if (ulp) {
   1793 				ufs_lockfs_end(ulp);
   1794 			}
   1795 			return (error);
   1796 
   1797 		case _FIOFFS:
   1798 			/*
   1799 			 * file system flush (push w/invalidate)
   1800 			 */
   1801 			if ((caddr_t)arg != NULL)
   1802 				return (EINVAL);
   1803 			return (ufs_fioffs(vp, NULL, cr));
   1804 
   1805 		case _FIOISBUSY:
   1806 			/*
   1807 			 * Contract-private interface for Legato
   1808 			 * Purge this vnode from the DNLC and decide
   1809 			 * if this vnode is busy (*arg == 1) or not
   1810 			 * (*arg == 0)
   1811 			 */
   1812 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
   1813 				return (EPERM);
   1814 			error = ufs_fioisbusy(vp, (int *)arg, cr);
   1815 			return (error);
   1816 
   1817 		case _FIODIRECTIO:
   1818 			return (ufs_fiodirectio(vp, (int)arg, cr));
   1819 
   1820 		case _FIOTUNE:
   1821 			/*
   1822 			 * Tune the file system (aka setting fs attributes)
   1823 			 */
   1824 			error = ufs_lockfs_begin(ufsvfsp, &ulp,
   1825 			    ULOCKFS_SETATTR_MASK);
   1826 			if (error)
   1827 				return (error);
   1828 
   1829 			error = ufs_fiotune(vp, (struct fiotune *)arg, cr);
   1830 
   1831 			if (ulp)
   1832 				ufs_lockfs_end(ulp);
   1833 			return (error);
   1834 
   1835 		case _FIOLOGENABLE:
   1836 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
   1837 				return (EPERM);
   1838 			return (ufs_fiologenable(vp, (void *)arg, cr, flag));
   1839 
   1840 		case _FIOLOGDISABLE:
   1841 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
   1842 				return (EPERM);
   1843 			return (ufs_fiologdisable(vp, (void *)arg, cr, flag));
   1844 
   1845 		case _FIOISLOG:
   1846 			return (ufs_fioislog(vp, (void *)arg, cr, flag));
   1847 
   1848 		case _FIOSNAPSHOTCREATE_MULTI:
   1849 		{
   1850 			struct fiosnapcreate_multi	fc, *fcp;
   1851 			size_t	fcm_size;
   1852 
   1853 			if (copyin((void *)arg, &fc, sizeof (fc)))
   1854 				return (EFAULT);
   1855 			if (fc.backfilecount > MAX_BACKFILE_COUNT)
   1856 				return (EINVAL);
   1857 			fcm_size = sizeof (struct fiosnapcreate_multi) +
   1858 			    (fc.backfilecount - 1) * sizeof (int);
   1859 			fcp = (struct fiosnapcreate_multi *)
   1860 			    kmem_alloc(fcm_size, KM_SLEEP);
   1861 			if (copyin((void *)arg, fcp, fcm_size)) {
   1862 				kmem_free(fcp, fcm_size);
   1863 				return (EFAULT);
   1864 			}
   1865 			error = ufs_snap_create(vp, fcp, cr);
   1866 			/*
   1867 			 * Do copyout even if there is an error because
   1868 			 * the details of error is stored in fcp.
   1869 			 */
   1870 			if (copyout(fcp, (void *)arg, fcm_size))
   1871 				error = EFAULT;
   1872 			kmem_free(fcp, fcm_size);
   1873 			return (error);
   1874 		}
   1875 
   1876 		case _FIOSNAPSHOTDELETE:
   1877 		{
   1878 			struct fiosnapdelete	fc;
   1879 
   1880 			if (copyin((void *)arg, &fc, sizeof (fc)))
   1881 				return (EFAULT);
   1882 			error = ufs_snap_delete(vp, &fc, cr);
   1883 			if (!error && copyout(&fc, (void *)arg, sizeof (fc)))
   1884 				error = EFAULT;
   1885 			return (error);
   1886 		}
   1887 
   1888 		case _FIOGETSUPERBLOCK:
   1889 			if (copyout(fs, (void *)arg, SBSIZE))
   1890 				return (EFAULT);
   1891 			return (0);
   1892 
   1893 		case _FIOGETMAXPHYS:
   1894 			if (copyout(&maxphys, (void *)arg, sizeof (maxphys)))
   1895 				return (EFAULT);
   1896 			return (0);
   1897 
   1898 		/*
   1899 		 * The following 3 ioctls are for TSufs support
   1900 		 * although could potentially be used elsewhere
   1901 		 */
   1902 		case _FIO_SET_LUFS_DEBUG:
   1903 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
   1904 				return (EPERM);
   1905 			lufs_debug = (uint32_t)arg;
   1906 			return (0);
   1907 
   1908 		case _FIO_SET_LUFS_ERROR:
   1909 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
   1910 				return (EPERM);
   1911 			TRANS_SETERROR(ufsvfsp);
   1912 			return (0);
   1913 
   1914 		case _FIO_GET_TOP_STATS:
   1915 		{
   1916 			fio_lufs_stats_t *ls;
   1917 			ml_unit_t *ul = ufsvfsp->vfs_log;
   1918 
   1919 			ls = kmem_zalloc(sizeof (*ls), KM_SLEEP);
   1920 			ls->ls_debug = ul->un_debug; /* return debug value */
   1921 			/* Copy stucture if statistics are being kept */
   1922 			if (ul->un_logmap->mtm_tops) {
   1923 				ls->ls_topstats = *(ul->un_logmap->mtm_tops);
   1924 			}
   1925 			error = 0;
   1926 			if (copyout(ls, (void *)arg, sizeof (*ls)))
   1927 				error = EFAULT;
   1928 			kmem_free(ls, sizeof (*ls));
   1929 			return (error);
   1930 		}
   1931 
   1932 		case _FIO_SEEK_DATA:
   1933 		case _FIO_SEEK_HOLE:
   1934 			if (ddi_copyin((void *)arg, &off, sizeof (off), flag))
   1935 				return (EFAULT);
   1936 			/* offset paramater is in/out */
   1937 			error = ufs_fio_holey(vp, cmd, &off);
   1938 			if (error)
   1939 				return (error);
   1940 			if (ddi_copyout(&off, (void *)arg, sizeof (off), flag))
   1941 				return (EFAULT);
   1942 			return (0);
   1943 
   1944 		case _FIO_COMPRESSED:
   1945 		{
   1946 			/*
   1947 			 * This is a project private ufs ioctl() to mark
   1948 			 * the inode as that belonging to a compressed
   1949 			 * file. This is used to mark individual
   1950 			 * compressed files in a miniroot archive.
   1951 			 * The files compressed in this manner are
   1952 			 * automatically decompressed by the dcfs filesystem
   1953 			 * (via an interception in ufs_lookup - see decompvp())
   1954 			 * which is layered on top of ufs on a system running
   1955 			 * from the archive. See uts/common/fs/dcfs for details.
   1956 			 * This ioctl only marks the file as compressed - the
   1957 			 * actual compression is done by fiocompress (a
   1958 			 * userland utility) which invokes this ioctl().
   1959 			 */
   1960 			struct inode *ip = VTOI(vp);
   1961 
   1962 			error = ufs_lockfs_begin(ufsvfsp, &ulp,
   1963 			    ULOCKFS_SETATTR_MASK);
   1964 			if (error)
   1965 				return (error);
   1966 
   1967 			if (ulp) {
   1968 				TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT,
   1969 				    TOP_IUPDAT_SIZE(ip));
   1970 			}
   1971 
   1972 			error = ufs_mark_compressed(vp);
   1973 
   1974 			if (ulp) {
   1975 				TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT,
   1976 				    TOP_IUPDAT_SIZE(ip));
   1977 				ufs_lockfs_end(ulp);
   1978 			}
   1979 
   1980 			return (error);
   1981 
   1982 		}
   1983 
   1984 		default:
   1985 			return (ENOTTY);
   1986 	}
   1987 }
   1988 
   1989 
   1990 /* ARGSUSED */
   1991 static int
   1992 ufs_getattr(struct vnode *vp, struct vattr *vap, int flags,
   1993 	struct cred *cr, caller_context_t *ct)
   1994 {
   1995 	struct inode *ip = VTOI(vp);
   1996 	struct ufsvfs *ufsvfsp;
   1997 	int err;
   1998 
   1999 	if (vap->va_mask == AT_SIZE) {
   2000 		/*
   2001 		 * for performance, if only the size is requested don't bother
   2002 		 * with anything else.
   2003 		 */
   2004 		UFS_GET_ISIZE(&vap->va_size, ip);
   2005 		return (0);
   2006 	}
   2007 
   2008 	/*
   2009 	 * inlined lockfs checks
   2010 	 */
   2011 	ufsvfsp = ip->i_ufsvfs;
   2012 	if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) {
   2013 		err = EIO;
   2014 		goto out;
   2015 	}
   2016 
   2017 	rw_enter(&ip->i_contents, RW_READER);
   2018 	/*
   2019 	 * Return all the attributes.  This should be refined so
   2020 	 * that it only returns what's asked for.
   2021 	 */
   2022 
   2023 	/*
   2024 	 * Copy from inode table.
   2025 	 */
   2026 	vap->va_type = vp->v_type;
   2027 	vap->va_mode = ip->i_mode & MODEMASK;
   2028 	/*
   2029 	 * If there is an ACL and there is a mask entry, then do the
   2030 	 * extra work that completes the equivalent of an acltomode(3)
   2031 	 * call.  According to POSIX P1003.1e, the acl mask should be
   2032 	 * returned in the group permissions field.
   2033 	 *
   2034 	 * - start with the original permission and mode bits (from above)
   2035 	 * - clear the group owner bits
   2036 	 * - add in the mask bits.
   2037 	 */
   2038 	if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) {
   2039 		vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3);
   2040 		vap->va_mode |=
   2041 		    (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3;
   2042 	}
   2043 	vap->va_uid = ip->i_uid;
   2044 	vap->va_gid = ip->i_gid;
   2045 	vap->va_fsid = ip->i_dev;
   2046 	vap->va_nodeid = (ino64_t)ip->i_number;
   2047 	vap->va_nlink = ip->i_nlink;
   2048 	vap->va_size = ip->i_size;
   2049 	if (vp->v_type == VCHR || vp->v_type == VBLK)
   2050 		vap->va_rdev = ip->i_rdev;
   2051 	else
   2052 		vap->va_rdev = 0;	/* not a b/c spec. */
   2053 	mutex_enter(&ip->i_tlock);
   2054 	ITIMES_NOLOCK(ip);	/* mark correct time in inode */
   2055 	vap->va_seq = ip->i_seq;
   2056 	vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
   2057 	vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000;
   2058 	vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
   2059 	vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000;
   2060 	vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
   2061 	vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000;
   2062 	mutex_exit(&ip->i_tlock);
   2063 
   2064 	switch (ip->i_mode & IFMT) {
   2065 
   2066 	case IFBLK:
   2067 		vap->va_blksize = MAXBSIZE;		/* was BLKDEV_IOSIZE */
   2068 		break;
   2069 
   2070 	case IFCHR:
   2071 		vap->va_blksize = MAXBSIZE;
   2072 		break;
   2073 
   2074 	default:
   2075 		vap->va_blksize = ip->i_fs->fs_bsize;
   2076 		break;
   2077 	}
   2078 	vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks;
   2079 	rw_exit(&ip->i_contents);
   2080 	err = 0;
   2081 
   2082 out:
   2083 	return (err);
   2084 }
   2085 
   2086 /*
   2087  * Special wrapper to provide a callback for secpolicy_vnode_setattr().
   2088  * The i_contents lock is already held by the caller and we need to
   2089  * declare the inode as 'void *' argument.
   2090  */
   2091 static int
   2092 ufs_priv_access(void *vip, int mode, struct cred *cr)
   2093 {
   2094 	struct inode *ip = vip;
   2095 
   2096 	return (ufs_iaccess(ip, mode, cr, 0));
   2097 }
   2098 
   2099 /*ARGSUSED4*/
   2100 static int
   2101 ufs_setattr(
   2102 	struct vnode *vp,
   2103 	struct vattr *vap,
   2104 	int flags,
   2105 	struct cred *cr,
   2106 	caller_context_t *ct)
   2107 {
   2108 	struct inode *ip = VTOI(vp);
   2109 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
   2110 	struct fs *fs;
   2111 	struct ulockfs *ulp;
   2112 	char *errmsg1;
   2113 	char *errmsg2;
   2114 	long blocks;
   2115 	long int mask = vap->va_mask;
   2116 	size_t len1, len2;
   2117 	int issync;
   2118 	int trans_size;
   2119 	int dotrans;
   2120 	int dorwlock;
   2121 	int error;
   2122 	int owner_change;
   2123 	int dodqlock;
   2124 	timestruc_t now;
   2125 	vattr_t oldva;
   2126 	int retry = 1;
   2127 	int indeadlock;
   2128 
   2129 	/*
   2130 	 * Cannot set these attributes.
   2131 	 */
   2132 	if ((mask & AT_NOSET) || (mask & AT_XVATTR))
   2133 		return (EINVAL);
   2134 
   2135 	/*
   2136 	 * check for forced unmount
   2137 	 */
   2138 	if (ufsvfsp == NULL)
   2139 		return (EIO);
   2140 
   2141 	fs = ufsvfsp->vfs_fs;
   2142 	if (fs->fs_ronly != 0)
   2143 		return (EROFS);
   2144 
   2145 again:
   2146 	errmsg1 = NULL;
   2147 	errmsg2 = NULL;
   2148 	dotrans = 0;
   2149 	dorwlock = 0;
   2150 	dodqlock = 0;
   2151 
   2152 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK);
   2153 	if (error)
   2154 		goto out;
   2155 
   2156 	/*
   2157 	 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
   2158 	 * This follows the protocol for read()/write().
   2159 	 */
   2160 	if (vp->v_type != VDIR) {
   2161 		/*
   2162 		 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
   2163 		 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
   2164 		 * possible, retries the operation.
   2165 		 */
   2166 		ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_file);
   2167 		if (indeadlock) {
   2168 			if (ulp)
   2169 				ufs_lockfs_end(ulp);
   2170 			goto again;
   2171 		}
   2172 		dorwlock = 1;
   2173 	}
   2174 
   2175 	/*
   2176 	 * Truncate file.  Must have write permission and not be a directory.
   2177 	 */
   2178 	if (mask & AT_SIZE) {
   2179 		rw_enter(&ip->i_contents, RW_WRITER);
   2180 		if (vp->v_type == VDIR) {
   2181 			error = EISDIR;
   2182 			goto update_inode;
   2183 		}
   2184 		if (error = ufs_iaccess(ip, IWRITE, cr, 0))
   2185 			goto update_inode;
   2186 
   2187 		rw_exit(&ip->i_contents);
   2188 		error = TRANS_ITRUNC(ip, vap->va_size, 0, cr);
   2189 		if (error) {
   2190 			rw_enter(&ip->i_contents, RW_WRITER);
   2191 			goto update_inode;
   2192 		}
   2193 	}
   2194 
   2195 	if (ulp) {
   2196 		trans_size = (int)TOP_SETATTR_SIZE(ip);
   2197 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SETATTR, trans_size);
   2198 		++dotrans;
   2199 	}
   2200 
   2201 	/*
   2202 	 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
   2203 	 * This follows the protocol established by
   2204 	 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
   2205 	 */
   2206 	if (vp->v_type == VDIR) {
   2207 		ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_SETATTR,
   2208 		    retry_dir);
   2209 		if (indeadlock)
   2210 			goto again;
   2211 		dorwlock = 1;
   2212 	}
   2213 
   2214 	/*
   2215 	 * Grab quota lock if we are changing the file's owner.
   2216 	 */
   2217 	if (mask & AT_UID) {
   2218 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   2219 		dodqlock = 1;
   2220 	}
   2221 	rw_enter(&ip->i_contents, RW_WRITER);
   2222 
   2223 	oldva.va_mode = ip->i_mode;
   2224 	oldva.va_uid = ip->i_uid;
   2225 	oldva.va_gid = ip->i_gid;
   2226 
   2227 	vap->va_mask &= ~AT_SIZE;
   2228 
   2229 	error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
   2230 	    ufs_priv_access, ip);
   2231 	if (error)
   2232 		goto update_inode;
   2233 
   2234 	mask = vap->va_mask;
   2235 
   2236 	/*
   2237 	 * Change file access modes.
   2238 	 */
   2239 	if (mask & AT_MODE) {
   2240 		ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT);
   2241 		TRANS_INODE(ufsvfsp, ip);
   2242 		ip->i_flag |= ICHG;
   2243 		if (stickyhack) {
   2244 			mutex_enter(&vp->v_lock);
   2245 			if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
   2246 				vp->v_flag |= VSWAPLIKE;
   2247 			else
   2248 				vp->v_flag &= ~VSWAPLIKE;
   2249 			mutex_exit(&vp->v_lock);
   2250 		}
   2251 	}
   2252 	if (mask & (AT_UID|AT_GID)) {
   2253 		if (mask & AT_UID) {
   2254 			/*
   2255 			 * Don't change ownership of the quota inode.
   2256 			 */
   2257 			if (ufsvfsp->vfs_qinod == ip) {
   2258 				ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED);
   2259 				error = EINVAL;
   2260 				goto update_inode;
   2261 			}
   2262 
   2263 			/*
   2264 			 * No real ownership change.
   2265 			 */
   2266 			if (ip->i_uid == vap->va_uid) {
   2267 				blocks = 0;
   2268 				owner_change = 0;
   2269 			}
   2270 			/*
   2271 			 * Remove the blocks and the file, from the old user's
   2272 			 * quota.
   2273 			 */
   2274 			else {
   2275 				blocks = ip->i_blocks;
   2276 				owner_change = 1;
   2277 
   2278 				(void) chkdq(ip, -blocks, /* force */ 1, cr,
   2279 				    (char **)NULL, (size_t *)NULL);
   2280 				(void) chkiq(ufsvfsp, /* change */ -1, ip,
   2281 				    (uid_t)ip->i_uid, /* force */ 1, cr,
   2282 				    (char **)NULL, (size_t *)NULL);
   2283 				dqrele(ip->i_dquot);
   2284 			}
   2285 
   2286 			ip->i_uid = vap->va_uid;
   2287 
   2288 			/*
   2289 			 * There is a real ownership change.
   2290 			 */
   2291 			if (owner_change) {
   2292 				/*
   2293 				 * Add the blocks and the file to the new
   2294 				 * user's quota.
   2295 				 */
   2296 				ip->i_dquot = getinoquota(ip);
   2297 				(void) chkdq(ip, blocks, /* force */ 1, cr,
   2298 				    &errmsg1, &len1);
   2299 				(void) chkiq(ufsvfsp, /* change */ 1,
   2300 				    (struct inode *)NULL, (uid_t)ip->i_uid,
   2301 				    /* force */ 1, cr, &errmsg2, &len2);
   2302 			}
   2303 		}
   2304 		if (mask & AT_GID) {
   2305 			ip->i_gid = vap->va_gid;
   2306 		}
   2307 		TRANS_INODE(ufsvfsp, ip);
   2308 		ip->i_flag |= ICHG;
   2309 	}
   2310 	/*
   2311 	 * Change file access or modified times.
   2312 	 */
   2313 	if (mask & (AT_ATIME|AT_MTIME)) {
   2314 		/* Check that the time value is within ufs range */
   2315 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
   2316 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
   2317 			error = EOVERFLOW;
   2318 			goto update_inode;
   2319 		}
   2320 
   2321 		/*
   2322 		 * if the "noaccess" mount option is set and only atime
   2323 		 * update is requested, do nothing. No error is returned.
   2324 		 */
   2325 		if ((ufsvfsp->vfs_noatime) &&
   2326 		    ((mask & (AT_ATIME|AT_MTIME)) == AT_ATIME))
   2327 			goto skip_atime;
   2328 
   2329 		if (mask & AT_ATIME) {
   2330 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
   2331 			ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
   2332 			ip->i_flag &= ~IACC;
   2333 		}
   2334 		if (mask & AT_MTIME) {
   2335 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
   2336 			ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
   2337 			gethrestime(&now);
   2338 			if (now.tv_sec > TIME32_MAX) {
   2339 				/*
   2340 				 * In 2038, ctime sticks forever..
   2341 				 */
   2342 				ip->i_ctime.tv_sec = TIME32_MAX;
   2343 				ip->i_ctime.tv_usec = 0;
   2344 			} else {
   2345 				ip->i_ctime.tv_sec = now.tv_sec;
   2346 				ip->i_ctime.tv_usec = now.tv_nsec / 1000;
   2347 			}
   2348 			ip->i_flag &= ~(IUPD|ICHG);
   2349 			ip->i_flag |= IMODTIME;
   2350 		}
   2351 		TRANS_INODE(ufsvfsp, ip);
   2352 		ip->i_flag |= IMOD;
   2353 	}
   2354 
   2355 skip_atime:
   2356 	/*
   2357 	 * The presence of a shadow inode may indicate an ACL, but does
   2358 	 * not imply an ACL.  Future FSD types should be handled here too
   2359 	 * and check for the presence of the attribute-specific data
   2360 	 * before referencing it.
   2361 	 */
   2362 	if (ip->i_shadow) {
   2363 		/*
   2364 		 * XXX if ufs_iupdat is changed to sandbagged write fix
   2365 		 * ufs_acl_setattr to push ip to keep acls consistent
   2366 		 *
   2367 		 * Suppress out of inodes messages if we will retry.
   2368 		 */
   2369 		if (retry)
   2370 			ip->i_flag |= IQUIET;
   2371 		error = ufs_acl_setattr(ip, vap, cr);
   2372 		ip->i_flag &= ~IQUIET;
   2373 	}
   2374 
   2375 update_inode:
   2376 	/*
   2377 	 * Setattr always increases the sequence number
   2378 	 */
   2379 	ip->i_seq++;
   2380 
   2381 	/*
   2382 	 * if nfsd and not logging; push synchronously
   2383 	 */
   2384 	if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) {
   2385 		ufs_iupdat(ip, 1);
   2386 	} else {
   2387 		ITIMES_NOLOCK(ip);
   2388 	}
   2389 
   2390 	rw_exit(&ip->i_contents);
   2391 	if (dodqlock) {
   2392 		rw_exit(&ufsvfsp->vfs_dqrwlock);
   2393 	}
   2394 	if (dorwlock)
   2395 		rw_exit(&ip->i_rwlock);
   2396 
   2397 	if (ulp) {
   2398 		if (dotrans) {
   2399 			int terr = 0;
   2400 			TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SETATTR,
   2401 			    trans_size);
   2402 			if (error == 0)
   2403 				error = terr;
   2404 		}
   2405 		ufs_lockfs_end(ulp);
   2406 	}
   2407 out:
   2408 	/*
   2409 	 * If out of inodes or blocks, see if we can free something
   2410 	 * up from the delete queue.
   2411 	 */
   2412 	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
   2413 		ufs_delete_drain_wait(ufsvfsp, 1);
   2414 		retry = 0;
   2415 		if (errmsg1 != NULL)
   2416 			kmem_free(errmsg1, len1);
   2417 		if (errmsg2 != NULL)
   2418 			kmem_free(errmsg2, len2);
   2419 		goto again;
   2420 	}
   2421 	if (errmsg1 != NULL) {
   2422 		uprintf(errmsg1);
   2423 		kmem_free(errmsg1, len1);
   2424 	}
   2425 	if (errmsg2 != NULL) {
   2426 		uprintf(errmsg2);
   2427 		kmem_free(errmsg2, len2);
   2428 	}
   2429 	return (error);
   2430 }
   2431 
   2432 /*ARGSUSED*/
   2433 static int
   2434 ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
   2435 	caller_context_t *ct)
   2436 {
   2437 	struct inode *ip = VTOI(vp);
   2438 
   2439 	if (ip->i_ufsvfs == NULL)
   2440 		return (EIO);
   2441 
   2442 	/*
   2443 	 * The ufs_iaccess function wants to be called with
   2444 	 * mode bits expressed as "ufs specific" bits.
   2445 	 * I.e., VWRITE|VREAD|VEXEC do not make sense to
   2446 	 * ufs_iaccess() but IWRITE|IREAD|IEXEC do.
   2447 	 * But since they're the same we just pass the vnode mode
   2448 	 * bit but just verify that assumption at compile time.
   2449 	 */
   2450 #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC
   2451 #error "ufs_access needs to map Vmodes to Imodes"
   2452 #endif
   2453 	return (ufs_iaccess(ip, mode, cr, 1));
   2454 }
   2455 
   2456 /* ARGSUSED */
   2457 static int
   2458 ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr,
   2459 	caller_context_t *ct)
   2460 {
   2461 	struct inode *ip = VTOI(vp);
   2462 	struct ufsvfs *ufsvfsp;
   2463 	struct ulockfs *ulp;
   2464 	int error;
   2465 	int fastsymlink;
   2466 
   2467 	if (vp->v_type != VLNK) {
   2468 		error = EINVAL;
   2469 		goto nolockout;
   2470 	}
   2471 
   2472 	/*
   2473 	 * If the symbolic link is empty there is nothing to read.
   2474 	 * Fast-track these empty symbolic links
   2475 	 */
   2476 	if (ip->i_size == 0) {
   2477 		error = 0;
   2478 		goto nolockout;
   2479 	}
   2480 
   2481 	ufsvfsp = ip->i_ufsvfs;
   2482 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK);
   2483 	if (error)
   2484 		goto nolockout;
   2485 	/*
   2486 	 * The ip->i_rwlock protects the data blocks used for FASTSYMLINK
   2487 	 */
   2488 again:
   2489 	fastsymlink = 0;
   2490 	if (ip->i_flag & IFASTSYMLNK) {
   2491 		rw_enter(&ip->i_rwlock, RW_READER);
   2492 		rw_enter(&ip->i_contents, RW_READER);
   2493 		if (ip->i_flag & IFASTSYMLNK) {
   2494 			if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
   2495 			    (ip->i_fs->fs_ronly == 0) &&
   2496 			    (!ufsvfsp->vfs_noatime)) {
   2497 				mutex_enter(&ip->i_tlock);
   2498 				ip->i_flag |= IACC;
   2499 				mutex_exit(&ip->i_tlock);
   2500 			}
   2501 			error = uiomove((caddr_t)&ip->i_db[1],
   2502 			    MIN(ip->i_size, uiop->uio_resid),
   2503 			    UIO_READ, uiop);
   2504 			ITIMES(ip);
   2505 			++fastsymlink;
   2506 		}
   2507 		rw_exit(&ip->i_contents);
   2508 		rw_exit(&ip->i_rwlock);
   2509 	}
   2510 	if (!fastsymlink) {
   2511 		ssize_t size;	/* number of bytes read  */
   2512 		caddr_t basep;	/* pointer to input data */
   2513 		ino_t ino;
   2514 		long  igen;
   2515 		struct uio tuio;	/* temp uio struct */
   2516 		struct uio *tuiop;
   2517 		iovec_t tiov;		/* temp iovec struct */
   2518 		char kbuf[FSL_SIZE];	/* buffer to hold fast symlink */
   2519 		int tflag = 0;		/* flag to indicate temp vars used */
   2520 
   2521 		ino = ip->i_number;
   2522 		igen = ip->i_gen;
   2523 		size = uiop->uio_resid;
   2524 		basep = uiop->uio_iov->iov_base;
   2525 		tuiop = uiop;
   2526 
   2527 		rw_enter(&ip->i_rwlock, RW_WRITER);
   2528 		rw_enter(&ip->i_contents, RW_WRITER);
   2529 		if (ip->i_flag & IFASTSYMLNK) {
   2530 			rw_exit(&ip->i_contents);
   2531 			rw_exit(&ip->i_rwlock);
   2532 			goto again;
   2533 		}
   2534 
   2535 		/* can this be a fast symlink and is it a user buffer? */
   2536 		if (ip->i_size <= FSL_SIZE &&
   2537 		    (uiop->uio_segflg == UIO_USERSPACE ||
   2538 		    uiop->uio_segflg == UIO_USERISPACE)) {
   2539 
   2540 			bzero(&tuio, sizeof (struct uio));
   2541 			/*
   2542 			 * setup a kernel buffer to read link into.  this
   2543 			 * is to fix a race condition where the user buffer
   2544 			 * got corrupted before copying it into the inode.
   2545 			 */
   2546 			size = ip->i_size;
   2547 			tiov.iov_len = size;
   2548 			tiov.iov_base = kbuf;
   2549 			tuio.uio_iov = &tiov;
   2550 			tuio.uio_iovcnt = 1;
   2551 			tuio.uio_offset = uiop->uio_offset;
   2552 			tuio.uio_segflg = UIO_SYSSPACE;
   2553 			tuio.uio_fmode = uiop->uio_fmode;
   2554 			tuio.uio_extflg = uiop->uio_extflg;
   2555 			tuio.uio_limit = uiop->uio_limit;
   2556 			tuio.uio_resid = size;
   2557 
   2558 			basep = tuio.uio_iov->iov_base;
   2559 			tuiop = &tuio;
   2560 			tflag = 1;
   2561 		}
   2562 
   2563 		error = rdip(ip, tuiop, 0, cr);
   2564 		if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) {
   2565 			rw_exit(&ip->i_contents);
   2566 			rw_exit(&ip->i_rwlock);
   2567 			goto out;
   2568 		}
   2569 
   2570 		if (tflag == 0)
   2571 			size -= uiop->uio_resid;
   2572 
   2573 		if ((tflag == 0 && ip->i_size <= FSL_SIZE &&
   2574 		    ip->i_size == size) || (tflag == 1 &&
   2575 		    tuio.uio_resid == 0)) {
   2576 			error = kcopy(basep, &ip->i_db[1], ip->i_size);
   2577 			if (error == 0) {
   2578 				ip->i_flag |= IFASTSYMLNK;
   2579 				/*
   2580 				 * free page
   2581 				 */
   2582 				(void) VOP_PUTPAGE(ITOV(ip),
   2583 				    (offset_t)0, PAGESIZE,
   2584 				    (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC),
   2585 				    cr, ct);
   2586 			} else {
   2587 				int i;
   2588 				/* error, clear garbage left behind */
   2589 				for (i = 1; i < NDADDR; i++)
   2590 					ip->i_db[i] = 0;
   2591 				for (i = 0; i < NIADDR; i++)
   2592 					ip->i_ib[i] = 0;
   2593 			}
   2594 		}
   2595 		if (tflag == 1) {
   2596 			/* now, copy it into the user buffer */
   2597 			error = uiomove((caddr_t)kbuf,
   2598 			    MIN(size, uiop->uio_resid),
   2599 			    UIO_READ, uiop);
   2600 		}
   2601 		rw_exit(&ip->i_contents);
   2602 		rw_exit(&ip->i_rwlock);
   2603 	}
   2604 out:
   2605 	if (ulp) {
   2606 		ufs_lockfs_end(ulp);
   2607 	}
   2608 nolockout:
   2609 	return (error);
   2610 }
   2611 
   2612 /* ARGSUSED */
   2613 static int
   2614 ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr,
   2615 	caller_context_t *ct)
   2616 {
   2617 	struct inode *ip = VTOI(vp);
   2618 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
   2619 	struct ulockfs *ulp;
   2620 	int error;
   2621 
   2622 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK);
   2623 	if (error)
   2624 		return (error);
   2625 
   2626 	if (TRANS_ISTRANS(ufsvfsp)) {
   2627 		/*
   2628 		 * First push out any data pages
   2629 		 */
   2630 		if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
   2631 		    (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) {
   2632 			error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
   2633 			    0, CRED(), ct);
   2634 			if (error)
   2635 				goto out;
   2636 		}
   2637 
   2638 		/*
   2639 		 * Delta any delayed inode times updates
   2640 		 * and push inode to log.
   2641 		 * All other inode deltas will have already been delta'd
   2642 		 * and will be pushed during the commit.
   2643 		 */
   2644 		if (!(syncflag & FDSYNC) &&
   2645 		    ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) {
   2646 			if (ulp) {
   2647 				TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC,
   2648 				    TOP_SYNCIP_SIZE);
   2649 			}
   2650 			rw_enter(&ip->i_contents, RW_READER);
   2651 			mutex_enter(&ip->i_tlock);
   2652 			ip->i_flag &= ~IMODTIME;
   2653 			mutex_exit(&ip->i_tlock);
   2654 			ufs_iupdat(ip, I_SYNC);
   2655 			rw_exit(&ip->i_contents);
   2656 			if (ulp) {
   2657 				TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC,
   2658 				    TOP_SYNCIP_SIZE);
   2659 			}
   2660 		}
   2661 
   2662 		/*
   2663 		 * Commit the Moby transaction
   2664 		 *
   2665 		 * Deltas have already been made so we just need to
   2666 		 * commit them with a synchronous transaction.
   2667 		 * TRANS_BEGIN_SYNC() will return an error
   2668 		 * if there are no deltas to commit, for an
   2669 		 * empty transaction.
   2670 		 */
   2671 		if (ulp) {
   2672 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE,
   2673 			    error);
   2674 			if (error) {
   2675 				error = 0; /* commit wasn't needed */
   2676 				goto out;
   2677 			}
   2678 			TRANS_END_SYNC(ufsvfsp, error, TOP_FSYNC,
   2679 			    TOP_COMMIT_SIZE);
   2680 		}
   2681 	} else {	/* not logging */
   2682 		if (!(IS_SWAPVP(vp)))
   2683 			if (syncflag & FNODSYNC) {
   2684 				/* Just update the inode only */
   2685 				TRANS_IUPDAT(ip, 1);
   2686 				error = 0;
   2687 			} else if (syncflag & FDSYNC)
   2688 				/* Do data-synchronous writes */
   2689 				error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC);
   2690 			else
   2691 				/* Do synchronous writes */
   2692 				error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC);
   2693 
   2694 		rw_enter(&ip->i_contents, RW_WRITER);
   2695 		if (!error)
   2696 			error = ufs_sync_indir(ip);
   2697 		rw_exit(&ip->i_contents);
   2698 	}
   2699 out:
   2700 	if (ulp) {
   2701 		ufs_lockfs_end(ulp);
   2702 	}
   2703 	return (error);
   2704 }
   2705 
   2706 /*ARGSUSED*/
   2707 static void
   2708 ufs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
   2709 {
   2710 	ufs_iinactive(VTOI(vp));
   2711 }
   2712 
   2713 /*
   2714  * Unix file system operations having to do with directory manipulation.
   2715  */
   2716 int ufs_lookup_idle_count = 2;	/* Number of inodes to idle each time */
   2717 /* ARGSUSED */
   2718 static int
   2719 ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
   2720 	struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr,
   2721 	caller_context_t *ct, int *direntflags, pathname_t *realpnp)
   2722 {
   2723 	struct inode *ip;
   2724 	struct inode *sip;
   2725 	struct inode *xip;
   2726 	struct ufsvfs *ufsvfsp;
   2727 	struct ulockfs *ulp;
   2728 	struct vnode *vp;
   2729 	int error;
   2730 
   2731 	/*
   2732 	 * Check flags for type of lookup (regular file or attribute file)
   2733 	 */
   2734 
   2735 	ip = VTOI(dvp);
   2736 
   2737 	if (flags & LOOKUP_XATTR) {
   2738 
   2739 		/*
   2740 		 * If not mounted with XATTR support then return EINVAL
   2741 		 */
   2742 
   2743 		if (!(ip->i_ufsvfs->vfs_vfs->vfs_flag & VFS_XATTR))
   2744 			return (EINVAL);
   2745 		/*
   2746 		 * We don't allow recursive attributes...
   2747 		 * Maybe someday we will.
   2748 		 */
   2749 		if ((ip->i_cflags & IXATTR)) {
   2750 			return (EINVAL);
   2751 		}
   2752 
   2753 		if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) {
   2754 			error = ufs_xattr_getattrdir(dvp, &sip, flags, cr);
   2755 			if (error) {
   2756 				*vpp = NULL;
   2757 				goto out;
   2758 			}
   2759 
   2760 			vp = ITOV(sip);
   2761 			dnlc_update(dvp, XATTR_DIR_NAME, vp);
   2762 		}
   2763 
   2764 		/*
   2765 		 * Check accessibility of directory.
   2766 		 */
   2767 		if (vp == DNLC_NO_VNODE) {
   2768 			VN_RELE(vp);
   2769 			error = ENOENT;
   2770 			goto out;
   2771 		}
   2772 		if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr, 1)) != 0) {
   2773 			VN_RELE(vp);
   2774 			goto out;
   2775 		}
   2776 
   2777 		*vpp = vp;
   2778 		return (0);
   2779 	}
   2780 
   2781 	/*
   2782 	 * Check for a null component, which we should treat as
   2783 	 * looking at dvp from within it's parent, so we don't
   2784 	 * need a call to ufs_iaccess(), as it has already been
   2785 	 * done.
   2786 	 */
   2787 	if (nm[0] == 0) {
   2788 		VN_HOLD(dvp);
   2789 		error = 0;
   2790 		*vpp = dvp;
   2791 		goto out;
   2792 	}
   2793 
   2794 	/*
   2795 	 * Check for "." ie itself. this is a quick check and
   2796 	 * avoids adding "." into the dnlc (which have been seen
   2797 	 * to occupy >10% of the cache).
   2798 	 */
   2799 	if ((nm[0] == '.') && (nm[1] == 0)) {
   2800 		/*
   2801 		 * Don't return without checking accessibility
   2802 		 * of the directory. We only need the lock if
   2803 		 * we are going to return it.
   2804 		 */
   2805 		if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) == 0) {
   2806 			VN_HOLD(dvp);
   2807 			*vpp = dvp;
   2808 		}
   2809 		goto out;
   2810 	}
   2811 
   2812 	/*
   2813 	 * Fast path: Check the directory name lookup cache.
   2814 	 */
   2815 	if (vp = dnlc_lookup(dvp, nm)) {
   2816 		/*
   2817 		 * Check accessibility of directory.
   2818 		 */
   2819 		if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) != 0) {
   2820 			VN_RELE(vp);
   2821 			goto out;
   2822 		}
   2823 		if (vp == DNLC_NO_VNODE) {
   2824 			VN_RELE(vp);
   2825 			error = ENOENT;
   2826 			goto out;
   2827 		}
   2828 		xip = VTOI(vp);
   2829 		ulp = NULL;
   2830 		goto fastpath;
   2831 	}
   2832 
   2833 	/*
   2834 	 * Keep the idle queue from getting too long by
   2835 	 * idling two inodes before attempting to allocate another.
   2836 	 *    This operation must be performed before entering
   2837 	 *    lockfs or a transaction.
   2838 	 */
   2839 	if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
   2840 		if ((curthread->t_flag & T_DONTBLOCK) == 0) {
   2841 			ins.in_lidles.value.ul += ufs_lookup_idle_count;
   2842 			ufs_idle_some(ufs_lookup_idle_count);
   2843 		}
   2844 
   2845 retry_lookup:
   2846 	/*
   2847 	 * Check accessibility of directory.
   2848 	 */
   2849 	if (error = ufs_diraccess(ip, IEXEC, cr))
   2850 		goto out;
   2851 
   2852 	ufsvfsp = ip->i_ufsvfs;
   2853 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK);
   2854 	if (error)
   2855 		goto out;
   2856 
   2857 	error = ufs_dirlook(ip, nm, &xip, cr, 1);
   2858 
   2859 fastpath:
   2860 	if (error == 0) {
   2861 		ip = xip;
   2862 		*vpp = ITOV(ip);
   2863 
   2864 		/*
   2865 		 * If vnode is a device return special vnode instead.
   2866 		 */
   2867 		if (IS_DEVVP(*vpp)) {
   2868 			struct vnode *newvp;
   2869 
   2870 			newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
   2871 			    cr);
   2872 			VN_RELE(*vpp);
   2873 			if (newvp == NULL)
   2874 				error = ENOSYS;
   2875 			else
   2876 				*vpp = newvp;
   2877 		} else if (ip->i_cflags & ICOMPRESS) {
   2878 			struct vnode *newvp;
   2879 
   2880 			/*
   2881 			 * Compressed file, substitute dcfs vnode
   2882 			 */
   2883 			newvp = decompvp(*vpp, cr, ct);
   2884 			VN_RELE(*vpp);
   2885 			if (newvp == NULL)
   2886 				error = ENOSYS;
   2887 			else
   2888 				*vpp = newvp;
   2889 		}
   2890 	}
   2891 	if (ulp) {
   2892 		ufs_lockfs_end(ulp);
   2893 	}
   2894 
   2895 	if (error == EAGAIN)
   2896 		goto retry_lookup;
   2897 
   2898 out:
   2899 	return (error);
   2900 }
   2901 
   2902 /*ARGSUSED*/
   2903 static int
   2904 ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl,
   2905 	int mode, struct vnode **vpp, struct cred *cr, int flag,
   2906 	caller_context_t *ct, vsecattr_t *vsecp)
   2907 {
   2908 	struct inode *ip;
   2909 	struct inode *xip;
   2910 	struct inode *dip;
   2911 	struct vnode *xvp;
   2912 	struct ufsvfs *ufsvfsp;
   2913 	struct ulockfs *ulp;
   2914 	int error;
   2915 	int issync;
   2916 	int truncflag;
   2917 	int trans_size;
   2918 	int noentry;
   2919 	int defer_dip_seq_update = 0;	/* need to defer update of dip->i_seq */
   2920 	int retry = 1;
   2921 	int indeadlock;
   2922 
   2923 again:
   2924 	ip = VTOI(dvp);
   2925 	ufsvfsp = ip->i_ufsvfs;
   2926 	truncflag = 0;
   2927 
   2928 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK);
   2929 	if (error)
   2930 		goto out;
   2931 
   2932 	if (ulp) {
   2933 		trans_size = (int)TOP_CREATE_SIZE(ip);
   2934 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_CREATE, trans_size);
   2935 	}
   2936 
   2937 	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
   2938 		vap->va_mode &= ~VSVTX;
   2939 
   2940 	if (*name == '\0') {
   2941 		/*
   2942 		 * Null component name refers to the directory itself.
   2943 		 */
   2944 		VN_HOLD(dvp);
   2945 		/*
   2946 		 * Even though this is an error case, we need to grab the
   2947 		 * quota lock since the error handling code below is common.
   2948 		 */
   2949 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   2950 		rw_enter(&ip->i_contents, RW_WRITER);
   2951 		error = EEXIST;
   2952 	} else {
   2953 		xip = NULL;
   2954 		noentry = 0;
   2955 		/*
   2956 		 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
   2957 		 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
   2958 		 * possible, retries the operation.
   2959 		 */
   2960 		ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_CREATE,
   2961 		    retry_dir);
   2962 		if (indeadlock)
   2963 			goto again;
   2964 
   2965 		xvp = dnlc_lookup(dvp, name);
   2966 		if (xvp == DNLC_NO_VNODE) {
   2967 			noentry = 1;
   2968 			VN_RELE(xvp);
   2969 			xvp = NULL;
   2970 		}
   2971 		if (xvp) {
   2972 			rw_exit(&ip->i_rwlock);
   2973 			if (error = ufs_iaccess(ip, IEXEC, cr, 1)) {
   2974 				VN_RELE(xvp);
   2975 			} else {
   2976 				error = EEXIST;
   2977 				xip = VTOI(xvp);
   2978 			}
   2979 		} else {
   2980 			/*
   2981 			 * Suppress file system full message if we will retry
   2982 			 */
   2983 			error = ufs_direnter_cm(ip, name, DE_CREATE,
   2984 			    vap, &xip, cr, (noentry | (retry ? IQUIET : 0)));
   2985 			if (error == EAGAIN) {
   2986 				if (ulp) {
   2987 					TRANS_END_CSYNC(ufsvfsp, error, issync,
   2988 					    TOP_CREATE, trans_size);
   2989 					ufs_lockfs_end(ulp);
   2990 				}
   2991 				goto again;
   2992 			}
   2993 			rw_exit(&ip->i_rwlock);
   2994 		}
   2995 		ip = xip;
   2996 		if (ip != NULL) {
   2997 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   2998 			rw_enter(&ip->i_contents, RW_WRITER);
   2999 		}
   3000 	}
   3001 
   3002 	/*
   3003 	 * If the file already exists and this is a non-exclusive create,
   3004 	 * check permissions and allow access for non-directories.
   3005 	 * Read-only create of an existing directory is also allowed.
   3006 	 * We fail an exclusive create of anything which already exists.
   3007 	 */
   3008 	if (error == EEXIST) {
   3009 		dip = VTOI(dvp);
   3010 		if (excl == NONEXCL) {
   3011 			if ((((ip->i_mode & IFMT) == IFDIR) ||
   3012 			    ((ip->i_mode & IFMT) == IFATTRDIR)) &&
   3013 			    (mode & IWRITE))
   3014 				error = EISDIR;
   3015 			else if (mode)
   3016 				error = ufs_iaccess(ip, mode, cr, 0);
   3017 			else
   3018 				error = 0;
   3019 		}
   3020 		if (error) {
   3021 			rw_exit(&ip->i_contents);
   3022 			rw_exit(&ufsvfsp->vfs_dqrwlock);
   3023 			VN_RELE(ITOV(ip));
   3024 			goto unlock;
   3025 		}
   3026 		/*
   3027 		 * If the error EEXIST was set, then i_seq can not
   3028 		 * have been updated. The sequence number interface
   3029 		 * is defined such that a non-error VOP_CREATE must
   3030 		 * increase the dir va_seq it by at least one. If we
   3031 		 * have cleared the error, increase i_seq. Note that
   3032 		 * we are increasing the dir i_seq and in rare cases
   3033 		 * ip may actually be from the dvp, so we already have
   3034 		 * the locks and it will not be subject to truncation.
   3035 		 * In case we have to update i_seq of the parent
   3036 		 * directory dip, we have to defer it till we have
   3037 		 * released our locks on ip due to lock ordering requirements.
   3038 		 */
   3039 		if (ip != dip)
   3040 			defer_dip_seq_update = 1;
   3041 		else
   3042 			ip->i_seq++;
   3043 
   3044 		if (((ip->i_mode & IFMT) == IFREG) &&
   3045 		    (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
   3046 			/*
   3047 			 * Truncate regular files, if requested by caller.
   3048 			 * Grab i_rwlock to make sure no one else is
   3049 			 * currently writing to the file (we promised
   3050 			 * bmap we would do this).
   3051 			 * Must get the locks in the correct order.
   3052 			 */
   3053 			if (ip->i_size == 0) {
   3054 				ip->i_flag |= ICHG | IUPD;
   3055 				ip->i_seq++;
   3056 				TRANS_INODE(ufsvfsp, ip);
   3057 			} else {
   3058 				/*
   3059 				 * Large Files: Why this check here?
   3060 				 * Though we do it in vn_create() we really
   3061 				 * want to guarantee that we do not destroy
   3062 				 * Large file data by atomically checking
   3063 				 * the size while holding the contents
   3064 				 * lock.
   3065 				 */
   3066 				if (flag && !(flag & FOFFMAX) &&
   3067 				    ((ip->i_mode & IFMT) == IFREG) &&
   3068 				    (ip->i_size > (offset_t)MAXOFF32_T)) {
   3069 					rw_exit(&ip->i_contents);
   3070 					rw_exit(&ufsvfsp->vfs_dqrwlock);
   3071 					error = EOVERFLOW;
   3072 					goto unlock;
   3073 				}
   3074 				if (TRANS_ISTRANS(ufsvfsp))
   3075 					truncflag++;
   3076 				else {
   3077 					rw_exit(&ip->i_contents);
   3078 					rw_exit(&ufsvfsp->vfs_dqrwlock);
   3079 					ufs_tryirwlock_trans(&ip->i_rwlock,
   3080 					    RW_WRITER, TOP_CREATE,
   3081 					    retry_file);
   3082 					if (indeadlock) {
   3083 						VN_RELE(ITOV(ip));
   3084 						goto again;
   3085 					}
   3086 					rw_enter(&ufsvfsp->vfs_dqrwlock,
   3087 					    RW_READER);
   3088 					rw_enter(&ip->i_contents, RW_WRITER);
   3089 					(void) ufs_itrunc(ip, (u_offset_t)0, 0,
   3090 					    cr);
   3091 					rw_exit(&ip->i_rwlock);
   3092 				}
   3093 
   3094 			}
   3095 			if (error == 0) {
   3096 				vnevent_create(ITOV(ip), ct);
   3097 			}
   3098 		}
   3099 	}
   3100 
   3101 	if (error) {
   3102 		if (ip != NULL) {
   3103 			rw_exit(&ufsvfsp->vfs_dqrwlock);
   3104 			rw_exit(&ip->i_contents);
   3105 		}
   3106 		goto unlock;
   3107 	}
   3108 
   3109 	*vpp = ITOV(ip);
   3110 	ITIMES(ip);
   3111 	rw_exit(&ip->i_contents);
   3112 	rw_exit(&ufsvfsp->vfs_dqrwlock);
   3113 
   3114 	/*
   3115 	 * If vnode is a device return special vnode instead.
   3116 	 */
   3117 	if (!error && IS_DEVVP(*vpp)) {
   3118 		struct vnode *newvp;
   3119 
   3120 		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
   3121 		VN_RELE(*vpp);
   3122 		if (newvp == NULL) {
   3123 			error = ENOSYS;
   3124 			goto unlock;
   3125 		}
   3126 		truncflag = 0;
   3127 		*vpp = newvp;
   3128 	}
   3129 unlock:
   3130 
   3131 	/*
   3132 	 * Do the deferred update of the parent directory's sequence
   3133 	 * number now.
   3134 	 */
   3135 	if (defer_dip_seq_update == 1) {
   3136 		rw_enter(&dip->i_contents, RW_READER);
   3137 		mutex_enter(&dip->i_tlock);
   3138 		dip->i_seq++;
   3139 		mutex_exit(&dip->i_tlock);
   3140 		rw_exit(&dip->i_contents);
   3141 	}
   3142 
   3143 	if (ulp) {
   3144 		int terr = 0;
   3145 
   3146 		TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_CREATE,
   3147 		    trans_size);
   3148 
   3149 		/*
   3150 		 * If we haven't had a more interesting failure
   3151 		 * already, then anything that might've happened
   3152 		 * here should be reported.
   3153 		 */
   3154 		if (error == 0)
   3155 			error = terr;
   3156 	}
   3157 
   3158 	if (!error && truncflag) {
   3159 		ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_trunc);
   3160 		if (indeadlock) {
   3161 			if (ulp)
   3162 				ufs_lockfs_end(ulp);
   3163 			VN_RELE(ITOV(ip));
   3164 			goto again;
   3165 		}
   3166 		(void) TRANS_ITRUNC(ip, (u_offset_t)0, 0, cr);
   3167 		rw_exit(&ip->i_rwlock);
   3168 	}
   3169 
   3170 	if (ulp)
   3171 		ufs_lockfs_end(ulp);
   3172 
   3173 	/*
   3174 	 * If no inodes available, try to free one up out of the
   3175 	 * pending delete queue.
   3176 	 */
   3177 	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
   3178 		ufs_delete_drain_wait(ufsvfsp, 1);
   3179 		retry = 0;
   3180 		goto again;
   3181 	}
   3182 
   3183 out:
   3184 	return (error);
   3185 }
   3186 
   3187 extern int ufs_idle_max;
   3188 /*ARGSUSED*/
   3189 static int
   3190 ufs_remove(struct vnode *vp, char *nm, struct cred *cr,
   3191 	caller_context_t *ct, int flags)
   3192 {
   3193 	struct inode *ip = VTOI(vp);
   3194 	struct ufsvfs *ufsvfsp	= ip->i_ufsvfs;
   3195 	struct ulockfs *ulp;
   3196 	vnode_t *rmvp = NULL;	/* Vnode corresponding to name being removed */
   3197 	int indeadlock;
   3198 	int error;
   3199 	int issync;
   3200 	int trans_size;
   3201 
   3202 	/*
   3203 	 * don't let the delete queue get too long
   3204 	 */
   3205 	if (ufsvfsp == NULL) {
   3206 		error = EIO;
   3207 		goto out;
   3208 	}
   3209 	if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
   3210 		ufs_delete_drain(vp->v_vfsp, 1, 1);
   3211 
   3212 retry_remove:
   3213 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK);
   3214 	if (error)
   3215 		goto out;
   3216 
   3217 	if (ulp)
   3218 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
   3219 		    trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp)));
   3220 
   3221 	/*
   3222 	 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
   3223 	 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
   3224 	 * possible, retries the operation.
   3225 	 */
   3226 	ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_REMOVE, retry);
   3227 	if (indeadlock)
   3228 		goto retry_remove;
   3229 	error = ufs_dirremove(ip, nm, (struct inode *)0, (struct vnode *)0,
   3230 	    DR_REMOVE, cr, &rmvp);
   3231 	rw_exit(&ip->i_rwlock);
   3232 
   3233 	if (ulp) {
   3234 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_REMOVE, trans_size);
   3235 		ufs_lockfs_end(ulp);
   3236 	}
   3237 
   3238 	/*
   3239 	 * This must be called after the remove transaction is closed.
   3240 	 */
   3241 	if (rmvp != NULL) {
   3242 		/* Only send the event if there were no errors */
   3243 		if (error == 0)
   3244 			vnevent_remove(rmvp, vp, nm, ct);
   3245 		VN_RELE(rmvp);
   3246 	}
   3247 out:
   3248 	return (error);
   3249 }
   3250 
   3251 /*
   3252  * Link a file or a directory.  Only privileged processes are allowed to
   3253  * make links to directories.
   3254  */
   3255 /*ARGSUSED*/
   3256 static int
   3257 ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr,
   3258 	caller_context_t *ct, int flags)
   3259 {
   3260 	struct inode *sip;
   3261 	struct inode *tdp = VTOI(tdvp);
   3262 	struct ufsvfs *ufsvfsp = tdp->i_ufsvfs;
   3263 	struct ulockfs *ulp;
   3264 	struct vnode *realvp;
   3265 	int error;
   3266 	int issync;
   3267 	int trans_size;
   3268 	int isdev;
   3269 	int indeadlock;
   3270 
   3271 retry_link:
   3272 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK);
   3273 	if (error)
   3274 		goto out;
   3275 
   3276 	if (ulp)
   3277 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_LINK,
   3278 		    trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp)));
   3279 
   3280 	if (VOP_REALVP(svp, &realvp, ct) == 0)
   3281 		svp = realvp;
   3282 
   3283 	/*
   3284 	 * Make sure link for extended attributes is valid
   3285 	 * We only support hard linking of attr in ATTRDIR to ATTRDIR
   3286 	 *
   3287 	 * Make certain we don't attempt to look at a device node as
   3288 	 * a ufs inode.
   3289 	 */
   3290 
   3291 	isdev = IS_DEVVP(svp);
   3292 	if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) &&
   3293 	    ((tdp->i_mode & IFMT) == IFATTRDIR)) ||
   3294 	    ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) &&
   3295 	    ((tdp->i_mode & IFMT) == IFDIR))) {
   3296 		error = EINVAL;
   3297 		goto unlock;
   3298 	}
   3299 
   3300 	sip = VTOI(svp);
   3301 	if ((svp->v_type == VDIR &&
   3302 	    secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) ||
   3303 	    (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) {
   3304 		error = EPERM;
   3305 		goto unlock;
   3306 	}
   3307 
   3308 	/*
   3309 	 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
   3310 	 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
   3311 	 * possible, retries the operation.
   3312 	 */
   3313 	ufs_tryirwlock_trans(&tdp->i_rwlock, RW_WRITER, TOP_LINK, retry);
   3314 	if (indeadlock)
   3315 		goto retry_link;
   3316 	error = ufs_direnter_lr(tdp, tnm, DE_LINK, (struct inode *)0,
   3317 	    sip, cr, NULL);
   3318 	rw_exit(&tdp->i_rwlock);
   3319 
   3320 unlock:
   3321 	if (ulp) {
   3322 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_LINK, trans_size);
   3323 		ufs_lockfs_end(ulp);
   3324 	}
   3325 
   3326 	if (!error) {
   3327 		vnevent_link(svp, ct);
   3328 	}
   3329 out:
   3330 	return (error);
   3331 }
   3332 
   3333 uint64_t ufs_rename_retry_cnt;
   3334 uint64_t ufs_rename_upgrade_retry_cnt;
   3335 uint64_t ufs_rename_dircheck_retry_cnt;
   3336 clock_t	 ufs_rename_backoff_delay = 1;
   3337 
   3338 /*
   3339  * Rename a file or directory.
   3340  * We are given the vnode and entry string of the source and the
   3341  * vnode and entry string of the place we want to move the source
   3342  * to (the target). The essential operation is:
   3343  *	unlink(target);
   3344  *	link(source, target);
   3345  *	unlink(source);
   3346  * but "atomically".  Can't do full commit without saving state in
   3347  * the inode on disk, which isn't feasible at this time.  Best we
   3348  * can do is always guarantee that the TARGET exists.
   3349  */
   3350 
   3351 /*ARGSUSED*/
   3352 static int
   3353 ufs_rename(
   3354 	struct vnode *sdvp,		/* old (source) parent vnode */
   3355 	char *snm,			/* old (source) entry name */
   3356 	struct vnode *tdvp,		/* new (target) parent vnode */
   3357 	char *tnm,			/* new (target) entry name */
   3358 	struct cred *cr,
   3359 	caller_context_t *ct,
   3360 	int flags)
   3361 {
   3362 	struct inode *sip = NULL;	/* source inode */
   3363 	struct inode *ip = NULL;	/* check inode */
   3364 	struct inode *sdp;		/* old (source) parent inode */
   3365 	struct inode *tdp;		/* new (target) parent inode */
   3366 	struct vnode *tvp = NULL;	/* target vnode, if it exists */
   3367 	struct vnode *realvp;
   3368 	struct ufsvfs *ufsvfsp;
   3369 	struct ulockfs *ulp;
   3370 	struct ufs_slot slot;
   3371 	timestruc_t now;
   3372 	int error;
   3373 	int issync;
   3374 	int trans_size;
   3375 	krwlock_t *first_lock;
   3376 	krwlock_t *second_lock;
   3377 	krwlock_t *reverse_lock;
   3378 
   3379 	sdp = VTOI(sdvp);
   3380 	slot.fbp = NULL;
   3381 	ufsvfsp = sdp->i_ufsvfs;
   3382 retry_rename:
   3383 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK);
   3384 	if (error)
   3385 		goto out;
   3386 
   3387 	if (ulp)
   3388 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME,
   3389 		    trans_size = (int)TOP_RENAME_SIZE(sdp));
   3390 
   3391 	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
   3392 		tdvp = realvp;
   3393 
   3394 	tdp = VTOI(tdvp);
   3395 
   3396 	/*
   3397 	 * We only allow renaming of attributes from ATTRDIR to ATTRDIR.
   3398 	 */
   3399 	if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) {
   3400 		error = EINVAL;
   3401 		goto unlock;
   3402 	}
   3403 
   3404 	/*
   3405 	 * Check accessibility of directory.
   3406 	 */
   3407 	if (error = ufs_diraccess(sdp, IEXEC, cr))
   3408 		goto unlock;
   3409 
   3410 	/*
   3411 	 * Look up inode of file we're supposed to rename.
   3412 	 */
   3413 	gethrestime(&now);
   3414 	if (error = ufs_dirlook(sdp, snm, &sip, cr, 0)) {
   3415 		if (error == EAGAIN) {
   3416 			if (ulp) {
   3417 				TRANS_END_CSYNC(ufsvfsp, error, issync,
   3418 				    TOP_RENAME, trans_size);
   3419 				ufs_lockfs_end(ulp);
   3420 			}
   3421 			goto retry_rename;
   3422 		}
   3423 
   3424 		goto unlock;
   3425 	}
   3426 
   3427 	/*
   3428 	 * Lock both the source and target directories (they may be
   3429 	 * the same) to provide the atomicity semantics that was
   3430 	 * previously provided by the per file system vfs_rename_lock
   3431 	 *
   3432 	 * with vfs_rename_lock removed to allow simultaneous renames
   3433 	 * within a file system, ufs_dircheckpath can deadlock while
   3434 	 * traversing back to ensure that source is not a parent directory
   3435 	 * of target parent directory. This is because we get into
   3436 	 * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER.
   3437 	 * If the tdp and sdp of the simultaneous renames happen to be
   3438 	 * in the path of each other, it can lead to a deadlock. This
   3439 	 * can be avoided by getting the locks as RW_READER here and then
   3440 	 * upgrading to RW_WRITER after completing the ufs_dircheckpath.
   3441 	 *
   3442 	 * We hold the target directory's i_rwlock after calling
   3443 	 * ufs_lockfs_begin but in many other operations (like ufs_readdir)
   3444 	 * VOP_RWLOCK is explicitly called by the filesystem independent code
   3445 	 * before calling the file system operation. In these cases the order
   3446 	 * is reversed (i.e i_rwlock is taken first and then ufs_lockfs_begin
   3447 	 * is called). This is fine as long as ufs_lockfs_begin acts as a VOP
   3448 	 * counter but with ufs_quiesce setting the SLOCK bit this becomes a
   3449 	 * synchronizing object which might lead to a deadlock. So we use
   3450 	 * rw_tryenter instead of rw_enter. If we fail to get this lock and
   3451 	 * find that SLOCK bit is set, we call ufs_lockfs_end and restart the
   3452 	 * operation.
   3453 	 */
   3454 retry:
   3455 	first_lock = &tdp->i_rwlock;
   3456 	second_lock = &sdp->i_rwlock;
   3457 retry_firstlock:
   3458 	if (!rw_tryenter(first_lock, RW_READER)) {
   3459 		/*
   3460 		 * We didn't get the lock. Check if the SLOCK is set in the
   3461 		 * ufsvfs. If yes, we might be in a deadlock. Safer to give up
   3462 		 * and wait for SLOCK to be cleared.
   3463 		 */
   3464 
   3465 		if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
   3466 			TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME,
   3467 			    trans_size);
   3468 			ufs_lockfs_end(ulp);
   3469 			goto retry_rename;
   3470 
   3471 		} else {
   3472 			/*
   3473 			 * SLOCK isn't set so this is a genuine synchronization
   3474 			 * case. Let's try again after giving them a breather.
   3475 			 */
   3476 			delay(RETRY_LOCK_DELAY);
   3477 			goto  retry_firstlock;
   3478 		}
   3479 	}
   3480 	/*
   3481 	 * Need to check if the tdp and sdp are same !!!
   3482 	 */
   3483 	if ((tdp != sdp) && (!rw_tryenter(second_lock, RW_READER))) {
   3484 		/*
   3485 		 * We didn't get the lock. Check if the SLOCK is set in the
   3486 		 * ufsvfs. If yes, we might be in a deadlock. Safer to give up
   3487 		 * and wait for SLOCK to be cleared.
   3488 		 */
   3489 
   3490 		rw_exit(first_lock);
   3491 		if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
   3492 			TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME,
   3493 			    trans_size);
   3494 			ufs_lockfs_end(ulp);
   3495 			goto retry_rename;
   3496 
   3497 		} else {
   3498 			/*
   3499 			 * So we couldn't get the second level peer lock *and*
   3500 			 * the SLOCK bit isn't set. Too bad we can be
   3501 			 * contentding with someone wanting these locks otherway
   3502 			 * round. Reverse the locks in case there is a heavy
   3503 			 * contention for the second level lock.
   3504 			 */
   3505 			reverse_lock = first_lock;
   3506 			first_lock = second_lock;
   3507 			second_lock = reverse_lock;
   3508 			ufs_rename_retry_cnt++;
   3509 			goto  retry_firstlock;
   3510 		}
   3511 	}
   3512 
   3513 	if (sip == tdp) {
   3514 		error = EINVAL;
   3515 		goto errout;
   3516 	}
   3517 	/*
   3518 	 * Make sure we can delete the source entry.  This requires
   3519 	 * write permission on the containing directory.
   3520 	 * Check for sticky directories.
   3521 	 */
   3522 	rw_enter(&sdp->i_contents, RW_READER);
   3523 	rw_enter(&sip->i_contents, RW_READER);
   3524 	if ((error = ufs_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
   3525 	    (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) {
   3526 		rw_exit(&sip->i_contents);
   3527 		rw_exit(&sdp->i_contents);
   3528 		goto errout;
   3529 	}
   3530 
   3531 	/*
   3532 	 * If this is a rename of a directory and the parent is
   3533 	 * different (".." must be changed), then the source
   3534 	 * directory must not be in the directory hierarchy
   3535 	 * above the target, as this would orphan everything
   3536 	 * below the source directory.  Also the user must have
   3537 	 * write permission in the source so as to be able to
   3538 	 * change "..".
   3539 	 */
   3540 	if ((((sip->i_mode & IFMT) == IFDIR) ||
   3541 	    ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) {
   3542 		ino_t	inum;
   3543 
   3544 		if (error = ufs_iaccess(sip, IWRITE, cr, 0)) {
   3545 			rw_exit(&sip->i_contents);
   3546 			rw_exit(&sdp->i_contents);
   3547 			goto errout;
   3548 		}
   3549 		inum = sip->i_number;
   3550 		rw_exit(&sip->i_contents);
   3551 		rw_exit(&sdp->i_contents);
   3552 		if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) {
   3553 			/*
   3554 			 * If we got EAGAIN ufs_dircheckpath detected a
   3555 			 * potential deadlock and backed out. We need
   3556 			 * to retry the operation since sdp and tdp have
   3557 			 * to be released to avoid the deadlock.
   3558 			 */
   3559 			if (error == EAGAIN) {
   3560 				rw_exit(&tdp->i_rwlock);
   3561 				if (tdp != sdp)
   3562 					rw_exit(&sdp->i_rwlock);
   3563 				delay(ufs_rename_backoff_delay);
   3564 				ufs_rename_dircheck_retry_cnt++;
   3565 				goto retry;
   3566 			}
   3567 			goto errout;
   3568 		}
   3569 	} else {
   3570 		rw_exit(&sip->i_contents);
   3571 		rw_exit(&sdp->i_contents);
   3572 	}
   3573 
   3574 
   3575 	/*
   3576 	 * Check for renaming '.' or '..' or alias of '.'
   3577 	 */
   3578 	if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) {
   3579 		error = EINVAL;
   3580 		goto errout;
   3581 	}
   3582 
   3583 	/*
   3584 	 * Simultaneous renames can deadlock in ufs_dircheckpath since it
   3585 	 * tries to traverse back the file tree with both tdp and sdp held
   3586 	 * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks
   3587 	 * as RW_READERS  till ufs_dircheckpath is done.
   3588 	 * Now that ufs_dircheckpath is done with, we can upgrade the locks
   3589 	 * to RW_WRITER.
   3590 	 */
   3591 	if (!rw_tryupgrade(&tdp->i_rwlock)) {
   3592 		/*
   3593 		 * The upgrade failed. We got to give away the lock
   3594 		 * as to avoid deadlocking with someone else who is
   3595 		 * waiting for writer lock. With the lock gone, we
   3596 		 * cannot be sure the checks done above will hold
   3597 		 * good when we eventually get them back as writer.
   3598 		 * So if we can't upgrade we drop the locks and retry
   3599 		 * everything again.
   3600 		 */
   3601 		rw_exit(&tdp->i_rwlock);
   3602 		if (tdp != sdp)
   3603 			rw_exit(&sdp->i_rwlock);
   3604 		delay(ufs_rename_backoff_delay);
   3605 		ufs_rename_upgrade_retry_cnt++;
   3606 		goto retry;
   3607 	}
   3608 	if (tdp != sdp) {
   3609 		if (!rw_tryupgrade(&sdp->i_rwlock)) {
   3610 			/*
   3611 			 * The upgrade failed. We got to give away the lock
   3612 			 * as to avoid deadlocking with someone else who is
   3613 			 * waiting for writer lock. With the lock gone, we
   3614 			 * cannot be sure the checks done above will hold
   3615 			 * good when we eventually get them back as writer.
   3616 			 * So if we can't upgrade we drop the locks and retry
   3617 			 * everything again.
   3618 			 */
   3619 			rw_exit(&tdp->i_rwlock);
   3620 			rw_exit(&sdp->i_rwlock);
   3621 			delay(ufs_rename_backoff_delay);
   3622 			ufs_rename_upgrade_retry_cnt++;
   3623 			goto retry;
   3624 		}
   3625 	}
   3626 
   3627 	/*
   3628 	 * Now that all the locks are held check to make sure another thread
   3629 	 * didn't slip in and take out the sip.
   3630 	 */
   3631 	slot.status = NONE;
   3632 	if ((sip->i_ctime.tv_usec * 1000) > now.tv_nsec ||
   3633 	    sip->i_ctime.tv_sec > now.tv_sec) {
   3634 		rw_enter(&sdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
   3635 		rw_enter(&sdp->i_contents, RW_WRITER);
   3636 		error = ufs_dircheckforname(sdp, snm, strlen(snm), &slot,
   3637 		    &ip, cr, 0);
   3638 		rw_exit(&sdp->i_contents);
   3639 		rw_exit(&sdp->i_ufsvfs->vfs_dqrwlock);
   3640 		if (error) {
   3641 			goto errout;
   3642 		}
   3643 		if (ip == NULL) {
   3644 			error = ENOENT;
   3645 			goto errout;
   3646 		} else {
   3647 			/*
   3648 			 * If the inode was found need to drop the v_count
   3649 			 * so as not to keep the filesystem from being
   3650 			 * unmounted at a later time.
   3651 			 */
   3652 			VN_RELE(ITOV(ip));
   3653 		}
   3654 
   3655 		/*
   3656 		 * Release the slot.fbp that has the page mapped and
   3657 		 * locked SE_SHARED, and could be used in in
   3658 		 * ufs_direnter_lr() which needs to get the SE_EXCL lock
   3659 		 * on said page.
   3660 		 */
   3661 		if (slot.fbp) {
   3662 			fbrelse(slot.fbp, S_OTHER);
   3663 			slot.fbp = NULL;
   3664 		}
   3665 	}
   3666 
   3667 	/*
   3668 	 * Link source to the target.  If a target exists, return its
   3669 	 * vnode pointer in tvp.  We'll release it after sending the
   3670 	 * vnevent.
   3671 	 */
   3672 	if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr, &tvp)) {
   3673 		/*
   3674 		 * ESAME isn't really an error; it indicates that the
   3675 		 * operation should not be done because the source and target
   3676 		 * are the same file, but that no error should be reported.
   3677 		 */
   3678 		if (error == ESAME)
   3679 			error = 0;
   3680 		goto errout;
   3681 	}
   3682 
   3683 	/*
   3684 	 * Unlink the source.
   3685 	 * Remove the source entry.  ufs_dirremove() checks that the entry
   3686 	 * still reflects sip, and returns an error if it doesn't.
   3687 	 * If the entry has changed just forget about it.  Release
   3688 	 * the source inode.
   3689 	 */
   3690 	if ((error = ufs_dirremove(sdp, snm, sip, (struct vnode *)0,
   3691 	    DR_RENAME, cr, NULL)) == ENOENT)
   3692 		error = 0;
   3693 
   3694 errout:
   3695 	if (slot.fbp)
   3696 		fbrelse(slot.fbp, S_OTHER);
   3697 
   3698 	rw_exit(&tdp->i_rwlock);
   3699 	if (sdp != tdp) {
   3700 		rw_exit(&sdp->i_rwlock);
   3701 	}
   3702 
   3703 unlock:
   3704 	if (ulp) {
   3705 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size);
   3706 		ufs_lockfs_end(ulp);
   3707 	}
   3708 
   3709 	/*
   3710 	 * If no errors, send the appropriate events on the source
   3711 	 * and destination (a.k.a, target) vnodes, if they exist.
   3712 	 * This has to be done after the rename transaction has closed.
   3713 	 */
   3714 	if (error == 0) {
   3715 		if (tvp != NULL)
   3716 			vnevent_rename_dest(tvp, tdvp, tnm, ct);
   3717 
   3718 		/*
   3719 		 * Notify the target directory of the rename event
   3720 		 * if source and target directories are not same.
   3721 		 */
   3722 		if (sdvp != tdvp)
   3723 			vnevent_rename_dest_dir(tdvp, ct);
   3724 
   3725 		/*
   3726 		 * Note that if ufs_direnter_lr() returned ESAME then
   3727 		 * this event will still be sent.  This isn't expected
   3728 		 * to be a problem for anticipated usage by consumers.
   3729 		 */
   3730 		if (sip != NULL)
   3731 			vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
   3732 	}
   3733 
   3734 	if (tvp != NULL)
   3735 		VN_RELE(tvp);
   3736 
   3737 	if (sip != NULL)
   3738 		VN_RELE(ITOV(sip));
   3739 
   3740 out:
   3741 	return (error);
   3742 }
   3743 
   3744 /*ARGSUSED*/
   3745 static int
   3746 ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap,
   3747 	struct vnode **vpp, struct cred *cr, caller_context_t *ct, int flags,
   3748 	vsecattr_t *vsecp)
   3749 {
   3750 	struct inode *ip;
   3751 	struct inode *xip;
   3752 	struct ufsvfs *ufsvfsp;
   3753 	struct ulockfs *ulp;
   3754 	int error;
   3755 	int issync;
   3756 	int trans_size;
   3757 	int indeadlock;
   3758 	int retry = 1;
   3759 
   3760 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
   3761 
   3762 	/*
   3763 	 * Can't make directory in attr hidden dir
   3764 	 */
   3765 	if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
   3766 		return (EINVAL);
   3767 
   3768 again:
   3769 	ip = VTOI(dvp);
   3770 	ufsvfsp = ip->i_ufsvfs;
   3771 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
   3772 	if (error)
   3773 		goto out;
   3774 	if (ulp)
   3775 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR,
   3776 		    trans_size = (int)TOP_MKDIR_SIZE(ip));
   3777 
   3778 	/*
   3779 	 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
   3780 	 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
   3781 	 * possible, retries the operation.
   3782 	 */
   3783 	ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_MKDIR, retry);
   3784 	if (indeadlock)
   3785 		goto again;
   3786 
   3787 	error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr,
   3788 	    (retry ? IQUIET : 0));
   3789 	if (error == EAGAIN) {
   3790 		if (ulp) {
   3791 			TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_MKDIR,
   3792 			    trans_size);
   3793 			ufs_lockfs_end(ulp);
   3794 		}
   3795 		goto again;
   3796 	}
   3797 
   3798 	rw_exit(&ip->i_rwlock);
   3799 	if (error == 0) {
   3800 		ip = xip;
   3801 		*vpp = ITOV(ip);
   3802 	} else if (error == EEXIST)
   3803 		VN_RELE(ITOV(xip));
   3804 
   3805 	if (ulp) {
   3806 		int terr = 0;
   3807 		TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_MKDIR, trans_size);
   3808 		ufs_lockfs_end(ulp);
   3809 		if (error == 0)
   3810 			error = terr;
   3811 	}
   3812 out:
   3813 	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
   3814 		ufs_delete_drain_wait(ufsvfsp, 1);
   3815 		retry = 0;
   3816 		goto again;
   3817 	}
   3818 
   3819 	return (error);
   3820 }
   3821 
   3822 /*ARGSUSED*/
   3823 static int
   3824 ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr,
   3825 	caller_context_t *ct, int flags)
   3826 {
   3827 	struct inode *ip = VTOI(vp);
   3828 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
   3829 	struct ulockfs *ulp;
   3830 	vnode_t *rmvp = NULL;	/* Vnode of removed directory */
   3831 	int error;
   3832 	int issync;
   3833 	int trans_size;
   3834 	int indeadlock;
   3835 
   3836 	/*
   3837 	 * don't let the delete queue get too long
   3838 	 */
   3839 	if (ufsvfsp == NULL) {
   3840 		error = EIO;
   3841 		goto out;
   3842 	}
   3843 	if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
   3844 		ufs_delete_drain(vp->v_vfsp, 1, 1);
   3845 
   3846 retry_rmdir:
   3847 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK);
   3848 	if (error)
   3849 		goto out;
   3850 
   3851 	if (ulp)
   3852 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RMDIR,
   3853 		    trans_size = TOP_RMDIR_SIZE);
   3854 
   3855 	/*
   3856 	 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
   3857 	 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
   3858 	 * possible, retries the operation.
   3859 	 */
   3860 	ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_RMDIR, retry);
   3861 	if (indeadlock)
   3862 		goto retry_rmdir;
   3863 	error = ufs_dirremove(ip, nm, (struct inode *)0, cdir, DR_RMDIR, cr,
   3864 	    &rmvp);
   3865 	rw_exit(&ip->i_rwlock);
   3866 
   3867 	if (ulp) {
   3868 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RMDIR,
   3869 		    trans_size);
   3870 		ufs_lockfs_end(ulp);
   3871 	}
   3872 
   3873 	/*
   3874 	 * This must be done AFTER the rmdir transaction has closed.
   3875 	 */
   3876 	if (rmvp != NULL) {
   3877 		/* Only send the event if there were no errors */
   3878 		if (error == 0)
   3879 			vnevent_rmdir(rmvp, vp, nm, ct);
   3880 		VN_RELE(rmvp);
   3881 	}
   3882 out:
   3883 	return (error);
   3884 }
   3885 
   3886 /* ARGSUSED */
   3887 static int
   3888 ufs_readdir(
   3889 	struct vnode *vp,
   3890 	struct uio *uiop,
   3891 	struct cred *cr,
   3892 	int *eofp,
   3893 	caller_context_t *ct,
   3894 	int flags)
   3895 {
   3896 	struct iovec *iovp;
   3897 	struct inode *ip;
   3898 	struct direct *idp;
   3899 	struct dirent64 *odp;
   3900 	struct fbuf *fbp;
   3901 	struct ufsvfs *ufsvfsp;
   3902 	struct ulockfs *ulp;
   3903 	caddr_t outbuf;
   3904 	size_t bufsize;
   3905 	uint_t offset;
   3906 	uint_t bytes_wanted, total_bytes_wanted;
   3907 	int incount = 0;
   3908 	int outcount = 0;
   3909 	int error;
   3910 
   3911 	ip = VTOI(vp);
   3912 	ASSERT(RW_READ_HELD(&ip->i_rwlock));
   3913 
   3914 	if (uiop->uio_loffset >= MAXOFF32_T) {
   3915 		if (eofp)
   3916 			*eofp = 1;
   3917 		return (0);
   3918 	}
   3919 
   3920 	/*
   3921 	 * Check if we have been called with a valid iov_len
   3922 	 * and bail out if not, otherwise we may potentially loop
   3923 	 * forever further down.
   3924 	 */
   3925 	if (uiop->uio_iov->iov_len <= 0) {
   3926 		error = EINVAL;
   3927 		goto out;
   3928 	}
   3929 
   3930 	/*
   3931 	 * Large Files: When we come here we are guaranteed that
   3932 	 * uio_offset can be used safely. The high word is zero.
   3933 	 */
   3934 
   3935 	ufsvfsp = ip->i_ufsvfs;
   3936 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK);
   3937 	if (error)
   3938 		goto out;
   3939 
   3940 	iovp = uiop->uio_iov;
   3941 	total_bytes_wanted = iovp->iov_len;
   3942 
   3943 	/* Large Files: directory files should not be "large" */
   3944 
   3945 	ASSERT(ip->i_size <= MAXOFF32_T);
   3946 
   3947 	/* Force offset to be valid (to guard against bogus lseek() values) */
   3948 	offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1);
   3949 
   3950 	/* Quit if at end of file or link count of zero (posix) */
   3951 	if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) {
   3952 		if (eofp)
   3953 			*eofp = 1;
   3954 		error = 0;
   3955 		goto unlock;
   3956 	}
   3957 
   3958 	/*
   3959 	 * Get space to change directory entries into fs independent format.
   3960 	 * Do fast alloc for the most commonly used-request size (filesystem
   3961 	 * block size).
   3962 	 */
   3963 	if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) {
   3964 		bufsize = total_bytes_wanted;
   3965 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
   3966 		odp = (struct dirent64 *)outbuf;
   3967 	} else {
   3968 		bufsize = total_bytes_wanted;
   3969 		odp = (struct dirent64 *)iovp->iov_base;
   3970 	}
   3971 
   3972 nextblk:
   3973 	bytes_wanted = total_bytes_wanted;
   3974 
   3975 	/* Truncate request to file size */
   3976 	if (offset + bytes_wanted > (int)ip->i_size)
   3977 		bytes_wanted = (int)(ip->i_size - offset);
   3978 
   3979 	/* Comply with MAXBSIZE boundary restrictions of fbread() */
   3980 	if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE)
   3981 		bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET);
   3982 
   3983 	/*
   3984 	 * Read in the next chunk.
   3985 	 * We are still holding the i_rwlock.
   3986 	 */
   3987 	error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp);
   3988 
   3989 	if (error)
   3990 		goto update_inode;
   3991 	if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) &&
   3992 	    (!ufsvfsp->vfs_noatime)) {
   3993 		ip->i_flag |= IACC;
   3994 	}
   3995 	incount = 0;
   3996 	idp = (struct direct *)fbp->fb_addr;
   3997 	if (idp->d_ino == 0 && idp->d_reclen == 0 && idp->d_namlen == 0) {
   3998 		cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, "
   3999 		    "fs = %s\n",
   4000 		    (u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt);
   4001 		fbrelse(fbp, S_OTHER);
   4002 		error = ENXIO;
   4003 		goto update_inode;
   4004 	}
   4005 	/* Transform to file-system independent format */
   4006 	while (incount < bytes_wanted) {
   4007 		/*
   4008 		 * If the current directory entry is mangled, then skip
   4009 		 * to the next block.  It would be nice to set the FSBAD
   4010 		 * flag in the super-block so that a fsck is forced on
   4011 		 * next reboot, but locking is a problem.
   4012 		 */
   4013 		if (idp->d_reclen & 0x3) {
   4014 			offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
   4015 			break;
   4016 		}
   4017 
   4018 		/* Skip to requested offset and skip empty entries */
   4019 		if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) {
   4020 			ushort_t this_reclen =
   4021 			    DIRENT64_RECLEN(idp->d_namlen);
   4022 			/* Buffer too small for any entries */
   4023 			if (!outcount && this_reclen > bufsize) {
   4024 				fbrelse(fbp, S_OTHER);
   4025 				error = EINVAL;
   4026 				goto update_inode;
   4027 			}
   4028 			/* If would overrun the buffer, quit */
   4029 			if (outcount + this_reclen > bufsize) {
   4030 				break;
   4031 			}
   4032 			/* Take this entry */
   4033 			odp->d_ino = (ino64_t)idp->d_ino;
   4034 			odp->d_reclen = (ushort_t)this_reclen;
   4035 			odp->d_off = (offset_t)(offset + idp->d_reclen);
   4036 
   4037 			/* use strncpy(9f) to zero out uninitialized bytes */
   4038 
   4039 			ASSERT(strlen(idp->d_name) + 1 <=
   4040 			    DIRENT64_NAMELEN(this_reclen));
   4041 			(void) strncpy(odp->d_name, idp->d_name,
   4042 			    DIRENT64_NAMELEN(this_reclen));
   4043 			outcount += odp->d_reclen;
   4044 			odp = (struct dirent64 *)
   4045 			    ((intptr_t)odp + odp->d_reclen);
   4046 			ASSERT(outcount <= bufsize);
   4047 		}
   4048 		if (idp->d_reclen) {
   4049 			incount += idp->d_reclen;
   4050 			offset += idp->d_reclen;
   4051 			idp = (struct direct *)((intptr_t)idp + idp->d_reclen);
   4052 		} else {
   4053 			offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
   4054 			break;
   4055 		}
   4056 	}
   4057 	/* Release the chunk */
   4058 	fbrelse(fbp, S_OTHER);
   4059 
   4060 	/* Read whole block, but got no entries, read another if not eof */
   4061 
   4062 	/*
   4063 	 * Large Files: casting i_size to int here is not a problem
   4064 	 * because directory sizes are always less than MAXOFF32_T.
   4065 	 * See assertion above.
   4066 	 */
   4067 
   4068 	if (offset < (int)ip->i_size && !outcount)
   4069 		goto nextblk;
   4070 
   4071 	/* Copy out the entry data */
   4072 	if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) {
   4073 		iovp->iov_base += outcount;
   4074 		iovp->iov_len -= outcount;
   4075 		uiop->uio_resid -= outcount;
   4076 		uiop->uio_offset = offset;
   4077 	} else if ((error = uiomove(outbuf, (long)outcount, UIO_READ,
   4078 	    uiop)) == 0)
   4079 		uiop->uio_offset = offset;
   4080 update_inode:
   4081 	ITIMES(ip);
   4082 	if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1)
   4083 		kmem_free(outbuf, bufsize);
   4084 
   4085 	if (eofp && error == 0)
   4086 		*eofp = (uiop->uio_offset >= (int)ip->i_size);
   4087 unlock:
   4088 	if (ulp) {
   4089 		ufs_lockfs_end(ulp);
   4090 	}
   4091 out:
   4092 	return (error);
   4093 }
   4094 
   4095 /*ARGSUSED*/
   4096 static int
   4097 ufs_symlink(
   4098 	struct vnode *dvp,		/* ptr to parent dir vnode */
   4099 	char *linkname,			/* name of symbolic link */
   4100 	struct vattr *vap,		/* attributes */
   4101 	char *target,			/* target path */
   4102 	struct cred *cr,		/* user credentials */
   4103 	caller_context_t *ct,
   4104 	int flags)
   4105 {
   4106 	struct inode *ip, *dip = VTOI(dvp);
   4107 	struct ufsvfs *ufsvfsp = dip->i_ufsvfs;
   4108 	struct ulockfs *ulp;
   4109 	int error;
   4110 	int issync;
   4111 	int trans_size;
   4112 	int residual;
   4113 	int ioflag;
   4114 	int retry = 1;
   4115 
   4116 	/*
   4117 	 * No symlinks in attrdirs at this time
   4118 	 */
   4119 	if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
   4120 		return (EINVAL);
   4121 
   4122 again:
   4123 	ip = (struct inode *)NULL;
   4124 	vap->va_type = VLNK;
   4125 	vap->va_rdev = 0;
   4126 
   4127 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK);
   4128 	if (error)
   4129 		goto out;
   4130 
   4131 	if (ulp)
   4132 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SYMLINK,
   4133 		    trans_size = (int)TOP_SYMLINK_SIZE(dip));
   4134 
   4135 	/*
   4136 	 * We must create the inode before the directory entry, to avoid
   4137 	 * racing with readlink().  ufs_dirmakeinode requires that we
   4138 	 * hold the quota lock as reader, and directory locks as writer.
   4139 	 */
   4140 
   4141 	rw_enter(&dip->i_rwlock, RW_WRITER);
   4142 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   4143 	rw_enter(&dip->i_contents, RW_WRITER);
   4144 
   4145 	/*
   4146 	 * Suppress any out of inodes messages if we will retry on
   4147 	 * ENOSP
   4148 	 */
   4149 	if (retry)
   4150 		dip->i_flag |= IQUIET;
   4151 
   4152 	error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr);
   4153 
   4154 	dip->i_flag &= ~IQUIET;
   4155 
   4156 	rw_exit(&dip->i_contents);
   4157 	rw_exit(&ufsvfsp->vfs_dqrwlock);
   4158 	rw_exit(&dip->i_rwlock);
   4159 
   4160 	if (error)
   4161 		goto unlock;
   4162 
   4163 	/*
   4164 	 * OK.  The inode has been created.  Write out the data of the
   4165 	 * symbolic link.  Since symbolic links are metadata, and should
   4166 	 * remain consistent across a system crash, we need to force the
   4167 	 * data out synchronously.
   4168 	 *
   4169 	 * (This is a change from the semantics in earlier releases, which
   4170 	 * only created symbolic links synchronously if the semi-documented
   4171 	 * 'syncdir' option was set, or if we were being invoked by the NFS
   4172 	 * server, which requires symbolic links to be created synchronously.)
   4173 	 *
   4174 	 * We need to pass in a pointer for the residual length; otherwise
   4175 	 * ufs_rdwri() will always return EIO if it can't write the data,
   4176 	 * even if the error was really ENOSPC or EDQUOT.
   4177 	 */
   4178 
   4179 	ioflag = FWRITE | FDSYNC;
   4180 	residual = 0;
   4181 
   4182 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   4183 	rw_enter(&ip->i_contents, RW_WRITER);
   4184 
   4185 	/*
   4186 	 * Suppress file system full messages if we will retry
   4187 	 */
   4188 	if (retry)
   4189 		ip->i_flag |= IQUIET;
   4190 
   4191 	error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target),
   4192 	    (offset_t)0, UIO_SYSSPACE, &residual, cr);
   4193 
   4194 	ip->i_flag &= ~IQUIET;
   4195 
   4196 	if (error) {
   4197 		rw_exit(&ip->i_contents);
   4198 		rw_exit(&ufsvfsp->vfs_dqrwlock);
   4199 		goto remove;
   4200 	}
   4201 
   4202 	/*
   4203 	 * If the link's data is small enough, we can cache it in the inode.
   4204 	 * This is a "fast symbolic link".  We don't use the first direct
   4205 	 * block because that's actually used to point at the symbolic link's
   4206 	 * contents on disk; but we know that none of the other direct or
   4207 	 * indirect blocks can be used because symbolic links are restricted
   4208 	 * to be smaller than a file system block.
   4209 	 */
   4210 
   4211 	ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip)));
   4212 
   4213 	if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) {
   4214 		if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) {
   4215 			ip->i_flag |= IFASTSYMLNK;
   4216 		} else {
   4217 			int i;
   4218 			/* error, clear garbage left behind */
   4219 			for (i = 1; i < NDADDR; i++)
   4220 				ip->i_db[i] = 0;
   4221 			for (i = 0; i < NIADDR; i++)
   4222 				ip->i_ib[i] = 0;
   4223 		}
   4224 	}
   4225 
   4226 	rw_exit(&ip->i_contents);
   4227 	rw_exit(&ufsvfsp->vfs_dqrwlock);
   4228 
   4229 	/*
   4230 	 * OK.  We've successfully created the symbolic link.  All that
   4231 	 * remains is to insert it into the appropriate directory.
   4232 	 */
   4233 
   4234 	rw_enter(&dip->i_rwlock, RW_WRITER);
   4235 	error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr, NULL);
   4236 	rw_exit(&dip->i_rwlock);
   4237 
   4238 	/*
   4239 	 * Fall through into remove-on-error code.  We're either done, or we
   4240 	 * need to remove the inode (if we couldn't insert it).
   4241 	 */
   4242 
   4243 remove:
   4244 	if (error && (ip != NULL)) {
   4245 		rw_enter(&ip->i_contents, RW_WRITER);
   4246 		ip->i_nlink--;
   4247 		ip->i_flag |= ICHG;
   4248 		ip->i_seq++;
   4249 		ufs_setreclaim(ip);
   4250 		rw_exit(&ip->i_contents);
   4251 	}
   4252 
   4253 unlock:
   4254 	if (ip != NULL)
   4255 		VN_RELE(ITOV(ip));
   4256 
   4257 	if (ulp) {
   4258 		int terr = 0;
   4259 
   4260 		TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SYMLINK,
   4261 		    trans_size);
   4262 		ufs_lockfs_end(ulp);
   4263 		if (error == 0)
   4264 			error = terr;
   4265 	}
   4266 
   4267 	/*
   4268 	 * We may have failed due to lack of an inode or of a block to
   4269 	 * store the target in.  Try flushing the delete queue to free
   4270 	 * logically-available things up and try again.
   4271 	 */
   4272 	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
   4273 		ufs_delete_drain_wait(ufsvfsp, 1);
   4274 		retry = 0;
   4275 		goto again;
   4276 	}
   4277 
   4278 out:
   4279 	return (error);
   4280 }
   4281 
   4282 /*
   4283  * Ufs specific routine used to do ufs io.
   4284  */
   4285 int
   4286 ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base,
   4287 	ssize_t len, offset_t offset, enum uio_seg seg, int *aresid,
   4288 	struct cred *cr)
   4289 {
   4290 	struct uio auio;
   4291 	struct iovec aiov;
   4292 	int error;
   4293 
   4294 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   4295 
   4296 	bzero((caddr_t)&auio, sizeof (uio_t));
   4297 	bzero((caddr_t)&aiov, sizeof (iovec_t));
   4298 
   4299 	aiov.iov_base = base;
   4300 	aiov.iov_len = len;
   4301 	auio.uio_iov = &aiov;
   4302 	auio.uio_iovcnt = 1;
   4303 	auio.uio_loffset = offset;
   4304 	auio.uio_segflg = (short)seg;
   4305 	auio.uio_resid = len;
   4306 
   4307 	if (rw == UIO_WRITE) {
   4308 		auio.uio_fmode = FWRITE;
   4309 		auio.uio_extflg = UIO_COPY_DEFAULT;
   4310 		auio.uio_llimit = curproc->p_fsz_ctl;
   4311 		error = wrip(ip, &auio, ioflag, cr);
   4312 	} else {
   4313 		auio.uio_fmode = FREAD;
   4314 		auio.uio_extflg = UIO_COPY_CACHED;
   4315 		auio.uio_llimit = MAXOFFSET_T;
   4316 		error = rdip(ip, &auio, ioflag, cr);
   4317 	}
   4318 
   4319 	if (aresid) {
   4320 		*aresid = auio.uio_resid;
   4321 	} else if (auio.uio_resid) {
   4322 		error = EIO;
   4323 	}
   4324 	return (error);
   4325 }
   4326 
   4327 /*ARGSUSED*/
   4328 static int
   4329 ufs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
   4330 {
   4331 	struct ufid *ufid;
   4332 	struct inode *ip = VTOI(vp);
   4333 
   4334 	if (ip->i_ufsvfs == NULL)
   4335 		return (EIO);
   4336 
   4337 	if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) {
   4338 		fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t);
   4339 		return (ENOSPC);
   4340 	}
   4341 
   4342 	ufid = (struct ufid *)fidp;
   4343 	bzero((char *)ufid, sizeof (struct ufid));
   4344 	ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t);
   4345 	ufid->ufid_ino = ip->i_number;
   4346 	ufid->ufid_gen = ip->i_gen;
   4347 
   4348 	return (0);
   4349 }
   4350 
   4351 /* ARGSUSED2 */
   4352 static int
   4353 ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
   4354 {
   4355 	struct inode	*ip = VTOI(vp);
   4356 	struct ufsvfs	*ufsvfsp;
   4357 	int		forcedirectio;
   4358 
   4359 	/*
   4360 	 * Read case is easy.
   4361 	 */
   4362 	if (!write_lock) {
   4363 		rw_enter(&ip->i_rwlock, RW_READER);
   4364 		return (V_WRITELOCK_FALSE);
   4365 	}
   4366 
   4367 	/*
   4368 	 * Caller has requested a writer lock, but that inhibits any
   4369 	 * concurrency in the VOPs that follow. Acquire the lock shared
   4370 	 * and defer exclusive access until it is known to be needed in
   4371 	 * other VOP handlers. Some cases can be determined here.
   4372 	 */
   4373 
   4374 	/*
   4375 	 * If directio is not set, there is no chance of concurrency,
   4376 	 * so just acquire the lock exclusive. Beware of a forced
   4377 	 * unmount before looking at the mount option.
   4378 	 */
   4379 	ufsvfsp = ip->i_ufsvfs;
   4380 	forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0;
   4381 	if (!(ip->i_flag & IDIRECTIO || forcedirectio) ||
   4382 	    !ufs_allow_shared_writes) {
   4383 		rw_enter(&ip->i_rwlock, RW_WRITER);
   4384 		return (V_WRITELOCK_TRUE);
   4385 	}
   4386 
   4387 	/*
   4388 	 * Mandatory locking forces acquiring i_rwlock exclusive.
   4389 	 */
   4390 	if (MANDLOCK(vp, ip->i_mode)) {
   4391 		rw_enter(&ip->i_rwlock, RW_WRITER);
   4392 		return (V_WRITELOCK_TRUE);
   4393 	}
   4394 
   4395 	/*
   4396 	 * Acquire the lock shared in case a concurrent write follows.
   4397 	 * Mandatory locking could have become enabled before the lock
   4398 	 * was acquired. Re-check and upgrade if needed.
   4399 	 */
   4400 	rw_enter(&ip->i_rwlock, RW_READER);
   4401 	if (MANDLOCK(vp, ip->i_mode)) {
   4402 		rw_exit(&ip->i_rwlock);
   4403 		rw_enter(&ip->i_rwlock, RW_WRITER);
   4404 		return (V_WRITELOCK_TRUE);
   4405 	}
   4406 	return (V_WRITELOCK_FALSE);
   4407 }
   4408 
   4409 /*ARGSUSED*/
   4410 static void
   4411 ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
   4412 {
   4413 	struct inode	*ip = VTOI(vp);
   4414 
   4415 	rw_exit(&ip->i_rwlock);
   4416 }
   4417 
   4418 /* ARGSUSED */
   4419 static int
   4420 ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
   4421 	caller_context_t *ct)
   4422 {
   4423 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
   4424 }
   4425 
   4426 /* ARGSUSED */
   4427 static int
   4428 ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
   4429 	offset_t offset, struct flk_callback *flk_cbp, struct cred *cr,
   4430 	caller_context_t *ct)
   4431 {
   4432 	struct inode *ip = VTOI(vp);
   4433 
   4434 	if (ip->i_ufsvfs == NULL)
   4435 		return (EIO);
   4436 
   4437 	/*
   4438 	 * If file is being mapped, disallow frlock.
   4439 	 * XXX I am not holding tlock while checking i_mapcnt because the
   4440 	 * current locking strategy drops all locks before calling fs_frlock.
   4441 	 * So, mapcnt could change before we enter fs_frlock making is
   4442 	 * meaningless to have held tlock in the first place.
   4443 	 */
   4444 	if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode))
   4445 		return (EAGAIN);
   4446 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
   4447 }
   4448 
   4449 /* ARGSUSED */
   4450 static int
   4451 ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
   4452 	offset_t offset, cred_t *cr, caller_context_t *ct)
   4453 {
   4454 	struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
   4455 	struct ulockfs *ulp;
   4456 	int error;
   4457 
   4458 	if ((error = convoff(vp, bfp, 0, offset)) == 0) {
   4459 		if (cmd == F_FREESP) {
   4460 			error = ufs_lockfs_begin(ufsvfsp, &ulp,
   4461 			    ULOCKFS_SPACE_MASK);
   4462 			if (error)
   4463 				return (error);
   4464 			error = ufs_freesp(vp, bfp, flag, cr);
   4465 		} else if (cmd == F_ALLOCSP) {
   4466 			error = ufs_lockfs_begin(ufsvfsp, &ulp,
   4467 			    ULOCKFS_FALLOCATE_MASK);
   4468 			if (error)
   4469 				return (error);
   4470 			error = ufs_allocsp(vp, bfp, cr);
   4471 		} else
   4472 			return (EINVAL); /* Command not handled here */
   4473 
   4474 		if (ulp)
   4475 			ufs_lockfs_end(ulp);
   4476 
   4477 	}
   4478 	return (error);
   4479 }
   4480 
   4481 /*
   4482  * Used to determine if read ahead should be done. Also used to
   4483  * to determine when write back occurs.
   4484  */
   4485 #define	CLUSTSZ(ip)		((ip)->i_ufsvfs->vfs_ioclustsz)
   4486 
   4487 /*
   4488  * A faster version of ufs_getpage.
   4489  *
   4490  * We optimize by inlining the pvn_getpages iterator, eliminating
   4491  * calls to bmap_read if file doesn't have UFS holes, and avoiding
   4492  * the overhead of page_exists().
   4493  *
   4494  * When files has UFS_HOLES and ufs_getpage is called with S_READ,
   4495  * we set *protp to PROT_READ to avoid calling bmap_read. This approach
   4496  * victimizes performance when a file with UFS holes is faulted
   4497  * first in the S_READ mode, and then in the S_WRITE mode. We will get
   4498  * two MMU faults in this case.
   4499  *
   4500  * XXX - the inode fields which control the sequential mode are not
   4501  *	 protected by any mutex. The read ahead will act wild if
   4502  *	 multiple processes will access the file concurrently and
   4503  *	 some of them in sequential mode. One particulary bad case
   4504  *	 is if another thread will change the value of i_nextrio between
   4505  *	 the time this thread tests the i_nextrio value and then reads it
   4506  *	 again to use it as the offset for the read ahead.
   4507  */
   4508 /*ARGSUSED*/
   4509 static int
   4510 ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
   4511 	page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr,
   4512 	enum seg_rw rw, struct cred *cr, caller_context_t *ct)
   4513 {
   4514 	u_offset_t	uoff = (u_offset_t)off; /* type conversion */
   4515 	u_offset_t	pgoff;
   4516 	u_offset_t	eoff;
   4517 	struct inode 	*ip = VTOI(vp);
   4518 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
   4519 	struct fs 	*fs;
   4520 	struct ulockfs	*ulp;
   4521 	page_t		**pl;
   4522 	caddr_t		pgaddr;
   4523 	krw_t		rwtype;
   4524 	int 		err;
   4525 	int		has_holes;
   4526 	int		beyond_eof;
   4527 	int		seqmode;
   4528 	int		pgsize = PAGESIZE;
   4529 	int		dolock;
   4530 	int		do_qlock;
   4531 	int		trans_size;
   4532 
   4533 	ASSERT((uoff & PAGEOFFSET) == 0);
   4534 
   4535 	if (protp)
   4536 		*protp = PROT_ALL;
   4537 
   4538 	/*
   4539 	 * Obey the lockfs protocol
   4540 	 */
   4541 	err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg,
   4542 	    rw == S_READ || rw == S_EXEC, protp);
   4543 	if (err)
   4544 		goto out;
   4545 
   4546 	fs = ufsvfsp->vfs_fs;
   4547 
   4548 	if (ulp && (rw == S_CREATE || rw == S_WRITE) &&
   4549 	    !(vp->v_flag & VISSWAP)) {
   4550 		/*
   4551 		 * Try to start a transaction, will return if blocking is
   4552 		 * expected to occur and the address space is not the
   4553 		 * kernel address space.
   4554 		 */
   4555 		trans_size = TOP_GETPAGE_SIZE(ip);
   4556 		if (seg->s_as != &kas) {
   4557 			TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE,
   4558 			    trans_size, err)
   4559 			if (err == EWOULDBLOCK) {
   4560 				/*
   4561 				 * Use EDEADLK here because the VM code
   4562 				 * can normally never see this error.
   4563 				 */
   4564 				err = EDEADLK;
   4565 				ufs_lockfs_end(ulp);
   4566 				goto out;
   4567 			}
   4568 		} else {
   4569 			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
   4570 		}
   4571 	}
   4572 
   4573 	if (vp->v_flag & VNOMAP) {
   4574 		err = ENOSYS;
   4575 		goto unlock;
   4576 	}
   4577 
   4578 	seqmode = ip->i_nextr == uoff && rw != S_CREATE;
   4579 
   4580 	rwtype = RW_READER;		/* start as a reader */
   4581 	dolock = (rw_owner(&ip->i_contents) != curthread);
   4582 	/*
   4583 	 * If this thread owns the lock, i.e., this thread grabbed it
   4584 	 * as writer somewhere above, then we don't need to grab the
   4585 	 * lock as reader in this routine.
   4586 	 */
   4587 	do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread);
   4588 
   4589 retrylock:
   4590 	if (dolock) {
   4591 		/*
   4592 		 * Grab the quota lock if we need to call
   4593 		 * bmap_write() below (with i_contents as writer).
   4594 		 */
   4595 		if (do_qlock && rwtype == RW_WRITER)
   4596 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   4597 		rw_enter(&ip->i_contents, rwtype);
   4598 	}
   4599 
   4600 	/*
   4601 	 * We may be getting called as a side effect of a bmap using
   4602 	 * fbread() when the blocks might be being allocated and the
   4603 	 * size has not yet been up'ed.  In this case we want to be
   4604 	 * able to return zero pages if we get back UFS_HOLE from
   4605 	 * calling bmap for a non write case here.  We also might have
   4606 	 * to read some frags from the disk into a page if we are
   4607 	 * extending the number of frags for a given lbn in bmap().
   4608 	 * Large Files: The read of i_size here is atomic because
   4609 	 * i_contents is held here. If dolock is zero, the lock
   4610 	 * is held in bmap routines.
   4611 	 */
   4612 	beyond_eof = uoff + len >
   4613 	    P2ROUNDUP_TYPED(ip->i_size, PAGESIZE, u_offset_t);
   4614 	if (beyond_eof && seg != segkmap) {
   4615 		if (dolock) {
   4616 			rw_exit(&ip->i_contents);
   4617 			if (do_qlock && rwtype == RW_WRITER)
   4618 				rw_exit(&ufsvfsp->vfs_dqrwlock);
   4619 		}
   4620 		err = EFAULT;
   4621 		goto unlock;
   4622 	}
   4623 
   4624 	/*
   4625 	 * Must hold i_contents lock throughout the call to pvn_getpages
   4626 	 * since locked pages are returned from each call to ufs_getapage.
   4627 	 * Must *not* return locked pages and then try for contents lock
   4628 	 * due to lock ordering requirements (inode > page)
   4629 	 */
   4630 
   4631 	has_holes = bmap_has_holes(ip);
   4632 
   4633 	if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) {
   4634 		int	blk_size;
   4635 		u_offset_t offset;
   4636 
   4637 		/*
   4638 		 * We must acquire the RW_WRITER lock in order to
   4639 		 * call bmap_write().
   4640 		 */
   4641 		if (dolock && rwtype == RW_READER) {
   4642 			rwtype = RW_WRITER;
   4643 
   4644 			/*
   4645 			 * Grab the quota lock before
   4646 			 * upgrading i_contents, but if we can't grab it
   4647 			 * don't wait here due to lock order:
   4648 			 * vfs_dqrwlock > i_contents.
   4649 			 */
   4650 			if (do_qlock &&
   4651 			    rw_tryenter(&ufsvfsp->vfs_dqrwlock, RW_READER)
   4652 			    == 0) {
   4653 				rw_exit(&ip->i_contents);
   4654 				goto retrylock;
   4655 			}
   4656 			if (!rw_tryupgrade(&ip->i_contents)) {
   4657 				rw_exit(&ip->i_contents);
   4658 				if (do_qlock)
   4659 					rw_exit(&ufsvfsp->vfs_dqrwlock);
   4660 				goto retrylock;
   4661 			}
   4662 		}
   4663 
   4664 		/*
   4665 		 * May be allocating disk blocks for holes here as
   4666 		 * a result of mmap faults. write(2) does the bmap_write
   4667 		 * in rdip/wrip, not here. We are not dealing with frags
   4668 		 * in this case.
   4669 		 */
   4670 		/*
   4671 		 * Large Files: We cast fs_bmask field to offset_t
   4672 		 * just as we do for MAXBMASK because uoff is a 64-bit
   4673 		 * data type. fs_bmask will still be a 32-bit type
   4674 		 * as we cannot change any ondisk data structures.
   4675 		 */
   4676 
   4677 		offset = uoff & (offset_t)fs->fs_bmask;
   4678 		while (offset < uoff + len) {
   4679 			blk_size = (int)blksize(fs, ip, lblkno(fs, offset));
   4680 			err = bmap_write(ip, offset, blk_size,
   4681 			    BI_NORMAL, NULL, cr);
   4682 			if (ip->i_flag & (ICHG|IUPD))
   4683 				ip->i_seq++;
   4684 			if (err)
   4685 				goto update_inode;
   4686 			offset += blk_size; /* XXX - make this contig */
   4687 		}
   4688 	}
   4689 
   4690 	/*
   4691 	 * Can be a reader from now on.
   4692 	 */
   4693 	if (dolock && rwtype == RW_WRITER) {
   4694 		rw_downgrade(&ip->i_contents);
   4695 		/*
   4696 		 * We can release vfs_dqrwlock early so do it, but make
   4697 		 * sure we don't try to release it again at the bottom.
   4698 		 */
   4699 		if (do_qlock) {
   4700 			rw_exit(&ufsvfsp->vfs_dqrwlock);
   4701 			do_qlock = 0;
   4702 		}
   4703 	}
   4704 
   4705 	/*
   4706 	 * We remove PROT_WRITE in cases when the file has UFS holes
   4707 	 * because we don't  want to call bmap_read() to check each
   4708 	 * page if it is backed with a disk block.
   4709 	 */
   4710 	if (protp && has_holes && rw != S_WRITE && rw != S_CREATE)
   4711 		*protp &= ~PROT_WRITE;
   4712 
   4713 	err = 0;
   4714 
   4715 	/*
   4716 	 * The loop looks up pages in the range [off, off + len).
   4717 	 * For each page, we first check if we should initiate an asynchronous
   4718 	 * read ahead before we call page_lookup (we may sleep in page_lookup
   4719 	 * for a previously initiated disk read).
   4720 	 */
   4721 	eoff = (uoff + len);
   4722 	for (pgoff = uoff, pgaddr = addr, pl = plarr;
   4723 	    pgoff < eoff; /* empty */) {
   4724 		page_t	*pp;
   4725 		u_offset_t	nextrio;
   4726 		se_t	se;
   4727 		int retval;
   4728 
   4729 		se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED);
   4730 
   4731 		/* Handle async getpage (faultahead) */
   4732 		if (plarr == NULL) {
   4733 			ip->i_nextrio = pgoff;
   4734 			(void) ufs_getpage_ra(vp, pgoff, seg, pgaddr);
   4735 			pgoff += pgsize;
   4736 			pgaddr += pgsize;
   4737 			continue;
   4738 		}
   4739 		/*
   4740 		 * Check if we should initiate read ahead of next cluster.
   4741 		 * We call page_exists only when we need to confirm that
   4742 		 * we have the current page before we initiate the read ahead.
   4743 		 */
   4744 		nextrio = ip->i_nextrio;
   4745 		if (seqmode &&
   4746 		    pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
   4747 		    nextrio < ip->i_size && page_exists(vp, pgoff)) {
   4748 			retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr);
   4749 			/*
   4750 			 * We always read ahead the next cluster of data
   4751 			 * starting from i_nextrio. If the page (vp,nextrio)
   4752 			 * is actually in core at this point, the routine
   4753 			 * ufs_getpage_ra() will stop pre-fetching data
   4754 			 * until we read that page in a synchronized manner
   4755 			 * through ufs_getpage_miss(). So, we should increase
   4756 			 * i_nextrio if the page (vp, nextrio) exists.
   4757 			 */
   4758 			if ((retval == 0) && page_exists(vp, nextrio)) {
   4759 				ip->i_nextrio = nextrio + pgsize;
   4760 			}
   4761 		}
   4762 
   4763 		if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
   4764 			/*
   4765 			 * We found the page in the page cache.
   4766 			 */
   4767 			*pl++ = pp;
   4768 			pgoff += pgsize;
   4769 			pgaddr += pgsize;
   4770 			len -= pgsize;
   4771 			plsz -= pgsize;
   4772 		} else  {
   4773 			/*
   4774 			 * We have to create the page, or read it from disk.
   4775 			 */
   4776 			if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr,
   4777 			    pl, plsz, rw, seqmode))
   4778 				goto error;
   4779 
   4780 			while (*pl != NULL) {
   4781 				pl++;
   4782 				pgoff += pgsize;
   4783 				pgaddr += pgsize;
   4784 				len -= pgsize;
   4785 				plsz -= pgsize;
   4786 			}
   4787 		}
   4788 	}
   4789 
   4790 	/*
   4791 	 * Return pages up to plsz if they are in the page cache.
   4792 	 * We cannot return pages if there is a chance that they are
   4793 	 * backed with a UFS hole and rw is S_WRITE or S_CREATE.
   4794 	 */
   4795 	if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
   4796 
   4797 		ASSERT((protp == NULL) ||
   4798 		    !(has_holes && (*protp & PROT_WRITE)));
   4799 
   4800 		eoff = pgoff + plsz;
   4801 		while (pgoff < eoff) {
   4802 			page_t		*pp;
   4803 
   4804 			if ((pp = page_lookup_nowait(vp, pgoff,
   4805 			    SE_SHARED)) == NULL)
   4806 				break;
   4807 
   4808 			*pl++ = pp;
   4809 			pgoff += pgsize;
   4810 			plsz -= pgsize;
   4811 		}
   4812 	}
   4813 
   4814 	if (plarr)
   4815 		*pl = NULL;			/* Terminate page list */
   4816 	ip->i_nextr = pgoff;
   4817 
   4818 error:
   4819 	if (err && plarr) {
   4820 		/*
   4821 		 * Release any pages we have locked.
   4822 		 */
   4823 		while (pl > &plarr[0])
   4824 			page_unlock(*--pl);
   4825 
   4826 		plarr[0] = NULL;
   4827 	}
   4828 
   4829 update_inode:
   4830 	/*
   4831 	 * If the inode is not already marked for IACC (in rdip() for read)
   4832 	 * and the inode is not marked for no access time update (in wrip()
   4833 	 * for write) then update the inode access time and mod time now.
   4834 	 */
   4835 	if ((ip->i_flag & (IACC | INOACC)) == 0) {
   4836 		if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) {
   4837 			if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
   4838 			    (fs->fs_ronly == 0) &&
   4839 			    (!ufsvfsp->vfs_noatime)) {
   4840 				mutex_enter(&ip->i_tlock);
   4841 				ip->i_flag |= IACC;
   4842 				ITIMES_NOLOCK(ip);
   4843 				mutex_exit(&ip->i_tlock);
   4844 			}
   4845 		}
   4846 	}
   4847 
   4848 	if (dolock) {
   4849 		rw_exit(&ip->i_contents);
   4850 		if (do_qlock && rwtype == RW_WRITER)
   4851 			rw_exit(&ufsvfsp->vfs_dqrwlock);
   4852 	}
   4853 
   4854 unlock:
   4855 	if (ulp) {
   4856 		if ((rw == S_CREATE || rw == S_WRITE) &&
   4857 		    !(vp->v_flag & VISSWAP)) {
   4858 			TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
   4859 		}
   4860 		ufs_lockfs_end(ulp);
   4861 	}
   4862 out:
   4863 	return (err);
   4864 }
   4865 
   4866 /*
   4867  * ufs_getpage_miss is called when ufs_getpage missed the page in the page
   4868  * cache. The page is either read from the disk, or it's created.
   4869  * A page is created (without disk read) if rw == S_CREATE, or if
   4870  * the page is not backed with a real disk block (UFS hole).
   4871  */
   4872 /* ARGSUSED */
   4873 static int
   4874 ufs_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg,
   4875 	caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq)
   4876 {
   4877 	struct inode	*ip = VTOI(vp);
   4878 	page_t		*pp;
   4879 	daddr_t		bn;
   4880 	size_t		io_len;
   4881 	int		crpage = 0;
   4882 	int		err;
   4883 	int		contig;
   4884 	int		bsize = ip->i_fs->fs_bsize;
   4885 
   4886 	/*
   4887 	 * Figure out whether the page can be created, or must be
   4888 	 * must be read from the disk.
   4889 	 */
   4890 	if (rw == S_CREATE)
   4891 		crpage = 1;
   4892 	else {
   4893 		contig = 0;
   4894 		if (err = bmap_read(ip, off, &bn, &contig))
   4895 			return (err);
   4896 
   4897 		crpage = (bn == UFS_HOLE);
   4898 
   4899 		/*
   4900 		 * If its also a fallocated block that hasn't been written to
   4901 		 * yet, we will treat it just like a UFS_HOLE and create
   4902 		 * a zero page for it
   4903 		 */
   4904 		if (ISFALLOCBLK(ip, bn))
   4905 			crpage = 1;
   4906 	}
   4907 
   4908 	if (crpage) {
   4909 		if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, seg,
   4910 		    addr)) == NULL) {
   4911 			return (ufs_fault(vp,
   4912 			    "ufs_getpage_miss: page_create == NULL"));
   4913 		}
   4914 
   4915 		if (rw != S_CREATE)
   4916 			pagezero(pp, 0, PAGESIZE);
   4917 
   4918 		io_len = PAGESIZE;
   4919 	} else {
   4920 		u_offset_t	io_off;
   4921 		uint_t	xlen;
   4922 		struct buf	*bp;
   4923 		ufsvfs_t	*ufsvfsp = ip->i_ufsvfs;
   4924 
   4925 		/*
   4926 		 * If access is not in sequential order, we read from disk
   4927 		 * in bsize units.
   4928 		 *
   4929 		 * We limit the size of the transfer to bsize if we are reading
   4930 		 * from the beginning of the file. Note in this situation we
   4931 		 * will hedge our bets and initiate an async read ahead of
   4932 		 * the second block.
   4933 		 */
   4934 		if (!seq || off == 0)
   4935 			contig = MIN(contig, bsize);
   4936 
   4937 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
   4938 		    &io_len, off, contig, 0);
   4939 
   4940 		/*
   4941 		 * Some other thread has entered the page.
   4942 		 * ufs_getpage will retry page_lookup.
   4943 		 */
   4944 		if (pp == NULL) {
   4945 			pl[0] = NULL;
   4946 			return (0);
   4947 		}
   4948 
   4949 		/*
   4950 		 * Zero part of the page which we are not
   4951 		 * going to read from the disk.
   4952 		 */
   4953 		xlen = io_len & PAGEOFFSET;
   4954 		if (xlen != 0)
   4955 			pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
   4956 
   4957 		bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ);
   4958 		bp->b_edev = ip->i_dev;
   4959 		bp->b_dev = cmpdev(ip->i_dev);
   4960 		bp->b_blkno = bn;
   4961 		bp->b_un.b_addr = (caddr_t)0;
   4962 		bp->b_file = ip->i_vnode;
   4963 		bp->b_offset = off;
   4964 
   4965 		if (ufsvfsp->vfs_log) {
   4966 			lufs_read_strategy(ufsvfsp->vfs_log, bp);
   4967 		} else if (ufsvfsp->vfs_snapshot) {
   4968 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
   4969 		} else {
   4970 			ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
   4971 			ub.ub_getpages.value.ul++;
   4972 			(void) bdev_strategy(bp);
   4973 			lwp_stat_update(LWP_STAT_INBLK, 1);
   4974 		}
   4975 
   4976 		ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK);
   4977 
   4978 		/*
   4979 		 * If the file access is sequential, initiate read ahead
   4980 		 * of the next cluster.
   4981 		 */
   4982 		if (seq && ip->i_nextrio < ip->i_size)
   4983 			(void) ufs_getpage_ra(vp, off, seg, addr);
   4984 		err = biowait(bp);
   4985 		pageio_done(bp);
   4986 
   4987 		if (err) {
   4988 			pvn_read_done(pp, B_ERROR);
   4989 			return (err);
   4990 		}
   4991 	}
   4992 
   4993 	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
   4994 	return (0);
   4995 }
   4996 
   4997 /*
   4998  * Read ahead a cluster from the disk. Returns the length in bytes.
   4999  */
   5000 static int
   5001 ufs_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr)
   5002 {
   5003 	struct inode	*ip = VTOI(vp);
   5004 	page_t		*pp;
   5005 	u_offset_t	io_off = ip->i_nextrio;
   5006 	ufsvfs_t	*ufsvfsp;
   5007 	caddr_t		addr2 = addr + (io_off - off);
   5008 	struct buf	*bp;
   5009 	daddr_t		bn;
   5010 	size_t		io_len;
   5011 	int		err;
   5012 	int		contig;
   5013 	int		xlen;
   5014 	int		bsize = ip->i_fs->fs_bsize;
   5015 
   5016 	/*
   5017 	 * If the directio advisory is in effect on this file,
   5018 	 * then do not do buffered read ahead. Read ahead makes
   5019 	 * it more difficult on threads using directio as they
   5020 	 * will be forced to flush the pages from this vnode.
   5021 	 */
   5022 	if ((ufsvfsp = ip->i_ufsvfs) == NULL)
   5023 		return (0);
   5024 	if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio)
   5025 		return (0);
   5026 
   5027 	/*
   5028 	 * Is this test needed?
   5029 	 */
   5030 	if (addr2 >= seg->s_base + seg->s_size)
   5031 		return (0);
   5032 
   5033 	contig = 0;
   5034 	err = bmap_read(ip, io_off, &bn, &contig);
   5035 	/*
   5036 	 * If its a UFS_HOLE or a fallocated block, do not perform
   5037 	 * any read ahead's since there probably is nothing to read ahead
   5038 	 */
   5039 	if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn))
   5040 		return (0);
   5041 
   5042 	/*
   5043 	 * Limit the transfer size to bsize if this is the 2nd block.
   5044 	 */
   5045 	if (io_off == (u_offset_t)bsize)
   5046 		contig = MIN(contig, bsize);
   5047 
   5048 	if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off,
   5049 	    &io_len, io_off, contig, 1)) == NULL)
   5050 		return (0);
   5051 
   5052 	/*
   5053 	 * Zero part of page which we are not going to read from disk
   5054 	 */
   5055 	if ((xlen = (io_len & PAGEOFFSET)) > 0)
   5056 		pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
   5057 
   5058 	ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK;
   5059 
   5060 	bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC);
   5061 	bp->b_edev = ip->i_dev;
   5062 	bp->b_dev = cmpdev(ip->i_dev);
   5063 	bp->b_blkno = bn;
   5064 	bp->b_un.b_addr = (caddr_t)0;
   5065 	bp->b_file = ip->i_vnode;
   5066 	bp->b_offset = off;
   5067 
   5068 	if (ufsvfsp->vfs_log) {
   5069 		lufs_read_strategy(ufsvfsp->vfs_log, bp);
   5070 	} else if (ufsvfsp->vfs_snapshot) {
   5071 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
   5072 	} else {
   5073 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
   5074 		ub.ub_getras.value.ul++;
   5075 		(void) bdev_strategy(bp);
   5076 		lwp_stat_update(LWP_STAT_INBLK, 1);
   5077 	}
   5078 
   5079 	return (io_len);
   5080 }
   5081 
   5082 int	ufs_delay = 1;
   5083 /*
   5084  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC}
   5085  *
   5086  * LMXXX - the inode really ought to contain a pointer to one of these
   5087  * async args.  Stuff gunk in there and just hand the whole mess off.
   5088  * This would replace i_delaylen, i_delayoff.
   5089  */
   5090 /*ARGSUSED*/
   5091 static int
   5092 ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
   5093 	struct cred *cr, caller_context_t *ct)
   5094 {
   5095 	struct inode *ip = VTOI(vp);
   5096 	int err = 0;
   5097 
   5098 	if (vp->v_count == 0) {
   5099 		return (ufs_fault(vp, "ufs_putpage: bad v_count == 0"));
   5100 	}
   5101 
   5102 	/*
   5103 	 * XXX - Why should this check be made here?
   5104 	 */
   5105 	if (vp->v_flag & VNOMAP) {
   5106 		err = ENOSYS;
   5107 		goto errout;
   5108 	}
   5109 
   5110 	if (ip->i_ufsvfs == NULL) {
   5111 		err = EIO;
   5112 		goto errout;
   5113 	}
   5114 
   5115 	if (flags & B_ASYNC) {
   5116 		if (ufs_delay && len &&
   5117 		    (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
   5118 			mutex_enter(&ip->i_tlock);
   5119 			/*
   5120 			 * If nobody stalled, start a new cluster.
   5121 			 */
   5122 			if (ip->i_delaylen == 0) {
   5123 				ip->i_delayoff = off;
   5124 				ip->i_delaylen = len;
   5125 				mutex_exit(&ip->i_tlock);
   5126 				goto errout;
   5127 			}
   5128 			/*
   5129 			 * If we have a full cluster or they are not contig,
   5130 			 * then push last cluster and start over.
   5131 			 */
   5132 			if (ip->i_delaylen >= CLUSTSZ(ip) ||
   5133 			    ip->i_delayoff + ip->i_delaylen != off) {
   5134 				u_offset_t doff;
   5135 				size_t dlen;
   5136 
   5137 				doff = ip->i_delayoff;
   5138 				dlen = ip->i_delaylen;
   5139 				ip->i_delayoff = off;
   5140 				ip->i_delaylen = len;
   5141 				mutex_exit(&ip->i_tlock);
   5142 				err = ufs_putpages(vp, doff, dlen,
   5143 				    flags, cr);
   5144 				/* LMXXX - flags are new val, not old */
   5145 				goto errout;
   5146 			}
   5147 			/*
   5148 			 * There is something there, it's not full, and
   5149 			 * it is contig.
   5150 			 */
   5151 			ip->i_delaylen += len;
   5152 			mutex_exit(&ip->i_tlock);
   5153 			goto errout;
   5154 		}
   5155 		/*
   5156 		 * Must have weird flags or we are not clustering.
   5157 		 */
   5158 	}
   5159 
   5160 	err = ufs_putpages(vp, off, len, flags, cr);
   5161 
   5162 errout:
   5163 	return (err);
   5164 }
   5165 
   5166 /*
   5167  * If len == 0, do from off to EOF.
   5168  *
   5169  * The normal cases should be len == 0 & off == 0 (entire vp list),
   5170  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
   5171  * (from pageout).
   5172  */
   5173 /*ARGSUSED*/
   5174 static int
   5175 ufs_putpages(
   5176 	struct vnode *vp,
   5177 	offset_t off,
   5178 	size_t len,
   5179 	int flags,
   5180 	struct cred *cr)
   5181 {
   5182 	u_offset_t io_off;
   5183 	u_offset_t eoff;
   5184 	struct inode *ip = VTOI(vp);
   5185 	page_t *pp;
   5186 	size_t io_len;
   5187 	int err = 0;
   5188 	int dolock;
   5189 
   5190 	if (vp->v_count == 0)
   5191 		return (ufs_fault(vp, "ufs_putpages: v_count == 0"));
   5192 	/*
   5193 	 * Acquire the readers/write inode lock before locking
   5194 	 * any pages in this inode.
   5195 	 * The inode lock is held during i/o.
   5196 	 */
   5197 	if (len == 0) {
   5198 		mutex_enter(&ip->i_tlock);
   5199 		ip->i_delayoff = ip->i_delaylen = 0;
   5200 		mutex_exit(&ip->i_tlock);
   5201 	}
   5202 	dolock = (rw_owner(&ip->i_contents) != curthread);
   5203 	if (dolock) {
   5204 		/*
   5205 		 * Must synchronize this thread and any possible thread
   5206 		 * operating in the window of vulnerability in wrip().
   5207 		 * It is dangerous to allow both a thread doing a putpage
   5208 		 * and a thread writing, so serialize them.  The exception
   5209 		 * is when the thread in wrip() does something which causes
   5210 		 * a putpage operation.  Then, the thread must be allowed
   5211 		 * to continue.  It may encounter a bmap_read problem in
   5212 		 * ufs_putapage, but that is handled in ufs_putapage.
   5213 		 * Allow async writers to proceed, we don't want to block
   5214 		 * the pageout daemon.
   5215 		 */
   5216 		if (ip->i_writer == curthread)
   5217 			rw_enter(&ip->i_contents, RW_READER);
   5218 		else {
   5219 			for (;;) {
   5220 				rw_enter(&ip->i_contents, RW_READER);
   5221 				mutex_enter(&ip->i_tlock);
   5222 				/*
   5223 				 * If there is no thread in the critical
   5224 				 * section of wrip(), then proceed.
   5225 				 * Otherwise, wait until there isn't one.
   5226 				 */
   5227 				if (ip->i_writer == NULL) {
   5228 					mutex_exit(&ip->i_tlock);
   5229 					break;
   5230 				}
   5231 				rw_exit(&ip->i_contents);
   5232 				/*
   5233 				 * Bounce async writers when we have a writer
   5234 				 * working on this file so we don't deadlock
   5235 				 * the pageout daemon.
   5236 				 */
   5237 				if (flags & B_ASYNC) {
   5238 					mutex_exit(&ip->i_tlock);
   5239 					return (0);
   5240 				}
   5241 				cv_wait(&ip->i_wrcv, &ip->i_tlock);
   5242 				mutex_exit(&ip->i_tlock);
   5243 			}
   5244 		}
   5245 	}
   5246 
   5247 	if (!vn_has_cached_data(vp)) {
   5248 		if (dolock)
   5249 			rw_exit(&ip->i_contents);
   5250 		return (0);
   5251 	}
   5252 
   5253 	if (len == 0) {
   5254 		/*
   5255 		 * Search the entire vp list for pages >= off.
   5256 		 */
   5257 		err = pvn_vplist_dirty(vp, (u_offset_t)off, ufs_putapage,
   5258 		    flags, cr);
   5259 	} else {
   5260 		/*
   5261 		 * Loop over all offsets in the range looking for
   5262 		 * pages to deal with.
   5263 		 */
   5264 		if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0)
   5265 			eoff = MIN(off + len, eoff);
   5266 		else
   5267 			eoff = off + len;
   5268 
   5269 		for (io_off = off; io_off < eoff; io_off += io_len) {
   5270 			/*
   5271 			 * If we are not invalidating, synchronously
   5272 			 * freeing or writing pages, use the routine
   5273 			 * page_lookup_nowait() to prevent reclaiming
   5274 			 * them from the free list.
   5275 			 */
   5276 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
   5277 				pp = page_lookup(vp, io_off,
   5278 				    (flags & (B_INVAL | B_FREE)) ?
   5279 				    SE_EXCL : SE_SHARED);
   5280 			} else {
   5281 				pp = page_lookup_nowait(vp, io_off,
   5282 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
   5283 			}
   5284 
   5285 			if (pp == NULL || pvn_getdirty(pp, flags) == 0)
   5286 				io_len = PAGESIZE;
   5287 			else {
   5288 				u_offset_t *io_offp = &io_off;
   5289 
   5290 				err = ufs_putapage(vp, pp, io_offp, &io_len,
   5291 				    flags, cr);
   5292 				if (err != 0)
   5293 					break;
   5294 				/*
   5295 				 * "io_off" and "io_len" are returned as
   5296 				 * the range of pages we actually wrote.
   5297 				 * This allows us to skip ahead more quickly
   5298 				 * since several pages may've been dealt
   5299 				 * with by this iteration of the loop.
   5300 				 */
   5301 			}
   5302 		}
   5303 	}
   5304 	if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
   5305 		/*
   5306 		 * We have just sync'ed back all the pages on
   5307 		 * the inode, turn off the IMODTIME flag.
   5308 		 */
   5309 		mutex_enter(&ip->i_tlock);
   5310 		ip->i_flag &= ~IMODTIME;
   5311 		mutex_exit(&ip->i_tlock);
   5312 	}
   5313 	if (dolock)
   5314 		rw_exit(&ip->i_contents);
   5315 	return (err);
   5316 }
   5317 
   5318 static void
   5319 ufs_iodone(buf_t *bp)
   5320 {
   5321 	struct inode *ip;
   5322 
   5323 	ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
   5324 
   5325 	bp->b_iodone = NULL;
   5326 
   5327 	ip = VTOI(bp->b_pages->p_vnode);
   5328 
   5329 	mutex_enter(&ip->i_tlock);
   5330 	if (ip->i_writes >= ufs_LW) {
   5331 		if ((ip->i_writes -= bp->b_bcount) <= ufs_LW)
   5332 			if (ufs_WRITES)
   5333 				cv_broadcast(&ip->i_wrcv); /* wake all up */
   5334 	} else {
   5335 		ip->i_writes -= bp->b_bcount;
   5336 	}
   5337 
   5338 	mutex_exit(&ip->i_tlock);
   5339 	iodone(bp);
   5340 }
   5341 
   5342 /*
   5343  * Write out a single page, possibly klustering adjacent
   5344  * dirty pages.  The inode lock must be held.
   5345  *
   5346  * LMXXX - bsize < pagesize not done.
   5347  */
   5348 /*ARGSUSED*/
   5349 int
   5350 ufs_putapage(
   5351 	struct vnode *vp,
   5352 	page_t *pp,
   5353 	u_offset_t *offp,
   5354 	size_t *lenp,		/* return values */
   5355 	int flags,
   5356 	struct cred *cr)
   5357 {
   5358 	u_offset_t io_off;
   5359 	u_offset_t off;
   5360 	struct inode *ip = VTOI(vp);
   5361 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
   5362 	struct fs *fs;
   5363 	struct buf *bp;
   5364 	size_t io_len;
   5365 	daddr_t bn;
   5366 	int err;
   5367 	int contig;
   5368 	int dotrans;
   5369 
   5370 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   5371 
   5372 	if (ufsvfsp == NULL) {
   5373 		err = EIO;
   5374 		goto out_trace;
   5375 	}
   5376 
   5377 	fs = ip->i_fs;
   5378 	ASSERT(fs->fs_ronly == 0);
   5379 
   5380 	/*
   5381 	 * If the modified time on the inode has not already been
   5382 	 * set elsewhere (e.g. for write/setattr) we set the time now.
   5383 	 * This gives us approximate modified times for mmap'ed files
   5384 	 * which are modified via stores in the user address space.
   5385 	 */
   5386 	if ((ip->i_flag & IMODTIME) == 0) {
   5387 		mutex_enter(&ip->i_tlock);
   5388 		ip->i_flag |= IUPD;
   5389 		ip->i_seq++;
   5390 		ITIMES_NOLOCK(ip);
   5391 		mutex_exit(&ip->i_tlock);
   5392 	}
   5393 
   5394 	/*
   5395 	 * Align the request to a block boundry (for old file systems),
   5396 	 * and go ask bmap() how contiguous things are for this file.
   5397 	 */
   5398 	off = pp->p_offset & (offset_t)fs->fs_bmask;	/* block align it */
   5399 	contig = 0;
   5400 	err = bmap_read(ip, off, &bn, &contig);
   5401 	if (err)
   5402 		goto out;
   5403 	if (bn == UFS_HOLE) {			/* putpage never allocates */
   5404 		/*
   5405 		 * logging device is in error mode; simply return EIO
   5406 		 */
   5407 		if (TRANS_ISERROR(ufsvfsp)) {
   5408 			err = EIO;
   5409 			goto out;
   5410 		}
   5411 		/*
   5412 		 * Oops, the thread in the window in wrip() did some
   5413 		 * sort of operation which caused a putpage in the bad
   5414 		 * range.  In this case, just return an error which will
   5415 		 * cause the software modified bit on the page to set
   5416 		 * and the page will get written out again later.
   5417 		 */
   5418 		if (ip->i_writer == curthread) {
   5419 			err = EIO;
   5420 			goto out;
   5421 		}
   5422 		/*
   5423 		 * If the pager is trying to push a page in the bad range
   5424 		 * just tell him to try again later when things are better.
   5425 		 */
   5426 		if (flags & B_ASYNC) {
   5427 			err = EAGAIN;
   5428 			goto out;
   5429 		}
   5430 		err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE");
   5431 		goto out;
   5432 	}
   5433 
   5434 	/*
   5435 	 * If it is an fallocate'd block, reverse the negativity since
   5436 	 * we are now writing to it
   5437 	 */
   5438 	if (ISFALLOCBLK(ip, bn)) {
   5439 		err = bmap_set_bn(vp, off, dbtofsb(fs, -bn));
   5440 		if (err)
   5441 			goto out;
   5442 
   5443 		bn = -bn;
   5444 	}
   5445 
   5446 	/*
   5447 	 * Take the length (of contiguous bytes) passed back from bmap()
   5448 	 * and _try_ and get a set of pages covering that extent.
   5449 	 */
   5450 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags);
   5451 
   5452 	/*
   5453 	 * May have run out of memory and not clustered backwards.
   5454 	 * off		p_offset
   5455 	 * [  pp - 1  ][   pp   ]
   5456 	 * [	block		]
   5457 	 * We told bmap off, so we have to adjust the bn accordingly.
   5458 	 */
   5459 	if (io_off > off) {
   5460 		bn += btod(io_off - off);
   5461 		contig -= (io_off - off);
   5462 	}
   5463 
   5464 	/*
   5465 	 * bmap was carefull to tell us the right size so use that.
   5466 	 * There might be unallocated frags at the end.
   5467 	 * LMXXX - bzero the end of the page?  We must be writing after EOF.
   5468 	 */
   5469 	if (io_len > contig) {
   5470 		ASSERT(io_len - contig < fs->fs_bsize);
   5471 		io_len -= (io_len - contig);
   5472 	}
   5473 
   5474 	/*
   5475 	 * Handle the case where we are writing the last page after EOF.
   5476 	 *
   5477 	 * XXX - just a patch for i-mt3.
   5478 	 */
   5479 	if (io_len == 0) {
   5480 		ASSERT(pp->p_offset >=
   5481 		    (u_offset_t)(roundup(ip->i_size, PAGESIZE)));
   5482 		io_len = PAGESIZE;
   5483 	}
   5484 
   5485 	bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags);
   5486 
   5487 	ULOCKFS_SET_MOD(ITOUL(ip));
   5488 
   5489 	bp->b_edev = ip->i_dev;
   5490 	bp->b_dev = cmpdev(ip->i_dev);
   5491 	bp->b_blkno = bn;
   5492 	bp->b_un.b_addr = (caddr_t)0;
   5493 	bp->b_file = ip->i_vnode;
   5494 
   5495 	/*
   5496 	 * File contents of shadow or quota inodes are metadata, and updates
   5497 	 * to these need to be put into a logging transaction. All direct
   5498 	 * callers in UFS do that, but fsflush can come here _before_ the
   5499 	 * normal codepath. An example would be updating ACL information, for
   5500 	 * which the normal codepath would be:
   5501 	 *	ufs_si_store()
   5502 	 *	ufs_rdwri()
   5503 	 *	wrip()
   5504 	 *	segmap_release()
   5505 	 *	VOP_PUTPAGE()
   5506 	 * Here, fsflush can pick up the dirty page before segmap_release()
   5507 	 * forces it out. If that happens, there's no transaction.
   5508 	 * We therefore need to test whether a transaction exists, and if not
   5509 	 * create one - for fsflush.
   5510 	 */
   5511 	dotrans =
   5512 	    (((ip->i_mode & IFMT) == IFSHAD || ufsvfsp->vfs_qinod == ip) &&
   5513 	    ((curthread->t_flag & T_DONTBLOCK) == 0) &&
   5514 	    (TRANS_ISTRANS(ufsvfsp)));
   5515 
   5516 	if (dotrans) {
   5517 		curthread->t_flag |= T_DONTBLOCK;
   5518 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
   5519 	}
   5520 	if (TRANS_ISTRANS(ufsvfsp)) {
   5521 		if ((ip->i_mode & IFMT) == IFSHAD) {
   5522 			TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD);
   5523 		} else if (ufsvfsp->vfs_qinod == ip) {
   5524 			TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR,
   5525 			    0, 0);
   5526 		}
   5527 	}
   5528 	if (dotrans) {
   5529 		TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
   5530 		curthread->t_flag &= ~T_DONTBLOCK;
   5531 	}
   5532 
   5533 	/* write throttle */
   5534 
   5535 	ASSERT(bp->b_iodone == NULL);
   5536 	bp->b_iodone = (int (*)())ufs_iodone;
   5537 	mutex_enter(&ip->i_tlock);
   5538 	ip->i_writes += bp->b_bcount;
   5539 	mutex_exit(&ip->i_tlock);
   5540 
   5541 	if (bp->b_flags & B_ASYNC) {
   5542 		if (ufsvfsp->vfs_log) {
   5543 			lufs_write_strategy(ufsvfsp->vfs_log, bp);
   5544 		} else if (ufsvfsp->vfs_snapshot) {
   5545 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
   5546 		} else {
   5547 			ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
   5548 			ub.ub_putasyncs.value.ul++;
   5549 			(void) bdev_strategy(bp);
   5550 			lwp_stat_update(LWP_STAT_OUBLK, 1);
   5551 		}
   5552 	} else {
   5553 		if (ufsvfsp->vfs_log) {
   5554 			lufs_write_strategy(ufsvfsp->vfs_log, bp);
   5555 		} else if (ufsvfsp->vfs_snapshot) {
   5556 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
   5557 		} else {
   5558 			ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
   5559 			ub.ub_putsyncs.value.ul++;
   5560 			(void) bdev_strategy(bp);
   5561 			lwp_stat_update(LWP_STAT_OUBLK, 1);
   5562 		}
   5563 		err = biowait(bp);
   5564 		pageio_done(bp);
   5565 		pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
   5566 	}
   5567 
   5568 	pp = NULL;
   5569 
   5570 out:
   5571 	if (err != 0 && pp != NULL)
   5572 		pvn_write_done(pp, B_ERROR | B_WRITE | flags);
   5573 
   5574 	if (offp)
   5575 		*offp = io_off;
   5576 	if (lenp)
   5577 		*lenp = io_len;
   5578 out_trace:
   5579 	return (err);
   5580 }
   5581 
   5582 uint64_t ufs_map_alock_retry_cnt;
   5583 uint64_t ufs_map_lockfs_retry_cnt;
   5584 
   5585 /* ARGSUSED */
   5586 static int
   5587 ufs_map(struct vnode *vp,
   5588 	offset_t off,
   5589 	struct as *as,
   5590 	caddr_t *addrp,
   5591 	size_t len,
   5592 	uchar_t prot,
   5593 	uchar_t maxprot,
   5594 	uint_t flags,
   5595 	struct cred *cr,
   5596 	caller_context_t *ct)
   5597 {
   5598 	struct segvn_crargs vn_a;
   5599 	struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
   5600 	struct ulockfs *ulp;
   5601 	int error, sig;
   5602 	k_sigset_t smask;
   5603 	caddr_t hint = *addrp;
   5604 
   5605 	if (vp->v_flag & VNOMAP) {
   5606 		error = ENOSYS;
   5607 		goto out;
   5608 	}
   5609 
   5610 	if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) {
   5611 		error = ENXIO;
   5612 		goto out;
   5613 	}
   5614 
   5615 	if (vp->v_type != VREG) {
   5616 		error = ENODEV;
   5617 		goto out;
   5618 	}
   5619 
   5620 retry_map:
   5621 	*addrp = hint;
   5622 	/*
   5623 	 * If file is being locked, disallow mapping.
   5624 	 */
   5625 	if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) {
   5626 		error = EAGAIN;
   5627 		goto out;
   5628 	}
   5629 
   5630 	as_rangelock(as);
   5631 	/*
   5632 	 * Note that if we are retrying (because ufs_lockfs_trybegin failed in
   5633 	 * the previous attempt), some other thread could have grabbed
   5634 	 * the same VA range if MAP_FIXED is set. In that case, choose_addr
   5635 	 * would unmap the valid VA range, that is ok.
   5636 	 */
   5637 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
   5638 	if (error != 0) {
   5639 		as_rangeunlock(as);
   5640 		goto out;
   5641 	}
   5642 
   5643 	/*
   5644 	 * a_lock has to be acquired before entering the lockfs protocol
   5645 	 * because that is the order in which pagefault works. Also we cannot
   5646 	 * block on a_lock here because this waiting writer will prevent
   5647 	 * further readers like ufs_read from progressing and could cause
   5648 	 * deadlock between ufs_read/ufs_map/pagefault when a quiesce is
   5649 	 * pending.
   5650 	 */
   5651 	while (!AS_LOCK_TRYENTER(as, &as->a_lock, RW_WRITER)) {
   5652 		ufs_map_alock_retry_cnt++;
   5653 		delay(RETRY_LOCK_DELAY);
   5654 	}
   5655 
   5656 	/*
   5657 	 * We can't hold as->a_lock and wait for lockfs to succeed because
   5658 	 * the proc tools might hang on a_lock, so call ufs_lockfs_trybegin()
   5659 	 * instead.
   5660 	 */
   5661 	if (error = ufs_lockfs_trybegin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK)) {
   5662 		/*
   5663 		 * ufs_lockfs_trybegin() did not succeed. It is safer to give up
   5664 		 * as->a_lock and wait for ulp->ul_fs_lock status to change.
   5665 		 */
   5666 		ufs_map_lockfs_retry_cnt++;
   5667 		AS_LOCK_EXIT(as, &as->a_lock);
   5668 		as_rangeunlock(as);
   5669 		if (error == EIO)
   5670 			goto out;
   5671 
   5672 		mutex_enter(&ulp->ul_lock);
   5673 		while (ulp->ul_fs_lock & ULOCKFS_MAP_MASK) {
   5674 			if (ULOCKFS_IS_SLOCK(ulp) || ufsvfsp->vfs_nointr) {
   5675 				cv_wait(&ulp->ul_cv, &ulp->ul_lock);
   5676 			} else {
   5677 				sigintr(&smask, 1);
   5678 				sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
   5679 				sigunintr(&smask);
   5680 				if (((ulp->ul_fs_lock & ULOCKFS_MAP_MASK) &&
   5681 				    !sig) || ufsvfsp->vfs_dontblock) {
   5682 					mutex_exit(&ulp->ul_lock);
   5683 					return (EINTR);
   5684 				}
   5685 			}
   5686 		}
   5687 		mutex_exit(&ulp->ul_lock);
   5688 		goto retry_map;
   5689 	}
   5690 
   5691 	vn_a.vp = vp;
   5692 	vn_a.offset = (u_offset_t)off;
   5693 	vn_a.type = flags & MAP_TYPE;
   5694 	vn_a.prot = prot;
   5695 	vn_a.maxprot = maxprot;
   5696 	vn_a.cred = cr;
   5697 	vn_a.amp = NULL;
   5698 	vn_a.flags = flags & ~MAP_TYPE;
   5699 	vn_a.szc = 0;
   5700 	vn_a.lgrp_mem_policy_flags = 0;
   5701 
   5702 	error = as_map_locked(as, *addrp, len, segvn_create, &vn_a);
   5703 	if (ulp)
   5704 		ufs_lockfs_end(ulp);
   5705 	as_rangeunlock(as);
   5706 out:
   5707 	return (error);
   5708 }
   5709 
   5710 /* ARGSUSED */
   5711 static int
   5712 ufs_addmap(struct vnode *vp,
   5713 	offset_t off,
   5714 	struct as *as,
   5715 	caddr_t addr,
   5716 	size_t	len,
   5717 	uchar_t  prot,
   5718 	uchar_t  maxprot,
   5719 	uint_t    flags,
   5720 	struct cred *cr,
   5721 	caller_context_t *ct)
   5722 {
   5723 	struct inode *ip = VTOI(vp);
   5724 
   5725 	if (vp->v_flag & VNOMAP) {
   5726 		return (ENOSYS);
   5727 	}
   5728 
   5729 	mutex_enter(&ip->i_tlock);
   5730 	ip->i_mapcnt += btopr(len);
   5731 	mutex_exit(&ip->i_tlock);
   5732 	return (0);
   5733 }
   5734 
   5735 /*ARGSUSED*/
   5736 static int
   5737 ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
   5738 	size_t len, uint_t prot,  uint_t maxprot,  uint_t flags,
   5739 	struct cred *cr, caller_context_t *ct)
   5740 {
   5741 	struct inode *ip = VTOI(vp);
   5742 
   5743 	if (vp->v_flag & VNOMAP) {
   5744 		return (ENOSYS);
   5745 	}
   5746 
   5747 	mutex_enter(&ip->i_tlock);
   5748 	ip->i_mapcnt -= btopr(len); 	/* Count released mappings */
   5749 	ASSERT(ip->i_mapcnt >= 0);
   5750 	mutex_exit(&ip->i_tlock);
   5751 	return (0);
   5752 }
   5753 /*
   5754  * Return the answer requested to poll() for non-device files
   5755  */
   5756 struct pollhead ufs_pollhd;
   5757 
   5758 /* ARGSUSED */
   5759 int
   5760 ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp,
   5761 	caller_context_t *ct)
   5762 {
   5763 	struct ufsvfs	*ufsvfsp;
   5764 
   5765 	*revp = 0;
   5766 	ufsvfsp = VTOI(vp)->i_ufsvfs;
   5767 
   5768 	if (!ufsvfsp) {
   5769 		*revp = POLLHUP;
   5770 		goto out;
   5771 	}
   5772 
   5773 	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) ||
   5774 	    ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) {
   5775 		*revp |= POLLERR;
   5776 
   5777 	} else {
   5778 		if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly &&
   5779 		    !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
   5780 			*revp |= POLLOUT;
   5781 
   5782 		if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly &&
   5783 		    !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
   5784 			*revp |= POLLWRBAND;
   5785 
   5786 		if (ev & POLLIN)
   5787 			*revp |= POLLIN;
   5788 
   5789 		if (ev & POLLRDNORM)
   5790 			*revp |= POLLRDNORM;
   5791 
   5792 		if (ev & POLLRDBAND)
   5793 			*revp |= POLLRDBAND;
   5794 	}
   5795 
   5796 	if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP)))
   5797 		*revp |= POLLPRI;
   5798 out:
   5799 	*phpp = !any && !*revp ? &ufs_pollhd : (struct pollhead *)NULL;
   5800 
   5801 	return (0);
   5802 }
   5803 
   5804 /* ARGSUSED */
   5805 static int
   5806 ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr,
   5807 	caller_context_t *ct)
   5808 {
   5809 	struct ufsvfs	*ufsvfsp = VTOI(vp)->i_ufsvfs;
   5810 	struct ulockfs	*ulp = NULL;
   5811 	struct inode 	*sip = NULL;
   5812 	int		error;
   5813 	struct inode 	*ip = VTOI(vp);
   5814 	int		issync;
   5815 
   5816 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK);
   5817 	if (error)
   5818 		return (error);
   5819 
   5820 	switch (cmd) {
   5821 		/*
   5822 		 * Have to handle _PC_NAME_MAX here, because the normal way
   5823 		 * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()]
   5824 		 * results in a lock ordering reversal between
   5825 		 * ufs_lockfs_{begin,end}() and
   5826 		 * ufs_thread_{suspend,continue}().
   5827 		 *
   5828 		 * Keep in sync with ufs_statvfs().
   5829 		 */
   5830 	case _PC_NAME_MAX:
   5831 		*valp = MAXNAMLEN;
   5832 		break;
   5833 
   5834 	case _PC_FILESIZEBITS:
   5835 		if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
   5836 			*valp = UFS_FILESIZE_BITS;
   5837 		else
   5838 			*valp = 32;
   5839 		break;
   5840 
   5841 	case _PC_XATTR_EXISTS:
   5842 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
   5843 
   5844 			error =
   5845 			    ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR, cr);
   5846 			if (error ==  0 && sip != NULL) {
   5847 				/* Start transaction */
   5848 				if (ulp) {
   5849 					TRANS_BEGIN_CSYNC(ufsvfsp, issync,
   5850 					    TOP_RMDIR, TOP_RMDIR_SIZE);
   5851 				}
   5852 				/*
   5853 				 * Is directory empty
   5854 				 */
   5855 				rw_enter(&sip->i_rwlock, RW_WRITER);
   5856 				rw_enter(&sip->i_contents, RW_WRITER);
   5857 				if (ufs_xattrdirempty(sip,
   5858 				    sip->i_number, CRED())) {
   5859 					rw_enter(&ip->i_contents, RW_WRITER);
   5860 					ufs_unhook_shadow(ip, sip);
   5861 					rw_exit(&ip->i_contents);
   5862 
   5863 					*valp = 0;
   5864 
   5865 				} else
   5866 					*valp = 1;
   5867 				rw_exit(&sip->i_contents);
   5868 				rw_exit(&sip->i_rwlock);
   5869 				if (ulp) {
   5870 					TRANS_END_CSYNC(ufsvfsp, error, issync,
   5871 					    TOP_RMDIR, TOP_RMDIR_SIZE);
   5872 				}
   5873 				VN_RELE(ITOV(sip));
   5874 			} else if (error == ENOENT) {
   5875 				*valp = 0;
   5876 				error = 0;
   5877 			}
   5878 		} else {
   5879 			error = fs_pathconf(vp, cmd, valp, cr, ct);
   5880 		}
   5881 		break;
   5882 
   5883 	case _PC_ACL_ENABLED:
   5884 		*valp = _ACL_ACLENT_ENABLED;
   5885 		break;
   5886 
   5887 	case _PC_MIN_HOLE_SIZE:
   5888 		*valp = (ulong_t)ip->i_fs->fs_bsize;
   5889 		break;
   5890 
   5891 	case _PC_SATTR_ENABLED:
   5892 	case _PC_SATTR_EXISTS:
   5893 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
   5894 		    (vp->v_type == VREG || vp->v_type == VDIR);
   5895 		break;
   5896 
   5897 	case _PC_TIMESTAMP_RESOLUTION:
   5898 		/*
   5899 		 * UFS keeps only microsecond timestamp resolution.
   5900 		 * This is historical and will probably never change.
   5901 		 */
   5902 		*valp = 1000L;
   5903 		break;
   5904 
   5905 	default:
   5906 		error = fs_pathconf(vp, cmd, valp, cr, ct);
   5907 		break;
   5908 	}
   5909 
   5910 	if (ulp != NULL) {
   5911 		ufs_lockfs_end(ulp);
   5912 	}
   5913 	return (error);
   5914 }
   5915 
   5916 int ufs_pageio_writes, ufs_pageio_reads;
   5917 
   5918 /*ARGSUSED*/
   5919 static int
   5920 ufs_pageio(struct vnode *vp, page_t *pp, u_offset_t io_off, size_t io_len,
   5921 	int flags, struct cred *cr, caller_context_t *ct)
   5922 {
   5923 	struct inode *ip = VTOI(vp);
   5924 	struct ufsvfs *ufsvfsp;
   5925 	page_t *npp = NULL, *opp = NULL, *cpp = pp;
   5926 	struct buf *bp;
   5927 	daddr_t bn;
   5928 	size_t done_len = 0, cur_len = 0;
   5929 	int err = 0;
   5930 	int contig = 0;
   5931 	int dolock;
   5932 	int vmpss = 0;
   5933 	struct ulockfs *ulp;
   5934 
   5935 	if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp &&
   5936 	    vp->v_mpssdata != NULL) {
   5937 		vmpss = 1;
   5938 	}
   5939 
   5940 	dolock = (rw_owner(&ip->i_contents) != curthread);
   5941 	/*
   5942 	 * We need a better check.  Ideally, we would use another
   5943 	 * vnodeops so that hlocked and forcibly unmounted file
   5944 	 * systems would return EIO where appropriate and w/o the
   5945 	 * need for these checks.
   5946 	 */
   5947 	if ((ufsvfsp = ip->i_ufsvfs) == NULL)
   5948 		return (EIO);
   5949 
   5950 	/*
   5951 	 * For vmpss (pp can be NULL) case respect the quiesce protocol.
   5952 	 * ul_lock must be taken before locking pages so we can't use it here
   5953 	 * if pp is non NULL because segvn already locked pages
   5954 	 * SE_EXCL. Instead we rely on the fact that a forced umount or
   5955 	 * applying a filesystem lock via ufs_fiolfs() will block in the
   5956 	 * implicit call to ufs_flush() until we unlock the pages after the
   5957 	 * return to segvn. Other ufs_quiesce() callers keep ufs_quiesce_pend
   5958 	 * above 0 until they are done. We have to be careful not to increment
   5959 	 * ul_vnops_cnt here after forceful unmount hlocks the file system.
   5960 	 *
   5961 	 * If pp is NULL use ul_lock to make sure we don't increment
   5962 	 * ul_vnops_cnt after forceful unmount hlocks the file system.
   5963 	 */
   5964 	if (vmpss || pp == NULL) {
   5965 		ulp = &ufsvfsp->vfs_ulockfs;
   5966 		if (pp == NULL)
   5967 			mutex_enter(&ulp->ul_lock);
   5968 		if (ulp->ul_fs_lock & ULOCKFS_GETREAD_MASK) {
   5969 			if (pp == NULL) {
   5970 				mutex_exit(&ulp->ul_lock);
   5971 			}
   5972 			return (vmpss ? EIO : EINVAL);
   5973 		}
   5974 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
   5975 		if (pp == NULL)
   5976 			mutex_exit(&ulp->ul_lock);
   5977 		if (ufs_quiesce_pend) {
   5978 			if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
   5979 				cv_broadcast(&ulp->ul_cv);
   5980 			return (vmpss ? EIO : EINVAL);
   5981 		}
   5982 	}
   5983 
   5984 	if (dolock) {
   5985 		/*
   5986 		 * segvn may call VOP_PAGEIO() instead of VOP_GETPAGE() to
   5987 		 * handle a fault against a segment that maps vnode pages with
   5988 		 * large mappings.  Segvn creates pages and holds them locked
   5989 		 * SE_EXCL during VOP_PAGEIO() call. In this case we have to
   5990 		 * use rw_tryenter() to avoid a potential deadlock since in
   5991 		 * lock order i_contents needs to be taken first.
   5992 		 * Segvn will retry via VOP_GETPAGE() if VOP_PAGEIO() fails.
   5993 		 */
   5994 		if (!vmpss) {
   5995 			rw_enter(&ip->i_contents, RW_READER);
   5996 		} else if (!rw_tryenter(&ip->i_contents, RW_READER)) {
   5997 			if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
   5998 				cv_broadcast(&ulp->ul_cv);
   5999 			return (EDEADLK);
   6000 		}
   6001 	}
   6002 
   6003 	/*
   6004 	 * Return an error to segvn because the pagefault request is beyond
   6005 	 * PAGESIZE rounded EOF.
   6006 	 */
   6007 	if (vmpss && btopr(io_off + io_len) > btopr(ip->i_size)) {
   6008 		if (dolock)
   6009 			rw_exit(&ip->i_contents);
   6010 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
   6011 			cv_broadcast(&ulp->ul_cv);
   6012 		return (EFAULT);
   6013 	}
   6014 
   6015 	if (pp == NULL) {
   6016 		if (bmap_has_holes(ip)) {
   6017 			err = ENOSYS;
   6018 		} else {
   6019 			err = EINVAL;
   6020 		}
   6021 		if (dolock)
   6022 			rw_exit(&ip->i_contents);
   6023 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
   6024 			cv_broadcast(&ulp->ul_cv);
   6025 		return (err);
   6026 	}
   6027 
   6028 	/*
   6029 	 * Break the io request into chunks, one for each contiguous
   6030 	 * stretch of disk blocks in the target file.
   6031 	 */
   6032 	while (done_len < io_len) {
   6033 		ASSERT(cpp);
   6034 		contig = 0;
   6035 		if (err = bmap_read(ip, (u_offset_t)(io_off + done_len),
   6036 		    &bn, &contig))
   6037 			break;
   6038 
   6039 		if (bn == UFS_HOLE) {	/* No holey swapfiles */
   6040 			if (vmpss) {
   6041 				err = EFAULT;
   6042 				break;
   6043 			}
   6044 			err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE");
   6045 			break;
   6046 		}
   6047 
   6048 		cur_len = MIN(io_len - done_len, contig);
   6049 		/*
   6050 		 * Zero out a page beyond EOF, when the last block of
   6051 		 * a file is a UFS fragment so that ufs_pageio() can be used
   6052 		 * instead of ufs_getpage() to handle faults against
   6053 		 * segvn segments that use large pages.
   6054 		 */
   6055 		page_list_break(&cpp, &npp, btopr(cur_len));
   6056 		if ((flags & B_READ) && (cur_len & PAGEOFFSET)) {
   6057 			size_t xlen = cur_len & PAGEOFFSET;
   6058 			pagezero(cpp->p_prev, xlen, PAGESIZE - xlen);
   6059 		}
   6060 
   6061 		bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
   6062 		ASSERT(bp != NULL);
   6063 
   6064 		bp->b_edev = ip->i_dev;
   6065 		bp->b_dev = cmpdev(ip->i_dev);
   6066 		bp->b_blkno = bn;
   6067 		bp->b_un.b_addr = (caddr_t)0;
   6068 		bp->b_file = ip->i_vnode;
   6069 
   6070 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
   6071 		ub.ub_pageios.value.ul++;
   6072 		if (ufsvfsp->vfs_snapshot)
   6073 			fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp);
   6074 		else
   6075 			(void) bdev_strategy(bp);
   6076 
   6077 		if (flags & B_READ)
   6078 			ufs_pageio_reads++;
   6079 		else
   6080 			ufs_pageio_writes++;
   6081 		if (flags & B_READ)
   6082 			lwp_stat_update(LWP_STAT_INBLK, 1);
   6083 		else
   6084 			lwp_stat_update(LWP_STAT_OUBLK, 1);
   6085 		/*
   6086 		 * If the request is not B_ASYNC, wait for i/o to complete
   6087 		 * and re-assemble the page list to return to the caller.
   6088 		 * If it is B_ASYNC we leave the page list in pieces and
   6089 		 * cleanup() will dispose of them.
   6090 		 */
   6091 		if ((flags & B_ASYNC) == 0) {
   6092 			err = biowait(bp);
   6093 			pageio_done(bp);
   6094 			if (err)
   6095 				break;
   6096 			page_list_concat(&opp, &cpp);
   6097 		}
   6098 		cpp = npp;
   6099 		npp = NULL;
   6100 		if (flags & B_READ)
   6101 			cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t);
   6102 		done_len += cur_len;
   6103 	}
   6104 	ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len));
   6105 	if (err) {
   6106 		if (flags & B_ASYNC) {
   6107 			/* Cleanup unprocessed parts of list */
   6108 			page_list_concat(&cpp, &npp);
   6109 			if (flags & B_READ)
   6110 				pvn_read_done(cpp, B_ERROR);
   6111 			else
   6112 				pvn_write_done(cpp, B_ERROR);
   6113 		} else {
   6114 			/* Re-assemble list and let caller clean up */
   6115 			page_list_concat(&opp, &cpp);
   6116 			page_list_concat(&opp, &npp);
   6117 		}
   6118 	}
   6119 
   6120 	if (vmpss && !(ip->i_flag & IACC) && !ULOCKFS_IS_NOIACC(ulp) &&
   6121 	    ufsvfsp->vfs_fs->fs_ronly == 0 && !ufsvfsp->vfs_noatime) {
   6122 		mutex_enter(&ip->i_tlock);
   6123 		ip->i_flag |= IACC;
   6124 		ITIMES_NOLOCK(ip);
   6125 		mutex_exit(&ip->i_tlock);
   6126 	}
   6127 
   6128 	if (dolock)
   6129 		rw_exit(&ip->i_contents);
   6130 	if (vmpss && !atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
   6131 		cv_broadcast(&ulp->ul_cv);
   6132 	return (err);
   6133 }
   6134 
   6135 /*
   6136  * Called when the kernel is in a frozen state to dump data
   6137  * directly to the device. It uses a private dump data structure,
   6138  * set up by dump_ctl, to locate the correct disk block to which to dump.
   6139  */
   6140 /*ARGSUSED*/
   6141 static int
   6142 ufs_dump(vnode_t *vp, caddr_t addr, offset_t ldbn, offset_t dblks,
   6143     caller_context_t *ct)
   6144 {
   6145 	u_offset_t	file_size;
   6146 	struct inode    *ip = VTOI(vp);
   6147 	struct fs	*fs = ip->i_fs;
   6148 	daddr_t		dbn, lfsbn;
   6149 	int		disk_blks = fs->fs_bsize >> DEV_BSHIFT;
   6150 	int		error = 0;
   6151 	int		ndbs, nfsbs;
   6152 
   6153 	/*
   6154 	 * forced unmount case
   6155 	 */
   6156 	if (ip->i_ufsvfs == NULL)
   6157 		return (EIO);
   6158 	/*
   6159 	 * Validate the inode that it has not been modified since
   6160 	 * the dump structure is allocated.
   6161 	 */
   6162 	mutex_enter(&ip->i_tlock);
   6163 	if ((dump_info == NULL) ||
   6164 	    (dump_info->ip != ip) ||
   6165 	    (dump_info->time.tv_sec != ip->i_mtime.tv_sec) ||
   6166 	    (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) {
   6167 		mutex_exit(&ip->i_tlock);
   6168 		return (-1);
   6169 	}
   6170 	mutex_exit(&ip->i_tlock);
   6171 
   6172 	/*
   6173 	 * See that the file has room for this write
   6174 	 */
   6175 	UFS_GET_ISIZE(&file_size, ip);
   6176 
   6177 	if (ldbtob(ldbn + dblks) > file_size)
   6178 		return (ENOSPC);
   6179 
   6180 	/*
   6181 	 * Find the physical disk block numbers from the dump
   6182 	 * private data structure directly and write out the data
   6183 	 * in contiguous block lumps
   6184 	 */
   6185 	while (dblks > 0 && !error) {
   6186 		lfsbn = (daddr_t)lblkno(fs, ldbtob(ldbn));
   6187 		dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks;
   6188 		nfsbs = 1;
   6189 		ndbs = disk_blks - ldbn % disk_blks;
   6190 		while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn +
   6191 		    nfsbs]) == dbn + ndbs) {
   6192 			nfsbs++;
   6193 			ndbs += disk_blks;
   6194 		}
   6195 		if (ndbs > dblks)
   6196 			ndbs = dblks;
   6197 		error = bdev_dump(ip->i_dev, addr, dbn, ndbs);
   6198 		addr += ldbtob((offset_t)ndbs);
   6199 		dblks -= ndbs;
   6200 		ldbn += ndbs;
   6201 	}
   6202 	return (error);
   6203 
   6204 }
   6205 
   6206 /*
   6207  * Prepare the file system before and after the dump operation.
   6208  *
   6209  * action = DUMP_ALLOC:
   6210  * Preparation before dump, allocate dump private data structure
   6211  * to hold all the direct and indirect block info for dump.
   6212  *
   6213  * action = DUMP_FREE:
   6214  * Clean up after dump, deallocate the dump private data structure.
   6215  *
   6216  * action = DUMP_SCAN:
   6217  * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space;
   6218  * if found, the starting file-relative DEV_BSIZE lbn is written
   6219  * to *bklp; that lbn is intended for use with VOP_DUMP()
   6220  */
   6221 /*ARGSUSED*/
   6222 static int
   6223 ufs_dumpctl(vnode_t *vp, int action, offset_t *blkp, caller_context_t *ct)
   6224 {
   6225 	struct inode	*ip = VTOI(vp);
   6226 	ufsvfs_t	*ufsvfsp = ip->i_ufsvfs;
   6227 	struct fs	*fs;
   6228 	daddr32_t	*dblk, *storeblk;
   6229 	daddr32_t	*nextblk, *endblk;
   6230 	struct buf	*bp;
   6231 	int		i, entry, entries;
   6232 	int		n, ncontig;
   6233 
   6234 	/*
   6235 	 * check for forced unmount
   6236 	 */
   6237 	if (ufsvfsp == NULL)
   6238 		return (EIO);
   6239 
   6240 	if (action == DUMP_ALLOC) {
   6241 		/*
   6242 		 * alloc and record dump_info
   6243 		 */
   6244 		if (dump_info != NULL)
   6245 			return (EINVAL);
   6246 
   6247 		ASSERT(vp->v_type == VREG);
   6248 		fs = ufsvfsp->vfs_fs;
   6249 
   6250 		rw_enter(&ip->i_contents, RW_READER);
   6251 
   6252 		if (bmap_has_holes(ip)) {
   6253 			rw_exit(&ip->i_contents);
   6254 			return (EFAULT);
   6255 		}
   6256 
   6257 		/*
   6258 		 * calculate and allocate space needed according to i_size
   6259 		 */
   6260 		entries = (int)lblkno(fs, blkroundup(fs, ip->i_size));
   6261 		dump_info = kmem_alloc(sizeof (struct dump) +
   6262 		    (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP);
   6263 		if (dump_info == NULL) {
   6264 			rw_exit(&ip->i_contents);
   6265 			return (ENOMEM);
   6266 		}
   6267 
   6268 		/* Start saving the info */
   6269 		dump_info->fsbs = entries;
   6270 		dump_info->ip = ip;
   6271 		storeblk = &dump_info->dblk[0];
   6272 
   6273 		/* Direct Blocks */
   6274 		for (entry = 0; entry < NDADDR && entry < entries; entry++)
   6275 			*storeblk++ = ip->i_db[entry];
   6276 
   6277 		/* Indirect Blocks */
   6278 		for (i = 0; i < NIADDR; i++) {
   6279 			int error = 0;
   6280 
   6281 			bp = UFS_BREAD(ufsvfsp,
   6282 			    ip->i_dev, fsbtodb(fs, ip->i_ib[i]), fs->fs_bsize);
   6283 			if (bp->b_flags & B_ERROR)
   6284 				error = EIO;
   6285 			else {
   6286 				dblk = bp->b_un.b_daddr;
   6287 				if ((storeblk = save_dblks(ip, ufsvfsp,
   6288 				    storeblk, dblk, i, entries)) == NULL)
   6289 					error = EIO;
   6290 			}
   6291 
   6292 			brelse(bp);
   6293 
   6294 			if (error != 0) {
   6295 				kmem_free(dump_info, sizeof (struct dump) +
   6296 				    (entries - 1) * sizeof (daddr32_t));
   6297 				rw_exit(&ip->i_contents);
   6298 				dump_info = NULL;
   6299 				return (error);
   6300 			}
   6301 		}
   6302 		/* and time stamp the information */
   6303 		mutex_enter(&ip->i_tlock);
   6304 		dump_info->time = ip->i_mtime;
   6305 		mutex_exit(&ip->i_tlock);
   6306 
   6307 		rw_exit(&ip->i_contents);
   6308 	} else if (action == DUMP_FREE) {
   6309 		/*
   6310 		 * free dump_info
   6311 		 */
   6312 		if (dump_info == NULL)
   6313 			return (EINVAL);
   6314 		entries = dump_info->fsbs - 1;
   6315 		kmem_free(dump_info, sizeof (struct dump) +
   6316 		    entries * sizeof (daddr32_t));
   6317 		dump_info = NULL;
   6318 	} else if (action == DUMP_SCAN) {
   6319 		/*
   6320 		 * scan dump_info
   6321 		 */
   6322 		if (dump_info == NULL)
   6323 			return (EINVAL);
   6324 
   6325 		dblk = dump_info->dblk;
   6326 		nextblk = dblk + 1;
   6327 		endblk = dblk + dump_info->fsbs - 1;
   6328 		fs = ufsvfsp->vfs_fs;
   6329 		ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT);
   6330 
   6331 		/*
   6332 		 * scan dblk[] entries; contig fs space is found when:
   6333 		 * ((current blkno + frags per block) == next blkno)
   6334 		 */
   6335 		n = 0;
   6336 		while (n < ncontig && dblk < endblk) {
   6337 			if ((*dblk + fs->fs_frag) == *nextblk)
   6338 				n++;
   6339 			else
   6340 				n = 0;
   6341 			dblk++;
   6342 			nextblk++;
   6343 		}
   6344 
   6345 		/*
   6346 		 * index is where size bytes of contig space begins;
   6347 		 * conversion from index to the file's DEV_BSIZE lbn
   6348 		 * is equivalent to:  (index * fs_bsize) / DEV_BSIZE
   6349 		 */
   6350 		if (n == ncontig) {
   6351 			i = (dblk - dump_info->dblk) - ncontig;
   6352 			*blkp = i << (fs->fs_bshift - DEV_BSHIFT);
   6353 		} else
   6354 			return (EFAULT);
   6355 	}
   6356 	return (0);
   6357 }
   6358 
   6359 /*
   6360  * Recursive helper function for ufs_dumpctl().  It follows the indirect file
   6361  * system  blocks until it reaches the the disk block addresses, which are
   6362  * then stored into the given buffer, storeblk.
   6363  */
   6364 static daddr32_t *
   6365 save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp,  daddr32_t *storeblk,
   6366     daddr32_t *dblk, int level, int entries)
   6367 {
   6368 	struct fs	*fs = ufsvfsp->vfs_fs;
   6369 	struct buf	*bp;
   6370 	int		i;
   6371 
   6372 	if (level == 0) {
   6373 		for (i = 0; i < NINDIR(fs); i++) {
   6374 			if (storeblk - dump_info->dblk >= entries)
   6375 				break;
   6376 			*storeblk++ = dblk[i];
   6377 		}
   6378 		return (storeblk);
   6379 	}
   6380 	for (i = 0; i < NINDIR(fs); i++) {
   6381 		if (storeblk - dump_info->dblk >= entries)
   6382 			break;
   6383 		bp = UFS_BREAD(ufsvfsp,
   6384 		    ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize);
   6385 		if (bp->b_flags & B_ERROR) {
   6386 			brelse(bp);
   6387 			return (NULL);
   6388 		}
   6389 		storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr,
   6390 		    level - 1, entries);
   6391 		brelse(bp);
   6392 
   6393 		if (storeblk == NULL)
   6394 			return (NULL);
   6395 	}
   6396 	return (storeblk);
   6397 }
   6398 
   6399 /* ARGSUSED */
   6400 static int
   6401 ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag,
   6402 	struct cred *cr, caller_context_t *ct)
   6403 {
   6404 	struct inode	*ip = VTOI(vp);
   6405 	struct ulockfs	*ulp;
   6406 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
   6407 	ulong_t		vsa_mask = vsap->vsa_mask;
   6408 	int		err = EINVAL;
   6409 
   6410 	vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
   6411 
   6412 	/*
   6413 	 * Only grab locks if needed - they're not needed to check vsa_mask
   6414 	 * or if the mask contains no acl flags.
   6415 	 */
   6416 	if (vsa_mask != 0) {
   6417 		if (err = ufs_lockfs_begin(ufsvfsp, &ulp,
   6418 		    ULOCKFS_GETATTR_MASK))
   6419 			return (err);
   6420 
   6421 		rw_enter(&ip->i_contents, RW_READER);
   6422 		err = ufs_acl_get(ip, vsap, flag, cr);
   6423 		rw_exit(&ip->i_contents);
   6424 
   6425 		if (ulp)
   6426 			ufs_lockfs_end(ulp);
   6427 	}
   6428 	return (err);
   6429 }
   6430 
   6431 /* ARGSUSED */
   6432 static int
   6433 ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr,
   6434 	caller_context_t *ct)
   6435 {
   6436 	struct inode	*ip = VTOI(vp);
   6437 	struct ulockfs	*ulp = NULL;
   6438 	struct ufsvfs	*ufsvfsp = VTOI(vp)->i_ufsvfs;
   6439 	ulong_t		vsa_mask = vsap->vsa_mask;
   6440 	int		err;
   6441 	int		haverwlock = 1;
   6442 	int		trans_size;
   6443 	int		donetrans = 0;
   6444 	int		retry = 1;
   6445 
   6446 	ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
   6447 
   6448 	/* Abort now if the request is either empty or invalid. */
   6449 	vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
   6450 	if ((vsa_mask == 0) ||
   6451 	    ((vsap->vsa_aclentp == NULL) &&
   6452 	    (vsap->vsa_dfaclentp == NULL))) {
   6453 		err = EINVAL;
   6454 		goto out;
   6455 	}
   6456 
   6457 	/*
   6458 	 * Following convention, if this is a directory then we acquire the
   6459 	 * inode's i_rwlock after starting a UFS logging transaction;
   6460 	 * otherwise, we acquire it beforehand. Since we were called (and
   6461 	 * must therefore return) with the lock held, we will have to drop it,
   6462 	 * and later reacquire it, if operating on a directory.
   6463 	 */
   6464 	if (vp->v_type == VDIR) {
   6465 		rw_exit(&ip->i_rwlock);
   6466 		haverwlock = 0;
   6467 	} else {
   6468 		/* Upgrade the lock if required. */
   6469 		if (!rw_write_held(&ip->i_rwlock)) {
   6470 			rw_exit(&ip->i_rwlock);
   6471 			rw_enter(&ip->i_rwlock, RW_WRITER);
   6472 		}
   6473 	}
   6474 
   6475 again:
   6476 	ASSERT(!(vp->v_type == VDIR && haverwlock));
   6477 	if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) {
   6478 		ulp = NULL;
   6479 		retry = 0;
   6480 		goto out;
   6481 	}
   6482 
   6483 	/*
   6484 	 * Check that the file system supports this operation. Note that
   6485 	 * ufs_lockfs_begin() will have checked that the file system had
   6486 	 * not been forcibly unmounted.
   6487 	 */
   6488 	if (ufsvfsp->vfs_fs->fs_ronly) {
   6489 		err = EROFS;
   6490 		goto out;
   6491 	}
   6492 	if (ufsvfsp->vfs_nosetsec) {
   6493 		err = ENOSYS;
   6494 		goto out;
   6495 	}
   6496 
   6497 	if (ulp) {
   6498 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR,
   6499 		    trans_size = TOP_SETSECATTR_SIZE(VTOI(vp)));
   6500 		donetrans = 1;
   6501 	}
   6502 
   6503 	if (vp->v_type == VDIR) {
   6504 		rw_enter(&ip->i_rwlock, RW_WRITER);
   6505 		haverwlock = 1;
   6506 	}
   6507 
   6508 	ASSERT(haverwlock);
   6509 
   6510 	/* Do the actual work. */
   6511 	rw_enter(&ip->i_contents, RW_WRITER);
   6512 	/*
   6513 	 * Suppress out of inodes messages if we will retry.
   6514 	 */
   6515 	if (retry)
   6516 		ip->i_flag |= IQUIET;
   6517 	err = ufs_acl_set(ip, vsap, flag, cr);
   6518 	ip->i_flag &= ~IQUIET;
   6519 	rw_exit(&ip->i_contents);
   6520 
   6521 out:
   6522 	if (ulp) {
   6523 		if (donetrans) {
   6524 			/*
   6525 			 * top_end_async() can eventually call
   6526 			 * top_end_sync(), which can block. We must
   6527 			 * therefore observe the lock-ordering protocol
   6528 			 * here as well.
   6529 			 */
   6530 			if (vp->v_type == VDIR) {
   6531 				rw_exit(&ip->i_rwlock);
   6532 				haverwlock = 0;
   6533 			}
   6534 			TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size);
   6535 		}
   6536 		ufs_lockfs_end(ulp);
   6537 	}
   6538 	/*
   6539 	 * If no inodes available, try scaring a logically-
   6540 	 * free one out of the delete queue to someplace
   6541 	 * that we can find it.
   6542 	 */
   6543 	if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
   6544 		ufs_delete_drain_wait(ufsvfsp, 1);
   6545 		retry = 0;
   6546 		if (vp->v_type == VDIR && haverwlock) {
   6547 			rw_exit(&ip->i_rwlock);
   6548 			haverwlock = 0;
   6549 		}
   6550 		goto again;
   6551 	}
   6552 	/*
   6553 	 * If we need to reacquire the lock then it is safe to do so
   6554 	 * as a reader. This is because ufs_rwunlock(), which will be
   6555 	 * called by our caller after we return, does not differentiate
   6556 	 * between shared and exclusive locks.
   6557 	 */
   6558 	if (!haverwlock) {
   6559 		ASSERT(vp->v_type == VDIR);
   6560 		rw_enter(&ip->i_rwlock, RW_READER);
   6561 	}
   6562 
   6563 	return (err);
   6564 }
   6565