Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #include <sys/types.h>
     40 #include <sys/t_lock.h>
     41 #include <sys/param.h>
     42 #include <sys/systm.h>
     43 #include <sys/uio.h>
     44 #include <sys/bitmap.h>
     45 #include <sys/signal.h>
     46 #include <sys/cred.h>
     47 #include <sys/user.h>
     48 #include <sys/vfs.h>
     49 #include <sys/stat.h>
     50 #include <sys/vnode.h>
     51 #include <sys/buf.h>
     52 #include <sys/proc.h>
     53 #include <sys/disp.h>
     54 #include <sys/dnlc.h>
     55 #include <sys/mode.h>
     56 #include <sys/cmn_err.h>
     57 #include <sys/kstat.h>
     58 #include <sys/acl.h>
     59 #include <sys/var.h>
     60 #include <sys/fs/ufs_inode.h>
     61 #include <sys/fs/ufs_fs.h>
     62 #include <sys/fs/ufs_trans.h>
     63 #include <sys/fs/ufs_acl.h>
     64 #include <sys/fs/ufs_bio.h>
     65 #include <sys/fs/ufs_quota.h>
     66 #include <sys/fs/ufs_log.h>
     67 #include <vm/hat.h>
     68 #include <vm/as.h>
     69 #include <vm/pvn.h>
     70 #include <vm/seg.h>
     71 #include <sys/swap.h>
     72 #include <sys/cpuvar.h>
     73 #include <sys/sysmacros.h>
     74 #include <sys/errno.h>
     75 #include <sys/kmem.h>
     76 #include <sys/debug.h>
     77 #include <fs/fs_subr.h>
     78 #include <sys/policy.h>
     79 
     80 struct kmem_cache *inode_cache;		/* cache of free inodes */
     81 
     82 /* UFS Inode Cache Stats -- Not protected */
     83 struct	instats ins = {
     84 	{ "size",		KSTAT_DATA_ULONG },
     85 	{ "maxsize",		KSTAT_DATA_ULONG },
     86 	{ "hits",		KSTAT_DATA_ULONG },
     87 	{ "misses",		KSTAT_DATA_ULONG },
     88 	{ "kmem allocs",	KSTAT_DATA_ULONG },
     89 	{ "kmem frees",		KSTAT_DATA_ULONG },
     90 	{ "maxsize reached",	KSTAT_DATA_ULONG },
     91 	{ "puts at frontlist",	KSTAT_DATA_ULONG },
     92 	{ "puts at backlist",	KSTAT_DATA_ULONG },
     93 	{ "queues to free",	KSTAT_DATA_ULONG },
     94 	{ "scans",		KSTAT_DATA_ULONG },
     95 	{ "thread idles",	KSTAT_DATA_ULONG },
     96 	{ "lookup idles",	KSTAT_DATA_ULONG },
     97 	{ "vget idles",		KSTAT_DATA_ULONG },
     98 	{ "cache allocs",	KSTAT_DATA_ULONG },
     99 	{ "cache frees",	KSTAT_DATA_ULONG },
    100 	{ "pushes at close",	KSTAT_DATA_ULONG }
    101 };
    102 
    103 /* kstat data */
    104 static kstat_t		*ufs_inode_kstat = NULL;
    105 
    106 union ihead *ihead;	/* inode LRU cache, Chris Maltby */
    107 kmutex_t *ih_lock;	/* protect inode cache hash table */
    108 static int ino_hashlen = 4;	/* desired average hash chain length */
    109 int inohsz;		/* number of buckets in the hash table */
    110 
    111 kmutex_t	ufs_scan_lock;	/* stop racing multiple ufs_scan_inodes() */
    112 kmutex_t	ufs_iuniqtime_lock; /* protect iuniqtime */
    113 kmutex_t	ufsvfs_mutex;
    114 struct ufsvfs	*oldufsvfslist, *ufsvfslist;
    115 
    116 /*
    117  * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
    118  * I/Os are going on.
    119  */
    120 clock_t	ufs_iowait;
    121 
    122 /*
    123  * the threads that process idle inodes and free (deleted) inodes
    124  * have high water marks that are set in ufsinit().
    125  * These values but can be no less then the minimum shown below
    126  */
    127 int	ufs_idle_max;	/* # of allowable idle inodes */
    128 ulong_t	ufs_inode_max;	/* hard limit of allowable idle inodes */
    129 #define	UFS_IDLE_MAX	(16)	/* min # of allowable idle inodes */
    130 
    131 /*
    132  * Tunables for ufs write throttling.
    133  * These are validated in ufs_iinit() since improper settings
    134  * can lead to filesystem hangs.
    135  */
    136 #define	UFS_HW_DEFAULT	(16 * 1024 * 1024)
    137 #define	UFS_LW_DEFAULT	(8 * 1024 * 1024)
    138 int	ufs_HW = UFS_HW_DEFAULT;
    139 int	ufs_LW = UFS_LW_DEFAULT;
    140 
    141 static void ihinit(void);
    142 extern int hash2ints(int, int);
    143 
    144 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
    145     struct cred *, int);
    146 
    147 /* ARGSUSED */
    148 static int
    149 ufs_inode_kstat_update(kstat_t *ksp, int rw)
    150 {
    151 	if (rw == KSTAT_WRITE)
    152 		return (EACCES);
    153 
    154 	ins.in_malloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
    155 	    "slab_alloc");
    156 	ins.in_mfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
    157 	    "slab_free");
    158 	ins.in_kcalloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
    159 	    "alloc");
    160 	ins.in_kcfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
    161 	    "free");
    162 	ins.in_size.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
    163 	    "buf_inuse");
    164 	ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
    165 	    "buf_max");
    166 	ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
    167 
    168 	return (0);
    169 }
    170 
    171 void
    172 ufs_iinit(void)
    173 {
    174 	/*
    175 	 * Validate that ufs_HW > ufs_LW.
    176 	 * The default values for these two tunables have been increased.
    177 	 * There is now a range of values for ufs_HW that used to be
    178 	 * legal on previous Solaris versions but no longer is now.
    179 	 * Upgrading a machine which has an /etc/system setting for ufs_HW
    180 	 * from that range can lead to filesystem hangs unless the values
    181 	 * are checked here.
    182 	 */
    183 	if (ufs_HW <= ufs_LW) {
    184 		cmn_err(CE_WARN,
    185 		    "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
    186 		    ufs_HW, ufs_LW);
    187 		ufs_LW = UFS_LW_DEFAULT;
    188 		ufs_HW = UFS_HW_DEFAULT;
    189 		cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
    190 		    ufs_HW, ufs_LW);
    191 	}
    192 
    193 	/*
    194 	 * Adjust the tunable `ufs_ninode' to a reasonable value
    195 	 */
    196 	if (ufs_ninode <= 0)
    197 		ufs_ninode = ncsize;
    198 	if (ufs_inode_max == 0)
    199 		ufs_inode_max =
    200 		    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
    201 	if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
    202 		cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
    203 		    ufs_inode_max);
    204 		ufs_ninode = ufs_inode_max;
    205 	}
    206 	/*
    207 	 * Wait till third call of ufs_update to declare that no I/Os are
    208 	 * going on. This allows deferred access times to be flushed to disk.
    209 	 */
    210 	ufs_iowait = v.v_autoup * hz * 2;
    211 
    212 	/*
    213 	 * idle thread runs when 25% of ufs_ninode entries are on the queue
    214 	 */
    215 	if (ufs_idle_max == 0)
    216 		ufs_idle_max = ufs_ninode >> 2;
    217 	if (ufs_idle_max < UFS_IDLE_MAX)
    218 		ufs_idle_max = UFS_IDLE_MAX;
    219 	if (ufs_idle_max > ufs_ninode)
    220 		ufs_idle_max = ufs_ninode;
    221 	/*
    222 	 * This is really a misnomer, it is ufs_queue_init
    223 	 */
    224 	ufs_thread_init(&ufs_idle_q, ufs_idle_max);
    225 	ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
    226 
    227 	/*
    228 	 * global hlock thread
    229 	 */
    230 	ufs_thread_init(&ufs_hlock, 1);
    231 	ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
    232 
    233 	ihinit();
    234 	qtinit();
    235 	ins.in_maxsize.value.ul = ufs_ninode;
    236 	if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
    237 	    KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
    238 	    KSTAT_FLAG_VIRTUAL)) != NULL) {
    239 		ufs_inode_kstat->ks_data = (void *)&ins;
    240 		ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
    241 		kstat_install(ufs_inode_kstat);
    242 	}
    243 	ufsfx_init();		/* fix-on-panic initialization */
    244 	si_cache_init();
    245 	ufs_directio_init();
    246 	lufs_init();
    247 	mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
    248 }
    249 
    250 /* ARGSUSED */
    251 static int
    252 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
    253 {
    254 	struct inode *ip = buf;
    255 	struct vnode *vp;
    256 
    257 	vp = ip->i_vnode = vn_alloc(kmflags);
    258 	if (vp == NULL) {
    259 		return (-1);
    260 	}
    261 	vn_setops(vp, ufs_vnodeops);
    262 	vp->v_data = ip;
    263 
    264 	rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
    265 	rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
    266 	mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
    267 	dnlc_dir_init(&ip->i_danchor);
    268 
    269 	cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
    270 
    271 	return (0);
    272 }
    273 
    274 /* ARGSUSED */
    275 static void
    276 ufs_inode_cache_destructor(void *buf, void *cdrarg)
    277 {
    278 	struct inode *ip = buf;
    279 	struct vnode *vp;
    280 
    281 	vp = ITOV(ip);
    282 
    283 	rw_destroy(&ip->i_rwlock);
    284 	rw_destroy(&ip->i_contents);
    285 	mutex_destroy(&ip->i_tlock);
    286 	if (vp->v_type == VDIR) {
    287 		dnlc_dir_fini(&ip->i_danchor);
    288 	}
    289 
    290 	cv_destroy(&ip->i_wrcv);
    291 
    292 	vn_free(vp);
    293 }
    294 
    295 /*
    296  * Initialize hash links for inodes
    297  * and build inode free list.
    298  */
    299 void
    300 ihinit(void)
    301 {
    302 	int i;
    303 	union	ihead *ih = ihead;
    304 
    305 	mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
    306 
    307 	inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
    308 	ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
    309 	ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
    310 
    311 	for (i = 0, ih = ihead; i < inohsz; i++,  ih++) {
    312 		ih->ih_head[0] = ih;
    313 		ih->ih_head[1] = ih;
    314 		mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
    315 	}
    316 	inode_cache = kmem_cache_create("ufs_inode_cache",
    317 	    sizeof (struct inode), 0, ufs_inode_cache_constructor,
    318 	    ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
    319 	    NULL, NULL, 0);
    320 }
    321 
    322 /*
    323  * Free an inode structure
    324  */
    325 void
    326 ufs_free_inode(struct inode *ip)
    327 {
    328 	vn_invalid(ITOV(ip));
    329 	kmem_cache_free(inode_cache, ip);
    330 }
    331 
    332 /*
    333  * Allocate an inode structure
    334  */
    335 struct inode *
    336 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
    337 {
    338 	struct inode *ip;
    339 	vnode_t *vp;
    340 
    341 	ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
    342 	/*
    343 	 * at this point we have a newly allocated inode
    344 	 */
    345 	ip->i_freef = ip;
    346 	ip->i_freeb = ip;
    347 	ip->i_flag = IREF;
    348 	ip->i_seq = 0xFF;	/* Unique initial value */
    349 	ip->i_dev = ufsvfsp->vfs_dev;
    350 	ip->i_ufsvfs = ufsvfsp;
    351 	ip->i_devvp = ufsvfsp->vfs_devvp;
    352 	ip->i_number = ino;
    353 	ip->i_diroff = 0;
    354 	ip->i_nextr = 0;
    355 	ip->i_map = NULL;
    356 	ip->i_rdev = 0;
    357 	ip->i_writes = 0;
    358 	ip->i_mode = 0;
    359 	ip->i_delaylen = 0;
    360 	ip->i_delayoff = 0;
    361 	ip->i_nextrio = 0;
    362 	ip->i_ufs_acl = NULL;
    363 	ip->i_cflags = 0;
    364 	ip->i_mapcnt = 0;
    365 	ip->i_dquot = NULL;
    366 	ip->i_cachedir = CD_ENABLED;
    367 	ip->i_writer = NULL;
    368 
    369 	/*
    370 	 * the vnode for this inode was allocated by the constructor
    371 	 */
    372 	vp = ITOV(ip);
    373 	vn_reinit(vp);
    374 	if (ino == (ino_t)UFSROOTINO)
    375 		vp->v_flag = VROOT;
    376 	vp->v_vfsp = ufsvfsp->vfs_vfs;
    377 	vn_exists(vp);
    378 	return (ip);
    379 }
    380 
    381 /*
    382  * Look up an inode by device, inumber.  If it is in core (in the
    383  * inode structure), honor the locking protocol.  If it is not in
    384  * core, read it in from the specified device after freeing any pages.
    385  * In all cases, a pointer to a VN_HELD inode structure is returned.
    386  */
    387 int
    388 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
    389 {
    390 	return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
    391 }
    392 
    393 /*
    394  * A version of ufs_iget which returns only allocated, linked inodes.
    395  * This is appropriate for any callers who do not expect a free inode.
    396  */
    397 int
    398 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
    399     struct cred *cr)
    400 {
    401 	return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
    402 }
    403 
    404 /*
    405  * Set vnode attributes based on v_type, this should be called whenever
    406  * an inode's i_mode is changed.
    407  */
    408 void
    409 ufs_reset_vnode(vnode_t *vp)
    410 {
    411 	/*
    412 	 * an old DBE hack
    413 	 */
    414 	if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
    415 		vp->v_flag |= VSWAPLIKE;
    416 	else
    417 		vp->v_flag &= ~VSWAPLIKE;
    418 
    419 	/*
    420 	 * if not swap like and it's just a regular file, we want
    421 	 * to maintain the vnode's pages sorted by clean/modified
    422 	 * for faster sync'ing to disk
    423 	 */
    424 	if (vp->v_type == VREG)
    425 		vp->v_flag |= VMODSORT;
    426 	else
    427 		vp->v_flag &= ~VMODSORT;
    428 
    429 	/*
    430 	 * Is this an attribute hidden dir?
    431 	 */
    432 	if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
    433 		vp->v_flag |= V_XATTRDIR;
    434 	else
    435 		vp->v_flag &= ~V_XATTRDIR;
    436 }
    437 
    438 /*
    439  * Shared implementation of ufs_iget and ufs_iget_alloced.  The 'validate'
    440  * flag is used to distinguish the two; when true, we validate that the inode
    441  * being retrieved looks like a linked and allocated inode.
    442  */
    443 /* ARGSUSED */
    444 static int
    445 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
    446     struct cred *cr, int validate)
    447 {
    448 	struct inode *ip, *sp;
    449 	union ihead *ih;
    450 	kmutex_t *ihm;
    451 	struct buf *bp;
    452 	struct dinode *dp;
    453 	struct vnode *vp;
    454 	extern vfs_t EIO_vfs;
    455 	int error;
    456 	int ftype;	/* XXX - Remove later on */
    457 	dev_t vfs_dev;
    458 	struct ufsvfs *ufsvfsp;
    459 	struct fs *fs;
    460 	int hno;
    461 	daddr_t bno;
    462 	ulong_t ioff;
    463 
    464 	CPU_STATS_ADD_K(sys, ufsiget, 1);
    465 
    466 	/*
    467 	 * Lookup inode in cache.
    468 	 */
    469 	vfs_dev = vfsp->vfs_dev;
    470 	hno = INOHASH(ino);
    471 	ih = &ihead[hno];
    472 	ihm = &ih_lock[hno];
    473 
    474 again:
    475 	mutex_enter(ihm);
    476 	for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
    477 		if (ino != ip->i_number || vfs_dev != ip->i_dev ||
    478 		    (ip->i_flag & ISTALE))
    479 			continue;
    480 
    481 		/*
    482 		 * Found the interesting inode; hold it and drop the cache lock
    483 		 */
    484 		vp = ITOV(ip);	/* for locknest */
    485 		VN_HOLD(vp);
    486 		mutex_exit(ihm);
    487 		rw_enter(&ip->i_contents, RW_READER);
    488 
    489 		/*
    490 		 * if necessary, remove from idle list
    491 		 */
    492 		if ((ip->i_flag & IREF) == 0) {
    493 			if (ufs_rmidle(ip))
    494 				VN_RELE(vp);
    495 		}
    496 
    497 		/*
    498 		 * Could the inode be read from disk?
    499 		 */
    500 		if (ip->i_flag & ISTALE) {
    501 			rw_exit(&ip->i_contents);
    502 			VN_RELE(vp);
    503 			goto again;
    504 		}
    505 
    506 		ins.in_hits.value.ul++;
    507 		*ipp = ip;
    508 
    509 		/*
    510 		 * Reset the vnode's attribute flags
    511 		 */
    512 		mutex_enter(&vp->v_lock);
    513 		ufs_reset_vnode(vp);
    514 		mutex_exit(&vp->v_lock);
    515 
    516 		rw_exit(&ip->i_contents);
    517 
    518 		return (0);
    519 	}
    520 	mutex_exit(ihm);
    521 
    522 	/*
    523 	 * Inode was not in cache.
    524 	 *
    525 	 * Allocate a new entry
    526 	 */
    527 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
    528 	fs = ufsvfsp->vfs_fs;
    529 
    530 	ip = ufs_alloc_inode(ufsvfsp, ino);
    531 	vp = ITOV(ip);
    532 
    533 	bno = fsbtodb(fs, itod(fs, ino));
    534 	ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
    535 	ip->i_doff = (offset_t)ioff + ldbtob(bno);
    536 
    537 	/*
    538 	 * put a place holder in the cache (if not already there)
    539 	 */
    540 	mutex_enter(ihm);
    541 	for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
    542 		if (ino == sp->i_number && vfs_dev == sp->i_dev &&
    543 		    ((sp->i_flag & ISTALE) == 0)) {
    544 			mutex_exit(ihm);
    545 			ufs_free_inode(ip);
    546 			goto again;
    547 		}
    548 	/*
    549 	 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
    550 	 * here, but if we do, then shadow inode allocations panic the
    551 	 * system.  We don't have to hold vfs_dqrwlock for shadow inodes
    552 	 * and the ufs_iget() parameters don't tell us what we are getting
    553 	 * so we have no way of knowing this is a ufs_iget() call from
    554 	 * a ufs_ialloc() call for a shadow inode.
    555 	 */
    556 	rw_enter(&ip->i_contents, RW_WRITER);
    557 	insque(ip, ih);
    558 	mutex_exit(ihm);
    559 	/*
    560 	 * read the dinode
    561 	 */
    562 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
    563 
    564 	/*
    565 	 * Check I/O errors
    566 	 */
    567 	error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
    568 	if (error) {
    569 		brelse(bp);
    570 		ip->i_flag |= ISTALE;	/* in case someone is looking it up */
    571 		rw_exit(&ip->i_contents);
    572 		vp->v_vfsp = &EIO_vfs;
    573 		VN_RELE(vp);
    574 		return (error);
    575 	}
    576 	/*
    577 	 * initialize the inode's dinode
    578 	 */
    579 	dp = (struct dinode *)(ioff + bp->b_un.b_addr);
    580 	ip->i_ic = dp->di_ic;			/* structure assignment */
    581 	brelse(bp);
    582 
    583 	/*
    584 	 * Maintain compatibility with Solaris 1.x UFS
    585 	 */
    586 	if (ip->i_suid != UID_LONG)
    587 		ip->i_uid = ip->i_suid;
    588 	if (ip->i_sgid != GID_LONG)
    589 		ip->i_gid = ip->i_sgid;
    590 
    591 	ftype = ip->i_mode & IFMT;
    592 	if (ftype == IFBLK || ftype == IFCHR) {
    593 		dev_t dv;
    594 		uint_t top16 = ip->i_ordev & 0xffff0000u;
    595 
    596 		if (top16 == 0 || top16 == 0xffff0000u)
    597 			dv = expdev(ip->i_ordev);
    598 		else
    599 			dv = expldev(ip->i_ordev);
    600 		vp->v_rdev = ip->i_rdev = dv;
    601 	}
    602 
    603 	/*
    604 	 * if our caller only expects allocated inodes, verify that
    605 	 * this inode looks good; throw it out if it's bad.
    606 	 */
    607 	if (validate) {
    608 		if ((ftype == 0) || (ip->i_nlink <= 0)) {
    609 			ip->i_flag |= ISTALE;
    610 			rw_exit(&ip->i_contents);
    611 			vp->v_vfsp = &EIO_vfs;
    612 			VN_RELE(vp);
    613 			cmn_err(CE_NOTE,
    614 			    "%s: unexpected free inode %d, run fsck(1M)%s",
    615 			    fs->fs_fsmnt, (int)ino,
    616 			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
    617 			return (EIO);
    618 		}
    619 	}
    620 
    621 	/*
    622 	 * Finish initializing the vnode, special handling for shadow inodes
    623 	 * because IFTOVT() will produce a v_type of VNON which is not what we
    624 	 * want, set v_type to VREG explicitly in that case.
    625 	 */
    626 	if (ftype == IFSHAD) {
    627 		vp->v_type = VREG;
    628 	} else {
    629 		vp->v_type = IFTOVT((mode_t)ip->i_mode);
    630 	}
    631 
    632 	ufs_reset_vnode(vp);
    633 
    634 	/*
    635 	 * read the shadow
    636 	 */
    637 	if (ftype != 0 && ip->i_shadow != 0) {
    638 		if ((error = ufs_si_load(ip, cr)) != 0) {
    639 			ip->i_flag |= ISTALE;
    640 			ip->i_ufs_acl = NULL;
    641 			rw_exit(&ip->i_contents);
    642 			vp->v_vfsp = &EIO_vfs;
    643 			VN_RELE(vp);
    644 			return (error);
    645 		}
    646 	}
    647 
    648 	/*
    649 	 * Only attach quota information if the inode has a type and if
    650 	 * that type is not a shadow inode.
    651 	 */
    652 	if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
    653 	    ((ip->i_mode & IFMT) != IFATTRDIR)) {
    654 		ip->i_dquot = getinoquota(ip);
    655 	}
    656 	TRANS_MATA_IGET(ufsvfsp, ip);
    657 	*ipp = ip;
    658 	rw_exit(&ip->i_contents);
    659 
    660 	return (0);
    661 }
    662 
    663 /*
    664  * Vnode is no longer referenced, write the inode out
    665  * and if necessary, truncate and deallocate the file.
    666  */
    667 void
    668 ufs_iinactive(struct inode *ip)
    669 {
    670 	int		front;
    671 	struct inode	*iq;
    672 	struct inode	*hip;
    673 	struct ufs_q	*uq;
    674 	struct vnode	*vp = ITOV(ip);
    675 	struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
    676 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
    677 
    678 	/*
    679 	 * Because the vnode type might have been changed,
    680 	 * the dnlc_dir_purge must be called unconditionally.
    681 	 */
    682 	dnlc_dir_purge(&ip->i_danchor);
    683 
    684 	/*
    685 	 * Get exclusive access to inode data.
    686 	 */
    687 	rw_enter(&ip->i_contents, RW_WRITER);
    688 	ASSERT(ip->i_flag & IREF);
    689 
    690 	/*
    691 	 * Make sure no one reclaimed the inode before we put it on
    692 	 * the freelist or destroy it. We keep our 'hold' on the vnode
    693 	 * from vn_rele until we are ready to do something with the inode.
    694 	 *
    695 	 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
    696 	 * operation via an async putpage, so we must make sure
    697 	 * we don't free/destroy the inode more than once. ufs_iget
    698 	 * may also put a VN_HOLD on the inode before it grabs
    699 	 * the i_contents lock. This is done so we don't free
    700 	 * an inode that a thread is waiting on.
    701 	 */
    702 	mutex_enter(&vp->v_lock);
    703 
    704 	if (vp->v_count > 1) {
    705 		vp->v_count--;  /* release our hold from vn_rele */
    706 		mutex_exit(&vp->v_lock);
    707 		rw_exit(&ip->i_contents);
    708 		return;
    709 	}
    710 	mutex_exit(&vp->v_lock);
    711 
    712 	/*
    713 	 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
    714 	 * and clean.  It can be safely destroyed (cyf).
    715 	 */
    716 	if (ip->i_ufsvfs == NULL) {
    717 		rw_exit(&ip->i_contents);
    718 		ufs_si_del(ip);
    719 		ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
    720 		ufs_free_inode(ip);
    721 		return;
    722 	}
    723 
    724 	/*
    725 	 * queue idle inode to appropriate thread. Will check v_count == 1
    726 	 * prior to putting this on the appropriate queue.
    727 	 * Stale inodes will be unhashed and freed by the ufs idle thread
    728 	 * in ufs_idle_free()
    729 	 */
    730 	front = 1;
    731 	if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
    732 	    ip->i_mode && ip->i_nlink <= 0) {
    733 		/*
    734 		 * Mark the i_flag to indicate that inode is being deleted.
    735 		 * This flag will be cleared when the deletion is complete.
    736 		 * This prevents nfs from sneaking in via ufs_vget() while
    737 		 * the delete is in progress (bugid 1242481).
    738 		 */
    739 		ip->i_flag |= IDEL;
    740 
    741 		/*
    742 		 * NOIDEL means that deletes are not allowed at this time;
    743 		 * whoever resets NOIDEL will also send this inode back
    744 		 * through ufs_iinactive.  IREF remains set.
    745 		 */
    746 		if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
    747 			mutex_enter(&vp->v_lock);
    748 			vp->v_count--;
    749 			mutex_exit(&vp->v_lock);
    750 			rw_exit(&ip->i_contents);
    751 			return;
    752 		}
    753 		if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
    754 			rw_exit(&ip->i_contents);
    755 			ufs_delete(ip->i_ufsvfs, ip, 0);
    756 			return;
    757 		}
    758 
    759 		/* queue to delete thread; IREF remains set */
    760 		ins.in_qfree.value.ul++;
    761 		uq = &ip->i_ufsvfs->vfs_delete;
    762 
    763 		mutex_enter(&uq->uq_mutex);
    764 
    765 		/* add to q */
    766 		if ((iq = uq->uq_ihead) != 0) {
    767 			ip->i_freef = iq;
    768 			ip->i_freeb = iq->i_freeb;
    769 			iq->i_freeb->i_freef = ip;
    770 			iq->i_freeb = ip;
    771 			if (front)
    772 				uq->uq_ihead = ip;
    773 		} else {
    774 			uq->uq_ihead = ip;
    775 			ip->i_freef = ip;
    776 			ip->i_freeb = ip;
    777 		}
    778 
    779 		delq_info->delq_unreclaimed_files += 1;
    780 		delq_info->delq_unreclaimed_blocks += ip->i_blocks;
    781 	} else {
    782 		/*
    783 		 * queue to idle thread
    784 		 *  Check the v_count == 1 again.
    785 		 *
    786 		 */
    787 		mutex_enter(&vp->v_lock);
    788 		if (vp->v_count > 1) {
    789 			vp->v_count--;  /* release our hold from vn_rele */
    790 			mutex_exit(&vp->v_lock);
    791 			rw_exit(&ip->i_contents);
    792 			return;
    793 		}
    794 		mutex_exit(&vp->v_lock);
    795 		uq = &ufs_idle_q;
    796 
    797 		/*
    798 		 * useful iff it has pages or is a fastsymlink; otherwise junk
    799 		 */
    800 		mutex_enter(&uq->uq_mutex);
    801 
    802 		/* clear IREF means `on idle list' */
    803 		ip->i_flag &= ~(IREF | IDIRECTIO);
    804 
    805 		if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
    806 			ins.in_frback.value.ul++;
    807 			hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
    808 			ufs_nuseful_iq++;
    809 		} else {
    810 			ins.in_frfront.value.ul++;
    811 			hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
    812 			ip->i_flag |= IJUNKIQ;
    813 			ufs_njunk_iq++;
    814 		}
    815 		ip->i_freef = hip;
    816 		ip->i_freeb = hip->i_freeb;
    817 		hip->i_freeb->i_freef = ip;
    818 		hip->i_freeb = ip;
    819 	}
    820 
    821 	/* wakeup thread(s) if q is overfull */
    822 	if (++uq->uq_ne == uq->uq_lowat)
    823 		cv_broadcast(&uq->uq_cv);
    824 
    825 	/* all done, release the q and inode */
    826 	mutex_exit(&uq->uq_mutex);
    827 	rw_exit(&ip->i_contents);
    828 }
    829 
    830 /*
    831  * Check accessed and update flags on an inode structure.
    832  * If any are on, update the inode with the (unique) current time.
    833  * If waitfor is given, insure I/O order so wait for write to complete.
    834  */
    835 void
    836 ufs_iupdat(struct inode *ip, int waitfor)
    837 {
    838 	struct buf	*bp;
    839 	struct fs	*fp;
    840 	struct dinode	*dp;
    841 	struct ufsvfs	*ufsvfsp 	= ip->i_ufsvfs;
    842 	int 		i;
    843 	int		do_trans_times;
    844 	ushort_t	flag;
    845 	o_uid_t		suid;
    846 	o_gid_t		sgid;
    847 
    848 	/*
    849 	 * This function is now safe to be called with either the reader
    850 	 * or writer i_contents lock.
    851 	 */
    852 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
    853 
    854 	/*
    855 	 * Return if file system has been forcibly umounted.
    856 	 */
    857 	if (ufsvfsp == NULL)
    858 		return;
    859 
    860 	flag = ip->i_flag;	/* Atomic read */
    861 	/*
    862 	 * We better not update the disk inode from a stale inode.
    863 	 */
    864 	if (flag & ISTALE)
    865 		return;
    866 
    867 	fp = ip->i_fs;
    868 
    869 	if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
    870 		if (fp->fs_ronly) {
    871 			mutex_enter(&ip->i_tlock);
    872 			ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
    873 			mutex_exit(&ip->i_tlock);
    874 			return;
    875 		}
    876 		/*
    877 		 * fs is active while metadata is being written
    878 		 */
    879 		mutex_enter(&ufsvfsp->vfs_lock);
    880 		ufs_notclean(ufsvfsp);
    881 		/*
    882 		 * get the dinode
    883 		 */
    884 		bp = UFS_BREAD(ufsvfsp, ip->i_dev,
    885 		    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
    886 		    (int)fp->fs_bsize);
    887 		if (bp->b_flags & B_ERROR) {
    888 			mutex_enter(&ip->i_tlock);
    889 			ip->i_flag &=
    890 			    ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
    891 			mutex_exit(&ip->i_tlock);
    892 			brelse(bp);
    893 			return;
    894 		}
    895 		/*
    896 		 * munge inode fields
    897 		 */
    898 		mutex_enter(&ip->i_tlock);
    899 		ITIMES_NOLOCK(ip);
    900 		do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
    901 		ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
    902 		mutex_exit(&ip->i_tlock);
    903 
    904 		/*
    905 		 * For reads and concurrent re-writes, no deltas were
    906 		 * entered for the access time changes - do it now.
    907 		 */
    908 		if (do_trans_times) {
    909 			TRANS_INODE_TIMES(ufsvfsp, ip);
    910 		}
    911 
    912 		/*
    913 		 * For SunOS 5.0->5.4, these lines below read:
    914 		 *
    915 		 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
    916 		 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
    917 		 *
    918 		 * where MAXUID was set to 60002.  This was incorrect -
    919 		 * the uids should have been constrained to what fitted into
    920 		 * a 16-bit word.
    921 		 *
    922 		 * This means that files from 4.x filesystems that have an
    923 		 * i_suid field larger than 60002 will have that field
    924 		 * changed to 65535.
    925 		 *
    926 		 * Security note: 4.x UFS could never create a i_suid of
    927 		 * UID_LONG since that would've corresponded to -1.
    928 		 */
    929 		suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
    930 		    UID_LONG : ip->i_uid;
    931 		sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
    932 		    GID_LONG : ip->i_gid;
    933 
    934 		if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
    935 			ip->i_suid = suid;
    936 			ip->i_sgid = sgid;
    937 			TRANS_INODE(ufsvfsp, ip);
    938 		}
    939 
    940 		if ((ip->i_mode & IFMT) == IFBLK ||
    941 		    (ip->i_mode & IFMT) == IFCHR) {
    942 			dev_t d = ip->i_rdev;
    943 			dev32_t dev32;
    944 
    945 			/*
    946 			 * load first direct block only if special device
    947 			 */
    948 			if (!cmpldev(&dev32, d)) {
    949 				/*
    950 				 * We panic here because there's "no way"
    951 				 * we should have been able to create a large
    952 				 * inode with a large dev_t.  Earlier layers
    953 				 * should've caught this.
    954 				 */
    955 				panic("ip %p: i_rdev too big", (void *)ip);
    956 			}
    957 
    958 			if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
    959 				ip->i_ordev = dev32;	/* can't use old fmt. */
    960 			} else {
    961 				ip->i_ordev = cmpdev(d);
    962 			}
    963 		}
    964 
    965 		/*
    966 		 * copy inode to dinode (zero fastsymlnk in dinode)
    967 		 */
    968 		dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
    969 		dp->di_ic = ip->i_ic;	/* structure assignment */
    970 		if (flag & IFASTSYMLNK) {
    971 			for (i = 1; i < NDADDR; i++)
    972 				dp->di_db[i] = 0;
    973 			for (i = 0; i < NIADDR; i++)
    974 				dp->di_ib[i] = 0;
    975 		}
    976 		if (TRANS_ISTRANS(ufsvfsp)) {
    977 			/*
    978 			 * Pass only a sector size buffer containing
    979 			 * the inode, otherwise when the buffer is copied
    980 			 * into a cached roll buffer then too much memory
    981 			 * gets consumed if 8KB inode buffers are passed.
    982 			 */
    983 			TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
    984 			    sizeof (struct dinode),
    985 			    (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
    986 			    DEV_BSIZE);
    987 
    988 			brelse(bp);
    989 		} else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
    990 			UFS_BRWRITE(ufsvfsp, bp);
    991 
    992 			/*
    993 			 * Synchronous write has guaranteed that inode
    994 			 * has been written on disk so clear the flag
    995 			 */
    996 			mutex_enter(&ip->i_tlock);
    997 			ip->i_flag &= ~IBDWRITE;
    998 			mutex_exit(&ip->i_tlock);
    999 		} else {
   1000 			bdrwrite(bp);
   1001 
   1002 			/*
   1003 			 * This write hasn't guaranteed that inode has been
   1004 			 * written on the disk.
   1005 			 * Since, all updat flags on inode are cleared, we must
   1006 			 * remember the condition in case inode is to be updated
   1007 			 * synchronously later (e.g.- fsync()/fdatasync())
   1008 			 * and inode has not been modified yet.
   1009 			 */
   1010 			mutex_enter(&ip->i_tlock);
   1011 			ip->i_flag |= IBDWRITE;
   1012 			mutex_exit(&ip->i_tlock);
   1013 		}
   1014 	} else {
   1015 		/*
   1016 		 * In case previous inode update was done asynchronously
   1017 		 * (IBDWRITE) and this inode update request wants guaranteed
   1018 		 * (synchronous) disk update, flush the inode.
   1019 		 */
   1020 		if (waitfor && (flag & IBDWRITE)) {
   1021 			blkflush(ip->i_dev,
   1022 			    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
   1023 			mutex_enter(&ip->i_tlock);
   1024 			ip->i_flag &= ~IBDWRITE;
   1025 			mutex_exit(&ip->i_tlock);
   1026 		}
   1027 	}
   1028 }
   1029 
   1030 #define	SINGLE	0	/* index of single indirect block */
   1031 #define	DOUBLE	1	/* index of double indirect block */
   1032 #define	TRIPLE	2	/* index of triple indirect block */
   1033 
   1034 /*
   1035  * Release blocks associated with the inode ip and
   1036  * stored in the indirect block bn.  Blocks are free'd
   1037  * in LIFO order up to (but not including) lastbn.  If
   1038  * level is greater than SINGLE, the block is an indirect
   1039  * block and recursive calls to indirtrunc must be used to
   1040  * cleanse other indirect blocks.
   1041  *
   1042  * N.B.: triple indirect blocks are untested.
   1043  */
   1044 static long
   1045 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
   1046 {
   1047 	int i;
   1048 	struct buf *bp, *copy;
   1049 	daddr32_t *bap;
   1050 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
   1051 	struct fs *fs = ufsvfsp->vfs_fs;
   1052 	daddr_t nb, last;
   1053 	long factor;
   1054 	int blocksreleased = 0, nblocks;
   1055 
   1056 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
   1057 	/*
   1058 	 * Calculate index in current block of last
   1059 	 * block to be kept.  -1 indicates the entire
   1060 	 * block so we need not calculate the index.
   1061 	 */
   1062 	factor = 1;
   1063 	for (i = SINGLE; i < level; i++)
   1064 		factor *= NINDIR(fs);
   1065 	last = lastbn;
   1066 	if (lastbn > 0)
   1067 		last /= factor;
   1068 	nblocks = btodb(fs->fs_bsize);
   1069 	/*
   1070 	 * Get buffer of block pointers, zero those
   1071 	 * entries corresponding to blocks to be free'd,
   1072 	 * and update on disk copy first.
   1073 	 * *Unless* the root pointer has been synchronously
   1074 	 * written to disk.  If nothing points to this
   1075 	 * indirect block then don't bother zero'ing and
   1076 	 * writing it.
   1077 	 */
   1078 	bp = UFS_BREAD(ufsvfsp,
   1079 	    ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
   1080 	if (bp->b_flags & B_ERROR) {
   1081 		brelse(bp);
   1082 		return (0);
   1083 	}
   1084 	bap = bp->b_un.b_daddr;
   1085 	if ((flags & I_CHEAP) == 0) {
   1086 		uint_t	zb;
   1087 
   1088 		zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
   1089 
   1090 		if (zb) {
   1091 			/*
   1092 			 * push any data into the log before we zero it
   1093 			 */
   1094 			if (bp->b_flags & B_DELWRI)
   1095 				TRANS_LOG(ufsvfsp, (caddr_t)bap,
   1096 				    ldbtob(bp->b_blkno), bp->b_bcount,
   1097 				    bp->b_un.b_addr, bp->b_bcount);
   1098 			copy = ngeteblk(fs->fs_bsize);
   1099 			bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
   1100 			    (uint_t)fs->fs_bsize);
   1101 			bzero((caddr_t)&bap[last + 1], zb);
   1102 
   1103 			TRANS_BUF(ufsvfsp,
   1104 			    (caddr_t)&bap[last + 1] - (caddr_t)bap,
   1105 			    zb, bp, DT_ABZERO);
   1106 
   1107 			UFS_BRWRITE(ufsvfsp, bp);
   1108 			bp = copy, bap = bp->b_un.b_daddr;
   1109 		}
   1110 	} else {
   1111 		/* make sure write retries are also cleared */
   1112 		bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
   1113 		bp->b_flags |= B_STALE | B_AGE;
   1114 	}
   1115 
   1116 	/*
   1117 	 * Recursively free totally unused blocks.
   1118 	 */
   1119 	flags |= I_CHEAP;
   1120 	for (i = NINDIR(fs) - 1; i > last; i--) {
   1121 		nb = bap[i];
   1122 		if (nb == 0)
   1123 			continue;
   1124 		if (level > SINGLE) {
   1125 			blocksreleased +=
   1126 			    indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
   1127 			free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
   1128 		} else
   1129 			free(ip, nb, (off_t)fs->fs_bsize, flags);
   1130 		blocksreleased += nblocks;
   1131 	}
   1132 	flags &= ~I_CHEAP;
   1133 
   1134 	/*
   1135 	 * Recursively free last partial block.
   1136 	 */
   1137 	if (level > SINGLE && lastbn >= 0) {
   1138 		last = lastbn % factor;
   1139 		nb = bap[i];
   1140 		if (nb != 0)
   1141 			blocksreleased +=
   1142 			    indirtrunc(ip, nb, last, level - 1, flags);
   1143 	}
   1144 	brelse(bp);
   1145 	return (blocksreleased);
   1146 }
   1147 
   1148 /*
   1149  * Truncate the inode ip to at most length size.
   1150  * Free affected disk blocks -- the blocks of the
   1151  * file are removed in reverse order.
   1152  *
   1153  * N.B.: triple indirect blocks are untested.
   1154  */
   1155 static int i_genrand = 1234;
   1156 int
   1157 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
   1158 {
   1159 	struct fs *fs = oip->i_fs;
   1160 	struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
   1161 	struct inode *ip;
   1162 	daddr_t lastblock;
   1163 	off_t bsize;
   1164 	int boff;
   1165 	daddr_t bn, lastiblock[NIADDR];
   1166 	int level;
   1167 	long nblocks, blocksreleased = 0;
   1168 	int i;
   1169 	ushort_t mode;
   1170 	struct inode tip;
   1171 	int err;
   1172 	u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
   1173 	    (UFS_MAXOFFSET_T) : (MAXOFF32_T);
   1174 
   1175 	/*
   1176 	 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
   1177 	 * other uses need the reader lock. opendq() holds the writer lock.
   1178 	 */
   1179 	ASSERT((oip->i_mode & IFMT) == IFSHAD ||
   1180 	    RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
   1181 	ASSERT(RW_WRITE_HELD(&oip->i_contents));
   1182 	/*
   1183 	 * We only allow truncation of regular files and directories
   1184 	 * to arbitrary lengths here.  In addition, we allow symbolic
   1185 	 * links to be truncated only to zero length.  Other inode
   1186 	 * types cannot have their length set here.  Disk blocks are
   1187 	 * being dealt with - especially device inodes where
   1188 	 * ip->i_ordev is actually being stored in ip->i_db[0]!
   1189 	 */
   1190 	TRANS_INODE(ufsvfsp, oip);
   1191 	mode = oip->i_mode & IFMT;
   1192 	if (flags & I_FREE) {
   1193 		i_genrand *= 16843009;  /* turns into shift and adds */
   1194 		i_genrand++;
   1195 		oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
   1196 		oip->i_flag |= ICHG |IUPD;
   1197 		oip->i_seq++;
   1198 		if (length == oip->i_size)
   1199 			return (0);
   1200 		flags |= I_CHEAP;
   1201 	}
   1202 	if (mode == IFIFO)
   1203 		return (0);
   1204 	if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
   1205 	    !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
   1206 		return (EINVAL);
   1207 	if (length > maxoffset)
   1208 		return (EFBIG);
   1209 	if ((mode == IFDIR) || (mode == IFATTRDIR))
   1210 		flags |= I_DIR;
   1211 	if (mode == IFSHAD)
   1212 		flags |= I_SHAD;
   1213 	if (oip == ufsvfsp->vfs_qinod)
   1214 		flags |= I_QUOTA;
   1215 	if (length == oip->i_size) {
   1216 		/* update ctime and mtime to please POSIX tests */
   1217 		oip->i_flag |= ICHG |IUPD;
   1218 		oip->i_seq++;
   1219 		if (length == 0) {
   1220 			/* nothing to cache so clear the flag */
   1221 			oip->i_flag &= ~IFASTSYMLNK;
   1222 		}
   1223 		return (0);
   1224 	}
   1225 	/* wipe out fast symlink till next access */
   1226 	if (oip->i_flag & IFASTSYMLNK) {
   1227 		int j;
   1228 
   1229 		ASSERT(ITOV(oip)->v_type == VLNK);
   1230 
   1231 		oip->i_flag &= ~IFASTSYMLNK;
   1232 
   1233 		for (j = 1; j < NDADDR; j++)
   1234 			oip->i_db[j] = 0;
   1235 		for (j = 0; j < NIADDR; j++)
   1236 			oip->i_ib[j] = 0;
   1237 	}
   1238 
   1239 	boff = (int)blkoff(fs, length);
   1240 
   1241 	if (length > oip->i_size) {
   1242 		/*
   1243 		 * Trunc up case.  BMAPALLOC will insure that the right blocks
   1244 		 * are allocated.  This includes extending the old frag to a
   1245 		 * full block (if needed) in addition to doing any work
   1246 		 * needed for allocating the last block.
   1247 		 */
   1248 		if (boff == 0)
   1249 			err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
   1250 		else
   1251 			err = BMAPALLOC(oip, length - 1, boff, cr);
   1252 
   1253 		if (err == 0) {
   1254 			/*
   1255 			 * Save old size and set inode's size now
   1256 			 * so that we don't cause too much of the
   1257 			 * file to be zero'd and pushed.
   1258 			 */
   1259 			u_offset_t osize = oip->i_size;
   1260 			oip->i_size  = length;
   1261 			/*
   1262 			 * Make sure we zero out the remaining bytes of
   1263 			 * the page in case a mmap scribbled on it. We
   1264 			 * can't prevent a mmap from writing beyond EOF
   1265 			 * on the last page of a file.
   1266 			 *
   1267 			 */
   1268 			if ((boff = (int)blkoff(fs, osize)) != 0) {
   1269 				bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
   1270 				    fs->fs_bsize : fragroundup(fs, boff);
   1271 				pvn_vpzero(ITOV(oip), osize,
   1272 				    (size_t)(bsize - boff));
   1273 			}
   1274 			oip->i_flag |= ICHG|IATTCHG;
   1275 			oip->i_seq++;
   1276 			ITIMES_NOLOCK(oip);
   1277 			/*
   1278 			 * MAXOFF32_T is old 2GB size limit. If
   1279 			 * this operation caused a large file to be
   1280 			 * created, turn on the superblock flag
   1281 			 * and update the superblock, if the flag
   1282 			 * is not already on.
   1283 			 */
   1284 			if ((length > (u_offset_t)MAXOFF32_T) &&
   1285 			    !(fs->fs_flags & FSLARGEFILES)) {
   1286 				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
   1287 				mutex_enter(&ufsvfsp->vfs_lock);
   1288 				fs->fs_flags |= FSLARGEFILES;
   1289 				ufs_sbwrite(ufsvfsp);
   1290 				mutex_exit(&ufsvfsp->vfs_lock);
   1291 			}
   1292 		}
   1293 
   1294 		return (err);
   1295 	}
   1296 
   1297 	/*
   1298 	 * Update the pages of the file.  If the file is not being
   1299 	 * truncated to a block boundary, the contents of the
   1300 	 * pages following the end of the file must be zero'ed
   1301 	 * in case it ever become accessible again because
   1302 	 * of subsequent file growth.
   1303 	 */
   1304 	if (boff == 0) {
   1305 		(void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
   1306 		    B_INVAL | B_TRUNC, CRED());
   1307 	} else {
   1308 		/*
   1309 		 * Make sure that the last block is properly allocated.
   1310 		 * We only really have to do this if the last block is
   1311 		 * actually allocated since ufs_bmap will now handle the case
   1312 		 * of an fragment which has no block allocated.  Just to
   1313 		 * be sure, we do it now independent of current allocation.
   1314 		 */
   1315 		err = BMAPALLOC(oip, length - 1, boff, cr);
   1316 		if (err)
   1317 			return (err);
   1318 
   1319 		/*
   1320 		 * BMAPALLOC will call bmap_write which defers i_seq
   1321 		 * processing.  If the timestamps were changed, update
   1322 		 * i_seq before rdip drops i_contents or syncs the inode.
   1323 		 */
   1324 		if (oip->i_flag & (ICHG|IUPD))
   1325 			oip->i_seq++;
   1326 
   1327 		/*
   1328 		 * BugId 4069932
   1329 		 * Make sure that the relevant partial page appears in
   1330 		 * the v_pages list, so that pvn_vpzero() will do its
   1331 		 * job.  Since doing this correctly requires everything
   1332 		 * in rdip() except for the uiomove(), it's easier and
   1333 		 * safer to do the uiomove() rather than duplicate the
   1334 		 * rest of rdip() here.
   1335 		 *
   1336 		 * To get here, we know that length indicates a byte
   1337 		 * that is not the first byte of a block.  (length - 1)
   1338 		 * is the last actual byte known to exist.  Deduction
   1339 		 * shows it is in the same block as byte (length).
   1340 		 * Thus, this rdip() invocation should always succeed
   1341 		 * except in the face of i/o errors, and give us the
   1342 		 * block we care about.
   1343 		 *
   1344 		 * rdip() makes the same locking assertions and
   1345 		 * assumptions as we do.  We do not acquire any locks
   1346 		 * before calling it, so we have not changed the locking
   1347 		 * situation.  Finally, there do not appear to be any
   1348 		 * paths whereby rdip() ends up invoking us again.
   1349 		 * Thus, infinite recursion is avoided.
   1350 		 */
   1351 		{
   1352 			uio_t uio;
   1353 			iovec_t iov[1];
   1354 			char buffer;
   1355 
   1356 			uio.uio_iov = iov;
   1357 			uio.uio_iovcnt = 1;
   1358 			uio.uio_loffset = length - 1;
   1359 			uio.uio_resid = 1;
   1360 			uio.uio_segflg = UIO_SYSSPACE;
   1361 			uio.uio_extflg = UIO_COPY_CACHED;
   1362 
   1363 			iov[0].iov_base = &buffer;
   1364 			iov[0].iov_len = 1;
   1365 
   1366 			err = rdip(oip, &uio, UIO_READ, NULL);
   1367 			if (err)
   1368 				return (err);
   1369 		}
   1370 
   1371 		bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
   1372 		    fs->fs_bsize : fragroundup(fs, boff);
   1373 		pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
   1374 		/*
   1375 		 * Ensure full fs block is marked as dirty.
   1376 		 */
   1377 		(void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
   1378 		    ufs_putapage, B_INVAL | B_TRUNC, CRED());
   1379 	}
   1380 
   1381 	/*
   1382 	 * Calculate index into inode's block list of
   1383 	 * last direct and indirect blocks (if any)
   1384 	 * which we want to keep.  Lastblock is -1 when
   1385 	 * the file is truncated to 0.
   1386 	 */
   1387 	lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
   1388 	lastiblock[SINGLE] = lastblock - NDADDR;
   1389 	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
   1390 	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
   1391 	nblocks = btodb(fs->fs_bsize);
   1392 
   1393 	/*
   1394 	 * Update file and block pointers
   1395 	 * on disk before we start freeing blocks.
   1396 	 * If we crash before free'ing blocks below,
   1397 	 * the blocks will be returned to the free list.
   1398 	 * lastiblock values are also normalized to -1
   1399 	 * for calls to indirtrunc below.
   1400 	 */
   1401 	tip = *oip;			/* structure copy */
   1402 	ip = &tip;
   1403 
   1404 	for (level = TRIPLE; level >= SINGLE; level--)
   1405 		if (lastiblock[level] < 0) {
   1406 			oip->i_ib[level] = 0;
   1407 			lastiblock[level] = -1;
   1408 		}
   1409 	for (i = NDADDR - 1; i > lastblock; i--) {
   1410 		oip->i_db[i] = 0;
   1411 		flags |= I_CHEAP;
   1412 	}
   1413 	oip->i_size = length;
   1414 	oip->i_flag |= ICHG|IUPD|IATTCHG;
   1415 	oip->i_seq++;
   1416 	if (!TRANS_ISTRANS(ufsvfsp))
   1417 		ufs_iupdat(oip, I_SYNC);	/* do sync inode update */
   1418 
   1419 	/*
   1420 	 * Indirect blocks first.
   1421 	 */
   1422 	for (level = TRIPLE; level >= SINGLE; level--) {
   1423 		bn = ip->i_ib[level];
   1424 		if (bn != 0) {
   1425 			blocksreleased +=
   1426 			    indirtrunc(ip, bn, lastiblock[level], level, flags);
   1427 			if (lastiblock[level] < 0) {
   1428 				ip->i_ib[level] = 0;
   1429 				free(ip, bn, (off_t)fs->fs_bsize,
   1430 				    flags | I_IBLK);
   1431 				blocksreleased += nblocks;
   1432 			}
   1433 		}
   1434 		if (lastiblock[level] >= 0)
   1435 			goto done;
   1436 	}
   1437 
   1438 	/*
   1439 	 * All whole direct blocks or frags.
   1440 	 */
   1441 	for (i = NDADDR - 1; i > lastblock; i--) {
   1442 		bn = ip->i_db[i];
   1443 		if (bn == 0)
   1444 			continue;
   1445 		ip->i_db[i] = 0;
   1446 		bsize = (off_t)blksize(fs, ip, i);
   1447 		free(ip, bn, bsize, flags);
   1448 		blocksreleased += btodb(bsize);
   1449 	}
   1450 	if (lastblock < 0)
   1451 		goto done;
   1452 
   1453 	/*
   1454 	 * Finally, look for a change in size of the
   1455 	 * last direct block; release any frags.
   1456 	 */
   1457 	bn = ip->i_db[lastblock];
   1458 	if (bn != 0) {
   1459 		off_t oldspace, newspace;
   1460 
   1461 		/*
   1462 		 * Calculate amount of space we're giving
   1463 		 * back as old block size minus new block size.
   1464 		 */
   1465 		oldspace = blksize(fs, ip, lastblock);
   1466 		UFS_SET_ISIZE(length, ip);
   1467 		newspace = blksize(fs, ip, lastblock);
   1468 		if (newspace == 0) {
   1469 			err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
   1470 			return (err);
   1471 		}
   1472 		if (oldspace - newspace > 0) {
   1473 			/*
   1474 			 * Block number of space to be free'd is
   1475 			 * the old block # plus the number of frags
   1476 			 * required for the storage we're keeping.
   1477 			 */
   1478 			bn += numfrags(fs, newspace);
   1479 			free(ip, bn, oldspace - newspace, flags);
   1480 			blocksreleased += btodb(oldspace - newspace);
   1481 		}
   1482 	}
   1483 done:
   1484 /* BEGIN PARANOIA */
   1485 	for (level = SINGLE; level <= TRIPLE; level++)
   1486 		if (ip->i_ib[level] != oip->i_ib[level]) {
   1487 			err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
   1488 			return (err);
   1489 		}
   1490 
   1491 	for (i = 0; i < NDADDR; i++)
   1492 		if (ip->i_db[i] != oip->i_db[i]) {
   1493 			err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
   1494 			return (err);
   1495 		}
   1496 /* END PARANOIA */
   1497 	oip->i_blocks -= blocksreleased;
   1498 
   1499 	if (oip->i_blocks < 0) {		/* sanity */
   1500 		cmn_err(CE_NOTE,
   1501 		    "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
   1502 		    fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
   1503 		    (int)oip->i_blocks);
   1504 		oip->i_blocks = 0;
   1505 	}
   1506 	oip->i_flag |= ICHG|IATTCHG;
   1507 	oip->i_seq++;
   1508 	/* blocksreleased is >= zero, so this can not fail */
   1509 	(void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
   1510 	    (size_t *)NULL);
   1511 	return (0);
   1512 }
   1513 
   1514 /*
   1515  * Check mode permission on inode.  Mode is READ, WRITE or EXEC.
   1516  * In the case of WRITE, the read-only status of the file system
   1517  * is checked.  Depending on the calling user, the appropriate
   1518  * mode bits are selected; privileges to override missing permission
   1519  * bits are checked through secpolicy_vnode_access().
   1520  * The i_contens lock must be held as reader here to prevent racing with
   1521  * the acl subsystem removing/setting/changing acls on this inode.
   1522  * The caller is responsible for indicating whether or not the i_contents
   1523  * lock needs to be acquired here or if already held.
   1524  */
   1525 int
   1526 ufs_iaccess(struct inode  *ip, int mode, struct cred *cr, int dolock)
   1527 {
   1528 	int shift = 0;
   1529 	int ret = 0;
   1530 
   1531 	if (dolock)
   1532 		rw_enter(&ip->i_contents, RW_READER);
   1533 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   1534 
   1535 	if (mode & IWRITE) {
   1536 		/*
   1537 		 * Disallow write attempts on read-only
   1538 		 * file systems, unless the file is a block
   1539 		 * or character device or a FIFO.
   1540 		 */
   1541 		if (ip->i_fs->fs_ronly != 0) {
   1542 			if ((ip->i_mode & IFMT) != IFCHR &&
   1543 			    (ip->i_mode & IFMT) != IFBLK &&
   1544 			    (ip->i_mode & IFMT) != IFIFO) {
   1545 				ret = EROFS;
   1546 				goto out;
   1547 			}
   1548 		}
   1549 	}
   1550 	/*
   1551 	 * If there is an acl, check the acl and return.
   1552 	 */
   1553 	if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
   1554 		ret = ufs_acl_access(ip, mode, cr);
   1555 		goto out;
   1556 	}
   1557 
   1558 	/*
   1559 	 * Access check is based on only one of owner, group, public.
   1560 	 * If not owner, then check group.
   1561 	 * If not a member of the group, then check public access.
   1562 	 */
   1563 	if (crgetuid(cr) != ip->i_uid) {
   1564 		shift += 3;
   1565 		if (!groupmember((uid_t)ip->i_gid, cr))
   1566 			shift += 3;
   1567 	}
   1568 
   1569 	mode &= ~(ip->i_mode << shift);
   1570 
   1571 	if (mode == 0)
   1572 		goto out;
   1573 
   1574 	/* test missing privilege bits */
   1575 	ret = secpolicy_vnode_access(cr, ITOV(ip), ip->i_uid, mode);
   1576 out:
   1577 	if (dolock)
   1578 		rw_exit(&ip->i_contents);
   1579 	return (ret);
   1580 }
   1581 
   1582 /*
   1583  * if necessary, remove an inode from the free list
   1584  *	i_contents is held except at unmount
   1585  *
   1586  * Return 1 if the inode is taken off of the ufs_idle_q,
   1587  * and the caller is expected to call VN_RELE.
   1588  *
   1589  * Return 0 otherwise.
   1590  */
   1591 int
   1592 ufs_rmidle(struct inode *ip)
   1593 {
   1594 	int rval = 0;
   1595 
   1596 	mutex_enter(&ip->i_tlock);
   1597 	if ((ip->i_flag & IREF) == 0) {
   1598 		mutex_enter(&ufs_idle_q.uq_mutex);
   1599 		ip->i_freef->i_freeb = ip->i_freeb;
   1600 		ip->i_freeb->i_freef = ip->i_freef;
   1601 		ip->i_freef = ip;
   1602 		ip->i_freeb = ip;
   1603 		ip->i_flag |= IREF;
   1604 		ufs_idle_q.uq_ne--;
   1605 		if (ip->i_flag & IJUNKIQ) {
   1606 			ufs_njunk_iq--;
   1607 			ip->i_flag &= ~IJUNKIQ;
   1608 		} else {
   1609 			ufs_nuseful_iq--;
   1610 		}
   1611 		mutex_exit(&ufs_idle_q.uq_mutex);
   1612 		rval = 1;
   1613 	}
   1614 	mutex_exit(&ip->i_tlock);
   1615 	return (rval);
   1616 }
   1617 
   1618 /*
   1619  * scan the hash of inodes and call func with the inode locked
   1620  */
   1621 int
   1622 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
   1623 		struct ufsvfs *ufsvfsp)
   1624 {
   1625 	struct inode		*ip;		/* current inode */
   1626 	struct inode		*lip = NULL;	/* last/previous inode */
   1627 	union ihead		*ih;		/* current hash chain */
   1628 	int			error, i;
   1629 	int			saverror = 0;
   1630 	int			lip_held;	/* lip needs a VN_RELE() */
   1631 
   1632 	/*
   1633 	 * If ufsvfsp is NULL, then our caller should be holding
   1634 	 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
   1635 	 * ufs_update().  Otherwise, to avoid false-positives in
   1636 	 * ufs_unmount()'s v_count-based EBUSY check, we only hold
   1637 	 * those inodes that are in the file system our caller cares
   1638 	 * about.
   1639 	 *
   1640 	 * We know that ip is a valid inode in the hash chain (and thus
   1641 	 * we can trust i_ufsvfs) because the inode we chained from
   1642 	 * (lip) is still in the hash chain.  This is true because either:
   1643 	 *
   1644 	 * 1. We did not drop the hash chain lock since the last
   1645 	 *    iteration (because we were not interested in the last inode),
   1646 	 * or
   1647 	 * 2. We maintained a hold on the last inode while we
   1648 	 *    we were processing it, so it could not be removed
   1649 	 *    from the hash chain.
   1650 	 *
   1651 	 * The whole reason we're dropping and re-grabbing the chain
   1652 	 * lock on every inode is so that we don't present a major
   1653 	 * choke point on throughput, particularly when we've been
   1654 	 * called on behalf of fsflush.
   1655 	 */
   1656 
   1657 	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
   1658 		mutex_enter(&ih_lock[i]);
   1659 		for (ip = ih->ih_chain[0], lip_held = 0;
   1660 		    ip != (struct inode *)ih;
   1661 		    ip = lip->i_forw) {
   1662 
   1663 			ins.in_scan.value.ul++;
   1664 
   1665 			/*
   1666 			 * Undo the previous iteration's VN_HOLD(), but
   1667 			 * only if one was done.
   1668 			 */
   1669 			if (lip_held)
   1670 				VN_RELE(ITOV(lip));
   1671 
   1672 			lip = ip;
   1673 			if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
   1674 				/*
   1675 				 * We're not processing all inodes, and
   1676 				 * this inode is not in the filesystem of
   1677 				 * interest, so skip it.  No need to do a
   1678 				 * VN_HOLD() since we're not dropping the
   1679 				 * hash chain lock until after we've
   1680 				 * done the i_forw traversal above.
   1681 				 */
   1682 				lip_held = 0;
   1683 				continue;
   1684 			}
   1685 			VN_HOLD(ITOV(ip));
   1686 			lip_held = 1;
   1687 			mutex_exit(&ih_lock[i]);
   1688 
   1689 			/*
   1690 			 * Acquire the contents lock as writer to make
   1691 			 * sure that the inode has been initialized in
   1692 			 * the cache or removed from the idle list by
   1693 			 * ufs_iget().  This works because ufs_iget()
   1694 			 * acquires the contents lock before putting
   1695 			 * the inode into the cache.  If we can lock
   1696 			 * it, then he's done with it.
   1697 			 */
   1698 
   1699 			if (rwtry) {
   1700 				if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
   1701 					mutex_enter(&ih_lock[i]);
   1702 					continue;
   1703 				}
   1704 			} else {
   1705 				rw_enter(&ip->i_contents, RW_WRITER);
   1706 			}
   1707 
   1708 			rw_exit(&ip->i_contents);
   1709 
   1710 			/*
   1711 			 * ISTALE means the inode couldn't be read
   1712 			 *
   1713 			 * We don't have to hold the i_contents lock
   1714 			 * for this check for a couple of
   1715 			 * reasons. First, if ISTALE is set then the
   1716 			 * flag cannot be cleared until the inode is
   1717 			 * removed from the cache and that cannot
   1718 			 * happen until after we VN_RELE() it.
   1719 			 * Second, if ISTALE is not set, then the
   1720 			 * inode is in the cache and does not need to
   1721 			 * be read from disk so ISTALE cannot be set
   1722 			 * while we are not looking.
   1723 			 */
   1724 			if ((ip->i_flag & ISTALE) == 0) {
   1725 				if ((error = (*func)(ip, arg)) != 0)
   1726 					saverror = error;
   1727 			}
   1728 
   1729 			mutex_enter(&ih_lock[i]);
   1730 		}
   1731 		if (lip_held)
   1732 			VN_RELE(ITOV(lip));
   1733 		mutex_exit(&ih_lock[i]);
   1734 	}
   1735 	return (saverror);
   1736 }
   1737 
   1738 /*
   1739  * Mark inode with the current time, plus a unique increment.
   1740  *
   1741  * Since we only keep 32-bit time on disk, if UFS is still alive
   1742  * beyond 2038, filesystem times will simply stick at the last
   1743  * possible second of 32-bit time. Not ideal, but probably better
   1744  * than going into the remote past, or confusing applications with
   1745  * negative time.
   1746  */
   1747 void
   1748 ufs_imark(struct inode *ip)
   1749 {
   1750 	timestruc_t now;
   1751 	int32_t usec, nsec;
   1752 
   1753 	/*
   1754 	 * The update of i_seq may have been deferred, increase i_seq here
   1755 	 * to make sure it is in sync with the timestamps.
   1756 	 */
   1757 	if (ip->i_flag & ISEQ) {
   1758 		ASSERT(ip->i_flag & (IUPD|ICHG));
   1759 		ip->i_seq++;
   1760 		ip->i_flag &= ~ISEQ;
   1761 	}
   1762 
   1763 	gethrestime(&now);
   1764 
   1765 	/*
   1766 	 * Fast algorithm to convert nsec to usec -- see hrt2ts()
   1767 	 * in common/os/timers.c for a full description.
   1768 	 */
   1769 	nsec = now.tv_nsec;
   1770 	usec = nsec + (nsec >> 2);
   1771 	usec = nsec + (usec >> 1);
   1772 	usec = nsec + (usec >> 2);
   1773 	usec = nsec + (usec >> 4);
   1774 	usec = nsec - (usec >> 3);
   1775 	usec = nsec + (usec >> 2);
   1776 	usec = nsec + (usec >> 3);
   1777 	usec = nsec + (usec >> 4);
   1778 	usec = nsec + (usec >> 1);
   1779 	usec = nsec + (usec >> 6);
   1780 	usec = usec >> 10;
   1781 
   1782 	mutex_enter(&ufs_iuniqtime_lock);
   1783 	if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
   1784 	    usec > iuniqtime.tv_usec) {
   1785 		if (now.tv_sec < TIME32_MAX) {
   1786 			iuniqtime.tv_sec = (time32_t)now.tv_sec;
   1787 			iuniqtime.tv_usec = usec;
   1788 		}
   1789 	} else {
   1790 		if (iuniqtime.tv_sec < TIME32_MAX) {
   1791 			iuniqtime.tv_usec++;
   1792 			/* Check for usec overflow */
   1793 			if (iuniqtime.tv_usec >= MICROSEC) {
   1794 				iuniqtime.tv_sec++;
   1795 				iuniqtime.tv_usec = 0;
   1796 			}
   1797 		}
   1798 	}
   1799 
   1800 	if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
   1801 		ip->i_atime = iuniqtime;
   1802 	}
   1803 	if (ip->i_flag & IUPD) {
   1804 		ip->i_mtime = iuniqtime;
   1805 		ip->i_flag |= IMODTIME;
   1806 	}
   1807 	if (ip->i_flag & ICHG) {
   1808 		ip->i_diroff = 0;
   1809 		ip->i_ctime = iuniqtime;
   1810 	}
   1811 	mutex_exit(&ufs_iuniqtime_lock);
   1812 }
   1813 
   1814 /*
   1815  * Update timestamps in inode.
   1816  */
   1817 void
   1818 ufs_itimes_nolock(struct inode *ip)
   1819 {
   1820 
   1821 	/*
   1822 	 * if noatime is set and the inode access time is the only field that
   1823 	 * must be changed, exit immediately.
   1824 	 */
   1825 	if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
   1826 	    (ip->i_ufsvfs->vfs_noatime)) {
   1827 		return;
   1828 	}
   1829 
   1830 	if (ip->i_flag & (IUPD|IACC|ICHG)) {
   1831 		if (ip->i_flag & ICHG)
   1832 			ip->i_flag |= IMOD;
   1833 		else
   1834 			ip->i_flag |= IMODACC;
   1835 		ufs_imark(ip);
   1836 		ip->i_flag &= ~(IACC|IUPD|ICHG);
   1837 	}
   1838 }
   1839