Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #include <sys/types.h>
     40 #include <sys/t_lock.h>
     41 #include <sys/param.h>
     42 #include <sys/time.h>
     43 #include <sys/fs/ufs_fs.h>
     44 #include <sys/cmn_err.h>
     45 
     46 #ifdef _KERNEL
     47 
     48 #include <sys/systm.h>
     49 #include <sys/sysmacros.h>
     50 #include <sys/buf.h>
     51 #include <sys/conf.h>
     52 #include <sys/user.h>
     53 #include <sys/var.h>
     54 #include <sys/vfs.h>
     55 #include <sys/vnode.h>
     56 #include <sys/proc.h>
     57 #include <sys/debug.h>
     58 #include <sys/fssnap_if.h>
     59 #include <sys/fs/ufs_inode.h>
     60 #include <sys/fs/ufs_trans.h>
     61 #include <sys/fs/ufs_panic.h>
     62 #include <sys/fs/ufs_bio.h>
     63 #include <sys/fs/ufs_log.h>
     64 #include <sys/kmem.h>
     65 #include <sys/policy.h>
     66 #include <vm/hat.h>
     67 #include <vm/as.h>
     68 #include <vm/seg.h>
     69 #include <vm/pvn.h>
     70 #include <vm/seg_map.h>
     71 #include <sys/swap.h>
     72 #include <vm/seg_kmem.h>
     73 
     74 #else  /* _KERNEL */
     75 
     76 #define	ASSERT(x)		/* don't use asserts for fsck et al */
     77 
     78 #endif  /* _KERNEL */
     79 
     80 #ifdef _KERNEL
     81 
     82 /*
     83  * Used to verify that a given entry on the ufs_instances list (see below)
     84  * still refers to a mounted file system.
     85  *
     86  * XXX:	This is a crock that substitutes for proper locking to coordinate
     87  *	updates to and uses of the entries in ufs_instances.
     88  */
     89 struct check_node {
     90 	struct vfs *vfsp;
     91 	struct ufsvfs *ufsvfs;
     92 	dev_t vfs_dev;
     93 };
     94 
     95 static vfs_t *still_mounted(struct check_node *);
     96 
     97 /*
     98  * All ufs file system instances are linked together into a list starting at
     99  * ufs_instances.  The list is updated as part of mount and unmount.  It's
    100  * consulted in ufs_update, to allow syncing out all ufs file system instances
    101  * in a batch.
    102  *
    103  * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist
    104  * manipulated in ufs_funmount_cleanup.  (A given ufs instance is always on
    105  * exactly one of these lists except while it's being allocated or
    106  * deallocated.)
    107  */
    108 struct ufsvfs	*ufs_instances;
    109 extern kmutex_t		ufsvfs_mutex;	/* XXX: move this to ufs_inode.h? */
    110 
    111 /*
    112  * ufsvfs list manipulation routines
    113  */
    114 
    115 /*
    116  * Link ufsp in at the head of the list of ufs_instances.
    117  */
    118 void
    119 ufs_vfs_add(struct ufsvfs *ufsp)
    120 {
    121 	mutex_enter(&ufsvfs_mutex);
    122 	ufsp->vfs_next = ufs_instances;
    123 	ufs_instances = ufsp;
    124 	mutex_exit(&ufsvfs_mutex);
    125 }
    126 
    127 /*
    128  * Remove ufsp from the list of ufs_instances.
    129  *
    130  * Does no error checking; ufsp is assumed to actually be on the list.
    131  */
    132 void
    133 ufs_vfs_remove(struct ufsvfs *ufsp)
    134 {
    135 	struct ufsvfs	**delpt = &ufs_instances;
    136 
    137 	mutex_enter(&ufsvfs_mutex);
    138 	for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) {
    139 		if (*delpt == ufsp) {
    140 			*delpt = ufsp->vfs_next;
    141 			ufsp->vfs_next = NULL;
    142 			break;
    143 		}
    144 	}
    145 	mutex_exit(&ufsvfs_mutex);
    146 }
    147 
    148 /*
    149  * Clean up state resulting from a forcible unmount that couldn't be handled
    150  * directly during the unmount.  (See commentary in the unmount code for more
    151  * info.)
    152  */
    153 static void
    154 ufs_funmount_cleanup()
    155 {
    156 	struct ufsvfs		*ufsvfsp;
    157 	extern struct ufsvfs	*oldufsvfslist, *ufsvfslist;
    158 
    159 	/*
    160 	 * Assumption: it's now safe to blow away the entries on
    161 	 * oldufsvfslist.
    162 	 */
    163 	mutex_enter(&ufsvfs_mutex);
    164 	while ((ufsvfsp = oldufsvfslist) != NULL) {
    165 		oldufsvfslist = ufsvfsp->vfs_next;
    166 
    167 		mutex_destroy(&ufsvfsp->vfs_lock);
    168 		kmem_free(ufsvfsp, sizeof (struct ufsvfs));
    169 	}
    170 	/*
    171 	 * Rotate more recent unmount entries into place in preparation for
    172 	 * the next time around.
    173 	 */
    174 	oldufsvfslist = ufsvfslist;
    175 	ufsvfslist = NULL;
    176 	mutex_exit(&ufsvfs_mutex);
    177 }
    178 
    179 
    180 /*
    181  * ufs_update performs the ufs part of `sync'.  It goes through the disk
    182  * queues to initiate sandbagged IO; goes through the inodes to write
    183  * modified nodes; and it goes through the mount table to initiate
    184  * the writing of the modified super blocks.
    185  */
    186 extern time_t	time;
    187 time_t		ufs_sync_time;
    188 time_t		ufs_sync_time_secs = 1;
    189 
    190 extern kmutex_t	ufs_scan_lock;
    191 
    192 void
    193 ufs_update(int flag)
    194 {
    195 	struct vfs *vfsp;
    196 	struct fs *fs;
    197 	struct ufsvfs *ufsp;
    198 	struct ufsvfs *ufsnext;
    199 	struct ufsvfs *update_list = NULL;
    200 	int check_cnt = 0;
    201 	size_t check_size;
    202 	struct check_node *check_list, *ptr;
    203 	int cheap = flag & SYNC_ATTR;
    204 
    205 	/*
    206 	 * This is a hack.  A design flaw in the forced unmount protocol
    207 	 * could allow a thread to attempt to use a kmem_freed ufsvfs
    208 	 * structure in ufs_lockfs_begin/ufs_check_lockfs.  This window
    209 	 * is difficult to hit, even during the lockfs stress tests.
    210 	 * So the hacky fix is to wait awhile before kmem_free'ing the
    211 	 * ufsvfs structures for forcibly unmounted file systems.  `Awhile'
    212 	 * is defined as every other call from fsflush (~60 seconds).
    213 	 */
    214 	if (cheap)
    215 		ufs_funmount_cleanup();
    216 
    217 	/*
    218 	 * Examine all ufsvfs structures and add those that we can lock to the
    219 	 * update list.  This is so that we don't hold the list lock for a
    220 	 * long time.  If vfs_lock fails for a file system instance, then skip
    221 	 * it because somebody is doing a unmount on it.
    222 	 */
    223 	mutex_enter(&ufsvfs_mutex);
    224 	for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
    225 		vfsp = ufsp->vfs_vfs;
    226 		if (vfs_lock(vfsp) != 0)
    227 			continue;
    228 		ufsp->vfs_wnext = update_list;
    229 		update_list = ufsp;
    230 		check_cnt++;
    231 	}
    232 	mutex_exit(&ufsvfs_mutex);
    233 
    234 	if (update_list == NULL)
    235 		return;
    236 
    237 	check_size = sizeof (struct check_node) * check_cnt;
    238 	check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP);
    239 
    240 	/*
    241 	 * Write back modified superblocks.
    242 	 * Consistency check that the superblock of
    243 	 * each file system is still in the buffer cache.
    244 	 *
    245 	 * Note that the update_list traversal is done without the protection
    246 	 * of an overall list lock, so it's necessary to rely on the fact that
    247 	 * each entry of the list is vfs_locked when moving from one entry to
    248 	 * the next.  This works because a concurrent attempt to add an entry
    249 	 * to another thread's update_list won't find it, since it'll already
    250 	 * be locked.
    251 	 */
    252 	check_cnt = 0;
    253 	for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) {
    254 		/*
    255 		 * Need to grab the next ptr before we unlock this one so
    256 		 * another thread doesn't grab it and change it before we move
    257 		 * on to the next vfs.  (Once we unlock it, it's ok if another
    258 		 * thread finds it to add it to its own update_list; we don't
    259 		 * attempt to refer to it through our list any more.)
    260 		 */
    261 		ufsnext = ufsp->vfs_wnext;
    262 		vfsp = ufsp->vfs_vfs;
    263 
    264 		/*
    265 		 * Seems like this can't happen, so perhaps it should become
    266 		 * an ASSERT(vfsp->vfs_data != NULL).
    267 		 */
    268 		if (!vfsp->vfs_data) {
    269 			vfs_unlock(vfsp);
    270 			continue;
    271 		}
    272 
    273 		fs = ufsp->vfs_fs;
    274 
    275 		/*
    276 		 * don't update a locked superblock during a panic; it
    277 		 * may be in an inconsistent state
    278 		 */
    279 		if (panicstr) {
    280 			if (!mutex_tryenter(&ufsp->vfs_lock)) {
    281 				vfs_unlock(vfsp);
    282 				continue;
    283 			}
    284 		} else
    285 			mutex_enter(&ufsp->vfs_lock);
    286 		/*
    287 		 * Build up the STABLE check list, so we can unlock the vfs
    288 		 * until we do the actual checking.
    289 		 */
    290 		if (check_list != NULL) {
    291 			if ((fs->fs_ronly == 0) &&
    292 			    (fs->fs_clean != FSBAD) &&
    293 			    (fs->fs_clean != FSSUSPEND)) {
    294 				ptr->vfsp = vfsp;
    295 				ptr->ufsvfs = ufsp;
    296 				ptr->vfs_dev = vfsp->vfs_dev;
    297 				ptr++;
    298 				check_cnt++;
    299 			}
    300 		}
    301 
    302 		/*
    303 		 * superblock is not modified
    304 		 */
    305 		if (fs->fs_fmod == 0) {
    306 			mutex_exit(&ufsp->vfs_lock);
    307 			vfs_unlock(vfsp);
    308 			continue;
    309 		}
    310 		if (fs->fs_ronly != 0) {
    311 			mutex_exit(&ufsp->vfs_lock);
    312 			vfs_unlock(vfsp);
    313 			(void) ufs_fault(ufsp->vfs_root,
    314 			    "fs = %s update: ro fs mod\n", fs->fs_fsmnt);
    315 			/*
    316 			 * XXX:	Why is this a return instead of a continue?
    317 			 *	This may be an attempt to replace a panic with
    318 			 *	something less drastic, but there's cleanup we
    319 			 *	should be doing that's not being done (e.g.,
    320 			 *	unlocking the remaining entries on the list).
    321 			 */
    322 			return;
    323 		}
    324 		fs->fs_fmod = 0;
    325 		mutex_exit(&ufsp->vfs_lock);
    326 		TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE);
    327 		vfs_unlock(vfsp);
    328 	}
    329 
    330 	ufs_sync_time = time;
    331 
    332 	/*
    333 	 * Avoid racing with ufs_unmount() and ufs_sync().
    334 	 */
    335 	mutex_enter(&ufs_scan_lock);
    336 
    337 	(void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap,
    338 	    NULL);
    339 
    340 	mutex_exit(&ufs_scan_lock);
    341 
    342 	/*
    343 	 * Force stale buffer cache information to be flushed,
    344 	 * for all devices.  This should cause any remaining control
    345 	 * information (e.g., cg and inode info) to be flushed back.
    346 	 */
    347 	bflush((dev_t)NODEV);
    348 
    349 	if (check_list == NULL)
    350 		return;
    351 
    352 	/*
    353 	 * For each UFS filesystem in the STABLE check_list, update
    354 	 * the clean flag if warranted.
    355 	 */
    356 	for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) {
    357 		int	error;
    358 
    359 		/*
    360 		 * still_mounted() returns with vfsp and the vfs_reflock
    361 		 * held if ptr refers to a vfs that is still mounted.
    362 		 */
    363 		if ((vfsp = still_mounted(ptr)) == NULL)
    364 			continue;
    365 		ufs_checkclean(vfsp);
    366 		/*
    367 		 * commit any outstanding async transactions
    368 		 */
    369 		ufsp = (struct ufsvfs *)vfsp->vfs_data;
    370 		curthread->t_flag |= T_DONTBLOCK;
    371 		TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE,
    372 		    error);
    373 		if (!error) {
    374 			TRANS_END_SYNC(ufsp, error, TOP_COMMIT_UPDATE,
    375 			    TOP_COMMIT_SIZE);
    376 		}
    377 		curthread->t_flag &= ~T_DONTBLOCK;
    378 
    379 		vfs_unlock(vfsp);
    380 	}
    381 
    382 	kmem_free(check_list, check_size);
    383 }
    384 
    385 int
    386 ufs_sync_inode(struct inode *ip, void *arg)
    387 {
    388 	int cheap = (int)(uintptr_t)arg;
    389 	struct ufsvfs *ufsvfsp;
    390 	uint_t flag = ip->i_flag;
    391 
    392 	if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0))
    393 		return (0);
    394 
    395 	/*
    396 	 * if we are panic'ing; then don't update the inode if this
    397 	 * file system is FSSTABLE.  Otherwise, we would have to
    398 	 * force the superblock to FSACTIVE and the superblock
    399 	 * may not be in a good state.  Also, if the inode is
    400 	 * IREF'ed then it may be in an inconsistent state.  Don't
    401 	 * push it.  Finally, don't push the inode if the fs is
    402 	 * logging; the transaction will be discarded at boot.
    403 	 */
    404 	if (panicstr) {
    405 
    406 		if (flag & IREF)
    407 			return (0);
    408 
    409 		if (ip->i_ufsvfs == NULL ||
    410 		    (ip->i_fs->fs_clean == FSSTABLE ||
    411 		    ip->i_fs->fs_clean == FSLOG))
    412 				return (0);
    413 	}
    414 
    415 	ufsvfsp = ip->i_ufsvfs;
    416 
    417 	/*
    418 	 * Limit access time only updates
    419 	 */
    420 	if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) {
    421 		/*
    422 		 * if file system has deferred access time turned on and there
    423 		 * was no IO recently, don't bother flushing it. It will be
    424 		 * flushed when I/Os start again.
    425 		 */
    426 		if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) &&
    427 		    (ufsvfsp->vfs_iotstamp + ufs_iowait < ddi_get_lbolt()))
    428 			return (0);
    429 		/*
    430 		 * an app issueing a sync() can take forever on a trans device
    431 		 * when NetWorker or find is running because all of the
    432 		 * directorys' access times have to be updated. So, we limit
    433 		 * the time we spend updating access times per sync.
    434 		 */
    435 		if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time +
    436 		    ufs_sync_time_secs) < time))
    437 			return (0);
    438 	}
    439 
    440 	/*
    441 	 * if we are running on behalf of the flush thread or this is
    442 	 * a swap file, then simply do a delay update of the inode.
    443 	 * Otherwise, push the pages and then do a delayed inode update.
    444 	 */
    445 	if (cheap || IS_SWAPVP(ITOV(ip))) {
    446 		TRANS_IUPDAT(ip, 0);
    447 	} else {
    448 		(void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC);
    449 	}
    450 	return (0);
    451 }
    452 
    453 /*
    454  * Flush all the pages associated with an inode using the given 'flags',
    455  * then force inode information to be written back using the given 'waitfor'.
    456  */
    457 int
    458 ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid)
    459 {
    460 	int	error;
    461 	struct vnode *vp = ITOV(ip);
    462 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
    463 	int dotrans = 0;
    464 
    465 	/*
    466 	 * Return if file system has been forcibly umounted.
    467 	 */
    468 	if (ufsvfsp == NULL)
    469 		return (EIO);
    470 	/*
    471 	 * don't need to VOP_PUTPAGE if there are no pages
    472 	 */
    473 	if (!vn_has_cached_data(vp) || vp->v_type == VCHR) {
    474 		error = 0;
    475 	} else {
    476 		/*
    477 		 * if the inode we're working on is a shadow inode
    478 		 * or quota inode we need to make sure that the
    479 		 * ufs_putpage call is inside a transaction as this
    480 		 * could include meta data changes.
    481 		 */
    482 		if ((ip->i_mode & IFMT) == IFSHAD ||
    483 		    ufsvfsp->vfs_qinod == ip) {
    484 			dotrans = 1;
    485 			curthread->t_flag |= T_DONTBLOCK;
    486 			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE,
    487 			    TOP_PUTPAGE_SIZE(ip));
    488 		}
    489 		error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
    490 		    flags, CRED(), NULL);
    491 		if (dotrans) {
    492 			TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE,
    493 			    TOP_PUTPAGE_SIZE(ip));
    494 			curthread->t_flag &= ~T_DONTBLOCK;
    495 			dotrans = 0;
    496 		}
    497 	}
    498 	if (panicstr && TRANS_ISTRANS(ufsvfsp))
    499 		goto out;
    500 	/*
    501 	 * waitfor represents two things -
    502 	 * 1. whether data sync or file sync.
    503 	 * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not.
    504 	 */
    505 	if (waitfor == I_DSYNC) {
    506 		/*
    507 		 * If data sync, only IATTCHG (size/block change) requires
    508 		 * inode update, fdatasync()/FDSYNC implementation.
    509 		 */
    510 		if (ip->i_flag & (IBDWRITE|IATTCHG)) {
    511 			/*
    512 			 * Enter a transaction to provide mutual exclusion
    513 			 * with deltamap_push and avoid a race where
    514 			 * the inode flush could get dropped.
    515 			 */
    516 			if ((curthread->t_flag & T_DONTBLOCK) == 0) {
    517 				dotrans = 1;
    518 				curthread->t_flag |= T_DONTBLOCK;
    519 				TRANS_BEGIN_ASYNC(ufsvfsp, topid,
    520 				    TOP_SYNCIP_SIZE);
    521 			}
    522 			rw_enter(&ip->i_contents, RW_READER);
    523 			mutex_enter(&ip->i_tlock);
    524 			ip->i_flag &= ~IMODTIME;
    525 			mutex_exit(&ip->i_tlock);
    526 			ufs_iupdat(ip, 1);
    527 			rw_exit(&ip->i_contents);
    528 			if (dotrans) {
    529 				TRANS_END_ASYNC(ufsvfsp, topid,
    530 				    TOP_SYNCIP_SIZE);
    531 				curthread->t_flag &= ~T_DONTBLOCK;
    532 			}
    533 		}
    534 	} else {
    535 		/* For file sync, any inode change requires inode update */
    536 		if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) {
    537 			/*
    538 			 * Enter a transaction to provide mutual exclusion
    539 			 * with deltamap_push and avoid a race where
    540 			 * the inode flush could get dropped.
    541 			 */
    542 			if ((curthread->t_flag & T_DONTBLOCK) == 0) {
    543 				dotrans = 1;
    544 				curthread->t_flag |= T_DONTBLOCK;
    545 				TRANS_BEGIN_ASYNC(ufsvfsp, topid,
    546 				    TOP_SYNCIP_SIZE);
    547 			}
    548 			rw_enter(&ip->i_contents, RW_READER);
    549 			mutex_enter(&ip->i_tlock);
    550 			ip->i_flag &= ~IMODTIME;
    551 			mutex_exit(&ip->i_tlock);
    552 			ufs_iupdat(ip, waitfor);
    553 			rw_exit(&ip->i_contents);
    554 			if (dotrans) {
    555 				TRANS_END_ASYNC(ufsvfsp, topid,
    556 				    TOP_SYNCIP_SIZE);
    557 				curthread->t_flag &= ~T_DONTBLOCK;
    558 			}
    559 		}
    560 	}
    561 
    562 out:
    563 	return (error);
    564 }
    565 /*
    566  * Flush all indirect blocks related to an inode.
    567  * Supports triple indirect blocks also.
    568  */
    569 int
    570 ufs_sync_indir(struct inode *ip)
    571 {
    572 	int i;
    573 	daddr_t blkno;
    574 	daddr_t lbn;	/* logical blkno of last blk in file */
    575 	daddr_t clbn;	/* current logical blk */
    576 	daddr32_t *bap;
    577 	struct fs *fs;
    578 	struct buf *bp;
    579 	int bsize;
    580 	struct ufsvfs *ufsvfsp;
    581 	int j;
    582 	daddr_t indirect_blkno;
    583 	daddr32_t *indirect_bap;
    584 	struct buf *indirect_bp;
    585 
    586 	ufsvfsp = ip->i_ufsvfs;
    587 	/*
    588 	 * unnecessary when logging; allocation blocks are kept up-to-date
    589 	 */
    590 	if (TRANS_ISTRANS(ufsvfsp))
    591 		return (0);
    592 
    593 	fs = ufsvfsp->vfs_fs;
    594 	bsize = fs->fs_bsize;
    595 	lbn = (daddr_t)lblkno(fs, ip->i_size - 1);
    596 	if (lbn < NDADDR)
    597 		return (0);	/* No indirect blocks used */
    598 	if (lbn < NDADDR + NINDIR(fs)) {
    599 		/* File has one indirect block. */
    600 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0]));
    601 		return (0);
    602 	}
    603 
    604 	/* Write out all the first level indirect blocks */
    605 	for (i = 0; i <= NIADDR; i++) {
    606 		if ((blkno = ip->i_ib[i]) == 0)
    607 			continue;
    608 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
    609 	}
    610 	/* Write out second level of indirect blocks */
    611 	if ((blkno = ip->i_ib[1]) == 0)
    612 		return (0);
    613 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
    614 	if (bp->b_flags & B_ERROR) {
    615 		brelse(bp);
    616 		return (EIO);
    617 	}
    618 	bap = bp->b_un.b_daddr;
    619 	clbn = NDADDR + NINDIR(fs);
    620 	for (i = 0; i < NINDIR(fs); i++) {
    621 		if (clbn > lbn)
    622 			break;
    623 		clbn += NINDIR(fs);
    624 		if ((blkno = bap[i]) == 0)
    625 			continue;
    626 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
    627 	}
    628 
    629 	brelse(bp);
    630 	/* write out third level indirect blocks */
    631 
    632 	if ((blkno = ip->i_ib[2]) == 0)
    633 		return (0);
    634 
    635 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
    636 	if (bp->b_flags & B_ERROR) {
    637 		brelse(bp);
    638 		return (EIO);
    639 	}
    640 	bap = bp->b_un.b_daddr;
    641 	clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs));
    642 
    643 	for (i = 0; i < NINDIR(fs); i++) {
    644 		if (clbn > lbn)
    645 			break;
    646 		if ((indirect_blkno = bap[i]) == 0)
    647 			continue;
    648 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno));
    649 		indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev,
    650 		    (daddr_t)fsbtodb(fs, indirect_blkno), bsize);
    651 		if (indirect_bp->b_flags & B_ERROR) {
    652 			brelse(indirect_bp);
    653 			brelse(bp);
    654 			return (EIO);
    655 		}
    656 		indirect_bap = indirect_bp->b_un.b_daddr;
    657 		for (j = 0; j < NINDIR(fs); j++) {
    658 			if (clbn > lbn)
    659 				break;
    660 			clbn += NINDIR(fs);
    661 			if ((blkno = indirect_bap[j]) == 0)
    662 				continue;
    663 			blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
    664 		}
    665 		brelse(indirect_bp);
    666 	}
    667 	brelse(bp);
    668 
    669 	return (0);
    670 }
    671 
    672 /*
    673  * Flush all indirect blocks related to an offset of a file.
    674  * read/write in sync mode may have to flush indirect blocks.
    675  */
    676 int
    677 ufs_indirblk_sync(struct inode *ip, offset_t off)
    678 {
    679 	daddr_t	lbn;
    680 	struct	fs *fs;
    681 	struct	buf *bp;
    682 	int	i, j, shft;
    683 	daddr_t	ob, nb, tbn;
    684 	daddr32_t *bap;
    685 	int	nindirshift, nindiroffset;
    686 	struct ufsvfs *ufsvfsp;
    687 
    688 	ufsvfsp = ip->i_ufsvfs;
    689 	/*
    690 	 * unnecessary when logging; allocation blocks are kept up-to-date
    691 	 */
    692 	if (TRANS_ISTRANS(ufsvfsp))
    693 		return (0);
    694 
    695 	fs = ufsvfsp->vfs_fs;
    696 
    697 	lbn = (daddr_t)lblkno(fs, off);
    698 	if (lbn < 0)
    699 		return (EFBIG);
    700 
    701 	/* The first NDADDR are direct so nothing to do */
    702 	if (lbn < NDADDR)
    703 		return (0);
    704 
    705 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
    706 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
    707 
    708 	/* Determine level of indirect blocks */
    709 	shft = 0;
    710 	tbn = lbn - NDADDR;
    711 	for (j = NIADDR; j > 0; j--) {
    712 		longlong_t	sh;
    713 
    714 		shft += nindirshift;
    715 		sh = 1LL << shft;
    716 		if (tbn < sh)
    717 			break;
    718 		tbn -= (daddr_t)sh;
    719 	}
    720 
    721 	if (j == 0)
    722 		return (EFBIG);
    723 
    724 	if ((nb = ip->i_ib[NIADDR - j]) == 0)
    725 			return (0);		/* UFS Hole */
    726 
    727 	/* Flush first level indirect block */
    728 	blkflush(ip->i_dev, fsbtodb(fs, nb));
    729 
    730 	/* Fetch through next levels */
    731 	for (; j < NIADDR; j++) {
    732 		ob = nb;
    733 		bp = UFS_BREAD(ufsvfsp,
    734 		    ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
    735 		if (bp->b_flags & B_ERROR) {
    736 			brelse(bp);
    737 			return (EIO);
    738 		}
    739 		bap = bp->b_un.b_daddr;
    740 		shft -= nindirshift;		/* sh / nindir */
    741 		i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */
    742 		nb = bap[i];
    743 		brelse(bp);
    744 		if (nb == 0) {
    745 			return (0); 		/* UFS hole */
    746 		}
    747 		blkflush(ip->i_dev, fsbtodb(fs, nb));
    748 	}
    749 	return (0);
    750 }
    751 
    752 #ifdef DEBUG
    753 
    754 /*
    755  * The bad block checking routines: ufs_indir_badblock() and ufs_badblock()
    756  * are very expensive. It's been found from profiling that we're
    757  * spending 6-7% of our time in ufs_badblock, and another 1-2% in
    758  * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels).
    759  * In addition from experience no failures have been found in recent
    760  * years. So the following tunable can be set to enable checking.
    761  */
    762 int ufs_badblock_checks = 0;
    763 
    764 /*
    765  * Check that a given indirect block contains blocks in range
    766  */
    767 int
    768 ufs_indir_badblock(struct inode *ip, daddr32_t *bap)
    769 {
    770 	int i;
    771 	int err = 0;
    772 
    773 	if (ufs_badblock_checks) {
    774 		for (i = 0; i < NINDIR(ip->i_fs) - 1; i++)
    775 			if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i])))
    776 				break;
    777 	}
    778 	return (err);
    779 }
    780 
    781 /*
    782  * Check that a specified block number is in range.
    783  */
    784 int
    785 ufs_badblock(struct inode *ip, daddr_t bn)
    786 {
    787 	long	c;
    788 	daddr_t	sum;
    789 
    790 	if (!ufs_badblock_checks)
    791 		return (0);
    792 	ASSERT(bn);
    793 	if (bn <= 0 || bn > ip->i_fs->fs_size)
    794 		return (bn);
    795 
    796 	sum = 0;
    797 	c = dtog(ip->i_fs, bn);
    798 	if (c == 0) {
    799 		sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize);
    800 	}
    801 	/*
    802 	 * if block no. is below this cylinder group,
    803 	 * within the space reserved for superblock, inodes, (summary data)
    804 	 * or if it is above this cylinder group
    805 	 * then its invalid
    806 	 * It's hard to see how we'd be outside this cyl, but let's be careful.
    807 	 */
    808 	if ((bn < cgbase(ip->i_fs, c)) ||
    809 	    (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) ||
    810 	    (bn >= (unsigned)cgbase(ip->i_fs, c+1)))
    811 		return (bn);
    812 
    813 	return (0);	/* not a bad block */
    814 }
    815 
    816 #endif /* DEBUG */
    817 
    818 /*
    819  * When i_rwlock is write-locked or has a writer pended, then the inode
    820  * is going to change in a way that the filesystem will be marked as
    821  * active. So no need to let the filesystem be mark as stable now.
    822  * Also to ensure the filesystem consistency during the directory
    823  * operations, filesystem cannot be marked as stable if i_rwlock of
    824  * the directory inode is write-locked.
    825  */
    826 
    827 /*
    828  * Check for busy inodes for this filesystem.
    829  * NOTE: Needs better way to do this expensive operation in the future.
    830  */
    831 static void
    832 ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp)
    833 {
    834 	union  ihead	*ih;
    835 	struct inode	*ip;
    836 	int		i;
    837 	int		isnottrans	= !TRANS_ISTRANS(ufsvfsp);
    838 	int		isbusy		= *isbusyp;
    839 	int		isreclaim	= *isreclaimp;
    840 
    841 	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
    842 		mutex_enter(&ih_lock[i]);
    843 		for (ip = ih->ih_chain[0];
    844 		    ip != (struct inode *)ih;
    845 		    ip = ip->i_forw) {
    846 			/*
    847 			 * if inode is busy/modified/deleted, filesystem is busy
    848 			 */
    849 			if (ip->i_ufsvfs != ufsvfsp)
    850 				continue;
    851 			if ((ip->i_flag & (IMOD | IUPD | ICHG)) ||
    852 			    (RW_ISWRITER(&ip->i_rwlock)))
    853 				isbusy = 1;
    854 			if ((ip->i_nlink <= 0) && (ip->i_flag & IREF))
    855 				isreclaim = 1;
    856 			if (isbusy && (isreclaim || isnottrans))
    857 				break;
    858 		}
    859 		mutex_exit(&ih_lock[i]);
    860 		if (isbusy && (isreclaim || isnottrans))
    861 			break;
    862 	}
    863 	*isbusyp = isbusy;
    864 	*isreclaimp = isreclaim;
    865 }
    866 
    867 /*
    868  * As part of the ufs 'sync' operation, this routine is called to mark
    869  * the filesystem as STABLE if there is no modified metadata in memory.
    870  */
    871 void
    872 ufs_checkclean(struct vfs *vfsp)
    873 {
    874 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
    875 	struct fs	*fs		= ufsvfsp->vfs_fs;
    876 	int		isbusy;
    877 	int		isreclaim;
    878 	int		updatesb;
    879 
    880 	ASSERT(vfs_lock_held(vfsp));
    881 
    882 	/*
    883 	 * filesystem is stable or cleanflag processing is disabled; do nothing
    884 	 *	no transitions when panic'ing
    885 	 */
    886 	if (fs->fs_ronly ||
    887 	    fs->fs_clean == FSBAD ||
    888 	    fs->fs_clean == FSSUSPEND ||
    889 	    fs->fs_clean == FSSTABLE ||
    890 	    panicstr)
    891 		return;
    892 
    893 	/*
    894 	 * if logging and nothing to reclaim; do nothing
    895 	 */
    896 	if ((fs->fs_clean == FSLOG) &&
    897 	    (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
    898 	    (fs->fs_reclaim & FS_RECLAIMING)))
    899 		return;
    900 
    901 	/*
    902 	 * FS_CHECKCLEAN is reset if the file system goes dirty
    903 	 * FS_CHECKRECLAIM is reset if a file gets deleted
    904 	 */
    905 	mutex_enter(&ufsvfsp->vfs_lock);
    906 	fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM);
    907 	mutex_exit(&ufsvfsp->vfs_lock);
    908 
    909 	updatesb = 0;
    910 
    911 	/*
    912 	 * if logging or buffers are busy; do nothing
    913 	 */
    914 	isbusy = isreclaim = 0;
    915 	if ((fs->fs_clean == FSLOG) ||
    916 	    (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp)))
    917 		isbusy = 1;
    918 
    919 	/*
    920 	 * isreclaim == TRUE means can't change the state of fs_reclaim
    921 	 */
    922 	isreclaim =
    923 	    ((fs->fs_clean == FSLOG) &&
    924 	    (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
    925 	    (fs->fs_reclaim & FS_RECLAIMING)));
    926 
    927 	/*
    928 	 * if fs is busy or can't change the state of fs_reclaim; do nothing
    929 	 */
    930 	if (isbusy && isreclaim)
    931 		return;
    932 
    933 	/*
    934 	 * look for busy or deleted inodes; (deleted == needs reclaim)
    935 	 */
    936 	ufs_icheck(ufsvfsp, &isbusy, &isreclaim);
    937 
    938 	mutex_enter(&ufsvfsp->vfs_lock);
    939 
    940 	/*
    941 	 * IF POSSIBLE, RESET RECLAIM
    942 	 */
    943 	/*
    944 	 * the reclaim thread is not running
    945 	 */
    946 	if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
    947 		/*
    948 		 * no files were deleted during the scan
    949 		 */
    950 		if (fs->fs_reclaim & FS_CHECKRECLAIM)
    951 			/*
    952 			 * no deleted files were found in the inode cache
    953 			 */
    954 			if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) {
    955 				fs->fs_reclaim &= ~FS_RECLAIM;
    956 				updatesb = 1;
    957 			}
    958 	/*
    959 	 * IF POSSIBLE, SET STABLE
    960 	 */
    961 	/*
    962 	 * not logging
    963 	 */
    964 	if (fs->fs_clean != FSLOG)
    965 		/*
    966 		 * file system has not gone dirty since the scan began
    967 		 */
    968 		if (fs->fs_reclaim & FS_CHECKCLEAN)
    969 			/*
    970 			 * nothing dirty was found in the buffer or inode cache
    971 			 */
    972 			if ((isbusy == 0) && (isreclaim == 0) &&
    973 			    (fs->fs_clean != FSSTABLE)) {
    974 				fs->fs_clean = FSSTABLE;
    975 				updatesb = 1;
    976 			}
    977 
    978 	mutex_exit(&ufsvfsp->vfs_lock);
    979 	if (updatesb) {
    980 		TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
    981 	}
    982 }
    983 
    984 /*
    985  * called whenever an unlink occurs
    986  */
    987 void
    988 ufs_setreclaim(struct inode *ip)
    989 {
    990 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
    991 	struct fs	*fs		= ufsvfsp->vfs_fs;
    992 
    993 	if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG))
    994 		return;
    995 
    996 	/*
    997 	 * reclaim-needed bit is already set or we need to tell
    998 	 * ufs_checkclean that a file has been deleted
    999 	 */
   1000 	if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM)
   1001 		return;
   1002 
   1003 	mutex_enter(&ufsvfsp->vfs_lock);
   1004 	/*
   1005 	 * inform ufs_checkclean that the file system has gone dirty
   1006 	 */
   1007 	fs->fs_reclaim &= ~FS_CHECKRECLAIM;
   1008 
   1009 	/*
   1010 	 * set the reclaim-needed bit
   1011 	 */
   1012 	if ((fs->fs_reclaim & FS_RECLAIM) == 0) {
   1013 		fs->fs_reclaim |= FS_RECLAIM;
   1014 		ufs_sbwrite(ufsvfsp);
   1015 	}
   1016 	mutex_exit(&ufsvfsp->vfs_lock);
   1017 }
   1018 
   1019 /*
   1020  * Before any modified metadata written back to the disk, this routine
   1021  * is called to mark the filesystem as ACTIVE.
   1022  */
   1023 void
   1024 ufs_notclean(struct ufsvfs *ufsvfsp)
   1025 {
   1026 	struct fs *fs = ufsvfsp->vfs_fs;
   1027 
   1028 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
   1029 	ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
   1030 
   1031 	/*
   1032 	 * inform ufs_checkclean that the file system has gone dirty
   1033 	 */
   1034 	fs->fs_reclaim &= ~FS_CHECKCLEAN;
   1035 
   1036 	/*
   1037 	 * ignore if active or bad or suspended or readonly or logging
   1038 	 */
   1039 	if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) ||
   1040 	    (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) ||
   1041 	    (fs->fs_ronly)) {
   1042 		mutex_exit(&ufsvfsp->vfs_lock);
   1043 		return;
   1044 	}
   1045 	fs->fs_clean = FSACTIVE;
   1046 	/*
   1047 	 * write superblock synchronously
   1048 	 */
   1049 	ufs_sbwrite(ufsvfsp);
   1050 	mutex_exit(&ufsvfsp->vfs_lock);
   1051 }
   1052 
   1053 /*
   1054  * ufs specific fbwrite()
   1055  */
   1056 int
   1057 ufs_fbwrite(struct fbuf *fbp, struct inode *ip)
   1058 {
   1059 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
   1060 
   1061 	if (TRANS_ISTRANS(ufsvfsp))
   1062 		return (fbwrite(fbp));
   1063 	mutex_enter(&ufsvfsp->vfs_lock);
   1064 	ufs_notclean(ufsvfsp);
   1065 	return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp));
   1066 }
   1067 
   1068 /*
   1069  * ufs specific fbiwrite()
   1070  */
   1071 int
   1072 ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize)
   1073 {
   1074 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
   1075 	o_mode_t	ifmt		= ip->i_mode & IFMT;
   1076 	buf_t		*bp;
   1077 	int		error;
   1078 
   1079 	mutex_enter(&ufsvfsp->vfs_lock);
   1080 	ufs_notclean(ufsvfsp);
   1081 	if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR ||
   1082 	    (ip->i_ufsvfs->vfs_qinod == ip)) {
   1083 		TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))),
   1084 		    fbp->fb_count, DT_FBI, 0, 0);
   1085 	}
   1086 	/*
   1087 	 * Inlined version of fbiwrite()
   1088 	 */
   1089 	bp = pageio_setup((struct page *)NULL, fbp->fb_count,
   1090 	    ip->i_devvp, B_WRITE);
   1091 	bp->b_flags &= ~B_PAGEIO;
   1092 	bp->b_un.b_addr = fbp->fb_addr;
   1093 
   1094 	bp->b_blkno = bn * btod(bsize);
   1095 	bp->b_dev = cmpdev(ip->i_dev);	/* store in old dev format */
   1096 	bp->b_edev = ip->i_dev;
   1097 	bp->b_proc = NULL;			/* i.e. the kernel */
   1098 	bp->b_file = ip->i_vnode;
   1099 	bp->b_offset = -1;
   1100 
   1101 	if (ufsvfsp->vfs_log) {
   1102 		lufs_write_strategy(ufsvfsp->vfs_log, bp);
   1103 	} else if (ufsvfsp->vfs_snapshot) {
   1104 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
   1105 	} else {
   1106 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
   1107 		ub.ub_fbiwrites.value.ul++;
   1108 		(void) bdev_strategy(bp);
   1109 		lwp_stat_update(LWP_STAT_OUBLK, 1);
   1110 	}
   1111 	error = biowait(bp);
   1112 	pageio_done(bp);
   1113 	fbrelse(fbp, S_OTHER);
   1114 	return (error);
   1115 }
   1116 
   1117 /*
   1118  * Write the ufs superblock only.
   1119  */
   1120 void
   1121 ufs_sbwrite(struct ufsvfs *ufsvfsp)
   1122 {
   1123 	char sav_fs_fmod;
   1124 	struct fs *fs = ufsvfsp->vfs_fs;
   1125 	struct buf *bp = ufsvfsp->vfs_bufp;
   1126 
   1127 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
   1128 
   1129 	/*
   1130 	 * for ulockfs processing, limit the superblock writes
   1131 	 */
   1132 	if ((ufsvfsp->vfs_ulockfs.ul_sbowner) &&
   1133 	    (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) {
   1134 		/* try again later */
   1135 		fs->fs_fmod = 1;
   1136 		return;
   1137 	}
   1138 
   1139 	ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
   1140 	/*
   1141 	 * update superblock timestamp and fs_clean checksum
   1142 	 * if marked FSBAD, we always want an erroneous
   1143 	 * checksum to force repair
   1144 	 */
   1145 	fs->fs_time = gethrestime_sec();
   1146 	fs->fs_state = (fs->fs_clean != FSBAD) ?
   1147 	    FSOKAY - fs->fs_time : -(FSOKAY - fs->fs_time);
   1148 	switch (fs->fs_clean) {
   1149 	case FSCLEAN:
   1150 	case FSSTABLE:
   1151 		fs->fs_reclaim &= ~FS_RECLAIM;
   1152 		break;
   1153 	case FSACTIVE:
   1154 	case FSSUSPEND:
   1155 	case FSBAD:
   1156 	case FSLOG:
   1157 		break;
   1158 	default:
   1159 		fs->fs_clean = FSACTIVE;
   1160 		break;
   1161 	}
   1162 	/*
   1163 	 * reset incore only bits
   1164 	 */
   1165 	fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM);
   1166 
   1167 	/*
   1168 	 * delta the whole superblock
   1169 	 */
   1170 	TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs),
   1171 	    DT_SB, NULL, 0);
   1172 	/*
   1173 	 * retain the incore state of fs_fmod; set the ondisk state to 0
   1174 	 */
   1175 	sav_fs_fmod = fs->fs_fmod;
   1176 	fs->fs_fmod = 0;
   1177 
   1178 	/*
   1179 	 * Don't release the buffer after written to the disk
   1180 	 */
   1181 	UFS_BWRITE2(ufsvfsp, bp);
   1182 	fs->fs_fmod = sav_fs_fmod;	/* reset fs_fmod's incore state */
   1183 }
   1184 
   1185 /*
   1186  * Returns vfs pointer if vfs still being mounted. vfs lock is held.
   1187  * Otherwise, returns NULL.
   1188  *
   1189  * For our purposes, "still mounted" means that the file system still appears
   1190  * on the list of UFS file system instances.
   1191  */
   1192 static vfs_t *
   1193 still_mounted(struct check_node *checkp)
   1194 {
   1195 	struct vfs	*vfsp;
   1196 	struct ufsvfs	*ufsp;
   1197 
   1198 	mutex_enter(&ufsvfs_mutex);
   1199 	for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
   1200 		if (ufsp != checkp->ufsvfs)
   1201 			continue;
   1202 		/*
   1203 		 * Tentative match:  verify it and try to lock.  (It's not at
   1204 		 * all clear how the verification could fail, given that we've
   1205 		 * gotten this far.  We would have had to reallocate the
   1206 		 * ufsvfs struct at hand for a new incarnation; is that really
   1207 		 * possible in the interval from constructing the check_node
   1208 		 * to here?)
   1209 		 */
   1210 		vfsp = ufsp->vfs_vfs;
   1211 		if (vfsp != checkp->vfsp)
   1212 			continue;
   1213 		if (vfsp->vfs_dev != checkp->vfs_dev)
   1214 			continue;
   1215 		if (vfs_lock(vfsp) != 0)
   1216 			continue;
   1217 
   1218 		mutex_exit(&ufsvfs_mutex);
   1219 		return (vfsp);
   1220 	}
   1221 	mutex_exit(&ufsvfs_mutex);
   1222 	return (NULL);
   1223 }
   1224 
   1225 int
   1226 ufs_si_io_done(struct buf *bp)
   1227 {
   1228 	sema_v(&bp->b_io);
   1229 	return (0);
   1230 }
   1231 
   1232 #define	SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE)
   1233 #define	NSIBUF 32
   1234 
   1235 /*
   1236  * ufs_construct_si()
   1237  * Read each cylinder group in turn and construct the summary information
   1238  */
   1239 static int
   1240 ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp)
   1241 {
   1242 	buf_t *bps, *bp;
   1243 	char *bufs;
   1244 	struct csum *sip = fs->fs_u.fs_csp;
   1245 	struct cg *cgp;
   1246 	int i, ncg;
   1247 	int error = 0, cg = 0;
   1248 
   1249 	bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP);
   1250 	bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP);
   1251 
   1252 	/*
   1253 	 * Initialise the buffer headers
   1254 	 */
   1255 	for (bp = bps, i = 0; i < NSIBUF; i++, bp++) {
   1256 		bioinit(bp);
   1257 		bp->b_iodone = ufs_si_io_done;
   1258 		bp->b_bufsize = bp->b_bcount = SI_BUFSZ;
   1259 		bp->b_flags = B_READ;
   1260 		bp->b_un.b_addr = bufs + (i * SI_BUFSZ);
   1261 		bp->b_edev = dev;
   1262 	}
   1263 
   1264 	/*
   1265 	 * Repeat while there are cylinder groups left to read.
   1266 	 */
   1267 	do {
   1268 		/*
   1269 		 * Issue upto NSIBUF asynchronous reads
   1270 		 */
   1271 		ncg = MIN(NSIBUF, (fs->fs_ncg - cg));
   1272 		for (bp = bps, i = 0; i < ncg; i++, bp++) {
   1273 			bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i));
   1274 			if (ufsvfsp->vfs_log) {
   1275 				lufs_read_strategy(ufsvfsp->vfs_log, bp);
   1276 			} else {
   1277 				(void) bdev_strategy(bp);
   1278 			}
   1279 		}
   1280 
   1281 		/*
   1282 		 * wait for each read to finish;
   1283 		 * check for errors and copy the csum info
   1284 		 */
   1285 		for (bp = bps, i = 0; i < ncg; i++, bp++) {
   1286 			sema_p(&bp->b_io);
   1287 			if (!error) {
   1288 				cgp = bp->b_un.b_cg;
   1289 				sip[cg + i] = cgp->cg_cs;
   1290 				error = geterror(bp);
   1291 			}
   1292 		}
   1293 		if (error) {
   1294 			goto err;
   1295 		}
   1296 		cg += ncg;
   1297 	} while (cg < fs->fs_ncg);
   1298 
   1299 err:
   1300 	kmem_free(bps, NSIBUF * sizeof (buf_t));
   1301 	kmem_free(bufs, NSIBUF * SI_BUFSZ);
   1302 	return (error);
   1303 }
   1304 
   1305 /*
   1306  * ufs_getsummaryinfo
   1307  */
   1308 int
   1309 ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
   1310 {
   1311 	int		i;		/* `for' loop counter */
   1312 	ssize_t		size;		/* bytes of summary info to read */
   1313 	daddr_t		frags;		/* frags of summary info to read */
   1314 	caddr_t		sip;		/* summary info */
   1315 	struct buf	*tp;		/* tmp buf */
   1316 
   1317 	/*
   1318 	 * maintain metadata map for trans device (debug only)
   1319 	 */
   1320 	TRANS_MATA_SI(ufsvfsp, fs);
   1321 
   1322 	/*
   1323 	 * Compute #frags and allocate space for summary info
   1324 	 */
   1325 	frags = howmany(fs->fs_cssize, fs->fs_fsize);
   1326 	sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP);
   1327 	fs->fs_u.fs_csp = (struct csum *)sip;
   1328 
   1329 	if (fs->fs_si == FS_SI_BAD) {
   1330 		/*
   1331 		 * The summary information is unknown, read it in from
   1332 		 * the cylinder groups.
   1333 		 */
   1334 		if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) &&
   1335 		    ufsvfsp->vfs_log->un_logmap) {
   1336 			logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */
   1337 		}
   1338 		bzero(sip, (size_t)fs->fs_cssize);
   1339 		if (ufs_construct_si(dev, fs, ufsvfsp)) {
   1340 			kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
   1341 			fs->fs_u.fs_csp = NULL;
   1342 			return (EIO);
   1343 		}
   1344 	} else {
   1345 		/* Read summary info a fs block at a time */
   1346 		size = fs->fs_bsize;
   1347 		for (i = 0; i < frags; i += fs->fs_frag) {
   1348 			if (i + fs->fs_frag > frags)
   1349 				/*
   1350 				 * This happens only the last iteration, so
   1351 				 * don't worry about size being reset
   1352 				 */
   1353 				size = (frags - i) * fs->fs_fsize;
   1354 			tp = UFS_BREAD(ufsvfsp, dev,
   1355 			    (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size);
   1356 			tp->b_flags |= B_STALE | B_AGE;
   1357 			if (tp->b_flags & B_ERROR) {
   1358 				kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
   1359 				fs->fs_u.fs_csp = NULL;
   1360 				brelse(tp);
   1361 				return (EIO);
   1362 			}
   1363 			bcopy(tp->b_un.b_addr, sip, size);
   1364 			sip += size;
   1365 			brelse(tp);
   1366 		}
   1367 	}
   1368 	bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal));
   1369 	for (i = 0; i < fs->fs_ncg; ++i) {
   1370 		fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir;
   1371 		fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree;
   1372 		fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree;
   1373 		fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree;
   1374 	}
   1375 	return (0);
   1376 }
   1377 
   1378 /*
   1379  * ufs_putsummaryinfo() stores all the cylinder group summary information
   1380  * This is only used when logging, but the file system may not
   1381  * be logging at the time, eg a read-only mount to flush the log
   1382  * may push the summary info out.
   1383  */
   1384 int
   1385 ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
   1386 {
   1387 	struct buf	b, *bp;		/* tmp buf */
   1388 	caddr_t		sip;		/* summary info */
   1389 	ssize_t		size;		/* bytes of summary info to write */
   1390 	daddr_t		frags;		/* frags of summary info to write */
   1391 	int		i;		/* `for' loop counter */
   1392 	int		error;		/* error */
   1393 
   1394 	if (TRANS_ISERROR(ufsvfsp)) {
   1395 		return (EIO);
   1396 	}
   1397 
   1398 	if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) {
   1399 		return (0);
   1400 	}
   1401 
   1402 	bp = &b;
   1403 	bioinit(bp);
   1404 	bp->b_iodone = ufs_si_io_done;
   1405 	bp->b_bufsize = size = fs->fs_bsize;
   1406 	bp->b_flags = B_WRITE;
   1407 	bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP);
   1408 	bp->b_edev = dev;
   1409 	frags = howmany(fs->fs_cssize, fs->fs_fsize);
   1410 	sip = (caddr_t)fs->fs_u.fs_csp;
   1411 
   1412 	/* Write summary info one fs block at a time */
   1413 	for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) {
   1414 		if (i + fs->fs_frag > frags) {
   1415 			/*
   1416 			 * This happens only the last iteration, so
   1417 			 * don't worry about size being reset
   1418 			 */
   1419 			size = (frags - i) * fs->fs_fsize;
   1420 		}
   1421 		bcopy(sip, bp->b_un.b_addr, size);
   1422 		bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i);
   1423 		bp->b_bcount = size;
   1424 		(void) bdev_strategy(bp);
   1425 		sema_p(&bp->b_io); /* wait for write to complete */
   1426 		error = geterror(bp);
   1427 		sip += size;
   1428 	}
   1429 	kmem_free(bp->b_un.b_addr, fs->fs_bsize);
   1430 	if (!error) {
   1431 		fs->fs_si = FS_SI_OK;
   1432 	}
   1433 	return (error);
   1434 }
   1435 
   1436 /*
   1437  * Decide whether it is okay to remove within a sticky directory.
   1438  * Two conditions need to be met:  write access to the directory
   1439  * is needed.  In sticky directories, write access is not sufficient;
   1440  * you can remove entries from a directory only if you own the directory,
   1441  * if you are privileged, if you own the entry or if the entry is
   1442  * a plain file and you have write access to that file.
   1443  * Function returns 0 if remove access is granted.
   1444  * Note, the caller is responsible for holding the i_contents lock
   1445  * at least as reader on the inquired inode 'ip'.
   1446  */
   1447 int
   1448 ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr)
   1449 {
   1450 	uid_t uid;
   1451 
   1452 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   1453 
   1454 	if ((dp->i_mode & ISVTX) &&
   1455 	    (uid = crgetuid(cr)) != dp->i_uid &&
   1456 	    uid != ip->i_uid &&
   1457 	    ((ip->i_mode & IFMT) != IFREG ||
   1458 	    ufs_iaccess(ip, IWRITE, cr, 0) != 0))
   1459 		return (secpolicy_vnode_remove(cr));
   1460 
   1461 	return (0);
   1462 }
   1463 #endif	/* _KERNEL */
   1464 
   1465 extern	int around[9];
   1466 extern	int inside[9];
   1467 extern	uchar_t *fragtbl[];
   1468 
   1469 /*
   1470  * Update the frsum fields to reflect addition or deletion
   1471  * of some frags.
   1472  */
   1473 void
   1474 fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt)
   1475 {
   1476 	int inblk;
   1477 	int field, subfield;
   1478 	int siz, pos;
   1479 
   1480 	/*
   1481 	 * ufsvfsp->vfs_lock is held when calling this.
   1482 	 */
   1483 	inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
   1484 	fragmap <<= 1;
   1485 	for (siz = 1; siz < fs->fs_frag; siz++) {
   1486 		if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
   1487 			continue;
   1488 		field = around[siz];
   1489 		subfield = inside[siz];
   1490 		for (pos = siz; pos <= fs->fs_frag; pos++) {
   1491 			if ((fragmap & field) == subfield) {
   1492 				fraglist[siz] += cnt;
   1493 				ASSERT(fraglist[siz] >= 0);
   1494 				pos += siz;
   1495 				field <<= siz;
   1496 				subfield <<= siz;
   1497 			}
   1498 			field <<= 1;
   1499 			subfield <<= 1;
   1500 		}
   1501 	}
   1502 }
   1503 
   1504 /*
   1505  * Block operations
   1506  */
   1507 
   1508 /*
   1509  * Check if a block is available
   1510  */
   1511 int
   1512 isblock(struct fs *fs, uchar_t *cp, daddr_t h)
   1513 {
   1514 	uchar_t mask;
   1515 
   1516 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
   1517 	    fs->fs_frag == 1);
   1518 	/*
   1519 	 * ufsvfsp->vfs_lock is held when calling this.
   1520 	 */
   1521 	switch ((int)fs->fs_frag) {
   1522 	case 8:
   1523 		return (cp[h] == 0xff);
   1524 	case 4:
   1525 		mask = 0x0f << ((h & 0x1) << 2);
   1526 		return ((cp[h >> 1] & mask) == mask);
   1527 	case 2:
   1528 		mask = 0x03 << ((h & 0x3) << 1);
   1529 		return ((cp[h >> 2] & mask) == mask);
   1530 	case 1:
   1531 		mask = 0x01 << (h & 0x7);
   1532 		return ((cp[h >> 3] & mask) == mask);
   1533 	default:
   1534 #ifndef _KERNEL
   1535 		cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)",
   1536 		    fs->fs_frag);
   1537 #endif /* _KERNEL */
   1538 		return (0);
   1539 	}
   1540 }
   1541 
   1542 /*
   1543  * Take a block out of the map
   1544  */
   1545 void
   1546 clrblock(struct fs *fs, uchar_t *cp, daddr_t h)
   1547 {
   1548 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
   1549 	    fs->fs_frag == 1);
   1550 	/*
   1551 	 * ufsvfsp->vfs_lock is held when calling this.
   1552 	 */
   1553 	switch ((int)fs->fs_frag) {
   1554 	case 8:
   1555 		cp[h] = 0;
   1556 		return;
   1557 	case 4:
   1558 		cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
   1559 		return;
   1560 	case 2:
   1561 		cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
   1562 		return;
   1563 	case 1:
   1564 		cp[h >> 3] &= ~(0x01 << (h & 0x7));
   1565 		return;
   1566 	default:
   1567 #ifndef _KERNEL
   1568 		cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)",
   1569 		    fs->fs_frag);
   1570 #endif /* _KERNEL */
   1571 		return;
   1572 	}
   1573 }
   1574 
   1575 /*
   1576  * Is block allocated?
   1577  */
   1578 int
   1579 isclrblock(struct fs *fs, uchar_t *cp, daddr_t h)
   1580 {
   1581 	uchar_t	mask;
   1582 	int	frag;
   1583 	/*
   1584 	 * ufsvfsp->vfs_lock is held when calling this.
   1585 	 */
   1586 	frag = fs->fs_frag;
   1587 	ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1);
   1588 	switch (frag) {
   1589 	case 8:
   1590 		return (cp[h] == 0);
   1591 	case 4:
   1592 		mask = ~(0x0f << ((h & 0x1) << 2));
   1593 		return (cp[h >> 1] == (cp[h >> 1] & mask));
   1594 	case 2:
   1595 		mask =	~(0x03 << ((h & 0x3) << 1));
   1596 		return (cp[h >> 2] == (cp[h >> 2] & mask));
   1597 	case 1:
   1598 		mask = ~(0x01 << (h & 0x7));
   1599 		return (cp[h >> 3] == (cp[h >> 3] & mask));
   1600 	default:
   1601 #ifndef _KERNEL
   1602 		cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)",
   1603 		    fs->fs_frag);
   1604 #endif /* _KERNEL */
   1605 		break;
   1606 	}
   1607 	return (0);
   1608 }
   1609 
   1610 /*
   1611  * Put a block into the map
   1612  */
   1613 void
   1614 setblock(struct fs *fs, uchar_t *cp, daddr_t h)
   1615 {
   1616 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
   1617 	    fs->fs_frag == 1);
   1618 	/*
   1619 	 * ufsvfsp->vfs_lock is held when calling this.
   1620 	 */
   1621 	switch ((int)fs->fs_frag) {
   1622 	case 8:
   1623 		cp[h] = 0xff;
   1624 		return;
   1625 	case 4:
   1626 		cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
   1627 		return;
   1628 	case 2:
   1629 		cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
   1630 		return;
   1631 	case 1:
   1632 		cp[h >> 3] |= (0x01 << (h & 0x7));
   1633 		return;
   1634 	default:
   1635 #ifndef _KERNEL
   1636 		cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)",
   1637 		    fs->fs_frag);
   1638 #endif /* _KERNEL */
   1639 		return;
   1640 	}
   1641 }
   1642 
   1643 int
   1644 skpc(char c, uint_t len, char *cp)
   1645 {
   1646 	if (len == 0)
   1647 		return (0);
   1648 	while (*cp++ == c && --len)
   1649 		;
   1650 	return (len);
   1651 }
   1652