Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
     27 /* All Rights Reserved */
     28 
     29 /*
     30  * Portions of this source code were derived from Berkeley 4.3 BSD
     31  * under license from the Regents of the University of California.
     32  */
     33 
     34 #include <sys/sysmacros.h>
     35 #include <sys/param.h>
     36 #include <sys/types.h>
     37 #include <sys/systm.h>
     38 #include <sys/t_lock.h>
     39 #include <sys/uio.h>
     40 #include <sys/kmem.h>
     41 #include <sys/thread.h>
     42 #include <sys/vfs.h>
     43 #include <sys/errno.h>
     44 #include <sys/buf.h>
     45 #include <sys/vnode.h>
     46 #include <sys/fs/ufs_trans.h>
     47 #include <sys/fs/ufs_inode.h>
     48 #include <sys/fs/ufs_fs.h>
     49 #include <sys/fs/ufs_fsdir.h>
     50 #include <sys/fs/ufs_quota.h>
     51 #include <sys/fs/ufs_panic.h>
     52 #include <sys/fs/ufs_bio.h>
     53 #include <sys/fs/ufs_log.h>
     54 #include <sys/cmn_err.h>
     55 #include <sys/file.h>
     56 #include <sys/debug.h>
     57 
     58 
     59 extern kmutex_t ufsvfs_mutex;
     60 extern struct ufsvfs *ufs_instances;
     61 
     62 /*
     63  * hlock any file systems w/errored logs
     64  */
     65 int
     66 ufs_trans_hlock()
     67 {
     68 	struct ufsvfs	*ufsvfsp;
     69 	struct lockfs	lockfs;
     70 	int		error;
     71 	int		retry	= 0;
     72 
     73 	/*
     74 	 * find fs's that paniced or have errored logging devices
     75 	 */
     76 	mutex_enter(&ufsvfs_mutex);
     77 	for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) {
     78 		/*
     79 		 * not mounted; continue
     80 		 */
     81 		if ((ufsvfsp->vfs_vfs == NULL) ||
     82 		    (ufsvfsp->vfs_validfs == UT_UNMOUNTED))
     83 			continue;
     84 		/*
     85 		 * disallow unmounts (hlock occurs below)
     86 		 */
     87 		if (TRANS_ISERROR(ufsvfsp))
     88 			ufsvfsp->vfs_validfs = UT_HLOCKING;
     89 	}
     90 	mutex_exit(&ufsvfs_mutex);
     91 
     92 	/*
     93 	 * hlock the fs's that paniced or have errored logging devices
     94 	 */
     95 again:
     96 	mutex_enter(&ufsvfs_mutex);
     97 	for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next)
     98 		if (ufsvfsp->vfs_validfs == UT_HLOCKING)
     99 			break;
    100 	mutex_exit(&ufsvfs_mutex);
    101 	if (ufsvfsp == NULL)
    102 		return (retry);
    103 	/*
    104 	 * hlock the file system
    105 	 */
    106 	(void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs);
    107 	if (!LOCKFS_IS_ELOCK(&lockfs)) {
    108 		lockfs.lf_lock = LOCKFS_HLOCK;
    109 		lockfs.lf_flags = 0;
    110 		lockfs.lf_comlen = 0;
    111 		lockfs.lf_comment = NULL;
    112 		error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0);
    113 		/*
    114 		 * retry after awhile; another app currently doing lockfs
    115 		 */
    116 		if (error == EBUSY || error == EINVAL)
    117 			retry = 1;
    118 	} else {
    119 		if (ufsfx_get_failure_qlen() > 0) {
    120 			if (mutex_tryenter(&ufs_fix.uq_mutex)) {
    121 				ufs_fix.uq_lowat = ufs_fix.uq_ne;
    122 				cv_broadcast(&ufs_fix.uq_cv);
    123 				mutex_exit(&ufs_fix.uq_mutex);
    124 			}
    125 		}
    126 		retry = 1;
    127 	}
    128 
    129 	/*
    130 	 * allow unmounts
    131 	 */
    132 	ufsvfsp->vfs_validfs = UT_MOUNTED;
    133 	goto again;
    134 }
    135 
    136 /*ARGSUSED*/
    137 void
    138 ufs_trans_onerror()
    139 {
    140 	mutex_enter(&ufs_hlock.uq_mutex);
    141 	ufs_hlock.uq_ne = ufs_hlock.uq_lowat;
    142 	cv_broadcast(&ufs_hlock.uq_cv);
    143 	mutex_exit(&ufs_hlock.uq_mutex);
    144 }
    145 
    146 void
    147 ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid)
    148 {
    149 	if (curthread->t_flag & T_DONTBLOCK) {
    150 		sbupdate(vfsp);
    151 		return;
    152 	} else {
    153 
    154 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
    155 			return;
    156 
    157 		curthread->t_flag |= T_DONTBLOCK;
    158 		TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
    159 		sbupdate(vfsp);
    160 		TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
    161 		curthread->t_flag &= ~T_DONTBLOCK;
    162 	}
    163 }
    164 
    165 void
    166 ufs_trans_iupdat(struct inode *ip, int waitfor)
    167 {
    168 	struct ufsvfs	*ufsvfsp;
    169 
    170 	if (curthread->t_flag & T_DONTBLOCK) {
    171 		rw_enter(&ip->i_contents, RW_READER);
    172 		ufs_iupdat(ip, waitfor);
    173 		rw_exit(&ip->i_contents);
    174 		return;
    175 	} else {
    176 		ufsvfsp = ip->i_ufsvfs;
    177 
    178 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
    179 			return;
    180 
    181 		curthread->t_flag |= T_DONTBLOCK;
    182 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
    183 		rw_enter(&ip->i_contents, RW_READER);
    184 		ufs_iupdat(ip, waitfor);
    185 		rw_exit(&ip->i_contents);
    186 		TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
    187 		curthread->t_flag &= ~T_DONTBLOCK;
    188 	}
    189 }
    190 
    191 void
    192 ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid)
    193 {
    194 	if (curthread->t_flag & T_DONTBLOCK) {
    195 		mutex_enter(&ufsvfsp->vfs_lock);
    196 		ufs_sbwrite(ufsvfsp);
    197 		mutex_exit(&ufsvfsp->vfs_lock);
    198 		return;
    199 	} else {
    200 
    201 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
    202 			return;
    203 
    204 		curthread->t_flag |= T_DONTBLOCK;
    205 		TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
    206 		mutex_enter(&ufsvfsp->vfs_lock);
    207 		ufs_sbwrite(ufsvfsp);
    208 		mutex_exit(&ufsvfsp->vfs_lock);
    209 		TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
    210 		curthread->t_flag &= ~T_DONTBLOCK;
    211 	}
    212 }
    213 
    214 /*ARGSUSED*/
    215 int
    216 ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore)
    217 {
    218 	struct fs	*fs;
    219 
    220 	fs = ufsvfsp->vfs_fs;
    221 	mutex_enter(&ufsvfsp->vfs_lock);
    222 	TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp,
    223 	    ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize,
    224 	    (caddr_t)fs->fs_u.fs_csp, fs->fs_cssize);
    225 	mutex_exit(&ufsvfsp->vfs_lock);
    226 	return (0);
    227 }
    228 
    229 /*ARGSUSED*/
    230 int
    231 ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno)
    232 {
    233 	struct buf	*bp;
    234 
    235 	bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1);
    236 	if (bp == NULL)
    237 		return (ENOENT);
    238 
    239 	if (bp->b_flags & B_DELWRI) {
    240 		/*
    241 		 * Do not use brwrite() here since the buffer is already
    242 		 * marked for retry or not by the code that called
    243 		 * TRANS_BUF().
    244 		 */
    245 		UFS_BWRITE(ufsvfsp, bp);
    246 		return (0);
    247 	}
    248 	/*
    249 	 * If we did not find the real buf for this block above then
    250 	 * clear the dev so the buf won't be found by mistake
    251 	 * for this block later.  We had to allocate at least a 1 byte
    252 	 * buffer to keep brelse happy.
    253 	 */
    254 	if (bp->b_bufsize == 1) {
    255 		bp->b_dev = (o_dev_t)NODEV;
    256 		bp->b_edev = NODEV;
    257 		bp->b_flags = 0;
    258 	}
    259 	brelse(bp);
    260 	return (ENOENT);
    261 }
    262 
    263 /*ARGSUSED*/
    264 int
    265 ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino)
    266 {
    267 	int		error;
    268 	struct inode	*ip;
    269 
    270 	/*
    271 	 * Grab the quota lock (if the file system has not been forcibly
    272 	 * unmounted).
    273 	 */
    274 	if (ufsvfsp)
    275 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
    276 
    277 	error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred);
    278 
    279 	if (ufsvfsp)
    280 		rw_exit(&ufsvfsp->vfs_dqrwlock);
    281 	if (error)
    282 		return (ENOENT);
    283 
    284 	if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) {
    285 		rw_enter(&ip->i_contents, RW_READER);
    286 		ufs_iupdat(ip, 1);
    287 		rw_exit(&ip->i_contents);
    288 		VN_RELE(ITOV(ip));
    289 		return (0);
    290 	}
    291 	VN_RELE(ITOV(ip));
    292 	return (ENOENT);
    293 }
    294 
    295 #ifdef DEBUG
    296 /*
    297  *	These routines maintain the metadata map (matamap)
    298  */
    299 
    300 /*
    301  * update the metadata map at mount
    302  */
    303 static int
    304 ufs_trans_mata_mount_scan(struct inode *ip, void *arg)
    305 {
    306 	/*
    307 	 * wrong file system; keep looking
    308 	 */
    309 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
    310 		return (0);
    311 
    312 	/*
    313 	 * load the metadata map
    314 	 */
    315 	rw_enter(&ip->i_contents, RW_WRITER);
    316 	ufs_trans_mata_iget(ip);
    317 	rw_exit(&ip->i_contents);
    318 	return (0);
    319 }
    320 
    321 void
    322 ufs_trans_mata_mount(struct ufsvfs *ufsvfsp)
    323 {
    324 	struct fs	*fs	= ufsvfsp->vfs_fs;
    325 	ino_t		ino;
    326 	int		i;
    327 
    328 	/*
    329 	 * put static metadata into matamap
    330 	 *	superblock
    331 	 *	cylinder groups
    332 	 *	inode groups
    333 	 *	existing inodes
    334 	 */
    335 	TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize);
    336 
    337 	for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) {
    338 		TRANS_MATAADD(ufsvfsp,
    339 		    ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize);
    340 		TRANS_MATAADD(ufsvfsp,
    341 		    ldbtob(fsbtodb(fs, itod(fs, ino))),
    342 		    fs->fs_ipg * sizeof (struct dinode));
    343 	}
    344 	(void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp);
    345 }
    346 
    347 /*
    348  * clear the metadata map at umount
    349  */
    350 void
    351 ufs_trans_mata_umount(struct ufsvfs *ufsvfsp)
    352 {
    353 	top_mataclr(ufsvfsp);
    354 }
    355 
    356 /*
    357  * summary info (may be extended during growfs test)
    358  */
    359 void
    360 ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs)
    361 {
    362 	TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)),
    363 	    fs->fs_cssize);
    364 }
    365 
    366 /*
    367  * scan an allocation block (either inode or true block)
    368  */
    369 static void
    370 ufs_trans_mata_direct(
    371 	struct inode *ip,
    372 	daddr_t *fragsp,
    373 	daddr32_t *blkp,
    374 	unsigned int nblk)
    375 {
    376 	int		i;
    377 	daddr_t		frag;
    378 	ulong_t		nb;
    379 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
    380 	struct fs	*fs		= ufsvfsp->vfs_fs;
    381 
    382 	for (i = 0; i < nblk && *fragsp; ++i, ++blkp)
    383 		if ((frag = *blkp) != 0) {
    384 			if (*fragsp > fs->fs_frag) {
    385 				nb = fs->fs_bsize;
    386 				*fragsp -= fs->fs_frag;
    387 			} else {
    388 				nb = *fragsp * fs->fs_fsize;
    389 				*fragsp = 0;
    390 			}
    391 			TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
    392 		}
    393 }
    394 
    395 /*
    396  * scan an indirect allocation block (either inode or true block)
    397  */
    398 static void
    399 ufs_trans_mata_indir(
    400 	struct inode *ip,
    401 	daddr_t *fragsp,
    402 	daddr_t frag,
    403 	int level)
    404 {
    405 	struct ufsvfs *ufsvfsp	= ip->i_ufsvfs;
    406 	struct fs *fs = ufsvfsp->vfs_fs;
    407 	int ne = fs->fs_bsize / (int)sizeof (daddr32_t);
    408 	int i;
    409 	struct buf *bp;
    410 	daddr32_t *blkp;
    411 	o_mode_t ifmt = ip->i_mode & IFMT;
    412 
    413 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize);
    414 	if (bp->b_flags & B_ERROR) {
    415 		brelse(bp);
    416 		return;
    417 	}
    418 	blkp = bp->b_un.b_daddr;
    419 
    420 	if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) ||
    421 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))
    422 		ufs_trans_mata_direct(ip, fragsp, blkp, ne);
    423 
    424 	if (level)
    425 		for (i = 0; i < ne && *fragsp; ++i, ++blkp)
    426 			ufs_trans_mata_indir(ip, fragsp, *blkp, level-1);
    427 	brelse(bp);
    428 }
    429 
    430 /*
    431  * put appropriate metadata into matamap for this inode
    432  */
    433 void
    434 ufs_trans_mata_iget(struct inode *ip)
    435 {
    436 	int		i;
    437 	daddr_t		frags	= dbtofsb(ip->i_fs, ip->i_blocks);
    438 	o_mode_t	ifmt 	= ip->i_mode & IFMT;
    439 
    440 	if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
    441 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
    442 		ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR);
    443 
    444 	if (frags)
    445 		ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR);
    446 
    447 	for (i = 0; i < NIADDR && frags; ++i)
    448 		if (ip->i_ib[i])
    449 			ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i);
    450 }
    451 
    452 /*
    453  * freeing possible metadata (block of user data)
    454  */
    455 void
    456 ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb)
    457 {
    458 	top_matadel(ufsvfsp, mof, nb);
    459 
    460 }
    461 
    462 /*
    463  * allocating metadata
    464  */
    465 void
    466 ufs_trans_mata_alloc(
    467 	struct ufsvfs *ufsvfsp,
    468 	struct inode *ip,
    469 	daddr_t frag,
    470 	ulong_t nb,
    471 	int indir)
    472 {
    473 	struct fs	*fs	= ufsvfsp->vfs_fs;
    474 	o_mode_t	ifmt 	= ip->i_mode & IFMT;
    475 
    476 	if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
    477 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
    478 		TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
    479 }
    480 
    481 #endif /* DEBUG */
    482 
    483 /*
    484  * ufs_trans_dir is used to declare a directory delta
    485  */
    486 int
    487 ufs_trans_dir(struct inode *ip, off_t offset)
    488 {
    489 	daddr_t	bn;
    490 	int	contig = 0, error;
    491 
    492 	ASSERT(ip);
    493 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
    494 	error = bmap_read(ip, (u_offset_t)offset, &bn, &contig);
    495 	if (error || (bn == UFS_HOLE)) {
    496 		cmn_err(CE_WARN, "ufs_trans_dir - could not get block"
    497 		    " number error = %d bn = %d\n", error, (int)bn);
    498 		if (error == 0)	/* treat UFS_HOLE as an I/O error */
    499 			error = EIO;
    500 		return (error);
    501 	}
    502 	TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0);
    503 	return (error);
    504 }
    505 
    506 /*ARGSUSED*/
    507 int
    508 ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp)
    509 {
    510 	/*
    511 	 * Lock the quota subsystem (ufsvfsp can be NULL
    512 	 * if the DQ_ERROR is set).
    513 	 */
    514 	if (ufsvfsp)
    515 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
    516 	mutex_enter(&dqp->dq_lock);
    517 
    518 	/*
    519 	 * If this transaction has been cancelled by closedq_scan_inode(),
    520 	 * then bail out now.  We don't call dqput() in this case because
    521 	 * it has already been done.
    522 	 */
    523 	if ((dqp->dq_flags & DQ_TRANS) == 0) {
    524 		mutex_exit(&dqp->dq_lock);
    525 		if (ufsvfsp)
    526 			rw_exit(&ufsvfsp->vfs_dqrwlock);
    527 		return (0);
    528 	}
    529 
    530 	if (dqp->dq_flags & DQ_ERROR) {
    531 		/*
    532 		 * Paranoia to make sure that there is at least one
    533 		 * reference to the dquot struct.  We are done with
    534 		 * the dquot (due to an error) so clear logging
    535 		 * specific markers.
    536 		 */
    537 		ASSERT(dqp->dq_cnt >= 1);
    538 		dqp->dq_flags &= ~DQ_TRANS;
    539 		dqput(dqp);
    540 		mutex_exit(&dqp->dq_lock);
    541 		if (ufsvfsp)
    542 			rw_exit(&ufsvfsp->vfs_dqrwlock);
    543 		return (1);
    544 	}
    545 
    546 	if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) {
    547 		ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0));
    548 		TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb,
    549 		    dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0);
    550 		/*
    551 		 * Paranoia to make sure that there is at least one
    552 		 * reference to the dquot struct.  Clear the
    553 		 * modification flag because the operation is now in
    554 		 * the log.  Also clear the logging specific markers
    555 		 * that were set in ufs_trans_quota().
    556 		 */
    557 		ASSERT(dqp->dq_cnt >= 1);
    558 		dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS);
    559 		dqput(dqp);
    560 	}
    561 
    562 	/*
    563 	 * At this point, the logging specific flag should be clear,
    564 	 * but add paranoia just in case something has gone wrong.
    565 	 */
    566 	ASSERT((dqp->dq_flags & DQ_TRANS) == 0);
    567 	mutex_exit(&dqp->dq_lock);
    568 	if (ufsvfsp)
    569 		rw_exit(&ufsvfsp->vfs_dqrwlock);
    570 	return (0);
    571 }
    572 
    573 /*
    574  * ufs_trans_quota take in a uid, allocates the disk space, placing the
    575  * quota record into the metamap, then declares the delta.
    576  */
    577 /*ARGSUSED*/
    578 void
    579 ufs_trans_quota(struct dquot *dqp)
    580 {
    581 
    582 	struct inode	*qip = dqp->dq_ufsvfsp->vfs_qinod;
    583 
    584 	ASSERT(qip);
    585 	ASSERT(MUTEX_HELD(&dqp->dq_lock));
    586 	ASSERT(dqp->dq_flags & DQ_MOD);
    587 	ASSERT(dqp->dq_mof != 0);
    588 	ASSERT(dqp->dq_mof != UFS_HOLE);
    589 
    590 	/*
    591 	 * Mark this dquot to indicate that we are starting a logging
    592 	 * file system operation for this dquot.  Also increment the
    593 	 * reference count so that the dquot does not get reused while
    594 	 * it is on the mapentry_t list.  DQ_TRANS is cleared and the
    595 	 * reference count is decremented by ufs_trans_push_quota.
    596 	 *
    597 	 * If the file system is force-unmounted while there is a
    598 	 * pending quota transaction, then closedq_scan_inode() will
    599 	 * clear the DQ_TRANS flag and decrement the reference count.
    600 	 *
    601 	 * Since deltamap_add() drops multiple transactions to the
    602 	 * same dq_mof and ufs_trans_push_quota() won't get called,
    603 	 * we use DQ_TRANS to prevent repeat transactions from
    604 	 * incrementing the reference count (or calling TRANS_DELTA()).
    605 	 */
    606 	if ((dqp->dq_flags & DQ_TRANS) == 0) {
    607 		dqp->dq_flags |= DQ_TRANS;
    608 		dqp->dq_cnt++;
    609 		TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk),
    610 		    DT_QR, ufs_trans_push_quota, (ulong_t)dqp);
    611 	}
    612 }
    613 
    614 void
    615 ufs_trans_dqrele(struct dquot *dqp)
    616 {
    617 	struct ufsvfs	*ufsvfsp = dqp->dq_ufsvfsp;
    618 
    619 	curthread->t_flag |= T_DONTBLOCK;
    620 	TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
    621 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
    622 	dqrele(dqp);
    623 	rw_exit(&ufsvfsp->vfs_dqrwlock);
    624 	TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
    625 	curthread->t_flag &= ~T_DONTBLOCK;
    626 }
    627 
    628 int ufs_trans_max_resv = TOP_MAX_RESV;	/* will be adjusted for testing */
    629 long ufs_trans_avgbfree = 0;		/* will be adjusted for testing */
    630 #define	TRANS_MAX_WRITE	(1024 * 1024)
    631 size_t ufs_trans_max_resid = TRANS_MAX_WRITE;
    632 
    633 /*
    634  * Calculate the log reservation for the given write or truncate
    635  */
    636 static ulong_t
    637 ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc)
    638 {
    639 	long		ncg, last2blk;
    640 	long		niblk		= 0;
    641 	u_offset_t	writeend, offblk;
    642 	int		resv;
    643 	daddr_t		nblk, maxfblk;
    644 	long		avgbfree;
    645 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
    646 	struct fs	*fs		= ufsvfsp->vfs_fs;
    647 	long		fni		= NINDIR(fs);
    648 	int		bsize		= fs->fs_bsize;
    649 
    650 	/*
    651 	 * Assume that the request will fit in 1 or 2 cg's,
    652 	 * resv is the amount of log space to reserve (in bytes).
    653 	 */
    654 	resv = SIZECG(ip) * 2 + INODESIZE + 1024;
    655 
    656 	/*
    657 	 * get max position of write in fs blocks
    658 	 */
    659 	writeend = offset + resid;
    660 	maxfblk = lblkno(fs, writeend);
    661 	offblk = lblkno(fs, offset);
    662 	/*
    663 	 * request size in fs blocks
    664 	 */
    665 	nblk = lblkno(fs, blkroundup(fs, resid));
    666 	/*
    667 	 * Adjust for sparse files
    668 	 */
    669 	if (trunc)
    670 		nblk = MIN(nblk, ip->i_blocks);
    671 
    672 	/*
    673 	 * Adjust avgbfree (for testing)
    674 	 */
    675 	avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1;
    676 
    677 	/*
    678 	 * Calculate maximum number of blocks of triple indirect
    679 	 * pointers to write.
    680 	 */
    681 	last2blk = NDADDR + fni + fni * fni;
    682 	if (maxfblk > last2blk) {
    683 		long nl2ptr;
    684 		long n3blk;
    685 
    686 		if (offblk > last2blk)
    687 			n3blk = maxfblk - offblk;
    688 		else
    689 			n3blk = maxfblk - last2blk;
    690 		niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1;
    691 		nl2ptr = roundup(niblk, fni) / fni + 1;
    692 		niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2;
    693 		maxfblk -= n3blk;
    694 	}
    695 	/*
    696 	 * calculate maximum number of blocks of double indirect
    697 	 * pointers to write.
    698 	 */
    699 	if (maxfblk > NDADDR + fni) {
    700 		long n2blk;
    701 
    702 		if (offblk > NDADDR + fni)
    703 			n2blk = maxfblk - offblk;
    704 		else
    705 			n2blk = maxfblk - NDADDR + fni;
    706 		niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2;
    707 		maxfblk -= n2blk;
    708 	}
    709 	/*
    710 	 * Add in indirect pointer block write
    711 	 */
    712 	if (maxfblk > NDADDR) {
    713 		niblk += 1;
    714 	}
    715 	/*
    716 	 * Calculate deltas for indirect pointer writes
    717 	 */
    718 	resv += niblk * (fs->fs_bsize + sizeof (struct delta));
    719 	/*
    720 	 * maximum number of cg's needed for request
    721 	 */
    722 	ncg = nblk / avgbfree;
    723 	if (ncg > fs->fs_ncg)
    724 		ncg = fs->fs_ncg;
    725 
    726 	/*
    727 	 * maximum amount of log space needed for request
    728 	 */
    729 	if (ncg > 2)
    730 		resv += (ncg - 2) * SIZECG(ip);
    731 
    732 	return (resv);
    733 }
    734 
    735 /*
    736  * Calculate the amount of log space that needs to be reserved for this
    737  * trunc request.  If the amount of log space is too large, then
    738  * calculate the the size that the requests needs to be split into.
    739  */
    740 void
    741 ufs_trans_trunc_resv(
    742 	struct inode *ip,
    743 	u_offset_t length,
    744 	int *resvp,
    745 	u_offset_t *residp)
    746 {
    747 	ulong_t		resv;
    748 	u_offset_t	size, offset, resid;
    749 	int		nchunks, flag;
    750 
    751 	/*
    752 	 *    *resvp is the amount of log space to reserve (in bytes).
    753 	 *    when nonzero, *residp is the number of bytes to truncate.
    754 	 */
    755 	*residp = 0;
    756 
    757 	if (length < ip->i_size) {
    758 		size = ip->i_size - length;
    759 	} else {
    760 		resv = SIZECG(ip) * 2 + INODESIZE + 1024;
    761 		/*
    762 		 * truncate up, doesn't really use much space,
    763 		 * the default above should be sufficient.
    764 		 */
    765 		goto done;
    766 	}
    767 
    768 	offset = length;
    769 	resid = size;
    770 	nchunks = 1;
    771 	flag = 0;
    772 
    773 	/*
    774 	 * If this request takes too much log space, it will be split into
    775 	 * "nchunks". If this split is not enough, linearly increment the
    776 	 * nchunks in the next iteration.
    777 	 */
    778 	for (; (resv = ufs_log_amt(ip, offset, resid, 1)) > ufs_trans_max_resv;
    779 	    offset = length + (nchunks - 1) * resid) {
    780 		if (!flag) {
    781 			nchunks = roundup(resv, ufs_trans_max_resv) /
    782 			    ufs_trans_max_resv;
    783 			flag = 1;
    784 		} else {
    785 			nchunks++;
    786 		}
    787 		resid = size / nchunks;
    788 	}
    789 
    790 	if (nchunks > 1) {
    791 		*residp = resid;
    792 	}
    793 done:
    794 	*resvp = resv;
    795 }
    796 
    797 int
    798 ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr)
    799 {
    800 	int 		err, issync, resv;
    801 	u_offset_t	resid;
    802 	int		do_block	= 0;
    803 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
    804 	struct fs	*fs		= ufsvfsp->vfs_fs;
    805 
    806 	/*
    807 	 * Not logging; just do the trunc
    808 	 */
    809 	if (!TRANS_ISTRANS(ufsvfsp)) {
    810 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
    811 		rw_enter(&ip->i_contents, RW_WRITER);
    812 		err = ufs_itrunc(ip, length, flags, cr);
    813 		rw_exit(&ip->i_contents);
    814 		rw_exit(&ufsvfsp->vfs_dqrwlock);
    815 		return (err);
    816 	}
    817 
    818 	/*
    819 	 * within the lockfs protocol but *not* part of a transaction
    820 	 */
    821 	do_block = curthread->t_flag & T_DONTBLOCK;
    822 	curthread->t_flag |= T_DONTBLOCK;
    823 
    824 	/*
    825 	 * Trunc the file (in pieces, if necessary)
    826 	 */
    827 again:
    828 	ufs_trans_trunc_resv(ip, length, &resv, &resid);
    829 	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv);
    830 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
    831 	rw_enter(&ip->i_contents, RW_WRITER);
    832 	if (resid) {
    833 		/*
    834 		 * resid is only set if we have to truncate in chunks
    835 		 */
    836 		ASSERT(length + resid < ip->i_size);
    837 
    838 		/*
    839 		 * Partially trunc file down to desired size (length).
    840 		 * Only retain I_FREE on the last partial trunc.
    841 		 * Round up size to a block boundary, to ensure the truncate
    842 		 * doesn't have to allocate blocks. This is done both for
    843 		 * performance and to fix a bug where if the block can't be
    844 		 * allocated then the inode delete fails, but the inode
    845 		 * is still freed with attached blocks and non-zero size
    846 		 * (bug 4348738).
    847 		 */
    848 		err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)),
    849 		    flags & ~I_FREE, cr);
    850 		ASSERT(ip->i_size != length);
    851 	} else
    852 		err = ufs_itrunc(ip, length, flags, cr);
    853 	if (!do_block)
    854 		curthread->t_flag &= ~T_DONTBLOCK;
    855 	rw_exit(&ip->i_contents);
    856 	rw_exit(&ufsvfsp->vfs_dqrwlock);
    857 	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv);
    858 
    859 	if ((err == 0) && resid) {
    860 		ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
    861 		goto again;
    862 	}
    863 	return (err);
    864 }
    865 
    866 /*
    867  * Calculate the amount of log space that needs to be reserved for this
    868  * write request.  If the amount of log space is too large, then
    869  * calculate the size that the requests needs to be split into.
    870  * First try fixed chunks of size ufs_trans_max_resid. If that
    871  * is too big, iterate down to the largest size that will fit.
    872  * Pagein the pages in the first chunk here, so that the pagein is
    873  * avoided later when the transaction is open.
    874  */
    875 void
    876 ufs_trans_write_resv(
    877 	struct inode *ip,
    878 	struct uio *uio,
    879 	int *resvp,
    880 	int *residp)
    881 {
    882 	ulong_t		resv;
    883 	offset_t	offset;
    884 	ssize_t		resid;
    885 	int		nchunks;
    886 
    887 	*residp = 0;
    888 	offset = uio->uio_offset;
    889 	resid = MIN(uio->uio_resid, ufs_trans_max_resid);
    890 	resv = ufs_log_amt(ip, offset, resid, 0);
    891 	if (resv <= ufs_trans_max_resv) {
    892 		uio_prefaultpages(resid, uio);
    893 		if (resid != uio->uio_resid)
    894 			*residp = resid;
    895 		*resvp = resv;
    896 		return;
    897 	}
    898 
    899 	resid = uio->uio_resid;
    900 	nchunks = 1;
    901 	for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv;
    902 	    offset = uio->uio_offset + (nchunks - 1) * resid) {
    903 		nchunks++;
    904 		resid = uio->uio_resid / nchunks;
    905 	}
    906 	uio_prefaultpages(resid, uio);
    907 	/*
    908 	 * If this request takes too much log space, it will be split
    909 	 */
    910 	if (nchunks > 1)
    911 		*residp = resid;
    912 	*resvp = resv;
    913 }
    914 
    915 /*
    916  * Issue write request.
    917  *
    918  * Split a large request into smaller chunks.
    919  */
    920 int
    921 ufs_trans_write(
    922 	struct inode *ip,
    923 	struct uio *uio,
    924 	int ioflag,
    925 	cred_t *cr,
    926 	int resv,
    927 	long resid)
    928 {
    929 	long		realresid;
    930 	int		err;
    931 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
    932 
    933 	/*
    934 	 * since the write is too big and would "HOG THE LOG" it needs to
    935 	 * be broken up and done in pieces.  NOTE, the caller will
    936 	 * issue the EOT after the request has been completed
    937 	 */
    938 	realresid = uio->uio_resid;
    939 
    940 again:
    941 	/*
    942 	 * Perform partial request (uiomove will update uio for us)
    943 	 *	Request is split up into "resid" size chunks until
    944 	 *	"realresid" bytes have been transferred.
    945 	 */
    946 	uio->uio_resid = MIN(resid, realresid);
    947 	realresid -= uio->uio_resid;
    948 	err = wrip(ip, uio, ioflag, cr);
    949 
    950 	/*
    951 	 * Error or request is done; caller issues final EOT
    952 	 */
    953 	if (err || uio->uio_resid || (realresid == 0)) {
    954 		uio->uio_resid += realresid;
    955 		return (err);
    956 	}
    957 
    958 	/*
    959 	 * Generate EOT for this part of the request
    960 	 */
    961 	rw_exit(&ip->i_contents);
    962 	rw_exit(&ufsvfsp->vfs_dqrwlock);
    963 	if (ioflag & (FSYNC|FDSYNC)) {
    964 		TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv);
    965 	} else {
    966 		TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
    967 	}
    968 
    969 	/*
    970 	 * Make sure the input buffer is resident before starting
    971 	 * the next transaction.
    972 	 */
    973 	uio_prefaultpages(MIN(resid, realresid), uio);
    974 
    975 	/*
    976 	 * Generate BOT for next part of the request
    977 	 */
    978 	if (ioflag & (FSYNC|FDSYNC)) {
    979 		int error;
    980 		TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error);
    981 		ASSERT(!error);
    982 	} else {
    983 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
    984 	}
    985 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
    986 	rw_enter(&ip->i_contents, RW_WRITER);
    987 	/*
    988 	 * Error during EOT (probably device error while writing commit rec)
    989 	 */
    990 	if (err)
    991 		return (err);
    992 	goto again;
    993 }
    994