Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 
     40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     41 
     42 #include <sys/types.h>
     43 #include <sys/t_lock.h>
     44 #include <sys/param.h>
     45 #include <sys/systm.h>
     46 #include <sys/signal.h>
     47 #include <sys/user.h>
     48 #include <sys/vnode.h>
     49 #include <sys/buf.h>
     50 #include <sys/disp.h>
     51 #include <sys/proc.h>
     52 #include <sys/conf.h>
     53 #include <sys/fs/ufs_inode.h>
     54 #include <sys/fs/ufs_fs.h>
     55 #include <sys/fs/ufs_quota.h>
     56 #include <sys/fs/ufs_trans.h>
     57 #include <sys/fs/ufs_bio.h>
     58 #include <vm/seg.h>
     59 #include <sys/errno.h>
     60 #include <sys/sysmacros.h>
     61 #include <sys/vfs.h>
     62 #include <sys/debug.h>
     63 #include <sys/kmem.h>
     64 #include <sys/cmn_err.h>
     65 
     66 /*
     67  * This structure is used to track blocks as we allocate them, so that
     68  * we can free them if we encounter an error during allocation.  We
     69  * keep track of five pieces of information for each allocated block:
     70  *   - The number of the newly allocated block
     71  *   - The size of the block (lets us deal with fragments if we want)
     72  *   - The number of the block containing a pointer to it; or whether
     73  *     the pointer is in the inode
     74  *   - The offset within the block (or inode) containing a pointer to it.
     75  *   - A flag indicating the usage of the block.  (Logging needs to know
     76  *     this to avoid overwriting a data block if it was previously used
     77  *     for metadata.)
     78  */
     79 
     80 enum ufs_owner_type {
     81 	ufs_no_owner,		/* Owner has not yet been updated */
     82 	ufs_inode_direct,	/* Listed in inode's direct block table */
     83 	ufs_inode_indirect,	/* Listed in inode's indirect block table */
     84 	ufs_indirect_block	/* Listed in an indirect block */
     85 };
     86 
     87 struct ufs_allocated_block {
     88 	daddr_t this_block;	    /* Number of this block */
     89 	off_t block_size;	    /* Size of this block, in bytes */
     90 	enum ufs_owner_type owner;  /* Who points to this block? */
     91 	daddr_t owner_block;	    /* Number of the owning block */
     92 	uint_t owner_offset;	    /* Offset within that block or inode */
     93 	int usage_flags;	    /* Usage flags, as expected by free() */
     94 };
     95 
     96 
     97 static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp,
     98 		int maxtrans);
     99 
    100 static void ufs_undo_allocation(inode_t *ip, int block_count,
    101 	struct ufs_allocated_block table[], int inode_sector_adjust);
    102 
    103 /*
    104  * Find the extent and the matching block number.
    105  *
    106  * bsize > PAGESIZE
    107  *	boff indicates that we want a page in the middle
    108  *	min expression is supposed to make sure no extra page[s] after EOF
    109  * PAGESIZE >= bsize
    110  *	we assume that a page is a multiple of bsize, i.e.,
    111  *	boff always == 0
    112  *
    113  * We always return a length that is suitable for a disk transfer.
    114  */
    115 #define	DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
    116 	register daddr32_t *dp = (tblp);				\
    117 	register int _chkfrag = chkfrag; /* for lint. sigh */		\
    118 									\
    119 	if (*dp == 0) {							\
    120 		*(bnp) = UFS_HOLE;					\
    121 	} else {							\
    122 		register int len;					\
    123 									\
    124 		len = findextent(fs, dp, (int)(n), lenp, maxtrans) << 	\
    125 			(fs)->fs_bshift; 				\
    126 		if (_chkfrag) {						\
    127 			register u_offset_t tmp;			\
    128 									\
    129 			tmp = fragroundup((fs), size) -			\
    130 			    (((u_offset_t)lbn) << fs->fs_bshift);	\
    131 			len = (int)MIN(tmp, len);			\
    132 		}							\
    133 		len -= (boff);						\
    134 		if (len <= 0) {						\
    135 			*(bnp) = UFS_HOLE;				\
    136 		} else {						\
    137 			*(bnp) = fsbtodb(fs, *dp) + btodb(boff);	\
    138 			*(lenp) = len;					\
    139 		}							\
    140 	}								\
    141 }
    142 
    143 /*
    144  * The maximum supported file size is actually somewhat less that 1
    145  * terabyte.  This is because the total number of blocks used for the
    146  * file and its metadata must fit into the ic_blocks field of the
    147  * inode, which is a signed 32-bit quantity.  The metadata allocated
    148  * for a file (that is, the single, double, and triple indirect blocks
    149  * used to reference the file blocks) is actually quite small,
    150  * but just to make sure, we check for overflow in the ic_blocks
    151  * ic_blocks fields for all files whose total block count is
    152  * within 1 GB of a terabyte.  VERYLARGEFILESIZE below is the number of
    153  * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
    154  * in a gigabyte (2^21).  We only check for overflow in the ic_blocks
    155  * field if the number of blocks currently allocated to the file is
    156  * greater than VERYLARGEFILESIZE.
    157  *
    158  * Note that file "size" is the not the same as file "length".  A
    159  * file's "size" is the number of blocks allocated to it.  A file's
    160  * "length" is the maximum offset in the file.  A UFS FILE can have a
    161  * length of a terabyte, but the size is limited to somewhat less than
    162  * a terabyte, as described above.
    163  */
    164 #define	VERYLARGEFILESIZE	0x7FE00000
    165 
    166 /*
    167  * bmap{read,write} define the structure of file system storage by mapping
    168  * a logical offset in a file to a physical block number on the device.
    169  * It should be called with a locked inode when allocation is to be
    170  * done (bmap_write).  Note this strangeness: bmap_write is always called from
    171  * getpage(), not putpage(), since getpage() is where all the allocation
    172  * is done.
    173  *
    174  * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write.
    175  *
    176  * NOTICE: the block number returned is the disk block number, not the
    177  * file system block number.  All the worries about block offsets and
    178  * page/block sizes are hidden inside of bmap.  Well, not quite,
    179  * unfortunately.  It's impossible to find one place to hide all this
    180  * mess.  There are 3 cases:
    181  *
    182  * PAGESIZE < bsize
    183  *	In this case, the {get,put}page routines will attempt to align to
    184  *	a file system block boundry (XXX - maybe this is a mistake?).  Since
    185  *	the kluster routines may be out of memory, we don't always get all
    186  *	the pages we wanted.  If we called bmap first, to find out how much
    187  *	to kluster, we handed in the block aligned offset.  If we didn't get
    188  *	all the pages, we have to chop off the amount we didn't get from the
    189  *	amount handed back by bmap.
    190  *
    191  * PAGESIZE == bsize
    192  *	Life is quite pleasant here, no extra work needed, mainly because we
    193  *	(probably?) won't kluster backwards, just forwards.
    194  *
    195  * PAGESIZE > bsize
    196  *	This one has a different set of problems, specifically, we may have to
    197  *	do N reads to fill one page.  Let us hope that Sun will stay with small
    198  *	pages.
    199  *
    200  * Returns 0 on success, or a non-zero errno if an error occurs.
    201  *
    202  * TODO
    203  *	LMXXX - add a bmap cache.  This could be a couple of extents in the
    204  *	inode.  Two is nice for PAGESIZE > bsize.
    205  */
    206 
    207 int
    208 bmap_read(struct inode *ip, u_offset_t off, daddr_t *bnp, int *lenp)
    209 {
    210 	daddr_t lbn;
    211 	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
    212 	struct	fs *fs = ufsvfsp->vfs_fs;
    213 	struct	buf *bp;
    214 	int	i, j, boff;
    215 	int	shft;			/* we maintain sh = 1 << shft */
    216 	daddr_t	ob, nb, tbn;
    217 	daddr32_t *bap;
    218 	int	nindirshift, nindiroffset;
    219 
    220 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
    221 	lbn = (daddr_t)lblkno(fs, off);
    222 	boff = (int)blkoff(fs, off);
    223 	if (lbn < 0)
    224 		return (EFBIG);
    225 
    226 	/*
    227 	 * The first NDADDR blocks are direct blocks.
    228 	 */
    229 	if (lbn < NDADDR) {
    230 		DOEXTENT(fs, lbn, boff, bnp, lenp,
    231 		    ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1,
    232 		    ufsvfsp->vfs_iotransz);
    233 		return (0);
    234 	}
    235 
    236 	nindirshift = ufsvfsp->vfs_nindirshift;
    237 	nindiroffset = ufsvfsp->vfs_nindiroffset;
    238 	/*
    239 	 * Determine how many levels of indirection.
    240 	 */
    241 	shft = 0;				/* sh = 1 */
    242 	tbn = lbn - NDADDR;
    243 	for (j = NIADDR; j > 0; j--) {
    244 		longlong_t	sh;
    245 
    246 		shft += nindirshift;		/* sh *= nindir */
    247 		sh = 1LL << shft;
    248 		if (tbn < sh)
    249 			break;
    250 		tbn -= sh;
    251 	}
    252 	if (j == 0)
    253 		return (EFBIG);
    254 
    255 	/*
    256 	 * Fetch the first indirect block.
    257 	 */
    258 	nb = ip->i_ib[NIADDR - j];
    259 	if (nb == 0) {
    260 		*bnp = UFS_HOLE;
    261 		return (0);
    262 	}
    263 
    264 	/*
    265 	 * Fetch through the indirect blocks.
    266 	 */
    267 	for (; j <= NIADDR; j++) {
    268 		ob = nb;
    269 		bp = UFS_BREAD(ufsvfsp,
    270 		    ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
    271 		if (bp->b_flags & B_ERROR) {
    272 			brelse(bp);
    273 			return (EIO);
    274 		}
    275 		bap = bp->b_un.b_daddr;
    276 
    277 		ASSERT(!ufs_indir_badblock(ip, bap));
    278 
    279 		shft -= nindirshift;		/* sh / nindir */
    280 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
    281 		nb = bap[i];
    282 		if (nb == 0) {
    283 			*bnp = UFS_HOLE;
    284 			brelse(bp);
    285 			return (0);
    286 		}
    287 		if (j != NIADDR)
    288 			brelse(bp);
    289 	}
    290 	DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i],
    291 	    MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1),
    292 	    0, ufsvfsp->vfs_iotransz);
    293 	brelse(bp);
    294 	return (0);
    295 }
    296 
    297 /*
    298  * See bmap_read for general notes.
    299  *
    300  * The block must be at least size bytes and will be extended or
    301  * allocated as needed.  If alloc_type is of type BI_ALLOC_ONLY, then bmap
    302  * will not create any in-core pages that correspond to the new disk allocation.
    303  * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr
    304  * and security is maintained b/c upon reading a negative block number pages
    305  * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will
    306  * be created and initialized as needed.
    307  *
    308  * Returns 0 on success, or a non-zero errno if an error occurs.
    309  */
    310 int
    311 bmap_write(struct inode	*ip, u_offset_t	off, int size,
    312     enum bi_type alloc_type, daddr_t *allocblk, struct cred *cr)
    313 {
    314 	struct	fs *fs;
    315 	struct	buf *bp;
    316 	int	i;
    317 	struct	buf *nbp;
    318 	int	j;
    319 	int	shft;				/* we maintain sh = 1 << shft */
    320 	daddr_t	ob, nb, pref, lbn, llbn, tbn;
    321 	daddr32_t *bap;
    322 	struct	vnode *vp = ITOV(ip);
    323 	long	bsize = VBSIZE(vp);
    324 	long	osize, nsize;
    325 	int	issync, metaflag, isdirquota;
    326 	int	err;
    327 	dev_t	dev;
    328 	struct	fbuf *fbp;
    329 	int	nindirshift;
    330 	int	nindiroffset;
    331 	struct	ufsvfs	*ufsvfsp;
    332 	int	added_sectors;		/* sectors added to this inode */
    333 	int	alloced_blocks;		/* fs blocks newly allocated */
    334 	struct  ufs_allocated_block undo_table[NIADDR+1];
    335 	int	verylargefile = 0;
    336 
    337 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
    338 
    339 	if (allocblk)
    340 		*allocblk = 0;
    341 
    342 	ufsvfsp = ip->i_ufsvfs;
    343 	fs = ufsvfsp->vfs_bufp->b_un.b_fs;
    344 	lbn = (daddr_t)lblkno(fs, off);
    345 	if (lbn < 0)
    346 		return (EFBIG);
    347 	if (ip->i_blocks >= VERYLARGEFILESIZE)
    348 		verylargefile = 1;
    349 	llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0);
    350 	metaflag = isdirquota = 0;
    351 	if (((ip->i_mode & IFMT) == IFDIR) ||
    352 	    ((ip->i_mode & IFMT) == IFATTRDIR))
    353 		isdirquota = metaflag = I_DIR;
    354 	else if ((ip->i_mode & IFMT) == IFSHAD)
    355 		metaflag = I_SHAD;
    356 	else if (ip->i_ufsvfs->vfs_qinod == ip)
    357 		isdirquota = metaflag = I_QUOTA;
    358 
    359 	issync = ((ip->i_flag & ISYNC) != 0);
    360 
    361 	if (isdirquota || issync) {
    362 		alloc_type = BI_NORMAL;	/* make sure */
    363 	}
    364 
    365 	/*
    366 	 * If the next write will extend the file into a new block,
    367 	 * and the file is currently composed of a fragment
    368 	 * this fragment has to be extended to be a full block.
    369 	 */
    370 	if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) {
    371 		osize = blksize(fs, ip, llbn);
    372 		if (osize < bsize && osize > 0) {
    373 			/*
    374 			 * Check to see if doing this will make the file too
    375 			 * big.  Only check if we are dealing with a very
    376 			 * large file.
    377 			 */
    378 			if (verylargefile == 1) {
    379 				if (((unsigned)ip->i_blocks +
    380 				    btodb(bsize - osize)) > INT_MAX) {
    381 					return (EFBIG);
    382 				}
    383 			}
    384 			/*
    385 			 * Make sure we have all needed pages setup correctly.
    386 			 *
    387 			 * We pass S_OTHER to fbread here because we want
    388 			 * an exclusive lock on the page in question
    389 			 * (see ufs_getpage). I/O to the old block location
    390 			 * may still be in progress and we are about to free
    391 			 * the old block. We don't want anyone else to get
    392 			 * a hold of the old block once we free it until
    393 			 * the I/O is complete.
    394 			 */
    395 			err =
    396 			    fbread(ITOV(ip), ((offset_t)llbn << fs->fs_bshift),
    397 			    (uint_t)bsize, S_OTHER, &fbp);
    398 			if (err)
    399 				return (err);
    400 			pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]);
    401 			err = realloccg(ip, ob, pref, (int)osize, (int)bsize,
    402 			    &nb, cr);
    403 			if (err) {
    404 				if (fbp)
    405 					fbrelse(fbp, S_OTHER);
    406 				return (err);
    407 			}
    408 			ASSERT(!ufs_badblock(ip, nb));
    409 
    410 			/*
    411 			 * Update the inode before releasing the
    412 			 * lock on the page. If we released the page
    413 			 * lock first, the data could be written to it's
    414 			 * old address and then destroyed.
    415 			 */
    416 			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
    417 			ip->i_db[llbn] = nb;
    418 			UFS_SET_ISIZE(((u_offset_t)(llbn + 1)) << fs->fs_bshift,
    419 			    ip);
    420 			ip->i_blocks += btodb(bsize - osize);
    421 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
    422 			TRANS_INODE(ufsvfsp, ip);
    423 			ip->i_flag |= IUPD | ICHG | IATTCHG;
    424 
    425 			/* Caller is responsible for updating i_seq */
    426 			/*
    427 			 * Don't check metaflag here, directories won't do this
    428 			 *
    429 			 */
    430 			if (issync) {
    431 				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
    432 			} else {
    433 				ASSERT(fbp);
    434 				fbrelse(fbp, S_WRITE);
    435 			}
    436 
    437 			if (nb != ob) {
    438 				(void) free(ip, ob, (off_t)osize, metaflag);
    439 			}
    440 		}
    441 	}
    442 
    443 	/*
    444 	 * The first NDADDR blocks are direct blocks.
    445 	 */
    446 	if (lbn < NDADDR) {
    447 		nb = ip->i_db[lbn];
    448 		if (nb == 0 ||
    449 		    ip->i_size < ((u_offset_t)(lbn + 1)) << fs->fs_bshift) {
    450 			if (nb != 0) {
    451 				/* consider need to reallocate a frag */
    452 				osize = fragroundup(fs, blkoff(fs, ip->i_size));
    453 				nsize = fragroundup(fs, size);
    454 				if (nsize <= osize)
    455 					goto gotit;
    456 				/*
    457 				 * Check to see if doing this will make the
    458 				 * file too big.  Only check if we are dealing
    459 				 * with a very large file.
    460 				 */
    461 				if (verylargefile == 1) {
    462 					if (((unsigned)ip->i_blocks +
    463 					    btodb(nsize - osize)) > INT_MAX) {
    464 						return (EFBIG);
    465 					}
    466 				}
    467 				/*
    468 				 * need to re-allocate a block or frag
    469 				 */
    470 				ob = nb;
    471 				pref = blkpref(ip, lbn, (int)lbn,
    472 				    &ip->i_db[0]);
    473 				err = realloccg(ip, ob, pref, (int)osize,
    474 				    (int)nsize, &nb, cr);
    475 				if (err)
    476 					return (err);
    477 				if (allocblk)
    478 					*allocblk = nb;
    479 				ASSERT(!ufs_badblock(ip, nb));
    480 
    481 			} else {
    482 				/*
    483 				 * need to allocate a block or frag
    484 				 */
    485 				osize = 0;
    486 				if (ip->i_size <
    487 				    ((u_offset_t)(lbn + 1)) << fs->fs_bshift)
    488 					nsize = fragroundup(fs, size);
    489 				else
    490 					nsize = bsize;
    491 				/*
    492 				 * Check to see if doing this will make the
    493 				 * file too big.  Only check if we are dealing
    494 				 * with a very large file.
    495 				 */
    496 				if (verylargefile == 1) {
    497 					if (((unsigned)ip->i_blocks +
    498 					    btodb(nsize - osize)) > INT_MAX) {
    499 						return (EFBIG);
    500 					}
    501 				}
    502 				pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]);
    503 				err = alloc(ip, pref, (int)nsize, &nb, cr);
    504 				if (err)
    505 					return (err);
    506 				if (allocblk)
    507 					*allocblk = nb;
    508 				ASSERT(!ufs_badblock(ip, nb));
    509 				ob = nb;
    510 			}
    511 
    512 			/*
    513 			 * Read old/create new zero pages
    514 			 */
    515 			fbp = NULL;
    516 			if (osize == 0) {
    517 				/*
    518 				 * mmap S_WRITE faults always enter here
    519 				 */
    520 				/*
    521 				 * We zero it if its also BI_FALLOCATE, but
    522 				 * only for direct blocks!
    523 				 */
    524 				if (alloc_type == BI_NORMAL ||
    525 				    alloc_type == BI_FALLOCATE ||
    526 				    P2ROUNDUP_TYPED(size,
    527 				    PAGESIZE, u_offset_t) < nsize) {
    528 					/* fbzero doesn't cause a pagefault */
    529 					fbzero(ITOV(ip),
    530 					    ((offset_t)lbn << fs->fs_bshift),
    531 					    (uint_t)nsize, &fbp);
    532 				}
    533 			} else {
    534 				err = fbread(vp,
    535 				    ((offset_t)lbn << fs->fs_bshift),
    536 				    (uint_t)nsize, S_OTHER, &fbp);
    537 				if (err) {
    538 					if (nb != ob) {
    539 						(void) free(ip, nb,
    540 						    (off_t)nsize, metaflag);
    541 					} else {
    542 						(void) free(ip,
    543 						    ob + numfrags(fs, osize),
    544 						    (off_t)(nsize - osize),
    545 						    metaflag);
    546 					}
    547 					ASSERT(nsize >= osize);
    548 					(void) chkdq(ip,
    549 					    -(long)btodb(nsize - osize),
    550 					    0, cr, (char **)NULL,
    551 					    (size_t *)NULL);
    552 					return (err);
    553 				}
    554 			}
    555 			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0);
    556 			ip->i_db[lbn] = nb;
    557 			ip->i_blocks += btodb(nsize - osize);
    558 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
    559 			TRANS_INODE(ufsvfsp, ip);
    560 			ip->i_flag |= IUPD | ICHG | IATTCHG;
    561 
    562 			/* Caller is responsible for updating i_seq */
    563 
    564 			/*
    565 			 * Write directory and shadow blocks synchronously so
    566 			 * that they never appear with garbage in them on the
    567 			 * disk.
    568 			 *
    569 			 */
    570 			if (isdirquota && (ip->i_size ||
    571 			    TRANS_ISTRANS(ufsvfsp))) {
    572 			/*
    573 			 * XXX man not be necessary with harpy trans
    574 			 * bug id 1130055
    575 			 */
    576 				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
    577 			} else if (fbp) {
    578 				fbrelse(fbp, S_WRITE);
    579 			}
    580 
    581 			if (nb != ob)
    582 				(void) free(ip, ob, (off_t)osize, metaflag);
    583 		}
    584 gotit:
    585 		return (0);
    586 	}
    587 
    588 	added_sectors = alloced_blocks = 0;	/* No blocks alloced yet */
    589 
    590 	/*
    591 	 * Determine how many levels of indirection.
    592 	 */
    593 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
    594 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
    595 	pref = 0;
    596 	shft = 0;				/* sh = 1 */
    597 	tbn = lbn - NDADDR;
    598 	for (j = NIADDR; j > 0; j--) {
    599 		longlong_t	sh;
    600 
    601 		shft += nindirshift;		/* sh *= nindir */
    602 		sh = 1LL << shft;
    603 		if (tbn < sh)
    604 			break;
    605 		tbn -= sh;
    606 	}
    607 
    608 	if (j == 0)
    609 		return (EFBIG);
    610 
    611 	/*
    612 	 * Fetch the first indirect block.
    613 	 */
    614 	dev = ip->i_dev;
    615 	nb = ip->i_ib[NIADDR - j];
    616 	if (nb == 0) {
    617 		/*
    618 		 * Check to see if doing this will make the
    619 		 * file too big.  Only check if we are dealing
    620 		 * with a very large file.
    621 		 */
    622 		if (verylargefile == 1) {
    623 			if (((unsigned)ip->i_blocks + btodb(bsize))
    624 			    > INT_MAX) {
    625 				return (EFBIG);
    626 			}
    627 		}
    628 		/*
    629 		 * Need to allocate an indirect block.
    630 		 */
    631 		pref = blkpref(ip, lbn, 0, (daddr32_t *)0);
    632 		err = alloc(ip, pref, (int)bsize, &nb, cr);
    633 		if (err)
    634 			return (err);
    635 		TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
    636 		ASSERT(!ufs_badblock(ip, nb));
    637 
    638 		/*
    639 		 * Keep track of this allocation so we can undo it if we
    640 		 * get an error later.
    641 		 */
    642 
    643 		ASSERT(alloced_blocks <= NIADDR);
    644 
    645 		undo_table[alloced_blocks].this_block = nb;
    646 		undo_table[alloced_blocks].block_size = bsize;
    647 		undo_table[alloced_blocks].owner = ufs_no_owner;
    648 		undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK;
    649 
    650 		alloced_blocks++;
    651 
    652 		/*
    653 		 * Write zero block synchronously so that
    654 		 * indirect blocks never point at garbage.
    655 		 */
    656 		bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize);
    657 
    658 		clrbuf(bp);
    659 		/* XXX Maybe special-case this? */
    660 		TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO);
    661 		UFS_BWRITE2(ufsvfsp, bp);
    662 		if (bp->b_flags & B_ERROR) {
    663 			err = geterror(bp);
    664 			brelse(bp);
    665 			ufs_undo_allocation(ip, alloced_blocks,
    666 			    undo_table, added_sectors);
    667 			return (err);
    668 		}
    669 		brelse(bp);
    670 
    671 		ip->i_ib[NIADDR - j] = nb;
    672 		added_sectors += btodb(bsize);
    673 		ip->i_blocks += btodb(bsize);
    674 		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
    675 		TRANS_INODE(ufsvfsp, ip);
    676 		ip->i_flag |= IUPD | ICHG | IATTCHG;
    677 		/* Caller is responsible for updating i_seq */
    678 
    679 		/*
    680 		 * Update the 'undo table' now that we've linked this block
    681 		 * to an inode.
    682 		 */
    683 
    684 		undo_table[alloced_blocks-1].owner = ufs_inode_indirect;
    685 		undo_table[alloced_blocks-1].owner_offset = NIADDR - j;
    686 
    687 		/*
    688 		 * In the ISYNC case, wrip will notice that the block
    689 		 * count on the inode has changed and will be sure to
    690 		 * ufs_iupdat the inode at the end of wrip.
    691 		 */
    692 	}
    693 
    694 	/*
    695 	 * Fetch through the indirect blocks.
    696 	 */
    697 	for (; j <= NIADDR; j++) {
    698 		ob = nb;
    699 		bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize);
    700 
    701 		if (bp->b_flags & B_ERROR) {
    702 			err = geterror(bp);
    703 			brelse(bp);
    704 			/*
    705 			 * Return any partial allocations.
    706 			 *
    707 			 * It is possible that we have not yet made any
    708 			 * allocations at this point (if this is the first
    709 			 * pass through the loop and we didn't have to
    710 			 * allocate the first indirect block, above).
    711 			 * In this case, alloced_blocks and added_sectors will
    712 			 * be zero, and ufs_undo_allocation will do nothing.
    713 			 */
    714 			ufs_undo_allocation(ip, alloced_blocks,
    715 			    undo_table, added_sectors);
    716 			return (err);
    717 		}
    718 		bap = bp->b_un.b_daddr;
    719 		shft -= nindirshift;		/* sh /= nindir */
    720 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
    721 		nb = bap[i];
    722 
    723 		if (nb == 0) {
    724 			/*
    725 			 * Check to see if doing this will make the
    726 			 * file too big.  Only check if we are dealing
    727 			 * with a very large file.
    728 			 */
    729 			if (verylargefile == 1) {
    730 				if (((unsigned)ip->i_blocks + btodb(bsize))
    731 				    > INT_MAX) {
    732 					brelse(bp);
    733 					ufs_undo_allocation(ip, alloced_blocks,
    734 					    undo_table, added_sectors);
    735 					return (EFBIG);
    736 				}
    737 			}
    738 			if (pref == 0) {
    739 				if (j < NIADDR) {
    740 					/* Indirect block */
    741 					pref = blkpref(ip, lbn, 0,
    742 					    (daddr32_t *)0);
    743 				} else {
    744 					/* Data block */
    745 					pref = blkpref(ip, lbn, i, &bap[0]);
    746 				}
    747 			}
    748 
    749 			/*
    750 			 * release "bp" buf to avoid deadlock (re-bread later)
    751 			 */
    752 			brelse(bp);
    753 
    754 			err = alloc(ip, pref, (int)bsize, &nb, cr);
    755 			if (err) {
    756 				/*
    757 				 * Return any partial allocations.
    758 				 */
    759 				ufs_undo_allocation(ip, alloced_blocks,
    760 				    undo_table, added_sectors);
    761 				return (err);
    762 			}
    763 
    764 			ASSERT(!ufs_badblock(ip, nb));
    765 			ASSERT(alloced_blocks <= NIADDR);
    766 
    767 			if (allocblk)
    768 				*allocblk = nb;
    769 
    770 			undo_table[alloced_blocks].this_block = nb;
    771 			undo_table[alloced_blocks].block_size = bsize;
    772 			undo_table[alloced_blocks].owner = ufs_no_owner;
    773 			undo_table[alloced_blocks].usage_flags = metaflag |
    774 			    ((j < NIADDR) ? I_IBLK : 0);
    775 
    776 			alloced_blocks++;
    777 
    778 			if (j < NIADDR) {
    779 				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
    780 				/*
    781 				 * Write synchronously so indirect
    782 				 * blocks never point at garbage.
    783 				 */
    784 				nbp = UFS_GETBLK(
    785 				    ufsvfsp, dev, fsbtodb(fs, nb), bsize);
    786 
    787 				clrbuf(nbp);
    788 				/* XXX Maybe special-case this? */
    789 				TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO);
    790 				UFS_BWRITE2(ufsvfsp, nbp);
    791 				if (nbp->b_flags & B_ERROR) {
    792 					err = geterror(nbp);
    793 					brelse(nbp);
    794 					/*
    795 					 * Return any partial
    796 					 * allocations.
    797 					 */
    798 					ufs_undo_allocation(ip,
    799 					    alloced_blocks,
    800 					    undo_table, added_sectors);
    801 					return (err);
    802 				}
    803 				brelse(nbp);
    804 			} else if (alloc_type == BI_NORMAL ||
    805 			    P2ROUNDUP_TYPED(size,
    806 			    PAGESIZE, u_offset_t) < bsize) {
    807 				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
    808 				fbzero(ITOV(ip),
    809 				    ((offset_t)lbn << fs->fs_bshift),
    810 				    (uint_t)bsize, &fbp);
    811 
    812 				/*
    813 				 * Cases which we need to do a synchronous
    814 				 * write of the zeroed data pages:
    815 				 *
    816 				 * 1) If we are writing a directory then we
    817 				 * want to write synchronously so blocks in
    818 				 * directories never contain garbage.
    819 				 *
    820 				 * 2) If we are filling in a hole and the
    821 				 * indirect block is going to be synchronously
    822 				 * written back below we need to make sure
    823 				 * that the zeroes are written here before
    824 				 * the indirect block is updated so that if
    825 				 * we crash before the real data is pushed
    826 				 * we will not end up with random data is
    827 				 * the middle of the file.
    828 				 *
    829 				 * 3) If the size of the request rounded up
    830 				 * to the system page size is smaller than
    831 				 * the file system block size, we want to
    832 				 * write out all the pages now so that
    833 				 * they are not aborted before they actually
    834 				 * make it to ufs_putpage since the length
    835 				 * of the inode will not include the pages.
    836 				 */
    837 
    838 				if (isdirquota || (issync &&
    839 				    lbn < llbn))
    840 					(void) ufs_fbiwrite(fbp, ip, nb,
    841 					    fs->fs_fsize);
    842 				else
    843 					fbrelse(fbp, S_WRITE);
    844 			}
    845 
    846 			/*
    847 			 * re-acquire "bp" buf
    848 			 */
    849 			bp = UFS_BREAD(ufsvfsp,
    850 			    ip->i_dev, fsbtodb(fs, ob), bsize);
    851 			if (bp->b_flags & B_ERROR) {
    852 				err = geterror(bp);
    853 				brelse(bp);
    854 				/*
    855 				 * Return any partial allocations.
    856 				 */
    857 				ufs_undo_allocation(ip,
    858 				    alloced_blocks,
    859 				    undo_table, added_sectors);
    860 				return (err);
    861 			}
    862 			bap = bp->b_un.b_daddr;
    863 			bap[i] = nb;
    864 
    865 			/*
    866 			 * The magic explained: j will be equal to NIADDR
    867 			 * when we are at the lowest level, this is where the
    868 			 * array entries point directly to data blocks. Since
    869 			 * we will be 'fallocate'ing we will go ahead and negate
    870 			 * the addresses.
    871 			 */
    872 			if (alloc_type == BI_FALLOCATE && j == NIADDR)
    873 				bap[i] = -bap[i];
    874 
    875 			TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
    876 			added_sectors += btodb(bsize);
    877 			ip->i_blocks += btodb(bsize);
    878 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
    879 			TRANS_INODE(ufsvfsp, ip);
    880 			ip->i_flag |= IUPD | ICHG | IATTCHG;
    881 
    882 			/* Caller is responsible for updating i_seq */
    883 
    884 			undo_table[alloced_blocks-1].owner =
    885 			    ufs_indirect_block;
    886 			undo_table[alloced_blocks-1].owner_block = ob;
    887 			undo_table[alloced_blocks-1].owner_offset = i;
    888 
    889 			if (issync) {
    890 				UFS_BWRITE2(ufsvfsp, bp);
    891 				if (bp->b_flags & B_ERROR) {
    892 					err = geterror(bp);
    893 					brelse(bp);
    894 					/*
    895 					 * Return any partial
    896 					 * allocations.
    897 					 */
    898 					ufs_undo_allocation(ip,
    899 					    alloced_blocks,
    900 					    undo_table, added_sectors);
    901 					return (err);
    902 				}
    903 				brelse(bp);
    904 			} else {
    905 				bdrwrite(bp);
    906 			}
    907 		} else {
    908 			brelse(bp);
    909 		}
    910 	}
    911 	return (0);
    912 }
    913 
    914 /*
    915  * Return 1 if inode has unmapped blocks (UFS holes).
    916  */
    917 int
    918 bmap_has_holes(struct inode *ip)
    919 {
    920 	struct fs *fs = ip->i_fs;
    921 	uint_t	dblks; 			/* # of data blocks */
    922 	uint_t	mblks;			/* # of data + metadata blocks */
    923 	int	nindirshift;
    924 	int	nindiroffset;
    925 	uint_t	cnt;
    926 	int	n, j, shft;
    927 	uint_t nindirblks;
    928 
    929 	int	fsbshift = fs->fs_bshift;
    930 	int	fsboffset = (1 << fsbshift) - 1;
    931 
    932 	dblks = (ip->i_size + fsboffset) >> fsbshift;
    933 	mblks = (ldbtob((u_offset_t)ip->i_blocks) + fsboffset) >> fsbshift;
    934 
    935 	/*
    936 	 * File has only direct blocks.
    937 	 */
    938 	if (dblks <= NDADDR)
    939 		return (mblks < dblks);
    940 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
    941 
    942 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
    943 	nindirblks = nindiroffset + 1;
    944 
    945 	dblks -= NDADDR;
    946 	shft = 0;
    947 	/*
    948 	 * Determine how many levels of indirection.
    949 	 */
    950 	for (j = NIADDR; j > 0; j--) {
    951 		longlong_t	sh;
    952 
    953 		shft += nindirshift;	/* sh *= nindir */
    954 		sh = 1LL << shft;
    955 		if (dblks <= sh)
    956 			break;
    957 		dblks -= sh;
    958 	}
    959 	/* LINTED: warning: logical expression always true: op "||" */
    960 	ASSERT(NIADDR <= 3);
    961 	ASSERT(j <= NIADDR);
    962 	if (j == NIADDR)	/* single level indirection */
    963 		cnt = NDADDR + 1 + dblks;
    964 	else if (j == NIADDR-1) /* double indirection */
    965 		cnt = NDADDR + 1 + nindirblks +
    966 		    1 + (dblks + nindiroffset)/nindirblks + dblks;
    967 	else if (j == NIADDR-2) { /* triple indirection */
    968 		n = (dblks + nindiroffset)/nindirblks;
    969 		cnt = NDADDR + 1 + nindirblks +
    970 		    1 + nindirblks + nindirblks*nindirblks +
    971 		    1 + (n + nindiroffset)/nindirblks + n + dblks;
    972 	}
    973 
    974 	return (mblks < cnt);
    975 }
    976 
    977 /*
    978  * find some contig blocks starting at *sbp and going for min(n, max_contig)
    979  * return the number of blocks (not frags) found.
    980  * The array passed in must be at least [0..n-1].
    981  */
    982 static int
    983 findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer)
    984 {
    985 	register daddr_t bn, nextbn;
    986 	register daddr32_t *bp;
    987 	register int diff;
    988 	int maxtransblk;
    989 
    990 	if (n <= 0)
    991 		return (0);
    992 	bn = *sbp;
    993 	if (bn == 0)
    994 		return (0);
    995 
    996 	diff = fs->fs_frag;
    997 	if (*lenp) {
    998 		n = MIN(n, lblkno(fs, *lenp));
    999 	} else {
   1000 		/*
   1001 		 * If the user has set the value for maxcontig lower than
   1002 		 * the drive transfer size, then assume they want this
   1003 		 * to be the maximum value for the size of the data transfer.
   1004 		 */
   1005 		maxtransblk = maxtransfer >> DEV_BSHIFT;
   1006 		if (fs->fs_maxcontig < maxtransblk) {
   1007 			n = MIN(n, fs->fs_maxcontig);
   1008 		} else {
   1009 			n = MIN(n, maxtransblk);
   1010 		}
   1011 	}
   1012 	bp = sbp;
   1013 	while (--n > 0) {
   1014 		nextbn = *(bp + 1);
   1015 		if (nextbn == 0 || bn + diff != nextbn)
   1016 			break;
   1017 		bn = nextbn;
   1018 		bp++;
   1019 	}
   1020 	return ((int)(bp - sbp) + 1);
   1021 }
   1022 
   1023 /*
   1024  * Free any blocks which had been successfully allocated.  Always called
   1025  * as a result of an error, so we don't bother returning an error code
   1026  * from here.
   1027  *
   1028  * If block_count and inode_sector_adjust are both zero, we'll do nothing.
   1029  * Thus it is safe to call this as part of error handling, whether or not
   1030  * any blocks have been allocated.
   1031  *
   1032  * The ufs_inode_direct case is currently unused.
   1033  */
   1034 
   1035 static void
   1036 ufs_undo_allocation(
   1037 	inode_t *ip,
   1038 	int block_count,
   1039 	struct ufs_allocated_block table[],
   1040 	int inode_sector_adjust)
   1041 {
   1042 	int i;
   1043 	int inode_changed;
   1044 	int error_updating_pointers;
   1045 	struct ufsvfs *ufsvfsp;
   1046 
   1047 	inode_changed = 0;
   1048 	error_updating_pointers = 0;
   1049 
   1050 	ufsvfsp = ip->i_ufsvfs;
   1051 
   1052 	/*
   1053 	 * Update pointers on disk before freeing blocks.  If we fail,
   1054 	 * some blocks may remain busy; but they will be reclaimed by
   1055 	 * an fsck.  (This is better than letting a block wind up with
   1056 	 * two owners if we successfully freed it but could not remove
   1057 	 * the pointer to it.)
   1058 	 */
   1059 
   1060 	for (i = 0; i < block_count; i++) {
   1061 		switch (table[i].owner) {
   1062 		case ufs_no_owner:
   1063 			/* Nothing to do here, nobody points to us */
   1064 			break;
   1065 		case ufs_inode_direct:
   1066 			ASSERT(table[i].owner_offset < NDADDR);
   1067 			ip->i_db[table[i].owner_offset] = 0;
   1068 			inode_changed = 1;
   1069 			break;
   1070 		case ufs_inode_indirect:
   1071 			ASSERT(table[i].owner_offset < NIADDR);
   1072 			ip->i_ib[table[i].owner_offset] = 0;
   1073 			inode_changed = 1;
   1074 			break;
   1075 		case ufs_indirect_block: {
   1076 			buf_t *bp;
   1077 			daddr32_t *block_data;
   1078 
   1079 			/* Read/modify/log/write. */
   1080 
   1081 			ASSERT(table[i].owner_offset <
   1082 			    (VBSIZE(ITOV(ip)) / sizeof (daddr32_t)));
   1083 
   1084 			bp = UFS_BREAD(ufsvfsp, ip->i_dev,
   1085 			    fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block),
   1086 			    VBSIZE(ITOV(ip)));
   1087 
   1088 			if (bp->b_flags & B_ERROR) {
   1089 				/* Couldn't read this block; give up. */
   1090 				error_updating_pointers = 1;
   1091 				brelse(bp);
   1092 				break;		/* out of SWITCH */
   1093 			}
   1094 
   1095 			block_data = bp->b_un.b_daddr;
   1096 			block_data[table[i].owner_offset] = 0;
   1097 
   1098 			/* Write a log entry which includes the zero. */
   1099 			/* It might be possible to optimize this by using */
   1100 			/* TRANS_BUF directly and zeroing only the four */
   1101 			/* bytes involved, but an attempt to do that led */
   1102 			/* to panics in the logging code.  The attempt was */
   1103 			/* TRANS_BUF(ufsvfsp,				  */
   1104 			/*    table[i].owner_offset * sizeof (daddr32_t), */
   1105 			/*    sizeof (daddr32_t),			  */
   1106 			/*    bp,					  */
   1107 			/*    DT_ABZERO);				  */
   1108 
   1109 			TRANS_BUF_ITEM_128(ufsvfsp,
   1110 			    block_data[table[i].owner_offset],
   1111 			    block_data, bp, DT_AB);
   1112 
   1113 			/* Now we can write the buffer itself. */
   1114 
   1115 			UFS_BWRITE2(ufsvfsp, bp);
   1116 
   1117 			if (bp->b_flags & B_ERROR) {
   1118 				error_updating_pointers = 1;
   1119 			}
   1120 
   1121 			brelse(bp);
   1122 			break;
   1123 		}
   1124 		default:
   1125 			(void) ufs_fault(ITOV(ip),
   1126 			    "ufs_undo_allocation failure\n");
   1127 			break;
   1128 		}
   1129 	}
   1130 
   1131 	/*
   1132 	 * If the inode changed, or if we need to update its block count,
   1133 	 * then do that now.  We update the inode synchronously on disk
   1134 	 * to ensure that it won't transiently point at a block we've
   1135 	 * freed (only necessary if we're not logging).
   1136 	 *
   1137 	 * NOTE: Currently ufs_iupdat() does not check for errors.  When
   1138 	 * it is fixed, we should verify that we successfully updated the
   1139 	 * inode before freeing blocks below.
   1140 	 */
   1141 
   1142 	if (inode_changed || (inode_sector_adjust != 0)) {
   1143 		ip->i_blocks -= inode_sector_adjust;
   1144 		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
   1145 		TRANS_INODE(ufsvfsp, ip);
   1146 		ip->i_flag |= IUPD | ICHG | IATTCHG;
   1147 		ip->i_seq++;
   1148 		if (!TRANS_ISTRANS(ufsvfsp))
   1149 			ufs_iupdat(ip, I_SYNC);
   1150 	}
   1151 
   1152 	/*
   1153 	 * Now we go through and actually free the blocks, but only if we
   1154 	 * successfully removed the pointers to them.
   1155 	 */
   1156 
   1157 	if (!error_updating_pointers) {
   1158 		for (i = 0; i < block_count; i++) {
   1159 			free(ip, table[i].this_block, table[i].block_size,
   1160 			    table[i].usage_flags);
   1161 		}
   1162 	}
   1163 }
   1164 
   1165 /*
   1166  * Find the next hole or data block in file starting at *off
   1167  * Return found offset in *off, which can be less than the
   1168  * starting offset if not block aligned.
   1169  * This code is based on bmap_read().
   1170  * Errors: ENXIO for end of file
   1171  *         EIO for block read error.
   1172  */
   1173 int
   1174 bmap_find(struct inode *ip, boolean_t hole, u_offset_t *off)
   1175 {
   1176 	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
   1177 	struct fs *fs = ufsvfsp->vfs_fs;
   1178 	buf_t *bp[NIADDR];
   1179 	int i, j;
   1180 	int shft;			/* we maintain sh = 1 << shft */
   1181 	int nindirshift, nindiroffset;
   1182 	daddr_t	ob, nb, tbn, lbn, skip;
   1183 	daddr32_t *bap;
   1184 	u_offset_t isz = (offset_t)ip->i_size;
   1185 	int32_t bs = fs->fs_bsize; /* file system block size */
   1186 	int32_t nindir = fs->fs_nindir;
   1187 	dev_t dev;
   1188 	int error = 0;
   1189 	daddr_t limits[NIADDR];
   1190 
   1191 	ASSERT(*off < isz);
   1192 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   1193 	lbn = (daddr_t)lblkno(fs, *off);
   1194 	ASSERT(lbn >= 0);
   1195 
   1196 	for (i = 0; i < NIADDR; i++)
   1197 		bp[i] = NULL;
   1198 
   1199 	/*
   1200 	 * The first NDADDR blocks are direct blocks.
   1201 	 */
   1202 	if (lbn < NDADDR) {
   1203 		for (; lbn < NDADDR; lbn++) {
   1204 			if ((hole && (ip->i_db[lbn] == 0)) ||
   1205 			    (!hole && (ip->i_db[lbn] != 0))) {
   1206 				goto out;
   1207 			}
   1208 		}
   1209 		if ((u_offset_t)lbn << fs->fs_bshift >= isz)
   1210 			goto out;
   1211 	}
   1212 
   1213 	nindir = fs->fs_nindir;
   1214 	nindirshift = ufsvfsp->vfs_nindirshift;
   1215 	nindiroffset = ufsvfsp->vfs_nindiroffset;
   1216 	dev = ip->i_dev;
   1217 
   1218 	/* Set up limits array */
   1219 	for (limits[0] = NDADDR, j = 1; j  < NIADDR; j++)
   1220 		limits[j] = limits[j-1] + (1ULL << (nindirshift * j));
   1221 
   1222 loop:
   1223 	/*
   1224 	 * Determine how many levels of indirection.
   1225 	 */
   1226 	shft = 0;				/* sh = 1 */
   1227 	tbn = lbn - NDADDR;
   1228 	for (j = NIADDR; j > 0; j--) {
   1229 		longlong_t sh;
   1230 
   1231 		shft += nindirshift;		/* sh *= nindir */
   1232 		sh = 1LL << shft;
   1233 		if (tbn < sh)
   1234 			break;
   1235 		tbn -= sh;
   1236 	}
   1237 	if (j == 0) {
   1238 		/* must have passed end of file */
   1239 		ASSERT(((u_offset_t)lbn << fs->fs_bshift) >= isz);
   1240 		goto out;
   1241 	}
   1242 
   1243 	/*
   1244 	 * Fetch the first indirect block.
   1245 	 */
   1246 	nb = ip->i_ib[NIADDR - j];
   1247 	if (nb == 0) {
   1248 		if (hole) {
   1249 			lbn = limits[NIADDR - j];
   1250 			goto out;
   1251 		} else {
   1252 			lbn = limits[NIADDR - j + 1];
   1253 			if ((u_offset_t)lbn << fs->fs_bshift >= isz)
   1254 				goto out;
   1255 			goto loop;
   1256 		}
   1257 	}
   1258 
   1259 	/*
   1260 	 * Fetch through the indirect blocks.
   1261 	 */
   1262 	for (; ((j <= NIADDR) && (nb != 0)); j++) {
   1263 		ob = nb;
   1264 		/*
   1265 		 * if there's a different block at this level then release
   1266 		 * the old one and in with the new.
   1267 		 */
   1268 		if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) {
   1269 			if (bp[j-1] != NULL)
   1270 				brelse(bp[j-1]);
   1271 			bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs);
   1272 			if (bp[j-1]->b_flags & B_ERROR) {
   1273 				error = EIO;
   1274 				goto out;
   1275 			}
   1276 		}
   1277 		bap = bp[j-1]->b_un.b_daddr;
   1278 
   1279 		shft -= nindirshift;		/* sh / nindir */
   1280 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
   1281 		nb = bap[i];
   1282 		skip = 1LL << (nindirshift * (NIADDR - j));
   1283 	}
   1284 
   1285 	/*
   1286 	 * Scan through the blocks in this array.
   1287 	 */
   1288 	for (; i < nindir; i++, lbn += skip) {
   1289 		if (hole && (bap[i] == 0))
   1290 			goto out;
   1291 		if (!hole && (bap[i] != 0)) {
   1292 			if (skip == 1) {
   1293 				/* we're at the lowest level */
   1294 				goto out;
   1295 			} else {
   1296 				goto loop;
   1297 			}
   1298 		}
   1299 	}
   1300 	if (((u_offset_t)lbn << fs->fs_bshift) < isz)
   1301 		goto loop;
   1302 out:
   1303 	for (i = 0; i < NIADDR; i++) {
   1304 		if (bp[i])
   1305 			brelse(bp[i]);
   1306 	}
   1307 	if (error == 0) {
   1308 		if (((u_offset_t)lbn << fs->fs_bshift) >= isz) {
   1309 			error = ENXIO;
   1310 		} else {
   1311 			/* success */
   1312 			*off = (u_offset_t)lbn << fs->fs_bshift;
   1313 		}
   1314 	}
   1315 	return (error);
   1316 }
   1317 
   1318 /*
   1319  * Set a particular offset in the inode list to be a certain block.
   1320  * User is responsible for calling TRANS* functions
   1321  */
   1322 int
   1323 bmap_set_bn(struct vnode *vp, u_offset_t off, daddr32_t bn)
   1324 {
   1325 	daddr_t lbn;
   1326 	struct inode *ip;
   1327 	ufsvfs_t *ufsvfsp;
   1328 	struct	fs *fs;
   1329 	struct	buf *bp;
   1330 	int	i, j;
   1331 	int	shft;			/* we maintain sh = 1 << shft */
   1332 	int err;
   1333 	daddr_t	ob, nb, tbn;
   1334 	daddr32_t *bap;
   1335 	int	nindirshift, nindiroffset;
   1336 
   1337 	ip = VTOI(vp);
   1338 	ufsvfsp = ip->i_ufsvfs;
   1339 	fs = ufsvfsp->vfs_fs;
   1340 	lbn = (daddr_t)lblkno(fs, off);
   1341 
   1342 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   1343 
   1344 	if (lbn < 0)
   1345 		return (EFBIG);
   1346 
   1347 	/*
   1348 	 * Take care of direct block assignment
   1349 	 */
   1350 	if (lbn < NDADDR) {
   1351 		ip->i_db[lbn] = bn;
   1352 		return (0);
   1353 	}
   1354 
   1355 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
   1356 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
   1357 	/*
   1358 	 * Determine how many levels of indirection.
   1359 	 */
   1360 	shft = 0;				/* sh = 1 */
   1361 	tbn = lbn - NDADDR;
   1362 	for (j = NIADDR; j > 0; j--) {
   1363 		longlong_t	sh;
   1364 
   1365 		shft += nindirshift;		/* sh *= nindir */
   1366 		sh = 1LL << shft;
   1367 		if (tbn < sh)
   1368 			break;
   1369 		tbn -= sh;
   1370 	}
   1371 	if (j == 0)
   1372 		return (EFBIG);
   1373 
   1374 	/*
   1375 	 * Fetch the first indirect block.
   1376 	 */
   1377 	nb = ip->i_ib[NIADDR - j];
   1378 	if (nb == 0) {
   1379 		err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
   1380 		return (err);
   1381 	}
   1382 
   1383 	/*
   1384 	 * Fetch through the indirect blocks.
   1385 	 */
   1386 	for (; j <= NIADDR; j++) {
   1387 		ob = nb;
   1388 		bp = UFS_BREAD(ufsvfsp,
   1389 		    ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
   1390 		if (bp->b_flags & B_ERROR) {
   1391 			err = geterror(bp);
   1392 			brelse(bp);
   1393 			return (err);
   1394 		}
   1395 		bap = bp->b_un.b_daddr;
   1396 
   1397 		ASSERT(!ufs_indir_badblock(ip, bap));
   1398 
   1399 		shft -= nindirshift;		/* sh / nindir */
   1400 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
   1401 
   1402 		nb = bap[i];
   1403 		if (nb == 0) {
   1404 			err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
   1405 			return (err);
   1406 		}
   1407 
   1408 		if (j == NIADDR) {
   1409 			bap[i] = bn;
   1410 			bdrwrite(bp);
   1411 			return (0);
   1412 		}
   1413 
   1414 		brelse(bp);
   1415 	}
   1416 	return (0);
   1417 }
   1418