Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #include <sys/condvar_impl.h>
     40 #include <sys/types.h>
     41 #include <sys/t_lock.h>
     42 #include <sys/debug.h>
     43 #include <sys/param.h>
     44 #include <sys/systm.h>
     45 #include <sys/signal.h>
     46 #include <sys/cred.h>
     47 #include <sys/proc.h>
     48 #include <sys/disp.h>
     49 #include <sys/user.h>
     50 #include <sys/buf.h>
     51 #include <sys/vfs.h>
     52 #include <sys/vnode.h>
     53 #include <sys/acl.h>
     54 #include <sys/fs/ufs_fs.h>
     55 #include <sys/fs/ufs_inode.h>
     56 #include <sys/fs/ufs_acl.h>
     57 #include <sys/fs/ufs_bio.h>
     58 #include <sys/fs/ufs_quota.h>
     59 #include <sys/kmem.h>
     60 #include <sys/fs/ufs_trans.h>
     61 #include <sys/fs/ufs_panic.h>
     62 #include <sys/errno.h>
     63 #include <sys/time.h>
     64 #include <sys/sysmacros.h>
     65 #include <sys/file.h>
     66 #include <sys/fcntl.h>
     67 #include <sys/flock.h>
     68 #include <fs/fs_subr.h>
     69 #include <sys/cmn_err.h>
     70 #include <sys/policy.h>
     71 #include <sys/fs/ufs_log.h>
     72 
     73 static ino_t	hashalloc();
     74 static daddr_t	fragextend();
     75 static daddr_t	alloccg();
     76 static daddr_t	alloccgblk();
     77 static ino_t	ialloccg();
     78 static daddr_t	mapsearch();
     79 static int	findlogstartcg();
     80 
     81 extern int	inside[], around[];
     82 extern uchar_t	*fragtbl[];
     83 void delay();
     84 
     85 /*
     86  * Allocate a block in the file system.
     87  *
     88  * The size of the requested block is given, which must be some
     89  * multiple of fs_fsize and <= fs_bsize.
     90  * A preference may be optionally specified. If a preference is given
     91  * the following hierarchy is used to allocate a block:
     92  *   1) allocate the requested block.
     93  *   2) allocate a rotationally optimal block in the same cylinder.
     94  *   3) allocate a block in the same cylinder group.
     95  *   4) quadratically rehash into other cylinder groups, until an
     96  *	available block is located.
     97  * If no block preference is given the following hierarchy is used
     98  * to allocate a block:
     99  *   1) allocate a block in the cylinder group that contains the
    100  *	inode for the file.
    101  *   2) quadratically rehash into other cylinder groups, until an
    102  *	available block is located.
    103  */
    104 int
    105 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr)
    106 {
    107 	struct fs *fs;
    108 	struct ufsvfs *ufsvfsp;
    109 	daddr_t bno;
    110 	int cg;
    111 	int err;
    112 	char *errmsg = NULL;
    113 	size_t len;
    114 	clock_t	now;
    115 
    116 	ufsvfsp = ip->i_ufsvfs;
    117 	fs = ufsvfsp->vfs_fs;
    118 	if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
    119 		err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx,"
    120 		    " bsize = %d, size = %d, fs = %s\n",
    121 		    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
    122 		return (err);
    123 	}
    124 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
    125 		goto nospace;
    126 	if (freespace(fs, ufsvfsp) <= 0 &&
    127 	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
    128 		goto nospace;
    129 	err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len);
    130 	/* Note that may not have err, but may have errmsg */
    131 	if (errmsg != NULL) {
    132 		uprintf(errmsg);
    133 		kmem_free(errmsg, len);
    134 		errmsg = NULL;
    135 	}
    136 	if (err)
    137 		return (err);
    138 	if (bpref >= fs->fs_size)
    139 		bpref = 0;
    140 	if (bpref == 0)
    141 		cg = (int)itog(fs, ip->i_number);
    142 	else
    143 		cg = dtog(fs, bpref);
    144 
    145 	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size,
    146 	    (ulong_t (*)())alloccg);
    147 	if (bno > 0) {
    148 		*bnp = bno;
    149 		return (0);
    150 	}
    151 
    152 	/*
    153 	 * hashalloc() failed because some other thread grabbed
    154 	 * the last block so unwind the quota operation.  We can
    155 	 * ignore the return because subtractions don't fail and
    156 	 * size is guaranteed to be >= zero by our caller.
    157 	 */
    158 	(void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL,
    159 	    (size_t *)NULL);
    160 
    161 nospace:
    162 	now = ddi_get_lbolt();
    163 	mutex_enter(&ufsvfsp->vfs_lock);
    164 	if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
    165 	    (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
    166 		ufsvfsp->vfs_lastwhinetime = now;
    167 		cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt);
    168 	}
    169 	mutex_exit(&ufsvfsp->vfs_lock);
    170 	return (ENOSPC);
    171 }
    172 
    173 /*
    174  * Reallocate a fragment to a bigger size
    175  *
    176  * The number and size of the old block is given, and a preference
    177  * and new size is also specified.  The allocator attempts to extend
    178  * the original block.  Failing that, the regular block allocator is
    179  * invoked to get an appropriate block.
    180  */
    181 int
    182 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize,
    183     int nsize, daddr_t *bnp, cred_t *cr)
    184 {
    185 	daddr_t bno;
    186 	struct fs *fs;
    187 	struct ufsvfs *ufsvfsp;
    188 	int cg, request;
    189 	int err;
    190 	char *errmsg = NULL;
    191 	size_t len;
    192 	clock_t	now;
    193 
    194 	ufsvfsp = ip->i_ufsvfs;
    195 	fs = ufsvfsp->vfs_fs;
    196 	if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
    197 	    (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
    198 		err = ufs_fault(ITOV(ip),
    199 		    "realloccg: bad size, dev=0x%lx, bsize=%d, "
    200 		    "osize=%d, nsize=%d, fs=%s\n",
    201 		    ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
    202 		return (err);
    203 	}
    204 	if (freespace(fs, ufsvfsp) <= 0 &&
    205 	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
    206 		goto nospace;
    207 	if (bprev == 0) {
    208 		err = ufs_fault(ITOV(ip),
    209 		    "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
    210 		    " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev,
    211 		    fs->fs_fsmnt);
    212 		return (err);
    213 	}
    214 	err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len);
    215 	/* Note that may not have err, but may have errmsg */
    216 	if (errmsg != NULL) {
    217 		uprintf(errmsg);
    218 		kmem_free(errmsg, len);
    219 		errmsg = NULL;
    220 	}
    221 	if (err)
    222 		return (err);
    223 	cg = dtog(fs, bprev);
    224 	bno = fragextend(ip, cg, (long)bprev, osize, nsize);
    225 	if (bno != 0) {
    226 		*bnp = bno;
    227 		return (0);
    228 	}
    229 	if (bpref >= fs->fs_size)
    230 		bpref = 0;
    231 
    232 	/*
    233 	 * When optimizing for time we allocate a full block and
    234 	 * then only use the upper portion for this request. When
    235 	 * this file grows again it will grow into the unused portion
    236 	 * of the block (See fragextend() above).  This saves time
    237 	 * because an extra disk write would be needed if the frags
    238 	 * following the current allocation were not free. The extra
    239 	 * disk write is needed to move the data from its current
    240 	 * location into the newly allocated position.
    241 	 *
    242 	 * When optimizing for space we allocate a run of frags
    243 	 * that is just the right size for this request.
    244 	 */
    245 	request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize;
    246 	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request,
    247 	    (ulong_t (*)())alloccg);
    248 	if (bno > 0) {
    249 		*bnp = bno;
    250 		if (nsize < request)
    251 			(void) free(ip, bno + numfrags(fs, nsize),
    252 			    (off_t)(request - nsize), I_NOCANCEL);
    253 		return (0);
    254 	}
    255 
    256 	/*
    257 	 * hashalloc() failed because some other thread grabbed
    258 	 * the last block so unwind the quota operation.  We can
    259 	 * ignore the return because subtractions don't fail, and
    260 	 * our caller guarantees nsize >= osize.
    261 	 */
    262 	(void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL,
    263 	    (size_t *)NULL);
    264 
    265 nospace:
    266 	now = ddi_get_lbolt();
    267 	mutex_enter(&ufsvfsp->vfs_lock);
    268 	if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
    269 	    (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
    270 		ufsvfsp->vfs_lastwhinetime = now;
    271 		cmn_err(CE_NOTE,
    272 		    "realloccg %s: file system full", fs->fs_fsmnt);
    273 	}
    274 	mutex_exit(&ufsvfsp->vfs_lock);
    275 	return (ENOSPC);
    276 }
    277 
    278 /*
    279  * Allocate an inode in the file system.
    280  *
    281  * A preference may be optionally specified. If a preference is given
    282  * the following hierarchy is used to allocate an inode:
    283  *   1) allocate the requested inode.
    284  *   2) allocate an inode in the same cylinder group.
    285  *   3) quadratically rehash into other cylinder groups, until an
    286  *	available inode is located.
    287  * If no inode preference is given the following hierarchy is used
    288  * to allocate an inode:
    289  *   1) allocate an inode in cylinder group 0.
    290  *   2) quadratically rehash into other cylinder groups, until an
    291  *	available inode is located.
    292  */
    293 int
    294 ufs_ialloc(struct inode *pip,
    295     ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr)
    296 {
    297 	struct inode *ip;
    298 	struct fs *fs;
    299 	int cg;
    300 	ino_t ino;
    301 	int err;
    302 	int nifree;
    303 	struct ufsvfs *ufsvfsp = pip->i_ufsvfs;
    304 	char *errmsg = NULL;
    305 	size_t len;
    306 
    307 	ASSERT(RW_WRITE_HELD(&pip->i_rwlock));
    308 	fs = pip->i_fs;
    309 loop:
    310 	nifree = fs->fs_cstotal.cs_nifree;
    311 
    312 	if (nifree == 0)
    313 		goto noinodes;
    314 	/*
    315 	 * Shadow inodes don't count against a user's inode allocation.
    316 	 * They are an implementation method and not a resource.
    317 	 */
    318 	if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
    319 		err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data,
    320 		    /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0,
    321 		    cr, &errmsg, &len);
    322 		/*
    323 		 * As we haven't acquired any locks yet, dump the message
    324 		 * now.
    325 		 */
    326 		if (errmsg != NULL) {
    327 			uprintf(errmsg);
    328 			kmem_free(errmsg, len);
    329 			errmsg = NULL;
    330 		}
    331 		if (err)
    332 			return (err);
    333 	}
    334 
    335 	if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg))
    336 		ipref = 0;
    337 	cg = (int)itog(fs, ipref);
    338 	ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode,
    339 	    (ulong_t (*)())ialloccg);
    340 	if (ino == 0) {
    341 		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
    342 			/*
    343 			 * We can safely ignore the return from chkiq()
    344 			 * because deallocations can only fail if we
    345 			 * can't get the user's quota info record off
    346 			 * the disk due to an I/O error.  In that case,
    347 			 * the quota subsystem is already messed up.
    348 			 */
    349 			(void) chkiq(ufsvfsp, /* change */ -1,
    350 			    (struct inode *)NULL, crgetuid(cr), 0, cr,
    351 			    (char **)NULL, (size_t *)NULL);
    352 		}
    353 		goto noinodes;
    354 	}
    355 	err = ufs_iget(pip->i_vfs, ino, ipp, cr);
    356 	if (err) {
    357 		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
    358 			/*
    359 			 * See above comment about why it is safe to ignore an
    360 			 * error return here.
    361 			 */
    362 			(void) chkiq(ufsvfsp, /* change */ -1,
    363 			    (struct inode *)NULL, crgetuid(cr), 0, cr,
    364 			    (char **)NULL, (size_t *)NULL);
    365 		}
    366 		ufs_ifree(pip, ino, 0);
    367 		return (err);
    368 	}
    369 	ip = *ipp;
    370 	ASSERT(!ip->i_ufs_acl);
    371 	ASSERT(!ip->i_dquot);
    372 	rw_enter(&ip->i_contents, RW_WRITER);
    373 
    374 	/*
    375 	 * Check if we really got a free inode, if not then complain
    376 	 * and mark the inode ISTALE so that it will be freed by the
    377 	 * ufs idle thread eventually and will not be sent to ufs_delete().
    378 	 */
    379 	if (ip->i_mode || (ip->i_nlink > 0)) {
    380 		ip->i_flag |= ISTALE;
    381 		rw_exit(&ip->i_contents);
    382 		VN_RELE(ITOV(ip));
    383 		cmn_err(CE_WARN,
    384 		    "%s: unexpected allocated inode %d, run fsck(1M)%s",
    385 		    fs->fs_fsmnt, (int)ino,
    386 		    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
    387 		goto loop;
    388 	}
    389 
    390 	/*
    391 	 * Check the inode has no size or data blocks.
    392 	 * This could have happened if the truncation failed when
    393 	 * deleting the inode. It used to be possible for this to occur
    394 	 * if a block allocation failed when iteratively truncating a
    395 	 * large file using logging and with a full file system.
    396 	 * This was fixed with bug fix 4348738. However, truncation may
    397 	 * still fail on an IO error. So in all cases for safety and
    398 	 * security we clear out the size; the blocks allocated; and
    399 	 * pointers to the blocks. This will ultimately cause a fsck
    400 	 * error of un-accounted for blocks, but its a fairly benign error,
    401 	 * and possibly the correct thing to do anyway as accesssing those
    402 	 * blocks agains may lead to more IO errors.
    403 	 */
    404 	if (ip->i_size || ip->i_blocks) {
    405 		int i;
    406 
    407 		if (ip->i_size) {
    408 			cmn_err(CE_WARN,
    409 			    "%s: free inode %d had size 0x%llx, run fsck(1M)%s",
    410 			    fs->fs_fsmnt, (int)ino, ip->i_size,
    411 			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
    412 		}
    413 		/*
    414 		 * Clear any garbage left behind.
    415 		 */
    416 		ip->i_size = (u_offset_t)0;
    417 		ip->i_blocks = 0;
    418 		for (i = 0; i < NDADDR; i++)
    419 			ip->i_db[i] = 0;
    420 		for (i = 0; i < NIADDR; i++)
    421 			ip->i_ib[i] = 0;
    422 	}
    423 
    424 	/*
    425 	 * Initialize the link count
    426 	 */
    427 	ip->i_nlink = 0;
    428 
    429 	/*
    430 	 * Clear the old flags
    431 	 */
    432 	ip->i_flag &= IREF;
    433 
    434 	/*
    435 	 * Access times are not really defined if the fs is mounted
    436 	 * with 'noatime'. But it can cause nfs clients to fail
    437 	 * open() if the atime is not a legal value. Set a legal value
    438 	 * here when the inode is allocated.
    439 	 */
    440 	if (ufsvfsp->vfs_noatime) {
    441 		mutex_enter(&ufs_iuniqtime_lock);
    442 		ip->i_atime = iuniqtime;
    443 		mutex_exit(&ufs_iuniqtime_lock);
    444 	}
    445 	rw_exit(&ip->i_contents);
    446 	return (0);
    447 noinodes:
    448 	if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET))
    449 		cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt);
    450 	return (ENOSPC);
    451 }
    452 
    453 /*
    454  * Find a cylinder group to place a directory.
    455  * Returns an inumber within the selected cylinder group.
    456  * Note, the vfs_lock is not needed as we don't require exact cg summary info.
    457  *
    458  * If the switch ufs_close_dirs is set, then the policy is to use
    459  * the current cg if it has more than 25% free inodes and more
    460  * than 25% free blocks. Otherwise the cgs are searched from
    461  * the beginning and the first cg with the same criteria is
    462  * used. If that is also null then we revert to the old algorithm.
    463  * This tends to cluster files at the beginning of the disk
    464  * until the disk gets full.
    465  *
    466  * Otherwise if ufs_close_dirs is not set then the original policy is
    467  * used which is to select from among those cylinder groups with
    468  * above the average number of free inodes, the one with the smallest
    469  * number of directories.
    470  */
    471 
    472 int ufs_close_dirs = 1;	/* allocate directories close as possible */
    473 
    474 ino_t
    475 dirpref(inode_t *dp)
    476 {
    477 	int cg, minndir, mincg, avgifree, mininode, minbpg, ifree;
    478 	struct fs *fs = dp->i_fs;
    479 
    480 	cg = itog(fs, dp->i_number);
    481 	mininode = fs->fs_ipg >> 2;
    482 	minbpg = fs->fs_maxbpg >> 2;
    483 	if (ufs_close_dirs &&
    484 	    (fs->fs_cs(fs, cg).cs_nifree > mininode) &&
    485 	    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
    486 		return (dp->i_number);
    487 	}
    488 
    489 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
    490 	minndir = fs->fs_ipg;
    491 	mincg = 0;
    492 	for (cg = 0; cg < fs->fs_ncg; cg++) {
    493 		ifree = fs->fs_cs(fs, cg).cs_nifree;
    494 		if (ufs_close_dirs &&
    495 		    (ifree > mininode) &&
    496 		    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
    497 			return ((ino_t)(fs->fs_ipg * cg));
    498 		}
    499 		if ((fs->fs_cs(fs, cg).cs_ndir < minndir) &&
    500 		    (ifree >= avgifree)) {
    501 			mincg = cg;
    502 			minndir = fs->fs_cs(fs, cg).cs_ndir;
    503 		}
    504 	}
    505 	return ((ino_t)(fs->fs_ipg * mincg));
    506 }
    507 
    508 /*
    509  * Select the desired position for the next block in a file.  The file is
    510  * logically divided into sections. The first section is composed of the
    511  * direct blocks. Each additional section contains fs_maxbpg blocks.
    512  *
    513  * If no blocks have been allocated in the first section, the policy is to
    514  * request a block in the same cylinder group as the inode that describes
    515  * the file. If no blocks have been allocated in any other section, the
    516  * policy is to place the section in a cylinder group with a greater than
    517  * average number of free blocks.  An appropriate cylinder group is found
    518  * by using a rotor that sweeps the cylinder groups. When a new group of
    519  * blocks is needed, the sweep begins in the cylinder group following the
    520  * cylinder group from which the previous allocation was made. The sweep
    521  * continues until a cylinder group with greater than the average number
    522  * of free blocks is found. If the allocation is for the first block in an
    523  * indirect block, the information on the previous allocation is unavailable;
    524  * here a best guess is made based upon the logical block number being
    525  * allocated.
    526  *
    527  * If a section is already partially allocated, the policy is to
    528  * contiguously allocate fs_maxcontig blocks.  The end of one of these
    529  * contiguous blocks and the beginning of the next is physically separated
    530  * so that the disk head will be in transit between them for at least
    531  * fs_rotdelay milliseconds.  This is to allow time for the processor to
    532  * schedule another I/O transfer.
    533  */
    534 daddr_t
    535 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap)
    536 {
    537 	struct fs *fs;
    538 	struct ufsvfs *ufsvfsp;
    539 	int cg;
    540 	int avgbfree, startcg;
    541 	daddr_t nextblk;
    542 
    543 	ufsvfsp = ip->i_ufsvfs;
    544 	fs = ip->i_fs;
    545 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
    546 		if (lbn < NDADDR) {
    547 			cg = itog(fs, ip->i_number);
    548 			return (fs->fs_fpg * cg + fs->fs_frag);
    549 		}
    550 		/*
    551 		 * Find a cylinder with greater than average
    552 		 * number of unused data blocks.
    553 		 */
    554 		if (indx == 0 || bap[indx - 1] == 0)
    555 			startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg;
    556 		else
    557 			startcg = dtog(fs, bap[indx - 1]) + 1;
    558 		startcg %= fs->fs_ncg;
    559 
    560 		mutex_enter(&ufsvfsp->vfs_lock);
    561 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
    562 		/*
    563 		 * used for computing log space for writes/truncs
    564 		 */
    565 		ufsvfsp->vfs_avgbfree = avgbfree;
    566 		for (cg = startcg; cg < fs->fs_ncg; cg++)
    567 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
    568 				fs->fs_cgrotor = cg;
    569 				mutex_exit(&ufsvfsp->vfs_lock);
    570 				return (fs->fs_fpg * cg + fs->fs_frag);
    571 			}
    572 		for (cg = 0; cg <= startcg; cg++)
    573 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
    574 				fs->fs_cgrotor = cg;
    575 				mutex_exit(&ufsvfsp->vfs_lock);
    576 				return (fs->fs_fpg * cg + fs->fs_frag);
    577 			}
    578 		mutex_exit(&ufsvfsp->vfs_lock);
    579 		return (NULL);
    580 	}
    581 	/*
    582 	 * One or more previous blocks have been laid out. If less
    583 	 * than fs_maxcontig previous blocks are contiguous, the
    584 	 * next block is requested contiguously, otherwise it is
    585 	 * requested rotationally delayed by fs_rotdelay milliseconds.
    586 	 */
    587 
    588 	nextblk = bap[indx - 1];
    589 	/*
    590 	 * Provision for fallocate to return positive
    591 	 * blk preference based on last allocation
    592 	 */
    593 	if (nextblk < 0 && nextblk != UFS_HOLE) {
    594 		nextblk = (-bap[indx - 1]) + fs->fs_frag;
    595 	} else {
    596 		nextblk = bap[indx - 1] + fs->fs_frag;
    597 	}
    598 
    599 	if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] +
    600 	    blkstofrags(fs, fs->fs_maxcontig) != nextblk) {
    601 		return (nextblk);
    602 	}
    603 	if (fs->fs_rotdelay != 0)
    604 		/*
    605 		 * Here we convert ms of delay to frags as:
    606 		 * (frags) = (ms) * (rev/sec) * (sect/rev) /
    607 		 * 	((sect/frag) * (ms/sec))
    608 		 * then round up to the next block.
    609 		 */
    610 		nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
    611 		    (NSPF(fs) * 1000), fs->fs_frag);
    612 	return (nextblk);
    613 }
    614 
    615 /*
    616  * Free a block or fragment.
    617  *
    618  * The specified block or fragment is placed back in the
    619  * free map. If a fragment is deallocated, a possible
    620  * block reassembly is checked.
    621  */
    622 void
    623 free(struct inode *ip, daddr_t bno, off_t size, int flags)
    624 {
    625 	struct fs *fs = ip->i_fs;
    626 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
    627 	struct ufs_q *delq = &ufsvfsp->vfs_delete;
    628 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
    629 	struct cg *cgp;
    630 	struct buf *bp;
    631 	int cg, bmap, bbase;
    632 	int i;
    633 	uchar_t *blksfree;
    634 	int *blktot;
    635 	short *blks;
    636 	daddr_t blkno, cylno, rpos;
    637 
    638 	/*
    639 	 * fallocate'd files will have negative block address.
    640 	 * So negate it again to get original block address.
    641 	 */
    642 	if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) {
    643 		bno = -bno;
    644 	}
    645 
    646 	if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) {
    647 		(void) ufs_fault(ITOV(ip),
    648 		    "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
    649 		    "fs = %s\n", ip->i_dev, fs->fs_bsize,
    650 		    (int)size, fs->fs_fsmnt);
    651 		return;
    652 	}
    653 	cg = dtog(fs, bno);
    654 	ASSERT(!ufs_badblock(ip, bno));
    655 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
    656 	    (int)fs->fs_cgsize);
    657 
    658 	cgp = bp->b_un.b_cg;
    659 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
    660 		brelse(bp);
    661 		return;
    662 	}
    663 
    664 	if (!(flags & I_NOCANCEL))
    665 		TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags);
    666 	if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) {
    667 		TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size);
    668 	}
    669 	blksfree = cg_blksfree(cgp);
    670 	blktot = cg_blktot(cgp);
    671 	mutex_enter(&ufsvfsp->vfs_lock);
    672 	cgp->cg_time = gethrestime_sec();
    673 	bno = dtogd(fs, bno);
    674 	if (size == fs->fs_bsize) {
    675 		blkno = fragstoblks(fs, bno);
    676 		cylno = cbtocylno(fs, bno);
    677 		rpos = cbtorpos(ufsvfsp, bno);
    678 		blks = cg_blks(ufsvfsp, cgp, cylno);
    679 		if (!isclrblock(fs, blksfree, blkno)) {
    680 			mutex_exit(&ufsvfsp->vfs_lock);
    681 			brelse(bp);
    682 			(void) ufs_fault(ITOV(ip), "free: freeing free block, "
    683 			    "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
    684 			    ip->i_dev, bno, ip->i_number, fs->fs_fsmnt);
    685 			return;
    686 		}
    687 		setblock(fs, blksfree, blkno);
    688 		blks[rpos]++;
    689 		blktot[cylno]++;
    690 		cgp->cg_cs.cs_nbfree++;		/* Log below */
    691 		fs->fs_cstotal.cs_nbfree++;
    692 		fs->fs_cs(fs, cg).cs_nbfree++;
    693 		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
    694 			mutex_enter(&delq->uq_mutex);
    695 			delq_info->delq_unreclaimed_blocks -=
    696 			    btodb(fs->fs_bsize);
    697 			mutex_exit(&delq->uq_mutex);
    698 		}
    699 	} else {
    700 		bbase = bno - fragnum(fs, bno);
    701 		/*
    702 		 * Decrement the counts associated with the old frags
    703 		 */
    704 		bmap = blkmap(fs, blksfree, bbase);
    705 		fragacct(fs, bmap, cgp->cg_frsum, -1);
    706 		/*
    707 		 * Deallocate the fragment
    708 		 */
    709 		for (i = 0; i < numfrags(fs, size); i++) {
    710 			if (isset(blksfree, bno + i)) {
    711 				brelse(bp);
    712 				mutex_exit(&ufsvfsp->vfs_lock);
    713 				(void) ufs_fault(ITOV(ip),
    714 				    "free: freeing free frag, "
    715 				    "dev:0x%lx, blk:%ld, cg:%d, "
    716 				    "ino:%lu, fs:%s",
    717 				    ip->i_dev,
    718 				    bno + i,
    719 				    cgp->cg_cgx,
    720 				    ip->i_number,
    721 				    fs->fs_fsmnt);
    722 				return;
    723 			}
    724 			setbit(blksfree, bno + i);
    725 		}
    726 		cgp->cg_cs.cs_nffree += i;
    727 		fs->fs_cstotal.cs_nffree += i;
    728 		fs->fs_cs(fs, cg).cs_nffree += i;
    729 		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
    730 			mutex_enter(&delq->uq_mutex);
    731 			delq_info->delq_unreclaimed_blocks -=
    732 			    btodb(i * fs->fs_fsize);
    733 			mutex_exit(&delq->uq_mutex);
    734 		}
    735 		/*
    736 		 * Add back in counts associated with the new frags
    737 		 */
    738 		bmap = blkmap(fs, blksfree, bbase);
    739 		fragacct(fs, bmap, cgp->cg_frsum, 1);
    740 		/*
    741 		 * If a complete block has been reassembled, account for it
    742 		 */
    743 		blkno = fragstoblks(fs, bbase);
    744 		if (isblock(fs, blksfree, blkno)) {
    745 			cylno = cbtocylno(fs, bbase);
    746 			rpos = cbtorpos(ufsvfsp, bbase);
    747 			blks = cg_blks(ufsvfsp, cgp, cylno);
    748 			blks[rpos]++;
    749 			blktot[cylno]++;
    750 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
    751 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
    752 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
    753 			cgp->cg_cs.cs_nbfree++;
    754 			fs->fs_cstotal.cs_nbfree++;
    755 			fs->fs_cs(fs, cg).cs_nbfree++;
    756 		}
    757 	}
    758 	fs->fs_fmod = 1;
    759 	ufs_notclean(ufsvfsp);
    760 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
    761 	TRANS_SI(ufsvfsp, fs, cg);
    762 	bdrwrite(bp);
    763 }
    764 
    765 /*
    766  * Free an inode.
    767  *
    768  * The specified inode is placed back in the free map.
    769  */
    770 void
    771 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode)
    772 {
    773 	struct fs *fs = ip->i_fs;
    774 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
    775 	struct cg *cgp;
    776 	struct buf *bp;
    777 	unsigned int inot;
    778 	int cg;
    779 	char *iused;
    780 
    781 	if (ip->i_number == ino && ip->i_mode != 0) {
    782 		(void) ufs_fault(ITOV(ip),
    783 		    "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
    784 		    "fs = %s\n",
    785 		    ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt);
    786 		return;
    787 	}
    788 	if (ino >= fs->fs_ipg * fs->fs_ncg) {
    789 		(void) ufs_fault(ITOV(ip),
    790 		    "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
    791 		    (int)ip->i_dev, (int)ino, fs->fs_fsmnt);
    792 		return;
    793 	}
    794 	cg = (int)itog(fs, ino);
    795 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
    796 	    (int)fs->fs_cgsize);
    797 
    798 	cgp = bp->b_un.b_cg;
    799 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
    800 		brelse(bp);
    801 		return;
    802 	}
    803 	mutex_enter(&ufsvfsp->vfs_lock);
    804 	cgp->cg_time = gethrestime_sec();
    805 	iused = cg_inosused(cgp);
    806 	inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg);
    807 	if (isclr(iused, inot)) {
    808 		mutex_exit(&ufsvfsp->vfs_lock);
    809 		brelse(bp);
    810 		(void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, "
    811 		    "mode: (imode) %o, (omode) %o, ino:%d, "
    812 		    "fs:%s",
    813 		    ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
    814 		return;
    815 	}
    816 	clrbit(iused, inot);
    817 
    818 	if (inot < (ulong_t)cgp->cg_irotor)
    819 		cgp->cg_irotor = inot;
    820 	cgp->cg_cs.cs_nifree++;
    821 	fs->fs_cstotal.cs_nifree++;
    822 	fs->fs_cs(fs, cg).cs_nifree++;
    823 	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
    824 		cgp->cg_cs.cs_ndir--;
    825 		fs->fs_cstotal.cs_ndir--;
    826 		fs->fs_cs(fs, cg).cs_ndir--;
    827 	}
    828 	fs->fs_fmod = 1;
    829 	ufs_notclean(ufsvfsp);
    830 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
    831 	TRANS_SI(ufsvfsp, fs, cg);
    832 	bdrwrite(bp);
    833 }
    834 
    835 /*
    836  * Implement the cylinder overflow algorithm.
    837  *
    838  * The policy implemented by this algorithm is:
    839  *   1) allocate the block in its requested cylinder group.
    840  *   2) quadratically rehash on the cylinder group number.
    841  *   3) brute force search for a free block.
    842  * The size parameter means size for data blocks, mode for inodes.
    843  */
    844 static ino_t
    845 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)())
    846 {
    847 	struct fs *fs;
    848 	int i;
    849 	long result;
    850 	int icg = cg;
    851 
    852 	fs = ip->i_fs;
    853 	/*
    854 	 * 1: preferred cylinder group
    855 	 */
    856 	result = (*allocator)(ip, cg, pref, size);
    857 	if (result)
    858 		return (result);
    859 	/*
    860 	 * 2: quadratic rehash
    861 	 */
    862 	for (i = 1; i < fs->fs_ncg; i *= 2) {
    863 		cg += i;
    864 		if (cg >= fs->fs_ncg)
    865 			cg -= fs->fs_ncg;
    866 		result = (*allocator)(ip, cg, 0, size);
    867 		if (result)
    868 			return (result);
    869 	}
    870 	/*
    871 	 * 3: brute force search
    872 	 * Note that we start at i == 2, since 0 was checked initially,
    873 	 * and 1 is always checked in the quadratic rehash.
    874 	 */
    875 	cg = (icg + 2) % fs->fs_ncg;
    876 	for (i = 2; i < fs->fs_ncg; i++) {
    877 		result = (*allocator)(ip, cg, 0, size);
    878 		if (result)
    879 			return (result);
    880 		cg++;
    881 		if (cg == fs->fs_ncg)
    882 			cg = 0;
    883 	}
    884 	return (NULL);
    885 }
    886 
    887 /*
    888  * Determine whether a fragment can be extended.
    889  *
    890  * Check to see if the necessary fragments are available, and
    891  * if they are, allocate them.
    892  */
    893 static daddr_t
    894 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize)
    895 {
    896 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
    897 	struct fs *fs = ip->i_fs;
    898 	struct buf *bp;
    899 	struct cg *cgp;
    900 	uchar_t *blksfree;
    901 	long bno;
    902 	int frags, bbase;
    903 	int i, j;
    904 
    905 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
    906 		return (NULL);
    907 	frags = numfrags(fs, nsize);
    908 	bbase = (int)fragnum(fs, bprev);
    909 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
    910 		/* cannot extend across a block boundary */
    911 		return (NULL);
    912 	}
    913 
    914 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
    915 	    (int)fs->fs_cgsize);
    916 	cgp = bp->b_un.b_cg;
    917 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
    918 		brelse(bp);
    919 		return (NULL);
    920 	}
    921 
    922 	blksfree = cg_blksfree(cgp);
    923 	mutex_enter(&ufsvfsp->vfs_lock);
    924 	bno = dtogd(fs, bprev);
    925 	for (i = numfrags(fs, osize); i < frags; i++) {
    926 		if (isclr(blksfree, bno + i)) {
    927 			mutex_exit(&ufsvfsp->vfs_lock);
    928 			brelse(bp);
    929 			return (NULL);
    930 		}
    931 		if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)),
    932 		    fs->fs_fsize))) {
    933 			mutex_exit(&ufsvfsp->vfs_lock);
    934 			brelse(bp);
    935 			return (NULL);
    936 		}
    937 	}
    938 
    939 	cgp->cg_time = gethrestime_sec();
    940 	/*
    941 	 * The current fragment can be extended,
    942 	 * deduct the count on fragment being extended into
    943 	 * increase the count on the remaining fragment (if any)
    944 	 * allocate the extended piece.
    945 	 */
    946 	for (i = frags; i < fs->fs_frag - bbase; i++)
    947 		if (isclr(blksfree, bno + i))
    948 			break;
    949 	j = i - numfrags(fs, osize);
    950 	cgp->cg_frsum[j]--;
    951 	ASSERT(cgp->cg_frsum[j] >= 0);
    952 	if (i != frags)
    953 		cgp->cg_frsum[i - frags]++;
    954 	for (i = numfrags(fs, osize); i < frags; i++) {
    955 		clrbit(blksfree, bno + i);
    956 		cgp->cg_cs.cs_nffree--;
    957 		fs->fs_cs(fs, cg).cs_nffree--;
    958 		fs->fs_cstotal.cs_nffree--;
    959 	}
    960 	fs->fs_fmod = 1;
    961 	ufs_notclean(ufsvfsp);
    962 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
    963 	TRANS_SI(ufsvfsp, fs, cg);
    964 	bdrwrite(bp);
    965 	return ((daddr_t)bprev);
    966 }
    967 
    968 /*
    969  * Determine whether a block can be allocated.
    970  *
    971  * Check to see if a block of the apprpriate size
    972  * is available, and if it is, allocate it.
    973  */
    974 static daddr_t
    975 alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
    976 {
    977 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
    978 	struct fs *fs = ip->i_fs;
    979 	struct buf *bp;
    980 	struct cg *cgp;
    981 	uchar_t *blksfree;
    982 	int bno, frags;
    983 	int allocsiz;
    984 	int i;
    985 
    986 	/*
    987 	 * Searching for space could be time expensive so do some
    988 	 * up front checking to verify that there is actually space
    989 	 * available (free blocks or free frags).
    990 	 */
    991 	if (fs->fs_cs(fs, cg).cs_nbfree == 0) {
    992 		if (size == fs->fs_bsize)
    993 			return (0);
    994 
    995 		/*
    996 		 * If there are not enough free frags then return.
    997 		 */
    998 		if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size))
    999 			return (0);
   1000 	}
   1001 
   1002 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
   1003 	    (int)fs->fs_cgsize);
   1004 
   1005 	cgp = bp->b_un.b_cg;
   1006 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
   1007 	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
   1008 		brelse(bp);
   1009 		return (0);
   1010 	}
   1011 	blksfree = cg_blksfree(cgp);
   1012 	mutex_enter(&ufsvfsp->vfs_lock);
   1013 	cgp->cg_time = gethrestime_sec();
   1014 	if (size == fs->fs_bsize) {
   1015 		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
   1016 			goto errout;
   1017 		fs->fs_fmod = 1;
   1018 		ufs_notclean(ufsvfsp);
   1019 		TRANS_SI(ufsvfsp, fs, cg);
   1020 		bdrwrite(bp);
   1021 		return (bno);
   1022 	}
   1023 	/*
   1024 	 * Check fragment bitmap to see if any fragments are already available.
   1025 	 * mapsearch() may fail because the fragment that fits this request
   1026 	 * might still be on the cancel list and not available for re-use yet.
   1027 	 * Look for a bigger sized fragment to allocate first before we have
   1028 	 * to give up and fragment a whole new block eventually.
   1029 	 */
   1030 	frags = numfrags(fs, size);
   1031 	allocsiz = frags;
   1032 next_size:
   1033 	for (; allocsiz < fs->fs_frag; allocsiz++)
   1034 		if (cgp->cg_frsum[allocsiz] != 0)
   1035 			break;
   1036 
   1037 	if (allocsiz != fs->fs_frag) {
   1038 		bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz);
   1039 		if (bno < 0 && allocsiz < (fs->fs_frag - 1)) {
   1040 			allocsiz++;
   1041 			goto next_size;
   1042 		}
   1043 	}
   1044 
   1045 	if (allocsiz == fs->fs_frag || bno < 0) {
   1046 		/*
   1047 		 * No fragments were available, so a block
   1048 		 * will be allocated and hacked up.
   1049 		 */
   1050 		if (cgp->cg_cs.cs_nbfree == 0)
   1051 			goto errout;
   1052 		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
   1053 			goto errout;
   1054 		bpref = dtogd(fs, bno);
   1055 		for (i = frags; i < fs->fs_frag; i++)
   1056 			setbit(blksfree, bpref + i);
   1057 		i = fs->fs_frag - frags;
   1058 		cgp->cg_cs.cs_nffree += i;
   1059 		fs->fs_cstotal.cs_nffree += i;
   1060 		fs->fs_cs(fs, cg).cs_nffree += i;
   1061 		cgp->cg_frsum[i]++;
   1062 		fs->fs_fmod = 1;
   1063 		ufs_notclean(ufsvfsp);
   1064 		TRANS_SI(ufsvfsp, fs, cg);
   1065 		bdrwrite(bp);
   1066 		return (bno);
   1067 	}
   1068 
   1069 	for (i = 0; i < frags; i++)
   1070 		clrbit(blksfree, bno + i);
   1071 	cgp->cg_cs.cs_nffree -= frags;
   1072 	fs->fs_cstotal.cs_nffree -= frags;
   1073 	fs->fs_cs(fs, cg).cs_nffree -= frags;
   1074 	cgp->cg_frsum[allocsiz]--;
   1075 	ASSERT(cgp->cg_frsum[allocsiz] >= 0);
   1076 	if (frags != allocsiz) {
   1077 		cgp->cg_frsum[allocsiz - frags]++;
   1078 	}
   1079 	fs->fs_fmod = 1;
   1080 	ufs_notclean(ufsvfsp);
   1081 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
   1082 	TRANS_SI(ufsvfsp, fs, cg);
   1083 	bdrwrite(bp);
   1084 	return (cg * fs->fs_fpg + bno);
   1085 errout:
   1086 	mutex_exit(&ufsvfsp->vfs_lock);
   1087 	brelse(bp);
   1088 	return (0);
   1089 }
   1090 
   1091 /*
   1092  * Allocate a block in a cylinder group.
   1093  *
   1094  * This algorithm implements the following policy:
   1095  *   1) allocate the requested block.
   1096  *   2) allocate a rotationally optimal block in the same cylinder.
   1097  *   3) allocate the next available block on the block rotor for the
   1098  *	specified cylinder group.
   1099  * Note that this routine only allocates fs_bsize blocks; these
   1100  * blocks may be fragmented by the routine that allocates them.
   1101  */
   1102 static daddr_t
   1103 alloccgblk(
   1104 	struct ufsvfs *ufsvfsp,
   1105 	struct cg *cgp,
   1106 	daddr_t bpref,
   1107 	struct buf *bp)
   1108 {
   1109 	daddr_t bno;
   1110 	int cylno, pos, delta, rotbl_size;
   1111 	short *cylbp;
   1112 	int i;
   1113 	struct fs *fs;
   1114 	uchar_t *blksfree;
   1115 	daddr_t blkno, rpos, frag;
   1116 	short *blks;
   1117 	int32_t *blktot;
   1118 
   1119 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
   1120 	fs = ufsvfsp->vfs_fs;
   1121 	blksfree = cg_blksfree(cgp);
   1122 	if (bpref == 0) {
   1123 		bpref = cgp->cg_rotor;
   1124 		goto norot;
   1125 	}
   1126 	bpref = blknum(fs, bpref);
   1127 	bpref = dtogd(fs, bpref);
   1128 	/*
   1129 	 * If the requested block is available, use it.
   1130 	 */
   1131 	if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) {
   1132 		bno = bpref;
   1133 		goto gotit;
   1134 	}
   1135 	/*
   1136 	 * Check for a block available on the same cylinder.
   1137 	 */
   1138 	cylno = cbtocylno(fs, bpref);
   1139 	if (cg_blktot(cgp)[cylno] == 0)
   1140 		goto norot;
   1141 	if (fs->fs_cpc == 0) {
   1142 		/*
   1143 		 * Block layout info is not available, so just
   1144 		 * have to take any block in this cylinder.
   1145 		 */
   1146 		bpref = howmany(fs->fs_spc * cylno, NSPF(fs));
   1147 		goto norot;
   1148 	}
   1149 	/*
   1150 	 * Check the summary information to see if a block is
   1151 	 * available in the requested cylinder starting at the
   1152 	 * requested rotational position and proceeding around.
   1153 	 */
   1154 	cylbp = cg_blks(ufsvfsp, cgp, cylno);
   1155 	pos = cbtorpos(ufsvfsp, bpref);
   1156 	for (i = pos; i < ufsvfsp->vfs_nrpos; i++)
   1157 		if (cylbp[i] > 0)
   1158 			break;
   1159 	if (i == ufsvfsp->vfs_nrpos)
   1160 		for (i = 0; i < pos; i++)
   1161 			if (cylbp[i] > 0)
   1162 				break;
   1163 	if (cylbp[i] > 0) {
   1164 		/*
   1165 		 * Found a rotational position, now find the actual
   1166 		 * block.  A "panic" if none is actually there.
   1167 		 */
   1168 
   1169 		/*
   1170 		 * Up to this point, "pos" has referred to the rotational
   1171 		 * position of the desired block.  From now on, it holds
   1172 		 * the offset of the current cylinder within a cylinder
   1173 		 * cycle.  (A cylinder cycle refers to a set of cylinders
   1174 		 * which are described by a single rotational table; the
   1175 		 * size of the cycle is fs_cpc.)
   1176 		 *
   1177 		 * bno is set to the block number of the first block within
   1178 		 * the current cylinder cycle.
   1179 		 */
   1180 
   1181 		pos = cylno % fs->fs_cpc;
   1182 		bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
   1183 
   1184 		/*
   1185 		 * The blocks within a cylinder are grouped into equivalence
   1186 		 * classes according to their "rotational position."  There
   1187 		 * are two tables used to determine these classes.
   1188 		 *
   1189 		 * The positional offset table (fs_postbl) has an entry for
   1190 		 * each rotational position of each cylinder in a cylinder
   1191 		 * cycle.  This entry contains the relative block number
   1192 		 * (counting from the start of the cylinder cycle) of the
   1193 		 * first block in the equivalence class for that position
   1194 		 * and that cylinder.  Positions for which no blocks exist
   1195 		 * are indicated by a -1.
   1196 		 *
   1197 		 * The rotational delta table (fs_rotbl) has an entry for
   1198 		 * each block in a cylinder cycle.  This entry contains
   1199 		 * the offset from that block to the next block in the
   1200 		 * same equivalence class.  The last block in the class
   1201 		 * is indicated by a zero in the table.
   1202 		 *
   1203 		 * The following code, then, walks through all of the blocks
   1204 		 * in the cylinder (cylno) which we're allocating within
   1205 		 * which are in the equivalence class for the rotational
   1206 		 * position (i) which we're allocating within.
   1207 		 */
   1208 
   1209 		if (fs_postbl(ufsvfsp, pos)[i] == -1) {
   1210 			(void) ufs_fault(ufsvfsp->vfs_root,
   1211 			    "alloccgblk: cyl groups corrupted, pos = %d, "
   1212 			    "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
   1213 			return (0);
   1214 		}
   1215 
   1216 		/*
   1217 		 * There is one entry in the rotational table for each block
   1218 		 * in the cylinder cycle.  These are whole blocks, not frags.
   1219 		 */
   1220 
   1221 		rotbl_size = (fs->fs_cpc * fs->fs_spc) >>
   1222 		    (fs->fs_fragshift + fs->fs_fsbtodb);
   1223 
   1224 		/*
   1225 		 * As we start, "i" is the rotational position within which
   1226 		 * we're searching.  After the next line, it will be a block
   1227 		 * number (relative to the start of the cylinder cycle)
   1228 		 * within the equivalence class of that rotational position.
   1229 		 */
   1230 
   1231 		i = fs_postbl(ufsvfsp, pos)[i];
   1232 
   1233 		for (;;) {
   1234 			if (isblock(fs, blksfree, (daddr_t)(bno + i))) {
   1235 				bno = blkstofrags(fs, (bno + i));
   1236 				goto gotit;
   1237 			}
   1238 			delta = fs_rotbl(fs)[i];
   1239 			if (delta <= 0 ||		/* End of chain, or */
   1240 			    delta + i > rotbl_size)	/* end of table? */
   1241 				break;			/* If so, panic. */
   1242 			i += delta;
   1243 		}
   1244 		(void) ufs_fault(ufsvfsp->vfs_root,
   1245 		    "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
   1246 		    "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno);
   1247 		return (0);
   1248 	}
   1249 norot:
   1250 	/*
   1251 	 * No blocks in the requested cylinder, so take
   1252 	 * next available one in this cylinder group.
   1253 	 */
   1254 	bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag);
   1255 	if (bno < 0)
   1256 		return (0);
   1257 	cgp->cg_rotor = bno;
   1258 gotit:
   1259 	blkno = fragstoblks(fs, bno);
   1260 	frag = (cgp->cg_cgx * fs->fs_fpg) + bno;
   1261 	if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize))
   1262 		goto norot;
   1263 	clrblock(fs, blksfree, (long)blkno);
   1264 	/*
   1265 	 * the other cg/sb/si fields are TRANS'ed by the caller
   1266 	 */
   1267 	cgp->cg_cs.cs_nbfree--;
   1268 	fs->fs_cstotal.cs_nbfree--;
   1269 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
   1270 	cylno = cbtocylno(fs, bno);
   1271 	blks = cg_blks(ufsvfsp, cgp, cylno);
   1272 	rpos = cbtorpos(ufsvfsp, bno);
   1273 	blktot = cg_blktot(cgp);
   1274 	blks[rpos]--;
   1275 	blktot[cylno]--;
   1276 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
   1277 	fs->fs_fmod = 1;
   1278 	return (frag);
   1279 }
   1280 
   1281 /*
   1282  * Determine whether an inode can be allocated.
   1283  *
   1284  * Check to see if an inode is available, and if it is,
   1285  * allocate it using the following policy:
   1286  *   1) allocate the requested inode.
   1287  *   2) allocate the next available inode after the requested
   1288  *	inode in the specified cylinder group.
   1289  */
   1290 static ino_t
   1291 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
   1292 {
   1293 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
   1294 	struct fs *fs = ip->i_fs;
   1295 	struct cg *cgp;
   1296 	struct buf *bp;
   1297 	int start, len, loc, map, i;
   1298 	char *iused;
   1299 
   1300 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
   1301 		return (0);
   1302 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
   1303 	    (int)fs->fs_cgsize);
   1304 
   1305 	cgp = bp->b_un.b_cg;
   1306 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
   1307 	    cgp->cg_cs.cs_nifree == 0) {
   1308 		brelse(bp);
   1309 		return (0);
   1310 	}
   1311 	iused = cg_inosused(cgp);
   1312 	mutex_enter(&ufsvfsp->vfs_lock);
   1313 	/*
   1314 	 * While we are waiting for the mutex, someone may have taken
   1315 	 * the last available inode.  Need to recheck.
   1316 	 */
   1317 	if (cgp->cg_cs.cs_nifree == 0) {
   1318 		mutex_exit(&ufsvfsp->vfs_lock);
   1319 		brelse(bp);
   1320 		return (0);
   1321 	}
   1322 
   1323 	cgp->cg_time = gethrestime_sec();
   1324 	if (ipref) {
   1325 		ipref %= fs->fs_ipg;
   1326 		if (isclr(iused, ipref))
   1327 			goto gotit;
   1328 	}
   1329 	start = cgp->cg_irotor / NBBY;
   1330 	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
   1331 	loc = skpc(0xff, (uint_t)len, &iused[start]);
   1332 	if (loc == 0) {
   1333 		len = start + 1;
   1334 		start = 0;
   1335 		loc = skpc(0xff, (uint_t)len, &iused[0]);
   1336 		if (loc == 0) {
   1337 			mutex_exit(&ufsvfsp->vfs_lock);
   1338 			(void) ufs_fault(ITOV(ip),
   1339 			    "ialloccg: map corrupted, cg = %d, irotor = %d, "
   1340 			    "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
   1341 			return (0);
   1342 		}
   1343 	}
   1344 	i = start + len - loc;
   1345 	map = iused[i];
   1346 	ipref = i * NBBY;
   1347 	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
   1348 		if ((map & i) == 0) {
   1349 			cgp->cg_irotor = ipref;
   1350 			goto gotit;
   1351 		}
   1352 	}
   1353 
   1354 	mutex_exit(&ufsvfsp->vfs_lock);
   1355 	(void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s",
   1356 	    fs->fs_fsmnt);
   1357 	return (0);
   1358 gotit:
   1359 	setbit(iused, ipref);
   1360 	cgp->cg_cs.cs_nifree--;
   1361 	fs->fs_cstotal.cs_nifree--;
   1362 	fs->fs_cs(fs, cg).cs_nifree--;
   1363 	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
   1364 		cgp->cg_cs.cs_ndir++;
   1365 		fs->fs_cstotal.cs_ndir++;
   1366 		fs->fs_cs(fs, cg).cs_ndir++;
   1367 	}
   1368 	fs->fs_fmod = 1;
   1369 	ufs_notclean(ufsvfsp);
   1370 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
   1371 	TRANS_SI(ufsvfsp, fs, cg);
   1372 	bdrwrite(bp);
   1373 	return (cg * fs->fs_ipg + ipref);
   1374 }
   1375 
   1376 /*
   1377  * Find a block of the specified size in the specified cylinder group.
   1378  *
   1379  * It is a panic if a request is made to find a block if none are
   1380  * available.
   1381  */
   1382 static daddr_t
   1383 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref,
   1384 	int allocsiz)
   1385 {
   1386 	struct fs *fs	= ufsvfsp->vfs_fs;
   1387 	daddr_t bno, cfrag;
   1388 	int start, len, loc, i, last, first, secondtime;
   1389 	int blk, field, subfield, pos;
   1390 	int gotit;
   1391 
   1392 	/*
   1393 	 * ufsvfs->vfs_lock is held when calling this.
   1394 	 */
   1395 	/*
   1396 	 * Find the fragment by searching through the
   1397 	 * free block map for an appropriate bit pattern.
   1398 	 */
   1399 	if (bpref)
   1400 		start = dtogd(fs, bpref) / NBBY;
   1401 	else
   1402 		start = cgp->cg_frotor / NBBY;
   1403 	/*
   1404 	 * the following loop performs two scans -- the first scan
   1405 	 * searches the bottom half of the array for a match and the
   1406 	 * second scan searches the top half of the array.  The loops
   1407 	 * have been merged just to make things difficult.
   1408 	 */
   1409 	first = start;
   1410 	last = howmany(fs->fs_fpg, NBBY);
   1411 	secondtime = 0;
   1412 	cfrag = cgp->cg_cgx * fs->fs_fpg;
   1413 	while (first < last) {
   1414 		len = last - first;
   1415 		/*
   1416 		 * search the array for a match
   1417 		 */
   1418 		loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first],
   1419 		    (uchar_t *)fragtbl[fs->fs_frag],
   1420 		    (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
   1421 		/*
   1422 		 * match found
   1423 		 */
   1424 		if (loc) {
   1425 			bno = (last - loc) * NBBY;
   1426 
   1427 			/*
   1428 			 * Found the byte in the map, sift
   1429 			 * through the bits to find the selected frag
   1430 			 */
   1431 			cgp->cg_frotor = bno;
   1432 			gotit = 0;
   1433 			for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
   1434 				blk = blkmap(fs, cg_blksfree(cgp), bno);
   1435 				blk <<= 1;
   1436 				field = around[allocsiz];
   1437 				subfield = inside[allocsiz];
   1438 				for (pos = 0;
   1439 				    pos <= fs->fs_frag - allocsiz;
   1440 				    pos++) {
   1441 					if ((blk & field) == subfield) {
   1442 						gotit++;
   1443 						break;
   1444 					}
   1445 					field <<= 1;
   1446 					subfield <<= 1;
   1447 				}
   1448 				if (gotit)
   1449 					break;
   1450 			}
   1451 			bno += pos;
   1452 
   1453 			/*
   1454 			 * success if block is *not* being converted from
   1455 			 * metadata into userdata (harpy).  If so, ignore.
   1456 			 */
   1457 			if (!TRANS_ISCANCEL(ufsvfsp,
   1458 			    ldbtob(fsbtodb(fs, (cfrag+bno))),
   1459 			    allocsiz * fs->fs_fsize))
   1460 				return (bno);
   1461 
   1462 			/*
   1463 			 * keep looking -- this block is being converted
   1464 			 */
   1465 			first = (last - loc) + 1;
   1466 			loc = 0;
   1467 			if (first < last)
   1468 				continue;
   1469 		}
   1470 		/*
   1471 		 * no usable matches in bottom half -- now search the top half
   1472 		 */
   1473 		if (secondtime)
   1474 			/*
   1475 			 * no usable matches in top half -- all done
   1476 			 */
   1477 			break;
   1478 		secondtime = 1;
   1479 		last = start + 1;
   1480 		first = 0;
   1481 	}
   1482 	/*
   1483 	 * no usable matches
   1484 	 */
   1485 	return ((daddr_t)-1);
   1486 }
   1487 
   1488 #define	UFSNADDR (NDADDR + NIADDR)	/* NADDR applies to (obsolete) S5FS */
   1489 #define	IB(i)	(NDADDR + (i))	/* index of i'th indirect block ptr */
   1490 #define	SINGLE	0		/* single indirect block ptr */
   1491 #define	DOUBLE	1		/* double indirect block ptr */
   1492 #define	TRIPLE	2		/* triple indirect block ptr */
   1493 
   1494 /*
   1495  * Acquire a write lock, and keep trying till we get it
   1496  */
   1497 static int
   1498 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf)
   1499 {
   1500 	int err = 0;
   1501 
   1502 lockagain:
   1503 	do {
   1504 		err = ufs_fiolfss(vp, lf);
   1505 		if (err)
   1506 			return (err);
   1507 	} while (!LOCKFS_IS_ULOCK(lf));
   1508 
   1509 	lf->lf_lock = LOCKFS_WLOCK;
   1510 	lf->lf_flags = 0;
   1511 	lf->lf_comment = NULL;
   1512 	err = ufs__fiolfs(vp, lf, 1, 0);
   1513 
   1514 	if (err == EBUSY || err == EINVAL)
   1515 		goto lockagain;
   1516 
   1517 	return (err);
   1518 }
   1519 
   1520 /*
   1521  * Release the write lock
   1522  */
   1523 static int
   1524 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf)
   1525 {
   1526 	int err = 0;
   1527 
   1528 	lf->lf_lock = LOCKFS_ULOCK;
   1529 	lf->lf_flags = 0;
   1530 	err = ufs__fiolfs(vp, lf, 1, 0);
   1531 	return (err);
   1532 }
   1533 
   1534 struct allocsp_undo {
   1535 	daddr_t offset;
   1536 	daddr_t blk;
   1537 	struct allocsp_undo *next;
   1538 };
   1539 
   1540 /*
   1541  * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
   1542  * file system. For direct blocks, the blocks are allocated from the offset
   1543  * requested to the block boundary, then any full blocks are allocated,
   1544  * and finally any remainder.
   1545  * For indirect blocks the blocks are not initialized and are
   1546  * only marked as allocated. These addresses are then stored as negative
   1547  * block numbers in the inode to imply special handling. UFS has been modified
   1548  * where necessary to understand this new notion.
   1549  * Successfully fallocated files will have IFALLOCATE cflag set in the inode.
   1550  */
   1551 int
   1552 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr)
   1553 {
   1554 	struct lockfs lf;
   1555 	int berr, err, resv, issync;
   1556 	off_t istart, len; /* istart, special for idb */
   1557 	struct inode *ip;
   1558 	struct fs *fs;
   1559 	struct ufsvfs *ufsvfsp;
   1560 	u_offset_t resid, i, uoff;
   1561 	daddr32_t db_undo[NDADDR];	/* old direct blocks */
   1562 	struct allocsp_undo *ib_undo = NULL;	/* ib undo */
   1563 	struct allocsp_undo *undo = NULL;
   1564 	u_offset_t osz;			/* old file size */
   1565 	int chunkblks = 0;		/* # of blocks in 1 allocation */
   1566 	int cnt = 0;
   1567 	daddr_t allocblk;
   1568 	daddr_t totblks = 0;
   1569 	struct ulockfs	*ulp;
   1570 	size_t done_len;
   1571 	int nbytes, offsetn;
   1572 
   1573 
   1574 	ASSERT(vp->v_type == VREG);
   1575 
   1576 	ip = VTOI(vp);
   1577 	fs = ip->i_fs;
   1578 	if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
   1579 		err = EIO;
   1580 		goto out_allocsp;
   1581 	}
   1582 
   1583 	istart = blkroundup(fs, (lp->l_start));
   1584 	len = blkroundup(fs, (lp->l_len));
   1585 	chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize;
   1586 	ulp = &ufsvfsp->vfs_ulockfs;
   1587 
   1588 	if (lp->l_start < 0 || lp->l_len <= 0)
   1589 		return (EINVAL);
   1590 
   1591 	/* Quickly check to make sure we have space before we proceed */
   1592 	if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) {
   1593 		if (TRANS_ISTRANS(ufsvfsp)) {
   1594 			ufs_delete_drain_wait(ufsvfsp, 1);
   1595 			if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree)
   1596 				return (ENOSPC);
   1597 		} else
   1598 			return (ENOSPC);
   1599 	}
   1600 
   1601 	/*
   1602 	 * We will keep i_rwlock locked as WRITER through out the function
   1603 	 * since we don't want anyone else reading or writing to the inode
   1604 	 * while we are in the middle of fallocating the file.
   1605 	 */
   1606 	rw_enter(&ip->i_rwlock, RW_WRITER);
   1607 
   1608 	/* Back up the direct block list, used for undo later if necessary */
   1609 	rw_enter(&ip->i_contents, RW_READER);
   1610 	for (i = 0; i < NDADDR; i++)
   1611 		db_undo[i] = ip->i_db[i];
   1612 	osz = ip->i_size;
   1613 	rw_exit(&ip->i_contents);
   1614 
   1615 	/* Write lock the file system */
   1616 	if (err = allocsp_wlockfs(vp, &lf))
   1617 		goto exit;
   1618 
   1619 	/*
   1620 	 * Allocate any direct blocks now.
   1621 	 * Blocks are allocated from the offset requested to the block
   1622 	 * boundary, then any full blocks are allocated, and finally any
   1623 	 * remainder.
   1624 	 */
   1625 	if (lblkno(fs, lp->l_start) < NDADDR) {
   1626 		ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize),
   1627 		    &resv, &resid);
   1628 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
   1629 
   1630 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   1631 		rw_enter(&ip->i_contents, RW_WRITER);
   1632 
   1633 		done_len = 0;
   1634 		while ((done_len < lp->l_len) &&
   1635 		    (lblkno(fs, lp->l_start + done_len) < NDADDR)) {
   1636 			uoff = (offset_t)(lp->l_start + done_len);
   1637 			offsetn = (int)blkoff(fs, uoff);
   1638 			nbytes = (int)MIN(fs->fs_bsize - offsetn,
   1639 			    lp->l_len - done_len);
   1640 
   1641 			berr = bmap_write(ip, uoff, offsetn + nbytes,
   1642 			    BI_FALLOCATE, &allocblk, cr);
   1643 			/* Yikes error, quit */
   1644 			if (berr) {
   1645 				TRANS_INODE(ufsvfsp, ip);
   1646 				rw_exit(&ip->i_contents);
   1647 				rw_exit(&ufsvfsp->vfs_dqrwlock);
   1648 				TRANS_END_CSYNC(ufsvfsp, err, issync,
   1649 				    TOP_ALLOCSP, resv);
   1650 				err = allocsp_unlockfs(vp, &lf);
   1651 				goto exit;
   1652 			}
   1653 
   1654 			if (allocblk) {
   1655 				totblks++;
   1656 				if ((uoff + nbytes) > ip->i_size)
   1657 					ip->i_size = (uoff + nbytes);
   1658 			}
   1659 			done_len += nbytes;
   1660 		}
   1661 
   1662 		TRANS_INODE(ufsvfsp, ip);
   1663 		rw_exit(&ip->i_contents);
   1664 		rw_exit(&ufsvfsp->vfs_dqrwlock);
   1665 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
   1666 
   1667 		/* start offset for indirect allocation */
   1668 		istart =  (uoff + nbytes);
   1669 	}
   1670 
   1671 	/* Break the transactions into vfs_iotransz units */
   1672 	ufs_trans_trunc_resv(ip, ip->i_size +
   1673 	    blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid);
   1674 	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
   1675 
   1676 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   1677 	rw_enter(&ip->i_contents, RW_WRITER);
   1678 
   1679 	/* Now go about fallocating necessary indirect blocks */
   1680 	for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) {
   1681 		berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
   1682 		    &allocblk, cr);
   1683 		if (berr) {
   1684 			TRANS_INODE(ufsvfsp, ip);
   1685 			rw_exit(&ip->i_contents);
   1686 			rw_exit(&ufsvfsp->vfs_dqrwlock);
   1687 			TRANS_END_CSYNC(ufsvfsp, err, issync,
   1688 			    TOP_ALLOCSP, resv);
   1689 			err = allocsp_unlockfs(vp, &lf);
   1690 			goto exit;
   1691 		}
   1692 
   1693 		/* Update the blk counter only if new block was added */
   1694 		if (allocblk) {
   1695 			/* Save undo information */
   1696 			undo = kmem_alloc(sizeof (struct allocsp_undo),
   1697 			    KM_SLEEP);
   1698 			undo->offset = i;
   1699 			undo->blk = allocblk;
   1700 			undo->next = ib_undo;
   1701 			ib_undo = undo;
   1702 			totblks++;
   1703 
   1704 			if (i >= ip->i_size)
   1705 				ip->i_size += fs->fs_bsize;
   1706 		}
   1707 		cnt++;
   1708 
   1709 		/* Being a good UFS citizen, let others get a share */
   1710 		if (cnt == chunkblks) {
   1711 			/*
   1712 			 * If there are waiters or the fs is hard locked,
   1713 			 * error locked, or read-only error locked,
   1714 			 * quit with EIO
   1715 			 */
   1716 			if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
   1717 			    ULOCKFS_IS_ROELOCK(ulp)) {
   1718 				ip->i_cflags |= IFALLOCATE;
   1719 				TRANS_INODE(ufsvfsp, ip);
   1720 				rw_exit(&ip->i_contents);
   1721 				rw_exit(&ufsvfsp->vfs_dqrwlock);
   1722 
   1723 				TRANS_END_CSYNC(ufsvfsp, err, issync,
   1724 				    TOP_ALLOCSP, resv);
   1725 				rw_exit(&ip->i_rwlock);
   1726 				(void) allocsp_unlockfs(vp, &lf);
   1727 				return (EIO);
   1728 			}
   1729 
   1730 			TRANS_INODE(ufsvfsp, ip);
   1731 			rw_exit(&ip->i_contents);
   1732 			rw_exit(&ufsvfsp->vfs_dqrwlock);
   1733 
   1734 			/* End the current transaction */
   1735 			TRANS_END_CSYNC(ufsvfsp, err, issync,
   1736 			    TOP_ALLOCSP, resv);
   1737 
   1738 			if (CV_HAS_WAITERS(&ulp->ul_cv)) {
   1739 				/* Release the write lock */
   1740 				if (err = allocsp_unlockfs(vp, &lf))
   1741 					goto exit;
   1742 
   1743 				/* Wake up others waiting to do operations */
   1744 				mutex_enter(&ulp->ul_lock);
   1745 				cv_broadcast(&ulp->ul_cv);
   1746 				mutex_exit(&ulp->ul_lock);
   1747 
   1748 				/* Grab the write lock again */
   1749 				if (err = allocsp_wlockfs(vp, &lf))
   1750 					goto exit;
   1751 			} /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
   1752 
   1753 			/* Reserve more space in log for this file */
   1754 			ufs_trans_trunc_resv(ip,
   1755 			    ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz),
   1756 			    &resv, &resid);
   1757 			TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
   1758 
   1759 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   1760 			rw_enter(&ip->i_contents, RW_WRITER);
   1761 
   1762 			cnt = 0;	/* reset cnt b/c of new transaction */
   1763 		}
   1764 	}
   1765 
   1766 	if (!err && !berr)
   1767 		ip->i_cflags |= IFALLOCATE;
   1768 
   1769 	/* If the file has grown then correct the file size */
   1770 	if (osz < (lp->l_start + lp->l_len))
   1771 		ip->i_size = (lp->l_start + lp->l_len);
   1772 
   1773 	/* Release locks, end log transaction and unlock fs */
   1774 	TRANS_INODE(ufsvfsp, ip);
   1775 	rw_exit(&ip->i_contents);
   1776 	rw_exit(&ufsvfsp->vfs_dqrwlock);
   1777 
   1778 	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
   1779 	err = allocsp_unlockfs(vp, &lf);
   1780 
   1781 	/*
   1782 	 * @ exit label, we should no longer be holding the fs write lock, and
   1783 	 * all logging transactions should have been ended. We still hold
   1784 	 * ip->i_rwlock.
   1785 	 */
   1786 exit:
   1787 	/*
   1788 	 * File has grown larger than 2GB. Set flag
   1789 	 * in superblock to indicate this, if it
   1790 	 * is not already set.
   1791 	 */
   1792 	if ((ip->i_size > MAXOFF32_T) &&
   1793 	    !(fs->fs_flags & FSLARGEFILES)) {
   1794 		ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
   1795 		mutex_enter(&ufsvfsp->vfs_lock);
   1796 		fs->fs_flags |= FSLARGEFILES;
   1797 		ufs_sbwrite(ufsvfsp);
   1798 		mutex_exit(&ufsvfsp->vfs_lock);
   1799 	}
   1800 
   1801 	/*
   1802 	 * Since we couldn't allocate completely, we will undo the allocations.
   1803 	 */
   1804 	if (berr) {
   1805 		ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid);
   1806 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
   1807 
   1808 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
   1809 		rw_enter(&ip->i_contents, RW_WRITER);
   1810 
   1811 		/* Direct blocks */
   1812 		for (i = 0; i < NDADDR; i++) {
   1813 			/*
   1814 			 * Only free the block if they are not same, and
   1815 			 * the old one isn't zero (the fragment was
   1816 			 * re-allocated).
   1817 			 */
   1818 			if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) {
   1819 				free(ip, ip->i_db[i], fs->fs_bsize, 0);
   1820 				ip->i_db[i] = 0;
   1821 			}
   1822 		}
   1823 
   1824 		/* Undo the indirect blocks */
   1825 		while (ib_undo != NULL) {
   1826 			undo = ib_undo;
   1827 			err = bmap_set_bn(vp, undo->offset, 0);
   1828 			if (err)
   1829 				cmn_err(CE_PANIC, "ufs_allocsp(): failed to "
   1830 				    "undo allocation of block %ld",
   1831 				    undo->offset);
   1832 			free(ip, undo->blk, fs->fs_bsize, I_IBLK);
   1833 			ib_undo = undo->next;
   1834 			kmem_free(undo, sizeof (struct allocsp_undo));
   1835 		}
   1836 
   1837 		ip->i_size = osz;
   1838 		TRANS_INODE(ufsvfsp, ip);
   1839 
   1840 		rw_exit(&ip->i_contents);
   1841 		rw_exit(&ufsvfsp->vfs_dqrwlock);
   1842 
   1843 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
   1844 
   1845 		rw_exit(&ip->i_rwlock);
   1846 		return (berr);
   1847 	}
   1848 
   1849 	/*
   1850 	 * Don't forget to free the undo chain :)
   1851 	 */
   1852 	while (ib_undo != NULL) {
   1853 		undo = ib_undo;
   1854 		ib_undo = undo->next;
   1855 		kmem_free(undo, sizeof (struct allocsp_undo));
   1856 	}
   1857 
   1858 	rw_exit(&ip->i_rwlock);
   1859 
   1860 out_allocsp:
   1861 	return (err);
   1862 }
   1863 
   1864 /*
   1865  * Free storage space associated with the specified inode.  The portion
   1866  * to be freed is specified by lp->l_start and lp->l_len (already
   1867  * normalized to a "whence" of 0).
   1868  *
   1869  * This is an experimental facility whose continued existence is not
   1870  * guaranteed.  Currently, we only support the special case
   1871  * of l_len == 0, meaning free to end of file.
   1872  *
   1873  * Blocks are freed in reverse order.  This FILO algorithm will tend to
   1874  * maintain a contiguous free list much longer than FIFO.
   1875  * See also ufs_itrunc() in ufs_inode.c.
   1876  *
   1877  * Bug: unused bytes in the last retained block are not cleared.
   1878  * This may result in a "hole" in the file that does not read as zeroes.
   1879  */
   1880 /* ARGSUSED */
   1881 int
   1882 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr)
   1883 {
   1884 	int i;
   1885 	struct inode *ip = VTOI(vp);
   1886 	int error;
   1887 
   1888 	ASSERT(vp->v_type == VREG);
   1889 	ASSERT(lp->l_start >= 0);	/* checked by convoff */
   1890 
   1891 	if (lp->l_len != 0)
   1892 		return (EINVAL);
   1893 
   1894 	rw_enter(&ip->i_contents, RW_READER);
   1895 	if (ip->i_size == (u_offset_t)lp->l_start) {
   1896 		rw_exit(&ip->i_contents);
   1897 		return (0);
   1898 	}
   1899 
   1900 	/*
   1901 	 * Check if there is any active mandatory lock on the
   1902 	 * range that will be truncated/expanded.
   1903 	 */
   1904 	if (MANDLOCK(vp, ip->i_mode)) {
   1905 		offset_t save_start;
   1906 
   1907 		save_start = lp->l_start;
   1908 
   1909 		if (ip->i_size < lp->l_start) {
   1910 			/*
   1911 			 * "Truncate up" case: need to make sure there
   1912 			 * is no lock beyond current end-of-file. To
   1913 			 * do so, we need to set l_start to the size
   1914 			 * of the file temporarily.
   1915 			 */
   1916 			lp->l_start = ip->i_size;
   1917 		}
   1918 		lp->l_type = F_WRLCK;
   1919 		lp->l_sysid = 0;
   1920 		lp->l_pid = ttoproc(curthread)->p_pid;
   1921 		i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
   1922 		rw_exit(&ip->i_contents);
   1923 		if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
   1924 		    lp->l_type != F_UNLCK) {
   1925 			return (i ? i : EAGAIN);
   1926 		}
   1927 		rw_enter(&ip->i_contents, RW_READER);
   1928 
   1929 		lp->l_start = save_start;
   1930 	}
   1931 
   1932 	/*
   1933 	 * Make sure a write isn't in progress (allocating blocks)
   1934 	 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
   1935 	 * truncate while it was allocating blocks).
   1936 	 * Grab the locks in the right order.
   1937 	 */
   1938 	rw_exit(&ip->i_contents);
   1939 	rw_enter(&ip->i_rwlock, RW_WRITER);
   1940 	error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr);
   1941 	rw_exit(&ip->i_rwlock);
   1942 	return (error);
   1943 }
   1944 
   1945 /*
   1946  * Find a cg with as close to nb contiguous bytes as possible
   1947  *	THIS MAY TAKE MANY DISK READS!
   1948  *
   1949  * Implemented in an attempt to allocate contiguous blocks for
   1950  * writing the ufs log file to, minimizing future disk head seeking
   1951  */
   1952 daddr_t
   1953 contigpref(ufsvfs_t *ufsvfsp, size_t nb, size_t minb)
   1954 {
   1955 	struct fs	*fs	= ufsvfsp->vfs_fs;
   1956 	daddr_t		nblk	= lblkno(fs, blkroundup(fs, nb));
   1957 	daddr_t		minblk	= lblkno(fs, blkroundup(fs, minb));
   1958 	daddr_t		savebno, curbno, cgbno;
   1959 	int		cg, cgblks, savecg, savenblk, curnblk, startcg;
   1960 	uchar_t		*blksfree;
   1961 	buf_t		*bp;
   1962 	struct cg	*cgp;
   1963 
   1964 	savenblk = 0;
   1965 	savecg = 0;
   1966 	savebno = 0;
   1967 
   1968 	if ((startcg = findlogstartcg(fs, nblk, minblk)) == -1)
   1969 		cg = 0;	/* Nothing suitable found */
   1970 	else
   1971 		cg = startcg;
   1972 
   1973 	for (; cg < fs->fs_ncg; ++cg) {
   1974 		/*
   1975 		 * find the largest contiguous range in this cg
   1976 		 */
   1977 		bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev,
   1978 		    (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
   1979 		    (int)fs->fs_cgsize);
   1980 		cgp = bp->b_un.b_cg;
   1981 		if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
   1982 			brelse(bp);
   1983 			continue;
   1984 		}
   1985 		blksfree = cg_blksfree(cgp);	    /* free array */
   1986 		cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */
   1987 		cgbno = 0;
   1988 		while (cgbno < cgblks && savenblk < nblk) {
   1989 			/* find a free block */
   1990 			for (; cgbno < cgblks; ++cgbno) {
   1991 				if (isblock(fs, blksfree, cgbno)) {
   1992 					if (startcg != -1) {
   1993 						brelse(bp);
   1994 						savecg = startcg;
   1995 						savebno = cgbno;
   1996 						goto done;
   1997 					} else
   1998 						break;
   1999 				}
   2000 			}
   2001 			curbno = cgbno;
   2002 			/* count the number of free blocks */
   2003 			for (curnblk = 0; cgbno < cgblks; ++cgbno) {
   2004 				if (!isblock(fs, blksfree, cgbno))
   2005 					break;
   2006 				if (++curnblk >= nblk)
   2007 					break;
   2008 			}
   2009 			if (curnblk > savenblk) {
   2010 				savecg = cg;
   2011 				savenblk = curnblk;
   2012 				savebno = curbno;
   2013 			}
   2014 		}
   2015 		brelse(bp);
   2016 		if (savenblk >= nblk)
   2017 			break;
   2018 	}
   2019 
   2020 done:
   2021 
   2022 	/* convert block offset in cg to frag offset in cg */
   2023 	savebno = blkstofrags(fs, savebno);
   2024 
   2025 	/* convert frag offset in cg to frag offset in fs */
   2026 	savebno += (savecg * fs->fs_fpg);
   2027 
   2028 	return (savebno);
   2029 }
   2030 
   2031 /*
   2032  * The object of this routine is to find a start point for the UFS log.
   2033  * Ideally the space should be allocated from the smallest possible number
   2034  * of contiguous cylinder groups. This is found by using a sliding window
   2035  * technique. The smallest window of contiguous cylinder groups, which is
   2036  * still able to accommodate the target, is found by moving the window
   2037  * through the cylinder groups in a single pass. The end of the window is
   2038  * advanced until the space is accommodated, then the start is advanced until
   2039  * it no longer fits, the end is then advanced again and so on until the
   2040  * final cylinder group is reached. The first suitable instance is recorded
   2041  * and its starting cg number is returned.
   2042  *
   2043  * If we are not able to find a minimum amount of space, represented by
   2044  * minblk, or to do so uses more than the available extents, then return -1.
   2045  */
   2046 
   2047 int
   2048 findlogstartcg(struct fs *fs, daddr_t requested, daddr_t minblk)
   2049 {
   2050 	int	 ncgs;		 /* number of cylinder groups */
   2051 	daddr_t target;		 /* amount of space sought */
   2052 	int	 cwidth, ctotal; /* current window width and total */
   2053 	int	 bwidth, btotal; /* best window width and total so far */
   2054 	int	 s;	/* index of the first element in the current window */
   2055 	int	 e;	/* index of the first element + the width */
   2056 			/*  (i.e. 1 + index of last element) */
   2057 	int	 bs; /* index of the first element in the best window so far */
   2058 	int	 header, max_extents;
   2059 
   2060 	target = requested;
   2061 	ncgs = fs->fs_ncg;
   2062 
   2063 	header = sizeof (extent_block_t) - sizeof (extent_t);
   2064 	max_extents = ((fs->fs_bsize)-header) / sizeof (extent_t);
   2065 	cwidth = ctotal = 0;
   2066 	btotal = -1;
   2067 	bwidth = ncgs;
   2068 	s = e = 0;
   2069 	while (e < ncgs) {
   2070 	/* Advance the end of the window until it accommodates the target. */
   2071 		while (ctotal < target && e < ncgs) {
   2072 			ctotal += fs->fs_cs(fs, e).cs_nbfree;
   2073 			e++;
   2074 		}
   2075 
   2076 		/*
   2077 		 * Advance the start of the window until it no longer
   2078 		 * accommodates the target.
   2079 		 */
   2080 		while (ctotal >= target && s < e) {
   2081 			/* See if this is the smallest window so far. */
   2082 			cwidth = e - s;
   2083 			if (cwidth <= bwidth) {
   2084 				if (cwidth == bwidth && ctotal <= btotal)
   2085 					goto more;
   2086 				bwidth = cwidth;
   2087 				btotal = ctotal;
   2088 				bs = s;
   2089 			}
   2090 more:
   2091 			ctotal -= fs->fs_cs(fs, s).cs_nbfree;
   2092 			s++;
   2093 		}
   2094 	}
   2095 
   2096 	/*
   2097 	 * If we cannot allocate the minimum required or we use too many
   2098 	 * extents to do so, return -1.
   2099 	 */
   2100 	if (btotal < minblk || bwidth > max_extents)
   2101 		bs = -1;
   2102 
   2103 	return (bs);
   2104 }
   2105