Home | History | Annotate | Download | only in fsck
      1 /*
      2  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
      3  * Use is subject to license terms.
      4  */
      5 
      6 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
      7 /*	  All Rights Reserved  	*/
      8 
      9 /*
     10  * Copyright (c) 1980, 1986, 1990 The Regents of the University of California.
     11  * All rights reserved.
     12  *
     13  * Redistribution and use in source and binary forms are permitted
     14  * provided that: (1) source distributions retain this entire copyright
     15  * notice and comment, and (2) distributions including binaries display
     16  * the following acknowledgement:  ``This product includes software
     17  * developed by the University of California, Berkeley and its contributors''
     18  * in the documentation or other materials provided with the distribution
     19  * and in all advertising materials mentioning features or use of this
     20  * software. Neither the name of the University nor the names of its
     21  * contributors may be used to endorse or promote products derived
     22  * from this software without specific prior written permission.
     23  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
     24  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
     25  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
     26  */
     27 
     28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     29 
     30 #include <stdio.h>
     31 #include <string.h>
     32 #include <stdlib.h>
     33 #include <unistd.h>
     34 #include <time.h>
     35 #include <limits.h>
     36 #include <sys/param.h>
     37 #include <sys/types.h>
     38 #include <sys/sysmacros.h>
     39 #include <sys/mntent.h>
     40 #include <sys/vnode.h>
     41 #include <sys/fs/ufs_inode.h>
     42 #include <sys/fs/ufs_fs.h>
     43 #define	_KERNEL
     44 #include <sys/fs/ufs_fsdir.h>
     45 #undef _KERNEL
     46 #include <pwd.h>
     47 #include "fsck.h"
     48 
     49 static int get_indir_offsets(int, daddr_t, int *, int *);
     50 static int clearanentry(struct inodesc *);
     51 static void pdinode(struct dinode *);
     52 static void inoflush(void);
     53 static void mark_delayed_inodes(fsck_ino_t, daddr32_t);
     54 static int iblock(struct inodesc *, int, u_offset_t, enum cki_action);
     55 static struct inoinfo *search_cache(struct inoinfo *, fsck_ino_t);
     56 static int ckinode_common(struct dinode *, struct inodesc *, enum cki_action);
     57 static int lookup_dotdot_ino(fsck_ino_t);
     58 
     59 /*
     60  * ckinode() essentially traverses the blocklist of the provided
     61  * inode.  For each block either the caller-supplied callback (id_func
     62  * in the provided struct inodesc) or dirscan() is invoked.  Which is
     63  * chosen is controlled by what type of traversal was requested
     64  * (id_type) - if it was for an ADDR or ACL, use the callback,
     65  * otherwise it is assumed to be DATA (i.e., a directory) whose
     66  * contents need to be scanned.
     67  *
     68  * Note that a directory inode can get passed in with a type of ADDR;
     69  * the type field is orthogonal to the IFMT value.  This is so that
     70  * the file aspects (no duplicate blocks, etc) of a directory can be
     71  * verified just like is done for any other file, or the actual
     72  * contents can be scanned so that connectivity and such can be
     73  * investigated.
     74  *
     75  * The traversal is controlled by flags in the return value of
     76  * dirscan() or the callback.  Five flags are defined, STOP, SKIP,
     77  * KEEPON, ALTERED, and FOUND.  Their semantics are:
     78  *
     79  *     STOP -    no further processing of this inode is desired/possible/
     80  *               feasible/etc.  This can mean that whatever the scan
     81  *               was searching for was found, or a serious
     82  *               inconsistency was encountered, or anything else
     83  *               appropriate.
     84  *
     85  *     SKIP -    something that made it impossible to continue was
     86  *               encountered, and the caller should go on to the next
     87  *               inode.  This is more for i/o failures than for
     88  *               logical inconsistencies.  Nothing actually looks for
     89  *               this.
     90  *
     91  *     KEEPON -  no more blocks of this inode need to be scanned, but
     92  *               nothing's wrong, so keep on going with the next
     93  *               inode.  It is similar to STOP, except that
     94  *               ckinode()'s caller will typically advance to the next
     95  *               inode for KEEPON, whereas it ceases scanning through
     96  *               the inodes completely for STOP.
     97  *
     98  *     ALTERED - a change was made to the inode.  If the caller sees
     99  *               this set, it should make sure to flush out the
    100  *               changes.  Note that any data blocks read in by the
    101  *               function need to be marked dirty by it directly;
    102  *               flushing of those will happen automatically later.
    103  *
    104  *     FOUND -   whatever was being searched for was located.
    105  *               Typically combined with STOP to avoid wasting time
    106  *               doing additional looking.
    107  *
    108  * During a traversal, some state needs to be carried around.  At the
    109  * least, the callback functions need to know what inode they're
    110  * working on, which logical block, and whether or not fixing problems
    111  * when they're encountered is desired.  Rather than try to guess what
    112  * else might be needed (and thus end up passing way more arguments
    113  * than is reasonable), all the possibilities have been bundled in
    114  * struct inodesc.  About half of the fields are specific to directory
    115  * traversals, and the rest are pretty much generic to any traversal.
    116  *
    117  * The general fields are:
    118  *
    119  *     id_fix        What to do when an error is found.  Generally, this
    120  *                   is set to DONTKNOW before a traversal.  If a
    121  *                   problem is encountered, it is changed to either FIX
    122  *                   or NOFIX by the dofix() query function.  If id_fix
    123  *                   has already been set to FIX when dofix() is called, then
    124  *                   it includes the ALTERED flag (see above) in its return
    125  *                   value; the net effect is that the inode's buffer
    126  *                   will get marked dirty and written to disk at some
    127  *                   point.  If id_fix is DONTKNOW, then dofix() will
    128  *                   query the user.  If it is NOFIX, then dofix()
    129  *                   essentially does nothing.  A few routines set NOFIX
    130  *                   as the initial value, as they are performing a best-
    131  *                   effort informational task, rather than an actual
    132  *                   repair operation.
    133  *
    134  *     id_func       This is the function that will be called for every
    135  *                   logical block in the file (assuming id_type is not
    136  *                   DATA).  The logical block may represent a hole, so
    137  *                   the callback needs to be prepared to handle that
    138  *                   case.  Its return value is a combination of the flags
    139  *                   described above (SKIP, ALTERED, etc).
    140  *
    141  *     id_number     The inode number whose block list or data is being
    142  *                   scanned.
    143  *
    144  *     id_parent     When id_type is DATA, this is the inode number for
    145  *                   the parent of id_number.  Otherwise, it is
    146  *                   available for use as an extra parameter or return
    147  *                   value between the callback and ckinode()'s caller.
    148  *                   Which, if either, of those is left completely up to
    149  *                   the two routines involved, so nothing can generally
    150  *                   be assumed about the id_parent value for non-DATA
    151  *                   traversals.
    152  *
    153  *     id_lbn        This is the current logical block (not fragment)
    154  *                   number being visited by the traversal.
    155  *
    156  *     id_blkno      This is the physical block corresponding to id_lbn.
    157  *
    158  *     id_numfrags   This defines how large a block is being processed in
    159  *                   this particular invocation of the callback.
    160  *                   Usually, it will be the same as sblock.fs_frag.
    161  *                   However, if a direct block is being processed and
    162  *                   it is less than a full filesystem block,
    163  *                   id_numfrags will indicate just how many fragments
    164  *                   (starting from id_lbn) are actually part of the
    165  *                   file.
    166  *
    167  *     id_truncto    The pass 4 callback is used in several places to
    168  *                   free the blocks of a file (the `FILE HAS PROBLEM
    169  *                   FOO; CLEAR?' scenario).  This has been generalized
    170  *                   to allow truncating a file to a particular length
    171  *                   rather than always completely discarding it.  If
    172  *                   id_truncto is -1, then the entire file is released,
    173  *                   otherwise it is logical block number to truncate
    174  *                   to.  This generalized interface was motivated by a
    175  *                   desire to be able to discard everything after a
    176  *                   hole in a directory, rather than the entire
    177  *                   directory.
    178  *
    179  *     id_type       Selects the type of traversal.  DATA for dirscan(),
    180  *                   ADDR or ACL for using the provided callback.
    181  *
    182  * There are several more fields used just for dirscan() traversals:
    183  *
    184  *     id_filesize   The number of bytes in the overall directory left to
    185  *                   process.
    186  *
    187  *     id_loc        Byte position within the directory block.  Should always
    188  *                   point to the start of a directory entry.
    189  *
    190  *     id_entryno    Which logical directory entry is being processed (0
    191  *                   is `.', 1 is `..', 2 and on are normal entries).
    192  *                   This field is primarily used to enable special
    193  *                   checks when looking at the first two entries.
    194  *
    195  *                   The exception (there's always an exception in fsck)
    196  *                   is that in pass 1, it tracks how many fragments are
    197  *                   being used by a particular inode.
    198  *
    199  *     id_firsthole  The first logical block number that was found to
    200  *                   be zero.  As directories are not supposed to have
    201  *                   holes, this marks where a directory should be
    202  *                   truncated down to.  A value of -1 indicates that
    203  *                   no holes were found.
    204  *
    205  *     id_dirp       A pointer to the in-memory copy of the current
    206  *                   directory entry (as identified by id_loc).
    207  *
    208  *     id_name       This is a directory entry name to either create
    209  *                   (callback is mkentry) or locate (callback is
    210  *                   chgino, findino, or findname).
    211  */
    212 int
    213 ckinode(struct dinode *dp, struct inodesc *idesc, enum cki_action action)
    214 {
    215 	struct inodesc cleardesc;
    216 	mode_t	mode;
    217 
    218 	if (idesc->id_filesize == 0)
    219 		idesc->id_filesize = (offset_t)dp->di_size;
    220 
    221 	/*
    222 	 * Our caller should be filtering out completely-free inodes
    223 	 * (mode == zero), so we'll work on the assumption that what
    224 	 * we're given has some basic validity.
    225 	 *
    226 	 * The kernel is inconsistent about MAXPATHLEN including the
    227 	 * trailing \0, so allow the more-generous length for symlinks.
    228 	 */
    229 	mode = dp->di_mode & IFMT;
    230 	if (mode == IFBLK || mode == IFCHR)
    231 		return (KEEPON);
    232 	if (mode == IFLNK && dp->di_size > MAXPATHLEN) {
    233 		pwarn("I=%d  Symlink longer than supported maximum",
    234 		    idesc->id_number);
    235 		init_inodesc(&cleardesc);
    236 		cleardesc.id_type = ADDR;
    237 		cleardesc.id_number = idesc->id_number;
    238 		cleardesc.id_fix = DONTKNOW;
    239 		clri(&cleardesc, "BAD", CLRI_VERBOSE, CLRI_NOP_CORRUPT);
    240 		return (STOP);
    241 	}
    242 	return (ckinode_common(dp, idesc, action));
    243 }
    244 
    245 /*
    246  * This was split out from ckinode() to allow it to be used
    247  * without having to pass in kludge flags to suppress the
    248  * wrong-for-deletion initialization and irrelevant checks.
    249  * This feature is no longer needed, but is being kept in case
    250  * the need comes back.
    251  */
    252 static int
    253 ckinode_common(struct dinode *dp, struct inodesc *idesc,
    254 	enum cki_action action)
    255 {
    256 	offset_t offset;
    257 	struct dinode dino;
    258 	daddr_t ndb;
    259 	int indir_data_blks, last_indir_blk;
    260 	int ret, i, frags;
    261 
    262 	(void) memmove(&dino, dp, sizeof (struct dinode));
    263 	ndb = howmany(dino.di_size, (u_offset_t)sblock.fs_bsize);
    264 
    265 	for (i = 0; i < NDADDR; i++) {
    266 		idesc->id_lbn++;
    267 		offset = blkoff(&sblock, dino.di_size);
    268 		if ((--ndb == 0) && (offset != 0)) {
    269 			idesc->id_numfrags =
    270 			    numfrags(&sblock, fragroundup(&sblock, offset));
    271 		} else {
    272 			idesc->id_numfrags = sblock.fs_frag;
    273 		}
    274 		if (dino.di_db[i] == 0) {
    275 			if ((ndb > 0) && (idesc->id_firsthole < 0)) {
    276 				idesc->id_firsthole = i;
    277 			}
    278 			continue;
    279 		}
    280 		idesc->id_blkno = dino.di_db[i];
    281 		if (idesc->id_type == ADDR || idesc->id_type == ACL)
    282 			ret = (*idesc->id_func)(idesc);
    283 		else
    284 			ret = dirscan(idesc);
    285 
    286 		/*
    287 		 * Need to clear the entry, now that we're done with
    288 		 * it.  We depend on freeblk() ignoring a request to
    289 		 * free already-free fragments to handle the problem of
    290 		 * a partial block.
    291 		 */
    292 		if ((action == CKI_TRUNCATE) &&
    293 		    (idesc->id_truncto >= 0) &&
    294 		    (idesc->id_lbn >= idesc->id_truncto)) {
    295 			dp = ginode(idesc->id_number);
    296 			/*
    297 			 * The (int) cast is safe, in that if di_size won't
    298 			 * fit, it'll be a multiple of any legal fs_frag,
    299 			 * thus giving a zero result.  That value, in turn
    300 			 * means we're doing an entire block.
    301 			 */
    302 			frags = howmany((int)dp->di_size, sblock.fs_fsize) %
    303 			    sblock.fs_frag;
    304 			if (frags == 0)
    305 				frags = sblock.fs_frag;
    306 			freeblk(idesc->id_number, dp->di_db[i],
    307 			    frags);
    308 			dp = ginode(idesc->id_number);
    309 			dp->di_db[i] = 0;
    310 			inodirty();
    311 			ret |= ALTERED;
    312 		}
    313 
    314 		if (ret & STOP)
    315 			return (ret);
    316 	}
    317 
    318 #ifdef lint
    319 	/*
    320 	 * Cure a lint complaint of ``possible use before set''.
    321 	 * Apparently it can't quite figure out the switch statement.
    322 	 */
    323 	indir_data_blks = 0;
    324 #endif
    325 	/*
    326 	 * indir_data_blks contains the number of data blocks in all
    327 	 * the previous levels for this iteration.  E.g., for the
    328 	 * single indirect case (i = 0, di_ib[i] != 0), NDADDR's worth
    329 	 * of blocks have already been covered by the direct blocks
    330 	 * (di_db[]).  At the triple indirect level (i = NIADDR - 1),
    331 	 * it is all of the number of data blocks that were covered
    332 	 * by the second indirect, single indirect, and direct block
    333 	 * levels.
    334 	 */
    335 	idesc->id_numfrags = sblock.fs_frag;
    336 	ndb = howmany(dino.di_size, (u_offset_t)sblock.fs_bsize);
    337 	for (i = 0; i < NIADDR; i++) {
    338 		(void) get_indir_offsets(i, ndb, &indir_data_blks,
    339 		    &last_indir_blk);
    340 		if (dino.di_ib[i] != 0) {
    341 			/*
    342 			 * We'll only clear di_ib[i] if the first entry (and
    343 			 * therefore all of them) is to be cleared, since we
    344 			 * only go through this code on the first entry of
    345 			 * each level of indirection.  The +1 is to account
    346 			 * for the fact that we don't modify id_lbn until
    347 			 * we actually start processing on a data block.
    348 			 */
    349 			idesc->id_blkno = dino.di_ib[i];
    350 			ret = iblock(idesc, i + 1,
    351 			    (u_offset_t)howmany(dino.di_size,
    352 			    (u_offset_t)sblock.fs_bsize) - indir_data_blks,
    353 			    action);
    354 			if ((action == CKI_TRUNCATE) &&
    355 			    (idesc->id_truncto <= indir_data_blks) &&
    356 			    ((idesc->id_lbn + 1) >= indir_data_blks) &&
    357 			    ((idesc->id_lbn + 1) <= last_indir_blk)) {
    358 				dp = ginode(idesc->id_number);
    359 				if (dp->di_ib[i] != 0) {
    360 					freeblk(idesc->id_number, dp->di_ib[i],
    361 					    sblock.fs_frag);
    362 				}
    363 			}
    364 			if (ret & STOP)
    365 				return (ret);
    366 		} else {
    367 			/*
    368 			 * Need to know which of the file's logical blocks
    369 			 * reside in the missing indirect block.  However, the
    370 			 * precise location is only needed for truncating
    371 			 * directories, and level-of-indirection precision is
    372 			 * sufficient for that.
    373 			 */
    374 			if ((indir_data_blks < ndb) &&
    375 			    (idesc->id_firsthole < 0)) {
    376 				idesc->id_firsthole = indir_data_blks;
    377 			}
    378 		}
    379 	}
    380 	return (KEEPON);
    381 }
    382 
    383 static int
    384 get_indir_offsets(int ilevel_wanted, daddr_t ndb, int *data_blks,
    385 	int *last_blk)
    386 {
    387 	int ndb_ilevel = -1;
    388 	int ilevel;
    389 	int dblks, lblk;
    390 
    391 	for (ilevel = 0; ilevel < NIADDR; ilevel++) {
    392 		switch (ilevel) {
    393 		case 0:	/* SINGLE */
    394 			dblks = NDADDR;
    395 			lblk = dblks + NINDIR(&sblock) - 1;
    396 			break;
    397 		case 1:	/* DOUBLE */
    398 			dblks = NDADDR + NINDIR(&sblock);
    399 			lblk = dblks + (NINDIR(&sblock) * NINDIR(&sblock)) - 1;
    400 			break;
    401 		case 2:	/* TRIPLE */
    402 			dblks = NDADDR + NINDIR(&sblock) +
    403 			    (NINDIR(&sblock) * NINDIR(&sblock));
    404 			lblk = dblks + (NINDIR(&sblock) * NINDIR(&sblock) *
    405 			    NINDIR(&sblock)) - 1;
    406 			break;
    407 		default:
    408 			exitstat = EXERRFATAL;
    409 			/*
    410 			 * Translate from zero-based array to
    411 			 * one-based human-style counting.
    412 			 */
    413 			errexit("panic: indirection level %d not 1, 2, or 3",
    414 			    ilevel + 1);
    415 			/* NOTREACHED */
    416 		}
    417 
    418 		if (dblks < ndb && ndb <= lblk)
    419 			ndb_ilevel = ilevel;
    420 
    421 		if (ilevel == ilevel_wanted) {
    422 			if (data_blks != NULL)
    423 				*data_blks = dblks;
    424 			if (last_blk != NULL)
    425 				*last_blk = lblk;
    426 		}
    427 	}
    428 
    429 	return (ndb_ilevel);
    430 }
    431 
    432 static int
    433 iblock(struct inodesc *idesc, int ilevel, u_offset_t iblks,
    434 	enum cki_action action)
    435 {
    436 	struct bufarea *bp;
    437 	int i, n;
    438 	int (*func)(struct inodesc *) = NULL;
    439 	u_offset_t fsbperindirb;
    440 	daddr32_t last_lbn;
    441 	int nif;
    442 	char buf[BUFSIZ];
    443 
    444 	n = KEEPON;
    445 
    446 	switch (idesc->id_type) {
    447 	case ADDR:
    448 		func = idesc->id_func;
    449 		if (((n = (*func)(idesc)) & KEEPON) == 0)
    450 				return (n);
    451 		break;
    452 	case ACL:
    453 		func = idesc->id_func;
    454 		break;
    455 	case DATA:
    456 		func = dirscan;
    457 		break;
    458 	default:
    459 		errexit("unknown inodesc type %d in iblock()", idesc->id_type);
    460 		/* NOTREACHED */
    461 	}
    462 	if (chkrange(idesc->id_blkno, idesc->id_numfrags)) {
    463 		return ((idesc->id_type == ACL) ? STOP : SKIP);
    464 	}
    465 
    466 	bp = getdatablk(idesc->id_blkno, (size_t)sblock.fs_bsize);
    467 	if (bp->b_errs != 0) {
    468 		brelse(bp);
    469 		return (SKIP);
    470 	}
    471 
    472 	ilevel--;
    473 	/*
    474 	 * Trivia note: the BSD fsck has the number of bytes remaining
    475 	 * as the third argument to iblock(), so the equivalent of
    476 	 * fsbperindirb starts at fs_bsize instead of one.  We're
    477 	 * working in units of filesystem blocks here, not bytes or
    478 	 * fragments.
    479 	 */
    480 	for (fsbperindirb = 1, i = 0; i < ilevel; i++) {
    481 		fsbperindirb *= (u_offset_t)NINDIR(&sblock);
    482 	}
    483 	/*
    484 	 * nif indicates the next "free" pointer (as an array index) in this
    485 	 * indirect block, based on counting the blocks remaining in the
    486 	 * file after subtracting all previously processed blocks.
    487 	 * This figure is based on the size field of the inode.
    488 	 *
    489 	 * Note that in normal operation, nif may initially be calculated
    490 	 * as larger than the number of pointers in this block (as when
    491 	 * there are more indirect blocks following); if that is
    492 	 * the case, nif is limited to the max number of pointers per
    493 	 * indirect block.
    494 	 *
    495 	 * Also note that if an inode is inconsistent (has more blocks
    496 	 * allocated to it than the size field would indicate), the sweep
    497 	 * through any indirect blocks directly pointed at by the inode
    498 	 * continues. Since the block offset of any data blocks referenced
    499 	 * by these indirect blocks is greater than the size of the file,
    500 	 * the index nif may be computed as a negative value.
    501 	 * In this case, we reset nif to indicate that all pointers in
    502 	 * this retrieval block should be zeroed and the resulting
    503 	 * unreferenced data and/or retrieval blocks will be recovered
    504 	 * through garbage collection later.
    505 	 */
    506 	nif = (offset_t)howmany(iblks, fsbperindirb);
    507 	if (nif > NINDIR(&sblock))
    508 		nif = NINDIR(&sblock);
    509 	else if (nif < 0)
    510 		nif = 0;
    511 	/*
    512 	 * first pass: all "free" retrieval pointers (from [nif] thru
    513 	 * 	the end of the indirect block) should be zero. (This
    514 	 *	assertion does not hold for directories, which may be
    515 	 *	truncated without releasing their allocated space)
    516 	 */
    517 	if (nif < NINDIR(&sblock) && (idesc->id_func == pass1check ||
    518 	    idesc->id_func == pass3bcheck)) {
    519 		for (i = nif; i < NINDIR(&sblock); i++) {
    520 			if (bp->b_un.b_indir[i] == 0)
    521 				continue;
    522 			(void) sprintf(buf, "PARTIALLY TRUNCATED INODE I=%lu",
    523 			    (ulong_t)idesc->id_number);
    524 			if (preen) {
    525 				pfatal(buf);
    526 			} else if (dofix(idesc, buf)) {
    527 				freeblk(idesc->id_number,
    528 				    bp->b_un.b_indir[i],
    529 				    sblock.fs_frag);
    530 				bp->b_un.b_indir[i] = 0;
    531 				dirty(bp);
    532 			}
    533 		}
    534 		flush(fswritefd, bp);
    535 	}
    536 	/*
    537 	 * second pass: all retrieval pointers referring to blocks within
    538 	 *	a valid range [0..filesize] (both indirect and data blocks)
    539 	 *	are examined in the same manner as ckinode() checks the
    540 	 *	direct blocks in the inode.  Sweep through from
    541 	 *	the first pointer in this retrieval block to [nif-1].
    542 	 */
    543 	last_lbn = howmany(idesc->id_filesize, sblock.fs_bsize);
    544 	for (i = 0; i < nif; i++) {
    545 		if (ilevel == 0)
    546 			idesc->id_lbn++;
    547 		if (bp->b_un.b_indir[i] != 0) {
    548 			idesc->id_blkno = bp->b_un.b_indir[i];
    549 			if (ilevel > 0) {
    550 				n = iblock(idesc, ilevel, iblks, action);
    551 				/*
    552 				 * Each iteration decreases "remaining block
    553 				 * count" by the number of blocks accessible
    554 				 * by a pointer at this indirect block level.
    555 				 */
    556 				iblks -= fsbperindirb;
    557 			} else {
    558 				/*
    559 				 * If we're truncating, func will discard
    560 				 * the data block for us.
    561 				 */
    562 				n = (*func)(idesc);
    563 			}
    564 
    565 			if ((action == CKI_TRUNCATE) &&
    566 			    (idesc->id_truncto >= 0) &&
    567 			    (idesc->id_lbn >= idesc->id_truncto)) {
    568 				freeblk(idesc->id_number,  bp->b_un.b_indir[i],
    569 				    sblock.fs_frag);
    570 			}
    571 
    572 			/*
    573 			 * Note that truncation never gets STOP back
    574 			 * under normal circumstances.  Abnormal would
    575 			 * be a bad acl short-circuit in iblock() or
    576 			 * an out-of-range failure in pass4check().
    577 			 * We still want to keep going when truncating
    578 			 * under those circumstances, since the whole
    579 			 * point of truncating is to get rid of all
    580 			 * that.
    581 			 */
    582 			if ((n & STOP) && (action != CKI_TRUNCATE)) {
    583 				brelse(bp);
    584 				return (n);
    585 			}
    586 		} else {
    587 			if ((idesc->id_lbn < last_lbn) &&
    588 			    (idesc->id_firsthole < 0)) {
    589 				idesc->id_firsthole = idesc->id_lbn;
    590 			}
    591 			if (idesc->id_type == DATA) {
    592 				/*
    593 				 * No point in continuing in the indirect
    594 				 * blocks of a directory, since they'll just
    595 				 * get freed anyway.
    596 				 */
    597 				brelse(bp);
    598 				return ((n & ~KEEPON) | STOP);
    599 			}
    600 		}
    601 	}
    602 
    603 	brelse(bp);
    604 	return (KEEPON);
    605 }
    606 
    607 /*
    608  * Check that a block is a legal block number.
    609  * Return 0 if in range, 1 if out of range.
    610  */
    611 int
    612 chkrange(daddr32_t blk, int cnt)
    613 {
    614 	int c;
    615 
    616 	if (cnt <= 0 || blk <= 0 || ((unsigned)blk >= (unsigned)maxfsblock) ||
    617 	    ((cnt - 1) > (maxfsblock - blk))) {
    618 		if (debug)
    619 			(void) printf(
    620 			    "Bad fragment range: should be 1 <= %d..%d < %d\n",
    621 			    blk, blk + cnt, maxfsblock);
    622 		return (1);
    623 	}
    624 	if ((cnt > sblock.fs_frag) ||
    625 	    ((fragnum(&sblock, blk) + cnt) > sblock.fs_frag)) {
    626 		if (debug)
    627 			(void) printf("Bad fragment size: size %d\n", cnt);
    628 		return (1);
    629 	}
    630 	c = dtog(&sblock, blk);
    631 	if (blk < cgdmin(&sblock, c)) {
    632 		if ((unsigned)(blk + cnt) > (unsigned)cgsblock(&sblock, c)) {
    633 			if (debug)
    634 				(void) printf(
    635 	    "Bad fragment position: %d..%d spans start of cg metadata\n",
    636 				    blk, blk + cnt);
    637 			return (1);
    638 		}
    639 	} else {
    640 		if ((unsigned)(blk + cnt) > (unsigned)cgbase(&sblock, c+1)) {
    641 			if (debug)
    642 				(void) printf(
    643 				    "Bad frag pos: %d..%d crosses end of cg\n",
    644 				    blk, blk + cnt);
    645 			return (1);
    646 		}
    647 	}
    648 	return (0);
    649 }
    650 
    651 /*
    652  * General purpose interface for reading inodes.
    653  */
    654 
    655 /*
    656  * Note that any call to ginode() can potentially invalidate any
    657  * dinode pointers previously acquired from it.  To avoid pain,
    658  * make sure to always call inodirty() immediately after modifying
    659  * an inode, if there's any chance of ginode() being called after
    660  * that.  Also, always call ginode() right before you need to access
    661  * an inode, so that there won't be any surprises from functions
    662  * called between the previous ginode() invocation and the dinode
    663  * use.
    664  *
    665  * Despite all that, we aren't doing the amount of i/o that's implied,
    666  * as we use the buffer cache that getdatablk() and friends maintain.
    667  */
    668 static fsck_ino_t startinum = -1;
    669 
    670 struct dinode *
    671 ginode(fsck_ino_t inum)
    672 {
    673 	daddr32_t iblk;
    674 	struct dinode *dp;
    675 
    676 	if (inum < UFSROOTINO || inum > maxino) {
    677 		errexit("bad inode number %d to ginode\n", inum);
    678 	}
    679 	if (startinum == -1 ||
    680 	    pbp == NULL ||
    681 	    inum < startinum ||
    682 	    inum >= (fsck_ino_t)(startinum + (fsck_ino_t)INOPB(&sblock))) {
    683 		iblk = itod(&sblock, inum);
    684 		if (pbp != NULL) {
    685 			brelse(pbp);
    686 		}
    687 		/*
    688 		 * We don't check for errors here, because we can't
    689 		 * tell our caller about it, and the zeros that will
    690 		 * be in the buffer are just as good as anything we
    691 		 * could fake.
    692 		 */
    693 		pbp = getdatablk(iblk, (size_t)sblock.fs_bsize);
    694 		startinum =
    695 		    (fsck_ino_t)((inum / INOPB(&sblock)) * INOPB(&sblock));
    696 	}
    697 	dp = &pbp->b_un.b_dinode[inum % INOPB(&sblock)];
    698 	if (dp->di_suid != UID_LONG)
    699 		dp->di_uid = dp->di_suid;
    700 	if (dp->di_sgid != GID_LONG)
    701 		dp->di_gid = dp->di_sgid;
    702 	return (dp);
    703 }
    704 
    705 /*
    706  * Special purpose version of ginode used to optimize first pass
    707  * over all the inodes in numerical order.  It bypasses the buffer
    708  * system used by ginode(), etc in favour of reading the bulk of a
    709  * cg's inodes at one time.
    710  */
    711 static fsck_ino_t nextino, lastinum;
    712 static int64_t readcnt, readpercg, fullcnt, inobufsize;
    713 static int64_t partialcnt, partialsize;
    714 static size_t lastsize;
    715 static struct dinode *inodebuf;
    716 static diskaddr_t currentdblk;
    717 static struct dinode *currentinode;
    718 
    719 struct dinode *
    720 getnextinode(fsck_ino_t inum)
    721 {
    722 	size_t size;
    723 	diskaddr_t dblk;
    724 	static struct dinode *dp;
    725 
    726 	if (inum != nextino++ || inum > maxino)
    727 		errexit("bad inode number %d to nextinode\n", inum);
    728 
    729 	/*
    730 	 * Will always go into the if() the first time we're called,
    731 	 * so dp will always be valid.
    732 	 */
    733 	if (inum >= lastinum) {
    734 		readcnt++;
    735 		dblk = fsbtodb(&sblock, itod(&sblock, lastinum));
    736 		currentdblk = dblk;
    737 		if (readcnt % readpercg == 0) {
    738 			if (partialsize > SIZE_MAX)
    739 				errexit(
    740 				    "Internal error: partialsize overflow");
    741 			size = (size_t)partialsize;
    742 			lastinum += partialcnt;
    743 		} else {
    744 			if (inobufsize > SIZE_MAX)
    745 				errexit("Internal error: inobufsize overflow");
    746 			size = (size_t)inobufsize;
    747 			lastinum += fullcnt;
    748 		}
    749 		/*
    750 		 * If fsck_bread() returns an error, it will already have
    751 		 * zeroed out the buffer, so we do not need to do so here.
    752 		 */
    753 		(void) fsck_bread(fsreadfd, (caddr_t)inodebuf, dblk, size);
    754 		lastsize = size;
    755 		dp = inodebuf;
    756 	}
    757 	currentinode = dp;
    758 	return (dp++);
    759 }
    760 
    761 /*
    762  * Reread the current getnext() buffer.  This allows for changing inodes
    763  * other than the current one via ginode()/inodirty()/inoflush().
    764  *
    765  * Just reuses all the interesting variables that getnextinode() set up
    766  * last time it was called.  This shouldn't get called often, so we don't
    767  * try to figure out if the caller's actually touched an inode in the
    768  * range we have cached.  There could have been an arbitrary number of
    769  * them, after all.
    770  */
    771 struct dinode *
    772 getnextrefresh(void)
    773 {
    774 	if (inodebuf == NULL) {
    775 		return (NULL);
    776 	}
    777 
    778 	inoflush();
    779 	(void) fsck_bread(fsreadfd, (caddr_t)inodebuf, currentdblk, lastsize);
    780 	return (currentinode);
    781 }
    782 
    783 void
    784 resetinodebuf(void)
    785 {
    786 	startinum = 0;
    787 	nextino = 0;
    788 	lastinum = 0;
    789 	readcnt = 0;
    790 	inobufsize = blkroundup(&sblock, INOBUFSIZE);
    791 	fullcnt = inobufsize / sizeof (struct dinode);
    792 	readpercg = sblock.fs_ipg / fullcnt;
    793 	partialcnt = sblock.fs_ipg % fullcnt;
    794 	partialsize = partialcnt * sizeof (struct dinode);
    795 	if (partialcnt != 0) {
    796 		readpercg++;
    797 	} else {
    798 		partialcnt = fullcnt;
    799 		partialsize = inobufsize;
    800 	}
    801 	if (inodebuf == NULL &&
    802 	    (inodebuf = (struct dinode *)malloc((unsigned)inobufsize)) == NULL)
    803 		errexit("Cannot allocate space for inode buffer\n");
    804 	while (nextino < UFSROOTINO)
    805 		(void) getnextinode(nextino);
    806 }
    807 
    808 void
    809 freeinodebuf(void)
    810 {
    811 	if (inodebuf != NULL) {
    812 		free((void *)inodebuf);
    813 	}
    814 	inodebuf = NULL;
    815 }
    816 
    817 /*
    818  * Routines to maintain information about directory inodes.
    819  * This is built during the first pass and used during the
    820  * second and third passes.
    821  *
    822  * Enter inodes into the cache.
    823  */
    824 void
    825 cacheino(struct dinode *dp, fsck_ino_t inum)
    826 {
    827 	struct inoinfo *inp;
    828 	struct inoinfo **inpp;
    829 	uint_t blks;
    830 
    831 	blks = NDADDR + NIADDR;
    832 	inp = (struct inoinfo *)
    833 	    malloc(sizeof (*inp) + (blks - 1) * sizeof (daddr32_t));
    834 	if (inp == NULL)
    835 		errexit("Cannot increase directory list\n");
    836 	init_inoinfo(inp, dp, inum); /* doesn't touch i_nextlist or i_number */
    837 	inpp = &inphead[inum % numdirs];
    838 	inp->i_nextlist = *inpp;
    839 	*inpp = inp;
    840 	inp->i_number = inum;
    841 	if (inplast == listmax) {
    842 		listmax += 100;
    843 		inpsort = (struct inoinfo **)realloc((void *)inpsort,
    844 		    (unsigned)listmax * sizeof (struct inoinfo *));
    845 		if (inpsort == NULL)
    846 			errexit("cannot increase directory list");
    847 	}
    848 	inpsort[inplast++] = inp;
    849 }
    850 
    851 /*
    852  * Look up an inode cache structure.
    853  */
    854 struct inoinfo *
    855 getinoinfo(fsck_ino_t inum)
    856 {
    857 	struct inoinfo *inp;
    858 
    859 	inp = search_cache(inphead[inum % numdirs], inum);
    860 	return (inp);
    861 }
    862 
    863 /*
    864  * Determine whether inode is in cache.
    865  */
    866 int
    867 inocached(fsck_ino_t inum)
    868 {
    869 	return (search_cache(inphead[inum % numdirs], inum) != NULL);
    870 }
    871 
    872 /*
    873  * Clean up all the inode cache structure.
    874  */
    875 void
    876 inocleanup(void)
    877 {
    878 	struct inoinfo **inpp;
    879 
    880 	if (inphead == NULL)
    881 		return;
    882 	for (inpp = &inpsort[inplast - 1]; inpp >= inpsort; inpp--) {
    883 		free((void *)(*inpp));
    884 	}
    885 	free((void *)inphead);
    886 	free((void *)inpsort);
    887 	inphead = inpsort = NULL;
    888 }
    889 
    890 /*
    891  * Routines to maintain information about acl inodes.
    892  * This is built during the first pass and used during the
    893  * second and third passes.
    894  *
    895  * Enter acl inodes into the cache.
    896  */
    897 void
    898 cacheacl(struct dinode *dp, fsck_ino_t inum)
    899 {
    900 	struct inoinfo *aclp;
    901 	struct inoinfo **aclpp;
    902 	uint_t blks;
    903 
    904 	blks = NDADDR + NIADDR;
    905 	aclp = (struct inoinfo *)
    906 	    malloc(sizeof (*aclp) + (blks - 1) * sizeof (daddr32_t));
    907 	if (aclp == NULL)
    908 		return;
    909 	aclpp = &aclphead[inum % numacls];
    910 	aclp->i_nextlist = *aclpp;
    911 	*aclpp = aclp;
    912 	aclp->i_number = inum;
    913 	aclp->i_isize = (offset_t)dp->di_size;
    914 	aclp->i_blkssize = (size_t)(blks * sizeof (daddr32_t));
    915 	(void) memmove(&aclp->i_blks[0], &dp->di_db[0], aclp->i_blkssize);
    916 	if (aclplast == aclmax) {
    917 		aclmax += 100;
    918 		aclpsort = (struct inoinfo **)realloc((char *)aclpsort,
    919 		    (unsigned)aclmax * sizeof (struct inoinfo *));
    920 		if (aclpsort == NULL)
    921 			errexit("cannot increase acl list");
    922 	}
    923 	aclpsort[aclplast++] = aclp;
    924 }
    925 
    926 
    927 /*
    928  * Generic cache search function.
    929  * ROOT is the first entry in a hash chain (the caller is expected
    930  * to have done the initial bucket lookup).  KEY is what's being
    931  * searched for.
    932  *
    933  * Returns a pointer to the entry if it is found, NULL otherwise.
    934  */
    935 static struct inoinfo *
    936 search_cache(struct inoinfo *element, fsck_ino_t key)
    937 {
    938 	while (element != NULL) {
    939 		if (element->i_number == key)
    940 			break;
    941 		element = element->i_nextlist;
    942 	}
    943 
    944 	return (element);
    945 }
    946 
    947 void
    948 inodirty(void)
    949 {
    950 	dirty(pbp);
    951 }
    952 
    953 static void
    954 inoflush(void)
    955 {
    956 	if (pbp != NULL)
    957 		flush(fswritefd, pbp);
    958 }
    959 
    960 /*
    961  * Interactive wrapper for freeino(), for those times when we're
    962  * not sure if we should throw something away.
    963  */
    964 void
    965 clri(struct inodesc *idesc, char *type, int verbose, int corrupting)
    966 {
    967 	int need_parent;
    968 	struct dinode *dp;
    969 
    970 	if (statemap[idesc->id_number] == USTATE)
    971 		return;
    972 
    973 	dp = ginode(idesc->id_number);
    974 	if (verbose == CLRI_VERBOSE) {
    975 		pwarn("%s %s", type, file_id(idesc->id_number, dp->di_mode));
    976 		pinode(idesc->id_number);
    977 	}
    978 	if (preen || (reply("CLEAR") == 1)) {
    979 		need_parent = (corrupting == CLRI_NOP_OK) ?
    980 		    TI_NOPARENT : TI_PARENT;
    981 		freeino(idesc->id_number, need_parent);
    982 		if (preen)
    983 			(void) printf(" (CLEARED)\n");
    984 		remove_orphan_dir(idesc->id_number);
    985 	} else if (corrupting == CLRI_NOP_CORRUPT) {
    986 		iscorrupt = 1;
    987 	}
    988 	(void) printf("\n");
    989 }
    990 
    991 /*
    992  * Find the directory entry for the inode noted in id_parent (which is
    993  * not necessarily the parent of anything, we're just using a convenient
    994  * field.
    995  */
    996 int
    997 findname(struct inodesc *idesc)
    998 {
    999 	struct direct *dirp = idesc->id_dirp;
   1000 
   1001 	if (dirp->d_ino != idesc->id_parent)
   1002 		return (KEEPON);
   1003 	(void) memmove(idesc->id_name, dirp->d_name,
   1004 	    MIN(dirp->d_namlen, MAXNAMLEN) + 1);
   1005 	return (STOP|FOUND);
   1006 }
   1007 
   1008 /*
   1009  * Find the inode number associated with the given name.
   1010  */
   1011 int
   1012 findino(struct inodesc *idesc)
   1013 {
   1014 	struct direct *dirp = idesc->id_dirp;
   1015 
   1016 	if (dirp->d_ino == 0)
   1017 		return (KEEPON);
   1018 	if (strcmp(dirp->d_name, idesc->id_name) == 0 &&
   1019 	    dirp->d_ino >= UFSROOTINO && dirp->d_ino <= maxino) {
   1020 		idesc->id_parent = dirp->d_ino;
   1021 		return (STOP|FOUND);
   1022 	}
   1023 	return (KEEPON);
   1024 }
   1025 
   1026 int
   1027 cleardirentry(fsck_ino_t parentdir, fsck_ino_t target)
   1028 {
   1029 	struct inodesc idesc;
   1030 	struct dinode *dp;
   1031 
   1032 	dp = ginode(parentdir);
   1033 	init_inodesc(&idesc);
   1034 	idesc.id_func = clearanentry;
   1035 	idesc.id_parent = target;
   1036 	idesc.id_type = DATA;
   1037 	idesc.id_fix = NOFIX;
   1038 	return (ckinode(dp, &idesc, CKI_TRAVERSE));
   1039 }
   1040 
   1041 static int
   1042 clearanentry(struct inodesc *idesc)
   1043 {
   1044 	struct direct *dirp = idesc->id_dirp;
   1045 
   1046 	if (dirp->d_ino != idesc->id_parent || idesc->id_entryno < 2) {
   1047 		idesc->id_entryno++;
   1048 		return (KEEPON);
   1049 	}
   1050 	dirp->d_ino = 0;
   1051 	return (STOP|FOUND|ALTERED);
   1052 }
   1053 
   1054 void
   1055 pinode(fsck_ino_t ino)
   1056 {
   1057 	struct dinode *dp;
   1058 
   1059 	(void) printf(" I=%lu ", (ulong_t)ino);
   1060 	if (ino < UFSROOTINO || ino > maxino)
   1061 		return;
   1062 	dp = ginode(ino);
   1063 	pdinode(dp);
   1064 }
   1065 
   1066 static void
   1067 pdinode(struct dinode *dp)
   1068 {
   1069 	char *p;
   1070 	struct passwd *pw;
   1071 	time_t t;
   1072 
   1073 	(void) printf(" OWNER=");
   1074 	if ((pw = getpwuid((int)dp->di_uid)) != 0)
   1075 		(void) printf("%s ", pw->pw_name);
   1076 	else
   1077 		(void) printf("%lu ", (ulong_t)dp->di_uid);
   1078 	(void) printf("MODE=%o\n", dp->di_mode);
   1079 	if (preen)
   1080 		(void) printf("%s: ", devname);
   1081 	(void) printf("SIZE=%lld ", (longlong_t)dp->di_size);
   1082 
   1083 	/* ctime() ignores LOCALE, so this is safe */
   1084 	t = (time_t)dp->di_mtime;
   1085 	p = ctime(&t);
   1086 	(void) printf("MTIME=%12.12s %4.4s ", p + 4, p + 20);
   1087 }
   1088 
   1089 void
   1090 blkerror(fsck_ino_t ino, char *type, daddr32_t blk, daddr32_t lbn)
   1091 {
   1092 	pfatal("FRAGMENT %d %s I=%u LFN %d", blk, type, ino, lbn);
   1093 	(void) printf("\n");
   1094 
   1095 	switch (statemap[ino] & ~INDELAYD) {
   1096 
   1097 	case FSTATE:
   1098 	case FZLINK:
   1099 		statemap[ino] = FCLEAR;
   1100 		return;
   1101 
   1102 	case DFOUND:
   1103 	case DSTATE:
   1104 	case DZLINK:
   1105 		statemap[ino] = DCLEAR;
   1106 		add_orphan_dir(ino);
   1107 		return;
   1108 
   1109 	case SSTATE:
   1110 		statemap[ino] = SCLEAR;
   1111 		return;
   1112 
   1113 	case FCLEAR:
   1114 	case DCLEAR:
   1115 	case SCLEAR:
   1116 		return;
   1117 
   1118 	default:
   1119 		errexit("BAD STATE 0x%x TO BLKERR\n", statemap[ino]);
   1120 		/* NOTREACHED */
   1121 	}
   1122 }
   1123 
   1124 /*
   1125  * allocate an unused inode
   1126  */
   1127 fsck_ino_t
   1128 allocino(fsck_ino_t request, int type)
   1129 {
   1130 	fsck_ino_t ino;
   1131 	struct dinode *dp;
   1132 	struct cg *cgp = &cgrp;
   1133 	int cg;
   1134 	time_t t;
   1135 	caddr_t err;
   1136 
   1137 	if (debug && (request != 0) && (request != UFSROOTINO))
   1138 		errexit("assertion failed: allocino() asked for "
   1139 		    "inode %d instead of 0 or %d",
   1140 		    (int)request, (int)UFSROOTINO);
   1141 
   1142 	/*
   1143 	 * We know that we're only going to get requests for UFSROOTINO
   1144 	 * or 0.  If UFSROOTINO is wanted, then it better be available
   1145 	 * because our caller is trying to recreate the root directory.
   1146 	 * If we're asked for 0, then which one we return doesn't matter.
   1147 	 * We know that inodes 0 and 1 are never valid to return, so we
   1148 	 * the start at the lowest-legal inode number.
   1149 	 *
   1150 	 * If we got a request for UFSROOTINO, then request != 0, and
   1151 	 * this pair of conditionals is the only place that treats
   1152 	 * UFSROOTINO specially.
   1153 	 */
   1154 	if (request == 0)
   1155 		request = UFSROOTINO;
   1156 	else if (statemap[request] != USTATE)
   1157 		return (0);
   1158 
   1159 	/*
   1160 	 * Doesn't do wrapping, since we know we started at
   1161 	 * the smallest inode.
   1162 	 */
   1163 	for (ino = request; ino < maxino; ino++)
   1164 		if (statemap[ino] == USTATE)
   1165 			break;
   1166 	if (ino == maxino)
   1167 		return (0);
   1168 
   1169 	/*
   1170 	 * In pass5, we'll calculate the bitmaps and counts all again from
   1171 	 * scratch and do a comparison, but for that to work the cg has
   1172 	 * to know what in-memory changes we've made to it.  If we have
   1173 	 * trouble reading the cg, cg_sanity() should kick it out so
   1174 	 * we can skip explicit i/o error checking here.
   1175 	 */
   1176 	cg = itog(&sblock, ino);
   1177 	(void) getblk(&cgblk, cgtod(&sblock, cg), (size_t)sblock.fs_cgsize);
   1178 	err = cg_sanity(cgp, cg);
   1179 	if (err != NULL) {
   1180 		pfatal("CG %d: %s\n", cg, err);
   1181 		free((void *)err);
   1182 		if (reply("REPAIR") == 0)
   1183 			errexit("Program terminated.");
   1184 		fix_cg(cgp, cg);
   1185 	}
   1186 	setbit(cg_inosused(cgp), ino % sblock.fs_ipg);
   1187 	cgp->cg_cs.cs_nifree--;
   1188 	cgdirty();
   1189 
   1190 	if (lastino < ino)
   1191 		lastino = ino;
   1192 
   1193 	/*
   1194 	 * Don't currently support IFATTRDIR or any of the other
   1195 	 * types, as they aren't needed.
   1196 	 */
   1197 	switch (type & IFMT) {
   1198 	case IFDIR:
   1199 		statemap[ino] = DSTATE;
   1200 		cgp->cg_cs.cs_ndir++;
   1201 		break;
   1202 	case IFREG:
   1203 	case IFLNK:
   1204 		statemap[ino] = FSTATE;
   1205 		break;
   1206 	default:
   1207 		/*
   1208 		 * Pretend nothing ever happened.  This clears the
   1209 		 * dirty flag, among other things.
   1210 		 */
   1211 		initbarea(&cgblk);
   1212 		if (debug)
   1213 			(void) printf("allocino: unknown type 0%o\n",
   1214 			    type & IFMT);
   1215 		return (0);
   1216 	}
   1217 
   1218 	/*
   1219 	 * We're allocating what should be a completely-unused inode,
   1220 	 * so make sure we don't inherit anything from any previous
   1221 	 * incarnations.
   1222 	 */
   1223 	dp = ginode(ino);
   1224 	(void) memset((void *)dp, 0, sizeof (struct dinode));
   1225 	dp->di_db[0] = allocblk(1);
   1226 	if (dp->di_db[0] == 0) {
   1227 		statemap[ino] = USTATE;
   1228 		return (0);
   1229 	}
   1230 	dp->di_mode = (mode_t)type;
   1231 	(void) time(&t);
   1232 	dp->di_atime = (time32_t)t;
   1233 	dp->di_ctime = dp->di_atime;
   1234 	dp->di_mtime = dp->di_ctime;
   1235 	dp->di_size = (u_offset_t)sblock.fs_fsize;
   1236 	dp->di_blocks = btodb(sblock.fs_fsize);
   1237 	n_files++;
   1238 	inodirty();
   1239 	return (ino);
   1240 }
   1241 
   1242 /*
   1243  * Release some or all of the blocks of an inode.
   1244  * Only truncates down.  Assumes new_length is appropriately aligned
   1245  * to a block boundary (or a directory block boundary, if it's a
   1246  * directory).
   1247  *
   1248  * If this is a directory, discard all of its contents first, so
   1249  * we don't create a bunch of orphans that would need another fsck
   1250  * run to clean up.
   1251  *
   1252  * Even if truncating to zero length, the inode remains allocated.
   1253  */
   1254 void
   1255 truncino(fsck_ino_t ino, offset_t new_length, int update)
   1256 {
   1257 	struct inodesc idesc;
   1258 	struct inoinfo *iip;
   1259 	struct dinode *dp;
   1260 	fsck_ino_t parent;
   1261 	mode_t mode;
   1262 	caddr_t message;
   1263 	int isdir;
   1264 	int ilevel, dblk;
   1265 
   1266 	dp = ginode(ino);
   1267 	mode = (dp->di_mode & IFMT);
   1268 	isdir = (mode == IFDIR) || (mode == IFATTRDIR);
   1269 
   1270 	if (isdir) {
   1271 		/*
   1272 		 * Go with the parent we found by chasing references,
   1273 		 * if we've gotten that far.  Otherwise, use what the
   1274 		 * directory itself claims.  If there's no ``..'' entry
   1275 		 * in it, give up trying to get the link counts right.
   1276 		 */
   1277 		if (update == TI_NOPARENT) {
   1278 			parent = -1;
   1279 		} else {
   1280 			iip = getinoinfo(ino);
   1281 			if (iip != NULL) {
   1282 				parent = iip->i_parent;
   1283 			} else {
   1284 				parent = lookup_dotdot_ino(ino);
   1285 				if (parent != 0) {
   1286 					/*
   1287 					 * Make sure that the claimed
   1288 					 * parent actually has a
   1289 					 * reference to us.
   1290 					 */
   1291 					dp = ginode(parent);
   1292 					idesc.id_name = lfname;
   1293 					idesc.id_type = DATA;
   1294 					idesc.id_func = findino;
   1295 					idesc.id_number = ino;
   1296 					idesc.id_fix = DONTKNOW;
   1297 					if ((ckinode(dp, &idesc,
   1298 					    CKI_TRAVERSE) & FOUND) == 0)
   1299 						parent = 0;
   1300 				}
   1301 			}
   1302 		}
   1303 
   1304 		mark_delayed_inodes(ino, numfrags(&sblock, new_length));
   1305 		if (parent > 0) {
   1306 			dp = ginode(parent);
   1307 			LINK_RANGE(message, dp->di_nlink, -1);
   1308 			if (message != NULL) {
   1309 				LINK_CLEAR(message, parent, dp->di_mode,
   1310 				    &idesc);
   1311 				if (statemap[parent] == USTATE)
   1312 					goto no_parent_update;
   1313 			}
   1314 			TRACK_LNCNTP(parent, lncntp[parent]--);
   1315 		} else if ((mode == IFDIR) && (parent == 0)) {
   1316 			/*
   1317 			 * Currently don't have a good way to
   1318 			 * handle this, so throw up our hands.
   1319 			 * However, we know that we can still
   1320 			 * do some good if we continue, so
   1321 			 * don't actually exit yet.
   1322 			 *
   1323 			 * We don't do it for attrdirs,
   1324 			 * because there aren't link counts
   1325 			 * between them and their parents.
   1326 			 */
   1327 			pwarn("Could not determine former parent of "
   1328 			    "inode %d, link counts are possibly\n"
   1329 			    "incorrect.  Please rerun fsck(1M) to "
   1330 			    "correct this.\n",
   1331 			    ino);
   1332 			iscorrupt = 1;
   1333 		}
   1334 		/*
   1335 		 * ...else if it's a directory with parent == -1, then
   1336 		 * we've not gotten far enough to know connectivity,
   1337 		 * and it'll get handled automatically later.
   1338 		 */
   1339 	}
   1340 
   1341 no_parent_update:
   1342 	init_inodesc(&idesc);
   1343 	idesc.id_type = ADDR;
   1344 	idesc.id_func = pass4check;
   1345 	idesc.id_number = ino;
   1346 	idesc.id_fix = DONTKNOW;
   1347 	idesc.id_truncto = howmany(new_length, sblock.fs_bsize);
   1348 	dp = ginode(ino);
   1349 	if (ckinode(dp, &idesc, CKI_TRUNCATE) & ALTERED)
   1350 		inodirty();
   1351 
   1352 	/*
   1353 	 * This has to be done after ckinode(), so that all of
   1354 	 * the fragments get visited.  Note that we assume we're
   1355 	 * always truncating to a block boundary, rather than a
   1356 	 * fragment boundary.
   1357 	 */
   1358 	dp = ginode(ino);
   1359 	dp->di_size = new_length;
   1360 
   1361 	/*
   1362 	 * Clear now-obsolete pointers.
   1363 	 */
   1364 	for (dblk = idesc.id_truncto + 1; dblk < NDADDR; dblk++) {
   1365 		dp->di_db[dblk] = 0;
   1366 	}
   1367 
   1368 	ilevel = get_indir_offsets(-1, idesc.id_truncto, NULL, NULL);
   1369 	for (ilevel++; ilevel < NIADDR; ilevel++) {
   1370 		dp->di_ib[ilevel] = 0;
   1371 	}
   1372 
   1373 	inodirty();
   1374 }
   1375 
   1376 /*
   1377  * Release an inode's resources, then release the inode itself.
   1378  */
   1379 void
   1380 freeino(fsck_ino_t ino, int update_parent)
   1381 {
   1382 	int cg;
   1383 	struct dinode *dp;
   1384 	struct cg *cgp;
   1385 
   1386 	n_files--;
   1387 	dp = ginode(ino);
   1388 	/*
   1389 	 * We need to make sure that the file is really a large file.
   1390 	 * Everything bigger than UFS_MAXOFFSET_T is treated as a file with
   1391 	 * negative size, which shall be cleared. (see verify_inode() in
   1392 	 * pass1.c)
   1393 	 */
   1394 	if (dp->di_size > (u_offset_t)MAXOFF_T &&
   1395 	    dp->di_size <= (u_offset_t)UFS_MAXOFFSET_T &&
   1396 	    ftypeok(dp) &&
   1397 	    (dp->di_mode & IFMT) != IFBLK &&
   1398 	    (dp->di_mode & IFMT) != IFCHR) {
   1399 		largefile_count--;
   1400 	}
   1401 	truncino(ino, 0, update_parent);
   1402 
   1403 	dp = ginode(ino);
   1404 	if ((dp->di_mode & IFMT) == IFATTRDIR) {
   1405 		clearshadow(ino, &attrclientinfo);
   1406 		dp = ginode(ino);
   1407 	}
   1408 
   1409 	clearinode(dp);
   1410 	inodirty();
   1411 	statemap[ino] = USTATE;
   1412 
   1413 	/*
   1414 	 * Keep the disk in sync with us so that pass5 doesn't get
   1415 	 * upset about spurious inconsistencies.
   1416 	 */
   1417 	cg = itog(&sblock, ino);
   1418 	(void) getblk(&cgblk, (diskaddr_t)cgtod(&sblock, cg),
   1419 	    (size_t)sblock.fs_cgsize);
   1420 	cgp = cgblk.b_un.b_cg;
   1421 	clrbit(cg_inosused(cgp), ino % sblock.fs_ipg);
   1422 	cgp->cg_cs.cs_nifree += 1;
   1423 	cgdirty();
   1424 	sblock.fs_cstotal.cs_nifree += 1;
   1425 	sbdirty();
   1426 }
   1427 
   1428 void
   1429 init_inoinfo(struct inoinfo *inp, struct dinode *dp, fsck_ino_t inum)
   1430 {
   1431 	inp->i_parent = ((inum == UFSROOTINO) ? UFSROOTINO : (fsck_ino_t)0);
   1432 	inp->i_dotdot = (fsck_ino_t)0;
   1433 	inp->i_isize = (offset_t)dp->di_size;
   1434 	inp->i_blkssize = (NDADDR + NIADDR) * sizeof (daddr32_t);
   1435 	inp->i_extattr = dp->di_oeftflag;
   1436 	(void) memmove((void *)&inp->i_blks[0], (void *)&dp->di_db[0],
   1437 	    inp->i_blkssize);
   1438 }
   1439 
   1440 /*
   1441  * Return the inode number in the ".." entry of the provided
   1442  * directory inode.
   1443  */
   1444 static int
   1445 lookup_dotdot_ino(fsck_ino_t ino)
   1446 {
   1447 	struct inodesc idesc;
   1448 
   1449 	init_inodesc(&idesc);
   1450 	idesc.id_type = DATA;
   1451 	idesc.id_func = findino;
   1452 	idesc.id_name = "..";
   1453 	idesc.id_number = ino;
   1454 	idesc.id_fix = NOFIX;
   1455 
   1456 	if ((ckinode(ginode(ino), &idesc, CKI_TRAVERSE) & FOUND) != 0) {
   1457 		return (idesc.id_parent);
   1458 	}
   1459 
   1460 	return (0);
   1461 }
   1462 
   1463 /*
   1464  * Convenience wrapper around ckinode(findino()).
   1465  */
   1466 int
   1467 lookup_named_ino(fsck_ino_t dir, caddr_t name)
   1468 {
   1469 	struct inodesc idesc;
   1470 
   1471 	init_inodesc(&idesc);
   1472 	idesc.id_type = DATA;
   1473 	idesc.id_func = findino;
   1474 	idesc.id_name = name;
   1475 	idesc.id_number = dir;
   1476 	idesc.id_fix = NOFIX;
   1477 
   1478 	if ((ckinode(ginode(dir), &idesc, CKI_TRAVERSE) & FOUND) != 0) {
   1479 		return (idesc.id_parent);
   1480 	}
   1481 
   1482 	return (0);
   1483 }
   1484 
   1485 /*
   1486  * Marks inodes that are being orphaned and might need to be reconnected
   1487  * by pass4().  The inode we're traversing is the directory whose
   1488  * contents will be reconnected later.  id_parent is the lfn at which
   1489  * to start looking at said contents.
   1490  */
   1491 static int
   1492 mark_a_delayed_inode(struct inodesc *idesc)
   1493 {
   1494 	struct direct *dirp = idesc->id_dirp;
   1495 
   1496 	if (idesc->id_lbn < idesc->id_parent) {
   1497 		return (KEEPON);
   1498 	}
   1499 
   1500 	if (dirp->d_ino != 0 &&
   1501 	    strcmp(dirp->d_name, ".") != 0 &&
   1502 	    strcmp(dirp->d_name, "..") != 0) {
   1503 		statemap[dirp->d_ino] &= ~INFOUND;
   1504 		statemap[dirp->d_ino] |= INDELAYD;
   1505 	}
   1506 
   1507 	return (KEEPON);
   1508 }
   1509 
   1510 static void
   1511 mark_delayed_inodes(fsck_ino_t ino, daddr32_t first_lfn)
   1512 {
   1513 	struct dinode *dp;
   1514 	struct inodesc idelayed;
   1515 
   1516 	init_inodesc(&idelayed);
   1517 	idelayed.id_number = ino;
   1518 	idelayed.id_type = DATA;
   1519 	idelayed.id_fix = NOFIX;
   1520 	idelayed.id_func = mark_a_delayed_inode;
   1521 	idelayed.id_parent = first_lfn;
   1522 	idelayed.id_entryno = 2;
   1523 
   1524 	dp = ginode(ino);
   1525 	(void) ckinode(dp, &idelayed, CKI_TRAVERSE);
   1526 }
   1527 
   1528 /*
   1529  * Clear the i_oeftflag/extended attribute pointer from INO.
   1530  */
   1531 void
   1532 clearattrref(fsck_ino_t ino)
   1533 {
   1534 	struct dinode *dp;
   1535 
   1536 	dp = ginode(ino);
   1537 	if (debug) {
   1538 		if (dp->di_oeftflag == 0)
   1539 			(void) printf("clearattref: no attr to clear on %d\n",
   1540 			    ino);
   1541 	}
   1542 
   1543 	dp->di_oeftflag = 0;
   1544 	inodirty();
   1545 }
   1546