Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #include <sys/types.h>
     40 #include <sys/t_lock.h>
     41 #include <sys/sysmacros.h>
     42 #include <sys/conf.h>
     43 #include <sys/cpuvar.h>
     44 #include <sys/errno.h>
     45 #include <sys/debug.h>
     46 #include <sys/buf.h>
     47 #include <sys/var.h>
     48 #include <sys/vnode.h>
     49 #include <sys/bitmap.h>
     50 #include <sys/cmn_err.h>
     51 #include <sys/kmem.h>
     52 #include <sys/vmem.h>
     53 #include <sys/atomic.h>
     54 #include <vm/seg_kmem.h>
     55 #include <vm/page.h>
     56 #include <vm/pvn.h>
     57 #include <sys/vtrace.h>
     58 #include <sys/tnf_probe.h>
     59 #include <sys/fs/ufs_inode.h>
     60 #include <sys/fs/ufs_bio.h>
     61 #include <sys/fs/ufs_log.h>
     62 #include <sys/systm.h>
     63 #include <sys/vfs.h>
     64 #include <sys/sdt.h>
     65 
     66 /* Locks */
     67 static	kmutex_t	blist_lock;	/* protects b_list */
     68 static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
     69 static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
     70 
     71 struct hbuf	*hbuf;			/* Hash buckets */
     72 struct dwbuf	*dwbuf;			/* Delayed write buckets */
     73 static struct buf *bhdrlist;		/* buf header free list */
     74 static int 	nbuf;			/* number of buffer headers allocated */
     75 
     76 static int	lastindex;		/* Reference point on where to start */
     77 					/* when looking for free buffers */
     78 
     79 #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
     80 #define	EMPTY_LIST	((struct buf *)-1)
     81 
     82 static kcondvar_t	bio_mem_cv; 	/* Condition variables */
     83 static kcondvar_t	bio_flushinval_cv;
     84 static int	bio_doingflush;		/* flush in progress */
     85 static int	bio_doinginval;		/* inval in progress */
     86 static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
     87 
     88 /*
     89  * Statistics on the buffer cache
     90  */
     91 struct biostats biostats = {
     92 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
     93 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
     94 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
     95 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
     96 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
     97 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
     98 };
     99 
    100 /*
    101  * kstat data
    102  */
    103 kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
    104 uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
    105 					sizeof (kstat_named_t));
    106 
    107 /*
    108  * Statistics on ufs buffer cache
    109  * Not protected by locks
    110  */
    111 struct ufsbiostats ub = {
    112 	{ "breads",			KSTAT_DATA_UINT32 },
    113 	{ "bwrites",			KSTAT_DATA_UINT32 },
    114 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
    115 	{ "getpages",			KSTAT_DATA_UINT32 },
    116 	{ "getras",			KSTAT_DATA_UINT32 },
    117 	{ "putsyncs",			KSTAT_DATA_UINT32 },
    118 	{ "putasyncs",			KSTAT_DATA_UINT32 },
    119 	{ "putpageios",			KSTAT_DATA_UINT32 },
    120 };
    121 
    122 /*
    123  * more UFS Logging eccentricities...
    124  *
    125  * required since "#pragma weak ..." doesn't work in reverse order.
    126  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
    127  *        to ufs routines don't get plugged into bio.c calls so
    128  *        we initialize it when setting up the "lufsops" table
    129  *        in "lufs.c:_init()"
    130  */
    131 void (*bio_lufs_strategy)(void *, buf_t *);
    132 void (*bio_snapshot_strategy)(void *, buf_t *);
    133 
    134 
    135 /* Private routines */
    136 static struct buf	*bio_getfreeblk(long);
    137 static void 		bio_mem_get(long);
    138 static void		bio_bhdr_free(struct buf *);
    139 static struct buf	*bio_bhdr_alloc(void);
    140 static void		bio_recycle(int, long);
    141 static void 		bio_pageio_done(struct buf *);
    142 static int 		bio_incore(dev_t, daddr_t);
    143 
    144 /*
    145  * Buffer cache constants
    146  */
    147 #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
    148 #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
    149 #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
    150 #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
    151 #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
    152 #define	BIO_HASHLEN	4		/* Target length of hash chains */
    153 
    154 
    155 /* Flags for bio_recycle() */
    156 #define	BIO_HEADER	0x01
    157 #define	BIO_MEM		0x02
    158 
    159 extern	int bufhwm;		/* User tunable - high water mark for mem  */
    160 extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
    161 
    162 /*
    163  * The following routines allocate and free
    164  * buffers with various side effects.  In general the
    165  * arguments to an allocate routine are a device and
    166  * a block number, and the value is a pointer to
    167  * to the buffer header; the buffer returned is locked with a
    168  * binary semaphore so that no one else can touch it. If the block was
    169  * already in core, no I/O need be done; if it is
    170  * already locked, the process waits until it becomes free.
    171  * The following routines allocate a buffer:
    172  *	getblk
    173  *	bread/BREAD
    174  *	breada
    175  * Eventually the buffer must be released, possibly with the
    176  * side effect of writing it out, by using one of
    177  *	bwrite/BWRITE/brwrite
    178  *	bdwrite/bdrwrite
    179  *	bawrite
    180  *	brelse
    181  *
    182  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
    183  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
    184  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
    185  * B_DONE is still used to denote a buffer with I/O complete on it.
    186  *
    187  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
    188  * should not be used where a very accurate count of the free buffers is
    189  * needed.
    190  */
    191 
    192 /*
    193  * Read in (if necessary) the block and return a buffer pointer.
    194  *
    195  * This interface is provided for binary compatibility.  Using
    196  * BREAD() directly avoids the extra function call overhead invoked
    197  * by calling this routine.
    198  */
    199 struct buf *
    200 bread(dev_t dev, daddr_t blkno, long bsize)
    201 {
    202 	return (BREAD(dev, blkno, bsize));
    203 }
    204 
    205 /*
    206  * Common code for reading a buffer with various options
    207  *
    208  * Read in (if necessary) the block and return a buffer pointer.
    209  */
    210 struct buf *
    211 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
    212 {
    213 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
    214 	struct buf *bp;
    215 	klwp_t *lwp = ttolwp(curthread);
    216 
    217 	CPU_STATS_ADD_K(sys, lread, 1);
    218 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
    219 	if (bp->b_flags & B_DONE)
    220 		return (bp);
    221 	bp->b_flags |= B_READ;
    222 	ASSERT(bp->b_bcount == bsize);
    223 	if (ufsvfsp == NULL) {					/* !ufs */
    224 		(void) bdev_strategy(bp);
    225 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
    226 							/* ufs && logging */
    227 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
    228 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
    229 							/* ufs && snapshots */
    230 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
    231 	} else {
    232 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
    233 		ub.ub_breads.value.ul++;		/* ufs && !logging */
    234 		(void) bdev_strategy(bp);
    235 	}
    236 	if (lwp != NULL)
    237 		lwp->lwp_ru.inblock++;
    238 	CPU_STATS_ADD_K(sys, bread, 1);
    239 	(void) biowait(bp);
    240 	return (bp);
    241 }
    242 
    243 /*
    244  * Read in the block, like bread, but also start I/O on the
    245  * read-ahead block (which is not allocated to the caller).
    246  */
    247 struct buf *
    248 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
    249 {
    250 	struct buf *bp, *rabp;
    251 	klwp_t *lwp = ttolwp(curthread);
    252 
    253 	bp = NULL;
    254 	if (!bio_incore(dev, blkno)) {
    255 		CPU_STATS_ADD_K(sys, lread, 1);
    256 		bp = GETBLK(dev, blkno, bsize);
    257 		if ((bp->b_flags & B_DONE) == 0) {
    258 			bp->b_flags |= B_READ;
    259 			bp->b_bcount = bsize;
    260 			(void) bdev_strategy(bp);
    261 			if (lwp != NULL)
    262 				lwp->lwp_ru.inblock++;
    263 			CPU_STATS_ADD_K(sys, bread, 1);
    264 		}
    265 	}
    266 	if (rablkno && bfreelist.b_bcount > 1 &&
    267 	    !bio_incore(dev, rablkno)) {
    268 		rabp = GETBLK(dev, rablkno, bsize);
    269 		if (rabp->b_flags & B_DONE)
    270 			brelse(rabp);
    271 		else {
    272 			rabp->b_flags |= B_READ|B_ASYNC;
    273 			rabp->b_bcount = bsize;
    274 			(void) bdev_strategy(rabp);
    275 			if (lwp != NULL)
    276 				lwp->lwp_ru.inblock++;
    277 			CPU_STATS_ADD_K(sys, bread, 1);
    278 		}
    279 	}
    280 	if (bp == NULL)
    281 		return (BREAD(dev, blkno, bsize));
    282 	(void) biowait(bp);
    283 	return (bp);
    284 }
    285 
    286 /*
    287  * Common code for writing a buffer with various options.
    288  *
    289  * force_wait  - wait for write completion regardless of B_ASYNC flag
    290  * do_relse    - release the buffer when we are done
    291  * clear_flags - flags to clear from the buffer
    292  */
    293 void
    294 bwrite_common(void *arg, struct buf *bp, int force_wait,
    295 				int do_relse, int clear_flags)
    296 {
    297 	register int do_wait;
    298 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
    299 	int flag;
    300 	klwp_t *lwp = ttolwp(curthread);
    301 	struct cpu *cpup;
    302 
    303 	ASSERT(SEMA_HELD(&bp->b_sem));
    304 	flag = bp->b_flags;
    305 	bp->b_flags &= ~clear_flags;
    306 	if (lwp != NULL)
    307 		lwp->lwp_ru.oublock++;
    308 	CPU_STATS_ENTER_K();
    309 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
    310 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
    311 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
    312 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
    313 	if (do_wait == 0)
    314 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
    315 	CPU_STATS_EXIT_K();
    316 	if (ufsvfsp == NULL) {
    317 		(void) bdev_strategy(bp);
    318 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
    319 							/* ufs && logging */
    320 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
    321 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
    322 							/* ufs && snapshots */
    323 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
    324 	} else {
    325 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
    326 		(void) bdev_strategy(bp);
    327 	}
    328 	if (do_wait) {
    329 		(void) biowait(bp);
    330 		if (do_relse) {
    331 			brelse(bp);
    332 		}
    333 	}
    334 }
    335 
    336 /*
    337  * Write the buffer, waiting for completion (unless B_ASYNC is set).
    338  * Then release the buffer.
    339  * This interface is provided for binary compatibility.  Using
    340  * BWRITE() directly avoids the extra function call overhead invoked
    341  * by calling this routine.
    342  */
    343 void
    344 bwrite(struct buf *bp)
    345 {
    346 	BWRITE(bp);
    347 }
    348 
    349 /*
    350  * Write the buffer, waiting for completion.
    351  * But don't release the buffer afterwards.
    352  * This interface is provided for binary compatibility.  Using
    353  * BWRITE2() directly avoids the extra function call overhead.
    354  */
    355 void
    356 bwrite2(struct buf *bp)
    357 {
    358 	BWRITE2(bp);
    359 }
    360 
    361 /*
    362  * Release the buffer, marking it so that if it is grabbed
    363  * for another purpose it will be written out before being
    364  * given up (e.g. when writing a partial block where it is
    365  * assumed that another write for the same block will soon follow).
    366  * Also save the time that the block is first marked as delayed
    367  * so that it will be written in a reasonable time.
    368  */
    369 void
    370 bdwrite(struct buf *bp)
    371 {
    372 	ASSERT(SEMA_HELD(&bp->b_sem));
    373 	CPU_STATS_ADD_K(sys, lwrite, 1);
    374 	if ((bp->b_flags & B_DELWRI) == 0)
    375 		bp->b_start = ddi_get_lbolt();
    376 	/*
    377 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
    378 	 * buffer to be written before being reused, and setting b_resid
    379 	 * to zero says the buffer is complete.
    380 	 */
    381 	bp->b_flags |= B_DELWRI | B_DONE;
    382 	bp->b_resid = 0;
    383 	brelse(bp);
    384 }
    385 
    386 /*
    387  * Release the buffer, start I/O on it, but don't wait for completion.
    388  */
    389 void
    390 bawrite(struct buf *bp)
    391 {
    392 	ASSERT(SEMA_HELD(&bp->b_sem));
    393 
    394 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
    395 	if (bfreelist.b_bcount > 4)
    396 		bp->b_flags |= B_ASYNC;
    397 	BWRITE(bp);
    398 }
    399 
    400 /*
    401  * Release the buffer, with no I/O implied.
    402  */
    403 void
    404 brelse(struct buf *bp)
    405 {
    406 	struct buf	**backp;
    407 	uint_t		index;
    408 	kmutex_t	*hmp;
    409 	struct	buf	*dp;
    410 	struct	hbuf	*hp;
    411 
    412 
    413 	ASSERT(SEMA_HELD(&bp->b_sem));
    414 
    415 	/*
    416 	 * Clear the retry write flag if the buffer was written without
    417 	 * error.  The presence of B_DELWRI means the buffer has not yet
    418 	 * been written and the presence of B_ERROR means that an error
    419 	 * is still occurring.
    420 	 */
    421 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
    422 		bp->b_flags &= ~B_RETRYWRI;
    423 	}
    424 
    425 	/* Check for anomalous conditions */
    426 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
    427 		if (bp->b_flags & B_NOCACHE) {
    428 			/* Don't add to the freelist. Destroy it now */
    429 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
    430 			sema_destroy(&bp->b_sem);
    431 			sema_destroy(&bp->b_io);
    432 			kmem_free(bp, sizeof (struct buf));
    433 			return;
    434 		}
    435 		/*
    436 		 * If a write failed and we are supposed to retry write,
    437 		 * don't toss the buffer.  Keep it around and mark it
    438 		 * delayed write in the hopes that it will eventually
    439 		 * get flushed (and still keep the system running.)
    440 		 */
    441 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
    442 			bp->b_flags |= B_DELWRI;
    443 			/* keep fsflush from trying continuously to flush */
    444 			bp->b_start = ddi_get_lbolt();
    445 		} else
    446 			bp->b_flags |= B_AGE|B_STALE;
    447 		bp->b_flags &= ~B_ERROR;
    448 		bp->b_error = 0;
    449 	}
    450 
    451 	/*
    452 	 * If delayed write is set then put in on the delayed
    453 	 * write list instead of the free buffer list.
    454 	 */
    455 	index = bio_bhash(bp->b_edev, bp->b_blkno);
    456 	hmp   = &hbuf[index].b_lock;
    457 
    458 	mutex_enter(hmp);
    459 	hp = &hbuf[index];
    460 	dp = (struct buf *)hp;
    461 
    462 	/*
    463 	 * Make sure that the number of entries on this list are
    464 	 * Zero <= count <= total # buffers
    465 	 */
    466 	ASSERT(hp->b_length >= 0);
    467 	ASSERT(hp->b_length < nbuf);
    468 
    469 	hp->b_length++;		/* We are adding this buffer */
    470 
    471 	if (bp->b_flags & B_DELWRI) {
    472 		/*
    473 		 * This buffer goes on the delayed write buffer list
    474 		 */
    475 		dp = (struct buf *)&dwbuf[index];
    476 	}
    477 	ASSERT(bp->b_bufsize > 0);
    478 	ASSERT(bp->b_bcount > 0);
    479 	ASSERT(bp->b_un.b_addr != NULL);
    480 
    481 	if (bp->b_flags & B_AGE) {
    482 		backp = &dp->av_forw;
    483 		(*backp)->av_back = bp;
    484 		bp->av_forw = *backp;
    485 		*backp = bp;
    486 		bp->av_back = dp;
    487 	} else {
    488 		backp = &dp->av_back;
    489 		(*backp)->av_forw = bp;
    490 		bp->av_back = *backp;
    491 		*backp = bp;
    492 		bp->av_forw = dp;
    493 	}
    494 	mutex_exit(hmp);
    495 
    496 	if (bfreelist.b_flags & B_WANTED) {
    497 		/*
    498 		 * Should come here very very rarely.
    499 		 */
    500 		mutex_enter(&bfree_lock);
    501 		if (bfreelist.b_flags & B_WANTED) {
    502 			bfreelist.b_flags &= ~B_WANTED;
    503 			cv_broadcast(&bio_mem_cv);
    504 		}
    505 		mutex_exit(&bfree_lock);
    506 	}
    507 
    508 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
    509 	/*
    510 	 * Don't let anyone get the buffer off the freelist before we
    511 	 * release our hold on it.
    512 	 */
    513 	sema_v(&bp->b_sem);
    514 }
    515 
    516 /*
    517  * Return a count of the number of B_BUSY buffers in the system
    518  * Can only be used as a good estimate.  If 'cleanit' is set,
    519  * try to flush all bufs.
    520  */
    521 int
    522 bio_busy(int cleanit)
    523 {
    524 	struct buf *bp, *dp;
    525 	int busy = 0;
    526 	int i;
    527 	kmutex_t *hmp;
    528 
    529 	for (i = 0; i < v.v_hbuf; i++) {
    530 		vfs_syncprogress();
    531 		dp = (struct buf *)&hbuf[i];
    532 		hmp = &hbuf[i].b_lock;
    533 
    534 		mutex_enter(hmp);
    535 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
    536 			if (bp->b_flags & B_BUSY)
    537 				busy++;
    538 		}
    539 		mutex_exit(hmp);
    540 	}
    541 
    542 	if (cleanit && busy != 0) {
    543 		bflush(NODEV);
    544 	}
    545 
    546 	return (busy);
    547 }
    548 
    549 /*
    550  * this interface is provided for binary compatibility.
    551  *
    552  * Assign a buffer for the given block.  If the appropriate
    553  * block is already associated, return it; otherwise search
    554  * for the oldest non-busy buffer and reassign it.
    555  */
    556 struct buf *
    557 getblk(dev_t dev, daddr_t blkno, long bsize)
    558 {
    559 	return (getblk_common(/* ufsvfsp */ NULL, dev,
    560 	    blkno, bsize, /* errflg */ 0));
    561 }
    562 
    563 /*
    564  * Assign a buffer for the given block.  If the appropriate
    565  * block is already associated, return it; otherwise search
    566  * for the oldest non-busy buffer and reassign it.
    567  */
    568 struct buf *
    569 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
    570 {
    571 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
    572 	struct buf *bp;
    573 	struct buf *dp;
    574 	struct buf *nbp = NULL;
    575 	struct buf *errbp;
    576 	uint_t		index;
    577 	kmutex_t	*hmp;
    578 	struct	hbuf	*hp;
    579 
    580 	if (getmajor(dev) >= devcnt)
    581 		cmn_err(CE_PANIC, "blkdev");
    582 
    583 	biostats.bio_lookup.value.ui32++;
    584 
    585 	index = bio_bhash(dev, blkno);
    586 	hp    = &hbuf[index];
    587 	dp    = (struct buf *)hp;
    588 	hmp   = &hp->b_lock;
    589 
    590 	mutex_enter(hmp);
    591 loop:
    592 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
    593 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
    594 		    (bp->b_flags & B_STALE))
    595 			continue;
    596 		/*
    597 		 * Avoid holding the hash lock in the event that
    598 		 * the buffer is locked by someone. Since the hash chain
    599 		 * may change when we drop the hash lock
    600 		 * we have to start at the beginning of the chain if the
    601 		 * buffer identity/contents aren't valid.
    602 		 */
    603 		if (!sema_tryp(&bp->b_sem)) {
    604 			biostats.bio_bufbusy.value.ui32++;
    605 			mutex_exit(hmp);
    606 			/*
    607 			 * OK, we are dealing with a busy buffer.
    608 			 * In the case that we are panicking and we
    609 			 * got called from bread(), we have some chance
    610 			 * for error recovery. So better bail out from
    611 			 * here since sema_p() won't block. If we got
    612 			 * called directly from ufs routines, there is
    613 			 * no way to report an error yet.
    614 			 */
    615 			if (panicstr && errflg)
    616 				goto errout;
    617 			/*
    618 			 * For the following line of code to work
    619 			 * correctly never kmem_free the buffer "header".
    620 			 */
    621 			sema_p(&bp->b_sem);
    622 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
    623 			    (bp->b_flags & B_STALE)) {
    624 				sema_v(&bp->b_sem);
    625 				mutex_enter(hmp);
    626 				goto loop;	/* start over */
    627 			}
    628 			mutex_enter(hmp);
    629 		}
    630 		/* Found */
    631 		biostats.bio_hit.value.ui32++;
    632 		bp->b_flags &= ~B_AGE;
    633 
    634 		/*
    635 		 * Yank it off the free/delayed write lists
    636 		 */
    637 		hp->b_length--;
    638 		notavail(bp);
    639 		mutex_exit(hmp);
    640 
    641 		ASSERT((bp->b_flags & B_NOCACHE) == NULL);
    642 
    643 		if (nbp == NULL) {
    644 			/*
    645 			 * Make the common path short.
    646 			 */
    647 			ASSERT(SEMA_HELD(&bp->b_sem));
    648 			return (bp);
    649 		}
    650 
    651 		biostats.bio_bufdup.value.ui32++;
    652 
    653 		/*
    654 		 * The buffer must have entered during the lock upgrade
    655 		 * so free the new buffer we allocated and return the
    656 		 * found buffer.
    657 		 */
    658 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
    659 		nbp->b_un.b_addr = NULL;
    660 
    661 		/*
    662 		 * Account for the memory
    663 		 */
    664 		mutex_enter(&bfree_lock);
    665 		bfreelist.b_bufsize += nbp->b_bufsize;
    666 		mutex_exit(&bfree_lock);
    667 
    668 		/*
    669 		 * Destroy buf identity, and place on avail list
    670 		 */
    671 		nbp->b_dev = (o_dev_t)NODEV;
    672 		nbp->b_edev = NODEV;
    673 		nbp->b_flags = 0;
    674 		nbp->b_file = NULL;
    675 		nbp->b_offset = -1;
    676 
    677 		sema_v(&nbp->b_sem);
    678 		bio_bhdr_free(nbp);
    679 
    680 		ASSERT(SEMA_HELD(&bp->b_sem));
    681 		return (bp);
    682 	}
    683 
    684 	/*
    685 	 * bio_getfreeblk may block so check the hash chain again.
    686 	 */
    687 	if (nbp == NULL) {
    688 		mutex_exit(hmp);
    689 		nbp = bio_getfreeblk(bsize);
    690 		mutex_enter(hmp);
    691 		goto loop;
    692 	}
    693 
    694 	/*
    695 	 * New buffer. Assign nbp and stick it on the hash.
    696 	 */
    697 	nbp->b_flags = B_BUSY;
    698 	nbp->b_edev = dev;
    699 	nbp->b_dev = (o_dev_t)cmpdev(dev);
    700 	nbp->b_blkno = blkno;
    701 	nbp->b_iodone = NULL;
    702 	nbp->b_bcount = bsize;
    703 	/*
    704 	 * If we are given a ufsvfsp and the vfs_root field is NULL
    705 	 * then this must be I/O for a superblock.  A superblock's
    706 	 * buffer is set up in mountfs() and there is no root vnode
    707 	 * at that point.
    708 	 */
    709 	if (ufsvfsp && ufsvfsp->vfs_root) {
    710 		nbp->b_vp = ufsvfsp->vfs_root;
    711 	} else {
    712 		nbp->b_vp = NULL;
    713 	}
    714 
    715 	ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
    716 
    717 	binshash(nbp, dp);
    718 	mutex_exit(hmp);
    719 
    720 	ASSERT(SEMA_HELD(&nbp->b_sem));
    721 
    722 	return (nbp);
    723 
    724 
    725 	/*
    726 	 * Come here in case of an internal error. At this point we couldn't
    727 	 * get a buffer, but he have to return one. Hence we allocate some
    728 	 * kind of error reply buffer on the fly. This buffer is marked as
    729 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
    730 	 *	- B_ERROR will indicate error to the caller.
    731 	 *	- B_DONE will prevent us from reading the buffer from
    732 	 *	  the device.
    733 	 *	- B_NOCACHE will cause that this buffer gets free'd in
    734 	 *	  brelse().
    735 	 */
    736 
    737 errout:
    738 	errbp = geteblk();
    739 	sema_p(&errbp->b_sem);
    740 	errbp->b_flags &= ~B_BUSY;
    741 	errbp->b_flags |= (B_ERROR | B_DONE);
    742 	return (errbp);
    743 }
    744 
    745 /*
    746  * Get an empty block, not assigned to any particular device.
    747  * Returns a locked buffer that is not on any hash or free list.
    748  */
    749 struct buf *
    750 ngeteblk(long bsize)
    751 {
    752 	struct buf *bp;
    753 
    754 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
    755 	bioinit(bp);
    756 	bp->av_forw = bp->av_back = NULL;
    757 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
    758 	bp->b_bufsize = bsize;
    759 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
    760 	bp->b_dev = (o_dev_t)NODEV;
    761 	bp->b_edev = NODEV;
    762 	bp->b_lblkno = 0;
    763 	bp->b_bcount = bsize;
    764 	bp->b_iodone = NULL;
    765 	return (bp);
    766 }
    767 
    768 /*
    769  * Interface of geteblk() is kept intact to maintain driver compatibility.
    770  * Use ngeteblk() to allocate block size other than 1 KB.
    771  */
    772 struct buf *
    773 geteblk(void)
    774 {
    775 	return (ngeteblk((long)1024));
    776 }
    777 
    778 /*
    779  * Return a buffer w/o sleeping
    780  */
    781 struct buf *
    782 trygetblk(dev_t dev, daddr_t blkno)
    783 {
    784 	struct buf	*bp;
    785 	struct buf	*dp;
    786 	struct hbuf	*hp;
    787 	kmutex_t	*hmp;
    788 	uint_t		index;
    789 
    790 	index = bio_bhash(dev, blkno);
    791 	hp = &hbuf[index];
    792 	hmp = &hp->b_lock;
    793 
    794 	if (!mutex_tryenter(hmp))
    795 		return (NULL);
    796 
    797 	dp = (struct buf *)hp;
    798 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
    799 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
    800 		    (bp->b_flags & B_STALE))
    801 			continue;
    802 		/*
    803 		 * Get access to a valid buffer without sleeping
    804 		 */
    805 		if (sema_tryp(&bp->b_sem)) {
    806 			if (bp->b_flags & B_DONE) {
    807 				hp->b_length--;
    808 				notavail(bp);
    809 				mutex_exit(hmp);
    810 				return (bp);
    811 			} else {
    812 				sema_v(&bp->b_sem);
    813 				break;
    814 			}
    815 		}
    816 		break;
    817 	}
    818 	mutex_exit(hmp);
    819 	return (NULL);
    820 }
    821 
    822 /*
    823  * Wait for I/O completion on the buffer; return errors
    824  * to the user.
    825  */
    826 int
    827 iowait(struct buf *bp)
    828 {
    829 	ASSERT(SEMA_HELD(&bp->b_sem));
    830 	return (biowait(bp));
    831 }
    832 
    833 /*
    834  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
    835  * and wake up anyone waiting for it.
    836  */
    837 void
    838 iodone(struct buf *bp)
    839 {
    840 	ASSERT(SEMA_HELD(&bp->b_sem));
    841 	(void) biodone(bp);
    842 }
    843 
    844 /*
    845  * Zero the core associated with a buffer.
    846  */
    847 void
    848 clrbuf(struct buf *bp)
    849 {
    850 	ASSERT(SEMA_HELD(&bp->b_sem));
    851 	bzero(bp->b_un.b_addr, bp->b_bcount);
    852 	bp->b_resid = 0;
    853 }
    854 
    855 
    856 /*
    857  * Make sure all write-behind blocks on dev (or NODEV for all)
    858  * are flushed out.
    859  */
    860 void
    861 bflush(dev_t dev)
    862 {
    863 	struct buf *bp, *dp;
    864 	struct hbuf *hp;
    865 	struct buf *delwri_list = EMPTY_LIST;
    866 	int i, index;
    867 	kmutex_t *hmp;
    868 
    869 	mutex_enter(&blist_lock);
    870 	/*
    871 	 * Wait for any invalidates or flushes ahead of us to finish.
    872 	 * We really could split blist_lock up per device for better
    873 	 * parallelism here.
    874 	 */
    875 	while (bio_doinginval || bio_doingflush) {
    876 		bio_flinv_cv_wanted = 1;
    877 		cv_wait(&bio_flushinval_cv, &blist_lock);
    878 	}
    879 	bio_doingflush++;
    880 	/*
    881 	 * Gather all B_DELWRI buffer for device.
    882 	 * Lock ordering is b_sem > hash lock (brelse).
    883 	 * Since we are finding the buffer via the delayed write list,
    884 	 * it may be busy and we would block trying to get the
    885 	 * b_sem lock while holding hash lock. So transfer all the
    886 	 * candidates on the delwri_list and then drop the hash locks.
    887 	 */
    888 	for (i = 0; i < v.v_hbuf; i++) {
    889 		vfs_syncprogress();
    890 		hmp = &hbuf[i].b_lock;
    891 		dp = (struct buf *)&dwbuf[i];
    892 		mutex_enter(hmp);
    893 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
    894 			if (dev == NODEV || bp->b_edev == dev) {
    895 				if (bp->b_list == NULL) {
    896 					bp->b_list = delwri_list;
    897 					delwri_list = bp;
    898 				}
    899 			}
    900 		}
    901 		mutex_exit(hmp);
    902 	}
    903 	mutex_exit(&blist_lock);
    904 
    905 	/*
    906 	 * Now that the hash locks have been dropped grab the semaphores
    907 	 * and write back all the buffers that have B_DELWRI set.
    908 	 */
    909 	while (delwri_list != EMPTY_LIST) {
    910 		vfs_syncprogress();
    911 		bp = delwri_list;
    912 
    913 		sema_p(&bp->b_sem);	/* may block */
    914 		if ((dev != bp->b_edev && dev != NODEV) ||
    915 		    (panicstr && bp->b_flags & B_BUSY)) {
    916 			sema_v(&bp->b_sem);
    917 			delwri_list = bp->b_list;
    918 			bp->b_list = NULL;
    919 			continue;	/* No longer a candidate */
    920 		}
    921 		if (bp->b_flags & B_DELWRI) {
    922 			index = bio_bhash(bp->b_edev, bp->b_blkno);
    923 			hp = &hbuf[index];
    924 			hmp = &hp->b_lock;
    925 			dp = (struct buf *)hp;
    926 
    927 			bp->b_flags |= B_ASYNC;
    928 			mutex_enter(hmp);
    929 			hp->b_length--;
    930 			notavail(bp);
    931 			mutex_exit(hmp);
    932 			if (bp->b_vp == NULL) {		/* !ufs */
    933 				BWRITE(bp);
    934 			} else {			/* ufs */
    935 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
    936 			}
    937 		} else {
    938 			sema_v(&bp->b_sem);
    939 		}
    940 		delwri_list = bp->b_list;
    941 		bp->b_list = NULL;
    942 	}
    943 	mutex_enter(&blist_lock);
    944 	bio_doingflush--;
    945 	if (bio_flinv_cv_wanted) {
    946 		bio_flinv_cv_wanted = 0;
    947 		cv_broadcast(&bio_flushinval_cv);
    948 	}
    949 	mutex_exit(&blist_lock);
    950 }
    951 
    952 /*
    953  * Ensure that a specified block is up-to-date on disk.
    954  */
    955 void
    956 blkflush(dev_t dev, daddr_t blkno)
    957 {
    958 	struct buf *bp, *dp;
    959 	struct hbuf *hp;
    960 	struct buf *sbp = NULL;
    961 	uint_t index;
    962 	kmutex_t *hmp;
    963 
    964 	index = bio_bhash(dev, blkno);
    965 	hp    = &hbuf[index];
    966 	dp    = (struct buf *)hp;
    967 	hmp   = &hp->b_lock;
    968 
    969 	/*
    970 	 * Identify the buffer in the cache belonging to
    971 	 * this device and blkno (if any).
    972 	 */
    973 	mutex_enter(hmp);
    974 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
    975 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
    976 		    (bp->b_flags & B_STALE))
    977 			continue;
    978 		sbp = bp;
    979 		break;
    980 	}
    981 	mutex_exit(hmp);
    982 	if (sbp == NULL)
    983 		return;
    984 	/*
    985 	 * Now check the buffer we have identified and
    986 	 * make sure it still belongs to the device and is B_DELWRI
    987 	 */
    988 	sema_p(&sbp->b_sem);
    989 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
    990 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
    991 		mutex_enter(hmp);
    992 		hp->b_length--;
    993 		notavail(sbp);
    994 		mutex_exit(hmp);
    995 		/*
    996 		 * XXX - There is nothing to guarantee a synchronous
    997 		 * write here if the B_ASYNC flag is set.  This needs
    998 		 * some investigation.
    999 		 */
   1000 		if (sbp->b_vp == NULL) {		/* !ufs */
   1001 			BWRITE(sbp);	/* synchronous write */
   1002 		} else {				/* ufs */
   1003 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
   1004 		}
   1005 	} else {
   1006 		sema_v(&sbp->b_sem);
   1007 	}
   1008 }
   1009 
   1010 /*
   1011  * Same as binval, except can force-invalidate delayed-write buffers
   1012  * (which are not be already flushed because of device errors).  Also
   1013  * makes sure that the retry write flag is cleared.
   1014  */
   1015 int
   1016 bfinval(dev_t dev, int force)
   1017 {
   1018 	struct buf *dp;
   1019 	struct buf *bp;
   1020 	struct buf *binval_list = EMPTY_LIST;
   1021 	int i, error = 0;
   1022 	kmutex_t *hmp;
   1023 	uint_t index;
   1024 	struct buf **backp;
   1025 
   1026 	mutex_enter(&blist_lock);
   1027 	/*
   1028 	 * Wait for any flushes ahead of us to finish, it's ok to
   1029 	 * do invalidates in parallel.
   1030 	 */
   1031 	while (bio_doingflush) {
   1032 		bio_flinv_cv_wanted = 1;
   1033 		cv_wait(&bio_flushinval_cv, &blist_lock);
   1034 	}
   1035 	bio_doinginval++;
   1036 
   1037 	/* Gather bp's */
   1038 	for (i = 0; i < v.v_hbuf; i++) {
   1039 		dp = (struct buf *)&hbuf[i];
   1040 		hmp = &hbuf[i].b_lock;
   1041 
   1042 		mutex_enter(hmp);
   1043 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
   1044 			if (bp->b_edev == dev) {
   1045 				if (bp->b_list == NULL) {
   1046 					bp->b_list = binval_list;
   1047 					binval_list = bp;
   1048 				}
   1049 			}
   1050 		}
   1051 		mutex_exit(hmp);
   1052 	}
   1053 	mutex_exit(&blist_lock);
   1054 
   1055 	/* Invalidate all bp's found */
   1056 	while (binval_list != EMPTY_LIST) {
   1057 		bp = binval_list;
   1058 
   1059 		sema_p(&bp->b_sem);
   1060 		if (bp->b_edev == dev) {
   1061 			if (force && (bp->b_flags & B_DELWRI)) {
   1062 				/* clear B_DELWRI, move to non-dw freelist */
   1063 				index = bio_bhash(bp->b_edev, bp->b_blkno);
   1064 				hmp = &hbuf[index].b_lock;
   1065 				dp = (struct buf *)&hbuf[index];
   1066 				mutex_enter(hmp);
   1067 
   1068 				/* remove from delayed write freelist */
   1069 				notavail(bp);
   1070 
   1071 				/* add to B_AGE side of non-dw freelist */
   1072 				backp = &dp->av_forw;
   1073 				(*backp)->av_back = bp;
   1074 				bp->av_forw = *backp;
   1075 				*backp = bp;
   1076 				bp->av_back = dp;
   1077 
   1078 				/*
   1079 				 * make sure write retries and busy are cleared
   1080 				 */
   1081 				bp->b_flags &=
   1082 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
   1083 				mutex_exit(hmp);
   1084 			}
   1085 			if ((bp->b_flags & B_DELWRI) == 0)
   1086 				bp->b_flags |= B_STALE|B_AGE;
   1087 			else
   1088 				error = EIO;
   1089 		}
   1090 		sema_v(&bp->b_sem);
   1091 		binval_list = bp->b_list;
   1092 		bp->b_list = NULL;
   1093 	}
   1094 	mutex_enter(&blist_lock);
   1095 	bio_doinginval--;
   1096 	if (bio_flinv_cv_wanted) {
   1097 		cv_broadcast(&bio_flushinval_cv);
   1098 		bio_flinv_cv_wanted = 0;
   1099 	}
   1100 	mutex_exit(&blist_lock);
   1101 	return (error);
   1102 }
   1103 
   1104 /*
   1105  * If possible, invalidate blocks for a dev on demand
   1106  */
   1107 void
   1108 binval(dev_t dev)
   1109 {
   1110 	(void) bfinval(dev, 0);
   1111 }
   1112 
   1113 /*
   1114  * Initialize the buffer I/O system by freeing
   1115  * all buffers and setting all device hash buffer lists to empty.
   1116  */
   1117 void
   1118 binit(void)
   1119 {
   1120 	struct buf *bp;
   1121 	unsigned int i, pct;
   1122 	ulong_t	bio_max_hwm, bio_default_hwm;
   1123 
   1124 	/*
   1125 	 * Maximum/Default values for bufhwm are set to the smallest of:
   1126 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
   1127 	 *	- 1/4 of kernel virtual memory
   1128 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
   1129 	 * Additionally, in order to allow simple tuning by percentage of
   1130 	 * physical memory, bufhwm_pct is used to calculate the default if
   1131 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
   1132 	 *
   1133 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
   1134 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
   1135 	 */
   1136 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
   1137 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
   1138 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
   1139 
   1140 	pct = BIO_BUF_PERCENT;
   1141 	if (bufhwm_pct != 0 &&
   1142 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
   1143 		pct = BIO_BUF_PERCENT;
   1144 		/*
   1145 		 * Invalid user specified value, emit a warning.
   1146 		 */
   1147 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
   1148 		    range(1..%d). Using %d as default.",
   1149 		    bufhwm_pct,
   1150 		    100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
   1151 	}
   1152 
   1153 	bio_default_hwm = MIN(physmem / pct,
   1154 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
   1155 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
   1156 
   1157 	if ((v.v_bufhwm = bufhwm) == 0)
   1158 		v.v_bufhwm = bio_default_hwm;
   1159 
   1160 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
   1161 		v.v_bufhwm = (int)bio_max_hwm;
   1162 		/*
   1163 		 * Invalid user specified value, emit a warning.
   1164 		 */
   1165 		cmn_err(CE_WARN,
   1166 		    "binit: bufhwm(%d) out \
   1167 		    of range(%d..%lu). Using %lu as default",
   1168 		    bufhwm,
   1169 		    BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
   1170 	}
   1171 
   1172 	/*
   1173 	 * Determine the number of hash buckets. Default is to
   1174 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
   1175 	 * Round up number to the next power of 2.
   1176 	 */
   1177 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
   1178 	    BIO_HASHLEN);
   1179 	v.v_hmask = v.v_hbuf - 1;
   1180 	v.v_buf = BIO_BHDR_POOL;
   1181 
   1182 	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
   1183 
   1184 	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
   1185 
   1186 	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
   1187 	bp = &bfreelist;
   1188 	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
   1189 
   1190 	for (i = 0; i < v.v_hbuf; i++) {
   1191 		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
   1192 		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
   1193 
   1194 		/*
   1195 		 * Initialize the delayed write buffer list.
   1196 		 */
   1197 		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
   1198 		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
   1199 	}
   1200 }
   1201 
   1202 /*
   1203  * Wait for I/O completion on the buffer; return error code.
   1204  * If bp was for synchronous I/O, bp is invalid and associated
   1205  * resources are freed on return.
   1206  */
   1207 int
   1208 biowait(struct buf *bp)
   1209 {
   1210 	int error = 0;
   1211 	struct cpu *cpup;
   1212 
   1213 	ASSERT(SEMA_HELD(&bp->b_sem));
   1214 
   1215 	cpup = CPU;
   1216 	atomic_add_64(&cpup->cpu_stats.sys.iowait, 1);
   1217 	DTRACE_IO1(wait__start, struct buf *, bp);
   1218 
   1219 	/*
   1220 	 * In case of panic, busy wait for completion
   1221 	 */
   1222 	if (panicstr) {
   1223 		while ((bp->b_flags & B_DONE) == 0)
   1224 			drv_usecwait(10);
   1225 	} else
   1226 		sema_p(&bp->b_io);
   1227 
   1228 	DTRACE_IO1(wait__done, struct buf *, bp);
   1229 	atomic_add_64(&cpup->cpu_stats.sys.iowait, -1);
   1230 
   1231 	error = geterror(bp);
   1232 	if ((bp->b_flags & B_ASYNC) == 0) {
   1233 		if (bp->b_flags & B_REMAPPED)
   1234 			bp_mapout(bp);
   1235 	}
   1236 	return (error);
   1237 }
   1238 
   1239 static void
   1240 biodone_tnf_probe(struct buf *bp)
   1241 {
   1242 	/* Kernel probe */
   1243 	TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
   1244 	    tnf_device,		device,		bp->b_edev,
   1245 	    tnf_diskaddr,	block,		bp->b_lblkno,
   1246 	    tnf_opaque,		buf,		bp);
   1247 }
   1248 
   1249 /*
   1250  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
   1251  * and wake up anyone waiting for it.
   1252  */
   1253 void
   1254 biodone(struct buf *bp)
   1255 {
   1256 	if (bp->b_flags & B_STARTED) {
   1257 		DTRACE_IO1(done, struct buf *, bp);
   1258 		bp->b_flags &= ~B_STARTED;
   1259 	}
   1260 
   1261 	/*
   1262 	 * Call the TNF probe here instead of the inline code
   1263 	 * to force our compiler to use the tail call optimization.
   1264 	 */
   1265 	biodone_tnf_probe(bp);
   1266 
   1267 	if (bp->b_iodone != NULL) {
   1268 		(*(bp->b_iodone))(bp);
   1269 		return;
   1270 	}
   1271 	ASSERT((bp->b_flags & B_DONE) == 0);
   1272 	ASSERT(SEMA_HELD(&bp->b_sem));
   1273 	bp->b_flags |= B_DONE;
   1274 	if (bp->b_flags & B_ASYNC) {
   1275 		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
   1276 			bio_pageio_done(bp);
   1277 		else
   1278 			brelse(bp);	/* release bp to freelist */
   1279 	} else {
   1280 		sema_v(&bp->b_io);
   1281 	}
   1282 }
   1283 
   1284 /*
   1285  * Pick up the device's error number and pass it to the user;
   1286  * if there is an error but the number is 0 set a generalized code.
   1287  */
   1288 int
   1289 geterror(struct buf *bp)
   1290 {
   1291 	int error = 0;
   1292 
   1293 	ASSERT(SEMA_HELD(&bp->b_sem));
   1294 	if (bp->b_flags & B_ERROR) {
   1295 		error = bp->b_error;
   1296 		if (!error)
   1297 			error = EIO;
   1298 	}
   1299 	return (error);
   1300 }
   1301 
   1302 /*
   1303  * Support for pageio buffers.
   1304  *
   1305  * This stuff should be generalized to provide a generalized bp
   1306  * header facility that can be used for things other than pageio.
   1307  */
   1308 
   1309 /*
   1310  * Allocate and initialize a buf struct for use with pageio.
   1311  */
   1312 struct buf *
   1313 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
   1314 {
   1315 	struct buf *bp;
   1316 	struct cpu *cpup;
   1317 
   1318 	if (flags & B_READ) {
   1319 		CPU_STATS_ENTER_K();
   1320 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
   1321 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
   1322 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
   1323 		if ((flags & B_ASYNC) == 0) {
   1324 			klwp_t *lwp = ttolwp(curthread);
   1325 			if (lwp != NULL)
   1326 				lwp->lwp_ru.majflt++;
   1327 			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
   1328 			/* Kernel probe */
   1329 			TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
   1330 			    tnf_opaque,		vnode,		pp->p_vnode,
   1331 			    tnf_offset,		offset,		pp->p_offset);
   1332 		}
   1333 		/*
   1334 		 * Update statistics for pages being paged in
   1335 		 */
   1336 		if (pp != NULL && pp->p_vnode != NULL) {
   1337 			if (IS_SWAPFSVP(pp->p_vnode)) {
   1338 				CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
   1339 			} else {
   1340 				if (pp->p_vnode->v_flag & VVMEXEC) {
   1341 					CPU_STATS_ADDQ(cpup, vm, execpgin,
   1342 					    btopr(len));
   1343 				} else {
   1344 					CPU_STATS_ADDQ(cpup, vm, fspgin,
   1345 					    btopr(len));
   1346 				}
   1347 			}
   1348 		}
   1349 		CPU_STATS_EXIT_K();
   1350 		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
   1351 		    "page_ws_in:pp %p", pp);
   1352 		/* Kernel probe */
   1353 		TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
   1354 		    tnf_opaque,	vnode,	pp->p_vnode,
   1355 		    tnf_offset,	offset,	pp->p_offset,
   1356 		    tnf_size,	size,	len);
   1357 	}
   1358 
   1359 	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
   1360 	bp->b_bcount = len;
   1361 	bp->b_bufsize = len;
   1362 	bp->b_pages = pp;
   1363 	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
   1364 	bp->b_offset = -1;
   1365 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
   1366 
   1367 	/* Initialize bp->b_sem in "locked" state */
   1368 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
   1369 
   1370 	VN_HOLD(vp);
   1371 	bp->b_vp = vp;
   1372 	THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
   1373 
   1374 	/*
   1375 	 * Caller sets dev & blkno and can adjust
   1376 	 * b_addr for page offset and can use bp_mapin
   1377 	 * to make pages kernel addressable.
   1378 	 */
   1379 	return (bp);
   1380 }
   1381 
   1382 void
   1383 pageio_done(struct buf *bp)
   1384 {
   1385 	ASSERT(SEMA_HELD(&bp->b_sem));
   1386 	if (bp->b_flags & B_REMAPPED)
   1387 		bp_mapout(bp);
   1388 	VN_RELE(bp->b_vp);
   1389 	bp->b_vp = NULL;
   1390 	ASSERT((bp->b_flags & B_NOCACHE) != 0);
   1391 
   1392 	/* A sema_v(bp->b_sem) is implied if we are destroying it */
   1393 	sema_destroy(&bp->b_sem);
   1394 	sema_destroy(&bp->b_io);
   1395 	kmem_free(bp, sizeof (struct buf));
   1396 }
   1397 
   1398 /*
   1399  * Check to see whether the buffers, except the one pointed by sbp,
   1400  * associated with the device are busy.
   1401  * NOTE: This expensive operation shall be improved together with ufs_icheck().
   1402  */
   1403 int
   1404 bcheck(dev_t dev, struct buf *sbp)
   1405 {
   1406 	struct buf	*bp;
   1407 	struct buf	*dp;
   1408 	int i;
   1409 	kmutex_t *hmp;
   1410 
   1411 	/*
   1412 	 * check for busy bufs for this filesystem
   1413 	 */
   1414 	for (i = 0; i < v.v_hbuf; i++) {
   1415 		dp = (struct buf *)&hbuf[i];
   1416 		hmp = &hbuf[i].b_lock;
   1417 
   1418 		mutex_enter(hmp);
   1419 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
   1420 			/*
   1421 			 * if buf is busy or dirty, then filesystem is busy
   1422 			 */
   1423 			if ((bp->b_edev == dev) &&
   1424 			    ((bp->b_flags & B_STALE) == 0) &&
   1425 			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
   1426 			    (bp != sbp)) {
   1427 				mutex_exit(hmp);
   1428 				return (1);
   1429 			}
   1430 		}
   1431 		mutex_exit(hmp);
   1432 	}
   1433 	return (0);
   1434 }
   1435 
   1436 /*
   1437  * Hash two 32 bit entities.
   1438  */
   1439 int
   1440 hash2ints(int x, int y)
   1441 {
   1442 	int hash = 0;
   1443 
   1444 	hash = x - 1;
   1445 	hash = ((hash * 7) + (x >> 8)) - 1;
   1446 	hash = ((hash * 7) + (x >> 16)) - 1;
   1447 	hash = ((hash * 7) + (x >> 24)) - 1;
   1448 	hash = ((hash * 7) + y) - 1;
   1449 	hash = ((hash * 7) + (y >> 8)) - 1;
   1450 	hash = ((hash * 7) + (y >> 16)) - 1;
   1451 	hash = ((hash * 7) + (y >> 24)) - 1;
   1452 
   1453 	return (hash);
   1454 }
   1455 
   1456 
   1457 /*
   1458  * Return a new buffer struct.
   1459  *	Create a new buffer if we haven't gone over our high water
   1460  *	mark for memory, otherwise try to get one off the freelist.
   1461  *
   1462  * Returns a locked buf that has no id and is not on any hash or free
   1463  * list.
   1464  */
   1465 static struct buf *
   1466 bio_getfreeblk(long bsize)
   1467 {
   1468 	struct buf *bp, *dp;
   1469 	struct hbuf *hp;
   1470 	kmutex_t	*hmp;
   1471 	uint_t		start, end;
   1472 
   1473 	/*
   1474 	 * mutex_enter(&bfree_lock);
   1475 	 * bfreelist.b_bufsize represents the amount of memory
   1476 	 * mutex_exit(&bfree_lock); protect ref to bfreelist
   1477 	 * we are allowed to allocate in the cache before we hit our hwm.
   1478 	 */
   1479 	bio_mem_get(bsize);	/* Account for our memory request */
   1480 
   1481 again:
   1482 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
   1483 	sema_p(&bp->b_sem);	/* Should never fail */
   1484 
   1485 	ASSERT(bp->b_un.b_addr == NULL);
   1486 	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
   1487 	if (bp->b_un.b_addr != NULL) {
   1488 		/*
   1489 		 * Make the common path short
   1490 		 */
   1491 		bp->b_bufsize = bsize;
   1492 		ASSERT(SEMA_HELD(&bp->b_sem));
   1493 		return (bp);
   1494 	} else {
   1495 		struct buf *save;
   1496 
   1497 		save = bp;	/* Save bp we allocated */
   1498 		start = end = lastindex;
   1499 
   1500 		biostats.bio_bufwant.value.ui32++;
   1501 
   1502 		/*
   1503 		 * Memory isn't available from the system now. Scan
   1504 		 * the hash buckets till enough space is found.
   1505 		 */
   1506 		do {
   1507 			hp = &hbuf[start];
   1508 			hmp = &hp->b_lock;
   1509 			dp = (struct buf *)hp;
   1510 
   1511 			mutex_enter(hmp);
   1512 			bp = dp->av_forw;
   1513 
   1514 			while (bp != dp) {
   1515 
   1516 				ASSERT(bp != NULL);
   1517 
   1518 				if (!sema_tryp(&bp->b_sem)) {
   1519 					bp = bp->av_forw;
   1520 					continue;
   1521 				}
   1522 
   1523 				/*
   1524 				 * Since we are going down the freelist
   1525 				 * associated with this hash bucket the
   1526 				 * B_DELWRI flag should not be set.
   1527 				 */
   1528 				ASSERT(!(bp->b_flags & B_DELWRI));
   1529 
   1530 				if (bp->b_bufsize == bsize) {
   1531 					hp->b_length--;
   1532 					notavail(bp);
   1533 					bremhash(bp);
   1534 					mutex_exit(hmp);
   1535 
   1536 					/*
   1537 					 * Didn't kmem_alloc any more, so don't
   1538 					 * count it twice.
   1539 					 */
   1540 					mutex_enter(&bfree_lock);
   1541 					bfreelist.b_bufsize += bsize;
   1542 					mutex_exit(&bfree_lock);
   1543 
   1544 					/*
   1545 					 * Update the lastindex value.
   1546 					 */
   1547 					lastindex = start;
   1548 
   1549 					/*
   1550 					 * Put our saved bp back on the list
   1551 					 */
   1552 					sema_v(&save->b_sem);
   1553 					bio_bhdr_free(save);
   1554 					ASSERT(SEMA_HELD(&bp->b_sem));
   1555 					return (bp);
   1556 				}
   1557 				sema_v(&bp->b_sem);
   1558 				bp = bp->av_forw;
   1559 			}
   1560 			mutex_exit(hmp);
   1561 			start = ((start + 1) % v.v_hbuf);
   1562 		} while (start != end);
   1563 
   1564 		biostats.bio_bufwait.value.ui32++;
   1565 		bp = save;		/* Use original bp */
   1566 		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
   1567 	}
   1568 
   1569 	bp->b_bufsize = bsize;
   1570 	ASSERT(SEMA_HELD(&bp->b_sem));
   1571 	return (bp);
   1572 }
   1573 
   1574 /*
   1575  * Allocate a buffer header. If none currently available, allocate
   1576  * a new pool.
   1577  */
   1578 static struct buf *
   1579 bio_bhdr_alloc(void)
   1580 {
   1581 	struct buf *dp, *sdp;
   1582 	struct buf *bp;
   1583 	int i;
   1584 
   1585 	for (;;) {
   1586 		mutex_enter(&bhdr_lock);
   1587 		if (bhdrlist != NULL) {
   1588 			bp = bhdrlist;
   1589 			bhdrlist = bp->av_forw;
   1590 			mutex_exit(&bhdr_lock);
   1591 			bp->av_forw = NULL;
   1592 			return (bp);
   1593 		}
   1594 		mutex_exit(&bhdr_lock);
   1595 
   1596 		/*
   1597 		 * Need to allocate a new pool. If the system is currently
   1598 		 * out of memory, then try freeing things on the freelist.
   1599 		 */
   1600 		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
   1601 		if (dp == NULL) {
   1602 			/*
   1603 			 * System can't give us a pool of headers, try
   1604 			 * recycling from the free lists.
   1605 			 */
   1606 			bio_recycle(BIO_HEADER, 0);
   1607 		} else {
   1608 			sdp = dp;
   1609 			for (i = 0; i < v.v_buf; i++, dp++) {
   1610 				/*
   1611 				 * The next two lines are needed since NODEV
   1612 				 * is -1 and not NULL
   1613 				 */
   1614 				dp->b_dev = (o_dev_t)NODEV;
   1615 				dp->b_edev = NODEV;
   1616 				dp->av_forw = dp + 1;
   1617 				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
   1618 				    NULL);
   1619 				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
   1620 				    NULL);
   1621 				dp->b_offset = -1;
   1622 			}
   1623 			mutex_enter(&bhdr_lock);
   1624 			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
   1625 			bhdrlist = sdp;
   1626 			nbuf += v.v_buf;
   1627 			bp = bhdrlist;
   1628 			bhdrlist = bp->av_forw;
   1629 			mutex_exit(&bhdr_lock);
   1630 
   1631 			bp->av_forw = NULL;
   1632 			return (bp);
   1633 		}
   1634 	}
   1635 }
   1636 
   1637 static  void
   1638 bio_bhdr_free(struct buf *bp)
   1639 {
   1640 	ASSERT(bp->b_back == NULL);
   1641 	ASSERT(bp->b_forw == NULL);
   1642 	ASSERT(bp->av_back == NULL);
   1643 	ASSERT(bp->av_forw == NULL);
   1644 	ASSERT(bp->b_un.b_addr == NULL);
   1645 	ASSERT(bp->b_dev == (o_dev_t)NODEV);
   1646 	ASSERT(bp->b_edev == NODEV);
   1647 	ASSERT(bp->b_flags == 0);
   1648 
   1649 	mutex_enter(&bhdr_lock);
   1650 	bp->av_forw = bhdrlist;
   1651 	bhdrlist = bp;
   1652 	mutex_exit(&bhdr_lock);
   1653 }
   1654 
   1655 /*
   1656  * If we haven't gone over the high water mark, it's o.k. to
   1657  * allocate more buffer space, otherwise recycle buffers
   1658  * from the freelist until enough memory is free for a bsize request.
   1659  *
   1660  * We account for this memory, even though
   1661  * we don't allocate it here.
   1662  */
   1663 static void
   1664 bio_mem_get(long bsize)
   1665 {
   1666 	mutex_enter(&bfree_lock);
   1667 	if (bfreelist.b_bufsize > bsize) {
   1668 		bfreelist.b_bufsize -= bsize;
   1669 		mutex_exit(&bfree_lock);
   1670 		return;
   1671 	}
   1672 	mutex_exit(&bfree_lock);
   1673 	bio_recycle(BIO_MEM, bsize);
   1674 }
   1675 
   1676 /*
   1677  * flush a list of delayed write buffers.
   1678  * (currently used only by bio_recycle below.)
   1679  */
   1680 static void
   1681 bio_flushlist(struct buf *delwri_list)
   1682 {
   1683 	struct buf *bp;
   1684 
   1685 	while (delwri_list != EMPTY_LIST) {
   1686 		bp = delwri_list;
   1687 		bp->b_flags |= B_AGE | B_ASYNC;
   1688 		if (bp->b_vp == NULL) {		/* !ufs */
   1689 			BWRITE(bp);
   1690 		} else {			/* ufs */
   1691 			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
   1692 		}
   1693 		delwri_list = bp->b_list;
   1694 		bp->b_list = NULL;
   1695 	}
   1696 }
   1697 
   1698 /*
   1699  * Start recycling buffers on the freelist for one of 2 reasons:
   1700  *	- we need a buffer header
   1701  *	- we need to free up memory
   1702  * Once started we continue to recycle buffers until the B_AGE
   1703  * buffers are gone.
   1704  */
   1705 static void
   1706 bio_recycle(int want, long bsize)
   1707 {
   1708 	struct buf *bp, *dp, *dwp, *nbp;
   1709 	struct hbuf *hp;
   1710 	int	found = 0;
   1711 	kmutex_t	*hmp;
   1712 	int		start, end;
   1713 	struct buf *delwri_list = EMPTY_LIST;
   1714 
   1715 	/*
   1716 	 * Recycle buffers.
   1717 	 */
   1718 top:
   1719 	start = end = lastindex;
   1720 	do {
   1721 		hp = &hbuf[start];
   1722 		hmp = &hp->b_lock;
   1723 		dp = (struct buf *)hp;
   1724 
   1725 		mutex_enter(hmp);
   1726 		bp = dp->av_forw;
   1727 
   1728 		while (bp != dp) {
   1729 
   1730 			ASSERT(bp != NULL);
   1731 
   1732 			if (!sema_tryp(&bp->b_sem)) {
   1733 				bp = bp->av_forw;
   1734 				continue;
   1735 			}
   1736 			/*
   1737 			 * Do we really want to nuke all of the B_AGE stuff??
   1738 			 */
   1739 			if ((bp->b_flags & B_AGE) == 0 && found) {
   1740 				sema_v(&bp->b_sem);
   1741 				mutex_exit(hmp);
   1742 				lastindex = start;
   1743 				return;	/* All done */
   1744 			}
   1745 
   1746 			ASSERT(MUTEX_HELD(&hp->b_lock));
   1747 			ASSERT(!(bp->b_flags & B_DELWRI));
   1748 			hp->b_length--;
   1749 			notavail(bp);
   1750 
   1751 			/*
   1752 			 * Remove bhdr from cache, free up memory,
   1753 			 * and add the hdr to the freelist.
   1754 			 */
   1755 			bremhash(bp);
   1756 			mutex_exit(hmp);
   1757 
   1758 			if (bp->b_bufsize) {
   1759 				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
   1760 				bp->b_un.b_addr = NULL;
   1761 				mutex_enter(&bfree_lock);
   1762 				bfreelist.b_bufsize += bp->b_bufsize;
   1763 				mutex_exit(&bfree_lock);
   1764 			}
   1765 
   1766 			bp->b_dev = (o_dev_t)NODEV;
   1767 			bp->b_edev = NODEV;
   1768 			bp->b_flags = 0;
   1769 			sema_v(&bp->b_sem);
   1770 			bio_bhdr_free(bp);
   1771 			if (want == BIO_HEADER) {
   1772 				found = 1;
   1773 			} else {
   1774 				ASSERT(want == BIO_MEM);
   1775 				if (!found && bfreelist.b_bufsize >= bsize) {
   1776 					/* Account for the memory we want */
   1777 					mutex_enter(&bfree_lock);
   1778 					if (bfreelist.b_bufsize >= bsize) {
   1779 						bfreelist.b_bufsize -= bsize;
   1780 						found = 1;
   1781 					}
   1782 					mutex_exit(&bfree_lock);
   1783 				}
   1784 			}
   1785 
   1786 			/*
   1787 			 * Since we dropped hmp start from the
   1788 			 * begining.
   1789 			 */
   1790 			mutex_enter(hmp);
   1791 			bp = dp->av_forw;
   1792 		}
   1793 		mutex_exit(hmp);
   1794 
   1795 		/*
   1796 		 * Look at the delayed write list.
   1797 		 * First gather into a private list, then write them.
   1798 		 */
   1799 		dwp = (struct buf *)&dwbuf[start];
   1800 		mutex_enter(&blist_lock);
   1801 		bio_doingflush++;
   1802 		mutex_enter(hmp);
   1803 		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
   1804 
   1805 			ASSERT(bp != NULL);
   1806 			nbp = bp->av_forw;
   1807 
   1808 			if (!sema_tryp(&bp->b_sem))
   1809 				continue;
   1810 			ASSERT(bp->b_flags & B_DELWRI);
   1811 			/*
   1812 			 * Do we really want to nuke all of the B_AGE stuff??
   1813 			 */
   1814 
   1815 			if ((bp->b_flags & B_AGE) == 0 && found) {
   1816 				sema_v(&bp->b_sem);
   1817 				mutex_exit(hmp);
   1818 				lastindex = start;
   1819 				mutex_exit(&blist_lock);
   1820 				bio_flushlist(delwri_list);
   1821 				mutex_enter(&blist_lock);
   1822 				bio_doingflush--;
   1823 				if (bio_flinv_cv_wanted) {
   1824 					bio_flinv_cv_wanted = 0;
   1825 					cv_broadcast(&bio_flushinval_cv);
   1826 				}
   1827 				mutex_exit(&blist_lock);
   1828 				return; /* All done */
   1829 			}
   1830 
   1831 			/*
   1832 			 * If the buffer is already on a flush or
   1833 			 * invalidate list then just skip it.
   1834 			 */
   1835 			if (bp->b_list != NULL) {
   1836 				sema_v(&bp->b_sem);
   1837 				continue;
   1838 			}
   1839 			/*
   1840 			 * We are still on the same bucket.
   1841 			 */
   1842 			hp->b_length--;
   1843 			notavail(bp);
   1844 			bp->b_list = delwri_list;
   1845 			delwri_list = bp;
   1846 		}
   1847 		mutex_exit(hmp);
   1848 		mutex_exit(&blist_lock);
   1849 		bio_flushlist(delwri_list);
   1850 		delwri_list = EMPTY_LIST;
   1851 		mutex_enter(&blist_lock);
   1852 		bio_doingflush--;
   1853 		if (bio_flinv_cv_wanted) {
   1854 			bio_flinv_cv_wanted = 0;
   1855 			cv_broadcast(&bio_flushinval_cv);
   1856 		}
   1857 		mutex_exit(&blist_lock);
   1858 		start = (start + 1) % v.v_hbuf;
   1859 
   1860 	} while (start != end);
   1861 
   1862 	if (found)
   1863 		return;
   1864 
   1865 	/*
   1866 	 * Free lists exhausted and we haven't satisfied the request.
   1867 	 * Wait here for more entries to be added to freelist.
   1868 	 * Because this might have just happened, make it timed.
   1869 	 */
   1870 	mutex_enter(&bfree_lock);
   1871 	bfreelist.b_flags |= B_WANTED;
   1872 	(void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
   1873 	mutex_exit(&bfree_lock);
   1874 	goto top;
   1875 }
   1876 
   1877 /*
   1878  * See if the block is associated with some buffer
   1879  * (mainly to avoid getting hung up on a wait in breada).
   1880  */
   1881 static int
   1882 bio_incore(dev_t dev, daddr_t blkno)
   1883 {
   1884 	struct buf *bp;
   1885 	struct buf *dp;
   1886 	uint_t index;
   1887 	kmutex_t *hmp;
   1888 
   1889 	index = bio_bhash(dev, blkno);
   1890 	dp = (struct buf *)&hbuf[index];
   1891 	hmp = &hbuf[index].b_lock;
   1892 
   1893 	mutex_enter(hmp);
   1894 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
   1895 		if (bp->b_blkno == blkno && bp->b_edev == dev &&
   1896 		    (bp->b_flags & B_STALE) == 0) {
   1897 			mutex_exit(hmp);
   1898 			return (1);
   1899 		}
   1900 	}
   1901 	mutex_exit(hmp);
   1902 	return (0);
   1903 }
   1904 
   1905 static void
   1906 bio_pageio_done(struct buf *bp)
   1907 {
   1908 	if (bp->b_flags & B_PAGEIO) {
   1909 
   1910 		if (bp->b_flags & B_REMAPPED)
   1911 			bp_mapout(bp);
   1912 
   1913 		if (bp->b_flags & B_READ)
   1914 			pvn_read_done(bp->b_pages, bp->b_flags);
   1915 		else
   1916 			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
   1917 		pageio_done(bp);
   1918 	} else {
   1919 		ASSERT(bp->b_flags & B_REMAPPED);
   1920 		bp_mapout(bp);
   1921 		brelse(bp);
   1922 	}
   1923 }
   1924 
   1925 /*
   1926  * bioerror(9F) - indicate error in buffer header
   1927  * If 'error' is zero, remove the error indication.
   1928  */
   1929 void
   1930 bioerror(struct buf *bp, int error)
   1931 {
   1932 	ASSERT(bp != NULL);
   1933 	ASSERT(error >= 0);
   1934 	ASSERT(SEMA_HELD(&bp->b_sem));
   1935 
   1936 	if (error != 0) {
   1937 		bp->b_flags |= B_ERROR;
   1938 	} else {
   1939 		bp->b_flags &= ~B_ERROR;
   1940 	}
   1941 	bp->b_error = error;
   1942 }
   1943 
   1944 /*
   1945  * bioreset(9F) - reuse a private buffer header after I/O is complete
   1946  */
   1947 void
   1948 bioreset(struct buf *bp)
   1949 {
   1950 	ASSERT(bp != NULL);
   1951 
   1952 	biofini(bp);
   1953 	bioinit(bp);
   1954 }
   1955 
   1956 /*
   1957  * biosize(9F) - return size of a buffer header
   1958  */
   1959 size_t
   1960 biosize(void)
   1961 {
   1962 	return (sizeof (struct buf));
   1963 }
   1964 
   1965 /*
   1966  * biomodified(9F) - check if buffer is modified
   1967  */
   1968 int
   1969 biomodified(struct buf *bp)
   1970 {
   1971 	int npf;
   1972 	int ppattr;
   1973 	struct page *pp;
   1974 
   1975 	ASSERT(bp != NULL);
   1976 
   1977 	if ((bp->b_flags & B_PAGEIO) == 0) {
   1978 		return (-1);
   1979 	}
   1980 	pp = bp->b_pages;
   1981 	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
   1982 
   1983 	while (npf > 0) {
   1984 		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
   1985 		    HAT_SYNC_STOPON_MOD);
   1986 		if (ppattr & P_MOD)
   1987 			return (1);
   1988 		pp = pp->p_next;
   1989 		npf--;
   1990 	}
   1991 
   1992 	return (0);
   1993 }
   1994 
   1995 /*
   1996  * bioinit(9F) - initialize a buffer structure
   1997  */
   1998 void
   1999 bioinit(struct buf *bp)
   2000 {
   2001 	bzero(bp, sizeof (struct buf));
   2002 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
   2003 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
   2004 	bp->b_offset = -1;
   2005 }
   2006 
   2007 /*
   2008  * biofini(9F) - uninitialize a buffer structure
   2009  */
   2010 void
   2011 biofini(struct buf *bp)
   2012 {
   2013 	sema_destroy(&bp->b_io);
   2014 	sema_destroy(&bp->b_sem);
   2015 }
   2016 
   2017 /*
   2018  * bioclone(9F) - clone a buffer
   2019  */
   2020 struct buf *
   2021 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
   2022     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
   2023 {
   2024 	struct buf *bufp;
   2025 
   2026 	ASSERT(bp);
   2027 	if (bp_mem == NULL) {
   2028 		bufp = kmem_alloc(sizeof (struct buf), sleep);
   2029 		if (bufp == NULL) {
   2030 			return (NULL);
   2031 		}
   2032 		bioinit(bufp);
   2033 	} else {
   2034 		bufp = bp_mem;
   2035 		bioreset(bufp);
   2036 	}
   2037 
   2038 #define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
   2039 	B_ABRWRITE)
   2040 
   2041 	/*
   2042 	 * The cloned buffer does not inherit the B_REMAPPED flag.
   2043 	 */
   2044 	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
   2045 	bufp->b_bcount = len;
   2046 	bufp->b_blkno = blkno;
   2047 	bufp->b_iodone = iodone;
   2048 	bufp->b_proc = bp->b_proc;
   2049 	bufp->b_edev = dev;
   2050 	bufp->b_file = bp->b_file;
   2051 	bufp->b_offset = bp->b_offset;
   2052 
   2053 	if (bp->b_flags & B_SHADOW) {
   2054 		ASSERT(bp->b_shadow);
   2055 		ASSERT(bp->b_flags & B_PHYS);
   2056 
   2057 		bufp->b_shadow = bp->b_shadow +
   2058 		    btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
   2059 		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
   2060 		if (bp->b_flags & B_REMAPPED)
   2061 			bufp->b_proc = NULL;
   2062 	} else {
   2063 		if (bp->b_flags & B_PAGEIO) {
   2064 			struct page *pp;
   2065 			off_t o;
   2066 			int i;
   2067 
   2068 			pp = bp->b_pages;
   2069 			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
   2070 			for (i = btop(o); i > 0; i--) {
   2071 				pp = pp->p_next;
   2072 			}
   2073 			bufp->b_pages = pp;
   2074 			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
   2075 		} else {
   2076 			bufp->b_un.b_addr =
   2077 			    (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
   2078 			if (bp->b_flags & B_REMAPPED)
   2079 				bufp->b_proc = NULL;
   2080 		}
   2081 	}
   2082 	return (bufp);
   2083 }
   2084