Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/systm.h>
     27 #include <sys/types.h>
     28 #include <sys/vnode.h>
     29 #include <sys/errno.h>
     30 #include <sys/sysmacros.h>
     31 #include <sys/debug.h>
     32 #include <sys/kmem.h>
     33 #include <sys/conf.h>
     34 #include <sys/proc.h>
     35 #include <sys/cmn_err.h>
     36 #include <sys/fssnap_if.h>
     37 #include <sys/fs/ufs_inode.h>
     38 #include <sys/fs/ufs_filio.h>
     39 #include <sys/fs/ufs_log.h>
     40 #include <sys/fs/ufs_bio.h>
     41 #include <sys/atomic.h>
     42 
     43 extern int		maxphys;
     44 extern uint_t		bypass_snapshot_throttle_key;
     45 
     46 extern struct kmem_cache	*lufs_sv;
     47 extern struct kmem_cache	*lufs_bp;
     48 
     49 static void
     50 makebusy(ml_unit_t *ul, buf_t *bp)
     51 {
     52 	sema_p(&bp->b_sem);
     53 	if ((bp->b_flags & B_ERROR) == 0)
     54 		return;
     55 	if (bp->b_flags & B_READ)
     56 		ldl_seterror(ul, "Error reading ufs log");
     57 	else
     58 		ldl_seterror(ul, "Error writing ufs log");
     59 }
     60 
     61 static int
     62 logdone(buf_t *bp)
     63 {
     64 	bp->b_flags |= B_DONE;
     65 
     66 	if (bp->b_flags & B_WRITE)
     67 		sema_v(&bp->b_sem);
     68 	else
     69 		/* wakeup the thread waiting on this buf */
     70 		sema_v(&bp->b_io);
     71 	return (0);
     72 }
     73 
     74 static int
     75 ldl_strategy_done(buf_t *cb)
     76 {
     77 	lufs_save_t	*sv;
     78 	lufs_buf_t	*lbp;
     79 	buf_t		*bp;
     80 
     81 	ASSERT(SEMA_HELD(&cb->b_sem));
     82 	ASSERT((cb->b_flags & B_DONE) == 0);
     83 
     84 	/*
     85 	 * Compute address of the ``save'' struct
     86 	 */
     87 	lbp = (lufs_buf_t *)cb;
     88 	sv = (lufs_save_t *)lbp->lb_ptr;
     89 
     90 	if (cb->b_flags & B_ERROR)
     91 		sv->sv_error = 1;
     92 
     93 	/*
     94 	 * If this is the last request, release the resources and
     95 	 * ``done'' the original buffer header.
     96 	 */
     97 	if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
     98 		kmem_cache_free(lufs_bp, lbp);
     99 		return (1);
    100 	}
    101 	/* Propagate any errors back to the original buffer header */
    102 	bp = sv->sv_bp;
    103 	if (sv->sv_error)
    104 		bp->b_flags |= B_ERROR;
    105 	kmem_cache_free(lufs_bp, lbp);
    106 	kmem_cache_free(lufs_sv, sv);
    107 
    108 	biodone(bp);
    109 	return (0);
    110 }
    111 
    112 /*
    113  * Map the log logical block number to a physical disk block number
    114  */
    115 static int
    116 map_frag(
    117 	ml_unit_t	*ul,
    118 	daddr_t		lblkno,
    119 	size_t		bcount,
    120 	daddr_t		*pblkno,
    121 	size_t		*pbcount)
    122 {
    123 	ic_extent_t	*ext = ul->un_ebp->ic_extents;
    124 	uint32_t	e = ul->un_ebp->ic_nextents;
    125 	uint32_t	s = 0;
    126 	uint32_t	i = e >> 1;
    127 	uint32_t	lasti = i;
    128 	uint32_t	bno_off;
    129 
    130 again:
    131 	if (ext[i].ic_lbno <= lblkno) {
    132 		if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
    133 			/* FOUND IT */
    134 			bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
    135 			*pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
    136 			*pblkno = ext[i].ic_pbno + bno_off;
    137 			return (0);
    138 		} else
    139 			s = i;
    140 	} else
    141 		e = i;
    142 	i = s + ((e - s) >> 1);
    143 
    144 	if (i == lasti) {
    145 		*pbcount = bcount;
    146 		return (ENOENT);
    147 	}
    148 	lasti = i;
    149 
    150 	goto again;
    151 }
    152 
    153 /*
    154  * The log is a set of extents (which typically will be only one, but
    155  * may be more if the disk was close to full when the log was created)
    156  * and hence the logical offsets into the log
    157  * have to be translated into their real device locations before
    158  * calling the device's strategy routine. The translation may result
    159  * in several IO requests if this request spans extents.
    160  */
    161 void
    162 ldl_strategy(ml_unit_t *ul, buf_t *pb)
    163 {
    164 	lufs_save_t	*sv;
    165 	lufs_buf_t	*lbp;
    166 	buf_t		*cb;
    167 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
    168 	daddr_t		lblkno, pblkno;
    169 	size_t		nb_left, pbcount;
    170 	off_t		offset;
    171 	dev_t		dev	= ul->un_dev;
    172 	int		error;
    173 	int		read = pb->b_flags & B_READ;
    174 
    175 	/*
    176 	 * Allocate and initialise the save stucture,
    177 	 */
    178 	sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
    179 	sv->sv_error = 0;
    180 	sv->sv_bp = pb;
    181 	nb_left = pb->b_bcount;
    182 	sv->sv_nb_left = nb_left;
    183 
    184 	lblkno = pb->b_blkno;
    185 	offset = 0;
    186 
    187 	do {
    188 		error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);
    189 
    190 		lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
    191 		bioinit(&lbp->lb_buf);
    192 		lbp->lb_ptr = sv;
    193 
    194 		cb = bioclone(pb, offset, pbcount, dev,
    195 		    pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);
    196 
    197 		offset += pbcount;
    198 		lblkno += btodb(pbcount);
    199 		nb_left -= pbcount;
    200 
    201 		if (error) {
    202 			cb->b_flags |= B_ERROR;
    203 			cb->b_resid = cb->b_bcount;
    204 			biodone(cb);
    205 		} else {
    206 			if (read) {
    207 				logstats.ls_ldlreads.value.ui64++;
    208 				ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
    209 				lwp_stat_update(LWP_STAT_INBLK, 1);
    210 			} else {
    211 				logstats.ls_ldlwrites.value.ui64++;
    212 				lwp_stat_update(LWP_STAT_OUBLK, 1);
    213 			}
    214 
    215 			/*
    216 			 * write through the snapshot driver if necessary
    217 			 * We do not want this write to be throttled because
    218 			 * we are holding the un_log mutex here. If we
    219 			 * are throttled in fssnap_translate, the fssnap_taskq
    220 			 * thread which can wake us up can get blocked on
    221 			 * the un_log mutex resulting in a deadlock.
    222 			 */
    223 			if (ufsvfsp->vfs_snapshot) {
    224 				(void) tsd_set(bypass_snapshot_throttle_key,
    225 				    (void *)1);
    226 				fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);
    227 
    228 				(void) tsd_set(bypass_snapshot_throttle_key,
    229 				    (void *)0);
    230 			} else {
    231 				(void) bdev_strategy(cb);
    232 			}
    233 		}
    234 
    235 	} while (nb_left);
    236 }
    237 
    238 static void
    239 writelog(ml_unit_t *ul, buf_t *bp)
    240 {
    241 	ASSERT(SEMA_HELD(&bp->b_sem));
    242 
    243 	/*
    244 	 * This is really an B_ASYNC write but we want Presto to
    245 	 * cache this write.  The iodone routine, logdone, processes
    246 	 * the buf correctly.
    247 	 */
    248 	bp->b_flags = B_WRITE;
    249 	bp->b_edev = ul->un_dev;
    250 	bp->b_iodone = logdone;
    251 
    252 	/*
    253 	 * return EIO for every IO if in hard error state
    254 	 */
    255 	if (ul->un_flags & LDL_ERROR) {
    256 		bp->b_flags |= B_ERROR;
    257 		bp->b_error = EIO;
    258 		biodone(bp);
    259 		return;
    260 	}
    261 
    262 	ldl_strategy(ul, bp);
    263 }
    264 
    265 static void
    266 readlog(ml_unit_t *ul, buf_t *bp)
    267 {
    268 	ASSERT(SEMA_HELD(&bp->b_sem));
    269 	ASSERT(bp->b_bcount);
    270 
    271 	bp->b_flags = B_READ;
    272 	bp->b_edev = ul->un_dev;
    273 	bp->b_iodone = logdone;
    274 
    275 	/* all IO returns errors when in error state */
    276 	if (ul->un_flags & LDL_ERROR) {
    277 		bp->b_flags |= B_ERROR;
    278 		bp->b_error = EIO;
    279 		biodone(bp);
    280 		(void) trans_wait(bp);
    281 		return;
    282 	}
    283 
    284 	ldl_strategy(ul, bp);
    285 
    286 	if (trans_wait(bp))
    287 		ldl_seterror(ul, "Error reading ufs log");
    288 }
    289 
    290 /*
    291  * NOTE: writers are single threaded thru the log layer.
    292  * This means we can safely reference and change the cb and bp fields
    293  * that ldl_read does not reference w/o holding the cb_rwlock or
    294  * the bp makebusy lock.
    295  */
    296 static void
    297 push_dirty_bp(ml_unit_t *ul, buf_t *bp)
    298 {
    299 	buf_t		*newbp;
    300 	cirbuf_t	*cb		= &ul->un_wrbuf;
    301 
    302 	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
    303 	ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);
    304 
    305 	/*
    306 	 * async write the buf
    307 	 */
    308 	writelog(ul, bp);
    309 
    310 	/*
    311 	 * no longer filling any buf
    312 	 */
    313 	cb->cb_dirty = NULL;
    314 
    315 	/*
    316 	 * no extra buffer space; all done
    317 	 */
    318 	if (bp->b_bcount == bp->b_bufsize)
    319 		return;
    320 
    321 	/*
    322 	 * give extra buffer space to a new bp
    323 	 * 	try to take buf off of free list
    324 	 */
    325 	if ((newbp = cb->cb_free) != NULL) {
    326 		cb->cb_free = newbp->b_forw;
    327 	} else {
    328 		newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
    329 		sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
    330 		sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
    331 	}
    332 	newbp->b_flags = 0;
    333 	newbp->b_bcount = 0;
    334 	newbp->b_file = NULL;
    335 	newbp->b_offset = -1;
    336 	newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
    337 	newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
    338 	bp->b_bufsize = bp->b_bcount;
    339 
    340 	/*
    341 	 * lock out readers and put new buf at LRU position
    342 	 */
    343 	rw_enter(&cb->cb_rwlock, RW_WRITER);
    344 	newbp->b_forw = bp->b_forw;
    345 	newbp->b_back = bp;
    346 	bp->b_forw->b_back = newbp;
    347 	bp->b_forw = newbp;
    348 	rw_exit(&cb->cb_rwlock);
    349 }
    350 
    351 static void
    352 inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
    353 {
    354 	buf_t		*bp;
    355 	off_t		elof	= lof + nb;
    356 	off_t		buflof;
    357 	off_t		bufelof;
    358 
    359 	/*
    360 	 * discard all bufs that overlap the range (lof, lof + nb)
    361 	 */
    362 	rw_enter(&cb->cb_rwlock, RW_WRITER);
    363 	bp = cb->cb_bp;
    364 	do {
    365 		if (bp == cb->cb_dirty || bp->b_bcount == 0) {
    366 			bp = bp->b_forw;
    367 			continue;
    368 		}
    369 		buflof = dbtob(bp->b_blkno);
    370 		bufelof = buflof + bp->b_bcount;
    371 		if ((buflof < lof && bufelof <= lof) ||
    372 		    (buflof >= elof && bufelof > elof)) {
    373 			bp = bp->b_forw;
    374 			continue;
    375 		}
    376 		makebusy(ul, bp);
    377 		bp->b_flags = 0;
    378 		bp->b_bcount = 0;
    379 		sema_v(&bp->b_sem);
    380 		bp = bp->b_forw;
    381 	} while (bp != cb->cb_bp);
    382 	rw_exit(&cb->cb_rwlock);
    383 }
    384 
    385 /*
    386  * NOTE: writers are single threaded thru the log layer.
    387  * This means we can safely reference and change the cb and bp fields
    388  * that ldl_read does not reference w/o holding the cb_rwlock or
    389  * the bp makebusy lock.
    390  */
    391 static buf_t *
    392 get_write_bp(ml_unit_t *ul)
    393 {
    394 	cirbuf_t	*cb = &ul->un_wrbuf;
    395 	buf_t		*bp;
    396 
    397 	/*
    398 	 * cb_dirty is the buffer we are currently filling; if any
    399 	 */
    400 	if ((bp = cb->cb_dirty) != NULL) {
    401 		makebusy(ul, bp);
    402 		return (bp);
    403 	}
    404 	/*
    405 	 * discard any bp that overlaps the current tail since we are
    406 	 * about to overwrite it.
    407 	 */
    408 	inval_range(ul, cb, ul->un_tail_lof, 1);
    409 
    410 	/*
    411 	 * steal LRU buf
    412 	 */
    413 	rw_enter(&cb->cb_rwlock, RW_WRITER);
    414 	bp = cb->cb_bp->b_forw;
    415 	makebusy(ul, bp);
    416 
    417 	cb->cb_dirty = bp;
    418 	cb->cb_bp = bp;
    419 
    420 	bp->b_flags = 0;
    421 	bp->b_bcount = 0;
    422 	bp->b_blkno = btodb(ul->un_tail_lof);
    423 	ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
    424 	rw_exit(&cb->cb_rwlock);
    425 
    426 	/*
    427 	 * NOTE:
    428 	 *	1. un_tail_lof never addresses >= un_eol_lof
    429 	 *	2. b_blkno + btodb(b_bufsize) may > un_eol_lof
    430 	 *		this case is handled in storebuf
    431 	 */
    432 	return (bp);
    433 }
    434 
    435 void
    436 alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
    437 {
    438 	int	i;
    439 	buf_t	*bp;
    440 
    441 	/*
    442 	 * Clear previous allocation
    443 	 */
    444 	if (cb->cb_nb)
    445 		free_cirbuf(cb);
    446 
    447 	bzero(cb, sizeof (*cb));
    448 	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
    449 
    450 	rw_enter(&cb->cb_rwlock, RW_WRITER);
    451 
    452 	/*
    453 	 * preallocate 3 bp's and put them on the free list.
    454 	 */
    455 	for (i = 0; i < 3; ++i) {
    456 		bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
    457 		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
    458 		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
    459 		bp->b_offset = -1;
    460 		bp->b_forw = cb->cb_free;
    461 		cb->cb_free = bp;
    462 	}
    463 
    464 	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
    465 	cb->cb_nb = bufsize;
    466 
    467 	/*
    468 	 * first bp claims entire write buffer
    469 	 */
    470 	bp = cb->cb_free;
    471 	cb->cb_free = bp->b_forw;
    472 
    473 	bp->b_forw = bp;
    474 	bp->b_back = bp;
    475 	cb->cb_bp = bp;
    476 	bp->b_un.b_addr = cb->cb_va;
    477 	bp->b_bufsize = cb->cb_nb;
    478 
    479 	rw_exit(&cb->cb_rwlock);
    480 }
    481 
    482 void
    483 alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
    484 {
    485 	caddr_t	va;
    486 	size_t	nb;
    487 	buf_t	*bp;
    488 
    489 	/*
    490 	 * Clear previous allocation
    491 	 */
    492 	if (cb->cb_nb)
    493 		free_cirbuf(cb);
    494 
    495 	bzero(cb, sizeof (*cb));
    496 	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
    497 
    498 	rw_enter(&cb->cb_rwlock, RW_WRITER);
    499 
    500 	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
    501 	cb->cb_nb = bufsize;
    502 
    503 	/*
    504 	 * preallocate N bufs that are hard-sized to blksize
    505 	 *	in other words, the read buffer pool is a linked list
    506 	 *	of statically sized bufs.
    507 	 */
    508 	va = cb->cb_va;
    509 	while ((nb = bufsize) != 0) {
    510 		if (nb > blksize)
    511 			nb = blksize;
    512 		bp = kmem_alloc(sizeof (buf_t), KM_SLEEP);
    513 		bzero(bp, sizeof (buf_t));
    514 		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
    515 		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
    516 		bp->b_un.b_addr = va;
    517 		bp->b_bufsize = nb;
    518 		if (cb->cb_bp) {
    519 			bp->b_forw = cb->cb_bp->b_forw;
    520 			bp->b_back = cb->cb_bp;
    521 			cb->cb_bp->b_forw->b_back = bp;
    522 			cb->cb_bp->b_forw = bp;
    523 		} else
    524 			bp->b_forw = bp->b_back = bp;
    525 		cb->cb_bp = bp;
    526 		bufsize -= nb;
    527 		va += nb;
    528 	}
    529 
    530 	rw_exit(&cb->cb_rwlock);
    531 }
    532 
    533 void
    534 free_cirbuf(cirbuf_t *cb)
    535 {
    536 	buf_t	*bp;
    537 
    538 	if (cb->cb_nb == 0)
    539 		return;
    540 
    541 	rw_enter(&cb->cb_rwlock, RW_WRITER);
    542 	ASSERT(cb->cb_dirty == NULL);
    543 
    544 	/*
    545 	 * free the active bufs
    546 	 */
    547 	while ((bp = cb->cb_bp) != NULL) {
    548 		if (bp == bp->b_forw)
    549 			cb->cb_bp = NULL;
    550 		else
    551 			cb->cb_bp = bp->b_forw;
    552 		bp->b_back->b_forw = bp->b_forw;
    553 		bp->b_forw->b_back = bp->b_back;
    554 		sema_destroy(&bp->b_sem);
    555 		sema_destroy(&bp->b_io);
    556 		kmem_free(bp, sizeof (buf_t));
    557 	}
    558 
    559 	/*
    560 	 * free the free bufs
    561 	 */
    562 	while ((bp = cb->cb_free) != NULL) {
    563 		cb->cb_free = bp->b_forw;
    564 		sema_destroy(&bp->b_sem);
    565 		sema_destroy(&bp->b_io);
    566 		kmem_free(bp, sizeof (buf_t));
    567 	}
    568 	kmem_free(cb->cb_va, cb->cb_nb);
    569 	cb->cb_va = NULL;
    570 	cb->cb_nb = 0;
    571 	rw_exit(&cb->cb_rwlock);
    572 	rw_destroy(&cb->cb_rwlock);
    573 }
    574 
    575 static int
    576 within_range(off_t lof, daddr_t blkno, ulong_t bcount)
    577 {
    578 	off_t	blof	= dbtob(blkno);
    579 
    580 	return ((lof >= blof) && (lof < (blof + bcount)));
    581 }
    582 
    583 static buf_t *
    584 find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
    585 {
    586 	buf_t *bp;
    587 
    588 	/*
    589 	 * find a buf that contains the offset lof
    590 	 */
    591 	rw_enter(&cb->cb_rwlock, RW_READER);
    592 	bp = cb->cb_bp;
    593 	do {
    594 		if (bp->b_bcount &&
    595 		    within_range(lof, bp->b_blkno, bp->b_bcount)) {
    596 			makebusy(ul, bp);
    597 			rw_exit(&cb->cb_rwlock);
    598 			return (bp);
    599 		}
    600 		bp = bp->b_forw;
    601 	} while (bp != cb->cb_bp);
    602 	rw_exit(&cb->cb_rwlock);
    603 
    604 	return (NULL);
    605 }
    606 
    607 static off_t
    608 find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
    609 {
    610 	buf_t	*bp, *bpend;
    611 	off_t	rlof;
    612 
    613 	/*
    614 	 * we mustn't:
    615 	 *	o read past eol
    616 	 *	o read past the tail
    617 	 *	o read data that may be being written.
    618 	 */
    619 	rw_enter(&cb->cb_rwlock, RW_READER);
    620 	bpend = bp = cb->cb_bp->b_forw;
    621 	rlof = ul->un_tail_lof;
    622 	do {
    623 		if (bp->b_bcount) {
    624 			rlof = dbtob(bp->b_blkno);
    625 			break;
    626 		}
    627 		bp = bp->b_forw;
    628 	} while (bp != bpend);
    629 	rw_exit(&cb->cb_rwlock);
    630 
    631 	if (lof <= rlof)
    632 		/* lof is prior to the range represented by the write buf */
    633 		return (rlof);
    634 	else
    635 		/* lof follows the range represented by the write buf */
    636 		return ((off_t)ul->un_eol_lof);
    637 }
    638 
    639 static buf_t *
    640 get_read_bp(ml_unit_t *ul, off_t lof)
    641 {
    642 	cirbuf_t	*cb;
    643 	buf_t		*bp;
    644 	off_t		rlof;
    645 
    646 	/*
    647 	 * retrieve as much data as possible from the incore buffers
    648 	 */
    649 	if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
    650 		logstats.ls_lreadsinmem.value.ui64++;
    651 		return (bp);
    652 	}
    653 	if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
    654 		logstats.ls_lreadsinmem.value.ui64++;
    655 		return (bp);
    656 	}
    657 
    658 	/*
    659 	 * steal the LRU buf
    660 	 */
    661 	cb = &ul->un_rdbuf;
    662 	rw_enter(&cb->cb_rwlock, RW_WRITER);
    663 	bp = cb->cb_bp->b_forw;
    664 	makebusy(ul, bp);
    665 	bp->b_flags = 0;
    666 	bp->b_bcount = 0;
    667 	cb->cb_bp = bp;
    668 	rw_exit(&cb->cb_rwlock);
    669 
    670 	/*
    671 	 * don't read past the tail or the end-of-log
    672 	 */
    673 	bp->b_blkno = btodb(lof);
    674 	lof = dbtob(bp->b_blkno);
    675 	rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
    676 	bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
    677 	readlog(ul, bp);
    678 	return (bp);
    679 }
    680 
    681 /*
    682  * NOTE: writers are single threaded thru the log layer.
    683  * This means we can safely reference and change the cb and bp fields
    684  * that ldl_read does not reference w/o holding the cb_rwlock or
    685  * the bp makebusy lock.
    686  */
    687 static int
    688 extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
    689 {
    690 	buf_t	*bpforw	= bp->b_forw;
    691 
    692 	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
    693 
    694 	/*
    695 	 * there is no `next' bp; do nothing
    696 	 */
    697 	if (bpforw == bp)
    698 		return (0);
    699 
    700 	/*
    701 	 * buffer space is not adjacent; do nothing
    702 	 */
    703 	if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
    704 		return (0);
    705 
    706 	/*
    707 	 * locking protocol requires giving up any bp locks before
    708 	 * acquiring cb_rwlock.  This is okay because we hold
    709 	 * un_log_mutex.
    710 	 */
    711 	sema_v(&bp->b_sem);
    712 
    713 	/*
    714 	 * lock out ldl_read
    715 	 */
    716 	rw_enter(&cb->cb_rwlock, RW_WRITER);
    717 
    718 	/*
    719 	 * wait for current IO to finish w/next bp; if necessary
    720 	 */
    721 	makebusy(ul, bpforw);
    722 
    723 	/*
    724 	 * free the next bp and steal its space
    725 	 */
    726 	bp->b_forw = bpforw->b_forw;
    727 	bpforw->b_forw->b_back = bp;
    728 	bp->b_bufsize += bpforw->b_bufsize;
    729 	sema_v(&bpforw->b_sem);
    730 	bpforw->b_forw = cb->cb_free;
    731 	cb->cb_free = bpforw;
    732 	makebusy(ul, bp);
    733 	rw_exit(&cb->cb_rwlock);
    734 
    735 	return (1);
    736 }
    737 
    738 static size_t
    739 storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
    740 {
    741 	size_t		copy_nb;
    742 	size_t		nb_in_sec;
    743 	sect_trailer_t	*st;
    744 	size_t		nb_left = nb;
    745 	cirbuf_t	*cb	= &ul->un_wrbuf;
    746 
    747 again:
    748 	nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
    749 	copy_nb = MIN(nb_left, nb_in_sec);
    750 
    751 	ASSERT(copy_nb);
    752 
    753 	bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
    754 	bp->b_bcount += copy_nb;
    755 	va += copy_nb;
    756 	nb_left -= copy_nb;
    757 	ul->un_tail_lof += copy_nb;
    758 
    759 	if ((nb_in_sec -= copy_nb) == 0) {
    760 		st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);
    761 
    762 		st->st_tid = ul->un_logmap->mtm_tid;
    763 		st->st_ident = ul->un_tail_ident++;
    764 		bp->b_bcount += sizeof (sect_trailer_t);
    765 		ul->un_tail_lof += sizeof (sect_trailer_t);
    766 		/*
    767 		 * log wrapped; async write this bp
    768 		 */
    769 		if (ul->un_tail_lof == ul->un_eol_lof) {
    770 			ul->un_tail_lof = ul->un_bol_lof;
    771 			push_dirty_bp(ul, bp);
    772 			return (nb - nb_left);
    773 		}
    774 		/*
    775 		 * out of bp space; get more or async write buf
    776 		 */
    777 		if (bp->b_bcount == bp->b_bufsize) {
    778 			if (!extend_write_bp(ul, cb, bp)) {
    779 				push_dirty_bp(ul, bp);
    780 				return (nb - nb_left);
    781 			}
    782 		}
    783 	}
    784 	if (nb_left)
    785 		goto again;
    786 
    787 	sema_v(&bp->b_sem);
    788 	return (nb);
    789 }
    790 
    791 static void
    792 fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
    793 {
    794 	offset_t	src_mof	= me->me_mof;
    795 	size_t		src_nb	= me->me_nb;
    796 
    797 	if (src_mof > dst_mof) {
    798 		ASSERT(src_mof < (dst_mof + dst_nb));
    799 		dst_va += (src_mof - dst_mof);
    800 		dst_nb -= (src_mof - dst_mof);
    801 	} else {
    802 		ASSERT(dst_mof < (src_mof + src_nb));
    803 		src_nb -= (dst_mof - src_mof);
    804 	}
    805 
    806 	src_nb = MIN(src_nb, dst_nb);
    807 	ASSERT(src_nb);
    808 	bzero(dst_va, src_nb);
    809 }
    810 
    811 /*
    812  * dst_va == NULL means don't copy anything
    813  */
    814 static ulong_t
    815 fetchbuf(
    816 	ml_unit_t *ul,
    817 	buf_t *bp,
    818 	caddr_t dst_va,
    819 	size_t dst_nb,
    820 	off_t *dst_lofp)
    821 {
    822 	caddr_t	copy_va;
    823 	size_t	copy_nb;
    824 	size_t	nb_sec;
    825 	off_t	dst_lof		= *dst_lofp;
    826 	ulong_t	sav_dst_nb	= dst_nb;
    827 	ulong_t	src_nb		= bp->b_bcount;
    828 	off_t	src_lof		= dbtob(bp->b_blkno);
    829 	off_t	src_elof	= src_lof + src_nb;
    830 	caddr_t	src_va		= bp->b_un.b_addr;
    831 
    832 	/*
    833 	 * copy from bp to dst_va
    834 	 */
    835 	while (dst_nb) {
    836 		/*
    837 		 * compute address within bp
    838 		 */
    839 		copy_va = src_va + (dst_lof - src_lof);
    840 
    841 		/*
    842 		 * adjust copy size to amount of data in bp
    843 		 */
    844 		copy_nb = MIN(dst_nb, src_elof - dst_lof);
    845 
    846 		/*
    847 		 * adjust copy size to amount of data in sector
    848 		 */
    849 		nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
    850 		copy_nb = MIN(copy_nb, nb_sec);
    851 
    852 		/*
    853 		 * dst_va == NULL means don't do copy (see logseek())
    854 		 */
    855 		if (dst_va) {
    856 			bcopy(copy_va, dst_va, copy_nb);
    857 			dst_va += copy_nb;
    858 		}
    859 		dst_lof += copy_nb;
    860 		dst_nb -= copy_nb;
    861 		nb_sec -= copy_nb;
    862 
    863 		/*
    864 		 * advance over sector trailer
    865 		 */
    866 		if (nb_sec == 0)
    867 			dst_lof += sizeof (sect_trailer_t);
    868 
    869 		/*
    870 		 * exhausted buffer
    871 		 *	return current lof for next read
    872 		 */
    873 		if (dst_lof == src_elof) {
    874 			sema_v(&bp->b_sem);
    875 			if (dst_lof == ul->un_eol_lof)
    876 				dst_lof = ul->un_bol_lof;
    877 			*dst_lofp = dst_lof;
    878 			return (sav_dst_nb - dst_nb);
    879 		}
    880 	}
    881 
    882 	/*
    883 	 * copy complete - return current lof
    884 	 */
    885 	sema_v(&bp->b_sem);
    886 	*dst_lofp = dst_lof;
    887 	return (sav_dst_nb);
    888 }
    889 
    890 void
    891 ldl_round_commit(ml_unit_t *ul)
    892 {
    893 	int		wrapped;
    894 	buf_t		*bp;
    895 	sect_trailer_t	*st;
    896 	size_t		bcount;
    897 	cirbuf_t	*cb	= &ul->un_wrbuf;
    898 
    899 	/*
    900 	 * if nothing to write; then do nothing
    901 	 */
    902 	if ((bp = cb->cb_dirty) == NULL)
    903 		return;
    904 	makebusy(ul, bp);
    905 
    906 	/*
    907 	 * round up to sector boundary and set new tail
    908 	 *	don't readjust st_ident if buf is already rounded
    909 	 */
    910 	bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
    911 	if (bcount == bp->b_bcount) {
    912 		sema_v(&bp->b_sem);
    913 		return;
    914 	}
    915 	bp->b_bcount = bcount;
    916 	ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
    917 	wrapped = 0;
    918 	if (ul->un_tail_lof == ul->un_eol_lof) {
    919 		ul->un_tail_lof = ul->un_bol_lof;
    920 		++wrapped;
    921 	}
    922 	ASSERT(ul->un_tail_lof != ul->un_head_lof);
    923 
    924 	/*
    925 	 * fix up the sector trailer
    926 	 */
    927 	/* LINTED */
    928 	st = (sect_trailer_t *)
    929 	    ((bp->b_un.b_addr + bcount) - sizeof (*st));
    930 	st->st_tid = ul->un_logmap->mtm_tid;
    931 	st->st_ident = ul->un_tail_ident++;
    932 
    933 	/*
    934 	 * if tail wrapped or we have exhausted this buffer
    935 	 *	async write the buffer
    936 	 */
    937 	if (wrapped || bcount == bp->b_bufsize)
    938 		push_dirty_bp(ul, bp);
    939 	else
    940 		sema_v(&bp->b_sem);
    941 }
    942 
    943 void
    944 ldl_push_commit(ml_unit_t *ul)
    945 {
    946 	buf_t		*bp;
    947 	cirbuf_t	*cb	= &ul->un_wrbuf;
    948 
    949 	/*
    950 	 * if nothing to write; then do nothing
    951 	 */
    952 	if ((bp = cb->cb_dirty) == NULL)
    953 		return;
    954 	makebusy(ul, bp);
    955 	push_dirty_bp(ul, bp);
    956 }
    957 
    958 int
    959 ldl_need_commit(ml_unit_t *ul)
    960 {
    961 	return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
    962 }
    963 
    964 int
    965 ldl_has_space(ml_unit_t *ul, mapentry_t *me)
    966 {
    967 	off_t	nfb;
    968 	off_t	nb;
    969 
    970 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
    971 
    972 	/*
    973 	 * Add up the size used by the deltas
    974 	 * round nb up to a sector length plus an extra sector
    975 	 *	w/o the extra sector we couldn't distinguish
    976 	 *	a full log (head == tail) from an empty log (head == tail)
    977 	 */
    978 	for (nb = DEV_BSIZE; me; me = me->me_hash) {
    979 		nb += sizeof (struct delta);
    980 		if (me->me_dt != DT_CANCEL)
    981 			nb += me->me_nb;
    982 	}
    983 	nb = P2ROUNDUP(nb, DEV_BSIZE);
    984 
    985 	if (ul->un_head_lof <= ul->un_tail_lof)
    986 		nfb = (ul->un_head_lof - ul->un_bol_lof) +
    987 		    (ul->un_eol_lof - ul->un_tail_lof);
    988 	else
    989 		nfb = ul->un_head_lof - ul->un_tail_lof;
    990 
    991 	return (nb < nfb);
    992 }
    993 
    994 void
    995 ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
    996 {
    997 	buf_t		*bp;
    998 	caddr_t		va;
    999 	size_t		nb;
   1000 	size_t		actual;
   1001 
   1002 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
   1003 
   1004 	/* Write the delta */
   1005 
   1006 	nb = sizeof (struct delta);
   1007 	va = (caddr_t)&me->me_delta;
   1008 	bp = get_write_bp(ul);
   1009 
   1010 	while (nb) {
   1011 		if (ul->un_flags & LDL_ERROR) {
   1012 			sema_v(&bp->b_sem);
   1013 			return;
   1014 		}
   1015 		actual = storebuf(ul, bp, va, nb);
   1016 		ASSERT(actual);
   1017 		va += actual;
   1018 		nb -= actual;
   1019 		if (nb)
   1020 			bp = get_write_bp(ul);
   1021 	}
   1022 
   1023 	/* If a commit, cancel, or 0's; we're almost done */
   1024 	switch (me->me_dt) {
   1025 		case DT_COMMIT:
   1026 		case DT_CANCEL:
   1027 		case DT_ABZERO:
   1028 			/* roll needs to know where the next delta will go */
   1029 			me->me_lof = ul->un_tail_lof;
   1030 			return;
   1031 		default:
   1032 			break;
   1033 	}
   1034 
   1035 	/* Now write the data */
   1036 
   1037 	ASSERT(me->me_nb != 0);
   1038 
   1039 	nb = me->me_nb;
   1040 	va = (me->me_mof - bufmof) + bufp;
   1041 	bp = get_write_bp(ul);
   1042 
   1043 	/* Save where we will put the data */
   1044 	me->me_lof = ul->un_tail_lof;
   1045 
   1046 	while (nb) {
   1047 		if (ul->un_flags & LDL_ERROR) {
   1048 			sema_v(&bp->b_sem);
   1049 			return;
   1050 		}
   1051 		actual = storebuf(ul, bp, va, nb);
   1052 		ASSERT(actual);
   1053 		va += actual;
   1054 		nb -= actual;
   1055 		if (nb)
   1056 			bp = get_write_bp(ul);
   1057 	}
   1058 }
   1059 
   1060 void
   1061 ldl_waito(ml_unit_t *ul)
   1062 {
   1063 	buf_t		*bp;
   1064 	cirbuf_t	*cb	= &ul->un_wrbuf;
   1065 
   1066 	rw_enter(&cb->cb_rwlock, RW_WRITER);
   1067 	/*
   1068 	 * wait on them
   1069 	 */
   1070 	bp = cb->cb_bp;
   1071 	do {
   1072 		if ((bp->b_flags & B_DONE) == 0) {
   1073 			makebusy(ul, bp);
   1074 			sema_v(&bp->b_sem);
   1075 		}
   1076 		bp = bp->b_forw;
   1077 	} while (bp != cb->cb_bp);
   1078 	rw_exit(&cb->cb_rwlock);
   1079 }
   1080 
   1081 /*
   1082  * seek nb bytes from location lof
   1083  */
   1084 static int
   1085 logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
   1086 {
   1087 	buf_t	*bp;
   1088 	ulong_t	actual;
   1089 
   1090 	while (nb) {
   1091 		bp = get_read_bp(ul, lof);
   1092 		if (bp->b_flags & B_ERROR) {
   1093 			sema_v(&bp->b_sem);
   1094 			return (EIO);
   1095 		}
   1096 		actual = fetchbuf(ul, bp, NULL, nb, &lof);
   1097 		ASSERT(actual);
   1098 		nb -= actual;
   1099 	}
   1100 	*lofp = lof;
   1101 	ASSERT(nb == 0);
   1102 	return (0);
   1103 }
   1104 
   1105 int
   1106 ldl_read(
   1107 	ml_unit_t *ul,		/* Log unit */
   1108 	caddr_t va,		/* address of buffer to read into */
   1109 	offset_t mof,		/* mof of buffer */
   1110 	off_t nb,		/* length of buffer */
   1111 	mapentry_t *me)		/* Map entry list */
   1112 {
   1113 	buf_t	*bp;
   1114 	crb_t   *crb;
   1115 	caddr_t	rva;			/* address to read into */
   1116 	size_t	rnb;			/* # of bytes to read */
   1117 	off_t	lof;			/* log device offset to read from */
   1118 	off_t   skip;
   1119 	ulong_t	actual;
   1120 	int	error;
   1121 	caddr_t	eva	= va + nb;	/* end of buffer */
   1122 
   1123 	for (; me; me = me->me_agenext) {
   1124 		ASSERT(me->me_dt != DT_CANCEL);
   1125 
   1126 		/*
   1127 		 * check for an cached roll buffer
   1128 		 */
   1129 		crb = me->me_crb;
   1130 		if (crb) {
   1131 			if (mof > crb->c_mof) {
   1132 				/*
   1133 				 * This mapentry overlaps with the beginning of
   1134 				 * the supplied buffer
   1135 				 */
   1136 				skip = mof - crb->c_mof;
   1137 				bcopy(crb->c_buf + skip, va,
   1138 				    MIN(nb, crb->c_nb - skip));
   1139 			} else {
   1140 				/*
   1141 				 * This mapentry starts at or after
   1142 				 * the supplied buffer.
   1143 				 */
   1144 				skip = crb->c_mof - mof;
   1145 				bcopy(crb->c_buf, va + skip,
   1146 				    MIN(crb->c_nb, nb - skip));
   1147 			}
   1148 			logstats.ls_lreadsinmem.value.ui64++;
   1149 			continue;
   1150 		}
   1151 
   1152 		/*
   1153 		 * check for a delta full of zeroes - there's no log data
   1154 		 */
   1155 		if (me->me_dt == DT_ABZERO) {
   1156 			fetchzeroes(va, mof, nb, me);
   1157 			continue;
   1158 		}
   1159 
   1160 		if (mof > me->me_mof) {
   1161 			rnb = (size_t)(mof - me->me_mof);
   1162 			error = logseek(ul, me->me_lof, rnb, &lof);
   1163 			if (error)
   1164 				return (EIO);
   1165 			rva = va;
   1166 			rnb = me->me_nb - rnb;
   1167 			rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
   1168 		} else {
   1169 			lof = me->me_lof;
   1170 			rva = (me->me_mof - mof) + va;
   1171 			rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
   1172 		}
   1173 
   1174 		while (rnb) {
   1175 			bp = get_read_bp(ul, lof);
   1176 			if (bp->b_flags & B_ERROR) {
   1177 				sema_v(&bp->b_sem);
   1178 				return (EIO);
   1179 			}
   1180 			ASSERT(((me->me_flags & ME_ROLL) == 0) ||
   1181 			    (bp != ul->un_wrbuf.cb_dirty));
   1182 			actual = fetchbuf(ul, bp, rva, rnb, &lof);
   1183 			ASSERT(actual);
   1184 			rva += actual;
   1185 			rnb -= actual;
   1186 		}
   1187 	}
   1188 	return (0);
   1189 }
   1190 
   1191 void
   1192 ldl_savestate(ml_unit_t *ul)
   1193 {
   1194 	int		error;
   1195 	buf_t		*bp	= ul->un_bp;
   1196 	ml_odunit_t	*ud	= (void *)bp->b_un.b_addr;
   1197 	ml_odunit_t	*ud2	= (void *)(bp->b_un.b_addr + DEV_BSIZE);
   1198 
   1199 #if	DEBUG
   1200 	/*
   1201 	 * Scan test is running; don't update intermediate state
   1202 	 */
   1203 	if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
   1204 		return;
   1205 #endif	/* DEBUG */
   1206 
   1207 	mutex_enter(&ul->un_state_mutex);
   1208 	bcopy(&ul->un_ondisk, ud, sizeof (*ud));
   1209 	ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
   1210 	bcopy(ud, ud2, sizeof (*ud));
   1211 
   1212 	/* If a snapshot is enabled write through the shapshot driver. */
   1213 	if (ul->un_ufsvfs->vfs_snapshot)
   1214 		UFS_BWRITE2(ul->un_ufsvfs, bp);
   1215 	else
   1216 		BWRITE2(bp);
   1217 	logstats.ls_ldlwrites.value.ui64++;
   1218 	error = bp->b_flags & B_ERROR;
   1219 	mutex_exit(&ul->un_state_mutex);
   1220 	if (error)
   1221 		ldl_seterror(ul, "Error writing ufs log state");
   1222 }
   1223 
   1224 /*
   1225  * The head will be set to (new_lof - header) since ldl_sethead is
   1226  * called with the new_lof of the data portion of a delta.
   1227  */
   1228 void
   1229 ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
   1230 {
   1231 	off_t		nb;
   1232 	off_t		new_lof;
   1233 	uint32_t	new_ident;
   1234 	daddr_t		beg_blkno;
   1235 	daddr_t		end_blkno;
   1236 
   1237 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
   1238 
   1239 	if (data_lof == -1) {
   1240 		/* log is empty */
   1241 		new_ident = lufs_hd_genid(ul);
   1242 		new_lof = ul->un_tail_lof;
   1243 
   1244 	} else {
   1245 		/* compute header's lof */
   1246 		new_ident = ul->un_head_ident;
   1247 		new_lof = data_lof - sizeof (struct delta);
   1248 
   1249 		/* whoops, header spans sectors; subtract out sector trailer */
   1250 		if (btodb(new_lof) != btodb(data_lof))
   1251 			new_lof -= sizeof (sect_trailer_t);
   1252 
   1253 		/* whoops, header wrapped the log; go to last sector */
   1254 		if (new_lof < ul->un_bol_lof) {
   1255 			/* sector offset */
   1256 			new_lof -= dbtob(btodb(new_lof));
   1257 			/* add to last sector's lof */
   1258 			new_lof += (ul->un_eol_lof - DEV_BSIZE);
   1259 		}
   1260 		ul->un_head_tid = tid;
   1261 	}
   1262 
   1263 	/*
   1264 	 * check for nop
   1265 	 */
   1266 	if (new_lof == ul->un_head_lof)
   1267 		return;
   1268 
   1269 	/*
   1270 	 * invalidate the affected bufs and calculate new ident
   1271 	 */
   1272 	if (new_lof > ul->un_head_lof) {
   1273 		nb = new_lof - ul->un_head_lof;
   1274 		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
   1275 		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
   1276 
   1277 		end_blkno = btodb(new_lof);
   1278 		beg_blkno = btodb(ul->un_head_lof);
   1279 		new_ident += (end_blkno - beg_blkno);
   1280 	} else {
   1281 		nb = ul->un_eol_lof - ul->un_head_lof;
   1282 		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
   1283 		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
   1284 
   1285 		end_blkno = btodb(ul->un_eol_lof);
   1286 		beg_blkno = btodb(ul->un_head_lof);
   1287 		new_ident += (end_blkno - beg_blkno);
   1288 
   1289 		nb = new_lof - ul->un_bol_lof;
   1290 		inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
   1291 		inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);
   1292 
   1293 		end_blkno = btodb(new_lof);
   1294 		beg_blkno = btodb(ul->un_bol_lof);
   1295 		new_ident += (end_blkno - beg_blkno);
   1296 	}
   1297 	/*
   1298 	 * don't update the head if there has been an error
   1299 	 */
   1300 	if (ul->un_flags & LDL_ERROR)
   1301 		return;
   1302 
   1303 	/* Fix up the head and ident */
   1304 	ASSERT(new_lof >= ul->un_bol_lof);
   1305 	ul->un_head_lof = new_lof;
   1306 	ul->un_head_ident = new_ident;
   1307 	if (data_lof == -1) {
   1308 		ul->un_tail_ident = ul->un_head_ident;
   1309 	}
   1310 
   1311 
   1312 	/* Commit to the database */
   1313 	ldl_savestate(ul);
   1314 
   1315 	ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
   1316 	    ldl_sethead_debug(ul));
   1317 }
   1318 
   1319 /*
   1320  * The tail will be set to the sector following lof+nb
   1321  *	lof + nb == size of the last delta + commit record
   1322  *	this function is called once after the log scan has completed.
   1323  */
   1324 void
   1325 ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
   1326 {
   1327 	off_t		new_lof;
   1328 	uint32_t	new_ident;
   1329 	daddr_t		beg_blkno;
   1330 	daddr_t		end_blkno;
   1331 
   1332 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
   1333 
   1334 	if (lof == -1) {
   1335 		ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
   1336 		ul->un_head_lof = ul->un_tail_lof;
   1337 		ul->un_head_ident = lufs_hd_genid(ul);
   1338 		ul->un_tail_ident = ul->un_head_ident;
   1339 
   1340 		/* Commit to the database */
   1341 		ldl_savestate(ul);
   1342 
   1343 		return;
   1344 	}
   1345 
   1346 	/*
   1347 	 * new_lof is the offset of the sector following the last commit
   1348 	 */
   1349 	(void) logseek(ul, lof, nb, &new_lof);
   1350 	ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));
   1351 
   1352 	/*
   1353 	 * calculate new ident
   1354 	 */
   1355 	if (new_lof > ul->un_head_lof) {
   1356 		end_blkno = btodb(new_lof);
   1357 		beg_blkno = btodb(ul->un_head_lof);
   1358 		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
   1359 	} else {
   1360 		end_blkno = btodb(ul->un_eol_lof);
   1361 		beg_blkno = btodb(ul->un_head_lof);
   1362 		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
   1363 
   1364 		end_blkno = btodb(new_lof);
   1365 		beg_blkno = btodb(ul->un_bol_lof);
   1366 		new_ident += (end_blkno - beg_blkno);
   1367 	}
   1368 
   1369 	/* Fix up the tail and ident */
   1370 	ul->un_tail_lof = new_lof;
   1371 	ul->un_tail_ident = new_ident;
   1372 
   1373 	/* Commit to the database */
   1374 	ldl_savestate(ul);
   1375 }
   1376 
   1377 /*
   1378  * LOGSCAN STUFF
   1379  */
   1380 static int
   1381 ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
   1382 {
   1383 	ulong_t		ident;
   1384 	size_t		nblk, i;
   1385 	sect_trailer_t	*st;
   1386 
   1387 	/*
   1388 	 * compute ident for first sector in the buffer
   1389 	 */
   1390 	ident = ul->un_head_ident;
   1391 	if (bp->b_blkno >= btodb(ul->un_head_lof)) {
   1392 		ident += (bp->b_blkno - btodb(ul->un_head_lof));
   1393 	} else {
   1394 		ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
   1395 		ident += (bp->b_blkno - btodb(ul->un_bol_lof));
   1396 	}
   1397 	/*
   1398 	 * truncate the buffer down to the last valid sector
   1399 	 */
   1400 	nblk = btodb(bp->b_bcount);
   1401 	bp->b_bcount = 0;
   1402 	/* LINTED */
   1403 	st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
   1404 	for (i = 0; i < nblk; ++i) {
   1405 		if (st->st_ident != ident)
   1406 			break;
   1407 
   1408 		/* remember last valid tid for ldl_logscan_error() */
   1409 		ul->un_tid = st->st_tid;
   1410 
   1411 		/* LINTED */
   1412 		st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
   1413 		++ident;
   1414 		bp->b_bcount += DEV_BSIZE;
   1415 	}
   1416 	/*
   1417 	 * make sure that lof is still within range
   1418 	 */
   1419 	return (within_range(lof, bp->b_blkno, bp->b_bcount));
   1420 }
   1421 
   1422 ulong_t
   1423 ldl_logscan_nbcommit(off_t lof)
   1424 {
   1425 	/*
   1426 	 * lof is the offset following the commit header.  However,
   1427 	 * if the commit header fell on the end-of-sector, then lof
   1428 	 * has already been advanced to the beginning of the next
   1429 	 * sector.  So do nothing.  Otherwise, return the remaining
   1430 	 * bytes in the sector.
   1431 	 */
   1432 	if ((lof & (DEV_BSIZE - 1)) == 0)
   1433 		return (0);
   1434 	return (NB_LEFT_IN_SECTOR(lof));
   1435 }
   1436 
   1437 int
   1438 ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
   1439 {
   1440 	buf_t	*bp;
   1441 	ulong_t	actual;
   1442 
   1443 	ASSERT(ul->un_head_lof != ul->un_tail_lof);
   1444 
   1445 	/*
   1446 	 * Check the log data doesn't go out of bounds
   1447 	 */
   1448 	if (ul->un_head_lof < ul->un_tail_lof) {
   1449 		if (!WITHIN(*lofp, nb, ul->un_head_lof,
   1450 		    (ul->un_tail_lof - ul->un_head_lof))) {
   1451 			return (EIO);
   1452 		}
   1453 	} else {
   1454 		if (OVERLAP(*lofp, nb, ul->un_tail_lof,
   1455 		    (ul->un_head_lof - ul->un_tail_lof))) {
   1456 			return (EIO);
   1457 		}
   1458 	}
   1459 
   1460 	while (nb) {
   1461 		bp = get_read_bp(ul, *lofp);
   1462 		if (bp->b_flags & B_ERROR) {
   1463 			sema_v(&bp->b_sem);
   1464 			return (EIO);
   1465 		}
   1466 		/*
   1467 		 * out-of-seq idents means partial transaction
   1468 		 *	panic, non-corrupting powerfail, ...
   1469 		 */
   1470 		if (!ldl_logscan_ident(ul, bp, *lofp)) {
   1471 			sema_v(&bp->b_sem);
   1472 			return (EIO);
   1473 		}
   1474 		/*
   1475 		 * copy the header into the caller's buf
   1476 		 */
   1477 		actual = fetchbuf(ul, bp, va, nb, lofp);
   1478 		if (va)
   1479 			va += actual;
   1480 		nb -= actual;
   1481 	}
   1482 	return (0);
   1483 }
   1484 
   1485 void
   1486 ldl_logscan_begin(ml_unit_t *ul)
   1487 {
   1488 	size_t	bufsize;
   1489 
   1490 	ASSERT(ul->un_wrbuf.cb_dirty == NULL);
   1491 
   1492 	/*
   1493 	 * logscan has begun
   1494 	 */
   1495 	ul->un_flags |= LDL_SCAN;
   1496 
   1497 	/*
   1498 	 * reset the circular bufs
   1499 	 */
   1500 	bufsize = ldl_bufsize(ul);
   1501 	alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
   1502 	alloc_wrbuf(&ul->un_wrbuf, bufsize);
   1503 
   1504 	/*
   1505 	 * set the tail to reflect a full log
   1506 	 */
   1507 	ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;
   1508 
   1509 	if (ul->un_tail_lof < ul->un_bol_lof)
   1510 		ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
   1511 	if (ul->un_tail_lof >= ul->un_eol_lof)
   1512 		ul->un_tail_lof = ul->un_bol_lof;
   1513 
   1514 	/*
   1515 	 * un_tid is used during error processing; it is initialized to
   1516 	 * the tid of the delta at un_head_lof;
   1517 	 */
   1518 	ul->un_tid = ul->un_head_tid;
   1519 }
   1520 
   1521 void
   1522 ldl_logscan_end(ml_unit_t *ul)
   1523 {
   1524 	size_t	bufsize;
   1525 
   1526 	/*
   1527 	 * reset the circular bufs
   1528 	 */
   1529 	bufsize = ldl_bufsize(ul);
   1530 	alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
   1531 	alloc_wrbuf(&ul->un_wrbuf, bufsize);
   1532 
   1533 	/*
   1534 	 * Done w/scan
   1535 	 */
   1536 	ul->un_flags &= ~LDL_SCAN;
   1537 }
   1538 
   1539 int
   1540 ldl_need_roll(ml_unit_t *ul)
   1541 {
   1542 	off_t	busybytes;
   1543 	off_t	head;
   1544 	off_t	tail;
   1545 	off_t	bol;
   1546 	off_t	eol;
   1547 	off_t	nb;
   1548 
   1549 	/*
   1550 	 * snapshot the log state
   1551 	 */
   1552 	head = ul->un_head_lof;
   1553 	tail = ul->un_tail_lof;
   1554 	bol = ul->un_bol_lof;
   1555 	eol = ul->un_eol_lof;
   1556 	nb = ul->un_logsize;
   1557 
   1558 	/*
   1559 	 * compute number of busy (inuse) bytes
   1560 	 */
   1561 	if (head <= tail)
   1562 		busybytes = tail - head;
   1563 	else
   1564 		busybytes = (eol - head) + (tail - bol);
   1565 
   1566 	/*
   1567 	 * return TRUE if > 75% full
   1568 	 */
   1569 	return (busybytes > (nb - (nb >> 2)));
   1570 }
   1571 
   1572 void
   1573 ldl_seterror(ml_unit_t *ul, char *why)
   1574 {
   1575 	/*
   1576 	 * already in error state; do nothing
   1577 	 */
   1578 	if (ul->un_flags & LDL_ERROR)
   1579 		return;
   1580 
   1581 	ul->un_flags |= LDL_ERROR;	/* incore */
   1582 	ul->un_badlog = 1;		/* ondisk (cleared by fsck) */
   1583 
   1584 	/*
   1585 	 * Commit to state sectors
   1586 	 */
   1587 	uniqtime(&ul->un_timestamp);
   1588 	ldl_savestate(ul);
   1589 
   1590 	/* Pretty print */
   1591 	cmn_err(CE_WARN, "%s", why);
   1592 	cmn_err(CE_WARN, "ufs log for %s changed state to Error",
   1593 	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);
   1594 	cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
   1595 	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);
   1596 
   1597 	/*
   1598 	 * If we aren't in the middle of scan (aka snarf); tell ufs
   1599 	 * to hard lock itself.
   1600 	 */
   1601 	if ((ul->un_flags & LDL_SCAN) == 0)
   1602 		ufs_trans_onerror();
   1603 }
   1604 
   1605 size_t
   1606 ldl_bufsize(ml_unit_t *ul)
   1607 {
   1608 	size_t		bufsize;
   1609 	extern uint32_t	ldl_minbufsize;
   1610 
   1611 	/*
   1612 	 * initial guess is the maxtransfer value for this log device
   1613 	 * 	increase if too small
   1614 	 * 	decrease if too large
   1615 	 */
   1616 	bufsize = dbtob(btod(ul->un_maxtransfer));
   1617 	if (bufsize < ldl_minbufsize)
   1618 		bufsize = ldl_minbufsize;
   1619 	if (bufsize > maxphys)
   1620 		bufsize = maxphys;
   1621 	if (bufsize > ul->un_maxtransfer)
   1622 		bufsize = ul->un_maxtransfer;
   1623 	return (bufsize);
   1624 }
   1625