Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/systm.h>
     27 #include <sys/types.h>
     28 #include <sys/vnode.h>
     29 #include <sys/errno.h>
     30 #include <sys/sysmacros.h>
     31 #include <sys/debug.h>
     32 #include <sys/kmem.h>
     33 #include <sys/conf.h>
     34 #include <sys/proc.h>
     35 #include <sys/cmn_err.h>
     36 #include <sys/fssnap_if.h>
     37 #include <sys/fs/ufs_inode.h>
     38 #include <sys/fs/ufs_filio.h>
     39 #include <sys/fs/ufs_log.h>
     40 #include <sys/fs/ufs_bio.h>
     41 #include <sys/inttypes.h>
     42 #include <sys/callb.h>
     43 #include <sys/tnf_probe.h>
     44 
     45 /*
     46  * Kernel threads for logging
     47  * Currently only one for rolling the log (one per log).
     48  */
     49 
     50 #define	LUFS_DEFAULT_NUM_ROLL_BUFS 16
     51 #define	LUFS_DEFAULT_MIN_ROLL_BUFS 4
     52 #define	LUFS_DEFAULT_MAX_ROLL_BUFS 64
     53 
     54 /*
     55  * Macros
     56  */
     57 #define	logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
     58 #define	ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
     59 
     60 /*
     61  * Tunables
     62  */
     63 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
     64 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
     65 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
     66 long logmap_maxnme = 1536;
     67 int trans_roll_tics = 0;
     68 uint64_t trans_roll_new_delta = 0;
     69 uint64_t lrr_wait = 0;
     70 /*
     71  * Key for thread specific data for the roll thread to
     72  * bypass snapshot throttling
     73  */
     74 uint_t bypass_snapshot_throttle_key;
     75 
     76 /*
     77  * externs
     78  */
     79 extern kmutex_t		ml_scan;
     80 extern kcondvar_t	ml_scan_cv;
     81 extern int		maxphys;
     82 
     83 static void
     84 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
     85 {
     86 	mutex_enter(&logmap->mtm_mutex);
     87 	logmap->mtm_ref = 0;
     88 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
     89 		cv_broadcast(&logmap->mtm_from_roll_cv);
     90 	}
     91 	logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
     92 	CALLB_CPR_SAFE_BEGIN(cprinfop);
     93 	(void) cv_reltimedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
     94 	    trans_roll_tics, TR_CLOCK_TICK);
     95 	CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
     96 	logmap->mtm_flags |= MTM_ROLLING;
     97 	mutex_exit(&logmap->mtm_mutex);
     98 }
     99 
    100 /*
    101  * returns the number of 8K buffers to use for rolling the log
    102  */
    103 static uint32_t
    104 log_roll_buffers()
    105 {
    106 	/*
    107 	 * sanity validate the tunable lufs_num_roll_bufs
    108 	 */
    109 	if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
    110 		return (lufs_min_roll_bufs);
    111 	}
    112 	if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
    113 		return (lufs_max_roll_bufs);
    114 	}
    115 	return (lufs_num_roll_bufs);
    116 }
    117 
    118 /*
    119  * Find something to roll, then if we don't have cached roll buffers
    120  * covering all the deltas in that MAPBLOCK then read the master
    121  * and overlay the deltas.
    122  * returns;
    123  * 	0 if sucessful
    124  *	1 on finding nothing to roll
    125  *	2 on error
    126  */
    127 int
    128 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
    129     int *retnbuf)
    130 {
    131 	offset_t	mof;
    132 	buf_t		*bp;
    133 	rollbuf_t	*rbp;
    134 	mt_map_t	*logmap = ul->un_logmap;
    135 	daddr_t		mblkno;
    136 	int		i;
    137 	int		error;
    138 	int		nbuf;
    139 
    140 	/*
    141 	 * Make sure there is really something to roll
    142 	 */
    143 	mof = 0;
    144 	if (!logmap_next_roll(logmap, &mof)) {
    145 		return (1);
    146 	}
    147 
    148 	/*
    149 	 * build some master blocks + deltas to roll forward
    150 	 */
    151 	rw_enter(&logmap->mtm_rwlock, RW_READER);
    152 	nbuf = 0;
    153 	do {
    154 		mof = mof & (offset_t)MAPBLOCKMASK;
    155 		mblkno = lbtodb(mof);
    156 
    157 		/*
    158 		 * Check for the case of a new delta to a set up buffer
    159 		 */
    160 		for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
    161 			if (P2ALIGN(rbp->rb_bh.b_blkno,
    162 			    MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
    163 				TNF_PROBE_0(trans_roll_new_delta, "lufs",
    164 				    /* CSTYLED */);
    165 				trans_roll_new_delta++;
    166 				/* Flush out the current set of buffers */
    167 				goto flush_bufs;
    168 			}
    169 		}
    170 
    171 		/*
    172 		 * Work out what to roll next. If it isn't cached then read
    173 		 * it asynchronously from the master.
    174 		 */
    175 		bp = &rbp->rb_bh;
    176 		bp->b_blkno = mblkno;
    177 		bp->b_flags = B_READ;
    178 		bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
    179 		bp->b_bufsize = MAPBLOCKSIZE;
    180 		if (top_read_roll(rbp, ul)) {
    181 			/* logmap deltas were in use */
    182 			if (nbuf == 0) {
    183 				/*
    184 				 * On first buffer wait for the logmap user
    185 				 * to finish by grabbing the logmap lock
    186 				 * exclusively rather than spinning
    187 				 */
    188 				rw_exit(&logmap->mtm_rwlock);
    189 				lrr_wait++;
    190 				rw_enter(&logmap->mtm_rwlock, RW_WRITER);
    191 				rw_exit(&logmap->mtm_rwlock);
    192 				return (1);
    193 			}
    194 			/* we have at least one buffer - flush it */
    195 			goto flush_bufs;
    196 		}
    197 		if ((bp->b_flags & B_INVAL) == 0) {
    198 			nbuf++;
    199 		}
    200 		mof += MAPBLOCKSIZE;
    201 	} while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
    202 
    203 	/*
    204 	 * If there was nothing to roll cycle back
    205 	 */
    206 	if (nbuf == 0) {
    207 		rw_exit(&logmap->mtm_rwlock);
    208 		return (1);
    209 	}
    210 
    211 flush_bufs:
    212 	/*
    213 	 * For each buffer, if it isn't cached then wait for the read to
    214 	 * finish and overlay the deltas.
    215 	 */
    216 	for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
    217 		if (!rbp->rb_crb) {
    218 			bp = &rbp->rb_bh;
    219 			if (trans_not_wait(bp)) {
    220 				ldl_seterror(ul,
    221 				    "Error reading master during ufs log roll");
    222 				error = 1;
    223 			}
    224 			/*
    225 			 * sync read the data from the log
    226 			 */
    227 			if (ldl_read(ul, bp->b_un.b_addr,
    228 			    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
    229 			    MAPBLOCKSIZE, rbp->rb_age)) {
    230 				error = 1;
    231 			}
    232 		}
    233 
    234 		/*
    235 		 * reset the age bit in the age list
    236 		 */
    237 		logmap_list_put_roll(logmap, rbp->rb_age);
    238 
    239 		if (ul->un_flags & LDL_ERROR) {
    240 			error = 1;
    241 		}
    242 	}
    243 	rw_exit(&logmap->mtm_rwlock);
    244 	if (error)
    245 		return (2);
    246 	*retnbuf = nbuf;
    247 	return (0);
    248 }
    249 
    250 /*
    251  * Write out a cached roll buffer
    252  */
    253 void
    254 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
    255 {
    256 	crb_t *crb = rbp->rb_crb;
    257 	buf_t *bp = &rbp->rb_bh;
    258 
    259 	bp->b_blkno = lbtodb(crb->c_mof);
    260 	bp->b_un.b_addr = crb->c_buf;
    261 	bp->b_bcount = crb->c_nb;
    262 	bp->b_bufsize = crb->c_nb;
    263 	ASSERT((crb->c_nb & DEV_BMASK) == 0);
    264 	bp->b_flags = B_WRITE;
    265 	logstats.ls_rwrites.value.ui64++;
    266 
    267 	/* if snapshots are enabled, call it */
    268 	if (ufsvfsp->vfs_snapshot) {
    269 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
    270 	} else {
    271 		(void) bdev_strategy(bp);
    272 	}
    273 }
    274 
    275 /*
    276  * Write out a set of non cached roll buffers
    277  */
    278 void
    279 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
    280 {
    281 	buf_t		*bp = &rbp->rb_bh;
    282 	buf_t		*bp2;
    283 	rbsecmap_t	secmap = rbp->rb_secmap;
    284 	int		j, k;
    285 
    286 	ASSERT(secmap);
    287 	ASSERT((bp->b_flags & B_INVAL) == 0);
    288 
    289 	do { /* for each contiguous block of sectors */
    290 		/* find start of next sector to write */
    291 		for (j = 0; j < 16; ++j) {
    292 			if (secmap & UINT16_C(1))
    293 				break;
    294 			secmap >>= 1;
    295 		}
    296 		bp->b_un.b_addr += (j << DEV_BSHIFT);
    297 		bp->b_blkno += j;
    298 
    299 		/* calculate number of sectors */
    300 		secmap >>= 1;
    301 		j++;
    302 		for (k = 1; j < 16; ++j) {
    303 			if ((secmap & UINT16_C(1)) == 0)
    304 				break;
    305 			secmap >>= 1;
    306 			k++;
    307 		}
    308 		bp->b_bcount = k << DEV_BSHIFT;
    309 		bp->b_flags = B_WRITE;
    310 		logstats.ls_rwrites.value.ui64++;
    311 
    312 		/* if snapshots are enabled, call it */
    313 		if (ufsvfsp->vfs_snapshot)
    314 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
    315 		else
    316 			(void) bdev_strategy(bp);
    317 		if (secmap) {
    318 			/*
    319 			 * Allocate another buf_t to handle
    320 			 * the next write in this MAPBLOCK
    321 			 * Chain them via b_list.
    322 			 */
    323 			bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
    324 			bp->b_list = bp2;
    325 			bioinit(bp2);
    326 			bp2->b_iodone = trans_not_done;
    327 			bp2->b_bufsize = MAPBLOCKSIZE;
    328 			bp2->b_edev = bp->b_edev;
    329 			bp2->b_un.b_addr =
    330 			    bp->b_un.b_addr + bp->b_bcount;
    331 			bp2->b_blkno = bp->b_blkno + k;
    332 			bp = bp2;
    333 		}
    334 	} while (secmap);
    335 }
    336 
    337 /*
    338  * Asynchronously roll the deltas, using the sector map
    339  * in each rollbuf_t.
    340  */
    341 int
    342 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
    343 {
    344 
    345 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
    346 	rollbuf_t	*rbp;
    347 	buf_t		*bp, *bp2;
    348 	rollbuf_t	*head, *prev, *rbp2;
    349 
    350 	/*
    351 	 * Order the buffers by blkno
    352 	 */
    353 	ASSERT(nbuf > 0);
    354 #ifdef lint
    355 	prev = rbs;
    356 #endif
    357 	for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
    358 		for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
    359 			if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
    360 				if (rbp2 == head) {
    361 					rbp->rb_next = head;
    362 					head = rbp;
    363 				} else {
    364 					prev->rb_next = rbp;
    365 					rbp->rb_next = rbp2;
    366 				}
    367 				break;
    368 			}
    369 		}
    370 		if (rbp2 == NULL) {
    371 			prev->rb_next = rbp;
    372 			rbp->rb_next = NULL;
    373 		}
    374 	}
    375 
    376 	/*
    377 	 * issue the in-order writes
    378 	 */
    379 	for (rbp = head; rbp; rbp = rbp2) {
    380 		if (rbp->rb_crb) {
    381 			log_roll_write_crb(ufsvfsp, rbp);
    382 		} else {
    383 			log_roll_write_bufs(ufsvfsp, rbp);
    384 		}
    385 		/* null out the rb_next link for next set of rolling */
    386 		rbp2 = rbp->rb_next;
    387 		rbp->rb_next = NULL;
    388 	}
    389 
    390 	/*
    391 	 * wait for all the writes to finish
    392 	 */
    393 	for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
    394 		bp = &rbp->rb_bh;
    395 		if (trans_not_wait(bp)) {
    396 			ldl_seterror(ul,
    397 			    "Error writing master during ufs log roll");
    398 		}
    399 
    400 		/*
    401 		 * Now wait for all the "cloned" buffer writes (if any)
    402 		 * and free those headers
    403 		 */
    404 		bp2 = bp->b_list;
    405 		bp->b_list = NULL;
    406 		while (bp2) {
    407 			if (trans_not_wait(bp2)) {
    408 				ldl_seterror(ul,
    409 				    "Error writing master during ufs log roll");
    410 			}
    411 			bp = bp2;
    412 			bp2 = bp2->b_list;
    413 			kmem_free(bp, sizeof (buf_t));
    414 		}
    415 	}
    416 
    417 	if (ul->un_flags & LDL_ERROR)
    418 		return (1);
    419 	return (0);
    420 }
    421 
    422 void
    423 trans_roll(ml_unit_t *ul)
    424 {
    425 	callb_cpr_t	cprinfo;
    426 	mt_map_t	*logmap = ul->un_logmap;
    427 	rollbuf_t	*rbs;
    428 	rollbuf_t	*rbp;
    429 	buf_t		*bp;
    430 	caddr_t		roll_bufs;
    431 	uint32_t	nmblk;
    432 	int		i;
    433 	int		doingforceroll;
    434 	int		nbuf;
    435 
    436 	CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
    437 	    "trans_roll");
    438 
    439 	/*
    440 	 * We do not want the roll thread's writes to be
    441 	 * throttled by the snapshot.
    442 	 * If they are throttled then we can have a deadlock
    443 	 * between the roll thread and the snapshot taskq thread:
    444 	 * roll thread wants the throttling semaphore and
    445 	 * the snapshot taskq thread cannot release the semaphore
    446 	 * because it is writing to the log and the log is full.
    447 	 */
    448 
    449 	(void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
    450 
    451 	/*
    452 	 * setup some roll parameters
    453 	 */
    454 	if (trans_roll_tics == 0)
    455 		trans_roll_tics = 5 * hz;
    456 	nmblk = log_roll_buffers();
    457 
    458 	/*
    459 	 * allocate the buffers and buffer headers
    460 	 */
    461 	roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
    462 	rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
    463 
    464 	/*
    465 	 * initialize the buffer headers
    466 	 */
    467 	for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
    468 		rbp->rb_next = NULL;
    469 		bp = &rbp->rb_bh;
    470 		bioinit(bp);
    471 		bp->b_edev = ul->un_dev;
    472 		bp->b_iodone = trans_not_done;
    473 		bp->b_bufsize = MAPBLOCKSIZE;
    474 	}
    475 
    476 	doingforceroll = 0;
    477 
    478 again:
    479 	/*
    480 	 * LOOP FOREVER
    481 	 */
    482 
    483 	/*
    484 	 * exit on demand
    485 	 */
    486 	mutex_enter(&logmap->mtm_mutex);
    487 	if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
    488 		kmem_free(rbs, nmblk * sizeof (rollbuf_t));
    489 		kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
    490 		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
    491 		    MTM_ROLL_EXIT | MTM_ROLLING);
    492 		cv_broadcast(&logmap->mtm_from_roll_cv);
    493 		CALLB_CPR_EXIT(&cprinfo);
    494 		thread_exit();
    495 		/* NOTREACHED */
    496 	}
    497 
    498 	/*
    499 	 * MT_SCAN debug mode
    500 	 *	don't roll except in FORCEROLL situations
    501 	 */
    502 	if (logmap->mtm_debug & MT_SCAN)
    503 		if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
    504 			mutex_exit(&logmap->mtm_mutex);
    505 			trans_roll_wait(logmap, &cprinfo);
    506 			goto again;
    507 		}
    508 	ASSERT(logmap->mtm_trimlof == 0);
    509 
    510 	/*
    511 	 * If we've finished a force roll cycle then wakeup any
    512 	 * waiters.
    513 	 */
    514 	if (doingforceroll) {
    515 		doingforceroll = 0;
    516 		logmap->mtm_flags &= ~MTM_FORCE_ROLL;
    517 		mutex_exit(&logmap->mtm_mutex);
    518 		cv_broadcast(&logmap->mtm_from_roll_cv);
    519 	} else {
    520 		mutex_exit(&logmap->mtm_mutex);
    521 	}
    522 
    523 	/*
    524 	 * If someone wants us to roll something; then do it
    525 	 */
    526 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
    527 		doingforceroll = 1;
    528 		goto rollsomething;
    529 	}
    530 
    531 	/*
    532 	 * Log is busy, check if logmap is getting full.
    533 	 */
    534 	if (logmap_need_roll(logmap)) {
    535 		goto rollsomething;
    536 	}
    537 
    538 	/*
    539 	 * Check if the log is idle and is not empty
    540 	 */
    541 	if (!logmap->mtm_ref && !ldl_empty(ul)) {
    542 		goto rollsomething;
    543 	}
    544 
    545 	/*
    546 	 * Log is busy, check if its getting full
    547 	 */
    548 	if (ldl_need_roll(ul)) {
    549 		goto rollsomething;
    550 	}
    551 
    552 	/*
    553 	 * nothing to do; wait a bit and then start over
    554 	 */
    555 	trans_roll_wait(logmap, &cprinfo);
    556 	goto again;
    557 
    558 	/*
    559 	 * ROLL SOMETHING
    560 	 */
    561 
    562 rollsomething:
    563 	/*
    564 	 * Use the cached roll buffers, or read the master
    565 	 * and overlay the deltas
    566 	 */
    567 	switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
    568 	case 1: trans_roll_wait(logmap, &cprinfo);
    569 		/* FALLTHROUGH */
    570 	case 2: goto again;
    571 	/* default case is success */
    572 	}
    573 
    574 	/*
    575 	 * Asynchronously write out the deltas
    576 	 */
    577 	if (log_roll_write(ul, rbs, nbuf))
    578 		goto again;
    579 
    580 	/*
    581 	 * free up the deltas in the logmap
    582 	 */
    583 	for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
    584 		bp = &rbp->rb_bh;
    585 		logmap_remove_roll(logmap,
    586 		    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
    587 	}
    588 
    589 	/*
    590 	 * free up log space; if possible
    591 	 */
    592 	logmap_sethead(logmap, ul);
    593 
    594 	/*
    595 	 * LOOP
    596 	 */
    597 	goto again;
    598 }
    599