Home | History | Annotate | Download | only in zfs
      1    789    ahrens /*
      2    789    ahrens  * CDDL HEADER START
      3    789    ahrens  *
      4    789    ahrens  * The contents of this file are subject to the terms of the
      5   1472    perrin  * Common Development and Distribution License (the "License").
      6   1472    perrin  * You may not use this file except in compliance with the License.
      7    789    ahrens  *
      8    789    ahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9    789    ahrens  * or http://www.opensolaris.org/os/licensing.
     10    789    ahrens  * See the License for the specific language governing permissions
     11    789    ahrens  * and limitations under the License.
     12    789    ahrens  *
     13    789    ahrens  * When distributing Covered Code, include this CDDL HEADER in each
     14    789    ahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15    789    ahrens  * If applicable, add the following below this CDDL HEADER, with the
     16    789    ahrens  * fields enclosed by brackets "[]" replaced with your own identifying
     17    789    ahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
     18    789    ahrens  *
     19    789    ahrens  * CDDL HEADER END
     20    789    ahrens  */
     21    789    ahrens /*
     22   8746   Matthew  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23    789    ahrens  * Use is subject to license terms.
     24    789    ahrens  */
     25    789    ahrens 
     26    789    ahrens #include <sys/zfs_context.h>
     27    789    ahrens #include <sys/spa.h>
     28    789    ahrens #include <sys/dmu.h>
     29    789    ahrens #include <sys/zap.h>
     30    789    ahrens #include <sys/arc.h>
     31    789    ahrens #include <sys/stat.h>
     32    789    ahrens #include <sys/resource.h>
     33    789    ahrens #include <sys/zil.h>
     34    789    ahrens #include <sys/zil_impl.h>
     35    789    ahrens #include <sys/dsl_dataset.h>
     36    789    ahrens #include <sys/vdev.h>
     37   3668   gw25295 #include <sys/dmu_tx.h>
     38    789    ahrens 
     39    789    ahrens /*
     40    789    ahrens  * The zfs intent log (ZIL) saves transaction records of system calls
     41    789    ahrens  * that change the file system in memory with enough information
     42    789    ahrens  * to be able to replay them. These are stored in memory until
     43    789    ahrens  * either the DMU transaction group (txg) commits them to the stable pool
     44    789    ahrens  * and they can be discarded, or they are flushed to the stable log
     45    789    ahrens  * (also in the pool) due to a fsync, O_DSYNC or other synchronous
     46    789    ahrens  * requirement. In the event of a panic or power fail then those log
     47    789    ahrens  * records (transactions) are replayed.
     48    789    ahrens  *
     49    789    ahrens  * There is one ZIL per file system. Its on-disk (pool) format consists
     50    789    ahrens  * of 3 parts:
     51    789    ahrens  *
     52    789    ahrens  * 	- ZIL header
     53    789    ahrens  * 	- ZIL blocks
     54    789    ahrens  * 	- ZIL records
     55    789    ahrens  *
     56    789    ahrens  * A log record holds a system call transaction. Log blocks can
     57    789    ahrens  * hold many log records and the blocks are chained together.
     58    789    ahrens  * Each ZIL block contains a block pointer (blkptr_t) to the next
     59    789    ahrens  * ZIL block in the chain. The ZIL header points to the first
     60    789    ahrens  * block in the chain. Note there is not a fixed place in the pool
     61    789    ahrens  * to hold blocks. They are dynamically allocated and freed as
     62    789    ahrens  * needed from the blocks available. Figure X shows the ZIL structure:
     63    789    ahrens  */
     64    789    ahrens 
     65    789    ahrens /*
     66   2986  ek110237  * This global ZIL switch affects all pools
     67    789    ahrens  */
     68    789    ahrens int zil_disable = 0;	/* disable intent logging */
     69   2986  ek110237 
     70   2986  ek110237 /*
     71   2986  ek110237  * Tunable parameter for debugging or performance analysis.  Setting
     72   2986  ek110237  * zfs_nocacheflush will cause corruption on power loss if a volatile
     73   2986  ek110237  * out-of-order write cache is enabled.
     74   2986  ek110237  */
     75   2986  ek110237 boolean_t zfs_nocacheflush = B_FALSE;
     76    789    ahrens 
     77    789    ahrens static kmem_cache_t *zil_lwb_cache;
     78  10685    George 
     79  10685    George static boolean_t zil_empty(zilog_t *zilog);
     80    789    ahrens 
     81    789    ahrens static int
     82  10922      Jeff zil_bp_compare(const void *x1, const void *x2)
     83    789    ahrens {
     84  10922      Jeff 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
     85  10922      Jeff 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
     86    789    ahrens 
     87    789    ahrens 	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
     88    789    ahrens 		return (-1);
     89    789    ahrens 	if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
     90    789    ahrens 		return (1);
     91    789    ahrens 
     92    789    ahrens 	if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
     93    789    ahrens 		return (-1);
     94    789    ahrens 	if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
     95    789    ahrens 		return (1);
     96    789    ahrens 
     97    789    ahrens 	return (0);
     98    789    ahrens }
     99    789    ahrens 
    100    789    ahrens static void
    101  10922      Jeff zil_bp_tree_init(zilog_t *zilog)
    102    789    ahrens {
    103  10922      Jeff 	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
    104  10922      Jeff 	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
    105    789    ahrens }
    106    789    ahrens 
    107    789    ahrens static void
    108  10922      Jeff zil_bp_tree_fini(zilog_t *zilog)
    109    789    ahrens {
    110  10922      Jeff 	avl_tree_t *t = &zilog->zl_bp_tree;
    111  10922      Jeff 	zil_bp_node_t *zn;
    112    789    ahrens 	void *cookie = NULL;
    113    789    ahrens 
    114    789    ahrens 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
    115  10922      Jeff 		kmem_free(zn, sizeof (zil_bp_node_t));
    116    789    ahrens 
    117    789    ahrens 	avl_destroy(t);
    118    789    ahrens }
    119    789    ahrens 
    120  10922      Jeff int
    121  10922      Jeff zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
    122    789    ahrens {
    123  10922      Jeff 	avl_tree_t *t = &zilog->zl_bp_tree;
    124  10922      Jeff 	const dva_t *dva = BP_IDENTITY(bp);
    125  10922      Jeff 	zil_bp_node_t *zn;
    126    789    ahrens 	avl_index_t where;
    127    789    ahrens 
    128    789    ahrens 	if (avl_find(t, dva, &where) != NULL)
    129    789    ahrens 		return (EEXIST);
    130    789    ahrens 
    131  10922      Jeff 	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
    132    789    ahrens 	zn->zn_dva = *dva;
    133    789    ahrens 	avl_insert(t, zn, where);
    134    789    ahrens 
    135    789    ahrens 	return (0);
    136    789    ahrens }
    137    789    ahrens 
    138   1807   bonwick static zil_header_t *
    139   1807   bonwick zil_header_in_syncing_context(zilog_t *zilog)
    140   1807   bonwick {
    141   1807   bonwick 	return ((zil_header_t *)zilog->zl_header);
    142   1807   bonwick }
    143   1807   bonwick 
    144   1807   bonwick static void
    145   1807   bonwick zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
    146   1807   bonwick {
    147   1807   bonwick 	zio_cksum_t *zc = &bp->blk_cksum;
    148   1807   bonwick 
    149   1807   bonwick 	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
    150   1807   bonwick 	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
    151   1807   bonwick 	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
    152   1807   bonwick 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
    153   1807   bonwick }
    154   1807   bonwick 
    155    789    ahrens /*
    156  10922      Jeff  * Read a log block and make sure it's valid.
    157    789    ahrens  */
    158    789    ahrens static int
    159  10922      Jeff zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst)
    160    789    ahrens {
    161  10922      Jeff 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
    162  10922      Jeff 	uint32_t aflags = ARC_WAIT;
    163  10922      Jeff 	arc_buf_t *abuf = NULL;
    164   1544  eschrock 	zbookmark_t zb;
    165    789    ahrens 	int error;
    166   1544  eschrock 
    167  10922      Jeff 	if (zilog->zl_header->zh_claim_txg == 0)
    168  10922      Jeff 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
    169    789    ahrens 
    170  10922      Jeff 	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
    171  10922      Jeff 		zio_flags |= ZIO_FLAG_SPECULATIVE;
    172   1807   bonwick 
    173  10922      Jeff 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
    174  10922      Jeff 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
    175  10922      Jeff 
    176  10922      Jeff 	error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
    177  10922      Jeff 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
    178   1807   bonwick 
    179   1807   bonwick 	if (error == 0) {
    180  10922      Jeff 		char *data = abuf->b_data;
    181  10922      Jeff 		uint64_t size = BP_GET_LSIZE(bp);
    182  10922      Jeff 		zil_trailer_t *ztp = (zil_trailer_t *)(data + size) - 1;
    183   1807   bonwick 		zio_cksum_t cksum = bp->blk_cksum;
    184  10922      Jeff 
    185  10922      Jeff 		bcopy(data, dst, size);
    186  10922      Jeff 		*nbp = ztp->zit_next_blk;
    187   1807   bonwick 
    188   1807   bonwick 		/*
    189   7522      Neil 		 * Validate the checksummed log block.
    190   7522      Neil 		 *
    191   1807   bonwick 		 * Sequence numbers should be... sequential.  The checksum
    192   1807   bonwick 		 * verifier for the next block should be bp's checksum plus 1.
    193   7522      Neil 		 *
    194   7522      Neil 		 * Also check the log chain linkage and size used.
    195   1807   bonwick 		 */
    196   1807   bonwick 		cksum.zc_word[ZIL_ZC_SEQ]++;
    197   1807   bonwick 
    198   7522      Neil 		if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
    199   7522      Neil 		    sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
    200  10922      Jeff 		    (ztp->zit_nused > (size - sizeof (zil_trailer_t))))
    201   7522      Neil 			error = ECKSUM;
    202   1807   bonwick 
    203  10922      Jeff 		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
    204    789    ahrens 	}
    205    789    ahrens 
    206  10922      Jeff 	return (error);
    207  10922      Jeff }
    208  10922      Jeff 
    209  10922      Jeff /*
    210  10922      Jeff  * Read a TX_WRITE log data block.
    211  10922      Jeff  */
    212  10922      Jeff static int
    213  10922      Jeff zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
    214  10922      Jeff {
    215  10922      Jeff 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
    216  10922      Jeff 	const blkptr_t *bp = &lr->lr_blkptr;
    217  10922      Jeff 	uint32_t aflags = ARC_WAIT;
    218  10922      Jeff 	arc_buf_t *abuf = NULL;
    219  10922      Jeff 	zbookmark_t zb;
    220  10922      Jeff 	int error;
    221  10922      Jeff 
    222  10922      Jeff 	if (BP_IS_HOLE(bp)) {
    223  10922      Jeff 		if (wbuf != NULL)
    224  10922      Jeff 			bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
    225  10922      Jeff 		return (0);
    226  10922      Jeff 	}
    227  10922      Jeff 
    228  10922      Jeff 	if (zilog->zl_header->zh_claim_txg == 0)
    229  10922      Jeff 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
    230  10922      Jeff 
    231  10922      Jeff 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
    232  10922      Jeff 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
    233  10922      Jeff 
    234  10922      Jeff 	error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
    235  10922      Jeff 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
    236  10922      Jeff 
    237  10922      Jeff 	if (error == 0) {
    238  10922      Jeff 		if (wbuf != NULL)
    239  10922      Jeff 			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
    240  10922      Jeff 		(void) arc_buf_remove_ref(abuf, &abuf);
    241  10922      Jeff 	}
    242    789    ahrens 
    243   1807   bonwick 	return (error);
    244    789    ahrens }
    245    789    ahrens 
    246    789    ahrens /*
    247    789    ahrens  * Parse the intent log, and call parse_func for each valid record within.
    248    789    ahrens  */
    249  10922      Jeff int
    250    789    ahrens zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
    251    789    ahrens     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
    252    789    ahrens {
    253   1807   bonwick 	const zil_header_t *zh = zilog->zl_header;
    254  10922      Jeff 	boolean_t claimed = !!zh->zh_claim_txg;
    255  10922      Jeff 	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
    256  10922      Jeff 	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
    257  10922      Jeff 	uint64_t max_blk_seq = 0;
    258  10922      Jeff 	uint64_t max_lr_seq = 0;
    259  10922      Jeff 	uint64_t blk_count = 0;
    260  10922      Jeff 	uint64_t lr_count = 0;
    261  10922      Jeff 	blkptr_t blk, next_blk;
    262    789    ahrens 	char *lrbuf, *lrp;
    263  10922      Jeff 	int error = 0;
    264    789    ahrens 
    265  10922      Jeff 	/*
    266  10922      Jeff 	 * Old logs didn't record the maximum zh_claim_lr_seq.
    267  10922      Jeff 	 */
    268  10922      Jeff 	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
    269  10922      Jeff 		claim_lr_seq = UINT64_MAX;
    270    789    ahrens 
    271    789    ahrens 	/*
    272    789    ahrens 	 * Starting at the block pointed to by zh_log we read the log chain.
    273    789    ahrens 	 * For each block in the chain we strongly check that block to
    274    789    ahrens 	 * ensure its validity.  We stop when an invalid block is found.
    275    789    ahrens 	 * For each block pointer in the chain we call parse_blk_func().
    276    789    ahrens 	 * For each record in each valid block we call parse_lr_func().
    277   1807   bonwick 	 * If the log has been claimed, stop if we encounter a sequence
    278   1807   bonwick 	 * number greater than the highest claimed sequence number.
    279    789    ahrens 	 */
    280  10922      Jeff 	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
    281  10922      Jeff 	zil_bp_tree_init(zilog);
    282   1807   bonwick 
    283  10922      Jeff 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
    284  10922      Jeff 		zil_trailer_t *ztp =
    285  10922      Jeff 		    (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
    286  10922      Jeff 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
    287  10922      Jeff 		int reclen;
    288  10922      Jeff 
    289  10922      Jeff 		if (blk_seq > claim_blk_seq)
    290  10922      Jeff 			break;
    291  10922      Jeff 		if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
    292  10922      Jeff 			break;
    293  10922      Jeff 		ASSERT(max_blk_seq < blk_seq);
    294  10922      Jeff 		max_blk_seq = blk_seq;
    295  10922      Jeff 		blk_count++;
    296  10922      Jeff 
    297  10922      Jeff 		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
    298   1807   bonwick 			break;
    299   1807   bonwick 
    300  10922      Jeff 		error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf);
    301    789    ahrens 		if (error)
    302    789    ahrens 			break;
    303    789    ahrens 
    304    789    ahrens 		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
    305    789    ahrens 			lr_t *lr = (lr_t *)lrp;
    306    789    ahrens 			reclen = lr->lrc_reclen;
    307    789    ahrens 			ASSERT3U(reclen, >=, sizeof (lr_t));
    308  10922      Jeff 			if (lr->lrc_seq > claim_lr_seq)
    309  10922      Jeff 				goto done;
    310  10922      Jeff 			if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
    311  10922      Jeff 				goto done;
    312  10922      Jeff 			ASSERT(max_lr_seq < lr->lrc_seq);
    313  10922      Jeff 			max_lr_seq = lr->lrc_seq;
    314  10922      Jeff 			lr_count++;
    315    789    ahrens 		}
    316    789    ahrens 	}
    317  10922      Jeff done:
    318  10922      Jeff 	zilog->zl_parse_error = error;
    319  10922      Jeff 	zilog->zl_parse_blk_seq = max_blk_seq;
    320  10922      Jeff 	zilog->zl_parse_lr_seq = max_lr_seq;
    321  10922      Jeff 	zilog->zl_parse_blk_count = blk_count;
    322  10922      Jeff 	zilog->zl_parse_lr_count = lr_count;
    323   1807   bonwick 
    324  10922      Jeff 	ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
    325  10922      Jeff 	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
    326  10922      Jeff 
    327  10922      Jeff 	zil_bp_tree_fini(zilog);
    328  10922      Jeff 	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
    329  10922      Jeff 
    330  10922      Jeff 	return (error);
    331  10922      Jeff }
    332  10922      Jeff 
    333  10922      Jeff static int
    334  10922      Jeff zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
    335  10922      Jeff {
    336  10922      Jeff 	/*
    337  10922      Jeff 	 * Claim log block if not already committed and not already claimed.
    338  10922      Jeff 	 * If tx == NULL, just verify that the block is claimable.
    339  10922      Jeff 	 */
    340  10922      Jeff 	if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
    341  10922      Jeff 		return (0);
    342  10922      Jeff 
    343  10922      Jeff 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
    344  10922      Jeff 	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
    345  10922      Jeff 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
    346  10922      Jeff }
    347  10922      Jeff 
    348  10922      Jeff static int
    349  10922      Jeff zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
    350  10922      Jeff {
    351  10922      Jeff 	lr_write_t *lr = (lr_write_t *)lrc;
    352  10922      Jeff 	int error;
    353  10922      Jeff 
    354  10922      Jeff 	if (lrc->lrc_txtype != TX_WRITE)
    355  10922      Jeff 		return (0);
    356  10922      Jeff 
    357  10922      Jeff 	/*
    358  10922      Jeff 	 * If the block is not readable, don't claim it.  This can happen
    359  10922      Jeff 	 * in normal operation when a log block is written to disk before
    360  10922      Jeff 	 * some of the dmu_sync() blocks it points to.  In this case, the
    361  10922      Jeff 	 * transaction cannot have been committed to anyone (we would have
    362  10922      Jeff 	 * waited for all writes to be stable first), so it is semantically
    363  10922      Jeff 	 * correct to declare this the end of the log.
    364  10922      Jeff 	 */
    365  10922      Jeff 	if (lr->lr_blkptr.blk_birth >= first_txg &&
    366  10922      Jeff 	    (error = zil_read_log_data(zilog, lr, NULL)) != 0)
    367  10922      Jeff 		return (error);
    368  10922      Jeff 
    369  10922      Jeff 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
    370    789    ahrens }
    371    789    ahrens 
    372    789    ahrens /* ARGSUSED */
    373  10922      Jeff static int
    374  10922      Jeff zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
    375    789    ahrens {
    376  10922      Jeff 	zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
    377    789    ahrens 
    378  10922      Jeff 	return (0);
    379    789    ahrens }
    380    789    ahrens 
    381  10922      Jeff static int
    382    789    ahrens zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
    383    789    ahrens {
    384  10922      Jeff 	lr_write_t *lr = (lr_write_t *)lrc;
    385  10922      Jeff 	blkptr_t *bp = &lr->lr_blkptr;
    386  10922      Jeff 
    387    789    ahrens 	/*
    388    789    ahrens 	 * If we previously claimed it, we need to free it.
    389    789    ahrens 	 */
    390  10922      Jeff 	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
    391  10922      Jeff 	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
    392  10922      Jeff 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
    393  10922      Jeff 
    394  10922      Jeff 	return (0);
    395    789    ahrens }
    396    789    ahrens 
    397    789    ahrens /*
    398    789    ahrens  * Create an on-disk intent log.
    399    789    ahrens  */
    400    789    ahrens static void
    401    789    ahrens zil_create(zilog_t *zilog)
    402    789    ahrens {
    403   1807   bonwick 	const zil_header_t *zh = zilog->zl_header;
    404    789    ahrens 	lwb_t *lwb;
    405   1807   bonwick 	uint64_t txg = 0;
    406   1807   bonwick 	dmu_tx_t *tx = NULL;
    407    789    ahrens 	blkptr_t blk;
    408   1807   bonwick 	int error = 0;
    409    789    ahrens 
    410    789    ahrens 	/*
    411   1807   bonwick 	 * Wait for any previous destroy to complete.
    412    789    ahrens 	 */
    413   1807   bonwick 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
    414   1807   bonwick 
    415   1807   bonwick 	ASSERT(zh->zh_claim_txg == 0);
    416   1807   bonwick 	ASSERT(zh->zh_replay_seq == 0);
    417   1807   bonwick 
    418   1807   bonwick 	blk = zh->zh_log;
    419    789    ahrens 
    420    789    ahrens 	/*
    421   8109      Neil 	 * If we don't already have an initial log block or we have one
    422   8109      Neil 	 * but it's the wrong endianness then allocate one.
    423    789    ahrens 	 */
    424   8109      Neil 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
    425   1807   bonwick 		tx = dmu_tx_create(zilog->zl_os);
    426  10922      Jeff 		VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
    427   1807   bonwick 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
    428   1807   bonwick 		txg = dmu_tx_get_txg(tx);
    429   8109      Neil 
    430   8109      Neil 		if (!BP_IS_HOLE(&blk)) {
    431  10922      Jeff 			zio_free_zil(zilog->zl_spa, txg, &blk);
    432   8109      Neil 			BP_ZERO(&blk);
    433   8109      Neil 		}
    434   1807   bonwick 
    435  10922      Jeff 		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
    436  10922      Jeff 		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
    437   1807   bonwick 
    438   1807   bonwick 		if (error == 0)
    439   1807   bonwick 			zil_init_log_chain(zilog, &blk);
    440   1362    perrin 	}
    441   1807   bonwick 
    442   1807   bonwick 	/*
    443   1807   bonwick 	 * Allocate a log write buffer (lwb) for the first log block.
    444   1807   bonwick 	 */
    445    789    ahrens 	if (error == 0) {
    446    789    ahrens 		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
    447    789    ahrens 		lwb->lwb_zilog = zilog;
    448    789    ahrens 		lwb->lwb_blk = blk;
    449    789    ahrens 		lwb->lwb_nused = 0;
    450    789    ahrens 		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
    451    789    ahrens 		lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
    452    789    ahrens 		lwb->lwb_max_txg = txg;
    453   2237    maybee 		lwb->lwb_zio = NULL;
    454  10922      Jeff 		lwb->lwb_tx = NULL;
    455   2237    maybee 
    456    789    ahrens 		mutex_enter(&zilog->zl_lock);
    457    789    ahrens 		list_insert_tail(&zilog->zl_lwb_list, lwb);
    458    789    ahrens 		mutex_exit(&zilog->zl_lock);
    459    789    ahrens 	}
    460    789    ahrens 
    461   1807   bonwick 	/*
    462   1807   bonwick 	 * If we just allocated the first log block, commit our transaction
    463   1807   bonwick 	 * and wait for zil_sync() to stuff the block poiner into zh_log.
    464   1807   bonwick 	 * (zh is part of the MOS, so we cannot modify it in open context.)
    465   1807   bonwick 	 */
    466   1807   bonwick 	if (tx != NULL) {
    467   1807   bonwick 		dmu_tx_commit(tx);
    468   1362    perrin 		txg_wait_synced(zilog->zl_dmu_pool, txg);
    469   1807   bonwick 	}
    470   1807   bonwick 
    471   1807   bonwick 	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
    472    789    ahrens }
    473    789    ahrens 
    474    789    ahrens /*
    475    789    ahrens  * In one tx, free all log blocks and clear the log header.
    476   1807   bonwick  * If keep_first is set, then we're replaying a log with no content.
    477   1807   bonwick  * We want to keep the first block, however, so that the first
    478   1807   bonwick  * synchronous transaction doesn't require a txg_wait_synced()
    479   1807   bonwick  * in zil_create().  We don't need to txg_wait_synced() here either
    480   1807   bonwick  * when keep_first is set, because both zil_create() and zil_destroy()
    481   1807   bonwick  * will wait for any in-progress destroys to complete.
    482    789    ahrens  */
    483    789    ahrens void
    484   1807   bonwick zil_destroy(zilog_t *zilog, boolean_t keep_first)
    485    789    ahrens {
    486   1807   bonwick 	const zil_header_t *zh = zilog->zl_header;
    487   1807   bonwick 	lwb_t *lwb;
    488    789    ahrens 	dmu_tx_t *tx;
    489    789    ahrens 	uint64_t txg;
    490    789    ahrens 
    491   1807   bonwick 	/*
    492   1807   bonwick 	 * Wait for any previous destroy to complete.
    493   1807   bonwick 	 */
    494   1807   bonwick 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
    495    789    ahrens 
    496  10922      Jeff 	zilog->zl_old_header = *zh;		/* debugging aid */
    497  10922      Jeff 
    498   1807   bonwick 	if (BP_IS_HOLE(&zh->zh_log))
    499    789    ahrens 		return;
    500    789    ahrens 
    501    789    ahrens 	tx = dmu_tx_create(zilog->zl_os);
    502  10922      Jeff 	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
    503    789    ahrens 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
    504    789    ahrens 	txg = dmu_tx_get_txg(tx);
    505    789    ahrens 
    506   1807   bonwick 	mutex_enter(&zilog->zl_lock);
    507   5223    perrin 
    508   1807   bonwick 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
    509    789    ahrens 	zilog->zl_destroy_txg = txg;
    510  10922      Jeff 	zilog->zl_keep_first = keep_first;
    511   1807   bonwick 
    512   1807   bonwick 	if (!list_is_empty(&zilog->zl_lwb_list)) {
    513   1807   bonwick 		ASSERT(zh->zh_claim_txg == 0);
    514  10922      Jeff 		ASSERT(!keep_first);
    515   1807   bonwick 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
    516   1807   bonwick 			list_remove(&zilog->zl_lwb_list, lwb);
    517   1807   bonwick 			if (lwb->lwb_buf != NULL)
    518   1807   bonwick 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
    519  10922      Jeff 			zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
    520   1807   bonwick 			kmem_cache_free(zil_lwb_cache, lwb);
    521   1807   bonwick 		}
    522  10922      Jeff 	} else if (!keep_first) {
    523  10922      Jeff 		(void) zil_parse(zilog, zil_free_log_block,
    524  10922      Jeff 		    zil_free_log_record, tx, zh->zh_claim_txg);
    525   1807   bonwick 	}
    526   2638    perrin 	mutex_exit(&zilog->zl_lock);
    527    789    ahrens 
    528    789    ahrens 	dmu_tx_commit(tx);
    529   8989      Neil }
    530   8989      Neil 
    531   2199    ahrens int
    532    789    ahrens zil_claim(char *osname, void *txarg)
    533    789    ahrens {
    534    789    ahrens 	dmu_tx_t *tx = txarg;
    535    789    ahrens 	uint64_t first_txg = dmu_tx_get_txg(tx);
    536    789    ahrens 	zilog_t *zilog;
    537    789    ahrens 	zil_header_t *zh;
    538    789    ahrens 	objset_t *os;
    539    789    ahrens 	int error;
    540    789    ahrens 
    541  10298   Matthew 	error = dmu_objset_hold(osname, FTAG, &os);
    542    789    ahrens 	if (error) {
    543   7294    perrin 		cmn_err(CE_WARN, "can't open objset for %s", osname);
    544   2199    ahrens 		return (0);
    545    789    ahrens 	}
    546    789    ahrens 
    547    789    ahrens 	zilog = dmu_objset_zil(os);
    548   1807   bonwick 	zh = zil_header_in_syncing_context(zilog);
    549   8989      Neil 
    550  10922      Jeff 	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
    551   9701    George 		if (!BP_IS_HOLE(&zh->zh_log))
    552  10922      Jeff 			zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
    553   9701    George 		BP_ZERO(&zh->zh_log);
    554   9701    George 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
    555  10921       Tim 		dmu_objset_rele(os, FTAG);
    556  10921       Tim 		return (0);
    557   9701    George 	}
    558    789    ahrens 
    559    789    ahrens 	/*
    560   1807   bonwick 	 * Claim all log blocks if we haven't already done so, and remember
    561   1807   bonwick 	 * the highest claimed sequence number.  This ensures that if we can
    562   1807   bonwick 	 * read only part of the log now (e.g. due to a missing device),
    563   1807   bonwick 	 * but we can read the entire log later, we will not try to replay
    564   1807   bonwick 	 * or destroy beyond the last block we successfully claimed.
    565    789    ahrens 	 */
    566    789    ahrens 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
    567    789    ahrens 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
    568  10922      Jeff 		(void) zil_parse(zilog, zil_claim_log_block,
    569  10922      Jeff 		    zil_claim_log_record, tx, first_txg);
    570    789    ahrens 		zh->zh_claim_txg = first_txg;
    571  10922      Jeff 		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
    572  10922      Jeff 		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
    573  10922      Jeff 		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
    574  10922      Jeff 			zh->zh_flags |= ZIL_REPLAY_NEEDED;
    575  10922      Jeff 		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
    576    789    ahrens 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
    577    789    ahrens 	}
    578   1807   bonwick 
    579    789    ahrens 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
    580  10298   Matthew 	dmu_objset_rele(os, FTAG);
    581   7294    perrin 	return (0);
    582   7294    perrin }
    583   7294    perrin 
    584   7294    perrin /*
    585   7294    perrin  * Check the log by walking the log chain.
    586   7294    perrin  * Checksum errors are ok as they indicate the end of the chain.
    587   7294    perrin  * Any other error (no device or read failure) returns an error.
    588   7294    perrin  */
    589   7294    perrin int
    590  10922      Jeff zil_check_log_chain(char *osname, void *tx)
    591   7294    perrin {
    592   7294    perrin 	zilog_t *zilog;
    593   7294    perrin 	objset_t *os;
    594   7294    perrin 	int error;
    595  10922      Jeff 
    596  10922      Jeff 	ASSERT(tx == NULL);
    597   7294    perrin 
    598  10298   Matthew 	error = dmu_objset_hold(osname, FTAG, &os);
    599   7294    perrin 	if (error) {
    600   7294    perrin 		cmn_err(CE_WARN, "can't open objset for %s", osname);
    601   7294    perrin 		return (0);
    602   7294    perrin 	}
    603   7294    perrin 
    604   7294    perrin 	zilog = dmu_objset_zil(os);
    605   7294    perrin 
    606  10922      Jeff 	/*
    607  10922      Jeff 	 * Because tx == NULL, zil_claim_log_block() will not actually claim
    608  10922      Jeff 	 * any blocks, but just determine whether it is possible to do so.
    609  10922      Jeff 	 * In addition to checking the log chain, zil_claim_log_block()
    610  10922      Jeff 	 * will invoke zio_claim() with a done func of spa_claim_notify(),
    611  10922      Jeff 	 * which will update spa_max_claim_txg.  See spa_load() for details.
    612  10922      Jeff 	 */
    613  10922      Jeff 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
    614  10922      Jeff 	    zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
    615  10922      Jeff 
    616  10298   Matthew 	dmu_objset_rele(os, FTAG);
    617  10922      Jeff 
    618  10922      Jeff 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
    619    789    ahrens }
    620    789    ahrens 
    621   5688   bonwick static int
    622   5688   bonwick zil_vdev_compare(const void *x1, const void *x2)
    623   5688   bonwick {
    624   5875    perrin 	uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
    625   5875    perrin 	uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
    626   5688   bonwick 
    627   5688   bonwick 	if (v1 < v2)
    628   5688   bonwick 		return (-1);
    629   5688   bonwick 	if (v1 > v2)
    630   5688   bonwick 		return (1);
    631   5688   bonwick 
    632   5688   bonwick 	return (0);
    633   5688   bonwick }
    634   5688   bonwick 
    635    789    ahrens void
    636  10922      Jeff zil_add_block(zilog_t *zilog, const blkptr_t *bp)
    637    789    ahrens {
    638   5688   bonwick 	avl_tree_t *t = &zilog->zl_vdev_tree;
    639   5688   bonwick 	avl_index_t where;
    640   5688   bonwick 	zil_vdev_node_t *zv, zvsearch;
    641   5688   bonwick 	int ndvas = BP_GET_NDVAS(bp);
    642   5688   bonwick 	int i;
    643    789    ahrens 
    644   2986  ek110237 	if (zfs_nocacheflush)
    645    789    ahrens 		return;
    646    789    ahrens 
    647   5688   bonwick 	ASSERT(zilog->zl_writer);
    648   5688   bonwick 
    649   5688   bonwick 	/*
    650   5688   bonwick 	 * Even though we're zl_writer, we still need a lock because the
    651   5688   bonwick 	 * zl_get_data() callbacks may have dmu_sync() done callbacks
    652   5688   bonwick 	 * that will run concurrently.
    653   5688   bonwick 	 */
    654   5688   bonwick 	mutex_enter(&zilog->zl_vdev_lock);
    655   5688   bonwick 	for (i = 0; i < ndvas; i++) {
    656   5688   bonwick 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
    657   5688   bonwick 		if (avl_find(t, &zvsearch, &where) == NULL) {
    658   5688   bonwick 			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
    659   5688   bonwick 			zv->zv_vdev = zvsearch.zv_vdev;
    660   5688   bonwick 			avl_insert(t, zv, where);
    661   3063    perrin 		}
    662   3063    perrin 	}
    663   5688   bonwick 	mutex_exit(&zilog->zl_vdev_lock);
    664   3063    perrin }
    665   3063    perrin 
    666    789    ahrens void
    667   2638    perrin zil_flush_vdevs(zilog_t *zilog)
    668    789    ahrens {
    669   3063    perrin 	spa_t *spa = zilog->zl_spa;
    670   5688   bonwick 	avl_tree_t *t = &zilog->zl_vdev_tree;
    671   5688   bonwick 	void *cookie = NULL;
    672   5688   bonwick 	zil_vdev_node_t *zv;
    673   5688   bonwick 	zio_t *zio;
    674    789    ahrens 
    675   3063    perrin 	ASSERT(zilog->zl_writer);
    676    789    ahrens 
    677   5688   bonwick 	/*
    678   5688   bonwick 	 * We don't need zl_vdev_lock here because we're the zl_writer,
    679   5688   bonwick 	 * and all zl_get_data() callbacks are done.
    680   5688   bonwick 	 */
    681   5688   bonwick 	if (avl_numnodes(t) == 0)
    682   5688   bonwick 		return;
    683   5688   bonwick 
    684   7754      Jeff 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
    685   5688   bonwick 
    686   7754      Jeff 	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
    687   5688   bonwick 
    688   5688   bonwick 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
    689   5688   bonwick 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
    690   5688   bonwick 		if (vd != NULL)
    691   5688   bonwick 			zio_flush(zio, vd);
    692   5688   bonwick 		kmem_free(zv, sizeof (*zv));
    693   3063    perrin 	}
    694    789    ahrens 
    695    789    ahrens 	/*
    696    789    ahrens 	 * Wait for all the flushes to complete.  Not all devices actually
    697    789    ahrens 	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
    698    789    ahrens 	 */
    699   5688   bonwick 	(void) zio_wait(zio);
    700   5688   bonwick 
    701   7754      Jeff 	spa_config_exit(spa, SCL_STATE, FTAG);
    702    789    ahrens }
    703    789    ahrens 
    704    789    ahrens /*
    705    789    ahrens  * Function called when a log block write completes
    706    789    ahrens  */
    707    789    ahrens static void
    708    789    ahrens zil_lwb_write_done(zio_t *zio)
    709    789    ahrens {
    710    789    ahrens 	lwb_t *lwb = zio->io_private;
    711    789    ahrens 	zilog_t *zilog = lwb->lwb_zilog;
    712  10922      Jeff 	dmu_tx_t *tx = lwb->lwb_tx;
    713   7754      Jeff 
    714   7754      Jeff 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
    715   7754      Jeff 	ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG);
    716   7754      Jeff 	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
    717   7754      Jeff 	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
    718   7754      Jeff 	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
    719   7754      Jeff 	ASSERT(!BP_IS_GANG(zio->io_bp));
    720   7754      Jeff 	ASSERT(!BP_IS_HOLE(zio->io_bp));
    721   7754      Jeff 	ASSERT(zio->io_bp->blk_fill == 0);
    722    789    ahrens 
    723    789    ahrens 	/*
    724   9493      Neil 	 * Ensure the lwb buffer pointer is cleared before releasing
    725   9493      Neil 	 * the txg. If we have had an allocation failure and
    726   9493      Neil 	 * the txg is waiting to sync then we want want zil_sync()
    727   9493      Neil 	 * to remove the lwb so that it's not picked up as the next new
    728   9493      Neil 	 * one in zil_commit_writer(). zil_sync() will only remove
    729   9493      Neil 	 * the lwb if lwb_buf is null.
    730    789    ahrens 	 */
    731    789    ahrens 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
    732    789    ahrens 	mutex_enter(&zilog->zl_lock);
    733    789    ahrens 	lwb->lwb_buf = NULL;
    734  10922      Jeff 	lwb->lwb_tx = NULL;
    735  10922      Jeff 	mutex_exit(&zilog->zl_lock);
    736   9493      Neil 
    737   9493      Neil 	/*
    738   9493      Neil 	 * Now that we've written this log block, we have a stable pointer
    739   9493      Neil 	 * to the next block in the chain, so it's OK to let the txg in
    740  10922      Jeff 	 * which we allocated the next block sync.
    741   9493      Neil 	 */
    742  10922      Jeff 	dmu_tx_commit(tx);
    743    789    ahrens }
    744    789    ahrens 
    745    789    ahrens /*
    746   2237    maybee  * Initialize the io for a log block.
    747   2237    maybee  */
    748   2237    maybee static void
    749   2237    maybee zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
    750   2237    maybee {
    751   2237    maybee 	zbookmark_t zb;
    752   2237    maybee 
    753  10922      Jeff 	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
    754  10922      Jeff 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
    755  10922      Jeff 	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
    756   2237    maybee 
    757   2638    perrin 	if (zilog->zl_root_zio == NULL) {
    758   2638    perrin 		zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
    759   2638    perrin 		    ZIO_FLAG_CANFAIL);
    760   2638    perrin 	}
    761   3063    perrin 	if (lwb->lwb_zio == NULL) {
    762   3063    perrin 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
    763   9701    George 		    0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz,
    764   9701    George 		    zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
    765  10685    George 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
    766   3063    perrin 	}
    767   2237    maybee }
    768   2237    maybee 
    769   2237    maybee /*
    770  10879      Neil  * Use the slog as long as the logbias is 'latency' and the current commit size
    771  10879      Neil  * is less than the limit or the total list size is less than 2X the limit.
    772  10879      Neil  * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
    773  10879      Neil  */
    774  10879      Neil uint64_t zil_slog_limit = 1024 * 1024;
    775  10879      Neil #define	USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
    776  10879      Neil 	(((zilog)->zl_cur_used < zil_slog_limit) || \
    777  10879      Neil 	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
    778  10879      Neil 
    779  10879      Neil /*
    780    789    ahrens  * Start a log block write and advance to the next log block.
    781    789    ahrens  * Calls are serialized.
    782    789    ahrens  */
    783    789    ahrens static lwb_t *
    784    789    ahrens zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
    785    789    ahrens {
    786    789    ahrens 	lwb_t *nlwb;
    787    789    ahrens 	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
    788   1807   bonwick 	spa_t *spa = zilog->zl_spa;
    789   1807   bonwick 	blkptr_t *bp = &ztp->zit_next_blk;
    790  10922      Jeff 	dmu_tx_t *tx;
    791    789    ahrens 	uint64_t txg;
    792    789    ahrens 	uint64_t zil_blksz;
    793    789    ahrens 	int error;
    794    789    ahrens 
    795    789    ahrens 	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
    796    789    ahrens 
    797    789    ahrens 	/*
    798    789    ahrens 	 * Allocate the next block and save its address in this block
    799    789    ahrens 	 * before writing it in order to establish the log chain.
    800    789    ahrens 	 * Note that if the allocation of nlwb synced before we wrote
    801    789    ahrens 	 * the block that points at it (lwb), we'd leak it if we crashed.
    802  10922      Jeff 	 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
    803  10922      Jeff 	 * We dirty the dataset to ensure that zil_sync() will be called
    804  10922      Jeff 	 * to clean up in the event of allocation failure or I/O failure.
    805    789    ahrens 	 */
    806  10922      Jeff 	tx = dmu_tx_create(zilog->zl_os);
    807  10922      Jeff 	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
    808  10922      Jeff 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
    809  10922      Jeff 	txg = dmu_tx_get_txg(tx);
    810  10922      Jeff 
    811  10922      Jeff 	lwb->lwb_tx = tx;
    812    789    ahrens 
    813    789    ahrens 	/*
    814   1141    perrin 	 * Pick a ZIL blocksize. We request a size that is the
    815   1141    perrin 	 * maximum of the previous used size, the current used size and
    816   1141    perrin 	 * the amount waiting in the queue.
    817    789    ahrens 	 */
    818   2237    maybee 	zil_blksz = MAX(zilog->zl_prev_used,
    819   2237    maybee 	    zilog->zl_cur_used + sizeof (*ztp));
    820   1141    perrin 	zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
    821   1842    perrin 	zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
    822   1141    perrin 	if (zil_blksz > ZIL_MAX_BLKSZ)
    823   1141    perrin 		zil_blksz = ZIL_MAX_BLKSZ;
    824    789    ahrens 
    825   3063    perrin 	BP_ZERO(bp);
    826   3063    perrin 	/* pass the old blkptr in order to spread log blocks across devs */
    827  10922      Jeff 	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
    828  10879      Neil 	    USE_SLOG(zilog));
    829    789    ahrens 	if (error) {
    830   1544  eschrock 		/*
    831  10922      Jeff 		 * Since we've just experienced an allocation failure,
    832   3668   gw25295 		 * terminate the current lwb and send it on its way.
    833   3668   gw25295 		 */
    834   3668   gw25295 		ztp->zit_pad = 0;
    835   3668   gw25295 		ztp->zit_nused = lwb->lwb_nused;
    836   3668   gw25295 		ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
    837   3668   gw25295 		zio_nowait(lwb->lwb_zio);
    838   3668   gw25295 
    839   3668   gw25295 		/*
    840   1544  eschrock 		 * By returning NULL the caller will call tx_wait_synced()
    841   1544  eschrock 		 */
    842    789    ahrens 		return (NULL);
    843    789    ahrens 	}
    844    789    ahrens 
    845   1807   bonwick 	ASSERT3U(bp->blk_birth, ==, txg);
    846   1544  eschrock 	ztp->zit_pad = 0;
    847    789    ahrens 	ztp->zit_nused = lwb->lwb_nused;
    848    789    ahrens 	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
    849   1807   bonwick 	bp->blk_cksum = lwb->lwb_blk.blk_cksum;
    850   1807   bonwick 	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
    851    789    ahrens 
    852    789    ahrens 	/*
    853    789    ahrens 	 * Allocate a new log write buffer (lwb).
    854    789    ahrens 	 */
    855    789    ahrens 	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
    856    789    ahrens 	nlwb->lwb_zilog = zilog;
    857   1807   bonwick 	nlwb->lwb_blk = *bp;
    858    789    ahrens 	nlwb->lwb_nused = 0;
    859    789    ahrens 	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
    860    789    ahrens 	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
    861    789    ahrens 	nlwb->lwb_max_txg = txg;
    862   2237    maybee 	nlwb->lwb_zio = NULL;
    863  10922      Jeff 	nlwb->lwb_tx = NULL;
    864    789    ahrens 
    865    789    ahrens 	/*
    866   3063    perrin 	 * Put new lwb at the end of the log chain
    867    789    ahrens 	 */
    868    789    ahrens 	mutex_enter(&zilog->zl_lock);
    869    789    ahrens 	list_insert_tail(&zilog->zl_lwb_list, nlwb);
    870   3063    perrin 	mutex_exit(&zilog->zl_lock);
    871   3063    perrin 
    872   5688   bonwick 	/* Record the block for later vdev flushing */
    873   5688   bonwick 	zil_add_block(zilog, &lwb->lwb_blk);
    874    789    ahrens 
    875    789    ahrens 	/*
    876   2237    maybee 	 * kick off the write for the old log block
    877    789    ahrens 	 */
    878   3063    perrin 	ASSERT(lwb->lwb_zio);
    879   2237    maybee 	zio_nowait(lwb->lwb_zio);
    880    789    ahrens 
    881    789    ahrens 	return (nlwb);
    882    789    ahrens }
    883    789    ahrens 
    884    789    ahrens static lwb_t *
    885    789    ahrens zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
    886    789    ahrens {
    887    789    ahrens 	lr_t *lrc = &itx->itx_lr; /* common log record */
    888  10922      Jeff 	lr_write_t *lrw = (lr_write_t *)lrc;
    889  10922      Jeff 	char *lr_buf;
    890    789    ahrens 	uint64_t txg = lrc->lrc_txg;
    891    789    ahrens 	uint64_t reclen = lrc->lrc_reclen;
    892  10922      Jeff 	uint64_t dlen = 0;
    893    789    ahrens 
    894    789    ahrens 	if (lwb == NULL)
    895    789    ahrens 		return (NULL);
    896  10922      Jeff 
    897    789    ahrens 	ASSERT(lwb->lwb_buf != NULL);
    898    789    ahrens 
    899   2237    maybee 	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
    900   2237    maybee 		dlen = P2ROUNDUP_TYPED(
    901  10922      Jeff 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
    902   1669    perrin 
    903   1669    perrin 	zilog->zl_cur_used += (reclen + dlen);
    904   1669    perrin 
    905   3063    perrin 	zil_lwb_write_init(zilog, lwb);
    906   3063    perrin 
    907   1669    perrin 	/*
    908   1669    perrin 	 * If this record won't fit in the current log block, start a new one.
    909   1669    perrin 	 */
    910   1669    perrin 	if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
    911   1669    perrin 		lwb = zil_lwb_write_start(zilog, lwb);
    912   2237    maybee 		if (lwb == NULL)
    913   1669    perrin 			return (NULL);
    914   3063    perrin 		zil_lwb_write_init(zilog, lwb);
    915   1669    perrin 		ASSERT(lwb->lwb_nused == 0);
    916   1669    perrin 		if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
    917   1669    perrin 			txg_wait_synced(zilog->zl_dmu_pool, txg);
    918    789    ahrens 			return (lwb);
    919    789    ahrens 		}
    920    789    ahrens 	}
    921   1141    perrin 
    922  10922      Jeff 	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
    923  10922      Jeff 	bcopy(lrc, lr_buf, reclen);
    924  10922      Jeff 	lrc = (lr_t *)lr_buf;
    925  10922      Jeff 	lrw = (lr_write_t *)lrc;
    926   2237    maybee 
    927   2237    maybee 	/*
    928   2237    maybee 	 * If it's a write, fetch the data or get its blkptr as appropriate.
    929   2237    maybee 	 */
    930   2237    maybee 	if (lrc->lrc_txtype == TX_WRITE) {
    931   2237    maybee 		if (txg > spa_freeze_txg(zilog->zl_spa))
    932   2237    maybee 			txg_wait_synced(zilog->zl_dmu_pool, txg);
    933   2237    maybee 		if (itx->itx_wr_state != WR_COPIED) {
    934   2237    maybee 			char *dbuf;
    935   2237    maybee 			int error;
    936   2237    maybee 
    937   2237    maybee 			if (dlen) {
    938   2237    maybee 				ASSERT(itx->itx_wr_state == WR_NEED_COPY);
    939  10922      Jeff 				dbuf = lr_buf + reclen;
    940  10922      Jeff 				lrw->lr_common.lrc_reclen += dlen;
    941   2237    maybee 			} else {
    942   2237    maybee 				ASSERT(itx->itx_wr_state == WR_INDIRECT);
    943   2237    maybee 				dbuf = NULL;
    944   2237    maybee 			}
    945   2237    maybee 			error = zilog->zl_get_data(
    946  10922      Jeff 			    itx->itx_private, lrw, dbuf, lwb->lwb_zio);
    947  10209      Mark 			if (error == EIO) {
    948  10209      Mark 				txg_wait_synced(zilog->zl_dmu_pool, txg);
    949  10209      Mark 				return (lwb);
    950  10209      Mark 			}
    951   2237    maybee 			if (error) {
    952   2237    maybee 				ASSERT(error == ENOENT || error == EEXIST ||
    953   2237    maybee 				    error == EALREADY);
    954   2237    maybee 				return (lwb);
    955   2237    maybee 			}
    956   2237    maybee 		}
    957   1669    perrin 	}
    958   2237    maybee 
    959  10922      Jeff 	/*
    960  10922      Jeff 	 * We're actually making an entry, so update lrc_seq to be the
    961  10922      Jeff 	 * log record sequence number.  Note that this is generally not
    962  10922      Jeff 	 * equal to the itx sequence number because not all transactions
    963  10922      Jeff 	 * are synchronous, and sometimes spa_sync() gets there first.
    964  10922      Jeff 	 */
    965  10922      Jeff 	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
    966   2237    maybee 	lwb->lwb_nused += reclen + dlen;
    967    789    ahrens 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
    968    789    ahrens 	ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
    969    789    ahrens 	ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
    970    789    ahrens 
    971    789    ahrens 	return (lwb);
    972    789    ahrens }
    973    789    ahrens 
    974    789    ahrens itx_t *
    975   5331       amw zil_itx_create(uint64_t txtype, size_t lrsize)
    976    789    ahrens {
    977    789    ahrens 	itx_t *itx;
    978    789    ahrens 
    979   1842    perrin 	lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
    980    789    ahrens 
    981    789    ahrens 	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
    982    789    ahrens 	itx->itx_lr.lrc_txtype = txtype;
    983    789    ahrens 	itx->itx_lr.lrc_reclen = lrsize;
    984   6101    perrin 	itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
    985    789    ahrens 	itx->itx_lr.lrc_seq = 0;	/* defensive */
    986    789    ahrens 
    987    789    ahrens 	return (itx);
    988    789    ahrens }
    989    789    ahrens 
    990  10922      Jeff void
    991  10922      Jeff zil_itx_destroy(itx_t *itx)
    992  10922      Jeff {
    993  10922      Jeff 	kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
    994  10922      Jeff }
    995  10922      Jeff 
    996    789    ahrens uint64_t
    997    789    ahrens zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
    998    789    ahrens {
    999    789    ahrens 	uint64_t seq;
   1000    789    ahrens 
   1001    789    ahrens 	ASSERT(itx->itx_lr.lrc_seq == 0);
   1002  10922      Jeff 	ASSERT(!zilog->zl_replay);
   1003    789    ahrens 
   1004    789    ahrens 	mutex_enter(&zilog->zl_lock);
   1005    789    ahrens 	list_insert_tail(&zilog->zl_itx_list, itx);
   1006   6101    perrin 	zilog->zl_itx_list_sz += itx->itx_sod;
   1007    789    ahrens 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
   1008    789    ahrens 	itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
   1009    789    ahrens 	mutex_exit(&zilog->zl_lock);
   1010    789    ahrens 
   1011    789    ahrens 	return (seq);
   1012    789    ahrens }
   1013    789    ahrens 
   1014    789    ahrens /*
   1015    789    ahrens  * Free up all in-memory intent log transactions that have now been synced.
   1016    789    ahrens  */
   1017    789    ahrens static void
   1018    789    ahrens zil_itx_clean(zilog_t *zilog)
   1019    789    ahrens {
   1020    789    ahrens 	uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
   1021    789    ahrens 	uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
   1022   3778  johansen 	list_t clean_list;
   1023    789    ahrens 	itx_t *itx;
   1024   3778  johansen 
   1025   3778  johansen 	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
   1026    789    ahrens 
   1027    789    ahrens 	mutex_enter(&zilog->zl_lock);
   1028   2638    perrin 	/* wait for a log writer to finish walking list */
   1029   2638    perrin 	while (zilog->zl_writer) {
   1030   2638    perrin 		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
   1031   2638    perrin 	}
   1032   3778  johansen 
   1033   3778  johansen 	/*
   1034   3778  johansen 	 * Move the sync'd log transactions to a separate list so we can call
   1035   3778  johansen 	 * kmem_free without holding the zl_lock.
   1036   3778  johansen 	 *
   1037   3778  johansen 	 * There is no need to set zl_writer as we don't drop zl_lock here
   1038   3778  johansen 	 */
   1039    789    ahrens 	while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
   1040    789    ahrens 	    itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
   1041    789    ahrens 		list_remove(&zilog->zl_itx_list, itx);
   1042   6101    perrin 		zilog->zl_itx_list_sz -= itx->itx_sod;
   1043   3778  johansen 		list_insert_tail(&clean_list, itx);
   1044   3778  johansen 	}
   1045   3778  johansen 	cv_broadcast(&zilog->zl_cv_writer);
   1046   3778  johansen 	mutex_exit(&zilog->zl_lock);
   1047   3778  johansen 
   1048   3778  johansen 	/* destroy sync'd log transactions */
   1049   3778  johansen 	while ((itx = list_head(&clean_list)) != NULL) {
   1050   3778  johansen 		list_remove(&clean_list, itx);
   1051  10922      Jeff 		zil_itx_destroy(itx);
   1052    789    ahrens 	}
   1053   3778  johansen 	list_destroy(&clean_list);
   1054    789    ahrens }
   1055    789    ahrens 
   1056   2638    perrin /*
   1057   3063    perrin  * If there are any in-memory intent log transactions which have now been
   1058   3063    perrin  * synced then start up a taskq to free them.
   1059   2638    perrin  */
   1060    789    ahrens void
   1061    789    ahrens zil_clean(zilog_t *zilog)
   1062    789    ahrens {
   1063   3063    perrin 	itx_t *itx;
   1064   3063    perrin 
   1065    789    ahrens 	mutex_enter(&zilog->zl_lock);
   1066   3063    perrin 	itx = list_head(&zilog->zl_itx_list);
   1067   3063    perrin 	if ((itx != NULL) &&
   1068   3063    perrin 	    (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
   1069    789    ahrens 		(void) taskq_dispatch(zilog->zl_clean_taskq,
   1070  10879      Neil 		    (task_func_t *)zil_itx_clean, zilog, TQ_NOSLEEP);
   1071   3063    perrin 	}
   1072    789    ahrens 	mutex_exit(&zilog->zl_lock);
   1073    789    ahrens }
   1074    789    ahrens 
   1075   7754      Jeff static void
   1076   2638    perrin zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
   1077    789    ahrens {
   1078    789    ahrens 	uint64_t txg;
   1079   3063    perrin 	uint64_t commit_seq = 0;
   1080  10922      Jeff 	itx_t *itx, *itx_next;
   1081    789    ahrens 	lwb_t *lwb;
   1082    789    ahrens 	spa_t *spa;
   1083  10922      Jeff 	int error = 0;
   1084    789    ahrens 
   1085   2638    perrin 	zilog->zl_writer = B_TRUE;
   1086   7754      Jeff 	ASSERT(zilog->zl_root_zio == NULL);
   1087    789    ahrens 	spa = zilog->zl_spa;
   1088    789    ahrens 
   1089    789    ahrens 	if (zilog->zl_suspend) {
   1090    789    ahrens 		lwb = NULL;
   1091    789    ahrens 	} else {
   1092    789    ahrens 		lwb = list_tail(&zilog->zl_lwb_list);
   1093    789    ahrens 		if (lwb == NULL) {
   1094   2638    perrin 			/*
   1095   2638    perrin 			 * Return if there's nothing to flush before we
   1096   2638    perrin 			 * dirty the fs by calling zil_create()
   1097   2638    perrin 			 */
   1098   2638    perrin 			if (list_is_empty(&zilog->zl_itx_list)) {
   1099   2638    perrin 				zilog->zl_writer = B_FALSE;
   1100   2638    perrin 				return;
   1101   2638    perrin 			}
   1102    789    ahrens 			mutex_exit(&zilog->zl_lock);
   1103    789    ahrens 			zil_create(zilog);
   1104    789    ahrens 			mutex_enter(&zilog->zl_lock);
   1105    789    ahrens 			lwb = list_tail(&zilog->zl_lwb_list);
   1106    789    ahrens 		}
   1107    789    ahrens 	}
   1108    789    ahrens 
   1109   3063    perrin 	/* Loop through in-memory log transactions filling log blocks. */
   1110   2638    perrin 	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
   1111  10922      Jeff 
   1112  10922      Jeff 	for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) {
   1113   2638    perrin 		/*
   1114  10922      Jeff 		 * Save the next pointer.  Even though we drop zl_lock below,
   1115  10922      Jeff 		 * all threads that can remove itx list entries (other writers
   1116  10922      Jeff 		 * and zil_itx_clean()) can't do so until they have zl_writer.
   1117   2638    perrin 		 */
   1118  10922      Jeff 		itx_next = list_next(&zilog->zl_itx_list, itx);
   1119  10922      Jeff 
   1120  10922      Jeff 		/*
   1121  10922      Jeff 		 * Determine whether to push this itx.
   1122  10922      Jeff 		 * Push all transactions related to specified foid and
   1123  10922      Jeff 		 * all other transactions except those that can be logged
   1124  10922      Jeff 		 * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL)
   1125  10922      Jeff 		 * for all other files.
   1126  10922      Jeff 		 *
   1127  10922      Jeff 		 * If foid == 0 (meaning "push all foids") or
   1128  10922      Jeff 		 * itx->itx_sync is set (meaning O_[D]SYNC), push regardless.
   1129  10922      Jeff 		 */
   1130  10922      Jeff 		if (foid != 0 && !itx->itx_sync &&
   1131  10922      Jeff 		    TX_OOO(itx->itx_lr.lrc_txtype) &&
   1132  10922      Jeff 		    ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid)
   1133  10922      Jeff 			continue; /* skip this record */
   1134    789    ahrens 
   1135    789    ahrens 		if ((itx->itx_lr.lrc_seq > seq) &&
   1136   2638    perrin 		    ((lwb == NULL) || (lwb->lwb_nused == 0) ||
   1137  10922      Jeff 		    (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb))))
   1138    789    ahrens 			break;
   1139    789    ahrens 
   1140    789    ahrens 		list_remove(&zilog->zl_itx_list, itx);
   1141   6101    perrin 		zilog->zl_itx_list_sz -= itx->itx_sod;
   1142  10922      Jeff 
   1143   3063    perrin 		mutex_exit(&zilog->zl_lock);
   1144  10922      Jeff 
   1145    789    ahrens 		txg = itx->itx_lr.lrc_txg;
   1146    789    ahrens 		ASSERT(txg);
   1147    789    ahrens 
   1148    789    ahrens 		if (txg > spa_last_synced_txg(spa) ||
   1149    789    ahrens 		    txg > spa_freeze_txg(spa))
   1150    789    ahrens 			lwb = zil_lwb_commit(zilog, itx, lwb);
   1151  10922      Jeff 
   1152  10922      Jeff 		zil_itx_destroy(itx);
   1153  10922      Jeff 
   1154    789    ahrens 		mutex_enter(&zilog->zl_lock);
   1155    789    ahrens 	}
   1156   2638    perrin 	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
   1157   3063    perrin 	/* determine commit sequence number */
   1158   3063    perrin 	itx = list_head(&zilog->zl_itx_list);
   1159   3063    perrin 	if (itx)
   1160  10922      Jeff 		commit_seq = itx->itx_lr.lrc_seq - 1;
   1161   3063    perrin 	else
   1162   3063    perrin 		commit_seq = zilog->zl_itx_seq;
   1163    789    ahrens 	mutex_exit(&zilog->zl_lock);
   1164    789    ahrens 
   1165    789    ahrens 	/* write the last block out */
   1166   3063    perrin 	if (lwb != NULL && lwb->lwb_zio != NULL)
   1167    789    ahrens 		lwb = zil_lwb_write_start(zilog, lwb);
   1168    789    ahrens 
   1169   1141    perrin 	zilog->zl_prev_used = zilog->zl_cur_used;
   1170   1141    perrin 	zilog->zl_cur_used = 0;
   1171   1141    perrin 
   1172   2638    perrin 	/*
   1173   2638    perrin 	 * Wait if necessary for the log blocks to be on stable storage.
   1174   2638    perrin 	 */
   1175   2638    perrin 	if (zilog->zl_root_zio) {
   1176   2638    perrin 		DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
   1177  10922      Jeff 		error = zio_wait(zilog->zl_root_zio);
   1178   7754      Jeff 		zilog->zl_root_zio = NULL;
   1179   2638    perrin 		DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
   1180   5688   bonwick 		zil_flush_vdevs(zilog);
   1181    789    ahrens 	}
   1182   1141    perrin 
   1183  10922      Jeff 	if (error || lwb == NULL)
   1184    789    ahrens 		txg_wait_synced(zilog->zl_dmu_pool, 0);
   1185   3063    perrin 
   1186   3063    perrin 	mutex_enter(&zilog->zl_lock);
   1187   1141    perrin 	zilog->zl_writer = B_FALSE;
   1188   3063    perrin 
   1189   3063    perrin 	ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
   1190   3063    perrin 	zilog->zl_commit_seq = commit_seq;
   1191  10922      Jeff 
   1192  10922      Jeff 	/*
   1193  10922      Jeff 	 * Remember the highest committed log sequence number for ztest.
   1194  10922      Jeff 	 * We only update this value when all the log writes succeeded,
   1195  10922      Jeff 	 * because ztest wants to ASSERT that it got the whole log chain.
   1196  10922      Jeff 	 */
   1197  10922      Jeff 	if (error == 0 && lwb != NULL)
   1198  10922      Jeff 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
   1199   2638    perrin }
   1200   2638    perrin 
   1201   2638    perrin /*
   1202   2638    perrin  * Push zfs transactions to stable storage up to the supplied sequence number.
   1203   2638    perrin  * If foid is 0 push out all transactions, otherwise push only those
   1204   2638    perrin  * for that file or might have been used to create that file.
   1205   2638    perrin  */
   1206   2638    perrin void
   1207   2638    perrin zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
   1208   2638    perrin {
   1209   2638    perrin 	if (zilog == NULL || seq == 0)
   1210   2638    perrin 		return;
   1211   2638    perrin 
   1212   2638    perrin 	mutex_enter(&zilog->zl_lock);
   1213   2638    perrin 
   1214   2638    perrin 	seq = MIN(seq, zilog->zl_itx_seq);	/* cap seq at largest itx seq */
   1215   2638    perrin 
   1216   3063    perrin 	while (zilog->zl_writer) {
   1217   2638    perrin 		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
   1218  10922      Jeff 		if (seq <= zilog->zl_commit_seq) {
   1219   3063    perrin 			mutex_exit(&zilog->zl_lock);
   1220   3063    perrin 			return;
   1221   3063    perrin 		}
   1222   3063    perrin 	}
   1223   2638    perrin 	zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
   1224   3063    perrin 	/* wake up others waiting on the commit */
   1225   3063    perrin 	cv_broadcast(&zilog->zl_cv_writer);
   1226   3063    perrin 	mutex_exit(&zilog->zl_lock);
   1227    789    ahrens }
   1228    789    ahrens 
   1229    789    ahrens /*
   1230  10922      Jeff  * Report whether all transactions are committed.
   1231  10922      Jeff  */
   1232  10922      Jeff static boolean_t
   1233  10922      Jeff zil_is_committed(zilog_t *zilog)
   1234  10922      Jeff {
   1235  10922      Jeff 	lwb_t *lwb;
   1236  10922      Jeff 	boolean_t committed;
   1237  10922      Jeff 
   1238  10922      Jeff 	mutex_enter(&zilog->zl_lock);
   1239  10922      Jeff 
   1240  10922      Jeff 	while (zilog->zl_writer)
   1241  10922      Jeff 		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
   1242  10922      Jeff 
   1243  10922      Jeff 	if (!list_is_empty(&zilog->zl_itx_list))
   1244  10922      Jeff 		committed = B_FALSE;		/* unpushed transactions */
   1245  10922      Jeff 	else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL)
   1246  10922      Jeff 		committed = B_TRUE;		/* intent log never used */
   1247  10922      Jeff 	else if (list_next(&zilog->zl_lwb_list, lwb) != NULL)
   1248  10922      Jeff 		committed = B_FALSE;		/* zil_sync() not done yet */
   1249  10922      Jeff 	else
   1250  10922      Jeff 		committed = B_TRUE;		/* everything synced */
   1251  10922      Jeff 
   1252  10922      Jeff 	mutex_exit(&zilog->zl_lock);
   1253  10922      Jeff 	return (committed);
   1254  10922      Jeff }
   1255  10922      Jeff 
   1256  10922      Jeff /*
   1257    789    ahrens  * Called in syncing context to free committed log blocks and update log header.
   1258    789    ahrens  */
   1259    789    ahrens void
   1260    789    ahrens zil_sync(zilog_t *zilog, dmu_tx_t *tx)
   1261    789    ahrens {
   1262   1807   bonwick 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
   1263    789    ahrens 	uint64_t txg = dmu_tx_get_txg(tx);
   1264    789    ahrens 	spa_t *spa = zilog->zl_spa;
   1265  10922      Jeff 	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
   1266    789    ahrens 	lwb_t *lwb;
   1267    789    ahrens 
   1268   9396   Matthew 	/*
   1269   9396   Matthew 	 * We don't zero out zl_destroy_txg, so make sure we don't try
   1270   9396   Matthew 	 * to destroy it twice.
   1271   9396   Matthew 	 */
   1272   9396   Matthew 	if (spa_sync_pass(spa) != 1)
   1273   9396   Matthew 		return;
   1274   9396   Matthew 
   1275   1807   bonwick 	mutex_enter(&zilog->zl_lock);
   1276   1807   bonwick 
   1277    789    ahrens 	ASSERT(zilog->zl_stop_sync == 0);
   1278    789    ahrens 
   1279  10922      Jeff 	if (*replayed_seq != 0) {
   1280  10922      Jeff 		ASSERT(zh->zh_replay_seq < *replayed_seq);
   1281  10922      Jeff 		zh->zh_replay_seq = *replayed_seq;
   1282  10922      Jeff 		*replayed_seq = 0;
   1283  10922      Jeff 	}
   1284    789    ahrens 
   1285    789    ahrens 	if (zilog->zl_destroy_txg == txg) {
   1286   1807   bonwick 		blkptr_t blk = zh->zh_log;
   1287   1807   bonwick 
   1288   1807   bonwick 		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
   1289   1807   bonwick 
   1290   1807   bonwick 		bzero(zh, sizeof (zil_header_t));
   1291   8227      Neil 		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
   1292   1807   bonwick 
   1293   1807   bonwick 		if (zilog->zl_keep_first) {
   1294   1807   bonwick 			/*
   1295   1807   bonwick 			 * If this block was part of log chain that couldn't
   1296   1807   bonwick 			 * be claimed because a device was missing during
   1297   1807   bonwick 			 * zil_claim(), but that device later returns,
   1298   1807   bonwick 			 * then this block could erroneously appear valid.
   1299   1807   bonwick 			 * To guard against this, assign a new GUID to the new
   1300   1807   bonwick 			 * log chain so it doesn't matter what blk points to.
   1301   1807   bonwick 			 */
   1302   1807   bonwick 			zil_init_log_chain(zilog, &blk);
   1303   1807   bonwick 			zh->zh_log = blk;
   1304   1807   bonwick 		}
   1305    789    ahrens 	}
   1306    789    ahrens 
   1307   9701    George 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
   1308   2638    perrin 		zh->zh_log = lwb->lwb_blk;
   1309    789    ahrens 		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
   1310    789    ahrens 			break;
   1311    789    ahrens 		list_remove(&zilog->zl_lwb_list, lwb);
   1312  10922      Jeff 		zio_free_zil(spa, txg, &lwb->lwb_blk);
   1313    789    ahrens 		kmem_cache_free(zil_lwb_cache, lwb);
   1314   3668   gw25295 
   1315   3668   gw25295 		/*
   1316   3668   gw25295 		 * If we don't have anything left in the lwb list then
   1317   3668   gw25295 		 * we've had an allocation failure and we need to zero
   1318   3668   gw25295 		 * out the zil_header blkptr so that we don't end
   1319   3668   gw25295 		 * up freeing the same block twice.
   1320   3668   gw25295 		 */
   1321   3668   gw25295 		if (list_head(&zilog->zl_lwb_list) == NULL)
   1322   3668   gw25295 			BP_ZERO(&zh->zh_log);
   1323    789    ahrens 	}
   1324    789    ahrens 	mutex_exit(&zilog->zl_lock);
   1325    789    ahrens }
   1326    789    ahrens 
   1327    789    ahrens void
   1328    789    ahrens zil_init(void)
   1329    789    ahrens {
   1330    789    ahrens 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
   1331   2856  nd150628 	    sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0);
   1332    789    ahrens }
   1333    789    ahrens 
   1334    789    ahrens void
   1335    789    ahrens zil_fini(void)
   1336    789    ahrens {
   1337    789    ahrens 	kmem_cache_destroy(zil_lwb_cache);
   1338    789    ahrens }
   1339    789    ahrens 
   1340  10310      Neil void
   1341  10310      Neil zil_set_logbias(zilog_t *zilog, uint64_t logbias)
   1342  10310      Neil {
   1343  10310      Neil 	zilog->zl_logbias = logbias;
   1344  10310      Neil }
   1345  10310      Neil 
   1346    789    ahrens zilog_t *
   1347    789    ahrens zil_alloc(objset_t *os, zil_header_t *zh_phys)
   1348    789    ahrens {
   1349    789    ahrens 	zilog_t *zilog;
   1350    789    ahrens 
   1351    789    ahrens 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
   1352    789    ahrens 
   1353    789    ahrens 	zilog->zl_header = zh_phys;
   1354    789    ahrens 	zilog->zl_os = os;
   1355    789    ahrens 	zilog->zl_spa = dmu_objset_spa(os);
   1356    789    ahrens 	zilog->zl_dmu_pool = dmu_objset_pool(os);
   1357   1807   bonwick 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
   1358  10310      Neil 	zilog->zl_logbias = dmu_objset_logbias(os);
   1359   2856  nd150628 
   1360   2856  nd150628 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
   1361    789    ahrens 
   1362    789    ahrens 	list_create(&zilog->zl_itx_list, sizeof (itx_t),
   1363    789    ahrens 	    offsetof(itx_t, itx_node));
   1364    789    ahrens 
   1365    789    ahrens 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
   1366    789    ahrens 	    offsetof(lwb_t, lwb_node));
   1367    789    ahrens 
   1368   5688   bonwick 	mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
   1369   5688   bonwick 
   1370   5688   bonwick 	avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
   1371   5688   bonwick 	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
   1372    789    ahrens 
   1373   5913    perrin 	cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
   1374   5913    perrin 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
   1375   5913    perrin 
   1376    789    ahrens 	return (zilog);
   1377    789    ahrens }
   1378    789    ahrens 
   1379    789    ahrens void
   1380    789    ahrens zil_free(zilog_t *zilog)
   1381    789    ahrens {
   1382    789    ahrens 	lwb_t *lwb;
   1383    789    ahrens 
   1384    789    ahrens 	zilog->zl_stop_sync = 1;
   1385    789    ahrens 
   1386    789    ahrens 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
   1387    789    ahrens 		list_remove(&zilog->zl_lwb_list, lwb);
   1388    789    ahrens 		if (lwb->lwb_buf != NULL)
   1389    789    ahrens 			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
   1390    789    ahrens 		kmem_cache_free(zil_lwb_cache, lwb);
   1391    789    ahrens 	}
   1392    789    ahrens 	list_destroy(&zilog->zl_lwb_list);
   1393    789    ahrens 
   1394   5688   bonwick 	avl_destroy(&zilog->zl_vdev_tree);
   1395   5688   bonwick 	mutex_destroy(&zilog->zl_vdev_lock);
   1396    789    ahrens 
   1397    789    ahrens 	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
   1398    789    ahrens 	list_destroy(&zilog->zl_itx_list);
   1399   2856  nd150628 	mutex_destroy(&zilog->zl_lock);
   1400   5913    perrin 
   1401   5913    perrin 	cv_destroy(&zilog->zl_cv_writer);
   1402   5913    perrin 	cv_destroy(&zilog->zl_cv_suspend);
   1403    789    ahrens 
   1404    789    ahrens 	kmem_free(zilog, sizeof (zilog_t));
   1405    789    ahrens }
   1406    789    ahrens 
   1407    789    ahrens /*
   1408    789    ahrens  * Open an intent log.
   1409    789    ahrens  */
   1410    789    ahrens zilog_t *
   1411    789    ahrens zil_open(objset_t *os, zil_get_data_t *get_data)
   1412    789    ahrens {
   1413    789    ahrens 	zilog_t *zilog = dmu_objset_zil(os);
   1414    789    ahrens 
   1415    789    ahrens 	zilog->zl_get_data = get_data;
   1416    789    ahrens 	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
   1417    789    ahrens 	    2, 2, TASKQ_PREPOPULATE);
   1418    789    ahrens 
   1419    789    ahrens 	return (zilog);
   1420    789    ahrens }
   1421    789    ahrens 
   1422    789    ahrens /*
   1423    789    ahrens  * Close an intent log.
   1424    789    ahrens  */
   1425    789    ahrens void
   1426    789    ahrens zil_close(zilog_t *zilog)
   1427    789    ahrens {
   1428   1807   bonwick 	/*
   1429   1807   bonwick 	 * If the log isn't already committed, mark the objset dirty
   1430   1807   bonwick 	 * (so zil_sync() will be called) and wait for that txg to sync.
   1431   1807   bonwick 	 */
   1432   1807   bonwick 	if (!zil_is_committed(zilog)) {
   1433   1807   bonwick 		uint64_t txg;
   1434   1807   bonwick 		dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
   1435  10922      Jeff 		VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
   1436   1807   bonwick 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
   1437   1807   bonwick 		txg = dmu_tx_get_txg(tx);
   1438   1807   bonwick 		dmu_tx_commit(tx);
   1439   1807   bonwick 		txg_wait_synced(zilog->zl_dmu_pool, txg);
   1440   1807   bonwick 	}
   1441   1807   bonwick 
   1442    789    ahrens 	taskq_destroy(zilog->zl_clean_taskq);
   1443    789    ahrens 	zilog->zl_clean_taskq = NULL;
   1444    789    ahrens 	zilog->zl_get_data = NULL;
   1445    789    ahrens 
   1446    789    ahrens 	zil_itx_clean(zilog);
   1447    789    ahrens 	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
   1448    789    ahrens }
   1449    789    ahrens 
   1450    789    ahrens /*
   1451    789    ahrens  * Suspend an intent log.  While in suspended mode, we still honor
   1452    789    ahrens  * synchronous semantics, but we rely on txg_wait_synced() to do it.
   1453    789    ahrens  * We suspend the log briefly when taking a snapshot so that the snapshot
   1454    789    ahrens  * contains all the data it's supposed to, and has an empty intent log.
   1455    789    ahrens  */
   1456    789    ahrens int
   1457    789    ahrens zil_suspend(zilog_t *zilog)
   1458    789    ahrens {
   1459   1807   bonwick 	const zil_header_t *zh = zilog->zl_header;
   1460    789    ahrens 
   1461    789    ahrens 	mutex_enter(&zilog->zl_lock);
   1462   8989      Neil 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
   1463    789    ahrens 		mutex_exit(&zilog->zl_lock);
   1464    789    ahrens 		return (EBUSY);
   1465    789    ahrens 	}
   1466   1807   bonwick 	if (zilog->zl_suspend++ != 0) {
   1467   1807   bonwick 		/*
   1468   1807   bonwick 		 * Someone else already began a suspend.
   1469   1807   bonwick 		 * Just wait for them to finish.
   1470   1807   bonwick 		 */
   1471   1807   bonwick 		while (zilog->zl_suspending)
   1472   1807   bonwick 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
   1473   1807   bonwick 		mutex_exit(&zilog->zl_lock);
   1474   1807   bonwick 		return (0);
   1475   1807   bonwick 	}
   1476   1807   bonwick 	zilog->zl_suspending = B_TRUE;
   1477    789    ahrens 	mutex_exit(&zilog->zl_lock);
   1478    789    ahrens 
   1479   2638    perrin 	zil_commit(zilog, UINT64_MAX, 0);
   1480    789    ahrens 
   1481   2638    perrin 	/*
   1482   2638    perrin 	 * Wait for any in-flight log writes to complete.
   1483   2638    perrin 	 */
   1484    789    ahrens 	mutex_enter(&zilog->zl_lock);
   1485   2638    perrin 	while (zilog->zl_writer)
   1486   2638    perrin 		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
   1487    789    ahrens 	mutex_exit(&zilog->zl_lock);
   1488    789    ahrens 
   1489   1807   bonwick 	zil_destroy(zilog, B_FALSE);
   1490   1807   bonwick 
   1491   1807   bonwick 	mutex_enter(&zilog->zl_lock);
   1492   1807   bonwick 	zilog->zl_suspending = B_FALSE;
   1493   1807   bonwick 	cv_broadcast(&zilog->zl_cv_suspend);
   1494   1807   bonwick 	mutex_exit(&zilog->zl_lock);
   1495    789    ahrens 
   1496    789    ahrens 	return (0);
   1497    789    ahrens }
   1498    789    ahrens 
   1499    789    ahrens void
   1500    789    ahrens zil_resume(zilog_t *zilog)
   1501    789    ahrens {
   1502    789    ahrens 	mutex_enter(&zilog->zl_lock);
   1503    789    ahrens 	ASSERT(zilog->zl_suspend != 0);
   1504    789    ahrens 	zilog->zl_suspend--;
   1505    789    ahrens 	mutex_exit(&zilog->zl_lock);
   1506    789    ahrens }
   1507    789    ahrens 
   1508    789    ahrens typedef struct zil_replay_arg {
   1509    789    ahrens 	zil_replay_func_t **zr_replay;
   1510    789    ahrens 	void		*zr_arg;
   1511    789    ahrens 	boolean_t	zr_byteswap;
   1512  10922      Jeff 	char		*zr_lr;
   1513    789    ahrens } zil_replay_arg_t;
   1514    789    ahrens 
   1515  10922      Jeff static int
   1516  10922      Jeff zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
   1517  10922      Jeff {
   1518  10922      Jeff 	char name[MAXNAMELEN];
   1519  10922      Jeff 
   1520  10922      Jeff 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
   1521  10922      Jeff 
   1522  10922      Jeff 	dmu_objset_name(zilog->zl_os, name);
   1523  10922      Jeff 
   1524  10922      Jeff 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
   1525  10922      Jeff 	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
   1526  10922      Jeff 	    (u_longlong_t)lr->lrc_seq,
   1527  10922      Jeff 	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
   1528  10922      Jeff 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
   1529  10922      Jeff 
   1530  10922      Jeff 	return (error);
   1531  10922      Jeff }
   1532  10922      Jeff 
   1533  10922      Jeff static int
   1534    789    ahrens zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
   1535    789    ahrens {
   1536    789    ahrens 	zil_replay_arg_t *zr = zra;
   1537   1807   bonwick 	const zil_header_t *zh = zilog->zl_header;
   1538    789    ahrens 	uint64_t reclen = lr->lrc_reclen;
   1539    789    ahrens 	uint64_t txtype = lr->lrc_txtype;
   1540  10922      Jeff 	int error = 0;
   1541    789    ahrens 
   1542  10922      Jeff 	zilog->zl_replaying_seq = lr->lrc_seq;
   1543  10922      Jeff 
   1544  10922      Jeff 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
   1545  10922      Jeff 		return (0);
   1546    789    ahrens 
   1547    789    ahrens 	if (lr->lrc_txg < claim_txg)		/* already committed */
   1548  10922      Jeff 		return (0);
   1549    789    ahrens 
   1550   5331       amw 	/* Strip case-insensitive bit, still present in log record */
   1551   5331       amw 	txtype &= ~TX_CI;
   1552   8227      Neil 
   1553  10922      Jeff 	if (txtype == 0 || txtype >= TX_MAX_TYPE)
   1554  10922      Jeff 		return (zil_replay_error(zilog, lr, EINVAL));
   1555  10922      Jeff 
   1556  10922      Jeff 	/*
   1557  10922      Jeff 	 * If this record type can be logged out of order, the object
   1558  10922      Jeff 	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
   1559  10922      Jeff 	 */
   1560  10922      Jeff 	if (TX_OOO(txtype)) {
   1561  10922      Jeff 		error = dmu_object_info(zilog->zl_os,
   1562  10922      Jeff 		    ((lr_ooo_t *)lr)->lr_foid, NULL);
   1563  10922      Jeff 		if (error == ENOENT || error == EEXIST)
   1564  10922      Jeff 			return (0);
   1565   8227      Neil 	}
   1566   5331       amw 
   1567    789    ahrens 	/*
   1568    789    ahrens 	 * Make a copy of the data so we can revise and extend it.
   1569    789    ahrens 	 */
   1570  10922      Jeff 	bcopy(lr, zr->zr_lr, reclen);
   1571  10922      Jeff 
   1572  10922      Jeff 	/*
   1573  10922      Jeff 	 * If this is a TX_WRITE with a blkptr, suck in the data.
   1574  10922      Jeff 	 */
   1575  10922      Jeff 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
   1576  10922      Jeff 		error = zil_read_log_data(zilog, (lr_write_t *)lr,
   1577  10922      Jeff 		    zr->zr_lr + reclen);
   1578  10922      Jeff 		if (error)
   1579  10922      Jeff 			return (zil_replay_error(zilog, lr, error));
   1580  10922      Jeff 	}
   1581    789    ahrens 
   1582    789    ahrens 	/*
   1583    789    ahrens 	 * The log block containing this lr may have been byteswapped
   1584    789    ahrens 	 * so that we can easily examine common fields like lrc_txtype.
   1585  10922      Jeff 	 * However, the log is a mix of different record types, and only the
   1586    789    ahrens 	 * replay vectors know how to byteswap their records.  Therefore, if
   1587    789    ahrens 	 * the lr was byteswapped, undo it before invoking the replay vector.
   1588    789    ahrens 	 */
   1589    789    ahrens 	if (zr->zr_byteswap)
   1590  10922      Jeff 		byteswap_uint64_array(zr->zr_lr, reclen);
   1591    789    ahrens 
   1592    789    ahrens 	/*
   1593   8227      Neil 	 * We must now do two things atomically: replay this log record,
   1594   8227      Neil 	 * and update the log header sequence number to reflect the fact that
   1595   8227      Neil 	 * we did so. At the end of each replay function the sequence number
   1596   8227      Neil 	 * is updated if we are in replay mode.
   1597   7904      Neil 	 */
   1598  10922      Jeff 	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
   1599  10922      Jeff 	if (error) {
   1600   3063    perrin 		/*
   1601   3063    perrin 		 * The DMU's dnode layer doesn't see removes until the txg
   1602   3063    perrin 		 * commits, so a subsequent claim can spuriously fail with
   1603   8227      Neil 		 * EEXIST. So if we receive any error we try syncing out
   1604  10922      Jeff 		 * any removes then retry the transaction.  Note that we
   1605  10922      Jeff 		 * specify B_FALSE for byteswap now, so we don't do it twice.
   1606   3063    perrin 		 */
   1607  10922      Jeff 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
   1608  10922      Jeff 		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
   1609  10922      Jeff 		if (error)
   1610  10922      Jeff 			return (zil_replay_error(zilog, lr, error));
   1611    789    ahrens 	}
   1612  10922      Jeff 	return (0);
   1613   3063    perrin }
   1614    789    ahrens 
   1615   3063    perrin /* ARGSUSED */
   1616  10922      Jeff static int
   1617   3063    perrin zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
   1618   3063    perrin {
   1619   3063    perrin 	zilog->zl_replay_blks++;
   1620  10922      Jeff 
   1621  10922      Jeff 	return (0);
   1622    789    ahrens }
   1623    789    ahrens 
   1624    789    ahrens /*
   1625   1362    perrin  * If this dataset has a non-empty intent log, replay it and destroy it.
   1626    789    ahrens  */
   1627    789    ahrens void
   1628   8227      Neil zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
   1629    789    ahrens {
   1630    789    ahrens 	zilog_t *zilog = dmu_objset_zil(os);
   1631   1807   bonwick 	const zil_header_t *zh = zilog->zl_header;
   1632   1807   bonwick 	zil_replay_arg_t zr;
   1633   1362    perrin 
   1634   8989      Neil 	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
   1635   1807   bonwick 		zil_destroy(zilog, B_TRUE);
   1636   1362    perrin 		return;
   1637   1362    perrin 	}
   1638    789    ahrens 
   1639    789    ahrens 	zr.zr_replay = replay_func;
   1640    789    ahrens 	zr.zr_arg = arg;
   1641   1807   bonwick 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
   1642  10922      Jeff 	zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
   1643    789    ahrens 
   1644    789    ahrens 	/*
   1645    789    ahrens 	 * Wait for in-progress removes to sync before starting replay.
   1646    789    ahrens 	 */
   1647    789    ahrens 	txg_wait_synced(zilog->zl_dmu_pool, 0);
   1648    789    ahrens 
   1649   8227      Neil 	zilog->zl_replay = B_TRUE;
   1650  11066    rafael 	zilog->zl_replay_time = ddi_get_lbolt();
   1651   3063    perrin 	ASSERT(zilog->zl_replay_blks == 0);
   1652   3063    perrin 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
   1653   1807   bonwick 	    zh->zh_claim_txg);
   1654  10922      Jeff 	kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
   1655    789    ahrens 
   1656   1807   bonwick 	zil_destroy(zilog, B_FALSE);
   1657   5712    ahrens 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
   1658   8227      Neil 	zilog->zl_replay = B_FALSE;
   1659    789    ahrens }
   1660   1646    perrin 
   1661  10922      Jeff boolean_t
   1662  10922      Jeff zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
   1663   1646    perrin {
   1664  10922      Jeff 	if (zilog == NULL)
   1665  10922      Jeff 		return (B_TRUE);
   1666   1646    perrin 
   1667  10922      Jeff 	if (zilog->zl_replay) {
   1668  10922      Jeff 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
   1669  10922      Jeff 		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
   1670  10922      Jeff 		    zilog->zl_replaying_seq;
   1671  10922      Jeff 		return (B_TRUE);
   1672   2638    perrin 	}
   1673   2638    perrin 
   1674  10922      Jeff 	return (B_FALSE);
   1675   1646    perrin }
   1676   9701    George 
   1677   9701    George /* ARGSUSED */
   1678   9701    George int
   1679   9701    George zil_vdev_offline(char *osname, void *arg)
   1680   9701    George {
   1681   9701    George 	objset_t *os;
   1682   9701    George 	zilog_t *zilog;
   1683   9701    George 	int error;
   1684   9701    George 
   1685  10298   Matthew 	error = dmu_objset_hold(osname, FTAG, &os);
   1686   9701    George 	if (error)
   1687   9701    George 		return (error);
   1688   9701    George 
   1689   9701    George 	zilog = dmu_objset_zil(os);
   1690   9701    George 	if (zil_suspend(zilog) != 0)
   1691   9701    George 		error = EEXIST;
   1692   9701    George 	else
   1693   9701    George 		zil_resume(zilog);
   1694  10298   Matthew 	dmu_objset_rele(os, FTAG);
   1695   9701    George 	return (error);
   1696   9701    George }
   1697