Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/zfs_context.h>
     29 #include <sys/dmu_objset.h>
     30 #include <sys/dmu_traverse.h>
     31 #include <sys/dsl_dataset.h>
     32 #include <sys/dsl_dir.h>
     33 #include <sys/dsl_pool.h>
     34 #include <sys/dnode.h>
     35 #include <sys/spa.h>
     36 #include <sys/zio.h>
     37 #include <sys/dmu_impl.h>
     38 #include <sys/zvol.h>
     39 
     40 #define	BP_SPAN_SHIFT(level, width)	((level) * (width))
     41 
     42 #define	BP_EQUAL(b1, b2)				\
     43 	(DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) &&	\
     44 	(b1)->blk_birth == (b2)->blk_birth)
     45 
     46 /*
     47  * Compare two bookmarks.
     48  *
     49  * For ADVANCE_PRE, the visitation order is:
     50  *
     51  *	objset 0, 1, 2, ..., ZB_MAXOBJSET.
     52  *	object 0, 1, 2, ..., ZB_MAXOBJECT.
     53  *	blkoff 0, 1, 2, ...
     54  *	level ZB_MAXLEVEL, ..., 2, 1, 0.
     55  *
     56  * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
     57  * ordering vector is:
     58  *
     59  *	< objset, object, blkoff, -level >
     60  *
     61  * For ADVANCE_POST, the starting offsets aren't sequential but ending
     62  * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
     63  * The visitation order is:
     64  *
     65  *	objset 1, 2, ..., ZB_MAXOBJSET, 0.
     66  *	object 1, 2, ..., ZB_MAXOBJECT, 0.
     67  *	blkoff 1, 2, ...
     68  *	level 0, 1, 2, ..., ZB_MAXLEVEL.
     69  *
     70  * and thus a valid ordering vector is:
     71  *
     72  *	< objset - 1, object - 1, blkoff, level >
     73  *
     74  * Both orderings can be expressed as:
     75  *
     76  *	< objset + bias, object + bias, blkoff, level ^ bias >
     77  *
     78  * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
     79  * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
     80  *
     81  * Special case: an objset's osphys is represented as level -1 of object 0.
     82  * It is always either the very first or very last block we visit in an objset.
     83  * Therefore, if either bookmark's level is -1, level alone determines order.
     84  */
     85 static int
     86 compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
     87     int advance)
     88 {
     89 	int bias = (advance & ADVANCE_PRE) ? 0 : -1;
     90 	uint64_t sblkoff, eblkoff;
     91 	int slevel, elevel, wshift;
     92 
     93 	if (szb->zb_objset + bias < ezb->zb_objset + bias)
     94 		return (-1);
     95 
     96 	if (szb->zb_objset + bias > ezb->zb_objset + bias)
     97 		return (1);
     98 
     99 	slevel = szb->zb_level;
    100 	elevel = ezb->zb_level;
    101 
    102 	if ((slevel | elevel) < 0)
    103 		return ((slevel ^ bias) - (elevel ^ bias));
    104 
    105 	if (szb->zb_object + bias < ezb->zb_object + bias)
    106 		return (-1);
    107 
    108 	if (szb->zb_object + bias > ezb->zb_object + bias)
    109 		return (1);
    110 
    111 	if (dnp == NULL)
    112 		return (0);
    113 
    114 	wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
    115 
    116 	sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
    117 	eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
    118 
    119 	if (sblkoff < eblkoff)
    120 		return (-1);
    121 
    122 	if (sblkoff > eblkoff)
    123 		return (1);
    124 
    125 	return ((elevel ^ bias) - (slevel ^ bias));
    126 }
    127 
    128 #define	SET_BOOKMARK(zb, objset, object, level, blkid)	\
    129 {							\
    130 	(zb)->zb_objset = objset;			\
    131 	(zb)->zb_object = object;			\
    132 	(zb)->zb_level = level;				\
    133 	(zb)->zb_blkid = blkid;				\
    134 }
    135 
    136 #define	SET_BOOKMARK_LB(zb, level, blkid)		\
    137 {							\
    138 	(zb)->zb_level = level;				\
    139 	(zb)->zb_blkid = blkid;				\
    140 }
    141 
    142 static int
    143 advance_objset(zseg_t *zseg, uint64_t objset, int advance)
    144 {
    145 	zbookmark_t *zb = &zseg->seg_start;
    146 
    147 	if (advance & ADVANCE_PRE) {
    148 		if (objset >= ZB_MAXOBJSET)
    149 			return (ERANGE);
    150 		SET_BOOKMARK(zb, objset, 0, -1, 0);
    151 	} else {
    152 		if (objset >= ZB_MAXOBJSET)
    153 			objset = 0;
    154 		SET_BOOKMARK(zb, objset, 1, 0, 0);
    155 	}
    156 
    157 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
    158 		return (ERANGE);
    159 
    160 	return (EAGAIN);
    161 }
    162 
    163 static int
    164 advance_object(zseg_t *zseg, uint64_t object, int advance)
    165 {
    166 	zbookmark_t *zb = &zseg->seg_start;
    167 
    168 	if (advance & ADVANCE_PRE) {
    169 		if (object >= ZB_MAXOBJECT) {
    170 			SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
    171 		} else {
    172 			SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
    173 		}
    174 	} else {
    175 		if (zb->zb_object == 0) {
    176 			SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
    177 		} else {
    178 			if (object >= ZB_MAXOBJECT)
    179 				object = 0;
    180 			SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
    181 		}
    182 	}
    183 
    184 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
    185 		return (ERANGE);
    186 
    187 	return (EAGAIN);
    188 }
    189 
    190 static int
    191 advance_from_osphys(zseg_t *zseg, int advance)
    192 {
    193 	zbookmark_t *zb = &zseg->seg_start;
    194 
    195 	ASSERT(zb->zb_object == 0);
    196 	ASSERT(zb->zb_level == -1);
    197 	ASSERT(zb->zb_blkid == 0);
    198 
    199 	if (advance & ADVANCE_PRE) {
    200 		SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
    201 	} else {
    202 		if (zb->zb_objset == 0)
    203 			return (ERANGE);
    204 		SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
    205 	}
    206 
    207 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
    208 		return (ERANGE);
    209 
    210 	return (EAGAIN);
    211 }
    212 
    213 static int
    214 advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
    215 {
    216 	zbookmark_t *zb = &zseg->seg_start;
    217 	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
    218 	int maxlevel = dnp->dn_nlevels - 1;
    219 	int level = zb->zb_level;
    220 	uint64_t blkid = zb->zb_blkid;
    221 
    222 	if (advance & ADVANCE_PRE) {
    223 		if (level > 0 && rc == 0) {
    224 			level--;
    225 			blkid <<= wshift;
    226 		} else {
    227 			blkid++;
    228 
    229 			if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
    230 			    dnp->dn_maxblkid)
    231 				return (ERANGE);
    232 
    233 			while (level < maxlevel) {
    234 				if (P2PHASE(blkid, 1ULL << wshift))
    235 					break;
    236 				blkid >>= wshift;
    237 				level++;
    238 			}
    239 		}
    240 	} else {
    241 		if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
    242 			blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
    243 			level = 0;
    244 		} else {
    245 			blkid >>= wshift;
    246 			level++;
    247 		}
    248 
    249 		while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
    250 		    dnp->dn_maxblkid) {
    251 			if (level == maxlevel)
    252 				return (ERANGE);
    253 			blkid >>= wshift;
    254 			level++;
    255 		}
    256 	}
    257 	SET_BOOKMARK_LB(zb, level, blkid);
    258 
    259 	if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
    260 		return (ERANGE);
    261 
    262 	return (EAGAIN);
    263 }
    264 
    265 /*
    266  * The traverse_callback function will call the function specified in th_func.
    267  * In the event of an error the callee, specified by th_func, must return
    268  * one of the following errors:
    269  *
    270  *	EINTR		- Indicates that the callee wants the traversal to
    271  *			  abort immediately.
    272  * 	ERESTART	- The callee has acknowledged the error and would
    273  *			  like to continue.
    274  */
    275 static int
    276 traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
    277 {
    278 	/*
    279 	 * Before we issue the callback, prune against maxtxg.
    280 	 *
    281 	 * We prune against mintxg before we get here because it's a big win.
    282 	 * If a given block was born in txg 37, then we know that the entire
    283 	 * subtree below that block must have been born in txg 37 or earlier.
    284 	 * We can therefore lop off huge branches of the tree as we go.
    285 	 *
    286 	 * There's no corresponding optimization for maxtxg because knowing
    287 	 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
    288 	 * children.  In fact, the copy-on-write design of ZFS ensures that
    289 	 * top-level blocks will pretty much always be new.
    290 	 *
    291 	 * Therefore, in the name of simplicity we don't prune against
    292 	 * maxtxg until the last possible moment -- that being right now.
    293 	 */
    294 	if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
    295 		return (0);
    296 
    297 	/*
    298 	 * Debugging: verify that the order we visit things agrees with the
    299 	 * order defined by compare_bookmark().  We don't check this for
    300 	 * log blocks because there's no defined ordering for them; they're
    301 	 * always visited (or not) as part of visiting the objset_phys_t.
    302 	 */
    303 	if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
    304 		zbookmark_t *zb = &bc->bc_bookmark;
    305 		zbookmark_t *szb = &zseg->seg_start;
    306 		zbookmark_t *ezb = &zseg->seg_end;
    307 		zbookmark_t *lzb = &th->th_lastcb;
    308 		dnode_phys_t *dnp = bc->bc_dnode;
    309 
    310 		ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
    311 		ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
    312 		ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
    313 		    lzb->zb_level == ZB_NO_LEVEL);
    314 		*lzb = *zb;
    315 	}
    316 
    317 	th->th_callbacks++;
    318 	return (th->th_func(bc, th->th_spa, th->th_arg));
    319 }
    320 
    321 static int
    322 traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
    323 	dnode_phys_t *dnp)
    324 {
    325 	zbookmark_t *zb = &bc->bc_bookmark;
    326 	int error;
    327 
    328 	th->th_hits++;
    329 
    330 	bc->bc_dnode = dnp;
    331 	bc->bc_errno = 0;
    332 
    333 	if (BP_EQUAL(&bc->bc_blkptr, bp))
    334 		return (0);
    335 
    336 	bc->bc_blkptr = *bp;
    337 
    338 	if (bc->bc_data == NULL)
    339 		return (0);
    340 
    341 	if (BP_IS_HOLE(bp)) {
    342 		ASSERT(th->th_advance & ADVANCE_HOLES);
    343 		return (0);
    344 	}
    345 
    346 	if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
    347 		error = EIO;
    348 	} else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
    349 		error = 0;
    350 		th->th_arc_hits++;
    351 	} else {
    352 		error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
    353 		    BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
    354 		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
    355 
    356 		if (BP_SHOULD_BYTESWAP(bp) && error == 0)
    357 			(zb->zb_level > 0 ? byteswap_uint64_array :
    358 			    dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
    359 			    BP_GET_LSIZE(bp));
    360 		th->th_reads++;
    361 	}
    362 
    363 	if (error) {
    364 		bc->bc_errno = error;
    365 		error = traverse_callback(th, NULL, bc);
    366 		ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
    367 		bc->bc_blkptr.blk_birth = -1ULL;
    368 	}
    369 
    370 	dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
    371 	    bc - &th->th_cache[0][0], error,
    372 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
    373 
    374 	return (error);
    375 }
    376 
    377 static int
    378 find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
    379 {
    380 	zbookmark_t *zb = &zseg->seg_start;
    381 	traverse_blk_cache_t *bc;
    382 	blkptr_t *bp = dnp->dn_blkptr;
    383 	int i, first, level;
    384 	int nbp = dnp->dn_nblkptr;
    385 	int minlevel = zb->zb_level;
    386 	int maxlevel = dnp->dn_nlevels - 1;
    387 	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
    388 	int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
    389 	uint64_t blkid = zb->zb_blkid >> bp_shift;
    390 	int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
    391 	int rc;
    392 
    393 	if (minlevel > maxlevel || blkid >= nbp)
    394 		return (ERANGE);
    395 
    396 	for (level = maxlevel; level >= minlevel; level--) {
    397 		first = P2PHASE(blkid, 1ULL << wshift);
    398 
    399 		for (i = first; i < nbp; i++)
    400 			if (bp[i].blk_birth > zseg->seg_mintxg ||
    401 			    BP_IS_HOLE(&bp[i]) && do_holes)
    402 				break;
    403 
    404 		if (i != first) {
    405 			i--;
    406 			SET_BOOKMARK_LB(zb, level, blkid + (i - first));
    407 			return (ENOTBLK);
    408 		}
    409 
    410 		bc = &th->th_cache[depth][level];
    411 
    412 		SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
    413 		    level, blkid);
    414 
    415 		if (rc = traverse_read(th, bc, bp + i, dnp)) {
    416 			if (rc != EAGAIN) {
    417 				SET_BOOKMARK_LB(zb, level, blkid);
    418 			}
    419 			return (rc);
    420 		}
    421 
    422 		if (BP_IS_HOLE(&bp[i])) {
    423 			SET_BOOKMARK_LB(zb, level, blkid);
    424 			th->th_lastcb.zb_level = ZB_NO_LEVEL;
    425 			return (0);
    426 		}
    427 
    428 		nbp = 1 << wshift;
    429 		bp = bc->bc_data;
    430 		bp_shift -= wshift;
    431 		blkid = zb->zb_blkid >> bp_shift;
    432 	}
    433 
    434 	return (0);
    435 }
    436 
    437 static int
    438 get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
    439     uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
    440 {
    441 	zseg_t zseg;
    442 	zbookmark_t *zb = &zseg.seg_start;
    443 	uint64_t object = *objectp;
    444 	int i, rc;
    445 
    446 	SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
    447 	SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
    448 
    449 	zseg.seg_mintxg = txg;
    450 	zseg.seg_maxtxg = -1ULL;
    451 
    452 	for (;;) {
    453 		rc = find_block(th, &zseg, mdn, depth);
    454 
    455 		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
    456 			break;
    457 
    458 		if (rc == 0 && zb->zb_level == 0) {
    459 			dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
    460 			for (i = 0; i < DNODES_PER_BLOCK; i++) {
    461 				object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
    462 				if (object >= *objectp &&
    463 				    dnp[i].dn_type != DMU_OT_NONE &&
    464 				    (type == -1 || dnp[i].dn_type == type)) {
    465 					*objectp = object;
    466 					*dnpp = &dnp[i];
    467 					return (0);
    468 				}
    469 			}
    470 		}
    471 
    472 		rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
    473 
    474 		if (rc == ERANGE)
    475 			break;
    476 	}
    477 
    478 	if (rc == ERANGE)
    479 		*objectp = ZB_MAXOBJECT;
    480 
    481 	return (rc);
    482 }
    483 
    484 /* ARGSUSED */
    485 static void
    486 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
    487 {
    488 	traverse_handle_t *th = arg;
    489 	traverse_blk_cache_t *bc = &th->th_zil_cache;
    490 	zbookmark_t *zb = &bc->bc_bookmark;
    491 	zseg_t *zseg = list_head(&th->th_seglist);
    492 
    493 	if (bp->blk_birth <= zseg->seg_mintxg)
    494 		return;
    495 
    496 	if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
    497 		zb->zb_object = 0;
    498 		zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
    499 		bc->bc_blkptr = *bp;
    500 		(void) traverse_callback(th, zseg, bc);
    501 	}
    502 }
    503 
    504 /* ARGSUSED */
    505 static void
    506 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
    507 {
    508 	traverse_handle_t *th = arg;
    509 	traverse_blk_cache_t *bc = &th->th_zil_cache;
    510 	zbookmark_t *zb = &bc->bc_bookmark;
    511 	zseg_t *zseg = list_head(&th->th_seglist);
    512 
    513 	if (lrc->lrc_txtype == TX_WRITE) {
    514 		lr_write_t *lr = (lr_write_t *)lrc;
    515 		blkptr_t *bp = &lr->lr_blkptr;
    516 
    517 		if (bp->blk_birth <= zseg->seg_mintxg)
    518 			return;
    519 
    520 		if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
    521 			zb->zb_object = lr->lr_foid;
    522 			zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
    523 			bc->bc_blkptr = *bp;
    524 			(void) traverse_callback(th, zseg, bc);
    525 		}
    526 	}
    527 }
    528 
    529 static void
    530 traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
    531 {
    532 	spa_t *spa = th->th_spa;
    533 	dsl_pool_t *dp = spa_get_dsl(spa);
    534 	objset_phys_t *osphys = bc->bc_data;
    535 	zil_header_t *zh = &osphys->os_zil_header;
    536 	uint64_t claim_txg = zh->zh_claim_txg;
    537 	zilog_t *zilog;
    538 
    539 	ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
    540 	ASSERT(bc->bc_bookmark.zb_level == -1);
    541 
    542 	/*
    543 	 * We only want to visit blocks that have been claimed but not yet
    544 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
    545 	 */
    546 	if (claim_txg == 0 && (spa_mode & FWRITE))
    547 		return;
    548 
    549 	th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
    550 
    551 	zilog = zil_alloc(dp->dp_meta_objset, zh);
    552 
    553 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
    554 	    claim_txg);
    555 
    556 	zil_free(zilog);
    557 }
    558 
    559 static int
    560 traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
    561 {
    562 	zbookmark_t *zb = &zseg->seg_start;
    563 	traverse_blk_cache_t *bc;
    564 	dnode_phys_t *dn, *dn_tmp;
    565 	int worklimit = 100;
    566 	int rc;
    567 
    568 	dprintf("<%llu, %llu, %d, %llx>\n",
    569 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
    570 
    571 	bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
    572 	dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
    573 
    574 	SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
    575 
    576 	rc = traverse_read(th, bc, mosbp, dn);
    577 
    578 	if (rc)		/* If we get ERESTART, we've got nowhere left to go */
    579 		return (rc == ERESTART ? EINTR : rc);
    580 
    581 	ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
    582 
    583 	if (zb->zb_objset != 0) {
    584 		uint64_t objset = zb->zb_objset;
    585 		dsl_dataset_phys_t *dsp;
    586 
    587 		rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
    588 		    DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
    589 
    590 		if (objset != zb->zb_objset)
    591 			rc = advance_objset(zseg, objset, th->th_advance);
    592 
    593 		if (rc != 0)
    594 			return (rc);
    595 
    596 		dsp = DN_BONUS(dn_tmp);
    597 
    598 		bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
    599 		dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
    600 
    601 		SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
    602 
    603 		/*
    604 		 * If we're traversing an open snapshot, we know that it
    605 		 * can't be deleted (because it's open) and it can't change
    606 		 * (because it's a snapshot).  Therefore, once we've gotten
    607 		 * from the uberblock down to the snapshot's objset_phys_t,
    608 		 * we no longer need to synchronize with spa_sync(); we're
    609 		 * traversing a completely static block tree from here on.
    610 		 */
    611 		if (th->th_advance & ADVANCE_NOLOCK) {
    612 			ASSERT(th->th_locked);
    613 			rw_exit(spa_traverse_rwlock(th->th_spa));
    614 			th->th_locked = 0;
    615 		}
    616 
    617 		if (BP_IS_HOLE(&dsp->ds_bp))
    618 			rc = ERESTART;
    619 		else
    620 			rc = traverse_read(th, bc, &dsp->ds_bp, dn);
    621 
    622 		if (rc != 0) {
    623 			if (rc == ERESTART)
    624 				rc = advance_objset(zseg, zb->zb_objset + 1,
    625 				    th->th_advance);
    626 			return (rc);
    627 		}
    628 
    629 		if (th->th_advance & ADVANCE_PRUNE)
    630 			zseg->seg_mintxg =
    631 			    MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
    632 	}
    633 
    634 	if (zb->zb_level == -1) {
    635 		ASSERT(zb->zb_object == 0);
    636 		ASSERT(zb->zb_blkid == 0);
    637 		ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
    638 
    639 		if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
    640 			rc = traverse_callback(th, zseg, bc);
    641 			if (rc) {
    642 				ASSERT(rc == EINTR);
    643 				return (rc);
    644 			}
    645 			if ((th->th_advance & ADVANCE_ZIL) &&
    646 			    zb->zb_objset != 0)
    647 				traverse_zil(th, bc);
    648 		}
    649 
    650 		return (advance_from_osphys(zseg, th->th_advance));
    651 	}
    652 
    653 	if (zb->zb_object != 0) {
    654 		uint64_t object = zb->zb_object;
    655 
    656 		rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
    657 		    zseg->seg_mintxg, -1, ZB_MDN_CACHE);
    658 
    659 		if (object != zb->zb_object)
    660 			rc = advance_object(zseg, object, th->th_advance);
    661 
    662 		if (rc != 0)
    663 			return (rc);
    664 
    665 		dn = dn_tmp;
    666 	}
    667 
    668 	if (zb->zb_level == ZB_MAXLEVEL)
    669 		zb->zb_level = dn->dn_nlevels - 1;
    670 
    671 	for (;;) {
    672 		rc = find_block(th, zseg, dn, ZB_DN_CACHE);
    673 
    674 		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
    675 			break;
    676 
    677 		if (rc == 0) {
    678 			bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
    679 			ASSERT(bc->bc_dnode == dn);
    680 			ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
    681 			rc = traverse_callback(th, zseg, bc);
    682 			if (rc) {
    683 				ASSERT(rc == EINTR);
    684 				return (rc);
    685 			}
    686 			if (BP_IS_HOLE(&bc->bc_blkptr)) {
    687 				ASSERT(th->th_advance & ADVANCE_HOLES);
    688 				rc = ENOTBLK;
    689 			}
    690 		}
    691 
    692 		rc = advance_block(zseg, dn, rc, th->th_advance);
    693 
    694 		if (rc == ERANGE)
    695 			break;
    696 
    697 		/*
    698 		 * Give spa_sync() a chance to run.
    699 		 */
    700 		if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
    701 			th->th_syncs++;
    702 			return (EAGAIN);
    703 		}
    704 
    705 		if (--worklimit == 0)
    706 			return (EAGAIN);
    707 	}
    708 
    709 	if (rc == ERANGE)
    710 		rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
    711 
    712 	return (rc);
    713 }
    714 
    715 /*
    716  * It is the caller's responsibility to ensure that the dsl_dataset_t
    717  * doesn't go away during traversal.
    718  */
    719 int
    720 traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
    721     blkptr_cb_t func, void *arg)
    722 {
    723 	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
    724 	traverse_handle_t *th;
    725 	int err;
    726 
    727 	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
    728 
    729 	traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
    730 
    731 	while ((err = traverse_more(th)) == EAGAIN)
    732 		continue;
    733 
    734 	traverse_fini(th);
    735 	return (err);
    736 }
    737 
    738 int
    739 traverse_zvol(objset_t *os, int advance,  blkptr_cb_t func, void *arg)
    740 {
    741 	spa_t *spa = dmu_objset_spa(os);
    742 	traverse_handle_t *th;
    743 	int err;
    744 
    745 	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL);
    746 
    747 	traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ);
    748 
    749 	while ((err = traverse_more(th)) == EAGAIN)
    750 		continue;
    751 
    752 	traverse_fini(th);
    753 	return (err);
    754 }
    755 
    756 int
    757 traverse_more(traverse_handle_t *th)
    758 {
    759 	zseg_t *zseg = list_head(&th->th_seglist);
    760 	uint64_t save_txg;	/* XXX won't be necessary with real itinerary */
    761 	krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
    762 	blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
    763 	int rc;
    764 
    765 	if (zseg == NULL)
    766 		return (0);
    767 
    768 	th->th_restarts++;
    769 
    770 	save_txg = zseg->seg_mintxg;
    771 
    772 	rw_enter(rw, RW_READER);
    773 	th->th_locked = 1;
    774 
    775 	rc = traverse_segment(th, zseg, mosbp);
    776 	ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
    777 
    778 	if (th->th_locked)
    779 		rw_exit(rw);
    780 	th->th_locked = 0;
    781 
    782 	zseg->seg_mintxg = save_txg;
    783 
    784 	if (rc == ERANGE) {
    785 		list_remove(&th->th_seglist, zseg);
    786 		kmem_free(zseg, sizeof (*zseg));
    787 		return (EAGAIN);
    788 	}
    789 
    790 	return (rc);
    791 }
    792 
    793 /*
    794  * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
    795  * are not included.  The blocks covered by this segment will all have
    796  * mintxg < birth < maxtxg.
    797  */
    798 static void
    799 traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
    800     uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
    801     uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
    802 {
    803 	zseg_t *zseg;
    804 
    805 	zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
    806 
    807 	zseg->seg_mintxg = mintxg;
    808 	zseg->seg_maxtxg = maxtxg;
    809 
    810 	zseg->seg_start.zb_objset = sobjset;
    811 	zseg->seg_start.zb_object = sobject;
    812 	zseg->seg_start.zb_level = slevel;
    813 	zseg->seg_start.zb_blkid = sblkid;
    814 
    815 	zseg->seg_end.zb_objset = eobjset;
    816 	zseg->seg_end.zb_object = eobject;
    817 	zseg->seg_end.zb_level = elevel;
    818 	zseg->seg_end.zb_blkid = eblkid;
    819 
    820 	list_insert_tail(&th->th_seglist, zseg);
    821 }
    822 
    823 void
    824 traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
    825     uint64_t objset, uint64_t object)
    826 {
    827 	if (th->th_advance & ADVANCE_PRE)
    828 		traverse_add_segment(th, mintxg, maxtxg,
    829 		    objset, object, ZB_MAXLEVEL, 0,
    830 		    objset, object, 0, ZB_MAXBLKID);
    831 	else
    832 		traverse_add_segment(th, mintxg, maxtxg,
    833 		    objset, object, 0, 0,
    834 		    objset, object, 0, ZB_MAXBLKID);
    835 }
    836 
    837 void
    838 traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
    839     uint64_t objset)
    840 {
    841 	if (th->th_advance & ADVANCE_PRE)
    842 		traverse_add_segment(th, mintxg, maxtxg,
    843 		    objset, 0, -1, 0,
    844 		    objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
    845 	else
    846 		traverse_add_segment(th, mintxg, maxtxg,
    847 		    objset, 1, 0, 0,
    848 		    objset, 0, -1, 0);
    849 }
    850 
    851 void
    852 traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
    853 {
    854 	if (th->th_advance & ADVANCE_PRE)
    855 		traverse_add_segment(th, mintxg, maxtxg,
    856 		    0, 0, -1, 0,
    857 		    ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
    858 	else
    859 		traverse_add_segment(th, mintxg, maxtxg,
    860 		    1, 1, 0, 0,
    861 		    0, 0, -1, 0);
    862 }
    863 
    864 traverse_handle_t *
    865 traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
    866     int zio_flags)
    867 {
    868 	traverse_handle_t *th;
    869 	int d, l;
    870 
    871 	th = kmem_zalloc(sizeof (*th), KM_SLEEP);
    872 
    873 	th->th_spa = spa;
    874 	th->th_func = func;
    875 	th->th_arg = arg;
    876 	th->th_advance = advance;
    877 	th->th_lastcb.zb_level = ZB_NO_LEVEL;
    878 	th->th_noread.zb_level = ZB_NO_LEVEL;
    879 	th->th_zio_flags = zio_flags;
    880 
    881 	list_create(&th->th_seglist, sizeof (zseg_t),
    882 	    offsetof(zseg_t, seg_node));
    883 
    884 	for (d = 0; d < ZB_DEPTH; d++) {
    885 		for (l = 0; l < ZB_MAXLEVEL; l++) {
    886 			if ((advance & ADVANCE_DATA) ||
    887 			    l != 0 || d != ZB_DN_CACHE)
    888 				th->th_cache[d][l].bc_data =
    889 				    zio_buf_alloc(SPA_MAXBLOCKSIZE);
    890 		}
    891 	}
    892 
    893 	return (th);
    894 }
    895 
    896 void
    897 traverse_fini(traverse_handle_t *th)
    898 {
    899 	int d, l;
    900 	zseg_t *zseg;
    901 
    902 	for (d = 0; d < ZB_DEPTH; d++)
    903 		for (l = 0; l < ZB_MAXLEVEL; l++)
    904 			if (th->th_cache[d][l].bc_data != NULL)
    905 				zio_buf_free(th->th_cache[d][l].bc_data,
    906 				    SPA_MAXBLOCKSIZE);
    907 
    908 	while ((zseg = list_head(&th->th_seglist)) != NULL) {
    909 		list_remove(&th->th_seglist, zseg);
    910 		kmem_free(zseg, sizeof (*zseg));
    911 	}
    912 
    913 	list_destroy(&th->th_seglist);
    914 
    915 	dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
    916 	    th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
    917 	    th->th_syncs, th->th_restarts);
    918 
    919 	kmem_free(th, sizeof (*th));
    920 }
    921