Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/dsl_pool.h>
     27 #include <sys/dsl_dataset.h>
     28 #include <sys/dsl_prop.h>
     29 #include <sys/dsl_dir.h>
     30 #include <sys/dsl_synctask.h>
     31 #include <sys/dnode.h>
     32 #include <sys/dmu_tx.h>
     33 #include <sys/dmu_objset.h>
     34 #include <sys/arc.h>
     35 #include <sys/zap.h>
     36 #include <sys/zio.h>
     37 #include <sys/zfs_context.h>
     38 #include <sys/fs/zfs.h>
     39 #include <sys/zfs_znode.h>
     40 #include <sys/spa_impl.h>
     41 #include <sys/vdev_impl.h>
     42 #include <sys/zil_impl.h>
     43 #include <sys/zio_checksum.h>
     44 #include <sys/ddt.h>
     45 
     46 typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
     47 
     48 static scrub_cb_t dsl_pool_scrub_clean_cb;
     49 static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
     50 static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
     51     uint64_t objset, uint64_t object);
     52 
     53 int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
     54 int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
     55 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
     56 
     57 extern int zfs_txg_timeout;
     58 
     59 static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
     60 	NULL,
     61 	dsl_pool_scrub_clean_cb
     62 };
     63 
     64 /* ARGSUSED */
     65 static void
     66 dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
     67 {
     68 	dsl_pool_t *dp = arg1;
     69 	enum scrub_func *funcp = arg2;
     70 	dmu_object_type_t ot = 0;
     71 	boolean_t complete = B_FALSE;
     72 
     73 	dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
     74 
     75 	ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
     76 	ASSERT(*funcp > SCRUB_FUNC_NONE);
     77 	ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
     78 
     79 	dp->dp_scrub_min_txg = 0;
     80 	dp->dp_scrub_max_txg = tx->tx_txg;
     81 
     82 	if (*funcp == SCRUB_FUNC_CLEAN) {
     83 		vdev_t *rvd = dp->dp_spa->spa_root_vdev;
     84 
     85 		/* rewrite all disk labels */
     86 		vdev_config_dirty(rvd);
     87 
     88 		if (vdev_resilver_needed(rvd,
     89 		    &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
     90 			spa_event_notify(dp->dp_spa, NULL,
     91 			    ESC_ZFS_RESILVER_START);
     92 			dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
     93 			    tx->tx_txg);
     94 		} else {
     95 			spa_event_notify(dp->dp_spa, NULL,
     96 			    ESC_ZFS_SCRUB_START);
     97 		}
     98 
     99 		/* zero out the scrub stats in all vdev_stat_t's */
    100 		vdev_scrub_stat_update(rvd,
    101 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
    102 		    POOL_SCRUB_EVERYTHING, B_FALSE);
    103 
    104 		dp->dp_spa->spa_scrub_started = B_TRUE;
    105 	}
    106 
    107 	/* back to the generic stuff */
    108 
    109 	if (dp->dp_blkstats == NULL) {
    110 		dp->dp_blkstats =
    111 		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
    112 	}
    113 	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
    114 
    115 	if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
    116 		ot = DMU_OT_ZAP_OTHER;
    117 
    118 	dp->dp_scrub_func = *funcp;
    119 	dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
    120 	    ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
    121 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
    122 	dp->dp_scrub_restart = B_FALSE;
    123 	dp->dp_scrub_ditto = B_FALSE;
    124 	dp->dp_spa->spa_scrub_errors = 0;
    125 
    126 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    127 	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
    128 	    &dp->dp_scrub_func, tx));
    129 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    130 	    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
    131 	    &dp->dp_scrub_queue_obj, tx));
    132 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    133 	    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
    134 	    &dp->dp_scrub_min_txg, tx));
    135 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    136 	    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
    137 	    &dp->dp_scrub_max_txg, tx));
    138 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    139 	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
    140 	    &dp->dp_scrub_bookmark, tx));
    141 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    142 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
    143 	    &dp->dp_spa->spa_scrub_errors, tx));
    144 
    145 	spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
    146 	    "func=%u mintxg=%llu maxtxg=%llu",
    147 	    *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
    148 }
    149 
    150 int
    151 dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
    152 {
    153 	return (dsl_sync_task_do(dp, NULL,
    154 	    dsl_pool_scrub_setup_sync, dp, &func, 0));
    155 }
    156 
    157 /* ARGSUSED */
    158 static void
    159 dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    160 {
    161 	dsl_pool_t *dp = arg1;
    162 	boolean_t *completep = arg2;
    163 
    164 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    165 		return;
    166 
    167 	mutex_enter(&dp->dp_scrub_cancel_lock);
    168 
    169 	if (dp->dp_scrub_restart) {
    170 		dp->dp_scrub_restart = B_FALSE;
    171 		*completep = B_FALSE;
    172 	}
    173 
    174 	/* XXX this is scrub-clean specific */
    175 	mutex_enter(&dp->dp_spa->spa_scrub_lock);
    176 	while (dp->dp_spa->spa_scrub_inflight > 0) {
    177 		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
    178 		    &dp->dp_spa->spa_scrub_lock);
    179 	}
    180 	mutex_exit(&dp->dp_spa->spa_scrub_lock);
    181 	dp->dp_spa->spa_scrub_started = B_FALSE;
    182 	dp->dp_spa->spa_scrub_active = B_FALSE;
    183 
    184 	dp->dp_scrub_func = SCRUB_FUNC_NONE;
    185 	VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
    186 	    dp->dp_scrub_queue_obj, tx));
    187 	dp->dp_scrub_queue_obj = 0;
    188 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
    189 
    190 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    191 	    DMU_POOL_SCRUB_QUEUE, tx));
    192 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    193 	    DMU_POOL_SCRUB_MIN_TXG, tx));
    194 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    195 	    DMU_POOL_SCRUB_MAX_TXG, tx));
    196 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    197 	    DMU_POOL_SCRUB_BOOKMARK, tx));
    198 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    199 	    DMU_POOL_SCRUB_FUNC, tx));
    200 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    201 	    DMU_POOL_SCRUB_ERRORS, tx));
    202 
    203 	spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
    204 	    "complete=%u", *completep);
    205 
    206 	/* below is scrub-clean specific */
    207 	vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
    208 	    *completep);
    209 	/*
    210 	 * If the scrub/resilver completed, update all DTLs to reflect this.
    211 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
    212 	 */
    213 	vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
    214 	    *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
    215 	if (*completep)
    216 		spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
    217 		    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
    218 	spa_errlog_rotate(dp->dp_spa);
    219 
    220 	/*
    221 	 * We may have finished replacing a device.
    222 	 * Let the async thread assess this and handle the detach.
    223 	 */
    224 	spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
    225 
    226 	dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
    227 	mutex_exit(&dp->dp_scrub_cancel_lock);
    228 }
    229 
    230 int
    231 dsl_pool_scrub_cancel(dsl_pool_t *dp)
    232 {
    233 	boolean_t complete = B_FALSE;
    234 
    235 	return (dsl_sync_task_do(dp, NULL,
    236 	    dsl_pool_scrub_cancel_sync, dp, &complete, 3));
    237 }
    238 
    239 void
    240 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
    241 {
    242 	/*
    243 	 * This function will be used by bp-rewrite wad to intercept frees.
    244 	 */
    245 	zio_free(dp->dp_spa, txg, bpp);
    246 }
    247 
    248 static boolean_t
    249 bookmark_is_zero(const zbookmark_t *zb)
    250 {
    251 	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
    252 	    zb->zb_level == 0 && zb->zb_blkid == 0);
    253 }
    254 
    255 /* dnp is the dnode for zb1->zb_object */
    256 static boolean_t
    257 bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
    258     const zbookmark_t *zb2)
    259 {
    260 	uint64_t zb1nextL0, zb2thisobj;
    261 
    262 	ASSERT(zb1->zb_objset == zb2->zb_objset);
    263 	ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT);
    264 	ASSERT(zb2->zb_level == 0);
    265 
    266 	/*
    267 	 * A bookmark in the deadlist is considered to be after
    268 	 * everything else.
    269 	 */
    270 	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
    271 		return (B_TRUE);
    272 
    273 	/* The objset_phys_t isn't before anything. */
    274 	if (dnp == NULL)
    275 		return (B_FALSE);
    276 
    277 	zb1nextL0 = (zb1->zb_blkid + 1) <<
    278 	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
    279 
    280 	zb2thisobj = zb2->zb_object ? zb2->zb_object :
    281 	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
    282 
    283 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
    284 		uint64_t nextobj = zb1nextL0 *
    285 		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
    286 		return (nextobj <= zb2thisobj);
    287 	}
    288 
    289 	if (zb1->zb_object < zb2thisobj)
    290 		return (B_TRUE);
    291 	if (zb1->zb_object > zb2thisobj)
    292 		return (B_FALSE);
    293 	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
    294 		return (B_FALSE);
    295 	return (zb1nextL0 <= zb2->zb_blkid);
    296 }
    297 
    298 static boolean_t
    299 scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
    300 {
    301 	int elapsed_ticks;
    302 	int mintime;
    303 
    304 	if (dp->dp_scrub_pausing)
    305 		return (B_TRUE); /* we're already pausing */
    306 
    307 	if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
    308 		return (B_FALSE); /* we're resuming */
    309 
    310 	/* We only know how to resume from level-0 blocks. */
    311 	if (zb->zb_level != 0)
    312 		return (B_FALSE);
    313 
    314 	mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time :
    315 	    zfs_scrub_min_time;
    316 	elapsed_ticks = lbolt64 - dp->dp_scrub_start_time;
    317 	if (elapsed_ticks > hz * zfs_txg_timeout ||
    318 	    (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) {
    319 		dprintf("pausing at %llx/%llx/%llx/%llx\n",
    320 		    (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
    321 		    (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
    322 		dp->dp_scrub_pausing = B_TRUE;
    323 		dp->dp_scrub_bookmark = *zb;
    324 		return (B_TRUE);
    325 	}
    326 	return (B_FALSE);
    327 }
    328 
    329 typedef struct zil_traverse_arg {
    330 	dsl_pool_t	*zta_dp;
    331 	zil_header_t	*zta_zh;
    332 } zil_traverse_arg_t;
    333 
    334 /* ARGSUSED */
    335 static int
    336 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
    337 {
    338 	zil_traverse_arg_t *zta = arg;
    339 	dsl_pool_t *dp = zta->zta_dp;
    340 	zil_header_t *zh = zta->zta_zh;
    341 	zbookmark_t zb;
    342 
    343 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
    344 		return (0);
    345 
    346 	/*
    347 	 * One block ("stubby") can be allocated a long time ago; we
    348 	 * want to visit that one because it has been allocated
    349 	 * (on-disk) even if it hasn't been claimed (even though for
    350 	 * plain scrub there's nothing to do to it).
    351 	 */
    352 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
    353 		return (0);
    354 
    355 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
    356 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
    357 
    358 	VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
    359 	return (0);
    360 }
    361 
    362 /* ARGSUSED */
    363 static int
    364 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
    365 {
    366 	if (lrc->lrc_txtype == TX_WRITE) {
    367 		zil_traverse_arg_t *zta = arg;
    368 		dsl_pool_t *dp = zta->zta_dp;
    369 		zil_header_t *zh = zta->zta_zh;
    370 		lr_write_t *lr = (lr_write_t *)lrc;
    371 		blkptr_t *bp = &lr->lr_blkptr;
    372 		zbookmark_t zb;
    373 
    374 		if (bp->blk_birth <= dp->dp_scrub_min_txg)
    375 			return (0);
    376 
    377 		/*
    378 		 * birth can be < claim_txg if this record's txg is
    379 		 * already txg sync'ed (but this log block contains
    380 		 * other records that are not synced)
    381 		 */
    382 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
    383 			return (0);
    384 
    385 		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
    386 		    lr->lr_foid, ZB_ZIL_LEVEL,
    387 		    lr->lr_offset / BP_GET_LSIZE(bp));
    388 
    389 		VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
    390 	}
    391 	return (0);
    392 }
    393 
    394 static void
    395 traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
    396 {
    397 	uint64_t claim_txg = zh->zh_claim_txg;
    398 	zil_traverse_arg_t zta = { dp, zh };
    399 	zilog_t *zilog;
    400 
    401 	/*
    402 	 * We only want to visit blocks that have been claimed but not yet
    403 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
    404 	 */
    405 	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
    406 		return;
    407 
    408 	zilog = zil_alloc(dp->dp_meta_objset, zh);
    409 
    410 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta,
    411 	    claim_txg);
    412 
    413 	zil_free(zilog);
    414 }
    415 
    416 static void
    417 scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
    418     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
    419 {
    420 	int err;
    421 	arc_buf_t *buf = NULL;
    422 
    423 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
    424 		return;
    425 
    426 	if (scrub_pause(dp, zb))
    427 		return;
    428 
    429 	if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
    430 		/*
    431 		 * If we already visited this bp & everything below (in
    432 		 * a prior txg), don't bother doing it again.
    433 		 */
    434 		if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
    435 			return;
    436 
    437 		/*
    438 		 * If we found the block we're trying to resume from, or
    439 		 * we went past it to a different object, zero it out to
    440 		 * indicate that it's OK to start checking for pausing
    441 		 * again.
    442 		 */
    443 		if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
    444 		    zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
    445 			dprintf("resuming at %llx/%llx/%llx/%llx\n",
    446 			    (longlong_t)zb->zb_objset,
    447 			    (longlong_t)zb->zb_object,
    448 			    (longlong_t)zb->zb_level,
    449 			    (longlong_t)zb->zb_blkid);
    450 			bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
    451 		}
    452 	}
    453 
    454 	if (BP_GET_LEVEL(bp) > 0) {
    455 		uint32_t flags = ARC_WAIT;
    456 		int i;
    457 		blkptr_t *cbp;
    458 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
    459 
    460 		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
    461 		    arc_getbuf_func, &buf,
    462 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    463 		if (err) {
    464 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
    465 			dp->dp_spa->spa_scrub_errors++;
    466 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
    467 			return;
    468 		}
    469 		cbp = buf->b_data;
    470 
    471 		for (i = 0; i < epb; i++, cbp++) {
    472 			zbookmark_t czb;
    473 
    474 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
    475 			    zb->zb_level - 1,
    476 			    zb->zb_blkid * epb + i);
    477 			scrub_visitbp(dp, dnp, buf, cbp, &czb);
    478 		}
    479 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
    480 		uint32_t flags = ARC_WAIT;
    481 		dnode_phys_t *child_dnp;
    482 		int i;
    483 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
    484 
    485 		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
    486 		    arc_getbuf_func, &buf,
    487 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    488 		if (err) {
    489 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
    490 			dp->dp_spa->spa_scrub_errors++;
    491 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
    492 			return;
    493 		}
    494 		child_dnp = buf->b_data;
    495 
    496 		for (i = 0; i < epb; i++, child_dnp++) {
    497 			scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset,
    498 			    zb->zb_blkid * epb + i);
    499 		}
    500 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
    501 		uint32_t flags = ARC_WAIT;
    502 		objset_phys_t *osp;
    503 
    504 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
    505 		    arc_getbuf_func, &buf,
    506 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    507 		if (err) {
    508 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
    509 			dp->dp_spa->spa_scrub_errors++;
    510 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
    511 			return;
    512 		}
    513 
    514 		osp = buf->b_data;
    515 
    516 		traverse_zil(dp, &osp->os_zil_header);
    517 
    518 		scrub_visitdnode(dp, &osp->os_meta_dnode,
    519 		    buf, zb->zb_objset, DMU_META_DNODE_OBJECT);
    520 		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
    521 			scrub_visitdnode(dp, &osp->os_userused_dnode,
    522 			    buf, zb->zb_objset, DMU_USERUSED_OBJECT);
    523 			scrub_visitdnode(dp, &osp->os_groupused_dnode,
    524 			    buf, zb->zb_objset, DMU_GROUPUSED_OBJECT);
    525 		}
    526 	}
    527 
    528 	(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
    529 	if (buf)
    530 		(void) arc_buf_remove_ref(buf, &buf);
    531 }
    532 
    533 static void
    534 scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
    535     uint64_t objset, uint64_t object)
    536 {
    537 	int j;
    538 
    539 	for (j = 0; j < dnp->dn_nblkptr; j++) {
    540 		zbookmark_t czb;
    541 
    542 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
    543 		scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
    544 	}
    545 
    546 }
    547 
    548 static void
    549 scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
    550 {
    551 	zbookmark_t zb;
    552 
    553 	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
    554 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
    555 	scrub_visitbp(dp, NULL, NULL, bp, &zb);
    556 }
    557 
    558 void
    559 dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
    560 {
    561 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
    562 
    563 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    564 		return;
    565 
    566 	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
    567 		SET_BOOKMARK(&dp->dp_scrub_bookmark, ZB_DESTROYED_OBJSET,
    568 		    0, 0, 0);
    569 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    570 	    ds->ds_object, tx) != 0) {
    571 		return;
    572 	}
    573 
    574 	if (ds->ds_phys->ds_next_snap_obj != 0) {
    575 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    576 		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
    577 	}
    578 	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
    579 }
    580 
    581 void
    582 dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
    583 {
    584 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
    585 
    586 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    587 		return;
    588 
    589 	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
    590 
    591 	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
    592 		dp->dp_scrub_bookmark.zb_objset =
    593 		    ds->ds_phys->ds_prev_snap_obj;
    594 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    595 	    ds->ds_object, tx) == 0) {
    596 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    597 		    ds->ds_phys->ds_prev_snap_obj, tx) == 0);
    598 	}
    599 }
    600 
    601 void
    602 dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
    603 {
    604 	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
    605 
    606 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    607 		return;
    608 
    609 	if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) {
    610 		dp->dp_scrub_bookmark.zb_objset = ds2->ds_object;
    611 	} else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) {
    612 		dp->dp_scrub_bookmark.zb_objset = ds1->ds_object;
    613 	}
    614 
    615 	if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    616 	    ds1->ds_object, tx) == 0) {
    617 		int err = zap_add_int(dp->dp_meta_objset,
    618 		    dp->dp_scrub_queue_obj, ds2->ds_object, tx);
    619 		VERIFY(err == 0 || err == EEXIST);
    620 		if (err == EEXIST) {
    621 			/* Both were there to begin with */
    622 			VERIFY(0 == zap_add_int(dp->dp_meta_objset,
    623 			    dp->dp_scrub_queue_obj, ds1->ds_object, tx));
    624 		}
    625 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    626 	    ds2->ds_object, tx) == 0) {
    627 		VERIFY(0 == zap_add_int(dp->dp_meta_objset,
    628 		    dp->dp_scrub_queue_obj, ds1->ds_object, tx));
    629 	}
    630 }
    631 
    632 struct enqueue_clones_arg {
    633 	dmu_tx_t *tx;
    634 	uint64_t originobj;
    635 };
    636 
    637 /* ARGSUSED */
    638 static int
    639 enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
    640 {
    641 	struct enqueue_clones_arg *eca = arg;
    642 	dsl_dataset_t *ds;
    643 	int err;
    644 	dsl_pool_t *dp;
    645 
    646 	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
    647 	if (err)
    648 		return (err);
    649 	dp = ds->ds_dir->dd_pool;
    650 
    651 	if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
    652 		while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
    653 			dsl_dataset_t *prev;
    654 			err = dsl_dataset_hold_obj(dp,
    655 			    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
    656 
    657 			dsl_dataset_rele(ds, FTAG);
    658 			if (err)
    659 				return (err);
    660 			ds = prev;
    661 		}
    662 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    663 		    ds->ds_object, eca->tx) == 0);
    664 	}
    665 	dsl_dataset_rele(ds, FTAG);
    666 	return (0);
    667 }
    668 
    669 static void
    670 scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
    671 {
    672 	dsl_dataset_t *ds;
    673 	uint64_t min_txg_save;
    674 
    675 	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
    676 
    677 	/*
    678 	 * Iterate over the bps in this ds.
    679 	 */
    680 	min_txg_save = dp->dp_scrub_min_txg;
    681 	dp->dp_scrub_min_txg =
    682 	    MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
    683 	scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
    684 	dp->dp_scrub_min_txg = min_txg_save;
    685 
    686 	if (dp->dp_scrub_pausing)
    687 		goto out;
    688 
    689 	/*
    690 	 * Add descendent datasets to work queue.
    691 	 */
    692 	if (ds->ds_phys->ds_next_snap_obj != 0) {
    693 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    694 		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
    695 	}
    696 	if (ds->ds_phys->ds_num_children > 1) {
    697 		boolean_t usenext = B_FALSE;
    698 		if (ds->ds_phys->ds_next_clones_obj != 0) {
    699 			uint64_t count;
    700 			/*
    701 			 * A bug in a previous version of the code could
    702 			 * cause upgrade_clones_cb() to not set
    703 			 * ds_next_snap_obj when it should, leading to a
    704 			 * missing entry.  Therefore we can only use the
    705 			 * next_clones_obj when its count is correct.
    706 			 */
    707 			int err = zap_count(dp->dp_meta_objset,
    708 			    ds->ds_phys->ds_next_clones_obj, &count);
    709 			if (err == 0 &&
    710 			    count == ds->ds_phys->ds_num_children - 1)
    711 				usenext = B_TRUE;
    712 		}
    713 
    714 		if (usenext) {
    715 			VERIFY(zap_join(dp->dp_meta_objset,
    716 			    ds->ds_phys->ds_next_clones_obj,
    717 			    dp->dp_scrub_queue_obj, tx) == 0);
    718 		} else {
    719 			struct enqueue_clones_arg eca;
    720 			eca.tx = tx;
    721 			eca.originobj = ds->ds_object;
    722 
    723 			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
    724 			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
    725 		}
    726 	}
    727 
    728 out:
    729 	dsl_dataset_rele(ds, FTAG);
    730 }
    731 
    732 /* ARGSUSED */
    733 static int
    734 enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
    735 {
    736 	dmu_tx_t *tx = arg;
    737 	dsl_dataset_t *ds;
    738 	int err;
    739 	dsl_pool_t *dp;
    740 
    741 	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
    742 	if (err)
    743 		return (err);
    744 
    745 	dp = ds->ds_dir->dd_pool;
    746 
    747 	while (ds->ds_phys->ds_prev_snap_obj != 0) {
    748 		dsl_dataset_t *prev;
    749 		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
    750 		    FTAG, &prev);
    751 		if (err) {
    752 			dsl_dataset_rele(ds, FTAG);
    753 			return (err);
    754 		}
    755 
    756 		/*
    757 		 * If this is a clone, we don't need to worry about it for now.
    758 		 */
    759 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
    760 			dsl_dataset_rele(ds, FTAG);
    761 			dsl_dataset_rele(prev, FTAG);
    762 			return (0);
    763 		}
    764 		dsl_dataset_rele(ds, FTAG);
    765 		ds = prev;
    766 	}
    767 
    768 	VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    769 	    ds->ds_object, tx) == 0);
    770 	dsl_dataset_rele(ds, FTAG);
    771 	return (0);
    772 }
    773 
    774 static void
    775 dsl_pool_scrub_ddt(dsl_pool_t *dp, enum zio_checksum c, enum ddt_type type,
    776     enum ddt_class class)
    777 {
    778 	ddt_t *ddt = ddt_select_by_checksum(dp->dp_spa, c);
    779 	ddt_entry_t dde;
    780 	blkptr_t blk;
    781 	zbookmark_t zb = { 0 };
    782 	uint64_t walk = 0;
    783 	int error;
    784 
    785 	if (!ddt_object_exists(ddt, type, class))
    786 		return;
    787 
    788 	while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) {
    789 		int p = DDT_PHYS_DITTO;
    790 		ddt_bp_create(ddt, &dde.dde_key, &dde.dde_phys[p], &blk);
    791 		scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
    792 	}
    793 	ASSERT(error == ENOENT);
    794 }
    795 
    796 static void
    797 dsl_pool_scrub_ditto(dsl_pool_t *dp)
    798 {
    799 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
    800 		for (enum ddt_type type = 0; type < DDT_TYPES; type++)
    801 			dsl_pool_scrub_ddt(dp, c, type, DDT_CLASS_DITTO);
    802 }
    803 
    804 void
    805 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
    806 {
    807 	spa_t *spa = dp->dp_spa;
    808 	zap_cursor_t zc;
    809 	zap_attribute_t za;
    810 	boolean_t complete = B_TRUE;
    811 
    812 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    813 		return;
    814 
    815 	/*
    816 	 * If the pool is not loaded, or is trying to unload, leave it alone.
    817 	 */
    818 	if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa))
    819 		return;
    820 
    821 	if (dp->dp_scrub_restart) {
    822 		enum scrub_func func = dp->dp_scrub_func;
    823 		dp->dp_scrub_restart = B_FALSE;
    824 		dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
    825 	}
    826 
    827 	if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
    828 		/*
    829 		 * We must have resumed after rebooting; reset the vdev
    830 		 * stats to know that we're doing a scrub (although it
    831 		 * will think we're just starting now).
    832 		 */
    833 		vdev_scrub_stat_update(spa->spa_root_vdev,
    834 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
    835 		    POOL_SCRUB_EVERYTHING, B_FALSE);
    836 	}
    837 
    838 	dp->dp_scrub_pausing = B_FALSE;
    839 	dp->dp_scrub_start_time = lbolt64;
    840 	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
    841 	spa->spa_scrub_active = B_TRUE;
    842 
    843 	if (!dp->dp_scrub_ditto) {
    844 		dsl_pool_scrub_ditto(dp);
    845 		dp->dp_scrub_ditto = B_TRUE;
    846 	}
    847 
    848 	if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
    849 		/* First do the MOS & ORIGIN */
    850 		scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
    851 		if (dp->dp_scrub_pausing)
    852 			goto out;
    853 
    854 		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
    855 			VERIFY(0 == dmu_objset_find_spa(spa,
    856 			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
    857 		} else {
    858 			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
    859 		}
    860 		ASSERT(!dp->dp_scrub_pausing);
    861 	} else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) {
    862 		/*
    863 		 * If we were paused, continue from here.  Note if the ds
    864 		 * we were paused on was destroyed, the zb_objset will be
    865 		 * ZB_DESTROYED_OBJSET, so we will skip this and find a new
    866 		 * objset below.
    867 		 */
    868 		scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
    869 		if (dp->dp_scrub_pausing)
    870 			goto out;
    871 	}
    872 
    873 	/*
    874 	 * In case we were paused right at the end of the ds, zero the
    875 	 * bookmark so we don't think that we're still trying to resume.
    876 	 */
    877 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
    878 
    879 	/* keep pulling things out of the zap-object-as-queue */
    880 	while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
    881 	    zap_cursor_retrieve(&zc, &za) == 0) {
    882 		VERIFY(0 == zap_remove(dp->dp_meta_objset,
    883 		    dp->dp_scrub_queue_obj, za.za_name, tx));
    884 		scrub_visitds(dp, za.za_first_integer, tx);
    885 		if (dp->dp_scrub_pausing)
    886 			break;
    887 		zap_cursor_fini(&zc);
    888 	}
    889 	zap_cursor_fini(&zc);
    890 	if (dp->dp_scrub_pausing)
    891 		goto out;
    892 
    893 	/* done. */
    894 
    895 	dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
    896 	return;
    897 out:
    898 	VERIFY(0 == zap_update(dp->dp_meta_objset,
    899 	    DMU_POOL_DIRECTORY_OBJECT,
    900 	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
    901 	    &dp->dp_scrub_bookmark, tx));
    902 	VERIFY(0 == zap_update(dp->dp_meta_objset,
    903 	    DMU_POOL_DIRECTORY_OBJECT,
    904 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
    905 	    &spa->spa_scrub_errors, tx));
    906 
    907 	/* XXX this is scrub-clean specific */
    908 	mutex_enter(&spa->spa_scrub_lock);
    909 	while (spa->spa_scrub_inflight > 0)
    910 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
    911 	mutex_exit(&spa->spa_scrub_lock);
    912 }
    913 
    914 void
    915 dsl_pool_scrub_restart(dsl_pool_t *dp)
    916 {
    917 	mutex_enter(&dp->dp_scrub_cancel_lock);
    918 	dp->dp_scrub_restart = B_TRUE;
    919 	mutex_exit(&dp->dp_scrub_cancel_lock);
    920 }
    921 
    922 /*
    923  * scrub consumers
    924  */
    925 
    926 static void
    927 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
    928 {
    929 	int i;
    930 
    931 	/*
    932 	 * If we resume after a reboot, zab will be NULL; don't record
    933 	 * incomplete stats in that case.
    934 	 */
    935 	if (zab == NULL)
    936 		return;
    937 
    938 	for (i = 0; i < 4; i++) {
    939 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
    940 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
    941 		zfs_blkstat_t *zb = &zab->zab_type[l][t];
    942 		int equal;
    943 
    944 		zb->zb_count++;
    945 		zb->zb_asize += BP_GET_ASIZE(bp);
    946 		zb->zb_lsize += BP_GET_LSIZE(bp);
    947 		zb->zb_psize += BP_GET_PSIZE(bp);
    948 		zb->zb_gangs += BP_COUNT_GANG(bp);
    949 
    950 		switch (BP_GET_NDVAS(bp)) {
    951 		case 2:
    952 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
    953 			    DVA_GET_VDEV(&bp->blk_dva[1]))
    954 				zb->zb_ditto_2_of_2_samevdev++;
    955 			break;
    956 		case 3:
    957 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
    958 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
    959 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
    960 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
    961 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
    962 			    DVA_GET_VDEV(&bp->blk_dva[2]));
    963 			if (equal == 1)
    964 				zb->zb_ditto_2_of_3_samevdev++;
    965 			else if (equal == 3)
    966 				zb->zb_ditto_3_of_3_samevdev++;
    967 			break;
    968 		}
    969 	}
    970 }
    971 
    972 static void
    973 dsl_pool_scrub_clean_done(zio_t *zio)
    974 {
    975 	spa_t *spa = zio->io_spa;
    976 
    977 	zio_data_buf_free(zio->io_data, zio->io_size);
    978 
    979 	mutex_enter(&spa->spa_scrub_lock);
    980 	spa->spa_scrub_inflight--;
    981 	cv_broadcast(&spa->spa_scrub_io_cv);
    982 
    983 	if (zio->io_error && (zio->io_error != ECKSUM ||
    984 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)))
    985 		spa->spa_scrub_errors++;
    986 	mutex_exit(&spa->spa_scrub_lock);
    987 }
    988 
    989 static int
    990 dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
    991     const blkptr_t *bp, const zbookmark_t *zb)
    992 {
    993 	size_t size = BP_GET_PSIZE(bp);
    994 	spa_t *spa = dp->dp_spa;
    995 	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
    996 	boolean_t needs_io;
    997 	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
    998 	int zio_priority;
    999 
   1000 	if (phys_birth <= dp->dp_scrub_min_txg ||
   1001 	    phys_birth >= dp->dp_scrub_max_txg)
   1002 		return (0);
   1003 
   1004 	count_block(dp->dp_blkstats, bp);
   1005 
   1006 	if (dp->dp_scrub_isresilver == 0) {
   1007 		/* It's a scrub */
   1008 		zio_flags |= ZIO_FLAG_SCRUB;
   1009 		zio_priority = ZIO_PRIORITY_SCRUB;
   1010 		needs_io = B_TRUE;
   1011 	} else {
   1012 		/* It's a resilver */
   1013 		zio_flags |= ZIO_FLAG_RESILVER;
   1014 		zio_priority = ZIO_PRIORITY_RESILVER;
   1015 		needs_io = B_FALSE;
   1016 	}
   1017 
   1018 	/* If it's an intent log block, failure is expected. */
   1019 	if (zb->zb_level == ZB_ZIL_LEVEL)
   1020 		zio_flags |= ZIO_FLAG_SPECULATIVE;
   1021 
   1022 	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
   1023 		vdev_t *vd = vdev_lookup_top(spa,
   1024 		    DVA_GET_VDEV(&bp->blk_dva[d]));
   1025 
   1026 		/*
   1027 		 * Keep track of how much data we've examined so that
   1028 		 * zpool(1M) status can make useful progress reports.
   1029 		 */
   1030 		mutex_enter(&vd->vdev_stat_lock);
   1031 		vd->vdev_stat.vs_scrub_examined +=
   1032 		    DVA_GET_ASIZE(&bp->blk_dva[d]);
   1033 		mutex_exit(&vd->vdev_stat_lock);
   1034 
   1035 		/* if it's a resilver, this may not be in the target range */
   1036 		if (!needs_io) {
   1037 			if (DVA_GET_GANG(&bp->blk_dva[d])) {
   1038 				/*
   1039 				 * Gang members may be spread across multiple
   1040 				 * vdevs, so the best estimate we have is the
   1041 				 * scrub range, which has already been checked.
   1042 				 * XXX -- it would be better to change our
   1043 				 * allocation policy to ensure that all
   1044 				 * gang members reside on the same vdev.
   1045 				 */
   1046 				needs_io = B_TRUE;
   1047 			} else {
   1048 				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
   1049 				    phys_birth, 1);
   1050 			}
   1051 		}
   1052 	}
   1053 
   1054 	if (needs_io && !zfs_no_scrub_io) {
   1055 		void *data = zio_data_buf_alloc(size);
   1056 
   1057 		mutex_enter(&spa->spa_scrub_lock);
   1058 		while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
   1059 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
   1060 		spa->spa_scrub_inflight++;
   1061 		mutex_exit(&spa->spa_scrub_lock);
   1062 
   1063 		zio_nowait(zio_read(NULL, spa, bp, data, size,
   1064 		    dsl_pool_scrub_clean_done, NULL, zio_priority,
   1065 		    zio_flags, zb));
   1066 	}
   1067 
   1068 	/* do not relocate this block */
   1069 	return (0);
   1070 }
   1071 
   1072 int
   1073 dsl_pool_scrub_clean(dsl_pool_t *dp)
   1074 {
   1075 	spa_t *spa = dp->dp_spa;
   1076 
   1077 	/*
   1078 	 * Purge all vdev caches and probe all devices.  We do this here
   1079 	 * rather than in sync context because this requires a writer lock
   1080 	 * on the spa_config lock, which we can't do from sync context.  The
   1081 	 * spa_scrub_reopen flag indicates that vdev_open() should not
   1082 	 * attempt to start another scrub.
   1083 	 */
   1084 	spa_vdev_state_enter(spa, SCL_NONE);
   1085 	spa->spa_scrub_reopen = B_TRUE;
   1086 	vdev_reopen(spa->spa_root_vdev);
   1087 	spa->spa_scrub_reopen = B_FALSE;
   1088 	(void) spa_vdev_state_exit(spa, NULL, 0);
   1089 
   1090 	return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
   1091 }
   1092