Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/dsl_pool.h>
     27 #include <sys/dsl_dataset.h>
     28 #include <sys/dsl_prop.h>
     29 #include <sys/dsl_dir.h>
     30 #include <sys/dsl_synctask.h>
     31 #include <sys/dnode.h>
     32 #include <sys/dmu_tx.h>
     33 #include <sys/dmu_objset.h>
     34 #include <sys/arc.h>
     35 #include <sys/zap.h>
     36 #include <sys/zio.h>
     37 #include <sys/zfs_context.h>
     38 #include <sys/fs/zfs.h>
     39 #include <sys/zfs_znode.h>
     40 #include <sys/spa_impl.h>
     41 #include <sys/vdev_impl.h>
     42 #include <sys/zil_impl.h>
     43 #include <sys/zio_checksum.h>
     44 #include <sys/ddt.h>
     45 
     46 typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
     47 
     48 static scrub_cb_t dsl_pool_scrub_clean_cb;
     49 static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
     50 static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
     51     uint64_t objset, uint64_t object);
     52 
     53 int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
     54 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
     55 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
     56 boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
     57 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
     58 
     59 extern int zfs_txg_timeout;
     60 
     61 static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
     62 	NULL,
     63 	dsl_pool_scrub_clean_cb
     64 };
     65 
     66 /* ARGSUSED */
     67 static void
     68 dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
     69 {
     70 	dsl_pool_t *dp = arg1;
     71 	enum scrub_func *funcp = arg2;
     72 	dmu_object_type_t ot = 0;
     73 	boolean_t complete = B_FALSE;
     74 
     75 	dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
     76 
     77 	ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
     78 	ASSERT(*funcp > SCRUB_FUNC_NONE);
     79 	ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
     80 
     81 	dp->dp_scrub_min_txg = 0;
     82 	dp->dp_scrub_max_txg = tx->tx_txg;
     83 	dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max;
     84 
     85 	if (*funcp == SCRUB_FUNC_CLEAN) {
     86 		vdev_t *rvd = dp->dp_spa->spa_root_vdev;
     87 
     88 		/* rewrite all disk labels */
     89 		vdev_config_dirty(rvd);
     90 
     91 		if (vdev_resilver_needed(rvd,
     92 		    &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
     93 			spa_event_notify(dp->dp_spa, NULL,
     94 			    ESC_ZFS_RESILVER_START);
     95 			dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
     96 			    tx->tx_txg);
     97 		} else {
     98 			spa_event_notify(dp->dp_spa, NULL,
     99 			    ESC_ZFS_SCRUB_START);
    100 		}
    101 
    102 		/* zero out the scrub stats in all vdev_stat_t's */
    103 		vdev_scrub_stat_update(rvd,
    104 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
    105 		    POOL_SCRUB_EVERYTHING, B_FALSE);
    106 
    107 		/*
    108 		 * If this is an incremental scrub, limit the DDT scrub phase
    109 		 * to just the auto-ditto class (for correctness); the rest
    110 		 * of the scrub should go faster using top-down pruning.
    111 		 */
    112 		if (dp->dp_scrub_min_txg > TXG_INITIAL)
    113 			dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO;
    114 
    115 		dp->dp_spa->spa_scrub_started = B_TRUE;
    116 	}
    117 
    118 	/* back to the generic stuff */
    119 
    120 	if (dp->dp_blkstats == NULL) {
    121 		dp->dp_blkstats =
    122 		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
    123 	}
    124 	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
    125 
    126 	if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
    127 		ot = DMU_OT_ZAP_OTHER;
    128 
    129 	dp->dp_scrub_func = *funcp;
    130 	dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
    131 	    ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
    132 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
    133 	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
    134 	dp->dp_scrub_restart = B_FALSE;
    135 	dp->dp_spa->spa_scrub_errors = 0;
    136 
    137 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    138 	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
    139 	    &dp->dp_scrub_func, tx));
    140 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    141 	    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
    142 	    &dp->dp_scrub_queue_obj, tx));
    143 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    144 	    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
    145 	    &dp->dp_scrub_min_txg, tx));
    146 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    147 	    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
    148 	    &dp->dp_scrub_max_txg, tx));
    149 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    150 	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
    151 	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
    152 	    &dp->dp_scrub_bookmark, tx));
    153 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    154 	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
    155 	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
    156 	    &dp->dp_scrub_ddt_bookmark, tx));
    157 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    158 	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
    159 	    &dp->dp_scrub_ddt_class_max, tx));
    160 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    161 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
    162 	    &dp->dp_spa->spa_scrub_errors, tx));
    163 
    164 	spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
    165 	    "func=%u mintxg=%llu maxtxg=%llu",
    166 	    *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
    167 }
    168 
    169 int
    170 dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
    171 {
    172 	return (dsl_sync_task_do(dp, NULL,
    173 	    dsl_pool_scrub_setup_sync, dp, &func, 0));
    174 }
    175 
    176 /* ARGSUSED */
    177 static void
    178 dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    179 {
    180 	dsl_pool_t *dp = arg1;
    181 	boolean_t *completep = arg2;
    182 
    183 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    184 		return;
    185 
    186 	mutex_enter(&dp->dp_scrub_cancel_lock);
    187 
    188 	if (dp->dp_scrub_restart) {
    189 		dp->dp_scrub_restart = B_FALSE;
    190 		*completep = B_FALSE;
    191 	}
    192 
    193 	/* XXX this is scrub-clean specific */
    194 	mutex_enter(&dp->dp_spa->spa_scrub_lock);
    195 	while (dp->dp_spa->spa_scrub_inflight > 0) {
    196 		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
    197 		    &dp->dp_spa->spa_scrub_lock);
    198 	}
    199 	mutex_exit(&dp->dp_spa->spa_scrub_lock);
    200 	dp->dp_spa->spa_scrub_started = B_FALSE;
    201 	dp->dp_spa->spa_scrub_active = B_FALSE;
    202 
    203 	dp->dp_scrub_func = SCRUB_FUNC_NONE;
    204 	VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
    205 	    dp->dp_scrub_queue_obj, tx));
    206 	dp->dp_scrub_queue_obj = 0;
    207 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
    208 	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
    209 
    210 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    211 	    DMU_POOL_SCRUB_QUEUE, tx));
    212 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    213 	    DMU_POOL_SCRUB_MIN_TXG, tx));
    214 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    215 	    DMU_POOL_SCRUB_MAX_TXG, tx));
    216 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    217 	    DMU_POOL_SCRUB_BOOKMARK, tx));
    218 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    219 	    DMU_POOL_SCRUB_FUNC, tx));
    220 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    221 	    DMU_POOL_SCRUB_ERRORS, tx));
    222 
    223 	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    224 	    DMU_POOL_SCRUB_DDT_BOOKMARK, tx);
    225 	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    226 	    DMU_POOL_SCRUB_DDT_CLASS_MAX, tx);
    227 
    228 	spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
    229 	    "complete=%u", *completep);
    230 
    231 	/* below is scrub-clean specific */
    232 	vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
    233 	    *completep);
    234 	/*
    235 	 * If the scrub/resilver completed, update all DTLs to reflect this.
    236 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
    237 	 */
    238 	vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
    239 	    *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
    240 	if (*completep)
    241 		spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
    242 		    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
    243 	spa_errlog_rotate(dp->dp_spa);
    244 
    245 	/*
    246 	 * We may have finished replacing a device.
    247 	 * Let the async thread assess this and handle the detach.
    248 	 */
    249 	spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
    250 
    251 	dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
    252 	mutex_exit(&dp->dp_scrub_cancel_lock);
    253 }
    254 
    255 int
    256 dsl_pool_scrub_cancel(dsl_pool_t *dp)
    257 {
    258 	boolean_t complete = B_FALSE;
    259 
    260 	return (dsl_sync_task_do(dp, NULL,
    261 	    dsl_pool_scrub_cancel_sync, dp, &complete, 3));
    262 }
    263 
    264 void
    265 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
    266 {
    267 	/*
    268 	 * This function will be used by bp-rewrite wad to intercept frees.
    269 	 */
    270 	zio_free(dp->dp_spa, txg, bpp);
    271 }
    272 
    273 static boolean_t
    274 bookmark_is_zero(const zbookmark_t *zb)
    275 {
    276 	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
    277 	    zb->zb_level == 0 && zb->zb_blkid == 0);
    278 }
    279 
    280 /* dnp is the dnode for zb1->zb_object */
    281 static boolean_t
    282 bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
    283     const zbookmark_t *zb2)
    284 {
    285 	uint64_t zb1nextL0, zb2thisobj;
    286 
    287 	ASSERT(zb1->zb_objset == zb2->zb_objset);
    288 	ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT);
    289 	ASSERT(zb2->zb_level == 0);
    290 
    291 	/*
    292 	 * A bookmark in the deadlist is considered to be after
    293 	 * everything else.
    294 	 */
    295 	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
    296 		return (B_TRUE);
    297 
    298 	/* The objset_phys_t isn't before anything. */
    299 	if (dnp == NULL)
    300 		return (B_FALSE);
    301 
    302 	zb1nextL0 = (zb1->zb_blkid + 1) <<
    303 	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
    304 
    305 	zb2thisobj = zb2->zb_object ? zb2->zb_object :
    306 	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
    307 
    308 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
    309 		uint64_t nextobj = zb1nextL0 *
    310 		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
    311 		return (nextobj <= zb2thisobj);
    312 	}
    313 
    314 	if (zb1->zb_object < zb2thisobj)
    315 		return (B_TRUE);
    316 	if (zb1->zb_object > zb2thisobj)
    317 		return (B_FALSE);
    318 	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
    319 		return (B_FALSE);
    320 	return (zb1nextL0 <= zb2->zb_blkid);
    321 }
    322 
    323 static boolean_t
    324 scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb)
    325 {
    326 	uint64_t elapsed_nanosecs;
    327 	int mintime;
    328 
    329 	if (dp->dp_scrub_pausing)
    330 		return (B_TRUE); /* we're already pausing */
    331 
    332 	if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
    333 		return (B_FALSE); /* we're resuming */
    334 
    335 	/* We only know how to resume from level-0 blocks. */
    336 	if (zb != NULL && zb->zb_level != 0)
    337 		return (B_FALSE);
    338 
    339 	mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms :
    340 	    zfs_scrub_min_time_ms;
    341 	elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time;
    342 	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
    343 	    (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) {
    344 		if (zb) {
    345 			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
    346 			    (longlong_t)zb->zb_objset,
    347 			    (longlong_t)zb->zb_object,
    348 			    (longlong_t)zb->zb_level,
    349 			    (longlong_t)zb->zb_blkid);
    350 			dp->dp_scrub_bookmark = *zb;
    351 		}
    352 		if (ddb) {
    353 			dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
    354 			    (longlong_t)ddb->ddb_class,
    355 			    (longlong_t)ddb->ddb_type,
    356 			    (longlong_t)ddb->ddb_checksum,
    357 			    (longlong_t)ddb->ddb_cursor);
    358 			ASSERT(&dp->dp_scrub_ddt_bookmark == ddb);
    359 		}
    360 		dp->dp_scrub_pausing = B_TRUE;
    361 		return (B_TRUE);
    362 	}
    363 	return (B_FALSE);
    364 }
    365 
    366 typedef struct zil_traverse_arg {
    367 	dsl_pool_t	*zta_dp;
    368 	zil_header_t	*zta_zh;
    369 } zil_traverse_arg_t;
    370 
    371 /* ARGSUSED */
    372 static int
    373 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
    374 {
    375 	zil_traverse_arg_t *zta = arg;
    376 	dsl_pool_t *dp = zta->zta_dp;
    377 	zil_header_t *zh = zta->zta_zh;
    378 	zbookmark_t zb;
    379 
    380 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
    381 		return (0);
    382 
    383 	/*
    384 	 * One block ("stubby") can be allocated a long time ago; we
    385 	 * want to visit that one because it has been allocated
    386 	 * (on-disk) even if it hasn't been claimed (even though for
    387 	 * plain scrub there's nothing to do to it).
    388 	 */
    389 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
    390 		return (0);
    391 
    392 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
    393 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
    394 
    395 	VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
    396 	return (0);
    397 }
    398 
    399 /* ARGSUSED */
    400 static int
    401 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
    402 {
    403 	if (lrc->lrc_txtype == TX_WRITE) {
    404 		zil_traverse_arg_t *zta = arg;
    405 		dsl_pool_t *dp = zta->zta_dp;
    406 		zil_header_t *zh = zta->zta_zh;
    407 		lr_write_t *lr = (lr_write_t *)lrc;
    408 		blkptr_t *bp = &lr->lr_blkptr;
    409 		zbookmark_t zb;
    410 
    411 		if (bp->blk_birth <= dp->dp_scrub_min_txg)
    412 			return (0);
    413 
    414 		/*
    415 		 * birth can be < claim_txg if this record's txg is
    416 		 * already txg sync'ed (but this log block contains
    417 		 * other records that are not synced)
    418 		 */
    419 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
    420 			return (0);
    421 
    422 		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
    423 		    lr->lr_foid, ZB_ZIL_LEVEL,
    424 		    lr->lr_offset / BP_GET_LSIZE(bp));
    425 
    426 		VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
    427 	}
    428 	return (0);
    429 }
    430 
    431 static void
    432 traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
    433 {
    434 	uint64_t claim_txg = zh->zh_claim_txg;
    435 	zil_traverse_arg_t zta = { dp, zh };
    436 	zilog_t *zilog;
    437 
    438 	/*
    439 	 * We only want to visit blocks that have been claimed but not yet
    440 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
    441 	 */
    442 	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
    443 		return;
    444 
    445 	zilog = zil_alloc(dp->dp_meta_objset, zh);
    446 
    447 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta,
    448 	    claim_txg);
    449 
    450 	zil_free(zilog);
    451 }
    452 
    453 static void
    454 scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset,
    455     uint64_t object, uint64_t blkid)
    456 {
    457 	zbookmark_t czb;
    458 	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
    459 
    460 	if (zfs_no_scrub_prefetch)
    461 		return;
    462 
    463 	if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg ||
    464 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
    465 		return;
    466 
    467 	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
    468 
    469 	(void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp,
    470 	    buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
    471 	    &flags, &czb);
    472 }
    473 
    474 static void
    475 scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
    476     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
    477 {
    478 	int err;
    479 	arc_buf_t *buf = NULL;
    480 
    481 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
    482 		return;
    483 
    484 	if (scrub_pause(dp, zb, NULL))
    485 		return;
    486 
    487 	if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
    488 		/*
    489 		 * If we already visited this bp & everything below (in
    490 		 * a prior txg), don't bother doing it again.
    491 		 */
    492 		if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
    493 			return;
    494 
    495 		/*
    496 		 * If we found the block we're trying to resume from, or
    497 		 * we went past it to a different object, zero it out to
    498 		 * indicate that it's OK to start checking for pausing
    499 		 * again.
    500 		 */
    501 		if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
    502 		    zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
    503 			dprintf("resuming at %llx/%llx/%llx/%llx\n",
    504 			    (longlong_t)zb->zb_objset,
    505 			    (longlong_t)zb->zb_object,
    506 			    (longlong_t)zb->zb_level,
    507 			    (longlong_t)zb->zb_blkid);
    508 			bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
    509 		}
    510 	}
    511 
    512 	/*
    513 	 * If dsl_pool_scrub_ddt() has aready scrubbed this block,
    514 	 * don't scrub it again.
    515 	 */
    516 	if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp))
    517 		(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
    518 
    519 	if (BP_GET_LEVEL(bp) > 0) {
    520 		uint32_t flags = ARC_WAIT;
    521 		int i;
    522 		blkptr_t *cbp;
    523 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
    524 
    525 		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
    526 		    arc_getbuf_func, &buf,
    527 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    528 		if (err) {
    529 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
    530 			dp->dp_spa->spa_scrub_errors++;
    531 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
    532 			return;
    533 		}
    534 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
    535 			scrub_prefetch(dp, buf, cbp, zb->zb_objset,
    536 			    zb->zb_object, zb->zb_blkid * epb + i);
    537 		}
    538 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
    539 			zbookmark_t czb;
    540 
    541 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
    542 			    zb->zb_level - 1,
    543 			    zb->zb_blkid * epb + i);
    544 			scrub_visitbp(dp, dnp, buf, cbp, &czb);
    545 		}
    546 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
    547 		uint32_t flags = ARC_WAIT;
    548 		dnode_phys_t *cdnp;
    549 		int i, j;
    550 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
    551 
    552 		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
    553 		    arc_getbuf_func, &buf,
    554 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    555 		if (err) {
    556 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
    557 			dp->dp_spa->spa_scrub_errors++;
    558 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
    559 			return;
    560 		}
    561 		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
    562 			for (j = 0; j < cdnp->dn_nblkptr; j++) {
    563 				blkptr_t *cbp = &cdnp->dn_blkptr[j];
    564 				scrub_prefetch(dp, buf, cbp, zb->zb_objset,
    565 				    zb->zb_blkid * epb + i, j);
    566 			}
    567 		}
    568 		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
    569 			scrub_visitdnode(dp, cdnp, buf, zb->zb_objset,
    570 			    zb->zb_blkid * epb + i);
    571 		}
    572 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
    573 		uint32_t flags = ARC_WAIT;
    574 		objset_phys_t *osp;
    575 
    576 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
    577 		    arc_getbuf_func, &buf,
    578 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    579 		if (err) {
    580 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
    581 			dp->dp_spa->spa_scrub_errors++;
    582 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
    583 			return;
    584 		}
    585 
    586 		osp = buf->b_data;
    587 
    588 		traverse_zil(dp, &osp->os_zil_header);
    589 
    590 		scrub_visitdnode(dp, &osp->os_meta_dnode,
    591 		    buf, zb->zb_objset, DMU_META_DNODE_OBJECT);
    592 		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
    593 			scrub_visitdnode(dp, &osp->os_userused_dnode,
    594 			    buf, zb->zb_objset, DMU_USERUSED_OBJECT);
    595 			scrub_visitdnode(dp, &osp->os_groupused_dnode,
    596 			    buf, zb->zb_objset, DMU_GROUPUSED_OBJECT);
    597 		}
    598 	}
    599 
    600 	if (buf)
    601 		(void) arc_buf_remove_ref(buf, &buf);
    602 }
    603 
    604 static void
    605 scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
    606     uint64_t objset, uint64_t object)
    607 {
    608 	int j;
    609 
    610 	for (j = 0; j < dnp->dn_nblkptr; j++) {
    611 		zbookmark_t czb;
    612 
    613 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
    614 		scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
    615 	}
    616 }
    617 
    618 static void
    619 scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
    620 {
    621 	zbookmark_t zb;
    622 
    623 	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
    624 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
    625 	scrub_visitbp(dp, NULL, NULL, bp, &zb);
    626 }
    627 
    628 void
    629 dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
    630 {
    631 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
    632 
    633 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    634 		return;
    635 
    636 	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
    637 		SET_BOOKMARK(&dp->dp_scrub_bookmark,
    638 		    ZB_DESTROYED_OBJSET, 0, 0, 0);
    639 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    640 	    ds->ds_object, tx) != 0) {
    641 		return;
    642 	}
    643 
    644 	if (ds->ds_phys->ds_next_snap_obj != 0) {
    645 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    646 		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
    647 	}
    648 	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
    649 }
    650 
    651 void
    652 dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
    653 {
    654 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
    655 
    656 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    657 		return;
    658 
    659 	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
    660 
    661 	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
    662 		dp->dp_scrub_bookmark.zb_objset =
    663 		    ds->ds_phys->ds_prev_snap_obj;
    664 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    665 	    ds->ds_object, tx) == 0) {
    666 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    667 		    ds->ds_phys->ds_prev_snap_obj, tx) == 0);
    668 	}
    669 }
    670 
    671 void
    672 dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
    673 {
    674 	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
    675 
    676 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    677 		return;
    678 
    679 	if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) {
    680 		dp->dp_scrub_bookmark.zb_objset = ds2->ds_object;
    681 	} else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) {
    682 		dp->dp_scrub_bookmark.zb_objset = ds1->ds_object;
    683 	}
    684 
    685 	if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    686 	    ds1->ds_object, tx) == 0) {
    687 		int err = zap_add_int(dp->dp_meta_objset,
    688 		    dp->dp_scrub_queue_obj, ds2->ds_object, tx);
    689 		VERIFY(err == 0 || err == EEXIST);
    690 		if (err == EEXIST) {
    691 			/* Both were there to begin with */
    692 			VERIFY(0 == zap_add_int(dp->dp_meta_objset,
    693 			    dp->dp_scrub_queue_obj, ds1->ds_object, tx));
    694 		}
    695 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    696 	    ds2->ds_object, tx) == 0) {
    697 		VERIFY(0 == zap_add_int(dp->dp_meta_objset,
    698 		    dp->dp_scrub_queue_obj, ds1->ds_object, tx));
    699 	}
    700 }
    701 
    702 struct enqueue_clones_arg {
    703 	dmu_tx_t *tx;
    704 	uint64_t originobj;
    705 };
    706 
    707 /* ARGSUSED */
    708 static int
    709 enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
    710 {
    711 	struct enqueue_clones_arg *eca = arg;
    712 	dsl_dataset_t *ds;
    713 	int err;
    714 	dsl_pool_t *dp;
    715 
    716 	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
    717 	if (err)
    718 		return (err);
    719 	dp = ds->ds_dir->dd_pool;
    720 
    721 	if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
    722 		while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
    723 			dsl_dataset_t *prev;
    724 			err = dsl_dataset_hold_obj(dp,
    725 			    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
    726 
    727 			dsl_dataset_rele(ds, FTAG);
    728 			if (err)
    729 				return (err);
    730 			ds = prev;
    731 		}
    732 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    733 		    ds->ds_object, eca->tx) == 0);
    734 	}
    735 	dsl_dataset_rele(ds, FTAG);
    736 	return (0);
    737 }
    738 
    739 static void
    740 scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
    741 {
    742 	dsl_dataset_t *ds;
    743 	uint64_t min_txg_save;
    744 
    745 	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
    746 
    747 	/*
    748 	 * Iterate over the bps in this ds.
    749 	 */
    750 	min_txg_save = dp->dp_scrub_min_txg;
    751 	dp->dp_scrub_min_txg =
    752 	    MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
    753 	scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
    754 	dp->dp_scrub_min_txg = min_txg_save;
    755 
    756 	if (dp->dp_scrub_pausing)
    757 		goto out;
    758 
    759 	/*
    760 	 * Add descendent datasets to work queue.
    761 	 */
    762 	if (ds->ds_phys->ds_next_snap_obj != 0) {
    763 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    764 		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
    765 	}
    766 	if (ds->ds_phys->ds_num_children > 1) {
    767 		boolean_t usenext = B_FALSE;
    768 		if (ds->ds_phys->ds_next_clones_obj != 0) {
    769 			uint64_t count;
    770 			/*
    771 			 * A bug in a previous version of the code could
    772 			 * cause upgrade_clones_cb() to not set
    773 			 * ds_next_snap_obj when it should, leading to a
    774 			 * missing entry.  Therefore we can only use the
    775 			 * next_clones_obj when its count is correct.
    776 			 */
    777 			int err = zap_count(dp->dp_meta_objset,
    778 			    ds->ds_phys->ds_next_clones_obj, &count);
    779 			if (err == 0 &&
    780 			    count == ds->ds_phys->ds_num_children - 1)
    781 				usenext = B_TRUE;
    782 		}
    783 
    784 		if (usenext) {
    785 			VERIFY(zap_join(dp->dp_meta_objset,
    786 			    ds->ds_phys->ds_next_clones_obj,
    787 			    dp->dp_scrub_queue_obj, tx) == 0);
    788 		} else {
    789 			struct enqueue_clones_arg eca;
    790 			eca.tx = tx;
    791 			eca.originobj = ds->ds_object;
    792 
    793 			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
    794 			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
    795 		}
    796 	}
    797 
    798 out:
    799 	dsl_dataset_rele(ds, FTAG);
    800 }
    801 
    802 /* ARGSUSED */
    803 static int
    804 enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
    805 {
    806 	dmu_tx_t *tx = arg;
    807 	dsl_dataset_t *ds;
    808 	int err;
    809 	dsl_pool_t *dp;
    810 
    811 	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
    812 	if (err)
    813 		return (err);
    814 
    815 	dp = ds->ds_dir->dd_pool;
    816 
    817 	while (ds->ds_phys->ds_prev_snap_obj != 0) {
    818 		dsl_dataset_t *prev;
    819 		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
    820 		    FTAG, &prev);
    821 		if (err) {
    822 			dsl_dataset_rele(ds, FTAG);
    823 			return (err);
    824 		}
    825 
    826 		/*
    827 		 * If this is a clone, we don't need to worry about it for now.
    828 		 */
    829 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
    830 			dsl_dataset_rele(ds, FTAG);
    831 			dsl_dataset_rele(prev, FTAG);
    832 			return (0);
    833 		}
    834 		dsl_dataset_rele(ds, FTAG);
    835 		ds = prev;
    836 	}
    837 
    838 	VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    839 	    ds->ds_object, tx) == 0);
    840 	dsl_dataset_rele(ds, FTAG);
    841 	return (0);
    842 }
    843 
    844 /*
    845  * Scrub/dedup interaction.
    846  *
    847  * If there are N references to a deduped block, we don't want to scrub it
    848  * N times -- ideally, we should scrub it exactly once.
    849  *
    850  * To prevent excess scrubbing, the scrub begins by walking the DDT
    851  * to find all blocks with refcnt > 1, and scrubs each of these once.
    852  * Then the top-down scrub begins, only visiting blocks with refcnt == 1.
    853  *
    854  * There would be nothing more to say if a block's refcnt couldn't change
    855  * during a scrub, but of course it can.  There are two cases to consider.
    856  *
    857  * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
    858  * when visited during the top-down scrub phase, it will be scrubbed twice.
    859  * This negates our scrub optimization, but is otherwise harmless.
    860  *
    861  * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
    862  * on each visit during the top-down scrub phase, it will never be scrubbed.
    863  * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
    864  * reference count changes; if it transitions from refcnt == 1 to refcnt > 1
    865  * while a scrub is in progress, it scrubs the block right then.
    866  *
    867  * The code does not actually use the refcnt directly, but rather uses the
    868  * dde's replication class (enum ddt_class), which serves the same purpose.
    869  */
    870 static void
    871 dsl_pool_scrub_ddt(dsl_pool_t *dp)
    872 {
    873 	ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark;
    874 	ddt_entry_t dde;
    875 	int error;
    876 
    877 	while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) {
    878 		if (ddb->ddb_class > dp->dp_scrub_ddt_class_max)
    879 			return;
    880 		dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde);
    881 		if (scrub_pause(dp, NULL, ddb))
    882 			return;
    883 	}
    884 	ASSERT(error == ENOENT);
    885 	ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max);
    886 }
    887 
    888 void
    889 dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
    890     const ddt_entry_t *dde)
    891 {
    892 	const ddt_key_t *ddk = &dde->dde_key;
    893 	const ddt_phys_t *ddp = dde->dde_phys;
    894 	blkptr_t blk;
    895 	zbookmark_t zb = { 0 };
    896 
    897 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
    898 		if (ddp->ddp_phys_birth == 0)
    899 			continue;
    900 		ddt_bp_create(checksum, ddk, ddp, &blk);
    901 		scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
    902 	}
    903 }
    904 
    905 void
    906 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
    907 {
    908 	spa_t *spa = dp->dp_spa;
    909 	zap_cursor_t zc;
    910 	zap_attribute_t za;
    911 	boolean_t complete = B_TRUE;
    912 
    913 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    914 		return;
    915 
    916 	/*
    917 	 * If the pool is not loaded, or is trying to unload, leave it alone.
    918 	 */
    919 	if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa))
    920 		return;
    921 
    922 	if (dp->dp_scrub_restart) {
    923 		enum scrub_func func = dp->dp_scrub_func;
    924 		dp->dp_scrub_restart = B_FALSE;
    925 		dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
    926 	}
    927 
    928 	if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
    929 		/*
    930 		 * We must have resumed after rebooting; reset the vdev
    931 		 * stats to know that we're doing a scrub (although it
    932 		 * will think we're just starting now).
    933 		 */
    934 		vdev_scrub_stat_update(spa->spa_root_vdev,
    935 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
    936 		    POOL_SCRUB_EVERYTHING, B_FALSE);
    937 	}
    938 
    939 	dp->dp_scrub_pausing = B_FALSE;
    940 	dp->dp_scrub_start_time = gethrtime();
    941 	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
    942 	spa->spa_scrub_active = B_TRUE;
    943 
    944 	if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) {
    945 		dsl_pool_scrub_ddt(dp);
    946 		if (dp->dp_scrub_pausing)
    947 			goto out;
    948 	}
    949 
    950 	if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
    951 		/* First do the MOS & ORIGIN */
    952 		scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
    953 		if (dp->dp_scrub_pausing)
    954 			goto out;
    955 
    956 		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
    957 			VERIFY(0 == dmu_objset_find_spa(spa,
    958 			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
    959 		} else {
    960 			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
    961 		}
    962 		ASSERT(!dp->dp_scrub_pausing);
    963 	} else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) {
    964 		/*
    965 		 * If we were paused, continue from here.  Note if the ds
    966 		 * we were paused on was destroyed, the zb_objset will be
    967 		 * ZB_DESTROYED_OBJSET, so we will skip this and find a new
    968 		 * objset below.
    969 		 */
    970 		scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
    971 		if (dp->dp_scrub_pausing)
    972 			goto out;
    973 	}
    974 
    975 	/*
    976 	 * In case we were paused right at the end of the ds, zero the
    977 	 * bookmark so we don't think that we're still trying to resume.
    978 	 */
    979 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
    980 
    981 	/* keep pulling things out of the zap-object-as-queue */
    982 	while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
    983 	    zap_cursor_retrieve(&zc, &za) == 0) {
    984 		VERIFY(0 == zap_remove(dp->dp_meta_objset,
    985 		    dp->dp_scrub_queue_obj, za.za_name, tx));
    986 		scrub_visitds(dp, za.za_first_integer, tx);
    987 		if (dp->dp_scrub_pausing)
    988 			break;
    989 		zap_cursor_fini(&zc);
    990 	}
    991 	zap_cursor_fini(&zc);
    992 	if (dp->dp_scrub_pausing)
    993 		goto out;
    994 
    995 	/* done. */
    996 
    997 	dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
    998 	return;
    999 out:
   1000 	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
   1001 	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
   1002 	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
   1003 	    &dp->dp_scrub_bookmark, tx));
   1004 	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
   1005 	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
   1006 	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
   1007 	    &dp->dp_scrub_ddt_bookmark, tx));
   1008 	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
   1009 	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
   1010 	    &dp->dp_scrub_ddt_class_max, tx));
   1011 	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
   1012 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
   1013 	    &spa->spa_scrub_errors, tx));
   1014 
   1015 	/* XXX this is scrub-clean specific */
   1016 	mutex_enter(&spa->spa_scrub_lock);
   1017 	while (spa->spa_scrub_inflight > 0)
   1018 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
   1019 	mutex_exit(&spa->spa_scrub_lock);
   1020 }
   1021 
   1022 void
   1023 dsl_pool_scrub_restart(dsl_pool_t *dp)
   1024 {
   1025 	mutex_enter(&dp->dp_scrub_cancel_lock);
   1026 	dp->dp_scrub_restart = B_TRUE;
   1027 	mutex_exit(&dp->dp_scrub_cancel_lock);
   1028 }
   1029 
   1030 /*
   1031  * scrub consumers
   1032  */
   1033 
   1034 static void
   1035 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
   1036 {
   1037 	int i;
   1038 
   1039 	/*
   1040 	 * If we resume after a reboot, zab will be NULL; don't record
   1041 	 * incomplete stats in that case.
   1042 	 */
   1043 	if (zab == NULL)
   1044 		return;
   1045 
   1046 	for (i = 0; i < 4; i++) {
   1047 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
   1048 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
   1049 		zfs_blkstat_t *zb = &zab->zab_type[l][t];
   1050 		int equal;
   1051 
   1052 		zb->zb_count++;
   1053 		zb->zb_asize += BP_GET_ASIZE(bp);
   1054 		zb->zb_lsize += BP_GET_LSIZE(bp);
   1055 		zb->zb_psize += BP_GET_PSIZE(bp);
   1056 		zb->zb_gangs += BP_COUNT_GANG(bp);
   1057 
   1058 		switch (BP_GET_NDVAS(bp)) {
   1059 		case 2:
   1060 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
   1061 			    DVA_GET_VDEV(&bp->blk_dva[1]))
   1062 				zb->zb_ditto_2_of_2_samevdev++;
   1063 			break;
   1064 		case 3:
   1065 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
   1066 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
   1067 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
   1068 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
   1069 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
   1070 			    DVA_GET_VDEV(&bp->blk_dva[2]));
   1071 			if (equal == 1)
   1072 				zb->zb_ditto_2_of_3_samevdev++;
   1073 			else if (equal == 3)
   1074 				zb->zb_ditto_3_of_3_samevdev++;
   1075 			break;
   1076 		}
   1077 	}
   1078 }
   1079 
   1080 static void
   1081 dsl_pool_scrub_clean_done(zio_t *zio)
   1082 {
   1083 	spa_t *spa = zio->io_spa;
   1084 
   1085 	zio_data_buf_free(zio->io_data, zio->io_size);
   1086 
   1087 	mutex_enter(&spa->spa_scrub_lock);
   1088 	spa->spa_scrub_inflight--;
   1089 	cv_broadcast(&spa->spa_scrub_io_cv);
   1090 
   1091 	if (zio->io_error && (zio->io_error != ECKSUM ||
   1092 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)))
   1093 		spa->spa_scrub_errors++;
   1094 	mutex_exit(&spa->spa_scrub_lock);
   1095 }
   1096 
   1097 static int
   1098 dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
   1099     const blkptr_t *bp, const zbookmark_t *zb)
   1100 {
   1101 	size_t size = BP_GET_PSIZE(bp);
   1102 	spa_t *spa = dp->dp_spa;
   1103 	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
   1104 	boolean_t needs_io;
   1105 	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
   1106 	int zio_priority;
   1107 
   1108 	if (phys_birth <= dp->dp_scrub_min_txg ||
   1109 	    phys_birth >= dp->dp_scrub_max_txg)
   1110 		return (0);
   1111 
   1112 	count_block(dp->dp_blkstats, bp);
   1113 
   1114 	if (dp->dp_scrub_isresilver == 0) {
   1115 		/* It's a scrub */
   1116 		zio_flags |= ZIO_FLAG_SCRUB;
   1117 		zio_priority = ZIO_PRIORITY_SCRUB;
   1118 		needs_io = B_TRUE;
   1119 	} else {
   1120 		/* It's a resilver */
   1121 		zio_flags |= ZIO_FLAG_RESILVER;
   1122 		zio_priority = ZIO_PRIORITY_RESILVER;
   1123 		needs_io = B_FALSE;
   1124 	}
   1125 
   1126 	/* If it's an intent log block, failure is expected. */
   1127 	if (zb->zb_level == ZB_ZIL_LEVEL)
   1128 		zio_flags |= ZIO_FLAG_SPECULATIVE;
   1129 
   1130 	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
   1131 		vdev_t *vd = vdev_lookup_top(spa,
   1132 		    DVA_GET_VDEV(&bp->blk_dva[d]));
   1133 
   1134 		/*
   1135 		 * Keep track of how much data we've examined so that
   1136 		 * zpool(1M) status can make useful progress reports.
   1137 		 */
   1138 		mutex_enter(&vd->vdev_stat_lock);
   1139 		vd->vdev_stat.vs_scrub_examined +=
   1140 		    DVA_GET_ASIZE(&bp->blk_dva[d]);
   1141 		mutex_exit(&vd->vdev_stat_lock);
   1142 
   1143 		/* if it's a resilver, this may not be in the target range */
   1144 		if (!needs_io) {
   1145 			if (DVA_GET_GANG(&bp->blk_dva[d])) {
   1146 				/*
   1147 				 * Gang members may be spread across multiple
   1148 				 * vdevs, so the best estimate we have is the
   1149 				 * scrub range, which has already been checked.
   1150 				 * XXX -- it would be better to change our
   1151 				 * allocation policy to ensure that all
   1152 				 * gang members reside on the same vdev.
   1153 				 */
   1154 				needs_io = B_TRUE;
   1155 			} else {
   1156 				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
   1157 				    phys_birth, 1);
   1158 			}
   1159 		}
   1160 	}
   1161 
   1162 	if (needs_io && !zfs_no_scrub_io) {
   1163 		void *data = zio_data_buf_alloc(size);
   1164 
   1165 		mutex_enter(&spa->spa_scrub_lock);
   1166 		while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
   1167 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
   1168 		spa->spa_scrub_inflight++;
   1169 		mutex_exit(&spa->spa_scrub_lock);
   1170 
   1171 		zio_nowait(zio_read(NULL, spa, bp, data, size,
   1172 		    dsl_pool_scrub_clean_done, NULL, zio_priority,
   1173 		    zio_flags, zb));
   1174 	}
   1175 
   1176 	/* do not relocate this block */
   1177 	return (0);
   1178 }
   1179 
   1180 int
   1181 dsl_pool_scrub_clean(dsl_pool_t *dp)
   1182 {
   1183 	spa_t *spa = dp->dp_spa;
   1184 
   1185 	/*
   1186 	 * Purge all vdev caches and probe all devices.  We do this here
   1187 	 * rather than in sync context because this requires a writer lock
   1188 	 * on the spa_config lock, which we can't do from sync context.  The
   1189 	 * spa_scrub_reopen flag indicates that vdev_open() should not
   1190 	 * attempt to start another scrub.
   1191 	 */
   1192 	spa_vdev_state_enter(spa, SCL_NONE);
   1193 	spa->spa_scrub_reopen = B_TRUE;
   1194 	vdev_reopen(spa->spa_root_vdev);
   1195 	spa->spa_scrub_reopen = B_FALSE;
   1196 	(void) spa_vdev_state_exit(spa, NULL, 0);
   1197 
   1198 	return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
   1199 }
   1200