Home | History | Annotate | Download | only in zfs
      1   7046   ahrens /*
      2   7046   ahrens  * CDDL HEADER START
      3   7046   ahrens  *
      4   7046   ahrens  * The contents of this file are subject to the terms of the
      5   7046   ahrens  * Common Development and Distribution License (the "License").
      6   7046   ahrens  * You may not use this file except in compliance with the License.
      7   7046   ahrens  *
      8   7046   ahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9   7046   ahrens  * or http://www.opensolaris.org/os/licensing.
     10   7046   ahrens  * See the License for the specific language governing permissions
     11   7046   ahrens  * and limitations under the License.
     12   7046   ahrens  *
     13   7046   ahrens  * When distributing Covered Code, include this CDDL HEADER in each
     14   7046   ahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15   7046   ahrens  * If applicable, add the following below this CDDL HEADER, with the
     16   7046   ahrens  * fields enclosed by brackets "[]" replaced with your own identifying
     17   7046   ahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
     18   7046   ahrens  *
     19   7046   ahrens  * CDDL HEADER END
     20   7046   ahrens  */
     21   7046   ahrens /*
     22   8525     Eric  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23   7046   ahrens  * Use is subject to license terms.
     24   7046   ahrens  */
     25   7046   ahrens 
     26   7046   ahrens #include <sys/dsl_pool.h>
     27   7046   ahrens #include <sys/dsl_dataset.h>
     28   7046   ahrens #include <sys/dsl_prop.h>
     29   7046   ahrens #include <sys/dsl_dir.h>
     30   7046   ahrens #include <sys/dsl_synctask.h>
     31   7046   ahrens #include <sys/dnode.h>
     32   7046   ahrens #include <sys/dmu_tx.h>
     33   7046   ahrens #include <sys/dmu_objset.h>
     34   7046   ahrens #include <sys/arc.h>
     35   7046   ahrens #include <sys/zap.h>
     36   7046   ahrens #include <sys/zio.h>
     37   7046   ahrens #include <sys/zfs_context.h>
     38   7046   ahrens #include <sys/fs/zfs.h>
     39   7046   ahrens #include <sys/zfs_znode.h>
     40   7046   ahrens #include <sys/spa_impl.h>
     41   7046   ahrens #include <sys/vdev_impl.h>
     42   7160   ahrens #include <sys/zil_impl.h>
     43  10922     Jeff #include <sys/zio_checksum.h>
     44  10922     Jeff #include <sys/ddt.h>
     45   7046   ahrens 
     46   7046   ahrens typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
     47   7046   ahrens 
     48   7046   ahrens static scrub_cb_t dsl_pool_scrub_clean_cb;
     49   7046   ahrens static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
     50   9396  Matthew static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
     51   9396  Matthew     uint64_t objset, uint64_t object);
     52   7046   ahrens 
     53  11182      Lin int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
     54  11182      Lin int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
     55   7046   ahrens boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
     56  11147   George boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
     57  11125     Jeff enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
     58   7046   ahrens 
     59   7046   ahrens extern int zfs_txg_timeout;
     60   7046   ahrens 
     61   7046   ahrens static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
     62   7046   ahrens 	NULL,
     63   7046   ahrens 	dsl_pool_scrub_clean_cb
     64   7046   ahrens };
     65   7046   ahrens 
     66   7046   ahrens /* ARGSUSED */
     67   7046   ahrens static void
     68   7046   ahrens dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
     69   7046   ahrens {
     70   7046   ahrens 	dsl_pool_t *dp = arg1;
     71   7046   ahrens 	enum scrub_func *funcp = arg2;
     72   7046   ahrens 	dmu_object_type_t ot = 0;
     73   7046   ahrens 	boolean_t complete = B_FALSE;
     74   7046   ahrens 
     75   7046   ahrens 	dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
     76   7046   ahrens 
     77   7046   ahrens 	ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
     78   7046   ahrens 	ASSERT(*funcp > SCRUB_FUNC_NONE);
     79   7046   ahrens 	ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
     80   7046   ahrens 
     81   7046   ahrens 	dp->dp_scrub_min_txg = 0;
     82   7046   ahrens 	dp->dp_scrub_max_txg = tx->tx_txg;
     83  11125     Jeff 	dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max;
     84   7046   ahrens 
     85   7046   ahrens 	if (*funcp == SCRUB_FUNC_CLEAN) {
     86   7046   ahrens 		vdev_t *rvd = dp->dp_spa->spa_root_vdev;
     87   7046   ahrens 
     88   7046   ahrens 		/* rewrite all disk labels */
     89   7046   ahrens 		vdev_config_dirty(rvd);
     90   7046   ahrens 
     91   7046   ahrens 		if (vdev_resilver_needed(rvd,
     92   7046   ahrens 		    &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
     93   7046   ahrens 			spa_event_notify(dp->dp_spa, NULL,
     94   7046   ahrens 			    ESC_ZFS_RESILVER_START);
     95   7046   ahrens 			dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
     96   7046   ahrens 			    tx->tx_txg);
     97   8525     Eric 		} else {
     98   8525     Eric 			spa_event_notify(dp->dp_spa, NULL,
     99   8525     Eric 			    ESC_ZFS_SCRUB_START);
    100   7046   ahrens 		}
    101   7046   ahrens 
    102   7046   ahrens 		/* zero out the scrub stats in all vdev_stat_t's */
    103   7046   ahrens 		vdev_scrub_stat_update(rvd,
    104   7046   ahrens 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
    105   7046   ahrens 		    POOL_SCRUB_EVERYTHING, B_FALSE);
    106   7046   ahrens 
    107  11125     Jeff 		/*
    108  11125     Jeff 		 * If this is an incremental scrub, limit the DDT scrub phase
    109  11125     Jeff 		 * to just the auto-ditto class (for correctness); the rest
    110  11125     Jeff 		 * of the scrub should go faster using top-down pruning.
    111  11125     Jeff 		 */
    112  11125     Jeff 		if (dp->dp_scrub_min_txg > TXG_INITIAL)
    113  11125     Jeff 			dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO;
    114  11125     Jeff 
    115   7046   ahrens 		dp->dp_spa->spa_scrub_started = B_TRUE;
    116   7046   ahrens 	}
    117   7046   ahrens 
    118   7046   ahrens 	/* back to the generic stuff */
    119   7837  Matthew 
    120   7837  Matthew 	if (dp->dp_blkstats == NULL) {
    121   7837  Matthew 		dp->dp_blkstats =
    122   7837  Matthew 		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
    123   7837  Matthew 	}
    124   7837  Matthew 	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
    125   7046   ahrens 
    126   7046   ahrens 	if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
    127   7046   ahrens 		ot = DMU_OT_ZAP_OTHER;
    128   7046   ahrens 
    129   7046   ahrens 	dp->dp_scrub_func = *funcp;
    130   7046   ahrens 	dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
    131   7046   ahrens 	    ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
    132   7046   ahrens 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
    133  11125     Jeff 	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
    134   7046   ahrens 	dp->dp_scrub_restart = B_FALSE;
    135   7160   ahrens 	dp->dp_spa->spa_scrub_errors = 0;
    136   7046   ahrens 
    137   7046   ahrens 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    138   7046   ahrens 	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
    139   7046   ahrens 	    &dp->dp_scrub_func, tx));
    140   7046   ahrens 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    141   7046   ahrens 	    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
    142   7046   ahrens 	    &dp->dp_scrub_queue_obj, tx));
    143   7046   ahrens 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    144   7046   ahrens 	    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
    145   7046   ahrens 	    &dp->dp_scrub_min_txg, tx));
    146   7046   ahrens 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    147   7046   ahrens 	    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
    148   7046   ahrens 	    &dp->dp_scrub_max_txg, tx));
    149   7046   ahrens 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    150  11125     Jeff 	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
    151  11125     Jeff 	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
    152   7046   ahrens 	    &dp->dp_scrub_bookmark, tx));
    153  11125     Jeff 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    154  11125     Jeff 	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
    155  11125     Jeff 	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
    156  11125     Jeff 	    &dp->dp_scrub_ddt_bookmark, tx));
    157  11125     Jeff 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    158  11125     Jeff 	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
    159  11125     Jeff 	    &dp->dp_scrub_ddt_class_max, tx));
    160   7046   ahrens 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    161   7046   ahrens 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
    162   7046   ahrens 	    &dp->dp_spa->spa_scrub_errors, tx));
    163   7046   ahrens 
    164   7046   ahrens 	spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
    165   7046   ahrens 	    "func=%u mintxg=%llu maxtxg=%llu",
    166   7046   ahrens 	    *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
    167   7046   ahrens }
    168   7046   ahrens 
    169   7046   ahrens int
    170   7046   ahrens dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
    171   7046   ahrens {
    172   7046   ahrens 	return (dsl_sync_task_do(dp, NULL,
    173   7046   ahrens 	    dsl_pool_scrub_setup_sync, dp, &func, 0));
    174   7046   ahrens }
    175   7046   ahrens 
    176   7046   ahrens /* ARGSUSED */
    177   7046   ahrens static void
    178   7046   ahrens dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    179   7046   ahrens {
    180   7046   ahrens 	dsl_pool_t *dp = arg1;
    181   7046   ahrens 	boolean_t *completep = arg2;
    182   7046   ahrens 
    183   7046   ahrens 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    184   7046   ahrens 		return;
    185   7046   ahrens 
    186   7046   ahrens 	mutex_enter(&dp->dp_scrub_cancel_lock);
    187   7046   ahrens 
    188   7046   ahrens 	if (dp->dp_scrub_restart) {
    189   7046   ahrens 		dp->dp_scrub_restart = B_FALSE;
    190   7046   ahrens 		*completep = B_FALSE;
    191   7046   ahrens 	}
    192   7046   ahrens 
    193   7046   ahrens 	/* XXX this is scrub-clean specific */
    194   7046   ahrens 	mutex_enter(&dp->dp_spa->spa_scrub_lock);
    195   7046   ahrens 	while (dp->dp_spa->spa_scrub_inflight > 0) {
    196   7046   ahrens 		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
    197   7046   ahrens 		    &dp->dp_spa->spa_scrub_lock);
    198   7046   ahrens 	}
    199   7046   ahrens 	mutex_exit(&dp->dp_spa->spa_scrub_lock);
    200   7046   ahrens 	dp->dp_spa->spa_scrub_started = B_FALSE;
    201   7160   ahrens 	dp->dp_spa->spa_scrub_active = B_FALSE;
    202   7046   ahrens 
    203   7046   ahrens 	dp->dp_scrub_func = SCRUB_FUNC_NONE;
    204   7046   ahrens 	VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
    205   7046   ahrens 	    dp->dp_scrub_queue_obj, tx));
    206   7046   ahrens 	dp->dp_scrub_queue_obj = 0;
    207   7046   ahrens 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
    208  11125     Jeff 	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
    209   7046   ahrens 
    210   7046   ahrens 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    211   7046   ahrens 	    DMU_POOL_SCRUB_QUEUE, tx));
    212   7046   ahrens 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    213   7046   ahrens 	    DMU_POOL_SCRUB_MIN_TXG, tx));
    214   7046   ahrens 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    215   7046   ahrens 	    DMU_POOL_SCRUB_MAX_TXG, tx));
    216   7046   ahrens 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    217   7046   ahrens 	    DMU_POOL_SCRUB_BOOKMARK, tx));
    218   7046   ahrens 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    219   7046   ahrens 	    DMU_POOL_SCRUB_FUNC, tx));
    220   7046   ahrens 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    221   7046   ahrens 	    DMU_POOL_SCRUB_ERRORS, tx));
    222  11125     Jeff 
    223  11125     Jeff 	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    224  11125     Jeff 	    DMU_POOL_SCRUB_DDT_BOOKMARK, tx);
    225  11125     Jeff 	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    226  11125     Jeff 	    DMU_POOL_SCRUB_DDT_CLASS_MAX, tx);
    227   7046   ahrens 
    228   7046   ahrens 	spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
    229   7046   ahrens 	    "complete=%u", *completep);
    230   7046   ahrens 
    231   7046   ahrens 	/* below is scrub-clean specific */
    232   7046   ahrens 	vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
    233   7046   ahrens 	    *completep);
    234   7046   ahrens 	/*
    235   7046   ahrens 	 * If the scrub/resilver completed, update all DTLs to reflect this.
    236   7046   ahrens 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
    237   7046   ahrens 	 */
    238   7046   ahrens 	vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
    239   7046   ahrens 	    *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
    240   8525     Eric 	if (*completep)
    241   8525     Eric 		spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
    242   8525     Eric 		    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
    243   7046   ahrens 	spa_errlog_rotate(dp->dp_spa);
    244   7046   ahrens 
    245   7046   ahrens 	/*
    246   7046   ahrens 	 * We may have finished replacing a device.
    247   7046   ahrens 	 * Let the async thread assess this and handle the detach.
    248   7046   ahrens 	 */
    249   7046   ahrens 	spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
    250   7046   ahrens 
    251   7046   ahrens 	dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
    252   7046   ahrens 	mutex_exit(&dp->dp_scrub_cancel_lock);
    253   7046   ahrens }
    254   7046   ahrens 
    255   7046   ahrens int
    256   7046   ahrens dsl_pool_scrub_cancel(dsl_pool_t *dp)
    257   7046   ahrens {
    258   7046   ahrens 	boolean_t complete = B_FALSE;
    259   7046   ahrens 
    260   7046   ahrens 	return (dsl_sync_task_do(dp, NULL,
    261   7046   ahrens 	    dsl_pool_scrub_cancel_sync, dp, &complete, 3));
    262   7046   ahrens }
    263   7046   ahrens 
    264  10922     Jeff void
    265  10922     Jeff dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
    266   7046   ahrens {
    267   7046   ahrens 	/*
    268   7046   ahrens 	 * This function will be used by bp-rewrite wad to intercept frees.
    269   7046   ahrens 	 */
    270  10922     Jeff 	zio_free(dp->dp_spa, txg, bpp);
    271   7046   ahrens }
    272   7046   ahrens 
    273   7046   ahrens static boolean_t
    274   7046   ahrens bookmark_is_zero(const zbookmark_t *zb)
    275   7046   ahrens {
    276   7046   ahrens 	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
    277   7046   ahrens 	    zb->zb_level == 0 && zb->zb_blkid == 0);
    278   7046   ahrens }
    279   7046   ahrens 
    280   7046   ahrens /* dnp is the dnode for zb1->zb_object */
    281   7046   ahrens static boolean_t
    282   7046   ahrens bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
    283   7046   ahrens     const zbookmark_t *zb2)
    284   7046   ahrens {
    285   7182   ahrens 	uint64_t zb1nextL0, zb2thisobj;
    286   7046   ahrens 
    287   7046   ahrens 	ASSERT(zb1->zb_objset == zb2->zb_objset);
    288  10922     Jeff 	ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT);
    289   7046   ahrens 	ASSERT(zb2->zb_level == 0);
    290   7046   ahrens 
    291   7046   ahrens 	/*
    292   7046   ahrens 	 * A bookmark in the deadlist is considered to be after
    293   7046   ahrens 	 * everything else.
    294   7046   ahrens 	 */
    295  10922     Jeff 	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
    296   7046   ahrens 		return (B_TRUE);
    297   7046   ahrens 
    298   7046   ahrens 	/* The objset_phys_t isn't before anything. */
    299   7046   ahrens 	if (dnp == NULL)
    300   7046   ahrens 		return (B_FALSE);
    301   7046   ahrens 
    302   7182   ahrens 	zb1nextL0 = (zb1->zb_blkid + 1) <<
    303   7046   ahrens 	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
    304   7046   ahrens 
    305   7182   ahrens 	zb2thisobj = zb2->zb_object ? zb2->zb_object :
    306   7182   ahrens 	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
    307   7182   ahrens 
    308  10922     Jeff 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
    309   7182   ahrens 		uint64_t nextobj = zb1nextL0 *
    310   7046   ahrens 		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
    311   7182   ahrens 		return (nextobj <= zb2thisobj);
    312   7046   ahrens 	}
    313   7046   ahrens 
    314   7182   ahrens 	if (zb1->zb_object < zb2thisobj)
    315   7046   ahrens 		return (B_TRUE);
    316   7182   ahrens 	if (zb1->zb_object > zb2thisobj)
    317   7046   ahrens 		return (B_FALSE);
    318  10922     Jeff 	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
    319   7182   ahrens 		return (B_FALSE);
    320   7182   ahrens 	return (zb1nextL0 <= zb2->zb_blkid);
    321   7046   ahrens }
    322   7046   ahrens 
    323   7046   ahrens static boolean_t
    324  11125     Jeff scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb)
    325   7046   ahrens {
    326  11137      Lin 	uint64_t elapsed_nanosecs;
    327   7160   ahrens 	int mintime;
    328   7046   ahrens 
    329   7046   ahrens 	if (dp->dp_scrub_pausing)
    330   7046   ahrens 		return (B_TRUE); /* we're already pausing */
    331   7046   ahrens 
    332   7046   ahrens 	if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
    333   7046   ahrens 		return (B_FALSE); /* we're resuming */
    334   7046   ahrens 
    335   7182   ahrens 	/* We only know how to resume from level-0 blocks. */
    336  11125     Jeff 	if (zb != NULL && zb->zb_level != 0)
    337   7046   ahrens 		return (B_FALSE);
    338   7046   ahrens 
    339  11182      Lin 	mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms :
    340  11182      Lin 	    zfs_scrub_min_time_ms;
    341  11137      Lin 	elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time;
    342  11137      Lin 	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
    343  11137      Lin 	    (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) {
    344  11125     Jeff 		if (zb) {
    345  11125     Jeff 			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
    346  11125     Jeff 			    (longlong_t)zb->zb_objset,
    347  11125     Jeff 			    (longlong_t)zb->zb_object,
    348  11125     Jeff 			    (longlong_t)zb->zb_level,
    349  11125     Jeff 			    (longlong_t)zb->zb_blkid);
    350  11125     Jeff 			dp->dp_scrub_bookmark = *zb;
    351  11125     Jeff 		}
    352  11125     Jeff 		if (ddb) {
    353  11125     Jeff 			dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
    354  11125     Jeff 			    (longlong_t)ddb->ddb_class,
    355  11125     Jeff 			    (longlong_t)ddb->ddb_type,
    356  11125     Jeff 			    (longlong_t)ddb->ddb_checksum,
    357  11125     Jeff 			    (longlong_t)ddb->ddb_cursor);
    358  11125     Jeff 			ASSERT(&dp->dp_scrub_ddt_bookmark == ddb);
    359  11125     Jeff 		}
    360   7046   ahrens 		dp->dp_scrub_pausing = B_TRUE;
    361   7046   ahrens 		return (B_TRUE);
    362   7046   ahrens 	}
    363   7046   ahrens 	return (B_FALSE);
    364   7046   ahrens }
    365   7046   ahrens 
    366   7754     Jeff typedef struct zil_traverse_arg {
    367   7754     Jeff 	dsl_pool_t	*zta_dp;
    368   7754     Jeff 	zil_header_t	*zta_zh;
    369   7754     Jeff } zil_traverse_arg_t;
    370   7754     Jeff 
    371   7046   ahrens /* ARGSUSED */
    372  10922     Jeff static int
    373   7046   ahrens traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
    374   7046   ahrens {
    375   7754     Jeff 	zil_traverse_arg_t *zta = arg;
    376   7754     Jeff 	dsl_pool_t *dp = zta->zta_dp;
    377   7754     Jeff 	zil_header_t *zh = zta->zta_zh;
    378   7754     Jeff 	zbookmark_t zb;
    379   7046   ahrens 
    380   7046   ahrens 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
    381  10922     Jeff 		return (0);
    382   7046   ahrens 
    383   8746  Matthew 	/*
    384   9701   George 	 * One block ("stubby") can be allocated a long time ago; we
    385   8746  Matthew 	 * want to visit that one because it has been allocated
    386   8746  Matthew 	 * (on-disk) even if it hasn't been claimed (even though for
    387   8746  Matthew 	 * plain scrub there's nothing to do to it).
    388   8746  Matthew 	 */
    389   7754     Jeff 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
    390  10922     Jeff 		return (0);
    391   7754     Jeff 
    392  10922     Jeff 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
    393  10922     Jeff 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
    394  10922     Jeff 
    395   7754     Jeff 	VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
    396  10922     Jeff 	return (0);
    397   7046   ahrens }
    398   7046   ahrens 
    399   7046   ahrens /* ARGSUSED */
    400  10922     Jeff static int
    401   7046   ahrens traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
    402   7046   ahrens {
    403   7046   ahrens 	if (lrc->lrc_txtype == TX_WRITE) {
    404   7754     Jeff 		zil_traverse_arg_t *zta = arg;
    405   7754     Jeff 		dsl_pool_t *dp = zta->zta_dp;
    406   7754     Jeff 		zil_header_t *zh = zta->zta_zh;
    407   7046   ahrens 		lr_write_t *lr = (lr_write_t *)lrc;
    408   7046   ahrens 		blkptr_t *bp = &lr->lr_blkptr;
    409   7754     Jeff 		zbookmark_t zb;
    410   7046   ahrens 
    411   7046   ahrens 		if (bp->blk_birth <= dp->dp_scrub_min_txg)
    412  10922     Jeff 			return (0);
    413   7046   ahrens 
    414   8746  Matthew 		/*
    415   8746  Matthew 		 * birth can be < claim_txg if this record's txg is
    416   8746  Matthew 		 * already txg sync'ed (but this log block contains
    417   8746  Matthew 		 * other records that are not synced)
    418   8746  Matthew 		 */
    419   7754     Jeff 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
    420  10922     Jeff 			return (0);
    421   7754     Jeff 
    422  10922     Jeff 		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
    423  10922     Jeff 		    lr->lr_foid, ZB_ZIL_LEVEL,
    424  10922     Jeff 		    lr->lr_offset / BP_GET_LSIZE(bp));
    425  10922     Jeff 
    426   7754     Jeff 		VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
    427   7046   ahrens 	}
    428  10922     Jeff 	return (0);
    429   7046   ahrens }
    430   7046   ahrens 
    431   7046   ahrens static void
    432   7046   ahrens traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
    433   7046   ahrens {
    434   7046   ahrens 	uint64_t claim_txg = zh->zh_claim_txg;
    435   7754     Jeff 	zil_traverse_arg_t zta = { dp, zh };
    436   7046   ahrens 	zilog_t *zilog;
    437   7046   ahrens 
    438   7046   ahrens 	/*
    439   7046   ahrens 	 * We only want to visit blocks that have been claimed but not yet
    440   7046   ahrens 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
    441   7046   ahrens 	 */
    442   8241     Jeff 	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
    443   7046   ahrens 		return;
    444   7046   ahrens 
    445   7046   ahrens 	zilog = zil_alloc(dp->dp_meta_objset, zh);
    446   7046   ahrens 
    447   7754     Jeff 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta,
    448   7046   ahrens 	    claim_txg);
    449   7046   ahrens 
    450   7046   ahrens 	zil_free(zilog);
    451   7046   ahrens }
    452   7046   ahrens 
    453   7046   ahrens static void
    454  11147   George scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset,
    455  11147   George     uint64_t object, uint64_t blkid)
    456  11147   George {
    457  11147   George 	zbookmark_t czb;
    458  11147   George 	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
    459  11147   George 
    460  11147   George 	if (zfs_no_scrub_prefetch)
    461  11147   George 		return;
    462  11147   George 
    463  11147   George 	if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg ||
    464  11147   George 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
    465  11147   George 		return;
    466  11147   George 
    467  11147   George 	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
    468  11147   George 
    469  11147   George 	(void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp,
    470  11147   George 	    buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
    471  11147   George 	    &flags, &czb);
    472  11147   George }
    473  11147   George 
    474  11147   George static void
    475   7046   ahrens scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
    476   7046   ahrens     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
    477   7046   ahrens {
    478   7046   ahrens 	int err;
    479   7046   ahrens 	arc_buf_t *buf = NULL;
    480   7046   ahrens 
    481   7046   ahrens 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
    482   7046   ahrens 		return;
    483   7046   ahrens 
    484  11125     Jeff 	if (scrub_pause(dp, zb, NULL))
    485   7046   ahrens 		return;
    486   7046   ahrens 
    487   7046   ahrens 	if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
    488   7046   ahrens 		/*
    489   7046   ahrens 		 * If we already visited this bp & everything below (in
    490   7046   ahrens 		 * a prior txg), don't bother doing it again.
    491   7046   ahrens 		 */
    492   7046   ahrens 		if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
    493   7046   ahrens 			return;
    494   7046   ahrens 
    495   7046   ahrens 		/*
    496   7046   ahrens 		 * If we found the block we're trying to resume from, or
    497   7046   ahrens 		 * we went past it to a different object, zero it out to
    498   7046   ahrens 		 * indicate that it's OK to start checking for pausing
    499   7046   ahrens 		 * again.
    500   7046   ahrens 		 */
    501   7046   ahrens 		if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
    502   7046   ahrens 		    zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
    503   7046   ahrens 			dprintf("resuming at %llx/%llx/%llx/%llx\n",
    504   7046   ahrens 			    (longlong_t)zb->zb_objset,
    505   7046   ahrens 			    (longlong_t)zb->zb_object,
    506   7046   ahrens 			    (longlong_t)zb->zb_level,
    507   7046   ahrens 			    (longlong_t)zb->zb_blkid);
    508   7046   ahrens 			bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
    509   7046   ahrens 		}
    510   7046   ahrens 	}
    511   7046   ahrens 
    512  11147   George 	/*
    513  11147   George 	 * If dsl_pool_scrub_ddt() has aready scrubbed this block,
    514  11147   George 	 * don't scrub it again.
    515  11147   George 	 */
    516  11147   George 	if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp))
    517  11147   George 		(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
    518  11147   George 
    519   7046   ahrens 	if (BP_GET_LEVEL(bp) > 0) {
    520   7046   ahrens 		uint32_t flags = ARC_WAIT;
    521   7046   ahrens 		int i;
    522   7046   ahrens 		blkptr_t *cbp;
    523   7046   ahrens 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
    524   7046   ahrens 
    525   7046   ahrens 		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
    526   7046   ahrens 		    arc_getbuf_func, &buf,
    527   7046   ahrens 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    528   7046   ahrens 		if (err) {
    529   7046   ahrens 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
    530   7046   ahrens 			dp->dp_spa->spa_scrub_errors++;
    531   7046   ahrens 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
    532   7046   ahrens 			return;
    533   7046   ahrens 		}
    534  11147   George 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
    535  11147   George 			scrub_prefetch(dp, buf, cbp, zb->zb_objset,
    536  11147   George 			    zb->zb_object, zb->zb_blkid * epb + i);
    537  11147   George 		}
    538  11147   George 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
    539   7046   ahrens 			zbookmark_t czb;
    540   7046   ahrens 
    541   7046   ahrens 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
    542   7046   ahrens 			    zb->zb_level - 1,
    543   7046   ahrens 			    zb->zb_blkid * epb + i);
    544   7046   ahrens 			scrub_visitbp(dp, dnp, buf, cbp, &czb);
    545   7046   ahrens 		}
    546   7046   ahrens 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
    547   7046   ahrens 		uint32_t flags = ARC_WAIT;
    548  11147   George 		dnode_phys_t *cdnp;
    549  11147   George 		int i, j;
    550   7046   ahrens 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
    551   7046   ahrens 
    552   7046   ahrens 		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
    553   7046   ahrens 		    arc_getbuf_func, &buf,
    554   7046   ahrens 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    555   7046   ahrens 		if (err) {
    556   7046   ahrens 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
    557   7046   ahrens 			dp->dp_spa->spa_scrub_errors++;
    558   7046   ahrens 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
    559   7046   ahrens 			return;
    560   7046   ahrens 		}
    561  11147   George 		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
    562  11147   George 			for (j = 0; j < cdnp->dn_nblkptr; j++) {
    563  11147   George 				blkptr_t *cbp = &cdnp->dn_blkptr[j];
    564  11147   George 				scrub_prefetch(dp, buf, cbp, zb->zb_objset,
    565  11147   George 				    zb->zb_blkid * epb + i, j);
    566  11147   George 			}
    567  11147   George 		}
    568  11147   George 		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
    569  11147   George 			scrub_visitdnode(dp, cdnp, buf, zb->zb_objset,
    570   9396  Matthew 			    zb->zb_blkid * epb + i);
    571   7046   ahrens 		}
    572   7046   ahrens 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
    573   7046   ahrens 		uint32_t flags = ARC_WAIT;
    574   7046   ahrens 		objset_phys_t *osp;
    575   7046   ahrens 
    576   7046   ahrens 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
    577   7046   ahrens 		    arc_getbuf_func, &buf,
    578   7046   ahrens 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    579   7046   ahrens 		if (err) {
    580   7046   ahrens 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
    581   7046   ahrens 			dp->dp_spa->spa_scrub_errors++;
    582   7046   ahrens 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
    583   7046   ahrens 			return;
    584   7046   ahrens 		}
    585   7046   ahrens 
    586   7046   ahrens 		osp = buf->b_data;
    587   7046   ahrens 
    588   7046   ahrens 		traverse_zil(dp, &osp->os_zil_header);
    589   7046   ahrens 
    590   9396  Matthew 		scrub_visitdnode(dp, &osp->os_meta_dnode,
    591  10922     Jeff 		    buf, zb->zb_objset, DMU_META_DNODE_OBJECT);
    592   9396  Matthew 		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
    593   9396  Matthew 			scrub_visitdnode(dp, &osp->os_userused_dnode,
    594  10922     Jeff 			    buf, zb->zb_objset, DMU_USERUSED_OBJECT);
    595   9396  Matthew 			scrub_visitdnode(dp, &osp->os_groupused_dnode,
    596  10922     Jeff 			    buf, zb->zb_objset, DMU_GROUPUSED_OBJECT);
    597   7046   ahrens 		}
    598   7046   ahrens 	}
    599  11125     Jeff 
    600   7046   ahrens 	if (buf)
    601   7046   ahrens 		(void) arc_buf_remove_ref(buf, &buf);
    602   9396  Matthew }
    603   9396  Matthew 
    604   9396  Matthew static void
    605   9396  Matthew scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
    606   9396  Matthew     uint64_t objset, uint64_t object)
    607   9396  Matthew {
    608   9396  Matthew 	int j;
    609   9396  Matthew 
    610   9396  Matthew 	for (j = 0; j < dnp->dn_nblkptr; j++) {
    611   9396  Matthew 		zbookmark_t czb;
    612   9396  Matthew 
    613   9396  Matthew 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
    614   9396  Matthew 		scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
    615   9396  Matthew 	}
    616   7046   ahrens }
    617   7046   ahrens 
    618   7046   ahrens static void
    619   7046   ahrens scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
    620   7046   ahrens {
    621   7046   ahrens 	zbookmark_t zb;
    622   7046   ahrens 
    623  10922     Jeff 	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
    624  10922     Jeff 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
    625   7046   ahrens 	scrub_visitbp(dp, NULL, NULL, bp, &zb);
    626   7046   ahrens }
    627   7046   ahrens 
    628   7046   ahrens void
    629   7046   ahrens dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
    630   7046   ahrens {
    631   7046   ahrens 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
    632   7046   ahrens 
    633   7046   ahrens 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    634   7046   ahrens 		return;
    635   7046   ahrens 
    636   7046   ahrens 	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
    637  11125     Jeff 		SET_BOOKMARK(&dp->dp_scrub_bookmark,
    638  11125     Jeff 		    ZB_DESTROYED_OBJSET, 0, 0, 0);
    639   7046   ahrens 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    640   7046   ahrens 	    ds->ds_object, tx) != 0) {
    641   7046   ahrens 		return;
    642   7046   ahrens 	}
    643   7046   ahrens 
    644   7046   ahrens 	if (ds->ds_phys->ds_next_snap_obj != 0) {
    645   7046   ahrens 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    646   7046   ahrens 		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
    647   7046   ahrens 	}
    648   7046   ahrens 	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
    649   7046   ahrens }
    650   7046   ahrens 
    651   7046   ahrens void
    652   7046   ahrens dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
    653   7046   ahrens {
    654   7046   ahrens 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
    655   7046   ahrens 
    656   7046   ahrens 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    657   7046   ahrens 		return;
    658   7046   ahrens 
    659   7046   ahrens 	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
    660   7046   ahrens 
    661   7046   ahrens 	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
    662   7046   ahrens 		dp->dp_scrub_bookmark.zb_objset =
    663   7046   ahrens 		    ds->ds_phys->ds_prev_snap_obj;
    664   7046   ahrens 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    665   7046   ahrens 	    ds->ds_object, tx) == 0) {
    666   7046   ahrens 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    667   7046   ahrens 		    ds->ds_phys->ds_prev_snap_obj, tx) == 0);
    668   7046   ahrens 	}
    669   7046   ahrens }
    670   7046   ahrens 
    671   7837  Matthew void
    672   7837  Matthew dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
    673   7837  Matthew {
    674   7837  Matthew 	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
    675   7837  Matthew 
    676   7837  Matthew 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    677   7837  Matthew 		return;
    678   7837  Matthew 
    679   7837  Matthew 	if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) {
    680   7837  Matthew 		dp->dp_scrub_bookmark.zb_objset = ds2->ds_object;
    681   7837  Matthew 	} else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) {
    682   7837  Matthew 		dp->dp_scrub_bookmark.zb_objset = ds1->ds_object;
    683   7837  Matthew 	}
    684   7837  Matthew 
    685   7837  Matthew 	if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    686   7837  Matthew 	    ds1->ds_object, tx) == 0) {
    687   7837  Matthew 		int err = zap_add_int(dp->dp_meta_objset,
    688   7837  Matthew 		    dp->dp_scrub_queue_obj, ds2->ds_object, tx);
    689   7837  Matthew 		VERIFY(err == 0 || err == EEXIST);
    690   7837  Matthew 		if (err == EEXIST) {
    691   7837  Matthew 			/* Both were there to begin with */
    692   7837  Matthew 			VERIFY(0 == zap_add_int(dp->dp_meta_objset,
    693   7837  Matthew 			    dp->dp_scrub_queue_obj, ds1->ds_object, tx));
    694   7837  Matthew 		}
    695   7837  Matthew 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    696   7837  Matthew 	    ds2->ds_object, tx) == 0) {
    697   7837  Matthew 		VERIFY(0 == zap_add_int(dp->dp_meta_objset,
    698   7837  Matthew 		    dp->dp_scrub_queue_obj, ds1->ds_object, tx));
    699   7837  Matthew 	}
    700   7837  Matthew }
    701   7837  Matthew 
    702   7046   ahrens struct enqueue_clones_arg {
    703   7046   ahrens 	dmu_tx_t *tx;
    704   7046   ahrens 	uint64_t originobj;
    705   7046   ahrens };
    706   7046   ahrens 
    707   7046   ahrens /* ARGSUSED */
    708   7046   ahrens static int
    709   7046   ahrens enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
    710   7046   ahrens {
    711   7046   ahrens 	struct enqueue_clones_arg *eca = arg;
    712   7046   ahrens 	dsl_dataset_t *ds;
    713   7046   ahrens 	int err;
    714   7046   ahrens 	dsl_pool_t *dp;
    715   7046   ahrens 
    716   7046   ahrens 	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
    717   7046   ahrens 	if (err)
    718   7046   ahrens 		return (err);
    719   7046   ahrens 	dp = ds->ds_dir->dd_pool;
    720   7046   ahrens 
    721   7046   ahrens 	if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
    722   7046   ahrens 		while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
    723   7046   ahrens 			dsl_dataset_t *prev;
    724   7046   ahrens 			err = dsl_dataset_hold_obj(dp,
    725   7046   ahrens 			    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
    726   7046   ahrens 
    727   7046   ahrens 			dsl_dataset_rele(ds, FTAG);
    728   7046   ahrens 			if (err)
    729   7046   ahrens 				return (err);
    730   7046   ahrens 			ds = prev;
    731   7046   ahrens 		}
    732   7046   ahrens 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    733   7046   ahrens 		    ds->ds_object, eca->tx) == 0);
    734   7046   ahrens 	}
    735   7046   ahrens 	dsl_dataset_rele(ds, FTAG);
    736   7046   ahrens 	return (0);
    737   7046   ahrens }
    738   7046   ahrens 
    739   7046   ahrens static void
    740   7046   ahrens scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
    741   7046   ahrens {
    742   7046   ahrens 	dsl_dataset_t *ds;
    743   7046   ahrens 	uint64_t min_txg_save;
    744   7046   ahrens 
    745   7046   ahrens 	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
    746   7046   ahrens 
    747   7046   ahrens 	/*
    748   7046   ahrens 	 * Iterate over the bps in this ds.
    749   7046   ahrens 	 */
    750   7046   ahrens 	min_txg_save = dp->dp_scrub_min_txg;
    751   7046   ahrens 	dp->dp_scrub_min_txg =
    752   7046   ahrens 	    MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
    753   7046   ahrens 	scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
    754   7046   ahrens 	dp->dp_scrub_min_txg = min_txg_save;
    755   7046   ahrens 
    756   7046   ahrens 	if (dp->dp_scrub_pausing)
    757   7046   ahrens 		goto out;
    758   7046   ahrens 
    759   7046   ahrens 	/*
    760   7046   ahrens 	 * Add descendent datasets to work queue.
    761   7046   ahrens 	 */
    762   7046   ahrens 	if (ds->ds_phys->ds_next_snap_obj != 0) {
    763   7046   ahrens 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    764   7046   ahrens 		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
    765   7046   ahrens 	}
    766   7046   ahrens 	if (ds->ds_phys->ds_num_children > 1) {
    767  10801  Matthew 		boolean_t usenext = B_FALSE;
    768  10801  Matthew 		if (ds->ds_phys->ds_next_clones_obj != 0) {
    769  10801  Matthew 			uint64_t count;
    770  10801  Matthew 			/*
    771  10801  Matthew 			 * A bug in a previous version of the code could
    772  10801  Matthew 			 * cause upgrade_clones_cb() to not set
    773  10801  Matthew 			 * ds_next_snap_obj when it should, leading to a
    774  10801  Matthew 			 * missing entry.  Therefore we can only use the
    775  10801  Matthew 			 * next_clones_obj when its count is correct.
    776  10801  Matthew 			 */
    777  10801  Matthew 			int err = zap_count(dp->dp_meta_objset,
    778  10801  Matthew 			    ds->ds_phys->ds_next_clones_obj, &count);
    779  10801  Matthew 			if (err == 0 &&
    780  10801  Matthew 			    count == ds->ds_phys->ds_num_children - 1)
    781  10801  Matthew 				usenext = B_TRUE;
    782  10801  Matthew 		}
    783  10801  Matthew 
    784  10801  Matthew 		if (usenext) {
    785  10801  Matthew 			VERIFY(zap_join(dp->dp_meta_objset,
    786  10801  Matthew 			    ds->ds_phys->ds_next_clones_obj,
    787  10801  Matthew 			    dp->dp_scrub_queue_obj, tx) == 0);
    788  10801  Matthew 		} else {
    789   7046   ahrens 			struct enqueue_clones_arg eca;
    790   7046   ahrens 			eca.tx = tx;
    791   7046   ahrens 			eca.originobj = ds->ds_object;
    792   7046   ahrens 
    793   7046   ahrens 			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
    794   7046   ahrens 			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
    795   7046   ahrens 		}
    796   7046   ahrens 	}
    797   7046   ahrens 
    798   7046   ahrens out:
    799   7046   ahrens 	dsl_dataset_rele(ds, FTAG);
    800   7046   ahrens }
    801   7046   ahrens 
    802   7046   ahrens /* ARGSUSED */
    803   7046   ahrens static int
    804   7046   ahrens enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
    805   7046   ahrens {
    806   7046   ahrens 	dmu_tx_t *tx = arg;
    807   7046   ahrens 	dsl_dataset_t *ds;
    808   7046   ahrens 	int err;
    809   7046   ahrens 	dsl_pool_t *dp;
    810   7046   ahrens 
    811   7046   ahrens 	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
    812   7046   ahrens 	if (err)
    813   7046   ahrens 		return (err);
    814   7046   ahrens 
    815   7046   ahrens 	dp = ds->ds_dir->dd_pool;
    816   7046   ahrens 
    817   7046   ahrens 	while (ds->ds_phys->ds_prev_snap_obj != 0) {
    818   7046   ahrens 		dsl_dataset_t *prev;
    819   7046   ahrens 		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
    820   7046   ahrens 		    FTAG, &prev);
    821   7046   ahrens 		if (err) {
    822   7046   ahrens 			dsl_dataset_rele(ds, FTAG);
    823   7046   ahrens 			return (err);
    824   7046   ahrens 		}
    825   7046   ahrens 
    826   7046   ahrens 		/*
    827   7046   ahrens 		 * If this is a clone, we don't need to worry about it for now.
    828   7046   ahrens 		 */
    829   7046   ahrens 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
    830   7046   ahrens 			dsl_dataset_rele(ds, FTAG);
    831   7046   ahrens 			dsl_dataset_rele(prev, FTAG);
    832   7046   ahrens 			return (0);
    833   7046   ahrens 		}
    834   7046   ahrens 		dsl_dataset_rele(ds, FTAG);
    835   7046   ahrens 		ds = prev;
    836   7046   ahrens 	}
    837   7046   ahrens 
    838   7046   ahrens 	VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
    839   7046   ahrens 	    ds->ds_object, tx) == 0);
    840   7046   ahrens 	dsl_dataset_rele(ds, FTAG);
    841   7046   ahrens 	return (0);
    842   7046   ahrens }
    843   7046   ahrens 
    844  11125     Jeff /*
    845  11125     Jeff  * Scrub/dedup interaction.
    846  11125     Jeff  *
    847  11125     Jeff  * If there are N references to a deduped block, we don't want to scrub it
    848  11125     Jeff  * N times -- ideally, we should scrub it exactly once.
    849  11125     Jeff  *
    850  11125     Jeff  * To prevent excess scrubbing, the scrub begins by walking the DDT
    851  11125     Jeff  * to find all blocks with refcnt > 1, and scrubs each of these once.
    852  11125     Jeff  * Then the top-down scrub begins, only visiting blocks with refcnt == 1.
    853  11125     Jeff  *
    854  11125     Jeff  * There would be nothing more to say if a block's refcnt couldn't change
    855  11125     Jeff  * during a scrub, but of course it can.  There are two cases to consider.
    856  11125     Jeff  *
    857  11125     Jeff  * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
    858  11125     Jeff  * when visited during the top-down scrub phase, it will be scrubbed twice.
    859  11125     Jeff  * This negates our scrub optimization, but is otherwise harmless.
    860  11125     Jeff  *
    861  11125     Jeff  * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
    862  11125     Jeff  * on each visit during the top-down scrub phase, it will never be scrubbed.
    863  11125     Jeff  * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
    864  11125     Jeff  * reference count changes; if it transitions from refcnt == 1 to refcnt > 1
    865  11125     Jeff  * while a scrub is in progress, it scrubs the block right then.
    866  11125     Jeff  *
    867  11125     Jeff  * The code does not actually use the refcnt directly, but rather uses the
    868  11125     Jeff  * dde's replication class (enum ddt_class), which serves the same purpose.
    869  11125     Jeff  */
    870  10922     Jeff static void
    871  11125     Jeff dsl_pool_scrub_ddt(dsl_pool_t *dp)
    872  10922     Jeff {
    873  11125     Jeff 	ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark;
    874  10922     Jeff 	ddt_entry_t dde;
    875  11125     Jeff 	int error;
    876  11125     Jeff 
    877  11125     Jeff 	while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) {
    878  11125     Jeff 		if (ddb->ddb_class > dp->dp_scrub_ddt_class_max)
    879  11125     Jeff 			return;
    880  11125     Jeff 		dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde);
    881  11125     Jeff 		if (scrub_pause(dp, NULL, ddb))
    882  11125     Jeff 			return;
    883  11125     Jeff 	}
    884  11125     Jeff 	ASSERT(error == ENOENT);
    885  11125     Jeff 	ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max);
    886  11125     Jeff }
    887  11125     Jeff 
    888  11125     Jeff void
    889  11125     Jeff dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
    890  11125     Jeff     const ddt_entry_t *dde)
    891  11125     Jeff {
    892  11125     Jeff 	const ddt_key_t *ddk = &dde->dde_key;
    893  11125     Jeff 	const ddt_phys_t *ddp = dde->dde_phys;
    894  10922     Jeff 	blkptr_t blk;
    895  10922     Jeff 	zbookmark_t zb = { 0 };
    896  10922     Jeff 
    897  11125     Jeff 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
    898  11125     Jeff 		if (ddp->ddp_phys_birth == 0)
    899  11125     Jeff 			continue;
    900  11125     Jeff 		ddt_bp_create(checksum, ddk, ddp, &blk);
    901  10922     Jeff 		scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
    902  10922     Jeff 	}
    903  10922     Jeff }
    904  10922     Jeff 
    905   7046   ahrens void
    906   7046   ahrens dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
    907   7046   ahrens {
    908   8241     Jeff 	spa_t *spa = dp->dp_spa;
    909   7046   ahrens 	zap_cursor_t zc;
    910   7046   ahrens 	zap_attribute_t za;
    911   7046   ahrens 	boolean_t complete = B_TRUE;
    912   7046   ahrens 
    913   7046   ahrens 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
    914   7046   ahrens 		return;
    915   7046   ahrens 
    916   8241     Jeff 	/*
    917   8241     Jeff 	 * If the pool is not loaded, or is trying to unload, leave it alone.
    918   8241     Jeff 	 */
    919  11147   George 	if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa))
    920   7046   ahrens 		return;
    921   7046   ahrens 
    922   7046   ahrens 	if (dp->dp_scrub_restart) {
    923   7046   ahrens 		enum scrub_func func = dp->dp_scrub_func;
    924   7046   ahrens 		dp->dp_scrub_restart = B_FALSE;
    925   7046   ahrens 		dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
    926   7046   ahrens 	}
    927   7046   ahrens 
    928   8241     Jeff 	if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
    929   7046   ahrens 		/*
    930   7046   ahrens 		 * We must have resumed after rebooting; reset the vdev
    931   7046   ahrens 		 * stats to know that we're doing a scrub (although it
    932   7046   ahrens 		 * will think we're just starting now).
    933   7046   ahrens 		 */
    934   8241     Jeff 		vdev_scrub_stat_update(spa->spa_root_vdev,
    935   7046   ahrens 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
    936   7046   ahrens 		    POOL_SCRUB_EVERYTHING, B_FALSE);
    937   7046   ahrens 	}
    938   7046   ahrens 
    939   7046   ahrens 	dp->dp_scrub_pausing = B_FALSE;
    940  11137      Lin 	dp->dp_scrub_start_time = gethrtime();
    941   7046   ahrens 	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
    942   8241     Jeff 	spa->spa_scrub_active = B_TRUE;
    943   7046   ahrens 
    944  11125     Jeff 	if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) {
    945  11125     Jeff 		dsl_pool_scrub_ddt(dp);
    946  11125     Jeff 		if (dp->dp_scrub_pausing)
    947  11125     Jeff 			goto out;
    948  10922     Jeff 	}
    949  10922     Jeff 
    950  10922     Jeff 	if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
    951   7046   ahrens 		/* First do the MOS & ORIGIN */
    952   7046   ahrens 		scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
    953   7046   ahrens 		if (dp->dp_scrub_pausing)
    954   7046   ahrens 			goto out;
    955   7046   ahrens 
    956   8241     Jeff 		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
    957   8241     Jeff 			VERIFY(0 == dmu_objset_find_spa(spa,
    958   7046   ahrens 			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
    959   7046   ahrens 		} else {
    960   7046   ahrens 			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
    961   7046   ahrens 		}
    962   7046   ahrens 		ASSERT(!dp->dp_scrub_pausing);
    963  10922     Jeff 	} else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) {
    964   7046   ahrens 		/*
    965  10922     Jeff 		 * If we were paused, continue from here.  Note if the ds
    966  10922     Jeff 		 * we were paused on was destroyed, the zb_objset will be
    967  10922     Jeff 		 * ZB_DESTROYED_OBJSET, so we will skip this and find a new
    968  10922     Jeff 		 * objset below.
    969   7046   ahrens 		 */
    970   7046   ahrens 		scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
    971   7046   ahrens 		if (dp->dp_scrub_pausing)
    972   7046   ahrens 			goto out;
    973   7046   ahrens 	}
    974   7046   ahrens 
    975   7046   ahrens 	/*
    976   7046   ahrens 	 * In case we were paused right at the end of the ds, zero the
    977   7046   ahrens 	 * bookmark so we don't think that we're still trying to resume.
    978   7046   ahrens 	 */
    979   7046   ahrens 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
    980   7046   ahrens 
    981   7046   ahrens 	/* keep pulling things out of the zap-object-as-queue */
    982   7046   ahrens 	while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
    983   7046   ahrens 	    zap_cursor_retrieve(&zc, &za) == 0) {
    984   7046   ahrens 		VERIFY(0 == zap_remove(dp->dp_meta_objset,
    985   7046   ahrens 		    dp->dp_scrub_queue_obj, za.za_name, tx));
    986   7046   ahrens 		scrub_visitds(dp, za.za_first_integer, tx);
    987   7046   ahrens 		if (dp->dp_scrub_pausing)
    988   7046   ahrens 			break;
    989   7046   ahrens 		zap_cursor_fini(&zc);
    990   7046   ahrens 	}
    991   7046   ahrens 	zap_cursor_fini(&zc);
    992   7046   ahrens 	if (dp->dp_scrub_pausing)
    993   7046   ahrens 		goto out;
    994   7046   ahrens 
    995   7046   ahrens 	/* done. */
    996   7046   ahrens 
    997   7046   ahrens 	dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
    998   7046   ahrens 	return;
    999   7046   ahrens out:
   1000  11125     Jeff 	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
   1001  11125     Jeff 	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
   1002  11125     Jeff 	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
   1003   7046   ahrens 	    &dp->dp_scrub_bookmark, tx));
   1004  11125     Jeff 	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
   1005  11125     Jeff 	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
   1006  11125     Jeff 	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
   1007  11125     Jeff 	    &dp->dp_scrub_ddt_bookmark, tx));
   1008  11125     Jeff 	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
   1009  11125     Jeff 	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
   1010  11125     Jeff 	    &dp->dp_scrub_ddt_class_max, tx));
   1011  11125     Jeff 	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
   1012   7046   ahrens 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
   1013   8241     Jeff 	    &spa->spa_scrub_errors, tx));
   1014   7046   ahrens 
   1015   7046   ahrens 	/* XXX this is scrub-clean specific */
   1016   8241     Jeff 	mutex_enter(&spa->spa_scrub_lock);
   1017   8241     Jeff 	while (spa->spa_scrub_inflight > 0)
   1018   8241     Jeff 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
   1019   8241     Jeff 	mutex_exit(&spa->spa_scrub_lock);
   1020   7046   ahrens }
   1021   7046   ahrens 
   1022   7046   ahrens void
   1023   7046   ahrens dsl_pool_scrub_restart(dsl_pool_t *dp)
   1024   7046   ahrens {
   1025   7046   ahrens 	mutex_enter(&dp->dp_scrub_cancel_lock);
   1026   7046   ahrens 	dp->dp_scrub_restart = B_TRUE;
   1027   7046   ahrens 	mutex_exit(&dp->dp_scrub_cancel_lock);
   1028   7046   ahrens }
   1029   7046   ahrens 
   1030   7046   ahrens /*
   1031   7046   ahrens  * scrub consumers
   1032   7046   ahrens  */
   1033   7046   ahrens 
   1034   7046   ahrens static void
   1035   7837  Matthew count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
   1036   7837  Matthew {
   1037   7837  Matthew 	int i;
   1038   7837  Matthew 
   1039   7837  Matthew 	/*
   1040   7837  Matthew 	 * If we resume after a reboot, zab will be NULL; don't record
   1041   7837  Matthew 	 * incomplete stats in that case.
   1042   7837  Matthew 	 */
   1043   7837  Matthew 	if (zab == NULL)
   1044   7837  Matthew 		return;
   1045   7837  Matthew 
   1046   7837  Matthew 	for (i = 0; i < 4; i++) {
   1047   7837  Matthew 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
   1048   7837  Matthew 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
   1049   7837  Matthew 		zfs_blkstat_t *zb = &zab->zab_type[l][t];
   1050   7837  Matthew 		int equal;
   1051   7837  Matthew 
   1052   7837  Matthew 		zb->zb_count++;
   1053   7837  Matthew 		zb->zb_asize += BP_GET_ASIZE(bp);
   1054   7837  Matthew 		zb->zb_lsize += BP_GET_LSIZE(bp);
   1055   7837  Matthew 		zb->zb_psize += BP_GET_PSIZE(bp);
   1056   7837  Matthew 		zb->zb_gangs += BP_COUNT_GANG(bp);
   1057   7837  Matthew 
   1058   7837  Matthew 		switch (BP_GET_NDVAS(bp)) {
   1059   7837  Matthew 		case 2:
   1060   7837  Matthew 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
   1061   7837  Matthew 			    DVA_GET_VDEV(&bp->blk_dva[1]))
   1062   7837  Matthew 				zb->zb_ditto_2_of_2_samevdev++;
   1063   7837  Matthew 			break;
   1064   7837  Matthew 		case 3:
   1065   7837  Matthew 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
   1066   7837  Matthew 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
   1067   7837  Matthew 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
   1068   7837  Matthew 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
   1069   7837  Matthew 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
   1070   7837  Matthew 			    DVA_GET_VDEV(&bp->blk_dva[2]));
   1071   7837  Matthew 			if (equal == 1)
   1072   7837  Matthew 				zb->zb_ditto_2_of_3_samevdev++;
   1073   7837  Matthew 			else if (equal == 3)
   1074   7837  Matthew 				zb->zb_ditto_3_of_3_samevdev++;
   1075   7837  Matthew 			break;
   1076   7837  Matthew 		}
   1077   7837  Matthew 	}
   1078   7837  Matthew }
   1079   7837  Matthew 
   1080   7837  Matthew static void
   1081   7046   ahrens dsl_pool_scrub_clean_done(zio_t *zio)
   1082   7046   ahrens {
   1083   7046   ahrens 	spa_t *spa = zio->io_spa;
   1084   7046   ahrens 
   1085   7046   ahrens 	zio_data_buf_free(zio->io_data, zio->io_size);
   1086   7046   ahrens 
   1087   7046   ahrens 	mutex_enter(&spa->spa_scrub_lock);
   1088   7046   ahrens 	spa->spa_scrub_inflight--;
   1089   7046   ahrens 	cv_broadcast(&spa->spa_scrub_io_cv);
   1090   7046   ahrens 
   1091   7754     Jeff 	if (zio->io_error && (zio->io_error != ECKSUM ||
   1092   7754     Jeff 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)))
   1093   7046   ahrens 		spa->spa_scrub_errors++;
   1094   7046   ahrens 	mutex_exit(&spa->spa_scrub_lock);
   1095   7046   ahrens }
   1096   7046   ahrens 
   1097   7046   ahrens static int
   1098   7046   ahrens dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
   1099   7046   ahrens     const blkptr_t *bp, const zbookmark_t *zb)
   1100   7046   ahrens {
   1101   8274     Jeff 	size_t size = BP_GET_PSIZE(bp);
   1102   7046   ahrens 	spa_t *spa = dp->dp_spa;
   1103  10922     Jeff 	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
   1104   7046   ahrens 	boolean_t needs_io;
   1105   8274     Jeff 	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
   1106   7046   ahrens 	int zio_priority;
   1107   8241     Jeff 
   1108  10922     Jeff 	if (phys_birth <= dp->dp_scrub_min_txg ||
   1109  10922     Jeff 	    phys_birth >= dp->dp_scrub_max_txg)
   1110   8241     Jeff 		return (0);
   1111   7837  Matthew 
   1112   7837  Matthew 	count_block(dp->dp_blkstats, bp);
   1113   7046   ahrens 
   1114   7046   ahrens 	if (dp->dp_scrub_isresilver == 0) {
   1115   7046   ahrens 		/* It's a scrub */
   1116   7046   ahrens 		zio_flags |= ZIO_FLAG_SCRUB;
   1117   7046   ahrens 		zio_priority = ZIO_PRIORITY_SCRUB;
   1118   7046   ahrens 		needs_io = B_TRUE;
   1119   7046   ahrens 	} else {
   1120   7046   ahrens 		/* It's a resilver */
   1121   7046   ahrens 		zio_flags |= ZIO_FLAG_RESILVER;
   1122   7046   ahrens 		zio_priority = ZIO_PRIORITY_RESILVER;
   1123   7046   ahrens 		needs_io = B_FALSE;
   1124   7046   ahrens 	}
   1125   7046   ahrens 
   1126   7160   ahrens 	/* If it's an intent log block, failure is expected. */
   1127  10922     Jeff 	if (zb->zb_level == ZB_ZIL_LEVEL)
   1128   7160   ahrens 		zio_flags |= ZIO_FLAG_SPECULATIVE;
   1129   7160   ahrens 
   1130   8241     Jeff 	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
   1131   7046   ahrens 		vdev_t *vd = vdev_lookup_top(spa,
   1132   7046   ahrens 		    DVA_GET_VDEV(&bp->blk_dva[d]));
   1133   7046   ahrens 
   1134   7046   ahrens 		/*
   1135   7046   ahrens 		 * Keep track of how much data we've examined so that
   1136   7046   ahrens 		 * zpool(1M) status can make useful progress reports.
   1137   7046   ahrens 		 */
   1138   7046   ahrens 		mutex_enter(&vd->vdev_stat_lock);
   1139   7046   ahrens 		vd->vdev_stat.vs_scrub_examined +=
   1140   7046   ahrens 		    DVA_GET_ASIZE(&bp->blk_dva[d]);
   1141   7046   ahrens 		mutex_exit(&vd->vdev_stat_lock);
   1142   7046   ahrens 
   1143   7046   ahrens 		/* if it's a resilver, this may not be in the target range */
   1144   7046   ahrens 		if (!needs_io) {
   1145   7046   ahrens 			if (DVA_GET_GANG(&bp->blk_dva[d])) {
   1146   7046   ahrens 				/*
   1147   7046   ahrens 				 * Gang members may be spread across multiple
   1148   8241     Jeff 				 * vdevs, so the best estimate we have is the
   1149   8241     Jeff 				 * scrub range, which has already been checked.
   1150   7046   ahrens 				 * XXX -- it would be better to change our
   1151   8241     Jeff 				 * allocation policy to ensure that all
   1152   8241     Jeff 				 * gang members reside on the same vdev.
   1153   7046   ahrens 				 */
   1154   8241     Jeff 				needs_io = B_TRUE;
   1155   8241     Jeff 			} else {
   1156   8241     Jeff 				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
   1157  10922     Jeff 				    phys_birth, 1);
   1158   7046   ahrens 			}
   1159   7046   ahrens 		}
   1160   7046   ahrens 	}
   1161   7046   ahrens 
   1162   7046   ahrens 	if (needs_io && !zfs_no_scrub_io) {
   1163   7046   ahrens 		void *data = zio_data_buf_alloc(size);
   1164   7046   ahrens 
   1165   7046   ahrens 		mutex_enter(&spa->spa_scrub_lock);
   1166   7046   ahrens 		while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
   1167   7046   ahrens 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
   1168   7046   ahrens 		spa->spa_scrub_inflight++;
   1169   7046   ahrens 		mutex_exit(&spa->spa_scrub_lock);
   1170   7046   ahrens 
   1171   7046   ahrens 		zio_nowait(zio_read(NULL, spa, bp, data, size,
   1172   7046   ahrens 		    dsl_pool_scrub_clean_done, NULL, zio_priority,
   1173   7046   ahrens 		    zio_flags, zb));
   1174   7046   ahrens 	}
   1175   7046   ahrens 
   1176   7046   ahrens 	/* do not relocate this block */
   1177   7046   ahrens 	return (0);
   1178   7046   ahrens }
   1179   7046   ahrens 
   1180   7046   ahrens int
   1181   7046   ahrens dsl_pool_scrub_clean(dsl_pool_t *dp)
   1182   7046   ahrens {
   1183   9997   George 	spa_t *spa = dp->dp_spa;
   1184   9997   George 
   1185   7046   ahrens 	/*
   1186  10850   George 	 * Purge all vdev caches and probe all devices.  We do this here
   1187  10850   George 	 * rather than in sync context because this requires a writer lock
   1188  10850   George 	 * on the spa_config lock, which we can't do from sync context.  The
   1189   7046   ahrens 	 * spa_scrub_reopen flag indicates that vdev_open() should not
   1190   7046   ahrens 	 * attempt to start another scrub.
   1191   7046   ahrens 	 */
   1192  10685   George 	spa_vdev_state_enter(spa, SCL_NONE);
   1193   9997   George 	spa->spa_scrub_reopen = B_TRUE;
   1194   9997   George 	vdev_reopen(spa->spa_root_vdev);
   1195   9997   George 	spa->spa_scrub_reopen = B_FALSE;
   1196   9997   George 	(void) spa_vdev_state_exit(spa, NULL, 0);
   1197   7046   ahrens 
   1198   7046   ahrens 	return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
   1199   7046   ahrens }
   1200