1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/dsl_pool.h> 27 #include <sys/dsl_dataset.h> 28 #include <sys/dsl_prop.h> 29 #include <sys/dsl_dir.h> 30 #include <sys/dsl_synctask.h> 31 #include <sys/dnode.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/arc.h> 35 #include <sys/zap.h> 36 #include <sys/zio.h> 37 #include <sys/zfs_context.h> 38 #include <sys/fs/zfs.h> 39 #include <sys/zfs_znode.h> 40 #include <sys/spa_impl.h> 41 #include <sys/vdev_impl.h> 42 #include <sys/zil_impl.h> 43 #include <sys/zio_checksum.h> 44 #include <sys/ddt.h> 45 46 typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); 47 48 static scrub_cb_t dsl_pool_scrub_clean_cb; 49 static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; 50 static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, 51 uint64_t objset, uint64_t object); 52 53 int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ 54 int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ 55 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ 56 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; 57 58 extern int zfs_txg_timeout; 59 60 static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { 61 NULL, 62 dsl_pool_scrub_clean_cb 63 }; 64 65 /* ARGSUSED */ 66 static void 67 dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 68 { 69 dsl_pool_t *dp = arg1; 70 enum scrub_func *funcp = arg2; 71 dmu_object_type_t ot = 0; 72 boolean_t complete = B_FALSE; 73 74 dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); 75 76 ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); 77 ASSERT(*funcp > SCRUB_FUNC_NONE); 78 ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); 79 80 dp->dp_scrub_min_txg = 0; 81 dp->dp_scrub_max_txg = tx->tx_txg; 82 dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max; 83 84 if (*funcp == SCRUB_FUNC_CLEAN) { 85 vdev_t *rvd = dp->dp_spa->spa_root_vdev; 86 87 /* rewrite all disk labels */ 88 vdev_config_dirty(rvd); 89 90 if (vdev_resilver_needed(rvd, 91 &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { 92 spa_event_notify(dp->dp_spa, NULL, 93 ESC_ZFS_RESILVER_START); 94 dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, 95 tx->tx_txg); 96 } else { 97 spa_event_notify(dp->dp_spa, NULL, 98 ESC_ZFS_SCRUB_START); 99 } 100 101 /* zero out the scrub stats in all vdev_stat_t's */ 102 vdev_scrub_stat_update(rvd, 103 dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : 104 POOL_SCRUB_EVERYTHING, B_FALSE); 105 106 /* 107 * If this is an incremental scrub, limit the DDT scrub phase 108 * to just the auto-ditto class (for correctness); the rest 109 * of the scrub should go faster using top-down pruning. 110 */ 111 if (dp->dp_scrub_min_txg > TXG_INITIAL) 112 dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO; 113 114 dp->dp_spa->spa_scrub_started = B_TRUE; 115 } 116 117 /* back to the generic stuff */ 118 119 if (dp->dp_blkstats == NULL) { 120 dp->dp_blkstats = 121 kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); 122 } 123 bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 124 125 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) 126 ot = DMU_OT_ZAP_OTHER; 127 128 dp->dp_scrub_func = *funcp; 129 dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, 130 ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); 131 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 132 bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); 133 dp->dp_scrub_restart = B_FALSE; 134 dp->dp_spa->spa_scrub_errors = 0; 135 136 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 137 DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, 138 &dp->dp_scrub_func, tx)); 139 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 140 DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, 141 &dp->dp_scrub_queue_obj, tx)); 142 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 143 DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, 144 &dp->dp_scrub_min_txg, tx)); 145 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 146 DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, 147 &dp->dp_scrub_max_txg, tx)); 148 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 149 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 150 sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), 151 &dp->dp_scrub_bookmark, tx)); 152 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 153 DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), 154 sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), 155 &dp->dp_scrub_ddt_bookmark, tx)); 156 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 157 DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, 158 &dp->dp_scrub_ddt_class_max, tx)); 159 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 160 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, 161 &dp->dp_spa->spa_scrub_errors, tx)); 162 163 spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, 164 "func=%u mintxg=%llu maxtxg=%llu", 165 *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); 166 } 167 168 int 169 dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) 170 { 171 return (dsl_sync_task_do(dp, NULL, 172 dsl_pool_scrub_setup_sync, dp, &func, 0)); 173 } 174 175 /* ARGSUSED */ 176 static void 177 dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 178 { 179 dsl_pool_t *dp = arg1; 180 boolean_t *completep = arg2; 181 182 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 183 return; 184 185 mutex_enter(&dp->dp_scrub_cancel_lock); 186 187 if (dp->dp_scrub_restart) { 188 dp->dp_scrub_restart = B_FALSE; 189 *completep = B_FALSE; 190 } 191 192 /* XXX this is scrub-clean specific */ 193 mutex_enter(&dp->dp_spa->spa_scrub_lock); 194 while (dp->dp_spa->spa_scrub_inflight > 0) { 195 cv_wait(&dp->dp_spa->spa_scrub_io_cv, 196 &dp->dp_spa->spa_scrub_lock); 197 } 198 mutex_exit(&dp->dp_spa->spa_scrub_lock); 199 dp->dp_spa->spa_scrub_started = B_FALSE; 200 dp->dp_spa->spa_scrub_active = B_FALSE; 201 202 dp->dp_scrub_func = SCRUB_FUNC_NONE; 203 VERIFY(0 == dmu_object_free(dp->dp_meta_objset, 204 dp->dp_scrub_queue_obj, tx)); 205 dp->dp_scrub_queue_obj = 0; 206 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 207 bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); 208 209 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 210 DMU_POOL_SCRUB_QUEUE, tx)); 211 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 212 DMU_POOL_SCRUB_MIN_TXG, tx)); 213 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 214 DMU_POOL_SCRUB_MAX_TXG, tx)); 215 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 216 DMU_POOL_SCRUB_BOOKMARK, tx)); 217 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 218 DMU_POOL_SCRUB_FUNC, tx)); 219 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 220 DMU_POOL_SCRUB_ERRORS, tx)); 221 222 (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 223 DMU_POOL_SCRUB_DDT_BOOKMARK, tx); 224 (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 225 DMU_POOL_SCRUB_DDT_CLASS_MAX, tx); 226 227 spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, 228 "complete=%u", *completep); 229 230 /* below is scrub-clean specific */ 231 vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, 232 *completep); 233 /* 234 * If the scrub/resilver completed, update all DTLs to reflect this. 235 * Whether it succeeded or not, vacate all temporary scrub DTLs. 236 */ 237 vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, 238 *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); 239 if (*completep) 240 spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? 241 ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); 242 spa_errlog_rotate(dp->dp_spa); 243 244 /* 245 * We may have finished replacing a device. 246 * Let the async thread assess this and handle the detach. 247 */ 248 spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); 249 250 dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; 251 mutex_exit(&dp->dp_scrub_cancel_lock); 252 } 253 254 int 255 dsl_pool_scrub_cancel(dsl_pool_t *dp) 256 { 257 boolean_t complete = B_FALSE; 258 259 return (dsl_sync_task_do(dp, NULL, 260 dsl_pool_scrub_cancel_sync, dp, &complete, 3)); 261 } 262 263 void 264 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) 265 { 266 /* 267 * This function will be used by bp-rewrite wad to intercept frees. 268 */ 269 zio_free(dp->dp_spa, txg, bpp); 270 } 271 272 static boolean_t 273 bookmark_is_zero(const zbookmark_t *zb) 274 { 275 return (zb->zb_objset == 0 && zb->zb_object == 0 && 276 zb->zb_level == 0 && zb->zb_blkid == 0); 277 } 278 279 /* dnp is the dnode for zb1->zb_object */ 280 static boolean_t 281 bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, 282 const zbookmark_t *zb2) 283 { 284 uint64_t zb1nextL0, zb2thisobj; 285 286 ASSERT(zb1->zb_objset == zb2->zb_objset); 287 ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT); 288 ASSERT(zb2->zb_level == 0); 289 290 /* 291 * A bookmark in the deadlist is considered to be after 292 * everything else. 293 */ 294 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 295 return (B_TRUE); 296 297 /* The objset_phys_t isn't before anything. */ 298 if (dnp == NULL) 299 return (B_FALSE); 300 301 zb1nextL0 = (zb1->zb_blkid + 1) << 302 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 303 304 zb2thisobj = zb2->zb_object ? zb2->zb_object : 305 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 306 307 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 308 uint64_t nextobj = zb1nextL0 * 309 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 310 return (nextobj <= zb2thisobj); 311 } 312 313 if (zb1->zb_object < zb2thisobj) 314 return (B_TRUE); 315 if (zb1->zb_object > zb2thisobj) 316 return (B_FALSE); 317 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 318 return (B_FALSE); 319 return (zb1nextL0 <= zb2->zb_blkid); 320 } 321 322 static boolean_t 323 scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb) 324 { 325 int elapsed_ticks; 326 int mintime; 327 328 if (dp->dp_scrub_pausing) 329 return (B_TRUE); /* we're already pausing */ 330 331 if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) 332 return (B_FALSE); /* we're resuming */ 333 334 /* We only know how to resume from level-0 blocks. */ 335 if (zb != NULL && zb->zb_level != 0) 336 return (B_FALSE); 337 338 mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time : 339 zfs_scrub_min_time; 340 elapsed_ticks = ddi_get_lbolt64() - dp->dp_scrub_start_time; 341 if (elapsed_ticks > hz * zfs_txg_timeout || 342 (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) { 343 if (zb) { 344 dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", 345 (longlong_t)zb->zb_objset, 346 (longlong_t)zb->zb_object, 347 (longlong_t)zb->zb_level, 348 (longlong_t)zb->zb_blkid); 349 dp->dp_scrub_bookmark = *zb; 350 } 351 if (ddb) { 352 dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", 353 (longlong_t)ddb->ddb_class, 354 (longlong_t)ddb->ddb_type, 355 (longlong_t)ddb->ddb_checksum, 356 (longlong_t)ddb->ddb_cursor); 357 ASSERT(&dp->dp_scrub_ddt_bookmark == ddb); 358 } 359 dp->dp_scrub_pausing = B_TRUE; 360 return (B_TRUE); 361 } 362 return (B_FALSE); 363 } 364 365 typedef struct zil_traverse_arg { 366 dsl_pool_t *zta_dp; 367 zil_header_t *zta_zh; 368 } zil_traverse_arg_t; 369 370 /* ARGSUSED */ 371 static int 372 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 373 { 374 zil_traverse_arg_t *zta = arg; 375 dsl_pool_t *dp = zta->zta_dp; 376 zil_header_t *zh = zta->zta_zh; 377 zbookmark_t zb; 378 379 if (bp->blk_birth <= dp->dp_scrub_min_txg) 380 return (0); 381 382 /* 383 * One block ("stubby") can be allocated a long time ago; we 384 * want to visit that one because it has been allocated 385 * (on-disk) even if it hasn't been claimed (even though for 386 * plain scrub there's nothing to do to it). 387 */ 388 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) 389 return (0); 390 391 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 392 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 393 394 VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); 395 return (0); 396 } 397 398 /* ARGSUSED */ 399 static int 400 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 401 { 402 if (lrc->lrc_txtype == TX_WRITE) { 403 zil_traverse_arg_t *zta = arg; 404 dsl_pool_t *dp = zta->zta_dp; 405 zil_header_t *zh = zta->zta_zh; 406 lr_write_t *lr = (lr_write_t *)lrc; 407 blkptr_t *bp = &lr->lr_blkptr; 408 zbookmark_t zb; 409 410 if (bp->blk_birth <= dp->dp_scrub_min_txg) 411 return (0); 412 413 /* 414 * birth can be < claim_txg if this record's txg is 415 * already txg sync'ed (but this log block contains 416 * other records that are not synced) 417 */ 418 if (claim_txg == 0 || bp->blk_birth < claim_txg) 419 return (0); 420 421 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 422 lr->lr_foid, ZB_ZIL_LEVEL, 423 lr->lr_offset / BP_GET_LSIZE(bp)); 424 425 VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); 426 } 427 return (0); 428 } 429 430 static void 431 traverse_zil(dsl_pool_t *dp, zil_header_t *zh) 432 { 433 uint64_t claim_txg = zh->zh_claim_txg; 434 zil_traverse_arg_t zta = { dp, zh }; 435 zilog_t *zilog; 436 437 /* 438 * We only want to visit blocks that have been claimed but not yet 439 * replayed (or, in read-only mode, blocks that *would* be claimed). 440 */ 441 if (claim_txg == 0 && spa_writeable(dp->dp_spa)) 442 return; 443 444 zilog = zil_alloc(dp->dp_meta_objset, zh); 445 446 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, 447 claim_txg); 448 449 zil_free(zilog); 450 } 451 452 static void 453 scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, 454 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) 455 { 456 int err; 457 arc_buf_t *buf = NULL; 458 459 if (bp->blk_birth <= dp->dp_scrub_min_txg) 460 return; 461 462 if (scrub_pause(dp, zb, NULL)) 463 return; 464 465 if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { 466 /* 467 * If we already visited this bp & everything below (in 468 * a prior txg), don't bother doing it again. 469 */ 470 if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) 471 return; 472 473 /* 474 * If we found the block we're trying to resume from, or 475 * we went past it to a different object, zero it out to 476 * indicate that it's OK to start checking for pausing 477 * again. 478 */ 479 if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || 480 zb->zb_object > dp->dp_scrub_bookmark.zb_object) { 481 dprintf("resuming at %llx/%llx/%llx/%llx\n", 482 (longlong_t)zb->zb_objset, 483 (longlong_t)zb->zb_object, 484 (longlong_t)zb->zb_level, 485 (longlong_t)zb->zb_blkid); 486 bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); 487 } 488 } 489 490 if (BP_GET_LEVEL(bp) > 0) { 491 uint32_t flags = ARC_WAIT; 492 int i; 493 blkptr_t *cbp; 494 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 495 496 err = arc_read(NULL, dp->dp_spa, bp, pbuf, 497 arc_getbuf_func, &buf, 498 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 499 if (err) { 500 mutex_enter(&dp->dp_spa->spa_scrub_lock); 501 dp->dp_spa->spa_scrub_errors++; 502 mutex_exit(&dp->dp_spa->spa_scrub_lock); 503 return; 504 } 505 cbp = buf->b_data; 506 507 for (i = 0; i < epb; i++, cbp++) { 508 zbookmark_t czb; 509 510 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 511 zb->zb_level - 1, 512 zb->zb_blkid * epb + i); 513 scrub_visitbp(dp, dnp, buf, cbp, &czb); 514 } 515 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 516 uint32_t flags = ARC_WAIT; 517 dnode_phys_t *child_dnp; 518 int i; 519 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 520 521 err = arc_read(NULL, dp->dp_spa, bp, pbuf, 522 arc_getbuf_func, &buf, 523 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 524 if (err) { 525 mutex_enter(&dp->dp_spa->spa_scrub_lock); 526 dp->dp_spa->spa_scrub_errors++; 527 mutex_exit(&dp->dp_spa->spa_scrub_lock); 528 return; 529 } 530 child_dnp = buf->b_data; 531 532 for (i = 0; i < epb; i++, child_dnp++) { 533 scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset, 534 zb->zb_blkid * epb + i); 535 } 536 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 537 uint32_t flags = ARC_WAIT; 538 objset_phys_t *osp; 539 540 err = arc_read_nolock(NULL, dp->dp_spa, bp, 541 arc_getbuf_func, &buf, 542 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 543 if (err) { 544 mutex_enter(&dp->dp_spa->spa_scrub_lock); 545 dp->dp_spa->spa_scrub_errors++; 546 mutex_exit(&dp->dp_spa->spa_scrub_lock); 547 return; 548 } 549 550 osp = buf->b_data; 551 552 traverse_zil(dp, &osp->os_zil_header); 553 554 scrub_visitdnode(dp, &osp->os_meta_dnode, 555 buf, zb->zb_objset, DMU_META_DNODE_OBJECT); 556 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { 557 scrub_visitdnode(dp, &osp->os_userused_dnode, 558 buf, zb->zb_objset, DMU_USERUSED_OBJECT); 559 scrub_visitdnode(dp, &osp->os_groupused_dnode, 560 buf, zb->zb_objset, DMU_GROUPUSED_OBJECT); 561 } 562 } 563 564 /* 565 * If dsl_pool_scrub_ddt() has aready scrubbed this block, 566 * don't scrub it again. 567 */ 568 if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp)) 569 (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); 570 571 if (buf) 572 (void) arc_buf_remove_ref(buf, &buf); 573 } 574 575 static void 576 scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, 577 uint64_t objset, uint64_t object) 578 { 579 int j; 580 581 for (j = 0; j < dnp->dn_nblkptr; j++) { 582 zbookmark_t czb; 583 584 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 585 scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); 586 } 587 } 588 589 static void 590 scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) 591 { 592 zbookmark_t zb; 593 594 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 595 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 596 scrub_visitbp(dp, NULL, NULL, bp, &zb); 597 } 598 599 void 600 dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) 601 { 602 dsl_pool_t *dp = ds->ds_dir->dd_pool; 603 604 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 605 return; 606 607 if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { 608 SET_BOOKMARK(&dp->dp_scrub_bookmark, 609 ZB_DESTROYED_OBJSET, 0, 0, 0); 610 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 611 ds->ds_object, tx) != 0) { 612 return; 613 } 614 615 if (ds->ds_phys->ds_next_snap_obj != 0) { 616 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 617 ds->ds_phys->ds_next_snap_obj, tx) == 0); 618 } 619 ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); 620 } 621 622 void 623 dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) 624 { 625 dsl_pool_t *dp = ds->ds_dir->dd_pool; 626 627 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 628 return; 629 630 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); 631 632 if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { 633 dp->dp_scrub_bookmark.zb_objset = 634 ds->ds_phys->ds_prev_snap_obj; 635 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 636 ds->ds_object, tx) == 0) { 637 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 638 ds->ds_phys->ds_prev_snap_obj, tx) == 0); 639 } 640 } 641 642 void 643 dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) 644 { 645 dsl_pool_t *dp = ds1->ds_dir->dd_pool; 646 647 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 648 return; 649 650 if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { 651 dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; 652 } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { 653 dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; 654 } 655 656 if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 657 ds1->ds_object, tx) == 0) { 658 int err = zap_add_int(dp->dp_meta_objset, 659 dp->dp_scrub_queue_obj, ds2->ds_object, tx); 660 VERIFY(err == 0 || err == EEXIST); 661 if (err == EEXIST) { 662 /* Both were there to begin with */ 663 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 664 dp->dp_scrub_queue_obj, ds1->ds_object, tx)); 665 } 666 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 667 ds2->ds_object, tx) == 0) { 668 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 669 dp->dp_scrub_queue_obj, ds1->ds_object, tx)); 670 } 671 } 672 673 struct enqueue_clones_arg { 674 dmu_tx_t *tx; 675 uint64_t originobj; 676 }; 677 678 /* ARGSUSED */ 679 static int 680 enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 681 { 682 struct enqueue_clones_arg *eca = arg; 683 dsl_dataset_t *ds; 684 int err; 685 dsl_pool_t *dp; 686 687 err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); 688 if (err) 689 return (err); 690 dp = ds->ds_dir->dd_pool; 691 692 if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { 693 while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { 694 dsl_dataset_t *prev; 695 err = dsl_dataset_hold_obj(dp, 696 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); 697 698 dsl_dataset_rele(ds, FTAG); 699 if (err) 700 return (err); 701 ds = prev; 702 } 703 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 704 ds->ds_object, eca->tx) == 0); 705 } 706 dsl_dataset_rele(ds, FTAG); 707 return (0); 708 } 709 710 static void 711 scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) 712 { 713 dsl_dataset_t *ds; 714 uint64_t min_txg_save; 715 716 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 717 718 /* 719 * Iterate over the bps in this ds. 720 */ 721 min_txg_save = dp->dp_scrub_min_txg; 722 dp->dp_scrub_min_txg = 723 MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); 724 scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); 725 dp->dp_scrub_min_txg = min_txg_save; 726 727 if (dp->dp_scrub_pausing) 728 goto out; 729 730 /* 731 * Add descendent datasets to work queue. 732 */ 733 if (ds->ds_phys->ds_next_snap_obj != 0) { 734 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 735 ds->ds_phys->ds_next_snap_obj, tx) == 0); 736 } 737 if (ds->ds_phys->ds_num_children > 1) { 738 boolean_t usenext = B_FALSE; 739 if (ds->ds_phys->ds_next_clones_obj != 0) { 740 uint64_t count; 741 /* 742 * A bug in a previous version of the code could 743 * cause upgrade_clones_cb() to not set 744 * ds_next_snap_obj when it should, leading to a 745 * missing entry. Therefore we can only use the 746 * next_clones_obj when its count is correct. 747 */ 748 int err = zap_count(dp->dp_meta_objset, 749 ds->ds_phys->ds_next_clones_obj, &count); 750 if (err == 0 && 751 count == ds->ds_phys->ds_num_children - 1) 752 usenext = B_TRUE; 753 } 754 755 if (usenext) { 756 VERIFY(zap_join(dp->dp_meta_objset, 757 ds->ds_phys->ds_next_clones_obj, 758 dp->dp_scrub_queue_obj, tx) == 0); 759 } else { 760 struct enqueue_clones_arg eca; 761 eca.tx = tx; 762 eca.originobj = ds->ds_object; 763 764 (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, 765 NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); 766 } 767 } 768 769 out: 770 dsl_dataset_rele(ds, FTAG); 771 } 772 773 /* ARGSUSED */ 774 static int 775 enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 776 { 777 dmu_tx_t *tx = arg; 778 dsl_dataset_t *ds; 779 int err; 780 dsl_pool_t *dp; 781 782 err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); 783 if (err) 784 return (err); 785 786 dp = ds->ds_dir->dd_pool; 787 788 while (ds->ds_phys->ds_prev_snap_obj != 0) { 789 dsl_dataset_t *prev; 790 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 791 FTAG, &prev); 792 if (err) { 793 dsl_dataset_rele(ds, FTAG); 794 return (err); 795 } 796 797 /* 798 * If this is a clone, we don't need to worry about it for now. 799 */ 800 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { 801 dsl_dataset_rele(ds, FTAG); 802 dsl_dataset_rele(prev, FTAG); 803 return (0); 804 } 805 dsl_dataset_rele(ds, FTAG); 806 ds = prev; 807 } 808 809 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 810 ds->ds_object, tx) == 0); 811 dsl_dataset_rele(ds, FTAG); 812 return (0); 813 } 814 815 /* 816 * Scrub/dedup interaction. 817 * 818 * If there are N references to a deduped block, we don't want to scrub it 819 * N times -- ideally, we should scrub it exactly once. 820 * 821 * To prevent excess scrubbing, the scrub begins by walking the DDT 822 * to find all blocks with refcnt > 1, and scrubs each of these once. 823 * Then the top-down scrub begins, only visiting blocks with refcnt == 1. 824 * 825 * There would be nothing more to say if a block's refcnt couldn't change 826 * during a scrub, but of course it can. There are two cases to consider. 827 * 828 * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 829 * when visited during the top-down scrub phase, it will be scrubbed twice. 830 * This negates our scrub optimization, but is otherwise harmless. 831 * 832 * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 833 * on each visit during the top-down scrub phase, it will never be scrubbed. 834 * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's 835 * reference count changes; if it transitions from refcnt == 1 to refcnt > 1 836 * while a scrub is in progress, it scrubs the block right then. 837 * 838 * The code does not actually use the refcnt directly, but rather uses the 839 * dde's replication class (enum ddt_class), which serves the same purpose. 840 */ 841 static void 842 dsl_pool_scrub_ddt(dsl_pool_t *dp) 843 { 844 ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark; 845 ddt_entry_t dde; 846 int error; 847 848 while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) { 849 if (ddb->ddb_class > dp->dp_scrub_ddt_class_max) 850 return; 851 dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde); 852 if (scrub_pause(dp, NULL, ddb)) 853 return; 854 } 855 ASSERT(error == ENOENT); 856 ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max); 857 } 858 859 void 860 dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, 861 const ddt_entry_t *dde) 862 { 863 const ddt_key_t *ddk = &dde->dde_key; 864 const ddt_phys_t *ddp = dde->dde_phys; 865 blkptr_t blk; 866 zbookmark_t zb = { 0 }; 867 868 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 869 if (ddp->ddp_phys_birth == 0) 870 continue; 871 ddt_bp_create(checksum, ddk, ddp, &blk); 872 scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); 873 } 874 } 875 876 void 877 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) 878 { 879 spa_t *spa = dp->dp_spa; 880 zap_cursor_t zc; 881 zap_attribute_t za; 882 boolean_t complete = B_TRUE; 883 884 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 885 return; 886 887 /* 888 * If the pool is not loaded, or is trying to unload, leave it alone. 889 */ 890 if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa)) 891 return; 892 893 if (dp->dp_scrub_restart) { 894 enum scrub_func func = dp->dp_scrub_func; 895 dp->dp_scrub_restart = B_FALSE; 896 dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); 897 } 898 899 if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { 900 /* 901 * We must have resumed after rebooting; reset the vdev 902 * stats to know that we're doing a scrub (although it 903 * will think we're just starting now). 904 */ 905 vdev_scrub_stat_update(spa->spa_root_vdev, 906 dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : 907 POOL_SCRUB_EVERYTHING, B_FALSE); 908 } 909 910 dp->dp_scrub_pausing = B_FALSE; 911 dp->dp_scrub_start_time = ddi_get_lbolt64(); 912 dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); 913 spa->spa_scrub_active = B_TRUE; 914 915 if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) { 916 dsl_pool_scrub_ddt(dp); 917 if (dp->dp_scrub_pausing) 918 goto out; 919 } 920 921 if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) { 922 /* First do the MOS & ORIGIN */ 923 scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); 924 if (dp->dp_scrub_pausing) 925 goto out; 926 927 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { 928 VERIFY(0 == dmu_objset_find_spa(spa, 929 NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); 930 } else { 931 scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); 932 } 933 ASSERT(!dp->dp_scrub_pausing); 934 } else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { 935 /* 936 * If we were paused, continue from here. Note if the ds 937 * we were paused on was destroyed, the zb_objset will be 938 * ZB_DESTROYED_OBJSET, so we will skip this and find a new 939 * objset below. 940 */ 941 scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); 942 if (dp->dp_scrub_pausing) 943 goto out; 944 } 945 946 /* 947 * In case we were paused right at the end of the ds, zero the 948 * bookmark so we don't think that we're still trying to resume. 949 */ 950 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 951 952 /* keep pulling things out of the zap-object-as-queue */ 953 while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), 954 zap_cursor_retrieve(&zc, &za) == 0) { 955 VERIFY(0 == zap_remove(dp->dp_meta_objset, 956 dp->dp_scrub_queue_obj, za.za_name, tx)); 957 scrub_visitds(dp, za.za_first_integer, tx); 958 if (dp->dp_scrub_pausing) 959 break; 960 zap_cursor_fini(&zc); 961 } 962 zap_cursor_fini(&zc); 963 if (dp->dp_scrub_pausing) 964 goto out; 965 966 /* done. */ 967 968 dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); 969 return; 970 out: 971 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 972 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 973 sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), 974 &dp->dp_scrub_bookmark, tx)); 975 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 976 DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), 977 sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), 978 &dp->dp_scrub_ddt_bookmark, tx)); 979 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 980 DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, 981 &dp->dp_scrub_ddt_class_max, tx)); 982 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 983 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, 984 &spa->spa_scrub_errors, tx)); 985 986 /* XXX this is scrub-clean specific */ 987 mutex_enter(&spa->spa_scrub_lock); 988 while (spa->spa_scrub_inflight > 0) 989 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 990 mutex_exit(&spa->spa_scrub_lock); 991 } 992 993 void 994 dsl_pool_scrub_restart(dsl_pool_t *dp) 995 { 996 mutex_enter(&dp->dp_scrub_cancel_lock); 997 dp->dp_scrub_restart = B_TRUE; 998 mutex_exit(&dp->dp_scrub_cancel_lock); 999 } 1000 1001 /* 1002 * scrub consumers 1003 */ 1004 1005 static void 1006 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) 1007 { 1008 int i; 1009 1010 /* 1011 * If we resume after a reboot, zab will be NULL; don't record 1012 * incomplete stats in that case. 1013 */ 1014 if (zab == NULL) 1015 return; 1016 1017 for (i = 0; i < 4; i++) { 1018 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; 1019 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; 1020 zfs_blkstat_t *zb = &zab->zab_type[l][t]; 1021 int equal; 1022 1023 zb->zb_count++; 1024 zb->zb_asize += BP_GET_ASIZE(bp); 1025 zb->zb_lsize += BP_GET_LSIZE(bp); 1026 zb->zb_psize += BP_GET_PSIZE(bp); 1027 zb->zb_gangs += BP_COUNT_GANG(bp); 1028 1029 switch (BP_GET_NDVAS(bp)) { 1030 case 2: 1031 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 1032 DVA_GET_VDEV(&bp->blk_dva[1])) 1033 zb->zb_ditto_2_of_2_samevdev++; 1034 break; 1035 case 3: 1036 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 1037 DVA_GET_VDEV(&bp->blk_dva[1])) + 1038 (DVA_GET_VDEV(&bp->blk_dva[0]) == 1039 DVA_GET_VDEV(&bp->blk_dva[2])) + 1040 (DVA_GET_VDEV(&bp->blk_dva[1]) == 1041 DVA_GET_VDEV(&bp->blk_dva[2])); 1042 if (equal == 1) 1043 zb->zb_ditto_2_of_3_samevdev++; 1044 else if (equal == 3) 1045 zb->zb_ditto_3_of_3_samevdev++; 1046 break; 1047 } 1048 } 1049 } 1050 1051 static void 1052 dsl_pool_scrub_clean_done(zio_t *zio) 1053 { 1054 spa_t *spa = zio->io_spa; 1055 1056 zio_data_buf_free(zio->io_data, zio->io_size); 1057 1058 mutex_enter(&spa->spa_scrub_lock); 1059 spa->spa_scrub_inflight--; 1060 cv_broadcast(&spa->spa_scrub_io_cv); 1061 1062 if (zio->io_error && (zio->io_error != ECKSUM || 1063 !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) 1064 spa->spa_scrub_errors++; 1065 mutex_exit(&spa->spa_scrub_lock); 1066 } 1067 1068 static int 1069 dsl_pool_scrub_clean_cb(dsl_pool_t *dp, 1070 const blkptr_t *bp, const zbookmark_t *zb) 1071 { 1072 size_t size = BP_GET_PSIZE(bp); 1073 spa_t *spa = dp->dp_spa; 1074 uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); 1075 boolean_t needs_io; 1076 int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; 1077 int zio_priority; 1078 1079 if (phys_birth <= dp->dp_scrub_min_txg || 1080 phys_birth >= dp->dp_scrub_max_txg) 1081 return (0); 1082 1083 count_block(dp->dp_blkstats, bp); 1084 1085 if (dp->dp_scrub_isresilver == 0) { 1086 /* It's a scrub */ 1087 zio_flags |= ZIO_FLAG_SCRUB; 1088 zio_priority = ZIO_PRIORITY_SCRUB; 1089 needs_io = B_TRUE; 1090 } else { 1091 /* It's a resilver */ 1092 zio_flags |= ZIO_FLAG_RESILVER; 1093 zio_priority = ZIO_PRIORITY_RESILVER; 1094 needs_io = B_FALSE; 1095 } 1096 1097 /* If it's an intent log block, failure is expected. */ 1098 if (zb->zb_level == ZB_ZIL_LEVEL) 1099 zio_flags |= ZIO_FLAG_SPECULATIVE; 1100 1101 for (int d = 0; d < BP_GET_NDVAS(bp); d++) { 1102 vdev_t *vd = vdev_lookup_top(spa, 1103 DVA_GET_VDEV(&bp->blk_dva[d])); 1104 1105 /* 1106 * Keep track of how much data we've examined so that 1107 * zpool(1M) status can make useful progress reports. 1108 */ 1109 mutex_enter(&vd->vdev_stat_lock); 1110 vd->vdev_stat.vs_scrub_examined += 1111 DVA_GET_ASIZE(&bp->blk_dva[d]); 1112 mutex_exit(&vd->vdev_stat_lock); 1113 1114 /* if it's a resilver, this may not be in the target range */ 1115 if (!needs_io) { 1116 if (DVA_GET_GANG(&bp->blk_dva[d])) { 1117 /* 1118 * Gang members may be spread across multiple 1119 * vdevs, so the best estimate we have is the 1120 * scrub range, which has already been checked. 1121 * XXX -- it would be better to change our 1122 * allocation policy to ensure that all 1123 * gang members reside on the same vdev. 1124 */ 1125 needs_io = B_TRUE; 1126 } else { 1127 needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, 1128 phys_birth, 1); 1129 } 1130 } 1131 } 1132 1133 if (needs_io && !zfs_no_scrub_io) { 1134 void *data = zio_data_buf_alloc(size); 1135 1136 mutex_enter(&spa->spa_scrub_lock); 1137 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) 1138 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1139 spa->spa_scrub_inflight++; 1140 mutex_exit(&spa->spa_scrub_lock); 1141 1142 zio_nowait(zio_read(NULL, spa, bp, data, size, 1143 dsl_pool_scrub_clean_done, NULL, zio_priority, 1144 zio_flags, zb)); 1145 } 1146 1147 /* do not relocate this block */ 1148 return (0); 1149 } 1150 1151 int 1152 dsl_pool_scrub_clean(dsl_pool_t *dp) 1153 { 1154 spa_t *spa = dp->dp_spa; 1155 1156 /* 1157 * Purge all vdev caches and probe all devices. We do this here 1158 * rather than in sync context because this requires a writer lock 1159 * on the spa_config lock, which we can't do from sync context. The 1160 * spa_scrub_reopen flag indicates that vdev_open() should not 1161 * attempt to start another scrub. 1162 */ 1163 spa_vdev_state_enter(spa, SCL_NONE); 1164 spa->spa_scrub_reopen = B_TRUE; 1165 vdev_reopen(spa->spa_root_vdev); 1166 spa->spa_scrub_reopen = B_FALSE; 1167 (void) spa_vdev_state_exit(spa, NULL, 0); 1168 1169 return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); 1170 } 1171