1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/dsl_pool.h> 27 #include <sys/dsl_dataset.h> 28 #include <sys/dsl_prop.h> 29 #include <sys/dsl_dir.h> 30 #include <sys/dsl_synctask.h> 31 #include <sys/dnode.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/arc.h> 35 #include <sys/zap.h> 36 #include <sys/zio.h> 37 #include <sys/zfs_context.h> 38 #include <sys/fs/zfs.h> 39 #include <sys/zfs_znode.h> 40 #include <sys/spa_impl.h> 41 #include <sys/vdev_impl.h> 42 #include <sys/zil_impl.h> 43 #include <sys/zio_checksum.h> 44 #include <sys/ddt.h> 45 46 typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); 47 48 static scrub_cb_t dsl_pool_scrub_clean_cb; 49 static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; 50 static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, 51 uint64_t objset, uint64_t object); 52 53 int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ 54 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ 55 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ 56 boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ 57 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; 58 59 extern int zfs_txg_timeout; 60 61 static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { 62 NULL, 63 dsl_pool_scrub_clean_cb 64 }; 65 66 /* ARGSUSED */ 67 static void 68 dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 69 { 70 dsl_pool_t *dp = arg1; 71 enum scrub_func *funcp = arg2; 72 dmu_object_type_t ot = 0; 73 boolean_t complete = B_FALSE; 74 75 dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); 76 77 ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); 78 ASSERT(*funcp > SCRUB_FUNC_NONE); 79 ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); 80 81 dp->dp_scrub_min_txg = 0; 82 dp->dp_scrub_max_txg = tx->tx_txg; 83 dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max; 84 85 if (*funcp == SCRUB_FUNC_CLEAN) { 86 vdev_t *rvd = dp->dp_spa->spa_root_vdev; 87 88 /* rewrite all disk labels */ 89 vdev_config_dirty(rvd); 90 91 if (vdev_resilver_needed(rvd, 92 &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { 93 spa_event_notify(dp->dp_spa, NULL, 94 ESC_ZFS_RESILVER_START); 95 dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, 96 tx->tx_txg); 97 } else { 98 spa_event_notify(dp->dp_spa, NULL, 99 ESC_ZFS_SCRUB_START); 100 } 101 102 /* zero out the scrub stats in all vdev_stat_t's */ 103 vdev_scrub_stat_update(rvd, 104 dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : 105 POOL_SCRUB_EVERYTHING, B_FALSE); 106 107 /* 108 * If this is an incremental scrub, limit the DDT scrub phase 109 * to just the auto-ditto class (for correctness); the rest 110 * of the scrub should go faster using top-down pruning. 111 */ 112 if (dp->dp_scrub_min_txg > TXG_INITIAL) 113 dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO; 114 115 dp->dp_spa->spa_scrub_started = B_TRUE; 116 } 117 118 /* back to the generic stuff */ 119 120 if (dp->dp_blkstats == NULL) { 121 dp->dp_blkstats = 122 kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); 123 } 124 bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 125 126 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) 127 ot = DMU_OT_ZAP_OTHER; 128 129 dp->dp_scrub_func = *funcp; 130 dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, 131 ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); 132 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 133 bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); 134 dp->dp_scrub_restart = B_FALSE; 135 dp->dp_spa->spa_scrub_errors = 0; 136 137 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 138 DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, 139 &dp->dp_scrub_func, tx)); 140 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 141 DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, 142 &dp->dp_scrub_queue_obj, tx)); 143 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 144 DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, 145 &dp->dp_scrub_min_txg, tx)); 146 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 147 DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, 148 &dp->dp_scrub_max_txg, tx)); 149 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 150 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 151 sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), 152 &dp->dp_scrub_bookmark, tx)); 153 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 154 DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), 155 sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), 156 &dp->dp_scrub_ddt_bookmark, tx)); 157 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 158 DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, 159 &dp->dp_scrub_ddt_class_max, tx)); 160 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 161 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, 162 &dp->dp_spa->spa_scrub_errors, tx)); 163 164 spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, 165 "func=%u mintxg=%llu maxtxg=%llu", 166 *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); 167 } 168 169 int 170 dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) 171 { 172 return (dsl_sync_task_do(dp, NULL, 173 dsl_pool_scrub_setup_sync, dp, &func, 0)); 174 } 175 176 /* ARGSUSED */ 177 static void 178 dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 179 { 180 dsl_pool_t *dp = arg1; 181 boolean_t *completep = arg2; 182 183 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 184 return; 185 186 mutex_enter(&dp->dp_scrub_cancel_lock); 187 188 if (dp->dp_scrub_restart) { 189 dp->dp_scrub_restart = B_FALSE; 190 *completep = B_FALSE; 191 } 192 193 /* XXX this is scrub-clean specific */ 194 mutex_enter(&dp->dp_spa->spa_scrub_lock); 195 while (dp->dp_spa->spa_scrub_inflight > 0) { 196 cv_wait(&dp->dp_spa->spa_scrub_io_cv, 197 &dp->dp_spa->spa_scrub_lock); 198 } 199 mutex_exit(&dp->dp_spa->spa_scrub_lock); 200 dp->dp_spa->spa_scrub_started = B_FALSE; 201 dp->dp_spa->spa_scrub_active = B_FALSE; 202 203 dp->dp_scrub_func = SCRUB_FUNC_NONE; 204 VERIFY(0 == dmu_object_free(dp->dp_meta_objset, 205 dp->dp_scrub_queue_obj, tx)); 206 dp->dp_scrub_queue_obj = 0; 207 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 208 bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); 209 210 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 211 DMU_POOL_SCRUB_QUEUE, tx)); 212 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 213 DMU_POOL_SCRUB_MIN_TXG, tx)); 214 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 215 DMU_POOL_SCRUB_MAX_TXG, tx)); 216 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 217 DMU_POOL_SCRUB_BOOKMARK, tx)); 218 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 219 DMU_POOL_SCRUB_FUNC, tx)); 220 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 221 DMU_POOL_SCRUB_ERRORS, tx)); 222 223 (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 224 DMU_POOL_SCRUB_DDT_BOOKMARK, tx); 225 (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 226 DMU_POOL_SCRUB_DDT_CLASS_MAX, tx); 227 228 spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, 229 "complete=%u", *completep); 230 231 /* below is scrub-clean specific */ 232 vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, 233 *completep); 234 /* 235 * If the scrub/resilver completed, update all DTLs to reflect this. 236 * Whether it succeeded or not, vacate all temporary scrub DTLs. 237 */ 238 vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, 239 *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); 240 if (*completep) 241 spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? 242 ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); 243 spa_errlog_rotate(dp->dp_spa); 244 245 /* 246 * We may have finished replacing a device. 247 * Let the async thread assess this and handle the detach. 248 */ 249 spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); 250 251 dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; 252 mutex_exit(&dp->dp_scrub_cancel_lock); 253 } 254 255 int 256 dsl_pool_scrub_cancel(dsl_pool_t *dp) 257 { 258 boolean_t complete = B_FALSE; 259 260 return (dsl_sync_task_do(dp, NULL, 261 dsl_pool_scrub_cancel_sync, dp, &complete, 3)); 262 } 263 264 void 265 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) 266 { 267 /* 268 * This function will be used by bp-rewrite wad to intercept frees. 269 */ 270 zio_free(dp->dp_spa, txg, bpp); 271 } 272 273 static boolean_t 274 bookmark_is_zero(const zbookmark_t *zb) 275 { 276 return (zb->zb_objset == 0 && zb->zb_object == 0 && 277 zb->zb_level == 0 && zb->zb_blkid == 0); 278 } 279 280 /* dnp is the dnode for zb1->zb_object */ 281 static boolean_t 282 bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, 283 const zbookmark_t *zb2) 284 { 285 uint64_t zb1nextL0, zb2thisobj; 286 287 ASSERT(zb1->zb_objset == zb2->zb_objset); 288 ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT); 289 ASSERT(zb2->zb_level == 0); 290 291 /* 292 * A bookmark in the deadlist is considered to be after 293 * everything else. 294 */ 295 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 296 return (B_TRUE); 297 298 /* The objset_phys_t isn't before anything. */ 299 if (dnp == NULL) 300 return (B_FALSE); 301 302 zb1nextL0 = (zb1->zb_blkid + 1) << 303 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 304 305 zb2thisobj = zb2->zb_object ? zb2->zb_object : 306 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 307 308 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 309 uint64_t nextobj = zb1nextL0 * 310 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 311 return (nextobj <= zb2thisobj); 312 } 313 314 if (zb1->zb_object < zb2thisobj) 315 return (B_TRUE); 316 if (zb1->zb_object > zb2thisobj) 317 return (B_FALSE); 318 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 319 return (B_FALSE); 320 return (zb1nextL0 <= zb2->zb_blkid); 321 } 322 323 static boolean_t 324 scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb) 325 { 326 uint64_t elapsed_nanosecs; 327 int mintime; 328 329 if (dp->dp_scrub_pausing) 330 return (B_TRUE); /* we're already pausing */ 331 332 if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) 333 return (B_FALSE); /* we're resuming */ 334 335 /* We only know how to resume from level-0 blocks. */ 336 if (zb != NULL && zb->zb_level != 0) 337 return (B_FALSE); 338 339 mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms : 340 zfs_scrub_min_time_ms; 341 elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time; 342 if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || 343 (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) { 344 if (zb) { 345 dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", 346 (longlong_t)zb->zb_objset, 347 (longlong_t)zb->zb_object, 348 (longlong_t)zb->zb_level, 349 (longlong_t)zb->zb_blkid); 350 dp->dp_scrub_bookmark = *zb; 351 } 352 if (ddb) { 353 dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", 354 (longlong_t)ddb->ddb_class, 355 (longlong_t)ddb->ddb_type, 356 (longlong_t)ddb->ddb_checksum, 357 (longlong_t)ddb->ddb_cursor); 358 ASSERT(&dp->dp_scrub_ddt_bookmark == ddb); 359 } 360 dp->dp_scrub_pausing = B_TRUE; 361 return (B_TRUE); 362 } 363 return (B_FALSE); 364 } 365 366 typedef struct zil_traverse_arg { 367 dsl_pool_t *zta_dp; 368 zil_header_t *zta_zh; 369 } zil_traverse_arg_t; 370 371 /* ARGSUSED */ 372 static int 373 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 374 { 375 zil_traverse_arg_t *zta = arg; 376 dsl_pool_t *dp = zta->zta_dp; 377 zil_header_t *zh = zta->zta_zh; 378 zbookmark_t zb; 379 380 if (bp->blk_birth <= dp->dp_scrub_min_txg) 381 return (0); 382 383 /* 384 * One block ("stubby") can be allocated a long time ago; we 385 * want to visit that one because it has been allocated 386 * (on-disk) even if it hasn't been claimed (even though for 387 * plain scrub there's nothing to do to it). 388 */ 389 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) 390 return (0); 391 392 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 393 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 394 395 VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); 396 return (0); 397 } 398 399 /* ARGSUSED */ 400 static int 401 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 402 { 403 if (lrc->lrc_txtype == TX_WRITE) { 404 zil_traverse_arg_t *zta = arg; 405 dsl_pool_t *dp = zta->zta_dp; 406 zil_header_t *zh = zta->zta_zh; 407 lr_write_t *lr = (lr_write_t *)lrc; 408 blkptr_t *bp = &lr->lr_blkptr; 409 zbookmark_t zb; 410 411 if (bp->blk_birth <= dp->dp_scrub_min_txg) 412 return (0); 413 414 /* 415 * birth can be < claim_txg if this record's txg is 416 * already txg sync'ed (but this log block contains 417 * other records that are not synced) 418 */ 419 if (claim_txg == 0 || bp->blk_birth < claim_txg) 420 return (0); 421 422 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 423 lr->lr_foid, ZB_ZIL_LEVEL, 424 lr->lr_offset / BP_GET_LSIZE(bp)); 425 426 VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); 427 } 428 return (0); 429 } 430 431 static void 432 traverse_zil(dsl_pool_t *dp, zil_header_t *zh) 433 { 434 uint64_t claim_txg = zh->zh_claim_txg; 435 zil_traverse_arg_t zta = { dp, zh }; 436 zilog_t *zilog; 437 438 /* 439 * We only want to visit blocks that have been claimed but not yet 440 * replayed (or, in read-only mode, blocks that *would* be claimed). 441 */ 442 if (claim_txg == 0 && spa_writeable(dp->dp_spa)) 443 return; 444 445 zilog = zil_alloc(dp->dp_meta_objset, zh); 446 447 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, 448 claim_txg); 449 450 zil_free(zilog); 451 } 452 453 static void 454 scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, 455 uint64_t object, uint64_t blkid) 456 { 457 zbookmark_t czb; 458 uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; 459 460 if (zfs_no_scrub_prefetch) 461 return; 462 463 if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg || 464 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) 465 return; 466 467 SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); 468 469 (void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp, 470 buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 471 &flags, &czb); 472 } 473 474 static void 475 scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, 476 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) 477 { 478 int err; 479 arc_buf_t *buf = NULL; 480 481 if (bp->blk_birth <= dp->dp_scrub_min_txg) 482 return; 483 484 if (scrub_pause(dp, zb, NULL)) 485 return; 486 487 if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { 488 /* 489 * If we already visited this bp & everything below (in 490 * a prior txg), don't bother doing it again. 491 */ 492 if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) 493 return; 494 495 /* 496 * If we found the block we're trying to resume from, or 497 * we went past it to a different object, zero it out to 498 * indicate that it's OK to start checking for pausing 499 * again. 500 */ 501 if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || 502 zb->zb_object > dp->dp_scrub_bookmark.zb_object) { 503 dprintf("resuming at %llx/%llx/%llx/%llx\n", 504 (longlong_t)zb->zb_objset, 505 (longlong_t)zb->zb_object, 506 (longlong_t)zb->zb_level, 507 (longlong_t)zb->zb_blkid); 508 bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); 509 } 510 } 511 512 /* 513 * If dsl_pool_scrub_ddt() has aready scrubbed this block, 514 * don't scrub it again. 515 */ 516 if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp)) 517 (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); 518 519 if (BP_GET_LEVEL(bp) > 0) { 520 uint32_t flags = ARC_WAIT; 521 int i; 522 blkptr_t *cbp; 523 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 524 525 err = arc_read(NULL, dp->dp_spa, bp, pbuf, 526 arc_getbuf_func, &buf, 527 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 528 if (err) { 529 mutex_enter(&dp->dp_spa->spa_scrub_lock); 530 dp->dp_spa->spa_scrub_errors++; 531 mutex_exit(&dp->dp_spa->spa_scrub_lock); 532 return; 533 } 534 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { 535 scrub_prefetch(dp, buf, cbp, zb->zb_objset, 536 zb->zb_object, zb->zb_blkid * epb + i); 537 } 538 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { 539 zbookmark_t czb; 540 541 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 542 zb->zb_level - 1, 543 zb->zb_blkid * epb + i); 544 scrub_visitbp(dp, dnp, buf, cbp, &czb); 545 } 546 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 547 uint32_t flags = ARC_WAIT; 548 dnode_phys_t *cdnp; 549 int i, j; 550 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 551 552 err = arc_read(NULL, dp->dp_spa, bp, pbuf, 553 arc_getbuf_func, &buf, 554 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 555 if (err) { 556 mutex_enter(&dp->dp_spa->spa_scrub_lock); 557 dp->dp_spa->spa_scrub_errors++; 558 mutex_exit(&dp->dp_spa->spa_scrub_lock); 559 return; 560 } 561 for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { 562 for (j = 0; j < cdnp->dn_nblkptr; j++) { 563 blkptr_t *cbp = &cdnp->dn_blkptr[j]; 564 scrub_prefetch(dp, buf, cbp, zb->zb_objset, 565 zb->zb_blkid * epb + i, j); 566 } 567 } 568 for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { 569 scrub_visitdnode(dp, cdnp, buf, zb->zb_objset, 570 zb->zb_blkid * epb + i); 571 } 572 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 573 uint32_t flags = ARC_WAIT; 574 objset_phys_t *osp; 575 576 err = arc_read_nolock(NULL, dp->dp_spa, bp, 577 arc_getbuf_func, &buf, 578 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 579 if (err) { 580 mutex_enter(&dp->dp_spa->spa_scrub_lock); 581 dp->dp_spa->spa_scrub_errors++; 582 mutex_exit(&dp->dp_spa->spa_scrub_lock); 583 return; 584 } 585 586 osp = buf->b_data; 587 588 traverse_zil(dp, &osp->os_zil_header); 589 590 scrub_visitdnode(dp, &osp->os_meta_dnode, 591 buf, zb->zb_objset, DMU_META_DNODE_OBJECT); 592 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { 593 scrub_visitdnode(dp, &osp->os_userused_dnode, 594 buf, zb->zb_objset, DMU_USERUSED_OBJECT); 595 scrub_visitdnode(dp, &osp->os_groupused_dnode, 596 buf, zb->zb_objset, DMU_GROUPUSED_OBJECT); 597 } 598 } 599 600 if (buf) 601 (void) arc_buf_remove_ref(buf, &buf); 602 } 603 604 static void 605 scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, 606 uint64_t objset, uint64_t object) 607 { 608 int j; 609 610 for (j = 0; j < dnp->dn_nblkptr; j++) { 611 zbookmark_t czb; 612 613 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 614 scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); 615 } 616 } 617 618 static void 619 scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) 620 { 621 zbookmark_t zb; 622 623 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 624 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 625 scrub_visitbp(dp, NULL, NULL, bp, &zb); 626 } 627 628 void 629 dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) 630 { 631 dsl_pool_t *dp = ds->ds_dir->dd_pool; 632 633 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 634 return; 635 636 if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { 637 SET_BOOKMARK(&dp->dp_scrub_bookmark, 638 ZB_DESTROYED_OBJSET, 0, 0, 0); 639 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 640 ds->ds_object, tx) != 0) { 641 return; 642 } 643 644 if (ds->ds_phys->ds_next_snap_obj != 0) { 645 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 646 ds->ds_phys->ds_next_snap_obj, tx) == 0); 647 } 648 ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); 649 } 650 651 void 652 dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) 653 { 654 dsl_pool_t *dp = ds->ds_dir->dd_pool; 655 656 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 657 return; 658 659 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); 660 661 if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { 662 dp->dp_scrub_bookmark.zb_objset = 663 ds->ds_phys->ds_prev_snap_obj; 664 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 665 ds->ds_object, tx) == 0) { 666 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 667 ds->ds_phys->ds_prev_snap_obj, tx) == 0); 668 } 669 } 670 671 void 672 dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) 673 { 674 dsl_pool_t *dp = ds1->ds_dir->dd_pool; 675 676 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 677 return; 678 679 if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { 680 dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; 681 } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { 682 dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; 683 } 684 685 if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 686 ds1->ds_object, tx) == 0) { 687 int err = zap_add_int(dp->dp_meta_objset, 688 dp->dp_scrub_queue_obj, ds2->ds_object, tx); 689 VERIFY(err == 0 || err == EEXIST); 690 if (err == EEXIST) { 691 /* Both were there to begin with */ 692 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 693 dp->dp_scrub_queue_obj, ds1->ds_object, tx)); 694 } 695 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 696 ds2->ds_object, tx) == 0) { 697 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 698 dp->dp_scrub_queue_obj, ds1->ds_object, tx)); 699 } 700 } 701 702 struct enqueue_clones_arg { 703 dmu_tx_t *tx; 704 uint64_t originobj; 705 }; 706 707 /* ARGSUSED */ 708 static int 709 enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 710 { 711 struct enqueue_clones_arg *eca = arg; 712 dsl_dataset_t *ds; 713 int err; 714 dsl_pool_t *dp; 715 716 err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); 717 if (err) 718 return (err); 719 dp = ds->ds_dir->dd_pool; 720 721 if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { 722 while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { 723 dsl_dataset_t *prev; 724 err = dsl_dataset_hold_obj(dp, 725 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); 726 727 dsl_dataset_rele(ds, FTAG); 728 if (err) 729 return (err); 730 ds = prev; 731 } 732 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 733 ds->ds_object, eca->tx) == 0); 734 } 735 dsl_dataset_rele(ds, FTAG); 736 return (0); 737 } 738 739 static void 740 scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) 741 { 742 dsl_dataset_t *ds; 743 uint64_t min_txg_save; 744 745 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 746 747 /* 748 * Iterate over the bps in this ds. 749 */ 750 min_txg_save = dp->dp_scrub_min_txg; 751 dp->dp_scrub_min_txg = 752 MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); 753 scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); 754 dp->dp_scrub_min_txg = min_txg_save; 755 756 if (dp->dp_scrub_pausing) 757 goto out; 758 759 /* 760 * Add descendent datasets to work queue. 761 */ 762 if (ds->ds_phys->ds_next_snap_obj != 0) { 763 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 764 ds->ds_phys->ds_next_snap_obj, tx) == 0); 765 } 766 if (ds->ds_phys->ds_num_children > 1) { 767 boolean_t usenext = B_FALSE; 768 if (ds->ds_phys->ds_next_clones_obj != 0) { 769 uint64_t count; 770 /* 771 * A bug in a previous version of the code could 772 * cause upgrade_clones_cb() to not set 773 * ds_next_snap_obj when it should, leading to a 774 * missing entry. Therefore we can only use the 775 * next_clones_obj when its count is correct. 776 */ 777 int err = zap_count(dp->dp_meta_objset, 778 ds->ds_phys->ds_next_clones_obj, &count); 779 if (err == 0 && 780 count == ds->ds_phys->ds_num_children - 1) 781 usenext = B_TRUE; 782 } 783 784 if (usenext) { 785 VERIFY(zap_join(dp->dp_meta_objset, 786 ds->ds_phys->ds_next_clones_obj, 787 dp->dp_scrub_queue_obj, tx) == 0); 788 } else { 789 struct enqueue_clones_arg eca; 790 eca.tx = tx; 791 eca.originobj = ds->ds_object; 792 793 (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, 794 NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); 795 } 796 } 797 798 out: 799 dsl_dataset_rele(ds, FTAG); 800 } 801 802 /* ARGSUSED */ 803 static int 804 enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 805 { 806 dmu_tx_t *tx = arg; 807 dsl_dataset_t *ds; 808 int err; 809 dsl_pool_t *dp; 810 811 err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); 812 if (err) 813 return (err); 814 815 dp = ds->ds_dir->dd_pool; 816 817 while (ds->ds_phys->ds_prev_snap_obj != 0) { 818 dsl_dataset_t *prev; 819 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 820 FTAG, &prev); 821 if (err) { 822 dsl_dataset_rele(ds, FTAG); 823 return (err); 824 } 825 826 /* 827 * If this is a clone, we don't need to worry about it for now. 828 */ 829 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { 830 dsl_dataset_rele(ds, FTAG); 831 dsl_dataset_rele(prev, FTAG); 832 return (0); 833 } 834 dsl_dataset_rele(ds, FTAG); 835 ds = prev; 836 } 837 838 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 839 ds->ds_object, tx) == 0); 840 dsl_dataset_rele(ds, FTAG); 841 return (0); 842 } 843 844 /* 845 * Scrub/dedup interaction. 846 * 847 * If there are N references to a deduped block, we don't want to scrub it 848 * N times -- ideally, we should scrub it exactly once. 849 * 850 * To prevent excess scrubbing, the scrub begins by walking the DDT 851 * to find all blocks with refcnt > 1, and scrubs each of these once. 852 * Then the top-down scrub begins, only visiting blocks with refcnt == 1. 853 * 854 * There would be nothing more to say if a block's refcnt couldn't change 855 * during a scrub, but of course it can. There are two cases to consider. 856 * 857 * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 858 * when visited during the top-down scrub phase, it will be scrubbed twice. 859 * This negates our scrub optimization, but is otherwise harmless. 860 * 861 * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 862 * on each visit during the top-down scrub phase, it will never be scrubbed. 863 * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's 864 * reference count changes; if it transitions from refcnt == 1 to refcnt > 1 865 * while a scrub is in progress, it scrubs the block right then. 866 * 867 * The code does not actually use the refcnt directly, but rather uses the 868 * dde's replication class (enum ddt_class), which serves the same purpose. 869 */ 870 static void 871 dsl_pool_scrub_ddt(dsl_pool_t *dp) 872 { 873 ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark; 874 ddt_entry_t dde; 875 int error; 876 877 while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) { 878 if (ddb->ddb_class > dp->dp_scrub_ddt_class_max) 879 return; 880 dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde); 881 if (scrub_pause(dp, NULL, ddb)) 882 return; 883 } 884 ASSERT(error == ENOENT); 885 ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max); 886 } 887 888 void 889 dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, 890 const ddt_entry_t *dde) 891 { 892 const ddt_key_t *ddk = &dde->dde_key; 893 const ddt_phys_t *ddp = dde->dde_phys; 894 blkptr_t blk; 895 zbookmark_t zb = { 0 }; 896 897 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 898 if (ddp->ddp_phys_birth == 0) 899 continue; 900 ddt_bp_create(checksum, ddk, ddp, &blk); 901 scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); 902 } 903 } 904 905 void 906 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) 907 { 908 spa_t *spa = dp->dp_spa; 909 zap_cursor_t zc; 910 zap_attribute_t za; 911 boolean_t complete = B_TRUE; 912 913 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 914 return; 915 916 /* 917 * If the pool is not loaded, or is trying to unload, leave it alone. 918 */ 919 if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa)) 920 return; 921 922 if (dp->dp_scrub_restart) { 923 enum scrub_func func = dp->dp_scrub_func; 924 dp->dp_scrub_restart = B_FALSE; 925 dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); 926 } 927 928 if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { 929 /* 930 * We must have resumed after rebooting; reset the vdev 931 * stats to know that we're doing a scrub (although it 932 * will think we're just starting now). 933 */ 934 vdev_scrub_stat_update(spa->spa_root_vdev, 935 dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : 936 POOL_SCRUB_EVERYTHING, B_FALSE); 937 } 938 939 dp->dp_scrub_pausing = B_FALSE; 940 dp->dp_scrub_start_time = gethrtime(); 941 dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); 942 spa->spa_scrub_active = B_TRUE; 943 944 if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) { 945 dsl_pool_scrub_ddt(dp); 946 if (dp->dp_scrub_pausing) 947 goto out; 948 } 949 950 if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) { 951 /* First do the MOS & ORIGIN */ 952 scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); 953 if (dp->dp_scrub_pausing) 954 goto out; 955 956 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { 957 VERIFY(0 == dmu_objset_find_spa(spa, 958 NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); 959 } else { 960 scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); 961 } 962 ASSERT(!dp->dp_scrub_pausing); 963 } else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { 964 /* 965 * If we were paused, continue from here. Note if the ds 966 * we were paused on was destroyed, the zb_objset will be 967 * ZB_DESTROYED_OBJSET, so we will skip this and find a new 968 * objset below. 969 */ 970 scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); 971 if (dp->dp_scrub_pausing) 972 goto out; 973 } 974 975 /* 976 * In case we were paused right at the end of the ds, zero the 977 * bookmark so we don't think that we're still trying to resume. 978 */ 979 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 980 981 /* keep pulling things out of the zap-object-as-queue */ 982 while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), 983 zap_cursor_retrieve(&zc, &za) == 0) { 984 VERIFY(0 == zap_remove(dp->dp_meta_objset, 985 dp->dp_scrub_queue_obj, za.za_name, tx)); 986 scrub_visitds(dp, za.za_first_integer, tx); 987 if (dp->dp_scrub_pausing) 988 break; 989 zap_cursor_fini(&zc); 990 } 991 zap_cursor_fini(&zc); 992 if (dp->dp_scrub_pausing) 993 goto out; 994 995 /* done. */ 996 997 dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); 998 return; 999 out: 1000 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1001 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 1002 sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), 1003 &dp->dp_scrub_bookmark, tx)); 1004 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1005 DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), 1006 sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), 1007 &dp->dp_scrub_ddt_bookmark, tx)); 1008 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1009 DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, 1010 &dp->dp_scrub_ddt_class_max, tx)); 1011 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1012 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, 1013 &spa->spa_scrub_errors, tx)); 1014 1015 /* XXX this is scrub-clean specific */ 1016 mutex_enter(&spa->spa_scrub_lock); 1017 while (spa->spa_scrub_inflight > 0) 1018 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1019 mutex_exit(&spa->spa_scrub_lock); 1020 } 1021 1022 void 1023 dsl_pool_scrub_restart(dsl_pool_t *dp) 1024 { 1025 mutex_enter(&dp->dp_scrub_cancel_lock); 1026 dp->dp_scrub_restart = B_TRUE; 1027 mutex_exit(&dp->dp_scrub_cancel_lock); 1028 } 1029 1030 /* 1031 * scrub consumers 1032 */ 1033 1034 static void 1035 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) 1036 { 1037 int i; 1038 1039 /* 1040 * If we resume after a reboot, zab will be NULL; don't record 1041 * incomplete stats in that case. 1042 */ 1043 if (zab == NULL) 1044 return; 1045 1046 for (i = 0; i < 4; i++) { 1047 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; 1048 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; 1049 zfs_blkstat_t *zb = &zab->zab_type[l][t]; 1050 int equal; 1051 1052 zb->zb_count++; 1053 zb->zb_asize += BP_GET_ASIZE(bp); 1054 zb->zb_lsize += BP_GET_LSIZE(bp); 1055 zb->zb_psize += BP_GET_PSIZE(bp); 1056 zb->zb_gangs += BP_COUNT_GANG(bp); 1057 1058 switch (BP_GET_NDVAS(bp)) { 1059 case 2: 1060 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 1061 DVA_GET_VDEV(&bp->blk_dva[1])) 1062 zb->zb_ditto_2_of_2_samevdev++; 1063 break; 1064 case 3: 1065 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 1066 DVA_GET_VDEV(&bp->blk_dva[1])) + 1067 (DVA_GET_VDEV(&bp->blk_dva[0]) == 1068 DVA_GET_VDEV(&bp->blk_dva[2])) + 1069 (DVA_GET_VDEV(&bp->blk_dva[1]) == 1070 DVA_GET_VDEV(&bp->blk_dva[2])); 1071 if (equal == 1) 1072 zb->zb_ditto_2_of_3_samevdev++; 1073 else if (equal == 3) 1074 zb->zb_ditto_3_of_3_samevdev++; 1075 break; 1076 } 1077 } 1078 } 1079 1080 static void 1081 dsl_pool_scrub_clean_done(zio_t *zio) 1082 { 1083 spa_t *spa = zio->io_spa; 1084 1085 zio_data_buf_free(zio->io_data, zio->io_size); 1086 1087 mutex_enter(&spa->spa_scrub_lock); 1088 spa->spa_scrub_inflight--; 1089 cv_broadcast(&spa->spa_scrub_io_cv); 1090 1091 if (zio->io_error && (zio->io_error != ECKSUM || 1092 !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) 1093 spa->spa_scrub_errors++; 1094 mutex_exit(&spa->spa_scrub_lock); 1095 } 1096 1097 static int 1098 dsl_pool_scrub_clean_cb(dsl_pool_t *dp, 1099 const blkptr_t *bp, const zbookmark_t *zb) 1100 { 1101 size_t size = BP_GET_PSIZE(bp); 1102 spa_t *spa = dp->dp_spa; 1103 uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); 1104 boolean_t needs_io; 1105 int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; 1106 int zio_priority; 1107 1108 if (phys_birth <= dp->dp_scrub_min_txg || 1109 phys_birth >= dp->dp_scrub_max_txg) 1110 return (0); 1111 1112 count_block(dp->dp_blkstats, bp); 1113 1114 if (dp->dp_scrub_isresilver == 0) { 1115 /* It's a scrub */ 1116 zio_flags |= ZIO_FLAG_SCRUB; 1117 zio_priority = ZIO_PRIORITY_SCRUB; 1118 needs_io = B_TRUE; 1119 } else { 1120 /* It's a resilver */ 1121 zio_flags |= ZIO_FLAG_RESILVER; 1122 zio_priority = ZIO_PRIORITY_RESILVER; 1123 needs_io = B_FALSE; 1124 } 1125 1126 /* If it's an intent log block, failure is expected. */ 1127 if (zb->zb_level == ZB_ZIL_LEVEL) 1128 zio_flags |= ZIO_FLAG_SPECULATIVE; 1129 1130 for (int d = 0; d < BP_GET_NDVAS(bp); d++) { 1131 vdev_t *vd = vdev_lookup_top(spa, 1132 DVA_GET_VDEV(&bp->blk_dva[d])); 1133 1134 /* 1135 * Keep track of how much data we've examined so that 1136 * zpool(1M) status can make useful progress reports. 1137 */ 1138 mutex_enter(&vd->vdev_stat_lock); 1139 vd->vdev_stat.vs_scrub_examined += 1140 DVA_GET_ASIZE(&bp->blk_dva[d]); 1141 mutex_exit(&vd->vdev_stat_lock); 1142 1143 /* if it's a resilver, this may not be in the target range */ 1144 if (!needs_io) { 1145 if (DVA_GET_GANG(&bp->blk_dva[d])) { 1146 /* 1147 * Gang members may be spread across multiple 1148 * vdevs, so the best estimate we have is the 1149 * scrub range, which has already been checked. 1150 * XXX -- it would be better to change our 1151 * allocation policy to ensure that all 1152 * gang members reside on the same vdev. 1153 */ 1154 needs_io = B_TRUE; 1155 } else { 1156 needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, 1157 phys_birth, 1); 1158 } 1159 } 1160 } 1161 1162 if (needs_io && !zfs_no_scrub_io) { 1163 void *data = zio_data_buf_alloc(size); 1164 1165 mutex_enter(&spa->spa_scrub_lock); 1166 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) 1167 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1168 spa->spa_scrub_inflight++; 1169 mutex_exit(&spa->spa_scrub_lock); 1170 1171 zio_nowait(zio_read(NULL, spa, bp, data, size, 1172 dsl_pool_scrub_clean_done, NULL, zio_priority, 1173 zio_flags, zb)); 1174 } 1175 1176 /* do not relocate this block */ 1177 return (0); 1178 } 1179 1180 int 1181 dsl_pool_scrub_clean(dsl_pool_t *dp) 1182 { 1183 spa_t *spa = dp->dp_spa; 1184 1185 /* 1186 * Purge all vdev caches and probe all devices. We do this here 1187 * rather than in sync context because this requires a writer lock 1188 * on the spa_config lock, which we can't do from sync context. The 1189 * spa_scrub_reopen flag indicates that vdev_open() should not 1190 * attempt to start another scrub. 1191 */ 1192 spa_vdev_state_enter(spa, SCL_NONE); 1193 spa->spa_scrub_reopen = B_TRUE; 1194 vdev_reopen(spa->spa_root_vdev); 1195 spa->spa_scrub_reopen = B_FALSE; 1196 (void) spa_vdev_state_exit(spa, NULL, 0); 1197 1198 return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); 1199 } 1200