1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/dsl_pool.h> 27 #include <sys/dsl_dataset.h> 28 #include <sys/dsl_prop.h> 29 #include <sys/dsl_dir.h> 30 #include <sys/dsl_synctask.h> 31 #include <sys/dnode.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/arc.h> 35 #include <sys/zap.h> 36 #include <sys/zio.h> 37 #include <sys/zfs_context.h> 38 #include <sys/fs/zfs.h> 39 #include <sys/zfs_znode.h> 40 #include <sys/spa_impl.h> 41 #include <sys/vdev_impl.h> 42 #include <sys/zil_impl.h> 43 #include <sys/zio_checksum.h> 44 #include <sys/ddt.h> 45 46 typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); 47 48 static scrub_cb_t dsl_pool_scrub_clean_cb; 49 static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; 50 static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, 51 uint64_t objset, uint64_t object); 52 53 int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ 54 int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ 55 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ 56 57 extern int zfs_txg_timeout; 58 59 static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { 60 NULL, 61 dsl_pool_scrub_clean_cb 62 }; 63 64 /* ARGSUSED */ 65 static void 66 dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 67 { 68 dsl_pool_t *dp = arg1; 69 enum scrub_func *funcp = arg2; 70 dmu_object_type_t ot = 0; 71 boolean_t complete = B_FALSE; 72 73 dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); 74 75 ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); 76 ASSERT(*funcp > SCRUB_FUNC_NONE); 77 ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); 78 79 dp->dp_scrub_min_txg = 0; 80 dp->dp_scrub_max_txg = tx->tx_txg; 81 82 if (*funcp == SCRUB_FUNC_CLEAN) { 83 vdev_t *rvd = dp->dp_spa->spa_root_vdev; 84 85 /* rewrite all disk labels */ 86 vdev_config_dirty(rvd); 87 88 if (vdev_resilver_needed(rvd, 89 &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { 90 spa_event_notify(dp->dp_spa, NULL, 91 ESC_ZFS_RESILVER_START); 92 dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, 93 tx->tx_txg); 94 } else { 95 spa_event_notify(dp->dp_spa, NULL, 96 ESC_ZFS_SCRUB_START); 97 } 98 99 /* zero out the scrub stats in all vdev_stat_t's */ 100 vdev_scrub_stat_update(rvd, 101 dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : 102 POOL_SCRUB_EVERYTHING, B_FALSE); 103 104 dp->dp_spa->spa_scrub_started = B_TRUE; 105 } 106 107 /* back to the generic stuff */ 108 109 if (dp->dp_blkstats == NULL) { 110 dp->dp_blkstats = 111 kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); 112 } 113 bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 114 115 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) 116 ot = DMU_OT_ZAP_OTHER; 117 118 dp->dp_scrub_func = *funcp; 119 dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, 120 ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); 121 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 122 dp->dp_scrub_restart = B_FALSE; 123 dp->dp_scrub_ditto = B_FALSE; 124 dp->dp_spa->spa_scrub_errors = 0; 125 126 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 127 DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, 128 &dp->dp_scrub_func, tx)); 129 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 130 DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, 131 &dp->dp_scrub_queue_obj, tx)); 132 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 133 DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, 134 &dp->dp_scrub_min_txg, tx)); 135 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 136 DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, 137 &dp->dp_scrub_max_txg, tx)); 138 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 139 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, 140 &dp->dp_scrub_bookmark, tx)); 141 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 142 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, 143 &dp->dp_spa->spa_scrub_errors, tx)); 144 145 spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, 146 "func=%u mintxg=%llu maxtxg=%llu", 147 *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); 148 } 149 150 int 151 dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) 152 { 153 return (dsl_sync_task_do(dp, NULL, 154 dsl_pool_scrub_setup_sync, dp, &func, 0)); 155 } 156 157 /* ARGSUSED */ 158 static void 159 dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 160 { 161 dsl_pool_t *dp = arg1; 162 boolean_t *completep = arg2; 163 164 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 165 return; 166 167 mutex_enter(&dp->dp_scrub_cancel_lock); 168 169 if (dp->dp_scrub_restart) { 170 dp->dp_scrub_restart = B_FALSE; 171 *completep = B_FALSE; 172 } 173 174 /* XXX this is scrub-clean specific */ 175 mutex_enter(&dp->dp_spa->spa_scrub_lock); 176 while (dp->dp_spa->spa_scrub_inflight > 0) { 177 cv_wait(&dp->dp_spa->spa_scrub_io_cv, 178 &dp->dp_spa->spa_scrub_lock); 179 } 180 mutex_exit(&dp->dp_spa->spa_scrub_lock); 181 dp->dp_spa->spa_scrub_started = B_FALSE; 182 dp->dp_spa->spa_scrub_active = B_FALSE; 183 184 dp->dp_scrub_func = SCRUB_FUNC_NONE; 185 VERIFY(0 == dmu_object_free(dp->dp_meta_objset, 186 dp->dp_scrub_queue_obj, tx)); 187 dp->dp_scrub_queue_obj = 0; 188 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 189 190 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 191 DMU_POOL_SCRUB_QUEUE, tx)); 192 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 193 DMU_POOL_SCRUB_MIN_TXG, tx)); 194 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 195 DMU_POOL_SCRUB_MAX_TXG, tx)); 196 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 197 DMU_POOL_SCRUB_BOOKMARK, tx)); 198 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 199 DMU_POOL_SCRUB_FUNC, tx)); 200 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 201 DMU_POOL_SCRUB_ERRORS, tx)); 202 203 spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, 204 "complete=%u", *completep); 205 206 /* below is scrub-clean specific */ 207 vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, 208 *completep); 209 /* 210 * If the scrub/resilver completed, update all DTLs to reflect this. 211 * Whether it succeeded or not, vacate all temporary scrub DTLs. 212 */ 213 vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, 214 *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); 215 if (*completep) 216 spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? 217 ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); 218 spa_errlog_rotate(dp->dp_spa); 219 220 /* 221 * We may have finished replacing a device. 222 * Let the async thread assess this and handle the detach. 223 */ 224 spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); 225 226 dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; 227 mutex_exit(&dp->dp_scrub_cancel_lock); 228 } 229 230 int 231 dsl_pool_scrub_cancel(dsl_pool_t *dp) 232 { 233 boolean_t complete = B_FALSE; 234 235 return (dsl_sync_task_do(dp, NULL, 236 dsl_pool_scrub_cancel_sync, dp, &complete, 3)); 237 } 238 239 void 240 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) 241 { 242 /* 243 * This function will be used by bp-rewrite wad to intercept frees. 244 */ 245 zio_free(dp->dp_spa, txg, bpp); 246 } 247 248 static boolean_t 249 bookmark_is_zero(const zbookmark_t *zb) 250 { 251 return (zb->zb_objset == 0 && zb->zb_object == 0 && 252 zb->zb_level == 0 && zb->zb_blkid == 0); 253 } 254 255 /* dnp is the dnode for zb1->zb_object */ 256 static boolean_t 257 bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, 258 const zbookmark_t *zb2) 259 { 260 uint64_t zb1nextL0, zb2thisobj; 261 262 ASSERT(zb1->zb_objset == zb2->zb_objset); 263 ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT); 264 ASSERT(zb2->zb_level == 0); 265 266 /* 267 * A bookmark in the deadlist is considered to be after 268 * everything else. 269 */ 270 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 271 return (B_TRUE); 272 273 /* The objset_phys_t isn't before anything. */ 274 if (dnp == NULL) 275 return (B_FALSE); 276 277 zb1nextL0 = (zb1->zb_blkid + 1) << 278 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 279 280 zb2thisobj = zb2->zb_object ? zb2->zb_object : 281 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 282 283 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 284 uint64_t nextobj = zb1nextL0 * 285 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 286 return (nextobj <= zb2thisobj); 287 } 288 289 if (zb1->zb_object < zb2thisobj) 290 return (B_TRUE); 291 if (zb1->zb_object > zb2thisobj) 292 return (B_FALSE); 293 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 294 return (B_FALSE); 295 return (zb1nextL0 <= zb2->zb_blkid); 296 } 297 298 static boolean_t 299 scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) 300 { 301 int elapsed_ticks; 302 int mintime; 303 304 if (dp->dp_scrub_pausing) 305 return (B_TRUE); /* we're already pausing */ 306 307 if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) 308 return (B_FALSE); /* we're resuming */ 309 310 /* We only know how to resume from level-0 blocks. */ 311 if (zb->zb_level != 0) 312 return (B_FALSE); 313 314 mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time : 315 zfs_scrub_min_time; 316 elapsed_ticks = lbolt64 - dp->dp_scrub_start_time; 317 if (elapsed_ticks > hz * zfs_txg_timeout || 318 (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) { 319 dprintf("pausing at %llx/%llx/%llx/%llx\n", 320 (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, 321 (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); 322 dp->dp_scrub_pausing = B_TRUE; 323 dp->dp_scrub_bookmark = *zb; 324 return (B_TRUE); 325 } 326 return (B_FALSE); 327 } 328 329 typedef struct zil_traverse_arg { 330 dsl_pool_t *zta_dp; 331 zil_header_t *zta_zh; 332 } zil_traverse_arg_t; 333 334 /* ARGSUSED */ 335 static int 336 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 337 { 338 zil_traverse_arg_t *zta = arg; 339 dsl_pool_t *dp = zta->zta_dp; 340 zil_header_t *zh = zta->zta_zh; 341 zbookmark_t zb; 342 343 if (bp->blk_birth <= dp->dp_scrub_min_txg) 344 return (0); 345 346 /* 347 * One block ("stubby") can be allocated a long time ago; we 348 * want to visit that one because it has been allocated 349 * (on-disk) even if it hasn't been claimed (even though for 350 * plain scrub there's nothing to do to it). 351 */ 352 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) 353 return (0); 354 355 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 356 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 357 358 VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); 359 return (0); 360 } 361 362 /* ARGSUSED */ 363 static int 364 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 365 { 366 if (lrc->lrc_txtype == TX_WRITE) { 367 zil_traverse_arg_t *zta = arg; 368 dsl_pool_t *dp = zta->zta_dp; 369 zil_header_t *zh = zta->zta_zh; 370 lr_write_t *lr = (lr_write_t *)lrc; 371 blkptr_t *bp = &lr->lr_blkptr; 372 zbookmark_t zb; 373 374 if (bp->blk_birth <= dp->dp_scrub_min_txg) 375 return (0); 376 377 /* 378 * birth can be < claim_txg if this record's txg is 379 * already txg sync'ed (but this log block contains 380 * other records that are not synced) 381 */ 382 if (claim_txg == 0 || bp->blk_birth < claim_txg) 383 return (0); 384 385 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 386 lr->lr_foid, ZB_ZIL_LEVEL, 387 lr->lr_offset / BP_GET_LSIZE(bp)); 388 389 VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); 390 } 391 return (0); 392 } 393 394 static void 395 traverse_zil(dsl_pool_t *dp, zil_header_t *zh) 396 { 397 uint64_t claim_txg = zh->zh_claim_txg; 398 zil_traverse_arg_t zta = { dp, zh }; 399 zilog_t *zilog; 400 401 /* 402 * We only want to visit blocks that have been claimed but not yet 403 * replayed (or, in read-only mode, blocks that *would* be claimed). 404 */ 405 if (claim_txg == 0 && spa_writeable(dp->dp_spa)) 406 return; 407 408 zilog = zil_alloc(dp->dp_meta_objset, zh); 409 410 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, 411 claim_txg); 412 413 zil_free(zilog); 414 } 415 416 static void 417 scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, 418 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) 419 { 420 int err; 421 arc_buf_t *buf = NULL; 422 423 if (bp->blk_birth <= dp->dp_scrub_min_txg) 424 return; 425 426 if (scrub_pause(dp, zb)) 427 return; 428 429 if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { 430 /* 431 * If we already visited this bp & everything below (in 432 * a prior txg), don't bother doing it again. 433 */ 434 if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) 435 return; 436 437 /* 438 * If we found the block we're trying to resume from, or 439 * we went past it to a different object, zero it out to 440 * indicate that it's OK to start checking for pausing 441 * again. 442 */ 443 if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || 444 zb->zb_object > dp->dp_scrub_bookmark.zb_object) { 445 dprintf("resuming at %llx/%llx/%llx/%llx\n", 446 (longlong_t)zb->zb_objset, 447 (longlong_t)zb->zb_object, 448 (longlong_t)zb->zb_level, 449 (longlong_t)zb->zb_blkid); 450 bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); 451 } 452 } 453 454 if (BP_GET_LEVEL(bp) > 0) { 455 uint32_t flags = ARC_WAIT; 456 int i; 457 blkptr_t *cbp; 458 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 459 460 err = arc_read(NULL, dp->dp_spa, bp, pbuf, 461 arc_getbuf_func, &buf, 462 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 463 if (err) { 464 mutex_enter(&dp->dp_spa->spa_scrub_lock); 465 dp->dp_spa->spa_scrub_errors++; 466 mutex_exit(&dp->dp_spa->spa_scrub_lock); 467 return; 468 } 469 cbp = buf->b_data; 470 471 for (i = 0; i < epb; i++, cbp++) { 472 zbookmark_t czb; 473 474 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 475 zb->zb_level - 1, 476 zb->zb_blkid * epb + i); 477 scrub_visitbp(dp, dnp, buf, cbp, &czb); 478 } 479 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 480 uint32_t flags = ARC_WAIT; 481 dnode_phys_t *child_dnp; 482 int i; 483 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 484 485 err = arc_read(NULL, dp->dp_spa, bp, pbuf, 486 arc_getbuf_func, &buf, 487 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 488 if (err) { 489 mutex_enter(&dp->dp_spa->spa_scrub_lock); 490 dp->dp_spa->spa_scrub_errors++; 491 mutex_exit(&dp->dp_spa->spa_scrub_lock); 492 return; 493 } 494 child_dnp = buf->b_data; 495 496 for (i = 0; i < epb; i++, child_dnp++) { 497 scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset, 498 zb->zb_blkid * epb + i); 499 } 500 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 501 uint32_t flags = ARC_WAIT; 502 objset_phys_t *osp; 503 504 err = arc_read_nolock(NULL, dp->dp_spa, bp, 505 arc_getbuf_func, &buf, 506 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 507 if (err) { 508 mutex_enter(&dp->dp_spa->spa_scrub_lock); 509 dp->dp_spa->spa_scrub_errors++; 510 mutex_exit(&dp->dp_spa->spa_scrub_lock); 511 return; 512 } 513 514 osp = buf->b_data; 515 516 traverse_zil(dp, &osp->os_zil_header); 517 518 scrub_visitdnode(dp, &osp->os_meta_dnode, 519 buf, zb->zb_objset, DMU_META_DNODE_OBJECT); 520 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { 521 scrub_visitdnode(dp, &osp->os_userused_dnode, 522 buf, zb->zb_objset, DMU_USERUSED_OBJECT); 523 scrub_visitdnode(dp, &osp->os_groupused_dnode, 524 buf, zb->zb_objset, DMU_GROUPUSED_OBJECT); 525 } 526 } 527 528 (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); 529 if (buf) 530 (void) arc_buf_remove_ref(buf, &buf); 531 } 532 533 static void 534 scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, 535 uint64_t objset, uint64_t object) 536 { 537 int j; 538 539 for (j = 0; j < dnp->dn_nblkptr; j++) { 540 zbookmark_t czb; 541 542 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 543 scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); 544 } 545 546 } 547 548 static void 549 scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) 550 { 551 zbookmark_t zb; 552 553 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 554 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 555 scrub_visitbp(dp, NULL, NULL, bp, &zb); 556 } 557 558 void 559 dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) 560 { 561 dsl_pool_t *dp = ds->ds_dir->dd_pool; 562 563 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 564 return; 565 566 if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { 567 SET_BOOKMARK(&dp->dp_scrub_bookmark, ZB_DESTROYED_OBJSET, 568 0, 0, 0); 569 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 570 ds->ds_object, tx) != 0) { 571 return; 572 } 573 574 if (ds->ds_phys->ds_next_snap_obj != 0) { 575 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 576 ds->ds_phys->ds_next_snap_obj, tx) == 0); 577 } 578 ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); 579 } 580 581 void 582 dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) 583 { 584 dsl_pool_t *dp = ds->ds_dir->dd_pool; 585 586 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 587 return; 588 589 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); 590 591 if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { 592 dp->dp_scrub_bookmark.zb_objset = 593 ds->ds_phys->ds_prev_snap_obj; 594 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 595 ds->ds_object, tx) == 0) { 596 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 597 ds->ds_phys->ds_prev_snap_obj, tx) == 0); 598 } 599 } 600 601 void 602 dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) 603 { 604 dsl_pool_t *dp = ds1->ds_dir->dd_pool; 605 606 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 607 return; 608 609 if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { 610 dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; 611 } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { 612 dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; 613 } 614 615 if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 616 ds1->ds_object, tx) == 0) { 617 int err = zap_add_int(dp->dp_meta_objset, 618 dp->dp_scrub_queue_obj, ds2->ds_object, tx); 619 VERIFY(err == 0 || err == EEXIST); 620 if (err == EEXIST) { 621 /* Both were there to begin with */ 622 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 623 dp->dp_scrub_queue_obj, ds1->ds_object, tx)); 624 } 625 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 626 ds2->ds_object, tx) == 0) { 627 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 628 dp->dp_scrub_queue_obj, ds1->ds_object, tx)); 629 } 630 } 631 632 struct enqueue_clones_arg { 633 dmu_tx_t *tx; 634 uint64_t originobj; 635 }; 636 637 /* ARGSUSED */ 638 static int 639 enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 640 { 641 struct enqueue_clones_arg *eca = arg; 642 dsl_dataset_t *ds; 643 int err; 644 dsl_pool_t *dp; 645 646 err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); 647 if (err) 648 return (err); 649 dp = ds->ds_dir->dd_pool; 650 651 if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { 652 while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { 653 dsl_dataset_t *prev; 654 err = dsl_dataset_hold_obj(dp, 655 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); 656 657 dsl_dataset_rele(ds, FTAG); 658 if (err) 659 return (err); 660 ds = prev; 661 } 662 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 663 ds->ds_object, eca->tx) == 0); 664 } 665 dsl_dataset_rele(ds, FTAG); 666 return (0); 667 } 668 669 static void 670 scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) 671 { 672 dsl_dataset_t *ds; 673 uint64_t min_txg_save; 674 675 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 676 677 /* 678 * Iterate over the bps in this ds. 679 */ 680 min_txg_save = dp->dp_scrub_min_txg; 681 dp->dp_scrub_min_txg = 682 MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); 683 scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); 684 dp->dp_scrub_min_txg = min_txg_save; 685 686 if (dp->dp_scrub_pausing) 687 goto out; 688 689 /* 690 * Add descendent datasets to work queue. 691 */ 692 if (ds->ds_phys->ds_next_snap_obj != 0) { 693 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 694 ds->ds_phys->ds_next_snap_obj, tx) == 0); 695 } 696 if (ds->ds_phys->ds_num_children > 1) { 697 boolean_t usenext = B_FALSE; 698 if (ds->ds_phys->ds_next_clones_obj != 0) { 699 uint64_t count; 700 /* 701 * A bug in a previous version of the code could 702 * cause upgrade_clones_cb() to not set 703 * ds_next_snap_obj when it should, leading to a 704 * missing entry. Therefore we can only use the 705 * next_clones_obj when its count is correct. 706 */ 707 int err = zap_count(dp->dp_meta_objset, 708 ds->ds_phys->ds_next_clones_obj, &count); 709 if (err == 0 && 710 count == ds->ds_phys->ds_num_children - 1) 711 usenext = B_TRUE; 712 } 713 714 if (usenext) { 715 VERIFY(zap_join(dp->dp_meta_objset, 716 ds->ds_phys->ds_next_clones_obj, 717 dp->dp_scrub_queue_obj, tx) == 0); 718 } else { 719 struct enqueue_clones_arg eca; 720 eca.tx = tx; 721 eca.originobj = ds->ds_object; 722 723 (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, 724 NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); 725 } 726 } 727 728 out: 729 dsl_dataset_rele(ds, FTAG); 730 } 731 732 /* ARGSUSED */ 733 static int 734 enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 735 { 736 dmu_tx_t *tx = arg; 737 dsl_dataset_t *ds; 738 int err; 739 dsl_pool_t *dp; 740 741 err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); 742 if (err) 743 return (err); 744 745 dp = ds->ds_dir->dd_pool; 746 747 while (ds->ds_phys->ds_prev_snap_obj != 0) { 748 dsl_dataset_t *prev; 749 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 750 FTAG, &prev); 751 if (err) { 752 dsl_dataset_rele(ds, FTAG); 753 return (err); 754 } 755 756 /* 757 * If this is a clone, we don't need to worry about it for now. 758 */ 759 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { 760 dsl_dataset_rele(ds, FTAG); 761 dsl_dataset_rele(prev, FTAG); 762 return (0); 763 } 764 dsl_dataset_rele(ds, FTAG); 765 ds = prev; 766 } 767 768 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 769 ds->ds_object, tx) == 0); 770 dsl_dataset_rele(ds, FTAG); 771 return (0); 772 } 773 774 static void 775 dsl_pool_scrub_ddt(dsl_pool_t *dp, enum zio_checksum c, enum ddt_type type, 776 enum ddt_class class) 777 { 778 ddt_t *ddt = ddt_select_by_checksum(dp->dp_spa, c); 779 ddt_entry_t dde; 780 blkptr_t blk; 781 zbookmark_t zb = { 0 }; 782 uint64_t walk = 0; 783 int error; 784 785 if (!ddt_object_exists(ddt, type, class)) 786 return; 787 788 while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) { 789 int p = DDT_PHYS_DITTO; 790 ddt_bp_create(ddt, &dde.dde_key, &dde.dde_phys[p], &blk); 791 scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); 792 } 793 ASSERT(error == ENOENT); 794 } 795 796 static void 797 dsl_pool_scrub_ditto(dsl_pool_t *dp) 798 { 799 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) 800 for (enum ddt_type type = 0; type < DDT_TYPES; type++) 801 dsl_pool_scrub_ddt(dp, c, type, DDT_CLASS_DITTO); 802 } 803 804 void 805 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) 806 { 807 spa_t *spa = dp->dp_spa; 808 zap_cursor_t zc; 809 zap_attribute_t za; 810 boolean_t complete = B_TRUE; 811 812 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 813 return; 814 815 /* 816 * If the pool is not loaded, or is trying to unload, leave it alone. 817 */ 818 if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa)) 819 return; 820 821 if (dp->dp_scrub_restart) { 822 enum scrub_func func = dp->dp_scrub_func; 823 dp->dp_scrub_restart = B_FALSE; 824 dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); 825 } 826 827 if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { 828 /* 829 * We must have resumed after rebooting; reset the vdev 830 * stats to know that we're doing a scrub (although it 831 * will think we're just starting now). 832 */ 833 vdev_scrub_stat_update(spa->spa_root_vdev, 834 dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : 835 POOL_SCRUB_EVERYTHING, B_FALSE); 836 } 837 838 dp->dp_scrub_pausing = B_FALSE; 839 dp->dp_scrub_start_time = lbolt64; 840 dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); 841 spa->spa_scrub_active = B_TRUE; 842 843 if (!dp->dp_scrub_ditto) { 844 dsl_pool_scrub_ditto(dp); 845 dp->dp_scrub_ditto = B_TRUE; 846 } 847 848 if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) { 849 /* First do the MOS & ORIGIN */ 850 scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); 851 if (dp->dp_scrub_pausing) 852 goto out; 853 854 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { 855 VERIFY(0 == dmu_objset_find_spa(spa, 856 NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); 857 } else { 858 scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); 859 } 860 ASSERT(!dp->dp_scrub_pausing); 861 } else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { 862 /* 863 * If we were paused, continue from here. Note if the ds 864 * we were paused on was destroyed, the zb_objset will be 865 * ZB_DESTROYED_OBJSET, so we will skip this and find a new 866 * objset below. 867 */ 868 scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); 869 if (dp->dp_scrub_pausing) 870 goto out; 871 } 872 873 /* 874 * In case we were paused right at the end of the ds, zero the 875 * bookmark so we don't think that we're still trying to resume. 876 */ 877 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 878 879 /* keep pulling things out of the zap-object-as-queue */ 880 while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), 881 zap_cursor_retrieve(&zc, &za) == 0) { 882 VERIFY(0 == zap_remove(dp->dp_meta_objset, 883 dp->dp_scrub_queue_obj, za.za_name, tx)); 884 scrub_visitds(dp, za.za_first_integer, tx); 885 if (dp->dp_scrub_pausing) 886 break; 887 zap_cursor_fini(&zc); 888 } 889 zap_cursor_fini(&zc); 890 if (dp->dp_scrub_pausing) 891 goto out; 892 893 /* done. */ 894 895 dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); 896 return; 897 out: 898 VERIFY(0 == zap_update(dp->dp_meta_objset, 899 DMU_POOL_DIRECTORY_OBJECT, 900 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, 901 &dp->dp_scrub_bookmark, tx)); 902 VERIFY(0 == zap_update(dp->dp_meta_objset, 903 DMU_POOL_DIRECTORY_OBJECT, 904 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, 905 &spa->spa_scrub_errors, tx)); 906 907 /* XXX this is scrub-clean specific */ 908 mutex_enter(&spa->spa_scrub_lock); 909 while (spa->spa_scrub_inflight > 0) 910 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 911 mutex_exit(&spa->spa_scrub_lock); 912 } 913 914 void 915 dsl_pool_scrub_restart(dsl_pool_t *dp) 916 { 917 mutex_enter(&dp->dp_scrub_cancel_lock); 918 dp->dp_scrub_restart = B_TRUE; 919 mutex_exit(&dp->dp_scrub_cancel_lock); 920 } 921 922 /* 923 * scrub consumers 924 */ 925 926 static void 927 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) 928 { 929 int i; 930 931 /* 932 * If we resume after a reboot, zab will be NULL; don't record 933 * incomplete stats in that case. 934 */ 935 if (zab == NULL) 936 return; 937 938 for (i = 0; i < 4; i++) { 939 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; 940 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; 941 zfs_blkstat_t *zb = &zab->zab_type[l][t]; 942 int equal; 943 944 zb->zb_count++; 945 zb->zb_asize += BP_GET_ASIZE(bp); 946 zb->zb_lsize += BP_GET_LSIZE(bp); 947 zb->zb_psize += BP_GET_PSIZE(bp); 948 zb->zb_gangs += BP_COUNT_GANG(bp); 949 950 switch (BP_GET_NDVAS(bp)) { 951 case 2: 952 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 953 DVA_GET_VDEV(&bp->blk_dva[1])) 954 zb->zb_ditto_2_of_2_samevdev++; 955 break; 956 case 3: 957 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 958 DVA_GET_VDEV(&bp->blk_dva[1])) + 959 (DVA_GET_VDEV(&bp->blk_dva[0]) == 960 DVA_GET_VDEV(&bp->blk_dva[2])) + 961 (DVA_GET_VDEV(&bp->blk_dva[1]) == 962 DVA_GET_VDEV(&bp->blk_dva[2])); 963 if (equal == 1) 964 zb->zb_ditto_2_of_3_samevdev++; 965 else if (equal == 3) 966 zb->zb_ditto_3_of_3_samevdev++; 967 break; 968 } 969 } 970 } 971 972 static void 973 dsl_pool_scrub_clean_done(zio_t *zio) 974 { 975 spa_t *spa = zio->io_spa; 976 977 zio_data_buf_free(zio->io_data, zio->io_size); 978 979 mutex_enter(&spa->spa_scrub_lock); 980 spa->spa_scrub_inflight--; 981 cv_broadcast(&spa->spa_scrub_io_cv); 982 983 if (zio->io_error && (zio->io_error != ECKSUM || 984 !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) 985 spa->spa_scrub_errors++; 986 mutex_exit(&spa->spa_scrub_lock); 987 } 988 989 static int 990 dsl_pool_scrub_clean_cb(dsl_pool_t *dp, 991 const blkptr_t *bp, const zbookmark_t *zb) 992 { 993 size_t size = BP_GET_PSIZE(bp); 994 spa_t *spa = dp->dp_spa; 995 uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); 996 boolean_t needs_io; 997 int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; 998 int zio_priority; 999 1000 if (phys_birth <= dp->dp_scrub_min_txg || 1001 phys_birth >= dp->dp_scrub_max_txg) 1002 return (0); 1003 1004 count_block(dp->dp_blkstats, bp); 1005 1006 if (dp->dp_scrub_isresilver == 0) { 1007 /* It's a scrub */ 1008 zio_flags |= ZIO_FLAG_SCRUB; 1009 zio_priority = ZIO_PRIORITY_SCRUB; 1010 needs_io = B_TRUE; 1011 } else { 1012 /* It's a resilver */ 1013 zio_flags |= ZIO_FLAG_RESILVER; 1014 zio_priority = ZIO_PRIORITY_RESILVER; 1015 needs_io = B_FALSE; 1016 } 1017 1018 /* If it's an intent log block, failure is expected. */ 1019 if (zb->zb_level == ZB_ZIL_LEVEL) 1020 zio_flags |= ZIO_FLAG_SPECULATIVE; 1021 1022 for (int d = 0; d < BP_GET_NDVAS(bp); d++) { 1023 vdev_t *vd = vdev_lookup_top(spa, 1024 DVA_GET_VDEV(&bp->blk_dva[d])); 1025 1026 /* 1027 * Keep track of how much data we've examined so that 1028 * zpool(1M) status can make useful progress reports. 1029 */ 1030 mutex_enter(&vd->vdev_stat_lock); 1031 vd->vdev_stat.vs_scrub_examined += 1032 DVA_GET_ASIZE(&bp->blk_dva[d]); 1033 mutex_exit(&vd->vdev_stat_lock); 1034 1035 /* if it's a resilver, this may not be in the target range */ 1036 if (!needs_io) { 1037 if (DVA_GET_GANG(&bp->blk_dva[d])) { 1038 /* 1039 * Gang members may be spread across multiple 1040 * vdevs, so the best estimate we have is the 1041 * scrub range, which has already been checked. 1042 * XXX -- it would be better to change our 1043 * allocation policy to ensure that all 1044 * gang members reside on the same vdev. 1045 */ 1046 needs_io = B_TRUE; 1047 } else { 1048 needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, 1049 phys_birth, 1); 1050 } 1051 } 1052 } 1053 1054 if (needs_io && !zfs_no_scrub_io) { 1055 void *data = zio_data_buf_alloc(size); 1056 1057 mutex_enter(&spa->spa_scrub_lock); 1058 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) 1059 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1060 spa->spa_scrub_inflight++; 1061 mutex_exit(&spa->spa_scrub_lock); 1062 1063 zio_nowait(zio_read(NULL, spa, bp, data, size, 1064 dsl_pool_scrub_clean_done, NULL, zio_priority, 1065 zio_flags, zb)); 1066 } 1067 1068 /* do not relocate this block */ 1069 return (0); 1070 } 1071 1072 int 1073 dsl_pool_scrub_clean(dsl_pool_t *dp) 1074 { 1075 spa_t *spa = dp->dp_spa; 1076 1077 /* 1078 * Purge all vdev caches and probe all devices. We do this here 1079 * rather than in sync context because this requires a writer lock 1080 * on the spa_config lock, which we can't do from sync context. The 1081 * spa_scrub_reopen flag indicates that vdev_open() should not 1082 * attempt to start another scrub. 1083 */ 1084 spa_vdev_state_enter(spa, SCL_NONE); 1085 spa->spa_scrub_reopen = B_TRUE; 1086 vdev_reopen(spa->spa_root_vdev); 1087 spa->spa_scrub_reopen = B_FALSE; 1088 (void) spa_vdev_state_exit(spa, NULL, 0); 1089 1090 return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); 1091 } 1092