1 789 ahrens /* 2 789 ahrens * CDDL HEADER START 3 789 ahrens * 4 789 ahrens * The contents of this file are subject to the terms of the 5 1544 eschrock * Common Development and Distribution License (the "License"). 6 1544 eschrock * You may not use this file except in compliance with the License. 7 789 ahrens * 8 789 ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 789 ahrens * or http://www.opensolaris.org/os/licensing. 10 789 ahrens * See the License for the specific language governing permissions 11 789 ahrens * and limitations under the License. 12 789 ahrens * 13 789 ahrens * When distributing Covered Code, include this CDDL HEADER in each 14 789 ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 789 ahrens * If applicable, add the following below this CDDL HEADER, with the 16 789 ahrens * fields enclosed by brackets "[]" replaced with your own identifying 17 789 ahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18 789 ahrens * 19 789 ahrens * CDDL HEADER END 20 789 ahrens */ 21 789 ahrens /* 22 8632 Bill * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 789 ahrens * Use is subject to license terms. 24 789 ahrens */ 25 789 ahrens 26 789 ahrens #include <sys/zfs_context.h> 27 1544 eschrock #include <sys/fm/fs/zfs.h> 28 789 ahrens #include <sys/spa.h> 29 789 ahrens #include <sys/txg.h> 30 789 ahrens #include <sys/spa_impl.h> 31 789 ahrens #include <sys/vdev_impl.h> 32 789 ahrens #include <sys/zio_impl.h> 33 789 ahrens #include <sys/zio_compress.h> 34 789 ahrens #include <sys/zio_checksum.h> 35 10922 Jeff #include <sys/dmu_objset.h> 36 10922 Jeff #include <sys/arc.h> 37 10922 Jeff #include <sys/ddt.h> 38 789 ahrens 39 789 ahrens /* 40 789 ahrens * ========================================================================== 41 789 ahrens * I/O priority table 42 789 ahrens * ========================================================================== 43 789 ahrens */ 44 789 ahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 45 789 ahrens 0, /* ZIO_PRIORITY_NOW */ 46 789 ahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 47 789 ahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 48 11146 George 0, /* ZIO_PRIORITY_LOG_WRITE */ 49 11146 George 1, /* ZIO_PRIORITY_CACHE_FILL */ 50 11146 George 1, /* ZIO_PRIORITY_AGG */ 51 11146 George 4, /* ZIO_PRIORITY_FREE */ 52 11146 George 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 53 789 ahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 54 789 ahrens 10, /* ZIO_PRIORITY_RESILVER */ 55 789 ahrens 20, /* ZIO_PRIORITY_SCRUB */ 56 789 ahrens }; 57 789 ahrens 58 789 ahrens /* 59 789 ahrens * ========================================================================== 60 789 ahrens * I/O type descriptions 61 789 ahrens * ========================================================================== 62 789 ahrens */ 63 789 ahrens char *zio_type_name[ZIO_TYPES] = { 64 11146 George "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 65 11146 George "zio_ioctl" 66 11146 George }; 67 3668 gw25295 68 789 ahrens /* 69 789 ahrens * ========================================================================== 70 789 ahrens * I/O kmem caches 71 789 ahrens * ========================================================================== 72 789 ahrens */ 73 4055 eschrock kmem_cache_t *zio_cache; 74 8632 Bill kmem_cache_t *zio_link_cache; 75 789 ahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 76 3290 johansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 77 3290 johansen 78 3290 johansen #ifdef _KERNEL 79 3290 johansen extern vmem_t *zio_alloc_arena; 80 3290 johansen #endif 81 5329 gw25295 82 5329 gw25295 /* 83 7754 Jeff * An allocating zio is one that either currently has the DVA allocate 84 7754 Jeff * stage set or will have it later in its lifetime. 85 5329 gw25295 */ 86 10922 Jeff #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 87 11173 Jonathan 88 11173 Jonathan boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 89 10922 Jeff 90 10922 Jeff #ifdef ZFS_DEBUG 91 10922 Jeff int zio_buf_debug_limit = 16384; 92 10922 Jeff #else 93 10922 Jeff int zio_buf_debug_limit = 0; 94 10922 Jeff #endif 95 789 ahrens 96 789 ahrens void 97 789 ahrens zio_init(void) 98 789 ahrens { 99 789 ahrens size_t c; 100 3290 johansen vmem_t *data_alloc_arena = NULL; 101 3290 johansen 102 3290 johansen #ifdef _KERNEL 103 3290 johansen data_alloc_arena = zio_alloc_arena; 104 3290 johansen #endif 105 8632 Bill zio_cache = kmem_cache_create("zio_cache", 106 8632 Bill sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 107 8632 Bill zio_link_cache = kmem_cache_create("zio_link_cache", 108 8632 Bill sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 109 789 ahrens 110 789 ahrens /* 111 789 ahrens * For small buffers, we want a cache for each multiple of 112 789 ahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 113 789 ahrens * for each quarter-power of 2. For large buffers, we want 114 789 ahrens * a cache for each multiple of PAGESIZE. 115 789 ahrens */ 116 789 ahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 117 789 ahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 118 789 ahrens size_t p2 = size; 119 789 ahrens size_t align = 0; 120 789 ahrens 121 789 ahrens while (p2 & (p2 - 1)) 122 789 ahrens p2 &= p2 - 1; 123 789 ahrens 124 789 ahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 125 789 ahrens align = SPA_MINBLOCKSIZE; 126 789 ahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 127 789 ahrens align = PAGESIZE; 128 789 ahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 129 789 ahrens align = p2 >> 2; 130 789 ahrens } 131 789 ahrens 132 789 ahrens if (align != 0) { 133 3290 johansen char name[36]; 134 2856 nd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 135 789 ahrens zio_buf_cache[c] = kmem_cache_create(name, size, 136 10922 Jeff align, NULL, NULL, NULL, NULL, NULL, 137 10922 Jeff size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 138 3290 johansen 139 3290 johansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 140 3290 johansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 141 3290 johansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 142 10922 Jeff size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 143 789 ahrens } 144 789 ahrens } 145 789 ahrens 146 789 ahrens while (--c != 0) { 147 789 ahrens ASSERT(zio_buf_cache[c] != NULL); 148 789 ahrens if (zio_buf_cache[c - 1] == NULL) 149 789 ahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 150 3290 johansen 151 3290 johansen ASSERT(zio_data_buf_cache[c] != NULL); 152 3290 johansen if (zio_data_buf_cache[c - 1] == NULL) 153 3290 johansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 154 789 ahrens } 155 5329 gw25295 156 1544 eschrock zio_inject_init(); 157 789 ahrens } 158 789 ahrens 159 789 ahrens void 160 789 ahrens zio_fini(void) 161 789 ahrens { 162 789 ahrens size_t c; 163 789 ahrens kmem_cache_t *last_cache = NULL; 164 3290 johansen kmem_cache_t *last_data_cache = NULL; 165 789 ahrens 166 789 ahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 167 789 ahrens if (zio_buf_cache[c] != last_cache) { 168 789 ahrens last_cache = zio_buf_cache[c]; 169 789 ahrens kmem_cache_destroy(zio_buf_cache[c]); 170 789 ahrens } 171 789 ahrens zio_buf_cache[c] = NULL; 172 3290 johansen 173 3290 johansen if (zio_data_buf_cache[c] != last_data_cache) { 174 3290 johansen last_data_cache = zio_data_buf_cache[c]; 175 3290 johansen kmem_cache_destroy(zio_data_buf_cache[c]); 176 3290 johansen } 177 3290 johansen zio_data_buf_cache[c] = NULL; 178 789 ahrens } 179 4055 eschrock 180 8632 Bill kmem_cache_destroy(zio_link_cache); 181 4055 eschrock kmem_cache_destroy(zio_cache); 182 1544 eschrock 183 1544 eschrock zio_inject_fini(); 184 789 ahrens } 185 789 ahrens 186 789 ahrens /* 187 789 ahrens * ========================================================================== 188 789 ahrens * Allocate and free I/O buffers 189 789 ahrens * ========================================================================== 190 789 ahrens */ 191 3290 johansen 192 3290 johansen /* 193 3290 johansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 194 3290 johansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 195 3290 johansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 196 3290 johansen * excess / transient data in-core during a crashdump. 197 3290 johansen */ 198 789 ahrens void * 199 789 ahrens zio_buf_alloc(size_t size) 200 789 ahrens { 201 789 ahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 202 789 ahrens 203 789 ahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 204 789 ahrens 205 6245 maybee return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 206 3290 johansen } 207 3290 johansen 208 3290 johansen /* 209 3290 johansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 210 3290 johansen * crashdump if the kernel panics. This exists so that we will limit the amount 211 3290 johansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 212 3290 johansen * of kernel heap dumped to disk when the kernel panics) 213 3290 johansen */ 214 3290 johansen void * 215 3290 johansen zio_data_buf_alloc(size_t size) 216 3290 johansen { 217 3290 johansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 218 3290 johansen 219 3290 johansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 220 3290 johansen 221 6245 maybee return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 222 789 ahrens } 223 789 ahrens 224 789 ahrens void 225 789 ahrens zio_buf_free(void *buf, size_t size) 226 789 ahrens { 227 789 ahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 228 789 ahrens 229 789 ahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 230 789 ahrens 231 789 ahrens kmem_cache_free(zio_buf_cache[c], buf); 232 789 ahrens } 233 789 ahrens 234 3290 johansen void 235 3290 johansen zio_data_buf_free(void *buf, size_t size) 236 3290 johansen { 237 3290 johansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 238 3290 johansen 239 3290 johansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 240 3290 johansen 241 3290 johansen kmem_cache_free(zio_data_buf_cache[c], buf); 242 3290 johansen } 243 3463 ahrens 244 789 ahrens /* 245 789 ahrens * ========================================================================== 246 789 ahrens * Push and pop I/O transform buffers 247 789 ahrens * ========================================================================== 248 789 ahrens */ 249 789 ahrens static void 250 7754 Jeff zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 251 7754 Jeff zio_transform_func_t *transform) 252 789 ahrens { 253 789 ahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 254 789 ahrens 255 7754 Jeff zt->zt_orig_data = zio->io_data; 256 7754 Jeff zt->zt_orig_size = zio->io_size; 257 789 ahrens zt->zt_bufsize = bufsize; 258 7754 Jeff zt->zt_transform = transform; 259 789 ahrens 260 789 ahrens zt->zt_next = zio->io_transform_stack; 261 789 ahrens zio->io_transform_stack = zt; 262 789 ahrens 263 789 ahrens zio->io_data = data; 264 789 ahrens zio->io_size = size; 265 789 ahrens } 266 789 ahrens 267 789 ahrens static void 268 7754 Jeff zio_pop_transforms(zio_t *zio) 269 789 ahrens { 270 7754 Jeff zio_transform_t *zt; 271 789 ahrens 272 7754 Jeff while ((zt = zio->io_transform_stack) != NULL) { 273 7754 Jeff if (zt->zt_transform != NULL) 274 7754 Jeff zt->zt_transform(zio, 275 7754 Jeff zt->zt_orig_data, zt->zt_orig_size); 276 789 ahrens 277 10922 Jeff if (zt->zt_bufsize != 0) 278 10922 Jeff zio_buf_free(zio->io_data, zt->zt_bufsize); 279 789 ahrens 280 7754 Jeff zio->io_data = zt->zt_orig_data; 281 7754 Jeff zio->io_size = zt->zt_orig_size; 282 7754 Jeff zio->io_transform_stack = zt->zt_next; 283 789 ahrens 284 7754 Jeff kmem_free(zt, sizeof (zio_transform_t)); 285 789 ahrens } 286 789 ahrens } 287 789 ahrens 288 789 ahrens /* 289 789 ahrens * ========================================================================== 290 7754 Jeff * I/O transform callbacks for subblocks and decompression 291 7754 Jeff * ========================================================================== 292 7754 Jeff */ 293 7754 Jeff static void 294 7754 Jeff zio_subblock(zio_t *zio, void *data, uint64_t size) 295 7754 Jeff { 296 7754 Jeff ASSERT(zio->io_size > size); 297 7754 Jeff 298 7754 Jeff if (zio->io_type == ZIO_TYPE_READ) 299 7754 Jeff bcopy(zio->io_data, data, size); 300 7754 Jeff } 301 7754 Jeff 302 7754 Jeff static void 303 7754 Jeff zio_decompress(zio_t *zio, void *data, uint64_t size) 304 7754 Jeff { 305 7754 Jeff if (zio->io_error == 0 && 306 7754 Jeff zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 307 10922 Jeff zio->io_data, data, zio->io_size, size) != 0) 308 7754 Jeff zio->io_error = EIO; 309 7754 Jeff } 310 7754 Jeff 311 7754 Jeff /* 312 7754 Jeff * ========================================================================== 313 7754 Jeff * I/O parent/child relationships and pipeline interlocks 314 7754 Jeff * ========================================================================== 315 7754 Jeff */ 316 8632 Bill /* 317 8632 Bill * NOTE - Callers to zio_walk_parents() and zio_walk_children must 318 8632 Bill * continue calling these functions until they return NULL. 319 8632 Bill * Otherwise, the next caller will pick up the list walk in 320 8632 Bill * some indeterminate state. (Otherwise every caller would 321 8632 Bill * have to pass in a cookie to keep the state represented by 322 8632 Bill * io_walk_link, which gets annoying.) 323 8632 Bill */ 324 8632 Bill zio_t * 325 8632 Bill zio_walk_parents(zio_t *cio) 326 8632 Bill { 327 8632 Bill zio_link_t *zl = cio->io_walk_link; 328 8632 Bill list_t *pl = &cio->io_parent_list; 329 7754 Jeff 330 8632 Bill zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 331 8632 Bill cio->io_walk_link = zl; 332 8632 Bill 333 8632 Bill if (zl == NULL) 334 8632 Bill return (NULL); 335 8632 Bill 336 8632 Bill ASSERT(zl->zl_child == cio); 337 8632 Bill return (zl->zl_parent); 338 8632 Bill } 339 8632 Bill 340 8632 Bill zio_t * 341 8632 Bill zio_walk_children(zio_t *pio) 342 7754 Jeff { 343 8632 Bill zio_link_t *zl = pio->io_walk_link; 344 8632 Bill list_t *cl = &pio->io_child_list; 345 8632 Bill 346 8632 Bill zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 347 8632 Bill pio->io_walk_link = zl; 348 8632 Bill 349 8632 Bill if (zl == NULL) 350 8632 Bill return (NULL); 351 8632 Bill 352 8632 Bill ASSERT(zl->zl_parent == pio); 353 8632 Bill return (zl->zl_child); 354 8632 Bill } 355 8632 Bill 356 8632 Bill zio_t * 357 8632 Bill zio_unique_parent(zio_t *cio) 358 8632 Bill { 359 8632 Bill zio_t *pio = zio_walk_parents(cio); 360 8632 Bill 361 8632 Bill VERIFY(zio_walk_parents(cio) == NULL); 362 8632 Bill return (pio); 363 8632 Bill } 364 8632 Bill 365 8632 Bill void 366 8632 Bill zio_add_child(zio_t *pio, zio_t *cio) 367 8632 Bill { 368 8632 Bill zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 369 8632 Bill 370 8632 Bill /* 371 8632 Bill * Logical I/Os can have logical, gang, or vdev children. 372 8632 Bill * Gang I/Os can have gang or vdev children. 373 8632 Bill * Vdev I/Os can only have vdev children. 374 8632 Bill * The following ASSERT captures all of these constraints. 375 8632 Bill */ 376 8632 Bill ASSERT(cio->io_child_type <= pio->io_child_type); 377 8632 Bill 378 8632 Bill zl->zl_parent = pio; 379 8632 Bill zl->zl_child = cio; 380 8632 Bill 381 8632 Bill mutex_enter(&cio->io_lock); 382 7754 Jeff mutex_enter(&pio->io_lock); 383 8632 Bill 384 8632 Bill ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 385 8632 Bill 386 8632 Bill for (int w = 0; w < ZIO_WAIT_TYPES; w++) 387 8632 Bill pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 388 8632 Bill 389 8632 Bill list_insert_head(&pio->io_child_list, zl); 390 8632 Bill list_insert_head(&cio->io_parent_list, zl); 391 8632 Bill 392 10922 Jeff pio->io_child_count++; 393 10922 Jeff cio->io_parent_count++; 394 10922 Jeff 395 7754 Jeff mutex_exit(&pio->io_lock); 396 8632 Bill mutex_exit(&cio->io_lock); 397 7754 Jeff } 398 7754 Jeff 399 7754 Jeff static void 400 8632 Bill zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 401 7754 Jeff { 402 8632 Bill ASSERT(zl->zl_parent == pio); 403 8632 Bill ASSERT(zl->zl_child == cio); 404 7754 Jeff 405 8632 Bill mutex_enter(&cio->io_lock); 406 8632 Bill mutex_enter(&pio->io_lock); 407 7754 Jeff 408 8632 Bill list_remove(&pio->io_child_list, zl); 409 8632 Bill list_remove(&cio->io_parent_list, zl); 410 8632 Bill 411 10922 Jeff pio->io_child_count--; 412 10922 Jeff cio->io_parent_count--; 413 10922 Jeff 414 7754 Jeff mutex_exit(&pio->io_lock); 415 8632 Bill mutex_exit(&cio->io_lock); 416 8632 Bill 417 8632 Bill kmem_cache_free(zio_link_cache, zl); 418 7754 Jeff } 419 7754 Jeff 420 7754 Jeff static boolean_t 421 7754 Jeff zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 422 7754 Jeff { 423 7754 Jeff uint64_t *countp = &zio->io_children[child][wait]; 424 7754 Jeff boolean_t waiting = B_FALSE; 425 7754 Jeff 426 7754 Jeff mutex_enter(&zio->io_lock); 427 7754 Jeff ASSERT(zio->io_stall == NULL); 428 7754 Jeff if (*countp != 0) { 429 10922 Jeff zio->io_stage >>= 1; 430 7754 Jeff zio->io_stall = countp; 431 7754 Jeff waiting = B_TRUE; 432 7754 Jeff } 433 7754 Jeff mutex_exit(&zio->io_lock); 434 7754 Jeff 435 7754 Jeff return (waiting); 436 7754 Jeff } 437 7754 Jeff 438 7754 Jeff static void 439 7754 Jeff zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 440 7754 Jeff { 441 7754 Jeff uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 442 7754 Jeff int *errorp = &pio->io_child_error[zio->io_child_type]; 443 7754 Jeff 444 7754 Jeff mutex_enter(&pio->io_lock); 445 7754 Jeff if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 446 7754 Jeff *errorp = zio_worst_error(*errorp, zio->io_error); 447 7754 Jeff pio->io_reexecute |= zio->io_reexecute; 448 7754 Jeff ASSERT3U(*countp, >, 0); 449 7754 Jeff if (--*countp == 0 && pio->io_stall == countp) { 450 7754 Jeff pio->io_stall = NULL; 451 7754 Jeff mutex_exit(&pio->io_lock); 452 7754 Jeff zio_execute(pio); 453 7754 Jeff } else { 454 7754 Jeff mutex_exit(&pio->io_lock); 455 7754 Jeff } 456 7754 Jeff } 457 7754 Jeff 458 7754 Jeff static void 459 7754 Jeff zio_inherit_child_errors(zio_t *zio, enum zio_child c) 460 7754 Jeff { 461 7754 Jeff if (zio->io_child_error[c] != 0 && zio->io_error == 0) 462 7754 Jeff zio->io_error = zio->io_child_error[c]; 463 7754 Jeff } 464 7754 Jeff 465 7754 Jeff /* 466 7754 Jeff * ========================================================================== 467 7754 Jeff * Create the various types of I/O (read, write, free, etc) 468 789 ahrens * ========================================================================== 469 789 ahrens */ 470 789 ahrens static zio_t * 471 10922 Jeff zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 472 789 ahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 473 10922 Jeff zio_type_t type, int priority, enum zio_flag flags, 474 10922 Jeff vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 475 10922 Jeff enum zio_stage stage, enum zio_stage pipeline) 476 789 ahrens { 477 789 ahrens zio_t *zio; 478 789 ahrens 479 789 ahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 480 789 ahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 481 7754 Jeff ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 482 789 ahrens 483 7754 Jeff ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 484 7754 Jeff ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 485 7754 Jeff ASSERT(vd || stage == ZIO_STAGE_OPEN); 486 7046 ahrens 487 4055 eschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 488 4055 eschrock bzero(zio, sizeof (zio_t)); 489 7754 Jeff 490 7754 Jeff mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 491 7754 Jeff cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 492 7754 Jeff 493 8632 Bill list_create(&zio->io_parent_list, sizeof (zio_link_t), 494 8632 Bill offsetof(zio_link_t, zl_parent_node)); 495 8632 Bill list_create(&zio->io_child_list, sizeof (zio_link_t), 496 8632 Bill offsetof(zio_link_t, zl_child_node)); 497 8632 Bill 498 7754 Jeff if (vd != NULL) 499 7754 Jeff zio->io_child_type = ZIO_CHILD_VDEV; 500 7754 Jeff else if (flags & ZIO_FLAG_GANG_CHILD) 501 7754 Jeff zio->io_child_type = ZIO_CHILD_GANG; 502 10922 Jeff else if (flags & ZIO_FLAG_DDT_CHILD) 503 10922 Jeff zio->io_child_type = ZIO_CHILD_DDT; 504 7754 Jeff else 505 7754 Jeff zio->io_child_type = ZIO_CHILD_LOGICAL; 506 7754 Jeff 507 789 ahrens if (bp != NULL) { 508 10922 Jeff zio->io_bp = (blkptr_t *)bp; 509 789 ahrens zio->io_bp_copy = *bp; 510 789 ahrens zio->io_bp_orig = *bp; 511 10922 Jeff if (type != ZIO_TYPE_WRITE || 512 10922 Jeff zio->io_child_type == ZIO_CHILD_DDT) 513 7754 Jeff zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 514 9443 Bill if (zio->io_child_type == ZIO_CHILD_LOGICAL) 515 7754 Jeff zio->io_logical = zio; 516 9443 Bill if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 517 9443 Bill pipeline |= ZIO_GANG_STAGES; 518 789 ahrens } 519 7754 Jeff 520 7754 Jeff zio->io_spa = spa; 521 7754 Jeff zio->io_txg = txg; 522 789 ahrens zio->io_done = done; 523 789 ahrens zio->io_private = private; 524 789 ahrens zio->io_type = type; 525 789 ahrens zio->io_priority = priority; 526 7754 Jeff zio->io_vd = vd; 527 7754 Jeff zio->io_offset = offset; 528 10922 Jeff zio->io_orig_data = zio->io_data = data; 529 10922 Jeff zio->io_orig_size = zio->io_size = size; 530 7754 Jeff zio->io_orig_flags = zio->io_flags = flags; 531 7754 Jeff zio->io_orig_stage = zio->io_stage = stage; 532 7754 Jeff zio->io_orig_pipeline = zio->io_pipeline = pipeline; 533 789 ahrens 534 8632 Bill zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 535 8632 Bill zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 536 8632 Bill 537 7754 Jeff if (zb != NULL) 538 7754 Jeff zio->io_bookmark = *zb; 539 7754 Jeff 540 7754 Jeff if (pio != NULL) { 541 7754 Jeff if (zio->io_logical == NULL) 542 1544 eschrock zio->io_logical = pio->io_logical; 543 9443 Bill if (zio->io_child_type == ZIO_CHILD_GANG) 544 9443 Bill zio->io_gang_leader = pio->io_gang_leader; 545 7754 Jeff zio_add_child(pio, zio); 546 789 ahrens } 547 7046 ahrens 548 789 ahrens return (zio); 549 5329 gw25295 } 550 5329 gw25295 551 5329 gw25295 static void 552 7754 Jeff zio_destroy(zio_t *zio) 553 5329 gw25295 { 554 8632 Bill list_destroy(&zio->io_parent_list); 555 8632 Bill list_destroy(&zio->io_child_list); 556 7754 Jeff mutex_destroy(&zio->io_lock); 557 7754 Jeff cv_destroy(&zio->io_cv); 558 7754 Jeff kmem_cache_free(zio_cache, zio); 559 789 ahrens } 560 789 ahrens 561 789 ahrens zio_t * 562 8632 Bill zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 563 10922 Jeff void *private, enum zio_flag flags) 564 789 ahrens { 565 789 ahrens zio_t *zio; 566 789 ahrens 567 789 ahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 568 8632 Bill ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 569 7754 Jeff ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 570 789 ahrens 571 789 ahrens return (zio); 572 789 ahrens } 573 789 ahrens 574 789 ahrens zio_t * 575 10922 Jeff zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 576 789 ahrens { 577 8632 Bill return (zio_null(NULL, spa, NULL, done, private, flags)); 578 789 ahrens } 579 789 ahrens 580 789 ahrens zio_t * 581 7754 Jeff zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 582 7754 Jeff void *data, uint64_t size, zio_done_func_t *done, void *private, 583 10922 Jeff int priority, enum zio_flag flags, const zbookmark_t *zb) 584 789 ahrens { 585 789 ahrens zio_t *zio; 586 789 ahrens 587 10922 Jeff zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 588 7046 ahrens data, size, done, private, 589 7754 Jeff ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 590 10922 Jeff ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 591 10922 Jeff ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 592 789 ahrens 593 789 ahrens return (zio); 594 7872 Tim } 595 7872 Tim 596 789 ahrens zio_t * 597 7754 Jeff zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 598 10922 Jeff void *data, uint64_t size, const zio_prop_t *zp, 599 7754 Jeff zio_done_func_t *ready, zio_done_func_t *done, void *private, 600 10922 Jeff int priority, enum zio_flag flags, const zbookmark_t *zb) 601 789 ahrens { 602 789 ahrens zio_t *zio; 603 789 ahrens 604 7754 Jeff ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 605 7754 Jeff zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 606 7754 Jeff zp->zp_compress >= ZIO_COMPRESS_OFF && 607 7754 Jeff zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 608 7754 Jeff zp->zp_type < DMU_OT_NUMTYPES && 609 7754 Jeff zp->zp_level < 32 && 610 10922 Jeff zp->zp_copies > 0 && 611 10922 Jeff zp->zp_copies <= spa_max_replication(spa) && 612 10922 Jeff zp->zp_dedup <= 1 && 613 10922 Jeff zp->zp_dedup_verify <= 1); 614 789 ahrens 615 789 ahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 616 7754 Jeff ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 617 10922 Jeff ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 618 10922 Jeff ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 619 3547 maybee 620 3547 maybee zio->io_ready = ready; 621 7754 Jeff zio->io_prop = *zp; 622 789 ahrens 623 789 ahrens return (zio); 624 789 ahrens } 625 789 ahrens 626 789 ahrens zio_t * 627 7754 Jeff zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 628 7754 Jeff uint64_t size, zio_done_func_t *done, void *private, int priority, 629 10922 Jeff enum zio_flag flags, zbookmark_t *zb) 630 789 ahrens { 631 789 ahrens zio_t *zio; 632 789 ahrens 633 7181 perrin zio = zio_create(pio, spa, txg, bp, data, size, done, private, 634 7754 Jeff ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 635 7754 Jeff ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 636 789 ahrens 637 789 ahrens return (zio); 638 789 ahrens } 639 789 ahrens 640 10922 Jeff void 641 10922 Jeff zio_write_override(zio_t *zio, blkptr_t *bp, int copies) 642 10922 Jeff { 643 10922 Jeff ASSERT(zio->io_type == ZIO_TYPE_WRITE); 644 10922 Jeff ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 645 10922 Jeff ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 646 10922 Jeff ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 647 10922 Jeff 648 10922 Jeff zio->io_prop.zp_copies = copies; 649 10922 Jeff zio->io_bp_override = bp; 650 10922 Jeff } 651 10922 Jeff 652 10922 Jeff void 653 10922 Jeff zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 654 10922 Jeff { 655 10922 Jeff bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp); 656 10922 Jeff } 657 10922 Jeff 658 789 ahrens zio_t * 659 10922 Jeff zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 660 10922 Jeff enum zio_flag flags) 661 789 ahrens { 662 789 ahrens zio_t *zio; 663 789 ahrens 664 789 ahrens ASSERT(!BP_IS_HOLE(bp)); 665 10922 Jeff ASSERT(spa_syncing_txg(spa) == txg); 666 10922 Jeff ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); 667 789 ahrens 668 7754 Jeff zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 669 10922 Jeff NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 670 7754 Jeff NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 671 789 ahrens 672 789 ahrens return (zio); 673 789 ahrens } 674 789 ahrens 675 789 ahrens zio_t * 676 10922 Jeff zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 677 10922 Jeff zio_done_func_t *done, void *private, enum zio_flag flags) 678 789 ahrens { 679 789 ahrens zio_t *zio; 680 789 ahrens 681 789 ahrens /* 682 789 ahrens * A claim is an allocation of a specific block. Claims are needed 683 789 ahrens * to support immediate writes in the intent log. The issue is that 684 789 ahrens * immediate writes contain committed data, but in a txg that was 685 789 ahrens * *not* committed. Upon opening the pool after an unclean shutdown, 686 789 ahrens * the intent log claims all blocks that contain immediate write data 687 789 ahrens * so that the SPA knows they're in use. 688 789 ahrens * 689 789 ahrens * All claims *must* be resolved in the first txg -- before the SPA 690 789 ahrens * starts allocating blocks -- so that nothing is allocated twice. 691 10922 Jeff * If txg == 0 we just verify that the block is claimable. 692 789 ahrens */ 693 789 ahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 694 10922 Jeff ASSERT(txg == spa_first_txg(spa) || txg == 0); 695 10922 Jeff ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 696 789 ahrens 697 7754 Jeff zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 698 7754 Jeff done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 699 7754 Jeff NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 700 789 ahrens 701 789 ahrens return (zio); 702 789 ahrens } 703 789 ahrens 704 789 ahrens zio_t * 705 789 ahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 706 10922 Jeff zio_done_func_t *done, void *private, int priority, enum zio_flag flags) 707 789 ahrens { 708 789 ahrens zio_t *zio; 709 789 ahrens int c; 710 789 ahrens 711 789 ahrens if (vd->vdev_children == 0) { 712 789 ahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 713 7754 Jeff ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 714 789 ahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 715 789 ahrens 716 789 ahrens zio->io_cmd = cmd; 717 789 ahrens } else { 718 8632 Bill zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 719 789 ahrens 720 789 ahrens for (c = 0; c < vd->vdev_children; c++) 721 789 ahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 722 789 ahrens done, private, priority, flags)); 723 789 ahrens } 724 789 ahrens 725 789 ahrens return (zio); 726 789 ahrens } 727 789 ahrens 728 789 ahrens zio_t * 729 789 ahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 730 789 ahrens void *data, int checksum, zio_done_func_t *done, void *private, 731 10922 Jeff int priority, enum zio_flag flags, boolean_t labels) 732 789 ahrens { 733 789 ahrens zio_t *zio; 734 789 ahrens 735 7754 Jeff ASSERT(vd->vdev_children == 0); 736 7754 Jeff ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 737 7754 Jeff offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 738 7754 Jeff ASSERT3U(offset + size, <=, vd->vdev_psize); 739 5329 gw25295 740 7754 Jeff zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 741 7754 Jeff ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 742 789 ahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 743 789 ahrens 744 7754 Jeff zio->io_prop.zp_checksum = checksum; 745 789 ahrens 746 789 ahrens return (zio); 747 789 ahrens } 748 789 ahrens 749 789 ahrens zio_t * 750 789 ahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 751 789 ahrens void *data, int checksum, zio_done_func_t *done, void *private, 752 10922 Jeff int priority, enum zio_flag flags, boolean_t labels) 753 789 ahrens { 754 789 ahrens zio_t *zio; 755 5329 gw25295 756 7754 Jeff ASSERT(vd->vdev_children == 0); 757 7754 Jeff ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 758 7754 Jeff offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 759 7754 Jeff ASSERT3U(offset + size, <=, vd->vdev_psize); 760 789 ahrens 761 7754 Jeff zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 762 7754 Jeff ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 763 789 ahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 764 789 ahrens 765 7754 Jeff zio->io_prop.zp_checksum = checksum; 766 789 ahrens 767 789 ahrens if (zio_checksum_table[checksum].ci_zbt) { 768 789 ahrens /* 769 789 ahrens * zbt checksums are necessarily destructive -- they modify 770 7754 Jeff * the end of the write buffer to hold the verifier/checksum. 771 789 ahrens * Therefore, we must make a local copy in case the data is 772 7754 Jeff * being written to multiple places in parallel. 773 789 ahrens */ 774 7754 Jeff void *wbuf = zio_buf_alloc(size); 775 789 ahrens bcopy(data, wbuf, size); 776 7754 Jeff zio_push_transform(zio, wbuf, size, size, NULL); 777 789 ahrens } 778 789 ahrens 779 789 ahrens return (zio); 780 789 ahrens } 781 789 ahrens 782 789 ahrens /* 783 7754 Jeff * Create a child I/O to do some work for us. 784 789 ahrens */ 785 789 ahrens zio_t * 786 7754 Jeff zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 787 10922 Jeff void *data, uint64_t size, int type, int priority, enum zio_flag flags, 788 789 ahrens zio_done_func_t *done, void *private) 789 789 ahrens { 790 10922 Jeff enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 791 7754 Jeff zio_t *zio; 792 7754 Jeff 793 7754 Jeff ASSERT(vd->vdev_parent == 794 7754 Jeff (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 795 789 ahrens 796 789 ahrens if (type == ZIO_TYPE_READ && bp != NULL) { 797 789 ahrens /* 798 789 ahrens * If we have the bp, then the child should perform the 799 789 ahrens * checksum and the parent need not. This pushes error 800 789 ahrens * detection as close to the leaves as possible and 801 789 ahrens * eliminates redundant checksums in the interior nodes. 802 789 ahrens */ 803 10922 Jeff pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 804 10922 Jeff pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 805 789 ahrens } 806 789 ahrens 807 7754 Jeff if (vd->vdev_children == 0) 808 7754 Jeff offset += VDEV_LABEL_START_SIZE; 809 7754 Jeff 810 10922 Jeff flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 811 10922 Jeff 812 10922 Jeff /* 813 10922 Jeff * If we've decided to do a repair, the write is not speculative -- 814 10922 Jeff * even if the original read was. 815 10922 Jeff */ 816 10922 Jeff if (flags & ZIO_FLAG_IO_REPAIR) 817 10922 Jeff flags &= ~ZIO_FLAG_SPECULATIVE; 818 10922 Jeff 819 7754 Jeff zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 820 10922 Jeff done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 821 10922 Jeff ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 822 789 ahrens 823 7754 Jeff return (zio); 824 7754 Jeff } 825 789 ahrens 826 7754 Jeff zio_t * 827 7754 Jeff zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 828 10922 Jeff int type, int priority, enum zio_flag flags, 829 10922 Jeff zio_done_func_t *done, void *private) 830 7754 Jeff { 831 7754 Jeff zio_t *zio; 832 7754 Jeff 833 7754 Jeff ASSERT(vd->vdev_ops->vdev_op_leaf); 834 7754 Jeff 835 7754 Jeff zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 836 7754 Jeff data, size, done, private, type, priority, 837 7754 Jeff flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 838 7754 Jeff vd, offset, NULL, 839 10922 Jeff ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 840 7754 Jeff 841 7754 Jeff return (zio); 842 7754 Jeff } 843 7754 Jeff 844 7754 Jeff void 845 7754 Jeff zio_flush(zio_t *zio, vdev_t *vd) 846 7754 Jeff { 847 7754 Jeff zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 848 7754 Jeff NULL, NULL, ZIO_PRIORITY_NOW, 849 7754 Jeff ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 850 7754 Jeff } 851 7754 Jeff 852 7754 Jeff /* 853 7754 Jeff * ========================================================================== 854 7754 Jeff * Prepare to read and write logical blocks 855 7754 Jeff * ========================================================================== 856 7754 Jeff */ 857 7754 Jeff 858 7754 Jeff static int 859 7754 Jeff zio_read_bp_init(zio_t *zio) 860 7754 Jeff { 861 7754 Jeff blkptr_t *bp = zio->io_bp; 862 7754 Jeff 863 8274 Jeff if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 864 9443 Bill zio->io_child_type == ZIO_CHILD_LOGICAL && 865 9443 Bill !(zio->io_flags & ZIO_FLAG_RAW)) { 866 10922 Jeff uint64_t psize = BP_GET_PSIZE(bp); 867 10922 Jeff void *cbuf = zio_buf_alloc(psize); 868 7754 Jeff 869 10922 Jeff zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 870 7754 Jeff } 871 7754 Jeff 872 7754 Jeff if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 873 11125 Jeff zio->io_flags |= ZIO_FLAG_DONT_CACHE; 874 11125 Jeff 875 11125 Jeff if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 876 7754 Jeff zio->io_flags |= ZIO_FLAG_DONT_CACHE; 877 10922 Jeff 878 10922 Jeff if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 879 10922 Jeff zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 880 7754 Jeff 881 7754 Jeff return (ZIO_PIPELINE_CONTINUE); 882 7754 Jeff } 883 7754 Jeff 884 7754 Jeff static int 885 7754 Jeff zio_write_bp_init(zio_t *zio) 886 7754 Jeff { 887 10922 Jeff spa_t *spa = zio->io_spa; 888 7754 Jeff zio_prop_t *zp = &zio->io_prop; 889 10922 Jeff enum zio_compress compress = zp->zp_compress; 890 7754 Jeff blkptr_t *bp = zio->io_bp; 891 7754 Jeff uint64_t lsize = zio->io_size; 892 10922 Jeff uint64_t psize = lsize; 893 7754 Jeff int pass = 1; 894 7754 Jeff 895 7754 Jeff /* 896 7754 Jeff * If our children haven't all reached the ready stage, 897 7754 Jeff * wait for them and then repeat this pipeline stage. 898 7754 Jeff */ 899 7754 Jeff if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 900 7754 Jeff zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 901 7754 Jeff return (ZIO_PIPELINE_STOP); 902 7754 Jeff 903 7754 Jeff if (!IO_IS_ALLOCATING(zio)) 904 7754 Jeff return (ZIO_PIPELINE_CONTINUE); 905 7754 Jeff 906 10922 Jeff ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 907 10922 Jeff 908 10922 Jeff if (zio->io_bp_override) { 909 10922 Jeff ASSERT(bp->blk_birth != zio->io_txg); 910 10922 Jeff ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 911 10922 Jeff 912 10922 Jeff *bp = *zio->io_bp_override; 913 10922 Jeff zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 914 10922 Jeff 915 10922 Jeff if (BP_IS_HOLE(bp) || !zp->zp_dedup) 916 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 917 10922 Jeff 918 10922 Jeff ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 919 10922 Jeff zp->zp_dedup_verify); 920 10922 Jeff 921 10922 Jeff if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 922 10922 Jeff BP_SET_DEDUP(bp, 1); 923 10922 Jeff zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 924 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 925 10922 Jeff } 926 10922 Jeff zio->io_bp_override = NULL; 927 10922 Jeff BP_ZERO(bp); 928 10922 Jeff } 929 7754 Jeff 930 7754 Jeff if (bp->blk_birth == zio->io_txg) { 931 7754 Jeff /* 932 7754 Jeff * We're rewriting an existing block, which means we're 933 7754 Jeff * working on behalf of spa_sync(). For spa_sync() to 934 7754 Jeff * converge, it must eventually be the case that we don't 935 7754 Jeff * have to allocate new blocks. But compression changes 936 7754 Jeff * the blocksize, which forces a reallocate, and makes 937 7754 Jeff * convergence take longer. Therefore, after the first 938 7754 Jeff * few passes, stop compressing to ensure convergence. 939 7754 Jeff */ 940 10922 Jeff pass = spa_sync_pass(spa); 941 10922 Jeff 942 10922 Jeff ASSERT(zio->io_txg == spa_syncing_txg(spa)); 943 10922 Jeff ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 944 10922 Jeff ASSERT(!BP_GET_DEDUP(bp)); 945 7754 Jeff 946 7754 Jeff if (pass > SYNC_PASS_DONT_COMPRESS) 947 7754 Jeff compress = ZIO_COMPRESS_OFF; 948 7754 Jeff 949 7754 Jeff /* Make sure someone doesn't change their mind on overwrites */ 950 10922 Jeff ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 951 10922 Jeff spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 952 7754 Jeff } 953 7754 Jeff 954 7754 Jeff if (compress != ZIO_COMPRESS_OFF) { 955 10922 Jeff void *cbuf = zio_buf_alloc(lsize); 956 10922 Jeff psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 957 10922 Jeff if (psize == 0 || psize == lsize) { 958 7754 Jeff compress = ZIO_COMPRESS_OFF; 959 10922 Jeff zio_buf_free(cbuf, lsize); 960 10922 Jeff } else { 961 10922 Jeff ASSERT(psize < lsize); 962 10922 Jeff zio_push_transform(zio, cbuf, psize, lsize, NULL); 963 7754 Jeff } 964 7754 Jeff } 965 7754 Jeff 966 7754 Jeff /* 967 7754 Jeff * The final pass of spa_sync() must be all rewrites, but the first 968 7754 Jeff * few passes offer a trade-off: allocating blocks defers convergence, 969 7754 Jeff * but newly allocated blocks are sequential, so they can be written 970 7754 Jeff * to disk faster. Therefore, we allow the first few passes of 971 7754 Jeff * spa_sync() to allocate new blocks, but force rewrites after that. 972 7754 Jeff * There should only be a handful of blocks after pass 1 in any case. 973 7754 Jeff */ 974 10922 Jeff if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 975 7754 Jeff pass > SYNC_PASS_REWRITE) { 976 10922 Jeff ASSERT(psize != 0); 977 10922 Jeff enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 978 7754 Jeff zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 979 7754 Jeff zio->io_flags |= ZIO_FLAG_IO_REWRITE; 980 7754 Jeff } else { 981 7754 Jeff BP_ZERO(bp); 982 7754 Jeff zio->io_pipeline = ZIO_WRITE_PIPELINE; 983 7754 Jeff } 984 7754 Jeff 985 10922 Jeff if (psize == 0) { 986 7754 Jeff zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 987 7754 Jeff } else { 988 7754 Jeff ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 989 7754 Jeff BP_SET_LSIZE(bp, lsize); 990 10922 Jeff BP_SET_PSIZE(bp, psize); 991 7754 Jeff BP_SET_COMPRESS(bp, compress); 992 7754 Jeff BP_SET_CHECKSUM(bp, zp->zp_checksum); 993 7754 Jeff BP_SET_TYPE(bp, zp->zp_type); 994 7754 Jeff BP_SET_LEVEL(bp, zp->zp_level); 995 10922 Jeff BP_SET_DEDUP(bp, zp->zp_dedup); 996 7754 Jeff BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 997 10922 Jeff if (zp->zp_dedup) { 998 10922 Jeff ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 999 10922 Jeff ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1000 10922 Jeff zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1001 10922 Jeff } 1002 10922 Jeff } 1003 10922 Jeff 1004 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 1005 10922 Jeff } 1006 10922 Jeff 1007 10922 Jeff static int 1008 10922 Jeff zio_free_bp_init(zio_t *zio) 1009 10922 Jeff { 1010 10922 Jeff blkptr_t *bp = zio->io_bp; 1011 10922 Jeff 1012 10922 Jeff if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1013 10922 Jeff if (BP_GET_DEDUP(bp)) 1014 10922 Jeff zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1015 10922 Jeff else 1016 10922 Jeff arc_free(zio->io_spa, bp); 1017 7754 Jeff } 1018 7754 Jeff 1019 7754 Jeff return (ZIO_PIPELINE_CONTINUE); 1020 7754 Jeff } 1021 7754 Jeff 1022 7754 Jeff /* 1023 7754 Jeff * ========================================================================== 1024 7754 Jeff * Execute the I/O pipeline 1025 7754 Jeff * ========================================================================== 1026 7754 Jeff */ 1027 7754 Jeff 1028 7754 Jeff static void 1029 11173 Jonathan zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) 1030 7754 Jeff { 1031 11146 George spa_t *spa = zio->io_spa; 1032 7754 Jeff zio_type_t t = zio->io_type; 1033 11173 Jonathan int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); 1034 7754 Jeff 1035 7754 Jeff /* 1036 9722 George * If we're a config writer or a probe, the normal issue and 1037 9722 George * interrupt threads may all be blocked waiting for the config lock. 1038 9722 George * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1039 7754 Jeff */ 1040 9722 George if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1041 7754 Jeff t = ZIO_TYPE_NULL; 1042 7754 Jeff 1043 7754 Jeff /* 1044 7754 Jeff * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1045 7754 Jeff */ 1046 7754 Jeff if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1047 7754 Jeff t = ZIO_TYPE_NULL; 1048 7754 Jeff 1049 11146 George /* 1050 11146 George * If this is a high priority I/O, then use the high priority taskq. 1051 11146 George */ 1052 11146 George if (zio->io_priority == ZIO_PRIORITY_NOW && 1053 11146 George spa->spa_zio_taskq[t][q + 1] != NULL) 1054 11146 George q++; 1055 11146 George 1056 11146 George ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1057 11146 George (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 1058 11173 Jonathan (task_func_t *)zio_execute, zio, flags); 1059 7754 Jeff } 1060 7754 Jeff 1061 7754 Jeff static boolean_t 1062 7754 Jeff zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 1063 7754 Jeff { 1064 7754 Jeff kthread_t *executor = zio->io_executor; 1065 7754 Jeff spa_t *spa = zio->io_spa; 1066 7754 Jeff 1067 7754 Jeff for (zio_type_t t = 0; t < ZIO_TYPES; t++) 1068 7754 Jeff if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 1069 7754 Jeff return (B_TRUE); 1070 7754 Jeff 1071 7754 Jeff return (B_FALSE); 1072 7754 Jeff } 1073 7754 Jeff 1074 7754 Jeff static int 1075 7754 Jeff zio_issue_async(zio_t *zio) 1076 7754 Jeff { 1077 11173 Jonathan zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1078 7754 Jeff 1079 7754 Jeff return (ZIO_PIPELINE_STOP); 1080 7754 Jeff } 1081 7754 Jeff 1082 7754 Jeff void 1083 7754 Jeff zio_interrupt(zio_t *zio) 1084 7754 Jeff { 1085 11173 Jonathan zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1086 7754 Jeff } 1087 7754 Jeff 1088 7754 Jeff /* 1089 7754 Jeff * Execute the I/O pipeline until one of the following occurs: 1090 7754 Jeff * (1) the I/O completes; (2) the pipeline stalls waiting for 1091 7754 Jeff * dependent child I/Os; (3) the I/O issues, so we're waiting 1092 7754 Jeff * for an I/O completion interrupt; (4) the I/O is delegated by 1093 7754 Jeff * vdev-level caching or aggregation; (5) the I/O is deferred 1094 7754 Jeff * due to vdev-level queueing; (6) the I/O is handed off to 1095 7754 Jeff * another thread. In all cases, the pipeline stops whenever 1096 7754 Jeff * there's no CPU work; it never burns a thread in cv_wait(). 1097 7754 Jeff * 1098 7754 Jeff * There's no locking on io_stage because there's no legitimate way 1099 7754 Jeff * for multiple threads to be attempting to process the same I/O. 1100 7754 Jeff */ 1101 10922 Jeff static zio_pipe_stage_t *zio_pipeline[]; 1102 7754 Jeff 1103 7754 Jeff void 1104 7754 Jeff zio_execute(zio_t *zio) 1105 7754 Jeff { 1106 7754 Jeff zio->io_executor = curthread; 1107 7754 Jeff 1108 7754 Jeff while (zio->io_stage < ZIO_STAGE_DONE) { 1109 10922 Jeff enum zio_stage pipeline = zio->io_pipeline; 1110 10922 Jeff enum zio_stage stage = zio->io_stage; 1111 7754 Jeff int rv; 1112 7754 Jeff 1113 7754 Jeff ASSERT(!MUTEX_HELD(&zio->io_lock)); 1114 10922 Jeff ASSERT(ISP2(stage)); 1115 10922 Jeff ASSERT(zio->io_stall == NULL); 1116 7754 Jeff 1117 10922 Jeff do { 1118 10922 Jeff stage <<= 1; 1119 10922 Jeff } while ((stage & pipeline) == 0); 1120 7754 Jeff 1121 7754 Jeff ASSERT(stage <= ZIO_STAGE_DONE); 1122 7754 Jeff 1123 7754 Jeff /* 1124 7754 Jeff * If we are in interrupt context and this pipeline stage 1125 7754 Jeff * will grab a config lock that is held across I/O, 1126 10922 Jeff * or may wait for an I/O that needs an interrupt thread 1127 10922 Jeff * to complete, issue async to avoid deadlock. 1128 11173 Jonathan * 1129 11173 Jonathan * For VDEV_IO_START, we cut in line so that the io will 1130 11173 Jonathan * be sent to disk promptly. 1131 7754 Jeff */ 1132 10922 Jeff if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1133 7754 Jeff zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1134 11173 Jonathan boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1135 11173 Jonathan zio_requeue_io_start_cut_in_line : B_FALSE; 1136 11173 Jonathan zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1137 7754 Jeff return; 1138 7754 Jeff } 1139 7754 Jeff 1140 7754 Jeff zio->io_stage = stage; 1141 10922 Jeff rv = zio_pipeline[highbit(stage) - 1](zio); 1142 7754 Jeff 1143 7754 Jeff if (rv == ZIO_PIPELINE_STOP) 1144 7754 Jeff return; 1145 7754 Jeff 1146 7754 Jeff ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1147 7754 Jeff } 1148 789 ahrens } 1149 789 ahrens 1150 789 ahrens /* 1151 789 ahrens * ========================================================================== 1152 789 ahrens * Initiate I/O, either sync or async 1153 789 ahrens * ========================================================================== 1154 789 ahrens */ 1155 789 ahrens int 1156 789 ahrens zio_wait(zio_t *zio) 1157 789 ahrens { 1158 789 ahrens int error; 1159 789 ahrens 1160 789 ahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1161 7754 Jeff ASSERT(zio->io_executor == NULL); 1162 789 ahrens 1163 789 ahrens zio->io_waiter = curthread; 1164 789 ahrens 1165 5530 bonwick zio_execute(zio); 1166 789 ahrens 1167 789 ahrens mutex_enter(&zio->io_lock); 1168 7754 Jeff while (zio->io_executor != NULL) 1169 789 ahrens cv_wait(&zio->io_cv, &zio->io_lock); 1170 789 ahrens mutex_exit(&zio->io_lock); 1171 789 ahrens 1172 789 ahrens error = zio->io_error; 1173 6523 ek110237 zio_destroy(zio); 1174 789 ahrens 1175 789 ahrens return (error); 1176 789 ahrens } 1177 789 ahrens 1178 789 ahrens void 1179 789 ahrens zio_nowait(zio_t *zio) 1180 789 ahrens { 1181 7754 Jeff ASSERT(zio->io_executor == NULL); 1182 7754 Jeff 1183 8632 Bill if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1184 8632 Bill zio_unique_parent(zio) == NULL) { 1185 7754 Jeff /* 1186 7754 Jeff * This is a logical async I/O with no parent to wait for it. 1187 9234 George * We add it to the spa_async_root_zio "Godfather" I/O which 1188 9234 George * will ensure they complete prior to unloading the pool. 1189 7754 Jeff */ 1190 7754 Jeff spa_t *spa = zio->io_spa; 1191 9234 George 1192 9234 George zio_add_child(spa->spa_async_zio_root, zio); 1193 7754 Jeff } 1194 7754 Jeff 1195 5530 bonwick zio_execute(zio); 1196 789 ahrens } 1197 789 ahrens 1198 789 ahrens /* 1199 789 ahrens * ========================================================================== 1200 7754 Jeff * Reexecute or suspend/resume failed I/O 1201 789 ahrens * ========================================================================== 1202 789 ahrens */ 1203 7754 Jeff 1204 7754 Jeff static void 1205 7754 Jeff zio_reexecute(zio_t *pio) 1206 789 ahrens { 1207 8632 Bill zio_t *cio, *cio_next; 1208 8632 Bill 1209 8632 Bill ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1210 8632 Bill ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1211 9443 Bill ASSERT(pio->io_gang_leader == NULL); 1212 9443 Bill ASSERT(pio->io_gang_tree == NULL); 1213 5530 bonwick 1214 7754 Jeff pio->io_flags = pio->io_orig_flags; 1215 7754 Jeff pio->io_stage = pio->io_orig_stage; 1216 7754 Jeff pio->io_pipeline = pio->io_orig_pipeline; 1217 7754 Jeff pio->io_reexecute = 0; 1218 7754 Jeff pio->io_error = 0; 1219 8632 Bill for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1220 8632 Bill pio->io_state[w] = 0; 1221 7754 Jeff for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1222 7754 Jeff pio->io_child_error[c] = 0; 1223 7754 Jeff 1224 10922 Jeff if (IO_IS_ALLOCATING(pio)) 1225 10922 Jeff BP_ZERO(pio->io_bp); 1226 5530 bonwick 1227 7754 Jeff /* 1228 7754 Jeff * As we reexecute pio's children, new children could be created. 1229 8632 Bill * New children go to the head of pio's io_child_list, however, 1230 7754 Jeff * so we will (correctly) not reexecute them. The key is that 1231 8632 Bill * the remainder of pio's io_child_list, from 'cio_next' onward, 1232 8632 Bill * cannot be affected by any side effects of reexecuting 'cio'. 1233 7754 Jeff */ 1234 8632 Bill for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1235 8632 Bill cio_next = zio_walk_children(pio); 1236 7754 Jeff mutex_enter(&pio->io_lock); 1237 8632 Bill for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1238 8632 Bill pio->io_children[cio->io_child_type][w]++; 1239 7754 Jeff mutex_exit(&pio->io_lock); 1240 8632 Bill zio_reexecute(cio); 1241 7754 Jeff } 1242 7754 Jeff 1243 7754 Jeff /* 1244 7754 Jeff * Now that all children have been reexecuted, execute the parent. 1245 9234 George * We don't reexecute "The Godfather" I/O here as it's the 1246 9234 George * responsibility of the caller to wait on him. 1247 7754 Jeff */ 1248 9234 George if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1249 9234 George zio_execute(pio); 1250 7754 Jeff } 1251 7754 Jeff 1252 7754 Jeff void 1253 7754 Jeff zio_suspend(spa_t *spa, zio_t *zio) 1254 7754 Jeff { 1255 7754 Jeff if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1256 7754 Jeff fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1257 7754 Jeff "failure and the failure mode property for this pool " 1258 7754 Jeff "is set to panic.", spa_name(spa)); 1259 7754 Jeff 1260 7754 Jeff zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1261 7754 Jeff 1262 7754 Jeff mutex_enter(&spa->spa_suspend_lock); 1263 7754 Jeff 1264 7754 Jeff if (spa->spa_suspend_zio_root == NULL) 1265 9234 George spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1266 9234 George ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1267 9234 George ZIO_FLAG_GODFATHER); 1268 7754 Jeff 1269 7754 Jeff spa->spa_suspended = B_TRUE; 1270 7754 Jeff 1271 7754 Jeff if (zio != NULL) { 1272 9234 George ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1273 7754 Jeff ASSERT(zio != spa->spa_suspend_zio_root); 1274 7754 Jeff ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1275 8632 Bill ASSERT(zio_unique_parent(zio) == NULL); 1276 7754 Jeff ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1277 7754 Jeff zio_add_child(spa->spa_suspend_zio_root, zio); 1278 7754 Jeff } 1279 7754 Jeff 1280 7754 Jeff mutex_exit(&spa->spa_suspend_lock); 1281 7754 Jeff } 1282 7754 Jeff 1283 9234 George int 1284 7754 Jeff zio_resume(spa_t *spa) 1285 7754 Jeff { 1286 9234 George zio_t *pio; 1287 7754 Jeff 1288 7754 Jeff /* 1289 7754 Jeff * Reexecute all previously suspended i/o. 1290 7754 Jeff */ 1291 7754 Jeff mutex_enter(&spa->spa_suspend_lock); 1292 7754 Jeff spa->spa_suspended = B_FALSE; 1293 7754 Jeff cv_broadcast(&spa->spa_suspend_cv); 1294 7754 Jeff pio = spa->spa_suspend_zio_root; 1295 7754 Jeff spa->spa_suspend_zio_root = NULL; 1296 7754 Jeff mutex_exit(&spa->spa_suspend_lock); 1297 7754 Jeff 1298 7754 Jeff if (pio == NULL) 1299 9234 George return (0); 1300 7754 Jeff 1301 9234 George zio_reexecute(pio); 1302 9234 George return (zio_wait(pio)); 1303 7754 Jeff } 1304 7754 Jeff 1305 7754 Jeff void 1306 7754 Jeff zio_resume_wait(spa_t *spa) 1307 7754 Jeff { 1308 7754 Jeff mutex_enter(&spa->spa_suspend_lock); 1309 7754 Jeff while (spa_suspended(spa)) 1310 7754 Jeff cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1311 7754 Jeff mutex_exit(&spa->spa_suspend_lock); 1312 7754 Jeff } 1313 7754 Jeff 1314 7754 Jeff /* 1315 7754 Jeff * ========================================================================== 1316 7754 Jeff * Gang blocks. 1317 7754 Jeff * 1318 7754 Jeff * A gang block is a collection of small blocks that looks to the DMU 1319 7754 Jeff * like one large block. When zio_dva_allocate() cannot find a block 1320 7754 Jeff * of the requested size, due to either severe fragmentation or the pool 1321 7754 Jeff * being nearly full, it calls zio_write_gang_block() to construct the 1322 7754 Jeff * block from smaller fragments. 1323 7754 Jeff * 1324 7754 Jeff * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1325 7754 Jeff * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1326 7754 Jeff * an indirect block: it's an array of block pointers. It consumes 1327 7754 Jeff * only one sector and hence is allocatable regardless of fragmentation. 1328 7754 Jeff * The gang header's bps point to its gang members, which hold the data. 1329 7754 Jeff * 1330 7754 Jeff * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1331 7754 Jeff * as the verifier to ensure uniqueness of the SHA256 checksum. 1332 7754 Jeff * Critically, the gang block bp's blk_cksum is the checksum of the data, 1333 7754 Jeff * not the gang header. This ensures that data block signatures (needed for 1334 7754 Jeff * deduplication) are independent of how the block is physically stored. 1335 7754 Jeff * 1336 7754 Jeff * Gang blocks can be nested: a gang member may itself be a gang block. 1337 7754 Jeff * Thus every gang block is a tree in which root and all interior nodes are 1338 7754 Jeff * gang headers, and the leaves are normal blocks that contain user data. 1339 7754 Jeff * The root of the gang tree is called the gang leader. 1340 7754 Jeff * 1341 7754 Jeff * To perform any operation (read, rewrite, free, claim) on a gang block, 1342 7754 Jeff * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1343 7754 Jeff * in the io_gang_tree field of the original logical i/o by recursively 1344 7754 Jeff * reading the gang leader and all gang headers below it. This yields 1345 7754 Jeff * an in-core tree containing the contents of every gang header and the 1346 7754 Jeff * bps for every constituent of the gang block. 1347 7754 Jeff * 1348 7754 Jeff * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1349 7754 Jeff * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1350 7754 Jeff * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1351 7754 Jeff * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1352 7754 Jeff * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1353 7754 Jeff * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1354 7754 Jeff * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1355 7754 Jeff * of the gang header plus zio_checksum_compute() of the data to update the 1356 7754 Jeff * gang header's blk_cksum as described above. 1357 7754 Jeff * 1358 7754 Jeff * The two-phase assemble/issue model solves the problem of partial failure -- 1359 7754 Jeff * what if you'd freed part of a gang block but then couldn't read the 1360 7754 Jeff * gang header for another part? Assembling the entire gang tree first 1361 7754 Jeff * ensures that all the necessary gang header I/O has succeeded before 1362 7754 Jeff * starting the actual work of free, claim, or write. Once the gang tree 1363 7754 Jeff * is assembled, free and claim are in-memory operations that cannot fail. 1364 7754 Jeff * 1365 7754 Jeff * In the event that a gang write fails, zio_dva_unallocate() walks the 1366 7754 Jeff * gang tree to immediately free (i.e. insert back into the space map) 1367 7754 Jeff * everything we've allocated. This ensures that we don't get ENOSPC 1368 7754 Jeff * errors during repeated suspend/resume cycles due to a flaky device. 1369 7754 Jeff * 1370 7754 Jeff * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1371 7754 Jeff * the gang tree, we won't modify the block, so we can safely defer the free 1372 7754 Jeff * (knowing that the block is still intact). If we *can* assemble the gang 1373 7754 Jeff * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1374 7754 Jeff * each constituent bp and we can allocate a new block on the next sync pass. 1375 7754 Jeff * 1376 7754 Jeff * In all cases, the gang tree allows complete recovery from partial failure. 1377 7754 Jeff * ========================================================================== 1378 7754 Jeff */ 1379 7754 Jeff 1380 7754 Jeff static zio_t * 1381 7754 Jeff zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1382 7754 Jeff { 1383 7754 Jeff if (gn != NULL) 1384 7754 Jeff return (pio); 1385 7754 Jeff 1386 7754 Jeff return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1387 7754 Jeff NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1388 7754 Jeff &pio->io_bookmark)); 1389 7754 Jeff } 1390 7754 Jeff 1391 7754 Jeff zio_t * 1392 7754 Jeff zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1393 7754 Jeff { 1394 7754 Jeff zio_t *zio; 1395 7754 Jeff 1396 7754 Jeff if (gn != NULL) { 1397 7754 Jeff zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1398 7754 Jeff gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1399 7754 Jeff ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1400 7754 Jeff /* 1401 7754 Jeff * As we rewrite each gang header, the pipeline will compute 1402 7754 Jeff * a new gang block header checksum for it; but no one will 1403 7754 Jeff * compute a new data checksum, so we do that here. The one 1404 7754 Jeff * exception is the gang leader: the pipeline already computed 1405 7754 Jeff * its data checksum because that stage precedes gang assembly. 1406 7754 Jeff * (Presently, nothing actually uses interior data checksums; 1407 7754 Jeff * this is just good hygiene.) 1408 7754 Jeff */ 1409 9443 Bill if (gn != pio->io_gang_leader->io_gang_tree) { 1410 7754 Jeff zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1411 7754 Jeff data, BP_GET_PSIZE(bp)); 1412 7754 Jeff } 1413 10922 Jeff /* 1414 10922 Jeff * If we are here to damage data for testing purposes, 1415 10922 Jeff * leave the GBH alone so that we can detect the damage. 1416 10922 Jeff */ 1417 10922 Jeff if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1418 10922 Jeff zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1419 7754 Jeff } else { 1420 7754 Jeff zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1421 7754 Jeff data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1422 7754 Jeff ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1423 7754 Jeff } 1424 7754 Jeff 1425 7754 Jeff return (zio); 1426 7754 Jeff } 1427 7754 Jeff 1428 7754 Jeff /* ARGSUSED */ 1429 7754 Jeff zio_t * 1430 7754 Jeff zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1431 7754 Jeff { 1432 10922 Jeff return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1433 10922 Jeff ZIO_GANG_CHILD_FLAGS(pio))); 1434 7754 Jeff } 1435 7754 Jeff 1436 7754 Jeff /* ARGSUSED */ 1437 7754 Jeff zio_t * 1438 7754 Jeff zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1439 7754 Jeff { 1440 7754 Jeff return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1441 7754 Jeff NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1442 7754 Jeff } 1443 7754 Jeff 1444 7754 Jeff static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1445 7754 Jeff NULL, 1446 7754 Jeff zio_read_gang, 1447 7754 Jeff zio_rewrite_gang, 1448 7754 Jeff zio_free_gang, 1449 7754 Jeff zio_claim_gang, 1450 7754 Jeff NULL 1451 7754 Jeff }; 1452 7754 Jeff 1453 7754 Jeff static void zio_gang_tree_assemble_done(zio_t *zio); 1454 7754 Jeff 1455 7754 Jeff static zio_gang_node_t * 1456 7754 Jeff zio_gang_node_alloc(zio_gang_node_t **gnpp) 1457 7754 Jeff { 1458 7754 Jeff zio_gang_node_t *gn; 1459 7754 Jeff 1460 7754 Jeff ASSERT(*gnpp == NULL); 1461 7754 Jeff 1462 7754 Jeff gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1463 7754 Jeff gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1464 7754 Jeff *gnpp = gn; 1465 7754 Jeff 1466 7754 Jeff return (gn); 1467 789 ahrens } 1468 789 ahrens 1469 789 ahrens static void 1470 7754 Jeff zio_gang_node_free(zio_gang_node_t **gnpp) 1471 6523 ek110237 { 1472 7754 Jeff zio_gang_node_t *gn = *gnpp; 1473 6523 ek110237 1474 7754 Jeff for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1475 7754 Jeff ASSERT(gn->gn_child[g] == NULL); 1476 6523 ek110237 1477 7754 Jeff zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1478 7754 Jeff kmem_free(gn, sizeof (*gn)); 1479 7754 Jeff *gnpp = NULL; 1480 6523 ek110237 } 1481 6523 ek110237 1482 6523 ek110237 static void 1483 7754 Jeff zio_gang_tree_free(zio_gang_node_t **gnpp) 1484 789 ahrens { 1485 7754 Jeff zio_gang_node_t *gn = *gnpp; 1486 789 ahrens 1487 7754 Jeff if (gn == NULL) 1488 7754 Jeff return; 1489 7754 Jeff 1490 7754 Jeff for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1491 7754 Jeff zio_gang_tree_free(&gn->gn_child[g]); 1492 7754 Jeff 1493 7754 Jeff zio_gang_node_free(gnpp); 1494 7754 Jeff } 1495 7754 Jeff 1496 7754 Jeff static void 1497 9443 Bill zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1498 7754 Jeff { 1499 7754 Jeff zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1500 7754 Jeff 1501 9443 Bill ASSERT(gio->io_gang_leader == gio); 1502 7754 Jeff ASSERT(BP_IS_GANG(bp)); 1503 7754 Jeff 1504 9443 Bill zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1505 7754 Jeff SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1506 9443 Bill gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1507 7754 Jeff } 1508 7754 Jeff 1509 7754 Jeff static void 1510 7754 Jeff zio_gang_tree_assemble_done(zio_t *zio) 1511 7754 Jeff { 1512 9443 Bill zio_t *gio = zio->io_gang_leader; 1513 7754 Jeff zio_gang_node_t *gn = zio->io_private; 1514 7754 Jeff blkptr_t *bp = zio->io_bp; 1515 7754 Jeff 1516 9443 Bill ASSERT(gio == zio_unique_parent(zio)); 1517 10922 Jeff ASSERT(zio->io_child_count == 0); 1518 7754 Jeff 1519 7754 Jeff if (zio->io_error) 1520 7754 Jeff return; 1521 7754 Jeff 1522 7754 Jeff if (BP_SHOULD_BYTESWAP(bp)) 1523 7754 Jeff byteswap_uint64_array(zio->io_data, zio->io_size); 1524 7754 Jeff 1525 7754 Jeff ASSERT(zio->io_data == gn->gn_gbh); 1526 7754 Jeff ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1527 7754 Jeff ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 1528 7754 Jeff 1529 7754 Jeff for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1530 7754 Jeff blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1531 7754 Jeff if (!BP_IS_GANG(gbp)) 1532 7754 Jeff continue; 1533 9443 Bill zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1534 789 ahrens } 1535 789 ahrens } 1536 789 ahrens 1537 7754 Jeff static void 1538 7754 Jeff zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1539 789 ahrens { 1540 9443 Bill zio_t *gio = pio->io_gang_leader; 1541 7754 Jeff zio_t *zio; 1542 789 ahrens 1543 7754 Jeff ASSERT(BP_IS_GANG(bp) == !!gn); 1544 9443 Bill ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1545 9443 Bill ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1546 7754 Jeff 1547 7754 Jeff /* 1548 7754 Jeff * If you're a gang header, your data is in gn->gn_gbh. 1549 7754 Jeff * If you're a gang member, your data is in 'data' and gn == NULL. 1550 7754 Jeff */ 1551 9443 Bill zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1552 7754 Jeff 1553 7754 Jeff if (gn != NULL) { 1554 7754 Jeff ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 1555 7754 Jeff 1556 7754 Jeff for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1557 7754 Jeff blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1558 7754 Jeff if (BP_IS_HOLE(gbp)) 1559 7754 Jeff continue; 1560 7754 Jeff zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1561 7754 Jeff data = (char *)data + BP_GET_PSIZE(gbp); 1562 7754 Jeff } 1563 7754 Jeff } 1564 7754 Jeff 1565 9443 Bill if (gn == gio->io_gang_tree) 1566 9443 Bill ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1567 7754 Jeff 1568 7754 Jeff if (zio != pio) 1569 7754 Jeff zio_nowait(zio); 1570 789 ahrens } 1571 789 ahrens 1572 5530 bonwick static int 1573 7754 Jeff zio_gang_assemble(zio_t *zio) 1574 5329 gw25295 { 1575 5530 bonwick blkptr_t *bp = zio->io_bp; 1576 5530 bonwick 1577 9443 Bill ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1578 9443 Bill ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1579 9443 Bill 1580 9443 Bill zio->io_gang_leader = zio; 1581 5329 gw25295 1582 7754 Jeff zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1583 5530 bonwick 1584 5530 bonwick return (ZIO_PIPELINE_CONTINUE); 1585 5329 gw25295 } 1586 5329 gw25295 1587 5530 bonwick static int 1588 7754 Jeff zio_gang_issue(zio_t *zio) 1589 789 ahrens { 1590 7754 Jeff blkptr_t *bp = zio->io_bp; 1591 3547 maybee 1592 7754 Jeff if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1593 7754 Jeff return (ZIO_PIPELINE_STOP); 1594 789 ahrens 1595 9443 Bill ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1596 9443 Bill ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1597 789 ahrens 1598 7754 Jeff if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1599 9443 Bill zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1600 7754 Jeff else 1601 9443 Bill zio_gang_tree_free(&zio->io_gang_tree); 1602 789 ahrens 1603 7754 Jeff zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1604 5530 bonwick 1605 5530 bonwick return (ZIO_PIPELINE_CONTINUE); 1606 789 ahrens } 1607 789 ahrens 1608 789 ahrens static void 1609 7754 Jeff zio_write_gang_member_ready(zio_t *zio) 1610 789 ahrens { 1611 8632 Bill zio_t *pio = zio_unique_parent(zio); 1612 9443 Bill zio_t *gio = zio->io_gang_leader; 1613 1775 billm dva_t *cdva = zio->io_bp->blk_dva; 1614 1775 billm dva_t *pdva = pio->io_bp->blk_dva; 1615 789 ahrens uint64_t asize; 1616 789 ahrens 1617 7754 Jeff if (BP_IS_HOLE(zio->io_bp)) 1618 7754 Jeff return; 1619 7754 Jeff 1620 7754 Jeff ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1621 7754 Jeff 1622 7754 Jeff ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1623 10922 Jeff ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1624 10922 Jeff ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1625 10922 Jeff ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1626 1775 billm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1627 789 ahrens 1628 789 ahrens mutex_enter(&pio->io_lock); 1629 7754 Jeff for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1630 1775 billm ASSERT(DVA_GET_GANG(&pdva[d])); 1631 1775 billm asize = DVA_GET_ASIZE(&pdva[d]); 1632 1775 billm asize += DVA_GET_ASIZE(&cdva[d]); 1633 1775 billm DVA_SET_ASIZE(&pdva[d], asize); 1634 1775 billm } 1635 789 ahrens mutex_exit(&pio->io_lock); 1636 789 ahrens } 1637 789 ahrens 1638 5329 gw25295 static int 1639 7754 Jeff zio_write_gang_block(zio_t *pio) 1640 789 ahrens { 1641 7754 Jeff spa_t *spa = pio->io_spa; 1642 7754 Jeff blkptr_t *bp = pio->io_bp; 1643 9443 Bill zio_t *gio = pio->io_gang_leader; 1644 7754 Jeff zio_t *zio; 1645 7754 Jeff zio_gang_node_t *gn, **gnpp; 1646 789 ahrens zio_gbh_phys_t *gbh; 1647 7754 Jeff uint64_t txg = pio->io_txg; 1648 7754 Jeff uint64_t resid = pio->io_size; 1649 7754 Jeff uint64_t lsize; 1650 10922 Jeff int copies = gio->io_prop.zp_copies; 1651 10922 Jeff int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1652 7754 Jeff zio_prop_t zp; 1653 789 ahrens int error; 1654 789 ahrens 1655 10922 Jeff error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1656 10922 Jeff bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1657 7754 Jeff METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1658 5530 bonwick if (error) { 1659 7754 Jeff pio->io_error = error; 1660 5530 bonwick return (ZIO_PIPELINE_CONTINUE); 1661 5530 bonwick } 1662 789 ahrens 1663 9443 Bill if (pio == gio) { 1664 9443 Bill gnpp = &gio->io_gang_tree; 1665 7754 Jeff } else { 1666 7754 Jeff gnpp = pio->io_private; 1667 7754 Jeff ASSERT(pio->io_ready == zio_write_gang_member_ready); 1668 789 ahrens } 1669 789 ahrens 1670 7754 Jeff gn = zio_gang_node_alloc(gnpp); 1671 7754 Jeff gbh = gn->gn_gbh; 1672 7754 Jeff bzero(gbh, SPA_GANGBLOCKSIZE); 1673 5530 bonwick 1674 1775 billm /* 1675 7754 Jeff * Create the gang header. 1676 1775 billm */ 1677 7754 Jeff zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1678 7754 Jeff pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1679 7754 Jeff 1680 7754 Jeff /* 1681 7754 Jeff * Create and nowait the gang children. 1682 7754 Jeff */ 1683 7754 Jeff for (int g = 0; resid != 0; resid -= lsize, g++) { 1684 7754 Jeff lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1685 7754 Jeff SPA_MINBLOCKSIZE); 1686 7754 Jeff ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1687 7754 Jeff 1688 9443 Bill zp.zp_checksum = gio->io_prop.zp_checksum; 1689 7754 Jeff zp.zp_compress = ZIO_COMPRESS_OFF; 1690 7754 Jeff zp.zp_type = DMU_OT_NONE; 1691 7754 Jeff zp.zp_level = 0; 1692 10922 Jeff zp.zp_copies = gio->io_prop.zp_copies; 1693 10922 Jeff zp.zp_dedup = 0; 1694 10922 Jeff zp.zp_dedup_verify = 0; 1695 7754 Jeff 1696 7754 Jeff zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1697 7754 Jeff (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1698 7754 Jeff zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1699 7754 Jeff pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1700 7754 Jeff &pio->io_bookmark)); 1701 7754 Jeff } 1702 7754 Jeff 1703 7754 Jeff /* 1704 7754 Jeff * Set pio's pipeline to just wait for zio to finish. 1705 7754 Jeff */ 1706 7754 Jeff pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1707 7754 Jeff 1708 7754 Jeff zio_nowait(zio); 1709 7754 Jeff 1710 7754 Jeff return (ZIO_PIPELINE_CONTINUE); 1711 789 ahrens } 1712 789 ahrens 1713 789 ahrens /* 1714 789 ahrens * ========================================================================== 1715 10922 Jeff * Dedup 1716 10922 Jeff * ========================================================================== 1717 10922 Jeff */ 1718 10922 Jeff static void 1719 10922 Jeff zio_ddt_child_read_done(zio_t *zio) 1720 10922 Jeff { 1721 10922 Jeff blkptr_t *bp = zio->io_bp; 1722 10922 Jeff ddt_entry_t *dde = zio->io_private; 1723 10922 Jeff ddt_phys_t *ddp; 1724 10922 Jeff zio_t *pio = zio_unique_parent(zio); 1725 10922 Jeff 1726 10922 Jeff mutex_enter(&pio->io_lock); 1727 10922 Jeff ddp = ddt_phys_select(dde, bp); 1728 10922 Jeff if (zio->io_error == 0) 1729 10922 Jeff ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1730 10922 Jeff if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1731 10922 Jeff dde->dde_repair_data = zio->io_data; 1732 10922 Jeff else 1733 10922 Jeff zio_buf_free(zio->io_data, zio->io_size); 1734 10922 Jeff mutex_exit(&pio->io_lock); 1735 10922 Jeff } 1736 10922 Jeff 1737 10922 Jeff static int 1738 10922 Jeff zio_ddt_read_start(zio_t *zio) 1739 10922 Jeff { 1740 10922 Jeff blkptr_t *bp = zio->io_bp; 1741 10922 Jeff 1742 10922 Jeff ASSERT(BP_GET_DEDUP(bp)); 1743 10922 Jeff ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1744 10922 Jeff ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1745 10922 Jeff 1746 10922 Jeff if (zio->io_child_error[ZIO_CHILD_DDT]) { 1747 10922 Jeff ddt_t *ddt = ddt_select(zio->io_spa, bp); 1748 10922 Jeff ddt_entry_t *dde = ddt_repair_start(ddt, bp); 1749 10922 Jeff ddt_phys_t *ddp = dde->dde_phys; 1750 10922 Jeff ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 1751 10922 Jeff blkptr_t blk; 1752 10922 Jeff 1753 10922 Jeff ASSERT(zio->io_vsd == NULL); 1754 10922 Jeff zio->io_vsd = dde; 1755 10922 Jeff 1756 10922 Jeff if (ddp_self == NULL) 1757 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 1758 10922 Jeff 1759 10922 Jeff for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1760 10922 Jeff if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 1761 10922 Jeff continue; 1762 11125 Jeff ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 1763 11125 Jeff &blk); 1764 10922 Jeff zio_nowait(zio_read(zio, zio->io_spa, &blk, 1765 10922 Jeff zio_buf_alloc(zio->io_size), zio->io_size, 1766 10922 Jeff zio_ddt_child_read_done, dde, zio->io_priority, 1767 10922 Jeff ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 1768 10922 Jeff &zio->io_bookmark)); 1769 10922 Jeff } 1770 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 1771 10922 Jeff } 1772 10922 Jeff 1773 10922 Jeff zio_nowait(zio_read(zio, zio->io_spa, bp, 1774 10922 Jeff zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 1775 10922 Jeff ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 1776 10922 Jeff 1777 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 1778 10922 Jeff } 1779 10922 Jeff 1780 10922 Jeff static int 1781 10922 Jeff zio_ddt_read_done(zio_t *zio) 1782 10922 Jeff { 1783 10922 Jeff blkptr_t *bp = zio->io_bp; 1784 10922 Jeff 1785 10922 Jeff if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 1786 10922 Jeff return (ZIO_PIPELINE_STOP); 1787 10922 Jeff 1788 10922 Jeff ASSERT(BP_GET_DEDUP(bp)); 1789 10922 Jeff ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1790 10922 Jeff ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1791 10922 Jeff 1792 10922 Jeff if (zio->io_child_error[ZIO_CHILD_DDT]) { 1793 10922 Jeff ddt_t *ddt = ddt_select(zio->io_spa, bp); 1794 10922 Jeff ddt_entry_t *dde = zio->io_vsd; 1795 10922 Jeff if (ddt == NULL) { 1796 11147 George ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 1797 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 1798 10922 Jeff } 1799 10922 Jeff if (dde == NULL) { 1800 10922 Jeff zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 1801 11173 Jonathan zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1802 10922 Jeff return (ZIO_PIPELINE_STOP); 1803 10922 Jeff } 1804 10922 Jeff if (dde->dde_repair_data != NULL) { 1805 10922 Jeff bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 1806 10922 Jeff zio->io_child_error[ZIO_CHILD_DDT] = 0; 1807 10922 Jeff } 1808 10922 Jeff ddt_repair_done(ddt, dde); 1809 10922 Jeff zio->io_vsd = NULL; 1810 10922 Jeff } 1811 10922 Jeff 1812 10922 Jeff ASSERT(zio->io_vsd == NULL); 1813 10922 Jeff 1814 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 1815 10922 Jeff } 1816 10922 Jeff 1817 10922 Jeff static boolean_t 1818 10922 Jeff zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 1819 10922 Jeff { 1820 10922 Jeff spa_t *spa = zio->io_spa; 1821 10922 Jeff 1822 10922 Jeff /* 1823 10922 Jeff * Note: we compare the original data, not the transformed data, 1824 10922 Jeff * because when zio->io_bp is an override bp, we will not have 1825 10922 Jeff * pushed the I/O transforms. That's an important optimization 1826 10922 Jeff * because otherwise we'd compress/encrypt all dmu_sync() data twice. 1827 10922 Jeff */ 1828 10922 Jeff for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1829 10922 Jeff zio_t *lio = dde->dde_lead_zio[p]; 1830 10922 Jeff 1831 10922 Jeff if (lio != NULL) { 1832 10922 Jeff return (lio->io_orig_size != zio->io_orig_size || 1833 10922 Jeff bcmp(zio->io_orig_data, lio->io_orig_data, 1834 10922 Jeff zio->io_orig_size) != 0); 1835 10922 Jeff } 1836 10922 Jeff } 1837 10922 Jeff 1838 10922 Jeff for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1839 10922 Jeff ddt_phys_t *ddp = &dde->dde_phys[p]; 1840 10922 Jeff 1841 10922 Jeff if (ddp->ddp_phys_birth != 0) { 1842 10922 Jeff arc_buf_t *abuf = NULL; 1843 10922 Jeff uint32_t aflags = ARC_WAIT; 1844 10922 Jeff blkptr_t blk = *zio->io_bp; 1845 10922 Jeff int error; 1846 10922 Jeff 1847 10922 Jeff ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 1848 10922 Jeff 1849 10922 Jeff ddt_exit(ddt); 1850 10922 Jeff 1851 10922 Jeff error = arc_read_nolock(NULL, spa, &blk, 1852 10922 Jeff arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 1853 10922 Jeff ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1854 10922 Jeff &aflags, &zio->io_bookmark); 1855 10922 Jeff 1856 10922 Jeff if (error == 0) { 1857 10922 Jeff if (arc_buf_size(abuf) != zio->io_orig_size || 1858 10922 Jeff bcmp(abuf->b_data, zio->io_orig_data, 1859 10922 Jeff zio->io_orig_size) != 0) 1860 10922 Jeff error = EEXIST; 1861 10922 Jeff VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 1862 10922 Jeff } 1863 10922 Jeff 1864 10922 Jeff ddt_enter(ddt); 1865 10922 Jeff return (error != 0); 1866 10922 Jeff } 1867 10922 Jeff } 1868 10922 Jeff 1869 10922 Jeff return (B_FALSE); 1870 10922 Jeff } 1871 10922 Jeff 1872 10922 Jeff static void 1873 10922 Jeff zio_ddt_child_write_ready(zio_t *zio) 1874 10922 Jeff { 1875 10922 Jeff int p = zio->io_prop.zp_copies; 1876 10922 Jeff ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 1877 10922 Jeff ddt_entry_t *dde = zio->io_private; 1878 10922 Jeff ddt_phys_t *ddp = &dde->dde_phys[p]; 1879 10922 Jeff zio_t *pio; 1880 10922 Jeff 1881 10922 Jeff if (zio->io_error) 1882 10922 Jeff return; 1883 10922 Jeff 1884 10922 Jeff ddt_enter(ddt); 1885 10922 Jeff 1886 10922 Jeff ASSERT(dde->dde_lead_zio[p] == zio); 1887 10922 Jeff 1888 10922 Jeff ddt_phys_fill(ddp, zio->io_bp); 1889 10922 Jeff 1890 10922 Jeff while ((pio = zio_walk_parents(zio)) != NULL) 1891 10922 Jeff ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 1892 10922 Jeff 1893 10922 Jeff ddt_exit(ddt); 1894 10922 Jeff } 1895 10922 Jeff 1896 10922 Jeff static void 1897 10922 Jeff zio_ddt_child_write_done(zio_t *zio) 1898 10922 Jeff { 1899 10922 Jeff int p = zio->io_prop.zp_copies; 1900 10922 Jeff ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 1901 10922 Jeff ddt_entry_t *dde = zio->io_private; 1902 10922 Jeff ddt_phys_t *ddp = &dde->dde_phys[p]; 1903 10922 Jeff 1904 10922 Jeff ddt_enter(ddt); 1905 10922 Jeff 1906 10922 Jeff ASSERT(ddp->ddp_refcnt == 0); 1907 10922 Jeff ASSERT(dde->dde_lead_zio[p] == zio); 1908 10922 Jeff dde->dde_lead_zio[p] = NULL; 1909 10922 Jeff 1910 10922 Jeff if (zio->io_error == 0) { 1911 10922 Jeff while (zio_walk_parents(zio) != NULL) 1912 10922 Jeff ddt_phys_addref(ddp); 1913 10922 Jeff } else { 1914 10922 Jeff ddt_phys_clear(ddp); 1915 10922 Jeff } 1916 10922 Jeff 1917 10922 Jeff ddt_exit(ddt); 1918 10922 Jeff } 1919 10922 Jeff 1920 10922 Jeff static void 1921 10922 Jeff zio_ddt_ditto_write_done(zio_t *zio) 1922 10922 Jeff { 1923 10922 Jeff int p = DDT_PHYS_DITTO; 1924 10922 Jeff zio_prop_t *zp = &zio->io_prop; 1925 10922 Jeff blkptr_t *bp = zio->io_bp; 1926 10922 Jeff ddt_t *ddt = ddt_select(zio->io_spa, bp); 1927 10922 Jeff ddt_entry_t *dde = zio->io_private; 1928 10922 Jeff ddt_phys_t *ddp = &dde->dde_phys[p]; 1929 10922 Jeff ddt_key_t *ddk = &dde->dde_key; 1930 10922 Jeff 1931 10922 Jeff ddt_enter(ddt); 1932 10922 Jeff 1933 10922 Jeff ASSERT(ddp->ddp_refcnt == 0); 1934 10922 Jeff ASSERT(dde->dde_lead_zio[p] == zio); 1935 10922 Jeff dde->dde_lead_zio[p] = NULL; 1936 10922 Jeff 1937 10922 Jeff if (zio->io_error == 0) { 1938 10922 Jeff ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 1939 10922 Jeff ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 1940 10922 Jeff ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 1941 10922 Jeff if (ddp->ddp_phys_birth != 0) 1942 10922 Jeff ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 1943 10922 Jeff ddt_phys_fill(ddp, bp); 1944 10922 Jeff } 1945 10922 Jeff 1946 10922 Jeff ddt_exit(ddt); 1947 10922 Jeff } 1948 10922 Jeff 1949 10922 Jeff static int 1950 10922 Jeff zio_ddt_write(zio_t *zio) 1951 10922 Jeff { 1952 10922 Jeff spa_t *spa = zio->io_spa; 1953 10922 Jeff blkptr_t *bp = zio->io_bp; 1954 10922 Jeff uint64_t txg = zio->io_txg; 1955 10922 Jeff zio_prop_t *zp = &zio->io_prop; 1956 10922 Jeff int p = zp->zp_copies; 1957 10922 Jeff int ditto_copies; 1958 10922 Jeff zio_t *cio = NULL; 1959 10922 Jeff zio_t *dio = NULL; 1960 10922 Jeff ddt_t *ddt = ddt_select(spa, bp); 1961 10922 Jeff ddt_entry_t *dde; 1962 10922 Jeff ddt_phys_t *ddp; 1963 10922 Jeff 1964 10922 Jeff ASSERT(BP_GET_DEDUP(bp)); 1965 10922 Jeff ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 1966 10922 Jeff ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 1967 10922 Jeff 1968 10922 Jeff ddt_enter(ddt); 1969 10922 Jeff dde = ddt_lookup(ddt, bp, B_TRUE); 1970 10922 Jeff ddp = &dde->dde_phys[p]; 1971 10922 Jeff 1972 10922 Jeff if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 1973 10922 Jeff /* 1974 10922 Jeff * If we're using a weak checksum, upgrade to a strong checksum 1975 10922 Jeff * and try again. If we're already using a strong checksum, 1976 10922 Jeff * we can't resolve it, so just convert to an ordinary write. 1977 10922 Jeff * (And automatically e-mail a paper to Nature?) 1978 10922 Jeff */ 1979 10922 Jeff if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 1980 10922 Jeff zp->zp_checksum = spa_dedup_checksum(spa); 1981 10922 Jeff zio_pop_transforms(zio); 1982 10922 Jeff zio->io_stage = ZIO_STAGE_OPEN; 1983 10922 Jeff BP_ZERO(bp); 1984 10922 Jeff } else { 1985 10922 Jeff zp->zp_dedup = 0; 1986 10922 Jeff } 1987 10922 Jeff zio->io_pipeline = ZIO_WRITE_PIPELINE; 1988 10922 Jeff ddt_exit(ddt); 1989 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 1990 10922 Jeff } 1991 10922 Jeff 1992 10922 Jeff ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 1993 10922 Jeff ASSERT(ditto_copies < SPA_DVAS_PER_BP); 1994 10922 Jeff 1995 10922 Jeff if (ditto_copies > ddt_ditto_copies_present(dde) && 1996 10922 Jeff dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 1997 10922 Jeff zio_prop_t czp = *zp; 1998 10922 Jeff 1999 10922 Jeff czp.zp_copies = ditto_copies; 2000 10922 Jeff 2001 10922 Jeff /* 2002 10922 Jeff * If we arrived here with an override bp, we won't have run 2003 10922 Jeff * the transform stack, so we won't have the data we need to 2004 10922 Jeff * generate a child i/o. So, toss the override bp and restart. 2005 10922 Jeff * This is safe, because using the override bp is just an 2006 10922 Jeff * optimization; and it's rare, so the cost doesn't matter. 2007 10922 Jeff */ 2008 10922 Jeff if (zio->io_bp_override) { 2009 10922 Jeff zio_pop_transforms(zio); 2010 10922 Jeff zio->io_stage = ZIO_STAGE_OPEN; 2011 10922 Jeff zio->io_pipeline = ZIO_WRITE_PIPELINE; 2012 10922 Jeff zio->io_bp_override = NULL; 2013 10922 Jeff BP_ZERO(bp); 2014 10922 Jeff ddt_exit(ddt); 2015 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 2016 10922 Jeff } 2017 10922 Jeff 2018 10922 Jeff dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2019 10922 Jeff zio->io_orig_size, &czp, NULL, 2020 10922 Jeff zio_ddt_ditto_write_done, dde, zio->io_priority, 2021 10922 Jeff ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2022 10922 Jeff 2023 10922 Jeff zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2024 10922 Jeff dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2025 10922 Jeff } 2026 10922 Jeff 2027 10922 Jeff if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2028 10922 Jeff if (ddp->ddp_phys_birth != 0) 2029 10922 Jeff ddt_bp_fill(ddp, bp, txg); 2030 10922 Jeff if (dde->dde_lead_zio[p] != NULL) 2031 10922 Jeff zio_add_child(zio, dde->dde_lead_zio[p]); 2032 10922 Jeff else 2033 10922 Jeff ddt_phys_addref(ddp); 2034 10922 Jeff } else if (zio->io_bp_override) { 2035 10922 Jeff ASSERT(bp->blk_birth == txg); 2036 10922 Jeff ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2037 10922 Jeff ddt_phys_fill(ddp, bp); 2038 10922 Jeff ddt_phys_addref(ddp); 2039 10922 Jeff } else { 2040 10922 Jeff cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2041 10922 Jeff zio->io_orig_size, zp, zio_ddt_child_write_ready, 2042 10922 Jeff zio_ddt_child_write_done, dde, zio->io_priority, 2043 10922 Jeff ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2044 10922 Jeff 2045 10922 Jeff zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2046 10922 Jeff dde->dde_lead_zio[p] = cio; 2047 10922 Jeff } 2048 10922 Jeff 2049 10922 Jeff ddt_exit(ddt); 2050 10922 Jeff 2051 10922 Jeff if (cio) 2052 10922 Jeff zio_nowait(cio); 2053 10922 Jeff if (dio) 2054 10922 Jeff zio_nowait(dio); 2055 10922 Jeff 2056 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 2057 10922 Jeff } 2058 10922 Jeff 2059 10922 Jeff static int 2060 10922 Jeff zio_ddt_free(zio_t *zio) 2061 10922 Jeff { 2062 10922 Jeff spa_t *spa = zio->io_spa; 2063 10922 Jeff blkptr_t *bp = zio->io_bp; 2064 10922 Jeff ddt_t *ddt = ddt_select(spa, bp); 2065 10922 Jeff ddt_entry_t *dde; 2066 10922 Jeff ddt_phys_t *ddp; 2067 10922 Jeff 2068 10922 Jeff ASSERT(BP_GET_DEDUP(bp)); 2069 10922 Jeff ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2070 10922 Jeff 2071 10922 Jeff ddt_enter(ddt); 2072 10922 Jeff dde = ddt_lookup(ddt, bp, B_TRUE); 2073 10922 Jeff ddp = ddt_phys_select(dde, bp); 2074 10922 Jeff ddt_phys_decref(ddp); 2075 10922 Jeff ddt_exit(ddt); 2076 10922 Jeff 2077 10922 Jeff return (ZIO_PIPELINE_CONTINUE); 2078 10922 Jeff } 2079 10922 Jeff 2080 10922 Jeff /* 2081 10922 Jeff * ========================================================================== 2082 789 ahrens * Allocate and free blocks 2083 789 ahrens * ========================================================================== 2084 789 ahrens */ 2085 5530 bonwick static int 2086 789 ahrens zio_dva_allocate(zio_t *zio) 2087 789 ahrens { 2088 4527 perrin spa_t *spa = zio->io_spa; 2089 10922 Jeff metaslab_class_t *mc = spa_normal_class(spa); 2090 789 ahrens blkptr_t *bp = zio->io_bp; 2091 789 ahrens int error; 2092 9443 Bill 2093 9443 Bill if (zio->io_gang_leader == NULL) { 2094 9443 Bill ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2095 9443 Bill zio->io_gang_leader = zio; 2096 9443 Bill } 2097 789 ahrens 2098 789 ahrens ASSERT(BP_IS_HOLE(bp)); 2099 1775 billm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 2100 10922 Jeff ASSERT3U(zio->io_prop.zp_copies, >, 0); 2101 10922 Jeff ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2102 789 ahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2103 789 ahrens 2104 7754 Jeff error = metaslab_alloc(spa, mc, zio->io_size, bp, 2105 10922 Jeff zio->io_prop.zp_copies, zio->io_txg, NULL, 0); 2106 789 ahrens 2107 7754 Jeff if (error) { 2108 7754 Jeff if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2109 7754 Jeff return (zio_write_gang_block(zio)); 2110 789 ahrens zio->io_error = error; 2111 789 ahrens } 2112 5530 bonwick 2113 5530 bonwick return (ZIO_PIPELINE_CONTINUE); 2114 789 ahrens } 2115 789 ahrens 2116 5530 bonwick static int 2117 789 ahrens zio_dva_free(zio_t *zio) 2118 789 ahrens { 2119 7754 Jeff metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2120 789 ahrens 2121 5530 bonwick return (ZIO_PIPELINE_CONTINUE); 2122 789 ahrens } 2123 789 ahrens 2124 5530 bonwick static int 2125 789 ahrens zio_dva_claim(zio_t *zio) 2126 789 ahrens { 2127 7754 Jeff int error; 2128 7754 Jeff 2129 7754 Jeff error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2130 7754 Jeff if (error) 2131 7754 Jeff zio->io_error = error; 2132 789 ahrens 2133 5530 bonwick return (ZIO_PIPELINE_CONTINUE); 2134 7754 Jeff } 2135 7754 Jeff 2136 7754 Jeff /* 2137 7754 Jeff * Undo an allocation. This is used by zio_done() when an I/O fails 2138 7754 Jeff * and we want to give back the block we just allocated. 2139 7754 Jeff * This handles both normal blocks and gang blocks. 2140 7754 Jeff */ 2141 7754 Jeff static void 2142 7754 Jeff zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2143 7754 Jeff { 2144 7754 Jeff ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2145 10922 Jeff ASSERT(zio->io_bp_override == NULL); 2146 7754 Jeff 2147 7754 Jeff if (!BP_IS_HOLE(bp)) 2148 10922 Jeff metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2149 7754 Jeff 2150 7754 Jeff if (gn != NULL) { 2151 7754 Jeff for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2152 7754 Jeff zio_dva_unallocate(zio, gn->gn_child[g], 2153 7754 Jeff &gn->gn_gbh->zg_blkptr[g]); 2154 7754 Jeff } 2155 7754 Jeff } 2156 7754 Jeff } 2157 7754 Jeff 2158 7754 Jeff /* 2159 7754 Jeff * Try to allocate an intent log block. Return 0 on success, errno on failure. 2160 7754 Jeff */ 2161 7754 Jeff int 2162 10922 Jeff zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2163 10922 Jeff uint64_t size, boolean_t use_slog) 2164 7754 Jeff { 2165 10310 Neil int error = 1; 2166 7754 Jeff 2167 10922 Jeff ASSERT(txg > spa_syncing_txg(spa)); 2168 10922 Jeff 2169 10879 Neil if (use_slog) 2170 10922 Jeff error = metaslab_alloc(spa, spa_log_class(spa), size, 2171 10310 Neil new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 2172 7754 Jeff 2173 7754 Jeff if (error) 2174 10922 Jeff error = metaslab_alloc(spa, spa_normal_class(spa), size, 2175 7754 Jeff new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 2176 7754 Jeff 2177 7754 Jeff if (error == 0) { 2178 7754 Jeff BP_SET_LSIZE(new_bp, size); 2179 7754 Jeff BP_SET_PSIZE(new_bp, size); 2180 7754 Jeff BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2181 7754 Jeff BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 2182 7754 Jeff BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2183 7754 Jeff BP_SET_LEVEL(new_bp, 0); 2184 10922 Jeff BP_SET_DEDUP(new_bp, 0); 2185 7754 Jeff BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2186 7754 Jeff } 2187 7754 Jeff 2188 7754 Jeff return (error); 2189 7754 Jeff } 2190 7754 Jeff 2191 7754 Jeff /* 2192 10922 Jeff * Free an intent log block. 2193 7754 Jeff */ 2194 7754 Jeff void 2195 10922 Jeff zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2196 7754 Jeff { 2197 10922 Jeff ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2198 7754 Jeff ASSERT(!BP_IS_GANG(bp)); 2199 7754 Jeff 2200 10922 Jeff zio_free(spa, txg, bp); 2201 789 ahrens } 2202 789 ahrens 2203 789 ahrens /* 2204 789 ahrens * ========================================================================== 2205 789 ahrens * Read and write to physical devices 2206 789 ahrens * ========================================================================== 2207 789 ahrens */ 2208 5530 bonwick static int 2209 1775 billm zio_vdev_io_start(zio_t *zio) 2210 789 ahrens { 2211 789 ahrens vdev_t *vd = zio->io_vd; 2212 1775 billm uint64_t align; 2213 5329 gw25295 spa_t *spa = zio->io_spa; 2214 5329 gw25295 2215 7754 Jeff ASSERT(zio->io_error == 0); 2216 7754 Jeff ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2217 789 ahrens 2218 7754 Jeff if (vd == NULL) { 2219 7754 Jeff if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2220 7754 Jeff spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2221 7754 Jeff 2222 7754 Jeff /* 2223 7754 Jeff * The mirror_ops handle multiple DVAs in a single BP. 2224 7754 Jeff */ 2225 5530 bonwick return (vdev_mirror_ops.vdev_op_io_start(zio)); 2226 7754 Jeff } 2227 1775 billm 2228 7754 Jeff align = 1ULL << vd->vdev_top->vdev_ashift; 2229 1732 bonwick 2230 1732 bonwick if (P2PHASE(zio->io_size, align) != 0) { 2231 1732 bonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 2232 1732 bonwick char *abuf = zio_buf_alloc(asize); 2233 7754 Jeff ASSERT(vd == vd->vdev_top); 2234 1732 bonwick if (zio->io_type == ZIO_TYPE_WRITE) { 2235 1732 bonwick bcopy(zio->io_data, abuf, zio->io_size); 2236 1732 bonwick bzero(abuf + zio->io_size, asize - zio->io_size); 2237 1732 bonwick } 2238 7754 Jeff zio_push_transform(zio, abuf, asize, asize, zio_subblock); 2239 789 ahrens } 2240 789 ahrens 2241 1732 bonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 2242 1732 bonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 2243 8241 Jeff ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 2244 8241 Jeff 2245 8241 Jeff /* 2246 8241 Jeff * If this is a repair I/O, and there's no self-healing involved -- 2247 8241 Jeff * that is, we're just resilvering what we expect to resilver -- 2248 8241 Jeff * then don't do the I/O unless zio's txg is actually in vd's DTL. 2249 8241 Jeff * This prevents spurious resilvering with nested replication. 2250 8241 Jeff * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2251 8241 Jeff * A is out of date, we'll read from C+D, then use the data to 2252 8241 Jeff * resilver A+B -- but we don't actually want to resilver B, just A. 2253 8241 Jeff * The top-level mirror has no way to know this, so instead we just 2254 8241 Jeff * discard unnecessary repairs as we work our way down the vdev tree. 2255 8241 Jeff * The same logic applies to any form of nested replication: 2256 8241 Jeff * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2257 8241 Jeff */ 2258 8241 Jeff if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2259 8241 Jeff !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2260 8241 Jeff zio->io_txg != 0 && /* not a delegated i/o */ 2261 8241 Jeff !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2262 8241 Jeff ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2263 8241 Jeff zio_vdev_io_bypass(zio); 2264 8241 Jeff return (ZIO_PIPELINE_CONTINUE); 2265 8241 Jeff } 2266 7754 Jeff 2267 7754 Jeff if (vd->vdev_ops->vdev_op_leaf && 2268 7754 Jeff (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2269 7754 Jeff 2270 7754 Jeff if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2271 8632 Bill return (ZIO_PIPELINE_CONTINUE); 2272 7754 Jeff 2273 7754 Jeff if ((zio = vdev_queue_io(zio)) == NULL) 2274 7754 Jeff return (ZIO_PIPELINE_STOP); 2275 7754 Jeff 2276 7754 Jeff if (!vdev_accessible(vd, zio)) { 2277 7754 Jeff zio->io_error = ENXIO; 2278 7754 Jeff zio_interrupt(zio); 2279 7754 Jeff return (ZIO_PIPELINE_STOP); 2280 7754 Jeff } 2281 7754 Jeff } 2282 789 ahrens 2283 5530 bonwick return (vd->vdev_ops->vdev_op_io_start(zio)); 2284 789 ahrens } 2285 789 ahrens 2286 5530 bonwick static int 2287 789 ahrens zio_vdev_io_done(zio_t *zio) 2288 789 ahrens { 2289 7754 Jeff vdev_t *vd = zio->io_vd; 2290 7754 Jeff vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2291 7754 Jeff boolean_t unexpected_error = B_FALSE; 2292 5530 bonwick 2293 7754 Jeff if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2294 7754 Jeff return (ZIO_PIPELINE_STOP); 2295 789 ahrens 2296 7754 Jeff ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2297 789 ahrens 2298 7754 Jeff if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 2299 7754 Jeff 2300 7754 Jeff vdev_queue_io_done(zio); 2301 7754 Jeff 2302 7754 Jeff if (zio->io_type == ZIO_TYPE_WRITE) 2303 7754 Jeff vdev_cache_write(zio); 2304 7754 Jeff 2305 7754 Jeff if (zio_injection_enabled && zio->io_error == 0) 2306 9725 Eric zio->io_error = zio_handle_device_injection(vd, 2307 9725 Eric zio, EIO); 2308 7754 Jeff 2309 7754 Jeff if (zio_injection_enabled && zio->io_error == 0) 2310 7754 Jeff zio->io_error = zio_handle_label_injection(zio, EIO); 2311 7754 Jeff 2312 7754 Jeff if (zio->io_error) { 2313 7754 Jeff if (!vdev_accessible(vd, zio)) { 2314 7754 Jeff zio->io_error = ENXIO; 2315 7754 Jeff } else { 2316 7754 Jeff unexpected_error = B_TRUE; 2317 7754 Jeff } 2318 7754 Jeff } 2319 6976 eschrock } 2320 789 ahrens 2321 7754 Jeff ops->vdev_op_io_done(zio); 2322 7754 Jeff 2323 7754 Jeff if (unexpected_error) 2324 8632 Bill VERIFY(vdev_probe(vd, zio) == NULL); 2325 7754 Jeff 2326 7754 Jeff return (ZIO_PIPELINE_CONTINUE); 2327 789 ahrens } 2328 789 ahrens 2329 10614 Jonathan /* 2330 10614 Jonathan * For non-raidz ZIOs, we can just copy aside the bad data read from the 2331 10614 Jonathan * disk, and use that to finish the checksum ereport later. 2332 10614 Jonathan */ 2333 10614 Jonathan static void 2334 10614 Jonathan zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2335 10614 Jonathan const void *good_buf) 2336 10614 Jonathan { 2337 10614 Jonathan /* no processing needed */ 2338 10614 Jonathan zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2339 10614 Jonathan } 2340 10614 Jonathan 2341 10614 Jonathan /*ARGSUSED*/ 2342 10614 Jonathan void 2343 10614 Jonathan zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2344 10614 Jonathan { 2345 10614 Jonathan void *buf = zio_buf_alloc(zio->io_size); 2346 10614 Jonathan 2347 10614 Jonathan bcopy(zio->io_data, buf, zio->io_size); 2348 10614 Jonathan 2349 10614 Jonathan zcr->zcr_cbinfo = zio->io_size; 2350 10614 Jonathan zcr->zcr_cbdata = buf; 2351 10614 Jonathan zcr->zcr_finish = zio_vsd_default_cksum_finish; 2352 10614 Jonathan zcr->zcr_free = zio_buf_free; 2353 10614 Jonathan } 2354 10614 Jonathan 2355 5530 bonwick static int 2356 789 ahrens zio_vdev_io_assess(zio_t *zio) 2357 789 ahrens { 2358 789 ahrens vdev_t *vd = zio->io_vd; 2359 789 ahrens 2360 7754 Jeff if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2361 7754 Jeff return (ZIO_PIPELINE_STOP); 2362 1732 bonwick 2363 7754 Jeff if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2364 7754 Jeff spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2365 7754 Jeff 2366 7754 Jeff if (zio->io_vsd != NULL) { 2367 10614 Jonathan zio->io_vsd_ops->vsd_free(zio); 2368 7754 Jeff zio->io_vsd = NULL; 2369 1732 bonwick } 2370 789 ahrens 2371 7754 Jeff if (zio_injection_enabled && zio->io_error == 0) 2372 1544 eschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 2373 789 ahrens 2374 789 ahrens /* 2375 789 ahrens * If the I/O failed, determine whether we should attempt to retry it. 2376 11173 Jonathan * 2377 11173 Jonathan * On retry, we cut in line in the issue queue, since we don't want 2378 11173 Jonathan * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2379 789 ahrens */ 2380 7754 Jeff if (zio->io_error && vd == NULL && 2381 7754 Jeff !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2382 7754 Jeff ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2383 7754 Jeff ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2384 7754 Jeff zio->io_error = 0; 2385 7754 Jeff zio->io_flags |= ZIO_FLAG_IO_RETRY | 2386 7754 Jeff ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2387 10922 Jeff zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2388 11173 Jonathan zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2389 11173 Jonathan zio_requeue_io_start_cut_in_line); 2390 7754 Jeff return (ZIO_PIPELINE_STOP); 2391 7754 Jeff } 2392 789 ahrens 2393 7754 Jeff /* 2394 7754 Jeff * If we got an error on a leaf device, convert it to ENXIO 2395 7754 Jeff * if the device is not accessible at all. 2396 7754 Jeff */ 2397 7754 Jeff if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2398 7754 Jeff !vdev_accessible(vd, zio)) 2399 7754 Jeff zio->io_error = ENXIO; 2400 789 ahrens 2401 7754 Jeff /* 2402 7754 Jeff * If we can't write to an interior vdev (mirror or RAID-Z), 2403 7754 Jeff * set vdev_cant_write so that we stop trying to allocate from it. 2404 7754 Jeff */ 2405 7754 Jeff if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2406 7754 Jeff vd != NULL && !vd->vdev_ops->vdev_op_leaf) 2407 7754 Jeff vd->vdev_cant_write = B_TRUE; 2408 7754 Jeff 2409 7754 Jeff if (zio->io_error) 2410 7754 Jeff zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2411 1544 eschrock 2412 5530 bonwick return (ZIO_PIPELINE_CONTINUE); 2413 789 ahrens } 2414 789 ahrens 2415 789 ahrens void 2416 789 ahrens zio_vdev_io_reissue(zio_t *zio) 2417 789 ahrens { 2418 789 ahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2419 789 ahrens ASSERT(zio->io_error == 0); 2420 789 ahrens 2421 10922 Jeff zio->io_stage >>= 1; 2422 789 ahrens } 2423 789 ahrens 2424 789 ahrens void 2425 789 ahrens zio_vdev_io_redone(zio_t *zio) 2426 789 ahrens { 2427 789 ahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2428 789 ahrens 2429 10922 Jeff zio->io_stage >>= 1; 2430 789 ahrens } 2431 789 ahrens 2432 789 ahrens void 2433 789 ahrens zio_vdev_io_bypass(zio_t *zio) 2434 789 ahrens { 2435 789 ahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2436 789 ahrens ASSERT(zio->io_error == 0); 2437 789 ahrens 2438 789 ahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2439 10922 Jeff zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2440 789 ahrens } 2441 789 ahrens 2442 789 ahrens /* 2443 789 ahrens * ========================================================================== 2444 789 ahrens * Generate and verify checksums 2445 789 ahrens * ========================================================================== 2446 789 ahrens */ 2447 5530 bonwick static int 2448 789 ahrens zio_checksum_generate(zio_t *zio) 2449 789 ahrens { 2450 789 ahrens blkptr_t *bp = zio->io_bp; 2451 7754 Jeff enum zio_checksum checksum; 2452 789 ahrens 2453 7754 Jeff if (bp == NULL) { 2454 7754 Jeff /* 2455 7754 Jeff * This is zio_write_phys(). 2456 7754 Jeff * We're either generating a label checksum, or none at all. 2457 7754 Jeff */ 2458 7754 Jeff checksum = zio->io_prop.zp_checksum; 2459 789 ahrens 2460 7754 Jeff if (checksum == ZIO_CHECKSUM_OFF) 2461 7754 Jeff return (ZIO_PIPELINE_CONTINUE); 2462 789 ahrens 2463 7754 Jeff ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2464 7754 Jeff } else { 2465 7754 Jeff if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2466 7754 Jeff ASSERT(!IO_IS_ALLOCATING(zio)); 2467 7754 Jeff checksum = ZIO_CHECKSUM_GANG_HEADER; 2468 7754 Jeff } else { 2469 7754 Jeff checksum = BP_GET_CHECKSUM(bp); 2470 7754 Jeff } 2471 7754 Jeff } 2472 789 ahrens 2473 7754 Jeff zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2474 789 ahrens 2475 5530 bonwick return (ZIO_PIPELINE_CONTINUE); 2476 789 ahrens } 2477 789 ahrens 2478 5530 bonwick static int 2479 789 ahrens zio_checksum_verify(zio_t *zio) 2480 789 ahrens { 2481 10614 Jonathan zio_bad_cksum_t info; 2482 7754 Jeff blkptr_t *bp = zio->io_bp; 2483 7754 Jeff int error; 2484 10922 Jeff 2485 10922 Jeff ASSERT(zio->io_vd != NULL); 2486 7754 Jeff 2487 7754 Jeff if (bp == NULL) { 2488 7754 Jeff /* 2489 7754 Jeff * This is zio_read_phys(). 2490 7754 Jeff * We're either verifying a label checksum, or nothing at all. 2491 7754 Jeff */ 2492 7754 Jeff if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2493 7754 Jeff return (ZIO_PIPELINE_CONTINUE); 2494 7754 Jeff 2495 7754 Jeff ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2496 7754 Jeff } 2497 7754 Jeff 2498 10614 Jonathan if ((error = zio_checksum_error(zio, &info)) != 0) { 2499 7754 Jeff zio->io_error = error; 2500 7754 Jeff if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2501 10614 Jonathan zfs_ereport_start_checksum(zio->io_spa, 2502 10614 Jonathan zio->io_vd, zio, zio->io_offset, 2503 10614 Jonathan zio->io_size, NULL, &info); 2504 7754 Jeff } 2505 789 ahrens } 2506 789 ahrens 2507 5530 bonwick return (ZIO_PIPELINE_CONTINUE); 2508 789 ahrens } 2509 789 ahrens 2510 789 ahrens /* 2511 789 ahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 2512 789 ahrens */ 2513 789 ahrens void 2514 789 ahrens zio_checksum_verified(zio_t *zio) 2515 789 ahrens { 2516 10922 Jeff zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2517 789 ahrens } 2518 789 ahrens 2519 789 ahrens /* 2520 7754 Jeff * ========================================================================== 2521 7754 Jeff * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2522 7754 Jeff * An error of 0 indictes success. ENXIO indicates whole-device failure, 2523 7754 Jeff * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2524 7754 Jeff * indicate errors that are specific to one I/O, and most likely permanent. 2525 7754 Jeff * Any other error is presumed to be worse because we weren't expecting it. 2526 7754 Jeff * ========================================================================== 2527 789 ahrens */ 2528 7754 Jeff int 2529 7754 Jeff zio_worst_error(int e1, int e2) 2530 789 ahrens { 2531 7754 Jeff static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2532 7754 Jeff int r1, r2; 2533 1775 billm 2534 7754 Jeff for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2535 7754 Jeff if (e1 == zio_error_rank[r1]) 2536 7754 Jeff break; 2537 7754 Jeff 2538 7754 Jeff for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2539 7754 Jeff if (e2 == zio_error_rank[r2]) 2540 7754 Jeff break; 2541 7754 Jeff 2542 7754 Jeff return (r1 > r2 ? e1 : e2); 2543 789 ahrens } 2544 789 ahrens 2545 789 ahrens /* 2546 789 ahrens * ========================================================================== 2547 7754 Jeff * I/O completion 2548 789 ahrens * ========================================================================== 2549 789 ahrens */ 2550 7754 Jeff static int 2551 7754 Jeff zio_ready(zio_t *zio) 2552 7754 Jeff { 2553 7754 Jeff blkptr_t *bp = zio->io_bp; 2554 8632 Bill zio_t *pio, *pio_next; 2555 789 ahrens 2556 10922 Jeff if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2557 10922 Jeff zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2558 9443 Bill return (ZIO_PIPELINE_STOP); 2559 9443 Bill 2560 7754 Jeff if (zio->io_ready) { 2561 7754 Jeff ASSERT(IO_IS_ALLOCATING(zio)); 2562 7754 Jeff ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2563 7754 Jeff ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2564 7754 Jeff 2565 7754 Jeff zio->io_ready(zio); 2566 7754 Jeff } 2567 7754 Jeff 2568 7754 Jeff if (bp != NULL && bp != &zio->io_bp_copy) 2569 7754 Jeff zio->io_bp_copy = *bp; 2570 7754 Jeff 2571 7754 Jeff if (zio->io_error) 2572 7754 Jeff zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2573 7754 Jeff 2574 8632 Bill mutex_enter(&zio->io_lock); 2575 8632 Bill zio->io_state[ZIO_WAIT_READY] = 1; 2576 8632 Bill pio = zio_walk_parents(zio); 2577 8632 Bill mutex_exit(&zio->io_lock); 2578 8632 Bill 2579 8632 Bill /* 2580 8632 Bill * As we notify zio's parents, new parents could be added. 2581 8632 Bill * New parents go to the head of zio's io_parent_list, however, 2582 8632 Bill * so we will (correctly) not notify them. The remainder of zio's 2583 8632 Bill * io_parent_list, from 'pio_next' onward, cannot change because 2584 8632 Bill * all parents must wait for us to be done before they can be done. 2585 8632 Bill */ 2586 8632 Bill for (; pio != NULL; pio = pio_next) { 2587 8632 Bill pio_next = zio_walk_parents(zio); 2588 7754 Jeff zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2589 8632 Bill } 2590 7754 Jeff 2591 10922 Jeff if (zio->io_flags & ZIO_FLAG_NODATA) { 2592 10922 Jeff if (BP_IS_GANG(bp)) { 2593 10922 Jeff zio->io_flags &= ~ZIO_FLAG_NODATA; 2594 10922 Jeff } else { 2595 10922 Jeff ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2596 10922 Jeff zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2597 10922 Jeff } 2598 10922 Jeff } 2599 10922 Jeff 2600 11026 Tim if (zio_injection_enabled && 2601 11026 Tim zio->io_spa->spa_syncing_txg == zio->io_txg) 2602 11026 Tim zio_handle_ignored_writes(zio); 2603 11026 Tim 2604 7754 Jeff return (ZIO_PIPELINE_CONTINUE); 2605 7754 Jeff } 2606 7754 Jeff 2607 7754 Jeff static int 2608 7754 Jeff zio_done(zio_t *zio) 2609 7754 Jeff { 2610 7754 Jeff spa_t *spa = zio->io_spa; 2611 7754 Jeff zio_t *lio = zio->io_logical; 2612 7754 Jeff blkptr_t *bp = zio->io_bp; 2613 7754 Jeff vdev_t *vd = zio->io_vd; 2614 7754 Jeff uint64_t psize = zio->io_size; 2615 8632 Bill zio_t *pio, *pio_next; 2616 7754 Jeff 2617 7754 Jeff /* 2618 9443 Bill * If our children haven't all completed, 2619 7754 Jeff * wait for them and then repeat this pipeline stage. 2620 7754 Jeff */ 2621 7754 Jeff if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2622 7754 Jeff zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2623 10922 Jeff zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2624 7754 Jeff zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2625 7754 Jeff return (ZIO_PIPELINE_STOP); 2626 7754 Jeff 2627 7754 Jeff for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2628 7754 Jeff for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2629 7754 Jeff ASSERT(zio->io_children[c][w] == 0); 2630 7754 Jeff 2631 7754 Jeff if (bp != NULL) { 2632 7754 Jeff ASSERT(bp->blk_pad[0] == 0); 2633 7754 Jeff ASSERT(bp->blk_pad[1] == 0); 2634 7754 Jeff ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2635 8632 Bill (bp == zio_unique_parent(zio)->io_bp)); 2636 7754 Jeff if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2637 10922 Jeff zio->io_bp_override == NULL && 2638 7754 Jeff !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2639 7754 Jeff ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2640 10922 Jeff ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2641 7754 Jeff ASSERT(BP_COUNT_GANG(bp) == 0 || 2642 7754 Jeff (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2643 7754 Jeff } 2644 7754 Jeff } 2645 7754 Jeff 2646 7754 Jeff /* 2647 10922 Jeff * If there were child vdev/gang/ddt errors, they apply to us now. 2648 7754 Jeff */ 2649 7754 Jeff zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2650 7754 Jeff zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2651 10922 Jeff zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2652 10922 Jeff 2653 10922 Jeff /* 2654 10922 Jeff * If the I/O on the transformed data was successful, generate any 2655 10922 Jeff * checksum reports now while we still have the transformed data. 2656 10922 Jeff */ 2657 10922 Jeff if (zio->io_error == 0) { 2658 10922 Jeff while (zio->io_cksum_report != NULL) { 2659 10922 Jeff zio_cksum_report_t *zcr = zio->io_cksum_report; 2660 10922 Jeff uint64_t align = zcr->zcr_align; 2661 10922 Jeff uint64_t asize = P2ROUNDUP(psize, align); 2662 10922 Jeff char *abuf = zio->io_data; 2663 10922 Jeff 2664 10922 Jeff if (asize != psize) { 2665 10922 Jeff abuf = zio_buf_alloc(asize); 2666 10922 Jeff bcopy(zio->io_data, abuf, psize); 2667 10922 Jeff bzero(abuf + psize, asize - psize); 2668 10922 Jeff } 2669 10922 Jeff 2670 10922 Jeff zio->io_cksum_report = zcr->zcr_next; 2671 10922 Jeff zcr->zcr_next = NULL; 2672 10922 Jeff zcr->zcr_finish(zcr, abuf); 2673 10922 Jeff zfs_ereport_free_checksum(zcr); 2674 10922 Jeff 2675 10922 Jeff if (asize != psize) 2676 10922 Jeff zio_buf_free(abuf, asize); 2677 10922 Jeff } 2678 10922 Jeff } 2679 7754 Jeff 2680 7754 Jeff zio_pop_transforms(zio); /* note: may set zio->io_error */ 2681 7754 Jeff 2682 7754 Jeff vdev_stat_update(zio, psize); 2683 7754 Jeff 2684 7754 Jeff if (zio->io_error) { 2685 7754 Jeff /* 2686 7754 Jeff * If this I/O is attached to a particular vdev, 2687 7754 Jeff * generate an error message describing the I/O failure 2688 7754 Jeff * at the block level. We ignore these errors if the 2689 7754 Jeff * device is currently unavailable. 2690 7754 Jeff */ 2691 7754 Jeff if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 2692 7754 Jeff zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 2693 7754 Jeff 2694 10685 George if ((zio->io_error == EIO || !(zio->io_flags & 2695 10685 George (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 2696 10685 George zio == lio) { 2697 7754 Jeff /* 2698 7754 Jeff * For logical I/O requests, tell the SPA to log the 2699 7754 Jeff * error and generate a logical data ereport. 2700 7754 Jeff */ 2701 7754 Jeff spa_log_error(spa, zio); 2702 7754 Jeff zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 2703 7754 Jeff 0, 0); 2704 7754 Jeff } 2705 7754 Jeff } 2706 7754 Jeff 2707 7754 Jeff if (zio->io_error && zio == lio) { 2708 7754 Jeff /* 2709 7754 Jeff * Determine whether zio should be reexecuted. This will 2710 7754 Jeff * propagate all the way to the root via zio_notify_parent(). 2711 7754 Jeff */ 2712 7754 Jeff ASSERT(vd == NULL && bp != NULL); 2713 10922 Jeff ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2714 7754 Jeff 2715 10922 Jeff if (IO_IS_ALLOCATING(zio) && 2716 10922 Jeff !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 2717 7754 Jeff if (zio->io_error != ENOSPC) 2718 7754 Jeff zio->io_reexecute |= ZIO_REEXECUTE_NOW; 2719 7754 Jeff else 2720 7754 Jeff zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2721 10922 Jeff } 2722 7754 Jeff 2723 7754 Jeff if ((zio->io_type == ZIO_TYPE_READ || 2724 7754 Jeff zio->io_type == ZIO_TYPE_FREE) && 2725 7754 Jeff zio->io_error == ENXIO && 2726 11147 George spa_load_state(spa) == SPA_LOAD_NONE && 2727 7754 Jeff spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 2728 7754 Jeff zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2729 7754 Jeff 2730 7754 Jeff if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 2731 7754 Jeff zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2732 10614 Jonathan 2733 10614 Jonathan /* 2734 10614 Jonathan * Here is a possibly good place to attempt to do 2735 10614 Jonathan * either combinatorial reconstruction or error correction 2736 10614 Jonathan * based on checksums. It also might be a good place 2737 10614 Jonathan * to send out preliminary ereports before we suspend 2738 10614 Jonathan * processing. 2739 10614 Jonathan */ 2740 7754 Jeff } 2741 7754 Jeff 2742 7754 Jeff /* 2743 7754 Jeff * If there were logical child errors, they apply to us now. 2744 7754 Jeff * We defer this until now to avoid conflating logical child 2745 7754 Jeff * errors with errors that happened to the zio itself when 2746 7754 Jeff * updating vdev stats and reporting FMA events above. 2747 7754 Jeff */ 2748 7754 Jeff zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 2749 7754 Jeff 2750 10922 Jeff if ((zio->io_error || zio->io_reexecute) && 2751 10922 Jeff IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 2752 10922 Jeff !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) 2753 9443 Bill zio_dva_unallocate(zio, zio->io_gang_tree, bp); 2754 9443 Bill 2755 9443 Bill zio_gang_tree_free(&zio->io_gang_tree); 2756 9443 Bill 2757 9470 George /* 2758 9470 George * Godfather I/Os should never suspend. 2759 9470 George */ 2760 9470 George if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 2761 9470 George (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 2762 9470 George zio->io_reexecute = 0; 2763 9470 George 2764 9470 George if (zio->io_reexecute) { 2765 7754 Jeff /* 2766 7754 Jeff * This is a logical I/O that wants to reexecute. 2767 7754 Jeff * 2768 7754 Jeff * Reexecute is top-down. When an i/o fails, if it's not 2769 7754 Jeff * the root, it simply notifies its parent and sticks around. 2770 7754 Jeff * The parent, seeing that it still has children in zio_done(), 2771 7754 Jeff * does the same. This percolates all the way up to the root. 2772 7754 Jeff * The root i/o will reexecute or suspend the entire tree. 2773 7754 Jeff * 2774 7754 Jeff * This approach ensures that zio_reexecute() honors 2775 7754 Jeff * all the original i/o dependency relationships, e.g. 2776 7754 Jeff * parents not executing until children are ready. 2777 7754 Jeff */ 2778 7754 Jeff ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2779 7754 Jeff 2780 9443 Bill zio->io_gang_leader = NULL; 2781 7754 Jeff 2782 8632 Bill mutex_enter(&zio->io_lock); 2783 8632 Bill zio->io_state[ZIO_WAIT_DONE] = 1; 2784 8632 Bill mutex_exit(&zio->io_lock); 2785 9234 George 2786 9234 George /* 2787 9234 George * "The Godfather" I/O monitors its children but is 2788 9234 George * not a true parent to them. It will track them through 2789 9234 George * the pipeline but severs its ties whenever they get into 2790 9234 George * trouble (e.g. suspended). This allows "The Godfather" 2791 9234 George * I/O to return status without blocking. 2792 9234 George */ 2793 9234 George for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 2794 9234 George zio_link_t *zl = zio->io_walk_link; 2795 9234 George pio_next = zio_walk_parents(zio); 2796 9234 George 2797 9234 George if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 2798 9234 George (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 2799 9234 George zio_remove_child(pio, zio, zl); 2800 9234 George zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2801 9234 George } 2802 9234 George } 2803 8632 Bill 2804 8632 Bill if ((pio = zio_unique_parent(zio)) != NULL) { 2805 7754 Jeff /* 2806 7754 Jeff * We're not a root i/o, so there's nothing to do 2807 7754 Jeff * but notify our parent. Don't propagate errors 2808 7754 Jeff * upward since we haven't permanently failed yet. 2809 7754 Jeff */ 2810 9470 George ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 2811 7754 Jeff zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 2812 7754 Jeff zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2813 7754 Jeff } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 2814 7754 Jeff /* 2815 7754 Jeff * We'd fail again if we reexecuted now, so suspend 2816 7754 Jeff * until conditions improve (e.g. device comes online). 2817 7754 Jeff */ 2818 7754 Jeff zio_suspend(spa, zio); 2819 7754 Jeff } else { 2820 7754 Jeff /* 2821 7754 Jeff * Reexecution is potentially a huge amount of work. 2822 7754 Jeff * Hand it off to the otherwise-unused claim taskq. 2823 7754 Jeff */ 2824 7754 Jeff (void) taskq_dispatch( 2825 7754 Jeff spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 2826 7754 Jeff (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 2827 7754 Jeff } 2828 7754 Jeff return (ZIO_PIPELINE_STOP); 2829 7754 Jeff } 2830 7754 Jeff 2831 10922 Jeff ASSERT(zio->io_child_count == 0); 2832 9470 George ASSERT(zio->io_reexecute == 0); 2833 7754 Jeff ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 2834 7754 Jeff 2835 10922 Jeff /* 2836 10922 Jeff * Report any checksum errors, since the I/O is complete. 2837 10922 Jeff */ 2838 10614 Jonathan while (zio->io_cksum_report != NULL) { 2839 10922 Jeff zio_cksum_report_t *zcr = zio->io_cksum_report; 2840 10922 Jeff zio->io_cksum_report = zcr->zcr_next; 2841 10922 Jeff zcr->zcr_next = NULL; 2842 10922 Jeff zcr->zcr_finish(zcr, NULL); 2843 10922 Jeff zfs_ereport_free_checksum(zcr); 2844 10614 Jonathan } 2845 10614 Jonathan 2846 8632 Bill /* 2847 8632 Bill * It is the responsibility of the done callback to ensure that this 2848 8632 Bill * particular zio is no longer discoverable for adoption, and as 2849 8632 Bill * such, cannot acquire any new parents. 2850 8632 Bill */ 2851 7754 Jeff if (zio->io_done) 2852 7754 Jeff zio->io_done(zio); 2853 7754 Jeff 2854 8632 Bill mutex_enter(&zio->io_lock); 2855 8632 Bill zio->io_state[ZIO_WAIT_DONE] = 1; 2856 8632 Bill mutex_exit(&zio->io_lock); 2857 7754 Jeff 2858 8632 Bill for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 2859 8632 Bill zio_link_t *zl = zio->io_walk_link; 2860 8632 Bill pio_next = zio_walk_parents(zio); 2861 8632 Bill zio_remove_child(pio, zio, zl); 2862 7754 Jeff zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2863 7754 Jeff } 2864 7754 Jeff 2865 7754 Jeff if (zio->io_waiter != NULL) { 2866 7754 Jeff mutex_enter(&zio->io_lock); 2867 7754 Jeff zio->io_executor = NULL; 2868 7754 Jeff cv_broadcast(&zio->io_cv); 2869 7754 Jeff mutex_exit(&zio->io_lock); 2870 7754 Jeff } else { 2871 7754 Jeff zio_destroy(zio); 2872 7754 Jeff } 2873 7754 Jeff 2874 7754 Jeff return (ZIO_PIPELINE_STOP); 2875 7754 Jeff } 2876 7754 Jeff 2877 7754 Jeff /* 2878 7754 Jeff * ========================================================================== 2879 7754 Jeff * I/O pipeline definition 2880 7754 Jeff * ========================================================================== 2881 7754 Jeff */ 2882 10922 Jeff static zio_pipe_stage_t *zio_pipeline[] = { 2883 5530 bonwick NULL, 2884 10922 Jeff zio_read_bp_init, 2885 10922 Jeff zio_free_bp_init, 2886 5530 bonwick zio_issue_async, 2887 7754 Jeff zio_write_bp_init, 2888 789 ahrens zio_checksum_generate, 2889 10922 Jeff zio_ddt_read_start, 2890 10922 Jeff zio_ddt_read_done, 2891 10922 Jeff zio_ddt_write, 2892 10922 Jeff zio_ddt_free, 2893 7754 Jeff zio_gang_assemble, 2894 7754 Jeff zio_gang_issue, 2895 789 ahrens zio_dva_allocate, 2896 789 ahrens zio_dva_free, 2897 789 ahrens zio_dva_claim, 2898 789 ahrens zio_ready, 2899 789 ahrens zio_vdev_io_start, 2900 789 ahrens zio_vdev_io_done, 2901 789 ahrens zio_vdev_io_assess, 2902 789 ahrens zio_checksum_verify, 2903 7754 Jeff zio_done 2904 789 ahrens }; 2905