1 789 ahrens /* 2 789 ahrens * CDDL HEADER START 3 789 ahrens * 4 789 ahrens * The contents of this file are subject to the terms of the 5 1485 lling * Common Development and Distribution License (the "License"). 6 1485 lling * You may not use this file except in compliance with the License. 7 789 ahrens * 8 789 ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 789 ahrens * or http://www.opensolaris.org/os/licensing. 10 789 ahrens * See the License for the specific language governing permissions 11 789 ahrens * and limitations under the License. 12 789 ahrens * 13 789 ahrens * When distributing Covered Code, include this CDDL HEADER in each 14 789 ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 789 ahrens * If applicable, add the following below this CDDL HEADER, with the 16 789 ahrens * fields enclosed by brackets "[]" replaced with your own identifying 17 789 ahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18 789 ahrens * 19 789 ahrens * CDDL HEADER END 20 789 ahrens */ 21 2082 eschrock 22 789 ahrens /* 23 8632 Bill * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 789 ahrens * Use is subject to license terms. 25 789 ahrens */ 26 789 ahrens 27 789 ahrens #include <sys/zfs_context.h> 28 1544 eschrock #include <sys/fm/fs/zfs.h> 29 789 ahrens #include <sys/spa.h> 30 789 ahrens #include <sys/spa_impl.h> 31 789 ahrens #include <sys/dmu.h> 32 789 ahrens #include <sys/dmu_tx.h> 33 789 ahrens #include <sys/vdev_impl.h> 34 789 ahrens #include <sys/uberblock_impl.h> 35 789 ahrens #include <sys/metaslab.h> 36 789 ahrens #include <sys/metaslab_impl.h> 37 789 ahrens #include <sys/space_map.h> 38 789 ahrens #include <sys/zio.h> 39 789 ahrens #include <sys/zap.h> 40 789 ahrens #include <sys/fs/zfs.h> 41 6643 eschrock #include <sys/arc.h> 42 9701 George #include <sys/zil.h> 43 789 ahrens 44 789 ahrens /* 45 789 ahrens * Virtual device management. 46 789 ahrens */ 47 789 ahrens 48 789 ahrens static vdev_ops_t *vdev_ops_table[] = { 49 789 ahrens &vdev_root_ops, 50 789 ahrens &vdev_raidz_ops, 51 789 ahrens &vdev_mirror_ops, 52 789 ahrens &vdev_replacing_ops, 53 2082 eschrock &vdev_spare_ops, 54 789 ahrens &vdev_disk_ops, 55 789 ahrens &vdev_file_ops, 56 789 ahrens &vdev_missing_ops, 57 10594 George &vdev_hole_ops, 58 789 ahrens NULL 59 789 ahrens }; 60 3697 mishra 61 7046 ahrens /* maximum scrub/resilver I/O queue per leaf vdev */ 62 7046 ahrens int zfs_scrub_limit = 10; 63 789 ahrens 64 789 ahrens /* 65 789 ahrens * Given a vdev type, return the appropriate ops vector. 66 789 ahrens */ 67 789 ahrens static vdev_ops_t * 68 789 ahrens vdev_getops(const char *type) 69 789 ahrens { 70 789 ahrens vdev_ops_t *ops, **opspp; 71 789 ahrens 72 789 ahrens for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 73 789 ahrens if (strcmp(ops->vdev_op_type, type) == 0) 74 789 ahrens break; 75 789 ahrens 76 789 ahrens return (ops); 77 789 ahrens } 78 789 ahrens 79 789 ahrens /* 80 789 ahrens * Default asize function: return the MAX of psize with the asize of 81 789 ahrens * all children. This is what's used by anything other than RAID-Z. 82 789 ahrens */ 83 789 ahrens uint64_t 84 789 ahrens vdev_default_asize(vdev_t *vd, uint64_t psize) 85 789 ahrens { 86 1732 bonwick uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 87 789 ahrens uint64_t csize; 88 789 ahrens 89 9816 George for (int c = 0; c < vd->vdev_children; c++) { 90 789 ahrens csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 91 789 ahrens asize = MAX(asize, csize); 92 789 ahrens } 93 789 ahrens 94 789 ahrens return (asize); 95 1175 lling } 96 1175 lling 97 1175 lling /* 98 9816 George * Get the minimum allocatable size. We define the allocatable size as 99 9816 George * the vdev's asize rounded to the nearest metaslab. This allows us to 100 9816 George * replace or attach devices which don't have the same physical size but 101 9816 George * can still satisfy the same number of allocations. 102 1175 lling */ 103 1175 lling uint64_t 104 9816 George vdev_get_min_asize(vdev_t *vd) 105 1175 lling { 106 9816 George vdev_t *pvd = vd->vdev_parent; 107 1175 lling 108 1175 lling /* 109 9816 George * The our parent is NULL (inactive spare or cache) or is the root, 110 9816 George * just return our own asize. 111 1175 lling */ 112 9816 George if (pvd == NULL) 113 9816 George return (vd->vdev_asize); 114 1175 lling 115 9816 George /* 116 9816 George * The top-level vdev just returns the allocatable size rounded 117 9816 George * to the nearest metaslab. 118 9816 George */ 119 9816 George if (vd == vd->vdev_top) 120 9816 George return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 121 1175 lling 122 9816 George /* 123 9816 George * The allocatable space for a raidz vdev is N * sizeof(smallest child), 124 9816 George * so each child must provide at least 1/Nth of its asize. 125 9816 George */ 126 9816 George if (pvd->vdev_ops == &vdev_raidz_ops) 127 9816 George return (pvd->vdev_min_asize / pvd->vdev_children); 128 1175 lling 129 9816 George return (pvd->vdev_min_asize); 130 9816 George } 131 9816 George 132 9816 George void 133 9816 George vdev_set_min_asize(vdev_t *vd) 134 9816 George { 135 9816 George vd->vdev_min_asize = vdev_get_min_asize(vd); 136 9816 George 137 9816 George for (int c = 0; c < vd->vdev_children; c++) 138 9816 George vdev_set_min_asize(vd->vdev_child[c]); 139 789 ahrens } 140 789 ahrens 141 789 ahrens vdev_t * 142 789 ahrens vdev_lookup_top(spa_t *spa, uint64_t vdev) 143 789 ahrens { 144 789 ahrens vdev_t *rvd = spa->spa_root_vdev; 145 5530 bonwick 146 7754 Jeff ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 147 789 ahrens 148 7046 ahrens if (vdev < rvd->vdev_children) { 149 7046 ahrens ASSERT(rvd->vdev_child[vdev] != NULL); 150 789 ahrens return (rvd->vdev_child[vdev]); 151 7046 ahrens } 152 789 ahrens 153 789 ahrens return (NULL); 154 789 ahrens } 155 789 ahrens 156 789 ahrens vdev_t * 157 789 ahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 158 789 ahrens { 159 789 ahrens vdev_t *mvd; 160 789 ahrens 161 1585 bonwick if (vd->vdev_guid == guid) 162 789 ahrens return (vd); 163 789 ahrens 164 9816 George for (int c = 0; c < vd->vdev_children; c++) 165 789 ahrens if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 166 789 ahrens NULL) 167 789 ahrens return (mvd); 168 789 ahrens 169 789 ahrens return (NULL); 170 789 ahrens } 171 789 ahrens 172 789 ahrens void 173 789 ahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd) 174 789 ahrens { 175 789 ahrens size_t oldsize, newsize; 176 789 ahrens uint64_t id = cvd->vdev_id; 177 789 ahrens vdev_t **newchild; 178 789 ahrens 179 7754 Jeff ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 180 789 ahrens ASSERT(cvd->vdev_parent == NULL); 181 789 ahrens 182 789 ahrens cvd->vdev_parent = pvd; 183 789 ahrens 184 789 ahrens if (pvd == NULL) 185 789 ahrens return; 186 789 ahrens 187 789 ahrens ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 188 789 ahrens 189 789 ahrens oldsize = pvd->vdev_children * sizeof (vdev_t *); 190 789 ahrens pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 191 789 ahrens newsize = pvd->vdev_children * sizeof (vdev_t *); 192 789 ahrens 193 789 ahrens newchild = kmem_zalloc(newsize, KM_SLEEP); 194 789 ahrens if (pvd->vdev_child != NULL) { 195 789 ahrens bcopy(pvd->vdev_child, newchild, oldsize); 196 789 ahrens kmem_free(pvd->vdev_child, oldsize); 197 789 ahrens } 198 789 ahrens 199 789 ahrens pvd->vdev_child = newchild; 200 789 ahrens pvd->vdev_child[id] = cvd; 201 789 ahrens 202 789 ahrens cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 203 789 ahrens ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 204 789 ahrens 205 789 ahrens /* 206 789 ahrens * Walk up all ancestors to update guid sum. 207 789 ahrens */ 208 789 ahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 209 789 ahrens pvd->vdev_guid_sum += cvd->vdev_guid_sum; 210 3697 mishra 211 3697 mishra if (cvd->vdev_ops->vdev_op_leaf) 212 3697 mishra cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; 213 789 ahrens } 214 789 ahrens 215 789 ahrens void 216 789 ahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 217 789 ahrens { 218 789 ahrens int c; 219 789 ahrens uint_t id = cvd->vdev_id; 220 789 ahrens 221 789 ahrens ASSERT(cvd->vdev_parent == pvd); 222 789 ahrens 223 789 ahrens if (pvd == NULL) 224 789 ahrens return; 225 789 ahrens 226 789 ahrens ASSERT(id < pvd->vdev_children); 227 789 ahrens ASSERT(pvd->vdev_child[id] == cvd); 228 789 ahrens 229 789 ahrens pvd->vdev_child[id] = NULL; 230 789 ahrens cvd->vdev_parent = NULL; 231 789 ahrens 232 789 ahrens for (c = 0; c < pvd->vdev_children; c++) 233 789 ahrens if (pvd->vdev_child[c]) 234 789 ahrens break; 235 789 ahrens 236 789 ahrens if (c == pvd->vdev_children) { 237 789 ahrens kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 238 789 ahrens pvd->vdev_child = NULL; 239 789 ahrens pvd->vdev_children = 0; 240 789 ahrens } 241 789 ahrens 242 789 ahrens /* 243 789 ahrens * Walk up all ancestors to update guid sum. 244 789 ahrens */ 245 789 ahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 246 789 ahrens pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 247 3697 mishra 248 3697 mishra if (cvd->vdev_ops->vdev_op_leaf) 249 3697 mishra cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; 250 789 ahrens } 251 789 ahrens 252 789 ahrens /* 253 789 ahrens * Remove any holes in the child array. 254 789 ahrens */ 255 789 ahrens void 256 789 ahrens vdev_compact_children(vdev_t *pvd) 257 789 ahrens { 258 789 ahrens vdev_t **newchild, *cvd; 259 789 ahrens int oldc = pvd->vdev_children; 260 9816 George int newc; 261 789 ahrens 262 7754 Jeff ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 263 789 ahrens 264 9816 George for (int c = newc = 0; c < oldc; c++) 265 789 ahrens if (pvd->vdev_child[c]) 266 789 ahrens newc++; 267 789 ahrens 268 789 ahrens newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 269 789 ahrens 270 9816 George for (int c = newc = 0; c < oldc; c++) { 271 789 ahrens if ((cvd = pvd->vdev_child[c]) != NULL) { 272 789 ahrens newchild[newc] = cvd; 273 789 ahrens cvd->vdev_id = newc++; 274 789 ahrens } 275 789 ahrens } 276 789 ahrens 277 789 ahrens kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 278 789 ahrens pvd->vdev_child = newchild; 279 789 ahrens pvd->vdev_children = newc; 280 789 ahrens } 281 789 ahrens 282 789 ahrens /* 283 789 ahrens * Allocate and minimally initialize a vdev_t. 284 789 ahrens */ 285 10594 George vdev_t * 286 789 ahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 287 789 ahrens { 288 789 ahrens vdev_t *vd; 289 789 ahrens 290 1585 bonwick vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 291 789 ahrens 292 1585 bonwick if (spa->spa_root_vdev == NULL) { 293 1585 bonwick ASSERT(ops == &vdev_root_ops); 294 1585 bonwick spa->spa_root_vdev = vd; 295 1585 bonwick } 296 1585 bonwick 297 10594 George if (guid == 0 && ops != &vdev_hole_ops) { 298 1585 bonwick if (spa->spa_root_vdev == vd) { 299 1585 bonwick /* 300 1585 bonwick * The root vdev's guid will also be the pool guid, 301 1585 bonwick * which must be unique among all pools. 302 1585 bonwick */ 303 1585 bonwick while (guid == 0 || spa_guid_exists(guid, 0)) 304 1585 bonwick guid = spa_get_random(-1ULL); 305 1585 bonwick } else { 306 1585 bonwick /* 307 1585 bonwick * Any other vdev's guid must be unique within the pool. 308 1585 bonwick */ 309 1585 bonwick while (guid == 0 || 310 1585 bonwick spa_guid_exists(spa_guid(spa), guid)) 311 1585 bonwick guid = spa_get_random(-1ULL); 312 1585 bonwick } 313 1585 bonwick ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 314 1585 bonwick } 315 789 ahrens 316 789 ahrens vd->vdev_spa = spa; 317 789 ahrens vd->vdev_id = id; 318 789 ahrens vd->vdev_guid = guid; 319 789 ahrens vd->vdev_guid_sum = guid; 320 789 ahrens vd->vdev_ops = ops; 321 789 ahrens vd->vdev_state = VDEV_STATE_CLOSED; 322 10594 George vd->vdev_ishole = (ops == &vdev_hole_ops); 323 789 ahrens 324 789 ahrens mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 325 2856 nd150628 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 326 7754 Jeff mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 327 8241 Jeff for (int t = 0; t < DTL_TYPES; t++) { 328 8241 Jeff space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, 329 8241 Jeff &vd->vdev_dtl_lock); 330 8241 Jeff } 331 789 ahrens txg_list_create(&vd->vdev_ms_list, 332 789 ahrens offsetof(struct metaslab, ms_txg_node)); 333 789 ahrens txg_list_create(&vd->vdev_dtl_list, 334 789 ahrens offsetof(struct vdev, vdev_dtl_node)); 335 789 ahrens vd->vdev_stat.vs_timestamp = gethrtime(); 336 4451 eschrock vdev_queue_init(vd); 337 4451 eschrock vdev_cache_init(vd); 338 789 ahrens 339 789 ahrens return (vd); 340 789 ahrens } 341 789 ahrens 342 789 ahrens /* 343 789 ahrens * Allocate a new vdev. The 'alloctype' is used to control whether we are 344 789 ahrens * creating a new vdev or loading an existing one - the behavior is slightly 345 789 ahrens * different for each case. 346 789 ahrens */ 347 2082 eschrock int 348 2082 eschrock vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 349 2082 eschrock int alloctype) 350 789 ahrens { 351 789 ahrens vdev_ops_t *ops; 352 789 ahrens char *type; 353 4527 perrin uint64_t guid = 0, islog, nparity; 354 789 ahrens vdev_t *vd; 355 789 ahrens 356 7754 Jeff ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 357 789 ahrens 358 789 ahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 359 2082 eschrock return (EINVAL); 360 789 ahrens 361 789 ahrens if ((ops = vdev_getops(type)) == NULL) 362 2082 eschrock return (EINVAL); 363 789 ahrens 364 789 ahrens /* 365 789 ahrens * If this is a load, get the vdev guid from the nvlist. 366 789 ahrens * Otherwise, vdev_alloc_common() will generate one for us. 367 789 ahrens */ 368 789 ahrens if (alloctype == VDEV_ALLOC_LOAD) { 369 789 ahrens uint64_t label_id; 370 789 ahrens 371 789 ahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 372 789 ahrens label_id != id) 373 2082 eschrock return (EINVAL); 374 789 ahrens 375 789 ahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 376 2082 eschrock return (EINVAL); 377 2082 eschrock } else if (alloctype == VDEV_ALLOC_SPARE) { 378 2082 eschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 379 2082 eschrock return (EINVAL); 380 5450 brendan } else if (alloctype == VDEV_ALLOC_L2CACHE) { 381 5450 brendan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 382 5450 brendan return (EINVAL); 383 9790 Lin } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 384 9790 Lin if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 385 9790 Lin return (EINVAL); 386 789 ahrens } 387 2082 eschrock 388 2082 eschrock /* 389 2082 eschrock * The first allocated vdev must be of type 'root'. 390 2082 eschrock */ 391 2082 eschrock if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 392 2082 eschrock return (EINVAL); 393 789 ahrens 394 4527 perrin /* 395 4527 perrin * Determine whether we're a log vdev. 396 4527 perrin */ 397 4527 perrin islog = 0; 398 4527 perrin (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 399 5094 lling if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 400 4527 perrin return (ENOTSUP); 401 4527 perrin 402 10594 George if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 403 10594 George return (ENOTSUP); 404 10594 George 405 4527 perrin /* 406 4527 perrin * Set the nparity property for RAID-Z vdevs. 407 4527 perrin */ 408 4527 perrin nparity = -1ULL; 409 4527 perrin if (ops == &vdev_raidz_ops) { 410 4527 perrin if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 411 4527 perrin &nparity) == 0) { 412 10922 Jeff if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 413 4527 perrin return (EINVAL); 414 4527 perrin /* 415 10105 adam * Previous versions could only support 1 or 2 parity 416 10105 adam * device. 417 4527 perrin */ 418 10105 adam if (nparity > 1 && 419 10105 adam spa_version(spa) < SPA_VERSION_RAIDZ2) 420 10105 adam return (ENOTSUP); 421 10105 adam if (nparity > 2 && 422 10105 adam spa_version(spa) < SPA_VERSION_RAIDZ3) 423 4527 perrin return (ENOTSUP); 424 4527 perrin } else { 425 4527 perrin /* 426 4527 perrin * We require the parity to be specified for SPAs that 427 4527 perrin * support multiple parity levels. 428 4527 perrin */ 429 10105 adam if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 430 4527 perrin return (EINVAL); 431 4527 perrin /* 432 4527 perrin * Otherwise, we default to 1 parity device for RAID-Z. 433 4527 perrin */ 434 4527 perrin nparity = 1; 435 4527 perrin } 436 4527 perrin } else { 437 4527 perrin nparity = 0; 438 4527 perrin } 439 4527 perrin ASSERT(nparity != -1ULL); 440 4527 perrin 441 789 ahrens vd = vdev_alloc_common(spa, id, guid, ops); 442 4527 perrin 443 4527 perrin vd->vdev_islog = islog; 444 4527 perrin vd->vdev_nparity = nparity; 445 789 ahrens 446 789 ahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 447 789 ahrens vd->vdev_path = spa_strdup(vd->vdev_path); 448 789 ahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 449 789 ahrens vd->vdev_devid = spa_strdup(vd->vdev_devid); 450 4451 eschrock if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 451 4451 eschrock &vd->vdev_physpath) == 0) 452 4451 eschrock vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 453 9425 Eric if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 454 9425 Eric vd->vdev_fru = spa_strdup(vd->vdev_fru); 455 1171 eschrock 456 1171 eschrock /* 457 1171 eschrock * Set the whole_disk property. If it's not specified, leave the value 458 1171 eschrock * as -1. 459 1171 eschrock */ 460 1171 eschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 461 1171 eschrock &vd->vdev_wholedisk) != 0) 462 1171 eschrock vd->vdev_wholedisk = -1ULL; 463 1544 eschrock 464 1544 eschrock /* 465 1544 eschrock * Look for the 'not present' flag. This will only be set if the device 466 1544 eschrock * was not present at the time of import. 467 1544 eschrock */ 468 9425 Eric (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 469 9425 Eric &vd->vdev_not_present); 470 789 ahrens 471 789 ahrens /* 472 1732 bonwick * Get the alignment requirement. 473 1732 bonwick */ 474 1732 bonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 475 10594 George 476 10594 George /* 477 10594 George * Retrieve the vdev creation time. 478 10594 George */ 479 10594 George (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 480 10594 George &vd->vdev_crtxg); 481 1732 bonwick 482 1732 bonwick /* 483 789 ahrens * If we're a top-level vdev, try to load the allocation parameters. 484 789 ahrens */ 485 789 ahrens if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 486 789 ahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 487 789 ahrens &vd->vdev_ms_array); 488 789 ahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 489 789 ahrens &vd->vdev_ms_shift); 490 789 ahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 491 789 ahrens &vd->vdev_asize); 492 789 ahrens } 493 789 ahrens 494 10974 Jeff if (parent && !parent->vdev_parent) { 495 10974 Jeff ASSERT(alloctype == VDEV_ALLOC_LOAD || 496 10982 George alloctype == VDEV_ALLOC_ADD || 497 10982 George alloctype == VDEV_ALLOC_ROOTPOOL); 498 10974 Jeff vd->vdev_mg = metaslab_group_create(islog ? 499 10974 Jeff spa_log_class(spa) : spa_normal_class(spa), vd); 500 10974 Jeff } 501 10974 Jeff 502 789 ahrens /* 503 4451 eschrock * If we're a leaf vdev, try to load the DTL object and other state. 504 789 ahrens */ 505 6643 eschrock if (vd->vdev_ops->vdev_op_leaf && 506 9790 Lin (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 507 9790 Lin alloctype == VDEV_ALLOC_ROOTPOOL)) { 508 6643 eschrock if (alloctype == VDEV_ALLOC_LOAD) { 509 6643 eschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 510 8241 Jeff &vd->vdev_dtl_smo.smo_object); 511 6643 eschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 512 6643 eschrock &vd->vdev_unspare); 513 6643 eschrock } 514 9790 Lin 515 9790 Lin if (alloctype == VDEV_ALLOC_ROOTPOOL) { 516 9790 Lin uint64_t spare = 0; 517 9790 Lin 518 9790 Lin if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 519 9790 Lin &spare) == 0 && spare) 520 9790 Lin spa_spare_add(vd); 521 9790 Lin } 522 9790 Lin 523 1732 bonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 524 1732 bonwick &vd->vdev_offline); 525 6643 eschrock 526 4451 eschrock /* 527 4451 eschrock * When importing a pool, we want to ignore the persistent fault 528 4451 eschrock * state, as the diagnosis made on another system may not be 529 10817 Eric * valid in the current context. Local vdevs will 530 10817 Eric * remain in the faulted state. 531 4451 eschrock */ 532 11147 George if (spa_load_state(spa) == SPA_LOAD_OPEN) { 533 4451 eschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 534 4451 eschrock &vd->vdev_faulted); 535 4451 eschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 536 4451 eschrock &vd->vdev_degraded); 537 4451 eschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 538 4451 eschrock &vd->vdev_removed); 539 10817 Eric 540 10817 Eric if (vd->vdev_faulted || vd->vdev_degraded) { 541 10817 Eric char *aux; 542 10817 Eric 543 10817 Eric vd->vdev_label_aux = 544 10817 Eric VDEV_AUX_ERR_EXCEEDED; 545 10817 Eric if (nvlist_lookup_string(nv, 546 10817 Eric ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 547 10817 Eric strcmp(aux, "external") == 0) 548 10817 Eric vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 549 10817 Eric } 550 4451 eschrock } 551 789 ahrens } 552 789 ahrens 553 789 ahrens /* 554 789 ahrens * Add ourselves to the parent's list of children. 555 789 ahrens */ 556 789 ahrens vdev_add_child(parent, vd); 557 789 ahrens 558 2082 eschrock *vdp = vd; 559 2082 eschrock 560 2082 eschrock return (0); 561 789 ahrens } 562 789 ahrens 563 789 ahrens void 564 789 ahrens vdev_free(vdev_t *vd) 565 789 ahrens { 566 4451 eschrock spa_t *spa = vd->vdev_spa; 567 789 ahrens 568 789 ahrens /* 569 789 ahrens * vdev_free() implies closing the vdev first. This is simpler than 570 789 ahrens * trying to ensure complicated semantics for all callers. 571 789 ahrens */ 572 789 ahrens vdev_close(vd); 573 4451 eschrock 574 7754 Jeff ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 575 10922 Jeff ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 576 789 ahrens 577 789 ahrens /* 578 789 ahrens * Free all children. 579 789 ahrens */ 580 9816 George for (int c = 0; c < vd->vdev_children; c++) 581 789 ahrens vdev_free(vd->vdev_child[c]); 582 789 ahrens 583 789 ahrens ASSERT(vd->vdev_child == NULL); 584 789 ahrens ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 585 789 ahrens 586 789 ahrens /* 587 789 ahrens * Discard allocation state. 588 789 ahrens */ 589 10974 Jeff if (vd->vdev_mg != NULL) { 590 789 ahrens vdev_metaslab_fini(vd); 591 10974 Jeff metaslab_group_destroy(vd->vdev_mg); 592 10974 Jeff } 593 789 ahrens 594 789 ahrens ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 595 2082 eschrock ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 596 789 ahrens ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 597 789 ahrens 598 789 ahrens /* 599 789 ahrens * Remove this vdev from its parent's child list. 600 789 ahrens */ 601 789 ahrens vdev_remove_child(vd->vdev_parent, vd); 602 789 ahrens 603 789 ahrens ASSERT(vd->vdev_parent == NULL); 604 789 ahrens 605 4451 eschrock /* 606 4451 eschrock * Clean up vdev structure. 607 4451 eschrock */ 608 4451 eschrock vdev_queue_fini(vd); 609 4451 eschrock vdev_cache_fini(vd); 610 4451 eschrock 611 4451 eschrock if (vd->vdev_path) 612 4451 eschrock spa_strfree(vd->vdev_path); 613 4451 eschrock if (vd->vdev_devid) 614 4451 eschrock spa_strfree(vd->vdev_devid); 615 4451 eschrock if (vd->vdev_physpath) 616 4451 eschrock spa_strfree(vd->vdev_physpath); 617 9425 Eric if (vd->vdev_fru) 618 9425 Eric spa_strfree(vd->vdev_fru); 619 4451 eschrock 620 4451 eschrock if (vd->vdev_isspare) 621 4451 eschrock spa_spare_remove(vd); 622 5450 brendan if (vd->vdev_isl2cache) 623 5450 brendan spa_l2cache_remove(vd); 624 4451 eschrock 625 4451 eschrock txg_list_destroy(&vd->vdev_ms_list); 626 4451 eschrock txg_list_destroy(&vd->vdev_dtl_list); 627 8241 Jeff 628 4451 eschrock mutex_enter(&vd->vdev_dtl_lock); 629 8241 Jeff for (int t = 0; t < DTL_TYPES; t++) { 630 8241 Jeff space_map_unload(&vd->vdev_dtl[t]); 631 8241 Jeff space_map_destroy(&vd->vdev_dtl[t]); 632 8241 Jeff } 633 4451 eschrock mutex_exit(&vd->vdev_dtl_lock); 634 8241 Jeff 635 4451 eschrock mutex_destroy(&vd->vdev_dtl_lock); 636 4451 eschrock mutex_destroy(&vd->vdev_stat_lock); 637 7754 Jeff mutex_destroy(&vd->vdev_probe_lock); 638 4451 eschrock 639 4451 eschrock if (vd == spa->spa_root_vdev) 640 4451 eschrock spa->spa_root_vdev = NULL; 641 4451 eschrock 642 4451 eschrock kmem_free(vd, sizeof (vdev_t)); 643 789 ahrens } 644 789 ahrens 645 789 ahrens /* 646 789 ahrens * Transfer top-level vdev state from svd to tvd. 647 789 ahrens */ 648 789 ahrens static void 649 789 ahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 650 789 ahrens { 651 789 ahrens spa_t *spa = svd->vdev_spa; 652 789 ahrens metaslab_t *msp; 653 789 ahrens vdev_t *vd; 654 789 ahrens int t; 655 789 ahrens 656 789 ahrens ASSERT(tvd == tvd->vdev_top); 657 789 ahrens 658 789 ahrens tvd->vdev_ms_array = svd->vdev_ms_array; 659 789 ahrens tvd->vdev_ms_shift = svd->vdev_ms_shift; 660 789 ahrens tvd->vdev_ms_count = svd->vdev_ms_count; 661 789 ahrens 662 789 ahrens svd->vdev_ms_array = 0; 663 789 ahrens svd->vdev_ms_shift = 0; 664 789 ahrens svd->vdev_ms_count = 0; 665 789 ahrens 666 789 ahrens tvd->vdev_mg = svd->vdev_mg; 667 789 ahrens tvd->vdev_ms = svd->vdev_ms; 668 789 ahrens 669 789 ahrens svd->vdev_mg = NULL; 670 789 ahrens svd->vdev_ms = NULL; 671 1732 bonwick 672 1732 bonwick if (tvd->vdev_mg != NULL) 673 1732 bonwick tvd->vdev_mg->mg_vd = tvd; 674 789 ahrens 675 789 ahrens tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 676 789 ahrens tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 677 2082 eschrock tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 678 789 ahrens 679 789 ahrens svd->vdev_stat.vs_alloc = 0; 680 789 ahrens svd->vdev_stat.vs_space = 0; 681 2082 eschrock svd->vdev_stat.vs_dspace = 0; 682 789 ahrens 683 789 ahrens for (t = 0; t < TXG_SIZE; t++) { 684 789 ahrens while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 685 789 ahrens (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 686 789 ahrens while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 687 789 ahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 688 789 ahrens if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 689 789 ahrens (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 690 789 ahrens } 691 789 ahrens 692 7754 Jeff if (list_link_active(&svd->vdev_config_dirty_node)) { 693 789 ahrens vdev_config_clean(svd); 694 789 ahrens vdev_config_dirty(tvd); 695 7754 Jeff } 696 7754 Jeff 697 7754 Jeff if (list_link_active(&svd->vdev_state_dirty_node)) { 698 7754 Jeff vdev_state_clean(svd); 699 7754 Jeff vdev_state_dirty(tvd); 700 789 ahrens } 701 2082 eschrock 702 2082 eschrock tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 703 2082 eschrock svd->vdev_deflate_ratio = 0; 704 4527 perrin 705 4527 perrin tvd->vdev_islog = svd->vdev_islog; 706 4527 perrin svd->vdev_islog = 0; 707 789 ahrens } 708 789 ahrens 709 789 ahrens static void 710 789 ahrens vdev_top_update(vdev_t *tvd, vdev_t *vd) 711 789 ahrens { 712 789 ahrens if (vd == NULL) 713 789 ahrens return; 714 789 ahrens 715 789 ahrens vd->vdev_top = tvd; 716 789 ahrens 717 9816 George for (int c = 0; c < vd->vdev_children; c++) 718 789 ahrens vdev_top_update(tvd, vd->vdev_child[c]); 719 789 ahrens } 720 789 ahrens 721 789 ahrens /* 722 789 ahrens * Add a mirror/replacing vdev above an existing vdev. 723 789 ahrens */ 724 789 ahrens vdev_t * 725 789 ahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 726 789 ahrens { 727 789 ahrens spa_t *spa = cvd->vdev_spa; 728 789 ahrens vdev_t *pvd = cvd->vdev_parent; 729 789 ahrens vdev_t *mvd; 730 789 ahrens 731 7754 Jeff ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 732 789 ahrens 733 789 ahrens mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 734 1732 bonwick 735 1732 bonwick mvd->vdev_asize = cvd->vdev_asize; 736 9816 George mvd->vdev_min_asize = cvd->vdev_min_asize; 737 1732 bonwick mvd->vdev_ashift = cvd->vdev_ashift; 738 1732 bonwick mvd->vdev_state = cvd->vdev_state; 739 10594 George mvd->vdev_crtxg = cvd->vdev_crtxg; 740 1732 bonwick 741 789 ahrens vdev_remove_child(pvd, cvd); 742 789 ahrens vdev_add_child(pvd, mvd); 743 789 ahrens cvd->vdev_id = mvd->vdev_children; 744 789 ahrens vdev_add_child(mvd, cvd); 745 789 ahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 746 789 ahrens 747 789 ahrens if (mvd == mvd->vdev_top) 748 789 ahrens vdev_top_transfer(cvd, mvd); 749 789 ahrens 750 789 ahrens return (mvd); 751 789 ahrens } 752 789 ahrens 753 789 ahrens /* 754 789 ahrens * Remove a 1-way mirror/replacing vdev from the tree. 755 789 ahrens */ 756 789 ahrens void 757 789 ahrens vdev_remove_parent(vdev_t *cvd) 758 789 ahrens { 759 789 ahrens vdev_t *mvd = cvd->vdev_parent; 760 789 ahrens vdev_t *pvd = mvd->vdev_parent; 761 789 ahrens 762 7754 Jeff ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 763 789 ahrens 764 789 ahrens ASSERT(mvd->vdev_children == 1); 765 789 ahrens ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 766 2082 eschrock mvd->vdev_ops == &vdev_replacing_ops || 767 2082 eschrock mvd->vdev_ops == &vdev_spare_ops); 768 1732 bonwick cvd->vdev_ashift = mvd->vdev_ashift; 769 789 ahrens 770 789 ahrens vdev_remove_child(mvd, cvd); 771 789 ahrens vdev_remove_child(pvd, mvd); 772 8241 Jeff 773 7754 Jeff /* 774 7754 Jeff * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 775 7754 Jeff * Otherwise, we could have detached an offline device, and when we 776 7754 Jeff * go to import the pool we'll think we have two top-level vdevs, 777 7754 Jeff * instead of a different version of the same top-level vdev. 778 7754 Jeff */ 779 8241 Jeff if (mvd->vdev_top == mvd) { 780 8241 Jeff uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 781 8241 Jeff cvd->vdev_guid += guid_delta; 782 8241 Jeff cvd->vdev_guid_sum += guid_delta; 783 8241 Jeff } 784 789 ahrens cvd->vdev_id = mvd->vdev_id; 785 789 ahrens vdev_add_child(pvd, cvd); 786 789 ahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 787 789 ahrens 788 789 ahrens if (cvd == cvd->vdev_top) 789 789 ahrens vdev_top_transfer(mvd, cvd); 790 789 ahrens 791 789 ahrens ASSERT(mvd->vdev_children == 0); 792 789 ahrens vdev_free(mvd); 793 789 ahrens } 794 789 ahrens 795 1544 eschrock int 796 789 ahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg) 797 789 ahrens { 798 789 ahrens spa_t *spa = vd->vdev_spa; 799 1732 bonwick objset_t *mos = spa->spa_meta_objset; 800 1732 bonwick uint64_t m; 801 789 ahrens uint64_t oldc = vd->vdev_ms_count; 802 789 ahrens uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 803 1732 bonwick metaslab_t **mspp; 804 1732 bonwick int error; 805 10974 Jeff 806 10974 Jeff ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 807 1585 bonwick 808 10594 George /* 809 10594 George * This vdev is not being allocated from yet or is a hole. 810 10594 George */ 811 10594 George if (vd->vdev_ms_shift == 0) 812 1585 bonwick return (0); 813 10594 George 814 10594 George ASSERT(!vd->vdev_ishole); 815 789 ahrens 816 9701 George /* 817 9701 George * Compute the raidz-deflation ratio. Note, we hard-code 818 9701 George * in 128k (1 << 17) because it is the current "typical" blocksize. 819 9701 George * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, 820 9701 George * or we will inconsistently account for existing bp's. 821 9701 George */ 822 9701 George vd->vdev_deflate_ratio = (1 << 17) / 823 9701 George (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 824 9701 George 825 789 ahrens ASSERT(oldc <= newc); 826 1732 bonwick 827 1732 bonwick mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 828 1732 bonwick 829 1732 bonwick if (oldc != 0) { 830 1732 bonwick bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 831 1732 bonwick kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 832 1732 bonwick } 833 1732 bonwick 834 1732 bonwick vd->vdev_ms = mspp; 835 789 ahrens vd->vdev_ms_count = newc; 836 789 ahrens 837 1732 bonwick for (m = oldc; m < newc; m++) { 838 1732 bonwick space_map_obj_t smo = { 0, 0, 0 }; 839 789 ahrens if (txg == 0) { 840 1732 bonwick uint64_t object = 0; 841 1732 bonwick error = dmu_read(mos, vd->vdev_ms_array, 842 9512 Neil m * sizeof (uint64_t), sizeof (uint64_t), &object, 843 9512 Neil DMU_READ_PREFETCH); 844 1732 bonwick if (error) 845 1732 bonwick return (error); 846 1732 bonwick if (object != 0) { 847 1732 bonwick dmu_buf_t *db; 848 1732 bonwick error = dmu_bonus_hold(mos, object, FTAG, &db); 849 1732 bonwick if (error) 850 1732 bonwick return (error); 851 4944 maybee ASSERT3U(db->db_size, >=, sizeof (smo)); 852 4944 maybee bcopy(db->db_data, &smo, sizeof (smo)); 853 1732 bonwick ASSERT3U(smo.smo_object, ==, object); 854 1544 eschrock dmu_buf_rele(db, FTAG); 855 789 ahrens } 856 789 ahrens } 857 1732 bonwick vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 858 1732 bonwick m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 859 789 ahrens } 860 789 ahrens 861 10974 Jeff if (txg == 0) 862 10974 Jeff spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 863 10974 Jeff 864 10974 Jeff if (oldc == 0) 865 10974 Jeff metaslab_group_activate(vd->vdev_mg); 866 10974 Jeff 867 10974 Jeff if (txg == 0) 868 10974 Jeff spa_config_exit(spa, SCL_ALLOC, FTAG); 869 10974 Jeff 870 1544 eschrock return (0); 871 789 ahrens } 872 789 ahrens 873 789 ahrens void 874 789 ahrens vdev_metaslab_fini(vdev_t *vd) 875 789 ahrens { 876 789 ahrens uint64_t m; 877 789 ahrens uint64_t count = vd->vdev_ms_count; 878 789 ahrens 879 789 ahrens if (vd->vdev_ms != NULL) { 880 10974 Jeff metaslab_group_passivate(vd->vdev_mg); 881 789 ahrens for (m = 0; m < count; m++) 882 1732 bonwick if (vd->vdev_ms[m] != NULL) 883 1732 bonwick metaslab_fini(vd->vdev_ms[m]); 884 789 ahrens kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 885 789 ahrens vd->vdev_ms = NULL; 886 789 ahrens } 887 789 ahrens } 888 789 ahrens 889 7754 Jeff typedef struct vdev_probe_stats { 890 7754 Jeff boolean_t vps_readable; 891 7754 Jeff boolean_t vps_writeable; 892 7754 Jeff int vps_flags; 893 7754 Jeff } vdev_probe_stats_t; 894 7754 Jeff 895 7754 Jeff static void 896 7754 Jeff vdev_probe_done(zio_t *zio) 897 5329 gw25295 { 898 8241 Jeff spa_t *spa = zio->io_spa; 899 8632 Bill vdev_t *vd = zio->io_vd; 900 7754 Jeff vdev_probe_stats_t *vps = zio->io_private; 901 8632 Bill 902 8632 Bill ASSERT(vd->vdev_probe_zio != NULL); 903 5329 gw25295 904 7754 Jeff if (zio->io_type == ZIO_TYPE_READ) { 905 7754 Jeff if (zio->io_error == 0) 906 7754 Jeff vps->vps_readable = 1; 907 8241 Jeff if (zio->io_error == 0 && spa_writeable(spa)) { 908 8632 Bill zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 909 7754 Jeff zio->io_offset, zio->io_size, zio->io_data, 910 7754 Jeff ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 911 7754 Jeff ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 912 7754 Jeff } else { 913 7754 Jeff zio_buf_free(zio->io_data, zio->io_size); 914 7754 Jeff } 915 7754 Jeff } else if (zio->io_type == ZIO_TYPE_WRITE) { 916 7754 Jeff if (zio->io_error == 0) 917 7754 Jeff vps->vps_writeable = 1; 918 7754 Jeff zio_buf_free(zio->io_data, zio->io_size); 919 7754 Jeff } else if (zio->io_type == ZIO_TYPE_NULL) { 920 8632 Bill zio_t *pio; 921 5329 gw25295 922 7754 Jeff vd->vdev_cant_read |= !vps->vps_readable; 923 7754 Jeff vd->vdev_cant_write |= !vps->vps_writeable; 924 7754 Jeff 925 7754 Jeff if (vdev_readable(vd) && 926 8241 Jeff (vdev_writeable(vd) || !spa_writeable(spa))) { 927 7754 Jeff zio->io_error = 0; 928 7754 Jeff } else { 929 7754 Jeff ASSERT(zio->io_error != 0); 930 7754 Jeff zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 931 8241 Jeff spa, vd, NULL, 0, 0); 932 7754 Jeff zio->io_error = ENXIO; 933 7754 Jeff } 934 8632 Bill 935 8632 Bill mutex_enter(&vd->vdev_probe_lock); 936 8632 Bill ASSERT(vd->vdev_probe_zio == zio); 937 8632 Bill vd->vdev_probe_zio = NULL; 938 8632 Bill mutex_exit(&vd->vdev_probe_lock); 939 8632 Bill 940 8632 Bill while ((pio = zio_walk_parents(zio)) != NULL) 941 8632 Bill if (!vdev_accessible(vd, pio)) 942 8632 Bill pio->io_error = ENXIO; 943 8632 Bill 944 7754 Jeff kmem_free(vps, sizeof (*vps)); 945 7754 Jeff } 946 7754 Jeff } 947 7754 Jeff 948 7754 Jeff /* 949 7754 Jeff * Determine whether this device is accessible by reading and writing 950 7754 Jeff * to several known locations: the pad regions of each vdev label 951 7754 Jeff * but the first (which we leave alone in case it contains a VTOC). 952 7754 Jeff */ 953 7754 Jeff zio_t * 954 8632 Bill vdev_probe(vdev_t *vd, zio_t *zio) 955 7754 Jeff { 956 7754 Jeff spa_t *spa = vd->vdev_spa; 957 8632 Bill vdev_probe_stats_t *vps = NULL; 958 8632 Bill zio_t *pio; 959 7754 Jeff 960 7754 Jeff ASSERT(vd->vdev_ops->vdev_op_leaf); 961 7754 Jeff 962 8632 Bill /* 963 8632 Bill * Don't probe the probe. 964 8632 Bill */ 965 8632 Bill if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 966 8632 Bill return (NULL); 967 7754 Jeff 968 8632 Bill /* 969 8632 Bill * To prevent 'probe storms' when a device fails, we create 970 8632 Bill * just one probe i/o at a time. All zios that want to probe 971 8632 Bill * this vdev will become parents of the probe io. 972 8632 Bill */ 973 8632 Bill mutex_enter(&vd->vdev_probe_lock); 974 8632 Bill 975 8632 Bill if ((pio = vd->vdev_probe_zio) == NULL) { 976 8632 Bill vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 977 8632 Bill 978 8632 Bill vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 979 8632 Bill ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 980 9725 Eric ZIO_FLAG_TRYHARD; 981 8632 Bill 982 8632 Bill if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 983 8632 Bill /* 984 8632 Bill * vdev_cant_read and vdev_cant_write can only 985 8632 Bill * transition from TRUE to FALSE when we have the 986 8632 Bill * SCL_ZIO lock as writer; otherwise they can only 987 8632 Bill * transition from FALSE to TRUE. This ensures that 988 8632 Bill * any zio looking at these values can assume that 989 8632 Bill * failures persist for the life of the I/O. That's 990 8632 Bill * important because when a device has intermittent 991 8632 Bill * connectivity problems, we want to ensure that 992 8632 Bill * they're ascribed to the device (ENXIO) and not 993 8632 Bill * the zio (EIO). 994 8632 Bill * 995 8632 Bill * Since we hold SCL_ZIO as writer here, clear both 996 8632 Bill * values so the probe can reevaluate from first 997 8632 Bill * principles. 998 8632 Bill */ 999 8632 Bill vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1000 8632 Bill vd->vdev_cant_read = B_FALSE; 1001 8632 Bill vd->vdev_cant_write = B_FALSE; 1002 8632 Bill } 1003 8632 Bill 1004 8632 Bill vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1005 8632 Bill vdev_probe_done, vps, 1006 8632 Bill vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1007 8632 Bill 1008 8632 Bill if (zio != NULL) { 1009 8632 Bill vd->vdev_probe_wanted = B_TRUE; 1010 8632 Bill spa_async_request(spa, SPA_ASYNC_PROBE); 1011 8632 Bill } 1012 8632 Bill } 1013 8632 Bill 1014 8632 Bill if (zio != NULL) 1015 8632 Bill zio_add_child(zio, pio); 1016 8632 Bill 1017 8632 Bill mutex_exit(&vd->vdev_probe_lock); 1018 8632 Bill 1019 8632 Bill if (vps == NULL) { 1020 8632 Bill ASSERT(zio != NULL); 1021 8632 Bill return (NULL); 1022 8632 Bill } 1023 7754 Jeff 1024 7754 Jeff for (int l = 1; l < VDEV_LABELS; l++) { 1025 8632 Bill zio_nowait(zio_read_phys(pio, vd, 1026 7754 Jeff vdev_label_offset(vd->vdev_psize, l, 1027 9056 Lin offsetof(vdev_label_t, vl_pad2)), 1028 9056 Lin VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), 1029 7754 Jeff ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1030 7754 Jeff ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1031 7754 Jeff } 1032 7754 Jeff 1033 8632 Bill if (zio == NULL) 1034 8632 Bill return (pio); 1035 8632 Bill 1036 8632 Bill zio_nowait(pio); 1037 8632 Bill return (NULL); 1038 5329 gw25295 } 1039 5329 gw25295 1040 9846 Eric static void 1041 9846 Eric vdev_open_child(void *arg) 1042 9846 Eric { 1043 9846 Eric vdev_t *vd = arg; 1044 9846 Eric 1045 9846 Eric vd->vdev_open_thread = curthread; 1046 9846 Eric vd->vdev_open_error = vdev_open(vd); 1047 9846 Eric vd->vdev_open_thread = NULL; 1048 9846 Eric } 1049 9846 Eric 1050 10588 Eric boolean_t 1051 10588 Eric vdev_uses_zvols(vdev_t *vd) 1052 10588 Eric { 1053 10588 Eric if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1054 10588 Eric strlen(ZVOL_DIR)) == 0) 1055 10588 Eric return (B_TRUE); 1056 10588 Eric for (int c = 0; c < vd->vdev_children; c++) 1057 10588 Eric if (vdev_uses_zvols(vd->vdev_child[c])) 1058 10588 Eric return (B_TRUE); 1059 10588 Eric return (B_FALSE); 1060 10588 Eric } 1061 10588 Eric 1062 9846 Eric void 1063 9846 Eric vdev_open_children(vdev_t *vd) 1064 9846 Eric { 1065 9846 Eric taskq_t *tq; 1066 9846 Eric int children = vd->vdev_children; 1067 9846 Eric 1068 10588 Eric /* 1069 10588 Eric * in order to handle pools on top of zvols, do the opens 1070 10588 Eric * in a single thread so that the same thread holds the 1071 10588 Eric * spa_namespace_lock 1072 10588 Eric */ 1073 10588 Eric if (vdev_uses_zvols(vd)) { 1074 10588 Eric for (int c = 0; c < children; c++) 1075 10588 Eric vd->vdev_child[c]->vdev_open_error = 1076 10588 Eric vdev_open(vd->vdev_child[c]); 1077 10588 Eric return; 1078 10588 Eric } 1079 9846 Eric tq = taskq_create("vdev_open", children, minclsyspri, 1080 9846 Eric children, children, TASKQ_PREPOPULATE); 1081 9846 Eric 1082 9846 Eric for (int c = 0; c < children; c++) 1083 9846 Eric VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1084 9846 Eric TQ_SLEEP) != NULL); 1085 9846 Eric 1086 9846 Eric taskq_destroy(tq); 1087 9846 Eric } 1088 9846 Eric 1089 789 ahrens /* 1090 789 ahrens * Prepare a virtual device for access. 1091 789 ahrens */ 1092 789 ahrens int 1093 789 ahrens vdev_open(vdev_t *vd) 1094 789 ahrens { 1095 8241 Jeff spa_t *spa = vd->vdev_spa; 1096 789 ahrens int error; 1097 789 ahrens uint64_t osize = 0; 1098 789 ahrens uint64_t asize, psize; 1099 1732 bonwick uint64_t ashift = 0; 1100 8241 Jeff 1101 9846 Eric ASSERT(vd->vdev_open_thread == curthread || 1102 9846 Eric spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1103 789 ahrens ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1104 789 ahrens vd->vdev_state == VDEV_STATE_CANT_OPEN || 1105 789 ahrens vd->vdev_state == VDEV_STATE_OFFLINE); 1106 789 ahrens 1107 789 ahrens vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1108 9701 George vd->vdev_cant_read = B_FALSE; 1109 9701 George vd->vdev_cant_write = B_FALSE; 1110 9816 George vd->vdev_min_asize = vdev_get_min_asize(vd); 1111 789 ahrens 1112 10817 Eric /* 1113 10817 Eric * If this vdev is not removed, check its fault status. If it's 1114 10817 Eric * faulted, bail out of the open. 1115 10817 Eric */ 1116 4451 eschrock if (!vd->vdev_removed && vd->vdev_faulted) { 1117 4451 eschrock ASSERT(vd->vdev_children == 0); 1118 10817 Eric ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1119 10817 Eric vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1120 4451 eschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1121 10817 Eric vd->vdev_label_aux); 1122 4451 eschrock return (ENXIO); 1123 4451 eschrock } else if (vd->vdev_offline) { 1124 789 ahrens ASSERT(vd->vdev_children == 0); 1125 1544 eschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1126 789 ahrens return (ENXIO); 1127 789 ahrens } 1128 789 ahrens 1129 789 ahrens error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 1130 789 ahrens 1131 10850 George /* 1132 10850 George * Reset the vdev_reopening flag so that we actually close 1133 10850 George * the vdev on error. 1134 10850 George */ 1135 10850 George vd->vdev_reopening = B_FALSE; 1136 1544 eschrock if (zio_injection_enabled && error == 0) 1137 9725 Eric error = zio_handle_device_injection(vd, NULL, ENXIO); 1138 1544 eschrock 1139 4451 eschrock if (error) { 1140 4451 eschrock if (vd->vdev_removed && 1141 4451 eschrock vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1142 4451 eschrock vd->vdev_removed = B_FALSE; 1143 789 ahrens 1144 1544 eschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1145 789 ahrens vd->vdev_stat.vs_aux); 1146 789 ahrens return (error); 1147 789 ahrens } 1148 789 ahrens 1149 4451 eschrock vd->vdev_removed = B_FALSE; 1150 4451 eschrock 1151 10830 Eric /* 1152 10830 Eric * Recheck the faulted flag now that we have confirmed that 1153 10830 Eric * the vdev is accessible. If we're faulted, bail. 1154 10830 Eric */ 1155 10830 Eric if (vd->vdev_faulted) { 1156 10830 Eric ASSERT(vd->vdev_children == 0); 1157 10830 Eric ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1158 10830 Eric vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1159 10830 Eric vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1160 10830 Eric vd->vdev_label_aux); 1161 10830 Eric return (ENXIO); 1162 10830 Eric } 1163 10830 Eric 1164 4451 eschrock if (vd->vdev_degraded) { 1165 4451 eschrock ASSERT(vd->vdev_children == 0); 1166 4451 eschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1167 4451 eschrock VDEV_AUX_ERR_EXCEEDED); 1168 4451 eschrock } else { 1169 10817 Eric vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1170 4451 eschrock } 1171 789 ahrens 1172 10594 George /* 1173 10594 George * For hole or missing vdevs we just return success. 1174 10594 George */ 1175 10594 George if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1176 10594 George return (0); 1177 10594 George 1178 9816 George for (int c = 0; c < vd->vdev_children; c++) { 1179 1544 eschrock if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1180 1544 eschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1181 1544 eschrock VDEV_AUX_NONE); 1182 1544 eschrock break; 1183 1544 eschrock } 1184 9816 George } 1185 789 ahrens 1186 789 ahrens osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1187 789 ahrens 1188 789 ahrens if (vd->vdev_children == 0) { 1189 789 ahrens if (osize < SPA_MINDEVSIZE) { 1190 1544 eschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1191 1544 eschrock VDEV_AUX_TOO_SMALL); 1192 789 ahrens return (EOVERFLOW); 1193 789 ahrens } 1194 789 ahrens psize = osize; 1195 789 ahrens asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1196 789 ahrens } else { 1197 1732 bonwick if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1198 789 ahrens (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1199 1544 eschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1200 1544 eschrock VDEV_AUX_TOO_SMALL); 1201 789 ahrens return (EOVERFLOW); 1202 789 ahrens } 1203 789 ahrens psize = 0; 1204 789 ahrens asize = osize; 1205 789 ahrens } 1206 789 ahrens 1207 789 ahrens vd->vdev_psize = psize; 1208 789 ahrens 1209 9816 George /* 1210 9816 George * Make sure the allocatable size hasn't shrunk. 1211 9816 George */ 1212 9816 George if (asize < vd->vdev_min_asize) { 1213 9816 George vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1214 9816 George VDEV_AUX_BAD_LABEL); 1215 9816 George return (EINVAL); 1216 9816 George } 1217 9816 George 1218 789 ahrens if (vd->vdev_asize == 0) { 1219 789 ahrens /* 1220 789 ahrens * This is the first-ever open, so use the computed values. 1221 1732 bonwick * For testing purposes, a higher ashift can be requested. 1222 789 ahrens */ 1223 789 ahrens vd->vdev_asize = asize; 1224 1732 bonwick vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 1225 789 ahrens } else { 1226 789 ahrens /* 1227 789 ahrens * Make sure the alignment requirement hasn't increased. 1228 789 ahrens */ 1229 1732 bonwick if (ashift > vd->vdev_top->vdev_ashift) { 1230 1544 eschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1231 1544 eschrock VDEV_AUX_BAD_LABEL); 1232 789 ahrens return (EINVAL); 1233 789 ahrens } 1234 9816 George } 1235 789 ahrens 1236 9816 George /* 1237 9816 George * If all children are healthy and the asize has increased, 1238 9816 George * then we've experienced dynamic LUN growth. If automatic 1239 9816 George * expansion is enabled then use the additional space. 1240 9816 George */ 1241 9816 George if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && 1242 9816 George (vd->vdev_expanding || spa->spa_autoexpand)) 1243 9816 George vd->vdev_asize = asize; 1244 789 ahrens 1245 9816 George vdev_set_min_asize(vd); 1246 5329 gw25295 1247 5329 gw25295 /* 1248 5329 gw25295 * Ensure we can issue some IO before declaring the 1249 5329 gw25295 * vdev open for business. 1250 5329 gw25295 */ 1251 7754 Jeff if (vd->vdev_ops->vdev_op_leaf && 1252 7754 Jeff (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1253 5329 gw25295 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1254 7754 Jeff VDEV_AUX_IO_FAILURE); 1255 5329 gw25295 return (error); 1256 2082 eschrock } 1257 2082 eschrock 1258 7046 ahrens /* 1259 7046 ahrens * If a leaf vdev has a DTL, and seems healthy, then kick off a 1260 8241 Jeff * resilver. But don't do this if we are doing a reopen for a scrub, 1261 8241 Jeff * since this would just restart the scrub we are already doing. 1262 7046 ahrens */ 1263 8241 Jeff if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1264 8241 Jeff vdev_resilver_needed(vd, NULL, NULL)) 1265 8241 Jeff spa_async_request(spa, SPA_ASYNC_RESILVER); 1266 7046 ahrens 1267 1986 eschrock return (0); 1268 1986 eschrock } 1269 1986 eschrock 1270 1986 eschrock /* 1271 1986 eschrock * Called once the vdevs are all opened, this routine validates the label 1272 1986 eschrock * contents. This needs to be done before vdev_load() so that we don't 1273 4451 eschrock * inadvertently do repair I/Os to the wrong device. 1274 1986 eschrock * 1275 1986 eschrock * This function will only return failure if one of the vdevs indicates that it 1276 1986 eschrock * has since been destroyed or exported. This is only possible if 1277 1986 eschrock * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1278 1986 eschrock * will be updated but the function will return 0. 1279 1986 eschrock */ 1280 1986 eschrock int 1281 1986 eschrock vdev_validate(vdev_t *vd) 1282 1986 eschrock { 1283 1986 eschrock spa_t *spa = vd->vdev_spa; 1284 1986 eschrock nvlist_t *label; 1285 7754 Jeff uint64_t guid, top_guid; 1286 1986 eschrock uint64_t state; 1287 1986 eschrock 1288 9816 George for (int c = 0; c < vd->vdev_children; c++) 1289 1986 eschrock if (vdev_validate(vd->vdev_child[c]) != 0) 1290 4070 mc142369 return (EBADF); 1291 1986 eschrock 1292 2174 eschrock /* 1293 2174 eschrock * If the device has already failed, or was marked offline, don't do 1294 2174 eschrock * any further validation. Otherwise, label I/O will fail and we will 1295 2174 eschrock * overwrite the previous state. 1296 2174 eschrock */ 1297 7754 Jeff if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1298 1986 eschrock 1299 1986 eschrock if ((label = vdev_label_read_config(vd)) == NULL) { 1300 1986 eschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1301 1986 eschrock VDEV_AUX_BAD_LABEL); 1302 1986 eschrock return (0); 1303 1986 eschrock } 1304 1986 eschrock 1305 1986 eschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 1306 1986 eschrock &guid) != 0 || guid != spa_guid(spa)) { 1307 1986 eschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1308 1986 eschrock VDEV_AUX_CORRUPT_DATA); 1309 1986 eschrock nvlist_free(label); 1310 1986 eschrock return (0); 1311 1986 eschrock } 1312 1986 eschrock 1313 7754 Jeff /* 1314 7754 Jeff * If this vdev just became a top-level vdev because its 1315 7754 Jeff * sibling was detached, it will have adopted the parent's 1316 7754 Jeff * vdev guid -- but the label may or may not be on disk yet. 1317 7754 Jeff * Fortunately, either version of the label will have the 1318 7754 Jeff * same top guid, so if we're a top-level vdev, we can 1319 7754 Jeff * safely compare to that instead. 1320 7754 Jeff */ 1321 1986 eschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1322 7754 Jeff &guid) != 0 || 1323 7754 Jeff nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1324 7754 Jeff &top_guid) != 0 || 1325 7754 Jeff (vd->vdev_guid != guid && 1326 7754 Jeff (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1327 1986 eschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1328 1986 eschrock VDEV_AUX_CORRUPT_DATA); 1329 1986 eschrock nvlist_free(label); 1330 1986 eschrock return (0); 1331 1986 eschrock } 1332 1986 eschrock 1333 1986 eschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1334 1986 eschrock &state) != 0) { 1335 1986 eschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1336 1986 eschrock VDEV_AUX_CORRUPT_DATA); 1337 1986 eschrock nvlist_free(label); 1338 1986 eschrock return (0); 1339 1986 eschrock } 1340 1986 eschrock 1341 1986 eschrock nvlist_free(label); 1342 1986 eschrock 1343 10100 Lin /* 1344 10100 Lin * If spa->spa_load_verbatim is true, no need to check the 1345 10100 Lin * state of the pool. 1346 10100 Lin */ 1347 10100 Lin if (!spa->spa_load_verbatim && 1348 11147 George spa_load_state(spa) == SPA_LOAD_OPEN && 1349 10100 Lin state != POOL_STATE_ACTIVE) 1350 4070 mc142369 return (EBADF); 1351 6976 eschrock 1352 6976 eschrock /* 1353 6976 eschrock * If we were able to open and validate a vdev that was 1354 6976 eschrock * previously marked permanently unavailable, clear that state 1355 6976 eschrock * now. 1356 6976 eschrock */ 1357 6976 eschrock if (vd->vdev_not_present) 1358 6976 eschrock vd->vdev_not_present = 0; 1359 1986 eschrock } 1360 789 ahrens 1361 789 ahrens return (0); 1362 789 ahrens } 1363 789 ahrens 1364 789 ahrens /* 1365 789 ahrens * Close a virtual device. 1366 789 ahrens */ 1367 789 ahrens void 1368 789 ahrens vdev_close(vdev_t *vd) 1369 789 ahrens { 1370 8241 Jeff spa_t *spa = vd->vdev_spa; 1371 10850 George vdev_t *pvd = vd->vdev_parent; 1372 8241 Jeff 1373 8241 Jeff ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1374 10850 George 1375 10850 George if (pvd != NULL && pvd->vdev_reopening) 1376 10850 George vd->vdev_reopening = pvd->vdev_reopening; 1377 8241 Jeff 1378 789 ahrens vd->vdev_ops->vdev_op_close(vd); 1379 789 ahrens 1380 4451 eschrock vdev_cache_purge(vd); 1381 1986 eschrock 1382 1986 eschrock /* 1383 9816 George * We record the previous state before we close it, so that if we are 1384 1986 eschrock * doing a reopen(), we don't generate FMA ereports if we notice that 1385 1986 eschrock * it's still faulted. 1386 1986 eschrock */ 1387 1986 eschrock vd->vdev_prevstate = vd->vdev_state; 1388 789 ahrens 1389 789 ahrens if (vd->vdev_offline) 1390 789 ahrens vd->vdev_state = VDEV_STATE_OFFLINE; 1391 789 ahrens else 1392 789 ahrens vd->vdev_state = VDEV_STATE_CLOSED; 1393 1544 eschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1394 789 ahrens } 1395 789 ahrens 1396 10850 George /* 1397 10850 George * Reopen all interior vdevs and any unopened leaves. We don't actually 1398 10850 George * reopen leaf vdevs which had previously been opened as they might deadlock 1399 10850 George * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1400 10850 George * If the leaf has never been opened then open it, as usual. 1401 10850 George */ 1402 789 ahrens void 1403 1544 eschrock vdev_reopen(vdev_t *vd) 1404 789 ahrens { 1405 1544 eschrock spa_t *spa = vd->vdev_spa; 1406 789 ahrens 1407 7754 Jeff ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1408 789 ahrens 1409 10850 George vd->vdev_reopening = B_TRUE; 1410 789 ahrens vdev_close(vd); 1411 789 ahrens (void) vdev_open(vd); 1412 789 ahrens 1413 789 ahrens /* 1414 3377 eschrock * Call vdev_validate() here to make sure we have the same device. 1415 3377 eschrock * Otherwise, a device with an invalid label could be successfully 1416 3377 eschrock * opened in response to vdev_reopen(). 1417 3377 eschrock */ 1418 6643 eschrock if (vd->vdev_aux) { 1419 6643 eschrock (void) vdev_validate_aux(vd); 1420 7754 Jeff if (vdev_readable(vd) && vdev_writeable(vd) && 1421 9425 Eric vd->vdev_aux == &spa->spa_l2cache && 1422 9816 George !l2arc_vdev_present(vd)) 1423 9816 George l2arc_add_vdev(spa, vd); 1424 6643 eschrock } else { 1425 6643 eschrock (void) vdev_validate(vd); 1426 6643 eschrock } 1427 3377 eschrock 1428 3377 eschrock /* 1429 4451 eschrock * Reassess parent vdev's health. 1430 789 ahrens */ 1431 4451 eschrock vdev_propagate_state(vd); 1432 789 ahrens } 1433 789 ahrens 1434 789 ahrens int 1435 2082 eschrock vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1436 789 ahrens { 1437 789 ahrens int error; 1438 789 ahrens 1439 789 ahrens /* 1440 789 ahrens * Normally, partial opens (e.g. of a mirror) are allowed. 1441 789 ahrens * For a create, however, we want to fail the request if 1442 789 ahrens * there are any components we can't open. 1443 789 ahrens */ 1444 789 ahrens error = vdev_open(vd); 1445 789 ahrens 1446 789 ahrens if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1447 789 ahrens vdev_close(vd); 1448 789 ahrens return (error ? error : ENXIO); 1449 789 ahrens } 1450 789 ahrens 1451 789 ahrens /* 1452 789 ahrens * Recursively initialize all labels. 1453 789 ahrens */ 1454 3377 eschrock if ((error = vdev_label_init(vd, txg, isreplacing ? 1455 3377 eschrock VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1456 789 ahrens vdev_close(vd); 1457 789 ahrens return (error); 1458 789 ahrens } 1459 789 ahrens 1460 789 ahrens return (0); 1461 789 ahrens } 1462 789 ahrens 1463 1585 bonwick void 1464 9816 George vdev_metaslab_set_size(vdev_t *vd) 1465 789 ahrens { 1466 789 ahrens /* 1467 789 ahrens * Aim for roughly 200 metaslabs per vdev. 1468 789 ahrens */ 1469 789 ahrens vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1470 789 ahrens vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1471 789 ahrens } 1472 789 ahrens 1473 789 ahrens void 1474 1732 bonwick vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1475 789 ahrens { 1476 1732 bonwick ASSERT(vd == vd->vdev_top); 1477 10594 George ASSERT(!vd->vdev_ishole); 1478 1732 bonwick ASSERT(ISP2(flags)); 1479 789 ahrens 1480 1732 bonwick if (flags & VDD_METASLAB) 1481 1732 bonwick (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1482 1732 bonwick 1483 1732 bonwick if (flags & VDD_DTL) 1484 1732 bonwick (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1485 1732 bonwick 1486 1732 bonwick (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1487 789 ahrens } 1488 789 ahrens 1489 8241 Jeff /* 1490 8241 Jeff * DTLs. 1491 8241 Jeff * 1492 8241 Jeff * A vdev's DTL (dirty time log) is the set of transaction groups for which 1493 8241 Jeff * the vdev has less than perfect replication. There are three kinds of DTL: 1494 8241 Jeff * 1495 8241 Jeff * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1496 8241 Jeff * 1497 8241 Jeff * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1498 8241 Jeff * 1499 8241 Jeff * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1500 8241 Jeff * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1501 8241 Jeff * txgs that was scrubbed. 1502 8241 Jeff * 1503 8241 Jeff * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1504 8241 Jeff * persistent errors or just some device being offline. 1505 8241 Jeff * Unlike the other three, the DTL_OUTAGE map is not generally 1506 8241 Jeff * maintained; it's only computed when needed, typically to 1507 8241 Jeff * determine whether a device can be detached. 1508 8241 Jeff * 1509 8241 Jeff * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1510 8241 Jeff * either has the data or it doesn't. 1511 8241 Jeff * 1512 8241 Jeff * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1513 8241 Jeff * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1514 8241 Jeff * if any child is less than fully replicated, then so is its parent. 1515 8241 Jeff * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1516 8241 Jeff * comprising only those txgs which appear in 'maxfaults' or more children; 1517 8241 Jeff * those are the txgs we don't have enough replication to read. For example, 1518 8241 Jeff * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1519 8241 Jeff * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1520 8241 Jeff * two child DTL_MISSING maps. 1521 8241 Jeff * 1522 8241 Jeff * It should be clear from the above that to compute the DTLs and outage maps 1523 8241 Jeff * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1524 8241 Jeff * Therefore, that is all we keep on disk. When loading the pool, or after 1525 8241 Jeff * a configuration change, we generate all other DTLs from first principles. 1526 8241 Jeff */ 1527 789 ahrens void 1528 8241 Jeff vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1529 789 ahrens { 1530 8241 Jeff space_map_t *sm = &vd->vdev_dtl[t]; 1531 8241 Jeff 1532 8241 Jeff ASSERT(t < DTL_TYPES); 1533 8241 Jeff ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1534 8241 Jeff 1535 789 ahrens mutex_enter(sm->sm_lock); 1536 789 ahrens if (!space_map_contains(sm, txg, size)) 1537 789 ahrens space_map_add(sm, txg, size); 1538 789 ahrens mutex_exit(sm->sm_lock); 1539 789 ahrens } 1540 789 ahrens 1541 8241 Jeff boolean_t 1542 8241 Jeff vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1543 789 ahrens { 1544 8241 Jeff space_map_t *sm = &vd->vdev_dtl[t]; 1545 8241 Jeff boolean_t dirty = B_FALSE; 1546 789 ahrens 1547 8241 Jeff ASSERT(t < DTL_TYPES); 1548 8241 Jeff ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1549 789 ahrens 1550 789 ahrens mutex_enter(sm->sm_lock); 1551 8241 Jeff if (sm->sm_space != 0) 1552 8241 Jeff dirty = space_map_contains(sm, txg, size); 1553 789 ahrens mutex_exit(sm->sm_lock); 1554 789 ahrens 1555 789 ahrens return (dirty); 1556 8241 Jeff } 1557 8241 Jeff 1558 8241 Jeff boolean_t 1559 8241 Jeff vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1560 8241 Jeff { 1561 8241 Jeff space_map_t *sm = &vd->vdev_dtl[t]; 1562 8241 Jeff boolean_t empty; 1563 8241 Jeff 1564 8241 Jeff mutex_enter(sm->sm_lock); 1565 8241 Jeff empty = (sm->sm_space == 0); 1566 8241 Jeff mutex_exit(sm->sm_lock); 1567 8241 Jeff 1568 8241 Jeff return (empty); 1569 789 ahrens } 1570 789 ahrens 1571 789 ahrens /* 1572 789 ahrens * Reassess DTLs after a config change or scrub completion. 1573 789 ahrens */ 1574 789 ahrens void 1575 789 ahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1576 789 ahrens { 1577 1544 eschrock spa_t *spa = vd->vdev_spa; 1578 8241 Jeff avl_tree_t reftree; 1579 8241 Jeff int minref; 1580 789 ahrens 1581 8241 Jeff ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1582 789 ahrens 1583 8241 Jeff for (int c = 0; c < vd->vdev_children; c++) 1584 8241 Jeff vdev_dtl_reassess(vd->vdev_child[c], txg, 1585 8241 Jeff scrub_txg, scrub_done); 1586 8241 Jeff 1587 10922 Jeff if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) 1588 8241 Jeff return; 1589 8241 Jeff 1590 8241 Jeff if (vd->vdev_ops->vdev_op_leaf) { 1591 789 ahrens mutex_enter(&vd->vdev_dtl_lock); 1592 7046 ahrens if (scrub_txg != 0 && 1593 7046 ahrens (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { 1594 7046 ahrens /* XXX should check scrub_done? */ 1595 7046 ahrens /* 1596 7046 ahrens * We completed a scrub up to scrub_txg. If we 1597 7046 ahrens * did it without rebooting, then the scrub dtl 1598 7046 ahrens * will be valid, so excise the old region and 1599 7046 ahrens * fold in the scrub dtl. Otherwise, leave the 1600 7046 ahrens * dtl as-is if there was an error. 1601 8241 Jeff * 1602 8241 Jeff * There's little trick here: to excise the beginning 1603 8241 Jeff * of the DTL_MISSING map, we put it into a reference 1604 8241 Jeff * tree and then add a segment with refcnt -1 that 1605 8241 Jeff * covers the range [0, scrub_txg). This means 1606 8241 Jeff * that each txg in that range has refcnt -1 or 0. 1607 8241 Jeff * We then add DTL_SCRUB with a refcnt of 2, so that 1608 8241 Jeff * entries in the range [0, scrub_txg) will have a 1609 8241 Jeff * positive refcnt -- either 1 or 2. We then convert 1610 8241 Jeff * the reference tree into the new DTL_MISSING map. 1611 7046 ahrens */ 1612 8241 Jeff space_map_ref_create(&reftree); 1613 8241 Jeff space_map_ref_add_map(&reftree, 1614 8241 Jeff &vd->vdev_dtl[DTL_MISSING], 1); 1615 8241 Jeff space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); 1616 8241 Jeff space_map_ref_add_map(&reftree, 1617 8241 Jeff &vd->vdev_dtl[DTL_SCRUB], 2); 1618 8241 Jeff space_map_ref_generate_map(&reftree, 1619 8241 Jeff &vd->vdev_dtl[DTL_MISSING], 1); 1620 8241 Jeff space_map_ref_destroy(&reftree); 1621 789 ahrens } 1622 8241 Jeff space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 1623 8241 Jeff space_map_walk(&vd->vdev_dtl[DTL_MISSING], 1624 8241 Jeff space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); 1625 789 ahrens if (scrub_done) 1626 8241 Jeff space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 1627 8241 Jeff space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 1628 8241 Jeff if (!vdev_readable(vd)) 1629 8241 Jeff space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 1630 8241 Jeff else 1631 8241 Jeff space_map_walk(&vd->vdev_dtl[DTL_MISSING], 1632 8241 Jeff space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); 1633 789 ahrens mutex_exit(&vd->vdev_dtl_lock); 1634 7046 ahrens 1635 1732 bonwick if (txg != 0) 1636 1732 bonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1637 789 ahrens return; 1638 789 ahrens } 1639 789 ahrens 1640 789 ahrens mutex_enter(&vd->vdev_dtl_lock); 1641 8241 Jeff for (int t = 0; t < DTL_TYPES; t++) { 1642 10890 Eric /* account for child's outage in parent's missing map */ 1643 10890 Eric int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 1644 8241 Jeff if (t == DTL_SCRUB) 1645 8241 Jeff continue; /* leaf vdevs only */ 1646 8241 Jeff if (t == DTL_PARTIAL) 1647 8241 Jeff minref = 1; /* i.e. non-zero */ 1648 8241 Jeff else if (vd->vdev_nparity != 0) 1649 8241 Jeff minref = vd->vdev_nparity + 1; /* RAID-Z */ 1650 8241 Jeff else 1651 8241 Jeff minref = vd->vdev_children; /* any kind of mirror */ 1652 8241 Jeff space_map_ref_create(&reftree); 1653 8241 Jeff for (int c = 0; c < vd->vdev_children; c++) { 1654 8241 Jeff vdev_t *cvd = vd->vdev_child[c]; 1655 8241 Jeff mutex_enter(&cvd->vdev_dtl_lock); 1656 10890 Eric space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); 1657 8241 Jeff mutex_exit(&cvd->vdev_dtl_lock); 1658 8241 Jeff } 1659 8241 Jeff space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); 1660 8241 Jeff space_map_ref_destroy(&reftree); 1661 8241 Jeff } 1662 789 ahrens mutex_exit(&vd->vdev_dtl_lock); 1663 789 ahrens } 1664 789 ahrens 1665 789 ahrens static int 1666 789 ahrens vdev_dtl_load(vdev_t *vd) 1667 789 ahrens { 1668 789 ahrens spa_t *spa = vd->vdev_spa; 1669 8241 Jeff space_map_obj_t *smo = &vd->vdev_dtl_smo; 1670 1732 bonwick objset_t *mos = spa->spa_meta_objset; 1671 789 ahrens dmu_buf_t *db; 1672 789 ahrens int error; 1673 789 ahrens 1674 789 ahrens ASSERT(vd->vdev_children == 0); 1675 789 ahrens 1676 789 ahrens if (smo->smo_object == 0) 1677 789 ahrens return (0); 1678 789 ahrens 1679 10594 George ASSERT(!vd->vdev_ishole); 1680 10594 George 1681 1732 bonwick if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 1682 1544 eschrock return (error); 1683 1732 bonwick 1684 4944 maybee ASSERT3U(db->db_size, >=, sizeof (*smo)); 1685 4944 maybee bcopy(db->db_data, smo, sizeof (*smo)); 1686 1544 eschrock dmu_buf_rele(db, FTAG); 1687 789 ahrens 1688 789 ahrens mutex_enter(&vd->vdev_dtl_lock); 1689 8241 Jeff error = space_map_load(&vd->vdev_dtl[DTL_MISSING], 1690 8241 Jeff NULL, SM_ALLOC, smo, mos); 1691 789 ahrens mutex_exit(&vd->vdev_dtl_lock); 1692 789 ahrens 1693 789 ahrens return (error); 1694 789 ahrens } 1695 789 ahrens 1696 789 ahrens void 1697 789 ahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1698 789 ahrens { 1699 789 ahrens spa_t *spa = vd->vdev_spa; 1700 8241 Jeff space_map_obj_t *smo = &vd->vdev_dtl_smo; 1701 8241 Jeff space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; 1702 1732 bonwick objset_t *mos = spa->spa_meta_objset; 1703 789 ahrens space_map_t smsync; 1704 789 ahrens kmutex_t smlock; 1705 789 ahrens dmu_buf_t *db; 1706 789 ahrens dmu_tx_t *tx; 1707 10594 George 1708 10594 George ASSERT(!vd->vdev_ishole); 1709 789 ahrens 1710 789 ahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1711 789 ahrens 1712 789 ahrens if (vd->vdev_detached) { 1713 789 ahrens if (smo->smo_object != 0) { 1714 1732 bonwick int err = dmu_object_free(mos, smo->smo_object, tx); 1715 789 ahrens ASSERT3U(err, ==, 0); 1716 789 ahrens smo->smo_object = 0; 1717 789 ahrens } 1718 789 ahrens dmu_tx_commit(tx); 1719 789 ahrens return; 1720 789 ahrens } 1721 789 ahrens 1722 789 ahrens if (smo->smo_object == 0) { 1723 789 ahrens ASSERT(smo->smo_objsize == 0); 1724 789 ahrens ASSERT(smo->smo_alloc == 0); 1725 1732 bonwick smo->smo_object = dmu_object_alloc(mos, 1726 789 ahrens DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1727 789 ahrens DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1728 789 ahrens ASSERT(smo->smo_object != 0); 1729 789 ahrens vdev_config_dirty(vd->vdev_top); 1730 789 ahrens } 1731 789 ahrens 1732 789 ahrens mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1733 789 ahrens 1734 789 ahrens space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1735 789 ahrens &smlock); 1736 789 ahrens 1737 789 ahrens mutex_enter(&smlock); 1738 789 ahrens 1739 789 ahrens mutex_enter(&vd->vdev_dtl_lock); 1740 1732 bonwick space_map_walk(sm, space_map_add, &smsync); 1741 789 ahrens mutex_exit(&vd->vdev_dtl_lock); 1742 789 ahrens 1743 1732 bonwick space_map_truncate(smo, mos, tx); 1744 1732 bonwick space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1745 789 ahrens 1746 789 ahrens space_map_destroy(&smsync); 1747 789 ahrens 1748 789 ahrens mutex_exit(&smlock); 1749 789 ahrens mutex_destroy(&smlock); 1750 789 ahrens 1751 1732 bonwick VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1752 789 ahrens dmu_buf_will_dirty(db, tx); 1753 4944 maybee ASSERT3U(db->db_size, >=, sizeof (*smo)); 1754 4944 maybee bcopy(smo, db->db_data, sizeof (*smo)); 1755 1544 eschrock dmu_buf_rele(db, FTAG); 1756 789 ahrens 1757 789 ahrens dmu_tx_commit(tx); 1758 7046 ahrens } 1759 7046 ahrens 1760 7046 ahrens /* 1761 8241 Jeff * Determine whether the specified vdev can be offlined/detached/removed 1762 8241 Jeff * without losing data. 1763 8241 Jeff */ 1764 8241 Jeff boolean_t 1765 8241 Jeff vdev_dtl_required(vdev_t *vd) 1766 8241 Jeff { 1767 8241 Jeff spa_t *spa = vd->vdev_spa; 1768 8241 Jeff vdev_t *tvd = vd->vdev_top; 1769 8241 Jeff uint8_t cant_read = vd->vdev_cant_read; 1770 8241 Jeff boolean_t required; 1771 8241 Jeff 1772 8241 Jeff ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1773 8241 Jeff 1774 8241 Jeff if (vd == spa->spa_root_vdev || vd == tvd) 1775 8241 Jeff return (B_TRUE); 1776 8241 Jeff 1777 8241 Jeff /* 1778 8241 Jeff * Temporarily mark the device as unreadable, and then determine 1779 8241 Jeff * whether this results in any DTL outages in the top-level vdev. 1780 8241 Jeff * If not, we can safely offline/detach/remove the device. 1781 8241 Jeff */ 1782 8241 Jeff vd->vdev_cant_read = B_TRUE; 1783 8241 Jeff vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 1784 8241 Jeff required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 1785 8241 Jeff vd->vdev_cant_read = cant_read; 1786 8241 Jeff vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 1787 8241 Jeff 1788 8241 Jeff return (required); 1789 8241 Jeff } 1790 8241 Jeff 1791 8241 Jeff /* 1792 7046 ahrens * Determine if resilver is needed, and if so the txg range. 1793 7046 ahrens */ 1794 7046 ahrens boolean_t 1795 7046 ahrens vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 1796 7046 ahrens { 1797 7046 ahrens boolean_t needed = B_FALSE; 1798 7046 ahrens uint64_t thismin = UINT64_MAX; 1799 7046 ahrens uint64_t thismax = 0; 1800 7046 ahrens 1801 7046 ahrens if (vd->vdev_children == 0) { 1802 7046 ahrens mutex_enter(&vd->vdev_dtl_lock); 1803 8241 Jeff if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && 1804 8241 Jeff vdev_writeable(vd)) { 1805 7046 ahrens space_seg_t *ss; 1806 7046 ahrens 1807 8241 Jeff ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); 1808 7046 ahrens thismin = ss->ss_start - 1; 1809 8241 Jeff ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); 1810 7046 ahrens thismax = ss->ss_end; 1811 7046 ahrens needed = B_TRUE; 1812 7046 ahrens } 1813 7046 ahrens mutex_exit(&vd->vdev_dtl_lock); 1814 7046 ahrens } else { 1815 8241 Jeff for (int c = 0; c < vd->vdev_children; c++) { 1816 7046 ahrens vdev_t *cvd = vd->vdev_child[c]; 1817 7046 ahrens uint64_t cmin, cmax; 1818 7046 ahrens 1819 7046 ahrens if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 1820 7046 ahrens thismin = MIN(thismin, cmin); 1821 7046 ahrens thismax = MAX(thismax, cmax); 1822 7046 ahrens needed = B_TRUE; 1823 7046 ahrens } 1824 7046 ahrens } 1825 7046 ahrens } 1826 7046 ahrens 1827 7046 ahrens if (needed && minp) { 1828 7046 ahrens *minp = thismin; 1829 7046 ahrens *maxp = thismax; 1830 7046 ahrens } 1831 7046 ahrens return (needed); 1832 789 ahrens } 1833 789 ahrens 1834 1986 eschrock void 1835 1544 eschrock vdev_load(vdev_t *vd) 1836 789 ahrens { 1837 789 ahrens /* 1838 789 ahrens * Recursively load all children. 1839 789 ahrens */ 1840 8241 Jeff for (int c = 0; c < vd->vdev_children; c++) 1841 1986 eschrock vdev_load(vd->vdev_child[c]); 1842 789 ahrens 1843 789 ahrens /* 1844 1585 bonwick * If this is a top-level vdev, initialize its metaslabs. 1845 789 ahrens */ 1846 10594 George if (vd == vd->vdev_top && !vd->vdev_ishole && 1847 1986 eschrock (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 1848 1986 eschrock vdev_metaslab_init(vd, 0) != 0)) 1849 1986 eschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1850 1986 eschrock VDEV_AUX_CORRUPT_DATA); 1851 789 ahrens 1852 789 ahrens /* 1853 789 ahrens * If this is a leaf vdev, load its DTL. 1854 789 ahrens */ 1855 1986 eschrock if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 1856 1986 eschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1857 1986 eschrock VDEV_AUX_CORRUPT_DATA); 1858 2082 eschrock } 1859 2082 eschrock 1860 2082 eschrock /* 1861 5450 brendan * The special vdev case is used for hot spares and l2cache devices. Its 1862 5450 brendan * sole purpose it to set the vdev state for the associated vdev. To do this, 1863 5450 brendan * we make sure that we can open the underlying device, then try to read the 1864 5450 brendan * label, and make sure that the label is sane and that it hasn't been 1865 5450 brendan * repurposed to another pool. 1866 2082 eschrock */ 1867 2082 eschrock int 1868 5450 brendan vdev_validate_aux(vdev_t *vd) 1869 2082 eschrock { 1870 2082 eschrock nvlist_t *label; 1871 2082 eschrock uint64_t guid, version; 1872 2082 eschrock uint64_t state; 1873 2082 eschrock 1874 7754 Jeff if (!vdev_readable(vd)) 1875 6643 eschrock return (0); 1876 6643 eschrock 1877 2082 eschrock if ((label = vdev_label_read_config(vd)) == NULL) { 1878 2082 eschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1879 2082 eschrock VDEV_AUX_CORRUPT_DATA); 1880 2082 eschrock return (-1); 1881 2082 eschrock } 1882 2082 eschrock 1883 2082 eschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 1884 4577 ahrens version > SPA_VERSION || 1885 2082 eschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 1886 2082 eschrock guid != vd->vdev_guid || 1887 2082 eschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 1888 2082 eschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1889 2082 eschrock VDEV_AUX_CORRUPT_DATA); 1890 2082 eschrock nvlist_free(label); 1891 2082 eschrock return (-1); 1892 2082 eschrock } 1893 2082 eschrock 1894 2082 eschrock /* 1895 2082 eschrock * We don't actually check the pool state here. If it's in fact in 1896 2082 eschrock * use by another pool, we update this fact on the fly when requested. 1897 2082 eschrock */ 1898 2082 eschrock nvlist_free(label); 1899 2082 eschrock return (0); 1900 789 ahrens } 1901 789 ahrens 1902 789 ahrens void 1903 10594 George vdev_remove(vdev_t *vd, uint64_t txg) 1904 10594 George { 1905 10594 George spa_t *spa = vd->vdev_spa; 1906 10594 George objset_t *mos = spa->spa_meta_objset; 1907 10594 George dmu_tx_t *tx; 1908 10594 George 1909 10594 George tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1910 10594 George 1911 10594 George if (vd->vdev_dtl_smo.smo_object) { 1912 10594 George ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0); 1913 10594 George (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); 1914 10594 George vd->vdev_dtl_smo.smo_object = 0; 1915 10594 George } 1916 10594 George 1917 10594 George if (vd->vdev_ms != NULL) { 1918 10594 George for (int m = 0; m < vd->vdev_ms_count; m++) { 1919 10594 George metaslab_t *msp = vd->vdev_ms[m]; 1920 10594 George 1921 10594 George if (msp == NULL || msp->ms_smo.smo_object == 0) 1922 10594 George continue; 1923 10594 George 1924 10594 George ASSERT3U(msp->ms_smo.smo_alloc, ==, 0); 1925 10594 George (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); 1926 10594 George msp->ms_smo.smo_object = 0; 1927 10594 George } 1928 10594 George } 1929 10594 George 1930 10594 George if (vd->vdev_ms_array) { 1931 10594 George (void) dmu_object_free(mos, vd->vdev_ms_array, tx); 1932 10594 George vd->vdev_ms_array = 0; 1933 10594 George vd->vdev_ms_shift = 0; 1934 10594 George } 1935 10594 George dmu_tx_commit(tx); 1936 10594 George } 1937 10594 George 1938 10594 George void 1939 789 ahrens vdev_sync_done(vdev_t *vd, uint64_t txg) 1940 789 ahrens { 1941 789 ahrens metaslab_t *msp; 1942 11146 George boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 1943 10594 George 1944 10594 George ASSERT(!vd->vdev_ishole); 1945 789 ahrens 1946 789 ahrens while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1947 789 ahrens metaslab_sync_done(msp, txg); 1948 11146 George 1949 11146 George if (reassess) 1950 11146 George metaslab_sync_reassess(vd->vdev_mg); 1951 789 ahrens } 1952 789 ahrens 1953 789 ahrens void 1954 789 ahrens vdev_sync(vdev_t *vd, uint64_t txg) 1955 789 ahrens { 1956 789 ahrens spa_t *spa = vd->vdev_spa; 1957 789 ahrens vdev_t *lvd; 1958 789 ahrens metaslab_t *msp; 1959 1732 bonwick dmu_tx_t *tx; 1960 789 ahrens 1961 10594 George ASSERT(!vd->vdev_ishole); 1962 10594 George 1963 1732 bonwick if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 1964 1732 bonwick ASSERT(vd == vd->vdev_top); 1965 1732 bonwick tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1966 1732 bonwick vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 1967 1732 bonwick DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 1968 1732 bonwick ASSERT(vd->vdev_ms_array != 0); 1969 1732 bonwick vdev_config_dirty(vd); 1970 1732 bonwick dmu_tx_commit(tx); 1971 1732 bonwick } 1972 10594 George 1973 10594 George if (vd->vdev_removing) 1974 10594 George vdev_remove(vd, txg); 1975 789 ahrens 1976 1732 bonwick while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 1977 789 ahrens metaslab_sync(msp, txg); 1978 1732 bonwick (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 1979 1732 bonwick } 1980 789 ahrens 1981 789 ahrens while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1982 789 ahrens vdev_dtl_sync(lvd, txg); 1983 789 ahrens 1984 789 ahrens (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1985 789 ahrens } 1986 789 ahrens 1987 789 ahrens uint64_t 1988 789 ahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1989 789 ahrens { 1990 789 ahrens return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1991 789 ahrens } 1992 789 ahrens 1993 4451 eschrock /* 1994 4451 eschrock * Mark the given vdev faulted. A faulted vdev behaves as if the device could 1995 4451 eschrock * not be opened, and no I/O is attempted. 1996 4451 eschrock */ 1997 789 ahrens int 1998 10817 Eric vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 1999 4451 eschrock { 2000 6643 eschrock vdev_t *vd; 2001 4451 eschrock 2002 10685 George spa_vdev_state_enter(spa, SCL_NONE); 2003 4451 eschrock 2004 6643 eschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2005 7754 Jeff return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2006 7754 Jeff 2007 4451 eschrock if (!vd->vdev_ops->vdev_op_leaf) 2008 7754 Jeff return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2009 4451 eschrock 2010 4451 eschrock /* 2011 10817 Eric * We don't directly use the aux state here, but if we do a 2012 10817 Eric * vdev_reopen(), we need this value to be present to remember why we 2013 10817 Eric * were faulted. 2014 10817 Eric */ 2015 10817 Eric vd->vdev_label_aux = aux; 2016 10817 Eric 2017 10817 Eric /* 2018 4451 eschrock * Faulted state takes precedence over degraded. 2019 4451 eschrock */ 2020 4451 eschrock vd->vdev_faulted = 1ULL; 2021 4451 eschrock vd->vdev_degraded = 0ULL; 2022 10817 Eric vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2023 4451 eschrock 2024 4451 eschrock /* 2025 8123 David * If marking the vdev as faulted cause the top-level vdev to become 2026 4451 eschrock * unavailable, then back off and simply mark the vdev as degraded 2027 4451 eschrock * instead. 2028 4451 eschrock */ 2029 10685 George if (vdev_is_dead(vd->vdev_top) && !vd->vdev_islog && 2030 10685 George vd->vdev_aux == NULL) { 2031 4451 eschrock vd->vdev_degraded = 1ULL; 2032 4451 eschrock vd->vdev_faulted = 0ULL; 2033 4451 eschrock 2034 4451 eschrock /* 2035 4451 eschrock * If we reopen the device and it's not dead, only then do we 2036 4451 eschrock * mark it degraded. 2037 4451 eschrock */ 2038 4451 eschrock vdev_reopen(vd); 2039 4451 eschrock 2040 10817 Eric if (vdev_readable(vd)) 2041 10817 Eric vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2042 4451 eschrock } 2043 4451 eschrock 2044 7754 Jeff return (spa_vdev_state_exit(spa, vd, 0)); 2045 4451 eschrock } 2046 4451 eschrock 2047 4451 eschrock /* 2048 4451 eschrock * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2049 4451 eschrock * user that something is wrong. The vdev continues to operate as normal as far 2050 4451 eschrock * as I/O is concerned. 2051 4451 eschrock */ 2052 4451 eschrock int 2053 10817 Eric vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2054 4451 eschrock { 2055 6643 eschrock vdev_t *vd; 2056 5329 gw25295 2057 10685 George spa_vdev_state_enter(spa, SCL_NONE); 2058 4451 eschrock 2059 6643 eschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2060 7754 Jeff return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2061 7754 Jeff 2062 4451 eschrock if (!vd->vdev_ops->vdev_op_leaf) 2063 7754 Jeff return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2064 4451 eschrock 2065 4451 eschrock /* 2066 4451 eschrock * If the vdev is already faulted, then don't do anything. 2067 4451 eschrock */ 2068 7754 Jeff if (vd->vdev_faulted || vd->vdev_degraded) 2069 7754 Jeff return (spa_vdev_state_exit(spa, NULL, 0)); 2070 4451 eschrock 2071 4451 eschrock vd->vdev_degraded = 1ULL; 2072 4451 eschrock if (!vdev_is_dead(vd)) 2073 4451 eschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2074 10817 Eric aux); 2075 4451 eschrock 2076 7754 Jeff return (spa_vdev_state_exit(spa, vd, 0)); 2077 4451 eschrock } 2078 4451 eschrock 2079 4451 eschrock /* 2080 4451 eschrock * Online the given vdev. If 'unspare' is set, it implies two things. First, 2081 4451 eschrock * any attached spare device should be detached when the device finishes 2082 4451 eschrock * resilvering. Second, the online should be treated like a 'test' online case, 2083 4451 eschrock * so no FMA events are generated if the device fails to open. 2084 4451 eschrock */ 2085 4451 eschrock int 2086 7754 Jeff vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 2087 789 ahrens { 2088 9816 George vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 2089 5329 gw25295 2090 10685 George spa_vdev_state_enter(spa, SCL_NONE); 2091 789 ahrens 2092 6643 eschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2093 7754 Jeff return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2094 1585 bonwick 2095 1585 bonwick if (!vd->vdev_ops->vdev_op_leaf) 2096 7754 Jeff return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2097 789 ahrens 2098 9816 George tvd = vd->vdev_top; 2099 789 ahrens vd->vdev_offline = B_FALSE; 2100 1485 lling vd->vdev_tmpoffline = B_FALSE; 2101 7754 Jeff vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2102 7754 Jeff vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2103 9816 George 2104 9816 George /* XXX - L2ARC 1.0 does not support expansion */ 2105 9816 George if (!vd->vdev_aux) { 2106 9816 George for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2107 9816 George pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2108 9816 George } 2109 9816 George 2110 9816 George vdev_reopen(tvd); 2111 4451 eschrock vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2112 9816 George 2113 9816 George if (!vd->vdev_aux) { 2114 9816 George for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2115 9816 George pvd->vdev_expanding = B_FALSE; 2116 9816 George } 2117 4451 eschrock 2118 4451 eschrock if (newstate) 2119 4451 eschrock *newstate = vd->vdev_state; 2120 4451 eschrock if ((flags & ZFS_ONLINE_UNSPARE) && 2121 4451 eschrock !vdev_is_dead(vd) && vd->vdev_parent && 2122 4451 eschrock vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2123 4451 eschrock vd->vdev_parent->vdev_child[0] == vd) 2124 4451 eschrock vd->vdev_unspare = B_TRUE; 2125 1485 lling 2126 9816 George if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2127 9816 George 2128 9816 George /* XXX - L2ARC 1.0 does not support expansion */ 2129 9816 George if (vd->vdev_aux) 2130 9816 George return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2131 9816 George spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2132 9816 George } 2133 8241 Jeff return (spa_vdev_state_exit(spa, vd, 0)); 2134 789 ahrens } 2135 789 ahrens 2136 789 ahrens int 2137 10685 George vdev_offline_log(spa_t *spa) 2138 10685 George { 2139 10685 George int error = 0; 2140 10685 George 2141 10685 George if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 2142 10685 George NULL, DS_FIND_CHILDREN)) == 0) { 2143 10685 George 2144 10685 George /* 2145 10685 George * We successfully offlined the log device, sync out the 2146 10685 George * current txg so that the "stubby" block can be removed 2147 10685 George * by zil_sync(). 2148 10685 George */ 2149 10685 George txg_wait_synced(spa->spa_dsl_pool, 0); 2150 10685 George } 2151 10685 George return (error); 2152 10685 George } 2153 10685 George 2154 10974 Jeff static int 2155 10974 Jeff vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2156 789 ahrens { 2157 9701 George vdev_t *vd, *tvd; 2158 10685 George int error = 0; 2159 10685 George uint64_t generation; 2160 10685 George metaslab_group_t *mg; 2161 5329 gw25295 2162 10685 George top: 2163 10685 George spa_vdev_state_enter(spa, SCL_ALLOC); 2164 789 ahrens 2165 6643 eschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2166 7754 Jeff return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2167 1585 bonwick 2168 1585 bonwick if (!vd->vdev_ops->vdev_op_leaf) 2169 7754 Jeff return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2170 1485 lling 2171 9701 George tvd = vd->vdev_top; 2172 10685 George mg = tvd->vdev_mg; 2173 10685 George generation = spa->spa_config_generation + 1; 2174 9701 George 2175 1732 bonwick /* 2176 1732 bonwick * If the device isn't already offline, try to offline it. 2177 1732 bonwick */ 2178 1732 bonwick if (!vd->vdev_offline) { 2179 1732 bonwick /* 2180 8241 Jeff * If this device has the only valid copy of some data, 2181 9701 George * don't allow it to be offlined. Log devices are always 2182 9701 George * expendable. 2183 1732 bonwick */ 2184 9701 George if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2185 9701 George vdev_dtl_required(vd)) 2186 7754 Jeff return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2187 10685 George 2188 10685 George /* 2189 10922 Jeff * If the top-level is a slog and it has had allocations 2190 10922 Jeff * then proceed. We check that the vdev's metaslab group 2191 10922 Jeff * is not NULL since it's possible that we may have just 2192 10922 Jeff * added this vdev but not yet initialized its metaslabs. 2193 10685 George */ 2194 10685 George if (tvd->vdev_islog && mg != NULL) { 2195 10685 George /* 2196 10685 George * Prevent any future allocations. 2197 10685 George */ 2198 10974 Jeff metaslab_group_passivate(mg); 2199 10685 George (void) spa_vdev_state_exit(spa, vd, 0); 2200 10685 George 2201 10685 George error = vdev_offline_log(spa); 2202 10685 George 2203 10685 George spa_vdev_state_enter(spa, SCL_ALLOC); 2204 10685 George 2205 10685 George /* 2206 10685 George * Check to see if the config has changed. 2207 10685 George */ 2208 10685 George if (error || generation != spa->spa_config_generation) { 2209 10974 Jeff metaslab_group_activate(mg); 2210 10685 George if (error) 2211 10685 George return (spa_vdev_state_exit(spa, 2212 10685 George vd, error)); 2213 10685 George (void) spa_vdev_state_exit(spa, vd, 0); 2214 10685 George goto top; 2215 10685 George } 2216 10685 George ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0); 2217 10685 George } 2218 789 ahrens 2219 1732 bonwick /* 2220 1732 bonwick * Offline this device and reopen its top-level vdev. 2221 9701 George * If the top-level vdev is a log device then just offline 2222 9701 George * it. Otherwise, if this action results in the top-level 2223 9701 George * vdev becoming unusable, undo it and fail the request. 2224 1732 bonwick */ 2225 1732 bonwick vd->vdev_offline = B_TRUE; 2226 9701 George vdev_reopen(tvd); 2227 9701 George 2228 9701 George if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2229 9701 George vdev_is_dead(tvd)) { 2230 1732 bonwick vd->vdev_offline = B_FALSE; 2231 9701 George vdev_reopen(tvd); 2232 7754 Jeff return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2233 1732 bonwick } 2234 10685 George 2235 10685 George /* 2236 10685 George * Add the device back into the metaslab rotor so that 2237 10685 George * once we online the device it's open for business. 2238 10685 George */ 2239 10685 George if (tvd->vdev_islog && mg != NULL) 2240 10974 Jeff metaslab_group_activate(mg); 2241 789 ahrens } 2242 789 ahrens 2243 7754 Jeff vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2244 1732 bonwick 2245 10685 George return (spa_vdev_state_exit(spa, vd, 0)); 2246 10974 Jeff } 2247 10974 Jeff 2248 10974 Jeff int 2249 10974 Jeff vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2250 10974 Jeff { 2251 10974 Jeff int error; 2252 10974 Jeff 2253 10974 Jeff mutex_enter(&spa->spa_vdev_top_lock); 2254 10974 Jeff error = vdev_offline_locked(spa, guid, flags); 2255 10974 Jeff mutex_exit(&spa->spa_vdev_top_lock); 2256 10974 Jeff 2257 10974 Jeff return (error); 2258 789 ahrens } 2259 789 ahrens 2260 1544 eschrock /* 2261 1544 eschrock * Clear the error counts associated with this vdev. Unlike vdev_online() and 2262 1544 eschrock * vdev_offline(), we assume the spa config is locked. We also clear all 2263 1544 eschrock * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2264 1544 eschrock */ 2265 1544 eschrock void 2266 7754 Jeff vdev_clear(spa_t *spa, vdev_t *vd) 2267 789 ahrens { 2268 7754 Jeff vdev_t *rvd = spa->spa_root_vdev; 2269 7754 Jeff 2270 7754 Jeff ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2271 789 ahrens 2272 1544 eschrock if (vd == NULL) 2273 7754 Jeff vd = rvd; 2274 789 ahrens 2275 1544 eschrock vd->vdev_stat.vs_read_errors = 0; 2276 1544 eschrock vd->vdev_stat.vs_write_errors = 0; 2277 1544 eschrock vd->vdev_stat.vs_checksum_errors = 0; 2278 789 ahrens 2279 7754 Jeff for (int c = 0; c < vd->vdev_children; c++) 2280 7754 Jeff vdev_clear(spa, vd->vdev_child[c]); 2281 4451 eschrock 2282 4451 eschrock /* 2283 6959 ek110237 * If we're in the FAULTED state or have experienced failed I/O, then 2284 6959 ek110237 * clear the persistent state and attempt to reopen the device. We 2285 6959 ek110237 * also mark the vdev config dirty, so that the new faulted state is 2286 6959 ek110237 * written out to disk. 2287 4451 eschrock */ 2288 7754 Jeff if (vd->vdev_faulted || vd->vdev_degraded || 2289 7754 Jeff !vdev_readable(vd) || !vdev_writeable(vd)) { 2290 6959 ek110237 2291 10830 Eric /* 2292 10830 Eric * When reopening in reponse to a clear event, it may be due to 2293 10830 Eric * a fmadm repair request. In this case, if the device is 2294 10830 Eric * still broken, we want to still post the ereport again. 2295 10830 Eric */ 2296 10830 Eric vd->vdev_forcefault = B_TRUE; 2297 10830 Eric 2298 4451 eschrock vd->vdev_faulted = vd->vdev_degraded = 0; 2299 7754 Jeff vd->vdev_cant_read = B_FALSE; 2300 7754 Jeff vd->vdev_cant_write = B_FALSE; 2301 7754 Jeff 2302 4451 eschrock vdev_reopen(vd); 2303 10830 Eric 2304 10830 Eric vd->vdev_forcefault = B_FALSE; 2305 4451 eschrock 2306 7754 Jeff if (vd != rvd) 2307 7754 Jeff vdev_state_dirty(vd->vdev_top); 2308 7754 Jeff 2309 7754 Jeff if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2310 4808 ek110237 spa_async_request(spa, SPA_ASYNC_RESILVER); 2311 4451 eschrock 2312 4451 eschrock spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 2313 4451 eschrock } 2314 10830 Eric 2315 10830 Eric /* 2316 10830 Eric * When clearing a FMA-diagnosed fault, we always want to 2317 10830 Eric * unspare the device, as we assume that the original spare was 2318 10830 Eric * done in response to the FMA fault. 2319 10830 Eric */ 2320 10830 Eric if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2321 10830 Eric vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2322 10830 Eric vd->vdev_parent->vdev_child[0] == vd) 2323 10830 Eric vd->vdev_unspare = B_TRUE; 2324 5329 gw25295 } 2325 5329 gw25295 2326 7754 Jeff boolean_t 2327 7754 Jeff vdev_is_dead(vdev_t *vd) 2328 7754 Jeff { 2329 10594 George /* 2330 10594 George * Holes and missing devices are always considered "dead". 2331 10594 George * This simplifies the code since we don't have to check for 2332 10594 George * these types of devices in the various code paths. 2333 10594 George * Instead we rely on the fact that we skip over dead devices 2334 10594 George * before issuing I/O to them. 2335 10594 George */ 2336 10594 George return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 2337 10594 George vd->vdev_ops == &vdev_missing_ops); 2338 7754 Jeff } 2339 7754 Jeff 2340 7754 Jeff boolean_t 2341 5329 gw25295 vdev_readable(vdev_t *vd) 2342 5329 gw25295 { 2343 7754 Jeff return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 2344 5329 gw25295 } 2345 5329 gw25295 2346 7754 Jeff boolean_t 2347 5329 gw25295 vdev_writeable(vdev_t *vd) 2348 5329 gw25295 { 2349 7754 Jeff return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 2350 789 ahrens } 2351 789 ahrens 2352 7754 Jeff boolean_t 2353 7980 George vdev_allocatable(vdev_t *vd) 2354 7980 George { 2355 8241 Jeff uint64_t state = vd->vdev_state; 2356 8241 Jeff 2357 7980 George /* 2358 8241 Jeff * We currently allow allocations from vdevs which may be in the 2359 7980 George * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2360 7980 George * fails to reopen then we'll catch it later when we're holding 2361 8241 Jeff * the proper locks. Note that we have to get the vdev state 2362 8241 Jeff * in a local variable because although it changes atomically, 2363 8241 Jeff * we're asking two separate questions about it. 2364 7980 George */ 2365 8241 Jeff return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 2366 10594 George !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing); 2367 7980 George } 2368 7980 George 2369 7980 George boolean_t 2370 7754 Jeff vdev_accessible(vdev_t *vd, zio_t *zio) 2371 789 ahrens { 2372 7754 Jeff ASSERT(zio->io_vd == vd); 2373 789 ahrens 2374 7754 Jeff if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2375 7754 Jeff return (B_FALSE); 2376 789 ahrens 2377 7754 Jeff if (zio->io_type == ZIO_TYPE_READ) 2378 7754 Jeff return (!vd->vdev_cant_read); 2379 789 ahrens 2380 7754 Jeff if (zio->io_type == ZIO_TYPE_WRITE) 2381 7754 Jeff return (!vd->vdev_cant_write); 2382 789 ahrens 2383 7754 Jeff return (B_TRUE); 2384 789 ahrens } 2385 789 ahrens 2386 789 ahrens /* 2387 789 ahrens * Get statistics for the given vdev. 2388 789 ahrens */ 2389 789 ahrens void 2390 789 ahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2391 789 ahrens { 2392 789 ahrens vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 2393 789 ahrens 2394 789 ahrens mutex_enter(&vd->vdev_stat_lock); 2395 789 ahrens bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2396 7046 ahrens vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; 2397 789 ahrens vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2398 789 ahrens vs->vs_state = vd->vdev_state; 2399 9816 George vs->vs_rsize = vdev_get_min_asize(vd); 2400 9816 George if (vd->vdev_ops->vdev_op_leaf) 2401 9816 George vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2402 789 ahrens mutex_exit(&vd->vdev_stat_lock); 2403 789 ahrens 2404 789 ahrens /* 2405 789 ahrens * If we're getting stats on the root vdev, aggregate the I/O counts 2406 789 ahrens * over all top-level vdevs (i.e. the direct children of the root). 2407 789 ahrens */ 2408 789 ahrens if (vd == rvd) { 2409 7754 Jeff for (int c = 0; c < rvd->vdev_children; c++) { 2410 789 ahrens vdev_t *cvd = rvd->vdev_child[c]; 2411 789 ahrens vdev_stat_t *cvs = &cvd->vdev_stat; 2412 789 ahrens 2413 789 ahrens mutex_enter(&vd->vdev_stat_lock); 2414 7754 Jeff for (int t = 0; t < ZIO_TYPES; t++) { 2415 789 ahrens vs->vs_ops[t] += cvs->vs_ops[t]; 2416 789 ahrens vs->vs_bytes[t] += cvs->vs_bytes[t]; 2417 789 ahrens } 2418 789 ahrens vs->vs_scrub_examined += cvs->vs_scrub_examined; 2419 789 ahrens mutex_exit(&vd->vdev_stat_lock); 2420 789 ahrens } 2421 789 ahrens } 2422 789 ahrens } 2423 789 ahrens 2424 789 ahrens void 2425 5450 brendan vdev_clear_stats(vdev_t *vd) 2426 5450 brendan { 2427 5450 brendan mutex_enter(&vd->vdev_stat_lock); 2428 5450 brendan vd->vdev_stat.vs_space = 0; 2429 5450 brendan vd->vdev_stat.vs_dspace = 0; 2430 5450 brendan vd->vdev_stat.vs_alloc = 0; 2431 5450 brendan mutex_exit(&vd->vdev_stat_lock); 2432 5450 brendan } 2433 5450 brendan 2434 5450 brendan void 2435 7754 Jeff vdev_stat_update(zio_t *zio, uint64_t psize) 2436 789 ahrens { 2437 8241 Jeff spa_t *spa = zio->io_spa; 2438 8241 Jeff vdev_t *rvd = spa->spa_root_vdev; 2439 7754 Jeff vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2440 789 ahrens vdev_t *pvd; 2441 789 ahrens uint64_t txg = zio->io_txg; 2442 789 ahrens vdev_stat_t *vs = &vd->vdev_stat; 2443 789 ahrens zio_type_t type = zio->io_type; 2444 789 ahrens int flags = zio->io_flags; 2445 789 ahrens 2446 7754 Jeff /* 2447 7754 Jeff * If this i/o is a gang leader, it didn't do any actual work. 2448 7754 Jeff */ 2449 7754 Jeff if (zio->io_gang_tree) 2450 7754 Jeff return; 2451 7754 Jeff 2452 789 ahrens if (zio->io_error == 0) { 2453 7754 Jeff /* 2454 7754 Jeff * If this is a root i/o, don't count it -- we've already 2455 7754 Jeff * counted the top-level vdevs, and vdev_get_stats() will 2456 7754 Jeff * aggregate them when asked. This reduces contention on 2457 7754 Jeff * the root vdev_stat_lock and implicitly handles blocks 2458 7754 Jeff * that compress away to holes, for which there is no i/o. 2459 7754 Jeff * (Holes never create vdev children, so all the counters 2460 7754 Jeff * remain zero, which is what we want.) 2461 7754 Jeff * 2462 7754 Jeff * Note: this only applies to successful i/o (io_error == 0) 2463 7754 Jeff * because unlike i/o counts, errors are not additive. 2464 7754 Jeff * When reading a ditto block, for example, failure of 2465 7754 Jeff * one top-level vdev does not imply a root-level error. 2466 7754 Jeff */ 2467 7754 Jeff if (vd == rvd) 2468 7754 Jeff return; 2469 7754 Jeff 2470 7754 Jeff ASSERT(vd == zio->io_vd); 2471 8241 Jeff 2472 8241 Jeff if (flags & ZIO_FLAG_IO_BYPASS) 2473 8241 Jeff return; 2474 8241 Jeff 2475 8241 Jeff mutex_enter(&vd->vdev_stat_lock); 2476 8241 Jeff 2477 7754 Jeff if (flags & ZIO_FLAG_IO_REPAIR) { 2478 1807 bonwick if (flags & ZIO_FLAG_SCRUB_THREAD) 2479 7754 Jeff vs->vs_scrub_repaired += psize; 2480 8241 Jeff if (flags & ZIO_FLAG_SELF_HEAL) 2481 7754 Jeff vs->vs_self_healed += psize; 2482 789 ahrens } 2483 8241 Jeff 2484 8241 Jeff vs->vs_ops[type]++; 2485 8241 Jeff vs->vs_bytes[type] += psize; 2486 8241 Jeff 2487 8241 Jeff mutex_exit(&vd->vdev_stat_lock); 2488 789 ahrens return; 2489 789 ahrens } 2490 789 ahrens 2491 789 ahrens if (flags & ZIO_FLAG_SPECULATIVE) 2492 789 ahrens return; 2493 789 ahrens 2494 9725 Eric /* 2495 9725 Eric * If this is an I/O error that is going to be retried, then ignore the 2496 9725 Eric * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 2497 9725 Eric * hard errors, when in reality they can happen for any number of 2498 9725 Eric * innocuous reasons (bus resets, MPxIO link failure, etc). 2499 9725 Eric */ 2500 9725 Eric if (zio->io_error == EIO && 2501 9725 Eric !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 2502 10685 George return; 2503 10685 George 2504 10685 George /* 2505 10685 George * Intent logs writes won't propagate their error to the root 2506 10685 George * I/O so don't mark these types of failures as pool-level 2507 10685 George * errors. 2508 10685 George */ 2509 10685 George if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 2510 9725 Eric return; 2511 9725 Eric 2512 7754 Jeff mutex_enter(&vd->vdev_stat_lock); 2513 9230 George if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 2514 7754 Jeff if (zio->io_error == ECKSUM) 2515 7754 Jeff vs->vs_checksum_errors++; 2516 7754 Jeff else 2517 7754 Jeff vs->vs_read_errors++; 2518 789 ahrens } 2519 9230 George if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 2520 7754 Jeff vs->vs_write_errors++; 2521 7754 Jeff mutex_exit(&vd->vdev_stat_lock); 2522 789 ahrens 2523 8241 Jeff if (type == ZIO_TYPE_WRITE && txg != 0 && 2524 8241 Jeff (!(flags & ZIO_FLAG_IO_REPAIR) || 2525 10922 Jeff (flags & ZIO_FLAG_SCRUB_THREAD) || 2526 10922 Jeff spa->spa_claiming)) { 2527 8241 Jeff /* 2528 10922 Jeff * This is either a normal write (not a repair), or it's 2529 10922 Jeff * a repair induced by the scrub thread, or it's a repair 2530 10922 Jeff * made by zil_claim() during spa_load() in the first txg. 2531 10922 Jeff * In the normal case, we commit the DTL change in the same 2532 10922 Jeff * txg as the block was born. In the scrub-induced repair 2533 10922 Jeff * case, we know that scrubs run in first-pass syncing context, 2534 10922 Jeff * so we commit the DTL change in spa_syncing_txg(spa). 2535 10922 Jeff * In the zil_claim() case, we commit in spa_first_txg(spa). 2536 8241 Jeff * 2537 8241 Jeff * We currently do not make DTL entries for failed spontaneous 2538 8241 Jeff * self-healing writes triggered by normal (non-scrubbing) 2539 8241 Jeff * reads, because we have no transactional context in which to 2540 8241 Jeff * do so -- and it's not clear that it'd be desirable anyway. 2541 8241 Jeff */ 2542 8241 Jeff if (vd->vdev_ops->vdev_op_leaf) { 2543 8241 Jeff uint64_t commit_txg = txg; 2544 8241 Jeff if (flags & ZIO_FLAG_SCRUB_THREAD) { 2545 8241 Jeff ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2546 8241 Jeff ASSERT(spa_sync_pass(spa) == 1); 2547 8241 Jeff vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 2548 10922 Jeff commit_txg = spa_syncing_txg(spa); 2549 10922 Jeff } else if (spa->spa_claiming) { 2550 10922 Jeff ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2551 10922 Jeff commit_txg = spa_first_txg(spa); 2552 8241 Jeff } 2553 10922 Jeff ASSERT(commit_txg >= spa_syncing_txg(spa)); 2554 8241 Jeff if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 2555 8241 Jeff return; 2556 8241 Jeff for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2557 8241 Jeff vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 2558 8241 Jeff vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 2559 789 ahrens } 2560 8241 Jeff if (vd != rvd) 2561 8241 Jeff vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 2562 789 ahrens } 2563 789 ahrens } 2564 789 ahrens 2565 789 ahrens void 2566 789 ahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 2567 789 ahrens { 2568 789 ahrens vdev_stat_t *vs = &vd->vdev_stat; 2569 789 ahrens 2570 9816 George for (int c = 0; c < vd->vdev_children; c++) 2571 789 ahrens vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 2572 789 ahrens 2573 789 ahrens mutex_enter(&vd->vdev_stat_lock); 2574 789 ahrens 2575 789 ahrens if (type == POOL_SCRUB_NONE) { 2576 789 ahrens /* 2577 789 ahrens * Update completion and end time. Leave everything else alone 2578 789 ahrens * so we can report what happened during the previous scrub. 2579 789 ahrens */ 2580 789 ahrens vs->vs_scrub_complete = complete; 2581 789 ahrens vs->vs_scrub_end = gethrestime_sec(); 2582 789 ahrens } else { 2583 789 ahrens vs->vs_scrub_type = type; 2584 789 ahrens vs->vs_scrub_complete = 0; 2585 789 ahrens vs->vs_scrub_examined = 0; 2586 789 ahrens vs->vs_scrub_repaired = 0; 2587 789 ahrens vs->vs_scrub_start = gethrestime_sec(); 2588 789 ahrens vs->vs_scrub_end = 0; 2589 789 ahrens } 2590 789 ahrens 2591 789 ahrens mutex_exit(&vd->vdev_stat_lock); 2592 789 ahrens } 2593 789 ahrens 2594 789 ahrens /* 2595 10922 Jeff * Update the in-core space usage stats for this vdev, its metaslab class, 2596 10922 Jeff * and the root vdev. 2597 789 ahrens */ 2598 789 ahrens void 2599 10922 Jeff vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 2600 10922 Jeff int64_t space_delta) 2601 789 ahrens { 2602 4527 perrin int64_t dspace_delta = space_delta; 2603 4527 perrin spa_t *spa = vd->vdev_spa; 2604 4527 perrin vdev_t *rvd = spa->spa_root_vdev; 2605 10922 Jeff metaslab_group_t *mg = vd->vdev_mg; 2606 10922 Jeff metaslab_class_t *mc = mg ? mg->mg_class : NULL; 2607 4527 perrin 2608 789 ahrens ASSERT(vd == vd->vdev_top); 2609 789 ahrens 2610 4527 perrin /* 2611 4527 perrin * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 2612 4527 perrin * factor. We must calculate this here and not at the root vdev 2613 4527 perrin * because the root vdev's psize-to-asize is simply the max of its 2614 4527 perrin * childrens', thus not accurate enough for us. 2615 4527 perrin */ 2616 4527 perrin ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 2617 9701 George ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 2618 4527 perrin dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 2619 4527 perrin vd->vdev_deflate_ratio; 2620 2082 eschrock 2621 4527 perrin mutex_enter(&vd->vdev_stat_lock); 2622 10922 Jeff vd->vdev_stat.vs_alloc += alloc_delta; 2623 4527 perrin vd->vdev_stat.vs_space += space_delta; 2624 4527 perrin vd->vdev_stat.vs_dspace += dspace_delta; 2625 4527 perrin mutex_exit(&vd->vdev_stat_lock); 2626 4527 perrin 2627 10922 Jeff if (mc == spa_normal_class(spa)) { 2628 10922 Jeff mutex_enter(&rvd->vdev_stat_lock); 2629 10922 Jeff rvd->vdev_stat.vs_alloc += alloc_delta; 2630 10922 Jeff rvd->vdev_stat.vs_space += space_delta; 2631 10922 Jeff rvd->vdev_stat.vs_dspace += dspace_delta; 2632 10922 Jeff mutex_exit(&rvd->vdev_stat_lock); 2633 10922 Jeff } 2634 10922 Jeff 2635 10922 Jeff if (mc != NULL) { 2636 5450 brendan ASSERT(rvd == vd->vdev_parent); 2637 5450 brendan ASSERT(vd->vdev_ms_count != 0); 2638 4527 perrin 2639 10922 Jeff metaslab_class_space_update(mc, 2640 10922 Jeff alloc_delta, defer_delta, space_delta, dspace_delta); 2641 5450 brendan } 2642 789 ahrens } 2643 789 ahrens 2644 789 ahrens /* 2645 789 ahrens * Mark a top-level vdev's config as dirty, placing it on the dirty list 2646 789 ahrens * so that it will be written out next time the vdev configuration is synced. 2647 789 ahrens * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2648 789 ahrens */ 2649 789 ahrens void 2650 789 ahrens vdev_config_dirty(vdev_t *vd) 2651 789 ahrens { 2652 789 ahrens spa_t *spa = vd->vdev_spa; 2653 789 ahrens vdev_t *rvd = spa->spa_root_vdev; 2654 789 ahrens int c; 2655 789 ahrens 2656 1601 bonwick /* 2657 9425 Eric * If this is an aux vdev (as with l2cache and spare devices), then we 2658 9425 Eric * update the vdev config manually and set the sync flag. 2659 6643 eschrock */ 2660 6643 eschrock if (vd->vdev_aux != NULL) { 2661 6643 eschrock spa_aux_vdev_t *sav = vd->vdev_aux; 2662 6643 eschrock nvlist_t **aux; 2663 6643 eschrock uint_t naux; 2664 6643 eschrock 2665 6643 eschrock for (c = 0; c < sav->sav_count; c++) { 2666 6643 eschrock if (sav->sav_vdevs[c] == vd) 2667 6643 eschrock break; 2668 6643 eschrock } 2669 6643 eschrock 2670 7754 Jeff if (c == sav->sav_count) { 2671 7754 Jeff /* 2672 7754 Jeff * We're being removed. There's nothing more to do. 2673 7754 Jeff */ 2674 7754 Jeff ASSERT(sav->sav_sync == B_TRUE); 2675 7754 Jeff return; 2676 7754 Jeff } 2677 7754 Jeff 2678 6643 eschrock sav->sav_sync = B_TRUE; 2679 6643 eschrock 2680 9425 Eric if (nvlist_lookup_nvlist_array(sav->sav_config, 2681 9425 Eric ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 2682 9425 Eric VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 2683 9425 Eric ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 2684 9425 Eric } 2685 6643 eschrock 2686 6643 eschrock ASSERT(c < naux); 2687 6643 eschrock 2688 6643 eschrock /* 2689 6643 eschrock * Setting the nvlist in the middle if the array is a little 2690 6643 eschrock * sketchy, but it will work. 2691 6643 eschrock */ 2692 6643 eschrock nvlist_free(aux[c]); 2693 6643 eschrock aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); 2694 6643 eschrock 2695 6643 eschrock return; 2696 6643 eschrock } 2697 6643 eschrock 2698 6643 eschrock /* 2699 7754 Jeff * The dirty list is protected by the SCL_CONFIG lock. The caller 2700 7754 Jeff * must either hold SCL_CONFIG as writer, or must be the sync thread 2701 7754 Jeff * (which holds SCL_CONFIG as reader). There's only one sync thread, 2702 1601 bonwick * so this is sufficient to ensure mutual exclusion. 2703 1601 bonwick */ 2704 7754 Jeff ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2705 7754 Jeff (dsl_pool_sync_context(spa_get_dsl(spa)) && 2706 7754 Jeff spa_config_held(spa, SCL_CONFIG, RW_READER))); 2707 1601 bonwick 2708 789 ahrens if (vd == rvd) { 2709 789 ahrens for (c = 0; c < rvd->vdev_children; c++) 2710 789 ahrens vdev_config_dirty(rvd->vdev_child[c]); 2711 789 ahrens } else { 2712 789 ahrens ASSERT(vd == vd->vdev_top); 2713 789 ahrens 2714 10594 George if (!list_link_active(&vd->vdev_config_dirty_node) && 2715 10594 George !vd->vdev_ishole) 2716 7754 Jeff list_insert_head(&spa->spa_config_dirty_list, vd); 2717 789 ahrens } 2718 789 ahrens } 2719 789 ahrens 2720 789 ahrens void 2721 789 ahrens vdev_config_clean(vdev_t *vd) 2722 789 ahrens { 2723 1601 bonwick spa_t *spa = vd->vdev_spa; 2724 1601 bonwick 2725 7754 Jeff ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2726 7754 Jeff (dsl_pool_sync_context(spa_get_dsl(spa)) && 2727 7754 Jeff spa_config_held(spa, SCL_CONFIG, RW_READER))); 2728 1601 bonwick 2729 7754 Jeff ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 2730 7754 Jeff list_remove(&spa->spa_config_dirty_list, vd); 2731 7754 Jeff } 2732 7754 Jeff 2733 7754 Jeff /* 2734 7754 Jeff * Mark a top-level vdev's state as dirty, so that the next pass of 2735 7754 Jeff * spa_sync() can convert this into vdev_config_dirty(). We distinguish 2736 7754 Jeff * the state changes from larger config changes because they require 2737 7754 Jeff * much less locking, and are often needed for administrative actions. 2738 7754 Jeff */ 2739 7754 Jeff void 2740 7754 Jeff vdev_state_dirty(vdev_t *vd) 2741 7754 Jeff { 2742 7754 Jeff spa_t *spa = vd->vdev_spa; 2743 7754 Jeff 2744 7754 Jeff ASSERT(vd == vd->vdev_top); 2745 7754 Jeff 2746 7754 Jeff /* 2747 7754 Jeff * The state list is protected by the SCL_STATE lock. The caller 2748 7754 Jeff * must either hold SCL_STATE as writer, or must be the sync thread 2749 7754 Jeff * (which holds SCL_STATE as reader). There's only one sync thread, 2750 7754 Jeff * so this is sufficient to ensure mutual exclusion. 2751 7754 Jeff */ 2752 7754 Jeff ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2753 7754 Jeff (dsl_pool_sync_context(spa_get_dsl(spa)) && 2754 7754 Jeff spa_config_held(spa, SCL_STATE, RW_READER))); 2755 7754 Jeff 2756 10922 Jeff if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 2757 7754 Jeff list_insert_head(&spa->spa_state_dirty_list, vd); 2758 7754 Jeff } 2759 7754 Jeff 2760 7754 Jeff void 2761 7754 Jeff vdev_state_clean(vdev_t *vd) 2762 7754 Jeff { 2763 7754 Jeff spa_t *spa = vd->vdev_spa; 2764 7754 Jeff 2765 7754 Jeff ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2766 7754 Jeff (dsl_pool_sync_context(spa_get_dsl(spa)) && 2767 7754 Jeff spa_config_held(spa, SCL_STATE, RW_READER))); 2768 7754 Jeff 2769 7754 Jeff ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 2770 7754 Jeff list_remove(&spa->spa_state_dirty_list, vd); 2771 789 ahrens } 2772 789 ahrens 2773 6523 ek110237 /* 2774 6523 ek110237 * Propagate vdev state up from children to parent. 2775 6523 ek110237 */ 2776 1775 billm void 2777 1775 billm vdev_propagate_state(vdev_t *vd) 2778 1775 billm { 2779 8241 Jeff spa_t *spa = vd->vdev_spa; 2780 8241 Jeff vdev_t *rvd = spa->spa_root_vdev; 2781 1775 billm int degraded = 0, faulted = 0; 2782 1775 billm int corrupted = 0; 2783 1775 billm vdev_t *child; 2784 1775 billm 2785 4451 eschrock if (vd->vdev_children > 0) { 2786 9816 George for (int c = 0; c < vd->vdev_children; c++) { 2787 4451 eschrock child = vd->vdev_child[c]; 2788 10594 George 2789 10594 George /* 2790 10594 George * Don't factor holes into the decision. 2791 10594 George */ 2792 10594 George if (child->vdev_ishole) 2793 10594 George continue; 2794 6976 eschrock 2795 7754 Jeff if (!vdev_readable(child) || 2796 8241 Jeff (!vdev_writeable(child) && spa_writeable(spa))) { 2797 6976 eschrock /* 2798 6976 eschrock * Root special: if there is a top-level log 2799 6976 eschrock * device, treat the root vdev as if it were 2800 6976 eschrock * degraded. 2801 6976 eschrock */ 2802 6976 eschrock if (child->vdev_islog && vd == rvd) 2803 6976 eschrock degraded++; 2804 6976 eschrock else 2805 6976 eschrock faulted++; 2806 6976 eschrock } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 2807 4451 eschrock degraded++; 2808 6976 eschrock } 2809 1775 billm 2810 4451 eschrock if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 2811 4451 eschrock corrupted++; 2812 4451 eschrock } 2813 4451 eschrock 2814 4451 eschrock vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 2815 4451 eschrock 2816 4451 eschrock /* 2817 7754 Jeff * Root special: if there is a top-level vdev that cannot be 2818 4451 eschrock * opened due to corrupted metadata, then propagate the root 2819 4451 eschrock * vdev's aux state as 'corrupt' rather than 'insufficient 2820 4451 eschrock * replicas'. 2821 4451 eschrock */ 2822 4451 eschrock if (corrupted && vd == rvd && 2823 4451 eschrock rvd->vdev_state == VDEV_STATE_CANT_OPEN) 2824 4451 eschrock vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 2825 4451 eschrock VDEV_AUX_CORRUPT_DATA); 2826 1775 billm } 2827 1775 billm 2828 6976 eschrock if (vd->vdev_parent) 2829 4451 eschrock vdev_propagate_state(vd->vdev_parent); 2830 1775 billm } 2831 1775 billm 2832 789 ahrens /* 2833 1544 eschrock * Set a vdev's state. If this is during an open, we don't update the parent 2834 1544 eschrock * state, because we're in the process of opening children depth-first. 2835 1544 eschrock * Otherwise, we propagate the change to the parent. 2836 1544 eschrock * 2837 1544 eschrock * If this routine places a device in a faulted state, an appropriate ereport is 2838 1544 eschrock * generated. 2839 789 ahrens */ 2840 789 ahrens void 2841 1544 eschrock vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2842 789 ahrens { 2843 1986 eschrock uint64_t save_state; 2844 6643 eschrock spa_t *spa = vd->vdev_spa; 2845 1544 eschrock 2846 1544 eschrock if (state == vd->vdev_state) { 2847 1544 eschrock vd->vdev_stat.vs_aux = aux; 2848 789 ahrens return; 2849 1544 eschrock } 2850 1544 eschrock 2851 1986 eschrock save_state = vd->vdev_state; 2852 789 ahrens 2853 789 ahrens vd->vdev_state = state; 2854 789 ahrens vd->vdev_stat.vs_aux = aux; 2855 789 ahrens 2856 4451 eschrock /* 2857 4451 eschrock * If we are setting the vdev state to anything but an open state, then 2858 4451 eschrock * always close the underlying device. Otherwise, we keep accessible 2859 4451 eschrock * but invalid devices open forever. We don't call vdev_close() itself, 2860 4451 eschrock * because that implies some extra checks (offline, etc) that we don't 2861 4451 eschrock * want here. This is limited to leaf devices, because otherwise 2862 4451 eschrock * closing the device will affect other children. 2863 4451 eschrock */ 2864 7780 Jeff if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) 2865 4451 eschrock vd->vdev_ops->vdev_op_close(vd); 2866 4451 eschrock 2867 10817 Eric /* 2868 10817 Eric * If we have brought this vdev back into service, we need 2869 10817 Eric * to notify fmd so that it can gracefully repair any outstanding 2870 10817 Eric * cases due to a missing device. We do this in all cases, even those 2871 10817 Eric * that probably don't correlate to a repaired fault. This is sure to 2872 10817 Eric * catch all cases, and we let the zfs-retire agent sort it out. If 2873 10817 Eric * this is a transient state it's OK, as the retire agent will 2874 10817 Eric * double-check the state of the vdev before repairing it. 2875 10817 Eric */ 2876 10817 Eric if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 2877 10817 Eric vd->vdev_prevstate != state) 2878 10817 Eric zfs_post_state_change(spa, vd); 2879 10817 Eric 2880 4451 eschrock if (vd->vdev_removed && 2881 4451 eschrock state == VDEV_STATE_CANT_OPEN && 2882 4451 eschrock (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 2883 4451 eschrock /* 2884 4451 eschrock * If the previous state is set to VDEV_STATE_REMOVED, then this 2885 4451 eschrock * device was previously marked removed and someone attempted to 2886 4451 eschrock * reopen it. If this failed due to a nonexistent device, then 2887 4451 eschrock * keep the device in the REMOVED state. We also let this be if 2888 4451 eschrock * it is one of our special test online cases, which is only 2889 4451 eschrock * attempting to online the device and shouldn't generate an FMA 2890 4451 eschrock * fault. 2891 4451 eschrock */ 2892 4451 eschrock vd->vdev_state = VDEV_STATE_REMOVED; 2893 4451 eschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 2894 4451 eschrock } else if (state == VDEV_STATE_REMOVED) { 2895 4451 eschrock vd->vdev_removed = B_TRUE; 2896 4451 eschrock } else if (state == VDEV_STATE_CANT_OPEN) { 2897 1544 eschrock /* 2898 1544 eschrock * If we fail to open a vdev during an import, we mark it as 2899 1544 eschrock * "not available", which signifies that it was never there to 2900 1544 eschrock * begin with. Failure to open such a device is not considered 2901 1544 eschrock * an error. 2902 1544 eschrock */ 2903 11147 George if (spa_load_state(spa) == SPA_LOAD_IMPORT && 2904 1986 eschrock vd->vdev_ops->vdev_op_leaf) 2905 1986 eschrock vd->vdev_not_present = 1; 2906 1986 eschrock 2907 1986 eschrock /* 2908 1986 eschrock * Post the appropriate ereport. If the 'prevstate' field is 2909 1986 eschrock * set to something other than VDEV_STATE_UNKNOWN, it indicates 2910 1986 eschrock * that this is part of a vdev_reopen(). In this case, we don't 2911 1986 eschrock * want to post the ereport if the device was already in the 2912 1986 eschrock * CANT_OPEN state beforehand. 2913 4451 eschrock * 2914 4451 eschrock * If the 'checkremove' flag is set, then this is an attempt to 2915 4451 eschrock * online the device in response to an insertion event. If we 2916 4451 eschrock * hit this case, then we have detected an insertion event for a 2917 4451 eschrock * faulted or offline device that wasn't in the removed state. 2918 4451 eschrock * In this scenario, we don't post an ereport because we are 2919 4451 eschrock * about to replace the device, or attempt an online with 2920 4451 eschrock * vdev_forcefault, which will generate the fault for us. 2921 1986 eschrock */ 2922 4451 eschrock if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 2923 4451 eschrock !vd->vdev_not_present && !vd->vdev_checkremove && 2924 6643 eschrock vd != spa->spa_root_vdev) { 2925 1544 eschrock const char *class; 2926 1544 eschrock 2927 1544 eschrock switch (aux) { 2928 1544 eschrock case VDEV_AUX_OPEN_FAILED: 2929 1544 eschrock class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 2930 1544 eschrock break; 2931 1544 eschrock case VDEV_AUX_CORRUPT_DATA: 2932 1544 eschrock class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 2933 1544 eschrock break; 2934 1544 eschrock case VDEV_AUX_NO_REPLICAS: 2935 1544 eschrock class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 2936 1544 eschrock break; 2937 1544 eschrock case VDEV_AUX_BAD_GUID_SUM: 2938 1544 eschrock class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 2939 1544 eschrock break; 2940 1544 eschrock case VDEV_AUX_TOO_SMALL: 2941 1544 eschrock class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 2942 1544 eschrock break; 2943 1544 eschrock case VDEV_AUX_BAD_LABEL: 2944 1544 eschrock class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 2945 1544 eschrock break; 2946 7754 Jeff case VDEV_AUX_IO_FAILURE: 2947 7754 Jeff class = FM_EREPORT_ZFS_IO_FAILURE; 2948 7754 Jeff break; 2949 1544 eschrock default: 2950 1544 eschrock class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 2951 1544 eschrock } 2952 1544 eschrock 2953 6643 eschrock zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 2954 1544 eschrock } 2955 4451 eschrock 2956 4451 eschrock /* Erase any notion of persistent removed state */ 2957 4451 eschrock vd->vdev_removed = B_FALSE; 2958 4451 eschrock } else { 2959 4451 eschrock vd->vdev_removed = B_FALSE; 2960 1544 eschrock } 2961 1544 eschrock 2962 9583 Tim if (!isopen && vd->vdev_parent) 2963 9583 Tim vdev_propagate_state(vd->vdev_parent); 2964 789 ahrens } 2965 7042 gw25295 2966 7042 gw25295 /* 2967 7042 gw25295 * Check the vdev configuration to ensure that it's capable of supporting 2968 7042 gw25295 * a root pool. Currently, we do not support RAID-Z or partial configuration. 2969 7042 gw25295 * In addition, only a single top-level vdev is allowed and none of the leaves 2970 7042 gw25295 * can be wholedisks. 2971 7042 gw25295 */ 2972 7042 gw25295 boolean_t 2973 7042 gw25295 vdev_is_bootable(vdev_t *vd) 2974 7042 gw25295 { 2975 7042 gw25295 if (!vd->vdev_ops->vdev_op_leaf) { 2976 7042 gw25295 char *vdev_type = vd->vdev_ops->vdev_op_type; 2977 7042 gw25295 2978 7042 gw25295 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 2979 7042 gw25295 vd->vdev_children > 1) { 2980 7042 gw25295 return (B_FALSE); 2981 7042 gw25295 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 2982 7042 gw25295 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 2983 7042 gw25295 return (B_FALSE); 2984 7042 gw25295 } 2985 7042 gw25295 } else if (vd->vdev_wholedisk == 1) { 2986 7042 gw25295 return (B_FALSE); 2987 7042 gw25295 } 2988 7042 gw25295 2989 9816 George for (int c = 0; c < vd->vdev_children; c++) { 2990 7042 gw25295 if (!vdev_is_bootable(vd->vdev_child[c])) 2991 7042 gw25295 return (B_FALSE); 2992 7042 gw25295 } 2993 7042 gw25295 return (B_TRUE); 2994 7042 gw25295 } 2995 9701 George 2996 10594 George /* 2997 10594 George * Load the state from the original vdev tree (ovd) which 2998 10594 George * we've retrieved from the MOS config object. If the original 2999 10594 George * vdev was offline then we transfer that state to the device 3000 10594 George * in the current vdev tree (nvd). 3001 10594 George */ 3002 9701 George void 3003 10594 George vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3004 9701 George { 3005 10594 George spa_t *spa = nvd->vdev_spa; 3006 9701 George 3007 10594 George ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3008 10594 George