1 789 ahrens /* 2 789 ahrens * CDDL HEADER START 3 789 ahrens * 4 789 ahrens * The contents of this file are subject to the terms of the 5 1544 eschrock * Common Development and Distribution License (the "License"). 6 1544 eschrock * You may not use this file except in compliance with the License. 7 789 ahrens * 8 789 ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 789 ahrens * or http://www.opensolaris.org/os/licensing. 10 789 ahrens * See the License for the specific language governing permissions 11 789 ahrens * and limitations under the License. 12 789 ahrens * 13 789 ahrens * When distributing Covered Code, include this CDDL HEADER in each 14 789 ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 789 ahrens * If applicable, add the following below this CDDL HEADER, with the 16 789 ahrens * fields enclosed by brackets "[]" replaced with your own identifying 17 789 ahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18 789 ahrens * 19 789 ahrens * CDDL HEADER END 20 789 ahrens */ 21 789 ahrens /* 22 9480 George * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 789 ahrens * Use is subject to license terms. 24 789 ahrens */ 25 789 ahrens 26 789 ahrens #include <sys/zfs_context.h> 27 789 ahrens #include <sys/dmu.h> 28 789 ahrens #include <sys/dmu_tx.h> 29 789 ahrens #include <sys/space_map.h> 30 789 ahrens #include <sys/metaslab_impl.h> 31 789 ahrens #include <sys/vdev_impl.h> 32 789 ahrens #include <sys/zio.h> 33 2391 maybee 34 2391 maybee uint64_t metaslab_aliquot = 512ULL << 10; 35 5530 bonwick uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 36 10922 Jeff 37 10922 Jeff /* 38 10922 Jeff * Metaslab debugging: when set, keeps all space maps in core to verify frees. 39 10922 Jeff */ 40 10922 Jeff static int metaslab_debug = 0; 41 789 ahrens 42 789 ahrens /* 43 9480 George * Minimum size which forces the dynamic allocator to change 44 11146 George * it's allocation strategy. Once the space map cannot satisfy 45 9480 George * an allocation of this size then it switches to using more 46 9480 George * aggressive strategy (i.e search by size rather than offset). 47 9480 George */ 48 9480 George uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 49 9480 George 50 9480 George /* 51 9480 George * The minimum free space, in percent, which must be available 52 9480 George * in a space map to continue allocations in a first-fit fashion. 53 9480 George * Once the space_map's free space drops below this level we dynamically 54 9480 George * switch to using best-fit allocations. 55 9480 George */ 56 11146 George int metaslab_df_free_pct = 4; 57 11146 George 58 11146 George /* 59 11146 George * A metaslab is considered "free" if it contains a contiguous 60 11146 George * segment which is greater than metaslab_min_alloc_size. 61 11146 George */ 62 11146 George uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 63 11146 George 64 11146 George /* 65 11146 George * Max number of space_maps to prefetch. 66 11146 George */ 67 11146 George int metaslab_prefetch_limit = SPA_DVAS_PER_BP; 68 11146 George 69 11146 George /* 70 11146 George * Percentage bonus multiplier for metaslabs that are in the bonus area. 71 11146 George */ 72 11146 George int metaslab_smo_bonus_pct = 150; 73 9480 George 74 9480 George /* 75 789 ahrens * ========================================================================== 76 789 ahrens * Metaslab classes 77 789 ahrens * ========================================================================== 78 789 ahrens */ 79 789 ahrens metaslab_class_t * 80 10594 George metaslab_class_create(spa_t *spa, space_map_ops_t *ops) 81 789 ahrens { 82 789 ahrens metaslab_class_t *mc; 83 789 ahrens 84 789 ahrens mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 85 789 ahrens 86 10594 George mc->mc_spa = spa; 87 789 ahrens mc->mc_rotor = NULL; 88 9480 George mc->mc_ops = ops; 89 789 ahrens 90 789 ahrens return (mc); 91 789 ahrens } 92 789 ahrens 93 789 ahrens void 94 789 ahrens metaslab_class_destroy(metaslab_class_t *mc) 95 789 ahrens { 96 10974 Jeff ASSERT(mc->mc_rotor == NULL); 97 10974 Jeff ASSERT(mc->mc_alloc == 0); 98 10974 Jeff ASSERT(mc->mc_deferred == 0); 99 10974 Jeff ASSERT(mc->mc_space == 0); 100 10974 Jeff ASSERT(mc->mc_dspace == 0); 101 789 ahrens 102 789 ahrens kmem_free(mc, sizeof (metaslab_class_t)); 103 10594 George } 104 10594 George 105 10594 George int 106 10594 George metaslab_class_validate(metaslab_class_t *mc) 107 10594 George { 108 10594 George metaslab_group_t *mg; 109 10594 George vdev_t *vd; 110 10594 George 111 10594 George /* 112 10594 George * Must hold one of the spa_config locks. 113 10594 George */ 114 10594 George ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 115 10594 George spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 116 10594 George 117 10594 George if ((mg = mc->mc_rotor) == NULL) 118 10594 George return (0); 119 10594 George 120 10594 George do { 121 10594 George vd = mg->mg_vd; 122 10594 George ASSERT(vd->vdev_mg != NULL); 123 10594 George ASSERT3P(vd->vdev_top, ==, vd); 124 10594 George ASSERT3P(mg->mg_class, ==, mc); 125 10594 George ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 126 10594 George } while ((mg = mg->mg_next) != mc->mc_rotor); 127 10594 George 128 10594 George return (0); 129 10922 Jeff } 130 10922 Jeff 131 10922 Jeff void 132 10922 Jeff metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 133 10922 Jeff int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 134 10922 Jeff { 135 10922 Jeff atomic_add_64(&mc->mc_alloc, alloc_delta); 136 10922 Jeff atomic_add_64(&mc->mc_deferred, defer_delta); 137 10922 Jeff atomic_add_64(&mc->mc_space, space_delta); 138 10922 Jeff atomic_add_64(&mc->mc_dspace, dspace_delta); 139 10922 Jeff } 140 10922 Jeff 141 10922 Jeff uint64_t 142 10922 Jeff metaslab_class_get_alloc(metaslab_class_t *mc) 143 10922 Jeff { 144 10922 Jeff return (mc->mc_alloc); 145 10922 Jeff } 146 10922 Jeff 147 10922 Jeff uint64_t 148 10922 Jeff metaslab_class_get_deferred(metaslab_class_t *mc) 149 10922 Jeff { 150 10922 Jeff return (mc->mc_deferred); 151 10922 Jeff } 152 10922 Jeff 153 10922 Jeff uint64_t 154 10922 Jeff metaslab_class_get_space(metaslab_class_t *mc) 155 10922 Jeff { 156 10922 Jeff return (mc->mc_space); 157 10922 Jeff } 158 10922 Jeff 159 10922 Jeff uint64_t 160 10922 Jeff metaslab_class_get_dspace(metaslab_class_t *mc) 161 10922 Jeff { 162 10922 Jeff return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 163 789 ahrens } 164 789 ahrens 165 789 ahrens /* 166 789 ahrens * ========================================================================== 167 789 ahrens * Metaslab groups 168 789 ahrens * ========================================================================== 169 789 ahrens */ 170 789 ahrens static int 171 789 ahrens metaslab_compare(const void *x1, const void *x2) 172 789 ahrens { 173 789 ahrens const metaslab_t *m1 = x1; 174 789 ahrens const metaslab_t *m2 = x2; 175 789 ahrens 176 789 ahrens if (m1->ms_weight < m2->ms_weight) 177 789 ahrens return (1); 178 789 ahrens if (m1->ms_weight > m2->ms_weight) 179 789 ahrens return (-1); 180 789 ahrens 181 789 ahrens /* 182 789 ahrens * If the weights are identical, use the offset to force uniqueness. 183 789 ahrens */ 184 789 ahrens if (m1->ms_map.sm_start < m2->ms_map.sm_start) 185 789 ahrens return (-1); 186 789 ahrens if (m1->ms_map.sm_start > m2->ms_map.sm_start) 187 789 ahrens return (1); 188 789 ahrens 189 789 ahrens ASSERT3P(m1, ==, m2); 190 789 ahrens 191 789 ahrens return (0); 192 789 ahrens } 193 789 ahrens 194 789 ahrens metaslab_group_t * 195 789 ahrens metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 196 789 ahrens { 197 789 ahrens metaslab_group_t *mg; 198 789 ahrens 199 789 ahrens mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 200 789 ahrens mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 201 789 ahrens avl_create(&mg->mg_metaslab_tree, metaslab_compare, 202 789 ahrens sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 203 789 ahrens mg->mg_vd = vd; 204 10974 Jeff mg->mg_class = mc; 205 10974 Jeff mg->mg_activation_count = 0; 206 789 ahrens 207 789 ahrens return (mg); 208 789 ahrens } 209 789 ahrens 210 789 ahrens void 211 789 ahrens metaslab_group_destroy(metaslab_group_t *mg) 212 789 ahrens { 213 10974 Jeff ASSERT(mg->mg_prev == NULL); 214 10974 Jeff ASSERT(mg->mg_next == NULL); 215 11026 Tim /* 216 11026 Tim * We may have gone below zero with the activation count 217 11026 Tim * either because we never activated in the first place or 218 11026 Tim * because we're done, and possibly removing the vdev. 219 11026 Tim */ 220 11026 Tim ASSERT(mg->mg_activation_count <= 0); 221 10974 Jeff 222 789 ahrens avl_destroy(&mg->mg_metaslab_tree); 223 789 ahrens mutex_destroy(&mg->mg_lock); 224 789 ahrens kmem_free(mg, sizeof (metaslab_group_t)); 225 10974 Jeff } 226 10974 Jeff 227 10974 Jeff void 228 10974 Jeff metaslab_group_activate(metaslab_group_t *mg) 229 10974 Jeff { 230 10974 Jeff metaslab_class_t *mc = mg->mg_class; 231 10974 Jeff metaslab_group_t *mgprev, *mgnext; 232 10974 Jeff 233 10974 Jeff ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 234 10974 Jeff 235 10974 Jeff ASSERT(mc->mc_rotor != mg); 236 10974 Jeff ASSERT(mg->mg_prev == NULL); 237 10974 Jeff ASSERT(mg->mg_next == NULL); 238 10974 Jeff ASSERT(mg->mg_activation_count <= 0); 239 10974 Jeff 240 10974 Jeff if (++mg->mg_activation_count <= 0) 241 10974 Jeff return; 242 10974 Jeff 243 10974 Jeff mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 244 10974 Jeff 245 10974 Jeff if ((mgprev = mc->mc_rotor) == NULL) { 246 10974 Jeff mg->mg_prev = mg; 247 10974 Jeff mg->mg_next = mg; 248 10974 Jeff } else { 249 10974 Jeff mgnext = mgprev->mg_next; 250 10974 Jeff mg->mg_prev = mgprev; 251 10974 Jeff mg->mg_next = mgnext; 252 10974 Jeff mgprev->mg_next = mg; 253 10974 Jeff mgnext->mg_prev = mg; 254 10974 Jeff } 255 10974 Jeff mc->mc_rotor = mg; 256 10974 Jeff } 257 10974 Jeff 258 10974 Jeff void 259 10974 Jeff metaslab_group_passivate(metaslab_group_t *mg) 260 10974 Jeff { 261 10974 Jeff metaslab_class_t *mc = mg->mg_class; 262 10974 Jeff metaslab_group_t *mgprev, *mgnext; 263 10974 Jeff 264 10974 Jeff ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 265 10974 Jeff 266 10974 Jeff if (--mg->mg_activation_count != 0) { 267 10974 Jeff ASSERT(mc->mc_rotor != mg); 268 10974 Jeff ASSERT(mg->mg_prev == NULL); 269 10974 Jeff ASSERT(mg->mg_next == NULL); 270 10974 Jeff ASSERT(mg->mg_activation_count < 0); 271 10974 Jeff return; 272 10974 Jeff } 273 10974 Jeff 274 10974 Jeff mgprev = mg->mg_prev; 275 10974 Jeff mgnext = mg->mg_next; 276 10974 Jeff 277 10974 Jeff if (mg == mgnext) { 278 10974 Jeff mc->mc_rotor = NULL; 279 10974 Jeff } else { 280 10974 Jeff mc->mc_rotor = mgnext; 281 10974 Jeff mgprev->mg_next = mgnext; 282 10974 Jeff mgnext->mg_prev = mgprev; 283 10974 Jeff } 284 10974 Jeff 285 10974 Jeff mg->mg_prev = NULL; 286 10974 Jeff mg->mg_next = NULL; 287 789 ahrens } 288 789 ahrens 289 1732 bonwick static void 290 1732 bonwick metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 291 789 ahrens { 292 789 ahrens mutex_enter(&mg->mg_lock); 293 789 ahrens ASSERT(msp->ms_group == NULL); 294 789 ahrens msp->ms_group = mg; 295 1732 bonwick msp->ms_weight = 0; 296 789 ahrens avl_add(&mg->mg_metaslab_tree, msp); 297 789 ahrens mutex_exit(&mg->mg_lock); 298 789 ahrens } 299 789 ahrens 300 1732 bonwick static void 301 789 ahrens metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 302 789 ahrens { 303 789 ahrens mutex_enter(&mg->mg_lock); 304 789 ahrens ASSERT(msp->ms_group == mg); 305 789 ahrens avl_remove(&mg->mg_metaslab_tree, msp); 306 789 ahrens msp->ms_group = NULL; 307 789 ahrens mutex_exit(&mg->mg_lock); 308 789 ahrens } 309 789 ahrens 310 1732 bonwick static void 311 789 ahrens metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 312 789 ahrens { 313 2459 ahrens /* 314 2459 ahrens * Although in principle the weight can be any value, in 315 2459 ahrens * practice we do not use values in the range [1, 510]. 316 2459 ahrens */ 317 2459 ahrens ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 318 1732 bonwick ASSERT(MUTEX_HELD(&msp->ms_lock)); 319 1732 bonwick 320 789 ahrens mutex_enter(&mg->mg_lock); 321 789 ahrens ASSERT(msp->ms_group == mg); 322 789 ahrens avl_remove(&mg->mg_metaslab_tree, msp); 323 789 ahrens msp->ms_weight = weight; 324 789 ahrens avl_add(&mg->mg_metaslab_tree, msp); 325 789 ahrens mutex_exit(&mg->mg_lock); 326 789 ahrens } 327 789 ahrens 328 789 ahrens /* 329 11146 George * ========================================================================== 330 11146 George * Common allocator routines 331 11146 George * ========================================================================== 332 11146 George */ 333 11146 George static int 334 11146 George metaslab_segsize_compare(const void *x1, const void *x2) 335 11146 George { 336 11146 George const space_seg_t *s1 = x1; 337 11146 George const space_seg_t *s2 = x2; 338 11146 George uint64_t ss_size1 = s1->ss_end - s1->ss_start; 339 11146 George uint64_t ss_size2 = s2->ss_end - s2->ss_start; 340 11146 George 341 11146 George if (ss_size1 < ss_size2) 342 11146 George return (-1); 343 11146 George if (ss_size1 > ss_size2) 344 11146 George return (1); 345 11146 George 346 11146 George if (s1->ss_start < s2->ss_start) 347 11146 George return (-1); 348 11146 George if (s1->ss_start > s2->ss_start) 349 11146 George return (1); 350 11146 George 351 11146 George return (0); 352 11146 George } 353 11146 George 354 11146 George /* 355 9480 George * This is a helper function that can be used by the allocator to find 356 9480 George * a suitable block to allocate. This will search the specified AVL 357 9480 George * tree looking for a block that matches the specified criteria. 358 789 ahrens */ 359 9480 George static uint64_t 360 9480 George metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 361 9480 George uint64_t align) 362 789 ahrens { 363 789 ahrens space_seg_t *ss, ssearch; 364 789 ahrens avl_index_t where; 365 789 ahrens 366 789 ahrens ssearch.ss_start = *cursor; 367 789 ahrens ssearch.ss_end = *cursor + size; 368 789 ahrens 369 789 ahrens ss = avl_find(t, &ssearch, &where); 370 789 ahrens if (ss == NULL) 371 789 ahrens ss = avl_nearest(t, where, AVL_AFTER); 372 789 ahrens 373 789 ahrens while (ss != NULL) { 374 789 ahrens uint64_t offset = P2ROUNDUP(ss->ss_start, align); 375 789 ahrens 376 789 ahrens if (offset + size <= ss->ss_end) { 377 789 ahrens *cursor = offset + size; 378 789 ahrens return (offset); 379 789 ahrens } 380 789 ahrens ss = AVL_NEXT(t, ss); 381 789 ahrens } 382 789 ahrens 383 1732 bonwick /* 384 1732 bonwick * If we know we've searched the whole map (*cursor == 0), give up. 385 1732 bonwick * Otherwise, reset the cursor to the beginning and try again. 386 1732 bonwick */ 387 1732 bonwick if (*cursor == 0) 388 1732 bonwick return (-1ULL); 389 1732 bonwick 390 1732 bonwick *cursor = 0; 391 9480 George return (metaslab_block_picker(t, cursor, size, align)); 392 9480 George } 393 9480 George 394 9480 George static void 395 11146 George metaslab_pp_load(space_map_t *sm) 396 9480 George { 397 9480 George space_seg_t *ss; 398 9480 George 399 9480 George ASSERT(sm->sm_ppd == NULL); 400 9480 George sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 401 9480 George 402 9480 George sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 403 11146 George avl_create(sm->sm_pp_root, metaslab_segsize_compare, 404 9480 George sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 405 9480 George 406 9480 George for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 407 9480 George avl_add(sm->sm_pp_root, ss); 408 9480 George } 409 9480 George 410 9480 George static void 411 11146 George metaslab_pp_unload(space_map_t *sm) 412 9480 George { 413 9480 George void *cookie = NULL; 414 9480 George 415 9480 George kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 416 9480 George sm->sm_ppd = NULL; 417 9480 George 418 9480 George while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 419 9480 George /* tear down the tree */ 420 9480 George } 421 9480 George 422 9480 George avl_destroy(sm->sm_pp_root); 423 9480 George kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 424 9480 George sm->sm_pp_root = NULL; 425 9480 George } 426 9480 George 427 11146 George /* ARGSUSED */ 428 11146 George static void 429 11146 George metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) 430 11146 George { 431 11146 George /* No need to update cursor */ 432 11146 George } 433 11146 George 434 11146 George /* ARGSUSED */ 435 11146 George static void 436 11146 George metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) 437 11146 George { 438 11146 George /* No need to update cursor */ 439 11146 George } 440 11146 George 441 11146 George /* 442 11146 George * Return the maximum contiguous segment within the metaslab. 443 11146 George */ 444 11146 George uint64_t 445 11146 George metaslab_pp_maxsize(space_map_t *sm) 446 11146 George { 447 11146 George avl_tree_t *t = sm->sm_pp_root; 448 11146 George space_seg_t *ss; 449 11146 George 450 11146 George if (t == NULL || (ss = avl_last(t)) == NULL) 451 11146 George return (0ULL); 452 11146 George 453 11146 George return (ss->ss_end - ss->ss_start); 454 11146 George } 455 11146 George 456 11146 George /* 457 11146 George * ========================================================================== 458 11146 George * The first-fit block allocator 459 11146 George * ========================================================================== 460 11146 George */ 461 11146 George static uint64_t 462 11146 George metaslab_ff_alloc(space_map_t *sm, uint64_t size) 463 11146 George { 464 11146 George avl_tree_t *t = &sm->sm_root; 465 11146 George uint64_t align = size & -size; 466 11146 George uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 467 11146 George 468 11146 George return (metaslab_block_picker(t, cursor, size, align)); 469 11146 George } 470 11146 George 471 11146 George /* ARGSUSED */ 472 11146 George boolean_t 473 11146 George metaslab_ff_fragmented(space_map_t *sm) 474 11146 George { 475 11146 George return (B_TRUE); 476 11146 George } 477 11146 George 478 11146 George static space_map_ops_t metaslab_ff_ops = { 479 11146 George metaslab_pp_load, 480 11146 George metaslab_pp_unload, 481 11146 George metaslab_ff_alloc, 482 11146 George metaslab_pp_claim, 483 11146 George metaslab_pp_free, 484 11146 George metaslab_pp_maxsize, 485 11146 George metaslab_ff_fragmented 486 11146 George }; 487 11146 George 488 11146 George /* 489 11146 George * ========================================================================== 490 11146 George * Dynamic block allocator - 491 11146 George * Uses the first fit allocation scheme until space get low and then 492 11146 George * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 493 11146 George * and metaslab_df_free_pct to determine when to switch the allocation scheme. 494 11146 George * ========================================================================== 495 11146 George */ 496 9480 George static uint64_t 497 9480 George metaslab_df_alloc(space_map_t *sm, uint64_t size) 498 9480 George { 499 9480 George avl_tree_t *t = &sm->sm_root; 500 9480 George uint64_t align = size & -size; 501 9480 George uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 502 11146 George uint64_t max_size = metaslab_pp_maxsize(sm); 503 9480 George int free_pct = sm->sm_space * 100 / sm->sm_size; 504 9480 George 505 9480 George ASSERT(MUTEX_HELD(sm->sm_lock)); 506 9480 George ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 507 9480 George 508 9480 George if (max_size < size) 509 9480 George return (-1ULL); 510 9480 George 511 9480 George /* 512 9480 George * If we're running low on space switch to using the size 513 9480 George * sorted AVL tree (best-fit). 514 9480 George */ 515 9480 George if (max_size < metaslab_df_alloc_threshold || 516 9480 George free_pct < metaslab_df_free_pct) { 517 9480 George t = sm->sm_pp_root; 518 9480 George *cursor = 0; 519 9480 George } 520 9480 George 521 9480 George return (metaslab_block_picker(t, cursor, size, 1ULL)); 522 9480 George } 523 9480 George 524 11146 George static boolean_t 525 11146 George metaslab_df_fragmented(space_map_t *sm) 526 9480 George { 527 11146 George uint64_t max_size = metaslab_pp_maxsize(sm); 528 11146 George int free_pct = sm->sm_space * 100 / sm->sm_size; 529 9480 George 530 11146 George if (max_size >= metaslab_df_alloc_threshold && 531 11146 George free_pct >= metaslab_df_free_pct) 532 11146 George return (B_FALSE); 533 11146 George 534 11146 George return (B_TRUE); 535 9480 George } 536 9480 George 537 9480 George static space_map_ops_t metaslab_df_ops = { 538 11146 George metaslab_pp_load, 539 11146 George metaslab_pp_unload, 540 9480 George metaslab_df_alloc, 541 11146 George metaslab_pp_claim, 542 11146 George metaslab_pp_free, 543 11146 George metaslab_pp_maxsize, 544 11146 George metaslab_df_fragmented 545 11146 George }; 546 11146 George 547 11146 George /* 548 11146 George * ========================================================================== 549 11146 George * Other experimental allocators 550 11146 George * ========================================================================== 551 11146 George */ 552 11146 George static uint64_t 553 11146 George metaslab_cdf_alloc(space_map_t *sm, uint64_t size) 554 11146 George { 555 11146 George avl_tree_t *t = &sm->sm_root; 556 11146 George uint64_t *cursor = (uint64_t *)sm->sm_ppd; 557 11146 George uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; 558 11146 George uint64_t max_size = metaslab_pp_maxsize(sm); 559 11146 George uint64_t rsize = size; 560 11146 George uint64_t offset = 0; 561 11146 George 562 11146 George ASSERT(MUTEX_HELD(sm->sm_lock)); 563 11146 George ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 564 11146 George 565 11146 George if (max_size < size) 566 11146 George return (-1ULL); 567 11146 George 568 11146 George ASSERT3U(*extent_end, >=, *cursor); 569 11146 George 570 11146 George /* 571 11146 George * If we're running low on space switch to using the size 572 11146 George * sorted AVL tree (best-fit). 573 11146 George */ 574 11146 George if ((*cursor + size) > *extent_end) { 575 11146 George 576 11146 George t = sm->sm_pp_root; 577 11146 George *cursor = *extent_end = 0; 578 11146 George 579 11146 George if (max_size > 2 * SPA_MAXBLOCKSIZE) 580 11146 George rsize = MIN(metaslab_min_alloc_size, max_size); 581 11146 George offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); 582 11146 George if (offset != -1) 583 11146 George *cursor = offset + size; 584 11146 George } else { 585 11146 George offset = metaslab_block_picker(t, cursor, rsize, 1ULL); 586 11146 George } 587 11146 George ASSERT3U(*cursor, <=, *extent_end); 588 11146 George return (offset); 589 11146 George } 590 11146 George 591 11146 George static boolean_t 592 11146 George metaslab_cdf_fragmented(space_map_t *sm) 593 11146 George { 594 11146 George uint64_t max_size = metaslab_pp_maxsize(sm); 595 11146 George 596 11146 George if (max_size > (metaslab_min_alloc_size * 10)) 597 11146 George return (B_FALSE); 598 11146 George return (B_TRUE); 599 11146 George } 600 11146 George 601 11146 George static space_map_ops_t metaslab_cdf_ops = { 602 11146 George metaslab_pp_load, 603 11146 George metaslab_pp_unload, 604 11146 George metaslab_cdf_alloc, 605 11146 George metaslab_pp_claim, 606 11146 George metaslab_pp_free, 607 11146 George metaslab_pp_maxsize, 608 11146 George metaslab_cdf_fragmented 609 11146 George }; 610 11146 George 611 11146 George static uint64_t 612 11146 George metaslab_ndf_alloc(space_map_t *sm, uint64_t size) 613 11146 George { 614 11146 George avl_tree_t *t = &sm->sm_root; 615 11146 George avl_index_t where; 616 11146 George space_seg_t *ss, ssearch; 617 11146 George uint64_t *cursor = (uint64_t *)sm->sm_ppd; 618 11146 George uint64_t max_size = metaslab_pp_maxsize(sm); 619 11146 George 620 11146 George ASSERT(MUTEX_HELD(sm->sm_lock)); 621 11146 George ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 622 11146 George 623 11146 George if (max_size < size) 624 11146 George return (-1ULL); 625 11146 George 626 11146 George ssearch.ss_start = *cursor; 627 11146 George ssearch.ss_end = *cursor + size; 628 11146 George 629 11146 George ss = avl_find(t, &ssearch, &where); 630 11146 George if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { 631 11146 George t = sm->sm_pp_root; 632 11146 George 633 11146 George if (max_size > 2 * SPA_MAXBLOCKSIZE) 634 11146 George size = MIN(metaslab_min_alloc_size, max_size); 635 11146 George 636 11146 George ssearch.ss_start = 0; 637 11146 George ssearch.ss_end = size; 638 11146 George ss = avl_find(t, &ssearch, &where); 639 11146 George if (ss == NULL) 640 11146 George ss = avl_nearest(t, where, AVL_AFTER); 641 11146 George ASSERT(ss != NULL); 642 11146 George } 643 11146 George 644 11146 George if (ss != NULL) { 645 11146 George if (ss->ss_start + size <= ss->ss_end) { 646 11146 George *cursor = ss->ss_start + size; 647 11146 George return (ss->ss_start); 648 11146 George } 649 11146 George } 650 11146 George return (-1ULL); 651 11146 George } 652 11146 George 653 11146 George static boolean_t 654 11146 George metaslab_ndf_fragmented(space_map_t *sm) 655 11146 George { 656 11146 George uint64_t max_size = metaslab_pp_maxsize(sm); 657 11146 George 658 11146 George if (max_size > (metaslab_min_alloc_size * 10)) 659 11146 George return (B_FALSE); 660 11146 George return (B_TRUE); 661 11146 George } 662 11146 George 663 11146 George 664 11146 George static space_map_ops_t metaslab_ndf_ops = { 665 11146 George metaslab_pp_load, 666 11146 George metaslab_pp_unload, 667 11146 George metaslab_ndf_alloc, 668 11146 George metaslab_pp_claim, 669 11146 George metaslab_pp_free, 670 11146 George metaslab_pp_maxsize, 671 11146 George metaslab_ndf_fragmented 672 9480 George }; 673 9480 George 674 9480 George space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 675 1732 bonwick 676 1732 bonwick /* 677 1732 bonwick * ========================================================================== 678 1732 bonwick * Metaslabs 679 1732 bonwick * ========================================================================== 680 1732 bonwick */ 681 1732 bonwick metaslab_t * 682 1732 bonwick metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 683 1732 bonwick uint64_t start, uint64_t size, uint64_t txg) 684 1732 bonwick { 685 1732 bonwick vdev_t *vd = mg->mg_vd; 686 1732 bonwick metaslab_t *msp; 687 1732 bonwick 688 1732 bonwick msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 689 2856 nd150628 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 690 1732 bonwick 691 1732 bonwick msp->ms_smo_syncing = *smo; 692 1732 bonwick 693 1732 bonwick /* 694 1732 bonwick * We create the main space map here, but we don't create the 695 1732 bonwick * allocmaps and freemaps until metaslab_sync_done(). This serves 696 1732 bonwick * two purposes: it allows metaslab_sync_done() to detect the 697 1732 bonwick * addition of new space; and for debugging, it ensures that we'd 698 1732 bonwick * data fault on any attempt to use this metaslab before it's ready. 699 1732 bonwick */ 700 1732 bonwick space_map_create(&msp->ms_map, start, size, 701 1732 bonwick vd->vdev_ashift, &msp->ms_lock); 702 1732 bonwick 703 1732 bonwick metaslab_group_add(mg, msp); 704 1732 bonwick 705 10922 Jeff if (metaslab_debug && smo->smo_object != 0) { 706 10922 Jeff mutex_enter(&msp->ms_lock); 707 10922 Jeff VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, 708 10922 Jeff SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); 709 10922 Jeff mutex_exit(&msp->ms_lock); 710 10922 Jeff } 711 10922 Jeff 712 1732 bonwick /* 713 1732 bonwick * If we're opening an existing pool (txg == 0) or creating 714 1732 bonwick * a new one (txg == TXG_INITIAL), all space is available now. 715 1732 bonwick * If we're adding space to an existing pool, the new space 716 1732 bonwick * does not become available until after this txg has synced. 717 1732 bonwick */ 718 1732 bonwick if (txg <= TXG_INITIAL) 719 1732 bonwick metaslab_sync_done(msp, 0); 720 1732 bonwick 721 1732 bonwick if (txg != 0) { 722 1732 bonwick vdev_dirty(vd, 0, NULL, txg); 723 10921 Tim vdev_dirty(vd, VDD_METASLAB, msp, txg); 724 789 ahrens } 725 789 ahrens 726 1732 bonwick return (msp); 727 789 ahrens } 728 789 ahrens 729 1732 bonwick void 730 1732 bonwick metaslab_fini(metaslab_t *msp) 731 1732 bonwick { 732 1732 bonwick metaslab_group_t *mg = msp->ms_group; 733 1732 bonwick 734 10922 Jeff vdev_space_update(mg->mg_vd, 735 10922 Jeff -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); 736 1732 bonwick 737 1732 bonwick metaslab_group_remove(mg, msp); 738 1732 bonwick 739 1732 bonwick mutex_enter(&msp->ms_lock); 740 1732 bonwick 741 1732 bonwick space_map_unload(&msp->ms_map); 742 1732 bonwick space_map_destroy(&msp->ms_map); 743 1732 bonwick 744 10921 Tim for (int t = 0; t < TXG_SIZE; t++) { 745 1732 bonwick space_map_destroy(&msp->ms_allocmap[t]); 746 1732 bonwick space_map_destroy(&msp->ms_freemap[t]); 747 1732 bonwick } 748 10921 Tim 749 10921 Tim for (int t = 0; t < TXG_DEFER_SIZE; t++) 750 10921 Tim space_map_destroy(&msp->ms_defermap[t]); 751 10921 Tim 752 10921 Tim ASSERT3S(msp->ms_deferspace, ==, 0); 753 1732 bonwick 754 1732 bonwick mutex_exit(&msp->ms_lock); 755 2856 nd150628 mutex_destroy(&msp->ms_lock); 756 1732 bonwick 757 1732 bonwick kmem_free(msp, sizeof (metaslab_t)); 758 1732 bonwick } 759 1732 bonwick 760 1775 billm #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 761 1775 billm #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 762 1775 billm #define METASLAB_ACTIVE_MASK \ 763 1775 billm (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 764 1732 bonwick 765 789 ahrens static uint64_t 766 1732 bonwick metaslab_weight(metaslab_t *msp) 767 789 ahrens { 768 1775 billm metaslab_group_t *mg = msp->ms_group; 769 789 ahrens space_map_t *sm = &msp->ms_map; 770 1732 bonwick space_map_obj_t *smo = &msp->ms_smo; 771 1775 billm vdev_t *vd = mg->mg_vd; 772 1732 bonwick uint64_t weight, space; 773 789 ahrens 774 789 ahrens ASSERT(MUTEX_HELD(&msp->ms_lock)); 775 789 ahrens 776 1732 bonwick /* 777 1732 bonwick * The baseline weight is the metaslab's free space. 778 1732 bonwick */ 779 1732 bonwick space = sm->sm_size - smo->smo_alloc; 780 1732 bonwick weight = space; 781 1732 bonwick 782 1732 bonwick /* 783 1732 bonwick * Modern disks have uniform bit density and constant angular velocity. 784 1732 bonwick * Therefore, the outer recording zones are faster (higher bandwidth) 785 1732 bonwick * than the inner zones by the ratio of outer to inner track diameter, 786 1732 bonwick * which is typically around 2:1. We account for this by assigning 787 1732 bonwick * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 788 1732 bonwick * In effect, this means that we'll select the metaslab with the most 789 1732 bonwick * free bandwidth rather than simply the one with the most free space. 790 1732 bonwick */ 791 1732 bonwick weight = 2 * weight - 792 1732 bonwick ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 793 1732 bonwick ASSERT(weight >= space && weight <= 2 * space); 794 1732 bonwick 795 1732 bonwick /* 796 11146 George * For locality, assign higher weight to metaslabs which have 797 11146 George * a lower offset than what we've already activated. 798 1732 bonwick */ 799 11146 George if (sm->sm_start <= mg->mg_bonus_area) 800 11146 George weight *= (metaslab_smo_bonus_pct / 100); 801 1775 billm ASSERT(weight >= space && 802 11146 George weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); 803 11146 George 804 11146 George if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { 805 11146 George /* 806 11146 George * If this metaslab is one we're actively using, adjust its 807 11146 George * weight to make it preferable to any inactive metaslab so 808 11146 George * we'll polish it off. 809 11146 George */ 810 11146 George weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 811 11146 George } 812 11146 George return (weight); 813 11146 George } 814 11146 George 815 11146 George static void 816 11146 George metaslab_prefetch(metaslab_group_t *mg) 817 11146 George { 818 11146 George spa_t *spa = mg->mg_vd->vdev_spa; 819 11146 George metaslab_t *msp; 820 11146 George avl_tree_t *t = &mg->mg_metaslab_tree; 821 11146 George int m; 822 11146 George 823 11146 George mutex_enter(&mg->mg_lock); 824 1732 bonwick 825 1732 bonwick /* 826 11146 George * Prefetch the next potential metaslabs 827 1732 bonwick */ 828 11146 George for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { 829 11146 George space_map_t *sm = &msp->ms_map; 830 11146 George space_map_obj_t *smo = &msp->ms_smo; 831 1732 bonwick 832 11146 George /* If we have reached our prefetch limit then we're done */ 833 11146 George if (m >= metaslab_prefetch_limit) 834 11146 George break; 835 11146 George 836 11146 George if (!sm->sm_loaded && smo->smo_object != 0) { 837 11146 George mutex_exit(&mg->mg_lock); 838 11146 George dmu_prefetch(spa_meta_objset(spa), smo->smo_object, 839 11146 George 0ULL, smo->smo_objsize); 840 11146 George mutex_enter(&mg->mg_lock); 841 11146 George } 842 11146 George } 843 11146 George mutex_exit(&mg->mg_lock); 844 1732 bonwick } 845 1732 bonwick 846 1732 bonwick static int 847 9480 George metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) 848 1732 bonwick { 849 11146 George metaslab_group_t *mg = msp->ms_group; 850 1732 bonwick space_map_t *sm = &msp->ms_map; 851 9480 George space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 852 1732 bonwick 853 1732 bonwick ASSERT(MUTEX_HELD(&msp->ms_lock)); 854 1732 bonwick 855 1775 billm if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 856 10921 Tim space_map_load_wait(sm); 857 10921 Tim if (!sm->sm_loaded) { 858 10921 Tim int error = space_map_load(sm, sm_ops, SM_FREE, 859 10921 Tim &msp->ms_smo, 860 10922 Jeff spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); 861 11146 George if (error) { 862 10921 Tim metaslab_group_sort(msp->ms_group, msp, 0); 863 10921 Tim return (error); 864 10921 Tim } 865 10921 Tim for (int t = 0; t < TXG_DEFER_SIZE; t++) 866 10921 Tim space_map_walk(&msp->ms_defermap[t], 867 10921 Tim space_map_claim, sm); 868 11146 George 869 11146 George } 870 11146 George 871 11146 George /* 872 11146 George * Track the bonus area as we activate new metaslabs. 873 11146 George */ 874 11146 George if (sm->sm_start > mg->mg_bonus_area) { 875 11146 George mutex_enter(&mg->mg_lock); 876 11146 George mg->mg_bonus_area = sm->sm_start; 877 11146 George mutex_exit(&mg->mg_lock); 878 1732 bonwick } 879 9480 George 880 9480 George /* 881 9480 George * If we were able to load the map then make sure 882 9480 George * that this map is still able to satisfy our request. 883 9480 George */ 884 9480 George if (msp->ms_weight < size) 885 9480 George return (ENOSPC); 886 9480 George 887 1732 bonwick metaslab_group_sort(msp->ms_group, msp, 888 1775 billm msp->ms_weight | activation_weight); 889 789 ahrens } 890 1732 bonwick ASSERT(sm->sm_loaded); 891 1775 billm ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 892 1732 bonwick 893 1732 bonwick return (0); 894 1732 bonwick } 895 1732 bonwick 896 1732 bonwick static void 897 1732 bonwick metaslab_passivate(metaslab_t *msp, uint64_t size) 898 1732 bonwick { 899 2459 ahrens /* 900 2459 ahrens * If size < SPA_MINBLOCKSIZE, then we will not allocate from 901 2459 ahrens * this metaslab again. In that case, it had better be empty, 902 2459 ahrens * or we would be leaving space on the table. 903 2459 ahrens */ 904 2459 ahrens ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); 905 1775 billm metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 906 1775 billm ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 907 1732 bonwick } 908 1732 bonwick 909 1732 bonwick /* 910 1732 bonwick * Write a metaslab to disk in the context of the specified transaction group. 911 1732 bonwick */ 912 1732 bonwick void 913 1732 bonwick metaslab_sync(metaslab_t *msp, uint64_t txg) 914 1732 bonwick { 915 1732 bonwick vdev_t *vd = msp->ms_group->mg_vd; 916 1732 bonwick spa_t *spa = vd->vdev_spa; 917 10922 Jeff objset_t *mos = spa_meta_objset(spa); 918 1732 bonwick space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; 919 1732 bonwick space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; 920 1732 bonwick space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 921 1732 bonwick space_map_t *sm = &msp->ms_map; 922 1732 bonwick space_map_obj_t *smo = &msp->ms_smo_syncing; 923 1732 bonwick dmu_buf_t *db; 924 1732 bonwick dmu_tx_t *tx; 925 1732 bonwick 926 10594 George ASSERT(!vd->vdev_ishole); 927 10594 George 928 10921 Tim if (allocmap->sm_space == 0 && freemap->sm_space == 0) 929 10921 Tim return; 930 1732 bonwick 931 1732 bonwick /* 932 1732 bonwick * The only state that can actually be changing concurrently with 933 1732 bonwick * metaslab_sync() is the metaslab's ms_map. No other thread can 934 1732 bonwick * be modifying this txg's allocmap, freemap, freed_map, or smo. 935 1732 bonwick * Therefore, we only hold ms_lock to satify space_map ASSERTs. 936 1732 bonwick * We drop it whenever we call into the DMU, because the DMU 937 1732 bonwick * can call down to us (e.g. via zio_free()) at any time. 938 1732 bonwick */ 939 10921 Tim 940 10921 Tim tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 941 1732 bonwick 942 1732 bonwick if (smo->smo_object == 0) { 943 1732 bonwick ASSERT(smo->smo_objsize == 0); 944 1732 bonwick ASSERT(smo->smo_alloc == 0); 945 1732 bonwick smo->smo_object = dmu_object_alloc(mos, 946 1732 bonwick DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 947 1732 bonwick DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 948 1732 bonwick ASSERT(smo->smo_object != 0); 949 1732 bonwick dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 950 1732 bonwick (sm->sm_start >> vd->vdev_ms_shift), 951 1732 bonwick sizeof (uint64_t), &smo->smo_object, tx); 952 1732 bonwick } 953 10921 Tim 954 10921 Tim mutex_enter(&msp->ms_lock); 955 1732 bonwick 956 1732 bonwick space_map_walk(freemap, space_map_add, freed_map); 957 1732 bonwick 958 1732 bonwick if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= 959 1732 bonwick 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { 960 1732 bonwick /* 961 1732 bonwick * The in-core space map representation is twice as compact 962 1732 bonwick * as the on-disk one, so it's time to condense the latter 963 1732 bonwick * by generating a pure allocmap from first principles. 964 1732 bonwick * 965 1732 bonwick * This metaslab is 100% allocated, 966 1732 bonwick * minus the content of the in-core map (sm), 967 1732 bonwick * minus what's been freed this txg (freed_map), 968 10921 Tim * minus deferred frees (ms_defermap[]), 969 1732 bonwick * minus allocations from txgs in the future 970 1732 bonwick * (because they haven't been committed yet). 971 1732 bonwick */ 972 1732 bonwick space_map_vacate(allocmap, NULL, NULL); 973 1732 bonwick space_map_vacate(freemap, NULL, NULL); 974 1732 bonwick 975 1732 bonwick space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); 976 1732 bonwick 977 1732 bonwick space_map_walk(sm, space_map_remove, allocmap); 978 1732 bonwick space_map_walk(freed_map, space_map_remove, allocmap); 979 1732 bonwick 980 10921 Tim for (int t = 0; t < TXG_DEFER_SIZE; t++) 981 10921 Tim space_map_walk(&msp->ms_defermap[t], 982 10921 Tim space_map_remove, allocmap); 983 10921 Tim 984 10921 Tim for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 985 1732 bonwick space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], 986 1732 bonwick space_map_remove, allocmap); 987 1732 bonwick 988 1732 bonwick mutex_exit(&msp->ms_lock); 989 1732 bonwick space_map_truncate(smo, mos, tx); 990 1732 bonwick mutex_enter(&msp->ms_lock); 991 1732 bonwick } 992 1732 bonwick 993 1732 bonwick space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 994 1732 bonwick space_map_sync(freemap, SM_FREE, smo, mos, tx); 995 1732 bonwick 996 1732 bonwick mutex_exit(&msp->ms_lock); 997 1732 bonwick 998 1732 bonwick VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 999 1732 bonwick dmu_buf_will_dirty(db, tx); 1000 4944 maybee ASSERT3U(db->db_size, >=, sizeof (*smo)); 1001 4944 maybee bcopy(smo, db->db_data, sizeof (*smo)); 1002 1732 bonwick dmu_buf_rele(db, FTAG); 1003 1732 bonwick 1004 1732 bonwick dmu_tx_commit(tx); 1005 1732 bonwick } 1006 1732 bonwick 1007 1732 bonwick /* 1008 1732 bonwick * Called after a transaction group has completely synced to mark 1009 1732 bonwick * all of the metaslab's free space as usable. 1010 1732 bonwick */ 1011 1732 bonwick void 1012 1732 bonwick metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1013 1732 bonwick { 1014 1732 bonwick space_map_obj_t *smo = &msp->ms_smo; 1015 1732 bonwick space_map_obj_t *smosync = &msp->ms_smo_syncing; 1016 1732 bonwick space_map_t *sm = &msp->ms_map; 1017 1732 bonwick space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1018 10921 Tim space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1019 1732 bonwick metaslab_group_t *mg = msp->ms_group; 1020 1732 bonwick vdev_t *vd = mg->mg_vd; 1021 10921 Tim int64_t alloc_delta, defer_delta; 1022 10594 George 1023 10594 George ASSERT(!vd->vdev_ishole); 1024 1732 bonwick 1025 1732 bonwick mutex_enter(&msp->ms_lock); 1026 1732 bonwick 1027 1732 bonwick /* 1028 1732 bonwick * If this metaslab is just becoming available, initialize its 1029 1732 bonwick * allocmaps and freemaps and add its capacity to the vdev. 1030 1732 bonwick */ 1031 1732 bonwick if (freed_map->sm_size == 0) { 1032 10921 Tim for (int t = 0; t < TXG_SIZE; t++) { 1033 1732 bonwick space_map_create(&msp->ms_allocmap[t], sm->sm_start, 1034 1732 bonwick sm->sm_size, sm->sm_shift, sm->sm_lock); 1035 1732 bonwick space_map_create(&msp->ms_freemap[t], sm->sm_start, 1036 1732 bonwick sm->sm_size, sm->sm_shift, sm->sm_lock); 1037 1732 bonwick } 1038 10921 Tim 1039 10921 Tim for (int t = 0; t < TXG_DEFER_SIZE; t++) 1040 10921 Tim space_map_create(&msp->ms_defermap[t], sm->sm_start, 1041 10921 Tim sm->sm_size, sm->sm_shift, sm->sm_lock); 1042 10921 Tim 1043 10922 Jeff vdev_space_update(vd, 0, 0, sm->sm_size); 1044 1732 bonwick } 1045 1732 bonwick 1046 10921 Tim alloc_delta = smosync->smo_alloc - smo->smo_alloc; 1047 10921 Tim defer_delta = freed_map->sm_space - defer_map->sm_space; 1048 10921 Tim 1049 10922 Jeff vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1050 1732 bonwick 1051 1732 bonwick ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); 1052 1732 bonwick ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); 1053 1732 bonwick 1054 1732 bonwick /* 1055 1732 bonwick * If there's a space_map_load() in progress, wait for it to complete 1056 1732 bonwick * so that we have a consistent view of the in-core space map. 1057 10921 Tim * Then, add defer_map (oldest deferred frees) to this map and 1058 10921 Tim * transfer freed_map (this txg's frees) to defer_map. 1059 1732 bonwick */ 1060 1732 bonwick space_map_load_wait(sm); 1061 10921 Tim space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); 1062 10921 Tim space_map_vacate(freed_map, space_map_add, defer_map); 1063 1732 bonwick 1064 1732 bonwick *smo = *smosync; 1065 10921 Tim 1066 10921 Tim msp->ms_deferspace += defer_delta; 1067 10921 Tim ASSERT3S(msp->ms_deferspace, >=, 0); 1068 10921 Tim ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); 1069 10921 Tim if (msp->ms_deferspace != 0) { 1070 10921 Tim /* 1071 10921 Tim * Keep syncing this metaslab until all deferred frees 1072 10921 Tim * are back in circulation. 1073 10921 Tim */ 1074 10921 Tim vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1075 10921 Tim } 1076 1732 bonwick 1077 1732 bonwick /* 1078 1732 bonwick * If the map is loaded but no longer active, evict it as soon as all 1079 1732 bonwick * future allocations have synced. (If we unloaded it now and then 1080 1732 bonwick * loaded a moment later, the map wouldn't reflect those allocations.) 1081 1732 bonwick */ 1082 1775 billm if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1083 1732 bonwick int evictable = 1; 1084 1732 bonwick 1085 10921 Tim for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1086 1732 bonwick if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) 1087 1732 bonwick evictable = 0; 1088 1732 bonwick 1089 10922 Jeff if (evictable && !metaslab_debug) 1090 1732 bonwick space_map_unload(sm); 1091 1732 bonwick } 1092 1732 bonwick 1093 1732 bonwick metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1094 1732 bonwick 1095 1732 bonwick mutex_exit(&msp->ms_lock); 1096 789 ahrens } 1097 789 ahrens 1098 11146 George void 1099 11146 George metaslab_sync_reassess(metaslab_group_t *mg) 1100 11146 George { 1101 11146 George vdev_t *vd = mg->mg_vd; 1102 11146 George 1103 11146 George /* 1104 11146 George * Re-evaluate all metaslabs which have lower offsets than the 1105 11146 George * bonus area. 1106 11146 George */ 1107 11146 George for (int m = 0; m < vd->vdev_ms_count; m++) { 1108 11146 George metaslab_t *msp = vd->vdev_ms[m]; 1109 11146 George 1110 11146 George if (msp->ms_map.sm_start > mg->mg_bonus_area) 1111 11146 George break; 1112 11146 George 1113 11146 George mutex_enter(&msp->ms_lock); 1114 11146 George metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1115 11146 George mutex_exit(&msp->ms_lock); 1116 11146 George } 1117 11146 George 1118 11146 George /* 1119 11146 George * Prefetch the next potential metaslabs 1120 11146 George */ 1121 11146 George metaslab_prefetch(mg); 1122 11146 George } 1123 11146 George 1124 1775 billm static uint64_t 1125 1775 billm metaslab_distance(metaslab_t *msp, dva_t *dva) 1126 1775 billm { 1127 1775 billm uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 1128 1775 billm uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 1129 1775 billm uint64_t start = msp->ms_map.sm_start >> ms_shift; 1130 1775 billm 1131 1775 billm if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 1132 1775 billm return (1ULL << 63); 1133 1775 billm 1134 1775 billm if (offset < start) 1135 1775 billm return ((start - offset) << ms_shift); 1136 1775 billm if (offset > start) 1137 1775 billm return ((offset - start) << ms_shift); 1138 1775 billm return (0); 1139 1775 billm } 1140 1775 billm 1141 1775 billm static uint64_t 1142 1775 billm metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, 1143 1775 billm uint64_t min_distance, dva_t *dva, int d) 1144 789 ahrens { 1145 1732 bonwick metaslab_t *msp = NULL; 1146 1732 bonwick uint64_t offset = -1ULL; 1147 1775 billm avl_tree_t *t = &mg->mg_metaslab_tree; 1148 1775 billm uint64_t activation_weight; 1149 1775 billm uint64_t target_distance; 1150 1775 billm int i; 1151 1775 billm 1152 1775 billm activation_weight = METASLAB_WEIGHT_PRIMARY; 1153 9480 George for (i = 0; i < d; i++) { 1154 9480 George if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 1155 1775 billm activation_weight = METASLAB_WEIGHT_SECONDARY; 1156 9480 George break; 1157 9480 George } 1158 9480 George } 1159 789 ahrens 1160 1732 bonwick for (;;) { 1161 9480 George boolean_t was_active; 1162 9480 George 1163 1732 bonwick mutex_enter(&mg->mg_lock); 1164 1775 billm for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 1165 1775 billm if (msp->ms_weight < size) { 1166 1775 billm mutex_exit(&mg->mg_lock); 1167 1775 billm return (-1ULL); 1168 1775 billm } 1169 1775 billm 1170 9480 George was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1171 1775 billm if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1172 1775 billm break; 1173 1775 billm 1174 1775 billm target_distance = min_distance + 1175 1775 billm (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 1176 1775 billm 1177 1775 billm for (i = 0; i < d; i++) 1178 1775 billm if (metaslab_distance(msp, &dva[i]) < 1179 1775 billm target_distance) 1180 1775 billm break; 1181 1775 billm if (i == d) 1182 1775 billm break; 1183 1732 bonwick } 1184 1732 bonwick mutex_exit(&mg->mg_lock); 1185 1775 billm if (msp == NULL) 1186 1775 billm return (-1ULL); 1187 789 ahrens 1188 1732 bonwick mutex_enter(&msp->ms_lock); 1189 789 ahrens 1190 3848 gw25295 /* 1191 3848 gw25295 * Ensure that the metaslab we have selected is still 1192 3848 gw25295 * capable of handling our request. It's possible that 1193 3848 gw25295 * another thread may have changed the weight while we 1194 3848 gw25295 * were blocked on the metaslab lock. 1195 3848 gw25295 */ 1196 9480 George if (msp->ms_weight < size || (was_active && 1197 9480 George !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1198 9480 George activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1199 3848 gw25295 mutex_exit(&msp->ms_lock); 1200 3848 gw25295 continue; 1201 3848 gw25295 } 1202 3848 gw25295 1203 1775 billm if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1204 1775 billm activation_weight == METASLAB_WEIGHT_PRIMARY) { 1205 1775 billm metaslab_passivate(msp, 1206 2459 ahrens msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1207 1775 billm mutex_exit(&msp->ms_lock); 1208 1775 billm continue; 1209 1775 billm } 1210 1775 billm 1211 9480 George if (metaslab_activate(msp, activation_weight, size) != 0) { 1212 789 ahrens mutex_exit(&msp->ms_lock); 1213 789 ahrens continue; 1214 789 ahrens } 1215 1732 bonwick 1216 1732 bonwick if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) 1217 1732 bonwick break; 1218 1732 bonwick 1219 11146 George metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); 1220 1732 bonwick 1221 789 ahrens mutex_exit(&msp->ms_lock); 1222 789 ahrens } 1223 789 ahrens 1224 1732 bonwick if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1225 1732 bonwick vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1226 1732 bonwick 1227 1732 bonwick space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 1228 1732 bonwick 1229 1732 bonwick mutex_exit(&msp->ms_lock); 1230 1732 bonwick 1231 1775 billm return (offset); 1232 789 ahrens } 1233 789 ahrens 1234 789 ahrens /* 1235 789 ahrens * Allocate a block for the specified i/o. 1236 789 ahrens */ 1237 1775 billm static int 1238 4527 perrin metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 1239 7754 Jeff dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 1240 789 ahrens { 1241 789 ahrens metaslab_group_t *mg, *rotor; 1242 789 ahrens vdev_t *vd; 1243 1775 billm int dshift = 3; 1244 1775 billm int all_zero; 1245 8241 Jeff int zio_lock = B_FALSE; 1246 8241 Jeff boolean_t allocatable; 1247 789 ahrens uint64_t offset = -1ULL; 1248 789 ahrens uint64_t asize; 1249 1775 billm uint64_t distance; 1250 1807 bonwick 1251 1807 bonwick ASSERT(!DVA_IS_VALID(&dva[d])); 1252 789 ahrens 1253 789 ahrens /* 1254 5530 bonwick * For testing, make some blocks above a certain size be gang blocks. 1255 5530 bonwick */ 1256 11066 rafael if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 1257 5530 bonwick return (ENOSPC); 1258 5530 bonwick 1259 5530 bonwick /* 1260 789 ahrens * Start at the rotor and loop through all mgs until we find something. 1261 10922 Jeff * Note that there's no locking on mc_rotor or mc_aliquot because 1262 789 ahrens * nothing actually breaks if we miss a few updates -- we just won't 1263 789 ahrens * allocate quite as evenly. It all balances out over time. 1264 1775 billm * 1265 3063 perrin * If we are doing ditto or log blocks, try to spread them across 1266 3063 perrin * consecutive vdevs. If we're forced to reuse a vdev before we've 1267 3063 perrin * allocated all of our ditto blocks, then try and spread them out on 1268 3063 perrin * that vdev as much as possible. If it turns out to not be possible, 1269 1775 billm * gradually lower our standards until anything becomes acceptable. 1270 1775 billm * Also, allocating on consecutive vdevs (as opposed to random vdevs) 1271 1775 billm * gives us hope of containing our fault domains to something we're 1272 1775 billm * able to reason about. Otherwise, any two top-level vdev failures 1273 1775 billm * will guarantee the loss of data. With consecutive allocation, 1274 1775 billm * only two adjacent top-level vdev failures will result in data loss. 1275 1775 billm * 1276 1775 billm * If we are doing gang blocks (hintdva is non-NULL), try to keep 1277 1775 billm * ourselves on the same vdev as our gang block header. That 1278 1775 billm * way, we can hope for locality in vdev_cache, plus it makes our 1279 1775 billm * fault domains something tractable. 1280 789 ahrens */ 1281 1775 billm if (hintdva) { 1282 1775 billm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 1283 10594 George 1284 10594 George /* 1285 10594 George * It's possible the vdev we're using as the hint no 1286 10594 George * longer exists (i.e. removed). Consult the rotor when 1287 10594 George * all else fails. 1288 10594 George */ 1289 10974 Jeff if (vd != NULL) { 1290 3063 perrin mg = vd->vdev_mg; 1291 10594 George 1292 10594 George if (flags & METASLAB_HINTBP_AVOID && 1293 10594 George mg->mg_next != NULL) 1294 10594 George mg = mg->mg_next; 1295 10594 George } else { 1296 10594 George mg = mc->mc_rotor; 1297 10594 George } 1298 1775 billm } else if (d != 0) { 1299 1775 billm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 1300 1775 billm mg = vd->vdev_mg->mg_next; 1301 1775 billm } else { 1302 1775 billm mg = mc->mc_rotor; 1303 1775 billm } 1304 4527 perrin 1305 4527 perrin /* 1306 10974 Jeff * If the hint put us into the wrong metaslab class, or into a 1307 10974 Jeff * metaslab group that has been passivated, just follow the rotor. 1308 4527 perrin */ 1309 10974 Jeff if (mg->mg_class != mc || mg->mg_activation_count <= 0) 1310 4527 perrin mg = mc->mc_rotor; 1311 4527 perrin 1312 1775 billm rotor = mg; 1313 1775 billm top: 1314 1775 billm all_zero = B_TRUE; 1315 789 ahrens do { 1316 10974 Jeff ASSERT(mg->mg_activation_count == 1); 1317 10974 Jeff 1318 789 ahrens vd = mg->mg_vd; 1319 8241 Jeff 1320 5329 gw25295 /* 1321 7754 Jeff * Don't allocate from faulted devices. 1322 5329 gw25295 */ 1323 8241 Jeff if (zio_lock) { 1324 8241 Jeff spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1325 8241 Jeff allocatable = vdev_allocatable(vd); 1326 8241 Jeff spa_config_exit(spa, SCL_ZIO, FTAG); 1327 8241 Jeff } else { 1328 8241 Jeff allocatable = vdev_allocatable(vd); 1329 8241 Jeff } 1330 8241 Jeff if (!allocatable) 1331 5329 gw25295 goto next; 1332 8241 Jeff 1333 5329 gw25295 /* 1334 5329 gw25295 * Avoid writing single-copy data to a failing vdev 1335 5329 gw25295 */ 1336 5329 gw25295 if ((vd->vdev_stat.vs_write_errors > 0 || 1337 5329 gw25295 vd->vdev_state < VDEV_STATE_HEALTHY) && 1338 5329 gw25295 d == 0 && dshift == 3) { 1339 5329 gw25295 all_zero = B_FALSE; 1340 5329 gw25295 goto next; 1341 5329 gw25295 } 1342 4527 perrin 1343 4527 perrin ASSERT(mg->mg_class == mc); 1344 1775 billm 1345 1775 billm distance = vd->vdev_asize >> dshift; 1346 1775 billm if (distance <= (1ULL << vd->vdev_ms_shift)) 1347 1775 billm distance = 0; 1348 1775 billm else 1349 1775 billm all_zero = B_FALSE; 1350 1775 billm 1351 789 ahrens asize = vdev_psize_to_asize(vd, psize); 1352 789 ahrens ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1353 789 ahrens 1354 1775 billm offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); 1355 1775 billm if (offset != -1ULL) { 1356 789 ahrens /* 1357 789 ahrens * If we've just selected this metaslab group, 1358 789 ahrens * figure out whether the corresponding vdev is 1359 789 ahrens * over- or under-used relative to the pool, 1360 789 ahrens * and set an allocation bias to even it out. 1361 789 ahrens */ 1362 10922 Jeff if (mc->mc_aliquot == 0) { 1363 789 ahrens vdev_stat_t *vs = &vd->vdev_stat; 1364 10922 Jeff int64_t vu, cu; 1365 789 ahrens 1366 789 ahrens /* 1367 789 ahrens * Determine percent used in units of 0..1024. 1368 789 ahrens * (This is just to avoid floating point.) 1369 789 ahrens */ 1370 789 ahrens vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); 1371 10922 Jeff cu = (mc->mc_alloc << 10) / (mc->mc_space + 1); 1372 789 ahrens 1373 789 ahrens /* 1374 789 ahrens * Bias by at most +/- 25% of the aliquot. 1375 789 ahrens */ 1376 10922 Jeff mg->mg_bias = ((cu - vu) * 1377 789 ahrens (int64_t)mg->mg_aliquot) / (1024 * 4); 1378 789 ahrens } 1379 789 ahrens 1380 10922 Jeff if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1381 789 ahrens mg->mg_aliquot + mg->mg_bias) { 1382 789 ahrens mc->mc_rotor = mg->mg_next; 1383 10922 Jeff mc->mc_aliquot = 0; 1384 789 ahrens } 1385 789 ahrens 1386 1775 billm DVA_SET_VDEV(&dva[d], vd->vdev_id); 1387 1775 billm DVA_SET_OFFSET(&dva[d], offset); 1388 7754 Jeff DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1389 1775 billm DVA_SET_ASIZE(&dva[d], asize); 1390 789 ahrens 1391 789 ahrens return (0); 1392 789 ahrens } 1393 5329 gw25295 next: 1394 789 ahrens mc->mc_rotor = mg->mg_next; 1395 10922 Jeff mc->mc_aliquot = 0; 1396 789 ahrens } while ((mg = mg->mg_next) != rotor); 1397 789 ahrens 1398 1775 billm if (!all_zero) { 1399 1775 billm dshift++; 1400 1775 billm ASSERT(dshift < 64); 1401 8241 Jeff goto top; 1402 8241 Jeff } 1403 8241 Jeff 1404 9480 George if (!allocatable && !zio_lock) { 1405 8241 Jeff dshift = 3; 1406 8241 Jeff zio_lock = B_TRUE; 1407 1775 billm goto top; 1408 1775 billm } 1409 1775 billm 1410 1775 billm bzero(&dva[d], sizeof (dva_t)); 1411 789 ahrens 1412 789 ahrens return (ENOSPC); 1413 1775 billm } 1414 1775 billm 1415 789 ahrens /* 1416 789 ahrens * Free the block represented by DVA in the context of the specified 1417 789 ahrens * transaction group. 1418 789 ahrens */ 1419 1807 bonwick static void 1420 1807 bonwick metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1421 789 ahrens { 1422 789 ahrens uint64_t vdev = DVA_GET_VDEV(dva); 1423 789 ahrens uint64_t offset = DVA_GET_OFFSET(dva); 1424 789 ahrens uint64_t size = DVA_GET_ASIZE(dva); 1425 789 ahrens vdev_t *vd; 1426 789 ahrens metaslab_t *msp; 1427 789 ahrens 1428 1807 bonwick ASSERT(DVA_IS_VALID(dva)); 1429 1807 bonwick 1430 789 ahrens if (txg > spa_freeze_txg(spa)) 1431 789 ahrens return; 1432 789 ahrens 1433 1807 bonwick if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1434 1807 bonwick (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 1435 1807 bonwick cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 1436 1807 bonwick (u_longlong_t)vdev, (u_longlong_t)offset); 1437 789 ahrens ASSERT(0); 1438 789 ahrens return; 1439 789 ahrens } 1440 789 ahrens 1441 789 ahrens msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1442 789 ahrens 1443 789 ahrens if (DVA_GET_GANG(dva)) 1444 789 ahrens size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1445 789 ahrens 1446 789 ahrens mutex_enter(&msp->ms_lock); 1447 789 ahrens 1448 1732 bonwick if (now) { 1449 1732 bonwick space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], 1450 1732 bonwick offset, size); 1451 1732 bonwick space_map_free(&msp->ms_map, offset, size); 1452 1732 bonwick } else { 1453 1732 bonwick if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) 1454 1732 bonwick vdev_dirty(vd, VDD_METASLAB, msp, txg); 1455 1732 bonwick space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); 1456 789 ahrens } 1457 789 ahrens 1458 789 ahrens mutex_exit(&msp->ms_lock); 1459 789 ahrens } 1460 1807 bonwick 1461 1807 bonwick /* 1462 1807 bonwick * Intent log support: upon opening the pool after a crash, notify the SPA 1463 1807 bonwick * of blocks that the intent log has allocated for immediate write, but 1464 1807 bonwick * which are still considered free by the SPA because the last transaction 1465 1807 bonwick * group didn't commit yet. 1466 1807 bonwick */ 1467 1807 bonwick static int 1468 1807 bonwick metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 1469 1807 bonwick { 1470 1807 bonwick uint64_t vdev = DVA_GET_VDEV(dva); 1471 1807 bonwick uint64_t offset = DVA_GET_OFFSET(dva); 1472 1807 bonwick uint64_t size = DVA_GET_ASIZE(dva); 1473 1807 bonwick vdev_t *vd; 1474 1807 bonwick metaslab_t *msp; 1475 10922 Jeff int error = 0; 1476 1807 bonwick 1477 1807 bonwick ASSERT(DVA_IS_VALID(dva)); 1478 1807 bonwick 1479 1807 bonwick if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1480 1807 bonwick (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 1481 1807 bonwick return (ENXIO); 1482 1807 bonwick 1483 1807 bonwick msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1484 1807 bonwick 1485 1807 bonwick if (DVA_GET_GANG(dva)) 1486 1807 bonwick size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1487 1807 bonwick 1488 1807 bonwick mutex_enter(&msp->ms_lock); 1489 1807 bonwick 1490 10922 Jeff if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) 1491 10922 Jeff error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); 1492 10922 Jeff 1493 10922 Jeff if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) 1494 10922 Jeff error = ENOENT; 1495 10922 Jeff 1496 7754 Jeff if (error || txg == 0) { /* txg == 0 indicates dry run */ 1497 1807 bonwick mutex_exit(&msp->ms_lock); 1498 1807 bonwick return (error); 1499 1807 bonwick } 1500 1807 bonwick 1501 7754 Jeff space_map_claim(&msp->ms_map, offset, size); 1502 1807 bonwick 1503 8241 Jeff if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 1504 7754 Jeff if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1505 7754 Jeff vdev_dirty(vd, VDD_METASLAB, msp, txg); 1506 7754 Jeff space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 1507 7754 Jeff } 1508 1807 bonwick 1509 1807 bonwick mutex_exit(&msp->ms_lock); 1510 1807 bonwick 1511 1807 bonwick return (0); 1512 1807 bonwick } 1513 1807 bonwick 1514 1807 bonwick int 1515 4527 perrin metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 1516 7754 Jeff int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 1517 1807 bonwick { 1518 1807 bonwick dva_t *dva = bp->blk_dva; 1519 1807 bonwick dva_t *hintdva = hintbp->blk_dva; 1520 1807 bonwick int error = 0; 1521 1807 bonwick 1522 7754 Jeff ASSERT(bp->blk_birth == 0); 1523 10922 Jeff ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 1524 7754 Jeff 1525 7754 Jeff spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1526 7754 Jeff 1527 7754 Jeff if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 1528 7754 Jeff spa_config_exit(spa, SCL_ALLOC, FTAG); 1529 4527 perrin return (ENOSPC); 1530 7754 Jeff } 1531 4527 perrin 1532 1807 bonwick ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 1533 1807 bonwick ASSERT(BP_GET_NDVAS(bp) == 0); 1534 1807 bonwick ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 1535 1807 bonwick 1536 7754 Jeff for (int d = 0; d < ndvas; d++) { 1537 4527 perrin error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 1538 7754 Jeff txg, flags); 1539 1807 bonwick if (error) { 1540 1807 bonwick for (d--; d >= 0; d--) { 1541 1807 bonwick metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 1542 1807 bonwick bzero(&dva[d], sizeof (dva_t)); 1543 1807 bonwick } 1544 7754 Jeff spa_config_exit(spa, SCL_ALLOC, FTAG); 1545 1807 bonwick return (error); 1546 1807 bonwick } 1547 1807 bonwick } 1548 1807 bonwick ASSERT(error == 0); 1549 1807 bonwick ASSERT(BP_GET_NDVAS(bp) == ndvas); 1550 7754 Jeff 1551 7754 Jeff spa_config_exit(spa, SCL_ALLOC, FTAG); 1552 7754 Jeff 1553 10922 Jeff BP_SET_BIRTH(bp, txg, txg); 1554 1807 bonwick 1555 1807 bonwick return (0); 1556 1807 bonwick } 1557 1807 bonwick 1558 1807 bonwick void 1559 1807 bonwick metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 1560 1807 bonwick { 1561 1807 bonwick const dva_t *dva = bp->blk_dva; 1562 1807 bonwick int ndvas = BP_GET_NDVAS(bp); 1563 1807 bonwick 1564 1807 bonwick ASSERT(!BP_IS_HOLE(bp)); 1565 10922 Jeff ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 1566 1807 bonwick 1567 7754 Jeff spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 1568 7754 Jeff 1569 7754 Jeff for (int d = 0; d < ndvas; d++) 1570 1807 bonwick metaslab_free_dva(spa, &dva[d], txg, now); 1571 7754 Jeff 1572 7754 Jeff spa_config_exit(spa, SCL_FREE, FTAG); 1573 1807 bonwick } 1574 1807 bonwick 1575 1807 bonwick int 1576 1807 bonwick metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 1577 1807 bonwick { 1578 1807 bonwick const dva_t *dva = bp->blk_dva; 1579 1807 bonwick int ndvas = BP_GET_NDVAS(bp); 1580 7754 Jeff int error = 0; 1581 1807 bonwick 1582 1807 bonwick ASSERT(!BP_IS_HOLE(bp)); 1583 1807 bonwick 1584 7754 Jeff if (txg != 0) { 1585 7754 Jeff /* 1586 7754 Jeff * First do a dry run to make sure all DVAs are claimable, 1587 7754 Jeff * so we don't have to unwind from partial failures below. 1588 7754 Jeff */ 1589 7754 Jeff if ((error = metaslab_claim(spa, bp, 0)) != 0) 1590 7754 Jeff return (error); 1591 7754 Jeff } 1592 7754 Jeff 1593 7754 Jeff spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1594 7754 Jeff 1595 7754 Jeff for (int d = 0; d < ndvas; d++) 1596 1807 bonwick if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 1597 7754 Jeff break; 1598 1807 bonwick 1599 7754 Jeff spa_config_exit(spa, SCL_ALLOC, FTAG); 1600 7754 Jeff 1601 7754 Jeff ASSERT(error == 0 || txg == 0); 1602 7754 Jeff 1603 7754 Jeff return (error); 1604 1807 bonwick } 1605