1 789 ahrens /* 2 789 ahrens * CDDL HEADER START 3 789 ahrens * 4 789 ahrens * The contents of this file are subject to the terms of the 5 1544 eschrock * Common Development and Distribution License (the "License"). 6 1544 eschrock * You may not use this file except in compliance with the License. 7 789 ahrens * 8 789 ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 789 ahrens * or http://www.opensolaris.org/os/licensing. 10 789 ahrens * See the License for the specific language governing permissions 11 789 ahrens * and limitations under the License. 12 789 ahrens * 13 789 ahrens * When distributing Covered Code, include this CDDL HEADER in each 14 789 ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 789 ahrens * If applicable, add the following below this CDDL HEADER, with the 16 789 ahrens * fields enclosed by brackets "[]" replaced with your own identifying 17 789 ahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18 789 ahrens * 19 789 ahrens * CDDL HEADER END 20 789 ahrens */ 21 789 ahrens /* 22 9008 Lin * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 789 ahrens * Use is subject to license terms. 24 789 ahrens */ 25 789 ahrens 26 789 ahrens #include <sys/dsl_pool.h> 27 789 ahrens #include <sys/dsl_dataset.h> 28 789 ahrens #include <sys/dsl_dir.h> 29 2199 ahrens #include <sys/dsl_synctask.h> 30 789 ahrens #include <sys/dmu_tx.h> 31 789 ahrens #include <sys/dmu_objset.h> 32 789 ahrens #include <sys/arc.h> 33 789 ahrens #include <sys/zap.h> 34 3547 maybee #include <sys/zio.h> 35 789 ahrens #include <sys/zfs_context.h> 36 789 ahrens #include <sys/fs/zfs.h> 37 7046 ahrens #include <sys/zfs_znode.h> 38 7046 ahrens #include <sys/spa_impl.h> 39 6245 maybee 40 6245 maybee int zfs_no_write_throttle = 0; 41 7468 Mark int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ 42 11182 Lin int zfs_txg_synctime_ms = 5000; /* target millisecs to sync a txg */ 43 7468 Mark 44 7468 Mark uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ 45 7468 Mark uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ 46 7468 Mark uint64_t zfs_write_limit_inflated = 0; 47 6245 maybee uint64_t zfs_write_limit_override = 0; 48 789 ahrens 49 7468 Mark kmutex_t zfs_write_limit_lock; 50 7468 Mark 51 7468 Mark static pgcnt_t old_physmem = 0; 52 7046 ahrens 53 1544 eschrock static int 54 7046 ahrens dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 55 789 ahrens { 56 789 ahrens uint64_t obj; 57 789 ahrens int err; 58 789 ahrens 59 789 ahrens err = zap_lookup(dp->dp_meta_objset, 60 789 ahrens dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, 61 7046 ahrens name, sizeof (obj), 1, &obj); 62 1544 eschrock if (err) 63 1544 eschrock return (err); 64 789 ahrens 65 7046 ahrens return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); 66 789 ahrens } 67 789 ahrens 68 789 ahrens static dsl_pool_t * 69 789 ahrens dsl_pool_open_impl(spa_t *spa, uint64_t txg) 70 789 ahrens { 71 789 ahrens dsl_pool_t *dp; 72 789 ahrens blkptr_t *bp = spa_get_rootblkptr(spa); 73 789 ahrens 74 789 ahrens dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 75 789 ahrens dp->dp_spa = spa; 76 789 ahrens dp->dp_meta_rootbp = *bp; 77 2856 nd150628 rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); 78 6245 maybee dp->dp_write_limit = zfs_write_limit_min; 79 789 ahrens txg_init(dp, txg); 80 789 ahrens 81 789 ahrens txg_list_create(&dp->dp_dirty_datasets, 82 789 ahrens offsetof(dsl_dataset_t, ds_dirty_link)); 83 789 ahrens txg_list_create(&dp->dp_dirty_dirs, 84 789 ahrens offsetof(dsl_dir_t, dd_dirty_link)); 85 2199 ahrens txg_list_create(&dp->dp_sync_tasks, 86 2199 ahrens offsetof(dsl_sync_task_group_t, dstg_node)); 87 5367 ahrens list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), 88 789 ahrens offsetof(dsl_dataset_t, ds_synced_link)); 89 6245 maybee 90 6245 maybee mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 91 7046 ahrens mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); 92 9321 Neil 93 9321 Neil dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 94 9321 Neil 1, 4, 0); 95 789 ahrens 96 789 ahrens return (dp); 97 789 ahrens } 98 789 ahrens 99 1544 eschrock int 100 1544 eschrock dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 101 789 ahrens { 102 789 ahrens int err; 103 789 ahrens dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 104 7046 ahrens dsl_dir_t *dd; 105 7046 ahrens dsl_dataset_t *ds; 106 789 ahrens 107 7046 ahrens rw_enter(&dp->dp_config_rwlock, RW_WRITER); 108 10298 Matthew err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 109 10298 Matthew &dp->dp_meta_objset); 110 1544 eschrock if (err) 111 1544 eschrock goto out; 112 1544 eschrock 113 789 ahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 114 789 ahrens DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 115 789 ahrens &dp->dp_root_dir_obj); 116 1544 eschrock if (err) 117 1544 eschrock goto out; 118 789 ahrens 119 1544 eschrock err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 120 1544 eschrock NULL, dp, &dp->dp_root_dir); 121 1544 eschrock if (err) 122 1544 eschrock goto out; 123 1544 eschrock 124 7046 ahrens err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 125 1544 eschrock if (err) 126 1544 eschrock goto out; 127 7046 ahrens 128 7046 ahrens if (spa_version(spa) >= SPA_VERSION_ORIGIN) { 129 7046 ahrens err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 130 7046 ahrens if (err) 131 7046 ahrens goto out; 132 7046 ahrens err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, 133 7046 ahrens FTAG, &ds); 134 9008 Lin if (err == 0) { 135 9008 Lin err = dsl_dataset_hold_obj(dp, 136 9008 Lin ds->ds_phys->ds_prev_snap_obj, dp, 137 9008 Lin &dp->dp_origin_snap); 138 9008 Lin dsl_dataset_rele(ds, FTAG); 139 9008 Lin } 140 9008 Lin dsl_dir_close(dd, dp); 141 7046 ahrens if (err) 142 7046 ahrens goto out; 143 7046 ahrens } 144 10342 chris 145 10342 chris err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 146 10342 chris DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 147 10342 chris &dp->dp_tmp_userrefs_obj); 148 10342 chris if (err == ENOENT) 149 10342 chris err = 0; 150 10342 chris if (err) 151 10342 chris goto out; 152 7046 ahrens 153 7046 ahrens /* get scrub status */ 154 7046 ahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 155 7046 ahrens DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, 156 7046 ahrens &dp->dp_scrub_func); 157 7046 ahrens if (err == 0) { 158 7046 ahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 159 7046 ahrens DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, 160 7046 ahrens &dp->dp_scrub_queue_obj); 161 7046 ahrens if (err) 162 7046 ahrens goto out; 163 7046 ahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 164 7046 ahrens DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, 165 7046 ahrens &dp->dp_scrub_min_txg); 166 7046 ahrens if (err) 167 7046 ahrens goto out; 168 7046 ahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 169 7046 ahrens DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, 170 7046 ahrens &dp->dp_scrub_max_txg); 171 7046 ahrens if (err) 172 7046 ahrens goto out; 173 7046 ahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 174 11125 Jeff DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 175 11125 Jeff sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), 176 7046 ahrens &dp->dp_scrub_bookmark); 177 7046 ahrens if (err) 178 11125 Jeff goto out; 179 11125 Jeff err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 180 11125 Jeff DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), 181 11125 Jeff sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), 182 11125 Jeff &dp->dp_scrub_ddt_bookmark); 183 11125 Jeff if (err && err != ENOENT) 184 11125 Jeff goto out; 185 11125 Jeff err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 186 11125 Jeff DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, 187 11125 Jeff &dp->dp_scrub_ddt_class_max); 188 11125 Jeff if (err && err != ENOENT) 189 7046 ahrens goto out; 190 7046 ahrens err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 191 7046 ahrens DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, 192 7046 ahrens &spa->spa_scrub_errors); 193 7046 ahrens if (err) 194 7046 ahrens goto out; 195 7046 ahrens if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { 196 7046 ahrens /* 197 7046 ahrens * A new-type scrub was in progress on an old 198 7046 ahrens * pool. Restart from the beginning, since the 199 7046 ahrens * old software may have changed the pool in the 200 7046 ahrens * meantime. 201 7046 ahrens */ 202 7046 ahrens dsl_pool_scrub_restart(dp); 203 7046 ahrens } 204 7046 ahrens } else { 205 7046 ahrens /* 206 7046 ahrens * It's OK if there is no scrub in progress (and if 207 7046 ahrens * there was an I/O error, ignore it). 208 7046 ahrens */ 209 7046 ahrens err = 0; 210 7046 ahrens } 211 1544 eschrock 212 1544 eschrock out: 213 789 ahrens rw_exit(&dp->dp_config_rwlock); 214 1544 eschrock if (err) 215 1544 eschrock dsl_pool_close(dp); 216 1544 eschrock else 217 1544 eschrock *dpp = dp; 218 789 ahrens 219 1544 eschrock return (err); 220 789 ahrens } 221 789 ahrens 222 789 ahrens void 223 789 ahrens dsl_pool_close(dsl_pool_t *dp) 224 789 ahrens { 225 7046 ahrens /* drop our references from dsl_pool_open() */ 226 7046 ahrens 227 7046 ahrens /* 228 7046 ahrens * Since we held the origin_snap from "syncing" context (which 229 7046 ahrens * includes pool-opening context), it actually only got a "ref" 230 7046 ahrens * and not a hold, so just drop that here. 231 7046 ahrens */ 232 7046 ahrens if (dp->dp_origin_snap) 233 7046 ahrens dsl_dataset_drop_ref(dp->dp_origin_snap, dp); 234 1544 eschrock if (dp->dp_mos_dir) 235 1544 eschrock dsl_dir_close(dp->dp_mos_dir, dp); 236 1544 eschrock if (dp->dp_root_dir) 237 1544 eschrock dsl_dir_close(dp->dp_root_dir, dp); 238 789 ahrens 239 789 ahrens /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 240 1544 eschrock if (dp->dp_meta_objset) 241 10298 Matthew dmu_objset_evict(dp->dp_meta_objset); 242 789 ahrens 243 789 ahrens txg_list_destroy(&dp->dp_dirty_datasets); 244 789 ahrens txg_list_destroy(&dp->dp_dirty_dirs); 245 5367 ahrens list_destroy(&dp->dp_synced_datasets); 246 789 ahrens 247 5642 maybee arc_flush(dp->dp_spa); 248 789 ahrens txg_fini(dp); 249 2856 nd150628 rw_destroy(&dp->dp_config_rwlock); 250 6245 maybee mutex_destroy(&dp->dp_lock); 251 7046 ahrens mutex_destroy(&dp->dp_scrub_cancel_lock); 252 9321 Neil taskq_destroy(dp->dp_vnrele_taskq); 253 7837 Matthew if (dp->dp_blkstats) 254 7837 Matthew kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 255 789 ahrens kmem_free(dp, sizeof (dsl_pool_t)); 256 789 ahrens } 257 789 ahrens 258 789 ahrens dsl_pool_t * 259 7184 timh dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 260 789 ahrens { 261 789 ahrens int err; 262 789 ahrens dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 263 789 ahrens dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 264 10298 Matthew objset_t *os; 265 7046 ahrens dsl_dataset_t *ds; 266 7046 ahrens uint64_t dsobj; 267 7046 ahrens 268 7046 ahrens /* create and open the MOS (meta-objset) */ 269 10298 Matthew dp->dp_meta_objset = dmu_objset_create_impl(spa, 270 10298 Matthew NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 271 789 ahrens 272 789 ahrens /* create the pool directory */ 273 789 ahrens err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 274 789 ahrens DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 275 789 ahrens ASSERT3U(err, ==, 0); 276 789 ahrens 277 789 ahrens /* create and open the root dir */ 278 7046 ahrens dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 279 1544 eschrock VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 280 1544 eschrock NULL, dp, &dp->dp_root_dir)); 281 789 ahrens 282 789 ahrens /* create and open the meta-objset dir */ 283 7046 ahrens (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 284 7046 ahrens VERIFY(0 == dsl_pool_open_special_dir(dp, 285 7046 ahrens MOS_DIR_NAME, &dp->dp_mos_dir)); 286 7046 ahrens 287 7046 ahrens if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 288 7046 ahrens dsl_pool_create_origin(dp, tx); 289 7046 ahrens 290 7046 ahrens /* create the root dataset */ 291 7046 ahrens dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 292 7046 ahrens 293 7046 ahrens /* create the root objset */ 294 7046 ahrens VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 295 10298 Matthew os = dmu_objset_create_impl(dp->dp_spa, ds, 296 7046 ahrens dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 297 7046 ahrens #ifdef _KERNEL 298 10298 Matthew zfs_create_fs(os, kcred, zplprops, tx); 299 7046 ahrens #endif 300 7046 ahrens dsl_dataset_rele(ds, FTAG); 301 789 ahrens 302 789 ahrens dmu_tx_commit(tx); 303 789 ahrens 304 789 ahrens return (dp); 305 789 ahrens } 306 789 ahrens 307 789 ahrens void 308 789 ahrens dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 309 789 ahrens { 310 3547 maybee zio_t *zio; 311 789 ahrens dmu_tx_t *tx; 312 3547 maybee dsl_dir_t *dd; 313 3547 maybee dsl_dataset_t *ds; 314 3547 maybee dsl_sync_task_group_t *dstg; 315 10298 Matthew objset_t *mos = dp->dp_meta_objset; 316 7468 Mark hrtime_t start, write_time; 317 7468 Mark uint64_t data_written; 318 3547 maybee int err; 319 789 ahrens 320 789 ahrens tx = dmu_tx_create_assigned(dp, txg); 321 789 ahrens 322 7468 Mark dp->dp_read_overhead = 0; 323 9366 Mark start = gethrtime(); 324 9396 Matthew 325 3547 maybee zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 326 3547 maybee while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 327 9396 Matthew /* 328 9396 Matthew * We must not sync any non-MOS datasets twice, because 329 9396 Matthew * we may have taken a snapshot of them. However, we 330 9396 Matthew * may sync newly-created datasets on pass 2. 331 9396 Matthew */ 332 9396 Matthew ASSERT(!list_link_active(&ds->ds_synced_link)); 333 9396 Matthew list_insert_tail(&dp->dp_synced_datasets, ds); 334 3547 maybee dsl_dataset_sync(ds, zio, tx); 335 3547 maybee } 336 7468 Mark DTRACE_PROBE(pool_sync__1setup); 337 9396 Matthew err = zio_wait(zio); 338 7468 Mark 339 7468 Mark write_time = gethrtime() - start; 340 3547 maybee ASSERT(err == 0); 341 7468 Mark DTRACE_PROBE(pool_sync__2rootzio); 342 789 ahrens 343 9396 Matthew for (ds = list_head(&dp->dp_synced_datasets); ds; 344 9396 Matthew ds = list_next(&dp->dp_synced_datasets, ds)) 345 10298 Matthew dmu_objset_do_userquota_callbacks(ds->ds_objset, tx); 346 9396 Matthew 347 9396 Matthew /* 348 9396 Matthew * Sync the datasets again to push out the changes due to 349 9396 Matthew * userquota updates. This must be done before we process the 350 9396 Matthew * sync tasks, because that could cause a snapshot of a dataset 351 9396 Matthew * whose ds_bp will be rewritten when we do this 2nd sync. 352 9396 Matthew */ 353 9396 Matthew zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 354 9396 Matthew while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 355 9396 Matthew ASSERT(list_link_active(&ds->ds_synced_link)); 356 9396 Matthew dmu_buf_rele(ds->ds_dbuf, ds); 357 9396 Matthew dsl_dataset_sync(ds, zio, tx); 358 9396 Matthew } 359 9396 Matthew err = zio_wait(zio); 360 9396 Matthew 361 10922 Jeff /* 362 10922 Jeff * If anything was added to a deadlist during a zio done callback, 363 10922 Jeff * it had to be put on the deferred queue. Enqueue it for real now. 364 10922 Jeff */ 365 10922 Jeff for (ds = list_head(&dp->dp_synced_datasets); ds; 366 10922 Jeff ds = list_next(&dp->dp_synced_datasets, ds)) 367 10922 Jeff bplist_sync(&ds->ds_deadlist, 368 10922 Jeff bplist_enqueue_cb, &ds->ds_deadlist, tx); 369 10922 Jeff 370 9396 Matthew while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) { 371 9396 Matthew /* 372 9396 Matthew * No more sync tasks should have been added while we 373 9396 Matthew * were syncing. 374 9396 Matthew */ 375 9396 Matthew ASSERT(spa_sync_pass(dp->dp_spa) == 1); 376 3547 maybee dsl_sync_task_group_sync(dstg, tx); 377 9396 Matthew } 378 7468 Mark DTRACE_PROBE(pool_sync__3task); 379 7468 Mark 380 7468 Mark start = gethrtime(); 381 3547 maybee while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) 382 3547 maybee dsl_dir_sync(dd, tx); 383 7468 Mark write_time += gethrtime() - start; 384 7046 ahrens 385 11147 George if (spa_sync_pass(dp->dp_spa) == 1) { 386 11147 George dp->dp_scrub_prefetch_zio_root = zio_root(dp->dp_spa, NULL, 387 11147 George NULL, ZIO_FLAG_CANFAIL); 388 7046 ahrens dsl_pool_scrub_sync(dp, tx); 389 11147 George (void) zio_wait(dp->dp_scrub_prefetch_zio_root); 390 11147 George } 391 789 ahrens 392 7468 Mark start = gethrtime(); 393 10298 Matthew if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 394 10298 Matthew list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { 395 3547 maybee zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 396 10298 Matthew dmu_objset_sync(mos, zio, tx); 397 3547 maybee err = zio_wait(zio); 398 3547 maybee ASSERT(err == 0); 399 789 ahrens dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 400 789 ahrens spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 401 789 ahrens } 402 7468 Mark write_time += gethrtime() - start; 403 7468 Mark DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, 404 7468 Mark hrtime_t, dp->dp_read_overhead); 405 7468 Mark write_time -= dp->dp_read_overhead; 406 789 ahrens 407 789 ahrens dmu_tx_commit(tx); 408 7468 Mark 409 7468 Mark data_written = dp->dp_space_towrite[txg & TXG_MASK]; 410 7468 Mark dp->dp_space_towrite[txg & TXG_MASK] = 0; 411 7468 Mark ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); 412 7468 Mark 413 7468 Mark /* 414 7468 Mark * If the write limit max has not been explicitly set, set it 415 7468 Mark * to a fraction of available physical memory (default 1/8th). 416 7468 Mark * Note that we must inflate the limit because the spa 417 7468 Mark * inflates write sizes to account for data replication. 418 7468 Mark * Check this each sync phase to catch changing memory size. 419 7468 Mark */ 420 7468 Mark if (physmem != old_physmem && zfs_write_limit_shift) { 421 7468 Mark mutex_enter(&zfs_write_limit_lock); 422 7468 Mark old_physmem = physmem; 423 7468 Mark zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 424 7468 Mark zfs_write_limit_inflated = MAX(zfs_write_limit_min, 425 7468 Mark spa_get_asize(dp->dp_spa, zfs_write_limit_max)); 426 7468 Mark mutex_exit(&zfs_write_limit_lock); 427 7468 Mark } 428 7468 Mark 429 7468 Mark /* 430 7468 Mark * Attempt to keep the sync time consistent by adjusting the 431 7468 Mark * amount of write traffic allowed into each transaction group. 432 7468 Mark * Weight the throughput calculation towards the current value: 433 7468 Mark * thru = 3/4 old_thru + 1/4 new_thru 434 7468 Mark */ 435 7468 Mark ASSERT(zfs_write_limit_min > 0); 436 7468 Mark if (data_written > zfs_write_limit_min / 8 && write_time > 0) { 437 7468 Mark uint64_t throughput = (data_written * NANOSEC) / write_time; 438 7468 Mark if (dp->dp_throughput) 439 7468 Mark dp->dp_throughput = throughput / 4 + 440 7468 Mark 3 * dp->dp_throughput / 4; 441 7468 Mark else 442 7468 Mark dp->dp_throughput = throughput; 443 7468 Mark dp->dp_write_limit = MIN(zfs_write_limit_inflated, 444 7468 Mark MAX(zfs_write_limit_min, 445 11182 Lin dp->dp_throughput * zfs_txg_synctime_ms / MILLISEC)); 446 7468 Mark } 447 789 ahrens } 448 789 ahrens 449 789 ahrens void 450 10922 Jeff dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 451 789 ahrens { 452 789 ahrens dsl_dataset_t *ds; 453 10922 Jeff objset_t *os; 454 789 ahrens 455 5367 ahrens while (ds = list_head(&dp->dp_synced_datasets)) { 456 5367 ahrens list_remove(&dp->dp_synced_datasets, ds); 457 10922 Jeff os = ds->ds_objset; 458 10922 Jeff zil_clean(os->os_zil); 459 10922 Jeff ASSERT(!dmu_objset_is_dirty(os, txg)); 460 3897 maybee dmu_buf_rele(ds->ds_dbuf, ds); 461 789 ahrens } 462 10922 Jeff ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 463 789 ahrens } 464 789 ahrens 465 3547 maybee /* 466 3547 maybee * TRUE if the current thread is the tx_sync_thread or if we 467 3547 maybee * are being called from SPA context during pool initialization. 468 3547 maybee */ 469 789 ahrens int 470 789 ahrens dsl_pool_sync_context(dsl_pool_t *dp) 471 789 ahrens { 472 789 ahrens return (curthread == dp->dp_tx.tx_sync_thread || 473 3547 maybee spa_get_dsl(dp->dp_spa) == NULL); 474 789 ahrens } 475 789 ahrens 476 789 ahrens uint64_t 477 789 ahrens dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 478 789 ahrens { 479 789 ahrens uint64_t space, resv; 480 789 ahrens 481 789 ahrens /* 482 1775 billm * Reserve about 1.6% (1/64), or at least 32MB, for allocation 483 789 ahrens * efficiency. 484 789 ahrens * XXX The intent log is not accounted for, so it must fit 485 789 ahrens * within this slop. 486 789 ahrens * 487 789 ahrens * If we're trying to assess whether it's OK to do a free, 488 789 ahrens * cut the reservation in half to allow forward progress 489 789 ahrens * (e.g. make it possible to rm(1) files from a full pool). 490 789 ahrens */ 491 10956 George space = spa_get_dspace(dp->dp_spa); 492 1775 billm resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); 493 789 ahrens if (netfree) 494 789 ahrens resv >>= 1; 495 789 ahrens 496 789 ahrens return (space - resv); 497 789 ahrens } 498 6245 maybee 499 6245 maybee int 500 6245 maybee dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) 501 6245 maybee { 502 6245 maybee uint64_t reserved = 0; 503 6245 maybee uint64_t write_limit = (zfs_write_limit_override ? 504 6245 maybee zfs_write_limit_override : dp->dp_write_limit); 505 6245 maybee 506 6245 maybee if (zfs_no_write_throttle) { 507 6643 eschrock atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], 508 6643 eschrock space); 509 6245 maybee return (0); 510 6245 maybee } 511 6245 maybee 512 6245 maybee /* 513 6245 maybee * Check to see if we have exceeded the maximum allowed IO for 514 6245 maybee * this transaction group. We can do this without locks since 515 6245 maybee * a little slop here is ok. Note that we do the reserved check 516 6245 maybee * with only half the requested reserve: this is because the 517 6245 maybee * reserve requests are worst-case, and we really don't want to 518 6245 maybee * throttle based off of worst-case estimates. 519 6245 maybee */ 520 6245 maybee if (write_limit > 0) { 521 6245 maybee reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] 522 6245 maybee + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; 523 6245 maybee 524 6245 maybee if (reserved && reserved > write_limit) 525 6245 maybee return (ERESTART); 526 6245 maybee } 527 6245 maybee 528 6245 maybee atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); 529 6245 maybee 530 6245 maybee /* 531 6245 maybee * If this transaction group is over 7/8ths capacity, delay 532 6245 maybee * the caller 1 clock tick. This will slow down the "fill" 533 6245 maybee * rate until the sync process can catch up with us. 534 6245 maybee */ 535 6740 gw25295 if (reserved && reserved > (write_limit - (write_limit >> 3))) 536 6245 maybee txg_delay(dp, tx->tx_txg, 1); 537 6245 maybee 538 6245 maybee return (0); 539 6245 maybee } 540 6245 maybee 541 6245 maybee void 542 6245 maybee dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 543 6245 maybee { 544 6245 maybee ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); 545 6245 maybee atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); 546 6245 maybee } 547 6245 maybee 548 6245 maybee void 549 6245 maybee dsl_pool_memory_pressure(dsl_pool_t *dp) 550 6245 maybee { 551 6245 maybee uint64_t space_inuse = 0; 552 6245 maybee int i; 553 6245 maybee 554 6245 maybee if (dp->dp_write_limit == zfs_write_limit_min) 555 6245 maybee return; 556 6245 maybee 557 6245 maybee for (i = 0; i < TXG_SIZE; i++) { 558 6245 maybee space_inuse += dp->dp_space_towrite[i]; 559 6245 maybee space_inuse += dp->dp_tempreserved[i]; 560 6245 maybee } 561 6245 maybee dp->dp_write_limit = MAX(zfs_write_limit_min, 562 6245 maybee MIN(dp->dp_write_limit, space_inuse / 4)); 563 6245 maybee } 564 6245 maybee 565 6245 maybee void 566 6245 maybee dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 567 6245 maybee { 568 6245 maybee if (space > 0) { 569 6245 maybee mutex_enter(&dp->dp_lock); 570 6245 maybee dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; 571 6245 maybee mutex_exit(&dp->dp_lock); 572 6245 maybee } 573 6245 maybee } 574 7046 ahrens 575 7046 ahrens /* ARGSUSED */ 576 7046 ahrens static int 577 7046 ahrens upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 578 7046 ahrens { 579 7046 ahrens dmu_tx_t *tx = arg; 580 7046 ahrens dsl_dataset_t *ds, *prev = NULL; 581 7046 ahrens int err; 582 7046 ahrens dsl_pool_t *dp = spa_get_dsl(spa); 583 7046 ahrens 584 7046 ahrens err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 585 7046 ahrens if (err) 586 7046 ahrens return (err); 587 7046 ahrens 588 7046 ahrens while (ds->ds_phys->ds_prev_snap_obj != 0) { 589 7046 ahrens err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 590 7046 ahrens FTAG, &prev); 591 7046 ahrens if (err) { 592 7046 ahrens dsl_dataset_rele(ds, FTAG); 593 7046 ahrens return (err); 594 7046 ahrens } 595 7046 ahrens 596 7046 ahrens if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) 597 7046 ahrens break; 598 7046 ahrens dsl_dataset_rele(ds, FTAG); 599 7046 ahrens ds = prev; 600 7046 ahrens prev = NULL; 601 7046 ahrens } 602 7046 ahrens 603 7046 ahrens if (prev == NULL) { 604 7046 ahrens prev = dp->dp_origin_snap; 605 7046 ahrens 606 7046 ahrens /* 607 7046 ahrens * The $ORIGIN can't have any data, or the accounting 608 7046 ahrens * will be wrong. 609 7046 ahrens */ 610 7046 ahrens ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); 611 7046 ahrens 612 7046 ahrens /* The origin doesn't get attached to itself */ 613 7046 ahrens if (ds->ds_object == prev->ds_object) { 614 7046 ahrens dsl_dataset_rele(ds, FTAG); 615 7046 ahrens return (0); 616 7046 ahrens } 617 7046 ahrens 618 7046 ahrens dmu_buf_will_dirty(ds->ds_dbuf, tx); 619 7046 ahrens ds->ds_phys->ds_prev_snap_obj = prev->ds_object; 620 7046 ahrens ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; 621 7046 ahrens 622 7046 ahrens dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 623 7046 ahrens ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; 624 7046 ahrens 625 7046 ahrens dmu_buf_will_dirty(prev->ds_dbuf, tx); 626 7046 ahrens prev->ds_phys->ds_num_children++; 627 7046 ahrens 628 7046 ahrens if (ds->ds_phys->ds_next_snap_obj == 0) { 629 7046 ahrens ASSERT(ds->ds_prev == NULL); 630 7046 ahrens VERIFY(0 == dsl_dataset_hold_obj(dp, 631 7046 ahrens ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 632 7046 ahrens } 633 7046 ahrens } 634 7046 ahrens 635 7046 ahrens ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); 636 7046 ahrens ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); 637 7046 ahrens 638 7046 ahrens if (prev->ds_phys->ds_next_clones_obj == 0) { 639 10801 Matthew dmu_buf_will_dirty(prev->ds_dbuf, tx); 640 7046 ahrens prev->ds_phys->ds_next_clones_obj = 641 7046 ahrens zap_create(dp->dp_meta_objset, 642 7046 ahrens DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 643 7046 ahrens } 644 7046 ahrens VERIFY(0 == zap_add_int(dp->dp_meta_objset, 645 7046 ahrens prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); 646 7046 ahrens 647 7046 ahrens dsl_dataset_rele(ds, FTAG); 648 7046 ahrens if (prev != dp->dp_origin_snap) 649 7046 ahrens dsl_dataset_rele(prev, FTAG); 650 7046 ahrens return (0); 651 7046 ahrens } 652 7046 ahrens 653 7046 ahrens void 654 7046 ahrens dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 655 7046 ahrens { 656 7046 ahrens ASSERT(dmu_tx_is_syncing(tx)); 657 7046 ahrens ASSERT(dp->dp_origin_snap != NULL); 658 7046 ahrens 659 10801 Matthew VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, 660 10801 Matthew tx, DS_FIND_CHILDREN)); 661 7046 ahrens } 662 7046 ahrens 663 7046 ahrens void 664 7046 ahrens dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 665 7046 ahrens { 666 7046 ahrens uint64_t dsobj; 667 7046 ahrens dsl_dataset_t *ds; 668 7046 ahrens 669 7046 ahrens ASSERT(dmu_tx_is_syncing(tx)); 670 7046 ahrens ASSERT(dp->dp_origin_snap == NULL); 671 7046 ahrens 672 7046 ahrens /* create the origin dir, ds, & snap-ds */ 673 7046 ahrens rw_enter(&dp->dp_config_rwlock, RW_WRITER); 674 7046 ahrens dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 675 7046 ahrens NULL, 0, kcred, tx); 676 7046 ahrens VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 677 7046 ahrens dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx); 678 7046 ahrens VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 679 7046 ahrens dp, &dp->dp_origin_snap)); 680 7046 ahrens dsl_dataset_rele(ds, FTAG); 681 7046 ahrens rw_exit(&dp->dp_config_rwlock); 682 7046 ahrens } 683 9321 Neil 684 9321 Neil taskq_t * 685 9321 Neil dsl_pool_vnrele_taskq(dsl_pool_t *dp) 686 9321 Neil { 687 9321 Neil return (dp->dp_vnrele_taskq); 688 9321 Neil } 689 10342 chris 690 10342 chris /* 691 10342 chris * Walk through the pool-wide zap object of temporary snapshot user holds 692 10342 chris * and release them. 693 10342 chris */ 694 10342 chris void 695 10342 chris dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 696 10342 chris { 697 10342 chris zap_attribute_t za; 698 10342 chris zap_cursor_t zc; 699 10342 chris objset_t *mos = dp->dp_meta_objset; 700 10342 chris uint64_t zapobj = dp->dp_tmp_userrefs_obj; 701 10342 chris 702 10342 chris if (zapobj == 0) 703 10342 chris return; 704 10342 chris ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 705 10342 chris 706 10342 chris for (zap_cursor_init(&zc, mos, zapobj); 707 10342 chris zap_cursor_retrieve(&zc, &za) == 0; 708 10342 chris zap_cursor_advance(&zc)) { 709 10342 chris char *htag; 710 10342 chris uint64_t dsobj; 711 10342 chris 712 10342 chris htag = strchr(za.za_name, '-'); 713 10342 chris *htag = '\0'; 714 10342 chris ++htag; 715 10342 chris dsobj = strtonum(za.za_name, NULL); 716 10342 chris (void) dsl_dataset_user_release_tmp(dp, dsobj, htag); 717 10342 chris } 718 10342 chris zap_cursor_fini(&zc); 719 10342 chris } 720 10342 chris 721 10342 chris /* 722 10342 chris * Create the pool-wide zap object for storing temporary snapshot holds. 723 10342 chris */ 724 10342 chris void 725 10342 chris dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 726 10342 chris { 727 10342 chris objset_t *mos = dp->dp_meta_objset; 728 10342 chris 729 10342 chris ASSERT(dp->dp_tmp_userrefs_obj == 0); 730 10342 chris ASSERT(dmu_tx_is_syncing(tx)); 731 10342 chris 732 10342 chris dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, 733 10342 chris DMU_OT_NONE, 0, tx); 734 10342 chris 735 10342 chris VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, 736 10342 chris sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); 737 10342 chris } 738 10342 chris 739 10342 chris static int 740 10342 chris dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 741 10951 Chris const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) 742 10342 chris { 743 10342 chris objset_t *mos = dp->dp_meta_objset; 744 10342 chris uint64_t zapobj = dp->dp_tmp_userrefs_obj; 745 10342 chris char *name; 746 10342 chris int error; 747 10342 chris 748 10342 chris ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 749 10342 chris ASSERT(dmu_tx_is_syncing(tx)); 750 10342 chris 751 10342 chris /* 752 10342 chris * If the pool was created prior to SPA_VERSION_USERREFS, the 753 10342 chris * zap object for temporary holds might not exist yet. 754 10342 chris */ 755 10342 chris if (zapobj == 0) { 756 10342 chris if (holding) { 757 10342 chris dsl_pool_user_hold_create_obj(dp, tx); 758 10342 chris zapobj = dp->dp_tmp_userrefs_obj; 759 10342 chris } else { 760 10342 chris return (ENOENT); 761 10342 chris } 762 10342 chris } 763 10342 chris 764 10342 chris name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 765 10342 chris if (holding) 766 10951 Chris error = zap_add(mos, zapobj, name, 8, 1, now, tx); 767 10342 chris else 768 10342 chris error = zap_remove(mos, zapobj, name, tx); 769 10342 chris strfree(name); 770 10342 chris 771 10342 chris return (error); 772 10342 chris } 773 10342 chris 774 10342 chris /* 775 10342 chris * Add a temporary hold for the given dataset object and tag. 776 10342 chris */ 777 10342 chris int 778 10342 chris dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 779 10951 Chris uint64_t *now, dmu_tx_t *tx) 780 10342 chris { 781 10951 Chris return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 782 10342 chris } 783 10342 chris 784 10342 chris /* 785 10342 chris * Release a temporary hold for the given dataset object and tag. 786 10342 chris */ 787 10342 chris int 788 10342 chris dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 789 10342 chris dmu_tx_t *tx) 790 10342 chris { 791 10342 chris return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, 792 10342 chris tx, B_FALSE)); 793 10342 chris } 794