1 789 ahrens /* 2 789 ahrens * CDDL HEADER START 3 789 ahrens * 4 789 ahrens * The contents of this file are subject to the terms of the 5 1491 ahrens * Common Development and Distribution License (the "License"). 6 1491 ahrens * You may not use this file except in compliance with the License. 7 789 ahrens * 8 789 ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 789 ahrens * or http://www.opensolaris.org/os/licensing. 10 789 ahrens * See the License for the specific language governing permissions 11 789 ahrens * and limitations under the License. 12 789 ahrens * 13 789 ahrens * When distributing Covered Code, include this CDDL HEADER in each 14 789 ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 789 ahrens * If applicable, add the following below this CDDL HEADER, with the 16 789 ahrens * fields enclosed by brackets "[]" replaced with your own identifying 17 789 ahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18 789 ahrens * 19 789 ahrens * CDDL HEADER END 20 789 ahrens */ 21 789 ahrens /* 22 8768 Mark * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 789 ahrens * Use is subject to license terms. 24 789 ahrens */ 25 789 ahrens 26 789 ahrens #include <sys/dmu.h> 27 789 ahrens #include <sys/dmu_impl.h> 28 789 ahrens #include <sys/dbuf.h> 29 789 ahrens #include <sys/dmu_tx.h> 30 789 ahrens #include <sys/dmu_objset.h> 31 789 ahrens #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ 32 789 ahrens #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ 33 789 ahrens #include <sys/dsl_pool.h> 34 2113 ahrens #include <sys/zap_impl.h> /* for fzap_default_block_shift */ 35 789 ahrens #include <sys/spa.h> 36 789 ahrens #include <sys/zfs_context.h> 37 789 ahrens 38 1544 eschrock typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 39 1544 eschrock uint64_t arg1, uint64_t arg2); 40 1544 eschrock 41 789 ahrens 42 789 ahrens dmu_tx_t * 43 2199 ahrens dmu_tx_create_dd(dsl_dir_t *dd) 44 789 ahrens { 45 789 ahrens dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 46 789 ahrens tx->tx_dir = dd; 47 789 ahrens if (dd) 48 789 ahrens tx->tx_pool = dd->dd_pool; 49 789 ahrens list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 50 2113 ahrens offsetof(dmu_tx_hold_t, txh_node)); 51 10612 Ricardo list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), 52 10612 Ricardo offsetof(dmu_tx_callback_t, dcb_node)); 53 2113 ahrens #ifdef ZFS_DEBUG 54 789 ahrens refcount_create(&tx->tx_space_written); 55 789 ahrens refcount_create(&tx->tx_space_freed); 56 2113 ahrens #endif 57 789 ahrens return (tx); 58 789 ahrens } 59 789 ahrens 60 789 ahrens dmu_tx_t * 61 789 ahrens dmu_tx_create(objset_t *os) 62 789 ahrens { 63 10298 Matthew dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); 64 789 ahrens tx->tx_objset = os; 65 10298 Matthew tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); 66 789 ahrens return (tx); 67 789 ahrens } 68 789 ahrens 69 789 ahrens dmu_tx_t * 70 789 ahrens dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 71 789 ahrens { 72 2199 ahrens dmu_tx_t *tx = dmu_tx_create_dd(NULL); 73 789 ahrens 74 789 ahrens ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 75 789 ahrens tx->tx_pool = dp; 76 789 ahrens tx->tx_txg = txg; 77 789 ahrens tx->tx_anyobj = TRUE; 78 789 ahrens 79 789 ahrens return (tx); 80 789 ahrens } 81 789 ahrens 82 789 ahrens int 83 789 ahrens dmu_tx_is_syncing(dmu_tx_t *tx) 84 789 ahrens { 85 789 ahrens return (tx->tx_anyobj); 86 789 ahrens } 87 789 ahrens 88 789 ahrens int 89 789 ahrens dmu_tx_private_ok(dmu_tx_t *tx) 90 789 ahrens { 91 1544 eschrock return (tx->tx_anyobj); 92 789 ahrens } 93 789 ahrens 94 2113 ahrens static dmu_tx_hold_t * 95 789 ahrens dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 96 2113 ahrens enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 97 789 ahrens { 98 2113 ahrens dmu_tx_hold_t *txh; 99 789 ahrens dnode_t *dn = NULL; 100 1544 eschrock int err; 101 789 ahrens 102 789 ahrens if (object != DMU_NEW_OBJECT) { 103 10298 Matthew err = dnode_hold(os, object, tx, &dn); 104 1544 eschrock if (err) { 105 1544 eschrock tx->tx_err = err; 106 2113 ahrens return (NULL); 107 1544 eschrock } 108 789 ahrens 109 1544 eschrock if (err == 0 && tx->tx_txg != 0) { 110 789 ahrens mutex_enter(&dn->dn_mtx); 111 789 ahrens /* 112 789 ahrens * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 113 789 ahrens * problem, but there's no way for it to happen (for 114 789 ahrens * now, at least). 115 789 ahrens */ 116 789 ahrens ASSERT(dn->dn_assigned_txg == 0); 117 789 ahrens dn->dn_assigned_txg = tx->tx_txg; 118 789 ahrens (void) refcount_add(&dn->dn_tx_holds, tx); 119 789 ahrens mutex_exit(&dn->dn_mtx); 120 789 ahrens } 121 789 ahrens } 122 789 ahrens 123 2113 ahrens txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 124 2113 ahrens txh->txh_tx = tx; 125 2113 ahrens txh->txh_dnode = dn; 126 2113 ahrens #ifdef ZFS_DEBUG 127 2113 ahrens txh->txh_type = type; 128 2113 ahrens txh->txh_arg1 = arg1; 129 2113 ahrens txh->txh_arg2 = arg2; 130 2113 ahrens #endif 131 2113 ahrens list_insert_tail(&tx->tx_holds, txh); 132 1544 eschrock 133 2113 ahrens return (txh); 134 789 ahrens } 135 789 ahrens 136 789 ahrens void 137 789 ahrens dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) 138 789 ahrens { 139 789 ahrens /* 140 789 ahrens * If we're syncing, they can manipulate any object anyhow, and 141 789 ahrens * the hold on the dnode_t can cause problems. 142 789 ahrens */ 143 789 ahrens if (!dmu_tx_is_syncing(tx)) { 144 2113 ahrens (void) dmu_tx_hold_object_impl(tx, os, 145 2113 ahrens object, THT_NEWOBJECT, 0, 0); 146 789 ahrens } 147 789 ahrens } 148 789 ahrens 149 1544 eschrock static int 150 1544 eschrock dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 151 1544 eschrock { 152 1544 eschrock int err; 153 1544 eschrock dmu_buf_impl_t *db; 154 1544 eschrock 155 1544 eschrock rw_enter(&dn->dn_struct_rwlock, RW_READER); 156 1544 eschrock db = dbuf_hold_level(dn, level, blkid, FTAG); 157 1544 eschrock rw_exit(&dn->dn_struct_rwlock); 158 1544 eschrock if (db == NULL) 159 1544 eschrock return (EIO); 160 6245 maybee err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); 161 1544 eschrock dbuf_rele(db, FTAG); 162 1544 eschrock return (err); 163 1544 eschrock } 164 1544 eschrock 165 8768 Mark static void 166 10922 Jeff dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, 167 10922 Jeff int level, uint64_t blkid, boolean_t freeable, uint64_t *history) 168 8768 Mark { 169 10922 Jeff objset_t *os = dn->dn_objset; 170 10922 Jeff dsl_dataset_t *ds = os->os_dsl_dataset; 171 10922 Jeff int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 172 10922 Jeff dmu_buf_impl_t *parent = NULL; 173 10922 Jeff blkptr_t *bp = NULL; 174 10922 Jeff uint64_t space; 175 8768 Mark 176 10922 Jeff if (level >= dn->dn_nlevels || history[level] == blkid) 177 8768 Mark return; 178 8768 Mark 179 10922 Jeff history[level] = blkid; 180 8768 Mark 181 10922 Jeff space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); 182 10922 Jeff 183 10922 Jeff if (db == NULL || db == dn->dn_dbuf) { 184 10922 Jeff ASSERT(level != 0); 185 10922 Jeff db = NULL; 186 10922 Jeff } else { 187 10922 Jeff ASSERT(db->db_dnode == dn); 188 10922 Jeff ASSERT(db->db_level == level); 189 10922 Jeff ASSERT(db->db.db_size == space); 190 10922 Jeff ASSERT(db->db_blkid == blkid); 191 10922 Jeff bp = db->db_blkptr; 192 10922 Jeff parent = db->db_parent; 193 8768 Mark } 194 8768 Mark 195 10922 Jeff freeable = (bp && (freeable || 196 10922 Jeff dsl_dataset_block_freeable(ds, bp->blk_birth))); 197 8768 Mark 198 10922 Jeff if (freeable) 199 10922 Jeff txh->txh_space_tooverwrite += space; 200 10922 Jeff else 201 10922 Jeff txh->txh_space_towrite += space; 202 10922 Jeff if (bp) 203 10922 Jeff txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp); 204 10922 Jeff 205 10922 Jeff dmu_tx_count_twig(txh, dn, parent, level + 1, 206 10922 Jeff blkid >> epbs, freeable, history); 207 8768 Mark } 208 8768 Mark 209 789 ahrens /* ARGSUSED */ 210 789 ahrens static void 211 2113 ahrens dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 212 789 ahrens { 213 2113 ahrens dnode_t *dn = txh->txh_dnode; 214 2113 ahrens uint64_t start, end, i; 215 789 ahrens int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; 216 2113 ahrens int err = 0; 217 789 ahrens 218 789 ahrens if (len == 0) 219 789 ahrens return; 220 789 ahrens 221 789 ahrens min_bs = SPA_MINBLOCKSHIFT; 222 789 ahrens max_bs = SPA_MAXBLOCKSHIFT; 223 789 ahrens min_ibs = DN_MIN_INDBLKSHIFT; 224 789 ahrens max_ibs = DN_MAX_INDBLKSHIFT; 225 1544 eschrock 226 8768 Mark if (dn) { 227 10922 Jeff uint64_t history[DN_MAX_LEVELS]; 228 8768 Mark int nlvls = dn->dn_nlevels; 229 8768 Mark int delta; 230 1544 eschrock 231 8768 Mark /* 232 8768 Mark * For i/o error checking, read the first and last level-0 233 8768 Mark * blocks (if they are not aligned), and all the level-1 blocks. 234 8768 Mark */ 235 1544 eschrock if (dn->dn_maxblkid == 0) { 236 8768 Mark delta = dn->dn_datablksz; 237 8768 Mark start = (off < dn->dn_datablksz) ? 0 : 1; 238 8768 Mark end = (off+len <= dn->dn_datablksz) ? 0 : 1; 239 8768 Mark if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { 240 7872 Tim err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 241 7872 Tim if (err) 242 7872 Tim goto out; 243 8768 Mark delta -= off; 244 7872 Tim } 245 1544 eschrock } else { 246 2113 ahrens zio_t *zio = zio_root(dn->dn_objset->os_spa, 247 1544 eschrock NULL, NULL, ZIO_FLAG_CANFAIL); 248 1544 eschrock 249 1544 eschrock /* first level-0 block */ 250 2082 eschrock start = off >> dn->dn_datablkshift; 251 2082 eschrock if (P2PHASE(off, dn->dn_datablksz) || 252 2082 eschrock len < dn->dn_datablksz) { 253 2082 eschrock err = dmu_tx_check_ioerr(zio, dn, 0, start); 254 2113 ahrens if (err) 255 2113 ahrens goto out; 256 1544 eschrock } 257 1544 eschrock 258 1544 eschrock /* last level-0 block */ 259 2082 eschrock end = (off+len-1) >> dn->dn_datablkshift; 260 7872 Tim if (end != start && end <= dn->dn_maxblkid && 261 2082 eschrock P2PHASE(off+len, dn->dn_datablksz)) { 262 1544 eschrock err = dmu_tx_check_ioerr(zio, dn, 0, end); 263 2113 ahrens if (err) 264 2113 ahrens goto out; 265 1544 eschrock } 266 1544 eschrock 267 1544 eschrock /* level-1 blocks */ 268 8768 Mark if (nlvls > 1) { 269 8768 Mark int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 270 8768 Mark for (i = (start>>shft)+1; i < end>>shft; i++) { 271 1544 eschrock err = dmu_tx_check_ioerr(zio, dn, 1, i); 272 2113 ahrens if (err) 273 2113 ahrens goto out; 274 1544 eschrock } 275 1544 eschrock } 276 1544 eschrock 277 1544 eschrock err = zio_wait(zio); 278 2113 ahrens if (err) 279 2113 ahrens goto out; 280 8768 Mark delta = P2NPHASE(off, dn->dn_datablksz); 281 1544 eschrock } 282 789 ahrens 283 8768 Mark if (dn->dn_maxblkid > 0) { 284 8768 Mark /* 285 8768 Mark * The blocksize can't change, 286 8768 Mark * so we can make a more precise estimate. 287 8768 Mark */ 288 8768 Mark ASSERT(dn->dn_datablkshift != 0); 289 789 ahrens min_bs = max_bs = dn->dn_datablkshift; 290 8768 Mark min_ibs = max_ibs = dn->dn_indblkshift; 291 8768 Mark } else if (dn->dn_indblkshift > max_ibs) { 292 8768 Mark /* 293 8768 Mark * This ensures that if we reduce DN_MAX_INDBLKSHIFT, 294 8768 Mark * the code will still work correctly on older pools. 295 8768 Mark */ 296 8768 Mark min_ibs = max_ibs = dn->dn_indblkshift; 297 8768 Mark } 298 8768 Mark 299 8768 Mark /* 300 8768 Mark * If this write is not off the end of the file 301 8768 Mark * we need to account for overwrites/unref. 302 8768 Mark */ 303 10922 Jeff if (start <= dn->dn_maxblkid) { 304 10922 Jeff for (int l = 0; l < DN_MAX_LEVELS; l++) 305 10922 Jeff history[l] = -1ULL; 306 10922 Jeff } 307 8768 Mark while (start <= dn->dn_maxblkid) { 308 8768 Mark dmu_buf_impl_t *db; 309 8768 Mark 310 8768 Mark rw_enter(&dn->dn_struct_rwlock, RW_READER); 311 8768 Mark db = dbuf_hold_level(dn, 0, start, FTAG); 312 8768 Mark rw_exit(&dn->dn_struct_rwlock); 313 10922 Jeff dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, 314 10922 Jeff history); 315 8768 Mark dbuf_rele(db, FTAG); 316 8768 Mark if (++start > end) { 317 8768 Mark /* 318 8768 Mark * Account for new indirects appearing 319 8768 Mark * before this IO gets assigned into a txg. 320 8768 Mark */ 321 8768 Mark bits = 64 - min_bs; 322 8768 Mark epbs = min_ibs - SPA_BLKPTRSHIFT; 323 8768 Mark for (bits -= epbs * (nlvls - 1); 324 8768 Mark bits >= 0; bits -= epbs) 325 8768 Mark txh->txh_fudge += 1ULL << max_ibs; 326 8768 Mark goto out; 327 8768 Mark } 328 8768 Mark off += delta; 329 8768 Mark if (len >= delta) 330 8768 Mark len -= delta; 331 8768 Mark delta = dn->dn_datablksz; 332 8768 Mark } 333 789 ahrens } 334 789 ahrens 335 789 ahrens /* 336 789 ahrens * 'end' is the last thing we will access, not one past. 337 789 ahrens * This way we won't overflow when accessing the last byte. 338 789 ahrens */ 339 789 ahrens start = P2ALIGN(off, 1ULL << max_bs); 340 789 ahrens end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; 341 2113 ahrens txh->txh_space_towrite += end - start + 1; 342 789 ahrens 343 789 ahrens start >>= min_bs; 344 789 ahrens end >>= min_bs; 345 789 ahrens 346 789 ahrens epbs = min_ibs - SPA_BLKPTRSHIFT; 347 789 ahrens 348 789 ahrens /* 349 789 ahrens * The object contains at most 2^(64 - min_bs) blocks, 350 789 ahrens * and each indirect level maps 2^epbs. 351 789 ahrens */ 352 789 ahrens for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { 353 789 ahrens start >>= epbs; 354 789 ahrens end >>= epbs; 355 8768 Mark ASSERT3U(end, >=, start); 356 8768 Mark txh->txh_space_towrite += (end - start + 1) << max_ibs; 357 8768 Mark if (start != 0) { 358 8768 Mark /* 359 8768 Mark * We also need a new blkid=0 indirect block 360 8768 Mark * to reference any existing file data. 361 8768 Mark */ 362 2113 ahrens txh->txh_space_towrite += 1ULL << max_ibs; 363 8768 Mark } 364 789 ahrens } 365 789 ahrens 366 8768 Mark out: 367 8768 Mark if (txh->txh_space_towrite + txh->txh_space_tooverwrite > 368 8768 Mark 2 * DMU_MAX_ACCESS) 369 8768 Mark err = EFBIG; 370 789 ahrens 371 2113 ahrens if (err) 372 2113 ahrens txh->txh_tx->tx_err = err; 373 789 ahrens } 374 789 ahrens 375 789 ahrens static void 376 2113 ahrens dmu_tx_count_dnode(dmu_tx_hold_t *txh) 377 789 ahrens { 378 2113 ahrens dnode_t *dn = txh->txh_dnode; 379 10298 Matthew dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode; 380 2113 ahrens uint64_t space = mdn->dn_datablksz + 381 2113 ahrens ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); 382 789 ahrens 383 789 ahrens if (dn && dn->dn_dbuf->db_blkptr && 384 789 ahrens dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 385 1544 eschrock dn->dn_dbuf->db_blkptr->blk_birth)) { 386 2113 ahrens txh->txh_space_tooverwrite += space; 387 8768 Mark txh->txh_space_tounref += space; 388 2113 ahrens } else { 389 2113 ahrens txh->txh_space_towrite += space; 390 5378 ck153898 if (dn && dn->dn_dbuf->db_blkptr) 391 5378 ck153898 txh->txh_space_tounref += space; 392 789 ahrens } 393 789 ahrens } 394 789 ahrens 395 789 ahrens void 396 789 ahrens dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 397 789 ahrens { 398 2113 ahrens dmu_tx_hold_t *txh; 399 2113 ahrens 400 789 ahrens ASSERT(tx->tx_txg == 0); 401 1544 eschrock ASSERT(len < DMU_MAX_ACCESS); 402 1819 maybee ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 403 789 ahrens 404 2113 ahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 405 2113 ahrens object, THT_WRITE, off, len); 406 2113 ahrens if (txh == NULL) 407 2113 ahrens return; 408 2113 ahrens 409 2113 ahrens dmu_tx_count_write(txh, off, len); 410 2113 ahrens dmu_tx_count_dnode(txh); 411 789 ahrens } 412 789 ahrens 413 789 ahrens static void 414 2113 ahrens dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 415 789 ahrens { 416 6992 maybee uint64_t blkid, nblks, lastblk; 417 6992 maybee uint64_t space = 0, unref = 0, skipped = 0; 418 2113 ahrens dnode_t *dn = txh->txh_dnode; 419 789 ahrens dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 420 2113 ahrens spa_t *spa = txh->txh_tx->tx_pool->dp_spa; 421 6992 maybee int epbs; 422 789 ahrens 423 6992 maybee if (dn->dn_nlevels == 0) 424 789 ahrens return; 425 1596 ahrens 426 789 ahrens /* 427 6992 maybee * The struct_rwlock protects us against dn_nlevels 428 1596 ahrens * changing, in case (against all odds) we manage to dirty & 429 1596 ahrens * sync out the changes after we check for being dirty. 430 6992 maybee * Also, dbuf_hold_level() wants us to have the struct_rwlock. 431 789 ahrens */ 432 789 ahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 433 6992 maybee epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 434 6992 maybee if (dn->dn_maxblkid == 0) { 435 1596 ahrens if (off == 0 && len >= dn->dn_datablksz) { 436 1596 ahrens blkid = 0; 437 1596 ahrens nblks = 1; 438 1596 ahrens } else { 439 1596 ahrens rw_exit(&dn->dn_struct_rwlock); 440 1596 ahrens return; 441 1596 ahrens } 442 1596 ahrens } else { 443 1596 ahrens blkid = off >> dn->dn_datablkshift; 444 6992 maybee nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; 445 789 ahrens 446 6992 maybee if (blkid >= dn->dn_maxblkid) { 447 1596 ahrens rw_exit(&dn->dn_struct_rwlock); 448 1596 ahrens return; 449 1596 ahrens } 450 6992 maybee if (blkid + nblks > dn->dn_maxblkid) 451 6992 maybee nblks = dn->dn_maxblkid - blkid; 452 1596 ahrens 453 1544 eschrock } 454 6992 maybee if (dn->dn_nlevels == 1) { 455 789 ahrens int i; 456 789 ahrens for (i = 0; i < nblks; i++) { 457 789 ahrens blkptr_t *bp = dn->dn_phys->dn_blkptr; 458 6992 maybee ASSERT3U(blkid + i, <, dn->dn_nblkptr); 459 789 ahrens bp += blkid + i; 460 1544 eschrock if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { 461 789 ahrens dprintf_bp(bp, "can free old%s", ""); 462 10922 Jeff space += bp_get_dsize(spa, bp); 463 789 ahrens } 464 5378 ck153898 unref += BP_GET_ASIZE(bp); 465 789 ahrens } 466 1544 eschrock nblks = 0; 467 789 ahrens } 468 789 ahrens 469 6992 maybee /* 470 7016 maybee * Add in memory requirements of higher-level indirects. 471 7016 maybee * This assumes a worst-possible scenario for dn_nlevels. 472 6992 maybee */ 473 7016 maybee { 474 6992 maybee uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); 475 7080 maybee int level = (dn->dn_nlevels > 1) ? 2 : 1; 476 6992 maybee 477 7016 maybee while (level++ < DN_MAX_LEVELS) { 478 6992 maybee txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; 479 6992 maybee blkcnt = 1 + (blkcnt >> epbs); 480 6992 maybee } 481 6992 maybee ASSERT(blkcnt <= dn->dn_nblkptr); 482 6992 maybee } 483 6992 maybee 484 6992 maybee lastblk = blkid + nblks - 1; 485 789 ahrens while (nblks) { 486 789 ahrens dmu_buf_impl_t *dbuf; 487 6992 maybee uint64_t ibyte, new_blkid; 488 6992 maybee int epb = 1 << epbs; 489 6992 maybee int err, i, blkoff, tochk; 490 6992 maybee blkptr_t *bp; 491 789 ahrens 492 6992 maybee ibyte = blkid << dn->dn_datablkshift; 493 6992 maybee err = dnode_next_offset(dn, 494 6992 maybee DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); 495 6992 maybee new_blkid = ibyte >> dn->dn_datablkshift; 496 7080 maybee if (err == ESRCH) { 497 7080 maybee skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; 498 6992 maybee break; 499 7080 maybee } 500 6992 maybee if (err) { 501 2113 ahrens txh->txh_tx->tx_err = err; 502 1596 ahrens break; 503 789 ahrens } 504 7080 maybee if (new_blkid > lastblk) { 505 7080 maybee skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; 506 6992 maybee break; 507 7080 maybee } 508 6992 maybee 509 6992 maybee if (new_blkid > blkid) { 510 7080 maybee ASSERT((new_blkid >> epbs) > (blkid >> epbs)); 511 7080 maybee skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; 512 6992 maybee nblks -= new_blkid - blkid; 513 6992 maybee blkid = new_blkid; 514 6992 maybee } 515 6992 maybee blkoff = P2PHASE(blkid, epb); 516 6992 maybee tochk = MIN(epb - blkoff, nblks); 517 6992 maybee 518 6992 maybee dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); 519 6992 maybee 520 6992 maybee txh->txh_memory_tohold += dbuf->db.db_size; 521 6992 maybee if (txh->txh_memory_tohold > DMU_MAX_ACCESS) { 522 6992 maybee txh->txh_tx->tx_err = E2BIG; 523 6992 maybee dbuf_rele(dbuf, FTAG); 524 6992 maybee break; 525 6992 maybee } 526 6992 maybee err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); 527 6992 maybee if (err != 0) { 528 6992 maybee txh->txh_tx->tx_err = err; 529 6992 maybee dbuf_rele(dbuf, FTAG); 530 6992 maybee break; 531 6992 maybee } 532 6992 maybee 533 6992 maybee bp = dbuf->db.db_data; 534 6992 maybee bp += blkoff; 535 6992 maybee 536 6992 maybee for (i = 0; i < tochk; i++) { 537 6992 maybee if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { 538 6992 maybee dprintf_bp(&bp[i], "can free old%s", ""); 539 10922 Jeff space += bp_get_dsize(spa, &bp[i]); 540 6992 maybee } 541 6992 maybee unref += BP_GET_ASIZE(bp); 542 6992 maybee } 543 6992 maybee dbuf_rele(dbuf, FTAG); 544 789 ahrens 545 789 ahrens blkid += tochk; 546 789 ahrens nblks -= tochk; 547 789 ahrens } 548 789 ahrens rw_exit(&dn->dn_struct_rwlock); 549 789 ahrens 550 6992 maybee /* account for new level 1 indirect blocks that might show up */ 551 7080 maybee if (skipped > 0) { 552 7016 maybee txh->txh_fudge += skipped << dn->dn_indblkshift; 553 6992 maybee skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); 554 6992 maybee txh->txh_memory_tohold += skipped << dn->dn_indblkshift; 555 6992 maybee } 556 2113 ahrens txh->txh_space_tofree += space; 557 5378 ck153898 txh->txh_space_tounref += unref; 558 789 ahrens } 559 789 ahrens 560 2113 ahrens void 561 2113 ahrens dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 562 789 ahrens { 563 2113 ahrens dmu_tx_hold_t *txh; 564 2113 ahrens dnode_t *dn; 565 1544 eschrock uint64_t start, end, i; 566 1596 ahrens int err, shift; 567 1544 eschrock zio_t *zio; 568 789 ahrens 569 2113 ahrens ASSERT(tx->tx_txg == 0); 570 2113 ahrens 571 2113 ahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 572 2113 ahrens object, THT_FREE, off, len); 573 2113 ahrens if (txh == NULL) 574 2113 ahrens return; 575 2113 ahrens dn = txh->txh_dnode; 576 2113 ahrens 577 789 ahrens /* first block */ 578 1793 ahrens if (off != 0) 579 2113 ahrens dmu_tx_count_write(txh, off, 1); 580 789 ahrens /* last block */ 581 789 ahrens if (len != DMU_OBJECT_END) 582 2113 ahrens dmu_tx_count_write(txh, off+len, 1); 583 789 ahrens 584 10922 Jeff dmu_tx_count_dnode(txh); 585 10922 Jeff 586 789 ahrens if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) 587 789 ahrens return; 588 789 ahrens if (len == DMU_OBJECT_END) 589 789 ahrens len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; 590 1544 eschrock 591 1544 eschrock /* 592 1544 eschrock * For i/o error checking, read the first and last level-0 593 1544 eschrock * blocks, and all the level-1 blocks. The above count_write's 594 6992 maybee * have already taken care of the level-0 blocks. 595 1544 eschrock */ 596 1793 ahrens if (dn->dn_nlevels > 1) { 597 1793 ahrens shift = dn->dn_datablkshift + dn->dn_indblkshift - 598 1793 ahrens SPA_BLKPTRSHIFT; 599 1793 ahrens start = off >> shift; 600 1793 ahrens end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; 601 1544 eschrock 602 1793 ahrens zio = zio_root(tx->tx_pool->dp_spa, 603 1793 ahrens NULL, NULL, ZIO_FLAG_CANFAIL); 604 1793 ahrens for (i = start; i <= end; i++) { 605 1793 ahrens uint64_t ibyte = i << shift; 606 6992 maybee err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); 607 1793 ahrens i = ibyte >> shift; 608 1793 ahrens if (err == ESRCH) 609 1793 ahrens break; 610 1793 ahrens if (err) { 611 1793 ahrens tx->tx_err = err; 612 1793 ahrens return; 613 1793 ahrens } 614 1793 ahrens 615 1793 ahrens err = dmu_tx_check_ioerr(zio, dn, 1, i); 616 1793 ahrens if (err) { 617 1793 ahrens tx->tx_err = err; 618 1793 ahrens return; 619 1793 ahrens } 620 1793 ahrens } 621 1793 ahrens err = zio_wait(zio); 622 1544 eschrock if (err) { 623 1544 eschrock tx->tx_err = err; 624 1544 eschrock return; 625 1544 eschrock } 626 1544 eschrock } 627 1544 eschrock 628 2113 ahrens dmu_tx_count_free(txh, off, len); 629 789 ahrens } 630 789 ahrens 631 789 ahrens void 632 9396 Matthew dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) 633 789 ahrens { 634 2113 ahrens dmu_tx_hold_t *txh; 635 2113 ahrens dnode_t *dn; 636 2113 ahrens uint64_t nblocks; 637 2113 ahrens int epbs, err; 638 2113 ahrens 639 789 ahrens ASSERT(tx->tx_txg == 0); 640 789 ahrens 641 2113 ahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 642 2113 ahrens object, THT_ZAP, add, (uintptr_t)name); 643 2113 ahrens if (txh == NULL) 644 2113 ahrens return; 645 2113 ahrens dn = txh->txh_dnode; 646 789 ahrens 647 2113 ahrens dmu_tx_count_dnode(txh); 648 789 ahrens 649 789 ahrens if (dn == NULL) { 650 789 ahrens /* 651 1544 eschrock * We will be able to fit a new object's entries into one leaf 652 789 ahrens * block. So there will be at most 2 blocks total, 653 789 ahrens * including the header block. 654 789 ahrens */ 655 2113 ahrens dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); 656 789 ahrens return; 657 789 ahrens } 658 789 ahrens 659 789 ahrens ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); 660 789 ahrens 661 1544 eschrock if (dn->dn_maxblkid == 0 && !add) { 662 789 ahrens /* 663 789 ahrens * If there is only one block (i.e. this is a micro-zap) 664 1544 eschrock * and we are not adding anything, the accounting is simple. 665 789 ahrens */ 666 1544 eschrock err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 667 1544 eschrock if (err) { 668 1544 eschrock tx->tx_err = err; 669 1544 eschrock return; 670 1544 eschrock } 671 1544 eschrock 672 3245 maybee /* 673 3245 maybee * Use max block size here, since we don't know how much 674 3245 maybee * the size will change between now and the dbuf dirty call. 675 3245 maybee */ 676 789 ahrens if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 677 5378 ck153898 dn->dn_phys->dn_blkptr[0].blk_birth)) { 678 3245 maybee txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; 679 5378 ck153898 } else { 680 3245 maybee txh->txh_space_towrite += SPA_MAXBLOCKSIZE; 681 5378 ck153898 } 682 8890 chris if (dn->dn_phys->dn_blkptr[0].blk_birth) 683 8890 chris txh->txh_space_tounref += SPA_MAXBLOCKSIZE; 684 789 ahrens return; 685 789 ahrens } 686 789 ahrens 687 1544 eschrock if (dn->dn_maxblkid > 0 && name) { 688 1544 eschrock /* 689 1544 eschrock * access the name in this fat-zap so that we'll check 690 1544 eschrock * for i/o errors to the leaf blocks, etc. 691 1544 eschrock */ 692 10298 Matthew err = zap_lookup(dn->dn_objset, dn->dn_object, name, 693 1544 eschrock 8, 0, NULL); 694 1544 eschrock if (err == EIO) { 695 1544 eschrock tx->tx_err = err; 696 1544 eschrock return; 697 1544 eschrock } 698 1544 eschrock } 699 1544 eschrock 700 10298 Matthew err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, 701 9873 Sanjeev &txh->txh_space_towrite, &txh->txh_space_tooverwrite); 702 789 ahrens 703 789 ahrens /* 704 789 ahrens * If the modified blocks are scattered to the four winds, 705 789 ahrens * we'll have to modify an indirect twig for each. 706 789 ahrens */ 707 789 ahrens epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 708 789 ahrens for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) 709 9653 Sanjeev if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) 710 9653 Sanjeev txh->txh_space_towrite += 3 << dn->dn_indblkshift; 711 9653 Sanjeev else 712 9653 Sanjeev txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; 713 789 ahrens } 714 789 ahrens 715 789 ahrens void 716 789 ahrens dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 717 789 ahrens { 718 2113 ahrens dmu_tx_hold_t *txh; 719 2113 ahrens 720 789 ahrens ASSERT(tx->tx_txg == 0); 721 789 ahrens 722 2113 ahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 723 2113 ahrens object, THT_BONUS, 0, 0); 724 2113 ahrens if (txh) 725 2113 ahrens dmu_tx_count_dnode(txh); 726 789 ahrens } 727 789 ahrens 728 789 ahrens void 729 789 ahrens dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 730 789 ahrens { 731 2113 ahrens dmu_tx_hold_t *txh; 732 789 ahrens ASSERT(tx->tx_txg == 0); 733 789 ahrens 734 2113 ahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 735 2113 ahrens DMU_NEW_OBJECT, THT_SPACE, space, 0); 736 2113 ahrens 737 2113 ahrens txh->txh_space_towrite += space; 738 789 ahrens } 739 789 ahrens 740 789 ahrens int 741 789 ahrens dmu_tx_holds(dmu_tx_t *tx, uint64_t object) 742 789 ahrens { 743 2113 ahrens dmu_tx_hold_t *txh; 744 789 ahrens int holds = 0; 745 789 ahrens 746 789 ahrens /* 747 789 ahrens * By asserting that the tx is assigned, we're counting the 748 789 ahrens * number of dn_tx_holds, which is the same as the number of 749 789 ahrens * dn_holds. Otherwise, we'd be counting dn_holds, but 750 789 ahrens * dn_tx_holds could be 0. 751 789 ahrens */ 752 789 ahrens ASSERT(tx->tx_txg != 0); 753 789 ahrens 754 789 ahrens /* if (tx->tx_anyobj == TRUE) */ 755 789 ahrens /* return (0); */ 756 789 ahrens 757 2113 ahrens for (txh = list_head(&tx->tx_holds); txh; 758 2113 ahrens txh = list_next(&tx->tx_holds, txh)) { 759 2113 ahrens if (txh->txh_dnode && txh->txh_dnode->dn_object == object) 760 789 ahrens holds++; 761 789 ahrens } 762 789 ahrens 763 789 ahrens return (holds); 764 789 ahrens } 765 789 ahrens 766 873 ek110237 #ifdef ZFS_DEBUG 767 789 ahrens void 768 789 ahrens dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 769 789 ahrens { 770 2113 ahrens dmu_tx_hold_t *txh; 771 789 ahrens int match_object = FALSE, match_offset = FALSE; 772 789 ahrens dnode_t *dn = db->db_dnode; 773 789 ahrens 774 789 ahrens ASSERT(tx->tx_txg != 0); 775 10298 Matthew ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); 776 789 ahrens ASSERT3U(dn->dn_object, ==, db->db.db_object); 777 789 ahrens 778 789 ahrens if (tx->tx_anyobj) 779 789 ahrens return; 780 789 ahrens 781 789 ahrens /* XXX No checking on the meta dnode for now */ 782 1544 eschrock if (db->db.db_object == DMU_META_DNODE_OBJECT) 783 789 ahrens return; 784 789 ahrens 785 2113 ahrens for (txh = list_head(&tx->tx_holds); txh; 786 2113 ahrens txh = list_next(&tx->tx_holds, txh)) { 787 789 ahrens ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); 788 2113 ahrens if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 789 789 ahrens match_object = TRUE; 790 2113 ahrens if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 791 789 ahrens int datablkshift = dn->dn_datablkshift ? 792 789 ahrens dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 793 789 ahrens int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 794 789 ahrens int shift = datablkshift + epbs * db->db_level; 795 789 ahrens uint64_t beginblk = shift >= 64 ? 0 : 796 2113 ahrens (txh->txh_arg1 >> shift); 797 789 ahrens uint64_t endblk = shift >= 64 ? 0 : 798 2113 ahrens ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 799 789 ahrens uint64_t blkid = db->db_blkid; 800 789 ahrens 801 2113 ahrens /* XXX txh_arg2 better not be zero... */ 802 789 ahrens 803 2113 ahrens dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 804 2113 ahrens txh->txh_type, beginblk, endblk); 805 789 ahrens 806 2113 ahrens switch (txh->txh_type) { 807 789 ahrens case THT_WRITE: 808 789 ahrens if (blkid >= beginblk && blkid <= endblk) 809 789 ahrens match_offset = TRUE; 810 789 ahrens /* 811 789 ahrens * We will let this hold work for the bonus 812 789 ahrens * buffer so that we don't need to hold it 813 789 ahrens * when creating a new object. 814 789 ahrens */ 815 789 ahrens if (blkid == DB_BONUS_BLKID) 816 789 ahrens match_offset = TRUE; 817 789 ahrens /* 818 789 ahrens * They might have to increase nlevels, 819 789 ahrens * thus dirtying the new TLIBs. Or the 820 789 ahrens * might have to change the block size, 821 789 ahrens * thus dirying the new lvl=0 blk=0. 822 789 ahrens */ 823 789 ahrens if (blkid == 0) 824 789 ahrens match_offset = TRUE; 825 789 ahrens break; 826 789 ahrens case THT_FREE: 827 6992 maybee /* 828 6992 maybee * We will dirty all the level 1 blocks in 829 6992 maybee * the free range and perhaps the first and 830 6992 maybee * last level 0 block. 831 6992 maybee */ 832 6992 maybee if (blkid >= beginblk && (blkid <= endblk || 833 6992 maybee txh->txh_arg2 == DMU_OBJECT_END)) 834 789 ahrens match_offset = TRUE; 835 789 ahrens break; 836 789 ahrens case THT_BONUS: 837 789 ahrens if (blkid == DB_BONUS_BLKID) 838 789 ahrens match_offset = TRUE; 839 789 ahrens break; 840 789 ahrens case THT_ZAP: 841 789 ahrens match_offset = TRUE; 842 789 ahrens break; 843 789 ahrens case THT_NEWOBJECT: 844 789 ahrens match_object = TRUE; 845 789 ahrens break; 846 789 ahrens default: 847 2113 ahrens ASSERT(!"bad txh_type"); 848 789 ahrens } 849 789 ahrens } 850 789 ahrens if (match_object && match_offset) 851 789 ahrens return; 852 789 ahrens } 853 789 ahrens panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 854 789 ahrens (u_longlong_t)db->db.db_object, db->db_level, 855 789 ahrens (u_longlong_t)db->db_blkid); 856 873 ek110237 } 857 789 ahrens #endif 858 789 ahrens 859 789 ahrens static int 860 2113 ahrens dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) 861 789 ahrens { 862 2113 ahrens dmu_tx_hold_t *txh; 863 5329 gw25295 spa_t *spa = tx->tx_pool->dp_spa; 864 6992 maybee uint64_t memory, asize, fsize, usize; 865 7016 maybee uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; 866 789 ahrens 867 2113 ahrens ASSERT3U(tx->tx_txg, ==, 0); 868 5329 gw25295 869 1544 eschrock if (tx->tx_err) 870 1544 eschrock return (tx->tx_err); 871 5329 gw25295 872 7754 Jeff if (spa_suspended(spa)) { 873 5329 gw25295 /* 874 5329 gw25295 * If the user has indicated a blocking failure mode 875 5329 gw25295 * then return ERESTART which will block in dmu_tx_wait(). 876 5329 gw25295 * Otherwise, return EIO so that an error can get 877 5329 gw25295 * propagated back to the VOP calls. 878 5329 gw25295 * 879 5329 gw25295 * Note that we always honor the txg_how flag regardless 880 5329 gw25295 * of the failuremode setting. 881 5329 gw25295 */ 882 5329 gw25295 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && 883 5329 gw25295 txg_how != TXG_WAIT) 884 5329 gw25295 return (EIO); 885 5329 gw25295 886 5329 gw25295 return (ERESTART); 887 5329 gw25295 } 888 789 ahrens 889 2113 ahrens tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 890 2113 ahrens tx->tx_needassign_txh = NULL; 891 2113 ahrens 892 2113 ahrens /* 893 2113 ahrens * NB: No error returns are allowed after txg_hold_open, but 894 2113 ahrens * before processing the dnode holds, due to the 895 2113 ahrens * dmu_tx_unassign() logic. 896 2113 ahrens */ 897 2113 ahrens 898 7016 maybee towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; 899 2113 ahrens for (txh = list_head(&tx->tx_holds); txh; 900 2113 ahrens txh = list_next(&tx->tx_holds, txh)) { 901 2113 ahrens dnode_t *dn = txh->txh_dnode; 902 789 ahrens if (dn != NULL) { 903 789 ahrens mutex_enter(&dn->dn_mtx); 904 2113 ahrens if (dn->dn_assigned_txg == tx->tx_txg - 1) { 905 2113 ahrens mutex_exit(&dn->dn_mtx); 906 2113 ahrens tx->tx_needassign_txh = txh; 907 2113 ahrens return (ERESTART); 908 789 ahrens } 909 2113 ahrens if (dn->dn_assigned_txg == 0) 910 789 ahrens dn->dn_assigned_txg = tx->tx_txg; 911 2113 ahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 912 789 ahrens (void) refcount_add(&dn->dn_tx_holds, tx); 913 789 ahrens mutex_exit(&dn->dn_mtx); 914 789 ahrens } 915 2113 ahrens towrite += txh->txh_space_towrite; 916 2113 ahrens tofree += txh->txh_space_tofree; 917 2113 ahrens tooverwrite += txh->txh_space_tooverwrite; 918 5378 ck153898 tounref += txh->txh_space_tounref; 919 6992 maybee tohold += txh->txh_memory_tohold; 920 7016 maybee fudge += txh->txh_fudge; 921 1544 eschrock } 922 2113 ahrens 923 2113 ahrens /* 924 2113 ahrens * NB: This check must be after we've held the dnodes, so that 925 2113 ahrens * the dmu_tx_unassign() logic will work properly 926 2113 ahrens */ 927 2113 ahrens if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) 928 2113 ahrens return (ERESTART); 929 1544 eschrock 930 1544 eschrock /* 931 1544 eschrock * If a snapshot has been taken since we made our estimates, 932 1544 eschrock * assume that we won't be able to free or overwrite anything. 933 1544 eschrock */ 934 1544 eschrock if (tx->tx_objset && 935 10298 Matthew dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > 936 1544 eschrock tx->tx_lastsnap_txg) { 937 2113 ahrens towrite += tooverwrite; 938 2113 ahrens tooverwrite = tofree = 0; 939 789 ahrens } 940 789 ahrens 941 6992 maybee /* needed allocation: worst-case estimate of write space */ 942 6992 maybee asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); 943 6992 maybee /* freed space estimate: worst-case overwrite + free estimate */ 944 2113 ahrens fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; 945 6992 maybee /* convert unrefd space to worst-case estimate */ 946 5378 ck153898 usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); 947 6992 maybee /* calculate memory footprint estimate */ 948 6992 maybee memory = towrite + tooverwrite + tohold; 949 2113 ahrens 950 2113 ahrens #ifdef ZFS_DEBUG 951 7016 maybee /* 952 7016 maybee * Add in 'tohold' to account for our dirty holds on this memory 953 7016 maybee * XXX - the "fudge" factor is to account for skipped blocks that 954 7016 maybee * we missed because dnode_next_offset() misses in-core-only blocks. 955 7016 maybee */ 956 6992 maybee tx->tx_space_towrite = asize + 957 7016 maybee spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); 958 2113 ahrens tx->tx_space_tofree = tofree; 959 2113 ahrens tx->tx_space_tooverwrite = tooverwrite; 960 5378 ck153898 tx->tx_space_tounref = tounref; 961 2113 ahrens #endif 962 789 ahrens 963 789 ahrens if (tx->tx_dir && asize != 0) { 964 6992 maybee int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, 965 6992 maybee asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); 966 2113 ahrens if (err) 967 789 ahrens return (err); 968 789 ahrens } 969 789 ahrens 970 789 ahrens return (0); 971 789 ahrens } 972 789 ahrens 973 2113 ahrens static void 974 2113 ahrens dmu_tx_unassign(dmu_tx_t *tx) 975 789 ahrens { 976 2113 ahrens dmu_tx_hold_t *txh; 977 789 ahrens 978 2113 ahrens if (tx->tx_txg == 0) 979 2113 ahrens return; 980 789 ahrens 981 789 ahrens txg_rele_to_quiesce(&tx->tx_txgh); 982 789 ahrens 983 2113 ahrens for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; 984 2113 ahrens txh = list_next(&tx->tx_holds, txh)) { 985 2113 ahrens dnode_t *dn = txh->txh_dnode; 986 789 ahrens 987 789 ahrens if (dn == NULL) 988 789 ahrens continue; 989 789 ahrens mutex_enter(&dn->dn_mtx); 990 2113 ahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 991 789 ahrens 992 789 ahrens if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 993 789 ahrens dn->dn_assigned_txg = 0; 994 789 ahrens cv_broadcast(&dn->dn_notxholds); 995 789 ahrens } 996 789 ahrens mutex_exit(&dn->dn_mtx); 997 789 ahrens } 998 789 ahrens 999 789 ahrens txg_rele_to_sync(&tx->tx_txgh); 1000 789 ahrens 1001 2113 ahrens tx->tx_lasttried_txg = tx->tx_txg; 1002 789 ahrens tx->tx_txg = 0; 1003 789 ahrens } 1004 789 ahrens 1005 789 ahrens /* 1006 789 ahrens * Assign tx to a transaction group. txg_how can be one of: 1007 789 ahrens * 1008 789 ahrens * (1) TXG_WAIT. If the current open txg is full, waits until there's 1009 789 ahrens * a new one. This should be used when you're not holding locks. 1010 789 ahrens * If will only fail if we're truly out of space (or over quota). 1011 789 ahrens * 1012 789 ahrens * (2) TXG_NOWAIT. If we can't assign into the current open txg without 1013 789 ahrens * blocking, returns immediately with ERESTART. This should be used 1014 789 ahrens * whenever you're holding locks. On an ERESTART error, the caller 1015 2113 ahrens * should drop locks, do a dmu_tx_wait(tx), and try again. 1016 789 ahrens * 1017 789 ahrens * (3) A specific txg. Use this if you need to ensure that multiple 1018 789 ahrens * transactions all sync in the same txg. Like TXG_NOWAIT, it 1019 789 ahrens * returns ERESTART if it can't assign you into the requested txg. 1020 789 ahrens */ 1021 789 ahrens int 1022 789 ahrens dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) 1023 789 ahrens { 1024 789 ahrens int err; 1025 789 ahrens 1026 789 ahrens ASSERT(tx->tx_txg == 0); 1027 789 ahrens ASSERT(txg_how != 0); 1028 789 ahrens ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 1029 789 ahrens 1030 2113 ahrens while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { 1031 2113 ahrens dmu_tx_unassign(tx); 1032 789 ahrens 1033 789 ahrens if (err != ERESTART || txg_how != TXG_WAIT) 1034 789 ahrens return (err); 1035 789 ahrens 1036 2113 ahrens dmu_tx_wait(tx); 1037 789 ahrens } 1038 789 ahrens 1039 789 ahrens txg_rele_to_quiesce(&tx->tx_txgh); 1040 789 ahrens 1041 789 ahrens return (0); 1042 789 ahrens } 1043 789 ahrens 1044 789 ahrens void 1045 2113 ahrens dmu_tx_wait(dmu_tx_t *tx) 1046 2113 ahrens { 1047 5329 gw25295 spa_t *spa = tx->tx_pool->dp_spa; 1048 5329 gw25295 1049 2113 ahrens ASSERT(tx->tx_txg == 0); 1050 2113 ahrens 1051 5329 gw25295 /* 1052 5329 gw25295 * It's possible that the pool has become active after this thread 1053 5329 gw25295 * has tried to obtain a tx. If that's the case then his 1054 5329 gw25295 * tx_lasttried_txg would not have been assigned. 1055 5329 gw25295 */ 1056 7754 Jeff if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { 1057 5329 gw25295 txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); 1058 5329 gw25295 } else if (tx->tx_needassign_txh) { 1059 2113 ahrens dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 1060 2113 ahrens 1061 2113 ahrens mutex_enter(&dn->dn_mtx); 1062 2113 ahrens while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 1063 2113 ahrens cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 1064 2113 ahrens mutex_exit(&dn->dn_mtx); 1065 2113 ahrens tx->tx_needassign_txh = NULL; 1066 2113 ahrens } else { 1067 2113 ahrens txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); 1068 2113 ahrens } 1069 2113 ahrens } 1070 2113 ahrens 1071 2113 ahrens void 1072 789 ahrens dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) 1073 789 ahrens { 1074 2113 ahrens #ifdef ZFS_DEBUG 1075 789 ahrens if (tx->tx_dir == NULL || delta == 0) 1076 789 ahrens return; 1077 789 ahrens 1078 789 ahrens if (delta > 0) { 1079 789 ahrens ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, 1080 789 ahrens tx->tx_space_towrite); 1081 789 ahrens (void) refcount_add_many(&tx->tx_space_written, delta, NULL); 1082 789 ahrens } else { 1083 789 ahrens (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); 1084 789 ahrens } 1085 2113 ahrens #endif 1086 789 ahrens } 1087 789 ahrens 1088 789 ahrens void 1089 789 ahrens dmu_tx_commit(dmu_tx_t *tx) 1090 789 ahrens { 1091 2113 ahrens dmu_tx_hold_t *txh; 1092 789 ahrens 1093 789 ahrens ASSERT(tx->tx_txg != 0); 1094 789 ahrens 1095 2113 ahrens while (txh = list_head(&tx->tx_holds)) { 1096 2113 ahrens dnode_t *dn = txh->txh_dnode; 1097 789 ahrens 1098 2113 ahrens list_remove(&tx->tx_holds, txh); 1099 2113 ahrens kmem_free(txh, sizeof (dmu_tx_hold_t)); 1100 789 ahrens if (dn == NULL) 1101 789 ahrens continue; 1102 789 ahrens mutex_enter(&dn->dn_mtx); 1103 789 ahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1104 789 ahrens 1105 789 ahrens if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1106 789 ahrens dn->dn_assigned_txg = 0; 1107 789 ahrens cv_broadcast(&dn->dn_notxholds); 1108 789 ahrens } 1109 789 ahrens mutex_exit(&dn->dn_mtx); 1110 789 ahrens dnode_rele(dn, tx); 1111 789 ahrens } 1112 789 ahrens 1113 2113 ahrens if (tx->tx_tempreserve_cookie) 1114 789 ahrens dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 1115 789 ahrens 1116 10612 Ricardo if (!list_is_empty(&tx->tx_callbacks)) 1117 10612 Ricardo txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); 1118 10612 Ricardo 1119 789 ahrens if (tx->tx_anyobj == FALSE) 1120 789 ahrens txg_rele_to_sync(&tx->tx_txgh); 1121 10612 Ricardo 1122 10612 Ricardo list_destroy(&tx->tx_callbacks); 1123 5765 ek110237 list_destroy(&tx->tx_holds); 1124 2113 ahrens #ifdef ZFS_DEBUG 1125 789 ahrens dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", 1126 789 ahrens tx->tx_space_towrite, refcount_count(&tx->tx_space_written), 1127 789 ahrens tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); 1128 789 ahrens refcount_destroy_many(&tx->tx_space_written, 1129 789 ahrens refcount_count(&tx->tx_space_written)); 1130 789 ahrens refcount_destroy_many(&tx->tx_space_freed, 1131 789 ahrens refcount_count(&tx->tx_space_freed)); 1132 789 ahrens #endif 1133 789 ahrens kmem_free(tx, sizeof (dmu_tx_t)); 1134 789 ahrens } 1135 789 ahrens 1136 789 ahrens void 1137 789 ahrens dmu_tx_abort(dmu_tx_t *tx) 1138 789 ahrens { 1139 2113 ahrens dmu_tx_hold_t *txh; 1140 789 ahrens 1141 789 ahrens ASSERT(tx->tx_txg == 0); 1142 789 ahrens 1143 2113 ahrens while (txh = list_head(&tx->tx_holds)) { 1144 2113 ahrens dnode_t *dn = txh->txh_dnode; 1145 789 ahrens 1146 2113 ahrens list_remove(&tx->tx_holds, txh); 1147 2113 ahrens kmem_free(txh, sizeof (dmu_tx_hold_t)); 1148 789 ahrens if (dn != NULL) 1149 789 ahrens dnode_rele(dn, tx); 1150 789 ahrens } 1151 10612 Ricardo 1152 10612 Ricardo /* 1153 10612 Ricardo * Call any registered callbacks with an error code. 1154 10612 Ricardo */ 1155 10612 Ricardo if (!list_is_empty(&tx->tx_callbacks)) 1156 10612 Ricardo dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); 1157 10612 Ricardo 1158 10612 Ricardo list_destroy(&tx->tx_callbacks); 1159 5765 ek110237 list_destroy(&tx->tx_holds); 1160 2113 ahrens #ifdef ZFS_DEBUG 1161 789 ahrens refcount_destroy_many(&tx->tx_space_written, 1162 789 ahrens refcount_count(&tx->tx_space_written)); 1163 789 ahrens refcount_destroy_many(&tx->tx_space_freed, 1164 789 ahrens refcount_count(&tx->tx_space_freed)); 1165 789 ahrens #endif 1166 789 ahrens kmem_free(tx, sizeof (dmu_tx_t)); 1167 789 ahrens } 1168 789 ahrens 1169 789 ahrens uint64_t 1170 789 ahrens dmu_tx_get_txg(dmu_tx_t *tx) 1171 789 ahrens { 1172 789 ahrens ASSERT(tx->tx_txg != 0); 1173 789 ahrens return (tx->tx_txg); 1174 789 ahrens } 1175 10612 Ricardo 1176 10612 Ricardo void 1177 10612 Ricardo dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) 1178 10612 Ricardo { 1179 10612 Ricardo dmu_tx_callback_t *dcb; 1180 10612 Ricardo 1181 10612 Ricardo dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); 1182 10612 Ricardo 1183 10612 Ricardo dcb->dcb_func = func; 1184 10612 Ricardo dcb->dcb_data = data; 1185 10612 Ricardo 1186 10612 Ricardo list_insert_tail(&tx->tx_callbacks, dcb); 1187 10612 Ricardo } 1188 10612 Ricardo 1189 10612 Ricardo /* 1190 10612 Ricardo * Call all the commit callbacks on a list, with a given error code. 1191 10612 Ricardo */ 1192 10612 Ricardo void 1193 10612 Ricardo dmu_tx_do_callbacks(list_t *cb_list, int error) 1194 10612 Ricardo { 1195 10612 Ricardo dmu_tx_callback_t *dcb; 1196 10612 Ricardo 1197 10612 Ricardo while (dcb = list_head(cb_list)) { 1198 10612 Ricardo list_remove(cb_list, dcb); 1199 10612 Ricardo dcb->dcb_func(dcb->dcb_data, error); 1200 10612 Ricardo kmem_free(dcb, sizeof (dmu_tx_callback_t)); 1201 10612 Ricardo } 1202 10612 Ricardo } 1203