OpenGrok

Cross Reference: dmu_tx.c
xref: /onnv/onnv-gate/usr/src/uts/common/fs/zfs/dmu_tx.c
Home | History | Annotate | Line # | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
     23  */
     24 
     25 #include <sys/dmu.h>
     26 #include <sys/dmu_impl.h>
     27 #include <sys/dbuf.h>
     28 #include <sys/dmu_tx.h>
     29 #include <sys/dmu_objset.h>
     30 #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
     31 #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
     32 #include <sys/dsl_pool.h>
     33 #include <sys/zap_impl.h> /* for fzap_default_block_shift */
     34 #include <sys/spa.h>
     35 #include <sys/sa.h>
     36 #include <sys/sa_impl.h>
     37 #include <sys/zfs_context.h>
     38 #include <sys/varargs.h>
     39 
     40 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     41     uint64_t arg1, uint64_t arg2);
     42 
     43 
     44 dmu_tx_t *
     45 dmu_tx_create_dd(dsl_dir_t *dd)
     46 {
     47 	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
     48 	tx->tx_dir = dd;
     49 	if (dd)
     50 		tx->tx_pool = dd->dd_pool;
     51 	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
     52 	    offsetof(dmu_tx_hold_t, txh_node));
     53 	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
     54 	    offsetof(dmu_tx_callback_t, dcb_node));
     55 #ifdef ZFS_DEBUG
     56 	refcount_create(&tx->tx_space_written);
     57 	refcount_create(&tx->tx_space_freed);
     58 #endif
     59 	return (tx);
     60 }
     61 
     62 dmu_tx_t *
     63 dmu_tx_create(objset_t *os)
     64 {
     65 	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
     66 	tx->tx_objset = os;
     67 	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
     68 	return (tx);
     69 }
     70 
     71 dmu_tx_t *
     72 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
     73 {
     74 	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
     75 
     76 	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
     77 	tx->tx_pool = dp;
     78 	tx->tx_txg = txg;
     79 	tx->tx_anyobj = TRUE;
     80 
     81 	return (tx);
     82 }
     83 
     84 int
     85 dmu_tx_is_syncing(dmu_tx_t *tx)
     86 {
     87 	return (tx->tx_anyobj);
     88 }
     89 
     90 int
     91 dmu_tx_private_ok(dmu_tx_t *tx)
     92 {
     93 	return (tx->tx_anyobj);
     94 }
     95 
     96 static dmu_tx_hold_t *
     97 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
     98     enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
     99 {
    100 	dmu_tx_hold_t *txh;
    101 	dnode_t *dn = NULL;
    102 	int err;
    103 
    104 	if (object != DMU_NEW_OBJECT) {
    105 		err = dnode_hold(os, object, tx, &dn);
    106 		if (err) {
    107 			tx->tx_err = err;
    108 			return (NULL);
    109 		}
    110 
    111 		if (err == 0 && tx->tx_txg != 0) {
    112 			mutex_enter(&dn->dn_mtx);
    113 			/*
    114 			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
    115 			 * problem, but there's no way for it to happen (for
    116 			 * now, at least).
    117 			 */
    118 			ASSERT(dn->dn_assigned_txg == 0);
    119 			dn->dn_assigned_txg = tx->tx_txg;
    120 			(void) refcount_add(&dn->dn_tx_holds, tx);
    121 			mutex_exit(&dn->dn_mtx);
    122 		}
    123 	}
    124 
    125 	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
    126 	txh->txh_tx = tx;
    127 	txh->txh_dnode = dn;
    128 #ifdef ZFS_DEBUG
    129 	txh->txh_type = type;
    130 	txh->txh_arg1 = arg1;
    131 	txh->txh_arg2 = arg2;
    132 #endif
    133 	list_insert_tail(&tx->tx_holds, txh);
    134 
    135 	return (txh);
    136 }
    137 
    138 void
    139 dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
    140 {
    141 	/*
    142 	 * If we're syncing, they can manipulate any object anyhow, and
    143 	 * the hold on the dnode_t can cause problems.
    144 	 */
    145 	if (!dmu_tx_is_syncing(tx)) {
    146 		(void) dmu_tx_hold_object_impl(tx, os,
    147 		    object, THT_NEWOBJECT, 0, 0);
    148 	}
    149 }
    150 
    151 static int
    152 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
    153 {
    154 	int err;
    155 	dmu_buf_impl_t *db;
    156 
    157 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
    158 	db = dbuf_hold_level(dn, level, blkid, FTAG);
    159 	rw_exit(&dn->dn_struct_rwlock);
    160 	if (db == NULL)
    161 		return (EIO);
    162 	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
    163 	dbuf_rele(db, FTAG);
    164 	return (err);
    165 }
    166 
    167 static void
    168 dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
    169     int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
    170 {
    171 	objset_t *os = dn->dn_objset;
    172 	dsl_dataset_t *ds = os->os_dsl_dataset;
    173 	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
    174 	dmu_buf_impl_t *parent = NULL;
    175 	blkptr_t *bp = NULL;
    176 	uint64_t space;
    177 
    178 	if (level >= dn->dn_nlevels || history[level] == blkid)
    179 		return;
    180 
    181 	history[level] = blkid;
    182 
    183 	space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
    184 
    185 	if (db == NULL || db == dn->dn_dbuf) {
    186 		ASSERT(level != 0);
    187 		db = NULL;
    188 	} else {
    189 		ASSERT(DB_DNODE(db) == dn);
    190 		ASSERT(db->db_level == level);
    191 		ASSERT(db->db.db_size == space);
    192 		ASSERT(db->db_blkid == blkid);
    193 		bp = db->db_blkptr;
    194 		parent = db->db_parent;
    195 	}
    196 
    197 	freeable = (bp && (freeable ||
    198 	    dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
    199 
    200 	if (freeable)
    201 		txh->txh_space_tooverwrite += space;
    202 	else
    203 		txh->txh_space_towrite += space;
    204 	if (bp)
    205 		txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
    206 
    207 	dmu_tx_count_twig(txh, dn, parent, level + 1,
    208 	    blkid >> epbs, freeable, history);
    209 }
    210 
    211 /* ARGSUSED */
    212 static void
    213 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
    214 {
    215 	dnode_t *dn = txh->txh_dnode;
    216 	uint64_t start, end, i;
    217 	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
    218 	int err = 0;
    219 
    220 	if (len == 0)
    221 		return;
    222 
    223 	min_bs = SPA_MINBLOCKSHIFT;
    224 	max_bs = SPA_MAXBLOCKSHIFT;
    225 	min_ibs = DN_MIN_INDBLKSHIFT;
    226 	max_ibs = DN_MAX_INDBLKSHIFT;
    227 
    228 	if (dn) {
    229 		uint64_t history[DN_MAX_LEVELS];
    230 		int nlvls = dn->dn_nlevels;
    231 		int delta;
    232 
    233 		/*
    234 		 * For i/o error checking, read the first and last level-0
    235 		 * blocks (if they are not aligned), and all the level-1 blocks.
    236 		 */
    237 		if (dn->dn_maxblkid == 0) {
    238 			delta = dn->dn_datablksz;
    239 			start = (off < dn->dn_datablksz) ? 0 : 1;
    240 			end = (off+len <= dn->dn_datablksz) ? 0 : 1;
    241 			if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
    242 				err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
    243 				if (err)
    244 					goto out;
    245 				delta -= off;
    246 			}
    247 		} else {
    248 			zio_t *zio = zio_root(dn->dn_objset->os_spa,
    249 			    NULL, NULL, ZIO_FLAG_CANFAIL);
    250 
    251 			/* first level-0 block */
    252 			start = off >> dn->dn_datablkshift;
    253 			if (P2PHASE(off, dn->dn_datablksz) ||
    254 			    len < dn->dn_datablksz) {
    255 				err = dmu_tx_check_ioerr(zio, dn, 0, start);
    256 				if (err)
    257 					goto out;
    258 			}
    259 
    260 			/* last level-0 block */
    261 			end = (off+len-1) >> dn->dn_datablkshift;
    262 			if (end != start && end <= dn->dn_maxblkid &&
    263 			    P2PHASE(off+len, dn->dn_datablksz)) {
    264 				err = dmu_tx_check_ioerr(zio, dn, 0, end);
    265 				if (err)
    266 					goto out;
    267 			}
    268 
    269 			/* level-1 blocks */
    270 			if (nlvls > 1) {
    271 				int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
    272 				for (i = (start>>shft)+1; i < end>>shft; i++) {
    273 					err = dmu_tx_check_ioerr(zio, dn, 1, i);
    274 					if (err)
    275 						goto out;
    276 				}
    277 			}
    278 
    279 			err = zio_wait(zio);
    280 			if (err)
    281 				goto out;
    282 			delta = P2NPHASE(off, dn->dn_datablksz);
    283 		}
    284 
    285 		if (dn->dn_maxblkid > 0) {
    286 			/*
    287 			 * The blocksize can't change,
    288 			 * so we can make a more precise estimate.
    289 			 */
    290 			ASSERT(dn->dn_datablkshift != 0);
    291 			min_bs = max_bs = dn->dn_datablkshift;
    292 			min_ibs = max_ibs = dn->dn_indblkshift;
    293 		} else if (dn->dn_indblkshift > max_ibs) {
    294 			/*
    295 			 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
    296 			 * the code will still work correctly on older pools.
    297 			 */
    298 			min_ibs = max_ibs = dn->dn_indblkshift;
    299 		}
    300 
    301 		/*
    302 		 * If this write is not off the end of the file
    303 		 * we need to account for overwrites/unref.
    304 		 */
    305 		if (start <= dn->dn_maxblkid) {
    306 			for (int l = 0; l < DN_MAX_LEVELS; l++)
    307 				history[l] = -1ULL;
    308 		}
    309 		while (start <= dn->dn_maxblkid) {
    310 			dmu_buf_impl_t *db;
    311 
    312 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
    313 			err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
    314 			rw_exit(&dn->dn_struct_rwlock);
    315 
    316 			if (err) {
    317 				txh->txh_tx->tx_err = err;
    318 				return;
    319 			}
    320 
    321 			dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
    322 			    history);
    323 			dbuf_rele(db, FTAG);
    324 			if (++start > end) {
    325 				/*
    326 				 * Account for new indirects appearing
    327 				 * before this IO gets assigned into a txg.
    328 				 */
    329 				bits = 64 - min_bs;
    330 				epbs = min_ibs - SPA_BLKPTRSHIFT;
    331 				for (bits -= epbs * (nlvls - 1);
    332 				    bits >= 0; bits -= epbs)
    333 					txh->txh_fudge += 1ULL << max_ibs;
    334 				goto out;
    335 			}
    336 			off += delta;
    337 			if (len >= delta)
    338 				len -= delta;
    339 			delta = dn->dn_datablksz;
    340 		}
    341 	}
    342 
    343 	/*
    344 	 * 'end' is the last thing we will access, not one past.
    345 	 * This way we won't overflow when accessing the last byte.
    346 	 */
    347 	start = P2ALIGN(off, 1ULL << max_bs);
    348 	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
    349 	txh->txh_space_towrite += end - start + 1;
    350 
    351 	start >>= min_bs;
    352 	end >>= min_bs;
    353 
    354 	epbs = min_ibs - SPA_BLKPTRSHIFT;
    355 
    356 	/*
    357 	 * The object contains at most 2^(64 - min_bs) blocks,
    358 	 * and each indirect level maps 2^epbs.
    359 	 */
    360 	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
    361 		start >>= epbs;
    362 		end >>= epbs;
    363 		ASSERT3U(end, >=, start);
    364 		txh->txh_space_towrite += (end - start + 1) << max_ibs;
    365 		if (start != 0) {
    366 			/*
    367 			 * We also need a new blkid=0 indirect block
    368 			 * to reference any existing file data.
    369 			 */
    370 			txh->txh_space_towrite += 1ULL << max_ibs;
    371 		}
    372 	}
    373 
    374 out:
    375 	if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
    376 	    2 * DMU_MAX_ACCESS)
    377 		err = EFBIG;
    378 
    379 	if (err)
    380 		txh->txh_tx->tx_err = err;
    381 }
    382 
    383 static void
    384 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
    385 {
    386 	dnode_t *dn = txh->txh_dnode;
    387 	dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
    388 	uint64_t space = mdn->dn_datablksz +
    389 	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
    390 
    391 	if (dn && dn->dn_dbuf->db_blkptr &&
    392 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
    393 	    dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
    394 		txh->txh_space_tooverwrite += space;
    395 		txh->txh_space_tounref += space;
    396 	} else {
    397 		txh->txh_space_towrite += space;
    398 		if (dn && dn->dn_dbuf->db_blkptr)
    399 			txh->txh_space_tounref += space;
    400 	}
    401 }
    402 
    403 void
    404 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
    405 {
    406 	dmu_tx_hold_t *txh;
    407 
    408 	ASSERT(tx->tx_txg == 0);
    409 	ASSERT(len < DMU_MAX_ACCESS);
    410 	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
    411 
    412 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
    413 	    object, THT_WRITE, off, len);
    414 	if (txh == NULL)
    415 		return;
    416 
    417 	dmu_tx_count_write(txh, off, len);
    418 	dmu_tx_count_dnode(txh);
    419 }
    420 
    421 static void
    422 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
    423 {
    424 	uint64_t blkid, nblks, lastblk;
    425 	uint64_t space = 0, unref = 0, skipped = 0;
    426 	dnode_t *dn = txh->txh_dnode;
    427 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
    428 	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
    429 	int epbs;
    430 
    431 	if (dn->dn_nlevels == 0)
    432 		return;
    433 
    434 	/*
    435 	 * The struct_rwlock protects us against dn_nlevels
    436 	 * changing, in case (against all odds) we manage to dirty &
    437 	 * sync out the changes after we check for being dirty.
    438 	 * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
    439 	 */
    440 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
    441 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
    442 	if (dn->dn_maxblkid == 0) {
    443 		if (off == 0 && len >= dn->dn_datablksz) {
    444 			blkid = 0;
    445 			nblks = 1;
    446 		} else {
    447 			rw_exit(&dn->dn_struct_rwlock);
    448 			return;
    449 		}
    450 	} else {
    451 		blkid = off >> dn->dn_datablkshift;
    452 		nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
    453 
    454 		if (blkid >= dn->dn_maxblkid) {
    455 			rw_exit(&dn->dn_struct_rwlock);
    456 			return;
    457 		}
    458 		if (blkid + nblks > dn->dn_maxblkid)
    459 			nblks = dn->dn_maxblkid - blkid;
    460 
    461 	}
    462 	if (dn->dn_nlevels == 1) {
    463 		int i;
    464 		for (i = 0; i < nblks; i++) {
    465 			blkptr_t *bp = dn->dn_phys->dn_blkptr;
    466 			ASSERT3U(blkid + i, <, dn->dn_nblkptr);
    467 			bp += blkid + i;
    468 			if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
    469 				dprintf_bp(bp, "can free old%s", "");
    470 				space += bp_get_dsize(spa, bp);
    471 			}
    472 			unref += BP_GET_ASIZE(bp);
    473 		}
    474 		nblks = 0;
    475 	}
    476 
    477 	/*
    478 	 * Add in memory requirements of higher-level indirects.
    479 	 * This assumes a worst-possible scenario for dn_nlevels.
    480 	 */
    481 	{
    482 		uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
    483 		int level = (dn->dn_nlevels > 1) ? 2 : 1;
    484 
    485 		while (level++ < DN_MAX_LEVELS) {
    486 			txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
    487 			blkcnt = 1 + (blkcnt >> epbs);
    488 		}
    489 		ASSERT(blkcnt <= dn->dn_nblkptr);
    490 	}
    491 
    492 	lastblk = blkid + nblks - 1;
    493 	while (nblks) {
    494 		dmu_buf_impl_t *dbuf;
    495 		uint64_t ibyte, new_blkid;
    496 		int epb = 1 << epbs;
    497 		int err, i, blkoff, tochk;
    498 		blkptr_t *bp;
    499 
    500 		ibyte = blkid << dn->dn_datablkshift;
    501 		err = dnode_next_offset(dn,
    502 		    DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
    503 		new_blkid = ibyte >> dn->dn_datablkshift;
    504 		if (err == ESRCH) {
    505 			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
    506 			break;
    507 		}
    508 		if (err) {
    509 			txh->txh_tx->tx_err = err;
    510 			break;
    511 		}
    512 		if (new_blkid > lastblk) {
    513 			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
    514 			break;
    515 		}
    516 
    517 		if (new_blkid > blkid) {
    518 			ASSERT((new_blkid >> epbs) > (blkid >> epbs));
    519 			skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
    520 			nblks -= new_blkid - blkid;
    521 			blkid = new_blkid;
    522 		}
    523 		blkoff = P2PHASE(blkid, epb);
    524 		tochk = MIN(epb - blkoff, nblks);
    525 
    526 		err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
    527 		if (err) {
    528 			txh->txh_tx->tx_err = err;
    529 			break;
    530 		}
    531 
    532 		txh->txh_memory_tohold += dbuf->db.db_size;
    533 
    534 		/*
    535 		 * We don't check memory_tohold against DMU_MAX_ACCESS because
    536 		 * memory_tohold is an over-estimation (especially the >L1
    537 		 * indirect blocks), so it could fail.  Callers should have
    538 		 * already verified that they will not be holding too much
    539 		 * memory.
    540 		 */
    541 
    542 		err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
    543 		if (err != 0) {
    544 			txh->txh_tx->tx_err = err;
    545 			dbuf_rele(dbuf, FTAG);
    546 			break;
    547 		}
    548 
    549 		bp = dbuf->db.db_data;
    550 		bp += blkoff;
    551 
    552 		for (i = 0; i < tochk; i++) {
    553 			if (dsl_dataset_block_freeable(ds, &bp[i],
    554 			    bp[i].blk_birth)) {
    555 				dprintf_bp(&bp[i], "can free old%s", "");
    556 				space += bp_get_dsize(spa, &bp[i]);
    557 			}
    558 			unref += BP_GET_ASIZE(bp);
    559 		}
    560 		dbuf_rele(dbuf, FTAG);
    561 
    562 		blkid += tochk;
    563 		nblks -= tochk;
    564 	}
    565 	rw_exit(&dn->dn_struct_rwlock);
    566 
    567 	/* account for new level 1 indirect blocks that might show up */
    568 	if (skipped > 0) {
    569 		txh->txh_fudge += skipped << dn->dn_indblkshift;
    570 		skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
    571 		txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
    572 	}
    573 	txh->txh_space_tofree += space;
    574 	txh->txh_space_tounref += unref;
    575 }
    576 
    577 void
    578 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
    579 {
    580 	dmu_tx_hold_t *txh;
    581 	dnode_t *dn;
    582 	uint64_t start, end, i;
    583 	int err, shift;
    584 	zio_t *zio;
    585 
    586 	ASSERT(tx->tx_txg == 0);
    587 
    588 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
    589 	    object, THT_FREE, off, len);
    590 	if (txh == NULL)
    591 		return;
    592 	dn = txh->txh_dnode;
    593 
    594 	/* first block */
    595 	if (off != 0)
    596 		dmu_tx_count_write(txh, off, 1);
    597 	/* last block */
    598 	if (len != DMU_OBJECT_END)
    599 		dmu_tx_count_write(txh, off+len, 1);
    600 
    601 	dmu_tx_count_dnode(txh);
    602 
    603 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
    604 		return;
    605 	if (len == DMU_OBJECT_END)
    606 		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
    607 
    608 	/*
    609 	 * For i/o error checking, read the first and last level-0
    610 	 * blocks, and all the level-1 blocks.  The above count_write's
    611 	 * have already taken care of the level-0 blocks.
    612 	 */
    613 	if (dn->dn_nlevels > 1) {
    614 		shift = dn->dn_datablkshift + dn->dn_indblkshift -
    615 		    SPA_BLKPTRSHIFT;
    616 		start = off >> shift;
    617 		end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
    618 
    619 		zio = zio_root(tx->tx_pool->dp_spa,
    620 		    NULL, NULL, ZIO_FLAG_CANFAIL);
    621 		for (i = start; i <= end; i++) {
    622 			uint64_t ibyte = i << shift;
    623 			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
    624 			i = ibyte >> shift;
    625 			if (err == ESRCH)
    626 				break;
    627 			if (err) {
    628 				tx->tx_err = err;
    629 				return;
    630 			}
    631 
    632 			err = dmu_tx_check_ioerr(zio, dn, 1, i);
    633 			if (err) {
    634 				tx->tx_err = err;
    635 				return;
    636 			}
    637 		}
    638 		err = zio_wait(zio);
    639 		if (err) {
    640 			tx->tx_err = err;
    641 			return;
    642 		}
    643 	}
    644 
    645 	dmu_tx_count_free(txh, off, len);
    646 }
    647 
    648 void
    649 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
    650 {
    651 	dmu_tx_hold_t *txh;
    652 	dnode_t *dn;
    653 	uint64_t nblocks;
    654 	int epbs, err;
    655 
    656 	ASSERT(tx->tx_txg == 0);
    657 
    658 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
    659 	    object, THT_ZAP, add, (uintptr_t)name);
    660 	if (txh == NULL)
    661 		return;
    662 	dn = txh->txh_dnode;
    663 
    664 	dmu_tx_count_dnode(txh);
    665 
    666 	if (dn == NULL) {
    667 		/*
    668 		 * We will be able to fit a new object's entries into one leaf
    669 		 * block.  So there will be at most 2 blocks total,
    670 		 * including the header block.
    671 		 */
    672 		dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
    673 		return;
    674 	}
    675 
    676 	ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
    677 
    678 	if (dn->dn_maxblkid == 0 && !add) {
    679 		/*
    680 		 * If there is only one block  (i.e. this is a micro-zap)
    681 		 * and we are not adding anything, the accounting is simple.
    682 		 */
    683 		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
    684 		if (err) {
    685 			tx->tx_err = err;
    686 			return;
    687 		}
    688 
    689 		/*
    690 		 * Use max block size here, since we don't know how much
    691 		 * the size will change between now and the dbuf dirty call.
    692 		 */
    693 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
    694 		    &dn->dn_phys->dn_blkptr[0],
    695 		    dn->dn_phys->dn_blkptr[0].blk_birth)) {
    696 			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
    697 		} else {
    698 			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
    699 		}
    700 		if (dn->dn_phys->dn_blkptr[0].blk_birth)
    701 			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
    702 		return;
    703 	}
    704 
    705 	if (dn->dn_maxblkid > 0 && name) {
    706 		/*
    707 		 * access the name in this fat-zap so that we'll check
    708 		 * for i/o errors to the leaf blocks, etc.
    709 		 */
    710 		err = zap_lookup(dn->dn_objset, dn->dn_object, name,
    711 		    8, 0, NULL);
    712 		if (err == EIO) {
    713 			tx->tx_err = err;
    714 			return;
    715 		}
    716 	}
    717 
    718 	err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
    719 	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
    720 
    721 	/*
    722 	 * If the modified blocks are scattered to the four winds,
    723 	 * we'll have to modify an indirect twig for each.
    724 	 */
    725 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
    726 	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
    727 		if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
    728 			txh->txh_space_towrite += 3 << dn->dn_indblkshift;
    729 		else
    730 			txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
    731 }
    732 
    733 void
    734 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
    735 {
    736 	dmu_tx_hold_t *txh;
    737 
    738 	ASSERT(tx->tx_txg == 0);
    739 
    740 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
    741 	    object, THT_BONUS, 0, 0);
    742 	if (txh)
    743 		dmu_tx_count_dnode(txh);
    744 }
    745 
    746 void
    747 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
    748 {
    749 	dmu_tx_hold_t *txh;
    750 	ASSERT(tx->tx_txg == 0);
    751 
    752 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
    753 	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
    754 
    755 	txh->txh_space_towrite += space;
    756 }
    757 
    758 int
    759 dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
    760 {
    761 	dmu_tx_hold_t *txh;
    762 	int holds = 0;
    763 
    764 	/*
    765 	 * By asserting that the tx is assigned, we're counting the
    766 	 * number of dn_tx_holds, which is the same as the number of
    767 	 * dn_holds.  Otherwise, we'd be counting dn_holds, but
    768 	 * dn_tx_holds could be 0.
    769 	 */
    770 	ASSERT(tx->tx_txg != 0);
    771 
    772 	/* if (tx->tx_anyobj == TRUE) */
    773 		/* return (0); */
    774 
    775 	for (txh = list_head(&tx->tx_holds); txh;
    776 	    txh = list_next(&tx->tx_holds, txh)) {
    777 		if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
    778 			holds++;
    779 	}
    780 
    781 	return (holds);
    782 }
    783 
    784 #ifdef ZFS_DEBUG
    785 void
    786 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
    787 {
    788 	dmu_tx_hold_t *txh;
    789 	int match_object = FALSE, match_offset = FALSE;
    790 	dnode_t *dn;
    791 
    792 	DB_DNODE_ENTER(db);
    793 	dn = DB_DNODE(db);
    794 	ASSERT(tx->tx_txg != 0);
    795 	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
    796 	ASSERT3U(dn->dn_object, ==, db->db.db_object);
    797 
    798 	if (tx->tx_anyobj) {
    799 		DB_DNODE_EXIT(db);
    800 		return;
    801 	}
    802 
    803 	/* XXX No checking on the meta dnode for now */
    804 	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
    805 		DB_DNODE_EXIT(db);
    806 		return;
    807 	}
    808 
    809 	for (txh = list_head(&tx->tx_holds); txh;
    810 	    txh = list_next(&tx->tx_holds, txh)) {
    811 		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
    812 		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
    813 			match_object = TRUE;
    814 		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
    815 			int datablkshift = dn->dn_datablkshift ?
    816 			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
    817 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
    818 			int shift = datablkshift + epbs * db->db_level;
    819 			uint64_t beginblk = shift >= 64 ? 0 :
    820 			    (txh->txh_arg1 >> shift);
    821 			uint64_t endblk = shift >= 64 ? 0 :
    822 			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
    823 			uint64_t blkid = db->db_blkid;
    824 
    825 			/* XXX txh_arg2 better not be zero... */
    826 
    827 			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
    828 			    txh->txh_type, beginblk, endblk);
    829 
    830 			switch (txh->txh_type) {
    831 			case THT_WRITE:
    832 				if (blkid >= beginblk && blkid <= endblk)
    833 					match_offset = TRUE;
    834 				/*
    835 				 * We will let this hold work for the bonus
    836 				 * or spill buffer so that we don't need to
    837 				 * hold it when creating a new object.
    838 				 */
    839 				if (blkid == DMU_BONUS_BLKID ||
    840 				    blkid == DMU_SPILL_BLKID)
    841 					match_offset = TRUE;
    842 				/*
    843 				 * They might have to increase nlevels,
    844 				 * thus dirtying the new TLIBs.  Or the
    845 				 * might have to change the block size,
    846 				 * thus dirying the new lvl=0 blk=0.
    847 				 */
    848 				if (blkid == 0)
    849 					match_offset = TRUE;
    850 				break;
    851 			case THT_FREE:
    852 				/*
    853 				 * We will dirty all the level 1 blocks in
    854 				 * the free range and perhaps the first and
    855 				 * last level 0 block.
    856 				 */
    857 				if (blkid >= beginblk && (blkid <= endblk ||
    858 				    txh->txh_arg2 == DMU_OBJECT_END))
    859 					match_offset = TRUE;
    860 				break;
    861 			case THT_SPILL:
    862 				if (blkid == DMU_SPILL_BLKID)
    863 					match_offset = TRUE;
    864 				break;
    865 			case THT_BONUS:
    866 				if (blkid == DMU_BONUS_BLKID)
    867 					match_offset = TRUE;
    868 				break;
    869 			case THT_ZAP:
    870 				match_offset = TRUE;
    871 				break;
    872 			case THT_NEWOBJECT:
    873 				match_object = TRUE;
    874 				break;
    875 			default:
    876 				ASSERT(!"bad txh_type");
    877 			}
    878 		}
    879 		if (match_object && match_offset) {
    880 			DB_DNODE_EXIT(db);
    881 			return;
    882 		}
    883 	}
    884 	DB_DNODE_EXIT(db);
    885 	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
    886 	    (u_longlong_t)db->db.db_object, db->db_level,
    887 	    (u_longlong_t)db->db_blkid);
    888 }
    889 #endif
    890 
    891 static int
    892 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
    893 {
    894 	dmu_tx_hold_t *txh;
    895 	spa_t *spa = tx->tx_pool->dp_spa;
    896 	uint64_t memory, asize, fsize, usize;
    897 	uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
    898 
    899 	ASSERT3U(tx->tx_txg, ==, 0);
    900 
    901 	if (tx->tx_err)
    902 		return (tx->tx_err);
    903 
    904 	if (spa_suspended(spa)) {
    905 		/*
    906 		 * If the user has indicated a blocking failure mode
    907 		 * then return ERESTART which will block in dmu_tx_wait().
    908 		 * Otherwise, return EIO so that an error can get
    909 		 * propagated back to the VOP calls.
    910 		 *
    911 		 * Note that we always honor the txg_how flag regardless
    912 		 * of the failuremode setting.
    913 		 */
    914 		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
    915 		    txg_how != TXG_WAIT)
    916 			return (EIO);
    917 
    918 		return (ERESTART);
    919 	}
    920 
    921 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
    922 	tx->tx_needassign_txh = NULL;
    923 
    924 	/*
    925 	 * NB: No error returns are allowed after txg_hold_open, but
    926 	 * before processing the dnode holds, due to the
    927 	 * dmu_tx_unassign() logic.
    928 	 */
    929 
    930 	towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
    931 	for (txh = list_head(&tx->tx_holds); txh;
    932 	    txh = list_next(&tx->tx_holds, txh)) {
    933 		dnode_t *dn = txh->txh_dnode;
    934 		if (dn != NULL) {
    935 			mutex_enter(&dn->dn_mtx);
    936 			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
    937 				mutex_exit(&dn->dn_mtx);
    938 				tx->tx_needassign_txh = txh;
    939 				return (ERESTART);
    940 			}
    941 			if (dn->dn_assigned_txg == 0)
    942 				dn->dn_assigned_txg = tx->tx_txg;
    943 			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
    944 			(void) refcount_add(&dn->dn_tx_holds, tx);
    945 			mutex_exit(&dn->dn_mtx);
    946 		}
    947 		towrite += txh->txh_space_towrite;
    948 		tofree += txh->txh_space_tofree;
    949 		tooverwrite += txh->txh_space_tooverwrite;
    950 		tounref += txh->txh_space_tounref;
    951 		tohold += txh->txh_memory_tohold;
    952 		fudge += txh->txh_fudge;
    953 	}
    954 
    955 	/*
    956 	 * NB: This check must be after we've held the dnodes, so that
    957 	 * the dmu_tx_unassign() logic will work properly
    958 	 */
    959 	if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
    960 		return (ERESTART);
    961 
    962 	/*
    963 	 * If a snapshot has been taken since we made our estimates,
    964 	 * assume that we won't be able to free or overwrite anything.
    965 	 */
    966 	if (tx->tx_objset &&
    967 	    dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
    968 	    tx->tx_lastsnap_txg) {
    969 		towrite += tooverwrite;
    970 		tooverwrite = tofree = 0;
    971 	}
    972 
    973 	/* needed allocation: worst-case estimate of write space */
    974 	asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
    975 	/* freed space estimate: worst-case overwrite + free estimate */
    976 	fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
    977 	/* convert unrefd space to worst-case estimate */
    978 	usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
    979 	/* calculate memory footprint estimate */
    980 	memory = towrite + tooverwrite + tohold;
    981 
    982 #ifdef ZFS_DEBUG
    983 	/*
    984 	 * Add in 'tohold' to account for our dirty holds on this memory
    985 	 * XXX - the "fudge" factor is to account for skipped blocks that
    986 	 * we missed because dnode_next_offset() misses in-core-only blocks.
    987 	 */
    988 	tx->tx_space_towrite = asize +
    989 	    spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
    990 	tx->tx_space_tofree = tofree;
    991 	tx->tx_space_tooverwrite = tooverwrite;
    992 	tx->tx_space_tounref = tounref;
    993 #endif
    994 
    995 	if (tx->tx_dir && asize != 0) {
    996 		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
    997 		    asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
    998 		if (err)
    999 			return (err);
   1000 	}
   1001 
   1002 	return (0);
   1003 }
   1004 
   1005 static void
   1006 dmu_tx_unassign(dmu_tx_t *tx)
   1007 {
   1008 	dmu_tx_hold_t *txh;
   1009 
   1010 	if (tx->tx_txg == 0)
   1011 		return;
   1012 
   1013 	txg_rele_to_quiesce(&tx->tx_txgh);
   1014 
   1015 	for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
   1016 	    txh = list_next(&tx->tx_holds, txh)) {
   1017 		dnode_t *dn = txh->txh_dnode;
   1018 
   1019 		if (dn == NULL)
   1020 			continue;
   1021 		mutex_enter(&dn->dn_mtx);
   1022 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
   1023 
   1024 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
   1025 			dn->dn_assigned_txg = 0;
   1026 			cv_broadcast(&dn->dn_notxholds);
   1027 		}
   1028 		mutex_exit(&dn->dn_mtx);
   1029 	}
   1030 
   1031 	txg_rele_to_sync(&tx->tx_txgh);
   1032 
   1033 	tx->tx_lasttried_txg = tx->tx_txg;
   1034 	tx->tx_txg = 0;
   1035 }
   1036 
   1037 /*
   1038  * Assign tx to a transaction group.  txg_how can be one of:
   1039  *
   1040  * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
   1041  *	a new one.  This should be used when you're not holding locks.
   1042  *	If will only fail if we're truly out of space (or over quota).
   1043  *
   1044  * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
   1045  *	blocking, returns immediately with ERESTART.  This should be used
   1046  *	whenever you're holding locks.  On an ERESTART error, the caller
   1047  *	should drop locks, do a dmu_tx_wait(tx), and try again.
   1048  *
   1049  * (3)	A specific txg.  Use this if you need to ensure that multiple
   1050  *	transactions all sync in the same txg.  Like TXG_NOWAIT, it
   1051  *	returns ERESTART if it can't assign you into the requested txg.
   1052  */
   1053 int
   1054 dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
   1055 {
   1056 	int err;
   1057 
   1058 	ASSERT(tx->tx_txg == 0);
   1059 	ASSERT(txg_how != 0);
   1060 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
   1061 
   1062 	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
   1063 		dmu_tx_unassign(tx);
   1064 
   1065 		if (err != ERESTART || txg_how != TXG_WAIT)
   1066 			return (err);
   1067 
   1068 		dmu_tx_wait(tx);
   1069 	}
   1070 
   1071 	txg_rele_to_quiesce(&tx->tx_txgh);
   1072 
   1073 	return (0);
   1074 }
   1075 
   1076 void
   1077 dmu_tx_wait(dmu_tx_t *tx)
   1078 {
   1079 	spa_t *spa = tx->tx_pool->dp_spa;
   1080 
   1081 	ASSERT(tx->tx_txg == 0);
   1082 
   1083 	/*
   1084 	 * It's possible that the pool has become active after this thread
   1085 	 * has tried to obtain a tx. If that's the case then his
   1086 	 * tx_lasttried_txg would not have been assigned.
   1087 	 */
   1088 	if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
   1089 		txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
   1090 	} else if (tx->tx_needassign_txh) {
   1091 		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
   1092 
   1093 		mutex_enter(&dn->dn_mtx);
   1094 		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
   1095 			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
   1096 		mutex_exit(&dn->dn_mtx);
   1097 		tx->tx_needassign_txh = NULL;
   1098 	} else {
   1099 		txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
   1100 	}
   1101 }
   1102 
   1103 void
   1104 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
   1105 {
   1106 #ifdef ZFS_DEBUG
   1107 	if (tx->tx_dir == NULL || delta == 0)
   1108 		return;
   1109 
   1110 	if (delta > 0) {
   1111 		ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
   1112 		    tx->tx_space_towrite);
   1113 		(void) refcount_add_many(&tx->tx_space_written, delta, NULL);
   1114 	} else {
   1115 		(void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
   1116 	}
   1117 #endif
   1118 }
   1119 
   1120 void
   1121 dmu_tx_commit(dmu_tx_t *tx)
   1122 {
   1123 	dmu_tx_hold_t *txh;
   1124 
   1125 	ASSERT(tx->tx_txg != 0);
   1126 
   1127 	while (txh = list_head(&tx->tx_holds)) {
   1128 		dnode_t *dn = txh->txh_dnode;
   1129 
   1130 		list_remove(&tx->tx_holds, txh);
   1131 		kmem_free(txh, sizeof (dmu_tx_hold_t));
   1132 		if (dn == NULL)
   1133 			continue;
   1134 		mutex_enter(&dn->dn_mtx);
   1135 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
   1136 
   1137 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
   1138 			dn->dn_assigned_txg = 0;
   1139 			cv_broadcast(&dn->dn_notxholds);
   1140 		}
   1141 		mutex_exit(&dn->dn_mtx);
   1142 		dnode_rele(dn, tx);
   1143 	}
   1144 
   1145 	if (tx->tx_tempreserve_cookie)
   1146 		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
   1147 
   1148 	if (!list_is_empty(&tx->tx_callbacks))
   1149 		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
   1150 
   1151 	if (tx->tx_anyobj == FALSE)
   1152 		txg_rele_to_sync(&tx->tx_txgh);
   1153 
   1154 	list_destroy(&tx->tx_callbacks);
   1155 	list_destroy(&tx->tx_holds);
   1156 #ifdef ZFS_DEBUG
   1157 	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
   1158 	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
   1159 	    tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
   1160 	refcount_destroy_many(&tx->tx_space_written,
   1161 	    refcount_count(&tx->tx_space_written));
   1162 	refcount_destroy_many(&tx->tx_space_freed,
   1163 	    refcount_count(&tx->tx_space_freed));
   1164 #endif
   1165 	kmem_free(tx, sizeof (dmu_tx_t));
   1166 }
   1167 
   1168 void
   1169 dmu_tx_abort(dmu_tx_t *tx)
   1170 {
   1171 	dmu_tx_hold_t *txh;
   1172 
   1173 	ASSERT(tx->tx_txg == 0);
   1174 
   1175 	while (txh = list_head(&tx->tx_holds)) {
   1176 		dnode_t *dn = txh->txh_dnode;
   1177 
   1178 		list_remove(&tx->tx_holds, txh);
   1179 		kmem_free(txh, sizeof (dmu_tx_hold_t));
   1180 		if (dn != NULL)
   1181 			dnode_rele(dn, tx);
   1182 	}
   1183 
   1184 	/*
   1185 	 * Call any registered callbacks with an error code.
   1186 	 */
   1187 	if (!list_is_empty(&tx->tx_callbacks))
   1188 		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
   1189 
   1190 	list_destroy(&tx->tx_callbacks);
   1191 	list_destroy(&tx->tx_holds);
   1192 #ifdef ZFS_DEBUG
   1193 	refcount_destroy_many(&tx->tx_space_written,
   1194 	    refcount_count(&tx->tx_space_written));
   1195 	refcount_destroy_many(&tx->tx_space_freed,
   1196 	    refcount_count(&tx->tx_space_freed));
   1197 #endif
   1198 	kmem_free(tx, sizeof (dmu_tx_t));
   1199 }
   1200 
   1201 uint64_t
   1202 dmu_tx_get_txg(dmu_tx_t *tx)
   1203 {
   1204 	ASSERT(tx->tx_txg != 0);
   1205 	return (tx->tx_txg);
   1206 }
   1207 
   1208 void
   1209 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
   1210 {
   1211 	dmu_tx_callback_t *dcb;
   1212 
   1213 	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
   1214 
   1215 	dcb->dcb_func = func;
   1216 	dcb->dcb_data = data;
   1217 
   1218 	list_insert_tail(&tx->tx_callbacks, dcb);
   1219 }
   1220 
   1221 /*
   1222  * Call all the commit callbacks on a list, with a given error code.
   1223  */
   1224 void
   1225 dmu_tx_do_callbacks(list_t *cb_list, int error)
   1226 {
   1227 	dmu_tx_callback_t *dcb;
   1228 
   1229 	while (dcb = list_head(cb_list)) {
   1230 		list_remove(cb_list, dcb);
   1231 		dcb->dcb_func(dcb->dcb_data, error);
   1232 		kmem_free(dcb, sizeof (dmu_tx_callback_t));
   1233 	}
   1234 }
   1235 
   1236 /*
   1237  * Interface to hold a bunch of attributes.
   1238  * used for creating new files.
   1239  * attrsize is the total size of all attributes
   1240  * to be added during object creation
   1241  *
   1242  * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
   1243  */
   1244 
   1245 /*
   1246  * hold necessary attribute name for attribute registration.
   1247  * should be a very rare case where this is needed.  If it does
   1248  * happen it would only happen on the first write to the file system.
   1249  */
   1250 static void
   1251 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
   1252 {
   1253 	int i;
   1254 
   1255 	if (!sa->sa_need_attr_registration)
   1256 		return;
   1257 
   1258 	for (i = 0; i != sa->sa_num_attrs; i++) {
   1259 		if (!sa->sa_attr_table[i].sa_registered) {
   1260 			if (sa->sa_reg_attr_obj)
   1261 				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
   1262 				    B_TRUE, sa->sa_attr_table[i].sa_name);
   1263 			else
   1264 				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
   1265 				    B_TRUE, sa->sa_attr_table[i].sa_name);
   1266 		}
   1267 	}
   1268 }
   1269 
   1270 
   1271 void
   1272 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
   1273 {
   1274 	dnode_t *dn;
   1275 	dmu_tx_hold_t *txh;
   1276 	blkptr_t *bp;
   1277 
   1278 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
   1279 	    THT_SPILL, 0, 0);
   1280 
   1281 	dn = txh->txh_dnode;
   1282 
   1283 	if (dn == NULL)
   1284 		return;
   1285 
   1286 	/* If blkptr doesn't exist then add space to towrite */
   1287 	bp = &dn->dn_phys->dn_spill;
   1288 	if (BP_IS_HOLE(bp)) {
   1289 		txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
   1290 		txh->txh_space_tounref = 0;
   1291 	} else {
   1292 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
   1293 		    bp, bp->blk_birth))
   1294 			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
   1295 		else
   1296 			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
   1297 		if (bp->blk_birth)
   1298 			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
   1299 	}
   1300 }
   1301 
   1302 void
   1303 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
   1304 {
   1305 	sa_os_t *sa = tx->tx_objset->os_sa;
   1306 
   1307 	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
   1308 
   1309 	if (tx->tx_objset->os_sa->sa_master_obj == 0)
   1310 		return;
   1311 
   1312 	if (tx->tx_objset->os_sa->sa_layout_attr_obj)
   1313 		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
   1314 	else {
   1315 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
   1316 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
   1317 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
   1318 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
   1319 	}
   1320 
   1321 	dmu_tx_sa_registration_hold(sa, tx);
   1322 
   1323 	if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
   1324 		return;
   1325 
   1326 	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
   1327 	    THT_SPILL, 0, 0);
   1328 }
   1329 
   1330 /*
   1331  * Hold SA attribute
   1332  *
   1333  * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
   1334  *
   1335  * variable_size is the total size of all variable sized attributes
   1336  * passed to this function.  It is not the total size of all
   1337  * variable size attributes that *may* exist on this object.
   1338  */
   1339 void
   1340 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
   1341 {
   1342 	uint64_t object;
   1343 	sa_os_t *sa = tx->tx_objset->os_sa;
   1344 
   1345 	ASSERT(hdl != NULL);
   1346 
   1347 	object = sa_handle_object(hdl);
   1348 
   1349 	dmu_tx_hold_bonus(tx, object);
   1350 
   1351 	if (tx->tx_objset->os_sa->sa_master_obj == 0)
   1352 		return;
   1353 
   1354 	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
   1355 	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
   1356 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
   1357 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
   1358 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
   1359 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
   1360 	}
   1361 
   1362 	dmu_tx_sa_registration_hold(sa, tx);
   1363 
   1364 	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
   1365 		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
   1366 
   1367 	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
   1368 		ASSERT(tx->tx_txg == 0);
   1369 		dmu_tx_hold_spill(tx, object);
   1370 	} else {
   1371 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
   1372 		dnode_t *dn;
   1373 
   1374 		DB_DNODE_ENTER(db);
   1375 		dn = DB_DNODE(db);
   1376 		if (dn->dn_have_spill) {
   1377 			ASSERT(tx->tx_txg == 0);
   1378 			dmu_tx_hold_spill(tx, object);
   1379 		}
   1380 		DB_DNODE_EXIT(db);
   1381 	}
   1382 }
   1383