Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/dmu.h>
     29 #include <sys/dmu_impl.h>
     30 #include <sys/dbuf.h>
     31 #include <sys/dmu_tx.h>
     32 #include <sys/dmu_objset.h>
     33 #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
     34 #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
     35 #include <sys/dsl_pool.h>
     36 #include <sys/zap_impl.h> /* for fzap_default_block_shift */
     37 #include <sys/spa.h>
     38 #include <sys/zfs_context.h>
     39 
     40 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     41     uint64_t arg1, uint64_t arg2);
     42 
     43 
     44 dmu_tx_t *
     45 dmu_tx_create_dd(dsl_dir_t *dd)
     46 {
     47 	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
     48 	tx->tx_dir = dd;
     49 	if (dd)
     50 		tx->tx_pool = dd->dd_pool;
     51 	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
     52 	    offsetof(dmu_tx_hold_t, txh_node));
     53 #ifdef ZFS_DEBUG
     54 	refcount_create(&tx->tx_space_written);
     55 	refcount_create(&tx->tx_space_freed);
     56 #endif
     57 	return (tx);
     58 }
     59 
     60 dmu_tx_t *
     61 dmu_tx_create(objset_t *os)
     62 {
     63 	dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
     64 	tx->tx_objset = os;
     65 	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
     66 	return (tx);
     67 }
     68 
     69 dmu_tx_t *
     70 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
     71 {
     72 	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
     73 
     74 	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
     75 	tx->tx_pool = dp;
     76 	tx->tx_txg = txg;
     77 	tx->tx_anyobj = TRUE;
     78 
     79 	return (tx);
     80 }
     81 
     82 int
     83 dmu_tx_is_syncing(dmu_tx_t *tx)
     84 {
     85 	return (tx->tx_anyobj);
     86 }
     87 
     88 int
     89 dmu_tx_private_ok(dmu_tx_t *tx)
     90 {
     91 	return (tx->tx_anyobj);
     92 }
     93 
     94 static dmu_tx_hold_t *
     95 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
     96     enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
     97 {
     98 	dmu_tx_hold_t *txh;
     99 	dnode_t *dn = NULL;
    100 	int err;
    101 
    102 	if (object != DMU_NEW_OBJECT) {
    103 		err = dnode_hold(os->os, object, tx, &dn);
    104 		if (err) {
    105 			tx->tx_err = err;
    106 			return (NULL);
    107 		}
    108 
    109 		if (err == 0 && tx->tx_txg != 0) {
    110 			mutex_enter(&dn->dn_mtx);
    111 			/*
    112 			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
    113 			 * problem, but there's no way for it to happen (for
    114 			 * now, at least).
    115 			 */
    116 			ASSERT(dn->dn_assigned_txg == 0);
    117 			dn->dn_assigned_txg = tx->tx_txg;
    118 			(void) refcount_add(&dn->dn_tx_holds, tx);
    119 			mutex_exit(&dn->dn_mtx);
    120 		}
    121 	}
    122 
    123 	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
    124 	txh->txh_tx = tx;
    125 	txh->txh_dnode = dn;
    126 #ifdef ZFS_DEBUG
    127 	txh->txh_type = type;
    128 	txh->txh_arg1 = arg1;
    129 	txh->txh_arg2 = arg2;
    130 #endif
    131 	list_insert_tail(&tx->tx_holds, txh);
    132 
    133 	return (txh);
    134 }
    135 
    136 void
    137 dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
    138 {
    139 	/*
    140 	 * If we're syncing, they can manipulate any object anyhow, and
    141 	 * the hold on the dnode_t can cause problems.
    142 	 */
    143 	if (!dmu_tx_is_syncing(tx)) {
    144 		(void) dmu_tx_hold_object_impl(tx, os,
    145 		    object, THT_NEWOBJECT, 0, 0);
    146 	}
    147 }
    148 
    149 static int
    150 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
    151 {
    152 	int err;
    153 	dmu_buf_impl_t *db;
    154 
    155 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
    156 	db = dbuf_hold_level(dn, level, blkid, FTAG);
    157 	rw_exit(&dn->dn_struct_rwlock);
    158 	if (db == NULL)
    159 		return (EIO);
    160 	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
    161 	dbuf_rele(db, FTAG);
    162 	return (err);
    163 }
    164 
    165 /* ARGSUSED */
    166 static void
    167 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
    168 {
    169 	dnode_t *dn = txh->txh_dnode;
    170 	uint64_t start, end, i;
    171 	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
    172 	int err = 0;
    173 
    174 	if (len == 0)
    175 		return;
    176 
    177 	min_bs = SPA_MINBLOCKSHIFT;
    178 	max_bs = SPA_MAXBLOCKSHIFT;
    179 	min_ibs = DN_MIN_INDBLKSHIFT;
    180 	max_ibs = DN_MAX_INDBLKSHIFT;
    181 
    182 
    183 	/*
    184 	 * For i/o error checking, read the first and last level-0
    185 	 * blocks (if they are not aligned), and all the level-1 blocks.
    186 	 */
    187 
    188 	if (dn) {
    189 		if (dn->dn_maxblkid == 0) {
    190 			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
    191 			if (err)
    192 				goto out;
    193 		} else {
    194 			zio_t *zio = zio_root(dn->dn_objset->os_spa,
    195 			    NULL, NULL, ZIO_FLAG_CANFAIL);
    196 
    197 			/* first level-0 block */
    198 			start = off >> dn->dn_datablkshift;
    199 			if (P2PHASE(off, dn->dn_datablksz) ||
    200 			    len < dn->dn_datablksz) {
    201 				err = dmu_tx_check_ioerr(zio, dn, 0, start);
    202 				if (err)
    203 					goto out;
    204 			}
    205 
    206 			/* last level-0 block */
    207 			end = (off+len-1) >> dn->dn_datablkshift;
    208 			if (end != start &&
    209 			    P2PHASE(off+len, dn->dn_datablksz)) {
    210 				err = dmu_tx_check_ioerr(zio, dn, 0, end);
    211 				if (err)
    212 					goto out;
    213 			}
    214 
    215 			/* level-1 blocks */
    216 			if (dn->dn_nlevels > 1) {
    217 				start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
    218 				end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
    219 				for (i = start+1; i < end; i++) {
    220 					err = dmu_tx_check_ioerr(zio, dn, 1, i);
    221 					if (err)
    222 						goto out;
    223 				}
    224 			}
    225 
    226 			err = zio_wait(zio);
    227 			if (err)
    228 				goto out;
    229 		}
    230 	}
    231 
    232 	/*
    233 	 * If there's more than one block, the blocksize can't change,
    234 	 * so we can make a more precise estimate.  Alternatively,
    235 	 * if the dnode's ibs is larger than max_ibs, always use that.
    236 	 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
    237 	 * the code will still work correctly on existing pools.
    238 	 */
    239 	if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
    240 		min_ibs = max_ibs = dn->dn_indblkshift;
    241 		if (dn->dn_datablkshift != 0)
    242 			min_bs = max_bs = dn->dn_datablkshift;
    243 	}
    244 
    245 	/*
    246 	 * 'end' is the last thing we will access, not one past.
    247 	 * This way we won't overflow when accessing the last byte.
    248 	 */
    249 	start = P2ALIGN(off, 1ULL << max_bs);
    250 	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
    251 	txh->txh_space_towrite += end - start + 1;
    252 
    253 	start >>= min_bs;
    254 	end >>= min_bs;
    255 
    256 	epbs = min_ibs - SPA_BLKPTRSHIFT;
    257 
    258 	/*
    259 	 * The object contains at most 2^(64 - min_bs) blocks,
    260 	 * and each indirect level maps 2^epbs.
    261 	 */
    262 	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
    263 		start >>= epbs;
    264 		end >>= epbs;
    265 		/*
    266 		 * If we increase the number of levels of indirection,
    267 		 * we'll need new blkid=0 indirect blocks.  If start == 0,
    268 		 * we're already accounting for that blocks; and if end == 0,
    269 		 * we can't increase the number of levels beyond that.
    270 		 */
    271 		if (start != 0 && end != 0)
    272 			txh->txh_space_towrite += 1ULL << max_ibs;
    273 		txh->txh_space_towrite += (end - start + 1) << max_ibs;
    274 	}
    275 
    276 	ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS);
    277 
    278 out:
    279 	if (err)
    280 		txh->txh_tx->tx_err = err;
    281 }
    282 
    283 static void
    284 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
    285 {
    286 	dnode_t *dn = txh->txh_dnode;
    287 	dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
    288 	uint64_t space = mdn->dn_datablksz +
    289 	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
    290 
    291 	if (dn && dn->dn_dbuf->db_blkptr &&
    292 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
    293 	    dn->dn_dbuf->db_blkptr->blk_birth)) {
    294 		txh->txh_space_tooverwrite += space;
    295 	} else {
    296 		txh->txh_space_towrite += space;
    297 		if (dn && dn->dn_dbuf->db_blkptr)
    298 			txh->txh_space_tounref += space;
    299 	}
    300 }
    301 
    302 void
    303 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
    304 {
    305 	dmu_tx_hold_t *txh;
    306 
    307 	ASSERT(tx->tx_txg == 0);
    308 	ASSERT(len < DMU_MAX_ACCESS);
    309 	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
    310 
    311 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
    312 	    object, THT_WRITE, off, len);
    313 	if (txh == NULL)
    314 		return;
    315 
    316 	dmu_tx_count_write(txh, off, len);
    317 	dmu_tx_count_dnode(txh);
    318 }
    319 
    320 static void
    321 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
    322 {
    323 	uint64_t blkid, nblks;
    324 	uint64_t space = 0, unref = 0;
    325 	dnode_t *dn = txh->txh_dnode;
    326 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
    327 	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
    328 	int dirty;
    329 
    330 	/*
    331 	 * We don't need to use any locking to check for dirtyness
    332 	 * because it's OK if we get stale data -- the dnode may become
    333 	 * dirty immediately after our check anyway.  This is just a
    334 	 * means to avoid the expensive count when we aren't sure we
    335 	 * need it.  We need to be able to deal with a dirty dnode.
    336 	 */
    337 	dirty = list_link_active(&dn->dn_dirty_link[0]) |
    338 	    list_link_active(&dn->dn_dirty_link[1]) |
    339 	    list_link_active(&dn->dn_dirty_link[2]) |
    340 	    list_link_active(&dn->dn_dirty_link[3]);
    341 	if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0)
    342 		return;
    343 
    344 	/*
    345 	 * the struct_rwlock protects us against dn_phys->dn_nlevels
    346 	 * changing, in case (against all odds) we manage to dirty &
    347 	 * sync out the changes after we check for being dirty.
    348 	 * also, dbuf_hold_impl() wants us to have the struct_rwlock.
    349 	 *
    350 	 * It's fine to use dn_datablkshift rather than the dn_phys
    351 	 * equivalent because if it is changing, maxblkid==0 and we will
    352 	 * bail.
    353 	 */
    354 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
    355 	if (dn->dn_phys->dn_maxblkid == 0) {
    356 		if (off == 0 && len >= dn->dn_datablksz) {
    357 			blkid = 0;
    358 			nblks = 1;
    359 		} else {
    360 			rw_exit(&dn->dn_struct_rwlock);
    361 			return;
    362 		}
    363 	} else {
    364 		blkid = off >> dn->dn_datablkshift;
    365 		nblks = (off + len) >> dn->dn_datablkshift;
    366 
    367 		if (blkid >= dn->dn_phys->dn_maxblkid) {
    368 			rw_exit(&dn->dn_struct_rwlock);
    369 			return;
    370 		}
    371 		if (blkid + nblks > dn->dn_phys->dn_maxblkid)
    372 			nblks = dn->dn_phys->dn_maxblkid - blkid;
    373 
    374 		/* don't bother after 128,000 blocks */
    375 		nblks = MIN(nblks, 128*1024);
    376 	}
    377 
    378 	if (dn->dn_phys->dn_nlevels == 1) {
    379 		int i;
    380 		for (i = 0; i < nblks; i++) {
    381 			blkptr_t *bp = dn->dn_phys->dn_blkptr;
    382 			ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
    383 			bp += blkid + i;
    384 			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
    385 				dprintf_bp(bp, "can free old%s", "");
    386 				space += bp_get_dasize(spa, bp);
    387 			}
    388 			unref += BP_GET_ASIZE(bp);
    389 		}
    390 		nblks = 0;
    391 	}
    392 
    393 	while (nblks) {
    394 		dmu_buf_impl_t *dbuf;
    395 		int err, epbs, blkoff, tochk;
    396 
    397 		epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
    398 		blkoff = P2PHASE(blkid, 1<<epbs);
    399 		tochk = MIN((1<<epbs) - blkoff, nblks);
    400 
    401 		err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
    402 		if (err == 0) {
    403 			int i;
    404 			blkptr_t *bp;
    405 
    406 			err = dbuf_read(dbuf, NULL,
    407 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
    408 			if (err != 0) {
    409 				txh->txh_tx->tx_err = err;
    410 				dbuf_rele(dbuf, FTAG);
    411 				break;
    412 			}
    413 
    414 			bp = dbuf->db.db_data;
    415 			bp += blkoff;
    416 
    417 			for (i = 0; i < tochk; i++) {
    418 				if (dsl_dataset_block_freeable(ds,
    419 				    bp[i].blk_birth)) {
    420 					dprintf_bp(&bp[i],
    421 					    "can free old%s", "");
    422 					space += bp_get_dasize(spa, &bp[i]);
    423 				}
    424 				unref += BP_GET_ASIZE(bp);
    425 			}
    426 			dbuf_rele(dbuf, FTAG);
    427 		}
    428 		if (err && err != ENOENT) {
    429 			txh->txh_tx->tx_err = err;
    430 			break;
    431 		}
    432 
    433 		blkid += tochk;
    434 		nblks -= tochk;
    435 	}
    436 	rw_exit(&dn->dn_struct_rwlock);
    437 
    438 	txh->txh_space_tofree += space;
    439 	txh->txh_space_tounref += unref;
    440 }
    441 
    442 void
    443 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
    444 {
    445 	dmu_tx_hold_t *txh;
    446 	dnode_t *dn;
    447 	uint64_t start, end, i;
    448 	int err, shift;
    449 	zio_t *zio;
    450 
    451 	ASSERT(tx->tx_txg == 0);
    452 
    453 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
    454 	    object, THT_FREE, off, len);
    455 	if (txh == NULL)
    456 		return;
    457 	dn = txh->txh_dnode;
    458 
    459 	/* first block */
    460 	if (off != 0)
    461 		dmu_tx_count_write(txh, off, 1);
    462 	/* last block */
    463 	if (len != DMU_OBJECT_END)
    464 		dmu_tx_count_write(txh, off+len, 1);
    465 
    466 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
    467 		return;
    468 	if (len == DMU_OBJECT_END)
    469 		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
    470 
    471 	/*
    472 	 * For i/o error checking, read the first and last level-0
    473 	 * blocks, and all the level-1 blocks.  The above count_write's
    474 	 * will take care of the level-0 blocks.
    475 	 */
    476 	if (dn->dn_nlevels > 1) {
    477 		shift = dn->dn_datablkshift + dn->dn_indblkshift -
    478 		    SPA_BLKPTRSHIFT;
    479 		start = off >> shift;
    480 		end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
    481 
    482 		zio = zio_root(tx->tx_pool->dp_spa,
    483 		    NULL, NULL, ZIO_FLAG_CANFAIL);
    484 		for (i = start; i <= end; i++) {
    485 			uint64_t ibyte = i << shift;
    486 			err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0);
    487 			i = ibyte >> shift;
    488 			if (err == ESRCH)
    489 				break;
    490 			if (err) {
    491 				tx->tx_err = err;
    492 				return;
    493 			}
    494 
    495 			err = dmu_tx_check_ioerr(zio, dn, 1, i);
    496 			if (err) {
    497 				tx->tx_err = err;
    498 				return;
    499 			}
    500 		}
    501 		err = zio_wait(zio);
    502 		if (err) {
    503 			tx->tx_err = err;
    504 			return;
    505 		}
    506 	}
    507 
    508 	dmu_tx_count_dnode(txh);
    509 	dmu_tx_count_free(txh, off, len);
    510 }
    511 
    512 void
    513 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
    514 {
    515 	dmu_tx_hold_t *txh;
    516 	dnode_t *dn;
    517 	uint64_t nblocks;
    518 	int epbs, err;
    519 
    520 	ASSERT(tx->tx_txg == 0);
    521 
    522 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
    523 	    object, THT_ZAP, add, (uintptr_t)name);
    524 	if (txh == NULL)
    525 		return;
    526 	dn = txh->txh_dnode;
    527 
    528 	dmu_tx_count_dnode(txh);
    529 
    530 	if (dn == NULL) {
    531 		/*
    532 		 * We will be able to fit a new object's entries into one leaf
    533 		 * block.  So there will be at most 2 blocks total,
    534 		 * including the header block.
    535 		 */
    536 		dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
    537 		return;
    538 	}
    539 
    540 	ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
    541 
    542 	if (dn->dn_maxblkid == 0 && !add) {
    543 		/*
    544 		 * If there is only one block  (i.e. this is a micro-zap)
    545 		 * and we are not adding anything, the accounting is simple.
    546 		 */
    547 		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
    548 		if (err) {
    549 			tx->tx_err = err;
    550 			return;
    551 		}
    552 
    553 		/*
    554 		 * Use max block size here, since we don't know how much
    555 		 * the size will change between now and the dbuf dirty call.
    556 		 */
    557 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
    558 		    dn->dn_phys->dn_blkptr[0].blk_birth)) {
    559 			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
    560 		} else {
    561 			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
    562 			txh->txh_space_tounref +=
    563 			    BP_GET_ASIZE(dn->dn_phys->dn_blkptr);
    564 		}
    565 		return;
    566 	}
    567 
    568 	if (dn->dn_maxblkid > 0 && name) {
    569 		/*
    570 		 * access the name in this fat-zap so that we'll check
    571 		 * for i/o errors to the leaf blocks, etc.
    572 		 */
    573 		err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
    574 		    8, 0, NULL);
    575 		if (err == EIO) {
    576 			tx->tx_err = err;
    577 			return;
    578 		}
    579 	}
    580 
    581 	/*
    582 	 * 3 blocks overwritten: target leaf, ptrtbl block, header block
    583 	 * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
    584 	 */
    585 	dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
    586 	    (3 + add ? 3 : 0) << dn->dn_datablkshift);
    587 
    588 	/*
    589 	 * If the modified blocks are scattered to the four winds,
    590 	 * we'll have to modify an indirect twig for each.
    591 	 */
    592 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
    593 	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
    594 		txh->txh_space_towrite += 3 << dn->dn_indblkshift;
    595 }
    596 
    597 void
    598 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
    599 {
    600 	dmu_tx_hold_t *txh;
    601 
    602 	ASSERT(tx->tx_txg == 0);
    603 
    604 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
    605 	    object, THT_BONUS, 0, 0);
    606 	if (txh)
    607 		dmu_tx_count_dnode(txh);
    608 }
    609 
    610 void
    611 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
    612 {
    613 	dmu_tx_hold_t *txh;
    614 	ASSERT(tx->tx_txg == 0);
    615 
    616 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
    617 	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
    618 
    619 	txh->txh_space_towrite += space;
    620 }
    621 
    622 int
    623 dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
    624 {
    625 	dmu_tx_hold_t *txh;
    626 	int holds = 0;
    627 
    628 	/*
    629 	 * By asserting that the tx is assigned, we're counting the
    630 	 * number of dn_tx_holds, which is the same as the number of
    631 	 * dn_holds.  Otherwise, we'd be counting dn_holds, but
    632 	 * dn_tx_holds could be 0.
    633 	 */
    634 	ASSERT(tx->tx_txg != 0);
    635 
    636 	/* if (tx->tx_anyobj == TRUE) */
    637 		/* return (0); */
    638 
    639 	for (txh = list_head(&tx->tx_holds); txh;
    640 	    txh = list_next(&tx->tx_holds, txh)) {
    641 		if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
    642 			holds++;
    643 	}
    644 
    645 	return (holds);
    646 }
    647 
    648 #ifdef ZFS_DEBUG
    649 void
    650 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
    651 {
    652 	dmu_tx_hold_t *txh;
    653 	int match_object = FALSE, match_offset = FALSE;
    654 	dnode_t *dn = db->db_dnode;
    655 
    656 	ASSERT(tx->tx_txg != 0);
    657 	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
    658 	ASSERT3U(dn->dn_object, ==, db->db.db_object);
    659 
    660 	if (tx->tx_anyobj)
    661 		return;
    662 
    663 	/* XXX No checking on the meta dnode for now */
    664 	if (db->db.db_object == DMU_META_DNODE_OBJECT)
    665 		return;
    666 
    667 	for (txh = list_head(&tx->tx_holds); txh;
    668 	    txh = list_next(&tx->tx_holds, txh)) {
    669 		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
    670 		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
    671 			match_object = TRUE;
    672 		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
    673 			int datablkshift = dn->dn_datablkshift ?
    674 			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
    675 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
    676 			int shift = datablkshift + epbs * db->db_level;
    677 			uint64_t beginblk = shift >= 64 ? 0 :
    678 			    (txh->txh_arg1 >> shift);
    679 			uint64_t endblk = shift >= 64 ? 0 :
    680 			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
    681 			uint64_t blkid = db->db_blkid;
    682 
    683 			/* XXX txh_arg2 better not be zero... */
    684 
    685 			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
    686 			    txh->txh_type, beginblk, endblk);
    687 
    688 			switch (txh->txh_type) {
    689 			case THT_WRITE:
    690 				if (blkid >= beginblk && blkid <= endblk)
    691 					match_offset = TRUE;
    692 				/*
    693 				 * We will let this hold work for the bonus
    694 				 * buffer so that we don't need to hold it
    695 				 * when creating a new object.
    696 				 */
    697 				if (blkid == DB_BONUS_BLKID)
    698 					match_offset = TRUE;
    699 				/*
    700 				 * They might have to increase nlevels,
    701 				 * thus dirtying the new TLIBs.  Or the
    702 				 * might have to change the block size,
    703 				 * thus dirying the new lvl=0 blk=0.
    704 				 */
    705 				if (blkid == 0)
    706 					match_offset = TRUE;
    707 				break;
    708 			case THT_FREE:
    709 				if (blkid == beginblk &&
    710 				    (txh->txh_arg1 != 0 ||
    711 				    dn->dn_maxblkid == 0))
    712 					match_offset = TRUE;
    713 				if (blkid == endblk &&
    714 				    txh->txh_arg2 != DMU_OBJECT_END)
    715 					match_offset = TRUE;
    716 				break;
    717 			case THT_BONUS:
    718 				if (blkid == DB_BONUS_BLKID)
    719 					match_offset = TRUE;
    720 				break;
    721 			case THT_ZAP:
    722 				match_offset = TRUE;
    723 				break;
    724 			case THT_NEWOBJECT:
    725 				match_object = TRUE;
    726 				break;
    727 			default:
    728 				ASSERT(!"bad txh_type");
    729 			}
    730 		}
    731 		if (match_object && match_offset)
    732 			return;
    733 	}
    734 	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
    735 	    (u_longlong_t)db->db.db_object, db->db_level,
    736 	    (u_longlong_t)db->db_blkid);
    737 }
    738 #endif
    739 
    740 static int
    741 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
    742 {
    743 	dmu_tx_hold_t *txh;
    744 	spa_t *spa = tx->tx_pool->dp_spa;
    745 	uint64_t lsize, asize, fsize, usize;
    746 	uint64_t towrite, tofree, tooverwrite, tounref;
    747 
    748 	ASSERT3U(tx->tx_txg, ==, 0);
    749 
    750 	if (tx->tx_err)
    751 		return (tx->tx_err);
    752 
    753 	if (spa_state(spa) == POOL_STATE_IO_FAILURE) {
    754 		/*
    755 		 * If the user has indicated a blocking failure mode
    756 		 * then return ERESTART which will block in dmu_tx_wait().
    757 		 * Otherwise, return EIO so that an error can get
    758 		 * propagated back to the VOP calls.
    759 		 *
    760 		 * Note that we always honor the txg_how flag regardless
    761 		 * of the failuremode setting.
    762 		 */
    763 		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
    764 		    txg_how != TXG_WAIT)
    765 			return (EIO);
    766 
    767 		return (ERESTART);
    768 	}
    769 
    770 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
    771 	tx->tx_needassign_txh = NULL;
    772 
    773 	/*
    774 	 * NB: No error returns are allowed after txg_hold_open, but
    775 	 * before processing the dnode holds, due to the
    776 	 * dmu_tx_unassign() logic.
    777 	 */
    778 
    779 	towrite = tofree = tooverwrite = tounref = 0;
    780 	for (txh = list_head(&tx->tx_holds); txh;
    781 	    txh = list_next(&tx->tx_holds, txh)) {
    782 		dnode_t *dn = txh->txh_dnode;
    783 		if (dn != NULL) {
    784 			mutex_enter(&dn->dn_mtx);
    785 			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
    786 				mutex_exit(&dn->dn_mtx);
    787 				tx->tx_needassign_txh = txh;
    788 				return (ERESTART);
    789 			}
    790 			if (dn->dn_assigned_txg == 0)
    791 				dn->dn_assigned_txg = tx->tx_txg;
    792 			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
    793 			(void) refcount_add(&dn->dn_tx_holds, tx);
    794 			mutex_exit(&dn->dn_mtx);
    795 		}
    796 		towrite += txh->txh_space_towrite;
    797 		tofree += txh->txh_space_tofree;
    798 		tooverwrite += txh->txh_space_tooverwrite;
    799 		tounref += txh->txh_space_tounref;
    800 	}
    801 
    802 	/*
    803 	 * NB: This check must be after we've held the dnodes, so that
    804 	 * the dmu_tx_unassign() logic will work properly
    805 	 */
    806 	if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
    807 		return (ERESTART);
    808 
    809 	/*
    810 	 * If a snapshot has been taken since we made our estimates,
    811 	 * assume that we won't be able to free or overwrite anything.
    812 	 */
    813 	if (tx->tx_objset &&
    814 	    dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
    815 	    tx->tx_lastsnap_txg) {
    816 		towrite += tooverwrite;
    817 		tooverwrite = tofree = 0;
    818 	}
    819 
    820 	/*
    821 	 * Convert logical size to worst-case allocated size.
    822 	 */
    823 	fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
    824 	lsize = towrite + tooverwrite;
    825 	asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
    826 	usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
    827 
    828 #ifdef ZFS_DEBUG
    829 	tx->tx_space_towrite = asize;
    830 	tx->tx_space_tofree = tofree;
    831 	tx->tx_space_tooverwrite = tooverwrite;
    832 	tx->tx_space_tounref = tounref;
    833 #endif
    834 
    835 	if (tx->tx_dir && asize != 0) {
    836 		int err = dsl_dir_tempreserve_space(tx->tx_dir,
    837 		    lsize, asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
    838 		if (err)
    839 			return (err);
    840 	}
    841 
    842 	return (0);
    843 }
    844 
    845 static void
    846 dmu_tx_unassign(dmu_tx_t *tx)
    847 {
    848 	dmu_tx_hold_t *txh;
    849 
    850 	if (tx->tx_txg == 0)
    851 		return;
    852 
    853 	txg_rele_to_quiesce(&tx->tx_txgh);
    854 
    855 	for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
    856 	    txh = list_next(&tx->tx_holds, txh)) {
    857 		dnode_t *dn = txh->txh_dnode;
    858 
    859 		if (dn == NULL)
    860 			continue;
    861 		mutex_enter(&dn->dn_mtx);
    862 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
    863 
    864 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
    865 			dn->dn_assigned_txg = 0;
    866 			cv_broadcast(&dn->dn_notxholds);
    867 		}
    868 		mutex_exit(&dn->dn_mtx);
    869 	}
    870 
    871 	txg_rele_to_sync(&tx->tx_txgh);
    872 
    873 	tx->tx_lasttried_txg = tx->tx_txg;
    874 	tx->tx_txg = 0;
    875 }
    876 
    877 /*
    878  * Assign tx to a transaction group.  txg_how can be one of:
    879  *
    880  * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
    881  *	a new one.  This should be used when you're not holding locks.
    882  *	If will only fail if we're truly out of space (or over quota).
    883  *
    884  * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
    885  *	blocking, returns immediately with ERESTART.  This should be used
    886  *	whenever you're holding locks.  On an ERESTART error, the caller
    887  *	should drop locks, do a dmu_tx_wait(tx), and try again.
    888  *
    889  * (3)	A specific txg.  Use this if you need to ensure that multiple
    890  *	transactions all sync in the same txg.  Like TXG_NOWAIT, it
    891  *	returns ERESTART if it can't assign you into the requested txg.
    892  */
    893 int
    894 dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
    895 {
    896 	int err;
    897 
    898 	ASSERT(tx->tx_txg == 0);
    899 	ASSERT(txg_how != 0);
    900 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
    901 
    902 	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
    903 		dmu_tx_unassign(tx);
    904 
    905 		if (err != ERESTART || txg_how != TXG_WAIT)
    906 			return (err);
    907 
    908 		dmu_tx_wait(tx);
    909 	}
    910 
    911 	txg_rele_to_quiesce(&tx->tx_txgh);
    912 
    913 	return (0);
    914 }
    915 
    916 void
    917 dmu_tx_wait(dmu_tx_t *tx)
    918 {
    919 	spa_t *spa = tx->tx_pool->dp_spa;
    920 
    921 	ASSERT(tx->tx_txg == 0);
    922 
    923 	/*
    924 	 * It's possible that the pool has become active after this thread
    925 	 * has tried to obtain a tx. If that's the case then his
    926 	 * tx_lasttried_txg would not have been assigned.
    927 	 */
    928 	if (spa_state(spa) == POOL_STATE_IO_FAILURE ||
    929 	    tx->tx_lasttried_txg == 0) {
    930 		txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
    931 	} else if (tx->tx_needassign_txh) {
    932 		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
    933 
    934 		mutex_enter(&dn->dn_mtx);
    935 		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
    936 			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
    937 		mutex_exit(&dn->dn_mtx);
    938 		tx->tx_needassign_txh = NULL;
    939 	} else {
    940 		txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
    941 	}
    942 }
    943 
    944 void
    945 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
    946 {
    947 #ifdef ZFS_DEBUG
    948 	if (tx->tx_dir == NULL || delta == 0)
    949 		return;
    950 
    951 	if (delta > 0) {
    952 		ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
    953 		    tx->tx_space_towrite);
    954 		(void) refcount_add_many(&tx->tx_space_written, delta, NULL);
    955 	} else {
    956 		(void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
    957 	}
    958 #endif
    959 }
    960 
    961 void
    962 dmu_tx_commit(dmu_tx_t *tx)
    963 {
    964 	dmu_tx_hold_t *txh;
    965 
    966 	ASSERT(tx->tx_txg != 0);
    967 
    968 	while (txh = list_head(&tx->tx_holds)) {
    969 		dnode_t *dn = txh->txh_dnode;
    970 
    971 		list_remove(&tx->tx_holds, txh);
    972 		kmem_free(txh, sizeof (dmu_tx_hold_t));
    973 		if (dn == NULL)
    974 			continue;
    975 		mutex_enter(&dn->dn_mtx);
    976 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
    977 
    978 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
    979 			dn->dn_assigned_txg = 0;
    980 			cv_broadcast(&dn->dn_notxholds);
    981 		}
    982 		mutex_exit(&dn->dn_mtx);
    983 		dnode_rele(dn, tx);
    984 	}
    985 
    986 	if (tx->tx_tempreserve_cookie)
    987 		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
    988 
    989 	if (tx->tx_anyobj == FALSE)
    990 		txg_rele_to_sync(&tx->tx_txgh);
    991 	list_destroy(&tx->tx_holds);
    992 #ifdef ZFS_DEBUG
    993 	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
    994 	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
    995 	    tx->tx_space_tofree, refcount_count(&tx->