Home | History | Annotate | Download | only in zfs
      1    789     ahrens /*
      2    789     ahrens  * CDDL HEADER START
      3    789     ahrens  *
      4    789     ahrens  * The contents of this file are subject to the terms of the
      5   1491     ahrens  * Common Development and Distribution License (the "License").
      6   1491     ahrens  * You may not use this file except in compliance with the License.
      7    789     ahrens  *
      8    789     ahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9    789     ahrens  * or http://www.opensolaris.org/os/licensing.
     10    789     ahrens  * See the License for the specific language governing permissions
     11    789     ahrens  * and limitations under the License.
     12    789     ahrens  *
     13    789     ahrens  * When distributing Covered Code, include this CDDL HEADER in each
     14    789     ahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15    789     ahrens  * If applicable, add the following below this CDDL HEADER, with the
     16    789     ahrens  * fields enclosed by brackets "[]" replaced with your own identifying
     17    789     ahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
     18    789     ahrens  *
     19    789     ahrens  * CDDL HEADER END
     20    789     ahrens  */
     21    789     ahrens /*
     22   8582    Brendan  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23    789     ahrens  * Use is subject to license terms.
     24    789     ahrens  */
     25    789     ahrens 
     26    789     ahrens #include <sys/zfs_context.h>
     27    789     ahrens #include <sys/dmu.h>
     28    789     ahrens #include <sys/dmu_impl.h>
     29    789     ahrens #include <sys/dbuf.h>
     30    789     ahrens #include <sys/dmu_objset.h>
     31    789     ahrens #include <sys/dsl_dataset.h>
     32    789     ahrens #include <sys/dsl_dir.h>
     33    789     ahrens #include <sys/dmu_tx.h>
     34    789     ahrens #include <sys/spa.h>
     35    789     ahrens #include <sys/zio.h>
     36    789     ahrens #include <sys/dmu_zfetch.h>
     37    789     ahrens 
     38    789     ahrens static void dbuf_destroy(dmu_buf_impl_t *db);
     39    789     ahrens static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
     40   7046     ahrens static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
     41    789     ahrens 
     42    789     ahrens /*
     43    789     ahrens  * Global data structures and functions for the dbuf cache.
     44    789     ahrens  */
     45    789     ahrens static kmem_cache_t *dbuf_cache;
     46    789     ahrens 
     47    789     ahrens /* ARGSUSED */
     48    789     ahrens static int
     49    789     ahrens dbuf_cons(void *vdb, void *unused, int kmflag)
     50    789     ahrens {
     51    789     ahrens 	dmu_buf_impl_t *db = vdb;
     52    789     ahrens 	bzero(db, sizeof (dmu_buf_impl_t));
     53    789     ahrens 
     54    789     ahrens 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
     55    789     ahrens 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
     56    789     ahrens 	refcount_create(&db->db_holds);
     57    789     ahrens 	return (0);
     58    789     ahrens }
     59    789     ahrens 
     60    789     ahrens /* ARGSUSED */
     61    789     ahrens static void
     62    789     ahrens dbuf_dest(void *vdb, void *unused)
     63    789     ahrens {
     64    789     ahrens 	dmu_buf_impl_t *db = vdb;
     65    789     ahrens 	mutex_destroy(&db->db_mtx);
     66    789     ahrens 	cv_destroy(&db->db_changed);
     67    789     ahrens 	refcount_destroy(&db->db_holds);
     68    789     ahrens }
     69    789     ahrens 
     70    789     ahrens /*
     71    789     ahrens  * dbuf hash table routines
     72    789     ahrens  */
     73    789     ahrens static dbuf_hash_table_t dbuf_hash_table;
     74    789     ahrens 
     75    789     ahrens static uint64_t dbuf_hash_count;
     76    789     ahrens 
     77    789     ahrens static uint64_t
     78    789     ahrens dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
     79    789     ahrens {
     80    789     ahrens 	uintptr_t osv = (uintptr_t)os;
     81    789     ahrens 	uint64_t crc = -1ULL;
     82    789     ahrens 
     83    789     ahrens 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
     84    789     ahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
     85    789     ahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
     86    789     ahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
     87    789     ahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
     88    789     ahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
     89    789     ahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
     90    789     ahrens 
     91    789     ahrens 	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
     92    789     ahrens 
     93    789     ahrens 	return (crc);
     94    789     ahrens }
     95    789     ahrens 
     96    789     ahrens #define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
     97    789     ahrens 
     98    789     ahrens #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
     99    789     ahrens 	((dbuf)->db.db_object == (obj) &&		\
    100    789     ahrens 	(dbuf)->db_objset == (os) &&			\
    101    789     ahrens 	(dbuf)->db_level == (level) &&			\
    102    789     ahrens 	(dbuf)->db_blkid == (blkid))
    103    789     ahrens 
    104    789     ahrens dmu_buf_impl_t *
    105    789     ahrens dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
    106    789     ahrens {
    107    789     ahrens 	dbuf_hash_table_t *h = &dbuf_hash_table;
    108  10298    Matthew 	objset_t *os = dn->dn_objset;
    109    789     ahrens 	uint64_t obj = dn->dn_object;
    110    789     ahrens 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
    111    789     ahrens 	uint64_t idx = hv & h->hash_table_mask;
    112    789     ahrens 	dmu_buf_impl_t *db;
    113    789     ahrens 
    114    789     ahrens 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
    115    789     ahrens 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
    116    789     ahrens 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
    117    789     ahrens 			mutex_enter(&db->db_mtx);
    118   1544   eschrock 			if (db->db_state != DB_EVICTING) {
    119    789     ahrens 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
    120    789     ahrens 				return (db);
    121    789     ahrens 			}
    122    789     ahrens 			mutex_exit(&db->db_mtx);
    123    789     ahrens 		}
    124    789     ahrens 	}
    125    789     ahrens 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
    126    789     ahrens 	return (NULL);
    127    789     ahrens }
    128    789     ahrens 
    129    789     ahrens /*
    130    789     ahrens  * Insert an entry into the hash table.  If there is already an element
    131    789     ahrens  * equal to elem in the hash table, then the already existing element
    132    789     ahrens  * will be returned and the new element will not be inserted.
    133    789     ahrens  * Otherwise returns NULL.
    134    789     ahrens  */
    135    789     ahrens static dmu_buf_impl_t *
    136    789     ahrens dbuf_hash_insert(dmu_buf_impl_t *db)
    137    789     ahrens {
    138    789     ahrens 	dbuf_hash_table_t *h = &dbuf_hash_table;
    139  10298    Matthew 	objset_t *os = db->db_objset;
    140    789     ahrens 	uint64_t obj = db->db.db_object;
    141    789     ahrens 	int level = db->db_level;
    142    789     ahrens 	uint64_t blkid = db->db_blkid;
    143    789     ahrens 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
    144    789     ahrens 	uint64_t idx = hv & h->hash_table_mask;
    145    789     ahrens 	dmu_buf_impl_t *dbf;
    146    789     ahrens 
    147    789     ahrens 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
    148    789     ahrens 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
    149    789     ahrens 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
    150    789     ahrens 			mutex_enter(&dbf->db_mtx);
    151   1544   eschrock 			if (dbf->db_state != DB_EVICTING) {
    152    789     ahrens 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
    153    789     ahrens 				return (dbf);
    154    789     ahrens 			}
    155    789     ahrens 			mutex_exit(&dbf->db_mtx);
    156    789     ahrens 		}
    157    789     ahrens 	}
    158    789     ahrens 
    159    789     ahrens 	mutex_enter(&db->db_mtx);
    160    789     ahrens 	db->db_hash_next = h->hash_table[idx];
    161    789     ahrens 	h->hash_table[idx] = db;
    162    789     ahrens 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
    163    789     ahrens 	atomic_add_64(&dbuf_hash_count, 1);
    164    789     ahrens 
    165    789     ahrens 	return (NULL);
    166    789     ahrens }
    167    789     ahrens 
    168    789     ahrens /*
    169    789     ahrens  * Remove an entry from the hash table.  This operation will
    170    789     ahrens  * fail if there are any existing holds on the db.
    171    789     ahrens  */
    172    789     ahrens static void
    173    789     ahrens dbuf_hash_remove(dmu_buf_impl_t *db)
    174    789     ahrens {
    175    789     ahrens 	dbuf_hash_table_t *h = &dbuf_hash_table;
    176    789     ahrens 	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
    177    789     ahrens 	    db->db_level, db->db_blkid);
    178    789     ahrens 	uint64_t idx = hv & h->hash_table_mask;
    179    789     ahrens 	dmu_buf_impl_t *dbf, **dbp;
    180    789     ahrens 
    181    789     ahrens 	/*
    182    789     ahrens 	 * We musn't hold db_mtx to maintin lock ordering:
    183    789     ahrens 	 * DBUF_HASH_MUTEX > db_mtx.
    184    789     ahrens 	 */
    185    789     ahrens 	ASSERT(refcount_is_zero(&db->db_holds));
    186   1544   eschrock 	ASSERT(db->db_state == DB_EVICTING);
    187    789     ahrens 	ASSERT(!MUTEX_HELD(&db->db_mtx));
    188    789     ahrens 
    189    789     ahrens 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
    190    789     ahrens 	dbp = &h->hash_table[idx];
    191    789     ahrens 	while ((dbf = *dbp) != db) {
    192    789     ahrens 		dbp = &dbf->db_hash_next;
    193    789     ahrens 		ASSERT(dbf != NULL);
    194    789     ahrens 	}
    195    789     ahrens 	*dbp = db->db_hash_next;
    196    789     ahrens 	db->db_hash_next = NULL;
    197    789     ahrens 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
    198    789     ahrens 	atomic_add_64(&dbuf_hash_count, -1);
    199    789     ahrens }
    200    789     ahrens 
    201   1544   eschrock static arc_evict_func_t dbuf_do_evict;
    202    789     ahrens 
    203    789     ahrens static void
    204    789     ahrens dbuf_evict_user(dmu_buf_impl_t *db)
    205    789     ahrens {
    206    789     ahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
    207    789     ahrens 
    208   3547     maybee 	if (db->db_level != 0 || db->db_evict_func == NULL)
    209    789     ahrens 		return;
    210    789     ahrens 
    211   3547     maybee 	if (db->db_user_data_ptr_ptr)
    212   3547     maybee 		*db->db_user_data_ptr_ptr = db->db.db_data;
    213   3547     maybee 	db->db_evict_func(&db->db, db->db_user_ptr);
    214   3547     maybee 	db->db_user_ptr = NULL;
    215   3547     maybee 	db->db_user_data_ptr_ptr = NULL;
    216   3547     maybee 	db->db_evict_func = NULL;
    217    789     ahrens }
    218    789     ahrens 
    219    789     ahrens void
    220   1544   eschrock dbuf_evict(dmu_buf_impl_t *db)
    221   1544   eschrock {
    222   1544   eschrock 	ASSERT(MUTEX_HELD(&db->db_mtx));
    223   1544   eschrock 	ASSERT(db->db_buf == NULL);
    224   3547     maybee 	ASSERT(db->db_data_pending == NULL);
    225   1544   eschrock 
    226   1544   eschrock 	dbuf_clear(db);
    227   1544   eschrock 	dbuf_destroy(db);
    228   1544   eschrock }
    229   1544   eschrock 
    230   1544   eschrock void
    231    789     ahrens dbuf_init(void)
    232    789     ahrens {
    233   1544   eschrock 	uint64_t hsize = 1ULL << 16;
    234    789     ahrens 	dbuf_hash_table_t *h = &dbuf_hash_table;
    235    789     ahrens 	int i;
    236    789     ahrens 
    237    789     ahrens 	/*
    238    789     ahrens 	 * The hash table is big enough to fill all of physical memory
    239   1544   eschrock 	 * with an average 4K block size.  The table will take up
    240   1544   eschrock 	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
    241    789     ahrens 	 */
    242   1544   eschrock 	while (hsize * 4096 < physmem * PAGESIZE)
    243    789     ahrens 		hsize <<= 1;
    244    789     ahrens 
    245   1544   eschrock retry:
    246    789     ahrens 	h->hash_table_mask = hsize - 1;
    247   1544   eschrock 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
    248   1544   eschrock 	if (h->hash_table == NULL) {
    249   1544   eschrock 		/* XXX - we should really return an error instead of assert */
    250   1544   eschrock 		ASSERT(hsize > (1ULL << 10));
    251   1544   eschrock 		hsize >>= 1;
    252   1544   eschrock 		goto retry;
    253   1544   eschrock 	}
    254    789     ahrens 
    255    789     ahrens 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
    256    789     ahrens 	    sizeof (dmu_buf_impl_t),
    257    789     ahrens 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
    258    789     ahrens 
    259    789     ahrens 	for (i = 0; i < DBUF_MUTEXES; i++)
    260    789     ahrens 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
    261    789     ahrens }
    262    789     ahrens 
    263    789     ahrens void
    264    789     ahrens dbuf_fini(void)
    265    789     ahrens {
    266    789     ahrens 	dbuf_hash_table_t *h = &dbuf_hash_table;
    267    789     ahrens 	int i;
    268    789     ahrens 
    269    789     ahrens 	for (i = 0; i < DBUF_MUTEXES; i++)
    270    789     ahrens 		mutex_destroy(&h->hash_mutexes[i]);
    271    789     ahrens 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
    272    789     ahrens 	kmem_cache_destroy(dbuf_cache);
    273    789     ahrens }
    274    789     ahrens 
    275    789     ahrens /*
    276    789     ahrens  * Other stuff.
    277    789     ahrens  */
    278    789     ahrens 
    279    873   ek110237 #ifdef ZFS_DEBUG
    280    789     ahrens static void
    281    789     ahrens dbuf_verify(dmu_buf_impl_t *db)
    282    789     ahrens {
    283    789     ahrens 	dnode_t *dn = db->db_dnode;
    284  10922       Jeff 	dbuf_dirty_record_t *dr;
    285    789     ahrens 
    286    789     ahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
    287    789     ahrens 
    288    789     ahrens 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
    289    789     ahrens 		return;
    290    789     ahrens 
    291    789     ahrens 	ASSERT(db->db_objset != NULL);
    292    789     ahrens 	if (dn == NULL) {
    293    789     ahrens 		ASSERT(db->db_parent == NULL);
    294    789     ahrens 		ASSERT(db->db_blkptr == NULL);
    295    789     ahrens 	} else {
    296    789     ahrens 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
    297    789     ahrens 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
    298    789     ahrens 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
    299   1544   eschrock 		ASSERT(db->db_blkid == DB_BONUS_BLKID ||
    300   1544   eschrock 		    list_head(&dn->dn_dbufs));
    301    789     ahrens 	}
    302    789     ahrens 	if (db->db_blkid == DB_BONUS_BLKID) {
    303    789     ahrens 		ASSERT(dn != NULL);
    304   4944     maybee 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
    305    789     ahrens 		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
    306    789     ahrens 	} else {
    307    789     ahrens 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
    308    789     ahrens 	}
    309    789     ahrens 
    310  10922       Jeff 	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
    311  10922       Jeff 		ASSERT(dr->dr_dbuf == db);
    312  10922       Jeff 
    313  10922       Jeff 	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
    314  10922       Jeff 		ASSERT(dr->dr_dbuf == db);
    315  10922       Jeff 
    316   7837    Matthew 	/*
    317   7837    Matthew 	 * We can't assert that db_size matches dn_datablksz because it
    318   7837    Matthew 	 * can be momentarily different when another thread is doing
    319   7837    Matthew 	 * dnode_set_blksz().
    320   7837    Matthew 	 */
    321   7837    Matthew 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
    322  10922       Jeff 		dr = db->db_data_pending;
    323   7837    Matthew 		/*
    324   7837    Matthew 		 * It should only be modified in syncing context, so
    325   7837    Matthew 		 * make sure we only have one copy of the data.
    326   7837    Matthew 		 */
    327   7837    Matthew 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
    328    789     ahrens 	}
    329    789     ahrens 
    330    789     ahrens 	/* verify db->db_blkptr */
    331    789     ahrens 	if (db->db_blkptr) {
    332    789     ahrens 		if (db->db_parent == dn->dn_dbuf) {
    333    789     ahrens 			/* db is pointed to by the dnode */
    334    789     ahrens 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
    335   9396    Matthew 			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
    336    789     ahrens 				ASSERT(db->db_parent == NULL);
    337    789     ahrens 			else
    338    789     ahrens 				ASSERT(db->db_parent != NULL);
    339    789     ahrens 			ASSERT3P(db->db_blkptr, ==,
    340    789     ahrens 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
    341    789     ahrens 		} else {
    342    789     ahrens 			/* db is pointed to by an indirect block */
    343    789     ahrens 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
    344    789     ahrens 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
    345    789     ahrens 			ASSERT3U(db->db_parent->db.db_object, ==,
    346    789     ahrens 			    db->db.db_object);
    347    789     ahrens 			/*
    348    789     ahrens 			 * dnode_grow_indblksz() can make this fail if we don't
    349    789     ahrens 			 * have the struct_rwlock.  XXX indblksz no longer
    350    789     ahrens 			 * grows.  safe to do this now?
    351    789     ahrens 			 */
    352    789     ahrens 			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
    353    789     ahrens 				ASSERT3P(db->db_blkptr, ==,
    354    789     ahrens 				    ((blkptr_t *)db->db_parent->db.db_data +
    355    789     ahrens 				    db->db_blkid % epb));
    356    789     ahrens 			}
    357    789     ahrens 		}
    358    789     ahrens 	}
    359    789     ahrens 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
    360    789     ahrens 	    db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
    361    789     ahrens 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
    362    789     ahrens 		/*
    363    789     ahrens 		 * If the blkptr isn't set but they have nonzero data,
    364    789     ahrens 		 * it had better be dirty, otherwise we'll lose that
    365    789     ahrens 		 * data when we evict this buffer.
    366    789     ahrens 		 */
    367    789     ahrens 		if (db->db_dirtycnt == 0) {
    368    789     ahrens 			uint64_t *buf = db->db.db_data;
    369    789     ahrens 			int i;
    370    789     ahrens 
    371    789     ahrens 			for (i = 0; i < db->db.db_size >> 3; i++) {
    372    789     ahrens 				ASSERT(buf[i] == 0);
    373    789     ahrens 			}
    374    789     ahrens 		}
    375    789     ahrens 	}
    376    873   ek110237 }
    377    789     ahrens #endif
    378    789     ahrens 
    379    789     ahrens static void
    380    789     ahrens dbuf_update_data(dmu_buf_impl_t *db)
    381    789     ahrens {
    382    789     ahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
    383   3547     maybee 	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
    384    789     ahrens 		ASSERT(!refcount_is_zero(&db->db_holds));
    385   3547     maybee 		*db->db_user_data_ptr_ptr = db->db.db_data;
    386    789     ahrens 	}
    387    789     ahrens }
    388    789     ahrens 
    389    789     ahrens static void
    390    789     ahrens dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
    391    789     ahrens {
    392    789     ahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
    393   1544   eschrock 	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
    394    789     ahrens 	db->db_buf = buf;
    395   1544   eschrock 	if (buf != NULL) {
    396   1544   eschrock 		ASSERT(buf->b_data != NULL);
    397   1544   eschrock 		db->db.db_data = buf->b_data;
    398   1544   eschrock 		if (!arc_released(buf))
    399   1544   eschrock 			arc_set_callback(buf, dbuf_do_evict, db);
    400   1544   eschrock 		dbuf_update_data(db);
    401   1544   eschrock 	} else {
    402   1544   eschrock 		dbuf_evict_user(db);
    403   1544   eschrock 		db->db.db_data = NULL;
    404   7872        Tim 		if (db->db_state != DB_NOFILL)
    405   7872        Tim 			db->db_state = DB_UNCACHED;
    406   1544   eschrock 	}
    407    789     ahrens }
    408    789     ahrens 
    409    789     ahrens uint64_t
    410    789     ahrens dbuf_whichblock(dnode_t *dn, uint64_t offset)
    411    789     ahrens {
    412    789     ahrens 	if (dn->dn_datablkshift) {
    413    789     ahrens 		return (offset >> dn->dn_datablkshift);
    414    789     ahrens 	} else {
    415    789     ahrens 		ASSERT3U(offset, <, dn->dn_datablksz);
    416    789     ahrens 		return (0);
    417    789     ahrens 	}
    418    789     ahrens }
    419    789     ahrens 
    420    789     ahrens static void
    421    789     ahrens dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
    422    789     ahrens {
    423    789     ahrens 	dmu_buf_impl_t *db = vdb;
    424    789     ahrens 
    425    789     ahrens 	mutex_enter(&db->db_mtx);
    426    789     ahrens 	ASSERT3U(db->db_state, ==, DB_READ);
    427    789     ahrens 	/*
    428    789     ahrens 	 * All reads are synchronous, so we must have a hold on the dbuf
    429    789     ahrens 	 */
    430    789     ahrens 	ASSERT(refcount_count(&db->db_holds) > 0);
    431   1544   eschrock 	ASSERT(db->db_buf == NULL);
    432    789     ahrens 	ASSERT(db->db.db_data == NULL);
    433   3547     maybee 	if (db->db_level == 0 && db->db_freed_in_flight) {
    434    789     ahrens 		/* we were freed in flight; disregard any error */
    435    789     ahrens 		arc_release(buf, db);
    436    789     ahrens 		bzero(buf->b_data, db->db.db_size);
    437   3093     ahrens 		arc_buf_freeze(buf);
    438   3547     maybee 		db->db_freed_in_flight = FALSE;
    439    789     ahrens 		dbuf_set_data(db, buf);
    440    789     ahrens 		db->db_state = DB_CACHED;
    441    789     ahrens 	} else if (zio == NULL || zio->io_error == 0) {
    442    789     ahrens 		dbuf_set_data(db, buf);
    443    789     ahrens 		db->db_state = DB_CACHED;
    444    789     ahrens 	} else {
    445    789     ahrens 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
    446   1544   eschrock 		ASSERT3P(db->db_buf, ==, NULL);
    447   1544   eschrock 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
    448    789     ahrens 		db->db_state = DB_UNCACHED;
    449    789     ahrens 	}
    450    789     ahrens 	cv_broadcast(&db->db_changed);
    451    789     ahrens 	mutex_exit(&db->db_mtx);
    452   1544   eschrock 	dbuf_rele(db, NULL);
    453    789     ahrens }
    454    789     ahrens 
    455   1544   eschrock static void
    456   2391     maybee dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
    457    789     ahrens {
    458   7046     ahrens 	dnode_t *dn = db->db_dnode;
    459   1544   eschrock 	zbookmark_t zb;
    460   2391     maybee 	uint32_t aflags = ARC_NOWAIT;
    461   7046     ahrens 	arc_buf_t *pbuf;
    462    789     ahrens 
    463    789     ahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
    464    789     ahrens 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
    465   7046     ahrens 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
    466   1544   eschrock 	ASSERT(MUTEX_HELD(&db->db_mtx));
    467   1544   eschrock 	ASSERT(db->db_state == DB_UNCACHED);
    468   1544   eschrock 	ASSERT(db->db_buf == NULL);
    469    789     ahrens 
    470    789     ahrens 	if (db->db_blkid == DB_BONUS_BLKID) {
    471   9299       Mark 		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
    472   4944     maybee 
    473   4944     maybee 		ASSERT3U(bonuslen, <=, db->db.db_size);
    474   1544   eschrock 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
    475   8582    Brendan 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
    476   4944     maybee 		if (bonuslen < DN_MAX_BONUSLEN)
    477   1544   eschrock 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
    478   9299       Mark 		if (bonuslen)
    479   9299       Mark 			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
    480   1544   eschrock 		dbuf_update_data(db);
    481    789     ahrens 		db->db_state = DB_CACHED;
    482    789     ahrens 		mutex_exit(&db->db_mtx);
    483    789     ahrens 		return;
    484    789     ahrens 	}
    485    789     ahrens 
    486   7385       Mark 	/*
    487   7385       Mark 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
    488   7385       Mark 	 * processes the delete record and clears the bp while we are waiting
    489   7385       Mark 	 * for the dn_mtx (resulting in a "no" from block_freed).
    490   7385       Mark 	 */
    491   7046     ahrens 	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
    492   7385       Mark 	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
    493   7385       Mark 	    BP_IS_HOLE(db->db_blkptr)))) {
    494   3290   johansen 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
    495   3290   johansen 
    496   7046     ahrens 		dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
    497   3290   johansen 		    db->db.db_size, db, type));
    498    789     ahrens 		bzero(db->db.db_data, db->db.db_size);
    499    789     ahrens 		db->db_state = DB_CACHED;
    500   2391     maybee 		*flags |= DB_RF_CACHED;
    501    789     ahrens 		mutex_exit(&db->db_mtx);
    502    789     ahrens 		return;
    503    789     ahrens 	}
    504    789     ahrens 
    505    789     ahrens 	db->db_state = DB_READ;
    506    789     ahrens 	mutex_exit(&db->db_mtx);
    507    789     ahrens 
    508   7237   ek110237 	if (DBUF_IS_L2CACHEABLE(db))
    509   7237   ek110237 		aflags |= ARC_L2CACHE;
    510   7237   ek110237 
    511  10922       Jeff 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
    512  10922       Jeff 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
    513  10922       Jeff 	    db->db.db_object, db->db_level, db->db_blkid);
    514   1544   eschrock 
    515   1544   eschrock 	dbuf_add_ref(db, NULL);
    516    789     ahrens 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
    517   7046     ahrens 
    518   7046     ahrens 	if (db->db_parent)
    519   7046     ahrens 		pbuf = db->db_parent->db_buf;
    520   7046     ahrens 	else
    521   7046     ahrens 		pbuf = db->db_objset->os_phys_buf;
    522   7046     ahrens 
    523   7046     ahrens 	(void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
    524    789     ahrens 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
    525   2391     maybee 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
    526   2391     maybee 	    &aflags, &zb);
    527   2391     maybee 	if (aflags & ARC_CACHED)
    528   2391     maybee 		*flags |= DB_RF_CACHED;
    529    789     ahrens }
    530    789     ahrens 
    531   1544   eschrock int
    532   1544   eschrock dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
    533    789     ahrens {
    534   1544   eschrock 	int err = 0;
    535   1544   eschrock 	int havepzio = (zio != NULL);
    536   2391     maybee 	int prefetch;
    537    789     ahrens 
    538    789     ahrens 	/*
    539    789     ahrens 	 * We don't have to hold the mutex to check db_state because it
    540    789     ahrens 	 * can't be freed while we have a hold on the buffer.
    541    789     ahrens 	 */
    542    789     ahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
    543    789     ahrens 
    544   7872        Tim 	if (db->db_state == DB_NOFILL)
    545   7872        Tim 		return (EIO);
    546   7872        Tim 
    547   1544   eschrock 	if ((flags & DB_RF_HAVESTRUCT) == 0)
    548   1544   eschrock 		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
    549   1544   eschrock 
    550   2391     maybee 	prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
    551   7237   ek110237 	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
    552   7237   ek110237 	    DBUF_IS_CACHEABLE(db);
    553   2391     maybee 
    554   1544   eschrock 	mutex_enter(&db->db_mtx);
    555   1544   eschrock 	if (db->db_state == DB_CACHED) {
    556   1544   eschrock 		mutex_exit(&db->db_mtx);
    557   2391     maybee 		if (prefetch)
    558   2391     maybee 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
    559   2391     maybee 			    db->db.db_size, TRUE);
    560    789     ahrens 		if ((flags & DB_RF_HAVESTRUCT) == 0)
    561    789     ahrens 			rw_exit(&db->db_dnode->dn_struct_rwlock);
    562   1544   eschrock 	} else if (db->db_state == DB_UNCACHED) {
    563   1544   eschrock 		if (zio == NULL) {
    564   1544   eschrock 			zio = zio_root(db->db_dnode->dn_objset->os_spa,
    565   1544   eschrock 			    NULL, NULL, ZIO_FLAG_CANFAIL);
    566   1544   eschrock 		}
    567   2391     maybee 		dbuf_read_impl(db, zio, &flags);
    568   2391     maybee 
    569   1544   eschrock 		/* dbuf_read_impl has dropped db_mtx for us */
    570   1544   eschrock 
    571   2391     maybee 		if (prefetch)
    572   1544   eschrock 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
    573   2391     maybee 			    db->db.db_size, flags & DB_RF_CACHED);
    574   1544   eschrock 
    575   1544   eschrock 		if ((flags & DB_RF_HAVESTRUCT) == 0)
    576   1544   eschrock 			rw_exit(&db->db_dnode->dn_struct_rwlock);
    577   1544   eschrock 
    578   1544   eschrock 		if (!havepzio)
    579   1544   eschrock 			err = zio_wait(zio);
    580   1544   eschrock 	} else {
    581   2391     maybee 		mutex_exit(&db->db_mtx);
    582   2391     maybee 		if (prefetch)
    583   2391     maybee 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
    584   2391     maybee 			    db->db.db_size, TRUE);
    585   1544   eschrock 		if ((flags & DB_RF_HAVESTRUCT) == 0)
    586   1544   eschrock 			rw_exit(&db->db_dnode->dn_struct_rwlock);
    587   2391     maybee 
    588   2391     maybee 		mutex_enter(&db->db_mtx);
    589   1544   eschrock 		if ((flags & DB_RF_NEVERWAIT) == 0) {
    590   1544   eschrock 			while (db->db_state == DB_READ ||
    591   1544   eschrock 			    db->db_state == DB_FILL) {
    592   1544   eschrock 				ASSERT(db->db_state == DB_READ ||
    593   1544   eschrock 				    (flags & DB_RF_HAVESTRUCT) == 0);
    594   1544   eschrock 				cv_wait(&db->db_changed, &db->db_mtx);
    595   1544   eschrock 			}
    596   1544   eschrock 			if (db->db_state == DB_UNCACHED)
    597   1544   eschrock 				err = EIO;
    598   1544   eschrock 		}
    599   1544   eschrock 		mutex_exit(&db->db_mtx);
    600    789     ahrens 	}
    601    789     ahrens 
    602   1544   eschrock 	ASSERT(err || havepzio || db->db_state == DB_CACHED);
    603   1544   eschrock 	return (err);
    604    789     ahrens }
    605    789     ahrens 
    606    789     ahrens static void
    607    789     ahrens dbuf_noread(dmu_buf_impl_t *db)
    608    789     ahrens {
    609    789     ahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
    610   1544   eschrock 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
    611    789     ahrens 	mutex_enter(&db->db_mtx);
    612    789     ahrens 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
    613    789     ahrens 		cv_wait(&db->db_changed, &db->db_mtx);
    614    789     ahrens 	if (db->db_state == DB_UNCACHED) {
    615   3290   johansen 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
    616   3290   johansen 
    617   1544   eschrock 		ASSERT(db->db_buf == NULL);
    618    789     ahrens 		ASSERT(db->db.db_data == NULL);
    619    789     ahrens 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
    620   3290   johansen 		    db->db.db_size, db, type));
    621    789     ahrens 		db->db_state = DB_FILL;
    622   7872        Tim 	} else if (db->db_state == DB_NOFILL) {
    623   7872        Tim 		dbuf_set_data(db, NULL);
    624    789     ahrens 	} else {
    625    789     ahrens 		ASSERT3U(db->db_state, ==, DB_CACHED);
    626    789     ahrens 	}
    627    789     ahrens 	mutex_exit(&db->db_mtx);
    628    789     ahrens }
    629    789     ahrens 
    630    789     ahrens /*
    631    789     ahrens  * This is our just-in-time copy function.  It makes a copy of
    632    789     ahrens  * buffers, that have been modified in a previous transaction
    633    789     ahrens  * group, before we modify them in the current active group.
    634    789     ahrens  *
    635    789     ahrens  * This function is used in two places: when we are dirtying a
    636    789     ahrens  * buffer for the first time in a txg, and when we are freeing
    637    789     ahrens  * a range in a dnode that includes this buffer.
    638    789     ahrens  *
    639    789     ahrens  * Note that when we are called from dbuf_free_range() we do
    640    789     ahrens  * not put a hold on the buffer, we just traverse the active
    641    789     ahrens  * dbuf list for the dnode.
    642    789     ahrens  */
    643    789     ahrens static void
    644    789     ahrens dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
    645    789     ahrens {
    646   3547     maybee 	dbuf_dirty_record_t *dr = db->db_last_dirty;
    647    789     ahrens 
    648    789     ahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
    649    789     ahrens 	ASSERT(db->db.db_data != NULL);
    650   3547     maybee 	ASSERT(db->db_level == 0);
    651   3547     maybee 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
    652    789     ahrens 
    653   3711     maybee 	if (dr == NULL ||
    654   3711     maybee 	    (dr->dt.dl.dr_data !=
    655   3711     maybee 	    ((db->db_blkid  == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
    656   3547     maybee 		return;
    657    789     ahrens 
    658    789     ahrens 	/*
    659   3547     maybee 	 * If the last dirty record for this dbuf has not yet synced
    660   3547     maybee 	 * and its referencing the dbuf data, either:
    661   3547     maybee 	 * 	reset the reference to point to a new copy,
    662   3547     maybee 	 * or (if there a no active holders)
    663   3547     maybee 	 *	just null out the current db_data pointer.
    664    789     ahrens 	 */
    665   3547     maybee 	ASSERT(dr->dr_txg >= txg - 2);
    666   3547     maybee 	if (db->db_blkid == DB_BONUS_BLKID) {
    667   3547     maybee 		/* Note that the data bufs here are zio_bufs */
    668   3547     maybee 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
    669   8582    Brendan 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
    670   3547     maybee 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
    671   3547     maybee 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
    672   3547     maybee 		int size = db->db.db_size;
    673   3547     maybee 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
    674   3547     maybee 		dr->dt.dl.dr_data = arc_buf_alloc(
    675   3547     maybee 		    db->db_dnode->dn_objset->os_spa, size, db, type);
    676   3547     maybee 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
    677   3547     maybee 	} else {
    678   3547     maybee 		dbuf_set_data(db, NULL);
    679    789     ahrens 	}
    680    789     ahrens }
    681    789     ahrens 
    682    789     ahrens void
    683   3547     maybee dbuf_unoverride(dbuf_dirty_record_t *dr)
    684    789     ahrens {
    685   3547     maybee 	dmu_buf_impl_t *db = dr->dr_dbuf;
    686  10922       Jeff 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
    687   3547     maybee 	uint64_t txg = dr->dr_txg;
    688   3547     maybee 
    689    789     ahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
    690   3547     maybee 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
    691   3547     maybee 	ASSERT(db->db_level == 0);
    692   2237     maybee 
    693   3547     maybee 	if (db->db_blkid == DB_BONUS_BLKID ||
    694   3547     maybee 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
    695   3547     maybee 		return;
    696   3547     maybee 
    697  10922       Jeff 	ASSERT(db->db_data_pending != dr);
    698  10922       Jeff 
    699   3547     maybee 	/* free this block */
    700  10922       Jeff 	if (!BP_IS_HOLE(bp))
    701  10922       Jeff 		dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp);
    702  10922       Jeff 
    703   3547     maybee 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
    704   3547     maybee 	/*
    705   3547     maybee 	 * Release the already-written buffer, so we leave it in
    706   3547     maybee 	 * a consistent dirty state.  Note that all callers are
    707   3547     maybee 	 * modifying the buffer, so they will immediately do
    708   3547     maybee 	 * another (redundant) arc_release().  Therefore, leave
    709   3547     maybee 	 * the buf thawed to save the effort of freezing &
    710   3547     maybee 	 * immediately re-thawing it.
    711   3547     maybee 	 */
    712   3547     maybee 	arc_release(dr->dt.dl.dr_data, db);
    713    789     ahrens }
    714    789     ahrens 
    715   6992     maybee /*
    716   6992     maybee  * Evict (if its unreferenced) or clear (if its referenced) any level-0
    717   6992     maybee  * data blocks in the free range, so that any future readers will find
    718   6992     maybee  * empty blocks.  Also, if we happen accross any level-1 dbufs in the
    719   6992     maybee  * range that have not already been marked dirty, mark them dirty so
    720   6992     maybee  * they stay in memory.
    721   6992     maybee  */
    722    789     ahrens void
    723   6992     maybee dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
    724    789     ahrens {
    725    789     ahrens 	dmu_buf_impl_t *db, *db_next;
    726    789     ahrens 	uint64_t txg = tx->tx_txg;
    727   6992     maybee 	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
    728   6992     maybee 	uint64_t first_l1 = start >> epbs;
    729   6992     maybee 	uint64_t last_l1 = end >> epbs;
    730    789     ahrens 
    731   6992     maybee 	if (end > dn->dn_maxblkid) {
    732   6992     maybee 		end = dn->dn_maxblkid;
    733   6992     maybee 		last_l1 = end >> epbs;
    734   6992     maybee 	}
    735   6992     maybee 	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
    736    789     ahrens 	mutex_enter(&dn->dn_dbufs_mtx);
    737    789     ahrens 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
    738    789     ahrens 		db_next = list_next(&dn->dn_dbufs, db);
    739   1544   eschrock 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
    740   6992     maybee 
    741   6992     maybee 		if (db->db_level == 1 &&
    742   6992     maybee 		    db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
    743   6992     maybee 			mutex_enter(&db->db_mtx);
    744   6992     maybee 			if (db->db_last_dirty &&
    745   6992     maybee 			    db->db_last_dirty->dr_txg < txg) {
    746   6992     maybee 				dbuf_add_ref(db, FTAG);
    747   6992     maybee 				mutex_exit(&db->db_mtx);
    748   6992     maybee 				dbuf_will_dirty(db, tx);
    749   6992     maybee 				dbuf_rele(db, FTAG);
    750   6992     maybee 			} else {
    751   6992     maybee 				mutex_exit(&db->db_mtx);
    752   6992     maybee 			}
    753   6992     maybee 		}
    754   6992     maybee 
    755   1544   eschrock 		if (db->db_level != 0)
    756    789     ahrens 			continue;
    757    789     ahrens 		dprintf_dbuf(db, "found buf %s\n", "");
    758   6992     maybee 		if (db->db_blkid < start || db->db_blkid > end)
    759    789     ahrens 			continue;
    760    789     ahrens 
    761    789     ahrens 		/* found a level 0 buffer in the range */
    762    789     ahrens 		if (dbuf_undirty(db, tx))
    763    789     ahrens 			continue;
    764    789     ahrens 
    765    789     ahrens 		mutex_enter(&db->db_mtx);
    766   1544   eschrock 		if (db->db_state == DB_UNCACHED ||
    767   7872        Tim 		    db->db_state == DB_NOFILL ||
    768   1544   eschrock 		    db->db_state == DB_EVICTING) {
    769    789     ahrens 			ASSERT(db->db.db_data == NULL);
    770    789     ahrens 			mutex_exit(&db->db_mtx);
    771    789     ahrens 			continue;
    772    789     ahrens 		}
    773   1596     ahrens 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
    774   1596     ahrens 			/* will be handled in dbuf_read_done or dbuf_rele */
    775   3547     maybee 			db->db_freed_in_flight = TRUE;
    776    789     ahrens 			mutex_exit(&db->db_mtx);
    777    789     ahrens 			continue;
    778    789     ahrens 		}
    779   1544   eschrock 		if (refcount_count(&db->db_holds) == 0) {
    780   1544   eschrock 			ASSERT(db->db_buf);
    781   1544   eschrock 			dbuf_clear(db);
    782   1544   eschrock 			continue;
    783   1544   eschrock 		}
    784   3547     maybee 		/* The dbuf is referenced */
    785    789     ahrens 
    786   3547     maybee 		if (db->db_last_dirty != NULL) {
    787   3547     maybee 			dbuf_dirty_record_t *dr = db->db_last_dirty;
    788   3547     maybee 
    789   3547     maybee 			if (dr->dr_txg == txg) {
    790   2688     maybee 				/*
    791   3547     maybee 				 * This buffer is "in-use", re-adjust the file
    792   3547     maybee 				 * size to reflect that this buffer may
    793   3547     maybee 				 * contain new data when we sync.
    794   2688     maybee 				 */
    795   3547     maybee 				if (db->db_blkid > dn->dn_maxblkid)
    796   3547     maybee 					dn->dn_maxblkid = db->db_blkid;
    797   3547     maybee 				dbuf_unoverride(dr);
    798   3547     maybee 			} else {
    799   3547     maybee 				/*
    800   3547     maybee 				 * This dbuf is not dirty in the open context.
    801   3547     maybee 				 * Either uncache it (if its not referenced in
    802   3547     maybee 				 * the open context) or reset its contents to
    803   3547     maybee 				 * empty.
    804   3547     maybee 				 */
    805   3547     maybee 				dbuf_fix_old_data(db, txg);
    806   2688     maybee 			}
    807   1544   eschrock 		}
    808   3547     maybee 		/* clear the contents if its cached */
    809   1544   eschrock 		if (db->db_state == DB_CACHED) {
    810   1544   eschrock 			ASSERT(db->db.db_data != NULL);
    811    789     ahrens 			arc_release(db->db_buf, db);
    812    789     ahrens 			bzero(db->db.db_data, db->db.db_size);
    813   3093     ahrens 			arc_buf_freeze(db->db_buf);
    814    789     ahrens 		}
    815   1544   eschrock 
    816    789     ahrens 		mutex_exit(&db->db_mtx);
    817    789     ahrens 	}
    818    789     ahrens 	mutex_exit(&dn->dn_dbufs_mtx);
    819    789     ahrens }
    820    789     ahrens 
    821    789     ahrens static int
    822   4944     maybee dbuf_block_freeable(dmu_buf_impl_t *db)
    823    789     ahrens {
    824    789     ahrens 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
    825    789     ahrens 	uint64_t birth_txg = 0;
    826    789     ahrens 
    827    789     ahrens 	/*
    828    789     ahrens 	 * We don't need any locking to protect db_blkptr:
    829   3547     maybee 	 * If it's syncing, then db_last_dirty will be set
    830   3547     maybee 	 * so we'll ignore db_blkptr.
    831    789     ahrens 	 */
    832   3547     maybee 	ASSERT(MUTEX_HELD(&db->db_mtx));
    833   3547     maybee 	if (db->db_last_dirty)
    834   3547     maybee 		birth_txg = db->db_last_dirty->dr_txg;
    835    789     ahrens 	else if (db->db_blkptr)
    836    789     ahrens 		birth_txg = db->db_blkptr->blk_birth;
    837    789     ahrens 
    838   4944     maybee 	/* If we don't exist or are in a snapshot, we can't be freed */
    839    789     ahrens 	if (birth_txg)
    840   4944     maybee 		return (ds == NULL ||
    841   4944     maybee 		    dsl_dataset_block_freeable(ds, birth_txg));
    842    789     ahrens 	else
    843   4944     maybee 		return (FALSE);
    844    789     ahrens }
    845    789     ahrens 
    846    789     ahrens void
    847    789     ahrens dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
    848    789     ahrens {
    849    789     ahrens 	arc_buf_t *buf, *obuf;
    850    789     ahrens 	int osize = db->db.db_size;
    851   3290   johansen 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
    852   1544   eschrock 
    853   1544   eschrock 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
    854    789     ahrens 
    855    789     ahrens 	/* XXX does *this* func really need the lock? */
    856    789     ahrens 	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
    857    789     ahrens 
    858    789     ahrens 	/*
    859    789     ahrens 	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
    860    789     ahrens 	 * is OK, because there can be no other references to the db
    861    789     ahrens 	 * when we are changing its size, so no concurrent DB_FILL can
    862    789     ahrens 	 * be happening.
    863    789     ahrens 	 */
    864   1544   eschrock 	/*
    865   1544   eschrock 	 * XXX we should be doing a dbuf_read, checking the return
    866   1544   eschrock 	 * value and returning that up to our callers
    867   1544   eschrock 	 */
    868    789     ahrens 	dbuf_will_dirty(db, tx);
    869    789     ahrens 
    870    789     ahrens 	/* create the data buffer for the new block */
    871   3290   johansen 	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
    872    789     ahrens 
    873    789     ahrens 	/* copy old block data to the new block */
    874    789     ahrens 	obuf = db->db_buf;
    875   1491     ahrens 	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
    876    789     ahrens 	/* zero the remainder */
    877   1491     ahrens 	if (size > osize)
    878   1491     ahrens 		bzero((uint8_t *)buf->b_data + osize, size - osize);
    879    789     ahrens 
    880    789     ahrens 	mutex_enter(&db->db_mtx);
    881    789     ahrens 	dbuf_set_data(db, buf);
    882   1544   eschrock 	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
    883    789     ahrens 	db->db.db_size = size;
    884    789     ahrens 
    885   3547     maybee 	if (db->db_level == 0) {
    886   3547     maybee 		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
    887   3547     maybee 		db->db_last_dirty->dt.dl.dr_data = buf;
    888   3547     maybee 	}
    889    789     ahrens 	mutex_exit(&db->db_mtx);
    890    789     ahrens 
    891    789     ahrens 	dnode_willuse_space(db->db_dnode, size-osize, tx);
    892    789     ahrens }
    893    789     ahrens 
    894   3547     maybee dbuf_dirty_record_t *
    895    789     ahrens dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
    896    789     ahrens {
    897    789     ahrens 	dnode_t *dn = db->db_dnode;
    898  10298    Matthew 	objset_t *os = dn->dn_objset;
    899   3547     maybee 	dbuf_dirty_record_t **drp, *dr;
    900    789     ahrens 	int drop_struct_lock = FALSE;
    901   7467       Mark 	boolean_t do_free_accounting = B_FALSE;
    902    789     ahrens 	int txgoff = tx->tx_txg & TXG_MASK;
    903    789     ahrens 
    904    789     ahrens 	ASSERT(tx->tx_txg != 0);
    905    789     ahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
    906    873   ek110237 	DMU_TX_DIRTY_BUF(tx, db);
    907    789     ahrens 
    908    789     ahrens 	/*
    909    789     ahrens 	 * Shouldn't dirty a regular buffer in syncing context.  Private
    910    789     ahrens 	 * objects may be dirtied in syncing context, but only if they
    911    789     ahrens 	 * were already pre-dirtied in open context.
    912    789     ahrens 	 */
    913   3547     maybee 	ASSERT(!dmu_tx_is_syncing(tx) ||
    914   3547     maybee 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
    915   9396    Matthew 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
    916   9396    Matthew 	    dn->dn_objset->os_dsl_dataset == NULL);
    917    789     ahrens 	/*
    918    789     ahrens 	 * We make this assert for private objects as well, but after we
    919    789     ahrens 	 * check if we're already dirty.  They are allowed to re-dirty
    920    789     ahrens 	 * in syncing context.
    921    789     ahrens 	 */
    922   1544   eschrock 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
    923   3547     maybee 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
    924    789     ahrens 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
    925    789     ahrens 
    926    789     ahrens 	mutex_enter(&db->db_mtx);
    927    789     ahrens 	/*
    928   3547     maybee 	 * XXX make this true for indirects too?  The problem is that
    929   3547     maybee 	 * transactions created with dmu_tx_create_assigned() from
    930   3547     maybee 	 * syncing context don't bother holding ahead.
    931    789     ahrens 	 */
    932   3547     maybee 	ASSERT(db->db_level != 0 ||
    933   7872        Tim 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
    934   7872        Tim 	    db->db_state == DB_NOFILL);
    935    789     ahrens 
    936    789     ahrens 	mutex_enter(&dn->dn_mtx);
    937    789     ahrens 	/*
    938    789     ahrens 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
    939    789     ahrens 	 * initialize the objset.
    940    789     ahrens 	 */
    941    789     ahrens 	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
    942   3547     maybee 	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
    943    789     ahrens 		dn->dn_dirtyctx =
    944    789     ahrens 		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
    945    789     ahrens 		ASSERT(dn->dn_dirtyctx_firstset == NULL);
    946    789     ahrens 		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
    947    789     ahrens 	}
    948    789     ahrens 	mutex_exit(&dn->dn_mtx);
    949    789     ahrens 
    950    789     ahrens 	/*
    951    789     ahrens 	 * If this buffer is already dirty, we're done.
    952    789     ahrens 	 */
    953   3547     maybee 	drp = &db->db_last_dirty;
    954   3547     maybee 	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
    955   3547     maybee 	    db->db.db_object == DMU_META_DNODE_OBJECT);
    956   5370    bonwick 	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
    957   5370    bonwick 		drp = &dr->dr_next;
    958   5370    bonwick 	if (dr && dr->dr_txg == tx->tx_txg) {
    959   3547     maybee 		if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
    960   3547     maybee 			/*
    961   3547     maybee 			 * If this buffer has already been written out,
    962   3547     maybee 			 * we now need to reset its state.
    963   3547     maybee 			 */
    964   5370    bonwick 			dbuf_unoverride(dr);
    965  10922       Jeff 			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
    966  10922       Jeff 			    db->db_state != DB_NOFILL)
    967   3547     maybee 				arc_buf_thaw(db->db_buf);
    968   3547     maybee 		}
    969    789     ahrens 		mutex_exit(&db->db_mtx);
    970   5370    bonwick 		return (dr);
    971    789     ahrens 	}
    972    789     ahrens 
    973    789     ahrens 	/*
    974    789     ahrens 	 * Only valid if not already dirty.
    975    789     ahrens 	 */
    976   9396    Matthew 	ASSERT(dn->dn_object == 0 ||
    977   9396    Matthew 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
    978    789     ahrens 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
    979    789     ahrens 
    980    789     ahrens 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
    981    789     ahrens 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
    982    789     ahrens 	    dn->dn_phys->dn_nlevels > db->db_level ||
    983    789     ahrens 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
    984    789     ahrens 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
    985    789     ahrens 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
    986    789     ahrens 
    987    789     ahrens 	/*
    988    789     ahrens 	 * We should only be dirtying in syncing context if it's the
    989   9396    Matthew 	 * mos or we're initializing the os or it's a special object.
    990   9396    Matthew 	 * However, we are allowed to dirty in syncing context provided
    991   9396    Matthew 	 * we already dirtied it in open context.  Hence we must make
    992   9396    Matthew 	 * this assertion only if we're not already dirty.
    993    789     ahrens 	 */
    994   9396    Matthew 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
    995   9396    Matthew 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
    996    789     ahrens 	ASSERT(db->db.db_size != 0);
    997    789     ahrens 
    998    789     ahrens 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
    999    789     ahrens 
   1000   4944     maybee 	if (db->db_blkid != DB_BONUS_BLKID) {
   1001   4944     maybee 		/*
   1002   4944     maybee 		 * Update the accounting.
   1003   7467       Mark 		 * Note: we delay "free accounting" until after we drop
   1004   7467       Mark 		 * the db_mtx.  This keeps us from grabbing other locks
   1005  10922       Jeff 		 * (and possibly deadlocking) in bp_get_dsize() while
   1006   7467       Mark 		 * also holding the db_mtx.
   1007   4944     maybee 		 */
   1008   4944     maybee 		dnode_willuse_space(dn, db->db.db_size, tx);
   1009   7467       Mark 		do_free_accounting = dbuf_block_freeable(db);
   1010   4944     maybee 	}
   1011   4944     maybee 
   1012   1544   eschrock 	/*
   1013   1544   eschrock 	 * If this buffer is dirty in an old transaction group we need
   1014   1544   eschrock 	 * to make a copy of it so that the changes we make in this
   1015   1544   eschrock 	 * transaction group won't leak out when we sync the older txg.
   1016   1544   eschrock 	 */
   1017   3547     maybee 	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
   1018   3547     maybee 	if (db->db_level == 0) {
   1019   3547     maybee 		void *data_old = db->db_buf;
   1020   3547     maybee 
   1021   7872        Tim 		if (db->db_state != DB_NOFILL) {
   1022   7872        Tim 			if (db->db_blkid == DB_BONUS_BLKID) {
   1023   7872        Tim 				dbuf_fix_old_data(db, tx->tx_txg);
   1024   7872        Tim 				data_old = db->db.db_data;
   1025   7872        Tim 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
   1026   7872        Tim 				/*
   1027   7872        Tim 				 * Release the data buffer from the cache so
   1028   7872        Tim 				 * that we can modify it without impacting
   1029   7872        Tim 				 * possible other users of this cached data
   1030   7872        Tim 				 * block.  Note that indirect blocks and
   1031   7872        Tim 				 * private objects are not released until the
   1032   7872        Tim 				 * syncing state (since they are only modified
   1033   7872        Tim 				 * then).
   1034   7872        Tim 				 */
   1035   7872        Tim 				arc_release(db->db_buf, db);
   1036   7872        Tim 				dbuf_fix_old_data(db, tx->tx_txg);
   1037   7872        Tim 				data_old = db->db_buf;
   1038   7872        Tim 			}
   1039   7872        Tim 			ASSERT(data_old != NULL);
   1040    789     ahrens 		}
   1041   3547     maybee 		dr->dt.dl.dr_data = data_old;
   1042   3547     maybee 	} else {
   1043   3547     maybee 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
   1044   3547     maybee 		list_create(&dr->dt.di.dr_children,
   1045   3547     maybee 		    sizeof (dbuf_dirty_record_t),
   1046   3547     maybee 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
   1047    789     ahrens 	}
   1048   3547     maybee 	dr->dr_dbuf = db;
   1049   3547     maybee 	dr->dr_txg = tx->tx_txg;
   1050   3547     maybee 	dr->dr_next = *drp;
   1051   3547     maybee 	*drp = dr;
   1052    789     ahrens 
   1053    789     ahrens 	/*
   1054    789     ahrens 	 * We could have been freed_in_flight between the dbuf_noread
   1055    789     ahrens 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
   1056    789     ahrens 	 * happened after the free.
   1057    789     ahrens 	 */
   1058    789     ahrens 	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
   1059   3547     maybee 		mutex_enter(&dn->dn_mtx);
   1060    789     ahrens 		dnode_clear_range(dn, db->db_blkid, 1, tx);
   1061   3547     maybee 		mutex_exit(&dn->dn_mtx);
   1062   3547     maybee 		db->db_freed_in_flight = FALSE;
   1063    789     ahrens 	}
   1064    789     ahrens 
   1065    789     ahrens 	/*
   1066    789     ahrens 	 * This buffer is now part of this txg
   1067    789     ahrens 	 */
   1068    789     ahrens 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
   1069    789     ahrens 	db->db_dirtycnt += 1;
   1070    789     ahrens 	ASSERT3U(db->db_dirtycnt, <=, 3);
   1071    789     ahrens 
   1072    789     ahrens 	mutex_exit(&db->db_mtx);
   1073    789     ahrens 
   1074    789     ahrens 	if (db->db_blkid == DB_BONUS_BLKID) {
   1075   3547     maybee 		mutex_enter(&dn->dn_mtx);
   1076   3547     maybee 		ASSERT(!list_link_active(&dr->dr_dirty_node));
   1077   3547     maybee 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
   1078   3547     maybee 		mutex_exit(&dn->dn_mtx);
   1079    789     ahrens 		dnode_setdirty(dn, tx);
   1080   3547     maybee 		return (dr);
   1081   7467       Mark 	} else if (do_free_accounting) {
   1082   7467       Mark 		blkptr_t *bp = db->db_blkptr;
   1083   7467       Mark 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
   1084  10922       Jeff 		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
   1085   7467       Mark 		/*
   1086   7467       Mark 		 * This is only a guess -- if the dbuf is dirty
   1087   7467       Mark 		 * in a previous txg, we don't know how much
   1088   7467       Mark 		 * space it will use on disk yet.  We should
   1089   7467       Mark 		 * really have the struct_rwlock to access
   1090   7467       Mark 		 * db_blkptr, but since this is just a guess,
   1091   7467       Mark 		 * it's OK if we get an odd answer.
   1092   7467       Mark 		 */
   1093   7467       Mark 		dnode_willuse_space(dn, -willfree, tx);
   1094    789     ahrens 	}
   1095    789     ahrens 
   1096    789     ahrens 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
   1097    789     ahrens 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
   1098    789     ahrens 		drop_struct_lock = TRUE;
   1099   7332   Jonathan 	}
   1100   7332   Jonathan 
   1101   7332   Jonathan 	if (db->db_level == 0) {
   1102   7332   Jonathan 		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
   1103   7332   Jonathan 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
   1104    789     ahrens 	}
   1105    789     ahrens 
   1106   2688     maybee 	if (db->db_level+1 < dn->dn_nlevels) {
   1107   3547     maybee 		dmu_buf_impl_t *parent = db->db_parent;
   1108   3547     maybee 		dbuf_dirty_record_t *di;
   1109   3547     maybee 		int parent_held = FALSE;
   1110   3547     maybee 
   1111   3547     maybee 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
   1112   3547     maybee 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
   1113   3547     maybee 
   1114   3547     maybee 			parent = dbuf_hold_level(dn, db->db_level+1,
   1115   3547     maybee 			    db->db_blkid >> epbs, FTAG);
   1116   3547     maybee 			parent_held = TRUE;
   1117   3547     maybee 		}
   1118    789     ahrens 		if (drop_struct_lock)
   1119    789     ahrens 			rw_exit(&dn->dn_struct_rwlock);
   1120   3547     maybee 		ASSERT3U(db->db_level+1, ==, parent->db_level);
   1121   3547     maybee 		di = dbuf_dirty(parent, tx);
   1122   3547     maybee 		if (parent_held)
   1123   3547     maybee 			dbuf_rele(parent, FTAG);
   1124   3547     maybee 
   1125   3547     maybee 		mutex_enter(&db->db_mtx);
   1126   3547     maybee 		/*  possible race with dbuf_undirty() */
   1127   3547     maybee 		if (db->db_last_dirty == dr ||
   1128   3547     maybee 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
   1129   3547     maybee 			mutex_enter(&di->dt.di.dr_mtx);
   1130   3547     maybee 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
   1131   3547     maybee 			ASSERT(!list_link_active(&dr->dr_dirty_node));
   1132   3547     maybee 			list_insert_tail(&di->dt.di.dr_children, dr);
   1133   3547     maybee 			mutex_exit(&di->dt.di.dr_mtx);
   1134   3547     maybee 			dr->dr_parent = di;
   1135   3547     maybee 		}
   1136   3547     maybee 		mutex_exit(&db->db_mtx);
   1137    789     ahrens 	} else {
   1138   3547     maybee 		ASSERT(db->db_level+1 == dn->dn_nlevels);
   1139   3547     maybee 		ASSERT(db->db_blkid < dn->dn_nblkptr);
   1140   3547     maybee 		ASSERT(db->db_parent == NULL ||
   1141   3547     maybee 		    db->db_parent == db->db_dnode->dn_dbuf);
   1142   3547     maybee 		mutex_enter(&dn->dn_mtx);
   1143   3547     maybee 		ASSERT(!list_link_active(&dr->dr_dirty_node));
   1144   3547     maybee 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
   1145   3547     maybee 		mutex_exit(&dn->dn_mtx);
   1146    789     ahrens 		if (drop_struct_lock)
   1147    789     ahrens 			rw_exit(&dn->dn_struct_rwlock);
   1148    789     ahrens 	}
   1149    789     ahrens 
   1150    789     ahrens 	dnode_setdirty(dn, tx);
   1151   3547     maybee 	return (dr);
   1152    789     ahrens }
   1153    789     ahrens 
   1154    789     ahrens static int
   1155    789     ahrens dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
   1156    789     ahrens {
   1157    789     ahrens 	dnode_t *dn = db->db_dnode;
   1158   3547     maybee 	uint64_t txg = tx->tx_txg;
   1159   5688    bonwick 	dbuf_dirty_record_t *dr, **drp;
   1160    789     ahrens 
   1161   3547     maybee 	ASSERT(txg != 0);
   1162   1544   eschrock 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
   1163    789     ahrens 
   1164    789     ahrens 	mutex_enter(&db->db_mtx);
   1165    789     ahrens 
   1166    789     ahrens 	/*
   1167    789     ahrens 	 * If this buffer is not dirty, we're done.
   1168    789     ahrens 	 */
   1169   5688    bonwick 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
   1170   3547     maybee 		if (dr->dr_txg <= txg)
   1171   3547     maybee 			break;
   1172   3547     maybee 	if (dr == NULL || dr->dr_txg < txg) {
   1173    789     ahrens 		mutex_exit(&db->db_mtx);
   1174    789     ahrens 		return (0);
   1175    789     ahrens 	}
   1176   3547     maybee 	ASSERT(dr->dr_txg == txg);
   1177  10922       Jeff 	ASSERT(dr->dr_dbuf == db);
   1178    789     ahrens 
   1179    789     ahrens 	/*
   1180    789     ahrens 	 * If this buffer is currently held, we cannot undirty
   1181    789     ahrens 	 * it, since one of the current holders may be in the
   1182    789     ahrens 	 * middle of an update.  Note that users of dbuf_undirty()
   1183    789     ahrens 	 * should not place a hold on the dbuf before the call.
   1184    789     ahrens 	 */
   1185    789     ahrens 	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
   1186    789     ahrens 		mutex_exit(&db->db_mtx);
   1187   2688     maybee 		/* Make sure we don't toss this buffer at sync phase */
   1188    789     ahrens 		mutex_enter(&dn->dn_mtx);
   1189    789     ahrens 		dnode_clear_range(dn, db->db_blkid, 1, tx);
   1190    789     ahrens 		mutex_exit(&dn->dn_mtx);
   1191    789     ahrens 		return (0);
   1192    789     ahrens 	}
   1193    789     ahrens 
   1194    789     ahrens 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
   1195    789     ahrens 
   1196   3547     maybee 	ASSERT(db->db.db_size != 0);
   1197    789     ahrens 
   1198   3547     maybee 	/* XXX would be nice to fix up dn_towrite_space[] */
   1199   3547     maybee 
   1200   5688    bonwick 	*drp = dr->dr_next;
   1201   3547     maybee 
   1202   3547     maybee 	if (dr->dr_parent) {
   1203   3547     maybee 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
   1204   3547     maybee 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
   1205   3547     maybee 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
   1206   3547     maybee 	} else if (db->db_level+1 == dn->dn_nlevels) {
   1207   6992     maybee 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
   1208   3547     maybee 		mutex_enter(&dn->dn_mtx);
   1209   3547     maybee 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
   1210   3547     maybee 		mutex_exit(&dn->dn_mtx);
   1211    789     ahrens 	}
   1212    789     ahrens 
   1213   3547     maybee 	if (db->db_level == 0) {
   1214   7872        Tim 		if (db->db_state != DB_NOFILL) {
   1215   7872        Tim 			dbuf_unoverride(dr);
   1216    789     ahrens 
   1217   7872        Tim 			ASSERT(db->db_buf != NULL);
   1218   7872        Tim 			ASSERT(dr->dt.dl.dr_data != NULL);
   1219   7872        Tim 			if (dr->dt.dl.dr_data != db->db_buf)
   1220   7872        Tim 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
   1221   7872        Tim 				    db) == 1);
   1222   7872        Tim 		}
   1223   3547     maybee 	} else {
   1224   3547     maybee 		ASSERT(db->db_buf != NULL);
   1225   3547     maybee 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
   1226   4831    gw25295 		mutex_destroy(&dr->dt.di.dr_mtx);
   1227   4831    gw25295 		list_destroy(&dr->dt.di.dr_children);
   1228   3547     maybee 	}
   1229   3547     maybee 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
   1230    789     ahrens 
   1231    789     ahrens 	ASSERT(db->db_dirtycnt > 0);
   1232    789     ahrens 	db->db_dirtycnt -= 1;
   1233    789     ahrens 
   1234   3547     maybee 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
   1235   1544   eschrock 		arc_buf_t *buf = db->db_buf;
   1236    789     ahrens 
   1237  10922       Jeff 		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
   1238   1544   eschrock 		dbuf_set_data(db, NULL);
   1239   1544   eschrock 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
   1240    789     ahrens 		dbuf_evict(db);
   1241    789     ahrens 		return (1);
   1242    789     ahrens 	}
   1243    789     ahrens 
   1244    789     ahrens 	mutex_exit(&db->db_mtx);
   1245    789     ahrens 	return (0);
   1246    789     ahrens }
   1247    789     ahrens 
   1248    789     ahrens #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
   1249    789     ahrens void
   1250    789     ahrens dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
   1251    789     ahrens {
   1252   6245     maybee 	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
   1253    789     ahrens 
   1254    789     ahrens 	ASSERT(tx->tx_txg != 0);
   1255    789     ahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
   1256    789     ahrens 
   1257    789     ahrens 	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
   1258    789     ahrens 		rf |= DB_RF_HAVESTRUCT;
   1259   1544   eschrock 	(void) dbuf_read(db, NULL, rf);
   1260   3547     maybee 	(void) dbuf_dirty(db, tx);
   1261   7872        Tim }
   1262   7872        Tim 
   1263   7872        Tim void
   1264   7872        Tim dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
   1265   7872        Tim {
   1266   7872        Tim 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
   1267   7872        Tim 
   1268   7872        Tim 	db->db_state = DB_NOFILL;
   1269   7872        Tim 
   1270   7872        Tim 	dmu_buf_will_fill(db_fake, tx);
   1271    789     ahrens }
   1272    789     ahrens 
   1273    789     ahrens void
   1274   1544   eschrock dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
   1275    789     ahrens {
   1276   1544   eschrock 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
   1277   1544   eschrock 
   1278   1544   eschrock 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
   1279    789     ahrens 	ASSERT(tx->tx_txg != 0);
   1280    789     ahrens 	ASSERT(db->db_level == 0);
   1281    789     ahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
   1282    789     ahrens 
   1283   1544   eschrock 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
   1284    789     ahrens 	    dmu_tx_private_ok(tx));
   1285    789     ahrens 
   1286    789     ahrens 	dbuf_noread(db);
   1287   3547     maybee 	(void) dbuf_dirty(db, tx);
   1288    789     ahrens }
   1289    789     ahrens 
   1290    789     ahrens #pragma weak dmu_buf_fill_done = dbuf_fill_done
   1291    789     ahrens /* ARGSUSED */
   1292    789     ahrens void
   1293    789     ahrens dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
   1294    789     ahrens {
   1295    789     ahrens 	mutex_enter(&db->db_mtx);
   1296    873   ek110237 	DBUF_VERIFY(db);
   1297    789     ahrens 
   1298    789     ahrens 	if (db->db_state == DB_FILL) {
   1299   3547     maybee 		if (db->db_level == 0 && db->db_freed_in_flight) {
   1300   1544   eschrock 			ASSERT(db->db_blkid != DB_BONUS_BLKID);
   1301    789     ahrens 			/* we were freed while filling */
   1302    789     ahrens 			/* XXX dbuf_undirty? */
   1303    789     ahrens 			bzero(db->db.db_data, db->db.db_size);
   1304   3547     maybee 			db->db_freed_in_flight = FALSE;
   1305    789     ahrens 		}
   1306    789     ahrens 		db->db_state = DB_CACHED;
   1307    789     ahrens 		cv_broadcast(&db->db_changed);
   1308    789     ahrens 	}
   1309    789     ahrens 	mutex_exit(&db->db_mtx);
   1310    789     ahrens }
   1311    789     ahrens 
   1312   1544   eschrock /*
   1313   9412  Aleksandr  * Directly assign a provided arc buf to a given dbuf if it's not referenced
   1314   9412  Aleksandr  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
   1315   9412  Aleksandr  */
   1316   9412  Aleksandr void
   1317   9412  Aleksandr dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
   1318   9412  Aleksandr {
   1319   9412  Aleksandr 	ASSERT(!refcount_is_zero(&db->db_holds));
   1320   9412  Aleksandr 	ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
   1321   9412  Aleksandr 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
   1322   9412  Aleksandr 	ASSERT(db->db_level == 0);
   1323   9412  Aleksandr 	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
   1324   9412  Aleksandr 	ASSERT(buf != NULL);
   1325   9412  Aleksandr 	ASSERT(arc_buf_size(buf) == db->db.db_size);
   1326   9412  Aleksandr 	ASSERT(tx->tx_txg != 0);
   1327   9412  Aleksandr 
   1328   9412  Aleksandr 	arc_return_buf(buf, db);
   1329   9412  Aleksandr 	ASSERT(arc_released(buf));
   1330   9412  Aleksandr 
   1331   9412  Aleksandr 	mutex_enter(&db->db_mtx);
   1332   9412  Aleksandr 
   1333   9412  Aleksandr 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
   1334   9412  Aleksandr 		cv_wait(&db->db_changed, &db->db_mtx);
   1335   9412  Aleksandr 
   1336   9412  Aleksandr 	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
   1337   9412  Aleksandr 
   1338   9412  Aleksandr 	if (db->db_state == DB_CACHED &&
   1339   9412  Aleksandr 	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
   1340   9412  Aleksandr 		mutex_exit(&db->db_mtx);
   1341   9412  Aleksandr 		(void) dbuf_dirty(db, tx);
   1342   9412  Aleksandr 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
   1343   9412  Aleksandr 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
   1344   9412  Aleksandr 		return;
   1345   9412  Aleksandr 	}
   1346   9412  Aleksandr 
   1347   9412  Aleksandr 	if (db->db_state == DB_CACHED) {
   1348   9412  Aleksandr 		dbuf_dirty_record_t *dr = db->db_last_dirty;
   1349   9412  Aleksandr 
   1350   9412  Aleksandr 		ASSERT(db->db_buf != NULL);
   1351   9412  Aleksandr 		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
   1352   9412  Aleksandr 			ASSERT(dr->dt.dl.dr_data == db->db_buf);
   1353   9412  Aleksandr 			if (!arc_released(db->db_buf)) {
   1354   9412  Aleksandr 				ASSERT(dr->dt.dl.dr_override_state ==
   1355   9412  Aleksandr 				    DR_OVERRIDDEN);
   1356   9412  Aleksandr 				arc_release(db->db_buf, db);
   1357   9412  Aleksandr 			}
   1358   9412  Aleksandr 			dr->dt.dl.dr_data = buf;
   1359   9412  Aleksandr 			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
   1360   9412  Aleksandr 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
   1361   9412  Aleksandr 			arc_release(db->db_buf, db);
   1362   9412  Aleksandr 			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
   1363   9412  Aleksandr 		}
   1364   9412  Aleksandr 		db->db_buf = NULL;
   1365   9412  Aleksandr 	}
   1366   9412  Aleksandr 	ASSERT(db->db_buf == NULL);
   1367   9412  Aleksandr 	dbuf_set_data(db, buf);
   1368   9412  Aleksandr 	db->db_state = DB_FILL;
   1369   9412  Aleksandr 	mutex_exit(&db->db_mtx);
   1370   9412  Aleksandr 	(void) dbuf_dirty(db, tx);
   1371   9412  Aleksandr 	dbuf_fill_done(db, tx);
   1372   9412  Aleksandr }
   1373   9412  Aleksandr 
   1374   9412  Aleksandr /*
   1375   1544   eschrock  * "Clear" the contents of this dbuf.  This will mark the dbuf
   1376   1544   eschrock  * EVICTING and clear *most* of its references.  Unfortunetely,
   1377   1544   eschrock  * when we are not holding the dn_dbufs_mtx, we can't clear the
   1378   1544   eschrock  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
   1379   1544   eschrock  * in this case.  For callers from the DMU we will usually see:
   1380   1544   eschrock  *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
   1381   1544   eschrock  * For the arc callback, we will usually see:
   1382   1544   eschrock  * 	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
   1383   1544   eschrock  * Sometimes, though, we will get a mix of these two:
   1384   1544   eschrock  *	DMU: dbuf_clear()->arc_buf_evict()
   1385   1544   eschrock  *	ARC: dbuf_do_evict()->dbuf_destroy()
   1386   1544   eschrock  */
   1387   1544   eschrock void
   1388    789     ahrens dbuf_clear(dmu_buf_impl_t *db)
   1389    789     ahrens {
   1390    789     ahrens 	dnode_t *dn = db->db_dnode;
   1391   1544   eschrock 	dmu_buf_impl_t *parent = db->db_parent;
   1392   1596     ahrens 	dmu_buf_impl_t *dndb = dn->dn_dbuf;
   1393   1544   eschrock 	int dbuf_gone = FALSE;
   1394    789     ahrens 
   1395    789     ahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
   1396    789     ahrens 	ASSERT(refcount_is_zero(&db->db_holds));
   1397    789     ahrens 
   1398   1544   eschrock 	dbuf_evict_user(db);
   1399   1544   eschrock 
   1400    789     ahrens 	if (db->db_state == DB_CACHED) {
   1401   1544   eschrock 		ASSERT(db->db.db_data != NULL);
   1402   4309     maybee 		if (db->db_blkid == DB_BONUS_BLKID) {
   1403   1544   eschrock 			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
   1404   8582    Brendan 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
   1405   4309     maybee 		}
   1406    789     ahrens 		db->db.db_data = NULL;
   1407    789     ahrens 		db->db_state = DB_UNCACHED;
   1408    789     ahrens 	}
   1409    789     ahrens 
   1410   7872        Tim 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
   1411    789     ahrens 	ASSERT(db->db_data_pending == NULL);
   1412    789     ahrens 
   1413   1544   eschrock 	db->db_state = DB_EVICTING;
   1414   1544   eschrock 	db->db_blkptr = NULL;
   1415   1544   eschrock 
   1416   1544   eschrock 	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
   1417   1544   eschrock 		list_remove(&dn->dn_dbufs, db);
   1418   1544   eschrock 		dnode_rele(dn, db);
   1419   4944     maybee 		db->db_dnode = NULL;
   1420   1544   eschrock 	}
   1421   1544   eschrock 
   1422   1544   eschrock 	if (db->db_buf)
   1423   1544   eschrock 		dbuf_gone = arc_buf_evict(db->db_buf);
   1424   1544   eschrock 
   1425   1544   eschrock 	if (!dbuf_gone)
   1426   1544   eschrock 		mutex_exit(&db->db_mtx);
   1427    789     ahrens 
   1428    789     ahrens 	/*
   1429    789     ahrens 	 * If this dbuf is referened from an indirect dbuf,
   1430    789     ahrens 	 * decrement the ref count on the indirect dbuf.
   1431    789     ahrens 	 */
   1432   1596     ahrens 	if (parent && parent != dndb)
   1433   1544   eschrock 		dbuf_rele(parent, db);
   1434    789     ahrens }
   1435    789     ahrens 
   1436    789     ahrens static int
   1437    789     ahrens dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
   1438    789     ahrens     dmu_buf_impl_t **parentp, blkptr_t **bpp)
   1439    789     ahrens {
   1440    789     ahrens 	int nlevels, epbs;
   1441   1544   eschrock 
   1442   2417     ahrens 	*parentp = NULL;
   1443   2417     ahrens 	*bpp = NULL;
   1444   2417     ahrens 
   1445   1544   eschrock 	ASSERT(blkid != DB_BONUS_BLKID);
   1446    789     ahrens 
   1447    789     ahrens 	if (dn->dn_phys->dn_nlevels == 0)
   1448    789     ahrens 		nlevels = 1;
   1449    789     ahrens 	else
   1450    789     ahrens 		nlevels = dn->dn_phys->dn_nlevels;
   1451    789     ahrens 
   1452    789     ahrens 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
   1453    789     ahrens 
   1454    789     ahrens 	ASSERT3U(level * epbs, <, 64);
   1455    789     ahrens 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
   1456   1544   eschrock 	if (level >= nlevels ||
   1457    789     ahrens 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
   1458    789     ahrens 		/* the buffer has no parent yet */
   1459    789     ahrens 		return (ENOENT);
   1460    789     ahrens 	} else if (level < nlevels-1) {
   1461    789     ahrens 		/* this block is referenced from an indirect block */
   1462    789     ahrens 		int err = dbuf_hold_impl(dn, level+1,
   1463    789     ahrens 		    blkid >> epbs, fail_sparse, NULL, parentp);
   1464    789     ahrens 		if (err)
   1465    789     ahrens 			return (err);
   1466   1544   eschrock 		err = dbuf_read(*parentp, NULL,
   1467   1544   eschrock 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
   1468   1596     ahrens 		if (err) {
   1469   1596     ahrens 			dbuf_rele(*parentp, NULL);
   1470   1596     ahrens 			*parentp = NULL;
   1471   1596     ahrens 			return (err);
   1472   1544   eschrock 		}
   1473   1596     ahrens 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
   1474   1596     ahrens 		    (blkid & ((1ULL << epbs) - 1));
   1475   1596     ahrens 		return (0);
   1476    789     ahrens 	} else {
   1477    789     ahrens 		/* the block is referenced from the dnode */
   1478    789     ahrens 		ASSERT3U(level, ==, nlevels-1);
   1479    789     ahrens 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
   1480    789     ahrens 		    blkid < dn->dn_phys->dn_nblkptr);
   1481   1596     ahrens 		if (dn->dn_dbuf) {
   1482   1596     ahrens 			dbuf_add_ref(dn->dn_dbuf, NULL);
   1483   1596     ahrens 			*parentp = dn->dn_dbuf;
   1484   1596     ahrens 		}
   1485    789     ahrens 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
   1486    789     ahrens 		return (0);
   1487    789     ahrens 	}
   1488    789     ahrens }
   1489    789     ahrens 
   1490    789     ahrens static dmu_buf_impl_t *
   1491    789     ahrens dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
   1492    789     ahrens     dmu_buf_impl_t *parent, blkptr_t *blkptr)
   1493    789     ahrens {
   1494  10298    Matthew 	objset_t *os = dn->dn_objset;
   1495    789     ahrens 	dmu_buf_impl_t *db, *odb;
   1496    789     ahrens 
   1497    789     ahrens 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
   1498    789     ahrens 	ASSERT(dn->dn_type != DMU_OT_NONE);
   1499    789     ahrens 
   1500    789     ahrens 	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
   1501    789     ahrens 
   1502    789     ahrens 	db->db_objset = os;
   1503    789     ahrens 	db->db.db_object = dn->dn_object;
   1504    789     ahrens 	db->db_level = level;
   1505    789     ahrens 	db->db_blkid = blkid;
   1506   3547     maybee 	db->db_last_dirty = NULL;
   1507   1544   eschrock 	db->db_dirtycnt = 0;
   1508   1544   eschrock 	db->db_dnode = dn;
   1509   1544   eschrock 	db->db_parent = parent;
   1510   1544   eschrock 	db->db_blkptr = blkptr;
   1511    789     ahrens 
   1512   3547     maybee 	db->db_user_ptr = NULL;
   1513   3547     maybee 	db->db_user_data_ptr_ptr = NULL;
   1514   3547     maybee 	db->db_evict_func = NULL;
   1515   3547     maybee 	db->db_immediate_evict = 0;
   1516   3547     maybee 	db->db_freed_in_flight = 0;
   1517   1544   eschrock 
   1518   1544   eschrock 	if (blkid == DB_BONUS_BLKID) {
   1519   1544   eschrock 		ASSERT3P(parent, ==, dn->dn_dbuf);
   1520   4944     maybee 		db->db.db_size = DN_MAX_BONUSLEN -
   1521   4944     maybee 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
   1522   4944     maybee 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
   1523    789     ahrens 		db->db.db_offset = DB_BONUS_BLKID;
   1524   1544   eschrock 		db->db_state = DB_UNCACHED;
   1525   1544   eschrock 		/* the bonus dbuf is not placed in the hash table */
   1526   8582    Brendan 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
   1527   1544   eschrock 		return (db);
   1528    789     ahrens 	} else {
   1529    789     ahrens 		int blocksize =
   1530    789     ahrens 		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
   1531    789     ahrens 		db->db.db_size = blocksize;
   1532    789     ahrens 		db->db.db_offset = db->db_blkid * blocksize;
   1533    789     ahrens 	}
   1534    789     ahrens 
   1535    789     ahrens 	/*
   1536    789     ahrens 	 * Hold the dn_dbufs_mtx while we get the new dbuf
   1537    789     ahrens 	 * in the hash table *and* added to the dbufs list.
   1538    789     ahrens 	 * This prevents a possible deadlock with someone
   1539    789     ahrens 	 * trying to look up this dbuf before its added to the
   1540    789     ahrens 	 * dn_dbufs list.
   1541    789     ahrens 	 */
   1542    789     ahrens 	mutex_enter(&dn->dn_dbufs_mtx);
   1543   1544   eschrock 	db->db_state = DB_EVICTING;
   1544    789     ahrens 	if ((odb = dbuf_hash_insert(db)) != NULL) {
   1545    789     ahrens 		/* someone else inserted it first */
   1546    789     ahrens 		kmem_cache_free(dbuf_cache, db);
   1547    789     ahrens 		mutex_exit(&dn->dn_dbufs_mtx);
   1548    789     ahrens 		return (odb);
   1549    789     ahrens 	}
   1550    789     ahrens 	list_insert_head(&dn->dn_dbufs, db);
   1551   1544   eschrock 	db->db_state = DB_UNCACHED;
   1552    789     ahrens 	mutex_exit(&dn->dn_dbufs_mtx);
   1553   8582    Brendan 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
   1554    789     ahrens 
   1555    789     ahrens 	if (parent && parent != dn->dn_dbuf)
   1556    789     ahrens 		dbuf_add_ref(parent, db);
   1557    789     ahrens 
   1558   1544   eschrock 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
   1559   1544   eschrock 	    refcount_count(&dn->dn_holds) > 0);
   1560    789     ahrens 	(void) refcount_add(&dn->dn_holds, db);
   1561    789     ahrens 
   1562    789     ahrens 	dprintf_dbuf(db, "db=%p\n", db);
   1563    789     ahrens 
   1564    789     ahrens 	return (db);
   1565    789     ahrens }
   1566    789     ahrens 
   1567    789     ahrens static int
   1568   1544   eschrock dbuf_do_evict(void *private)
   1569    789     ahrens {
   1570   1544   eschrock 	arc_buf_t *buf = private;
   1571   1544   eschrock 	dmu_buf_impl_t *db = buf->b_private;
   1572    789     ahrens 
   1573   1544   eschrock 	if (!MUTEX_HELD(&db->db_mtx))
   1574   1544   eschrock 		mutex_enter(&db->db_mtx);
   1575    789     ahrens 
   1576   1544   eschrock 	ASSERT(refcount_is_zero(&db->db_holds));
   1577    789     ahrens 
   1578   1544   eschrock 	if (db->db_state != DB_EVICTING) {
   1579   1544   eschrock 		ASSERT(db->db_state == DB_CACHED);
   1580   1544   eschrock 		DBUF_VERIFY(db);
   1581   1544   eschrock 		db->db_buf = NULL;
   1582   1544   eschrock 		dbuf_evict(db);
   1583   1544   eschrock 	} else {
   1584   1544   eschrock 		mutex_exit(&db->db_mtx);
   1585   1544   eschrock 		dbuf_destroy(db);
   1586    789     ahrens 	}
   1587   1544   eschrock 	return (0);
   1588    789     ahrens }
   1589    789     ahrens 
   1590    789     ahrens static void
   1591    789     ahrens dbuf_destroy(dmu_buf_impl_t *db)
   1592    789     ahrens {
   1593    789     ahrens 	ASSERT(refcount_is_zero(&db->db_holds));
   1594    789     ahrens 
   1595   1544   eschrock 	if (db->db_blkid != DB_BONUS_BLKID) {
   1596   1544   eschrock 		/*
   1597   1544   eschrock 		 * If this dbuf is still on the dn_dbufs list,
   1598   1544   eschrock 		 * remove it from that list.
   1599   1544   eschrock 		 */
   1600   4944     maybee 		if (db->db_dnode) {
   1601   4944     maybee 			dnode_t *dn = db->db_dnode;
   1602   4944     maybee 
   1603   4944     maybee 			mutex_enter(&dn->dn_dbufs_mtx);
   1604   1544   eschrock 			list_remove(&dn->dn_dbufs, db);
   1605   1596     ahrens 			mutex_exit(&dn->dn_dbufs_mtx);
   1606   1544   eschrock 
   1607   1544   eschrock 			dnode_rele(dn, db);
   1608   4944     maybee 			db->db_dnode = NULL;
   1609   1544   eschrock 		}
   1610   1544   eschrock 		dbuf_hash_remove(db);
   1611   1544   eschrock 	}
   1612   1544   eschrock 	db->db_parent = NULL;
   1613   1544   eschrock 	db->db_buf = NULL;
   1614   1544   eschrock 
   1615   4312    gw25295 	ASSERT(!list_link_active(&db->db_link));
   1616    789     ahrens 	ASSERT(db->db.db_data == NULL);
   1617    789     ahrens 	ASSERT(db->db_hash_next == NULL);
   1618    789     ahrens 	ASSERT(db->db_blkptr == NULL);
   1619    789     ahrens 	ASSERT(db->db_data_pending == NULL);
   1620    789     ahrens 
   1621    789     ahrens 	kmem_cache_free(dbuf_cache, db);
   1622   8582    Brendan 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
   1623    789     ahrens }
   1624    789     ahrens 
   1625    789     ahrens void
   1626    789     ahrens dbuf_prefetch(dnode_t *dn, uint64_t blkid)
   1627    789     ahrens {
   1628   2391     maybee 	dmu_buf_impl_t *db = NULL;
   1629    789     ahrens 	blkptr_t *bp = NULL;
   1630    789     ahrens 
   1631    789     ahrens 	ASSERT(blkid != DB_BONUS_BLKID);
   1632    789     ahrens 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
   1633    789     ahrens 
   1634    789     ahrens 	if (dnode_block_freed(dn, blkid))
   1635    789     ahrens 		return;
   1636    789     ahrens 
   1637    789     ahrens 	/* dbuf_find() returns with db_mtx held */
   1638    789     ahrens 	if (db = dbuf_find(dn, 0, blkid)) {
   1639   2391     maybee 		if (refcount_count(&db->db_holds) > 0) {
   1640   2391     maybee 			/*
   1641   2391     maybee 			 * This dbuf is active.  We assume that it is
   1642   2391     maybee 			 * already CACHED, or else about to be either
   1643   2391     maybee 			 * read or filled.
   1644   2391     maybee 			 */
   1645   2391     maybee 			mutex_exit(&db->db_mtx);
   1646   2391     maybee 			return;
   1647   2391     maybee 		}
   1648    789     ahrens 		mutex_exit(&db->db_mtx);
   1649   2417     ahrens 		db = NULL;
   1650    789     ahrens 	}
   1651    789     ahrens 
   1652   2391     maybee 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
   1653    789     ahrens 		if (bp && !BP_IS_HOLE(bp)) {
   1654   7046     ahrens 			arc_buf_t *pbuf;
   1655  10922       Jeff 			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
   1656   2391     maybee 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
   1657   1544   eschrock 			zbookmark_t zb;
   1658  10922       Jeff 
   1659  10922       Jeff 			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
   1660  10922       Jeff 			    dn->dn_object, 0, blkid);
   1661   1544   eschrock 
   1662   7046     ahrens 			if (db)
   1663   7046     ahrens 				pbuf = db->db_buf;
   1664   7046     ahrens 			else
   1665   7046     ahrens 				pbuf = dn->dn_objset->os_phys_buf;
   1666   7046     ahrens 
   1667   7046     ahrens 			(void) arc_read(NULL, dn->dn_objset->os_spa,
   1668   7046     ahrens 			    bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
   1669    789     ahrens 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
   1670   2391     maybee 			    &aflags, &zb);
   1671    789     ahrens 		}
   1672   2391     maybee 		if (db)
   1673   2391     maybee 			dbuf_rele(db, NULL);
   1674    789     ahrens 	}
   1675    789     ahrens }
   1676    789     ahrens 
   1677    789     ahrens /*
   1678    789     ahrens  * Returns with db_holds incremented, and db_mtx not held.
   1679    789     ahrens  * Note: dn_struct_rwlock must be held.
   1680    789     ahrens  */
   1681    789     ahrens int
   1682    789     ahrens dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
   1683    789     ahrens     void *tag, dmu_buf_impl_t **dbp)
   1684    789     ahrens {
   1685    789     ahrens 	dmu_buf_impl_t *db, *parent = NULL;
   1686    789     ahrens 
   1687   1544   eschrock 	ASSERT(blkid != DB_BONUS_BLKID);
   1688    789     ahrens 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
   1689    789     ahrens 	ASSERT3U(dn->dn_nlevels, >, level);
   1690    789     ahrens 
   1691    789     ahrens 	*dbp = NULL;
   1692   1544   eschrock top:
   1693    789     ahrens 	/* dbuf_find() returns with db_mtx held */
   1694    789     ahrens 	db = dbuf_find(dn, level, blkid);
   1695    789     ahrens 
   1696    789     ahrens 	if (db == NULL) {
   1697    789     ahrens 		blkptr_t *bp = NULL;
   1698    789     ahrens 		int err;
   1699    789     ahrens 
   1700   1596     ahrens 		ASSERT3P(parent, ==, NULL);
   1701    789     ahrens 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
   1702    789     ahrens 		if (fail_sparse) {
   1703    789     ahrens 			if (err == 0 && bp && BP_IS_HOLE(bp))
   1704    789     ahrens 				err = ENOENT;
   1705    789     ahrens 			if (err) {
   1706   1596     ahrens 				if (parent)
   1707   1544   eschrock 					dbuf_rele(parent, NULL);
   1708    789     ahrens 				return (err);
   1709    789     ahrens 			}
   1710    789     ahrens 		}
   1711   1544   eschrock 		if (err && err != ENOENT)
   1712   1544   eschrock 			return (err);
   1713    789     ahrens 		db = dbuf_create(dn, level, blkid, parent, bp);
   1714    789     ahrens 	}
   1715   1544   eschrock 
   1716   1544   eschrock 	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
   1717   1544   eschrock 		arc_buf_add_ref(db->db_buf, db);
   1718   1544   eschrock 		if (db->db_buf->b_data == NULL) {
   1719   1544   eschrock 			dbuf_clear(db);
   1720   1596     ahrens 			if (parent) {
   1721   1596     ahrens 				dbuf_rele(parent, NULL);
   1722   1596     ahrens 				parent = NULL;
   1723   1596     ahrens 			}
   1724   1544   eschrock 			goto top;
   1725   1544   eschrock 		}
   1726   1544   eschrock 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
   1727   1544   eschrock 	}
   1728   1544   eschrock 
   1729   1544   eschrock 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
   1730    789     ahrens 
   1731    789     ahrens 	/*
   1732   3547     maybee 	 * If this buffer is currently syncing out, and we are are
   1733   3547     maybee 	 * still referencing it from db_data, we need to make a copy
   1734   3547     maybee 	 * of it in case we decide we want to dirty it again in this txg.
   1735    789     ahrens 	 */
   1736   3547     maybee 	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
   1737   1544   eschrock 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
   1738   3547     maybee 	    db->db_state == DB_CACHED && db->db_data_pending) {
   1739   3547     maybee 		dbuf_dirty_record_t *dr = db->db_data_pending;
   1740    789     ahrens 
   1741   3547     maybee 		if (dr->dt.dl.dr_data == db->db_buf) {
   1742   3547     maybee 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
   1743   3547     maybee 
   1744   3547     maybee 			dbuf_set_data(db,
   1745   3547     maybee 			    arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
   1746   3547     maybee 			    db->db.db_size, db, type));
   1747   3547     maybee 			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
   1748   3547     maybee 			    db->db.db_size);
   1749   3547     maybee 		}
   1750    789     ahrens 	}
   1751    789     ahrens 
   1752   1544   eschrock 	(void) refcount_add(&db->db_holds, tag);
   1753    789     ahrens 	dbuf_update_data(db);
   1754    873   ek110237 	DBUF_VERIFY(db);
   1755    789     ahrens 	mutex_exit(&db->db_mtx);
   1756    789     ahrens 
   1757    789     ahrens 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
   1758   1596     ahrens 	if (parent)
   1759   1544   eschrock 		dbuf_rele(parent, NULL);
   1760    789     ahrens 
   1761    789     ahrens 	ASSERT3P(db->db_dnode, ==, dn);
   1762    789     ahrens 	ASSERT3U(db->db_blkid, ==, blkid);
   1763    789     ahrens 	ASSERT3U(db->db_level, ==, level);
   1764    789     ahrens 	*dbp = db;
   1765    789     ahrens 
   1766    789     ahrens 	return (0);
   1767    789     ahrens }
   1768    789     ahrens 
   1769    789     ahrens dmu_buf_impl_t *
   1770   1544   eschrock dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
   1771    789     ahrens {
   1772    789     ahrens 	dmu_buf_impl_t *db;
   1773   1544   eschrock 	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
   1774   1544   eschrock 	return (err ? NULL : db);
   1775    789     ahrens }
   1776    789     ahrens 
   1777    789     ahrens dmu_buf_impl_t *
   1778    789     ahrens dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
   1779    789     ahrens {
   1780    789     ahrens 	dmu_buf_impl_t *db;
   1781   1544   eschrock 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
   1782   1544   eschrock 	return (err ? NULL : db);
   1783   1544   eschrock }
   1784   1544   eschrock 
   1785   4944     maybee void
   1786   1544   eschrock dbuf_create_bonus(dnode_t *dn)
   1787   1544   eschrock {
   1788   1544   eschrock 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
   1789   1544   eschrock 
   1790   1544   eschrock 	ASSERT(dn->dn_bonus == NULL);
   1791   4944     maybee 	dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
   1792    789     ahrens }
   1793    789     ahrens 
   1794   1544   eschrock #pragma weak dmu_buf_add_ref = dbuf_add_ref
   1795    789     ahrens void
   1796    789     ahrens dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
   1797    789     ahrens {
   1798   1544   eschrock 	int64_t holds = refcount_add(&db->db_holds, tag);
   1799   1544   eschrock 	ASSERT(holds > 1);
   1800    789     ahrens }
   1801    789     ahrens 
   1802   1544   eschrock #pragma weak dmu_buf_rele = dbuf_rele
   1803    789     ahrens void
   1804   1544   eschrock dbuf_rele(dmu_buf_impl_t *db, void *tag)
   1805    789     ahrens {
   1806  10922       Jeff 	mutex_enter(&db->db_mtx);
   1807  10922       Jeff 	dbuf_rele_and_unlock(db, tag);
   1808  10922       Jeff }
   1809  10922       Jeff 
   1810  10922       Jeff /*
   1811  10922       Jeff  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
   1812  10922       Jeff  * db_dirtycnt and db_holds to be updated atomically.
   1813  10922       Jeff  */
   1814  10922       Jeff void
   1815  10922       Jeff dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
   1816  10922       Jeff {
   1817    789     ahrens 	int64_t holds;
   1818    789     ahrens 
   1819  10922       Jeff 	ASSERT(MUTEX_HELD(&db->db_mtx));
   1820    873   ek110237 	DBUF_VERIFY(db);
   1821    789     ahrens 
   1822    789     ahrens 	holds = refcount_remove(&db->db_holds, tag);
   1823   1544   eschrock 	ASSERT(holds >= 0);
   1824   1544   eschrock 
   1825   3547     maybee 	/*
   1826   3547     maybee 	 * We can't freeze indirects if there is a possibility that they
   1827   3547     maybee 	 * may be modified in the current syncing context.
   1828   3547     maybee 	 */
   1829   3547     maybee 	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
   1830   3093     ahrens 		arc_buf_freeze(db->db_buf);
   1831   3093     ahrens 
   1832   1544   eschrock 	if (holds == db->db_dirtycnt &&
   1833   3547     maybee 	    db->db_level == 0 && db->db_immediate_evict)
   1834   1544   eschrock 		dbuf_evict_user(db);
   1835    789     ahrens 
   1836    789     ahrens 	if (holds == 0) {
   1837   1544   eschrock 		if (db->db_blkid == DB_BONUS_BLKID) {
   1838   1544   eschrock 			mutex_exit(&db->db_mtx);
   1839   1544   eschrock 			dnode_rele(db->db_dnode, db);
   1840   1544   eschrock 		} else if (db->db_buf == NULL) {
   1841   1544   eschrock 			/*
   1842   1544   eschrock 			 * This is a special case: we never associated this
   1843   1544   eschrock 			 * dbuf with any data allocated from the ARC.
   1844   1544   eschrock 			 */
   1845   7872        Tim 			ASSERT(db->db_state == DB_UNCACHED ||
   1846   7872        Tim 			    db->db_state == DB_NOFILL);
   1847   1544   eschrock 			dbuf_evict(db);
   1848   3093     ahrens 		} else if (arc_released(db->db_buf)) {
   1849   1544   eschrock 			arc_buf_t *buf = db->db_buf;
   1850   1544   eschrock 			/*
   1851   1544   eschrock 			 * This dbuf has anonymous data associated with it.
   1852   1544   eschrock 			 */
   1853   1544   eschrock 			dbuf_set_data(db, NULL);
   1854   1544   eschrock 			VERIFY(arc_buf_remove_ref(buf, db) == 1);
   1855   1544   eschrock 			dbuf_evict(db);
   1856   1544   eschrock 		} else {
   1857   1544   eschrock 			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
   1858   7237   ek110237 			if (!DBUF_IS_CACHEABLE(db))
   1859   7237   ek110237 				dbuf_clear(db);
   1860   7237   ek110237 			else
   1861   7237   ek110237 				mutex_exit(&db->db_mtx);
   1862   1544   eschrock 		}
   1863    789     ahrens 	} else {
   1864    789     ahrens 		mutex_exit(&db->db_mtx);
   1865    789     ahrens 	}
   1866    789     ahrens }
   1867    789     ahrens 
   1868    789     ahrens #pragma weak dmu_buf_refcount = dbuf_refcount
   1869    789     ahrens uint64_t
   1870    789     ahrens dbuf_refcount(dmu_buf_impl_t *db)
   1871    789     ahrens {
   1872    789     ahrens 	return (refcount_count(&db->db_holds));
   1873    789     ahrens }
   1874    789     ahrens 
   1875    789     ahrens void *
   1876    789     ahrens dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
   1877    789     ahrens     dmu_buf_evict_func_t *evict_func)
   1878    789     ahrens {
   1879    789     ahrens 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
   1880    789     ahrens 	    user_data_ptr_ptr, evict_func));
   1881    789     ahrens }
   1882    789     ahrens 
   1883    789     ahrens void *
   1884    789     ahrens dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
   1885    789     ahrens     dmu_buf_evict_func_t *evict_func)
   1886    789     ahrens {
   1887    789     ahrens 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
   1888    789     ahrens 
   1889   3547     maybee 	db->db_immediate_evict = TRUE;
   1890    789     ahrens 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
   1891    789     ahrens 	    user_data_ptr_ptr, evict_func));
   1892    789     ahrens }
   1893    789     ahrens 
   1894    789     ahrens void *
   1895    789     ahrens dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
   1896    789     ahrens     void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
   1897    789     ahrens {
   1898    789     ahrens 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
   1899    789     ahrens 	ASSERT(db->db_level == 0);
   1900    789     ahrens 
   1901    789     ahrens 	ASSERT((user_ptr == NULL) == (evict_func == NULL));
   1902    789     ahrens 
   1903    789     ahrens 	mutex_enter(&db->db_mtx);
   1904    789     ahrens 
   1905   3547     maybee 	if (db->db_user_ptr == old_user_ptr) {
   1906   3547     maybee 		db->db_user_ptr = user_ptr;
   1907   3547     maybee 		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
   1908   3547     maybee 		db->db_evict_func = evict_func;
   1909    789     ahrens 
   1910    789     ahrens 		dbuf_update_data(db);
   1911    789     ahrens 	} else {
   1912   3547     maybee 		old_user_ptr = db->db_user_ptr;
   1913    789     ahrens 	}
   1914    789     ahrens 
   1915    789     ahrens 	mutex_exit(&db->db_mtx);
   1916    789     ahrens 	return (old_user_ptr);
   1917    789     ahrens }
   1918    789     ahrens 
   1919    789     ahrens void *
   1920    789     ahrens dmu_buf_get_user(dmu_buf_t *db_fake)
   1921    789     ahrens {
   1922    789     ahrens 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
   1923    789     ahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
   1924    789     ahrens 
   1925   3547     maybee 	return (db->db_user_ptr);
   1926    789     ahrens }
   1927    789     ahrens 
   1928   9653    Sanjeev boolean_t
   1929   9653    Sanjeev dmu_buf_freeable(dmu_buf_t *dbuf)
   1930   9653    Sanjeev {
   1931   9653    Sanjeev 	boolean_t res = B_FALSE;
   1932   9653    Sanjeev 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
   1933   9653    Sanjeev 
   1934   9653    Sanjeev 	if (db->db_blkptr)
   1935   9653    Sanjeev 		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
   1936   9653    Sanjeev 		    db->db_blkptr->blk_birth);
   1937   9653    Sanjeev 
   1938   9653    Sanjeev 	return (res);
   1939   9653    Sanjeev }
   1940   9653    Sanjeev 
   1941   3547     maybee static void
   1942   3547     maybee dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
   1943    789     ahrens {
   1944   3547     maybee 	/* ASSERT(dmu_tx_is_syncing(tx) */
   1945   3547     maybee 	ASSERT(MUTEX_HELD(&db->db_mtx));
   1946   3547     maybee 
   1947   3547     maybee 	if (db->db_blkptr != NULL)
   1948   3547     maybee 		return;
   1949   3547     maybee 
   1950   3547     maybee 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
   1951   3547     maybee 		/*
   1952   3547     maybee 		 * This buffer was allocated at a time when there was
   1953   3547     maybee 		 * no available blkptrs from the dnode, or it was
   1954   3547     maybee 		 * inappropriate to hook it in (i.e., nlevels mis-match).
   1955   3547     maybee 		 */
   1956   3547     maybee 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
   1957   3547     maybee 		ASSERT(db->db_parent == NULL);
   1958   3547     maybee 		db->db_parent = dn->dn_dbuf;
   1959   3547     maybee 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
   1960   3547     maybee 		DBUF_VERIFY(db);
   1961   3547     maybee 	} else {
   1962   3547     maybee 		dmu_buf_impl_t *parent = db->db_parent;
   1963   3547     maybee 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
   1964   3547     maybee 
   1965   3547     maybee 		ASSERT(dn->dn_phys->dn_nlevels > 1);
   1966   3547     maybee 		if (parent == NULL) {
   1967   3547     maybee 			mutex_exit(&db->db_mtx);
   1968   3547     maybee 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
   1969   3547     maybee 			(void) dbuf_hold_impl(dn, db->db_level+1,
   1970   3547     maybee 			    db->db_blkid >> epbs, FALSE, db, &parent);
   1971   3547     maybee 			rw_exit(&dn->dn_struct_rwlock);
   1972   3547     maybee 			mutex_enter(&db->db_mtx);
   1973   3547     maybee 			db->db_parent = parent;
   1974   3547     maybee 		}
   1975   3547     maybee 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
   1976   3547     maybee 		    (db->db_blkid & ((1ULL << epbs) - 1));
   1977   3547     maybee 		DBUF_VERIFY(db);
   1978   3547     maybee 	}
   1979   3547     maybee }
   1980   3547     maybee 
   1981   3547     maybee static void
   1982   3547     maybee dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
   1983   3547     maybee {
   1984   3547     maybee 	dmu_buf_impl_t *db = dr->dr_dbuf;
   1985   3547     maybee 	dnode_t *dn = db->db_dnode;
   1986   3547     maybee 	zio_t *zio;
   1987   3547     maybee 
   1988   3547     maybee 	ASSERT(dmu_tx_is_syncing(tx));
   1989   3547     maybee 
   1990   3547     maybee 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
   1991   3547     maybee 
   1992   3547     maybee 	mutex_enter(&db->db_mtx);
   1993   3547     maybee 
   1994   3547     maybee 	ASSERT(db->db_level > 0);
   1995   3547     maybee 	DBUF_VERIFY(db);
   1996   3547     maybee 
   1997   3547     maybee 	if (db->db_buf == NULL) {
   1998   3547     maybee 		mutex_exit(&db->db_mtx);
   1999   3547     maybee 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
   2000   3547     maybee 		mutex_enter(&db->db_mtx);
   2001   3547     maybee 	}
   2002   3547     maybee 	ASSERT3U(db->db_state, ==, DB_CACHED);
   2003   3547     maybee 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
   2004   3547     maybee 	ASSERT(db->db_buf != NULL);
   2005   3547     maybee 
   2006   3547     maybee 	dbuf_check_blkptr(dn, db);
   2007   3547     maybee 
   2008   3547     maybee 	db->db_data_pending = dr;
   2009   3547     maybee 
   2010   3897     maybee 	mutex_exit(&db->db_mtx);
   2011   7046     ahrens 	dbuf_write(dr, db->db_buf, tx);
   2012   3547     maybee 
   2013   3547     maybee 	zio = dr->dr_zio;
   2014   3547     maybee 	mutex_enter(&dr->dt.di.dr_mtx);
   2015   3547     maybee 	dbuf_sync_list(&dr->dt.di.dr_children, tx);
   2016   3547     maybee 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
   2017   3547     maybee 	mutex_exit(&dr->dt.di.dr_mtx);
   2018   3547     maybee 	zio_nowait(zio);
   2019   3547     maybee }
   2020   3547     maybee 
   2021   3547     maybee static void
   2022   3547     maybee dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
   2023   3547     maybee {
   2024   3547     maybee 	arc_buf_t **datap = &dr->dt.dl.dr_data;
   2025   3547     maybee 	dmu_buf_impl_t *db = dr->dr_dbuf;
   2026    789     ahrens 	dnode_t *dn = db->db_dnode;
   2027  10298    Matthew 	objset_t *os = dn->dn_objset;
   2028   3547     maybee 	uint64_t txg = tx->tx_txg;
   2029    789     ahrens 
   2030    789     ahrens 	ASSERT(dmu_tx_is_syncing(tx));
   2031    789     ahrens 
   2032    789     ahrens 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
   2033    789     ahrens 
   2034    789     ahrens 	mutex_enter(&db->db_mtx);
   2035    789     ahrens 	/*
   2036    789     ahrens 	 * To be synced, we must be dirtied.  But we
   2037    789     ahrens 	 * might have been freed after the dirty.
   2038    789     ahrens 	 */
   2039    789     ahrens 	if (db->db_state == DB_UNCACHED) {
   2040    789     ahrens 		/* This buffer has been freed since it was dirtied */
   2041    789     ahrens 		ASSERT(db->db.db_data == NULL);
   2042    789     ahrens 	} else if (db->db_state == DB_FILL) {
   2043    789     ahrens 		/* This buffer was freed and is now being re-filled */
   2044   3547     maybee 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
   2045    789     ahrens 	} else {
   2046   7872        Tim 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
   2047    789     ahrens 	}
   2048    873   ek110237 	DBUF_VERIFY(db);
   2049    789     ahrens 
   2050    789     ahrens 	/*
   2051   3547     maybee 	 * If this is a bonus buffer, simply copy the bonus data into the
   2052   3547     maybee 	 * dnode.  It will be written out when the dnode is synced (and it
   2053   3547     maybee 	 * will be synced, since it must have been dirty for dbuf_sync to
   2054   3547     maybee 	 * be called).
   2055    789     ahrens 	 */
   2056   1544   eschrock 	if (db->db_blkid == DB_BONUS_BLKID) {
   2057   3547     maybee 		dbuf_dirty_record_t **drp;
   2058   4944     maybee 
   2059   1544   eschrock 		ASSERT(*datap != NULL);
   2060   1544   eschrock 		ASSERT3U(db->db_level, ==, 0);
   2061   1544   eschrock 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
   2062   1544   eschrock 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
   2063   4309     maybee 		if (*datap != db->db.db_data) {
   2064   1544   eschrock 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
   2065   8582    Brendan 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
   2066   4309     maybee 		}
   2067   1544   eschrock 		db->db_data_pending = NULL;
   2068   3547     maybee 		drp = &db->db_last_dirty;
   2069   3547     maybee 		while (*drp != dr)
   2070   3547     maybee 			drp = &(*drp)->dr_next;
   2071   5688    bonwick 		ASSERT(dr->dr_next == NULL);
   2072  10922       Jeff 		ASSERT(dr->dr_dbuf == db);
   2073   5688    bonwick 		*drp = dr->dr_next;
   2074   3547     maybee 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
   2075   1544   eschrock 		ASSERT(db->db_dirtycnt > 0);
   2076   1544   eschrock 		db->db_dirtycnt -= 1;
   2077  10922       Jeff 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
   2078   1544   eschrock 		return;
   2079   1544   eschrock 	}
   2080   1544   eschrock 
   2081   3547     maybee 	/*
   2082   4312    gw25295 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
   2083   4312    gw25295 	 * operation to sneak in. As a result, we need to ensure that we
   2084   4312    gw25295 	 * don't check the dr_override_state until we have returned from
   2085   4312    gw25295 	 * dbuf_check_blkptr.
   2086   4312    gw25295 	 */
   2087   4312    gw25295 	dbuf_check_blkptr(dn, db);
   2088   4312    gw25295 
   2089   4312    gw25295 	/*
   2090   3547     maybee 	 * If this buffer is in the middle of an immdiate write,
   2091   3547     maybee 	 * wait for the synchronous IO to complete.
   2092   3547     maybee 	 */
   2093   3547     maybee 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
   2094   3547     maybee 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
   2095   3547     maybee 		cv_wait(&db->db_changed, &db->db_mtx);
   2096   3547     maybee 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
   2097   3547     maybee 	}
   2098   3547     maybee 
   2099   8746    Matthew 	if (db->db_state != DB_NOFILL &&
   2100   8746    Matthew 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
   2101   8746    Matthew 	    refcount_count(&db->db_holds) > 1 &&
   2102  10922       Jeff 	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
   2103   8746    Matthew 	    *datap == db->db_buf) {
   2104   8746    Matthew 		/*
   2105   8746    Matthew 		 * If this buffer is currently "in use" (i.e., there
   2106   8746    Matthew 		 * are active holds and db_data still references it),
   2107   8746    Matthew 		 * then make a copy before we start the write so that
   2108   8746    Matthew 		 * any modifications from the open txg will not leak
   2109   8746    Matthew 		 * into this write.
   2110   8746    Matthew 		 *
   2111   8746    Matthew 		 * NOTE: this copy does not need to be made for
   2112   8746    Matthew 		 * objects only modified in the syncing context (e.g.
   2113   8746    Matthew 		 * DNONE_DNODE blocks).
   2114   8746    Matthew 		 */
   2115   8746    Matthew 		int blksz = arc_buf_size(*datap);
   2116   8746    Matthew 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
   2117   8746    Matthew 		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
   2118   8746    Matthew 		bcopy(db->db.db_data, (*datap)->b_data, blksz);
   2119    789     ahrens 	}
   2120   3547     maybee 	db->db_data_pending = dr;
   2121    789     ahrens 
   2122   3547     maybee 	mutex_exit(&db->db_mtx);
   2123    789     ahrens 
   2124   7046     ahrens 	dbuf_write(dr, *datap, tx);
   2125    789     ahrens 
   2126   3547     maybee 	ASSERT(!list_link_active(&dr->dr_dirty_node));
   2127   3547     maybee 	if (dn->dn_object == DMU_META_DNODE_OBJECT)
   2128   3547     maybee 		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
   2129   3547     maybee 	else
   2130   3547     maybee 		zio_nowait(dr->dr_zio);
   2131   3547     maybee }
   2132   1163     maybee 
   2133   3547     maybee void
   2134   3547     maybee dbuf_sync_list(list_t *list, dmu_tx_t *tx)
   2135   3547     maybee {
   2136   3547     maybee 	dbuf_dirty_record_t *dr;
   2137   3547     maybee 
   2138   3547     maybee 	while (dr = list_head(list)) {
   2139   3547     maybee 		if (dr->dr_zio != NULL) {
   2140   3547     maybee 			/*
   2141   3547     maybee 			 * If we find an already initialized zio then we
   2142   3547     maybee 			 * are processing the meta-dnode, and we have finished.
   2143   3547     maybee 			 * The dbufs for all dnodes are put back on the list
   2144   3547     maybee 			 * during processing, so that we can zio_wait()
   2145   3547     maybee 			 * these IOs after initiating all child IOs.
   2146   3547     maybee 			 */
   2147   3547     maybee 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
   2148   3547     maybee 			    DMU_META_DNODE_OBJECT);
   2149   3547     maybee 			break;
   2150   1163     maybee 		}
   2151   3547     maybee 		list_remove(list, dr);
   2152   3547     maybee 		if (dr->dr_dbuf->db_level > 0)
   2153   3547     maybee 			dbuf_sync_indirect(dr, tx);
   2154   3547     maybee 		else
   2155   3547     maybee 			dbuf_sync_leaf(dr, tx);
   2156   1163     maybee 	}
   2157   3547     maybee }
   2158   1163     maybee 
   2159    789     ahrens /* ARGSUSED */
   2160    789     ahrens static void
   2161   3547     maybee dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
   2162    789     ahrens {
   2163    789     ahrens 	dmu_buf_impl_t *db = vdb;
   2164   7754       Jeff 	blkptr_t *bp = zio->io_bp;
   2165   3547     maybee 	blkptr_t *bp_orig = &zio->io_bp_orig;
   2166  10922       Jeff 	dnode_t *dn = db->db_dnode;
   2167  10922       Jeff 	spa_t *spa = zio->io_spa;
   2168  10922       Jeff 	int64_t delta;
   2169    789     ahrens 	uint64_t fill = 0;
   2170  10922       Jeff 	int i;
   2171    789     ahrens 
   2172   7754       Jeff 	ASSERT(db->db_blkptr == bp);
   2173   7754       Jeff 
   2174  10922       Jeff 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
   2175  10922       Jeff 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
   2176  10922       Jeff 	zio->io_prev_space_delta = delta;
   2177    789     ahrens 
   2178   7754       Jeff 	if (BP_IS_HOLE(bp)) {
   2179  10922       Jeff 		ASSERT(bp->blk_fill == 0);
   2180   3547     maybee 		return;
   2181   3547     maybee 	}
   2182   7754       Jeff 
   2183   7754       Jeff 	ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
   2184   7754       Jeff 	ASSERT(BP_GET_LEVEL(bp) == db->db_level);
   2185   3547     maybee 
   2186    789     ahrens 	mutex_enter(&db->db_mtx);
   2187    789     ahrens 
   2188    789     ahrens 	if (db->db_level == 0) {
   2189    789     ahrens 		mutex_enter(&dn->dn_mtx);
   2190   3547     maybee 		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
   2191    789     ahrens 			dn->dn_phys->dn_maxblkid = db->db_blkid;
   2192    789     ahrens 		mutex_exit(&dn->dn_mtx);
   2193    789     ahrens 
   2194    789     ahrens 		if (dn->dn_type == DMU_OT_DNODE) {
   2195    789     ahrens 			dnode_phys_t *dnp = db->db.db_data;
   2196    789     ahrens 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
   2197    789     ahrens 			    i--, dnp++) {
   2198    789     ahrens 				if (dnp->dn_type != DMU_OT_NONE)
   2199    789     ahrens 					fill++;
   2200    789     ahrens 			}
   2201    789     ahrens 		} else {
   2202   3547     maybee 			fill = 1;
   2203    789     ahrens 		}
   2204    789     ahrens 	} else {
   2205   7754       Jeff 		blkptr_t *ibp = db->db.db_data;
   2206    789     ahrens 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
   2207   7754       Jeff 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
   2208   7754       Jeff 			if (BP_IS_HOLE(ibp))
   2209    789     ahrens 				continue;
   2210   7754       Jeff 			ASSERT3U(BP_GET_LSIZE(ibp), ==,
   2211    789     ahrens 			    db->db_level == 1 ? dn->dn_datablksz :
   2212    789     ahrens 			    (1<<dn->dn_phys->dn_indblkshift));
   2213   7754       Jeff 			fill += ibp->blk_fill;
   2214    789     ahrens 		}
   2215    789     ahrens 	}
   2216    789     ahrens 
   2217   7754       Jeff 	bp->blk_fill = fill;
   2218   3547     maybee 
   2219   3547     maybee 	mutex_exit(&db->db_mtx);
   2220   3547     maybee }
   2221   3547     maybee 
   2222   3547     maybee /* ARGSUSED */
   2223   3547     maybee static void
   2224   3547     maybee dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
   2225   3547     maybee {
   2226   3547     maybee 	dmu_buf_impl_t *db = vdb;
   2227  10922       Jeff 	blkptr_t *bp = zio->io_bp;
   2228  10922       Jeff 	blkptr_t *bp_orig = &zio->io_bp_orig;
   2229  10922       Jeff 	dnode_t *dn = db->db_dnode;
   2230  10922       Jeff 	objset_t *os = dn->dn_objset;
   2231   3547     maybee 	uint64_t txg = zio->io_txg;
   2232   3547     maybee 	dbuf_dirty_record_t **drp, *dr;
   2233   3547     maybee 
   2234   3547     maybee 	ASSERT3U(zio->io_error, ==, 0);
   2235  10922       Jeff 	ASSERT(db->db_blkptr == bp);
   2236  10922       Jeff 
   2237  10922       Jeff 	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
   2238  10922       Jeff 		ASSERT(BP_EQUAL(bp, bp_orig));
   2239  10922       Jeff 	} else {
   2240  10922       Jeff 		dsl_dataset_t *ds = os->os_dsl_dataset;
   2241  10922       Jeff 		dmu_tx_t *tx = os->os_synctx;
   2242  10922       Jeff 
   2243  10922       Jeff 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
   2244  10922       Jeff 		dsl_dataset_block_born(ds, bp, tx);
   2245  10922       Jeff 	}
   2246   3547     maybee 
   2247   3547     maybee 	mutex_enter(&db->db_mtx);
   2248  10922       Jeff 
   2249  10922       Jeff 	DBUF_VERIFY(db);
   2250   3547     maybee 
   2251   3547     maybee 	drp = &db->db_last_dirty;
   2252   5688    bonwick 	while ((dr = *drp) != db->db_data_pending)
   2253   5688    bonwick 		drp = &dr->dr_next;
   2254   5688    bonwick 	ASSERT(!list_link_active(&dr->dr_dirty_node));
   2255   5688    bonwick 	ASSERT(dr->dr_txg == txg);
   2256  10922       Jeff 	ASSERT(dr->dr_dbuf == db);
   2257   5688    bonwick 	ASSERT(dr->dr_next == NULL);
   2258   5688    bonwick 	*drp = dr->dr_next;
   2259   3547     maybee 
   2260   3547     maybee 	if (db->db_level == 0) {
   2261   3547     maybee 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
   2262   3547     maybee 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
   2263   7872        Tim 		if (db->db_state != DB_NOFILL) {
   2264   7872        Tim 			if (dr->dt.dl.dr_data != db->db_buf)
   2265   7872        Tim 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
   2266   7872        Tim 				    db) == 1);
   2267  10922       Jeff 			else if (!arc_released(db->db_buf))
   2268   7872        Tim 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
   2269   7872        Tim 		}
   2270    789     ahrens 	} else {
   2271   3547     maybee 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
   2272   3547     maybee 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
   2273   3547     maybee 		if (!BP_IS_HOLE(db->db_blkptr)) {
   2274   3547     maybee 			int epbs =
   2275   3547     maybee 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
   2276   3547     maybee 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
   2277   3547     maybee 			    db->db.db_size);
   2278   3547     maybee 			ASSERT3U(dn->dn_phys->dn_maxblkid
   2279   3547     maybee 			    >> (db->db_level * epbs), >=, db->db_blkid);
   2280   3547     maybee 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
   2281   3547     maybee 		}
   2282   4831    gw25295 		mutex_destroy(&dr->dt.di.dr_mtx);
   2283   4831    gw25295 		list_destroy(&dr->dt.di.dr_children);
   2284    789     ahrens 	}
   2285   3547     maybee 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
   2286    789     ahrens 
   2287    789     ahrens 	cv_broadcast(&db->db_changed);
   2288    789     ahrens 	ASSERT(db->db_dirtycnt > 0);
   2289    789     ahrens 	db->db_dirtycnt -= 1;
   2290   3547     maybee 	db->db_data_pending = NULL;
   2291  10922       Jeff 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
   2292  10922       Jeff }
   2293  10922       Jeff 
   2294  10922       Jeff static void
   2295  10922       Jeff dbuf_write_nofill_ready(zio_t *zio)
   2296  10922       Jeff {
   2297  10922       Jeff 	dbuf_write_ready(zio, NULL, zio->io_private);
   2298  10922       Jeff }
   2299  10922       Jeff 
   2300  10922       Jeff static void
   2301  10922       Jeff dbuf_write_nofill_done(zio_t *zio)
   2302  10922       Jeff {
   2303  10922       Jeff 	dbuf_write_done(zio, NULL, zio->io_private);
   2304  10922       Jeff }
   2305  10922       Jeff 
   2306  10922       Jeff static void
   2307  10922       Jeff dbuf_write_override_ready(zio_t *zio)
   2308  10922       Jeff {
   2309  10922       Jeff 	dbuf_dirty_record_t *dr = zio->io_private;
   2310  10922       Jeff 	dmu_buf_impl_t *db = dr->dr_dbuf;
   2311  10922       Jeff 
   2312  10922       Jeff 	dbuf_write_ready(zio, NULL, db);
   2313  10922       Jeff }
   2314  10922       Jeff 
   2315  10922       Jeff static void
   2316  10922       Jeff dbuf_write_override_done(zio_t *zio)
   2317  10922       Jeff {
   2318  10922       Jeff 	dbuf_dirty_record_t *dr = zio->io_private;
   2319  10922       Jeff 	dmu_buf_impl_t *db = dr->dr_dbuf;
   2320  10922       Jeff 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
   2321  10922       Jeff 
   2322  10922       Jeff 	mutex_enter(&db->db_mtx);
   2323  10922       Jeff 	if (!BP_EQUAL(zio->io_bp, obp)) {
   2324  10922       Jeff 		if (!BP_IS_HOLE(obp))
   2325  10922       Jeff 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
   2326  10922       Jeff 		arc_release(dr->dt.dl.dr_data, db);
   2327  10922       Jeff 	}
   2328    789     ahrens 	mutex_exit(&db->db_mtx);
   2329    789     ahrens 
   2330  10922       Jeff 	dbuf_write_done(zio, NULL, db);
   2331  10922       Jeff }
   2332    789     ahrens 
   2333  10922       Jeff static void
   2334  10922       Jeff dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
   2335  10922       Jeff {
   2336  10922       Jeff 	dmu_buf_impl_t *db = dr->dr_dbuf;
   2337  10922       Jeff 	dnode_t *dn = db->db_dnode;
   2338  10922       Jeff 	objset_t *os = dn->dn_objset;
   2339  10922       Jeff 	dmu_buf_impl_t *parent = db->db_parent;
   2340  10922       Jeff 	uint64_t txg = tx->tx_txg;
   2341  10922       Jeff 	zbookmark_t zb;
   2342  10922       Jeff 	zio_prop_t zp;
   2343  10922       Jeff 	zio_t *zio;
   2344  10922       Jeff 
   2345  10922       Jeff 	if (db->db_state != DB_NOFILL) {
   2346  10922       Jeff 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
   2347  10922       Jeff 			/*
   2348  10922       Jeff 			 * Private object buffers are released here rather
   2349  10922       Jeff 			 * than in dbuf_dirty() since they are only modified
   2350  10922       Jeff 			 * in the syncing context and we don't want the
   2351  10922       Jeff 			 * overhead of making multiple copies of the data.
   2352  10922       Jeff 			 */
   2353  10922       Jeff 			if (BP_IS_HOLE(db->db_blkptr)) {
   2354  10922       Jeff 				arc_buf_thaw(data);
   2355  10922       Jeff 			} else {
   2356  10922       Jeff 				arc_release(data, db);
   2357  10922       Jeff 			}
   2358  10922       Jeff 		}
   2359  10922       Jeff 	}
   2360  10922       Jeff 
   2361  10922       Jeff 	if (parent != dn->dn_dbuf) {
   2362  10922       Jeff 		ASSERT(parent && parent->db_data_pending);
   2363  10922       Jeff 		ASSERT(db->db_level == parent->db_level-1);
   2364  10922       Jeff 		ASSERT(arc_released(parent->db_buf));
   2365  10922       Jeff 		zio = parent->db_data_pending->dr_zio;
   2366  10922       Jeff 	} else {
   2367  10922       Jeff 		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
   2368  10922       Jeff 		ASSERT3P(db->db_blkptr, ==,
   2369  10922       Jeff 		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
   2370  10922       Jeff 		zio = dn->dn_zio;
   2371  10922       Jeff 	}
   2372  10922       Jeff 
   2373  10922       Jeff 	ASSERT(db->db_level == 0 || data == db->db_buf);
   2374  10922       Jeff 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
   2375  10922       Jeff 	ASSERT(zio);
   2376  10922       Jeff 
   2377  10922       Jeff 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
   2378  10922       Jeff 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
   2379  10922       Jeff 	    db->db.db_object, db->db_level, db->db_blkid);
   2380  10922       Jeff 
   2381  10922       Jeff 	dmu_write_policy(os, dn, db->db_level,
   2382  10922       Jeff 	    db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp);
   2383  10922       Jeff 
   2384  10922       Jeff 	if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
   2385  10922       Jeff 		ASSERT(db->db_state != DB_NOFILL);
   2386  10922       Jeff 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
   2387  10922       Jeff 		    db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
   2388  10922       Jeff 		    dbuf_write_override_ready, dbuf_write_override_done, dr,
   2389  10922       Jeff 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
   2390  10922       Jeff 		mutex_enter(&db->db_mtx);
   2391  10922       Jeff 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
   2392  10922       Jeff 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
   2393  10922       Jeff 		    dr->dt.dl.dr_copies);
   2394  10922       Jeff 		mutex_exit(&db->db_mtx);
   2395  10922       Jeff 	} else if (db->db_state == DB_NOFILL) {
   2396  10922       Jeff 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
   2397  10922       Jeff 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
   2398  10922       Jeff 		    db->db_blkptr, NULL, db->db.db_size, &zp,
   2399  10922       Jeff 		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
   2400  10922       Jeff 		    ZIO_PRIORITY_ASYNC_WRITE,
   2401  10922       Jeff 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
   2402  10922       Jeff 	} else {
   2403  10922       Jeff 		ASSERT(arc_released(data));
   2404  10922       Jeff 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
   2405  10922       Jeff 		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
   2406  10922       Jeff 		    dbuf_write_ready, dbuf_write_done, db,
   2407  10922       Jeff 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
   2408  10922       Jeff 	}
   2409    789     ahrens }
   2410