Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/zfs_context.h>
     29 #include <sys/dmu.h>
     30 #include <sys/dmu_impl.h>
     31 #include <sys/dbuf.h>
     32 #include <sys/dmu_objset.h>
     33 #include <sys/dsl_dataset.h>
     34 #include <sys/dsl_dir.h>
     35 #include <sys/dmu_tx.h>
     36 #include <sys/spa.h>
     37 #include <sys/zio.h>
     38 #include <sys/dmu_zfetch.h>
     39 
     40 static void dbuf_destroy(dmu_buf_impl_t *db);
     41 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
     42 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
     43     int compress, dmu_tx_t *tx);
     44 static arc_done_func_t dbuf_write_ready;
     45 static arc_done_func_t dbuf_write_done;
     46 
     47 int zfs_mdcomp_disable = 0;
     48 
     49 /*
     50  * Global data structures and functions for the dbuf cache.
     51  */
     52 static kmem_cache_t *dbuf_cache;
     53 
     54 /* ARGSUSED */
     55 static int
     56 dbuf_cons(void *vdb, void *unused, int kmflag)
     57 {
     58 	dmu_buf_impl_t *db = vdb;
     59 	bzero(db, sizeof (dmu_buf_impl_t));
     60 
     61 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
     62 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
     63 	refcount_create(&db->db_holds);
     64 	return (0);
     65 }
     66 
     67 /* ARGSUSED */
     68 static void
     69 dbuf_dest(void *vdb, void *unused)
     70 {
     71 	dmu_buf_impl_t *db = vdb;
     72 	mutex_destroy(&db->db_mtx);
     73 	cv_destroy(&db->db_changed);
     74 	refcount_destroy(&db->db_holds);
     75 }
     76 
     77 /*
     78  * dbuf hash table routines
     79  */
     80 static dbuf_hash_table_t dbuf_hash_table;
     81 
     82 static uint64_t dbuf_hash_count;
     83 
     84 static uint64_t
     85 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
     86 {
     87 	uintptr_t osv = (uintptr_t)os;
     88 	uint64_t crc = -1ULL;
     89 
     90 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
     91 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
     92 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
     93 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
     94 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
     95 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
     96 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
     97 
     98 	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
     99 
    100 	return (crc);
    101 }
    102 
    103 #define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
    104 
    105 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
    106 	((dbuf)->db.db_object == (obj) &&		\
    107 	(dbuf)->db_objset == (os) &&			\
    108 	(dbuf)->db_level == (level) &&			\
    109 	(dbuf)->db_blkid == (blkid))
    110 
    111 dmu_buf_impl_t *
    112 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
    113 {
    114 	dbuf_hash_table_t *h = &dbuf_hash_table;
    115 	objset_impl_t *os = dn->dn_objset;
    116 	uint64_t obj = dn->dn_object;
    117 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
    118 	uint64_t idx = hv & h->hash_table_mask;
    119 	dmu_buf_impl_t *db;
    120 
    121 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
    122 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
    123 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
    124 			mutex_enter(&db->db_mtx);
    125 			if (db->db_state != DB_EVICTING) {
    126 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
    127 				return (db);
    128 			}
    129 			mutex_exit(&db->db_mtx);
    130 		}
    131 	}
    132 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
    133 	return (NULL);
    134 }
    135 
    136 /*
    137  * Insert an entry into the hash table.  If there is already an element
    138  * equal to elem in the hash table, then the already existing element
    139  * will be returned and the new element will not be inserted.
    140  * Otherwise returns NULL.
    141  */
    142 static dmu_buf_impl_t *
    143 dbuf_hash_insert(dmu_buf_impl_t *db)
    144 {
    145 	dbuf_hash_table_t *h = &dbuf_hash_table;
    146 	objset_impl_t *os = db->db_objset;
    147 	uint64_t obj = db->db.db_object;
    148 	int level = db->db_level;
    149 	uint64_t blkid = db->db_blkid;
    150 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
    151 	uint64_t idx = hv & h->hash_table_mask;
    152 	dmu_buf_impl_t *dbf;
    153 
    154 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
    155 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
    156 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
    157 			mutex_enter(&dbf->db_mtx);
    158 			if (dbf->db_state != DB_EVICTING) {
    159 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
    160 				return (dbf);
    161 			}
    162 			mutex_exit(&dbf->db_mtx);
    163 		}
    164 	}
    165 
    166 	mutex_enter(&db->db_mtx);
    167 	db->db_hash_next = h->hash_table[idx];
    168 	h->hash_table[idx] = db;
    169 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
    170 	atomic_add_64(&dbuf_hash_count, 1);
    171 
    172 	return (NULL);
    173 }
    174 
    175 /*
    176  * Remove an entry from the hash table.  This operation will
    177  * fail if there are any existing holds on the db.
    178  */
    179 static void
    180 dbuf_hash_remove(dmu_buf_impl_t *db)
    181 {
    182 	dbuf_hash_table_t *h = &dbuf_hash_table;
    183 	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
    184 	    db->db_level, db->db_blkid);
    185 	uint64_t idx = hv & h->hash_table_mask;
    186 	dmu_buf_impl_t *dbf, **dbp;
    187 
    188 	/*
    189 	 * We musn't hold db_mtx to maintin lock ordering:
    190 	 * DBUF_HASH_MUTEX > db_mtx.
    191 	 */
    192 	ASSERT(refcount_is_zero(&db->db_holds));
    193 	ASSERT(db->db_state == DB_EVICTING);
    194 	ASSERT(!MUTEX_HELD(&db->db_mtx));
    195 
    196 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
    197 	dbp = &h->hash_table[idx];
    198 	while ((dbf = *dbp) != db) {
    199 		dbp = &dbf->db_hash_next;
    200 		ASSERT(dbf != NULL);
    201 	}
    202 	*dbp = db->db_hash_next;
    203 	db->db_hash_next = NULL;
    204 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
    205 	atomic_add_64(&dbuf_hash_count, -1);
    206 }
    207 
    208 static arc_evict_func_t dbuf_do_evict;
    209 
    210 static void
    211 dbuf_evict_user(dmu_buf_impl_t *db)
    212 {
    213 	ASSERT(MUTEX_HELD(&db->db_mtx));
    214 
    215 	if (db->db_level != 0 || db->db_evict_func == NULL)
    216 		return;
    217 
    218 	if (db->db_user_data_ptr_ptr)
    219 		*db->db_user_data_ptr_ptr = db->db.db_data;
    220 	db->db_evict_func(&db->db, db->db_user_ptr);
    221 	db->db_user_ptr = NULL;
    222 	db->db_user_data_ptr_ptr = NULL;
    223 	db->db_evict_func = NULL;
    224 }
    225 
    226 void
    227 dbuf_evict(dmu_buf_impl_t *db)
    228 {
    229 	ASSERT(MUTEX_HELD(&db->db_mtx));
    230 	ASSERT(db->db_buf == NULL);
    231 	ASSERT(db->db_data_pending == NULL);
    232 
    233 	dbuf_clear(db);
    234 	dbuf_destroy(db);
    235 }
    236 
    237 void
    238 dbuf_init(void)
    239 {
    240 	uint64_t hsize = 1ULL << 16;
    241 	dbuf_hash_table_t *h = &dbuf_hash_table;
    242 	int i;
    243 
    244 	/*
    245 	 * The hash table is big enough to fill all of physical memory
    246 	 * with an average 4K block size.  The table will take up
    247 	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
    248 	 */
    249 	while (hsize * 4096 < physmem * PAGESIZE)
    250 		hsize <<= 1;
    251 
    252 retry:
    253 	h->hash_table_mask = hsize - 1;
    254 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
    255 	if (h->hash_table == NULL) {
    256 		/* XXX - we should really return an error instead of assert */
    257 		ASSERT(hsize > (1ULL << 10));
    258 		hsize >>= 1;
    259 		goto retry;
    260 	}
    261 
    262 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
    263 	    sizeof (dmu_buf_impl_t),
    264 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
    265 
    266 	for (i = 0; i < DBUF_MUTEXES; i++)
    267 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
    268 }
    269 
    270 void
    271 dbuf_fini(void)
    272 {
    273 	dbuf_hash_table_t *h = &dbuf_hash_table;
    274 	int i;
    275 
    276 	for (i = 0; i < DBUF_MUTEXES; i++)
    277 		mutex_destroy(&h->hash_mutexes[i]);
    278 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
    279 	kmem_cache_destroy(dbuf_cache);
    280 }
    281 
    282 /*
    283  * Other stuff.
    284  */
    285 
    286 #ifdef ZFS_DEBUG
    287 static void
    288 dbuf_verify(dmu_buf_impl_t *db)
    289 {
    290 	dnode_t *dn = db->db_dnode;
    291 
    292 	ASSERT(MUTEX_HELD(&db->db_mtx));
    293 
    294 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
    295 		return;
    296 
    297 	ASSERT(db->db_objset != NULL);
    298 	if (dn == NULL) {
    299 		ASSERT(db->db_parent == NULL);
    300 		ASSERT(db->db_blkptr == NULL);
    301 	} else {
    302 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
    303 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
    304 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
    305 		ASSERT(db->db_blkid == DB_BONUS_BLKID ||
    306 		    list_head(&dn->dn_dbufs));
    307 	}
    308 	if (db->db_blkid == DB_BONUS_BLKID) {
    309 		ASSERT(dn != NULL);
    310 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
    311 		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
    312 	} else {
    313 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
    314 	}
    315 
    316 	if (db->db_level == 0) {
    317 		/* we can be momentarily larger in dnode_set_blksz() */
    318 		if (db->db_blkid != DB_BONUS_BLKID && dn) {
    319 			ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
    320 		}
    321 		if (db->db.db_object == DMU_META_DNODE_OBJECT) {
    322 			dbuf_dirty_record_t *dr = db->db_data_pending;
    323 			/*
    324 			 * it should only be modified in syncing
    325 			 * context, so make sure we only have
    326 			 * one copy of the data.
    327 			 */
    328 			ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
    329 		}
    330 	}
    331 
    332 	/* verify db->db_blkptr */
    333 	if (db->db_blkptr) {
    334 		if (db->db_parent == dn->dn_dbuf) {
    335 			/* db is pointed to by the dnode */
    336 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
    337 			if (db->db.db_object == DMU_META_DNODE_OBJECT)
    338 				ASSERT(db->db_parent == NULL);
    339 			else
    340 				ASSERT(db->db_parent != NULL);
    341 			ASSERT3P(db->db_blkptr, ==,
    342 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
    343 		} else {
    344 			/* db is pointed to by an indirect block */
    345 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
    346 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
    347 			ASSERT3U(db->db_parent->db.db_object, ==,
    348 			    db->db.db_object);
    349 			/*
    350 			 * dnode_grow_indblksz() can make this fail if we don't
    351 			 * have the struct_rwlock.  XXX indblksz no longer
    352 			 * grows.  safe to do this now?
    353 			 */
    354 			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
    355 				ASSERT3P(db->db_blkptr, ==,
    356 				    ((blkptr_t *)db->db_parent->db.db_data +
    357 				    db->db_blkid % epb));
    358 			}
    359 		}
    360 	}
    361 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
    362 	    db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
    363 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
    364 		/*
    365 		 * If the blkptr isn't set but they have nonzero data,
    366 		 * it had better be dirty, otherwise we'll lose that
    367 		 * data when we evict this buffer.
    368 		 */
    369 		if (db->db_dirtycnt == 0) {
    370 			uint64_t *buf = db->db.db_data;
    371 			int i;
    372 
    373 			for (i = 0; i < db->db.db_size >> 3; i++) {
    374 				ASSERT(buf[i] == 0);
    375 			}
    376 		}
    377 	}
    378 }
    379 #endif
    380 
    381 static void
    382 dbuf_update_data(dmu_buf_impl_t *db)
    383 {
    384 	ASSERT(MUTEX_HELD(&db->db_mtx));
    385 	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
    386 		ASSERT(!refcount_is_zero(&db->db_holds));
    387 		*db->db_user_data_ptr_ptr = db->db.db_data;
    388 	}
    389 }
    390 
    391 static void
    392 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
    393 {
    394 	ASSERT(MUTEX_HELD(&db->db_mtx));
    395 	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
    396 	db->db_buf = buf;
    397 	if (buf != NULL) {
    398 		ASSERT(buf->b_data != NULL);
    399 		db->db.db_data = buf->b_data;
    400 		if (!arc_released(buf))
    401 			arc_set_callback(buf, dbuf_do_evict, db);
    402 		dbuf_update_data(db);
    403 	} else {
    404 		dbuf_evict_user(db);
    405 		db->db.db_data = NULL;
    406 		db->db_state = DB_UNCACHED;
    407 	}
    408 }
    409 
    410 uint64_t
    411 dbuf_whichblock(dnode_t *dn, uint64_t offset)
    412 {
    413 	if (dn->dn_datablkshift) {
    414 		return (offset >> dn->dn_datablkshift);
    415 	} else {
    416 		ASSERT3U(offset, <, dn->dn_datablksz);
    417 		return (0);
    418 	}
    419 }
    420 
    421 static void
    422 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
    423 {
    424 	dmu_buf_impl_t *db = vdb;
    425 
    426 	mutex_enter(&db->db_mtx);
    427 	ASSERT3U(db->db_state, ==, DB_READ);
    428 	/*
    429 	 * All reads are synchronous, so we must have a hold on the dbuf
    430 	 */
    431 	ASSERT(refcount_count(&db->db_holds) > 0);
    432 	ASSERT(db->db_buf == NULL);
    433 	ASSERT(db->db.db_data == NULL);
    434 	if (db->db_level == 0 && db->db_freed_in_flight) {
    435 		/* we were freed in flight; disregard any error */
    436 		arc_release(buf, db);
    437 		bzero(buf->b_data, db->db.db_size);
    438 		arc_buf_freeze(buf);
    439 		db->db_freed_in_flight = FALSE;
    440 		dbuf_set_data(db, buf);
    441 		db->db_state = DB_CACHED;
    442 	} else if (zio == NULL || zio->io_error == 0) {
    443 		dbuf_set_data(db, buf);
    444 		db->db_state = DB_CACHED;
    445 	} else {
    446 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
    447 		ASSERT3P(db->db_buf, ==, NULL);
    448 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
    449 		db->db_state = DB_UNCACHED;
    450 	}
    451 	cv_broadcast(&db->db_changed);
    452 	mutex_exit(&db->db_mtx);
    453 	dbuf_rele(db, NULL);
    454 }
    455 
    456 static void
    457 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
    458 {
    459 	blkptr_t *bp;
    460 	zbookmark_t zb;
    461 	uint32_t aflags = ARC_NOWAIT;
    462 
    463 	ASSERT(!refcount_is_zero(&db->db_holds));
    464 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
    465 	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
    466 	ASSERT(MUTEX_HELD(&db->db_mtx));
    467 	ASSERT(db->db_state == DB_UNCACHED);
    468 	ASSERT(db->db_buf == NULL);
    469 
    470 	if (db->db_blkid == DB_BONUS_BLKID) {
    471 		int bonuslen = db->db_dnode->dn_bonuslen;
    472 
    473 		ASSERT3U(bonuslen, <=, db->db.db_size);
    474 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
    475 		arc_space_consume(DN_MAX_BONUSLEN);
    476 		if (bonuslen < DN_MAX_BONUSLEN)
    477 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
    478 		bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
    479 		    bonuslen);
    480 		dbuf_update_data(db);
    481 		db->db_state = DB_CACHED;
    482 		mutex_exit(&db->db_mtx);
    483 		return;
    484 	}
    485 
    486 	if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
    487 		bp = NULL;
    488 	else
    489 		bp = db->db_blkptr;
    490 
    491 	if (bp == NULL)
    492 		dprintf_dbuf(db, "blkptr: %s\n", "NULL");
    493 	else
    494 		dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
    495 
    496 	if (bp == NULL || BP_IS_HOLE(bp)) {
    497 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
    498 
    499 		ASSERT(bp == NULL || BP_IS_HOLE(bp));
    500 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
    501 		    db->db.db_size, db, type));
    502 		bzero(db->db.db_data, db->db.db_size);
    503 		db->db_state = DB_CACHED;
    504 		*flags |= DB_RF_CACHED;
    505 		mutex_exit(&db->db_mtx);
    506 		return;
    507 	}
    508 
    509 	db->db_state = DB_READ;
    510 	mutex_exit(&db->db_mtx);
    511 
    512 	zb.zb_objset = db->db_objset->os_dsl_dataset ?
    513 	    db->db_objset->os_dsl_dataset->ds_object : 0;
    514 	zb.zb_object = db->db.db_object;
    515 	zb.zb_level = db->db_level;
    516 	zb.zb_blkid = db->db_blkid;
    517 
    518 	dbuf_add_ref(db, NULL);
    519 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
    520 	ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
    521 	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
    522 	    db->db_level > 0 ? byteswap_uint64_array :
    523 	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
    524 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
    525 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
    526 	    &aflags, &zb);
    527 	if (aflags & ARC_CACHED)
    528 		*flags |= DB_RF_CACHED;
    529 }
    530 
    531 int
    532 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
    533 {
    534 	int err = 0;
    535 	int havepzio = (zio != NULL);
    536 	int prefetch;
    537 
    538 	/*
    539 	 * We don't have to hold the mutex to check db_state because it
    540 	 * can't be freed while we have a hold on the buffer.
    541 	 */
    542 	ASSERT(!refcount_is_zero(&db->db_holds));
    543 
    544 	if ((flags & DB_RF_HAVESTRUCT) == 0)
    545 		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
    546 
    547 	prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
    548 	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
    549 
    550 	mutex_enter(&db->db_mtx);
    551 	if (db->db_state == DB_CACHED) {
    552 		mutex_exit(&db->db_mtx);
    553 		if (prefetch)
    554 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
    555 			    db->db.db_size, TRUE);
    556 		if ((flags & DB_RF_HAVESTRUCT) == 0)
    557 			rw_exit(&db->db_dnode->dn_struct_rwlock);
    558 	} else if (db->db_state == DB_UNCACHED) {
    559 		if (zio == NULL) {
    560 			zio = zio_root(db->db_dnode->dn_objset->os_spa,
    561 			    NULL, NULL, ZIO_FLAG_CANFAIL);
    562 		}
    563 		dbuf_read_impl(db, zio, &flags);
    564 
    565 		/* dbuf_read_impl has dropped db_mtx for us */
    566 
    567 		if (prefetch)
    568 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
    569 			    db->db.db_size, flags & DB_RF_CACHED);
    570 
    571 		if ((flags & DB_RF_HAVESTRUCT) == 0)
    572 			rw_exit(&db->db_dnode->dn_struct_rwlock);
    573 
    574 		if (!havepzio)
    575 			err = zio_wait(zio);
    576 	} else {
    577 		mutex_exit(&db->db_mtx);
    578 		if (prefetch)
    579 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
    580 			    db->db.db_size, TRUE);
    581 		if ((flags & DB_RF_HAVESTRUCT) == 0)
    582 			rw_exit(&db->db_dnode->dn_struct_rwlock);
    583 
    584 		mutex_enter(&db->db_mtx);
    585 		if ((flags & DB_RF_NEVERWAIT) == 0) {
    586 			while (db->db_state == DB_READ ||
    587 			    db->db_state == DB_FILL) {
    588 				ASSERT(db->db_state == DB_READ ||
    589 				    (flags & DB_RF_HAVESTRUCT) == 0);
    590 				cv_wait(&db->db_changed, &db->db_mtx);
    591 			}
    592 			if (db->db_state == DB_UNCACHED)
    593 				err = EIO;
    594 		}
    595 		mutex_exit(&db->db_mtx);
    596 	}
    597 
    598 	ASSERT(err || havepzio || db->db_state == DB_CACHED);
    599 	return (err);
    600 }
    601 
    602 static void
    603 dbuf_noread(dmu_buf_impl_t *db)
    604 {
    605 	ASSERT(!refcount_is_zero(&db->db_holds));
    606 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
    607 	mutex_enter(&db->db_mtx);
    608 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
    609 		cv_wait(&db->db_changed, &db->db_mtx);
    610 	if (db->db_state == DB_UNCACHED) {
    611 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
    612 
    613 		ASSERT(db->db_buf == NULL);
    614 		ASSERT(db->db.db_data == NULL);
    615 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
    616 		    db->db.db_size, db, type));
    617 		db->db_state = DB_FILL;
    618 	} else {
    619 		ASSERT3U(db->db_state, ==, DB_CACHED);
    620 	}
    621 	mutex_exit(&db->db_mtx);
    622 }
    623 
    624 /*
    625  * This is our just-in-time copy function.  It makes a copy of
    626  * buffers, that have been modified in a previous transaction
    627  * group, before we modify them in the current active group.
    628  *
    629  * This function is used in two places: when we are dirtying a
    630  * buffer for the first time in a txg, and when we are freeing
    631  * a range in a dnode that includes this buffer.
    632  *
    633  * Note that when we are called from dbuf_free_range() we do
    634  * not put a hold on the buffer, we just traverse the active
    635  * dbuf list for the dnode.
    636  */
    637 static void
    638 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
    639 {
    640 	dbuf_dirty_record_t *dr = db->db_last_dirty;
    641 
    642 	ASSERT(MUTEX_HELD(&db->db_mtx));
    643 	ASSERT(db->db.db_data != NULL);
    644 	ASSERT(db->db_level == 0);
    645 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
    646 
    647 	if (dr == NULL ||
    648 	    (dr->dt.dl.dr_data !=
    649 	    ((db->db_blkid  == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
    650 		return;
    651 
    652 	/*
    653 	 * If the last dirty record for this dbuf has not yet synced
    654 	 * and its referencing the dbuf data, either:
    655 	 * 	reset the reference to point to a new copy,
    656 	 * or (if there a no active holders)
    657 	 *	just null out the current db_data pointer.
    658 	 */
    659 	ASSERT(dr->dr_txg >= txg - 2);
    660 	if (db->db_blkid == DB_BONUS_BLKID) {
    661 		/* Note that the data bufs here are zio_bufs */
    662 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
    663 		arc_space_consume(DN_MAX_BONUSLEN);
    664 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
    665 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
    666 		int size = db->db.db_size;
    667 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
    668 		dr->dt.dl.dr_data = arc_buf_alloc(
    669 		    db->db_dnode->dn_objset->os_spa, size, db, type);
    670 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
    671 	} else {
    672 		dbuf_set_data(db, NULL);
    673 	}
    674 }
    675 
    676 void
    677 dbuf_unoverride(dbuf_dirty_record_t *dr)
    678 {
    679 	dmu_buf_impl_t *db = dr->dr_dbuf;
    680 	uint64_t txg = dr->dr_txg;
    681 
    682 	ASSERT(MUTEX_HELD(&db->db_mtx));
    683 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
    684 	ASSERT(db->db_level == 0);
    685 
    686 	if (db->db_blkid == DB_BONUS_BLKID ||
    687 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
    688 		return;
    689 
    690 	/* free this block */
    691 	if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
    692 		/* XXX can get silent EIO here */
    693 		(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
    694 		    txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
    695 	}
    696 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
    697 	/*
    698 	 * Release the already-written buffer, so we leave it in
    699 	 * a consistent dirty state.  Note that all callers are
    700 	 * modifying the buffer, so they will immediately do
    701 	 * another (redundant) arc_release().  Therefore, leave
    702 	 * the buf thawed to save the effort of freezing &
    703 	 * immediately re-thawing it.
    704 	 */
    705 	arc_release(dr->dt.dl.dr_data, db);
    706 }
    707 
    708 void
    709 dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
    710 {
    711 	dmu_buf_impl_t *db, *db_next;
    712 	uint64_t txg = tx->tx_txg;
    713 
    714 	dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
    715 	mutex_enter(&dn->dn_dbufs_mtx);
    716 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
    717 		db_next = list_next(&dn->dn_dbufs, db);
    718 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
    719 		if (db->db_level != 0)
    720 			continue;
    721 		dprintf_dbuf(db, "found buf %s\n", "");
    722 		if (db->db_blkid < blkid ||
    723 		    db->db_blkid >= blkid+nblks)
    724 			continue;
    725 
    726 		/* found a level 0 buffer in the range */
    727 		if (dbuf_undirty(db, tx))
    728 			continue;
    729 
    730 		mutex_enter(&db->db_mtx);
    731 		if (db->db_state == DB_UNCACHED ||
    732 		    db->db_state == DB_EVICTING) {
    733 			ASSERT(db->db.db_data == NULL);
    734 			mutex_exit(&db->db_mtx);
    735 			continue;
    736 		}
    737 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
    738 			/* will be handled in dbuf_read_done or dbuf_rele */
    739 			db->db_freed_in_flight = TRUE;
    740 			mutex_exit(&db->db_mtx);
    741 			continue;
    742 		}
    743 		if (refcount_count(&db->db_holds) == 0) {
    744 			ASSERT(db->db_buf);
    745 			dbuf_clear(db);
    746 			continue;
    747 		}
    748 		/* The dbuf is referenced */
    749 
    750 		if (db->db_last_dirty != NULL) {
    751 			dbuf_dirty_record_t *dr = db->db_last_dirty;
    752 
    753 			if (dr->dr_txg == txg) {
    754 				/*
    755 				 * This buffer is "in-use", re-adjust the file
    756 				 * size to reflect that this buffer may
    757 				 * contain new data when we sync.
    758 				 */
    759 				if (db->db_blkid > dn->dn_maxblkid)
    760 					dn->dn_maxblkid = db->db_blkid;
    761 				dbuf_unoverride(dr);
    762 			} else {
    763 				/*
    764 				 * This dbuf is not dirty in the open context.
    765 				 * Either uncache it (if its not referenced in
    766 				 * the open context) or reset its contents to
    767 				 * empty.
    768 				 */
    769 				dbuf_fix_old_data(db, txg);
    770 			}
    771 		}
    772 		/* clear the contents if its cached */
    773 		if (db->db_state == DB_CACHED) {
    774 			ASSERT(db->db.db_data != NULL);
    775 			arc_release(db->db_buf, db);
    776 			bzero(db->db.db_data, db->db.db_size);
    777 			arc_buf_freeze(db->db_buf);
    778 		}
    779 
    780 		mutex_exit(&db->db_mtx);
    781 	}
    782 	mutex_exit(&dn->dn_dbufs_mtx);
    783 }
    784 
    785 static int
    786 dbuf_block_freeable(dmu_buf_impl_t *db)
    787 {
    788 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
    789 	uint64_t birth_txg = 0;
    790 
    791 	/*
    792 	 * We don't need any locking to protect db_blkptr:
    793 	 * If it's syncing, then db_last_dirty will be set
    794 	 * so we'll ignore db_blkptr.
    795 	 */
    796 	ASSERT(MUTEX_HELD(&db->db_mtx));
    797 	if (db->db_last_dirty)
    798 		birth_txg = db->db_last_dirty->dr_txg;
    799 	else if (db->db_blkptr)
    800 		birth_txg = db->db_blkptr->blk_birth;
    801 
    802 	/* If we don't exist or are in a snapshot, we can't be freed */
    803 	if (birth_txg)
    804 		return (ds == NULL ||
    805 		    dsl_dataset_block_freeable(ds, birth_txg));
    806 	else
    807 		return (FALSE);
    808 }
    809 
    810 void
    811 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
    812 {
    813 	arc_buf_t *buf, *obuf;
    814 	int osize = db->db.db_size;
    815 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
    816 
    817 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
    818 
    819 	/* XXX does *this* func really need the lock? */
    820 	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
    821 
    822 	/*
    823 	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
    824 	 * is OK, because there can be no other references to the db
    825 	 * when we are changing its size, so no concurrent DB_FILL can
    826 	 * be happening.
    827 	 */
    828 	/*
    829 	 * XXX we should be doing a dbuf_read, checking the return
    830 	 * value and returning that up to our callers
    831 	 */
    832 	dbuf_will_dirty(db, tx);
    833 
    834 	/* create the data buffer for the new block */
    835 	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
    836 
    837 	/* copy old block data to the new block */
    838 	obuf = db->db_buf;
    839 	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
    840 	/* zero the remainder */
    841 	if (size > osize)
    842 		bzero((uint8_t *)buf->b_data + osize, size - osize);
    843 
    844 	mutex_enter(&db->db_mtx);
    845 	dbuf_set_data(db, buf);
    846 	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
    847 	db->db.db_size = size;
    848 
    849 	if (db->db_level == 0) {
    850 		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
    851 		db->db_last_dirty->dt.dl.dr_data = buf;
    852 	}
    853 	mutex_exit(&db->db_mtx);
    854 
    855 	dnode_willuse_space(db->db_dnode, size-osize, tx);
    856 }
    857 
    858 dbuf_dirty_record_t *
    859 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
    860 {
    861 	dnode_t *dn = db->db_dnode;
    862 	objset_impl_t *os = dn->dn_objset;
    863 	dbuf_dirty_record_t **drp, *dr;
    864 	int drop_struct_lock = FALSE;
    865 	int txgoff = tx->tx_txg & TXG_MASK;
    866 
    867 	ASSERT(tx->tx_txg != 0);
    868 	ASSERT(!refcount_is_zero(&db->db_holds));
    869 	DMU_TX_DIRTY_BUF(tx, db);
    870 
    871 	/*
    872 	 * Shouldn't dirty a regular buffer in syncing context.  Private
    873 	 * objects may be dirtied in syncing context, but only if they
    874 	 * were already pre-dirtied in open context.
    875 	 * XXX We may want to prohibit dirtying in syncing context even
    876 	 * if they did pre-dirty.
    877 	 */
    878 	ASSERT(!dmu_tx_is_syncing(tx) ||
    879 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
    880 	    dn->dn_object == DMU_META_DNODE_OBJECT ||
    881 	    dn->dn_objset->os_dsl_dataset == NULL ||
    882 	    dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
    883 
    884 	/*
    885 	 * We make this assert for private objects as well, but after we
    886 	 * check if we're already dirty.  They are allowed to re-dirty
    887 	 * in syncing context.
    888 	 */
    889 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
    890 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
    891 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
    892 
    893 	mutex_enter(&db->db_mtx);
    894 	/*
    895 	 * XXX make this true for indirects too?  The problem is that
    896 	 * transactions created with dmu_tx_create_assigned() from
    897 	 * syncing context don't bother holding ahead.
    898 	 */
    899 	ASSERT(db->db_level != 0 ||
    900 	    db-