1 789 ahrens /* 2 789 ahrens * CDDL HEADER START 3 789 ahrens * 4 789 ahrens * The contents of this file are subject to the terms of the 5 1491 ahrens * Common Development and Distribution License (the "License"). 6 1491 ahrens * You may not use this file except in compliance with the License. 7 789 ahrens * 8 789 ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 789 ahrens * or http://www.opensolaris.org/os/licensing. 10 789 ahrens * See the License for the specific language governing permissions 11 789 ahrens * and limitations under the License. 12 789 ahrens * 13 789 ahrens * When distributing Covered Code, include this CDDL HEADER in each 14 789 ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 789 ahrens * If applicable, add the following below this CDDL HEADER, with the 16 789 ahrens * fields enclosed by brackets "[]" replaced with your own identifying 17 789 ahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18 789 ahrens * 19 789 ahrens * CDDL HEADER END 20 789 ahrens */ 21 789 ahrens /* 22 8582 Brendan * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 789 ahrens * Use is subject to license terms. 24 789 ahrens */ 25 789 ahrens 26 789 ahrens #include <sys/zfs_context.h> 27 789 ahrens #include <sys/dmu.h> 28 789 ahrens #include <sys/dmu_impl.h> 29 789 ahrens #include <sys/dbuf.h> 30 789 ahrens #include <sys/dmu_objset.h> 31 789 ahrens #include <sys/dsl_dataset.h> 32 789 ahrens #include <sys/dsl_dir.h> 33 789 ahrens #include <sys/dmu_tx.h> 34 789 ahrens #include <sys/spa.h> 35 789 ahrens #include <sys/zio.h> 36 789 ahrens #include <sys/dmu_zfetch.h> 37 789 ahrens 38 789 ahrens static void dbuf_destroy(dmu_buf_impl_t *db); 39 789 ahrens static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 40 7046 ahrens static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 41 789 ahrens 42 789 ahrens /* 43 789 ahrens * Global data structures and functions for the dbuf cache. 44 789 ahrens */ 45 789 ahrens static kmem_cache_t *dbuf_cache; 46 789 ahrens 47 789 ahrens /* ARGSUSED */ 48 789 ahrens static int 49 789 ahrens dbuf_cons(void *vdb, void *unused, int kmflag) 50 789 ahrens { 51 789 ahrens dmu_buf_impl_t *db = vdb; 52 789 ahrens bzero(db, sizeof (dmu_buf_impl_t)); 53 789 ahrens 54 789 ahrens mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 55 789 ahrens cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 56 789 ahrens refcount_create(&db->db_holds); 57 789 ahrens return (0); 58 789 ahrens } 59 789 ahrens 60 789 ahrens /* ARGSUSED */ 61 789 ahrens static void 62 789 ahrens dbuf_dest(void *vdb, void *unused) 63 789 ahrens { 64 789 ahrens dmu_buf_impl_t *db = vdb; 65 789 ahrens mutex_destroy(&db->db_mtx); 66 789 ahrens cv_destroy(&db->db_changed); 67 789 ahrens refcount_destroy(&db->db_holds); 68 789 ahrens } 69 789 ahrens 70 789 ahrens /* 71 789 ahrens * dbuf hash table routines 72 789 ahrens */ 73 789 ahrens static dbuf_hash_table_t dbuf_hash_table; 74 789 ahrens 75 789 ahrens static uint64_t dbuf_hash_count; 76 789 ahrens 77 789 ahrens static uint64_t 78 789 ahrens dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 79 789 ahrens { 80 789 ahrens uintptr_t osv = (uintptr_t)os; 81 789 ahrens uint64_t crc = -1ULL; 82 789 ahrens 83 789 ahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 84 789 ahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 85 789 ahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 86 789 ahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 87 789 ahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 88 789 ahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 89 789 ahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 90 789 ahrens 91 789 ahrens crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 92 789 ahrens 93 789 ahrens return (crc); 94 789 ahrens } 95 789 ahrens 96 789 ahrens #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 97 789 ahrens 98 789 ahrens #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 99 789 ahrens ((dbuf)->db.db_object == (obj) && \ 100 789 ahrens (dbuf)->db_objset == (os) && \ 101 789 ahrens (dbuf)->db_level == (level) && \ 102 789 ahrens (dbuf)->db_blkid == (blkid)) 103 789 ahrens 104 789 ahrens dmu_buf_impl_t * 105 789 ahrens dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 106 789 ahrens { 107 789 ahrens dbuf_hash_table_t *h = &dbuf_hash_table; 108 10298 Matthew objset_t *os = dn->dn_objset; 109 789 ahrens uint64_t obj = dn->dn_object; 110 789 ahrens uint64_t hv = DBUF_HASH(os, obj, level, blkid); 111 789 ahrens uint64_t idx = hv & h->hash_table_mask; 112 789 ahrens dmu_buf_impl_t *db; 113 789 ahrens 114 789 ahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 115 789 ahrens for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 116 789 ahrens if (DBUF_EQUAL(db, os, obj, level, blkid)) { 117 789 ahrens mutex_enter(&db->db_mtx); 118 1544 eschrock if (db->db_state != DB_EVICTING) { 119 789 ahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 120 789 ahrens return (db); 121 789 ahrens } 122 789 ahrens mutex_exit(&db->db_mtx); 123 789 ahrens } 124 789 ahrens } 125 789 ahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 126 789 ahrens return (NULL); 127 789 ahrens } 128 789 ahrens 129 789 ahrens /* 130 789 ahrens * Insert an entry into the hash table. If there is already an element 131 789 ahrens * equal to elem in the hash table, then the already existing element 132 789 ahrens * will be returned and the new element will not be inserted. 133 789 ahrens * Otherwise returns NULL. 134 789 ahrens */ 135 789 ahrens static dmu_buf_impl_t * 136 789 ahrens dbuf_hash_insert(dmu_buf_impl_t *db) 137 789 ahrens { 138 789 ahrens dbuf_hash_table_t *h = &dbuf_hash_table; 139 10298 Matthew objset_t *os = db->db_objset; 140 789 ahrens uint64_t obj = db->db.db_object; 141 789 ahrens int level = db->db_level; 142 789 ahrens uint64_t blkid = db->db_blkid; 143 789 ahrens uint64_t hv = DBUF_HASH(os, obj, level, blkid); 144 789 ahrens uint64_t idx = hv & h->hash_table_mask; 145 789 ahrens dmu_buf_impl_t *dbf; 146 789 ahrens 147 789 ahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 148 789 ahrens for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 149 789 ahrens if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 150 789 ahrens mutex_enter(&dbf->db_mtx); 151 1544 eschrock if (dbf->db_state != DB_EVICTING) { 152 789 ahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 153 789 ahrens return (dbf); 154 789 ahrens } 155 789 ahrens mutex_exit(&dbf->db_mtx); 156 789 ahrens } 157 789 ahrens } 158 789 ahrens 159 789 ahrens mutex_enter(&db->db_mtx); 160 789 ahrens db->db_hash_next = h->hash_table[idx]; 161 789 ahrens h->hash_table[idx] = db; 162 789 ahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 163 789 ahrens atomic_add_64(&dbuf_hash_count, 1); 164 789 ahrens 165 789 ahrens return (NULL); 166 789 ahrens } 167 789 ahrens 168 789 ahrens /* 169 789 ahrens * Remove an entry from the hash table. This operation will 170 789 ahrens * fail if there are any existing holds on the db. 171 789 ahrens */ 172 789 ahrens static void 173 789 ahrens dbuf_hash_remove(dmu_buf_impl_t *db) 174 789 ahrens { 175 789 ahrens dbuf_hash_table_t *h = &dbuf_hash_table; 176 789 ahrens uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 177 789 ahrens db->db_level, db->db_blkid); 178 789 ahrens uint64_t idx = hv & h->hash_table_mask; 179 789 ahrens dmu_buf_impl_t *dbf, **dbp; 180 789 ahrens 181 789 ahrens /* 182 789 ahrens * We musn't hold db_mtx to maintin lock ordering: 183 789 ahrens * DBUF_HASH_MUTEX > db_mtx. 184 789 ahrens */ 185 789 ahrens ASSERT(refcount_is_zero(&db->db_holds)); 186 1544 eschrock ASSERT(db->db_state == DB_EVICTING); 187 789 ahrens ASSERT(!MUTEX_HELD(&db->db_mtx)); 188 789 ahrens 189 789 ahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 190 789 ahrens dbp = &h->hash_table[idx]; 191 789 ahrens while ((dbf = *dbp) != db) { 192 789 ahrens dbp = &dbf->db_hash_next; 193 789 ahrens ASSERT(dbf != NULL); 194 789 ahrens } 195 789 ahrens *dbp = db->db_hash_next; 196 789 ahrens db->db_hash_next = NULL; 197 789 ahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 198 789 ahrens atomic_add_64(&dbuf_hash_count, -1); 199 789 ahrens } 200 789 ahrens 201 1544 eschrock static arc_evict_func_t dbuf_do_evict; 202 789 ahrens 203 789 ahrens static void 204 789 ahrens dbuf_evict_user(dmu_buf_impl_t *db) 205 789 ahrens { 206 789 ahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 207 789 ahrens 208 3547 maybee if (db->db_level != 0 || db->db_evict_func == NULL) 209 789 ahrens return; 210 789 ahrens 211 3547 maybee if (db->db_user_data_ptr_ptr) 212 3547 maybee *db->db_user_data_ptr_ptr = db->db.db_data; 213 3547 maybee db->db_evict_func(&db->db, db->db_user_ptr); 214 3547 maybee db->db_user_ptr = NULL; 215 3547 maybee db->db_user_data_ptr_ptr = NULL; 216 3547 maybee db->db_evict_func = NULL; 217 789 ahrens } 218 789 ahrens 219 789 ahrens void 220 1544 eschrock dbuf_evict(dmu_buf_impl_t *db) 221 1544 eschrock { 222 1544 eschrock ASSERT(MUTEX_HELD(&db->db_mtx)); 223 1544 eschrock ASSERT(db->db_buf == NULL); 224 3547 maybee ASSERT(db->db_data_pending == NULL); 225 1544 eschrock 226 1544 eschrock dbuf_clear(db); 227 1544 eschrock dbuf_destroy(db); 228 1544 eschrock } 229 1544 eschrock 230 1544 eschrock void 231 789 ahrens dbuf_init(void) 232 789 ahrens { 233 1544 eschrock uint64_t hsize = 1ULL << 16; 234 789 ahrens dbuf_hash_table_t *h = &dbuf_hash_table; 235 789 ahrens int i; 236 789 ahrens 237 789 ahrens /* 238 789 ahrens * The hash table is big enough to fill all of physical memory 239 1544 eschrock * with an average 4K block size. The table will take up 240 1544 eschrock * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 241 789 ahrens */ 242 1544 eschrock while (hsize * 4096 < physmem * PAGESIZE) 243 789 ahrens hsize <<= 1; 244 789 ahrens 245 1544 eschrock retry: 246 789 ahrens h->hash_table_mask = hsize - 1; 247 1544 eschrock h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 248 1544 eschrock if (h->hash_table == NULL) { 249 1544 eschrock /* XXX - we should really return an error instead of assert */ 250 1544 eschrock ASSERT(hsize > (1ULL << 10)); 251 1544 eschrock hsize >>= 1; 252 1544 eschrock goto retry; 253 1544 eschrock } 254 789 ahrens 255 789 ahrens dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 256 789 ahrens sizeof (dmu_buf_impl_t), 257 789 ahrens 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 258 789 ahrens 259 789 ahrens for (i = 0; i < DBUF_MUTEXES; i++) 260 789 ahrens mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 261 789 ahrens } 262 789 ahrens 263 789 ahrens void 264 789 ahrens dbuf_fini(void) 265 789 ahrens { 266 789 ahrens dbuf_hash_table_t *h = &dbuf_hash_table; 267 789 ahrens int i; 268 789 ahrens 269 789 ahrens for (i = 0; i < DBUF_MUTEXES; i++) 270 789 ahrens mutex_destroy(&h->hash_mutexes[i]); 271 789 ahrens kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 272 789 ahrens kmem_cache_destroy(dbuf_cache); 273 789 ahrens } 274 789 ahrens 275 789 ahrens /* 276 789 ahrens * Other stuff. 277 789 ahrens */ 278 789 ahrens 279 873 ek110237 #ifdef ZFS_DEBUG 280 789 ahrens static void 281 789 ahrens dbuf_verify(dmu_buf_impl_t *db) 282 789 ahrens { 283 789 ahrens dnode_t *dn = db->db_dnode; 284 10922 Jeff dbuf_dirty_record_t *dr; 285 789 ahrens 286 789 ahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 287 789 ahrens 288 789 ahrens if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 289 789 ahrens return; 290 789 ahrens 291 789 ahrens ASSERT(db->db_objset != NULL); 292 789 ahrens if (dn == NULL) { 293 789 ahrens ASSERT(db->db_parent == NULL); 294 789 ahrens ASSERT(db->db_blkptr == NULL); 295 789 ahrens } else { 296 789 ahrens ASSERT3U(db->db.db_object, ==, dn->dn_object); 297 789 ahrens ASSERT3P(db->db_objset, ==, dn->dn_objset); 298 789 ahrens ASSERT3U(db->db_level, <, dn->dn_nlevels); 299 1544 eschrock ASSERT(db->db_blkid == DB_BONUS_BLKID || 300 1544 eschrock list_head(&dn->dn_dbufs)); 301 789 ahrens } 302 789 ahrens if (db->db_blkid == DB_BONUS_BLKID) { 303 789 ahrens ASSERT(dn != NULL); 304 4944 maybee ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 305 789 ahrens ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 306 789 ahrens } else { 307 789 ahrens ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 308 789 ahrens } 309 789 ahrens 310 10922 Jeff for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 311 10922 Jeff ASSERT(dr->dr_dbuf == db); 312 10922 Jeff 313 10922 Jeff for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 314 10922 Jeff ASSERT(dr->dr_dbuf == db); 315 10922 Jeff 316 7837 Matthew /* 317 7837 Matthew * We can't assert that db_size matches dn_datablksz because it 318 7837 Matthew * can be momentarily different when another thread is doing 319 7837 Matthew * dnode_set_blksz(). 320 7837 Matthew */ 321 7837 Matthew if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 322 10922 Jeff dr = db->db_data_pending; 323 7837 Matthew /* 324 7837 Matthew * It should only be modified in syncing context, so 325 7837 Matthew * make sure we only have one copy of the data. 326 7837 Matthew */ 327 7837 Matthew ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 328 789 ahrens } 329 789 ahrens 330 789 ahrens /* verify db->db_blkptr */ 331 789 ahrens if (db->db_blkptr) { 332 789 ahrens if (db->db_parent == dn->dn_dbuf) { 333 789 ahrens /* db is pointed to by the dnode */ 334 789 ahrens /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 335 9396 Matthew if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 336 789 ahrens ASSERT(db->db_parent == NULL); 337 789 ahrens else 338 789 ahrens ASSERT(db->db_parent != NULL); 339 789 ahrens ASSERT3P(db->db_blkptr, ==, 340 789 ahrens &dn->dn_phys->dn_blkptr[db->db_blkid]); 341 789 ahrens } else { 342 789 ahrens /* db is pointed to by an indirect block */ 343 789 ahrens int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 344 789 ahrens ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 345 789 ahrens ASSERT3U(db->db_parent->db.db_object, ==, 346 789 ahrens db->db.db_object); 347 789 ahrens /* 348 789 ahrens * dnode_grow_indblksz() can make this fail if we don't 349 789 ahrens * have the struct_rwlock. XXX indblksz no longer 350 789 ahrens * grows. safe to do this now? 351 789 ahrens */ 352 789 ahrens if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 353 789 ahrens ASSERT3P(db->db_blkptr, ==, 354 789 ahrens ((blkptr_t *)db->db_parent->db.db_data + 355 789 ahrens db->db_blkid % epb)); 356 789 ahrens } 357 789 ahrens } 358 789 ahrens } 359 789 ahrens if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 360 789 ahrens db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 361 789 ahrens db->db_state != DB_FILL && !dn->dn_free_txg) { 362 789 ahrens /* 363 789 ahrens * If the blkptr isn't set but they have nonzero data, 364 789 ahrens * it had better be dirty, otherwise we'll lose that 365 789 ahrens * data when we evict this buffer. 366 789 ahrens */ 367 789 ahrens if (db->db_dirtycnt == 0) { 368 789 ahrens uint64_t *buf = db->db.db_data; 369 789 ahrens int i; 370 789 ahrens 371 789 ahrens for (i = 0; i < db->db.db_size >> 3; i++) { 372 789 ahrens ASSERT(buf[i] == 0); 373 789 ahrens } 374 789 ahrens } 375 789 ahrens } 376 873 ek110237 } 377 789 ahrens #endif 378 789 ahrens 379 789 ahrens static void 380 789 ahrens dbuf_update_data(dmu_buf_impl_t *db) 381 789 ahrens { 382 789 ahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 383 3547 maybee if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 384 789 ahrens ASSERT(!refcount_is_zero(&db->db_holds)); 385 3547 maybee *db->db_user_data_ptr_ptr = db->db.db_data; 386 789 ahrens } 387 789 ahrens } 388 789 ahrens 389 789 ahrens static void 390 789 ahrens dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 391 789 ahrens { 392 789 ahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 393 1544 eschrock ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 394 789 ahrens db->db_buf = buf; 395 1544 eschrock if (buf != NULL) { 396 1544 eschrock ASSERT(buf->b_data != NULL); 397 1544 eschrock db->db.db_data = buf->b_data; 398 1544 eschrock if (!arc_released(buf)) 399 1544 eschrock arc_set_callback(buf, dbuf_do_evict, db); 400 1544 eschrock dbuf_update_data(db); 401 1544 eschrock } else { 402 1544 eschrock dbuf_evict_user(db); 403 1544 eschrock db->db.db_data = NULL; 404 7872 Tim if (db->db_state != DB_NOFILL) 405 7872 Tim db->db_state = DB_UNCACHED; 406 1544 eschrock } 407 789 ahrens } 408 789 ahrens 409 789 ahrens uint64_t 410 789 ahrens dbuf_whichblock(dnode_t *dn, uint64_t offset) 411 789 ahrens { 412 789 ahrens if (dn->dn_datablkshift) { 413 789 ahrens return (offset >> dn->dn_datablkshift); 414 789 ahrens } else { 415 789 ahrens ASSERT3U(offset, <, dn->dn_datablksz); 416 789 ahrens return (0); 417 789 ahrens } 418 789 ahrens } 419 789 ahrens 420 789 ahrens static void 421 789 ahrens dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 422 789 ahrens { 423 789 ahrens dmu_buf_impl_t *db = vdb; 424 789 ahrens 425 789 ahrens mutex_enter(&db->db_mtx); 426 789 ahrens ASSERT3U(db->db_state, ==, DB_READ); 427 789 ahrens /* 428 789 ahrens * All reads are synchronous, so we must have a hold on the dbuf 429 789 ahrens */ 430 789 ahrens ASSERT(refcount_count(&db->db_holds) > 0); 431 1544 eschrock ASSERT(db->db_buf == NULL); 432 789 ahrens ASSERT(db->db.db_data == NULL); 433 3547 maybee if (db->db_level == 0 && db->db_freed_in_flight) { 434 789 ahrens /* we were freed in flight; disregard any error */ 435 789 ahrens arc_release(buf, db); 436 789 ahrens bzero(buf->b_data, db->db.db_size); 437 3093 ahrens arc_buf_freeze(buf); 438 3547 maybee db->db_freed_in_flight = FALSE; 439 789 ahrens dbuf_set_data(db, buf); 440 789 ahrens db->db_state = DB_CACHED; 441 789 ahrens } else if (zio == NULL || zio->io_error == 0) { 442 789 ahrens dbuf_set_data(db, buf); 443 789 ahrens db->db_state = DB_CACHED; 444 789 ahrens } else { 445 789 ahrens ASSERT(db->db_blkid != DB_BONUS_BLKID); 446 1544 eschrock ASSERT3P(db->db_buf, ==, NULL); 447 1544 eschrock VERIFY(arc_buf_remove_ref(buf, db) == 1); 448 789 ahrens db->db_state = DB_UNCACHED; 449 789 ahrens } 450 789 ahrens cv_broadcast(&db->db_changed); 451 789 ahrens mutex_exit(&db->db_mtx); 452 1544 eschrock dbuf_rele(db, NULL); 453 789 ahrens } 454 789 ahrens 455 1544 eschrock static void 456 2391 maybee dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 457 789 ahrens { 458 7046 ahrens dnode_t *dn = db->db_dnode; 459 1544 eschrock zbookmark_t zb; 460 2391 maybee uint32_t aflags = ARC_NOWAIT; 461 7046 ahrens arc_buf_t *pbuf; 462 789 ahrens 463 789 ahrens ASSERT(!refcount_is_zero(&db->db_holds)); 464 789 ahrens /* We need the struct_rwlock to prevent db_blkptr from changing. */ 465 7046 ahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 466 1544 eschrock ASSERT(MUTEX_HELD(&db->db_mtx)); 467 1544 eschrock ASSERT(db->db_state == DB_UNCACHED); 468 1544 eschrock ASSERT(db->db_buf == NULL); 469 789 ahrens 470 789 ahrens if (db->db_blkid == DB_BONUS_BLKID) { 471 9299 Mark int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 472 4944 maybee 473 4944 maybee ASSERT3U(bonuslen, <=, db->db.db_size); 474 1544 eschrock db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 475 8582 Brendan arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 476 4944 maybee if (bonuslen < DN_MAX_BONUSLEN) 477 1544 eschrock bzero(db->db.db_data, DN_MAX_BONUSLEN); 478 9299 Mark if (bonuslen) 479 9299 Mark bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 480 1544 eschrock dbuf_update_data(db); 481 789 ahrens db->db_state = DB_CACHED; 482 789 ahrens mutex_exit(&db->db_mtx); 483 789 ahrens return; 484 789 ahrens } 485 789 ahrens 486 7385 Mark /* 487 7385 Mark * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 488 7385 Mark * processes the delete record and clears the bp while we are waiting 489 7385 Mark * for the dn_mtx (resulting in a "no" from block_freed). 490 7385 Mark */ 491 7046 ahrens if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 492 7385 Mark (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 493 7385 Mark BP_IS_HOLE(db->db_blkptr)))) { 494 3290 johansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 495 3290 johansen 496 7046 ahrens dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 497 3290 johansen db->db.db_size, db, type)); 498 789 ahrens bzero(db->db.db_data, db->db.db_size); 499 789 ahrens db->db_state = DB_CACHED; 500 2391 maybee *flags |= DB_RF_CACHED; 501 789 ahrens mutex_exit(&db->db_mtx); 502 789 ahrens return; 503 789 ahrens } 504 789 ahrens 505 789 ahrens db->db_state = DB_READ; 506 789 ahrens mutex_exit(&db->db_mtx); 507 789 ahrens 508 7237 ek110237 if (DBUF_IS_L2CACHEABLE(db)) 509 7237 ek110237 aflags |= ARC_L2CACHE; 510 7237 ek110237 511 10922 Jeff SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 512 10922 Jeff db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 513 10922 Jeff db->db.db_object, db->db_level, db->db_blkid); 514 1544 eschrock 515 1544 eschrock dbuf_add_ref(db, NULL); 516 789 ahrens /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 517 7046 ahrens 518 7046 ahrens if (db->db_parent) 519 7046 ahrens pbuf = db->db_parent->db_buf; 520 7046 ahrens else 521 7046 ahrens pbuf = db->db_objset->os_phys_buf; 522 7046 ahrens 523 7046 ahrens (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, 524 789 ahrens dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 525 2391 maybee (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 526 2391 maybee &aflags, &zb); 527 2391 maybee if (aflags & ARC_CACHED) 528 2391 maybee *flags |= DB_RF_CACHED; 529 789 ahrens } 530 789 ahrens 531 1544 eschrock int 532 1544 eschrock dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 533 789 ahrens { 534 1544 eschrock int err = 0; 535 1544 eschrock int havepzio = (zio != NULL); 536 2391 maybee int prefetch; 537 789 ahrens 538 789 ahrens /* 539 789 ahrens * We don't have to hold the mutex to check db_state because it 540 789 ahrens * can't be freed while we have a hold on the buffer. 541 789 ahrens */ 542 789 ahrens ASSERT(!refcount_is_zero(&db->db_holds)); 543 789 ahrens 544 7872 Tim if (db->db_state == DB_NOFILL) 545 7872 Tim return (EIO); 546 7872 Tim 547 1544 eschrock if ((flags & DB_RF_HAVESTRUCT) == 0) 548 1544 eschrock rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 549 1544 eschrock 550 2391 maybee prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 551 7237 ek110237 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && 552 7237 ek110237 DBUF_IS_CACHEABLE(db); 553 2391 maybee 554 1544 eschrock mutex_enter(&db->db_mtx); 555 1544 eschrock if (db->db_state == DB_CACHED) { 556 1544 eschrock mutex_exit(&db->db_mtx); 557 2391 maybee if (prefetch) 558 2391 maybee dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 559 2391 maybee db->db.db_size, TRUE); 560 789 ahrens if ((flags & DB_RF_HAVESTRUCT) == 0) 561 789 ahrens rw_exit(&db->db_dnode->dn_struct_rwlock); 562 1544 eschrock } else if (db->db_state == DB_UNCACHED) { 563 1544 eschrock if (zio == NULL) { 564 1544 eschrock zio = zio_root(db->db_dnode->dn_objset->os_spa, 565 1544 eschrock NULL, NULL, ZIO_FLAG_CANFAIL); 566 1544 eschrock } 567 2391 maybee dbuf_read_impl(db, zio, &flags); 568 2391 maybee 569 1544 eschrock /* dbuf_read_impl has dropped db_mtx for us */ 570 1544 eschrock 571 2391 maybee if (prefetch) 572 1544 eschrock dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 573 2391 maybee db->db.db_size, flags & DB_RF_CACHED); 574 1544 eschrock 575 1544 eschrock if ((flags & DB_RF_HAVESTRUCT) == 0) 576 1544 eschrock rw_exit(&db->db_dnode->dn_struct_rwlock); 577 1544 eschrock 578 1544 eschrock if (!havepzio) 579 1544 eschrock err = zio_wait(zio); 580 1544 eschrock } else { 581 2391 maybee mutex_exit(&db->db_mtx); 582 2391 maybee if (prefetch) 583 2391 maybee dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 584 2391 maybee db->db.db_size, TRUE); 585 1544 eschrock if ((flags & DB_RF_HAVESTRUCT) == 0) 586 1544 eschrock rw_exit(&db->db_dnode->dn_struct_rwlock); 587 2391 maybee 588 2391 maybee mutex_enter(&db->db_mtx); 589 1544 eschrock if ((flags & DB_RF_NEVERWAIT) == 0) { 590 1544 eschrock while (db->db_state == DB_READ || 591 1544 eschrock db->db_state == DB_FILL) { 592 1544 eschrock ASSERT(db->db_state == DB_READ || 593 1544 eschrock (flags & DB_RF_HAVESTRUCT) == 0); 594 1544 eschrock cv_wait(&db->db_changed, &db->db_mtx); 595 1544 eschrock } 596 1544 eschrock if (db->db_state == DB_UNCACHED) 597 1544 eschrock err = EIO; 598 1544 eschrock } 599 1544 eschrock mutex_exit(&db->db_mtx); 600 789 ahrens } 601 789 ahrens 602 1544 eschrock ASSERT(err || havepzio || db->db_state == DB_CACHED); 603 1544 eschrock return (err); 604 789 ahrens } 605 789 ahrens 606 789 ahrens static void 607 789 ahrens dbuf_noread(dmu_buf_impl_t *db) 608 789 ahrens { 609 789 ahrens ASSERT(!refcount_is_zero(&db->db_holds)); 610 1544 eschrock ASSERT(db->db_blkid != DB_BONUS_BLKID); 611 789 ahrens mutex_enter(&db->db_mtx); 612 789 ahrens while (db->db_state == DB_READ || db->db_state == DB_FILL) 613 789 ahrens cv_wait(&db->db_changed, &db->db_mtx); 614 789 ahrens if (db->db_state == DB_UNCACHED) { 615 3290 johansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 616 3290 johansen 617 1544 eschrock ASSERT(db->db_buf == NULL); 618 789 ahrens ASSERT(db->db.db_data == NULL); 619 789 ahrens dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 620 3290 johansen db->db.db_size, db, type)); 621 789 ahrens db->db_state = DB_FILL; 622 7872 Tim } else if (db->db_state == DB_NOFILL) { 623 7872 Tim dbuf_set_data(db, NULL); 624 789 ahrens } else { 625 789 ahrens ASSERT3U(db->db_state, ==, DB_CACHED); 626 789 ahrens } 627 789 ahrens mutex_exit(&db->db_mtx); 628 789 ahrens } 629 789 ahrens 630 789 ahrens /* 631 789 ahrens * This is our just-in-time copy function. It makes a copy of 632 789 ahrens * buffers, that have been modified in a previous transaction 633 789 ahrens * group, before we modify them in the current active group. 634 789 ahrens * 635 789 ahrens * This function is used in two places: when we are dirtying a 636 789 ahrens * buffer for the first time in a txg, and when we are freeing 637 789 ahrens * a range in a dnode that includes this buffer. 638 789 ahrens * 639 789 ahrens * Note that when we are called from dbuf_free_range() we do 640 789 ahrens * not put a hold on the buffer, we just traverse the active 641 789 ahrens * dbuf list for the dnode. 642 789 ahrens */ 643 789 ahrens static void 644 789 ahrens dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 645 789 ahrens { 646 3547 maybee dbuf_dirty_record_t *dr = db->db_last_dirty; 647 789 ahrens 648 789 ahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 649 789 ahrens ASSERT(db->db.db_data != NULL); 650 3547 maybee ASSERT(db->db_level == 0); 651 3547 maybee ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 652 789 ahrens 653 3711 maybee if (dr == NULL || 654 3711 maybee (dr->dt.dl.dr_data != 655 3711 maybee ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 656 3547 maybee return; 657 789 ahrens 658 789 ahrens /* 659 3547 maybee * If the last dirty record for this dbuf has not yet synced 660 3547 maybee * and its referencing the dbuf data, either: 661 3547 maybee * reset the reference to point to a new copy, 662 3547 maybee * or (if there a no active holders) 663 3547 maybee * just null out the current db_data pointer. 664 789 ahrens */ 665 3547 maybee ASSERT(dr->dr_txg >= txg - 2); 666 3547 maybee if (db->db_blkid == DB_BONUS_BLKID) { 667 3547 maybee /* Note that the data bufs here are zio_bufs */ 668 3547 maybee dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 669 8582 Brendan arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 670 3547 maybee bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 671 3547 maybee } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 672 3547 maybee int size = db->db.db_size; 673 3547 maybee arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 674 3547 maybee dr->dt.dl.dr_data = arc_buf_alloc( 675 3547 maybee db->db_dnode->dn_objset->os_spa, size, db, type); 676 3547 maybee bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 677 3547 maybee } else { 678 3547 maybee dbuf_set_data(db, NULL); 679 789 ahrens } 680 789 ahrens } 681 789 ahrens 682 789 ahrens void 683 3547 maybee dbuf_unoverride(dbuf_dirty_record_t *dr) 684 789 ahrens { 685 3547 maybee dmu_buf_impl_t *db = dr->dr_dbuf; 686 10922 Jeff blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 687 3547 maybee uint64_t txg = dr->dr_txg; 688 3547 maybee 689 789 ahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 690 3547 maybee ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 691 3547 maybee ASSERT(db->db_level == 0); 692 2237 maybee 693 3547 maybee if (db->db_blkid == DB_BONUS_BLKID || 694 3547 maybee dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 695 3547 maybee return; 696 3547 maybee 697 10922 Jeff ASSERT(db->db_data_pending != dr); 698 10922 Jeff 699 3547 maybee /* free this block */ 700 10922 Jeff if (!BP_IS_HOLE(bp)) 701 10922 Jeff dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp); 702 10922 Jeff 703 3547 maybee dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 704 3547 maybee /* 705 3547 maybee * Release the already-written buffer, so we leave it in 706 3547 maybee * a consistent dirty state. Note that all callers are 707 3547 maybee * modifying the buffer, so they will immediately do 708 3547 maybee * another (redundant) arc_release(). Therefore, leave 709 3547 maybee * the buf thawed to save the effort of freezing & 710 3547 maybee * immediately re-thawing it. 711 3547 maybee */ 712 3547 maybee arc_release(dr->dt.dl.dr_data, db); 713 789 ahrens } 714 789 ahrens 715 6992 maybee /* 716 6992 maybee * Evict (if its unreferenced) or clear (if its referenced) any level-0 717 6992 maybee * data blocks in the free range, so that any future readers will find 718 6992 maybee * empty blocks. Also, if we happen accross any level-1 dbufs in the 719 6992 maybee * range that have not already been marked dirty, mark them dirty so 720 6992 maybee * they stay in memory. 721 6992 maybee */ 722 789 ahrens void 723 6992 maybee dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 724 789 ahrens { 725 789 ahrens dmu_buf_impl_t *db, *db_next; 726 789 ahrens uint64_t txg = tx->tx_txg; 727 6992 maybee int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 728 6992 maybee uint64_t first_l1 = start >> epbs; 729 6992 maybee uint64_t last_l1 = end >> epbs; 730 789 ahrens 731 6992 maybee if (end > dn->dn_maxblkid) { 732 6992 maybee end = dn->dn_maxblkid; 733 6992 maybee last_l1 = end >> epbs; 734 6992 maybee } 735 6992 maybee dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 736 789 ahrens mutex_enter(&dn->dn_dbufs_mtx); 737 789 ahrens for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 738 789 ahrens db_next = list_next(&dn->dn_dbufs, db); 739 1544 eschrock ASSERT(db->db_blkid != DB_BONUS_BLKID); 740 6992 maybee 741 6992 maybee if (db->db_level == 1 && 742 6992 maybee db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 743 6992 maybee mutex_enter(&db->db_mtx); 744 6992 maybee if (db->db_last_dirty && 745 6992 maybee db->db_last_dirty->dr_txg < txg) { 746 6992 maybee dbuf_add_ref(db, FTAG); 747 6992 maybee mutex_exit(&db->db_mtx); 748 6992 maybee dbuf_will_dirty(db, tx); 749 6992 maybee dbuf_rele(db, FTAG); 750 6992 maybee } else { 751 6992 maybee mutex_exit(&db->db_mtx); 752 6992 maybee } 753 6992 maybee } 754 6992 maybee 755 1544 eschrock if (db->db_level != 0) 756 789 ahrens continue; 757 789 ahrens dprintf_dbuf(db, "found buf %s\n", ""); 758 6992 maybee if (db->db_blkid < start || db->db_blkid > end) 759 789 ahrens continue; 760 789 ahrens 761 789 ahrens /* found a level 0 buffer in the range */ 762 789 ahrens if (dbuf_undirty(db, tx)) 763 789 ahrens continue; 764 789 ahrens 765 789 ahrens mutex_enter(&db->db_mtx); 766 1544 eschrock if (db->db_state == DB_UNCACHED || 767 7872 Tim db->db_state == DB_NOFILL || 768 1544 eschrock db->db_state == DB_EVICTING) { 769 789 ahrens ASSERT(db->db.db_data == NULL); 770 789 ahrens mutex_exit(&db->db_mtx); 771 789 ahrens continue; 772 789 ahrens } 773 1596 ahrens if (db->db_state == DB_READ || db->db_state == DB_FILL) { 774 1596 ahrens /* will be handled in dbuf_read_done or dbuf_rele */ 775 3547 maybee db->db_freed_in_flight = TRUE; 776 789 ahrens mutex_exit(&db->db_mtx); 777 789 ahrens continue; 778 789 ahrens } 779 1544 eschrock if (refcount_count(&db->db_holds) == 0) { 780 1544 eschrock ASSERT(db->db_buf); 781 1544 eschrock dbuf_clear(db); 782 1544 eschrock continue; 783 1544 eschrock } 784 3547 maybee /* The dbuf is referenced */ 785 789 ahrens 786 3547 maybee if (db->db_last_dirty != NULL) { 787 3547 maybee dbuf_dirty_record_t *dr = db->db_last_dirty; 788 3547 maybee 789 3547 maybee if (dr->dr_txg == txg) { 790 2688 maybee /* 791 3547 maybee * This buffer is "in-use", re-adjust the file 792 3547 maybee * size to reflect that this buffer may 793 3547 maybee * contain new data when we sync. 794 2688 maybee */ 795 3547 maybee if (db->db_blkid > dn->dn_maxblkid) 796 3547 maybee dn->dn_maxblkid = db->db_blkid; 797 3547 maybee dbuf_unoverride(dr); 798 3547 maybee } else { 799 3547 maybee /* 800 3547 maybee * This dbuf is not dirty in the open context. 801 3547 maybee * Either uncache it (if its not referenced in 802 3547 maybee * the open context) or reset its contents to 803 3547 maybee * empty. 804 3547 maybee */ 805 3547 maybee dbuf_fix_old_data(db, txg); 806 2688 maybee } 807 1544 eschrock } 808 3547 maybee /* clear the contents if its cached */ 809 1544 eschrock if (db->db_state == DB_CACHED) { 810 1544 eschrock ASSERT(db->db.db_data != NULL); 811 789 ahrens arc_release(db->db_buf, db); 812 789 ahrens bzero(db->db.db_data, db->db.db_size); 813 3093 ahrens arc_buf_freeze(db->db_buf); 814 789 ahrens } 815 1544 eschrock 816 789 ahrens mutex_exit(&db->db_mtx); 817 789 ahrens } 818 789 ahrens mutex_exit(&dn->dn_dbufs_mtx); 819 789 ahrens } 820 789 ahrens 821 789 ahrens static int 822 4944 maybee dbuf_block_freeable(dmu_buf_impl_t *db) 823 789 ahrens { 824 789 ahrens dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 825 789 ahrens uint64_t birth_txg = 0; 826 789 ahrens 827 789 ahrens /* 828 789 ahrens * We don't need any locking to protect db_blkptr: 829 3547 maybee * If it's syncing, then db_last_dirty will be set 830 3547 maybee * so we'll ignore db_blkptr. 831 789 ahrens */ 832 3547 maybee ASSERT(MUTEX_HELD(&db->db_mtx)); 833 3547 maybee if (db->db_last_dirty) 834 3547 maybee birth_txg = db->db_last_dirty->dr_txg; 835 789 ahrens else if (db->db_blkptr) 836 789 ahrens birth_txg = db->db_blkptr->blk_birth; 837 789 ahrens 838 4944 maybee /* If we don't exist or are in a snapshot, we can't be freed */ 839 789 ahrens if (birth_txg) 840 4944 maybee return (ds == NULL || 841 4944 maybee dsl_dataset_block_freeable(ds, birth_txg)); 842 789 ahrens else 843 4944 maybee return (FALSE); 844 789 ahrens } 845 789 ahrens 846 789 ahrens void 847 789 ahrens dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 848 789 ahrens { 849 789 ahrens arc_buf_t *buf, *obuf; 850 789 ahrens int osize = db->db.db_size; 851 3290 johansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 852 1544 eschrock 853 1544 eschrock ASSERT(db->db_blkid != DB_BONUS_BLKID); 854 789 ahrens 855 789 ahrens /* XXX does *this* func really need the lock? */ 856 789 ahrens ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 857 789 ahrens 858 789 ahrens /* 859 789 ahrens * This call to dbuf_will_dirty() with the dn_struct_rwlock held 860 789 ahrens * is OK, because there can be no other references to the db 861 789 ahrens * when we are changing its size, so no concurrent DB_FILL can 862 789 ahrens * be happening. 863 789 ahrens */ 864 1544 eschrock /* 865 1544 eschrock * XXX we should be doing a dbuf_read, checking the return 866 1544 eschrock * value and returning that up to our callers 867 1544 eschrock */ 868 789 ahrens dbuf_will_dirty(db, tx); 869 789 ahrens 870 789 ahrens /* create the data buffer for the new block */ 871 3290 johansen buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); 872 789 ahrens 873 789 ahrens /* copy old block data to the new block */ 874 789 ahrens obuf = db->db_buf; 875 1491 ahrens bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 876 789 ahrens /* zero the remainder */ 877 1491 ahrens if (size > osize) 878 1491 ahrens bzero((uint8_t *)buf->b_data + osize, size - osize); 879 789 ahrens 880 789 ahrens mutex_enter(&db->db_mtx); 881 789 ahrens dbuf_set_data(db, buf); 882 1544 eschrock VERIFY(arc_buf_remove_ref(obuf, db) == 1); 883 789 ahrens db->db.db_size = size; 884 789 ahrens 885 3547 maybee if (db->db_level == 0) { 886 3547 maybee ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 887 3547 maybee db->db_last_dirty->dt.dl.dr_data = buf; 888 3547 maybee } 889 789 ahrens mutex_exit(&db->db_mtx); 890 789 ahrens 891 789 ahrens dnode_willuse_space(db->db_dnode, size-osize, tx); 892 789 ahrens } 893 789 ahrens 894 3547 maybee dbuf_dirty_record_t * 895 789 ahrens dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 896 789 ahrens { 897 789 ahrens dnode_t *dn = db->db_dnode; 898 10298 Matthew objset_t *os = dn->dn_objset; 899 3547 maybee dbuf_dirty_record_t **drp, *dr; 900 789 ahrens int drop_struct_lock = FALSE; 901 7467 Mark boolean_t do_free_accounting = B_FALSE; 902 789 ahrens int txgoff = tx->tx_txg & TXG_MASK; 903 789 ahrens 904 789 ahrens ASSERT(tx->tx_txg != 0); 905 789 ahrens ASSERT(!refcount_is_zero(&db->db_holds)); 906 873 ek110237 DMU_TX_DIRTY_BUF(tx, db); 907 789 ahrens 908 789 ahrens /* 909 789 ahrens * Shouldn't dirty a regular buffer in syncing context. Private 910 789 ahrens * objects may be dirtied in syncing context, but only if they 911 789 ahrens * were already pre-dirtied in open context. 912 789 ahrens */ 913 3547 maybee ASSERT(!dmu_tx_is_syncing(tx) || 914 3547 maybee BP_IS_HOLE(dn->dn_objset->os_rootbp) || 915 9396 Matthew DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 916 9396 Matthew dn->dn_objset->os_dsl_dataset == NULL); 917 789 ahrens /* 918 789 ahrens * We make this assert for private objects as well, but after we 919 789 ahrens * check if we're already dirty. They are allowed to re-dirty 920 789 ahrens * in syncing context. 921 789 ahrens */ 922 1544 eschrock ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 923 3547 maybee dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 924 789 ahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 925 789 ahrens 926 789 ahrens mutex_enter(&db->db_mtx); 927 789 ahrens /* 928 3547 maybee * XXX make this true for indirects too? The problem is that 929 3547 maybee * transactions created with dmu_tx_create_assigned() from 930 3547 maybee * syncing context don't bother holding ahead. 931 789 ahrens */ 932 3547 maybee ASSERT(db->db_level != 0 || 933 7872 Tim db->db_state == DB_CACHED || db->db_state == DB_FILL || 934 7872 Tim db->db_state == DB_NOFILL); 935 789 ahrens 936 789 ahrens mutex_enter(&dn->dn_mtx); 937 789 ahrens /* 938 789 ahrens * Don't set dirtyctx to SYNC if we're just modifying this as we 939 789 ahrens * initialize the objset. 940 789 ahrens */ 941 789 ahrens if (dn->dn_dirtyctx == DN_UNDIRTIED && 942 3547 maybee !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 943 789 ahrens dn->dn_dirtyctx = 944 789 ahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 945 789 ahrens ASSERT(dn->dn_dirtyctx_firstset == NULL); 946 789 ahrens dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 947 789 ahrens } 948 789 ahrens mutex_exit(&dn->dn_mtx); 949 789 ahrens 950 789 ahrens /* 951 789 ahrens * If this buffer is already dirty, we're done. 952 789 ahrens */ 953 3547 maybee drp = &db->db_last_dirty; 954 3547 maybee ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 955 3547 maybee db->db.db_object == DMU_META_DNODE_OBJECT); 956 5370 bonwick while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 957 5370 bonwick drp = &dr->dr_next; 958 5370 bonwick if (dr && dr->dr_txg == tx->tx_txg) { 959 3547 maybee if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 960 3547 maybee /* 961 3547 maybee * If this buffer has already been written out, 962 3547 maybee * we now need to reset its state. 963 3547 maybee */ 964 5370 bonwick dbuf_unoverride(dr); 965 10922 Jeff if (db->db.db_object != DMU_META_DNODE_OBJECT && 966 10922 Jeff db->db_state != DB_NOFILL) 967 3547 maybee arc_buf_thaw(db->db_buf); 968 3547 maybee } 969 789 ahrens mutex_exit(&db->db_mtx); 970 5370 bonwick return (dr); 971 789 ahrens } 972 789 ahrens 973 789 ahrens /* 974 789 ahrens * Only valid if not already dirty. 975 789 ahrens */ 976 9396 Matthew ASSERT(dn->dn_object == 0 || 977 9396 Matthew dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 978 789 ahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 979 789 ahrens 980 789 ahrens ASSERT3U(dn->dn_nlevels, >, db->db_level); 981 789 ahrens ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 982 789 ahrens dn->dn_phys->dn_nlevels > db->db_level || 983 789 ahrens dn->dn_next_nlevels[txgoff] > db->db_level || 984 789 ahrens dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 985 789 ahrens dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 986 789 ahrens 987 789 ahrens /* 988 789 ahrens * We should only be dirtying in syncing context if it's the 989 9396 Matthew * mos or we're initializing the os or it's a special object. 990 9396 Matthew * However, we are allowed to dirty in syncing context provided 991 9396 Matthew * we already dirtied it in open context. Hence we must make 992 9396 Matthew * this assertion only if we're not already dirty. 993 789 ahrens */ 994 9396 Matthew ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 995 9396 Matthew os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 996 789 ahrens ASSERT(db->db.db_size != 0); 997 789 ahrens 998 789 ahrens dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 999 789 ahrens 1000 4944 maybee if (db->db_blkid != DB_BONUS_BLKID) { 1001 4944 maybee /* 1002 4944 maybee * Update the accounting. 1003 7467 Mark * Note: we delay "free accounting" until after we drop 1004 7467 Mark * the db_mtx. This keeps us from grabbing other locks 1005 10922 Jeff * (and possibly deadlocking) in bp_get_dsize() while 1006 7467 Mark * also holding the db_mtx. 1007 4944 maybee */ 1008 4944 maybee dnode_willuse_space(dn, db->db.db_size, tx); 1009 7467 Mark do_free_accounting = dbuf_block_freeable(db); 1010 4944 maybee } 1011 4944 maybee 1012 1544 eschrock /* 1013 1544 eschrock * If this buffer is dirty in an old transaction group we need 1014 1544 eschrock * to make a copy of it so that the changes we make in this 1015 1544 eschrock * transaction group won't leak out when we sync the older txg. 1016 1544 eschrock */ 1017 3547 maybee dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1018 3547 maybee if (db->db_level == 0) { 1019 3547 maybee void *data_old = db->db_buf; 1020 3547 maybee 1021 7872 Tim if (db->db_state != DB_NOFILL) { 1022 7872 Tim if (db->db_blkid == DB_BONUS_BLKID) { 1023 7872 Tim dbuf_fix_old_data(db, tx->tx_txg); 1024 7872 Tim data_old = db->db.db_data; 1025 7872 Tim } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1026 7872 Tim /* 1027 7872 Tim * Release the data buffer from the cache so 1028 7872 Tim * that we can modify it without impacting 1029 7872 Tim * possible other users of this cached data 1030 7872 Tim * block. Note that indirect blocks and 1031 7872 Tim * private objects are not released until the 1032 7872 Tim * syncing state (since they are only modified 1033 7872 Tim * then). 1034 7872 Tim */ 1035 7872 Tim arc_release(db->db_buf, db); 1036 7872 Tim dbuf_fix_old_data(db, tx->tx_txg); 1037 7872 Tim data_old = db->db_buf; 1038 7872 Tim } 1039 7872 Tim ASSERT(data_old != NULL); 1040 789 ahrens } 1041 3547 maybee dr->dt.dl.dr_data = data_old; 1042 3547 maybee } else { 1043 3547 maybee mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1044 3547 maybee list_create(&dr->dt.di.dr_children, 1045 3547 maybee sizeof (dbuf_dirty_record_t), 1046 3547 maybee offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1047 789 ahrens } 1048 3547 maybee dr->dr_dbuf = db; 1049 3547 maybee dr->dr_txg = tx->tx_txg; 1050 3547 maybee dr->dr_next = *drp; 1051 3547 maybee *drp = dr; 1052 789 ahrens 1053 789 ahrens /* 1054 789 ahrens * We could have been freed_in_flight between the dbuf_noread 1055 789 ahrens * and dbuf_dirty. We win, as though the dbuf_noread() had 1056 789 ahrens * happened after the free. 1057 789 ahrens */ 1058 789 ahrens if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 1059 3547 maybee mutex_enter(&dn->dn_mtx); 1060 789 ahrens dnode_clear_range(dn, db->db_blkid, 1, tx); 1061 3547 maybee mutex_exit(&dn->dn_mtx); 1062 3547 maybee db->db_freed_in_flight = FALSE; 1063 789 ahrens } 1064 789 ahrens 1065 789 ahrens /* 1066 789 ahrens * This buffer is now part of this txg 1067 789 ahrens */ 1068 789 ahrens dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1069 789 ahrens db->db_dirtycnt += 1; 1070 789 ahrens ASSERT3U(db->db_dirtycnt, <=, 3); 1071 789 ahrens 1072 789 ahrens mutex_exit(&db->db_mtx); 1073 789 ahrens 1074 789 ahrens if (db->db_blkid == DB_BONUS_BLKID) { 1075 3547 maybee mutex_enter(&dn->dn_mtx); 1076 3547 maybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 1077 3547 maybee list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1078 3547 maybee mutex_exit(&dn->dn_mtx); 1079 789 ahrens dnode_setdirty(dn, tx); 1080 3547 maybee return (dr); 1081 7467 Mark } else if (do_free_accounting) { 1082 7467 Mark blkptr_t *bp = db->db_blkptr; 1083 7467 Mark int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1084 10922 Jeff bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1085 7467 Mark /* 1086 7467 Mark * This is only a guess -- if the dbuf is dirty 1087 7467 Mark * in a previous txg, we don't know how much 1088 7467 Mark * space it will use on disk yet. We should 1089 7467 Mark * really have the struct_rwlock to access 1090 7467 Mark * db_blkptr, but since this is just a guess, 1091 7467 Mark * it's OK if we get an odd answer. 1092 7467 Mark */ 1093 7467 Mark dnode_willuse_space(dn, -willfree, tx); 1094 789 ahrens } 1095 789 ahrens 1096 789 ahrens if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1097 789 ahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 1098 789 ahrens drop_struct_lock = TRUE; 1099 7332 Jonathan } 1100 7332 Jonathan 1101 7332 Jonathan if (db->db_level == 0) { 1102 7332 Jonathan dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1103 7332 Jonathan ASSERT(dn->dn_maxblkid >= db->db_blkid); 1104 789 ahrens } 1105 789 ahrens 1106 2688 maybee if (db->db_level+1 < dn->dn_nlevels) { 1107 3547 maybee dmu_buf_impl_t *parent = db->db_parent; 1108 3547 maybee dbuf_dirty_record_t *di; 1109 3547 maybee int parent_held = FALSE; 1110 3547 maybee 1111 3547 maybee if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1112 3547 maybee int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1113 3547 maybee 1114 3547 maybee parent = dbuf_hold_level(dn, db->db_level+1, 1115 3547 maybee db->db_blkid >> epbs, FTAG); 1116 3547 maybee parent_held = TRUE; 1117 3547 maybee } 1118 789 ahrens if (drop_struct_lock) 1119 789 ahrens rw_exit(&dn->dn_struct_rwlock); 1120 3547 maybee ASSERT3U(db->db_level+1, ==, parent->db_level); 1121 3547 maybee di = dbuf_dirty(parent, tx); 1122 3547 maybee if (parent_held) 1123 3547 maybee dbuf_rele(parent, FTAG); 1124 3547 maybee 1125 3547 maybee mutex_enter(&db->db_mtx); 1126 3547 maybee /* possible race with dbuf_undirty() */ 1127 3547 maybee if (db->db_last_dirty == dr || 1128 3547 maybee dn->dn_object == DMU_META_DNODE_OBJECT) { 1129 3547 maybee mutex_enter(&di->dt.di.dr_mtx); 1130 3547 maybee ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1131 3547 maybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 1132 3547 maybee list_insert_tail(&di->dt.di.dr_children, dr); 1133 3547 maybee mutex_exit(&di->dt.di.dr_mtx); 1134 3547 maybee dr->dr_parent = di; 1135 3547 maybee } 1136 3547 maybee mutex_exit(&db->db_mtx); 1137 789 ahrens } else { 1138 3547 maybee ASSERT(db->db_level+1 == dn->dn_nlevels); 1139 3547 maybee ASSERT(db->db_blkid < dn->dn_nblkptr); 1140 3547 maybee ASSERT(db->db_parent == NULL || 1141 3547 maybee db->db_parent == db->db_dnode->dn_dbuf); 1142 3547 maybee mutex_enter(&dn->dn_mtx); 1143 3547 maybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 1144 3547 maybee list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1145 3547 maybee mutex_exit(&dn->dn_mtx); 1146 789 ahrens if (drop_struct_lock) 1147 789 ahrens rw_exit(&dn->dn_struct_rwlock); 1148 789 ahrens } 1149 789 ahrens 1150 789 ahrens dnode_setdirty(dn, tx); 1151 3547 maybee return (dr); 1152 789 ahrens } 1153 789 ahrens 1154 789 ahrens static int 1155 789 ahrens dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1156 789 ahrens { 1157 789 ahrens dnode_t *dn = db->db_dnode; 1158 3547 maybee uint64_t txg = tx->tx_txg; 1159 5688 bonwick dbuf_dirty_record_t *dr, **drp; 1160 789 ahrens 1161 3547 maybee ASSERT(txg != 0); 1162 1544 eschrock ASSERT(db->db_blkid != DB_BONUS_BLKID); 1163 789 ahrens 1164 789 ahrens mutex_enter(&db->db_mtx); 1165 789 ahrens 1166 789 ahrens /* 1167 789 ahrens * If this buffer is not dirty, we're done. 1168 789 ahrens */ 1169 5688 bonwick for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1170 3547 maybee if (dr->dr_txg <= txg) 1171 3547 maybee break; 1172 3547 maybee if (dr == NULL || dr->dr_txg < txg) { 1173 789 ahrens mutex_exit(&db->db_mtx); 1174 789 ahrens return (0); 1175 789 ahrens } 1176 3547 maybee ASSERT(dr->dr_txg == txg); 1177 10922 Jeff ASSERT(dr->dr_dbuf == db); 1178 789 ahrens 1179 789 ahrens /* 1180 789 ahrens * If this buffer is currently held, we cannot undirty 1181 789 ahrens * it, since one of the current holders may be in the 1182 789 ahrens * middle of an update. Note that users of dbuf_undirty() 1183 789 ahrens * should not place a hold on the dbuf before the call. 1184 789 ahrens */ 1185 789 ahrens if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1186 789 ahrens mutex_exit(&db->db_mtx); 1187 2688 maybee /* Make sure we don't toss this buffer at sync phase */ 1188 789 ahrens mutex_enter(&dn->dn_mtx); 1189 789 ahrens dnode_clear_range(dn, db->db_blkid, 1, tx); 1190 789 ahrens mutex_exit(&dn->dn_mtx); 1191 789 ahrens return (0); 1192 789 ahrens } 1193 789 ahrens 1194 789 ahrens dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1195 789 ahrens 1196 3547 maybee ASSERT(db->db.db_size != 0); 1197 789 ahrens 1198 3547 maybee /* XXX would be nice to fix up dn_towrite_space[] */ 1199 3547 maybee 1200 5688 bonwick *drp = dr->dr_next; 1201 3547 maybee 1202 3547 maybee if (dr->dr_parent) { 1203 3547 maybee mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1204 3547 maybee list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1205 3547 maybee mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1206 3547 maybee } else if (db->db_level+1 == dn->dn_nlevels) { 1207 6992 maybee ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1208 3547 maybee mutex_enter(&dn->dn_mtx); 1209 3547 maybee list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1210 3547 maybee mutex_exit(&dn->dn_mtx); 1211 789 ahrens } 1212 789 ahrens 1213 3547 maybee if (db->db_level == 0) { 1214 7872 Tim if (db->db_state != DB_NOFILL) { 1215 7872 Tim dbuf_unoverride(dr); 1216 789 ahrens 1217 7872 Tim ASSERT(db->db_buf != NULL); 1218 7872 Tim ASSERT(dr->dt.dl.dr_data != NULL); 1219 7872 Tim if (dr->dt.dl.dr_data != db->db_buf) 1220 7872 Tim VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 1221 7872 Tim db) == 1); 1222 7872 Tim } 1223 3547 maybee } else { 1224 3547 maybee ASSERT(db->db_buf != NULL); 1225 3547 maybee ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1226 4831 gw25295 mutex_destroy(&dr->dt.di.dr_mtx); 1227 4831 gw25295 list_destroy(&dr->dt.di.dr_children); 1228 3547 maybee } 1229 3547 maybee kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1230 789 ahrens 1231 789 ahrens ASSERT(db->db_dirtycnt > 0); 1232 789 ahrens db->db_dirtycnt -= 1; 1233 789 ahrens 1234 3547 maybee if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1235 1544 eschrock arc_buf_t *buf = db->db_buf; 1236 789 ahrens 1237 10922 Jeff ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1238 1544 eschrock dbuf_set_data(db, NULL); 1239 1544 eschrock VERIFY(arc_buf_remove_ref(buf, db) == 1); 1240 789 ahrens dbuf_evict(db); 1241 789 ahrens return (1); 1242 789 ahrens } 1243 789 ahrens 1244 789 ahrens mutex_exit(&db->db_mtx); 1245 789 ahrens return (0); 1246 789 ahrens } 1247 789 ahrens 1248 789 ahrens #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1249 789 ahrens void 1250 789 ahrens dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1251 789 ahrens { 1252 6245 maybee int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1253 789 ahrens 1254 789 ahrens ASSERT(tx->tx_txg != 0); 1255 789 ahrens ASSERT(!refcount_is_zero(&db->db_holds)); 1256 789 ahrens 1257 789 ahrens if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1258 789 ahrens rf |= DB_RF_HAVESTRUCT; 1259 1544 eschrock (void) dbuf_read(db, NULL, rf); 1260 3547 maybee (void) dbuf_dirty(db, tx); 1261 7872 Tim } 1262 7872 Tim 1263 7872 Tim void 1264 7872 Tim dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1265 7872 Tim { 1266 7872 Tim dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1267 7872 Tim 1268 7872 Tim db->db_state = DB_NOFILL; 1269 7872 Tim 1270 7872 Tim dmu_buf_will_fill(db_fake, tx); 1271 789 ahrens } 1272 789 ahrens 1273 789 ahrens void 1274 1544 eschrock dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1275 789 ahrens { 1276 1544 eschrock dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1277 1544 eschrock 1278 1544 eschrock ASSERT(db->db_blkid != DB_BONUS_BLKID); 1279 789 ahrens ASSERT(tx->tx_txg != 0); 1280 789 ahrens ASSERT(db->db_level == 0); 1281 789 ahrens ASSERT(!refcount_is_zero(&db->db_holds)); 1282 789 ahrens 1283 1544 eschrock ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1284 789 ahrens dmu_tx_private_ok(tx)); 1285 789 ahrens 1286 789 ahrens dbuf_noread(db); 1287 3547 maybee (void) dbuf_dirty(db, tx); 1288 789 ahrens } 1289 789 ahrens 1290 789 ahrens #pragma weak dmu_buf_fill_done = dbuf_fill_done 1291 789 ahrens /* ARGSUSED */ 1292 789 ahrens void 1293 789 ahrens dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1294 789 ahrens { 1295 789 ahrens mutex_enter(&db->db_mtx); 1296 873 ek110237 DBUF_VERIFY(db); 1297 789 ahrens 1298 789 ahrens if (db->db_state == DB_FILL) { 1299 3547 maybee if (db->db_level == 0 && db->db_freed_in_flight) { 1300 1544 eschrock ASSERT(db->db_blkid != DB_BONUS_BLKID); 1301 789 ahrens /* we were freed while filling */ 1302 789 ahrens /* XXX dbuf_undirty? */ 1303 789 ahrens bzero(db->db.db_data, db->db.db_size); 1304 3547 maybee db->db_freed_in_flight = FALSE; 1305 789 ahrens } 1306 789 ahrens db->db_state = DB_CACHED; 1307 789 ahrens cv_broadcast(&db->db_changed); 1308 789 ahrens } 1309 789 ahrens mutex_exit(&db->db_mtx); 1310 789 ahrens } 1311 789 ahrens 1312 1544 eschrock /* 1313 9412 Aleksandr * Directly assign a provided arc buf to a given dbuf if it's not referenced 1314 9412 Aleksandr * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1315 9412 Aleksandr */ 1316 9412 Aleksandr void 1317 9412 Aleksandr dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1318 9412 Aleksandr { 1319 9412 Aleksandr ASSERT(!refcount_is_zero(&db->db_holds)); 1320 9412 Aleksandr ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT); 1321 9412 Aleksandr ASSERT(db->db_blkid != DB_BONUS_BLKID); 1322 9412 Aleksandr ASSERT(db->db_level == 0); 1323 9412 Aleksandr ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1324 9412 Aleksandr ASSERT(buf != NULL); 1325 9412 Aleksandr ASSERT(arc_buf_size(buf) == db->db.db_size); 1326 9412 Aleksandr ASSERT(tx->tx_txg != 0); 1327 9412 Aleksandr 1328 9412 Aleksandr arc_return_buf(buf, db); 1329 9412 Aleksandr ASSERT(arc_released(buf)); 1330 9412 Aleksandr 1331 9412 Aleksandr mutex_enter(&db->db_mtx); 1332 9412 Aleksandr 1333 9412 Aleksandr while (db->db_state == DB_READ || db->db_state == DB_FILL) 1334 9412 Aleksandr cv_wait(&db->db_changed, &db->db_mtx); 1335 9412 Aleksandr 1336 9412 Aleksandr ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1337 9412 Aleksandr 1338 9412 Aleksandr if (db->db_state == DB_CACHED && 1339 9412 Aleksandr refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1340 9412 Aleksandr mutex_exit(&db->db_mtx); 1341 9412 Aleksandr (void) dbuf_dirty(db, tx); 1342 9412 Aleksandr bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1343 9412 Aleksandr VERIFY(arc_buf_remove_ref(buf, db) == 1); 1344 9412 Aleksandr return; 1345 9412 Aleksandr } 1346 9412 Aleksandr 1347 9412 Aleksandr if (db->db_state == DB_CACHED) { 1348 9412 Aleksandr dbuf_dirty_record_t *dr = db->db_last_dirty; 1349 9412 Aleksandr 1350 9412 Aleksandr ASSERT(db->db_buf != NULL); 1351 9412 Aleksandr if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1352 9412 Aleksandr ASSERT(dr->dt.dl.dr_data == db->db_buf); 1353 9412 Aleksandr if (!arc_released(db->db_buf)) { 1354 9412 Aleksandr ASSERT(dr->dt.dl.dr_override_state == 1355 9412 Aleksandr DR_OVERRIDDEN); 1356 9412 Aleksandr arc_release(db->db_buf, db); 1357 9412 Aleksandr } 1358 9412 Aleksandr dr->dt.dl.dr_data = buf; 1359 9412 Aleksandr VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1360 9412 Aleksandr } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1361 9412 Aleksandr arc_release(db->db_buf, db); 1362 9412 Aleksandr VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1363 9412 Aleksandr } 1364 9412 Aleksandr db->db_buf = NULL; 1365 9412 Aleksandr } 1366 9412 Aleksandr ASSERT(db->db_buf == NULL); 1367 9412 Aleksandr dbuf_set_data(db, buf); 1368 9412 Aleksandr db->db_state = DB_FILL; 1369 9412 Aleksandr mutex_exit(&db->db_mtx); 1370 9412 Aleksandr (void) dbuf_dirty(db, tx); 1371 9412 Aleksandr dbuf_fill_done(db, tx); 1372 9412 Aleksandr } 1373 9412 Aleksandr 1374 9412 Aleksandr /* 1375 1544 eschrock * "Clear" the contents of this dbuf. This will mark the dbuf 1376 1544 eschrock * EVICTING and clear *most* of its references. Unfortunetely, 1377 1544 eschrock * when we are not holding the dn_dbufs_mtx, we can't clear the 1378 1544 eschrock * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1379 1544 eschrock * in this case. For callers from the DMU we will usually see: 1380 1544 eschrock * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1381 1544 eschrock * For the arc callback, we will usually see: 1382 1544 eschrock * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1383 1544 eschrock * Sometimes, though, we will get a mix of these two: 1384 1544 eschrock * DMU: dbuf_clear()->arc_buf_evict() 1385 1544 eschrock * ARC: dbuf_do_evict()->dbuf_destroy() 1386 1544 eschrock */ 1387 1544 eschrock void 1388 789 ahrens dbuf_clear(dmu_buf_impl_t *db) 1389 789 ahrens { 1390 789 ahrens dnode_t *dn = db->db_dnode; 1391 1544 eschrock dmu_buf_impl_t *parent = db->db_parent; 1392 1596 ahrens dmu_buf_impl_t *dndb = dn->dn_dbuf; 1393 1544 eschrock int dbuf_gone = FALSE; 1394 789 ahrens 1395 789 ahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 1396 789 ahrens ASSERT(refcount_is_zero(&db->db_holds)); 1397 789 ahrens 1398 1544 eschrock dbuf_evict_user(db); 1399 1544 eschrock 1400 789 ahrens if (db->db_state == DB_CACHED) { 1401 1544 eschrock ASSERT(db->db.db_data != NULL); 1402 4309 maybee if (db->db_blkid == DB_BONUS_BLKID) { 1403 1544 eschrock zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1404 8582 Brendan arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1405 4309 maybee } 1406 789 ahrens db->db.db_data = NULL; 1407 789 ahrens db->db_state = DB_UNCACHED; 1408 789 ahrens } 1409 789 ahrens 1410 7872 Tim ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1411 789 ahrens ASSERT(db->db_data_pending == NULL); 1412 789 ahrens 1413 1544 eschrock db->db_state = DB_EVICTING; 1414 1544 eschrock db->db_blkptr = NULL; 1415 1544 eschrock 1416 1544 eschrock if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1417 1544 eschrock list_remove(&dn->dn_dbufs, db); 1418 1544 eschrock dnode_rele(dn, db); 1419 4944 maybee db->db_dnode = NULL; 1420 1544 eschrock } 1421 1544 eschrock 1422 1544 eschrock if (db->db_buf) 1423 1544 eschrock dbuf_gone = arc_buf_evict(db->db_buf); 1424 1544 eschrock 1425 1544 eschrock if (!dbuf_gone) 1426 1544 eschrock mutex_exit(&db->db_mtx); 1427 789 ahrens 1428 789 ahrens /* 1429 789 ahrens * If this dbuf is referened from an indirect dbuf, 1430 789 ahrens * decrement the ref count on the indirect dbuf. 1431 789 ahrens */ 1432 1596 ahrens if (parent && parent != dndb) 1433 1544 eschrock dbuf_rele(parent, db); 1434 789 ahrens } 1435 789 ahrens 1436 789 ahrens static int 1437 789 ahrens dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1438 789 ahrens dmu_buf_impl_t **parentp, blkptr_t **bpp) 1439 789 ahrens { 1440 789 ahrens int nlevels, epbs; 1441 1544 eschrock 1442 2417 ahrens *parentp = NULL; 1443 2417 ahrens *bpp = NULL; 1444 2417 ahrens 1445 1544 eschrock ASSERT(blkid != DB_BONUS_BLKID); 1446 789 ahrens 1447 789 ahrens if (dn->dn_phys->dn_nlevels == 0) 1448 789 ahrens nlevels = 1; 1449 789 ahrens else 1450 789 ahrens nlevels = dn->dn_phys->dn_nlevels; 1451 789 ahrens 1452 789 ahrens epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1453 789 ahrens 1454 789 ahrens ASSERT3U(level * epbs, <, 64); 1455 789 ahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1456 1544 eschrock if (level >= nlevels || 1457 789 ahrens (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1458 789 ahrens /* the buffer has no parent yet */ 1459 789 ahrens return (ENOENT); 1460 789 ahrens } else if (level < nlevels-1) { 1461 789 ahrens /* this block is referenced from an indirect block */ 1462 789 ahrens int err = dbuf_hold_impl(dn, level+1, 1463 789 ahrens blkid >> epbs, fail_sparse, NULL, parentp); 1464 789 ahrens if (err) 1465 789 ahrens return (err); 1466 1544 eschrock err = dbuf_read(*parentp, NULL, 1467 1544 eschrock (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1468 1596 ahrens if (err) { 1469 1596 ahrens dbuf_rele(*parentp, NULL); 1470 1596 ahrens *parentp = NULL; 1471 1596 ahrens return (err); 1472 1544 eschrock } 1473 1596 ahrens *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1474 1596 ahrens (blkid & ((1ULL << epbs) - 1)); 1475 1596 ahrens return (0); 1476 789 ahrens } else { 1477 789 ahrens /* the block is referenced from the dnode */ 1478 789 ahrens ASSERT3U(level, ==, nlevels-1); 1479 789 ahrens ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1480 789 ahrens blkid < dn->dn_phys->dn_nblkptr); 1481 1596 ahrens if (dn->dn_dbuf) { 1482 1596 ahrens dbuf_add_ref(dn->dn_dbuf, NULL); 1483 1596 ahrens *parentp = dn->dn_dbuf; 1484 1596 ahrens } 1485 789 ahrens *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1486 789 ahrens return (0); 1487 789 ahrens } 1488 789 ahrens } 1489 789 ahrens 1490 789 ahrens static dmu_buf_impl_t * 1491 789 ahrens dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1492 789 ahrens dmu_buf_impl_t *parent, blkptr_t *blkptr) 1493 789 ahrens { 1494 10298 Matthew objset_t *os = dn->dn_objset; 1495 789 ahrens dmu_buf_impl_t *db, *odb; 1496 789 ahrens 1497 789 ahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1498 789 ahrens ASSERT(dn->dn_type != DMU_OT_NONE); 1499 789 ahrens 1500 789 ahrens db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1501 789 ahrens 1502 789 ahrens db->db_objset = os; 1503 789 ahrens db->db.db_object = dn->dn_object; 1504 789 ahrens db->db_level = level; 1505 789 ahrens db->db_blkid = blkid; 1506 3547 maybee db->db_last_dirty = NULL; 1507 1544 eschrock db->db_dirtycnt = 0; 1508 1544 eschrock db->db_dnode = dn; 1509 1544 eschrock db->db_parent = parent; 1510 1544 eschrock db->db_blkptr = blkptr; 1511 789 ahrens 1512 3547 maybee db->db_user_ptr = NULL; 1513 3547 maybee db->db_user_data_ptr_ptr = NULL; 1514 3547 maybee db->db_evict_func = NULL; 1515 3547 maybee db->db_immediate_evict = 0; 1516 3547 maybee db->db_freed_in_flight = 0; 1517 1544 eschrock 1518 1544 eschrock if (blkid == DB_BONUS_BLKID) { 1519 1544 eschrock ASSERT3P(parent, ==, dn->dn_dbuf); 1520 4944 maybee db->db.db_size = DN_MAX_BONUSLEN - 1521 4944 maybee (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1522 4944 maybee ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1523 789 ahrens db->db.db_offset = DB_BONUS_BLKID; 1524 1544 eschrock db->db_state = DB_UNCACHED; 1525 1544 eschrock /* the bonus dbuf is not placed in the hash table */ 1526 8582 Brendan arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1527 1544 eschrock return (db); 1528 789 ahrens } else { 1529 789 ahrens int blocksize = 1530 789 ahrens db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1531 789 ahrens db->db.db_size = blocksize; 1532 789 ahrens db->db.db_offset = db->db_blkid * blocksize; 1533 789 ahrens } 1534 789 ahrens 1535 789 ahrens /* 1536 789 ahrens * Hold the dn_dbufs_mtx while we get the new dbuf 1537 789 ahrens * in the hash table *and* added to the dbufs list. 1538 789 ahrens * This prevents a possible deadlock with someone 1539 789 ahrens * trying to look up this dbuf before its added to the 1540 789 ahrens * dn_dbufs list. 1541 789 ahrens */ 1542 789 ahrens mutex_enter(&dn->dn_dbufs_mtx); 1543 1544 eschrock db->db_state = DB_EVICTING; 1544 789 ahrens if ((odb = dbuf_hash_insert(db)) != NULL) { 1545 789 ahrens /* someone else inserted it first */ 1546 789 ahrens kmem_cache_free(dbuf_cache, db); 1547 789 ahrens mutex_exit(&dn->dn_dbufs_mtx); 1548 789 ahrens return (odb); 1549 789 ahrens } 1550 789 ahrens list_insert_head(&dn->dn_dbufs, db); 1551 1544 eschrock db->db_state = DB_UNCACHED; 1552 789 ahrens mutex_exit(&dn->dn_dbufs_mtx); 1553 8582 Brendan arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1554 789 ahrens 1555 789 ahrens if (parent && parent != dn->dn_dbuf) 1556 789 ahrens dbuf_add_ref(parent, db); 1557 789 ahrens 1558 1544 eschrock ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1559 1544 eschrock refcount_count(&dn->dn_holds) > 0); 1560 789 ahrens (void) refcount_add(&dn->dn_holds, db); 1561 789 ahrens 1562 789 ahrens dprintf_dbuf(db, "db=%p\n", db); 1563 789 ahrens 1564 789 ahrens return (db); 1565 789 ahrens } 1566 789 ahrens 1567 789 ahrens static int 1568 1544 eschrock dbuf_do_evict(void *private) 1569 789 ahrens { 1570 1544 eschrock arc_buf_t *buf = private; 1571 1544 eschrock dmu_buf_impl_t *db = buf->b_private; 1572 789 ahrens 1573 1544 eschrock if (!MUTEX_HELD(&db->db_mtx)) 1574 1544 eschrock mutex_enter(&db->db_mtx); 1575 789 ahrens 1576 1544 eschrock ASSERT(refcount_is_zero(&db->db_holds)); 1577 789 ahrens 1578 1544 eschrock if (db->db_state != DB_EVICTING) { 1579 1544 eschrock ASSERT(db->db_state == DB_CACHED); 1580 1544 eschrock DBUF_VERIFY(db); 1581 1544 eschrock db->db_buf = NULL; 1582 1544 eschrock dbuf_evict(db); 1583 1544 eschrock } else { 1584 1544 eschrock mutex_exit(&db->db_mtx); 1585 1544 eschrock dbuf_destroy(db); 1586 789 ahrens } 1587 1544 eschrock return (0); 1588 789 ahrens } 1589 789 ahrens 1590 789 ahrens static void 1591 789 ahrens dbuf_destroy(dmu_buf_impl_t *db) 1592 789 ahrens { 1593 789 ahrens ASSERT(refcount_is_zero(&db->db_holds)); 1594 789 ahrens 1595 1544 eschrock if (db->db_blkid != DB_BONUS_BLKID) { 1596 1544 eschrock /* 1597 1544 eschrock * If this dbuf is still on the dn_dbufs list, 1598 1544 eschrock * remove it from that list. 1599 1544 eschrock */ 1600 4944 maybee if (db->db_dnode) { 1601 4944 maybee dnode_t *dn = db->db_dnode; 1602 4944 maybee 1603 4944 maybee mutex_enter(&dn->dn_dbufs_mtx); 1604 1544 eschrock list_remove(&dn->dn_dbufs, db); 1605 1596 ahrens mutex_exit(&dn->dn_dbufs_mtx); 1606 1544 eschrock 1607 1544 eschrock dnode_rele(dn, db); 1608 4944 maybee db->db_dnode = NULL; 1609 1544 eschrock } 1610 1544 eschrock dbuf_hash_remove(db); 1611 1544 eschrock } 1612 1544 eschrock db->db_parent = NULL; 1613 1544 eschrock db->db_buf = NULL; 1614 1544 eschrock 1615 4312 gw25295 ASSERT(!list_link_active(&db->db_link)); 1616 789 ahrens ASSERT(db->db.db_data == NULL); 1617 789 ahrens ASSERT(db->db_hash_next == NULL); 1618 789 ahrens ASSERT(db->db_blkptr == NULL); 1619 789 ahrens ASSERT(db->db_data_pending == NULL); 1620 789 ahrens 1621 789 ahrens kmem_cache_free(dbuf_cache, db); 1622 8582 Brendan arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1623 789 ahrens } 1624 789 ahrens 1625 789 ahrens void 1626 789 ahrens dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1627 789 ahrens { 1628 2391 maybee dmu_buf_impl_t *db = NULL; 1629 789 ahrens blkptr_t *bp = NULL; 1630 789 ahrens 1631 789 ahrens ASSERT(blkid != DB_BONUS_BLKID); 1632 789 ahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1633 789 ahrens 1634 789 ahrens if (dnode_block_freed(dn, blkid)) 1635 789 ahrens return; 1636 789 ahrens 1637 789 ahrens /* dbuf_find() returns with db_mtx held */ 1638 789 ahrens if (db = dbuf_find(dn, 0, blkid)) { 1639 2391 maybee if (refcount_count(&db->db_holds) > 0) { 1640 2391 maybee /* 1641 2391 maybee * This dbuf is active. We assume that it is 1642 2391 maybee * already CACHED, or else about to be either 1643 2391 maybee * read or filled. 1644 2391 maybee */ 1645 2391 maybee mutex_exit(&db->db_mtx); 1646 2391 maybee return; 1647 2391 maybee } 1648 789 ahrens mutex_exit(&db->db_mtx); 1649 2417 ahrens db = NULL; 1650 789 ahrens } 1651 789 ahrens 1652 2391 maybee if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1653 789 ahrens if (bp && !BP_IS_HOLE(bp)) { 1654 7046 ahrens arc_buf_t *pbuf; 1655 10922 Jeff dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1656 2391 maybee uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1657 1544 eschrock zbookmark_t zb; 1658 10922 Jeff 1659 10922 Jeff SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1660 10922 Jeff dn->dn_object, 0, blkid); 1661 1544 eschrock 1662 7046 ahrens if (db) 1663 7046 ahrens pbuf = db->db_buf; 1664 7046 ahrens else 1665 7046 ahrens pbuf = dn->dn_objset->os_phys_buf; 1666 7046 ahrens 1667 7046 ahrens (void) arc_read(NULL, dn->dn_objset->os_spa, 1668 7046 ahrens bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1669 789 ahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1670 2391 maybee &aflags, &zb); 1671 789 ahrens } 1672 2391 maybee if (db) 1673 2391 maybee dbuf_rele(db, NULL); 1674 789 ahrens } 1675 789 ahrens } 1676 789 ahrens 1677 789 ahrens /* 1678 789 ahrens * Returns with db_holds incremented, and db_mtx not held. 1679 789 ahrens * Note: dn_struct_rwlock must be held. 1680 789 ahrens */ 1681 789 ahrens int 1682 789 ahrens dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1683 789 ahrens void *tag, dmu_buf_impl_t **dbp) 1684 789 ahrens { 1685 789 ahrens dmu_buf_impl_t *db, *parent = NULL; 1686 789 ahrens 1687 1544 eschrock ASSERT(blkid != DB_BONUS_BLKID); 1688 789 ahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1689 789 ahrens ASSERT3U(dn->dn_nlevels, >, level); 1690 789 ahrens 1691 789 ahrens *dbp = NULL; 1692 1544 eschrock top: 1693 789 ahrens /* dbuf_find() returns with db_mtx held */ 1694 789 ahrens db = dbuf_find(dn, level, blkid); 1695 789 ahrens 1696 789 ahrens if (db == NULL) { 1697 789 ahrens blkptr_t *bp = NULL; 1698 789 ahrens int err; 1699 789 ahrens 1700 1596 ahrens ASSERT3P(parent, ==, NULL); 1701 789 ahrens err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1702 789 ahrens if (fail_sparse) { 1703 789 ahrens if (err == 0 && bp && BP_IS_HOLE(bp)) 1704 789 ahrens err = ENOENT; 1705 789 ahrens if (err) { 1706 1596 ahrens if (parent) 1707 1544 eschrock dbuf_rele(parent, NULL); 1708 789 ahrens return (err); 1709 789 ahrens } 1710 789 ahrens } 1711 1544 eschrock if (err && err != ENOENT) 1712 1544 eschrock return (err); 1713 789 ahrens db = dbuf_create(dn, level, blkid, parent, bp); 1714 789 ahrens } 1715 1544 eschrock 1716 1544 eschrock if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1717 1544 eschrock arc_buf_add_ref(db->db_buf, db); 1718 1544 eschrock if (db->db_buf->b_data == NULL) { 1719 1544 eschrock dbuf_clear(db); 1720 1596 ahrens if (parent) { 1721 1596 ahrens dbuf_rele(parent, NULL); 1722 1596 ahrens parent = NULL; 1723 1596 ahrens } 1724 1544 eschrock goto top; 1725 1544 eschrock } 1726 1544 eschrock ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1727 1544 eschrock } 1728 1544 eschrock 1729 1544 eschrock ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1730 789 ahrens 1731 789 ahrens /* 1732 3547 maybee * If this buffer is currently syncing out, and we are are 1733 3547 maybee * still referencing it from db_data, we need to make a copy 1734 3547 maybee * of it in case we decide we want to dirty it again in this txg. 1735 789 ahrens */ 1736 3547 maybee if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 1737 1544 eschrock dn->dn_object != DMU_META_DNODE_OBJECT && 1738 3547 maybee db->db_state == DB_CACHED && db->db_data_pending) { 1739 3547 maybee dbuf_dirty_record_t *dr = db->db_data_pending; 1740 789 ahrens 1741 3547 maybee if (dr->dt.dl.dr_data == db->db_buf) { 1742 3547 maybee arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1743 3547 maybee 1744 3547 maybee dbuf_set_data(db, 1745 3547 maybee arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1746 3547 maybee db->db.db_size, db, type)); 1747 3547 maybee bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1748 3547 maybee db->db.db_size); 1749 3547 maybee } 1750 789 ahrens } 1751 789 ahrens 1752 1544 eschrock (void) refcount_add(&db->db_holds, tag); 1753 789 ahrens dbuf_update_data(db); 1754 873 ek110237 DBUF_VERIFY(db); 1755 789 ahrens mutex_exit(&db->db_mtx); 1756 789 ahrens 1757 789 ahrens /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1758 1596 ahrens if (parent) 1759 1544 eschrock dbuf_rele(parent, NULL); 1760 789 ahrens 1761 789 ahrens ASSERT3P(db->db_dnode, ==, dn); 1762 789 ahrens ASSERT3U(db->db_blkid, ==, blkid); 1763 789 ahrens ASSERT3U(db->db_level, ==, level); 1764 789 ahrens *dbp = db; 1765 789 ahrens 1766 789 ahrens return (0); 1767 789 ahrens } 1768 789 ahrens 1769 789 ahrens dmu_buf_impl_t * 1770 1544 eschrock dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1771 789 ahrens { 1772 789 ahrens dmu_buf_impl_t *db; 1773 1544 eschrock int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1774 1544 eschrock return (err ? NULL : db); 1775 789 ahrens } 1776 789 ahrens 1777 789 ahrens dmu_buf_impl_t * 1778 789 ahrens dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1779 789 ahrens { 1780 789 ahrens dmu_buf_impl_t *db; 1781 1544 eschrock int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1782 1544 eschrock return (err ? NULL : db); 1783 1544 eschrock } 1784 1544 eschrock 1785 4944 maybee void 1786 1544 eschrock dbuf_create_bonus(dnode_t *dn) 1787 1544 eschrock { 1788 1544 eschrock ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1789 1544 eschrock 1790 1544 eschrock ASSERT(dn->dn_bonus == NULL); 1791 4944 maybee dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); 1792 789 ahrens } 1793 789 ahrens 1794 1544 eschrock #pragma weak dmu_buf_add_ref = dbuf_add_ref 1795 789 ahrens void 1796 789 ahrens dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1797 789 ahrens { 1798 1544 eschrock int64_t holds = refcount_add(&db->db_holds, tag); 1799 1544 eschrock ASSERT(holds > 1); 1800 789 ahrens } 1801 789 ahrens 1802 1544 eschrock #pragma weak dmu_buf_rele = dbuf_rele 1803 789 ahrens void 1804 1544 eschrock dbuf_rele(dmu_buf_impl_t *db, void *tag) 1805 789 ahrens { 1806 10922 Jeff mutex_enter(&db->db_mtx); 1807 10922 Jeff dbuf_rele_and_unlock(db, tag); 1808 10922 Jeff } 1809 10922 Jeff 1810 10922 Jeff /* 1811 10922 Jeff * dbuf_rele() for an already-locked dbuf. This is necessary to allow 1812 10922 Jeff * db_dirtycnt and db_holds to be updated atomically. 1813 10922 Jeff */ 1814 10922 Jeff void 1815 10922 Jeff dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 1816 10922 Jeff { 1817 789 ahrens int64_t holds; 1818 789 ahrens 1819 10922 Jeff ASSERT(MUTEX_HELD(&db->db_mtx)); 1820 873 ek110237 DBUF_VERIFY(db); 1821 789 ahrens 1822 789 ahrens holds = refcount_remove(&db->db_holds, tag); 1823 1544 eschrock ASSERT(holds >= 0); 1824 1544 eschrock 1825 3547 maybee /* 1826 3547 maybee * We can't freeze indirects if there is a possibility that they 1827 3547 maybee * may be modified in the current syncing context. 1828 3547 maybee */ 1829 3547 maybee if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 1830 3093 ahrens arc_buf_freeze(db->db_buf); 1831 3093 ahrens 1832 1544 eschrock if (holds == db->db_dirtycnt && 1833 3547 maybee db->db_level == 0 && db->db_immediate_evict) 1834 1544 eschrock dbuf_evict_user(db); 1835 789 ahrens 1836 789 ahrens if (holds == 0) { 1837 1544 eschrock if (db->db_blkid == DB_BONUS_BLKID) { 1838 1544 eschrock mutex_exit(&db->db_mtx); 1839 1544 eschrock dnode_rele(db->db_dnode, db); 1840 1544 eschrock } else if (db->db_buf == NULL) { 1841 1544 eschrock /* 1842 1544 eschrock * This is a special case: we never associated this 1843 1544 eschrock * dbuf with any data allocated from the ARC. 1844 1544 eschrock */ 1845 7872 Tim ASSERT(db->db_state == DB_UNCACHED || 1846 7872 Tim db->db_state == DB_NOFILL); 1847 1544 eschrock dbuf_evict(db); 1848 3093 ahrens } else if (arc_released(db->db_buf)) { 1849 1544 eschrock arc_buf_t *buf = db->db_buf; 1850 1544 eschrock /* 1851 1544 eschrock * This dbuf has anonymous data associated with it. 1852 1544 eschrock */ 1853 1544 eschrock dbuf_set_data(db, NULL); 1854 1544 eschrock VERIFY(arc_buf_remove_ref(buf, db) == 1); 1855 1544 eschrock dbuf_evict(db); 1856 1544 eschrock } else { 1857 1544 eschrock VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 1858 7237 ek110237 if (!DBUF_IS_CACHEABLE(db)) 1859 7237 ek110237 dbuf_clear(db); 1860 7237 ek110237 else 1861 7237 ek110237 mutex_exit(&db->db_mtx); 1862 1544 eschrock } 1863 789 ahrens } else { 1864 789 ahrens mutex_exit(&db->db_mtx); 1865 789 ahrens } 1866 789 ahrens } 1867 789 ahrens 1868 789 ahrens #pragma weak dmu_buf_refcount = dbuf_refcount 1869 789 ahrens uint64_t 1870 789 ahrens dbuf_refcount(dmu_buf_impl_t *db) 1871 789 ahrens { 1872 789 ahrens return (refcount_count(&db->db_holds)); 1873 789 ahrens } 1874 789 ahrens 1875 789 ahrens void * 1876 789 ahrens dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1877 789 ahrens dmu_buf_evict_func_t *evict_func) 1878 789 ahrens { 1879 789 ahrens return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1880 789 ahrens user_data_ptr_ptr, evict_func)); 1881 789 ahrens } 1882 789 ahrens 1883 789 ahrens void * 1884 789 ahrens dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1885 789 ahrens dmu_buf_evict_func_t *evict_func) 1886 789 ahrens { 1887 789 ahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1888 789 ahrens 1889 3547 maybee db->db_immediate_evict = TRUE; 1890 789 ahrens return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1891 789 ahrens user_data_ptr_ptr, evict_func)); 1892 789 ahrens } 1893 789 ahrens 1894 789 ahrens void * 1895 789 ahrens dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1896 789 ahrens void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1897 789 ahrens { 1898 789 ahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1899 789 ahrens ASSERT(db->db_level == 0); 1900 789 ahrens 1901 789 ahrens ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1902 789 ahrens 1903 789 ahrens mutex_enter(&db->db_mtx); 1904 789 ahrens 1905 3547 maybee if (db->db_user_ptr == old_user_ptr) { 1906 3547 maybee db->db_user_ptr = user_ptr; 1907 3547 maybee db->db_user_data_ptr_ptr = user_data_ptr_ptr; 1908 3547 maybee db->db_evict_func = evict_func; 1909 789 ahrens 1910 789 ahrens dbuf_update_data(db); 1911 789 ahrens } else { 1912 3547 maybee old_user_ptr = db->db_user_ptr; 1913 789 ahrens } 1914 789 ahrens 1915 789 ahrens mutex_exit(&db->db_mtx); 1916 789 ahrens return (old_user_ptr); 1917 789 ahrens } 1918 789 ahrens 1919 789 ahrens void * 1920 789 ahrens dmu_buf_get_user(dmu_buf_t *db_fake) 1921 789 ahrens { 1922 789 ahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1923 789 ahrens ASSERT(!refcount_is_zero(&db->db_holds)); 1924 789 ahrens 1925 3547 maybee return (db->db_user_ptr); 1926 789 ahrens } 1927 789 ahrens 1928 9653 Sanjeev boolean_t 1929 9653 Sanjeev dmu_buf_freeable(dmu_buf_t *dbuf) 1930 9653 Sanjeev { 1931 9653 Sanjeev boolean_t res = B_FALSE; 1932 9653 Sanjeev dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1933 9653 Sanjeev 1934 9653 Sanjeev if (db->db_blkptr) 1935 9653 Sanjeev res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 1936 9653 Sanjeev db->db_blkptr->blk_birth); 1937 9653 Sanjeev 1938 9653 Sanjeev return (res); 1939 9653 Sanjeev } 1940 9653 Sanjeev 1941 3547 maybee static void 1942 3547 maybee dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 1943 789 ahrens { 1944 3547 maybee /* ASSERT(dmu_tx_is_syncing(tx) */ 1945 3547 maybee ASSERT(MUTEX_HELD(&db->db_mtx)); 1946 3547 maybee 1947 3547 maybee if (db->db_blkptr != NULL) 1948 3547 maybee return; 1949 3547 maybee 1950 3547 maybee if (db->db_level == dn->dn_phys->dn_nlevels-1) { 1951 3547 maybee /* 1952 3547 maybee * This buffer was allocated at a time when there was 1953 3547 maybee * no available blkptrs from the dnode, or it was 1954 3547 maybee * inappropriate to hook it in (i.e., nlevels mis-match). 1955 3547 maybee */ 1956 3547 maybee ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 1957 3547 maybee ASSERT(db->db_parent == NULL); 1958 3547 maybee db->db_parent = dn->dn_dbuf; 1959 3547 maybee db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 1960 3547 maybee DBUF_VERIFY(db); 1961 3547 maybee } else { 1962 3547 maybee dmu_buf_impl_t *parent = db->db_parent; 1963 3547 maybee int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1964 3547 maybee 1965 3547 maybee ASSERT(dn->dn_phys->dn_nlevels > 1); 1966 3547 maybee if (parent == NULL) { 1967 3547 maybee mutex_exit(&db->db_mtx); 1968 3547 maybee rw_enter(&dn->dn_struct_rwlock, RW_READER); 1969 3547 maybee (void) dbuf_hold_impl(dn, db->db_level+1, 1970 3547 maybee db->db_blkid >> epbs, FALSE, db, &parent); 1971 3547 maybee rw_exit(&dn->dn_struct_rwlock); 1972 3547 maybee mutex_enter(&db->db_mtx); 1973 3547 maybee db->db_parent = parent; 1974 3547 maybee } 1975 3547 maybee db->db_blkptr = (blkptr_t *)parent->db.db_data + 1976 3547 maybee (db->db_blkid & ((1ULL << epbs) - 1)); 1977 3547 maybee DBUF_VERIFY(db); 1978 3547 maybee } 1979 3547 maybee } 1980 3547 maybee 1981 3547 maybee static void 1982 3547 maybee dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 1983 3547 maybee { 1984 3547 maybee dmu_buf_impl_t *db = dr->dr_dbuf; 1985 3547 maybee dnode_t *dn = db->db_dnode; 1986 3547 maybee zio_t *zio; 1987 3547 maybee 1988 3547 maybee ASSERT(dmu_tx_is_syncing(tx)); 1989 3547 maybee 1990 3547 maybee dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1991 3547 maybee 1992 3547 maybee mutex_enter(&db->db_mtx); 1993 3547 maybee 1994 3547 maybee ASSERT(db->db_level > 0); 1995 3547 maybee DBUF_VERIFY(db); 1996 3547 maybee 1997 3547 maybee if (db->db_buf == NULL) { 1998 3547 maybee mutex_exit(&db->db_mtx); 1999 3547 maybee (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2000 3547 maybee mutex_enter(&db->db_mtx); 2001 3547 maybee } 2002 3547 maybee ASSERT3U(db->db_state, ==, DB_CACHED); 2003 3547 maybee ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2004 3547 maybee ASSERT(db->db_buf != NULL); 2005 3547 maybee 2006 3547 maybee dbuf_check_blkptr(dn, db); 2007 3547 maybee 2008 3547 maybee db->db_data_pending = dr; 2009 3547 maybee 2010 3897 maybee mutex_exit(&db->db_mtx); 2011 7046 ahrens dbuf_write(dr, db->db_buf, tx); 2012 3547 maybee 2013 3547 maybee zio = dr->dr_zio; 2014 3547 maybee mutex_enter(&dr->dt.di.dr_mtx); 2015 3547 maybee dbuf_sync_list(&dr->dt.di.dr_children, tx); 2016 3547 maybee ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2017 3547 maybee mutex_exit(&dr->dt.di.dr_mtx); 2018 3547 maybee zio_nowait(zio); 2019 3547 maybee } 2020 3547 maybee 2021 3547 maybee static void 2022 3547 maybee dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2023 3547 maybee { 2024 3547 maybee arc_buf_t **datap = &dr->dt.dl.dr_data; 2025 3547 maybee dmu_buf_impl_t *db = dr->dr_dbuf; 2026 789 ahrens dnode_t *dn = db->db_dnode; 2027 10298 Matthew objset_t *os = dn->dn_objset; 2028 3547 maybee uint64_t txg = tx->tx_txg; 2029 789 ahrens 2030 789 ahrens ASSERT(dmu_tx_is_syncing(tx)); 2031 789 ahrens 2032 789 ahrens dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2033 789 ahrens 2034 789 ahrens mutex_enter(&db->db_mtx); 2035 789 ahrens /* 2036 789 ahrens * To be synced, we must be dirtied. But we 2037 789 ahrens * might have been freed after the dirty. 2038 789 ahrens */ 2039 789 ahrens if (db->db_state == DB_UNCACHED) { 2040 789 ahrens /* This buffer has been freed since it was dirtied */ 2041 789 ahrens ASSERT(db->db.db_data == NULL); 2042 789 ahrens } else if (db->db_state == DB_FILL) { 2043 789 ahrens /* This buffer was freed and is now being re-filled */ 2044 3547 maybee ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2045 789 ahrens } else { 2046 7872 Tim ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2047 789 ahrens } 2048 873 ek110237 DBUF_VERIFY(db); 2049 789 ahrens 2050 789 ahrens /* 2051 3547 maybee * If this is a bonus buffer, simply copy the bonus data into the 2052 3547 maybee * dnode. It will be written out when the dnode is synced (and it 2053 3547 maybee * will be synced, since it must have been dirty for dbuf_sync to 2054 3547 maybee * be called). 2055 789 ahrens */ 2056 1544 eschrock if (db->db_blkid == DB_BONUS_BLKID) { 2057 3547 maybee dbuf_dirty_record_t **drp; 2058 4944 maybee 2059 1544 eschrock ASSERT(*datap != NULL); 2060 1544 eschrock ASSERT3U(db->db_level, ==, 0); 2061 1544 eschrock ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2062 1544 eschrock bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2063 4309 maybee if (*datap != db->db.db_data) { 2064 1544 eschrock zio_buf_free(*datap, DN_MAX_BONUSLEN); 2065 8582 Brendan arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2066 4309 maybee } 2067 1544 eschrock db->db_data_pending = NULL; 2068 3547 maybee drp = &db->db_last_dirty; 2069 3547 maybee while (*drp != dr) 2070 3547 maybee drp = &(*drp)->dr_next; 2071 5688 bonwick ASSERT(dr->dr_next == NULL); 2072 10922 Jeff ASSERT(dr->dr_dbuf == db); 2073 5688 bonwick *drp = dr->dr_next; 2074 3547 maybee kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2075 1544 eschrock ASSERT(db->db_dirtycnt > 0); 2076 1544 eschrock db->db_dirtycnt -= 1; 2077 10922 Jeff dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2078 1544 eschrock return; 2079 1544 eschrock } 2080 1544 eschrock 2081 3547 maybee /* 2082 4312 gw25295 * This function may have dropped the db_mtx lock allowing a dmu_sync 2083 4312 gw25295 * operation to sneak in. As a result, we need to ensure that we 2084 4312 gw25295 * don't check the dr_override_state until we have returned from 2085 4312 gw25295 * dbuf_check_blkptr. 2086 4312 gw25295 */ 2087 4312 gw25295 dbuf_check_blkptr(dn, db); 2088 4312 gw25295 2089 4312 gw25295 /* 2090 3547 maybee * If this buffer is in the middle of an immdiate write, 2091 3547 maybee * wait for the synchronous IO to complete. 2092 3547 maybee */ 2093 3547 maybee while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2094 3547 maybee ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2095 3547 maybee cv_wait(&db->db_changed, &db->db_mtx); 2096 3547 maybee ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2097 3547 maybee } 2098 3547 maybee 2099 8746 Matthew if (db->db_state != DB_NOFILL && 2100 8746 Matthew dn->dn_object != DMU_META_DNODE_OBJECT && 2101 8746 Matthew refcount_count(&db->db_holds) > 1 && 2102 10922 Jeff dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2103 8746 Matthew *datap == db->db_buf) { 2104 8746 Matthew /* 2105 8746 Matthew * If this buffer is currently "in use" (i.e., there 2106 8746 Matthew * are active holds and db_data still references it), 2107 8746 Matthew * then make a copy before we start the write so that 2108 8746 Matthew * any modifications from the open txg will not leak 2109 8746 Matthew * into this write. 2110 8746 Matthew * 2111 8746 Matthew * NOTE: this copy does not need to be made for 2112 8746 Matthew * objects only modified in the syncing context (e.g. 2113 8746 Matthew * DNONE_DNODE blocks). 2114 8746 Matthew */ 2115 8746 Matthew int blksz = arc_buf_size(*datap); 2116 8746 Matthew arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2117 8746 Matthew *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2118 8746 Matthew bcopy(db->db.db_data, (*datap)->b_data, blksz); 2119 789 ahrens } 2120 3547 maybee db->db_data_pending = dr; 2121 789 ahrens 2122 3547 maybee mutex_exit(&db->db_mtx); 2123 789 ahrens 2124 7046 ahrens dbuf_write(dr, *datap, tx); 2125 789 ahrens 2126 3547 maybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 2127 3547 maybee if (dn->dn_object == DMU_META_DNODE_OBJECT) 2128 3547 maybee list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2129 3547 maybee else 2130 3547 maybee zio_nowait(dr->dr_zio); 2131 3547 maybee } 2132 1163 maybee 2133 3547 maybee void 2134 3547 maybee dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2135 3547 maybee { 2136 3547 maybee dbuf_dirty_record_t *dr; 2137 3547 maybee 2138 3547 maybee while (dr = list_head(list)) { 2139 3547 maybee if (dr->dr_zio != NULL) { 2140 3547 maybee /* 2141 3547 maybee * If we find an already initialized zio then we 2142 3547 maybee * are processing the meta-dnode, and we have finished. 2143 3547 maybee * The dbufs for all dnodes are put back on the list 2144 3547 maybee * during processing, so that we can zio_wait() 2145 3547 maybee * these IOs after initiating all child IOs. 2146 3547 maybee */ 2147 3547 maybee ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2148 3547 maybee DMU_META_DNODE_OBJECT); 2149 3547 maybee break; 2150 1163 maybee } 2151 3547 maybee list_remove(list, dr); 2152 3547 maybee if (dr->dr_dbuf->db_level > 0) 2153 3547 maybee dbuf_sync_indirect(dr, tx); 2154 3547 maybee else 2155 3547 maybee dbuf_sync_leaf(dr, tx); 2156 1163 maybee } 2157 3547 maybee } 2158 1163 maybee 2159 789 ahrens /* ARGSUSED */ 2160 789 ahrens static void 2161 3547 maybee dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2162 789 ahrens { 2163 789 ahrens dmu_buf_impl_t *db = vdb; 2164 7754 Jeff blkptr_t *bp = zio->io_bp; 2165 3547 maybee blkptr_t *bp_orig = &zio->io_bp_orig; 2166 10922 Jeff dnode_t *dn = db->db_dnode; 2167 10922 Jeff spa_t *spa = zio->io_spa; 2168 10922 Jeff int64_t delta; 2169 789 ahrens uint64_t fill = 0; 2170 10922 Jeff int i; 2171 789 ahrens 2172 7754 Jeff ASSERT(db->db_blkptr == bp); 2173 7754 Jeff 2174 10922 Jeff delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2175 10922 Jeff dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2176 10922 Jeff zio->io_prev_space_delta = delta; 2177 789 ahrens 2178 7754 Jeff if (BP_IS_HOLE(bp)) { 2179 10922 Jeff ASSERT(bp->blk_fill == 0); 2180 3547 maybee return; 2181 3547 maybee } 2182 7754 Jeff 2183 7754 Jeff ASSERT(BP_GET_TYPE(bp) == dn->dn_type); 2184 7754 Jeff ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2185 3547 maybee 2186 789 ahrens mutex_enter(&db->db_mtx); 2187 789 ahrens 2188 789 ahrens if (db->db_level == 0) { 2189 789 ahrens mutex_enter(&dn->dn_mtx); 2190 3547 maybee if (db->db_blkid > dn->dn_phys->dn_maxblkid) 2191 789 ahrens dn->dn_phys->dn_maxblkid = db->db_blkid; 2192 789 ahrens mutex_exit(&dn->dn_mtx); 2193 789 ahrens 2194 789 ahrens if (dn->dn_type == DMU_OT_DNODE) { 2195 789 ahrens dnode_phys_t *dnp = db->db.db_data; 2196 789 ahrens for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2197 789 ahrens i--, dnp++) { 2198 789 ahrens if (dnp->dn_type != DMU_OT_NONE) 2199 789 ahrens fill++; 2200 789 ahrens } 2201 789 ahrens } else { 2202 3547 maybee fill = 1; 2203 789 ahrens } 2204 789 ahrens } else { 2205 7754 Jeff blkptr_t *ibp = db->db.db_data; 2206 789 ahrens ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2207 7754 Jeff for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2208 7754 Jeff if (BP_IS_HOLE(ibp)) 2209 789 ahrens continue; 2210 7754 Jeff ASSERT3U(BP_GET_LSIZE(ibp), ==, 2211 789 ahrens db->db_level == 1 ? dn->dn_datablksz : 2212 789 ahrens (1<<dn->dn_phys->dn_indblkshift)); 2213 7754 Jeff fill += ibp->blk_fill; 2214 789 ahrens } 2215 789 ahrens } 2216 789 ahrens 2217 7754 Jeff bp->blk_fill = fill; 2218 3547 maybee 2219 3547 maybee mutex_exit(&db->db_mtx); 2220 3547 maybee } 2221 3547 maybee 2222 3547 maybee /* ARGSUSED */ 2223 3547 maybee static void 2224 3547 maybee dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2225 3547 maybee { 2226 3547 maybee dmu_buf_impl_t *db = vdb; 2227 10922 Jeff blkptr_t *bp = zio->io_bp; 2228 10922 Jeff blkptr_t *bp_orig = &zio->io_bp_orig; 2229 10922 Jeff dnode_t *dn = db->db_dnode; 2230 10922 Jeff objset_t *os = dn->dn_objset; 2231 3547 maybee uint64_t txg = zio->io_txg; 2232 3547 maybee dbuf_dirty_record_t **drp, *dr; 2233 3547 maybee 2234 3547 maybee ASSERT3U(zio->io_error, ==, 0); 2235 10922 Jeff ASSERT(db->db_blkptr == bp); 2236 10922 Jeff 2237 10922 Jeff if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 2238 10922 Jeff ASSERT(BP_EQUAL(bp, bp_orig)); 2239 10922 Jeff } else { 2240 10922 Jeff dsl_dataset_t *ds = os->os_dsl_dataset; 2241 10922 Jeff dmu_tx_t *tx = os->os_synctx; 2242 10922 Jeff 2243 10922 Jeff (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2244 10922 Jeff dsl_dataset_block_born(ds, bp, tx); 2245 10922 Jeff } 2246 3547 maybee 2247 3547 maybee mutex_enter(&db->db_mtx); 2248 10922 Jeff 2249 10922 Jeff DBUF_VERIFY(db); 2250 3547 maybee 2251 3547 maybee drp = &db->db_last_dirty; 2252 5688 bonwick while ((dr = *drp) != db->db_data_pending) 2253 5688 bonwick drp = &dr->dr_next; 2254 5688 bonwick ASSERT(!list_link_active(&dr->dr_dirty_node)); 2255 5688 bonwick ASSERT(dr->dr_txg == txg); 2256 10922 Jeff ASSERT(dr->dr_dbuf == db); 2257 5688 bonwick ASSERT(dr->dr_next == NULL); 2258 5688 bonwick *drp = dr->dr_next; 2259 3547 maybee 2260 3547 maybee if (db->db_level == 0) { 2261 3547 maybee ASSERT(db->db_blkid != DB_BONUS_BLKID); 2262 3547 maybee ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2263 7872 Tim if (db->db_state != DB_NOFILL) { 2264 7872 Tim if (dr->dt.dl.dr_data != db->db_buf) 2265 7872 Tim VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2266 7872 Tim db) == 1); 2267 10922 Jeff else if (!arc_released(db->db_buf)) 2268 7872 Tim arc_set_callback(db->db_buf, dbuf_do_evict, db); 2269 7872 Tim } 2270 789 ahrens } else { 2271 3547 maybee ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2272 3547 maybee ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2273 3547 maybee if (!BP_IS_HOLE(db->db_blkptr)) { 2274 3547 maybee int epbs = 2275 3547 maybee dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2276 3547 maybee ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2277 3547 maybee db->db.db_size); 2278 3547 maybee ASSERT3U(dn->dn_phys->dn_maxblkid 2279 3547 maybee >> (db->db_level * epbs), >=, db->db_blkid); 2280 3547 maybee arc_set_callback(db->db_buf, dbuf_do_evict, db); 2281 3547 maybee } 2282 4831 gw25295 mutex_destroy(&dr->dt.di.dr_mtx); 2283 4831 gw25295 list_destroy(&dr->dt.di.dr_children); 2284 789 ahrens } 2285 3547 maybee kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2286 789 ahrens 2287 789 ahrens cv_broadcast(&db->db_changed); 2288 789 ahrens ASSERT(db->db_dirtycnt > 0); 2289 789 ahrens db->db_dirtycnt -= 1; 2290 3547 maybee db->db_data_pending = NULL; 2291 10922 Jeff dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2292 10922 Jeff } 2293 10922 Jeff 2294 10922 Jeff static void 2295 10922 Jeff dbuf_write_nofill_ready(zio_t *zio) 2296 10922 Jeff { 2297 10922 Jeff dbuf_write_ready(zio, NULL, zio->io_private); 2298 10922 Jeff } 2299 10922 Jeff 2300 10922 Jeff static void 2301 10922 Jeff dbuf_write_nofill_done(zio_t *zio) 2302 10922 Jeff { 2303 10922 Jeff dbuf_write_done(zio, NULL, zio->io_private); 2304 10922 Jeff } 2305 10922 Jeff 2306 10922 Jeff static void 2307 10922 Jeff dbuf_write_override_ready(zio_t *zio) 2308 10922 Jeff { 2309 10922 Jeff dbuf_dirty_record_t *dr = zio->io_private; 2310 10922 Jeff dmu_buf_impl_t *db = dr->dr_dbuf; 2311 10922 Jeff 2312 10922 Jeff dbuf_write_ready(zio, NULL, db); 2313 10922 Jeff } 2314 10922 Jeff 2315 10922 Jeff static void 2316 10922 Jeff dbuf_write_override_done(zio_t *zio) 2317 10922 Jeff { 2318 10922 Jeff dbuf_dirty_record_t *dr = zio->io_private; 2319 10922 Jeff dmu_buf_impl_t *db = dr->dr_dbuf; 2320 10922 Jeff blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2321 10922 Jeff 2322 10922 Jeff mutex_enter(&db->db_mtx); 2323 10922 Jeff if (!BP_EQUAL(zio->io_bp, obp)) { 2324 10922 Jeff if (!BP_IS_HOLE(obp)) 2325 10922 Jeff dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2326 10922 Jeff arc_release(dr->dt.dl.dr_data, db); 2327 10922 Jeff } 2328 789 ahrens mutex_exit(&db->db_mtx); 2329 789 ahrens 2330 10922 Jeff dbuf_write_done(zio, NULL, db); 2331 10922 Jeff } 2332 789 ahrens 2333 10922 Jeff static void 2334 10922 Jeff dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2335 10922 Jeff { 2336 10922 Jeff dmu_buf_impl_t *db = dr->dr_dbuf; 2337 10922 Jeff dnode_t *dn = db->db_dnode; 2338 10922 Jeff objset_t *os = dn->dn_objset; 2339 10922 Jeff dmu_buf_impl_t *parent = db->db_parent; 2340 10922 Jeff uint64_t txg = tx->tx_txg; 2341 10922 Jeff zbookmark_t zb; 2342 10922 Jeff zio_prop_t zp; 2343 10922 Jeff zio_t *zio; 2344 10922 Jeff 2345 10922 Jeff if (db->db_state != DB_NOFILL) { 2346 10922 Jeff if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2347 10922 Jeff /* 2348 10922 Jeff * Private object buffers are released here rather 2349 10922 Jeff * than in dbuf_dirty() since they are only modified 2350 10922 Jeff * in the syncing context and we don't want the 2351 10922 Jeff * overhead of making multiple copies of the data. 2352 10922 Jeff */ 2353 10922 Jeff if (BP_IS_HOLE(db->db_blkptr)) { 2354 10922 Jeff arc_buf_thaw(data); 2355 10922 Jeff } else { 2356 10922 Jeff arc_release(data, db); 2357 10922 Jeff } 2358 10922 Jeff } 2359 10922 Jeff } 2360 10922 Jeff 2361 10922 Jeff if (parent != dn->dn_dbuf) { 2362 10922 Jeff ASSERT(parent && parent->db_data_pending); 2363 10922 Jeff ASSERT(db->db_level == parent->db_level-1); 2364 10922 Jeff ASSERT(arc_released(parent->db_buf)); 2365 10922 Jeff zio = parent->db_data_pending->dr_zio; 2366 10922 Jeff } else { 2367 10922 Jeff ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 2368 10922 Jeff ASSERT3P(db->db_blkptr, ==, 2369 10922 Jeff &dn->dn_phys->dn_blkptr[db->db_blkid]); 2370 10922 Jeff zio = dn->dn_zio; 2371 10922 Jeff } 2372 10922 Jeff 2373 10922 Jeff ASSERT(db->db_level == 0 || data == db->db_buf); 2374 10922 Jeff ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2375 10922 Jeff ASSERT(zio); 2376 10922 Jeff 2377 10922 Jeff SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2378 10922 Jeff os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2379 10922 Jeff db->db.db_object, db->db_level, db->db_blkid); 2380 10922 Jeff 2381 10922 Jeff dmu_write_policy(os, dn, db->db_level, 2382 10922 Jeff db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp); 2383 10922 Jeff 2384 10922 Jeff if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2385 10922 Jeff ASSERT(db->db_state != DB_NOFILL); 2386 10922 Jeff dr->dr_zio = zio_write(zio, os->os_spa, txg, 2387 10922 Jeff db->db_blkptr, data->b_data, arc_buf_size(data), &zp, 2388 10922 Jeff dbuf_write_override_ready, dbuf_write_override_done, dr, 2389 10922 Jeff ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2390 10922 Jeff mutex_enter(&db->db_mtx); 2391 10922 Jeff dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2392 10922 Jeff zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2393 10922 Jeff dr->dt.dl.dr_copies); 2394 10922 Jeff mutex_exit(&db->db_mtx); 2395 10922 Jeff } else if (db->db_state == DB_NOFILL) { 2396 10922 Jeff ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); 2397 10922 Jeff dr->dr_zio = zio_write(zio, os->os_spa, txg, 2398 10922 Jeff db->db_blkptr, NULL, db->db.db_size, &zp, 2399 10922 Jeff dbuf_write_nofill_ready, dbuf_write_nofill_done, db, 2400 10922 Jeff ZIO_PRIORITY_ASYNC_WRITE, 2401 10922 Jeff ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2402 10922 Jeff } else { 2403 10922 Jeff ASSERT(arc_released(data)); 2404 10922 Jeff dr->dr_zio = arc_write(zio, os->os_spa, txg, 2405 10922 Jeff db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, 2406 10922 Jeff dbuf_write_ready, dbuf_write_done, db, 2407 10922 Jeff ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2408 10922 Jeff } 2409 789 ahrens } 2410