1 789 ahrens /* 2 789 ahrens * CDDL HEADER START 3 789 ahrens * 4 789 ahrens * The contents of this file are subject to the terms of the 5 1484 ek110237 * Common Development and Distribution License (the "License"). 6 1484 ek110237 * You may not use this file except in compliance with the License. 7 789 ahrens * 8 789 ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 789 ahrens * or http://www.opensolaris.org/os/licensing. 10 789 ahrens * See the License for the specific language governing permissions 11 789 ahrens * and limitations under the License. 12 789 ahrens * 13 789 ahrens * When distributing Covered Code, include this CDDL HEADER in each 14 789 ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 789 ahrens * If applicable, add the following below this CDDL HEADER, with the 16 789 ahrens * fields enclosed by brackets "[]" replaced with your own identifying 17 789 ahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18 789 ahrens * 19 789 ahrens * CDDL HEADER END 20 789 ahrens */ 21 789 ahrens /* 22 8582 Brendan * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 789 ahrens * Use is subject to license terms. 24 789 ahrens */ 25 789 ahrens 26 789 ahrens /* 27 3403 bmc * DVA-based Adjustable Replacement Cache 28 789 ahrens * 29 1544 eschrock * While much of the theory of operation used here is 30 1544 eschrock * based on the self-tuning, low overhead replacement cache 31 789 ahrens * presented by Megiddo and Modha at FAST 2003, there are some 32 789 ahrens * significant differences: 33 789 ahrens * 34 789 ahrens * 1. The Megiddo and Modha model assumes any page is evictable. 35 789 ahrens * Pages in its cache cannot be "locked" into memory. This makes 36 789 ahrens * the eviction algorithm simple: evict the last page in the list. 37 789 ahrens * This also make the performance characteristics easy to reason 38 789 ahrens * about. Our cache is not so simple. At any given moment, some 39 789 ahrens * subset of the blocks in the cache are un-evictable because we 40 789 ahrens * have handed out a reference to them. Blocks are only evictable 41 789 ahrens * when there are no external references active. This makes 42 789 ahrens * eviction far more problematic: we choose to evict the evictable 43 789 ahrens * blocks that are the "lowest" in the list. 44 789 ahrens * 45 789 ahrens * There are times when it is not possible to evict the requested 46 789 ahrens * space. In these circumstances we are unable to adjust the cache 47 789 ahrens * size. To prevent the cache growing unbounded at these times we 48 5450 brendan * implement a "cache throttle" that slows the flow of new data 49 5450 brendan * into the cache until we can make space available. 50 789 ahrens * 51 789 ahrens * 2. The Megiddo and Modha model assumes a fixed cache size. 52 789 ahrens * Pages are evicted when the cache is full and there is a cache 53 789 ahrens * miss. Our model has a variable sized cache. It grows with 54 5450 brendan * high use, but also tries to react to memory pressure from the 55 789 ahrens * operating system: decreasing its size when system memory is 56 789 ahrens * tight. 57 789 ahrens * 58 789 ahrens * 3. The Megiddo and Modha model assumes a fixed page size. All 59 789 ahrens * elements of the cache are therefor exactly the same size. So 60 789 ahrens * when adjusting the cache size following a cache miss, its simply 61 789 ahrens * a matter of choosing a single page to evict. In our model, we 62 789 ahrens * have variable sized cache blocks (rangeing from 512 bytes to 63 789 ahrens * 128K bytes). We therefor choose a set of blocks to evict to make 64 789 ahrens * space for a cache miss that approximates as closely as possible 65 789 ahrens * the space used by the new block. 66 789 ahrens * 67 789 ahrens * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 68 789 ahrens * by N. Megiddo & D. Modha, FAST 2003 69 789 ahrens */ 70 789 ahrens 71 789 ahrens /* 72 789 ahrens * The locking model: 73 789 ahrens * 74 789 ahrens * A new reference to a cache buffer can be obtained in two 75 789 ahrens * ways: 1) via a hash table lookup using the DVA as a key, 76 5450 brendan * or 2) via one of the ARC lists. The arc_read() interface 77 789 ahrens * uses method 1, while the internal arc algorithms for 78 789 ahrens * adjusting the cache use method 2. We therefor provide two 79 789 ahrens * types of locks: 1) the hash table lock array, and 2) the 80 789 ahrens * arc list locks. 81 789 ahrens * 82 789 ahrens * Buffers do not have their own mutexs, rather they rely on the 83 789 ahrens * hash table mutexs for the bulk of their protection (i.e. most 84 789 ahrens * fields in the arc_buf_hdr_t are protected by these mutexs). 85 789 ahrens * 86 789 ahrens * buf_hash_find() returns the appropriate mutex (held) when it 87 789 ahrens * locates the requested buffer in the hash table. It returns 88 789 ahrens * NULL for the mutex if the buffer was not in the table. 89 789 ahrens * 90 789 ahrens * buf_hash_remove() expects the appropriate hash mutex to be 91 789 ahrens * already held before it is invoked. 92 789 ahrens * 93 789 ahrens * Each arc state also has a mutex which is used to protect the 94 789 ahrens * buffer list associated with the state. When attempting to 95 789 ahrens * obtain a hash table lock while holding an arc list lock you 96 789 ahrens * must use: mutex_tryenter() to avoid deadlock. Also note that 97 2688 maybee * the active state mutex must be held before the ghost state mutex. 98 789 ahrens * 99 1544 eschrock * Arc buffers may have an associated eviction callback function. 100 1544 eschrock * This function will be invoked prior to removing the buffer (e.g. 101 1544 eschrock * in arc_do_user_evicts()). Note however that the data associated 102 1544 eschrock * with the buffer may be evicted prior to the callback. The callback 103 1544 eschrock * must be made with *no locks held* (to prevent deadlock). Additionally, 104 1544 eschrock * the users of callbacks must ensure that their private data is 105 1544 eschrock * protected from simultaneous callbacks from arc_buf_evict() 106 1544 eschrock * and arc_do_user_evicts(). 107 1544 eschrock * 108 789 ahrens * Note that the majority of the performance stats are manipulated 109 789 ahrens * with atomic operations. 110 5450 brendan * 111 5450 brendan * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 112 5450 brendan * 113 5450 brendan * - L2ARC buflist creation 114 5450 brendan * - L2ARC buflist eviction 115 5450 brendan * - L2ARC write completion, which walks L2ARC buflists 116 5450 brendan * - ARC header destruction, as it removes from L2ARC buflists 117 5450 brendan * - ARC header release, as it removes from L2ARC buflists 118 789 ahrens */ 119 789 ahrens 120 789 ahrens #include <sys/spa.h> 121 789 ahrens #include <sys/zio.h> 122 789 ahrens #include <sys/zfs_context.h> 123 789 ahrens #include <sys/arc.h> 124 789 ahrens #include <sys/refcount.h> 125 6643 eschrock #include <sys/vdev.h> 126 9816 George #include <sys/vdev_impl.h> 127 789 ahrens #ifdef _KERNEL 128 789 ahrens #include <sys/vmsystm.h> 129 789 ahrens #include <vm/anon.h> 130 789 ahrens #include <sys/fs/swapnode.h> 131 1484 ek110237 #include <sys/dnlc.h> 132 789 ahrens #endif 133 789 ahrens #include <sys/callb.h> 134 3403 bmc #include <sys/kstat.h> 135 10922 Jeff #include <zfs_fletcher.h> 136 789 ahrens 137 789 ahrens static kmutex_t arc_reclaim_thr_lock; 138 789 ahrens static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 139 789 ahrens static uint8_t arc_thread_exit; 140 6245 maybee 141 6245 maybee extern int zfs_write_limit_shift; 142 6245 maybee extern uint64_t zfs_write_limit_max; 143 7468 Mark extern kmutex_t zfs_write_limit_lock; 144 1484 ek110237 145 1484 ek110237 #define ARC_REDUCE_DNLC_PERCENT 3 146 1484 ek110237 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 147 789 ahrens 148 789 ahrens typedef enum arc_reclaim_strategy { 149 789 ahrens ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 150 789 ahrens ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 151 789 ahrens } arc_reclaim_strategy_t; 152 789 ahrens 153 789 ahrens /* number of seconds before growing cache again */ 154 789 ahrens static int arc_grow_retry = 60; 155 789 ahrens 156 8582 Brendan /* shift of arc_c for calculating both min and max arc_p */ 157 8582 Brendan static int arc_p_min_shift = 4; 158 8582 Brendan 159 8582 Brendan /* log2(fraction of arc to reclaim) */ 160 8582 Brendan static int arc_shrink_shift = 5; 161 8582 Brendan 162 2391 maybee /* 163 2638 perrin * minimum lifespan of a prefetch block in clock ticks 164 2638 perrin * (initialized in arc_init()) 165 2391 maybee */ 166 2638 perrin static int arc_min_prefetch_lifespan; 167 2391 maybee 168 789 ahrens static int arc_dead; 169 6987 brendan 170 6987 brendan /* 171 6987 brendan * The arc has filled available memory and has now warmed up. 172 6987 brendan */ 173 6987 brendan static boolean_t arc_warm; 174 2885 ahrens 175 2885 ahrens /* 176 2885 ahrens * These tunables are for performance analysis. 177 2885 ahrens */ 178 2885 ahrens uint64_t zfs_arc_max; 179 2885 ahrens uint64_t zfs_arc_min; 180 4645 ek110237 uint64_t zfs_arc_meta_limit = 0; 181 8582 Brendan int zfs_arc_grow_retry = 0; 182 8582 Brendan int zfs_arc_shrink_shift = 0; 183 8582 Brendan int zfs_arc_p_min_shift = 0; 184 789 ahrens 185 789 ahrens /* 186 5450 brendan * Note that buffers can be in one of 6 states: 187 789 ahrens * ARC_anon - anonymous (discussed below) 188 1544 eschrock * ARC_mru - recently used, currently cached 189 1544 eschrock * ARC_mru_ghost - recentely used, no longer in cache 190 1544 eschrock * ARC_mfu - frequently used, currently cached 191 1544 eschrock * ARC_mfu_ghost - frequently used, no longer in cache 192 5450 brendan * ARC_l2c_only - exists in L2ARC but not other states 193 4309 maybee * When there are no active references to the buffer, they are 194 4309 maybee * are linked onto a list in one of these arc states. These are 195 4309 maybee * the only buffers that can be evicted or deleted. Within each 196 4309 maybee * state there are multiple lists, one for meta-data and one for 197 4309 maybee * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 198 4309 maybee * etc.) is tracked separately so that it can be managed more 199 5450 brendan * explicitly: favored over data, limited explicitly. 200 789 ahrens * 201 789 ahrens * Anonymous buffers are buffers that are not associated with 202 789 ahrens * a DVA. These are buffers that hold dirty block copies 203 789 ahrens * before they are written to stable storage. By definition, 204 1544 eschrock * they are "ref'd" and are considered part of arc_mru 205 789 ahrens * that cannot be freed. Generally, they will aquire a DVA 206 1544 eschrock * as they are written and migrate onto the arc_mru list. 207 5450 brendan * 208 5450 brendan * The ARC_l2c_only state is for buffers that are in the second 209 5450 brendan * level ARC but no longer in any of the ARC_m* lists. The second 210 5450 brendan * level ARC itself may also contain buffers that are in any of 211 5450 brendan * the ARC_m* states - meaning that a buffer can exist in two 212 5450 brendan * places. The reason for the ARC_l2c_only state is to keep the 213 5450 brendan * buffer header in the hash table, so that reads that hit the 214 5450 brendan * second level ARC benefit from these fast lookups. 215 789 ahrens */ 216 789 ahrens 217 789 ahrens typedef struct arc_state { 218 4309 maybee list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 219 4309 maybee uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 220 4309 maybee uint64_t arcs_size; /* total amount of data in this state */ 221 3403 bmc kmutex_t arcs_mtx; 222 789 ahrens } arc_state_t; 223 789 ahrens 224 5450 brendan /* The 6 states: */ 225 789 ahrens static arc_state_t ARC_anon; 226 1544 eschrock static arc_state_t ARC_mru; 227 1544 eschrock static arc_state_t ARC_mru_ghost; 228 1544 eschrock static arc_state_t ARC_mfu; 229 1544 eschrock static arc_state_t ARC_mfu_ghost; 230 5450 brendan static arc_state_t ARC_l2c_only; 231 789 ahrens 232 3403 bmc typedef struct arc_stats { 233 3403 bmc kstat_named_t arcstat_hits; 234 3403 bmc kstat_named_t arcstat_misses; 235 3403 bmc kstat_named_t arcstat_demand_data_hits; 236 3403 bmc kstat_named_t arcstat_demand_data_misses; 237 3403 bmc kstat_named_t arcstat_demand_metadata_hits; 238 3403 bmc kstat_named_t arcstat_demand_metadata_misses; 239 3403 bmc kstat_named_t arcstat_prefetch_data_hits; 240 3403 bmc kstat_named_t arcstat_prefetch_data_misses; 241 3403 bmc kstat_named_t arcstat_prefetch_metadata_hits; 242 3403 bmc kstat_named_t arcstat_prefetch_metadata_misses; 243 3403 bmc kstat_named_t arcstat_mru_hits; 244 3403 bmc kstat_named_t arcstat_mru_ghost_hits; 245 3403 bmc kstat_named_t arcstat_mfu_hits; 246 3403 bmc kstat_named_t arcstat_mfu_ghost_hits; 247 3403 bmc kstat_named_t arcstat_deleted; 248 3403 bmc kstat_named_t arcstat_recycle_miss; 249 3403 bmc kstat_named_t arcstat_mutex_miss; 250 3403 bmc kstat_named_t arcstat_evict_skip; 251 10357 Brendan kstat_named_t arcstat_evict_l2_cached; 252 10357 Brendan kstat_named_t arcstat_evict_l2_eligible; 253 10357 Brendan kstat_named_t arcstat_evict_l2_ineligible; 254 3403 bmc kstat_named_t arcstat_hash_elements; 255 3403 bmc kstat_named_t arcstat_hash_elements_max; 256 3403 bmc kstat_named_t arcstat_hash_collisions; 257 3403 bmc kstat_named_t arcstat_hash_chains; 258 3403 bmc kstat_named_t arcstat_hash_chain_max; 259 3403 bmc kstat_named_t arcstat_p; 260 3403 bmc kstat_named_t arcstat_c; 261 3403 bmc kstat_named_t arcstat_c_min; 262 3403 bmc kstat_named_t arcstat_c_max; 263 3403 bmc kstat_named_t arcstat_size; 264 5450 brendan kstat_named_t arcstat_hdr_size; 265 8582 Brendan kstat_named_t arcstat_data_size; 266 8582 Brendan kstat_named_t arcstat_other_size; 267 5450 brendan kstat_named_t arcstat_l2_hits; 268 5450 brendan kstat_named_t arcstat_l2_misses; 269 5450 brendan kstat_named_t arcstat_l2_feeds; 270 5450 brendan kstat_named_t arcstat_l2_rw_clash; 271 8582 Brendan kstat_named_t arcstat_l2_read_bytes; 272 8582 Brendan kstat_named_t arcstat_l2_write_bytes; 273 5450 brendan kstat_named_t arcstat_l2_writes_sent; 274 5450 brendan kstat_named_t arcstat_l2_writes_done; 275 5450 brendan kstat_named_t arcstat_l2_writes_error; 276 5450 brendan kstat_named_t arcstat_l2_writes_hdr_miss; 277 5450 brendan kstat_named_t arcstat_l2_evict_lock_retry; 278 5450 brendan kstat_named_t arcstat_l2_evict_reading; 279 5450 brendan kstat_named_t arcstat_l2_free_on_write; 280 5450 brendan kstat_named_t arcstat_l2_abort_lowmem; 281 5450 brendan kstat_named_t arcstat_l2_cksum_bad; 282 5450 brendan kstat_named_t arcstat_l2_io_error; 283 5450 brendan kstat_named_t arcstat_l2_size; 284 5450 brendan kstat_named_t arcstat_l2_hdr_size; 285 6245 maybee kstat_named_t arcstat_memory_throttle_count; 286 3403 bmc } arc_stats_t; 287 789 ahrens 288 3403 bmc static arc_stats_t arc_stats = { 289 3403 bmc { "hits", KSTAT_DATA_UINT64 }, 290 3403 bmc { "misses", KSTAT_DATA_UINT64 }, 291 3403 bmc { "demand_data_hits", KSTAT_DATA_UINT64 }, 292 3403 bmc { "demand_data_misses", KSTAT_DATA_UINT64 }, 293 3403 bmc { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 294 3403 bmc { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 295 3403 bmc { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 296 3403 bmc { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 297 3403 bmc { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 298 3403 bmc { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 299 3403 bmc { "mru_hits", KSTAT_DATA_UINT64 }, 300 3403 bmc { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 301 3403 bmc { "mfu_hits", KSTAT_DATA_UINT64 }, 302 3403 bmc { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 303 3403 bmc { "deleted", KSTAT_DATA_UINT64 }, 304 3403 bmc { "recycle_miss", KSTAT_DATA_UINT64 }, 305 3403 bmc { "mutex_miss", KSTAT_DATA_UINT64 }, 306 3403 bmc { "evict_skip", KSTAT_DATA_UINT64 }, 307 10357 Brendan { "evict_l2_cached", KSTAT_DATA_UINT64 }, 308 10357 Brendan { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 309 10357 Brendan { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 310 3403 bmc { "hash_elements", KSTAT_DATA_UINT64 }, 311 3403 bmc { "hash_elements_max", KSTAT_DATA_UINT64 }, 312 3403 bmc { "hash_collisions", KSTAT_DATA_UINT64 }, 313 3403 bmc { "hash_chains", KSTAT_DATA_UINT64 }, 314 3403 bmc { "hash_chain_max", KSTAT_DATA_UINT64 }, 315 3403 bmc { "p", KSTAT_DATA_UINT64 }, 316 3403 bmc { "c", KSTAT_DATA_UINT64 }, 317 3403 bmc { "c_min", KSTAT_DATA_UINT64 }, 318 3403 bmc { "c_max", KSTAT_DATA_UINT64 }, 319 5450 brendan { "size", KSTAT_DATA_UINT64 }, 320 5450 brendan { "hdr_size", KSTAT_DATA_UINT64 }, 321 8582 Brendan { "data_size", KSTAT_DATA_UINT64 }, 322 8582 Brendan { "other_size", KSTAT_DATA_UINT64 }, 323 5450 brendan { "l2_hits", KSTAT_DATA_UINT64 }, 324 5450 brendan { "l2_misses", KSTAT_DATA_UINT64 }, 325 5450 brendan { "l2_feeds", KSTAT_DATA_UINT64 }, 326 5450 brendan { "l2_rw_clash", KSTAT_DATA_UINT64 }, 327 8582 Brendan { "l2_read_bytes", KSTAT_DATA_UINT64 }, 328 8582 Brendan { "l2_write_bytes", KSTAT_DATA_UINT64 }, 329 5450 brendan { "l2_writes_sent", KSTAT_DATA_UINT64 }, 330 5450 brendan { "l2_writes_done", KSTAT_DATA_UINT64 }, 331 5450 brendan { "l2_writes_error", KSTAT_DATA_UINT64 }, 332 5450 brendan { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 333 5450 brendan { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 334 5450 brendan { "l2_evict_reading", KSTAT_DATA_UINT64 }, 335 5450 brendan { "l2_free_on_write", KSTAT_DATA_UINT64 }, 336 5450 brendan { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 337 5450 brendan { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 338 5450 brendan { "l2_io_error", KSTAT_DATA_UINT64 }, 339 5450 brendan { "l2_size", KSTAT_DATA_UINT64 }, 340 6245 maybee { "l2_hdr_size", KSTAT_DATA_UINT64 }, 341 6245 maybee { "memory_throttle_count", KSTAT_DATA_UINT64 } 342 3403 bmc }; 343 789 ahrens 344 3403 bmc #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 345 789 ahrens 346 3403 bmc #define ARCSTAT_INCR(stat, val) \ 347 3403 bmc atomic_add_64(&arc_stats.stat.value.ui64, (val)); 348 3403 bmc 349 10922 Jeff #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 350 3403 bmc #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 351 3403 bmc 352 3403 bmc #define ARCSTAT_MAX(stat, val) { \ 353 3403 bmc uint64_t m; \ 354 3403 bmc while ((val) > (m = arc_stats.stat.value.ui64) && \ 355 3403 bmc (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 356 3403 bmc continue; \ 357 3403 bmc } 358 3403 bmc 359 3403 bmc #define ARCSTAT_MAXSTAT(stat) \ 360 3403 bmc ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 361 3403 bmc 362 3403 bmc /* 363 3403 bmc * We define a macro to allow ARC hits/misses to be easily broken down by 364 3403 bmc * two separate conditions, giving a total of four different subtypes for 365 3403 bmc * each of hits and misses (so eight statistics total). 366 3403 bmc */ 367 3403 bmc #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 368 3403 bmc if (cond1) { \ 369 3403 bmc if (cond2) { \ 370 3403 bmc ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 371 3403 bmc } else { \ 372 3403 bmc ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 373 3403 bmc } \ 374 3403 bmc } else { \ 375 3403 bmc if (cond2) { \ 376 3403 bmc ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 377 3403 bmc } else { \ 378 3403 bmc ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 379 3403 bmc } \ 380 3403 bmc } 381 3403 bmc 382 3403 bmc kstat_t *arc_ksp; 383 10922 Jeff static arc_state_t *arc_anon; 384 3403 bmc static arc_state_t *arc_mru; 385 3403 bmc static arc_state_t *arc_mru_ghost; 386 3403 bmc static arc_state_t *arc_mfu; 387 3403 bmc static arc_state_t *arc_mfu_ghost; 388 5450 brendan static arc_state_t *arc_l2c_only; 389 3403 bmc 390 3403 bmc /* 391 3403 bmc * There are several ARC variables that are critical to export as kstats -- 392 3403 bmc * but we don't want to have to grovel around in the kstat whenever we wish to 393 3403 bmc * manipulate them. For these variables, we therefore define them to be in 394 3403 bmc * terms of the statistic variable. This assures that we are not introducing 395 3403 bmc * the possibility of inconsistency by having shadow copies of the variables, 396 3403 bmc * while still allowing the code to be readable. 397 3403 bmc */ 398 3403 bmc #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 399 3403 bmc #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 400 3403 bmc #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 401 3403 bmc #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 402 3403 bmc #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 403 3403 bmc 404 3403 bmc static int arc_no_grow; /* Don't try to grow cache size */ 405 3403 bmc static uint64_t arc_tempreserve; 406 9412 Aleksandr static uint64_t arc_loaned_bytes; 407 4309 maybee static uint64_t arc_meta_used; 408 4309 maybee static uint64_t arc_meta_limit; 409 4309 maybee static uint64_t arc_meta_max = 0; 410 5450 brendan 411 5450 brendan typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 412 789 ahrens 413 789 ahrens typedef struct arc_callback arc_callback_t; 414 789 ahrens 415 789 ahrens struct arc_callback { 416 3547 maybee void *acb_private; 417 789 ahrens arc_done_func_t *acb_done; 418 789 ahrens arc_buf_t *acb_buf; 419 789 ahrens zio_t *acb_zio_dummy; 420 789 ahrens arc_callback_t *acb_next; 421 3547 maybee }; 422 3547 maybee 423 3547 maybee typedef struct arc_write_callback arc_write_callback_t; 424 3547 maybee 425 3547 maybee struct arc_write_callback { 426 3547 maybee void *awcb_private; 427 3547 maybee arc_done_func_t *awcb_ready; 428 3547 maybee arc_done_func_t *awcb_done; 429 3547 maybee arc_buf_t *awcb_buf; 430 789 ahrens }; 431 789 ahrens 432 789 ahrens struct arc_buf_hdr { 433 789 ahrens /* protected by hash lock */ 434 789 ahrens dva_t b_dva; 435 789 ahrens uint64_t b_birth; 436 789 ahrens uint64_t b_cksum0; 437 3093 ahrens 438 3093 ahrens kmutex_t b_freeze_lock; 439 3093 ahrens zio_cksum_t *b_freeze_cksum; 440 789 ahrens 441 789 ahrens arc_buf_hdr_t *b_hash_next; 442 789 ahrens arc_buf_t *b_buf; 443 789 ahrens uint32_t b_flags; 444 1544 eschrock uint32_t b_datacnt; 445 789 ahrens 446 3290 johansen arc_callback_t *b_acb; 447 789 ahrens kcondvar_t b_cv; 448 3290 johansen 449 3290 johansen /* immutable */ 450 3290 johansen arc_buf_contents_t b_type; 451 3290 johansen uint64_t b_size; 452 8636 Mark uint64_t b_spa; 453 789 ahrens 454 789 ahrens /* protected by arc state mutex */ 455 789 ahrens arc_state_t *b_state; 456 789 ahrens list_node_t b_arc_node; 457 789 ahrens 458 789 ahrens /* updated atomically */ 459 789 ahrens clock_t b_arc_access; 460 789 ahrens 461 789 ahrens /* self protecting */ 462 789 ahrens refcount_t b_refcnt; 463 5450 brendan 464 5450 brendan l2arc_buf_hdr_t *b_l2hdr; 465 5450 brendan list_node_t b_l2node; 466 789 ahrens }; 467 789 ahrens 468 1544 eschrock static arc_buf_t *arc_eviction_list; 469 1544 eschrock static kmutex_t arc_eviction_mtx; 470 2887 maybee static arc_buf_hdr_t arc_eviction_hdr; 471 2688 maybee static void arc_get_data_buf(arc_buf_t *buf); 472 2688 maybee static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 473 4309 maybee static int arc_evict_needed(arc_buf_contents_t type); 474 8636 Mark static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); 475 1544 eschrock 476 10357 Brendan static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); 477 10357 Brendan 478 1544 eschrock #define GHOST_STATE(state) \ 479 5450 brendan ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 480 5450 brendan (state) == arc_l2c_only) 481 1544 eschrock 482 789 ahrens /* 483 789 ahrens * Private ARC flags. These flags are private ARC only flags that will show up 484 789 ahrens * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 485 789 ahrens * be passed in as arc_flags in things like arc_read. However, these flags 486 789 ahrens * should never be passed and should only be set by ARC code. When adding new 487 789 ahrens * public flags, make sure not to smash the private ones. 488 789 ahrens */ 489 789 ahrens 490 1544 eschrock #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 491 789 ahrens #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 492 789 ahrens #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 493 789 ahrens #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 494 1544 eschrock #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 495 2391 maybee #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 496 5450 brendan #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 497 7237 ek110237 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ 498 7237 ek110237 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ 499 7237 ek110237 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ 500 789 ahrens 501 1544 eschrock #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 502 789 ahrens #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 503 789 ahrens #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 504 8582 Brendan #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 505 789 ahrens #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 506 1544 eschrock #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 507 5450 brendan #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 508 7237 ek110237 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 509 6987 brendan #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 510 6987 brendan (hdr)->b_l2hdr != NULL) 511 5450 brendan #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 512 5450 brendan #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 513 5450 brendan #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 514 789 ahrens 515 789 ahrens /* 516 6018 brendan * Other sizes 517 6018 brendan */ 518 6018 brendan 519 6018 brendan #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 520 6018 brendan #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 521 6018 brendan 522 6018 brendan /* 523 789 ahrens * Hash table routines 524 789 ahrens */ 525 789 ahrens 526 789 ahrens #define HT_LOCK_PAD 64 527 789 ahrens 528 789 ahrens struct ht_lock { 529 789 ahrens kmutex_t ht_lock; 530 789 ahrens #ifdef _KERNEL 531 789 ahrens unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 532 789 ahrens #endif 533 789 ahrens }; 534 789 ahrens 535 789 ahrens #define BUF_LOCKS 256 536 789 ahrens typedef struct buf_hash_table { 537 789 ahrens uint64_t ht_mask; 538 789 ahrens arc_buf_hdr_t **ht_table; 539 789 ahrens struct ht_lock ht_locks[BUF_LOCKS]; 540 789 ahrens } buf_hash_table_t; 541 789 ahrens 542 789 ahrens static buf_hash_table_t buf_hash_table; 543 789 ahrens 544 789 ahrens #define BUF_HASH_INDEX(spa, dva, birth) \ 545 789 ahrens (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 546 789 ahrens #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 547 789 ahrens #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 548 789 ahrens #define HDR_LOCK(buf) \ 549 789 ahrens (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 550 789 ahrens 551 789 ahrens uint64_t zfs_crc64_table[256]; 552 789 ahrens 553 5450 brendan /* 554 5450 brendan * Level 2 ARC 555 5450 brendan */ 556 5450 brendan 557 5450 brendan #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 558 8582 Brendan #define L2ARC_HEADROOM 2 /* num of writes */ 559 8582 Brendan #define L2ARC_FEED_SECS 1 /* caching interval secs */ 560 8582 Brendan #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 561 5450 brendan 562 5450 brendan #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 563 5450 brendan #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 564 5450 brendan 565 5450 brendan /* 566 5450 brendan * L2ARC Performance Tunables 567 5450 brendan */ 568 5450 brendan uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 569 6987 brendan uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 570 5450 brendan uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 571 5450 brendan uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 572 8582 Brendan uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 573 5450 brendan boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 574 8582 Brendan boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 575 8582 Brendan boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 576 5450 brendan 577 5450 brendan /* 578 5450 brendan * L2ARC Internals 579 5450 brendan */ 580 5450 brendan typedef struct l2arc_dev { 581 5450 brendan vdev_t *l2ad_vdev; /* vdev */ 582 5450 brendan spa_t *l2ad_spa; /* spa */ 583 5450 brendan uint64_t l2ad_hand; /* next write location */ 584 5450 brendan uint64_t l2ad_write; /* desired write size, bytes */ 585 6987 brendan uint64_t l2ad_boost; /* warmup write boost, bytes */ 586 5450 brendan uint64_t l2ad_start; /* first addr on device */ 587 5450 brendan uint64_t l2ad_end; /* last addr on device */ 588 5450 brendan uint64_t l2ad_evict; /* last addr eviction reached */ 589 5450 brendan boolean_t l2ad_first; /* first sweep through */ 590 8582 Brendan boolean_t l2ad_writing; /* currently writing */ 591 5450 brendan list_t *l2ad_buflist; /* buffer list */ 592 5450 brendan list_node_t l2ad_node; /* device list node */ 593 5450 brendan } l2arc_dev_t; 594 5450 brendan 595 5450 brendan static list_t L2ARC_dev_list; /* device list */ 596 5450 brendan static list_t *l2arc_dev_list; /* device list pointer */ 597 5450 brendan static kmutex_t l2arc_dev_mtx; /* device list mutex */ 598 5450 brendan static l2arc_dev_t *l2arc_dev_last; /* last device used */ 599 5450 brendan static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 600 5450 brendan static list_t L2ARC_free_on_write; /* free after write buf list */ 601 5450 brendan static list_t *l2arc_free_on_write; /* free after write list ptr */ 602 5450 brendan static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 603 5450 brendan static uint64_t l2arc_ndev; /* number of devices */ 604 5450 brendan 605 5450 brendan typedef struct l2arc_read_callback { 606 5450 brendan arc_buf_t *l2rcb_buf; /* read buffer */ 607 5450 brendan spa_t *l2rcb_spa; /* spa */ 608 5450 brendan blkptr_t l2rcb_bp; /* original blkptr */ 609 5450 brendan zbookmark_t l2rcb_zb; /* original bookmark */ 610 5450 brendan int l2rcb_flags; /* original flags */ 611 5450 brendan } l2arc_read_callback_t; 612 5450 brendan 613 5450 brendan typedef struct l2arc_write_callback { 614 5450 brendan l2arc_dev_t *l2wcb_dev; /* device info */ 615 5450 brendan arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 616 5450 brendan } l2arc_write_callback_t; 617 5450 brendan 618 5450 brendan struct l2arc_buf_hdr { 619 5450 brendan /* protected by arc_buf_hdr mutex */ 620 5450 brendan l2arc_dev_t *b_dev; /* L2ARC device */ 621 9215 George uint64_t b_daddr; /* disk address, offset byte */ 622 5450 brendan }; 623 5450 brendan 624 5450 brendan typedef struct l2arc_data_free { 625 5450 brendan /* protected by l2arc_free_on_write_mtx */ 626 5450 brendan void *l2df_data; 627 5450 brendan size_t l2df_size; 628 5450 brendan void (*l2df_func)(void *, size_t); 629 5450 brendan list_node_t l2df_list_node; 630 5450 brendan } l2arc_data_free_t; 631 5450 brendan 632 5450 brendan static kmutex_t l2arc_feed_thr_lock; 633 5450 brendan static kcondvar_t l2arc_feed_thr_cv; 634 5450 brendan static uint8_t l2arc_thread_exit; 635 5450 brendan 636 5450 brendan static void l2arc_read_done(zio_t *zio); 637 5450 brendan static void l2arc_hdr_stat_add(void); 638 5450 brendan static void l2arc_hdr_stat_remove(void); 639 5450 brendan 640 789 ahrens static uint64_t 641 8636 Mark buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 642 8636 Mark { 643 789 ahrens uint8_t *vdva = (uint8_t *)dva; 644 789 ahrens uint64_t crc = -1ULL; 645 789 ahrens int i; 646 789 ahrens 647 789 ahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 648 789 ahrens 649 789 ahrens for (i = 0; i < sizeof (dva_t); i++) 650 789 ahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 651 789 ahrens 652 8636 Mark crc ^= (spa>>8) ^ birth; 653 789 ahrens 654 789 ahrens return (crc); 655 789 ahrens } 656 789 ahrens 657 789 ahrens #define BUF_EMPTY(buf) \ 658 789 ahrens ((buf)->b_dva.dva_word[0] == 0 && \ 659 789 ahrens (buf)->b_dva.dva_word[1] == 0 && \ 660 789 ahrens (buf)->b_birth == 0) 661 789 ahrens 662 789 ahrens #define BUF_EQUAL(spa, dva, birth, buf) \ 663 789 ahrens ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 664 789 ahrens ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 665 789 ahrens ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 666 789 ahrens 667 789 ahrens static arc_buf_hdr_t * 668 8636 Mark buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 669 789 ahrens { 670 789 ahrens uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 671 789 ahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 672 789 ahrens arc_buf_hdr_t *buf; 673 789 ahrens 674 789 ahrens mutex_enter(hash_lock); 675 789 ahrens for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 676 789 ahrens buf = buf->b_hash_next) { 677 789 ahrens if (BUF_EQUAL(spa, dva, birth, buf)) { 678 789 ahrens *lockp = hash_lock; 679 789 ahrens return (buf); 680 789 ahrens } 681 789 ahrens } 682 789 ahrens mutex_exit(hash_lock); 683 789 ahrens *lockp = NULL; 684 789 ahrens return (NULL); 685 789 ahrens } 686 789 ahrens 687 789 ahrens /* 688 789 ahrens * Insert an entry into the hash table. If there is already an element 689 789 ahrens * equal to elem in the hash table, then the already existing element 690 789 ahrens * will be returned and the new element will not be inserted. 691 789 ahrens * Otherwise returns NULL. 692 789 ahrens */ 693 789 ahrens static arc_buf_hdr_t * 694 789 ahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 695 789 ahrens { 696 789 ahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 697 789 ahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 698 789 ahrens arc_buf_hdr_t *fbuf; 699 3403 bmc uint32_t i; 700 789 ahrens 701 1544 eschrock ASSERT(!HDR_IN_HASH_TABLE(buf)); 702 789 ahrens *lockp = hash_lock; 703 789 ahrens mutex_enter(hash_lock); 704 789 ahrens for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 705 789 ahrens fbuf = fbuf->b_hash_next, i++) { 706 789 ahrens if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 707 789 ahrens return (fbuf); 708 789 ahrens } 709 789 ahrens 710 789 ahrens buf->b_hash_next = buf_hash_table.ht_table[idx]; 711 789 ahrens buf_hash_table.ht_table[idx] = buf; 712 1544 eschrock buf->b_flags |= ARC_IN_HASH_TABLE; 713 789 ahrens 714 789 ahrens /* collect some hash table performance data */ 715 789 ahrens if (i > 0) { 716 3403 bmc ARCSTAT_BUMP(arcstat_hash_collisions); 717 789 ahrens if (i == 1) 718 3403 bmc ARCSTAT_BUMP(arcstat_hash_chains); 719 3403 bmc 720 3403 bmc ARCSTAT_MAX(arcstat_hash_chain_max, i); 721 789 ahrens } 722 3403 bmc 723 3403 bmc ARCSTAT_BUMP(arcstat_hash_elements); 724 3403 bmc ARCSTAT_MAXSTAT(arcstat_hash_elements); 725 789 ahrens 726 789 ahrens return (NULL); 727 789 ahrens } 728 789 ahrens 729 789 ahrens static void 730 789 ahrens buf_hash_remove(arc_buf_hdr_t *buf) 731 789 ahrens { 732 789 ahrens arc_buf_hdr_t *fbuf, **bufp; 733 789 ahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 734 789 ahrens 735 789 ahrens ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 736 1544 eschrock ASSERT(HDR_IN_HASH_TABLE(buf)); 737 789 ahrens 738 789 ahrens bufp = &buf_hash_table.ht_table[idx]; 739 789 ahrens while ((fbuf = *bufp) != buf) { 740 789 ahrens ASSERT(fbuf != NULL); 741 789 ahrens bufp = &fbuf->b_hash_next; 742 789 ahrens } 743 789 ahrens *bufp = buf->b_hash_next; 744 789 ahrens buf->b_hash_next = NULL; 745 1544 eschrock buf->b_flags &= ~ARC_IN_HASH_TABLE; 746 789 ahrens 747 789 ahrens /* collect some hash table performance data */ 748 3403 bmc ARCSTAT_BUMPDOWN(arcstat_hash_elements); 749 3403 bmc 750 789 ahrens if (buf_hash_table.ht_table[idx] && 751 789 ahrens buf_hash_table.ht_table[idx]->b_hash_next == NULL) 752 3403 bmc ARCSTAT_BUMPDOWN(arcstat_hash_chains); 753 789 ahrens } 754 789 ahrens 755 789 ahrens /* 756 789 ahrens * Global data structures and functions for the buf kmem cache. 757 789 ahrens */ 758 789 ahrens static kmem_cache_t *hdr_cache; 759 789 ahrens static kmem_cache_t *buf_cache; 760 789 ahrens 761 789 ahrens static void 762 789 ahrens buf_fini(void) 763 789 ahrens { 764 789 ahrens int i; 765 789 ahrens 766 789 ahrens kmem_free(buf_hash_table.ht_table, 767 789 ahrens (buf_hash_table.ht_mask + 1) * sizeof (void *)); 768 789 ahrens for (i = 0; i < BUF_LOCKS; i++) 769 789 ahrens mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 770 789 ahrens kmem_cache_destroy(hdr_cache); 771 789 ahrens kmem_cache_destroy(buf_cache); 772 789 ahrens } 773 789 ahrens 774 789 ahrens /* 775 789 ahrens * Constructor callback - called when the cache is empty 776 789 ahrens * and a new buf is requested. 777 789 ahrens */ 778 789 ahrens /* ARGSUSED */ 779 789 ahrens static int 780 789 ahrens hdr_cons(void *vbuf, void *unused, int kmflag) 781 789 ahrens { 782 789 ahrens arc_buf_hdr_t *buf = vbuf; 783 789 ahrens 784 789 ahrens bzero(buf, sizeof (arc_buf_hdr_t)); 785 789 ahrens refcount_create(&buf->b_refcnt); 786 789 ahrens cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 787 4831 gw25295 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 788 8582 Brendan arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 789 8582 Brendan 790 7545 Mark return (0); 791 7545 Mark } 792 7545 Mark 793 7545 Mark /* ARGSUSED */ 794 7545 Mark static int 795 7545 Mark buf_cons(void *vbuf, void *unused, int kmflag) 796 7545 Mark { 797 7545 Mark arc_buf_t *buf = vbuf; 798 7545 Mark 799 7545 Mark bzero(buf, sizeof (arc_buf_t)); 800 7545 Mark rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); 801 8582 Brendan arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 802 8582 Brendan 803 789 ahrens return (0); 804 789 ahrens } 805 789 ahrens 806 789 ahrens /* 807 789 ahrens * Destructor callback - called when a cached buf is 808 789 ahrens * no longer required. 809 789 ahrens */ 810 789 ahrens /* ARGSUSED */ 811 789 ahrens static void 812 789 ahrens hdr_dest(void *vbuf, void *unused) 813 789 ahrens { 814 789 ahrens arc_buf_hdr_t *buf = vbuf; 815 789 ahrens 816 10922 Jeff ASSERT(BUF_EMPTY(buf)); 817 789 ahrens refcount_destroy(&buf->b_refcnt); 818 789 ahrens cv_destroy(&buf->b_cv); 819 4831 gw25295 mutex_destroy(&buf->b_freeze_lock); 820 8582 Brendan arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 821 7545 Mark } 822 7545 Mark 823 7545 Mark /* ARGSUSED */ 824 7545 Mark static void 825 7545 Mark buf_dest(void *vbuf, void *unused) 826 7545 Mark { 827 7545 Mark arc_buf_t *buf = vbuf; 828 7545 Mark 829 7545 Mark rw_destroy(&buf->b_lock); 830 8582 Brendan arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 831 789 ahrens } 832 789 ahrens 833 789 ahrens /* 834 789 ahrens * Reclaim callback -- invoked when memory is low. 835 789 ahrens */ 836 789 ahrens /* ARGSUSED */ 837 789 ahrens static void 838 789 ahrens hdr_recl(void *unused) 839 789 ahrens { 840 789 ahrens dprintf("hdr_recl called\n"); 841 3158 maybee /* 842 3158 maybee * umem calls the reclaim func when we destroy the buf cache, 843 3158 maybee * which is after we do arc_fini(). 844 3158 maybee */ 845 3158 maybee if (!arc_dead) 846 3158 maybee cv_signal(&arc_reclaim_thr_cv); 847 789 ahrens } 848 789 ahrens 849 789 ahrens static void 850 789 ahrens buf_init(void) 851 789 ahrens { 852 789 ahrens uint64_t *ct; 853 1544 eschrock uint64_t hsize = 1ULL << 12; 854 789 ahrens int i, j; 855 789 ahrens 856 789 ahrens /* 857 789 ahrens * The hash table is big enough to fill all of physical memory 858 1544 eschrock * with an average 64K block size. The table will take up 859 1544 eschrock * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 860 789 ahrens */ 861 1544 eschrock while (hsize * 65536 < physmem * PAGESIZE) 862 789 ahrens hsize <<= 1; 863 1544 eschrock retry: 864 789 ahrens buf_hash_table.ht_mask = hsize - 1; 865 1544 eschrock buf_hash_table.ht_table = 866 1544 eschrock kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 867 1544 eschrock if (buf_hash_table.ht_table == NULL) { 868 1544 eschrock ASSERT(hsize > (1ULL << 8)); 869 1544 eschrock hsize >>= 1; 870 1544 eschrock goto retry; 871 1544 eschrock } 872 789 ahrens 873 789 ahrens hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 874 789 ahrens 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 875 789 ahrens buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 876 7545 Mark 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 877 789 ahrens 878 789 ahrens for (i = 0; i < 256; i++) 879 789 ahrens for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 880 789 ahrens *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 881 789 ahrens 882 789 ahrens for (i = 0; i < BUF_LOCKS; i++) { 883 789 ahrens mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 884 789 ahrens NULL, MUTEX_DEFAULT, NULL); 885 789 ahrens } 886 789 ahrens } 887 789 ahrens 888 789 ahrens #define ARC_MINTIME (hz>>4) /* 62 ms */ 889 789 ahrens 890 789 ahrens static void 891 3093 ahrens arc_cksum_verify(arc_buf_t *buf) 892 3093 ahrens { 893 3093 ahrens zio_cksum_t zc; 894 3093 ahrens 895 3312 ahrens if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 896 3093 ahrens return; 897 3093 ahrens 898 3093 ahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 899 3265 ahrens if (buf->b_hdr->b_freeze_cksum == NULL || 900 3265 ahrens (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 901 3093 ahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 902 3093 ahrens return; 903 3093 ahrens } 904 3093 ahrens fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 905 3093 ahrens if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 906 3093 ahrens panic("buffer modified while frozen!"); 907 3093 ahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 908 3093 ahrens } 909 3093 ahrens 910 5450 brendan static int 911 5450 brendan arc_cksum_equal(arc_buf_t *buf) 912 5450 brendan { 913 5450 brendan zio_cksum_t zc; 914 5450 brendan int equal; 915 5450 brendan 916 5450 brendan mutex_enter(&buf->b_hdr->b_freeze_lock); 917 5450 brendan fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 918 5450 brendan equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 919 5450 brendan mutex_exit(&buf->b_hdr->b_freeze_lock); 920 5450 brendan 921 5450 brendan return (equal); 922 5450 brendan } 923 5450 brendan 924 5450 brendan static void 925 5450 brendan arc_cksum_compute(arc_buf_t *buf, boolean_t force) 926 5450 brendan { 927 5450 brendan if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 928 3093 ahrens return; 929 3093 ahrens 930 3093 ahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 931 3093 ahrens if (buf->b_hdr->b_freeze_cksum != NULL) { 932 3093 ahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 933 3093 ahrens return; 934 3093 ahrens } 935 3093 ahrens buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 936 3093 ahrens fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 937 3093 ahrens buf->b_hdr->b_freeze_cksum); 938 3093 ahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 939 3093 ahrens } 940 3093 ahrens 941 3093 ahrens void 942 3093 ahrens arc_buf_thaw(arc_buf_t *buf) 943 3093 ahrens { 944 5450 brendan if (zfs_flags & ZFS_DEBUG_MODIFY) { 945 5450 brendan if (buf->b_hdr->b_state != arc_anon) 946 5450 brendan panic("modifying non-anon buffer!"); 947 5450 brendan if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 948 5450 brendan panic("modifying buffer while i/o in progress!"); 949 5450 brendan arc_cksum_verify(buf); 950 5450 brendan } 951 5450 brendan 952 3093 ahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 953 3093 ahrens if (buf->b_hdr->b_freeze_cksum != NULL) { 954 3093 ahrens kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 955 3093 ahrens buf->b_hdr->b_freeze_cksum = NULL; 956 3093 ahrens } 957 3093 ahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 958 3093 ahrens } 959 3093 ahrens 960 3093 ahrens void 961 3093 ahrens arc_buf_freeze(arc_buf_t *buf) 962 3093 ahrens { 963 3312 ahrens if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 964 3312 ahrens return; 965 3312 ahrens 966 3093 ahrens ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 967 3403 bmc buf->b_hdr->b_state == arc_anon); 968 5450 brendan arc_cksum_compute(buf, B_FALSE); 969 3093 ahrens } 970 3093 ahrens 971 3093 ahrens static void 972 789 ahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 973 789 ahrens { 974 789 ahrens ASSERT(MUTEX_HELD(hash_lock)); 975 789 ahrens 976 789 ahrens if ((refcount_add(&ab->b_refcnt, tag) == 1) && 977 3403 bmc (ab->b_state != arc_anon)) { 978 3700 ek110237 uint64_t delta = ab->b_size * ab->b_datacnt; 979 4309 maybee list_t *list = &ab->b_state->arcs_list[ab->b_type]; 980 4309 maybee uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 981 789 ahrens 982 3403 bmc ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 983 3403 bmc mutex_enter(&ab->b_state->arcs_mtx); 984 789 ahrens ASSERT(list_link_active(&ab->b_arc_node)); 985 4309 maybee list_remove(list, ab); 986 1544 eschrock if (GHOST_STATE(ab->b_state)) { 987 1544 eschrock ASSERT3U(ab->b_datacnt, ==, 0); 988 1544 eschrock ASSERT3P(ab->b_buf, ==, NULL); 989 1544 eschrock delta = ab->b_size; 990 1544 eschrock } 991 1544 eschrock ASSERT(delta > 0); 992 4309 maybee ASSERT3U(*size, >=, delta); 993 4309 maybee atomic_add_64(size, -delta); 994 3403 bmc mutex_exit(&ab->b_state->arcs_mtx); 995 7046 ahrens /* remove the prefetch flag if we get a reference */ 996 2391 maybee if (ab->b_flags & ARC_PREFETCH) 997 2391 maybee ab->b_flags &= ~ARC_PREFETCH; 998 789 ahrens } 999 789 ahrens } 1000 789 ahrens 1001 789 ahrens static int 1002 789 ahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1003 789 ahrens { 1004 789 ahrens int cnt; 1005 3403 bmc arc_state_t *state = ab->b_state; 1006 789 ahrens 1007 3403 bmc ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1008 3403 bmc ASSERT(!GHOST_STATE(state)); 1009 789 ahrens 1010 789 ahrens if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 1011 3403 bmc (state != arc_anon)) { 1012 4309 maybee uint64_t *size = &state->arcs_lsize[ab->b_type]; 1013 4309 maybee 1014 3403 bmc ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1015 3403 bmc mutex_enter(&state->arcs_mtx); 1016 789 ahrens ASSERT(!list_link_active(&ab->b_arc_node)); 1017 4309 maybee list_insert_head(&state->arcs_list[ab->b_type], ab); 1018 1544 eschrock ASSERT(ab->b_datacnt > 0); 1019 4309 maybee atomic_add_64(size, ab->b_size * ab->b_datacnt); 1020 3403 bmc mutex_exit(&state->arcs_mtx); 1021 789 ahrens } 1022 789 ahrens return (cnt); 1023 789 ahrens } 1024 789 ahrens 1025 789 ahrens /* 1026 789 ahrens * Move the supplied buffer to the indicated state. The mutex 1027 789 ahrens * for the buffer must be held by the caller. 1028 789 ahrens */ 1029 789 ahrens static void 1030 1544 eschrock arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 1031 789 ahrens { 1032 1544 eschrock arc_state_t *old_state = ab->b_state; 1033 3700 ek110237 int64_t refcnt = refcount_count(&ab->b_refcnt); 1034 3700 ek110237 uint64_t from_delta, to_delta; 1035 789 ahrens 1036 789 ahrens ASSERT(MUTEX_HELD(hash_lock)); 1037 1544 eschrock ASSERT(new_state != old_state); 1038 1544 eschrock ASSERT(refcnt == 0 || ab->b_datacnt > 0); 1039 1544 eschrock ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 1040 10922 Jeff ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon); 1041 10922 Jeff ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); 1042 1544 eschrock 1043 1544 eschrock from_delta = to_delta = ab->b_datacnt * ab->b_size; 1044 789 ahrens 1045 789 ahrens /* 1046 789 ahrens * If this buffer is evictable, transfer it from the 1047 789 ahrens * old state list to the new state list. 1048 789 ahrens */ 1049 1544 eschrock if (refcnt == 0) { 1050 3403 bmc if (old_state != arc_anon) { 1051 3403 bmc int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 1052 4309 maybee uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1053 789 ahrens 1054 1544 eschrock if (use_mutex) 1055 3403 bmc mutex_enter(&old_state->arcs_mtx); 1056 1544 eschrock 1057 1544 eschrock ASSERT(list_link_active(&ab->b_arc_node)); 1058 4309 maybee list_remove(&old_state->arcs_list[ab->b_type], ab); 1059 1544 eschrock 1060 2391 maybee /* 1061 2391 maybee * If prefetching out of the ghost cache, 1062 2391 maybee * we will have a non-null datacnt. 1063 2391 maybee */ 1064 2391 maybee if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1065 2391 maybee /* ghost elements have a ghost size */ 1066 1544 eschrock ASSERT(ab->b_buf == NULL); 1067 1544 eschrock from_delta = ab->b_size; 1068 789 ahrens } 1069 4309 maybee ASSERT3U(*size, >=, from_delta); 1070 4309 maybee atomic_add_64(size, -from_delta); 1071 1544 eschrock 1072 1544 eschrock if (use_mutex) 1073 3403 bmc mutex_exit(&old_state->arcs_mtx); 1074 789 ahrens } 1075 3403 bmc if (new_state != arc_anon) { 1076 3403 bmc int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 1077 4309 maybee uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1078 789 ahrens 1079 1544 eschrock if (use_mutex) 1080 3403 bmc mutex_enter(&new_state->arcs_mtx); 1081 1544 eschrock 1082 4309 maybee list_insert_head(&new_state->arcs_list[ab->b_type], ab); 1083 1544 eschrock 1084 1544 eschrock /* ghost elements have a ghost size */ 1085 1544 eschrock if (GHOST_STATE(new_state)) { 1086 1544 eschrock ASSERT(ab->b_datacnt == 0); 1087 1544 eschrock ASSERT(ab->b_buf == NULL); 1088 1544 eschrock to_delta = ab->b_size; 1089 789 ahrens } 1090 4309 maybee atomic_add_64(size, to_delta); 1091 1544 eschrock 1092 1544 eschrock if (use_mutex) 1093 3403 bmc mutex_exit(&new_state->arcs_mtx); 1094 789 ahrens } 1095 789 ahrens } 1096 789 ahrens 1097 789 ahrens ASSERT(!BUF_EMPTY(ab)); 1098 5450 brendan if (new_state == arc_anon) { 1099 789 ahrens buf_hash_remove(ab); 1100 789 ahrens } 1101 789 ahrens 1102 1544 eschrock /* adjust state sizes */ 1103 1544 eschrock if (to_delta) 1104 3403 bmc atomic_add_64(&new_state->arcs_size, to_delta); 1105 1544 eschrock if (from_delta) { 1106 3403 bmc ASSERT3U(old_state->arcs_size, >=, from_delta); 1107 3403 bmc atomic_add_64(&old_state->arcs_size, -from_delta); 1108 789 ahrens } 1109 789 ahrens ab->b_state = new_state; 1110 5450 brendan 1111 5450 brendan /* adjust l2arc hdr stats */ 1112 5450 brendan if (new_state == arc_l2c_only) 1113 5450 brendan l2arc_hdr_stat_add(); 1114 5450 brendan else if (old_state == arc_l2c_only) 1115 5450 brendan l2arc_hdr_stat_remove(); 1116 4309 maybee } 1117 4309 maybee 1118 4309 maybee void 1119 8582 Brendan arc_space_consume(uint64_t space, arc_space_type_t type) 1120 8582 Brendan { 1121 8582 Brendan ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1122 8582 Brendan 1123 8582 Brendan switch (type) { 1124 8582 Brendan case ARC_SPACE_DATA: 1125 8582 Brendan ARCSTAT_INCR(arcstat_data_size, space); 1126 8582 Brendan break; 1127 8582 Brendan case ARC_SPACE_OTHER: 1128 8582 Brendan ARCSTAT_INCR(arcstat_other_size, space); 1129 8582 Brendan break; 1130 8582 Brendan case ARC_SPACE_HDRS: 1131 8582 Brendan ARCSTAT_INCR(arcstat_hdr_size, space); 1132 8582 Brendan break; 1133 8582 Brendan case ARC_SPACE_L2HDRS: 1134 8582 Brendan ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1135 8582 Brendan break; 1136 8582 Brendan } 1137 8582 Brendan 1138 4309 maybee atomic_add_64(&arc_meta_used, space); 1139 4309 maybee atomic_add_64(&arc_size, space); 1140 4309 maybee } 1141 4309 maybee 1142 4309 maybee void 1143 8582 Brendan arc_space_return(uint64_t space, arc_space_type_t type) 1144 8582 Brendan { 1145 8582 Brendan ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1146 8582 Brendan 1147 8582 Brendan switch (type) { 1148 8582 Brendan case ARC_SPACE_DATA: 1149 8582 Brendan ARCSTAT_INCR(arcstat_data_size, -space); 1150 8582 Brendan break; 1151 8582 Brendan case ARC_SPACE_OTHER: 1152 8582 Brendan ARCSTAT_INCR(arcstat_other_size, -space); 1153 8582 Brendan break; 1154 8582 Brendan case ARC_SPACE_HDRS: 1155 8582 Brendan ARCSTAT_INCR(arcstat_hdr_size, -space); 1156 8582 Brendan break; 1157 8582 Brendan case ARC_SPACE_L2HDRS: 1158 8582 Brendan ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1159 8582 Brendan break; 1160 8582 Brendan } 1161 8582 Brendan 1162 4309 maybee ASSERT(arc_meta_used >= space); 1163 4309 maybee if (arc_meta_max < arc_meta_used) 1164 4309 maybee arc_meta_max = arc_meta_used; 1165 4309 maybee atomic_add_64(&arc_meta_used, -space); 1166 4309 maybee ASSERT(arc_size >= space); 1167 4309 maybee atomic_add_64(&arc_size, -space); 1168 4309 maybee } 1169 4309 maybee 1170 4309 maybee void * 1171 4309 maybee arc_data_buf_alloc(uint64_t size) 1172 4309 maybee { 1173 4309 maybee if (arc_evict_needed(ARC_BUFC_DATA)) 1174 4309 maybee cv_signal(&arc_reclaim_thr_cv); 1175 4309 maybee atomic_add_64(&arc_size, size); 1176 4309 maybee return (zio_data_buf_alloc(size)); 1177 4309 maybee } 1178 4309 maybee 1179 4309 maybee void 1180 4309 maybee arc_data_buf_free(void *buf, uint64_t size) 1181 4309 maybee { 1182 4309 maybee zio_data_buf_free(buf, size); 1183 4309 maybee ASSERT(arc_size >= size); 1184 4309 maybee atomic_add_64(&arc_size, -size); 1185 789 ahrens } 1186 789 ahrens 1187 789 ahrens arc_buf_t * 1188 3290 johansen arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1189 789 ahrens { 1190 789 ahrens arc_buf_hdr_t *hdr; 1191 789 ahrens arc_buf_t *buf; 1192 789 ahrens 1193 789 ahrens ASSERT3U(size, >, 0); 1194 6245 maybee hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1195 789 ahrens ASSERT(BUF_EMPTY(hdr)); 1196 789 ahrens hdr->b_size = size; 1197 3290 johansen hdr->b_type = type; 1198 8636 Mark hdr->b_spa = spa_guid(spa); 1199 3403 bmc hdr->b_state = arc_anon; 1200 789 ahrens hdr->b_arc_access = 0; 1201 6245 maybee buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1202 789 ahrens buf->b_hdr = hdr; 1203 2688 maybee buf->b_data = NULL; 1204 1544 eschrock buf->b_efunc = NULL; 1205 1544 eschrock buf->b_private = NULL; 1206 789 ahrens buf->b_next = NULL; 1207 789 ahrens hdr->b_buf = buf; 1208 2688 maybee arc_get_data_buf(buf); 1209 1544 eschrock hdr->b_datacnt = 1; 1210 789 ahrens hdr->b_flags = 0; 1211 789 ahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1212 789 ahrens (void) refcount_add(&hdr->b_refcnt, tag); 1213 789 ahrens 1214 789 ahrens return (buf); 1215 789 ahrens } 1216 789 ahrens 1217 9412 Aleksandr static char *arc_onloan_tag = "onloan"; 1218 9412 Aleksandr 1219 9412 Aleksandr /* 1220 9412 Aleksandr * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1221 9412 Aleksandr * flight data by arc_tempreserve_space() until they are "returned". Loaned 1222 9412 Aleksandr * buffers must be returned to the arc before they can be used by the DMU or 1223 9412 Aleksandr * freed. 1224 9412 Aleksandr */ 1225 9412 Aleksandr arc_buf_t * 1226 9412 Aleksandr arc_loan_buf(spa_t *spa, int size) 1227 9412 Aleksandr { 1228 9412 Aleksandr arc_buf_t *buf; 1229 9412 Aleksandr 1230 9412 Aleksandr buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1231 9412 Aleksandr 1232 9412 Aleksandr atomic_add_64(&arc_loaned_bytes, size); 1233 9412 Aleksandr return (buf); 1234 9412 Aleksandr } 1235 9412 Aleksandr 1236 9412 Aleksandr /* 1237 9412 Aleksandr * Return a loaned arc buffer to the arc. 1238 9412 Aleksandr */ 1239 9412 Aleksandr void 1240 9412 Aleksandr arc_return_buf(arc_buf_t *buf, void *tag) 1241 9412 Aleksandr { 1242 9412 Aleksandr arc_buf_hdr_t *hdr = buf->b_hdr; 1243 9412 Aleksandr 1244 9412 Aleksandr ASSERT(hdr->b_state == arc_anon); 1245 9412 Aleksandr ASSERT(buf->b_data != NULL); 1246 9412 Aleksandr VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0); 1247 9412 Aleksandr VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1); 1248 9412 Aleksandr 1249 9412 Aleksandr atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1250 9412 Aleksandr } 1251 9412 Aleksandr 1252 2688 maybee static arc_buf_t * 1253 2688 maybee arc_buf_clone(arc_buf_t *from) 1254 1544 eschrock { 1255 2688 maybee arc_buf_t *buf; 1256 2688 maybee arc_buf_hdr_t *hdr = from->b_hdr; 1257 2688 maybee uint64_t size = hdr->b_size; 1258 1544 eschrock 1259 10922 Jeff ASSERT(hdr->b_state != arc_anon); 1260 10922 Jeff 1261 6245 maybee buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1262 2688 maybee buf->b_hdr = hdr; 1263 2688 maybee buf->b_data = NULL; 1264 2688 maybee buf->b_efunc = NULL; 1265 2688 maybee buf->b_private = NULL; 1266 2688 maybee buf->b_next = hdr->b_buf; 1267 2688 maybee hdr->b_buf = buf; 1268 2688 maybee arc_get_data_buf(buf); 1269 2688 maybee bcopy(from->b_data, buf->b_data, size); 1270 2688 maybee hdr->b_datacnt += 1; 1271 2688 maybee return (buf); 1272 1544 eschrock } 1273 1544 eschrock 1274 1544 eschrock void 1275 1544 eschrock arc_buf_add_ref(arc_buf_t *buf, void* tag) 1276 1544 eschrock { 1277 2887 maybee arc_buf_hdr_t *hdr; 1278 1544 eschrock kmutex_t *hash_lock; 1279 1544 eschrock 1280 2724 maybee /* 1281 7545 Mark * Check to see if this buffer is evicted. Callers 1282 7545 Mark * must verify b_data != NULL to know if the add_ref 1283 7545 Mark * was successful. 1284 7545 Mark */ 1285 7545 Mark rw_enter(&buf->b_lock, RW_READER); 1286 7545 Mark if (buf->b_data == NULL) { 1287 7545 Mark rw_exit(&buf->b_lock); 1288 7545 Mark return; 1289 7545 Mark } 1290 7545 Mark hdr = buf->b_hdr; 1291 7545 Mark ASSERT(hdr != NULL); 1292 2887 maybee hash_lock = HDR_LOCK(hdr); 1293 7545 Mark mutex_enter(hash_lock); 1294 7545 Mark rw_exit(&buf->b_lock); 1295 7545 Mark 1296 3403 bmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1297 1544 eschrock add_reference(hdr, hash_lock, tag); 1298 8582 Brendan DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1299 2688 maybee arc_access(hdr, hash_lock); 1300 2688 maybee mutex_exit(hash_lock); 1301 3403 bmc ARCSTAT_BUMP(arcstat_hits); 1302 3403 bmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1303 3403 bmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1304 3403 bmc data, metadata, hits); 1305 1544 eschrock } 1306 1544 eschrock 1307 5450 brendan /* 1308 5450 brendan * Free the arc data buffer. If it is an l2arc write in progress, 1309 5450 brendan * the buffer is placed on l2arc_free_on_write to be freed later. 1310 5450 brendan */ 1311 5450 brendan static void 1312 5450 brendan arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), 1313 5450 brendan void *data, size_t size) 1314 5450 brendan { 1315 5450 brendan if (HDR_L2_WRITING(hdr)) { 1316 5450 brendan l2arc_data_free_t *df; 1317 5450 brendan df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1318 5450 brendan df->l2df_data = data; 1319 5450 brendan df->l2df_size = size; 1320 5450 brendan df->l2df_func = free_func; 1321 5450 brendan mutex_enter(&l2arc_free_on_write_mtx); 1322 5450 brendan list_insert_head(l2arc_free_on_write, df); 1323 5450 brendan mutex_exit(&l2arc_free_on_write_mtx); 1324 5450 brendan ARCSTAT_BUMP(arcstat_l2_free_on_write); 1325 5450 brendan } else { 1326 5450 brendan free_func(data, size); 1327 5450 brendan } 1328 5450 brendan } 1329 5450 brendan 1330 789 ahrens static void 1331 2688 maybee arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 1332 1544 eschrock { 1333 1544 eschrock arc_buf_t **bufp; 1334 1544 eschrock 1335 1544 eschrock /* free up data associated with the buf */ 1336 1544 eschrock if (buf->b_data) { 1337 1544 eschrock arc_state_t *state = buf->b_hdr->b_state; 1338 1544 eschrock uint64_t size = buf->b_hdr->b_size; 1339 3290 johansen arc_buf_contents_t type = buf->b_hdr->b_type; 1340 1544 eschrock 1341 3093 ahrens arc_cksum_verify(buf); 1342 10922 Jeff 1343 2688 maybee if (!recycle) { 1344 3290 johansen if (type == ARC_BUFC_METADATA) { 1345 5450 brendan arc_buf_data_free(buf->b_hdr, zio_buf_free, 1346 5450 brendan buf->b_data, size); 1347 8582 Brendan arc_space_return(size, ARC_SPACE_DATA); 1348 3290 johansen } else { 1349 3290 johansen ASSERT(type == ARC_BUFC_DATA); 1350 5450 brendan arc_buf_data_free(buf->b_hdr, 1351 5450 brendan zio_data_buf_free, buf->b_data, size); 1352 8582 Brendan ARCSTAT_INCR(arcstat_data_size, -size); 1353 4309 maybee atomic_add_64(&arc_size, -size); 1354 3290 johansen } 1355 2688 maybee } 1356 1544 eschrock if (list_link_active(&buf->b_hdr->b_arc_node)) { 1357 4309 maybee uint64_t *cnt = &state->arcs_lsize[type]; 1358 4309 maybee 1359 1544 eschrock ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1360 3403 bmc ASSERT(state != arc_anon); 1361 4309 maybee 1362 4309 maybee ASSERT3U(*cnt, >=, size); 1363 4309 maybee atomic_add_64(cnt, -size); 1364 1544 eschrock } 1365 3403 bmc ASSERT3U(state->arcs_size, >=, size); 1366 3403 bmc atomic_add_64(&state->arcs_size, -size); 1367 1544 eschrock buf->b_data = NULL; 1368 1544 eschrock ASSERT(buf->b_hdr->b_datacnt > 0); 1369 1544 eschrock buf->b_hdr->b_datacnt -= 1; 1370 1544 eschrock } 1371 1544 eschrock 1372 1544 eschrock /* only remove the buf if requested */ 1373 1544 eschrock if (!all) 1374 1544 eschrock return; 1375 1544 eschrock 1376 1544 eschrock /* remove the buf from the hdr list */ 1377 1544 eschrock for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1378 1544 eschrock continue; 1379 1544 eschrock *bufp = buf->b_next; 1380 1544 eschrock 1381 1544 eschrock ASSERT(buf->b_efunc == NULL); 1382 1544 eschrock 1383 1544 eschrock /* clean up the buf */ 1384 1544 eschrock buf->b_hdr = NULL; 1385 1544 eschrock kmem_cache_free(buf_cache, buf); 1386 1544 eschrock } 1387 1544 eschrock 1388 1544 eschrock static void 1389 1544 eschrock arc_hdr_destroy(arc_buf_hdr_t *hdr) 1390 789 ahrens { 1391 789 ahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1392 3403 bmc ASSERT3P(hdr->b_state, ==, arc_anon); 1393 1544 eschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1394 10922 Jeff l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; 1395 10922 Jeff 1396 10922 Jeff if (l2hdr != NULL) { 1397 10922 Jeff boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 1398 10922 Jeff /* 1399 10922 Jeff * To prevent arc_free() and l2arc_evict() from 1400 10922 Jeff * attempting to free the same buffer at the same time, 1401 10922 Jeff * a FREE_IN_PROGRESS flag is given to arc_free() to 1402 10922 Jeff * give it priority. l2arc_evict() can't destroy this 1403 10922 Jeff * header while we are waiting on l2arc_buflist_mtx. 1404 10922 Jeff * 1405 10922 Jeff * The hdr may be removed from l2ad_buflist before we 1406 10922 Jeff * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1407 10922 Jeff */ 1408 10922 Jeff if (!buflist_held) { 1409 5450 brendan mutex_enter(&l2arc_buflist_mtx); 1410 10922 Jeff l2hdr = hdr->b_l2hdr; 1411 10922 Jeff } 1412 10922 Jeff 1413 10922 Jeff if (l2hdr != NULL) { 1414 10922 Jeff list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 1415 10922 Jeff ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1416 10922 Jeff kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 1417 10922 Jeff if (hdr->b_state == arc_l2c_only) 1418 10922 Jeff l2arc_hdr_stat_remove(); 1419 10922 Jeff hdr->b_l2hdr = NULL; 1420 10922 Jeff } 1421 10922 Jeff 1422 10922 Jeff if (!buflist_held) 1423 5450 brendan mutex_exit(&l2arc_buflist_mtx); 1424 5450 brendan } 1425 789 ahrens 1426 789 ahrens if (!BUF_EMPTY(hdr)) { 1427 1544 eschrock ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1428 789 ahrens bzero(&hdr->b_dva, sizeof (dva_t)); 1429 789 ahrens hdr->b_birth = 0; 1430 789 ahrens hdr->b_cksum0 = 0; 1431 789 ahrens } 1432 1544 eschrock while (hdr->b_buf) { 1433 789 ahrens arc_buf_t *buf = hdr->b_buf; 1434 789 ahrens 1435 1544 eschrock if (buf->b_efunc) { 1436 1544 eschrock mutex_enter(&arc_eviction_mtx); 1437 7545 Mark rw_enter(&buf->b_lock, RW_WRITER); 1438 1544 eschrock ASSERT(buf->b_hdr != NULL); 1439 2688 maybee arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1440 1544 eschrock hdr->b_buf = buf->b_next; 1441 2887 maybee buf->b_hdr = &arc_eviction_hdr; 1442 1544 eschrock buf->b_next = arc_eviction_list; 1443 1544 eschrock arc_eviction_list = buf; 1444 7545 Mark rw_exit(&buf->b_lock); 1445 1544 eschrock mutex_exit(&arc_eviction_mtx); 1446 1544 eschrock } else { 1447 2688 maybee arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1448 1544 eschrock } 1449 3093 ahrens } 1450 3093 ahrens if (hdr->b_freeze_cksum != NULL) { 1451 3093 ahrens kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1452 3093 ahrens hdr->b_freeze_cksum = NULL; 1453 789 ahrens } 1454 1544 eschrock 1455 789 ahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 1456 789 ahrens ASSERT3P(hdr->b_hash_next, ==, NULL); 1457 789 ahrens ASSERT3P(hdr->b_acb, ==, NULL); 1458 789 ahrens kmem_cache_free(hdr_cache, hdr); 1459 789 ahrens } 1460 789 ahrens 1461 789 ahrens void 1462 789 ahrens arc_buf_free(arc_buf_t *buf, void *tag) 1463 789 ahrens { 1464 789 ahrens arc_buf_hdr_t *hdr = buf->b_hdr; 1465 3403 bmc int hashed = hdr->b_state != arc_anon; 1466 1544 eschrock 1467 1544 eschrock ASSERT(buf->b_efunc == NULL); 1468 1544 eschrock ASSERT(buf->b_data != NULL); 1469 1544 eschrock 1470 1544 eschrock if (hashed) { 1471 1544 eschrock kmutex_t *hash_lock = HDR_LOCK(hdr); 1472 1544 eschrock 1473 1544 eschrock mutex_enter(hash_lock); 1474 1544 eschrock (void) remove_reference(hdr, hash_lock, tag); 1475 10922 Jeff if (hdr->b_datacnt > 1) { 1476 2688 maybee arc_buf_destroy(buf, FALSE, TRUE); 1477 10922 Jeff } else { 1478 10922 Jeff ASSERT(buf == hdr->b_buf); 1479 10922 Jeff ASSERT(buf->b_efunc == NULL); 1480 1544 eschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 1481 10922 Jeff } 1482 1544 eschrock mutex_exit(hash_lock); 1483 1544 eschrock } else if (HDR_IO_IN_PROGRESS(hdr)) { 1484 1544 eschrock int destroy_hdr; 1485 1544 eschrock /* 1486 1544 eschrock * We are in the middle of an async write. Don't destroy 1487 1544 eschrock * this buffer unless the write completes before we finish 1488 1544 eschrock * decrementing the reference count. 1489 1544 eschrock */ 1490 1544 eschrock mutex_enter(&arc_eviction_mtx); 1491 1544 eschrock (void) remove_reference(hdr, NULL, tag); 1492 1544 eschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1493 1544 eschrock destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1494 1544 eschrock mutex_exit(&arc_eviction_mtx); 1495 1544 eschrock if (destroy_hdr) 1496 1544 eschrock arc_hdr_destroy(hdr); 1497 1544 eschrock } else { 1498 1544 eschrock if (remove_reference(hdr, NULL, tag) > 0) { 1499 1544 eschrock ASSERT(HDR_IO_ERROR(hdr)); 1500 2688 maybee arc_buf_destroy(buf, FALSE, TRUE); 1501 1544 eschrock } else { 1502 1544 eschrock arc_hdr_destroy(hdr); 1503 1544 eschrock } 1504 1544 eschrock } 1505 1544 eschrock } 1506 1544 eschrock 1507 1544 eschrock int 1508 1544 eschrock arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1509 1544 eschrock { 1510 1544 eschrock arc_buf_hdr_t *hdr = buf->b_hdr; 1511 789 ahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 1512 1544 eschrock int no_callback = (buf->b_efunc == NULL); 1513 1544 eschrock 1514 3403 bmc if (hdr->b_state == arc_anon) { 1515 10922 Jeff ASSERT(hdr->b_datacnt == 1); 1516 1544 eschrock arc_buf_free(buf, tag); 1517 1544 eschrock return (no_callback); 1518 1544 eschrock } 1519 789 ahrens 1520 789 ahrens mutex_enter(hash_lock); 1521 3403 bmc ASSERT(hdr->b_state != arc_anon); 1522 1544 eschrock ASSERT(buf->b_data != NULL); 1523 789 ahrens 1524 1544 eschrock (void) remove_reference(hdr, hash_lock, tag); 1525 1544 eschrock if (hdr->b_datacnt > 1) { 1526 1544 eschrock if (no_callback) 1527 2688 maybee arc_buf_destroy(buf, FALSE, TRUE); 1528 1544 eschrock } else if (no_callback) { 1529 1544 eschrock ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1530 10922 Jeff ASSERT(buf->b_efunc == NULL); 1531 1544 eschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 1532 789 ahrens } 1533 1544 eschrock ASSERT(no_callback || hdr->b_datacnt > 1 || 1534 1544 eschrock refcount_is_zero(&hdr->b_refcnt)); 1535 789 ahrens mutex_exit(hash_lock); 1536 1544 eschrock return (no_callback); 1537 789 ahrens } 1538 789 ahrens 1539 789 ahrens int 1540 789 ahrens arc_buf_size(arc_buf_t *buf) 1541 789 ahrens { 1542 789 ahrens return (buf->b_hdr->b_size); 1543 789 ahrens } 1544 789 ahrens 1545 789 ahrens /* 1546 789 ahrens * Evict buffers from list until we've removed the specified number of 1547 789 ahrens * bytes. Move the removed buffers to the appropriate evict state. 1548 2688 maybee * If the recycle flag is set, then attempt to "recycle" a buffer: 1549 2688 maybee * - look for a buffer to evict that is `bytes' long. 1550 2688 maybee * - return the data block from this buffer rather than freeing it. 1551 2688 maybee * This flag is used by callers that are trying to make space for a 1552 2688 maybee * new buffer in a full arc cache. 1553 5642 maybee * 1554 5642 maybee * This function makes a "best effort". It skips over any buffers 1555 5642 maybee * it can't get a hash_lock on, and so may not catch all candidates. 1556 5642 maybee * It may also return without evicting as much space as requested. 1557 789 ahrens */ 1558 2688 maybee static void * 1559 8636 Mark arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 1560 3290 johansen arc_buf_contents_t type) 1561 789 ahrens { 1562 789 ahrens arc_state_t *evicted_state; 1563 2688 maybee uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1564 2918 maybee arc_buf_hdr_t *ab, *ab_prev = NULL; 1565 4309 maybee list_t *list = &state->arcs_list[type]; 1566 789 ahrens kmutex_t *hash_lock; 1567 2688 maybee boolean_t have_lock; 1568 2918 maybee void *stolen = NULL; 1569 789 ahrens 1570 3403 bmc ASSERT(state == arc_mru || state == arc_mfu); 1571 789 ahrens 1572 3403 bmc evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1573 789 ahrens 1574 3403 bmc mutex_enter(&state->arcs_mtx); 1575 3403 bmc mutex_enter(&evicted_state->arcs_mtx); 1576 789 ahrens 1577 4309 maybee for (ab = list_tail(list); ab; ab = ab_prev) { 1578 4309 maybee ab_prev = list_prev(list, ab); 1579 2391 maybee /* prefetch buffers have a minimum lifespan */ 1580 2688 maybee if (HDR_IO_IN_PROGRESS(ab) || 1581 5642 maybee (spa && ab->b_spa != spa) || 1582 2688 maybee (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1583 11066 rafael ddi_get_lbolt() - ab->b_arc_access < 1584 11066 rafael arc_min_prefetch_lifespan)) { 1585 2391 maybee skipped++; 1586 2391 maybee continue; 1587 2391 maybee } 1588 2918 maybee /* "lookahead" for better eviction candidate */ 1589 2918 maybee if (recycle && ab->b_size != bytes && 1590 2918 maybee ab_prev && ab_prev->b_size == bytes) 1591 2688 maybee continue; 1592 789 ahrens hash_lock = HDR_LOCK(ab); 1593 2688 maybee have_lock = MUTEX_HELD(hash_lock); 1594 2688 maybee if (have_lock || mutex_tryenter(hash_lock)) { 1595 789 ahrens ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1596 1544 eschrock ASSERT(ab->b_datacnt > 0); 1597 1544 eschrock while (ab->b_buf) { 1598 1544 eschrock arc_buf_t *buf = ab->b_buf; 1599 7545 Mark if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { 1600 7545 Mark missed += 1; 1601 7545 Mark break; 1602 7545 Mark } 1603 2688 maybee if (buf->b_data) { 1604 1544 eschrock bytes_evicted += ab->b_size; 1605 3290 johansen if (recycle && ab->b_type == type && 1606 5450 brendan ab->b_size == bytes && 1607 5450 brendan !HDR_L2_WRITING(ab)) { 1608 2918 maybee stolen = buf->b_data; 1609 2918 maybee recycle = FALSE; 1610 2918 maybee } 1611 2688 maybee } 1612 1544 eschrock if (buf->b_efunc) { 1613 1544 eschrock mutex_enter(&arc_eviction_mtx); 1614 2918 maybee arc_buf_destroy(buf, 1615 2918 maybee buf->b_data == stolen, FALSE); 1616 1544 eschrock ab->b_buf = buf->b_next; 1617 2887 maybee buf->b_hdr = &arc_eviction_hdr; 1618 1544 eschrock buf->b_next = arc_eviction_list; 1619 1544 eschrock arc_eviction_list = buf; 1620 1544 eschrock mutex_exit(&arc_eviction_mtx); 1621 7545 Mark rw_exit(&buf->b_lock); 1622 1544 eschrock } else { 1623 7545 Mark rw_exit(&buf->b_lock); 1624 2918 maybee arc_buf_destroy(buf, 1625 2918 maybee buf->b_data == stolen, TRUE); 1626 1544 eschrock } 1627 1544 eschrock } 1628 10357 Brendan 1629 10357 Brendan if (ab->b_l2hdr) { 1630 10357 Brendan ARCSTAT_INCR(arcstat_evict_l2_cached, 1631 10357 Brendan ab->b_size); 1632 10357 Brendan } else { 1633 10357 Brendan if (l2arc_write_eligible(ab->b_spa, ab)) { 1634 10357 Brendan ARCSTAT_INCR(arcstat_evict_l2_eligible, 1635 10357 Brendan ab->b_size); 1636 10357 Brendan } else { 1637 10357 Brendan ARCSTAT_INCR( 1638 10357 Brendan arcstat_evict_l2_ineligible, 1639 10357 Brendan ab->b_size); 1640 10357 Brendan } 1641 10357 Brendan } 1642 10357 Brendan 1643 7545 Mark if (ab->b_datacnt == 0) { 1644 7545 Mark arc_change_state(evicted_state, ab, hash_lock); 1645 7545 Mark ASSERT(HDR_IN_HASH_TABLE(ab)); 1646 7545 Mark ab->b_flags |= ARC_IN_HASH_TABLE; 1647 7545 Mark ab->b_flags &= ~ARC_BUF_AVAILABLE; 1648 7545 Mark DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1649 7545 Mark } 1650 2688 maybee if (!have_lock) 1651 2688 maybee mutex_exit(hash_lock); 1652 1544 eschrock if (bytes >= 0 && bytes_evicted >= bytes) 1653 789 ahrens break; 1654 789 ahrens } else { 1655 2688 maybee missed += 1; 1656 789 ahrens } 1657 789 ahrens } 1658 3403 bmc 1659 3403 bmc mutex_exit(&evicted_state->arcs_mtx); 1660 3403 bmc mutex_exit(&state->arcs_mtx); 1661 789 ahrens 1662 789 ahrens if (bytes_evicted < bytes) 1663 789 ahrens dprintf("only evicted %lld bytes from %x", 1664 789 ahrens (longlong_t)bytes_evicted, state); 1665 789 ahrens 1666 2688 maybee if (skipped) 1667 3403 bmc ARCSTAT_INCR(arcstat_evict_skip, skipped); 1668 3403 bmc 1669 2688 maybee if (missed) 1670 3403 bmc ARCSTAT_INCR(arcstat_mutex_miss, missed); 1671 4709 maybee 1672 4709 maybee /* 1673 4709 maybee * We have just evicted some date into the ghost state, make 1674 4709 maybee * sure we also adjust the ghost state size if necessary. 1675 4709 maybee */ 1676 4709 maybee if (arc_no_grow && 1677 4709 maybee arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 1678 4709 maybee int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 1679 4709 maybee arc_mru_ghost->arcs_size - arc_c; 1680 4709 maybee 1681 4709 maybee if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 1682 4709 maybee int64_t todelete = 1683 4709 maybee MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 1684 5642 maybee arc_evict_ghost(arc_mru_ghost, NULL, todelete); 1685 4709 maybee } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 1686 4709 maybee int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 1687 4709 maybee arc_mru_ghost->arcs_size + 1688 4709 maybee arc_mfu_ghost->arcs_size - arc_c); 1689 5642 maybee arc_evict_ghost(arc_mfu_ghost, NULL, todelete); 1690 4709 maybee } 1691 4709 maybee } 1692 3403 bmc 1693 2918 maybee return (stolen); 1694 789 ahrens } 1695 789 ahrens 1696 789 ahrens /* 1697 789 ahrens * Remove buffers from list until we've removed the specified number of 1698 789 ahrens * bytes. Destroy the buffers that are removed. 1699 789 ahrens */ 1700 789 ahrens static void 1701 8636 Mark arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 1702 789 ahrens { 1703 789 ahrens arc_buf_hdr_t *ab, *ab_prev; 1704 4309 maybee list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1705 789 ahrens kmutex_t *hash_lock; 1706 1544 eschrock uint64_t bytes_deleted = 0; 1707 3700 ek110237 uint64_t bufs_skipped = 0; 1708 789 ahrens 1709 1544 eschrock ASSERT(GHOST_STATE(state)); 1710 789 ahrens top: 1711 3403 bmc mutex_enter(&state->arcs_mtx); 1712 4309 maybee for (ab = list_tail(list); ab; ab = ab_prev) { 1713 4309 maybee ab_prev = list_prev(list, ab); 1714 5642 maybee if (spa && ab->b_spa != spa) 1715 5642 maybee continue; 1716 789 ahrens hash_lock = HDR_LOCK(ab); 1717 789 ahrens if (mutex_tryenter(hash_lock)) { 1718 2391 maybee ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1719 1544 eschrock ASSERT(ab->b_buf == NULL); 1720 3403 bmc ARCSTAT_BUMP(arcstat_deleted); 1721 1544 eschrock bytes_deleted += ab->b_size; 1722 5450 brendan 1723 5450 brendan if (ab->b_l2hdr != NULL) { 1724 5450 brendan /* 1725 5450 brendan * This buffer is cached on the 2nd Level ARC; 1726 5450 brendan * don't destroy the header. 1727 5450 brendan */ 1728 5450 brendan arc_change_state(arc_l2c_only, ab, hash_lock); 1729 5450 brendan mutex_exit(hash_lock); 1730 5450 brendan } else { 1731 5450 brendan arc_change_state(arc_anon, ab, hash_lock); 1732 5450 brendan mutex_exit(hash_lock); 1733 5450 brendan arc_hdr_destroy(ab); 1734 5450 brendan } 1735 5450 brendan 1736 789 ahrens DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1737 789 ahrens if (bytes >= 0 && bytes_deleted >= bytes) 1738 789 ahrens break; 1739 789 ahrens } else { 1740 789 ahrens if (bytes < 0) { 1741 3403 bmc mutex_exit(&state->arcs_mtx); 1742 789 ahrens mutex_enter(hash_lock); 1743 789 ahrens mutex_exit(hash_lock); 1744 789 ahrens goto top; 1745 789 ahrens } 1746 789 ahrens bufs_skipped += 1; 1747 789 ahrens } 1748 789 ahrens } 1749 3403 bmc mutex_exit(&state->arcs_mtx); 1750 789 ahrens 1751 4309 maybee if (list == &state->arcs_list[ARC_BUFC_DATA] && 1752 4309 maybee (bytes < 0 || bytes_deleted < bytes)) { 1753 4309 maybee list = &state->arcs_list[ARC_BUFC_METADATA]; 1754 4309 maybee goto top; 1755 4309 maybee } 1756 4309 maybee 1757 789 ahrens if (bufs_skipped) { 1758 3403 bmc ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1759 789 ahrens ASSERT(bytes >= 0); 1760 789 ahrens } 1761 789 ahrens 1762 789 ahrens if (bytes_deleted < bytes) 1763 789 ahrens dprintf("only deleted %lld bytes from %p", 1764 789 ahrens (longlong_t)bytes_deleted, state); 1765 789 ahrens } 1766 789 ahrens 1767 789 ahrens static void 1768 789 ahrens arc_adjust(void) 1769 789 ahrens { 1770 8582 Brendan int64_t adjustment, delta; 1771 8582 Brendan 1772 8582 Brendan /* 1773 8582 Brendan * Adjust MRU size 1774 8582 Brendan */ 1775 8582 Brendan 1776 8582 Brendan adjustment = MIN(arc_size - arc_c, 1777 8582 Brendan arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p); 1778 8582 Brendan 1779 8582 Brendan if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 1780 8582 Brendan delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 1781 8582 Brendan (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); 1782 8582 Brendan adjustment -= delta; 1783 8582 Brendan } 1784 8582 Brendan 1785 8582 Brendan if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1786 8582 Brendan delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 1787 8582 Brendan (void) arc_evict(arc_mru, NULL, delta, FALSE, 1788 5642 maybee ARC_BUFC_METADATA); 1789 8582 Brendan } 1790 8582 Brendan 1791 8582 Brendan /* 1792 8582 Brendan * Adjust MFU size 1793 8582 Brendan */ 1794 8582 Brendan 1795 8582 Brendan adjustment = arc_size - arc_c; 1796 8582 Brendan 1797 8582 Brendan if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 1798 8582 Brendan delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 1799 8582 Brendan (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); 1800 8582 Brendan adjustment -= delta; 1801 8582 Brendan } 1802 8582 Brendan 1803 8582 Brendan if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1804 8582 Brendan int64_t delta = MIN(adjustment, 1805 8582 Brendan arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 1806 8582 Brendan (void) arc_evict(arc_mfu, NULL, delta, FALSE, 1807 8582 Brendan ARC_BUFC_METADATA); 1808 8582 Brendan } 1809 8582 Brendan 1810 8582 Brendan /* 1811 8582 Brendan * Adjust ghost lists 1812 8582 Brendan */ 1813 8582 Brendan 1814 8582 Brendan adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 1815 8582 Brendan 1816 8582 Brendan if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 1817 8582 Brendan delta = MIN(arc_mru_ghost->arcs_size, adjustment); 1818 8582 Brendan arc_evict_ghost(arc_mru_ghost, NULL, delta); 1819 8582 Brendan } 1820 8582 Brendan 1821 8582 Brendan adjustment = 1822 8582 Brendan arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 1823 8582 Brendan 1824 8582 Brendan if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 1825 8582 Brendan delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 1826 8582 Brendan arc_evict_ghost(arc_mfu_ghost, NULL, delta); 1827 789 ahrens } 1828 1544 eschrock } 1829 1544 eschrock 1830 1544 eschrock static void 1831 1544 eschrock arc_do_user_evicts(void) 1832 1544 eschrock { 1833 1544 eschrock mutex_enter(&arc_eviction_mtx); 1834 1544 eschrock while (arc_eviction_list != NULL) { 1835 1544 eschrock arc_buf_t *buf = arc_eviction_list; 1836 1544 eschrock arc_eviction_list = buf->b_next; 1837 7545 Mark rw_enter(&buf->b_lock, RW_WRITER); 1838 1544 eschrock buf->b_hdr = NULL; 1839 7545 Mark rw_exit(&buf->b_lock); 1840 1544 eschrock mutex_exit(&arc_eviction_mtx); 1841 1544 eschrock 1842 1819 maybee if (buf->b_efunc != NULL) 1843 1819 maybee VERIFY(buf->b_efunc(buf) == 0); 1844 1544 eschrock 1845 1544 eschrock buf->b_efunc = NULL; 1846 1544 eschrock buf->b_private = NULL; 1847 1544 eschrock kmem_cache_free(buf_cache, buf); 1848 1544 eschrock mutex_enter(&arc_eviction_mtx); 1849 1544 eschrock } 1850 1544 eschrock mutex_exit(&arc_eviction_mtx); 1851 789 ahrens } 1852 789 ahrens 1853 789 ahrens /* 1854 5642 maybee * Flush all *evictable* data from the cache for the given spa. 1855 789 ahrens * NOTE: this will not touch "active" (i.e. referenced) data. 1856 789 ahrens */ 1857 789 ahrens void 1858 5642 maybee arc_flush(spa_t *spa) 1859 5642 maybee { 1860 8636 Mark uint64_t guid = 0; 1861 8636 Mark 1862 8636 Mark if (spa) 1863 8636 Mark guid = spa_guid(spa); 1864 8636 Mark 1865 5642 maybee while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { 1866 8636 Mark (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 1867 5642 maybee if (spa) 1868 5642 maybee break; 1869 5642 maybee } 1870 5642 maybee while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { 1871 8636 Mark (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 1872 5642 maybee if (spa) 1873 5642 maybee break; 1874 5642 maybee } 1875 5642 maybee while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { 1876 8636 Mark (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 1877 5642 maybee if (spa) 1878 5642 maybee break; 1879 5642 maybee } 1880 5642 maybee while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { 1881 8636 Mark (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 1882 5642 maybee if (spa) 1883 5642 maybee break; 1884 5642 maybee } 1885 5642 maybee 1886 8636 Mark arc_evict_ghost(arc_mru_ghost, guid, -1); 1887 8636 Mark arc_evict_ghost(arc_mfu_ghost, guid, -1); 1888 1544 eschrock 1889 1544 eschrock mutex_enter(&arc_reclaim_thr_lock); 1890 1544 eschrock arc_do_user_evicts(); 1891 1544 eschrock mutex_exit(&arc_reclaim_thr_lock); 1892 5642 maybee ASSERT(spa || arc_eviction_list == NULL); 1893 789 ahrens } 1894 2391 maybee 1895 789 ahrens void 1896 3158 maybee arc_shrink(void) 1897 789 ahrens { 1898 3403 bmc if (arc_c > arc_c_min) { 1899 3158 maybee uint64_t to_free; 1900 789 ahrens 1901 2048 stans #ifdef _KERNEL 1902 3403 bmc to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 1903 2048 stans #else 1904 3403 bmc to_free = arc_c >> arc_shrink_shift; 1905 2048 stans #endif 1906 3403 bmc if (arc_c > arc_c_min + to_free) 1907 3403 bmc atomic_add_64(&arc_c, -to_free); 1908 3158 maybee else 1909 3403 bmc arc_c = arc_c_min; 1910 2048 stans 1911 3403 bmc atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 1912 3403 bmc if (arc_c > arc_size) 1913 3403 bmc arc_c = MAX(arc_size, arc_c_min); 1914 3403 bmc if (arc_p > arc_c) 1915 3403 bmc arc_p = (arc_c >> 1); 1916 3403 bmc ASSERT(arc_c >= arc_c_min); 1917 3403 bmc ASSERT((int64_t)arc_p >= 0); 1918 3158 maybee } 1919 789 ahrens 1920 3403 bmc if (arc_size > arc_c) 1921 3158 maybee arc_adjust(); 1922 789 ahrens } 1923 789 ahrens 1924 789 ahrens static int 1925 789 ahrens arc_reclaim_needed(void) 1926 789 ahrens { 1927 789 ahrens uint64_t extra; 1928 789 ahrens 1929 789 ahrens #ifdef _KERNEL 1930 2048 stans 1931 2048 stans if (needfree) 1932 2048 stans return (1); 1933 2048 stans 1934 789 ahrens /* 1935 789 ahrens * take 'desfree' extra pages, so we reclaim sooner, rather than later 1936 789 ahrens */ 1937 789 ahrens extra = desfree; 1938 789 ahrens 1939 789 ahrens /* 1940 789 ahrens * check that we're out of range of the pageout scanner. It starts to 1941 789 ahrens * schedule paging if freemem is less than lotsfree and needfree. 1942 789 ahrens * lotsfree is the high-water mark for pageout, and needfree is the 1943 789 ahrens * number of needed free pages. We add extra pages here to make sure 1944 789 ahrens * the scanner doesn't start up while we're freeing memory. 1945 789 ahrens */ 1946 789 ahrens if (freemem < lotsfree + needfree + extra) 1947 789 ahrens return (1); 1948 789 ahrens 1949 789 ahrens /* 1950 789 ahrens * check to make sure that swapfs has enough space so that anon 1951 5450 brendan * reservations can still succeed. anon_resvmem() checks that the 1952 789 ahrens * availrmem is greater than swapfs_minfree, and the number of reserved 1953 789 ahrens * swap pages. We also add a bit of extra here just to prevent 1954 789 ahrens * circumstances from getting really dire. 1955 789 ahrens */ 1956 789 ahrens if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1957 789 ahrens return (1); 1958 789 ahrens 1959 1936 maybee #if defined(__i386) 1960 789 ahrens /* 1961 789 ahrens * If we're on an i386 platform, it's possible that we'll exhaust the 1962 789 ahrens * kernel heap space before we ever run out of available physical 1963 789 ahrens * memory. Most checks of the size of the heap_area compare against 1964 789 ahrens * tune.t_minarmem, which is the minimum available real memory that we 1965 789 ahrens * can have in the system. However, this is generally fixed at 25 pages 1966 789 ahrens * which is so low that it's useless. In this comparison, we seek to 1967 789 ahrens * calculate the total heap-size, and reclaim if more than 3/4ths of the 1968 5450 brendan * heap is allocated. (Or, in the calculation, if less than 1/4th is 1969 789 ahrens * free) 1970 789 ahrens */ 1971 789 ahrens if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1972 789 ahrens (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1973 789 ahrens return (1); 1974 789 ahrens #endif 1975 789 ahrens 1976 789 ahrens #else 1977 789 ahrens if (spa_get_random(100) == 0) 1978 789 ahrens return (1); 1979 789 ahrens #endif 1980 789 ahrens return (0); 1981 789 ahrens } 1982 789 ahrens 1983 789 ahrens static void 1984 789 ahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1985 789 ahrens { 1986 789 ahrens size_t i; 1987 789 ahrens kmem_cache_t *prev_cache = NULL; 1988 3290 johansen kmem_cache_t *prev_data_cache = NULL; 1989 789 ahrens extern kmem_cache_t *zio_buf_cache[]; 1990 3290 johansen extern kmem_cache_t *zio_data_buf_cache[]; 1991 1484 ek110237 1992 1484 ek110237 #ifdef _KERNEL 1993 4309 maybee if (arc_meta_used >= arc_meta_limit) { 1994 4309 maybee /* 1995 4309 maybee * We are exceeding our meta-data cache limit. 1996 4309 maybee * Purge some DNLC entries to release holds on meta-data. 1997 4309 maybee */ 1998 4309 maybee dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1999 4309 maybee } 2000 1936 maybee #if defined(__i386) 2001 1936 maybee /* 2002 1936 maybee * Reclaim unused memory from all kmem caches. 2003 1936 maybee */ 2004 1936 maybee kmem_reap(); 2005 1936 maybee #endif 2006 1484 ek110237 #endif 2007 789 ahrens 2008 789 ahrens /* 2009 5450 brendan * An aggressive reclamation will shrink the cache size as well as 2010 1544 eschrock * reap free buffers from the arc kmem caches. 2011 789 ahrens */ 2012 789 ahrens if (strat == ARC_RECLAIM_AGGR) 2013 3158 maybee arc_shrink(); 2014 789 ahrens 2015 789 ahrens for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2016 789 ahrens if (zio_buf_cache[i] != prev_cache) { 2017 789 ahrens prev_cache = zio_buf_cache[i]; 2018 789 ahrens kmem_cache_reap_now(zio_buf_cache[i]); 2019 3290 johansen } 2020 3290 johansen if (zio_data_buf_cache[i] != prev_data_cache) { 2021 3290 johansen prev_data_cache = zio_data_buf_cache[i]; 2022 3290 johansen kmem_cache_reap_now(zio_data_buf_cache[i]); 2023 789 ahrens } 2024 789 ahrens } 2025 1544 eschrock kmem_cache_reap_now(buf_cache); 2026 1544 eschrock kmem_cache_reap_now(hdr_cache); 2027 789 ahrens } 2028 789 ahrens 2029 789 ahrens static void 2030 789 ahrens arc_reclaim_thread(void) 2031 789 ahrens { 2032 789 ahrens clock_t growtime = 0; 2033 789 ahrens arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2034 789 ahrens callb_cpr_t cpr; 2035 789 ahrens 2036 789 ahrens CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2037 789 ahrens 2038 789 ahrens mutex_enter(&arc_reclaim_thr_lock); 2039 789 ahrens while (arc_thread_exit == 0) { 2040 789 ahrens if (arc_reclaim_needed()) { 2041 789 ahrens 2042 3403 bmc if (arc_no_grow) { 2043 789 ahrens if (last_reclaim == ARC_RECLAIM_CONS) { 2044 789 ahrens last_reclaim = ARC_RECLAIM_AGGR; 2045 789 ahrens } else { 2046 789 ahrens last_reclaim = ARC_RECLAIM_CONS; 2047 789 ahrens } 2048 789 ahrens } else { 2049 3403 bmc arc_no_grow = TRUE; 2050 789 ahrens last_reclaim = ARC_RECLAIM_AGGR; 2051 789 ahrens membar_producer(); 2052 789 ahrens } 2053 789 ahrens 2054 789 ahrens /* reset the growth delay for every reclaim */ 2055 11066 rafael growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 2056 789 ahrens 2057 789 ahrens arc_kmem_reap_now(last_reclaim); 2058 6987 brendan arc_warm = B_TRUE; 2059 789 ahrens 2060 11066 rafael } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 2061 3403 bmc arc_no_grow = FALSE; 2062 789 ahrens } 2063 789 ahrens 2064 3403 bmc if (2 * arc_c < arc_size + 2065 3403 bmc arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) 2066 3298 maybee arc_adjust(); 2067 3298 maybee 2068 1544 eschrock if (arc_eviction_list != NULL) 2069 1544 eschrock arc_do_user_evicts(); 2070 1544 eschrock 2071 789 ahrens /* block until needed, or one second, whichever is shorter */ 2072 789 ahrens CALLB_CPR_SAFE_BEGIN(&cpr); 2073 789 ahrens (void) cv_timedwait(&arc_reclaim_thr_cv, 2074 11066 rafael &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); 2075 789 ahrens CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2076 789 ahrens } 2077 789 ahrens 2078 789 ahrens arc_thread_exit = 0; 2079 789 ahrens cv_broadcast(&arc_reclaim_thr_cv); 2080 789 ahrens CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 2081 789 ahrens thread_exit(); 2082 789 ahrens } 2083 789 ahrens 2084 1544 eschrock /* 2085 1544 eschrock * Adapt arc info given the number of bytes we are trying to add and 2086 1544 eschrock * the state that we are comming from. This function is only called 2087 1544 eschrock * when we are adding new content to the cache. 2088 1544 eschrock */ 2089 789 ahrens static void 2090 1544 eschrock arc_adapt(int bytes, arc_state_t *state) 2091 789 ahrens { 2092 1544 eschrock int mult; 2093 8582 Brendan uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2094 5450 brendan 2095 5450 brendan if (state == arc_l2c_only) 2096 5450 brendan return; 2097 1544 eschrock 2098 1544 eschrock ASSERT(bytes > 0); 2099 789 ahrens /* 2100 1544 eschrock * Adapt the target size of the MRU list: 2101 1544 eschrock * - if we just hit in the MRU ghost list, then increase 2102 1544 eschrock * the target size of the MRU list. 2103 1544 eschrock * - if we just hit in the MFU ghost list, then increase 2104 1544 eschrock * the target size of the MFU list by decreasing the 2105 1544 eschrock * target size of the MRU list. 2106 789 ahrens */ 2107 3403 bmc if (state == arc_mru_ghost) { 2108 3403 bmc mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 2109 3403 bmc 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 2110 1544 eschrock 2111 8582 Brendan arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 2112 3403 bmc } else if (state == arc_mfu_ghost) { 2113 8582 Brendan uint64_t delta; 2114 8582 Brendan 2115 3403 bmc mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 2116 3403 bmc 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 2117 1544 eschrock 2118 8582 Brendan delta = MIN(bytes * mult, arc_p); 2119 8582 Brendan arc_p = MAX(arc_p_min, arc_p - delta); 2120 1544 eschrock } 2121 3403 bmc ASSERT((int64_t)arc_p >= 0); 2122 789 ahrens 2123 789 ahrens if (arc_reclaim_needed()) { 2124 789 ahrens cv_signal(&arc_reclaim_thr_cv); 2125 789 ahrens return; 2126 789 ahrens } 2127 789 ahrens 2128 3403 bmc if (arc_no_grow) 2129 789 ahrens return; 2130 789 ahrens 2131 3403 bmc if (arc_c >= arc_c_max) 2132 1544 eschrock return; 2133 1544 eschrock 2134 789 ahrens /* 2135 1544 eschrock * If we're within (2 * maxblocksize) bytes of the target 2136 1544 eschrock * cache size, increment the target cache size 2137 789 ahrens */ 2138 3403 bmc if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 2139 3403 bmc atomic_add_64(&arc_c, (int64_t)bytes); 2140 3403 bmc if (arc_c > arc_c_max) 2141 3403 bmc arc_c = arc_c_max; 2142 3403 bmc else if (state == arc_anon) 2143 3403 bmc atomic_add_64(&arc_p, (int64_t)bytes); 2144 3403 bmc if (arc_p > arc_c) 2145 3403 bmc arc_p = arc_c; 2146 789 ahrens } 2147 3403 bmc ASSERT((int64_t)arc_p >= 0); 2148 789 ahrens } 2149 789 ahrens 2150 789 ahrens /* 2151 1544 eschrock * Check if the cache has reached its limits and eviction is required 2152 1544 eschrock * prior to insert. 2153 789 ahrens */ 2154 789 ahrens static int 2155 4309 maybee arc_evict_needed(arc_buf_contents_t type) 2156 789 ahrens { 2157 4309 maybee if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 2158 4309 maybee return (1); 2159 4309 maybee 2160 4309 maybee #ifdef _KERNEL 2161 4309 maybee /* 2162 4309 maybee * If zio data pages are being allocated out of a separate heap segment, 2163 4309 maybee * then enforce that the size of available vmem for this area remains 2164 4309 maybee * above about 1/32nd free. 2165 4309 maybee */ 2166 4309 maybee if (type == ARC_BUFC_DATA && zio_arena != NULL && 2167 4309 maybee vmem_size(zio_arena, VMEM_FREE) < 2168 4309 maybee (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 2169 4309 maybee return (1); 2170 4309 maybee #endif 2171 4309 maybee 2172 789 ahrens if (arc_reclaim_needed()) 2173 789 ahrens return (1); 2174 789 ahrens 2175 3403 bmc return (arc_size > arc_c); 2176 789 ahrens } 2177 789 ahrens 2178 789 ahrens /* 2179 2688 maybee * The buffer, supplied as the first argument, needs a data block. 2180 2688 maybee * So, if we are at cache max, determine which cache should be victimized. 2181 2688 maybee * We have the following cases: 2182 789 ahrens * 2183 3403 bmc * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2184 789 ahrens * In this situation if we're out of space, but the resident size of the MFU is 2185 789 ahrens * under the limit, victimize the MFU cache to satisfy this insertion request. 2186 789 ahrens * 2187 3403 bmc * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2188 789 ahrens * Here, we've used up all of the available space for the MRU, so we need to 2189 789 ahrens * evict from our own cache instead. Evict from the set of resident MRU 2190 789 ahrens * entries. 2191 789 ahrens * 2192 3403 bmc * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2193 789 ahrens * c minus p represents the MFU space in the cache, since p is the size of the 2194 789 ahrens * cache that is dedicated to the MRU. In this situation there's still space on 2195 789 ahrens * the MFU side, so the MRU side needs to be victimized. 2196 789 ahrens * 2197 3403 bmc * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2198 789 ahrens * MFU's resident set is consuming more space than it has been allotted. In 2199 789 ahrens * this situation, we must victimize our own cache, the MFU, for this insertion. 2200 789 ahrens */ 2201 789 ahrens static void 2202 2688 maybee arc_get_data_buf(arc_buf_t *buf) 2203 789 ahrens { 2204 3290 johansen arc_state_t *state = buf->b_hdr->b_state; 2205 3290 johansen uint64_t size = buf->b_hdr->b_size; 2206 3290 johansen arc_buf_contents_t type = buf->b_hdr->b_type; 2207 789 ahrens 2208 2688 maybee arc_adapt(size, state); 2209 789 ahrens 2210 2688 maybee /* 2211 2688 maybee * We have not yet reached cache maximum size, 2212 2688 maybee * just allocate a new buffer. 2213 2688 maybee */ 2214 4309 maybee if (!arc_evict_needed(type)) { 2215 3290 johansen if (type == ARC_BUFC_METADATA) { 2216 3290 johansen buf->b_data = zio_buf_alloc(size); 2217 8582 Brendan arc_space_consume(size, ARC_SPACE_DATA); 2218 3290 johansen } else { 2219 3290 johansen ASSERT(type == ARC_BUFC_DATA); 2220 3290 johansen buf->b_data = zio_data_buf_alloc(size); 2221 8582 Brendan ARCSTAT_INCR(arcstat_data_size, size); 2222 4309 maybee atomic_add_64(&arc_size, size); 2223 3290 johansen } 2224 2688 maybee goto out; 2225 2688 maybee } 2226 2688 maybee 2227 2688 maybee /* 2228 2688 maybee * If we are prefetching from the mfu ghost list, this buffer 2229 2688 maybee * will end up on the mru list; so steal space from there. 2230 2688 maybee */ 2231 3403 bmc if (state == arc_mfu_ghost) 2232 3403 bmc state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2233 3403 bmc else if (state == arc_mru_ghost) 2234 3403 bmc state = arc_mru; 2235 2688 maybee 2236 3403 bmc if (state == arc_mru || state == arc_anon) { 2237 3403 bmc uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2238 8582 Brendan state = (arc_mfu->arcs_lsize[type] >= size && 2239 4309 maybee arc_p > mru_used) ? arc_mfu : arc_mru; 2240 789 ahrens } else { 2241 2688 maybee /* MFU cases */ 2242 3403 bmc uint64_t mfu_space = arc_c - arc_p; 2243 8582 Brendan state = (arc_mru->arcs_lsize[type] >= size && 2244 4309 maybee mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2245 2688 maybee } 2246 5642 maybee if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { 2247 3290 johansen if (type == ARC_BUFC_METADATA) { 2248 3290 johansen buf->b_data = zio_buf_alloc(size); 2249 8582 Brendan arc_space_consume(size, ARC_SPACE_DATA); 2250 3290 johansen } else { 2251 3290 johansen ASSERT(type == ARC_BUFC_DATA); 2252 3290 johansen buf->b_data = zio_data_buf_alloc(size); 2253 8582 Brendan ARCSTAT_INCR(arcstat_data_size, size); 2254 4309 maybee atomic_add_64(&arc_size, size); 2255 3290 johansen } 2256 3403 bmc ARCSTAT_BUMP(arcstat_recycle_miss); 2257 2688 maybee } 2258 2688 maybee ASSERT(buf->b_data != NULL); 2259 2688 maybee out: 2260 2688 maybee /* 2261 2688 maybee * Update the state size. Note that ghost states have a 2262 2688 maybee * "ghost size" and so don't need to be updated. 2263 2688 maybee */ 2264 2688 maybee if (!GHOST_STATE(buf->b_hdr->b_state)) { 2265 2688 maybee arc_buf_hdr_t *hdr = buf->b_hdr; 2266 2688 maybee 2267 3403 bmc atomic_add_64(&hdr->b_state->arcs_size, size); 2268 2688 maybee if (list_link_active(&hdr->b_arc_node)) { 2269 2688 maybee ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2270 4309 maybee atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2271 789 ahrens } 2272 3298 maybee /* 2273 3298 maybee * If we are growing the cache, and we are adding anonymous 2274 3403 bmc * data, and we have outgrown arc_p, update arc_p 2275 3298 maybee */ 2276 3403 bmc if (arc_size < arc_c && hdr->b_state == arc_anon && 2277 3403 bmc arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2278 3403 bmc arc_p = MIN(arc_c, arc_p + size); 2279 789 ahrens } 2280 789 ahrens } 2281 789 ahrens 2282 789 ahrens /* 2283 789 ahrens * This routine is called whenever a buffer is accessed. 2284 1544 eschrock * NOTE: the hash lock is dropped in this function. 2285 789 ahrens */ 2286 789 ahrens static void 2287 2688 maybee arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2288 789 ahrens { 2289 11066 rafael clock_t now; 2290 11066 rafael 2291 789 ahrens ASSERT(MUTEX_HELD(hash_lock)); 2292 789 ahrens 2293 3403 bmc if (buf->b_state == arc_anon) { 2294 789 ahrens /* 2295 789 ahrens * This buffer is not in the cache, and does not 2296 789 ahrens * appear in our "ghost" list. Add the new buffer 2297 789 ahrens * to the MRU state. 2298 789 ahrens */ 2299 789 ahrens 2300 789 ahrens ASSERT(buf->b_arc_access == 0); 2301 11066 rafael buf->b_arc_access = ddi_get_lbolt(); 2302 1544 eschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2303 3403 bmc arc_change_state(arc_mru, buf, hash_lock); 2304 789 ahrens 2305 3403 bmc } else if (buf->b_state == arc_mru) { 2306 11066 rafael now = ddi_get_lbolt(); 2307 11066 rafael 2308 789 ahrens /* 2309 2391 maybee * If this buffer is here because of a prefetch, then either: 2310 2391 maybee * - clear the flag if this is a "referencing" read 2311 2391 maybee * (any subsequent access will bump this into the MFU state). 2312 2391 maybee * or 2313 2391 maybee * - move the buffer to the head of the list if this is 2314 2391 maybee * another prefetch (to make it less likely to be evicted). 2315 789 ahrens */ 2316 789 ahrens if ((buf->b_flags & ARC_PREFETCH) != 0) { 2317 2391 maybee if (refcount_count(&buf->b_refcnt) == 0) { 2318 2391 maybee ASSERT(list_link_active(&buf->b_arc_node)); 2319 2391 maybee } else { 2320 2391 maybee buf->b_flags &= ~ARC_PREFETCH; 2321 3403 bmc ARCSTAT_BUMP(arcstat_mru_hits); 2322 2391 maybee } 2323 11066 rafael buf->b_arc_access = now; 2324 789 ahrens return; 2325 789 ahrens } 2326 789 ahrens 2327 789 ahrens /* 2328 789 ahrens * This buffer has been "accessed" only once so far, 2329 789 ahrens * but it is still in the cache. Move it to the MFU 2330 789 ahrens * state. 2331 789 ahrens */ 2332 11066 rafael if (now > buf->b_arc_access + ARC_MINTIME) { 2333 789 ahrens /* 2334 789 ahrens * More than 125ms have passed since we 2335 789 ahrens * instantiated this buffer. Move it to the 2336 789 ahrens * most frequently used state. 2337 789 ahrens */ 2338 11066 rafael buf->b_arc_access = now; 2339 1544 eschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2340 3403 bmc arc_change_state(arc_mfu, buf, hash_lock); 2341 789 ahrens } 2342 3403 bmc ARCSTAT_BUMP(arcstat_mru_hits); 2343 3403 bmc } else if (buf->b_state == arc_mru_ghost) { 2344 789 ahrens arc_state_t *new_state; 2345 789 ahrens /* 2346 789 ahrens * This buffer has been "accessed" recently, but 2347 789 ahrens * was evicted from the cache. Move it to the 2348 789 ahrens * MFU state. 2349 789 ahrens */ 2350 789 ahrens 2351 789 ahrens if (buf->b_flags & ARC_PREFETCH) { 2352 3403 bmc new_state = arc_mru; 2353 2391 maybee if (refcount_count(&buf->b_refcnt) > 0) 2354 2391 maybee buf->b_flags &= ~ARC_PREFETCH; 2355 1544 eschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2356 789 ahrens } else { 2357 3403 bmc new_state = arc_mfu; 2358 1544 eschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2359 789 ahrens } 2360 789 ahrens 2361 11066 rafael buf->b_arc_access = ddi_get_lbolt(); 2362 789 ahrens arc_change_state(new_state, buf, hash_lock); 2363 789 ahrens 2364 3403 bmc ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2365 3403 bmc } else if (buf->b_state == arc_mfu) { 2366 789 ahrens /* 2367 789 ahrens * This buffer has been accessed more than once and is 2368 789 ahrens * still in the cache. Keep it in the MFU state. 2369 789 ahrens * 2370 2391 maybee * NOTE: an add_reference() that occurred when we did 2371 2391 maybee * the arc_read() will have kicked this off the list. 2372 2391 maybee * If it was a prefetch, we will explicitly move it to 2373 2391 maybee * the head of the list now. 2374 789 ahrens */ 2375 2391 maybee if ((buf->b_flags & ARC_PREFETCH) != 0) { 2376 2391 maybee ASSERT(refcount_count(&buf->b_refcnt) == 0); 2377 2391 maybee ASSERT(list_link_active(&buf->b_arc_node)); 2378 2391 maybee } 2379 3403 bmc ARCSTAT_BUMP(arcstat_mfu_hits); 2380 11066 rafael buf->b_arc_access = ddi_get_lbolt(); 2381 3403 bmc } else if (buf->b_state == arc_mfu_ghost) { 2382 3403 bmc arc_state_t *new_state = arc_mfu; 2383 789 ahrens /* 2384 789 ahrens * This buffer has been accessed more than once but has 2385 789 ahrens * been evicted from the cache. Move it back to the 2386 789 ahrens * MFU state. 2387 789 ahrens */ 2388 789 ahrens 2389 2391 maybee if (buf->b_flags & ARC_PREFETCH) { 2390 2391 maybee /* 2391 2391 maybee * This is a prefetch access... 2392 2391 maybee * move this block back to the MRU state. 2393 2391 maybee */ 2394 2391 maybee ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 2395 3403 bmc new_state = arc_mru; 2396 2391 maybee } 2397 2391 maybee 2398 11066 rafael buf->b_arc_access = ddi_get_lbolt(); 2399 1544 eschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2400 2391 maybee arc_change_state(new_state, buf, hash_lock); 2401 789 ahrens 2402 3403 bmc ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2403 5450 brendan } else if (buf->b_state == arc_l2c_only) { 2404 5450 brendan /* 2405 5450 brendan * This buffer is on the 2nd Level ARC. 2406 5450 brendan */ 2407 5450 brendan 2408 11066 rafael buf->b_arc_access = ddi_get_lbolt(); 2409 5450 brendan DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2410 5450 brendan arc_change_state(arc_mfu, buf, hash_lock); 2411 789 ahrens } else { 2412 789 ahrens ASSERT(!"invalid arc state"); 2413 789 ahrens } 2414 789 ahrens } 2415 789 ahrens 2416 789 ahrens /* a generic arc_done_func_t which you can use */ 2417 789 ahrens /* ARGSUSED */ 2418 789 ahrens void 2419 789 ahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2420 789 ahrens { 2421 789 ahrens bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2422 1544 eschrock VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2423 789 ahrens } 2424 789 ahrens 2425 4309 maybee /* a generic arc_done_func_t */ 2426 789 ahrens void 2427 789 ahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2428 789 ahrens { 2429 789 ahrens arc_buf_t **bufp = arg; 2430 789 ahrens if (zio && zio->io_error) { 2431 1544 eschrock VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2432 789 ahrens *bufp = NULL; 2433 789 ahrens } else { 2434 789 ahrens *bufp = buf; 2435 789 ahrens } 2436 789 ahrens } 2437 789 ahrens 2438 789 ahrens static void 2439 789 ahrens arc_read_done(zio_t *zio) 2440 789 ahrens { 2441 1589 maybee arc_buf_hdr_t *hdr, *found; 2442 789 ahrens arc_buf_t *buf; 2443 789 ahrens arc_buf_t *abuf; /* buffer we're assigning to callback */ 2444 789 ahrens kmutex_t *hash_lock; 2445 789 ahrens arc_callback_t *callback_list, *acb; 2446 789 ahrens int freeable = FALSE; 2447 789 ahrens 2448 789 ahrens buf = zio->io_private; 2449 789 ahrens hdr = buf->b_hdr; 2450 789 ahrens 2451 1589 maybee /* 2452 1589 maybee * The hdr was inserted into hash-table and removed from lists 2453 1589 maybee * prior to starting I/O. We should find this header, since 2454 1589 maybee * it's in the hash table, and it should be legit since it's 2455 1589 maybee * not possible to evict it during the I/O. The only possible 2456 1589 maybee * reason for it not to be found is if we were freed during the 2457 1589 maybee * read. 2458 1589 maybee */ 2459 8636 Mark found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, 2460 3093 ahrens &hash_lock); 2461 789 ahrens 2462 1589 maybee ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 2463 5450 brendan (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 2464 5450 brendan (found == hdr && HDR_L2_READING(hdr))); 2465 5450 brendan 2466 6987 brendan hdr->b_flags &= ~ARC_L2_EVICTED; 2467 5450 brendan if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2468 7237 ek110237 hdr->b_flags &= ~ARC_L2CACHE; 2469 789 ahrens 2470 789 ahrens /* byteswap if necessary */ 2471 789 ahrens callback_list = hdr->b_acb; 2472 789 ahrens ASSERT(callback_list != NULL); 2473 10839 william if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 2474 7046 ahrens arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 2475 7046 ahrens byteswap_uint64_array : 2476 7046 ahrens dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; 2477 7046 ahrens func(buf->b_data, hdr->b_size); 2478 7046 ahrens } 2479 3093 ahrens 2480 5450 brendan arc_cksum_compute(buf, B_FALSE); 2481 789 ahrens 2482 10922 Jeff if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 2483 10922 Jeff /* 2484 10922 Jeff * Only call arc_access on anonymous buffers. This is because 2485 10922 Jeff * if we've issued an I/O for an evicted buffer, we've already 2486 10922 Jeff * called arc_access (to prevent any simultaneous readers from 2487 10922 Jeff * getting confused). 2488 10922 Jeff */ 2489 10922 Jeff arc_access(hdr, hash_lock); 2490 10922 Jeff } 2491 10922 Jeff 2492 789 ahrens /* create copies of the data buffer for the callers */ 2493 789 ahrens abuf = buf; 2494 789 ahrens for (acb = callback_list; acb; acb = acb->acb_next) { 2495 789 ahrens if (acb->acb_done) { 2496 2688 maybee if (abuf == NULL) 2497 2688 maybee abuf = arc_buf_clone(buf); 2498 789 ahrens acb->acb_buf = abuf; 2499 789 ahrens abuf = NULL; 2500 789 ahrens } 2501 789 ahrens } 2502 789 ahrens hdr->b_acb = NULL; 2503 789 ahrens hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2504 1544 eschrock ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2505 10922 Jeff if (abuf == buf) { 2506 10922 Jeff ASSERT(buf->b_efunc == NULL); 2507 10922 Jeff ASSERT(hdr->b_datacnt == 1); 2508 1544 eschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 2509 10922 Jeff } 2510 789 ahrens 2511 789 ahrens ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2512 789 ahrens 2513 789 ahrens if (zio->io_error != 0) { 2514 789 ahrens hdr->b_flags |= ARC_IO_ERROR; 2515 3403 bmc if (hdr->b_state != arc_anon) 2516 3403 bmc arc_change_state(arc_anon, hdr, hash_lock); 2517 1544 eschrock if (HDR_IN_HASH_TABLE(hdr)) 2518 1544 eschrock buf_hash_remove(hdr); 2519 789 ahrens freeable = refcount_is_zero(&hdr->b_refcnt); 2520 789 ahrens } 2521 1544 eschrock 2522 1544 eschrock /* 2523 2391 maybee * Broadcast before we drop the hash_lock to avoid the possibility 2524 2391 maybee * that the hdr (and hence the cv) might be freed before we get to 2525 2391 maybee * the cv_broadcast(). 2526 1544 eschrock */ 2527 1544 eschrock cv_broadcast(&hdr->b_cv); 2528 789 ahrens 2529 1589 maybee if (hash_lock) { 2530 2688 maybee mutex_exit(hash_lock); 2531 789 ahrens } else { 2532 789 ahrens /* 2533 789 ahrens * This block was freed while we waited for the read to 2534 789 ahrens * complete. It has been removed from the hash table and 2535 789 ahrens * moved to the anonymous state (so that it won't show up 2536 789 ahrens * in the cache). 2537 789 ahrens */ 2538 3403 bmc ASSERT3P(hdr->b_state, ==, arc_anon); 2539 789 ahrens freeable = refcount_is_zero(&hdr->b_refcnt); 2540 789 ahrens } 2541 789 ahrens 2542 789 ahrens /* execute each callback and free its structure */ 2543 789 ahrens while ((acb = callback_list) != NULL) { 2544 789 ahrens if (acb->acb_done) 2545 789 ahrens acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2546 789 ahrens 2547 789 ahrens if (acb->acb_zio_dummy != NULL) { 2548 789 ahrens acb->acb_zio_dummy->io_error = zio->io_error; 2549 789 ahrens zio_nowait(acb->acb_zio_dummy); 2550 789 ahrens } 2551 789 ahrens 2552 789 ahrens callback_list = acb->acb_next; 2553 789 ahrens kmem_free(acb, sizeof (arc_callback_t)); 2554 789 ahrens } 2555 789 ahrens 2556 789 ahrens if (freeable) 2557 1544 eschrock arc_hdr_destroy(hdr); 2558 789 ahrens } 2559 789 ahrens 2560 789 ahrens /* 2561 789 ahrens * "Read" the block block at the specified DVA (in bp) via the 2562 789 ahrens * cache. If the block is found in the cache, invoke the provided 2563 789 ahrens * callback immediately and return. Note that the `zio' parameter 2564 789 ahrens * in the callback will be NULL in this case, since no IO was 2565 789 ahrens * required. If the block is not in the cache pass the read request 2566 789 ahrens * on to the spa with a substitute callback function, so that the 2567 789 ahrens * requested block will be added to the cache. 2568 789 ahrens * 2569 789 ahrens * If a read request arrives for a block that has a read in-progress, 2570 789 ahrens * either wait for the in-progress read to complete (and return the 2571 789 ahrens * results); or, if this is a read with a "done" func, add a record 2572 789 ahrens * to the read to invoke the "done" func when the read completes, 2573 789 ahrens * and return; or just return. 2574 789 ahrens * 2575 789 ahrens * arc_read_done() will invoke all the requested "done" functions 2576 789 ahrens * for readers of this block. 2577 7046 ahrens * 2578 7046 ahrens * Normal callers should use arc_read and pass the arc buffer and offset 2579 7046 ahrens * for the bp. But if you know you don't need locking, you can use 2580 8213 Suhasini * arc_read_bp. 2581 7046 ahrens */ 2582 7046 ahrens int 2583 10922 Jeff arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, 2584 7237 ek110237 arc_done_func_t *done, void *private, int priority, int zio_flags, 2585 7046 ahrens uint32_t *arc_flags, const zbookmark_t *zb) 2586 7046 ahrens { 2587 7046 ahrens int err; 2588 7046 ahrens 2589 7046 ahrens ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); 2590 7046 ahrens ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); 2591 7545 Mark rw_enter(&pbuf->b_lock, RW_READER); 2592 7046 ahrens 2593 7046 ahrens err = arc_read_nolock(pio, spa, bp, done, private, priority, 2594 7237 ek110237 zio_flags, arc_flags, zb); 2595 7545 Mark rw_exit(&pbuf->b_lock); 2596 9396 Matthew 2597 7046 ahrens return (err); 2598 7046 ahrens } 2599 7046 ahrens 2600 7046 ahrens int 2601 10922 Jeff arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, 2602 7237 ek110237 arc_done_func_t *done, void *private, int priority, int zio_flags, 2603 7046 ahrens uint32_t *arc_flags, const zbookmark_t *zb) 2604 789 ahrens { 2605 789 ahrens arc_buf_hdr_t *hdr; 2606 789 ahrens arc_buf_t *buf; 2607 789 ahrens kmutex_t *hash_lock; 2608 5450 brendan zio_t *rzio; 2609 8636 Mark uint64_t guid = spa_guid(spa); 2610 789 ahrens 2611 789 ahrens top: 2612 10922 Jeff hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 2613 10922 Jeff &hash_lock); 2614 1544 eschrock if (hdr && hdr->b_datacnt > 0) { 2615 789 ahrens 2616 2391 maybee *arc_flags |= ARC_CACHED; 2617 2391 maybee 2618 789 ahrens if (HDR_IO_IN_PROGRESS(hdr)) { 2619 2391 maybee 2620 2391 maybee if (*arc_flags & ARC_WAIT) { 2621 2391 maybee cv_wait(&hdr->b_cv, hash_lock); 2622 2391 maybee mutex_exit(hash_lock); 2623 2391 maybee goto top; 2624 2391 maybee } 2625 2391 maybee ASSERT(*arc_flags & ARC_NOWAIT); 2626 2391 maybee 2627 2391 maybee if (done) { 2628 789 ahrens arc_callback_t *acb = NULL; 2629 789 ahrens 2630 789 ahrens acb = kmem_zalloc(sizeof (arc_callback_t), 2631 789 ahrens KM_SLEEP); 2632 789 ahrens acb->acb_done = done; 2633 789 ahrens acb->acb_private = private; 2634 789 ahrens if (pio != NULL) 2635 789 ahrens acb->acb_zio_dummy = zio_null(pio, 2636 8632 Bill spa, NULL, NULL, NULL, zio_flags); 2637 789 ahrens 2638 789 ahrens ASSERT(acb->acb_done != NULL); 2639 789 ahrens acb->acb_next = hdr->b_acb; 2640 789 ahrens hdr->b_acb = acb; 2641 789 ahrens add_reference(hdr, hash_lock, private); 2642 789 ahrens mutex_exit(hash_lock); 2643 789 ahrens return (0); 2644 789 ahrens } 2645 789 ahrens mutex_exit(hash_lock); 2646 789 ahrens return (0); 2647 789 ahrens } 2648 789 ahrens 2649 3403 bmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2650 789 ahrens 2651 1544 eschrock if (done) { 2652 2688 maybee add_reference(hdr, hash_lock, private); 2653 1544 eschrock /* 2654 1544 eschrock * If this block is already in use, create a new 2655 1544 eschrock * copy of the data so that we will be guaranteed 2656 1544 eschrock * that arc_release() will always succeed. 2657 1544 eschrock */ 2658 1544 eschrock buf = hdr->b_buf; 2659 1544 eschrock ASSERT(buf); 2660 1544 eschrock ASSERT(buf->b_data); 2661 2688 maybee if (HDR_BUF_AVAILABLE(hdr)) { 2662 1544 eschrock ASSERT(buf->b_efunc == NULL); 2663 1544 eschrock hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2664 2688 maybee } else { 2665 2688 maybee buf = arc_buf_clone(buf); 2666 1544 eschrock } 2667 10922 Jeff 2668 2391 maybee } else if (*arc_flags & ARC_PREFETCH && 2669 2391 maybee refcount_count(&hdr->b_refcnt) == 0) { 2670 2391 maybee hdr->b_flags |= ARC_PREFETCH; 2671 789 ahrens } 2672 789 ahrens DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2673 2688 maybee arc_access(hdr, hash_lock); 2674 7237 ek110237 if (*arc_flags & ARC_L2CACHE) 2675 7237 ek110237 hdr->b_flags |= ARC_L2CACHE; 2676 2688 maybee mutex_exit(hash_lock); 2677 3403 bmc ARCSTAT_BUMP(arcstat_hits); 2678 3403 bmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2679 3403 bmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2680 3403 bmc data, metadata, hits); 2681 3403 bmc 2682 789 ahrens if (done) 2683 789 ahrens done(NULL, buf, private); 2684 789 ahrens } else { 2685 789 ahrens uint64_t size = BP_GET_LSIZE(bp); 2686 789 ahrens arc_callback_t *acb; 2687 6987 brendan vdev_t *vd = NULL; 2688 9215 George uint64_t addr; 2689 8582 Brendan boolean_t devw = B_FALSE; 2690 789 ahrens 2691 789 ahrens if (hdr == NULL) { 2692 789 ahrens /* this block is not in the cache */ 2693 789 ahrens arc_buf_hdr_t *exists; 2694 3290 johansen arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2695 3290 johansen buf = arc_buf_alloc(spa, size, private, type); 2696 789 ahrens hdr = buf->b_hdr; 2697 789 ahrens hdr->b_dva = *BP_IDENTITY(bp); 2698 10922 Jeff hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 2699 789 ahrens hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2700 789 ahrens exists = buf_hash_insert(hdr, &hash_lock); 2701 789 ahrens if (exists) { 2702 789 ahrens /* somebody beat us to the hash insert */ 2703 789 ahrens mutex_exit(hash_lock); 2704 789 ahrens bzero(&hdr->b_dva, sizeof (dva_t)); 2705 789 ahrens hdr->b_birth = 0; 2706 789 ahrens hdr->b_cksum0 = 0; 2707 1544 eschrock (void) arc_buf_remove_ref(buf, private); 2708 789 ahrens goto top; /* restart the IO request */ 2709 789 ahrens } 2710 2391 maybee /* if this is a prefetch, we don't have a reference */ 2711 2391 maybee if (*arc_flags & ARC_PREFETCH) { 2712 2391 maybee (void) remove_reference(hdr, hash_lock, 2713 2391 maybee private); 2714 2391 maybee hdr->b_flags |= ARC_PREFETCH; 2715 2391 maybee } 2716 7237 ek110237 if (*arc_flags & ARC_L2CACHE) 2717 7237 ek110237 hdr->b_flags |= ARC_L2CACHE; 2718 2391 maybee if (BP_GET_LEVEL(bp) > 0) 2719 2391 maybee hdr->b_flags |= ARC_INDIRECT; 2720 789 ahrens } else { 2721 789 ahrens /* this block is in the ghost cache */ 2722 1544 eschrock ASSERT(GHOST_STATE(hdr->b_state)); 2723 1544 eschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2724 2391 maybee ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2725 2391 maybee ASSERT(hdr->b_buf == NULL); 2726 789 ahrens 2727 2391 maybee /* if this is a prefetch, we don't have a reference */ 2728 2391 maybee if (*arc_flags & ARC_PREFETCH) 2729 2391 maybee hdr->b_flags |= ARC_PREFETCH; 2730 2391 maybee else 2731 2391 maybee add_reference(hdr, hash_lock, private); 2732 7237 ek110237 if (*arc_flags & ARC_L2CACHE) 2733 7237 ek110237 hdr->b_flags |= ARC_L2CACHE; 2734 6245 maybee buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2735 1544 eschrock buf->b_hdr = hdr; 2736 2688 maybee buf->b_data = NULL; 2737 1544 eschrock buf->b_efunc = NULL; 2738 1544 eschrock buf->b_private = NULL; 2739 1544 eschrock buf->b_next = NULL; 2740 1544 eschrock hdr->b_buf = buf; 2741 2688 maybee arc_get_data_buf(buf); 2742 1544 eschrock ASSERT(hdr->b_datacnt == 0); 2743 1544 eschrock hdr->b_datacnt = 1; 2744 789 ahrens } 2745 789 ahrens 2746 789 ahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2747 789 ahrens acb->acb_done = done; 2748 789 ahrens acb->acb_private = private; 2749 789 ahrens 2750 789 ahrens ASSERT(hdr->b_acb == NULL); 2751 789 ahrens hdr->b_acb = acb; 2752 789 ahrens hdr->b_flags |= ARC_IO_IN_PROGRESS; 2753 789 ahrens 2754 789 ahrens /* 2755 789 ahrens * If the buffer has been evicted, migrate it to a present state 2756 789 ahrens * before issuing the I/O. Once we drop the hash-table lock, 2757 789 ahrens * the header will be marked as I/O in progress and have an 2758 789 ahrens * attached buffer. At this point, anybody who finds this 2759 789 ahrens * buffer ought to notice that it's legit but has a pending I/O. 2760 789 ahrens */ 2761 789 ahrens 2762 1544 eschrock if (GHOST_STATE(hdr->b_state)) 2763 2688 maybee arc_access(hdr, hash_lock); 2764 789 ahrens 2765 7754 Jeff if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && 2766 7754 Jeff (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { 2767 8582 Brendan devw = hdr->b_l2hdr->b_dev->l2ad_writing; 2768 6987 brendan addr = hdr->b_l2hdr->b_daddr; 2769 7754 Jeff /* 2770 7754 Jeff * Lock out device removal. 2771 7754 Jeff */ 2772 7754 Jeff if (vdev_is_dead(vd) || 2773 7754 Jeff !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 2774 7754 Jeff vd = NULL; 2775 6987 brendan } 2776 6987 brendan 2777 6987 brendan mutex_exit(hash_lock); 2778 6987 brendan 2779 789 ahrens ASSERT3U(hdr->b_size, ==, size); 2780 10409 Brendan DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 2781 10409 Brendan uint64_t, size, zbookmark_t *, zb); 2782 3403 bmc ARCSTAT_BUMP(arcstat_misses); 2783 3403 bmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2784 3403 bmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2785 3403 bmc data, metadata, misses); 2786 1544 eschrock 2787 8582 Brendan if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 2788 6987 brendan /* 2789 5450 brendan * Read from the L2ARC if the following are true: 2790 6987 brendan * 1. The L2ARC vdev was previously cached. 2791 6987 brendan * 2. This buffer still has L2ARC metadata. 2792 6987 brendan * 3. This buffer isn't currently writing to the L2ARC. 2793 6987 brendan * 4. The L2ARC entry wasn't evicted, which may 2794 6987 brendan * also have invalidated the vdev. 2795 8582 Brendan * 5. This isn't prefetch and l2arc_noprefetch is set. 2796 6987 brendan */ 2797 7754 Jeff if (hdr->b_l2hdr != NULL && 2798 8582 Brendan !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 2799 8582 Brendan !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 2800 5450 brendan l2arc_read_callback_t *cb; 2801 6643 eschrock 2802 5450 brendan DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 2803 5450 brendan ARCSTAT_BUMP(arcstat_l2_hits); 2804 5450 brendan 2805 5450 brendan cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 2806 5450 brendan KM_SLEEP); 2807 5450 brendan cb->l2rcb_buf = buf; 2808 5450 brendan cb->l2rcb_spa = spa; 2809 5450 brendan cb->l2rcb_bp = *bp; 2810 5450 brendan cb->l2rcb_zb = *zb; 2811 7237 ek110237 cb->l2rcb_flags = zio_flags; 2812 5450 brendan 2813 5450 brendan /* 2814 7754 Jeff * l2arc read. The SCL_L2ARC lock will be 2815 7754 Jeff * released by l2arc_read_done(). 2816 5450 brendan */ 2817 5450 brendan rzio = zio_read_phys(pio, vd, addr, size, 2818 5450 brendan buf->b_data, ZIO_CHECKSUM_OFF, 2819 7237 ek110237 l2arc_read_done, cb, priority, zio_flags | 2820 7361 Brendan ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | 2821 7754 Jeff ZIO_FLAG_DONT_PROPAGATE | 2822 7754 Jeff ZIO_FLAG_DONT_RETRY, B_FALSE); 2823 5450 brendan DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 2824 5450 brendan zio_t *, rzio); 2825 8582 Brendan ARCSTAT_INCR(arcstat_l2_read_bytes, size); 2826 6987 brendan 2827 6987 brendan if (*arc_flags & ARC_NOWAIT) { 2828 6987 brendan zio_nowait(rzio); 2829 6987 brendan return (0); 2830 6987 brendan } 2831 6987 brendan 2832 6987 brendan ASSERT(*arc_flags & ARC_WAIT); 2833 6987 brendan if (zio_wait(rzio) == 0) 2834 6987 brendan return (0); 2835 6987 brendan 2836 6987 brendan /* l2arc read error; goto zio_read() */ 2837 5450 brendan } else { 2838 5450 brendan DTRACE_PROBE1(l2arc__miss, 2839 5450 brendan arc_buf_hdr_t *, hdr); 2840 5450 brendan ARCSTAT_BUMP(arcstat_l2_misses); 2841 5450 brendan if (HDR_L2_WRITING(hdr)) 2842 5450 brendan ARCSTAT_BUMP(arcstat_l2_rw_clash); 2843 7754 Jeff spa_config_exit(spa, SCL_L2ARC, vd); 2844 8582 Brendan } 2845 8582 Brendan } else { 2846 8628 Bill if (vd != NULL) 2847 8628 Bill spa_config_exit(spa, SCL_L2ARC, vd); 2848 8582 Brendan if (l2arc_ndev != 0) { 2849 8582 Brendan DTRACE_PROBE1(l2arc__miss, 2850 8582 Brendan arc_buf_hdr_t *, hdr); 2851 8582 Brendan ARCSTAT_BUMP(arcstat_l2_misses); 2852 6987 brendan } 2853 6987 brendan } 2854 5450 brendan 2855 789 ahrens rzio = zio_read(pio, spa, bp, buf->b_data, size, 2856 7237 ek110237 arc_read_done, buf, priority, zio_flags, zb); 2857 789 ahrens 2858 2391 maybee if (*arc_flags & ARC_WAIT) 2859 789 ahrens return (zio_wait(rzio)); 2860 789 ahrens 2861 2391 maybee ASSERT(*arc_flags & ARC_NOWAIT); 2862 789 ahrens zio_nowait(rzio); 2863 789 ahrens } 2864 789 ahrens return (0); 2865 789 ahrens } 2866 789 ahrens 2867 1544 eschrock void 2868 1544 eschrock arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2869 1544 eschrock { 2870 1544 eschrock ASSERT(buf->b_hdr != NULL); 2871 3403 bmc ASSERT(buf->b_hdr->b_state != arc_anon); 2872 1544 eschrock ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2873 10922 Jeff ASSERT(buf->b_efunc == NULL); 2874 10922 Jeff ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 2875 10922 Jeff 2876 1544 eschrock buf->b_efunc = func; 2877 1544 eschrock buf->b_private = private; 2878 1544 eschrock } 2879 1544 eschrock 2880 1544 eschrock /* 2881 1544 eschrock * This is used by the DMU to let the ARC know that a buffer is 2882 1544 eschrock * being evicted, so the ARC should clean up. If this arc buf 2883 1544 eschrock * is not yet in the evicted state, it will be put there. 2884 1544 eschrock */ 2885 1544 eschrock int 2886 1544 eschrock arc_buf_evict(arc_buf_t *buf) 2887 1544 eschrock { 2888 2887 maybee arc_buf_hdr_t *hdr; 2889 1544 eschrock kmutex_t *hash_lock; 2890 1544 eschrock arc_buf_t **bufp; 2891 1544 eschrock 2892 7545 Mark rw_enter(&buf->b_lock, RW_WRITER); 2893 2887 maybee hdr = buf->b_hdr; 2894 1544 eschrock if (hdr == NULL) { 2895 1544 eschrock /* 2896 1544 eschrock * We are in arc_do_user_evicts(). 2897 1544 eschrock */ 2898 1544 eschrock ASSERT(buf->b_data == NULL); 2899 7545 Mark rw_exit(&buf->b_lock); 2900 1544 eschrock return (0); 2901 7545 Mark } else if (buf->b_data == NULL) { 2902 7545 Mark arc_buf_t copy = *buf; /* structure assignment */ 2903 7545 Mark /* 2904 7545 Mark * We are on the eviction list; process this buffer now 2905 7545 Mark * but let arc_do_user_evicts() do the reaping. 2906 7545 Mark */ 2907 7545 Mark buf->b_efunc = NULL; 2908 7545 Mark rw_exit(&buf->b_lock); 2909 7545 Mark VERIFY(copy.b_efunc(©) == 0); 2910 7545 Mark return (1); 2911 1544 eschrock } 2912 2887 maybee hash_lock = HDR_LOCK(hdr); 2913 7545 Mark mutex_enter(hash_lock); 2914 2724 maybee 2915 2724 maybee ASSERT(buf->b_hdr == hdr); 2916 2724 maybee ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2917 3403 bmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2918 1544 eschrock 2919 1544 eschrock /* 2920 1544 eschrock * Pull this buffer off of the hdr 2921 1544 eschrock */ 2922 1544 eschrock bufp = &hdr->b_buf; 2923 1544 eschrock while (*bufp != buf) 2924 1544 eschrock bufp = &(*bufp)->b_next; 2925 1544 eschrock *bufp = buf->b_next; 2926 1544 eschrock 2927 1544 eschrock ASSERT(buf->b_data != NULL); 2928 2688 maybee arc_buf_destroy(buf, FALSE, FALSE); 2929 1544 eschrock 2930 1544 eschrock if (hdr->b_datacnt == 0) { 2931 1544 eschrock arc_state_t *old_state = hdr->b_state; 2932 1544 eschrock arc_state_t *evicted_state; 2933 1544 eschrock 2934 1544 eschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2935 1544 eschrock 2936 1544 eschrock evicted_state = 2937 3403 bmc (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2938 1544 eschrock 2939 3403 bmc mutex_enter(&old_state->arcs_mtx); 2940 3403 bmc mutex_enter(&evicted_state->arcs_mtx); 2941 1544 eschrock 2942 1544 eschrock arc_change_state(evicted_state, hdr, hash_lock); 2943 1544 eschrock ASSERT(HDR_IN_HASH_TABLE(hdr)); 2944 5450 brendan hdr->b_flags |= ARC_IN_HASH_TABLE; 2945 5450 brendan hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2946 1544 eschrock 2947 3403 bmc mutex_exit(&evicted_state->arcs_mtx); 2948 3403 bmc mutex_exit(&old_state->arcs_mtx); 2949 1544 eschrock } 2950 1544 eschrock mutex_exit(hash_lock); 2951 7545 Mark rw_exit(&buf->b_lock); 2952 1819 maybee 2953 1544 eschrock VERIFY(buf->b_efunc(buf) == 0); 2954 1544 eschrock buf->b_efunc = NULL; 2955 1544 eschrock buf->b_private = NULL; 2956 1544 eschrock buf->b_hdr = NULL; 2957 1544 eschrock kmem_cache_free(buf_cache, buf); 2958 1544 eschrock return (1); 2959 789 ahrens } 2960 <