Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * DVA-based Adjustable Replacement Cache
     28  *
     29  * While much of the theory of operation used here is
     30  * based on the self-tuning, low overhead replacement cache
     31  * presented by Megiddo and Modha at FAST 2003, there are some
     32  * significant differences:
     33  *
     34  * 1. The Megiddo and Modha model assumes any page is evictable.
     35  * Pages in its cache cannot be "locked" into memory.  This makes
     36  * the eviction algorithm simple: evict the last page in the list.
     37  * This also make the performance characteristics easy to reason
     38  * about.  Our cache is not so simple.  At any given moment, some
     39  * subset of the blocks in the cache are un-evictable because we
     40  * have handed out a reference to them.  Blocks are only evictable
     41  * when there are no external references active.  This makes
     42  * eviction far more problematic:  we choose to evict the evictable
     43  * blocks that are the "lowest" in the list.
     44  *
     45  * There are times when it is not possible to evict the requested
     46  * space.  In these circumstances we are unable to adjust the cache
     47  * size.  To prevent the cache growing unbounded at these times we
     48  * implement a "cache throttle" that slows the flow of new data
     49  * into the cache until we can make space available.
     50  *
     51  * 2. The Megiddo and Modha model assumes a fixed cache size.
     52  * Pages are evicted when the cache is full and there is a cache
     53  * miss.  Our model has a variable sized cache.  It grows with
     54  * high use, but also tries to react to memory pressure from the
     55  * operating system: decreasing its size when system memory is
     56  * tight.
     57  *
     58  * 3. The Megiddo and Modha model assumes a fixed page size. All
     59  * elements of the cache are therefor exactly the same size.  So
     60  * when adjusting the cache size following a cache miss, its simply
     61  * a matter of choosing a single page to evict.  In our model, we
     62  * have variable sized cache blocks (rangeing from 512 bytes to
     63  * 128K bytes).  We therefor choose a set of blocks to evict to make
     64  * space for a cache miss that approximates as closely as possible
     65  * the space used by the new block.
     66  *
     67  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
     68  * by N. Megiddo & D. Modha, FAST 2003
     69  */
     70 
     71 /*
     72  * The locking model:
     73  *
     74  * A new reference to a cache buffer can be obtained in two
     75  * ways: 1) via a hash table lookup using the DVA as a key,
     76  * or 2) via one of the ARC lists.  The arc_read() interface
     77  * uses method 1, while the internal arc algorithms for
     78  * adjusting the cache use method 2.  We therefor provide two
     79  * types of locks: 1) the hash table lock array, and 2) the
     80  * arc list locks.
     81  *
     82  * Buffers do not have their own mutexs, rather they rely on the
     83  * hash table mutexs for the bulk of their protection (i.e. most
     84  * fields in the arc_buf_hdr_t are protected by these mutexs).
     85  *
     86  * buf_hash_find() returns the appropriate mutex (held) when it
     87  * locates the requested buffer in the hash table.  It returns
     88  * NULL for the mutex if the buffer was not in the table.
     89  *
     90  * buf_hash_remove() expects the appropriate hash mutex to be
     91  * already held before it is invoked.
     92  *
     93  * Each arc state also has a mutex which is used to protect the
     94  * buffer list associated with the state.  When attempting to
     95  * obtain a hash table lock while holding an arc list lock you
     96  * must use: mutex_tryenter() to avoid deadlock.  Also note that
     97  * the active state mutex must be held before the ghost state mutex.
     98  *
     99  * Arc buffers may have an associated eviction callback function.
    100  * This function will be invoked prior to removing the buffer (e.g.
    101  * in arc_do_user_evicts()).  Note however that the data associated
    102  * with the buffer may be evicted prior to the callback.  The callback
    103  * must be made with *no locks held* (to prevent deadlock).  Additionally,
    104  * the users of callbacks must ensure that their private data is
    105  * protected from simultaneous callbacks from arc_buf_evict()
    106  * and arc_do_user_evicts().
    107  *
    108  * Note that the majority of the performance stats are manipulated
    109  * with atomic operations.
    110  *
    111  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
    112  *
    113  *	- L2ARC buflist creation
    114  *	- L2ARC buflist eviction
    115  *	- L2ARC write completion, which walks L2ARC buflists
    116  *	- ARC header destruction, as it removes from L2ARC buflists
    117  *	- ARC header release, as it removes from L2ARC buflists
    118  */
    119 
    120 #include <sys/spa.h>
    121 #include <sys/zio.h>
    122 #include <sys/zio_checksum.h>
    123 #include <sys/zfs_context.h>
    124 #include <sys/arc.h>
    125 #include <sys/refcount.h>
    126 #include <sys/vdev.h>
    127 #include <sys/vdev_impl.h>
    128 #ifdef _KERNEL
    129 #include <sys/vmsystm.h>
    130 #include <vm/anon.h>
    131 #include <sys/fs/swapnode.h>
    132 #include <sys/dnlc.h>
    133 #endif
    134 #include <sys/callb.h>
    135 #include <sys/kstat.h>
    136 
    137 static kmutex_t		arc_reclaim_thr_lock;
    138 static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
    139 static uint8_t		arc_thread_exit;
    140 
    141 extern int zfs_write_limit_shift;
    142 extern uint64_t zfs_write_limit_max;
    143 extern kmutex_t zfs_write_limit_lock;
    144 
    145 #define	ARC_REDUCE_DNLC_PERCENT	3
    146 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
    147 
    148 typedef enum arc_reclaim_strategy {
    149 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
    150 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
    151 } arc_reclaim_strategy_t;
    152 
    153 /* number of seconds before growing cache again */
    154 static int		arc_grow_retry = 60;
    155 
    156 /* shift of arc_c for calculating both min and max arc_p */
    157 static int		arc_p_min_shift = 4;
    158 
    159 /* log2(fraction of arc to reclaim) */
    160 static int		arc_shrink_shift = 5;
    161 
    162 /*
    163  * minimum lifespan of a prefetch block in clock ticks
    164  * (initialized in arc_init())
    165  */
    166 static int		arc_min_prefetch_lifespan;
    167 
    168 static int arc_dead;
    169 
    170 /*
    171  * The arc has filled available memory and has now warmed up.
    172  */
    173 static boolean_t arc_warm;
    174 
    175 /*
    176  * These tunables are for performance analysis.
    177  */
    178 uint64_t zfs_arc_max;
    179 uint64_t zfs_arc_min;
    180 uint64_t zfs_arc_meta_limit = 0;
    181 int zfs_mdcomp_disable = 0;
    182 int zfs_arc_grow_retry = 0;
    183 int zfs_arc_shrink_shift = 0;
    184 int zfs_arc_p_min_shift = 0;
    185 
    186 /*
    187  * Note that buffers can be in one of 6 states:
    188  *	ARC_anon	- anonymous (discussed below)
    189  *	ARC_mru		- recently used, currently cached
    190  *	ARC_mru_ghost	- recentely used, no longer in cache
    191  *	ARC_mfu		- frequently used, currently cached
    192  *	ARC_mfu_ghost	- frequently used, no longer in cache
    193  *	ARC_l2c_only	- exists in L2ARC but not other states
    194  * When there are no active references to the buffer, they are
    195  * are linked onto a list in one of these arc states.  These are
    196  * the only buffers that can be evicted or deleted.  Within each
    197  * state there are multiple lists, one for meta-data and one for
    198  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
    199  * etc.) is tracked separately so that it can be managed more
    200  * explicitly: favored over data, limited explicitly.
    201  *
    202  * Anonymous buffers are buffers that are not associated with
    203  * a DVA.  These are buffers that hold dirty block copies
    204  * before they are written to stable storage.  By definition,
    205  * they are "ref'd" and are considered part of arc_mru
    206  * that cannot be freed.  Generally, they will aquire a DVA
    207  * as they are written and migrate onto the arc_mru list.
    208  *
    209  * The ARC_l2c_only state is for buffers that are in the second
    210  * level ARC but no longer in any of the ARC_m* lists.  The second
    211  * level ARC itself may also contain buffers that are in any of
    212  * the ARC_m* states - meaning that a buffer can exist in two
    213  * places.  The reason for the ARC_l2c_only state is to keep the
    214  * buffer header in the hash table, so that reads that hit the
    215  * second level ARC benefit from these fast lookups.
    216  */
    217 
    218 typedef struct arc_state {
    219 	list_t	arcs_list[ARC_BUFC_NUMTYPES];	/* list of evictable buffers */
    220 	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
    221 	uint64_t arcs_size;	/* total amount of data in this state */
    222 	kmutex_t arcs_mtx;
    223 } arc_state_t;
    224 
    225 /* The 6 states: */
    226 static arc_state_t ARC_anon;
    227 static arc_state_t ARC_mru;
    228 static arc_state_t ARC_mru_ghost;
    229 static arc_state_t ARC_mfu;
    230 static arc_state_t ARC_mfu_ghost;
    231 static arc_state_t ARC_l2c_only;
    232 
    233 typedef struct arc_stats {
    234 	kstat_named_t arcstat_hits;
    235 	kstat_named_t arcstat_misses;
    236 	kstat_named_t arcstat_demand_data_hits;
    237 	kstat_named_t arcstat_demand_data_misses;
    238 	kstat_named_t arcstat_demand_metadata_hits;
    239 	kstat_named_t arcstat_demand_metadata_misses;
    240 	kstat_named_t arcstat_prefetch_data_hits;
    241 	kstat_named_t arcstat_prefetch_data_misses;
    242 	kstat_named_t arcstat_prefetch_metadata_hits;
    243 	kstat_named_t arcstat_prefetch_metadata_misses;
    244 	kstat_named_t arcstat_mru_hits;
    245 	kstat_named_t arcstat_mru_ghost_hits;
    246 	kstat_named_t arcstat_mfu_hits;
    247 	kstat_named_t arcstat_mfu_ghost_hits;
    248 	kstat_named_t arcstat_deleted;
    249 	kstat_named_t arcstat_recycle_miss;
    250 	kstat_named_t arcstat_mutex_miss;
    251 	kstat_named_t arcstat_evict_skip;
    252 	kstat_named_t arcstat_hash_elements;
    253 	kstat_named_t arcstat_hash_elements_max;
    254 	kstat_named_t arcstat_hash_collisions;
    255 	kstat_named_t arcstat_hash_chains;
    256 	kstat_named_t arcstat_hash_chain_max;
    257 	kstat_named_t arcstat_p;
    258 	kstat_named_t arcstat_c;
    259 	kstat_named_t arcstat_c_min;
    260 	kstat_named_t arcstat_c_max;
    261 	kstat_named_t arcstat_size;
    262 	kstat_named_t arcstat_hdr_size;
    263 	kstat_named_t arcstat_data_size;
    264 	kstat_named_t arcstat_other_size;
    265 	kstat_named_t arcstat_l2_hits;
    266 	kstat_named_t arcstat_l2_misses;
    267 	kstat_named_t arcstat_l2_feeds;
    268 	kstat_named_t arcstat_l2_rw_clash;
    269 	kstat_named_t arcstat_l2_read_bytes;
    270 	kstat_named_t arcstat_l2_write_bytes;
    271 	kstat_named_t arcstat_l2_writes_sent;
    272 	kstat_named_t arcstat_l2_writes_done;
    273 	kstat_named_t arcstat_l2_writes_error;
    274 	kstat_named_t arcstat_l2_writes_hdr_miss;
    275 	kstat_named_t arcstat_l2_evict_lock_retry;
    276 	kstat_named_t arcstat_l2_evict_reading;
    277 	kstat_named_t arcstat_l2_free_on_write;
    278 	kstat_named_t arcstat_l2_abort_lowmem;
    279 	kstat_named_t arcstat_l2_cksum_bad;
    280 	kstat_named_t arcstat_l2_io_error;
    281 	kstat_named_t arcstat_l2_size;
    282 	kstat_named_t arcstat_l2_hdr_size;
    283 	kstat_named_t arcstat_memory_throttle_count;
    284 } arc_stats_t;
    285 
    286 static arc_stats_t arc_stats = {
    287 	{ "hits",			KSTAT_DATA_UINT64 },
    288 	{ "misses",			KSTAT_DATA_UINT64 },
    289 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
    290 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
    291 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
    292 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
    293 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
    294 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
    295 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
    296 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
    297 	{ "mru_hits",			KSTAT_DATA_UINT64 },
    298 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
    299 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
    300 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
    301 	{ "deleted",			KSTAT_DATA_UINT64 },
    302 	{ "recycle_miss",		KSTAT_DATA_UINT64 },
    303 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
    304 	{ "evict_skip",			KSTAT_DATA_UINT64 },
    305 	{ "hash_elements",		KSTAT_DATA_UINT64 },
    306 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
    307 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
    308 	{ "hash_chains",		KSTAT_DATA_UINT64 },
    309 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
    310 	{ "p",				KSTAT_DATA_UINT64 },
    311 	{ "c",				KSTAT_DATA_UINT64 },
    312 	{ "c_min",			KSTAT_DATA_UINT64 },
    313 	{ "c_max",			KSTAT_DATA_UINT64 },
    314 	{ "size",			KSTAT_DATA_UINT64 },
    315 	{ "hdr_size",			KSTAT_DATA_UINT64 },
    316 	{ "data_size",			KSTAT_DATA_UINT64 },
    317 	{ "other_size",			KSTAT_DATA_UINT64 },
    318 	{ "l2_hits",			KSTAT_DATA_UINT64 },
    319 	{ "l2_misses",			KSTAT_DATA_UINT64 },
    320 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
    321 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
    322 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
    323 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
    324 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
    325 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
    326 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
    327 	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
    328 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
    329 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
    330 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
    331 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
    332 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
    333 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
    334 	{ "l2_size",			KSTAT_DATA_UINT64 },
    335 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
    336 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 }
    337 };
    338 
    339 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
    340 
    341 #define	ARCSTAT_INCR(stat, val) \
    342 	atomic_add_64(&arc_stats.stat.value.ui64, (val));
    343 
    344 #define	ARCSTAT_BUMP(stat) 	ARCSTAT_INCR(stat, 1)
    345 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
    346 
    347 #define	ARCSTAT_MAX(stat, val) {					\
    348 	uint64_t m;							\
    349 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
    350 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
    351 		continue;						\
    352 }
    353 
    354 #define	ARCSTAT_MAXSTAT(stat) \
    355 	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
    356 
    357 /*
    358  * We define a macro to allow ARC hits/misses to be easily broken down by
    359  * two separate conditions, giving a total of four different subtypes for
    360  * each of hits and misses (so eight statistics total).
    361  */
    362 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
    363 	if (cond1) {							\
    364 		if (cond2) {						\
    365 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
    366 		} else {						\
    367 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
    368 		}							\
    369 	} else {							\
    370 		if (cond2) {						\
    371 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
    372 		} else {						\
    373 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
    374 		}							\
    375 	}
    376 
    377 kstat_t			*arc_ksp;
    378 static arc_state_t 	*arc_anon;
    379 static arc_state_t	*arc_mru;
    380 static arc_state_t	*arc_mru_ghost;
    381 static arc_state_t	*arc_mfu;
    382 static arc_state_t	*arc_mfu_ghost;
    383 static arc_state_t	*arc_l2c_only;
    384 
    385 /*
    386  * There are several ARC variables that are critical to export as kstats --
    387  * but we don't want to have to grovel around in the kstat whenever we wish to
    388  * manipulate them.  For these variables, we therefore define them to be in
    389  * terms of the statistic variable.  This assures that we are not introducing
    390  * the possibility of inconsistency by having shadow copies of the variables,
    391  * while still allowing the code to be readable.
    392  */
    393 #define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
    394 #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
    395 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
    396 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
    397 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
    398 
    399 static int		arc_no_grow;	/* Don't try to grow cache size */
    400 static uint64_t		arc_tempreserve;
    401 static uint64_t		arc_loaned_bytes;
    402 static uint64_t		arc_meta_used;
    403 static uint64_t		arc_meta_limit;
    404 static uint64_t		arc_meta_max = 0;
    405 
    406 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
    407 
    408 typedef struct arc_callback arc_callback_t;
    409 
    410 struct arc_callback {
    411 	void			*acb_private;
    412 	arc_done_func_t		*acb_done;
    413 	arc_buf_t		*acb_buf;
    414 	zio_t			*acb_zio_dummy;
    415 	arc_callback_t		*acb_next;
    416 };
    417 
    418 typedef struct arc_write_callback arc_write_callback_t;
    419 
    420 struct arc_write_callback {
    421 	void		*awcb_private;
    422 	arc_done_func_t	*awcb_ready;
    423 	arc_done_func_t	*awcb_done;
    424 	arc_buf_t	*awcb_buf;
    425 };
    426 
    427 struct arc_buf_hdr {
    428 	/* protected by hash lock */
    429 	dva_t			b_dva;
    430 	uint64_t		b_birth;
    431 	uint64_t		b_cksum0;
    432 
    433 	kmutex_t		b_freeze_lock;
    434 	zio_cksum_t		*b_freeze_cksum;
    435 
    436 	arc_buf_hdr_t		*b_hash_next;
    437 	arc_buf_t		*b_buf;
    438 	uint32_t		b_flags;
    439 	uint32_t		b_datacnt;
    440 
    441 	arc_callback_t		*b_acb;
    442 	kcondvar_t		b_cv;
    443 
    444 	/* immutable */
    445 	arc_buf_contents_t	b_type;
    446 	uint64_t		b_size;
    447 	uint64_t		b_spa;
    448 
    449 	/* protected by arc state mutex */
    450 	arc_state_t		*b_state;
    451 	list_node_t		b_arc_node;
    452 
    453 	/* updated atomically */
    454 	clock_t			b_arc_access;
    455 
    456 	/* self protecting */
    457 	refcount_t		b_refcnt;
    458 
    459 	l2arc_buf_hdr_t		*b_l2hdr;
    460 	list_node_t		b_l2node;
    461 };
    462 
    463 static arc_buf_t *arc_eviction_list;
    464 static kmutex_t arc_eviction_mtx;
    465 static arc_buf_hdr_t arc_eviction_hdr;
    466 static void arc_get_data_buf(arc_buf_t *buf);
    467 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
    468 static int arc_evict_needed(arc_buf_contents_t type);
    469 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
    470 
    471 #define	GHOST_STATE(state)	\
    472 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
    473 	(state) == arc_l2c_only)
    474 
    475 /*
    476  * Private ARC flags.  These flags are private ARC only flags that will show up
    477  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
    478  * be passed in as arc_flags in things like arc_read.  However, these flags
    479  * should never be passed and should only be set by ARC code.  When adding new
    480  * public flags, make sure not to smash the private ones.
    481  */
    482 
    483 #define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
    484 #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
    485 #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
    486 #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
    487 #define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
    488 #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
    489 #define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
    490 #define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
    491 #define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
    492 #define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
    493 #define	ARC_STORED		(1 << 19)	/* has been store()d to */
    494 
    495 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
    496 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
    497 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
    498 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
    499 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
    500 #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
    501 #define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
    502 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
    503 #define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
    504 				    (hdr)->b_l2hdr != NULL)
    505 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
    506 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
    507 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
    508 
    509 /*
    510  * Other sizes
    511  */
    512 
    513 #define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
    514 #define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
    515 
    516 /*
    517  * Hash table routines
    518  */
    519 
    520 #define	HT_LOCK_PAD	64
    521 
    522 struct ht_lock {
    523 	kmutex_t	ht_lock;
    524 #ifdef _KERNEL
    525 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
    526 #endif
    527 };
    528 
    529 #define	BUF_LOCKS 256
    530 typedef struct buf_hash_table {
    531 	uint64_t ht_mask;
    532 	arc_buf_hdr_t **ht_table;
    533 	struct ht_lock ht_locks[BUF_LOCKS];
    534 } buf_hash_table_t;
    535 
    536 static buf_hash_table_t buf_hash_table;
    537 
    538 #define	BUF_HASH_INDEX(spa, dva, birth) \
    539 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
    540 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
    541 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
    542 #define	HDR_LOCK(buf) \
    543 	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
    544 
    545 uint64_t zfs_crc64_table[256];
    546 
    547 /*
    548  * Level 2 ARC
    549  */
    550 
    551 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
    552 #define	L2ARC_HEADROOM		2		/* num of writes */
    553 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
    554 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
    555 
    556 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
    557 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
    558 
    559 /*
    560  * L2ARC Performance Tunables
    561  */
    562 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
    563 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
    564 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
    565 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
    566 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
    567 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
    568 boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
    569 boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
    570 
    571 /*
    572  * L2ARC Internals
    573  */
    574 typedef struct l2arc_dev {
    575 	vdev_t			*l2ad_vdev;	/* vdev */
    576 	spa_t			*l2ad_spa;	/* spa */
    577 	uint64_t		l2ad_hand;	/* next write location */
    578 	uint64_t		l2ad_write;	/* desired write size, bytes */
    579 	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
    580 	uint64_t		l2ad_start;	/* first addr on device */
    581 	uint64_t		l2ad_end;	/* last addr on device */
    582 	uint64_t		l2ad_evict;	/* last addr eviction reached */
    583 	boolean_t		l2ad_first;	/* first sweep through */
    584 	boolean_t		l2ad_writing;	/* currently writing */
    585 	list_t			*l2ad_buflist;	/* buffer list */
    586 	list_node_t		l2ad_node;	/* device list node */
    587 } l2arc_dev_t;
    588 
    589 static list_t L2ARC_dev_list;			/* device list */
    590 static list_t *l2arc_dev_list;			/* device list pointer */
    591 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
    592 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
    593 static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
    594 static list_t L2ARC_free_on_write;		/* free after write buf list */
    595 static list_t *l2arc_free_on_write;		/* free after write list ptr */
    596 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
    597 static uint64_t l2arc_ndev;			/* number of devices */
    598 
    599 typedef struct l2arc_read_callback {
    600 	arc_buf_t	*l2rcb_buf;		/* read buffer */
    601 	spa_t		*l2rcb_spa;		/* spa */
    602 	blkptr_t	l2rcb_bp;		/* original blkptr */
    603 	zbookmark_t	l2rcb_zb;		/* original bookmark */
    604 	int		l2rcb_flags;		/* original flags */
    605 } l2arc_read_callback_t;
    606 
    607 typedef struct l2arc_write_callback {
    608 	l2arc_dev_t	*l2wcb_dev;		/* device info */
    609 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
    610 } l2arc_write_callback_t;
    611 
    612 struct l2arc_buf_hdr {
    613 	/* protected by arc_buf_hdr  mutex */
    614 	l2arc_dev_t	*b_dev;			/* L2ARC device */
    615 	uint64_t	b_daddr;		/* disk address, offset byte */
    616 };
    617 
    618 typedef struct l2arc_data_free {
    619 	/* protected by l2arc_free_on_write_mtx */
    620 	void		*l2df_data;
    621 	size_t		l2df_size;
    622 	void		(*l2df_func)(void *, size_t);
    623 	list_node_t	l2df_list_node;
    624 } l2arc_data_free_t;
    625 
    626 static kmutex_t l2arc_feed_thr_lock;
    627 static kcondvar_t l2arc_feed_thr_cv;
    628 static uint8_t l2arc_thread_exit;
    629 
    630 static void l2arc_read_done(zio_t *zio);
    631 static void l2arc_hdr_stat_add(void);
    632 static void l2arc_hdr_stat_remove(void);
    633 
    634 static uint64_t
    635 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
    636 {
    637 	uint8_t *vdva = (uint8_t *)dva;
    638 	uint64_t crc = -1ULL;
    639 	int i;
    640 
    641 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
    642 
    643 	for (i = 0; i < sizeof (dva_t); i++)
    644 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
    645 
    646 	crc ^= (spa>>8) ^ birth;
    647 
    648 	return (crc);
    649 }
    650 
    651 #define	BUF_EMPTY(buf)						\
    652 	((buf)->b_dva.dva_word[0] == 0 &&			\
    653 	(buf)->b_dva.dva_word[1] == 0 &&			\
    654 	(buf)->b_birth == 0)
    655 
    656 #define	BUF_EQUAL(spa, dva, birth, buf)				\
    657 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
    658 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
    659 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
    660 
    661 static arc_buf_hdr_t *
    662 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
    663 {
    664 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
    665 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
    666 	arc_buf_hdr_t *buf;
    667 
    668 	mutex_enter(hash_lock);
    669 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
    670 	    buf = buf->b_hash_next) {
    671 		if (BUF_EQUAL(spa, dva, birth, buf)) {
    672 			*lockp = hash_lock;
    673 			return (buf);
    674 		}
    675 	}
    676 	mutex_exit(hash_lock);
    677 	*lockp = NULL;
    678 	return (NULL);
    679 }
    680 
    681 /*
    682  * Insert an entry into the hash table.  If there is already an element
    683  * equal to elem in the hash table, then the already existing element
    684  * will be returned and the new element will not be inserted.
    685  * Otherwise returns NULL.
    686  */
    687 static arc_buf_hdr_t *
    688 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
    689 {
    690 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
    691 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
    692 	arc_buf_hdr_t *fbuf;
    693 	uint32_t i;
    694 
    695 	ASSERT(!HDR_IN_HASH_TABLE(buf));
    696 	*lockp = hash_lock;
    697 	mutex_enter(hash_lock);
    698 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
    699 	    fbuf = fbuf->b_hash_next, i++) {
    700 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
    701 			return (fbuf);
    702 	}
    703 
    704 	buf->b_hash_next = buf_hash_table.ht_table[idx];
    705 	buf_hash_table.ht_table[idx] = buf;
    706 	buf->b_flags |= ARC_IN_HASH_TABLE;
    707 
    708 	/* collect some hash table performance data */
    709 	if (i > 0) {
    710 		ARCSTAT_BUMP(arcstat_hash_collisions);
    711 		if (i == 1)
    712 			ARCSTAT_BUMP(arcstat_hash_chains);
    713 
    714 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
    715 	}
    716 
    717 	ARCSTAT_BUMP(arcstat_hash_elements);
    718 	ARCSTAT_MAXSTAT(arcstat_hash_elements);
    719 
    720 	return (NULL);
    721 }
    722 
    723 static void
    724 buf_hash_remove(arc_buf_hdr_t *buf)
    725 {
    726 	arc_buf_hdr_t *fbuf, **bufp;
    727 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
    728 
    729 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
    730 	ASSERT(HDR_IN_HASH_TABLE(buf));
    731 
    732 	bufp = &buf_hash_table.ht_table[idx];
    733 	while ((fbuf = *bufp) != buf) {
    734 		ASSERT(fbuf != NULL);
    735 		bufp = &fbuf->b_hash_next;
    736 	}
    737 	*bufp = buf->b_hash_next;
    738 	buf->b_hash_next = NULL;
    739 	buf->b_flags &= ~ARC_IN_HASH_TABLE;
    740 
    741 	/* collect some hash table performance data */
    742 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
    743 
    744 	if (buf_hash_table.ht_table[idx] &&
    745 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
    746 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
    747 }
    748 
    749 /*
    750  * Global data structures and functions for the buf kmem cache.
    751  */
    752 static kmem_cache_t *hdr_cache;
    753 static kmem_cache_t *buf_cache;
    754 
    755 static void
    756 buf_fini(void)
    757 {
    758 	int i;
    759 
    760 	kmem_free(buf_hash_table.ht_table,
    761 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
    762 	for (i = 0; i < BUF_LOCKS; i++)
    763 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
    764 	kmem_cache_destroy(hdr_cache);
    765 	kmem_cache_destroy(buf_cache);
    766 }
    767 
    768 /*
    769  * Constructor callback - called when the cache is empty
    770  * and a new buf is requested.
    771  */
    772 /* ARGSUSED */
    773 static int
    774 hdr_cons(void *vbuf, void *unused, int kmflag)
    775 {
    776 	arc_buf_hdr_t *buf = vbuf;
    777 
    778 	bzero(buf, sizeof (arc_buf_hdr_t));
    779 	refcount_create(&buf->b_refcnt);
    780 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
    781 	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
    782 	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
    783 
    784 	return (0);
    785 }
    786 
    787 /* ARGSUSED */
    788 static int
    789 buf_cons(void *vbuf, void *unused, int kmflag)
    790 {
    791 	arc_buf_t *buf = vbuf;
    792 
    793 	bzero(buf, sizeof (arc_buf_t));
    794 	rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
    795 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
    796 
    797 	return (0);
    798 }
    799 
    800 /*
    801  * Destructor callback - called when a cached buf is
    802  * no longer required.
    803  */
    804 /* ARGSUSED */
    805 static void
    806 hdr_dest(void *vbuf, void *unused)
    807 {
    808 	arc_buf_hdr_t *buf = vbuf;
    809 
    810 	refcount_destroy(&buf->b_refcnt);
    811 	cv_destroy(&buf->b_cv);
    812 	mutex_destroy(&buf->b_freeze_lock);
    813 	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
    814 }
    815 
    816 /* ARGSUSED */
    817 static void
    818 buf_dest(void *vbuf, void *unused)
    819 {
    820 	arc_buf_t *buf = vbuf;
    821 
    822 	rw_destroy(&buf->b_lock);
    823 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
    824 }
    825 
    826 /*
    827  * Reclaim callback -- invoked when memory is low.
    828  */
    829 /* ARGSUSED */
    830 static void
    831 hdr_recl(void *unused)
    832 {
    833 	dprintf("hdr_recl called\n");
    834 	/*
    835 	 * umem calls the reclaim func when we destroy the buf cache,
    836 	 * which is after we do arc_fini().
    837 	 */
    838 	if (!arc_dead)
    839 		cv_signal(&arc_reclaim_thr_cv);
    840 }
    841 
    842 static void
    843 buf_init(void)
    844 {
    845 	uint64_t *ct;
    846 	uint64_t hsize = 1ULL << 12;
    847 	int i, j;
    848 
    849 	/*
    850 	 * The hash table is big enough to fill all of physical memory
    851 	 * with an average 64K block size.  The table will take up
    852 	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
    853 	 */
    854 	while (hsize * 65536 < physmem * PAGESIZE)
    855 		hsize <<= 1;
    856 retry:
    857 	buf_hash_table.ht_mask = hsize - 1;
    858 	buf_hash_table.ht_table =
    859 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
    860 	if (buf_hash_table.ht_table == NULL) {
    861 		ASSERT(hsize > (1ULL << 8));
    862 		hsize >>= 1;
    863 		goto retry;
    864 	}
    865 
    866 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
    867 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
    868 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
    869 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
    870 
    871 	for (i = 0; i < 256; i++)
    872 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
    873 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
    874 
    875 	for (i = 0; i < BUF_LOCKS; i++) {
    876 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
    877 		    NULL, MUTEX_DEFAULT, NULL);
    878 	}
    879 }
    880 
    881 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
    882 
    883 static void
    884 arc_cksum_verify(arc_buf_t *buf)
    885 {
    886 	zio_cksum_t zc;
    887 
    888 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
    889 		return;
    890 
    891 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    892 	if (buf->b_hdr->b_freeze_cksum == NULL ||
    893 	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
    894 		mutex_exit(&buf->b_hdr->b_freeze_lock);
    895 		return;
    896 	}
    897 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
    898 	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
    899 		panic("buffer modified while frozen!");
    900 	mutex_exit(&buf->b_hdr->b_freeze_lock);
    901 }
    902 
    903 static int
    904 arc_cksum_equal(arc_buf_t *buf)
    905 {
    906 	zio_cksum_t zc;
    907 	int equal;
    908 
    909 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    910 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
    911 	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
    912 	mutex_exit(&buf->b_hdr->b_freeze_lock);
    913 
    914 	return (equal);
    915 }
    916 
    917 static void
    918 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
    919 {
    920 	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
    921 		return;
    922 
    923 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    924 	if (buf->b_hdr->b_freeze_cksum != NULL) {
    925 		mutex_exit(&buf->b_hdr->b_freeze_lock);
    926 		return;
    927 	}
    928 	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
    929 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
    930 	    buf->b_hdr->b_freeze_cksum);
    931 	mutex_exit(&buf->b_hdr->b_freeze_lock);
    932 }
    933 
    934 void
    935 arc_buf_thaw(arc_buf_t *buf)
    936 {
    937 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
    938 		if (buf->b_hdr->b_state != arc_anon)
    939 			panic("modifying non-anon buffer!");
    940 		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
    941 			panic("modifying buffer while i/o in progress!");
    942 		arc_cksum_verify(buf);
    943 	}
    944 
    945 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    946 	if (buf->b_hdr->b_freeze_cksum != NULL) {
    947 		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
    948 		buf->b_hdr->b_freeze_cksum = NULL;
    949 	}
    950 	mutex_exit(&buf->b_hdr->b_freeze_lock);
    951 }
    952 
    953 void
    954 arc_buf_freeze(arc_buf_t *buf)
    955 {
    956 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
    957 		return;
    958 
    959 	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
    960 	    buf->b_hdr->b_state == arc_anon);
    961 	arc_cksum_compute(buf, B_FALSE);
    962 }
    963 
    964 static void
    965 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
    966 {
    967 	ASSERT(MUTEX_HELD(hash_lock));
    968 
    969 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
    970 	    (ab->b_state != arc_anon)) {
    971 		uint64_t delta = ab->b_size * ab->b_datacnt;
    972 		list_t *list = &ab->b_state->arcs_list[ab->b_type];
    973 		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
    974 
    975 		ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
    976 		mutex_enter(&ab->b_state->arcs_mtx);
    977 		ASSERT(list_link_active(&ab->b_arc_node));
    978 		list_remove(list, ab);
    979 		if (GHOST_STATE(ab->b_state)) {
    980 			ASSERT3U(ab->b_datacnt, ==, 0);
    981 			ASSERT3P(ab->b_buf, ==, NULL);
    982 			delta = ab->b_size;
    983 		}
    984 		ASSERT(delta > 0);
    985 		ASSERT3U(*size, >=, delta);
    986 		atomic_add_64(size, -delta);
    987 		mutex_exit(&ab->b_state->arcs_mtx);
    988 		/* remove the prefetch flag if we get a reference */
    989 		if (ab->b_flags & ARC_PREFETCH)
    990 			ab->b_flags &= ~ARC_PREFETCH;
    991 	}
    992 }
    993 
    994 static int
    995 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
    996 {
    997 	int cnt;
    998 	arc_state_t *state = ab->b_state;
    999 
   1000 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
   1001 	ASSERT(!GHOST_STATE(state));
   1002 
   1003 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
   1004 	    (state != arc_anon)) {
   1005 		uint64_t *size = &state->arcs_lsize[ab->b_type];
   1006 
   1007 		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
   1008 		mutex_enter(&state->arcs_mtx);
   1009 		ASSERT(!list_link_active(&ab->b_arc_node));
   1010 		list_insert_head(&state->arcs_list[ab->b_type], ab);
   1011 		ASSERT(ab->b_datacnt > 0);
   1012 		atomic_add_64(size, ab->b_size * ab->b_datacnt);
   1013 		mutex_exit(&state->arcs_mtx);
   1014 	}
   1015 	return (cnt);
   1016 }
   1017 
   1018 /*
   1019  * Move the supplied buffer to the indicated state.  The mutex
   1020  * for the buffer must be held by the caller.
   1021  */
   1022 static void
   1023 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
   1024 {
   1025 	arc_state_t *old_state = ab->b_state;
   1026 	int64_t refcnt = refcount_count(&ab->b_refcnt);
   1027 	uint64_t from_delta, to_delta;
   1028 
   1029 	ASSERT(MUTEX_HELD(hash_lock));
   1030 	ASSERT(new_state != old_state);
   1031 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
   1032 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
   1033 
   1034 	from_delta = to_delta = ab->b_datacnt * ab->b_size;
   1035 
   1036 	/*
   1037 	 * If this buffer is evictable, transfer it from the
   1038 	 * old state list to the new state list.
   1039 	 */
   1040 	if (refcnt == 0) {
   1041 		if (old_state != arc_anon) {
   1042 			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
   1043 			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
   1044 
   1045 			if (use_mutex)
   1046 				mutex_enter(&old_state->arcs_mtx);
   1047 
   1048 			ASSERT(list_link_active(&ab->b_arc_node));
   1049 			list_remove(&old_state->arcs_list[ab->b_type], ab);
   1050 
   1051 			/*
   1052 			 * If prefetching out of the ghost cache,
   1053 			 * we will have a non-null datacnt.
   1054 			 */
   1055 			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
   1056 				/* ghost elements have a ghost size */
   1057 				ASSERT(ab->b_buf == NULL);
   1058 				from_delta = ab->b_size;
   1059 			}
   1060 			ASSERT3U(*size, >=, from_delta);
   1061 			atomic_add_64(size, -from_delta);
   1062 
   1063 			if (use_mutex)
   1064 				mutex_exit(&old_state->arcs_mtx);
   1065 		}
   1066 		if (new_state != arc_anon) {
   1067 			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
   1068 			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
   1069 
   1070 			if (use_mutex)
   1071 				mutex_enter(&new_state->arcs_mtx);
   1072 
   1073 			list_insert_head(&new_state->arcs_list[ab->b_type], ab);
   1074 
   1075 			/* ghost elements have a ghost size */
   1076 			if (GHOST_STATE(new_state)) {
   1077 				ASSERT(ab->b_datacnt == 0);
   1078 				ASSERT(ab->b_buf == NULL);
   1079 				to_delta = ab->b_size;
   1080 			}
   1081 			atomic_add_64(size, to_delta);
   1082 
   1083 			if (use_mutex)
   1084 				mutex_exit(&new_state->arcs_mtx);
   1085 		}
   1086 	}
   1087 
   1088 	ASSERT(!BUF_EMPTY(ab));
   1089 	if (new_state == arc_anon) {
   1090 		buf_hash_remove(ab);
   1091 	}
   1092 
   1093 	/* adjust state sizes */
   1094 	if (to_delta)
   1095 		atomic_add_64(&new_state->arcs_size, to_delta);
   1096 	if (from_delta) {
   1097 		ASSERT3U(old_state->arcs_size, >=, from_delta);
   1098 		atomic_add_64(&old_state->arcs_size, -from_delta);
   1099 	}
   1100 	ab->b_state = new_state;
   1101 
   1102 	/* adjust l2arc hdr stats */
   1103 	if (new_state == arc_l2c_only)
   1104 		l2arc_hdr_stat_add();
   1105 	else if (old_state == arc_l2c_only)
   1106 		l2arc_hdr_stat_remove();
   1107 }
   1108 
   1109 void
   1110 arc_space_consume(uint64_t space, arc_space_type_t type)
   1111 {
   1112 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
   1113 
   1114 	switch (type) {
   1115 	case ARC_SPACE_DATA:
   1116 		ARCSTAT_INCR(arcstat_data_size, space);
   1117 		break;
   1118 	case ARC_SPACE_OTHER:
   1119 		ARCSTAT_INCR(arcstat_other_size, space);
   1120 		break;
   1121 	case ARC_SPACE_HDRS:
   1122 		ARCSTAT_INCR(arcstat_hdr_size, space);
   1123 		break;
   1124 	case ARC_SPACE_L2HDRS:
   1125 		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
   1126 		break;
   1127 	}
   1128 
   1129 	atomic_add_64(&arc_meta_used, space);
   1130 	atomic_add_64(&arc_size, space);
   1131 }
   1132 
   1133 void
   1134 arc_space_return(uint64_t space, arc_space_type_t type)
   1135 {
   1136 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
   1137 
   1138 	switch (type) {
   1139 	case ARC_SPACE_DATA:
   1140 		ARCSTAT_INCR(arcstat_data_size, -space);
   1141 		break;
   1142 	case ARC_SPACE_OTHER:
   1143 		ARCSTAT_INCR(arcstat_other_size, -space);
   1144 		break;
   1145 	case ARC_SPACE_HDRS:
   1146 		ARCSTAT_INCR(arcstat_hdr_size, -space);
   1147 		break;
   1148 	case ARC_SPACE_L2HDRS:
   1149 		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
   1150 		break;
   1151 	}
   1152 
   1153 	ASSERT(arc_meta_used >= space);
   1154 	if (arc_meta_max < arc_meta_used)
   1155 		arc_meta_max = arc_meta_used;
   1156 	atomic_add_64(&arc_meta_used, -space);
   1157 	ASSERT(arc_size >= space);
   1158 	atomic_add_64(&arc_size, -space);
   1159 }
   1160 
   1161 void *
   1162 arc_data_buf_alloc(uint64_t size)
   1163 {
   1164 	if (arc_evict_needed(ARC_BUFC_DATA))
   1165 		cv_signal(&arc_reclaim_thr_cv);
   1166 	atomic_add_64(&arc_size, size);
   1167 	return (zio_data_buf_alloc(size));
   1168 }
   1169 
   1170 void
   1171 arc_data_buf_free(void *buf, uint64_t size)
   1172 {
   1173 	zio_data_buf_free(buf, size);
   1174 	ASSERT(arc_size >= size);
   1175 	atomic_add_64(&arc_size, -size);
   1176 }
   1177 
   1178 arc_buf_t *
   1179 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
   1180 {
   1181 	arc_buf_hdr_t *hdr;
   1182 	arc_buf_t *buf;
   1183 
   1184 	ASSERT3U(size, >, 0);
   1185 	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
   1186 	ASSERT(BUF_EMPTY(hdr));
   1187 	hdr->b_size = size;
   1188 	hdr->b_type = type;
   1189 	hdr->b_spa = spa_guid(spa);
   1190 	hdr->b_state = arc_anon;
   1191 	hdr->b_arc_access = 0;
   1192 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
   1193 	buf->b_hdr = hdr;
   1194 	buf->b_data = NULL;
   1195 	buf->b_efunc = NULL;
   1196 	buf->b_private = NULL;
   1197 	buf->b_next = NULL;
   1198 	hdr->b_buf = buf;
   1199 	arc_get_data_buf(buf);
   1200 	hdr->b_datacnt = 1;
   1201 	hdr->b_flags = 0;
   1202 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
   1203 	(void) refcount_add(&hdr->b_refcnt, tag);
   1204 
   1205 	return (buf);
   1206 }
   1207 
   1208 static char *arc_onloan_tag = "onloan";
   1209 
   1210 /*
   1211  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
   1212  * flight data by arc_tempreserve_space() until they are "returned". Loaned
   1213  * buffers must be returned to the arc before they can be used by the DMU or
   1214  * freed.
   1215  */
   1216 arc_buf_t *
   1217 arc_loan_buf(spa_t *spa, int size)
   1218 {
   1219 	arc_buf_t *buf;
   1220 
   1221 	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
   1222 
   1223 	atomic_add_64(&arc_loaned_bytes, size);
   1224 	return (buf);
   1225 }
   1226 
   1227 /*
   1228  * Return a loaned arc buffer to the arc.
   1229  */
   1230 void
   1231 arc_return_buf(arc_buf_t *buf, void *tag)
   1232 {
   1233 	arc_buf_hdr_t *hdr = buf->b_hdr;
   1234 
   1235 	ASSERT(hdr->b_state == arc_anon);
   1236 	ASSERT(buf->b_data != NULL);
   1237 	VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
   1238 	VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
   1239 
   1240 	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
   1241 }
   1242 
   1243 static arc_buf_t *
   1244 arc_buf_clone(arc_buf_t *from)
   1245 {
   1246 	arc_buf_t *buf;
   1247 	arc_buf_hdr_t *hdr = from->b_hdr;
   1248 	uint64_t size = hdr->b_size;
   1249 
   1250 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
   1251 	buf->b_hdr = hdr;
   1252 	buf->b_data = NULL;
   1253 	buf->b_efunc = NULL;
   1254 	buf->b_private = NULL;
   1255 	buf->b_next = hdr->b_buf;
   1256 	hdr->b_buf = buf;
   1257 	arc_get_data_buf(buf);
   1258 	bcopy(from->b_data, buf->b_data, size);
   1259 	hdr->b_datacnt += 1;
   1260 	return (buf);
   1261 }
   1262 
   1263 void
   1264 arc_buf_add_ref(arc_buf_t *buf, void* tag)
   1265 {
   1266 	arc_buf_hdr_t *hdr;
   1267 	kmutex_t *hash_lock;
   1268 
   1269 	/*
   1270 	 * Check to see if this buffer is evicted.  Callers
   1271 	 * must verify b_data != NULL to know if the add_ref
   1272 	 * was successful.
   1273 	 */
   1274 	rw_enter(&buf->b_lock, RW_READER);
   1275 	if (buf->b_data == NULL) {
   1276 		rw_exit(&buf->b_lock);
   1277 		return;
   1278 	}
   1279 	hdr = buf->b_hdr;
   1280 	ASSERT(hdr != NULL);
   1281 	hash_lock = HDR_LOCK(hdr);
   1282 	mutex_enter(hash_lock);
   1283 	rw_exit(&buf->b_lock);
   1284 
   1285 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
   1286 	add_reference(hdr, hash_lock, tag);
   1287 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
   1288 	arc_access(hdr, hash_lock);
   1289 	mutex_exit(hash_lock);
   1290 	ARCSTAT_BUMP(arcstat_hits);
   1291 	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
   1292 	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
   1293 	    data, metadata, hits);
   1294 }
   1295 
   1296 /*
   1297  * Free the arc data buffer.  If it is an l2arc write in progress,
   1298  * the buffer is placed on l2arc_free_on_write to be freed later.
   1299  */
   1300 static void
   1301 arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
   1302     void *data, size_t size)
   1303 {
   1304 	if (HDR_L2_WRITING(hdr)) {
   1305 		l2arc_data_free_t *df;
   1306 		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
   1307 		df->l2df_data = data;
   1308 		df->l2df_size = size;
   1309 		df->l2df_func = free_func;
   1310 		mutex_enter(&l2arc_free_on_write_mtx);
   1311 		list_insert_head(l2arc_free_on_write, df);
   1312 		mutex_exit(&l2arc_free_on_write_mtx);
   1313 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
   1314 	} else {
   1315 		free_func(data, size);
   1316 	}
   1317 }
   1318 
   1319 static void
   1320 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
   1321 {
   1322 	arc_buf_t **bufp;
   1323 
   1324 	/* free up data associated with the buf */
   1325 	if (buf->b_data) {
   1326 		arc_state_t *state = buf->b_hdr->b_state;
   1327 		uint64_t size = buf->b_hdr->b_size;
   1328 		arc_buf_contents_t type = buf->b_hdr->b_type;
   1329 
   1330 		arc_cksum_verify(buf);
   1331 		if (!recycle) {
   1332 			if (type == ARC_BUFC_METADATA) {
   1333 				arc_buf_data_free(buf->b_hdr, zio_buf_free,
   1334 				    buf->b_data, size);
   1335 				arc_space_return(size, ARC_SPACE_DATA);
   1336 			} else {
   1337 				ASSERT(type == ARC_BUFC_DATA);
   1338 				arc_buf_data_free(buf->b_hdr,
   1339 				    zio_data_buf_free, buf->b_data, size);
   1340 				ARCSTAT_INCR(arcstat_data_size, -size);
   1341 				atomic_add_64(&arc_size, -size);
   1342 			}
   1343 		}
   1344 		if (list_link_active(&buf->b_hdr->b_arc_node)) {
   1345 			uint64_t *cnt = &state->arcs_lsize[type];
   1346 
   1347 			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
   1348 			ASSERT(state != arc_anon);
   1349 
   1350 			ASSERT3U(*cnt, >=, size);
   1351 			atomic_add_64(cnt, -size);
   1352 		}
   1353 		ASSERT3U(state->arcs_size, >=, size);
   1354 		atomic_add_64(&state->arcs_size, -size);
   1355 		buf->b_data = NULL;
   1356 		ASSERT(buf->b_hdr->b_datacnt > 0);
   1357 		buf->b_hdr->b_datacnt -= 1;
   1358 	}
   1359 
   1360 	/* only remove the buf if requested */
   1361 	if (!all)
   1362 		return;
   1363 
   1364 	/* remove the buf from the hdr list */
   1365 	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
   1366 		continue;
   1367 	*bufp = buf->b_next;
   1368 
   1369 	ASSERT(buf->b_efunc == NULL);
   1370 
   1371 	/* clean up the buf */
   1372 	buf->b_hdr = NULL;
   1373 	kmem_cache_free(buf_cache, buf);
   1374 }
   1375 
   1376 static void
   1377 arc_hdr_destroy(arc_buf_hdr_t *hdr)
   1378 {
   1379 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
   1380 	ASSERT3P(hdr->b_state, ==, arc_anon);
   1381 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
   1382 	ASSERT(!(hdr->b_flags & ARC_STORED));
   1383 
   1384 	if (hdr->b_l2hdr != NULL) {
   1385 		if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
   1386 			/*
   1387 			 * To prevent arc_free() and l2arc_evict() from
   1388 			 * attempting to free the same buffer at the same time,
   1389 			 * a FREE_IN_PROGRESS flag is given to arc_free() to
   1390 			 * give it priority.  l2arc_evict() can't destroy this
   1391 			 * header while we are waiting on l2arc_buflist_mtx.
   1392 			 *
   1393 			 * The hdr may be removed from l2ad_buflist before we
   1394 			 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
   1395 			 */
   1396 			mutex_enter(&l2arc_buflist_mtx);
   1397 			if (hdr->b_l2hdr != NULL) {
   1398 				list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
   1399 				    hdr);
   1400 			}
   1401 			mutex_exit(&l2arc_buflist_mtx);
   1402 		} else {
   1403 			list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
   1404 		}
   1405 		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
   1406 		kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
   1407 		if (hdr->b_state == arc_l2c_only)
   1408 			l2arc_hdr_stat_remove();
   1409 		hdr->b_l2hdr = NULL;
   1410 	}
   1411 
   1412 	if (!BUF_EMPTY(hdr)) {
   1413 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
   1414 		bzero(&hdr->b_dva, sizeof (dva_t));
   1415 		hdr->b_birth = 0;
   1416 		hdr->b_cksum0 = 0;
   1417 	}
   1418 	while (hdr->b_buf) {
   1419 		arc_buf_t *buf = hdr->b_buf;
   1420 
   1421 		if (buf->b_efunc) {
   1422 			mutex_enter(&arc_eviction_mtx);
   1423 			rw_enter(&buf->b_lock, RW_WRITER);
   1424 			ASSERT(buf->b_hdr != NULL);
   1425 			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
   1426 			hdr->b_buf = buf->b_next;
   1427 			buf->b_hdr = &arc_eviction_hdr;
   1428 			buf->b_next = arc_eviction_list;
   1429 			arc_eviction_list = buf;
   1430 			rw_exit(&buf->b_lock);
   1431 			mutex_exit(&arc_eviction_mtx);
   1432 		} else {
   1433 			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
   1434 		}
   1435 	}
   1436 	if (hdr->b_freeze_cksum != NULL) {
   1437 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
   1438 		hdr->b_freeze_cksum = NULL;
   1439 	}
   1440 
   1441 	ASSERT(!list_link_active(&hdr->b_arc_node));
   1442 	ASSERT3P(hdr->b_hash_next, ==, NULL);
   1443 	ASSERT3P(hdr->b_acb, ==, NULL);
   1444 	kmem_cache_free(hdr_cache, hdr);
   1445 }
   1446 
   1447 void
   1448 arc_buf_free(arc_buf_t *buf, void *tag)
   1449 {
   1450 	arc_buf_hdr_t *hdr = buf->b_hdr;
   1451 	int hashed = hdr->b_state != arc_anon;
   1452 
   1453 	ASSERT(buf->b_efunc == NULL);
   1454 	ASSERT(buf->b_data != NULL);
   1455 
   1456 	if (hashed) {
   1457 		kmutex_t *hash_lock = HDR_LOCK(hdr);
   1458 
   1459 		mutex_enter(hash_lock);
   1460 		(void) remove_reference(hdr, hash_lock, tag);
   1461 		if (hdr->b_datacnt > 1)
   1462 			arc_buf_destroy(buf, FALSE, TRUE);
   1463 		else
   1464 			hdr->b_flags |= ARC_BUF_AVAILABLE;
   1465 		mutex_exit(hash_lock);
   1466 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
   1467 		int destroy_hdr;
   1468 		/*
   1469 		 * We are in the middle of an async write.  Don't destroy
   1470 		 * this buffer unless the write completes before we finish
   1471 		 * decrementing the reference count.
   1472 		 */
   1473 		mutex_enter(&arc_eviction_mtx);
   1474 		(void) remove_reference(hdr, NULL, tag);
   1475 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
   1476 		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
   1477 		mutex_exit(&arc_eviction_mtx);
   1478 		if (destroy_hdr)
   1479 			arc_hdr_destroy(hdr);
   1480 	} else {
   1481 		if (remove_reference(hdr, NULL, tag) > 0) {
   1482 			ASSERT(HDR_IO_ERROR(hdr));
   1483 			arc_buf_destroy(buf, FALSE, TRUE);
   1484 		} else {
   1485 			arc_hdr_destroy(hdr);
   1486 		}
   1487 	}
   1488 }
   1489 
   1490 int
   1491 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
   1492 {
   1493 	arc_buf_hdr_t *hdr = buf->b_hdr;
   1494 	kmutex_t *hash_lock = HDR_LOCK(hdr);
   1495 	int no_callback = (buf->b_efunc == NULL);
   1496 
   1497 	if (hdr->b_state == arc_anon) {
   1498 		arc_buf_free(buf, tag);
   1499 		return (no_callback);
   1500 	}
   1501 
   1502 	mutex_enter(hash_lock);
   1503 	ASSERT(hdr->b_state != arc_anon);
   1504 	ASSERT(buf->b_data != NULL);
   1505 
   1506 	(void) remove_reference(hdr, hash_lock, tag);
   1507 	if (hdr->b_datacnt > 1) {
   1508 		if (no_callback)
   1509 			arc_buf_destroy(buf, FALSE, TRUE);
   1510 	} else if (no_callback) {
   1511 		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
   1512 		hdr->b_flags |= ARC_BUF_AVAILABLE;
   1513 	}
   1514 	ASSERT(no_callback || hdr->b_datacnt > 1 ||
   1515 	    refcount_is_zero(&hdr->b_refcnt));
   1516 	mutex_exit(hash_lock);
   1517 	return (no_callback);
   1518 }
   1519 
   1520 int
   1521 arc_buf_size(arc_buf_t *buf)
   1522 {
   1523 	return (buf->b_hdr->b_size);
   1524 }
   1525 
   1526 /*
   1527  * Evict buffers from list until we've removed the specified number of
   1528  * bytes.  Move the removed buffers to the appropriate evict state.
   1529  * If the recycle flag is set, then attempt to "recycle" a buffer:
   1530  * - look for a buffer to evict that is `bytes' long.
   1531  * - return the data block from this buffer rather than freeing it.
   1532  * This flag is used by callers that are trying to make space for a
   1533  * new buffer in a full arc cache.
   1534  *
   1535  * This function makes a "best effort".  It skips over any buffers
   1536  * it can't get a hash_lock on, and so may not catch all candidates.
   1537  * It may also return without evicting as much space as requested.
   1538  */
   1539 static void *
   1540 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
   1541     arc_buf_contents_t type)
   1542 {
   1543 	arc_state_t *evicted_state;
   1544 	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
   1545 	arc_buf_hdr_t *ab, *ab_prev = NULL;
   1546 	list_t *list = &state->arcs_list[type];
   1547 	kmutex_t *hash_lock;
   1548 	boolean_t have_lock;
   1549 	void *stolen = NULL;
   1550 
   1551 	ASSERT(state == arc_mru || state == arc_mfu);
   1552 
   1553 	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
   1554 
   1555 	mutex_enter(&state->arcs_mtx);
   1556 	mutex_enter(&evicted_state->arcs_mtx);
   1557 
   1558 	for (ab = list_tail(list); ab; ab = ab_prev) {
   1559 		ab_prev = list_prev(list, ab);
   1560 		/* prefetch buffers have a minimum lifespan */
   1561 		if (HDR_IO_IN_PROGRESS(ab) ||
   1562 		    (spa && ab->b_spa != spa) ||
   1563 		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
   1564 		    lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
   1565 			skipped++;
   1566 			continue;
   1567 		}
   1568 		/* "lookahead" for better eviction candidate */
   1569 		if (recycle && ab->b_size != bytes &&
   1570 		    ab_prev && ab_prev->b_size == bytes)
   1571 			continue;
   1572 		hash_lock = HDR_LOCK(ab);
   1573 		have_lock = MUTEX_HELD(hash_lock);
   1574 		if (have_lock || mutex_tryenter(hash_lock)) {
   1575 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
   1576 			ASSERT(ab->b_datacnt > 0);
   1577 			while (ab->b_buf) {
   1578 				arc_buf_t *buf = ab->b_buf;
   1579 				if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
   1580 					missed += 1;
   1581 					break;
   1582 				}
   1583 				if (buf->b_data) {
   1584 					bytes_evicted += ab->b_size;
   1585 					if (recycle && ab->b_type == type &&
   1586 					    ab->b_size == bytes &&
   1587 					    !HDR_L2_WRITING(ab)) {
   1588 						stolen = buf->b_data;
   1589 						recycle = FALSE;
   1590 					}
   1591 				}
   1592 				if (buf->b_efunc) {
   1593 					mutex_enter(&arc_eviction_mtx);
   1594 					arc_buf_destroy(buf,
   1595 					    buf->b_data == stolen, FALSE);
   1596 					ab->b_buf = buf->b_next;
   1597 					buf->b_hdr = &arc_eviction_hdr;
   1598 					buf->b_next = arc_eviction_list;
   1599 					arc_eviction_list = buf;
   1600 					mutex_exit(&arc_eviction_mtx);
   1601 					rw_exit(&buf->b_lock);
   1602 				} else {
   1603 					rw_exit(&buf->b_lock);
   1604 					arc_buf_destroy(buf,
   1605 					    buf->b_data == stolen, TRUE);
   1606 				}
   1607 			}
   1608 			if (ab->b_datacnt == 0) {
   1609 				arc_change_state(evicted_state, ab, hash_lock);
   1610 				ASSERT(HDR_IN_HASH_TABLE(ab));
   1611 				ab->b_flags |= ARC_IN_HASH_TABLE;
   1612 				ab->b_flags &= ~ARC_BUF_AVAILABLE;
   1613 				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
   1614 			}
   1615 			if (!have_lock)
   1616 				mutex_exit(hash_lock);
   1617 			if (bytes >= 0 && bytes_evicted >= bytes)
   1618 				break;
   1619 		} else {
   1620 			missed += 1;
   1621 		}
   1622 	}
   1623 
   1624 	mutex_exit(&evicted_state->arcs_mtx);
   1625 	mutex_exit(&state->arcs_mtx);
   1626 
   1627 	if (bytes_evicted < bytes)
   1628 		dprintf("only evicted %lld bytes from %x",
   1629 		    (longlong_t)bytes_evicted, state);
   1630 
   1631 	if (skipped)
   1632 		ARCSTAT_INCR(arcstat_evict_skip, skipped);
   1633 
   1634 	if (missed)
   1635 		ARCSTAT_INCR(arcstat_mutex_miss, missed);
   1636 
   1637 	/*
   1638 	 * We have just evicted some date into the ghost state, make
   1639 	 * sure we also adjust the ghost state size if necessary.
   1640 	 */
   1641 	if (arc_no_grow &&
   1642 	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
   1643 		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
   1644 		    arc_mru_ghost->arcs_size - arc_c;
   1645 
   1646 		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
   1647 			int64_t todelete =
   1648 			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
   1649 			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
   1650 		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
   1651 			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
   1652 			    arc_mru_ghost->arcs_size +
   1653 			    arc_mfu_ghost->arcs_size - arc_c);
   1654 			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
   1655 		}
   1656 	}
   1657 
   1658 	return (stolen);
   1659 }
   1660 
   1661 /*
   1662  * Remove buffers from list until we've removed the specified number of
   1663  * bytes.  Destroy the buffers that are removed.
   1664  */
   1665 static void
   1666 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
   1667 {
   1668 	arc_buf_hdr_t *ab, *ab_prev;
   1669 	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
   1670 	kmutex_t *hash_lock;
   1671 	uint64_t bytes_deleted = 0;
   1672 	uint64_t bufs_skipped = 0;
   1673 
   1674 	ASSERT(GHOST_STATE(state));
   1675 top:
   1676 	mutex_enter(&state->arcs_mtx);
   1677 	for (ab = list_tail(list); ab; ab = ab_prev) {
   1678 		ab_prev = list_prev(list, ab);
   1679 		if (spa && ab->b_spa != spa)
   1680 			continue;
   1681 		hash_lock = HDR_LOCK(ab);
   1682 		if (mutex_tryenter(hash_lock)) {
   1683 			ASSERT(!HDR_IO_IN_PROGRESS(ab));
   1684 			ASSERT(ab->b_buf == NULL);
   1685 			ARCSTAT_BUMP(arcstat_deleted);
   1686 			bytes_deleted += ab->b_size;
   1687 
   1688 			if (ab->b_l2hdr != NULL) {
   1689 				/*
   1690 				 * This buffer is cached on the 2nd Level ARC;
   1691 				 * don't destroy the header.
   1692 				 */
   1693 				arc_change_state(arc_l2c_only, ab, hash_lock);
   1694 				mutex_exit(hash_lock);
   1695 			} else {
   1696 				arc_change_state(arc_anon, ab, hash_lock);
   1697 				mutex_exit(hash_lock);
   1698 				arc_hdr_destroy(ab);
   1699 			}
   1700 
   1701 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
   1702 			if (bytes >= 0 && bytes_deleted >= bytes)
   1703 				break;
   1704 		} else {
   1705 			if (bytes < 0) {
   1706 				mutex_exit(&state->arcs_mtx);
   1707 				mutex_enter(hash_lock);
   1708 				mutex_exit(hash_lock);
   1709 				goto top;
   1710 			}
   1711 			bufs_skipped += 1;
   1712 		}
   1713 	}
   1714 	mutex_exit(&state->arcs_mtx);
   1715 
   1716 	if (list == &state->arcs_list[ARC_BUFC_DATA] &&
   1717 	    (bytes < 0 || bytes_deleted < bytes)) {
   1718 		list = &state->arcs_list[ARC_BUFC_METADATA];
   1719 		goto top;
   1720 	}
   1721 
   1722 	if (bufs_skipped) {
   1723 		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
   1724 		ASSERT(bytes >= 0);
   1725 	}
   1726 
   1727 	if (bytes_deleted < bytes)
   1728 		dprintf("only deleted %lld bytes from %p",
   1729 		    (longlong_t)bytes_deleted, state);
   1730 }
   1731 
   1732 static void
   1733 arc_adjust(void)
   1734 {
   1735 	int64_t adjustment, delta;
   1736 
   1737 	/*
   1738 	 * Adjust MRU size
   1739 	 */
   1740 
   1741 	adjustment = MIN(arc_size - arc_c,
   1742 	    arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
   1743 
   1744 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
   1745 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
   1746 		(void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
   1747 		adjustment -= delta;
   1748 	}
   1749 
   1750 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
   1751 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
   1752 		(void) arc_evict(arc_mru, NULL, delta, FALSE,
   1753 		    ARC_BUFC_METADATA);
   1754 	}
   1755 
   1756 	/*
   1757 	 * Adjust MFU size
   1758 	 */
   1759 
   1760 	adjustment = arc_size - arc_c;
   1761 
   1762 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
   1763 		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
   1764 		(void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
   1765 		adjustment -= delta;
   1766 	}
   1767 
   1768 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
   1769 		int64_t delta = MIN(adjustment,
   1770 		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
   1771 		(void) arc_evict(arc_mfu, NULL, delta, FALSE,
   1772 		    ARC_BUFC_METADATA);
   1773 	}
   1774 
   1775 	/*
   1776 	 * Adjust ghost lists
   1777 	 */
   1778 
   1779 	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
   1780 
   1781 	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
   1782 		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
   1783 		arc_evict_ghost(arc_mru_ghost, NULL, delta);
   1784 	}
   1785 
   1786 	adjustment =
   1787 	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
   1788 
   1789 	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
   1790 		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
   1791 		arc_evict_ghost(arc_mfu_ghost, NULL, delta);
   1792 	}
   1793 }
   1794 
   1795 static void
   1796 arc_do_user_evicts(void)
   1797 {
   1798 	mutex_enter(&arc_eviction_mtx);
   1799 	while (arc_eviction_list != NULL) {
   1800 		arc_buf_t *buf = arc_eviction_list;
   1801 		arc_eviction_list = buf->b_next;
   1802 		rw_enter(&buf->b_lock, RW_WRITER);
   1803 		buf->b_hdr = NULL;
   1804 		rw_exit(&buf->b_lock);
   1805 		mutex_exit(&arc_eviction_mtx);
   1806 
   1807 		if (buf->b_efunc != NULL)
   1808 			VERIFY(buf->b_efunc(buf) == 0);
   1809 
   1810 		buf->b_efunc = NULL;
   1811 		buf->b_private = NULL;
   1812 		kmem_cache_free(buf_cache, buf);
   1813 		mutex_enter(&arc_eviction_mtx);
   1814 	}
   1815 	mutex_exit(&arc_eviction_mtx);
   1816 }
   1817 
   1818 /*
   1819  * Flush all *evictable* data from the cache for the given spa.
   1820  * NOTE: this will not touch "active" (i.e. referenced) data.
   1821  */
   1822 void
   1823 arc_flush(spa_t *spa)
   1824 {
   1825 	uint64_t guid = 0;
   1826 
   1827 	if (spa)
   1828 		guid = spa_guid(spa);
   1829 
   1830 	while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
   1831 		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
   1832 		if (spa)
   1833 			break;
   1834 	}
   1835 	while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
   1836 		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
   1837 		if (spa)
   1838 			break;
   1839 	}
   1840 	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
   1841 		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
   1842 		if (spa)
   1843 			break;
   1844 	}
   1845 	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
   1846 		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
   1847 		if (spa)
   1848 			break;
   1849 	}
   1850 
   1851 	arc_evict_ghost(arc_mru_ghost, guid, -1);
   1852 	arc_evict_ghost(arc_mfu_ghost, guid, -1);
   1853 
   1854 	mutex_enter(&arc_reclaim_thr_lock);
   1855 	arc_do_user_evicts();
   1856 	mutex_exit(&arc_reclaim_thr_lock);
   1857 	ASSERT(spa || arc_eviction_list == NULL);
   1858 }
   1859 
   1860 void
   1861 arc_shrink(void)
   1862 {
   1863 	if (arc_c > arc_c_min) {
   1864 		uint64_t to_free;
   1865 
   1866 #ifdef _KERNEL
   1867 		to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
   1868 #else
   1869 		to_free = arc_c >> arc_shrink_shift;
   1870 #endif
   1871 		if (arc_c > arc_c_min + to_free)
   1872 			atomic_add_64(&arc_c, -to_free);
   1873 		else
   1874 			arc_c = arc_c_min;
   1875 
   1876 		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
   1877 		if (arc_c > arc_size)
   1878 			arc_c = MAX(arc_size, arc_c_min);
   1879 		if (arc_p > arc_c)
   1880 			arc_p = (arc_c >> 1);
   1881 		ASSERT(arc_c >= arc_c_min);
   1882 		ASSERT((int64_t)arc_p >= 0);
   1883 	}
   1884 
   1885 	if (arc_size > arc_c)
   1886 		arc_adjust();
   1887 }
   1888 
   1889 static int
   1890 arc_reclaim_needed(void)
   1891 {
   1892 	uint64_t extra;
   1893 
   1894 #ifdef _KERNEL
   1895 
   1896 	if (needfree)
   1897 		return (1);
   1898 
   1899 	/*
   1900 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
   1901 	 */
   1902 	extra = desfree;
   1903 
   1904 	/*
   1905 	 * check that we're out of range of the pageout scanner.  It starts to
   1906 	 * schedule paging if freemem is less than lotsfree and needfree.
   1907 	 * lotsfree is the high-water mark for pageout, and needfree is the
   1908 	 * number of needed free pages.  We add extra pages here to make sure
   1909 	 * the scanner doesn't start up while we're freeing memory.
   1910 	 */
   1911 	if (freemem < lotsfree + needfree + extra)
   1912 		return (1);
   1913 
   1914 	/*
   1915 	 * check to make sure that swapfs has enough space so that anon
   1916 	 * reservations can still succeed. anon_resvmem() checks that the
   1917 	 * availrmem is greater than swapfs_minfree, and the number of reserved
   1918 	 * swap pages.  We also add a bit of extra here just to prevent
   1919 	 * circumstances from getting really dire.
   1920 	 */
   1921 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
   1922 		return (1);
   1923 
   1924 #if defined(__i386)
   1925 	/*
   1926 	 * If we're on an i386 platform, it's possible that we'll exhaust the
   1927 	 * kernel heap space before we ever run out of available physical
   1928 	 * memory.  Most checks of the size of the heap_area compare against
   1929 	 * tune.t_minarmem, which is the minimum available real memory that we
   1930 	 * can have in the system.  However, this is generally fixed at 25 pages
   1931 	 * which is so low that it's useless.  In this comparison, we seek to
   1932 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
   1933 	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
   1934 	 * free)
   1935 	 */
   1936 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
   1937 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
   1938 		return (1);
   1939 #endif
   1940 
   1941 #else
   1942 	if (spa_get_random(100) == 0)
   1943 		return (1);
   1944 #endif
   1945 	return (0);
   1946 }
   1947 
   1948 static void
   1949 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
   1950 {
   1951 	size_t			i;
   1952 	kmem_cache_t		*prev_cache = NULL;
   1953 	kmem_cache_t		*prev_data_cache = NULL;
   1954 	extern kmem_cache_t	*zio_buf_cache[];
   1955 	extern kmem_cache_t	*zio_data_buf_cache[];
   1956 
   1957 #ifdef _KERNEL
   1958 	if (arc_meta_used >= arc_meta_limit) {
   1959 		/*
   1960 		 * We are exceeding our meta-data cache limit.
   1961 		 * Purge some DNLC entries to release holds on meta-data.
   1962 		 */
   1963 		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
   1964 	}
   1965 #if defined(__i386)
   1966 	/*
   1967 	 * Reclaim unused memory from all kmem caches.
   1968 	 */
   1969 	kmem_reap();
   1970 #endif
   1971 #endif
   1972 
   1973 	/*
   1974 	 * An aggressive reclamation will shrink the cache size as well as
   1975 	 * reap free buffers from the arc kmem caches.
   1976 	 */
   1977 	if (strat == ARC_RECLAIM_AGGR)
   1978 		arc_shrink();
   1979 
   1980 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
   1981 		if (zio_buf_cache[i] != prev_cache) {
   1982 			prev_cache = zio_buf_cache[i];
   1983 			kmem_cache_reap_now(zio_buf_cache[i]);
   1984 		}
   1985 		if (zio_data_buf_cache[i] != prev_data_cache) {
   1986 			prev_data_cache = zio_data_buf_cache[i];
   1987 			kmem_cache_reap_now(zio_data_buf_cache[i]);
   1988 		}
   1989 	}
   1990 	kmem_cache_reap_now(buf_cache);
   1991 	kmem_cache_reap_now(hdr_cache);
   1992 }
   1993 
   1994 static void
   1995 arc_reclaim_thread(void)
   1996 {
   1997 	clock_t			growtime = 0;
   1998 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
   1999 	callb_cpr_t		cpr;
   2000 
   2001 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
   2002 
   2003 	mutex_enter(&arc_reclaim_thr_lock);
   2004 	while (arc_thread_exit == 0) {
   2005 		if (arc_reclaim_needed()) {
   2006 
   2007 			if (arc_no_grow) {
   2008 				if (last_reclaim == ARC_RECLAIM_CONS) {
   2009 					last_reclaim = ARC_RECLAIM_AGGR;
   2010 				} else {
   2011 					last_reclaim = ARC_RECLAIM_CONS;
   2012 				}
   2013 			} else {
   2014 				arc_no_grow = TRUE;
   2015 				last_reclaim = ARC_RECLAIM_AGGR;
   2016 				membar_producer();
   2017 			}
   2018 
   2019 			/* reset the growth delay for every reclaim */
   2020 			growtime = lbolt + (arc_grow_retry * hz);
   2021 
   2022 			arc_kmem_reap_now(last_reclaim);
   2023 			arc_warm = B_TRUE;
   2024 
   2025 		} else if (arc_no_grow && lbolt >= growtime) {
   2026 			arc_no_grow = FALSE;
   2027 		}
   2028 
   2029 		if (2 * arc_c < arc_size +
   2030 		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
   2031 			arc_adjust();
   2032 
   2033 		if (arc_eviction_list != NULL)
   2034 			arc_do_user_evicts();
   2035 
   2036 		/* block until needed, or one second, whichever is shorter */
   2037 		CALLB_CPR_SAFE_BEGIN(&cpr);
   2038 		(void) cv_timedwait(&arc_reclaim_thr_cv,
   2039 		    &arc_reclaim_thr_lock, (lbolt + hz));
   2040 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
   2041 	}
   2042 
   2043 	arc_thread_exit = 0;
   2044 	cv_broadcast(&arc_reclaim_thr_cv);
   2045 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
   2046 	thread_exit();
   2047 }
   2048 
   2049 /*
   2050  * Adapt arc info given the number of bytes we are trying to add and
   2051  * the state that we are comming from.  This function is only called
   2052  * when we are adding new content to the cache.
   2053  */
   2054 static void
   2055 arc_adapt(int bytes, arc_state_t *state)
   2056 {
   2057 	int mult;
   2058 	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
   2059 
   2060 	if (state == arc_l2c_only)
   2061 		return;
   2062 
   2063 	ASSERT(bytes > 0);
   2064 	/*
   2065 	 * Adapt the target size of the MRU list:
   2066 	 *	- if we just hit in the MRU ghost list, then increase
   2067 	 *	  the target size of the MRU list.
   2068 	 *	- if we just hit in the MFU ghost list, then increase
   2069 	 *	  the target size of the MFU list by decreasing the
   2070 	 *	  target size of the MRU list.
   2071 	 */
   2072 	if (state == arc_mru_ghost) {
   2073 		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
   2074 		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
   2075 
   2076 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
   2077 	} else if (state == arc_mfu_ghost) {
   2078 		uint64_t delta;
   2079 
   2080 		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
   2081 		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
   2082 
   2083 		delta = MIN(bytes * mult, arc_p);
   2084 		arc_p = MAX(arc_p_min, arc_p - delta);
   2085 	}
   2086 	ASSERT((int64_t)arc_p >= 0);
   2087 
   2088 	if (arc_reclaim_needed()) {
   2089 		cv_signal(&arc_reclaim_thr_cv);
   2090 		return;
   2091 	}
   2092 
   2093 	if (arc_no_grow)
   2094 		return;
   2095 
   2096 	if (arc_c >= arc_c_max)
   2097 		return;
   2098 
   2099 	/*
   2100 	 * If we're within (2 * maxblocksize) bytes of the target
   2101 	 * cache size, increment the target cache size
   2102 	 */
   2103 	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
   2104 		atomic_add_64(&arc_c, (int64_t)bytes);
   2105 		if (arc_c > arc_c_max)
   2106 			arc_c = arc_c_max;
   2107 		else if (state == arc_anon)
   2108 			atomic_add_64(&arc_p, (int64_t)bytes);
   2109 		if (arc_p > arc_c)
   2110 			arc_p = arc_c;
   2111 	}
   2112 	ASSERT((int64_t)arc_p >= 0);
   2113 }
   2114 
   2115 /*
   2116  * Check if the cache has reached its limits and eviction is required
   2117  * prior to insert.
   2118  */
   2119 static int
   2120 arc_evict_needed(arc_buf_contents_t type)
   2121 {
   2122 	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
   2123 		return (1);
   2124 
   2125 #ifdef _KERNEL
   2126 	/*
   2127 	 * If zio data pages are being allocated out of a separate heap segment,
   2128 	 * then enforce that the size of available vmem for this area remains
   2129 	 * above about 1/32nd free.
   2130 	 */
   2131 	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
   2132 	    vmem_size(zio_arena, VMEM_FREE) <
   2133 	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
   2134 		return (1);
   2135 #endif
   2136 
   2137 	if (arc_reclaim_needed())
   2138 		return (1);
   2139 
   2140 	return (arc_size > arc_c);
   2141 }
   2142 
   2143 /*
   2144  * The buffer, supplied as the first argument, needs a data block.
   2145  * So, if we are at cache max, determine which cache should be victimized.
   2146  * We have the following cases:
   2147  *
   2148  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
   2149  * In this situation if we're out of space, but the resident size of the MFU is
   2150  * under the limit, victimize the MFU cache to satisfy this insertion request.
   2151  *
   2152  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
   2153  * Here, we've used up all of the available space for the MRU, so we need to
   2154  * evict from our own cache instead.  Evict from the set of resident MRU
   2155  * entries.
   2156  *
   2157  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
   2158  * c minus p represents the MFU space in the cache, since p is the size of the
   2159  * cache that is dedicated to the MRU.  In this situation there's still space on
   2160  * the MFU side, so the MRU side needs to be victimized.
   2161  *
   2162  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
   2163  * MFU's resident set is consuming more space than it has been allotted.  In
   2164  * this situation, we must victimize our own cache, the MFU, for this insertion.
   2165  */
   2166 static void
   2167 arc_get_data_buf(arc_buf_t *buf)
   2168 {
   2169 	arc_state_t		*state = buf->b_hdr->b_state;
   2170 	uint64_t		size = buf->b_hdr->b_size;
   2171 	arc_buf_contents_t	type = buf->b_hdr->b_type;
   2172 
   2173 	arc_adapt(size, state);
   2174 
   2175 	/*
   2176 	 * We have not yet reached cache maximum size,
   2177 	 * just allocate a new buffer.
   2178 	 */
   2179 	if (!arc_evict_needed(type)) {
   2180 		if (type == ARC_BUFC_METADATA) {
   2181 			buf->b_data = zio_buf_alloc(size);
   2182 			arc_space_consume(size, ARC_SPACE_DATA);
   2183 		} else {
   2184 			ASSERT(type == ARC_BUFC_DATA);
   2185 			buf->b_data = zio_data_buf_alloc(size);
   2186 			ARCSTAT_INCR(arcstat_data_size, size);
   2187 			atomic_add_64(&arc_size, size);
   2188 		}
   2189 		goto out;
   2190 	}
   2191 
   2192 	/*
   2193 	 * If we are prefetching from the mfu ghost list, this buffer
   2194 	 * will end up on the mru list; so steal space from there.
   2195 	 */
   2196 	if (state == arc_mfu_ghost)
   2197 		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
   2198 	else if (state == arc_mru_ghost)
   2199 		state = arc_mru;
   2200 
   2201 	if (state == arc_mru || state == arc_anon) {
   2202 		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
   2203 		state = (arc_mfu->arcs_lsize[type] >= size &&
   2204 		    arc_p > mru_used) ? arc_mfu : arc_mru;
   2205 	} else {
   2206 		/* MFU cases */
   2207 		uint64_t mfu_space = arc_c - arc_p;
   2208 		state =  (arc_mru->arcs_lsize[type] >= size &&
   2209 		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
   2210 	}
   2211 	if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
   2212 		if (type == ARC_BUFC_METADATA) {
   2213 			buf->b_data = zio_buf_alloc(size);
   2214 			arc_space_consume(size, ARC_SPACE_DATA);
   2215 		} else {
   2216 			ASSERT(type == ARC_BUFC_DATA);
   2217 			buf->b_data = zio_data_buf_alloc(size);
   2218 			ARCSTAT_INCR(arcstat_data_size, size);
   2219 			atomic_add_64(&arc_size, size);
   2220 		}
   2221 		ARCSTAT_BUMP(arcstat_recycle_miss);
   2222 	}
   2223 	ASSERT(buf->b_data != NULL);
   2224 out:
   2225 	/*
   2226 	 * Update the state size.  Note that ghost states have a
   2227 	 * "ghost size" and so don't need to be updated.
   2228 	 */
   2229 	if (!GHOST_STATE(buf->b_hdr->b_state)) {
   2230 		arc_buf_hdr_t *hdr = buf->b_hdr;
   2231 
   2232 		atomic_add_64(&hdr->b_state->arcs_size, size);
   2233 		if (list_link_active(&hdr->b_arc_node)) {
   2234 			ASSERT(refcount_is_zero(&hdr->b_refcnt));
   2235 			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
   2236 		}
   2237 		/*
   2238 		 * If we are growing the cache, and we are adding anonymous
   2239 		 * data, and we have outgrown arc_p, update arc_p
   2240 		 */
   2241 		if (arc_size < arc_c && hdr->b_state == arc_anon &&
   2242 		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
   2243 			arc_p = MIN(arc_c, arc_p + size);
   2244 	}
   2245 }
   2246 
   2247 /*
   2248  * This routine is called whenever a buffer is accessed.
   2249  * NOTE: the hash lock is dropped in this function.
   2250  */
   2251 static void
   2252 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
   2253 {
   2254 	ASSERT(MUTEX_HELD(hash_lock));
   2255 
   2256 	if (buf->b_state == arc_anon) {
   2257 		/*
   2258 		 * This buffer is not in the cache, and does not
   2259 		 * appear in our "ghost" list.  Add the new buffer
   2260 		 * to the MRU state.
   2261 		 */
   2262 
   2263 		ASSERT(buf->b_arc_access == 0);
   2264 		buf->b_arc_access = lbolt;
   2265 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
   2266 		arc_change_state(arc_mru, buf, hash_lock);
   2267 
   2268 	} else if (buf->b_state == arc_mru) {
   2269 		/*
   2270 		 * If this buffer is here because of a prefetch, then either:
   2271 		 * - clear the flag if this is a "referencing" read
   2272 		 *   (any subsequent access will bump this into the MFU state).
   2273 		 * or
   2274 		 * - move the buffer to the head of the list if this is
   2275 		 *   another prefetch (to make it less likely to be evicted).
   2276 		 */
   2277 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
   2278 			if (refcount_count(&buf->b_refcnt) == 0) {
   2279 				ASSERT(list_link_active(&buf->b_arc_node));
   2280 			} else {
   2281 				buf->b_flags &= ~ARC_PREFETCH;
   2282 				ARCSTAT_BUMP(arcstat_mru_hits);
   2283 			}
   2284 			buf->b_arc_access = lbolt;
   2285 			return;
   2286 		}
   2287 
   2288 		/*
   2289 		 * This buffer has been "accessed" only once so far,
   2290 		 * but it is still in the cache. Move it to the MFU
   2291 		 * state.
   2292 		 */
   2293 		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
   2294 			/*
   2295 			 * More than 125ms have passed since we
   2296 			 * instantiated this buffer.  Move it to the
   2297 			 * most frequently used state.
   2298 			 */
   2299 			buf->b_arc_access = lbolt;
   2300 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
   2301 			arc_change_state(arc_mfu, buf, hash_lock);
   2302 		}
   2303 		ARCSTAT_BUMP(arcstat_mru_hits);
   2304 	} else if (buf->b_state == arc_mru_ghost) {
   2305 		arc_state_t	*new_state;
   2306 		/*
   2307 		 * This buffer has been "accessed" recently, but
   2308 		 * was evicted from the cache.  Move it to the
   2309 		 * MFU state.
   2310 		 */
   2311 
   2312 		if (buf->b_flags & ARC_PREFETCH) {
   2313 			new_state = arc_mru;
   2314 			if (refcount_count(&buf->b_refcnt) > 0)
   2315 				buf->b_flags &= ~ARC_PREFETCH;
   2316 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
   2317 		} else {
   2318 			new_state = arc_mfu;
   2319 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
   2320 		}
   2321 
   2322 		buf->b_arc_access = lbolt;
   2323 		arc_change_state(new_state, buf, hash_lock);
   2324 
   2325 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
   2326 	} else if (buf->b_state == arc_mfu) {
   2327 		/*
   2328 		 * This buffer has been accessed more than once and is
   2329 		 * still in the cache.  Keep it in the MFU state.
   2330 		 *
   2331 		 * NOTE: an add_reference() that occurred when we did
   2332 		 * the arc_read() will have kicked this off the list.
   2333 		 * If it was a prefetch, we will explicitly move it to
   2334 		 * the head of the list now.
   2335 		 */
   2336 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
   2337 			ASSERT(refcount_count(&buf->b_refcnt) == 0);
   2338 			ASSERT(list_link_active(&buf->b_arc_node));
   2339 		}
   2340 		ARCSTAT_BUMP(arcstat_mfu_hits);
   2341 		buf->b_arc_access = lbolt;
   2342 	} else if (buf->b_state == arc_mfu_ghost) {
   2343 		arc_state_t	*new_state = arc_mfu;
   2344 		/*
   2345 		 * This buffer has been accessed more than once but has
   2346 		 * been evicted from the cache.  Move it back to the
   2347 		 * MFU state.
   2348 		 */
   2349 
   2350 		if (buf->b_flags & ARC_PREFETCH) {
   2351 			/*
   2352 			 * This is a prefetch access...
   2353 			 * move this block back to the MRU state.
   2354 			 */
   2355 			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
   2356 			new_state = arc_mru;
   2357 		}
   2358 
   2359 		buf->b_arc_access = lbolt;
   2360 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
   2361 		arc_change_state(new_state, buf, hash_lock);
   2362 
   2363 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
   2364 	} else if (buf->b_state == arc_l2c_only) {
   2365 		/*
   2366 		 * This buffer is on the 2nd Level ARC.
   2367 		 */
   2368 
   2369 		buf->b_arc_access = lbolt;
   2370 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
   2371 		arc_change_state(arc_mfu, buf, hash_lock);
   2372 	} else {
   2373 		ASSERT(!"invalid arc state");
   2374 	}
   2375 }
   2376 
   2377 /* a generic arc_done_func_t which you can use */
   2378 /* ARGSUSED */
   2379 void
   2380 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
   2381 {
   2382 	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
   2383 	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
   2384 }
   2385 
   2386 /* a generic arc_done_func_t */
   2387 void
   2388 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
   2389 {
   2390 	arc_buf_t **bufp = arg;
   2391 	if (zio && zio->io_error) {
   2392 		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
   2393 		*bufp = NULL;
   2394 	} else {
   2395 		*bufp = buf;
   2396 	}
   2397 }
   2398 
   2399 static void
   2400 arc_read_done(zio_t *zio)
   2401 {
   2402 	arc_buf_hdr_t	*hdr, *found;
   2403 	arc_buf_t	*buf;
   2404 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
   2405 	kmutex_t	*hash_lock;
   2406 	arc_callback_t	*callback_list, *acb;
   2407 	int		freeable = FALSE;
   2408 
   2409 	buf = zio->io_private;
   2410 	hdr = buf->b_hdr;
   2411 
   2412 	/*
   2413 	 * The hdr was inserted into hash-table and removed from lists
   2414 	 * prior to starting I/O.  We should find this header, since
   2415 	 * it's in the hash table, and it should be legit since it's
   2416 	 * not possible to evict it during the I/O.  The only possible
   2417 	 * reason for it not to be found is if we were freed during the
   2418 	 * read.
   2419 	 */
   2420 	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
   2421 	    &hash_lock);
   2422 
   2423 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
   2424 	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
   2425 	    (found == hdr && HDR_L2_READING(hdr)));
   2426 
   2427 	hdr->b_flags &= ~ARC_L2_EVICTED;
   2428 	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
   2429 		hdr->b_flags &= ~ARC_L2CACHE;
   2430 
   2431 	/* byteswap if necessary */
   2432 	callback_list = hdr->b_acb;
   2433 	ASSERT(callback_list != NULL);
   2434 	if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
   2435 		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
   2436 		    byteswap_uint64_array :
   2437 		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
   2438 		func(buf->b_data, hdr->b_size);
   2439 	}
   2440 
   2441 	arc_cksum_compute(buf, B_FALSE);
   2442 
   2443 	/* create copies of the data buffer for the callers */
   2444 	abuf = buf;
   2445 	for (acb = callback_list; acb; acb = acb->acb_next) {
   2446 		if (acb->acb_done) {
   2447 			if (abuf == NULL)
   2448 				abuf = arc_buf_clone(buf);
   2449 			acb->acb_buf = abuf;
   2450 			abuf = NULL;
   2451 		}
   2452 	}
   2453 	hdr->b_acb = NULL;
   2454 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
   2455 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
   2456 	if (abuf == buf)
   2457 		hdr->b_flags |= ARC_BUF_AVAILABLE;
   2458 
   2459 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
   2460 
   2461 	if (zio->io_error != 0) {
   2462 		hdr->b_flags |= ARC_IO_ERROR;
   2463 		if (hdr->b_state != arc_anon)
   2464 			arc_change_state(arc_anon, hdr, hash_lock);
   2465 		if (HDR_IN_HASH_TABLE(hdr))
   2466 			buf_hash_remove(hdr);
   2467 		freeable = refcount_is_zero(&hdr->b_refcnt);
   2468 	}
   2469 
   2470 	/*
   2471 	 * Broadcast before we drop the hash_lock to avoid the possibility
   2472 	 * that the hdr (and hence the cv) might be freed before we get to
   2473 	 * the cv_broadcast().
   2474 	 */
   2475 	cv_broadcast(&hdr->b_cv);
   2476 
   2477 	if (hash_lock) {
   2478 		/*
   2479 		 * Only call arc_access on anonymous buffers.  This is because
   2480 		 * if we've issued an I/O for an evicted buffer, we've already
   2481 		 * called arc_access (to prevent any simultaneous readers from
   2482 		 * getting confused).
   2483 		 */
   2484 		if (zio->io_error == 0 && hdr->b_state == arc_anon)
   2485 			arc_access(hdr, hash_lock);
   2486 		mutex_exit(hash_lock);
   2487 	} else {
   2488 		/*
   2489 		 * This block was freed while we waited for the read to
   2490 		 * complete.  It has been removed from the hash table and
   2491 		 * moved to the anonymous state (so that it won't show up
   2492 		 * in the cache).
   2493 		 */
   2494 		ASSERT3P(hdr->b_state, ==, arc_anon);
   2495 		freeable = refcount_is_zero(&hdr->b_refcnt);
   2496 	}
   2497 
   2498 	/* execute each callback and free its structure */
   2499 	while ((acb = callback_list) != NULL) {
   2500 		if (acb->acb_done)
   2501 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
   2502 
   2503 		if (acb->acb_zio_dummy != NULL) {
   2504 			acb->acb_zio_dummy->io_error = zio->io_error;
   2505 			zio_nowait(acb->acb_zio_dummy);
   2506 		}
   2507 
   2508 		callback_list = acb->acb_next;
   2509 		kmem_free(acb, sizeof (arc_callback_t));
   2510 	}
   2511 
   2512 	if (freeable)
   2513 		arc_hdr_destroy(hdr);
   2514 }
   2515 
   2516 /*
   2517  * "Read" the block block at the specified DVA (in bp) via the
   2518  * cache.  If the block is found in the cache, invoke the provided
   2519  * callback immediately and return.  Note that the `zio' parameter
   2520  * in the callback will be NULL in this case, since no IO was
   2521  * required.  If the block is not in the cache pass the read request
   2522  * on to the spa with a substitute callback function, so that the
   2523  * requested block will be added to the cache.
   2524  *
   2525  * If a read request arrives for a block that has a read in-progress,
   2526  * either wait for the in-progress read to complete (and return the
   2527  * results); or, if this is a read with a "done" func, add a record
   2528  * to the read to invoke the "done" func when the read completes,
   2529  * and return; or just return.
   2530  *
   2531  * arc_read_done() will invoke all the requested "done" functions
   2532  * for readers of this block.
   2533  *
   2534  * Normal callers should use arc_read and pass the arc buffer and offset
   2535  * for the bp.  But if you know you don't need locking, you can use
   2536  * arc_read_bp.
   2537  */
   2538 int
   2539 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
   2540     arc_done_func_t *done, void *private, int priority, int zio_flags,
   2541     uint32_t *arc_flags, const zbookmark_t *zb)
   2542 {
   2543 	int err;
   2544 
   2545 	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
   2546 	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
   2547 	rw_enter(&pbuf->b_lock, RW_READER);
   2548 
   2549 	err = arc_read_nolock(pio, spa, bp, done, private, priority,
   2550 	    zio_flags, arc_flags, zb);
   2551 	rw_exit(&pbuf->b_lock);
   2552 
   2553 	return (err);
   2554 }
   2555 
   2556 int
   2557 arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
   2558     arc_done_func_t *done, void *private, int priority, int zio_flags,
   2559     uint32_t *arc_flags, const zbookmark_t *zb)
   2560 {
   2561 	arc_buf_hdr_t *hdr;
   2562 	arc_buf_t *buf;
   2563 	kmutex_t *hash_lock;
   2564 	zio_t *rzio;
   2565 	uint64_t guid = spa_guid(spa);
   2566 
   2567 top:
   2568 	hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
   2569 	if (hdr && hdr->b_datacnt > 0) {
   2570 
   2571 		*arc_flags |= ARC_CACHED;
   2572 
   2573 		if (HDR_IO_IN_PROGRESS(hdr)) {
   2574 
   2575 			if (*arc_flags & ARC_WAIT) {
   2576 				cv_wait(&hdr->b_cv, hash_lock);
   2577 				mutex_exit(hash_lock);
   2578 				goto top;
   2579 			}
   2580 			ASSERT(*arc_flags & ARC_NOWAIT);
   2581 
   2582 			if (done) {
   2583 				arc_callback_t	*acb = NULL;
   2584 
   2585 				acb = kmem_zalloc(sizeof (arc_callback_t),
   2586 				    KM_SLEEP);
   2587 				acb->acb_done = done;
   2588 				acb->acb_private = private;
   2589 				if (pio != NULL)
   2590 					acb->acb_zio_dummy = zio_null(pio,
   2591 					    spa, NULL, NULL, NULL, zio_flags);
   2592 
   2593 				ASSERT(acb->acb_done != NULL);
   2594 				acb->acb_next = hdr->b_acb;
   2595 				hdr->b_acb = acb;
   2596 				add_reference(hdr, hash_lock, private);
   2597 				mutex_exit(hash_lock);
   2598 				return (0);
   2599 			}
   2600 			mutex_exit(hash_lock);
   2601 			return (0);
   2602 		}
   2603 
   2604 		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
   2605 
   2606 		if (done) {
   2607 			add_reference(hdr, hash_lock, private);
   2608 			/*
   2609 			 * If this block is already in use, create a new
   2610 			 * copy of the data so that we will be guaranteed
   2611 			 * that arc_release() will always succeed.
   2612 			 */
   2613 			buf = hdr->b_buf;
   2614 			ASSERT(buf);
   2615 			ASSERT(buf->b_data);
   2616 			if (HDR_BUF_AVAILABLE(hdr)) {
   2617 				ASSERT(buf->b_efunc == NULL);
   2618 				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
   2619 			} else {
   2620 				buf = arc_buf_clone(buf);
   2621 			}
   2622 		} else if (*arc_flags & ARC_PREFETCH &&
   2623 		    refcount_count(&hdr->b_refcnt) == 0) {
   2624 			hdr->b_flags |= ARC_PREFETCH;
   2625 		}
   2626 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
   2627 		arc_access(hdr, hash_lock);
   2628 		if (*arc_flags & ARC_L2CACHE)
   2629 			hdr->b_flags |= ARC_L2CACHE;
   2630 		mutex_exit(hash_lock);
   2631 		ARCSTAT_BUMP(arcstat_hits);
   2632 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
   2633 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
   2634 		    data, metadata, hits);
   2635 
   2636 		if (done)
   2637 			done(NULL, buf, private);
   2638 	} else {
   2639 		uint64_t size = BP_GET_LSIZE(bp);
   2640 		arc_callback_t	*acb;
   2641 		vdev_t *vd = NULL;
   2642 		uint64_t addr;
   2643 		boolean_t devw = B_FALSE;
   2644 
   2645 		if (hdr == NULL) {
   2646 			/* this block is not in the cache */
   2647 			arc_buf_hdr_t	*exists;
   2648 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
   2649 			buf = arc_buf_alloc(spa, size, private, type);
   2650 			hdr = buf->b_hdr;
   2651 			hdr->b_dva = *BP_IDENTITY(bp);
   2652 			hdr->b_birth = bp->blk_birth;
   2653 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
   2654 			exists = buf_hash_insert(hdr, &hash_lock);
   2655 			if (exists) {
   2656 				/* somebody beat us to the hash insert */
   2657 				mutex_exit(hash_lock);
   2658 				bzero(&hdr->b_dva, sizeof (dva_t));
   2659 				hdr->b_birth = 0;
   2660 				hdr->b_cksum0 = 0;
   2661 				(void) arc_buf_remove_ref(buf, private);
   2662 				goto top; /* restart the IO request */
   2663 			}
   2664 			/* if this is a prefetch, we don't have a reference */
   2665 			if (*arc_flags & ARC_PREFETCH) {
   2666 				(void) remove_reference(hdr, hash_lock,
   2667 				    private);
   2668 				hdr->b_flags |= ARC_PREFETCH;
   2669 			}
   2670 			if (*arc_flags & ARC_L2CACHE)
   2671 				hdr->b_flags |= ARC_L2CACHE;
   2672 			if (BP_GET_LEVEL(bp) > 0)
   2673 				hdr->b_flags |= ARC_INDIRECT;
   2674 		} else {
   2675 			/* this block is in the ghost cache */
   2676 			ASSERT(GHOST_STATE(hdr->b_state));
   2677 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
   2678 			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
   2679 			ASSERT(hdr->b_buf == NULL);
   2680 
   2681 			/* if this is a prefetch, we don't have a reference */
   2682 			if (*arc_flags & ARC_PREFETCH)
   2683 				hdr->b_flags |= ARC_PREFETCH;
   2684 			else
   2685 				add_reference(hdr, hash_lock, private);
   2686 			if (*arc_flags & ARC_L2CACHE)
   2687 				hdr->b_flags |= ARC_L2CACHE;
   2688 			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
   2689 			buf->b_hdr = hdr;
   2690 			buf->b_data = NULL;
   2691 			buf->b_efunc = NULL;
   2692 			buf->b_private = NULL;
   2693 			buf->b_next = NULL;
   2694 			hdr->b_buf = buf;
   2695 			arc_get_data_buf(buf);
   2696 			ASSERT(hdr->b_datacnt == 0);
   2697 			hdr->b_datacnt = 1;
   2698 
   2699 		}
   2700 
   2701 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
   2702 		acb->acb_done = done;
   2703 		acb->acb_private = private;
   2704 
   2705 		ASSERT(hdr->b_acb == NULL);
   2706 		hdr->b_acb = acb;
   2707 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
   2708 
   2709 		/*
   2710 		 * If the buffer has been evicted, migrate it to a present state
   2711 		 * before issuing the I/O.  Once we drop the hash-table lock,
   2712 		 * the header will be marked as I/O in progress and have an
   2713 		 * attached buffer.  At this point, anybody who finds this
   2714 		 * buffer ought to notice that it's legit but has a pending I/O.
   2715 		 */
   2716 
   2717 		if (GHOST_STATE(hdr->b_state))
   2718 			arc_access(hdr, hash_lock);
   2719 
   2720 		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
   2721 		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
   2722 			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
   2723 			addr = hdr->b_l2hdr->b_daddr;
   2724 			/*
   2725 			 * Lock out device removal.
   2726 			 */
   2727 			if (vdev_is_dead(vd) ||
   2728 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
   2729 				vd = NULL;
   2730 		}
   2731 
   2732 		mutex_exit(hash_lock);
   2733 
   2734 		ASSERT3U(hdr->b_size, ==, size);
   2735 		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
   2736 		    zbookmark_t *, zb);
   2737 		ARCSTAT_BUMP(arcstat_misses);
   2738 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
   2739 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
   2740 		    data, metadata, misses);
   2741 
   2742 		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
   2743 			/*
   2744 			 * Read from the L2ARC if the following are true:
   2745 			 * 1. The L2ARC vdev was previously cached.
   2746 			 * 2. This buffer still has L2ARC metadata.
   2747 			 * 3. This buffer isn't currently writing to the L2ARC.
   2748 			 * 4. The L2ARC entry wasn't evicted, which may
   2749 			 *    also have invalidated the vdev.
   2750 			 * 5. This isn't prefetch and l2arc_noprefetch is set.
   2751 			 */
   2752 			if (hdr->b_l2hdr != NULL &&
   2753 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
   2754 			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
   2755 				l2arc_read_callback_t *cb;
   2756 
   2757 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
   2758 				ARCSTAT_BUMP(arcstat_l2_hits);
   2759 
   2760 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
   2761 				    KM_SLEEP);
   2762 				cb->l2rcb_buf = buf;
   2763 				cb->l2rcb_spa = spa;
   2764 				cb->l2rcb_bp = *bp;
   2765 				cb->l2rcb_zb = *zb;
   2766 				cb->l2rcb_flags = zio_flags;
   2767 
   2768 				/*
   2769 				 * l2arc read.  The SCL_L2ARC lock will be
   2770 				 * released by l2arc_read_done().
   2771 				 */
   2772 				rzio = zio_read_phys(pio, vd, addr, size,
   2773 				    buf->b_data, ZIO_CHECKSUM_OFF,
   2774 				    l2arc_read_done, cb, priority, zio_flags |
   2775 				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
   2776 				    ZIO_FLAG_DONT_PROPAGATE |
   2777 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
   2778 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
   2779 				    zio_t *, rzio);
   2780 				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
   2781 
   2782 				if (*arc_flags & ARC_NOWAIT) {
   2783 					zio_nowait(rzio);
   2784 					return (0);
   2785 				}
   2786 
   2787 				ASSERT(*arc_flags & ARC_WAIT);
   2788 				if (zio_wait(rzio) == 0)
   2789 					return (0);
   2790 
   2791 				/* l2arc read error; goto zio_read() */
   2792 			} else {
   2793 				DTRACE_PROBE1(l2arc__miss,
   2794 				    arc_buf_hdr_t *, hdr);
   2795 				ARCSTAT_BUMP(arcstat_l2_misses);
   2796 				if (HDR_L2_WRITING(hdr))
   2797 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
   2798 				spa_config_exit(spa, SCL_L2ARC, vd);
   2799 			}
   2800 		} else {
   2801 			if (vd != NULL)
   2802 				spa_config_exit(spa, SCL_L2ARC, vd);
   2803 			if (l2arc_ndev != 0) {
   2804 				DTRACE_PROBE1(l2arc__miss,
   2805 				    arc_buf_hdr_t *, hdr);
   2806 				ARCSTAT_BUMP(arcstat_l2_misses);
   2807 			}
   2808 		}
   2809 
   2810 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
   2811 		    arc_read_done, buf, priority, zio_flags, zb);
   2812 
   2813 		if (*arc_flags & ARC_WAIT)
   2814 			return (zio_wait(rzio));
   2815 
   2816 		ASSERT(*arc_flags & ARC_NOWAIT);
   2817 		zio_nowait(rzio);
   2818 	}
   2819 	return (0);
   2820 }
   2821 
   2822 /*
   2823  * arc_read() variant to support pool traversal.  If the block is already
   2824  * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
   2825  * The idea is that we don't want pool traversal filling up memory, but
   2826  * if the ARC already has the data anyway, we shouldn't pay for the I/O.
   2827  */
   2828 int
   2829 arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
   2830 {
   2831 	arc_buf_hdr_t *hdr;
   2832 	kmutex_t *hash_mtx;
   2833 	uint64_t guid = spa_guid(spa);
   2834 	int rc = 0;
   2835 
   2836 	hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
   2837 
   2838 	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
   2839 		arc_buf_t *buf = hdr->b_buf;
   2840 
   2841 		ASSERT(buf);
   2842 		while (buf->b_data == NULL) {
   2843 			buf = buf->b_next;
   2844 			ASSERT(buf);
   2845 		}
   2846 		bcopy(buf->b_data, data, hdr->b_size);
   2847 	} else {
   2848 		rc = ENOENT;
   2849 	}
   2850 
   2851 	if (hash_mtx)
   2852 		mutex_exit(hash_mtx);
   2853 
   2854 	return (rc);
   2855 }
   2856 
   2857 void
   2858 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
   2859 {
   2860 	ASSERT(buf->b_hdr != NULL);
   2861 	ASSERT(buf->b_hdr->b_state != arc_anon);
   2862 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
   2863 	buf->b_efunc = func;
   2864 	buf->b_private = private;
   2865 }
   2866 
   2867 /*
   2868  * This is used by the DMU to let the ARC know that a buffer is
   2869  * being evicted, so the ARC should clean up.  If this arc buf
   2870  * is not yet in the evicted state, it will be put there.
   2871  */
   2872 int
   2873 arc_buf_evict(arc_buf_t *buf)
   2874 {
   2875 	arc_buf_hdr_t *hdr;
   2876 	kmutex_t *hash_lock;
   2877 	arc_buf_t **bufp;
   2878 
   2879 	rw_enter(&buf->b_lock, RW_WRITER);
   2880 	hdr = buf->b_hdr;
   2881 	if (hdr == NULL) {
   2882 		/*
   2883 		 * We are in arc_do_user_evicts().
   2884 		 */
   2885 		ASSERT(buf->b_data == NULL);
   2886 		rw_exit(&buf->b_lock);
   2887 		return (0);
   2888 	} else if (buf->b_data == NULL) {
   2889 		arc_buf_t copy = *buf; /* structure assignment */
   2890 		/*
   2891 		 * We are on the eviction list; process this buffer now
   2892 		 * but let arc_do_user_evicts() do the reaping.
   2893 		 */
   2894 		buf->b_efunc = NULL;
   2895 		rw_exit(&buf->b_lock);
   2896 		VERIFY(copy.b_efunc(&copy) == 0);
   2897 		return (1);
   2898 	}
   2899 	hash_lock = HDR_LOCK(hdr);
   2900 	mutex_enter(hash_lock);
   2901 
   2902 	ASSERT(buf->b_hdr == hdr);
   2903 	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
   2904 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
   2905 
   2906 	/*
   2907 	 * Pull this buffer off of the hdr
   2908 	 */
   2909 	bufp = &hdr->b_buf;
   2910 	while (*bufp != buf)
   2911 		bufp = &(*bufp)->b_next;
   2912 	*bufp = buf->b_next;
   2913 
   2914 	ASSERT(buf->b_data != NULL);
   2915 	arc_buf_destroy(buf, FALSE, FALSE);
   2916 
   2917 	if (hdr->b_datacnt == 0) {
   2918 		arc_state_t *old_state = hdr->b_state;
   2919 		arc_state_t *evicted_state;
   2920 
   2921 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
   2922 
   2923 		evicted_state =
   2924 		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
   2925 
   2926 		mutex_enter(&old_state->arcs_mtx);
   2927 		mutex_enter(&evicted_state->arcs_mtx);
   2928 
   2929 		arc_change_state(evicted_state, hdr, hash_lock);
   2930 		ASSERT(HDR_IN_HASH_TABLE(hdr));
   2931 		hdr->b_flags |= ARC_IN_HASH_TABLE;
   2932 		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
   2933 
   2934 		mutex_exit(&evicted_state->arcs_mtx);
   2935 		mutex_exit(&old_state->arcs_mtx);
   2936 	}
   2937 	mutex_exit(hash_lock);
   2938 	rw_exit(&buf->b_lock);
   2939 
   2940 	VERIFY(buf->b_efunc(buf) == 0);
   2941 	buf->b_efunc = NULL;
   2942 	buf->b_private = NULL;
   2943 	buf->b_hdr = NULL;
   2944 	kmem_cache_free(buf_cache, buf);
   2945 	return (1);
   2946 }
   2947 
   2948 /*
   2949  * Release this buffer from the cache.  This must be done
   2950  * after a read and prior to modifying the buffer contents.
   2951  * If the buffer has more than one reference, we must make
   2952  * a new hdr for the buffer.
   2953  */
   2954 void
   2955 arc_release(arc_buf_t *buf, void *tag)
   2956 {
   2957 	arc_buf_hdr_t *hdr;
   2958 	kmutex_t *hash_lock;
   2959 	l2arc_buf_hdr_t *l2hdr;
   2960 	uint64_t buf_size;
   2961 	boolean_t released = B_FALSE;
   2962 
   2963 	rw_enter(&buf->b_lock, RW_WRITER);
   2964 	hdr = buf->b_hdr;
   2965 
   2966 	/* this buffer is not on any list */
   2967 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
   2968 	ASSERT(!(hdr->b_flags & ARC_STORED));
   2969 
   2970 	if (hdr->b_state == arc_anon) {
   2971 		/* this buffer is already released */
   2972 		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
   2973 		ASSERT(BUF_EMPTY(hdr));
   2974 		ASSERT(buf->b_efunc == NULL);
   2975 		arc_buf_thaw(buf);
   2976 		rw_exit(&buf->b_lock);
   2977 		released = B_TRUE;
   2978 	} else {
   2979 		hash_lock = HDR_LOCK(hdr);
   2980 		mutex_enter(hash_lock);
   2981 	}
   2982 
   2983 	l2hdr = hdr->b_l2hdr;
   2984 	if (l2hdr) {
   2985 		mutex_enter(&l2arc_buflist_mtx);
   2986 		hdr->b_l2hdr = NULL;
   2987 		buf_size = hdr->b_size;
   2988 	}
   2989 
   2990 	if (released)
   2991 		goto out;
   2992 
   2993 	/*
   2994 	 * Do we have more than one buf?
   2995 	 */
   2996 	if (hdr->b_datacnt > 1) {
   2997 		arc_buf_hdr_t *nhdr;
   2998 		arc_buf_t **bufp;
   2999 		uint64_t blksz = hdr->b_size;
   3000 		uint64_t spa = hdr->b_spa;
   3001 		arc_buf_contents_t type = hdr->b_type;
   3002 		uint32_t flags = hdr->b_flags;
   3003 
   3004 		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
   3005 		/*
   3006 		 * Pull the data off of this buf and attach it to
   3007 		 * a new anonymous buf.
   3008 		 */
   3009 		(void) remove_reference(hdr, hash_lock, tag);
   3010 		bufp = &hdr->b_buf;
   3011 		while (*bufp != buf)
   3012 			bufp = &(*bufp)->b_next;
   3013 		*bufp = (*bufp)->b_next;
   3014 		buf->b_next = NULL;
   3015 
   3016 		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
   3017 		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
   3018 		if (refcount_is_zero(&hdr->b_refcnt)) {
   3019 			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
   3020 			ASSERT3U(*size, >=, hdr->b_size);
   3021 			atomic_add_64(size, -hdr->b_size);
   3022 		}
   3023 		hdr->b_datacnt -= 1;
   3024 		arc_cksum_verify(buf);
   3025 
   3026 		mutex_exit(hash_lock);
   3027 
   3028 		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
   3029 		nhdr->b_size = blksz;
   3030 		nhdr->b_spa = spa;
   3031 		nhdr->b_type = type;
   3032 		nhdr->b_buf = buf;
   3033 		nhdr->b_state = arc_anon;
   3034 		nhdr->b_arc_access = 0;
   3035 		nhdr->b_flags = flags & ARC_L2_WRITING;
   3036 		nhdr->b_l2hdr = NULL;
   3037 		nhdr->b_datacnt = 1;
   3038 		nhdr->b_freeze_cksum = NULL;
   3039 		(void) refcount_add(&nhdr->b_refcnt, tag);
   3040 		buf->b_hdr = nhdr;
   3041 		rw_exit(&buf->b_lock);
   3042 		atomic_add_64(&arc_anon->arcs_size, blksz);
   3043 	} else {
   3044 		rw_exit(&buf->b_lock);
   3045 		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
   3046 		ASSERT(!list_link_active(&hdr->b_arc_node));
   3047 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
   3048 		arc_change_state(arc_anon, hdr, hash_lock);
   3049 		hdr->b_arc_access = 0;
   3050 		mutex_exit(hash_lock);
   3051 
   3052 		bzero(&hdr->b_dva, sizeof (dva_t));
   3053 		hdr->b_birth = 0;
   3054 		hdr->b_cksum0 = 0;
   3055 		arc_buf_thaw(buf);
   3056 	}
   3057 	buf->b_efunc = NULL;
   3058 	buf->b_private = NULL;
   3059 
   3060 out:
   3061 	if (l2hdr) {
   3062 		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
   3063 		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
   3064 		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
   3065 		mutex_exit(&l2arc_buflist_mtx);
   3066 	}
   3067 }
   3068 
   3069 int
   3070 arc_released(arc_buf_t *buf)
   3071 {
   3072 	int released;
   3073 
   3074 	rw_enter(&buf->b_lock, RW_READER);
   3075 	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
   3076 	rw_exit(&buf->b_lock);
   3077 	return (released);
   3078 }
   3079 
   3080 int
   3081 arc_has_callback(arc_buf_t *buf)
   3082 {
   3083 	int callback;
   3084 
   3085 	rw_enter(&buf->b_lock, RW_READER);
   3086 	callback = (buf->b_efunc != NULL);
   3087 	rw_exit(&buf->b_lock);
   3088 	return (callback);
   3089 }
   3090 
   3091 #ifdef ZFS_DEBUG
   3092 int
   3093 arc_referenced(arc_buf_t *buf)
   3094 {
   3095 	int referenced;
   3096 
   3097 	rw_enter(&buf->b_lock, RW_READER);
   3098 	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
   3099 	rw_exit(&buf->b_lock);
   3100 	return (referenced);
   3101 }
   3102 #endif
   3103 
   3104 static void
   3105 arc_write_ready(zio_t *zio)
   3106 {
   3107 	arc_write_callback_t *callback = zio->io_private;
   3108 	arc_buf_t *buf = callback->awcb_buf;
   3109 	arc_buf_hdr_t *hdr = buf->b_hdr;
   3110 
   3111 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
   3112 	callback->awcb_ready(zio, buf, callback->awcb_private);
   3113 
   3114 	/*
   3115 	 * If the IO is already in progress, then this is a re-write
   3116 	 * attempt, so we need to thaw and re-compute the cksum.
   3117 	 * It is the responsibility of the callback to handle the
   3118 	 * accounting for any re-write attempt.
   3119 	 */
   3120 	if (HDR_IO_IN_PROGRESS(hdr)) {
   3121 		mutex_enter(&hdr->b_freeze_lock);
   3122 		if (hdr->b_freeze_cksum != NULL) {
   3123 			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
   3124 			hdr->b_freeze_cksum = NULL;
   3125 		}
   3126 		mutex_exit(&hdr->b_freeze_lock);
   3127 	}
   3128 	arc_cksum_compute(buf, B_FALSE);
   3129 	hdr->b_flags |= ARC_IO_IN_PROGRESS;
   3130 }
   3131 
   3132 static void
   3133 arc_write_done(zio_t *zio)
   3134 {
   3135 	arc_write_callback_t *callback = zio->io_private;
   3136 	arc_buf_t *buf = callback->awcb_buf;
   3137 	arc_buf_hdr_t *hdr = buf->b_hdr;
   3138 
   3139 	hdr->b_acb = NULL;
   3140 
   3141 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
   3142 	hdr->b_birth = zio->io_bp->blk_birth;
   3143 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
   3144 	/*
   3145 	 * If the block to be written was all-zero, we may have
   3146 	 * compressed it away.  In this case no write was performed
   3147 	 * so there will be no dva/birth-date/checksum.  The buffer
   3148 	 * must therefor remain anonymous (and uncached).
   3149 	 */
   3150 	if (!BUF_EMPTY(hdr)) {
   3151 		arc_buf_hdr_t *exists;
   3152 		kmutex_t *hash_lock;
   3153 
   3154 		arc_cksum_verify(buf);
   3155 
   3156 		exists = buf_hash_insert(hdr, &hash_lock);
   3157 		if (exists) {
   3158 			/*
   3159 			 * This can only happen if we overwrite for
   3160 			 * sync-to-convergence, because we remove
   3161 			 * buffers from the hash table when we arc_free().
   3162 			 */
   3163 			ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
   3164 			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
   3165 			    BP_IDENTITY(zio->io_bp)));
   3166 			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
   3167 			    zio->io_bp->blk_birth);
   3168 
   3169 			ASSERT(refcount_is_zero(&exists->b_refcnt));
   3170 			arc_change_state(arc_anon, exists, hash_lock);
   3171 			mutex_exit(hash_lock);
   3172 			arc_hdr_destroy(exists);
   3173 			exists = buf_hash_insert(hdr, &hash_lock);
   3174 			ASSERT3P(exists, ==, NULL);
   3175 		}
   3176 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
   3177 		/* if it's not anon, we are doing a scrub */
   3178 		if (hdr->b_state == arc_anon)
   3179 			arc_access(hdr, hash_lock);
   3180 		mutex_exit(hash_lock);
   3181 	} else if (callback->awcb_done == NULL) {
   3182 		int destroy_hdr;
   3183 		/*
   3184 		 * This is an anonymous buffer with no user callback,
   3185 		 * destroy it if there are no active references.
   3186 		 */
   3187 		mutex_enter(&arc_eviction_mtx);
   3188 		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
   3189 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
   3190 		mutex_exit(&arc_eviction_mtx);
   3191 		if (destroy_hdr)
   3192 			arc_hdr_destroy(hdr);
   3193 	} else {
   3194 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
   3195 	}
   3196 	hdr->b_flags &= ~ARC_STORED;
   3197 
   3198 	if (callback->awcb_done) {
   3199 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
   3200 		callback->awcb_done(zio, buf, callback->awcb_private);
   3201 	}
   3202 
   3203 	kmem_free(callback, sizeof (arc_write_callback_t));
   3204 }
   3205 
   3206 void
   3207 write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
   3208 {
   3209 	boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
   3210 
   3211 	/* Determine checksum setting */
   3212 	if (ismd) {
   3213 		/*
   3214 		 * Metadata always gets checksummed.  If the data
   3215 		 * checksum is multi-bit correctable, and it's not a
   3216 		 * ZBT-style checksum, then it's suitable for metadata
   3217 		 * as well.  Otherwise, the metadata checksum defaults
   3218 		 * to fletcher4.
   3219 		 */
   3220 		if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
   3221 		    !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
   3222 			zp->zp_checksum = wp->wp_oschecksum;
   3223 		else
   3224 			zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
   3225 	} else {
   3226 		zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
   3227 		    wp->wp_oschecksum);
   3228 	}
   3229 
   3230 	/* Determine compression setting */
   3231 	if (ismd) {
   3232 		/*
   3233 		 * XXX -- we should design a compression algorithm
   3234 		 * that specializes in arrays of bps.
   3235 		 */
   3236 		zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
   3237 		    ZIO_COMPRESS_LZJB;
   3238 	} else {
   3239 		zp->zp_compress = zio_compress_select(wp->wp_dncompress,
   3240 		    wp->wp_oscompress);
   3241 	}
   3242 
   3243 	zp->zp_type = wp->wp_type;
   3244 	zp->zp_level = wp->wp_level;
   3245 	zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
   3246 }
   3247 
   3248 zio_t *
   3249 arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
   3250     boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
   3251     arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
   3252     int zio_flags, const zbookmark_t *zb)
   3253 {
   3254 	arc_buf_hdr_t *hdr = buf->b_hdr;
   3255 	arc_write_callback_t *callback;
   3256 	zio_t *zio;
   3257 	zio_prop_t zp;
   3258 
   3259 	ASSERT(ready != NULL);
   3260 	ASSERT(!HDR_IO_ERROR(hdr));
   3261 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
   3262 	ASSERT(hdr->b_acb == 0);
   3263 	if (l2arc)
   3264 		hdr->b_flags |= ARC_L2CACHE;
   3265 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
   3266 	callback->awcb_ready = ready;
   3267 	callback->awcb_done = done;
   3268 	callback->awcb_private = private;
   3269 	callback->awcb_buf = buf;
   3270 
   3271 	write_policy(spa, wp, &zp);
   3272 	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
   3273 	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
   3274 
   3275 	return (zio);
   3276 }
   3277 
   3278 int
   3279 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
   3280     zio_done_func_t *done, void *private, uint32_t arc_flags)
   3281 {
   3282 	arc_buf_hdr_t *ab;
   3283 	kmutex_t *hash_lock;
   3284 	zio_t	*zio;
   3285 	uint64_t guid = spa_guid(spa);
   3286 
   3287 	/*
   3288 	 * If this buffer is in the cache, release it, so it
   3289 	 * can be re-used.
   3290 	 */
   3291 	ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
   3292 	if (ab != NULL) {
   3293 		/*
   3294 		 * The checksum of blocks to free is not always
   3295 		 * preserved (eg. on the deadlist).  However, if it is
   3296 		 * nonzero, it should match what we have in the cache.
   3297 		 */
   3298 		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
   3299 		    bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
   3300 		    bp->blk_fill == BLK_FILL_ALREADY_FREED);
   3301 
   3302 		if (ab->b_state != arc_anon)
   3303 			arc_change_state(arc_anon, ab, hash_lock);
   3304 		if (HDR_IO_IN_PROGRESS(ab)) {
   3305 			/*
   3306 			 * This should only happen when we prefetch.
   3307 			 */
   3308 			ASSERT(ab->b_flags & ARC_PREFETCH);
   3309 			ASSERT3U(ab->b_datacnt, ==, 1);
   3310 			ab->b_flags |= ARC_FREED_IN_READ;
   3311 			if (HDR_IN_HASH_TABLE(ab))
   3312 				buf_hash_remove(ab);
   3313 			ab->b_arc_access = 0;
   3314 			bzero(&ab->b_dva, sizeof (dva_t));
   3315 			ab->b_birth = 0;
   3316 			ab->b_cksum0 = 0;
   3317 			ab->b_buf->b_efunc = NULL;
   3318 			ab->b_buf->b_private = NULL;
   3319 			mutex_exit(hash_lock);
   3320 		} else if (refcount_is_zero(&ab->b_refcnt)) {
   3321 			ab->b_flags |= ARC_FREE_IN_PROGRESS;
   3322 			mutex_exit(hash_lock);
   3323 			arc_hdr_destroy(ab);
   3324 			ARCSTAT_BUMP(arcstat_deleted);
   3325 		} else {
   3326 			/*
   3327 			 * We still have an active reference on this
   3328 			 * buffer.  This can happen, e.g., from
   3329 			 * dbuf_unoverride().
   3330 			 */
   3331 			ASSERT(!HDR_IN_HASH_TABLE(ab));
   3332 			ab->b_arc_access = 0;
   3333 			bzero(&ab->b_dva, sizeof (dva_t));
   3334 			ab->b_birth = 0;
   3335 			ab->b_cksum0 = 0;
   3336 			ab->b_buf->b_efunc = NULL;
   3337 			ab->b_buf->b_private = NULL;
   3338 			mutex_exit(hash_lock);
   3339 		}
   3340 	}
   3341 
   3342 	zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
   3343 
   3344 	if (arc_flags & ARC_WAIT)
   3345 		return (zio_wait(zio));
   3346 
   3347 	ASSERT(arc_flags & ARC_NOWAIT);
   3348 	zio_nowait(zio);
   3349 
   3350 	return (0);
   3351 }
   3352 
   3353 static int
   3354 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
   3355 {
   3356 #ifdef _KERNEL
   3357 	uint64_t available_memory = ptob(freemem);
   3358 	static uint64_t page_load = 0;
   3359 	static uint64_t last_txg = 0;
   3360 
   3361 #if defined(__i386)
   3362 	available_memory =
   3363 	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
   3364 #endif
   3365 	if (available_memory >= zfs_write_limit_max)
   3366 		return (0);
   3367 
   3368 	if (txg > last_txg) {
   3369 		last_txg = txg;
   3370 		page_load = 0;
   3371 	}
   3372 	/*
   3373 	 * If we are in pageout, we know that memory is already tight,
   3374 	 * the arc is already going to be evicting, so we just want to
   3375 	 * continue to let page writes occur as quickly as possible.
   3376 	 */
   3377 	if (curproc == proc_pageout) {
   3378 		if (page_load > MAX(ptob(minfree), available_memory) / 4)
   3379 			return (ERESTART);
   3380 		/* Note: reserve is inflated, so we deflate */
   3381 		page_load += reserve / 8;
   3382 		return (0);
   3383 	} else if (page_load > 0 && arc_reclaim_needed()) {
   3384 		/* memory is low, delay before restarting */
   3385 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
   3386 		return (EAGAIN);
   3387 	}
   3388 	page_load = 0;
   3389 
   3390 	if (arc_size > arc_c_min) {
   3391 		uint64_t evictable_memory =
   3392 		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
   3393 		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
   3394 		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
   3395 		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
   3396 		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
   3397 	}
   3398 
   3399 	if (inflight_data > available_memory / 4) {
   3400 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
   3401 		return (ERESTART);
   3402 	}
   3403 #endif
   3404 	return (0);
   3405 }
   3406 
   3407 void
   3408 arc_tempreserve_clear(uint64_t reserve)
   3409 {
   3410 	atomic_add_64(&arc_tempreserve, -reserve);
   3411 	ASSERT((int64_t)arc_tempreserve >= 0);
   3412 }
   3413 
   3414 int
   3415 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
   3416 {
   3417 	int error;
   3418 	uint64_t anon_size;
   3419 
   3420 #ifdef ZFS_DEBUG
   3421 	/*
   3422 	 * Once in a while, fail for no reason.  Everything should cope.
   3423 	 */
   3424 	if (spa_get_random(10000) == 0) {
   3425 		dprintf("forcing random failure\n");
   3426 		return (ERESTART);
   3427 	}
   3428 #endif
   3429 	if (reserve > arc_c/4 && !arc_no_grow)
   3430 		arc_c = MIN(arc_c_max, reserve * 4);
   3431 	if (reserve > arc_c)
   3432 		return (ENOMEM);
   3433 
   3434 	/*
   3435 	 * Don't count loaned bufs as in flight dirty data to prevent long
   3436 	 * network delays from blocking transactions that are ready to be
   3437 	 * assigned to a txg.
   3438 	 */
   3439 	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
   3440 
   3441 	/*
   3442 	 * Writes will, almost always, require additional memory allocations
   3443 	 * in order to compress/encrypt/etc the data.  We therefor need to
   3444 	 * make sure that there is sufficient available memory for this.
   3445 	 */
   3446 	if (error = arc_memory_throttle(reserve, anon_size, txg))
   3447 		return (error);
   3448 
   3449 	/*
   3450 	 * Throttle writes when the amount of dirty data in the cache
   3451 	 * gets too large.  We try to keep the cache less than half full
   3452 	 * of dirty blocks so that our sync times don't grow too large.
   3453 	 * Note: if two requests come in concurrently, we might let them
   3454 	 * both succeed, when one of them should fail.  Not a huge deal.
   3455 	 */
   3456 
   3457 	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
   3458 	    anon_size > arc_c / 4) {
   3459 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
   3460 		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
   3461 		    arc_tempreserve>>10,
   3462 		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
   3463 		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
   3464 		    reserve>>10, arc_c>>10);
   3465 		return (ERESTART);
   3466 	}
   3467 	atomic_add_64(&arc_tempreserve, reserve);
   3468 	return (0);
   3469 }
   3470 
   3471 void
   3472 arc_init(void)
   3473 {
   3474 	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
   3475 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
   3476 
   3477 	/* Convert seconds to clock ticks */
   3478 	arc_min_prefetch_lifespan = 1 * hz;
   3479 
   3480 	/* Start out with 1/8 of all memory */
   3481 	arc_c = physmem * PAGESIZE / 8;
   3482 
   3483 #ifdef _KERNEL
   3484 	/*
   3485 	 * On architectures where the physical memory can be larger
   3486 	 * than the addressable space (intel in 32-bit mode), we may
   3487 	 * need to limit the cache to 1/8 of VM size.
   3488 	 */
   3489 	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
   3490 #endif
   3491 
   3492 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
   3493 	arc_c_min = MAX(arc_c / 4, 64<<20);
   3494 	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
   3495 	if (arc_c * 8 >= 1<<30)
   3496 		arc_c_max = (arc_c * 8) - (1<<30);
   3497 	else
   3498 		arc_c_max = arc_c_min;
   3499 	arc_c_max = MAX(arc_c * 6, arc_c_max);
   3500 
   3501 	/*
   3502 	 * Allow the tunables to override our calculations if they are
   3503 	 * reasonable (ie. over 64MB)
   3504 	 */
   3505 	if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
   3506 		arc_c_max = zfs_arc_max;
   3507 	if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
   3508 		arc_c_min = zfs_arc_min;
   3509 
   3510 	arc_c = arc_c_max;
   3511 	arc_p = (arc_c >> 1);
   3512 
   3513 	/* limit meta-data to 1/4 of the arc capacity */
   3514 	arc_meta_limit = arc_c_max / 4;
   3515 
   3516 	/* Allow the tunable to override if it is reasonable */
   3517 	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
   3518 		arc_meta_limit = zfs_arc_meta_limit;
   3519 
   3520 	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
   3521 		arc_c_min = arc_meta_limit / 2;
   3522 
   3523 	if (zfs_arc_grow_retry > 0)
   3524 		arc_grow_retry = zfs_arc_grow_retry;
   3525 
   3526 	if (zfs_arc_shrink_shift > 0)
   3527 		arc_shrink_shift = zfs_arc_shrink_shift;
   3528 
   3529 	if (zfs_arc_p_min_shift > 0)
   3530 		arc_p_min_shift = zfs_arc_p_min_shift;
   3531 
   3532 	/* if kmem_flags are set, lets try to use less memory */
   3533 	if (kmem_debugging())
   3534 		arc_c = arc_c / 2;
   3535 	if (arc_c < arc_c_min)
   3536 		arc_c = arc_c_min;
   3537 
   3538 	arc_anon = &ARC_anon;
   3539 	arc_mru = &ARC_mru;
   3540 	arc_mru_ghost = &ARC_mru_ghost;
   3541 	arc_mfu = &ARC_mfu;
   3542 	arc_mfu_ghost = &ARC_mfu_ghost;
   3543 	arc_l2c_only = &ARC_l2c_only;
   3544 	arc_size = 0;
   3545 
   3546 	mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
   3547 	mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
   3548 	mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
   3549 	mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
   3550 	mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
   3551 	mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
   3552 
   3553 	list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
   3554 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
   3555 	list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
   3556 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
   3557 	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
   3558 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
   3559 	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
   3560 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
   3561 	list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
   3562 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
   3563 	list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
   3564 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
   3565 	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
   3566 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
   3567 	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
   3568 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
   3569 	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
   3570 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
   3571 	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
   3572 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
   3573 
   3574 	buf_init();
   3575 
   3576 	arc_thread_exit = 0;
   3577 	arc_eviction_list = NULL;
   3578 	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
   3579 	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
   3580 
   3581 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
   3582 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
   3583 
   3584 	if (arc_ksp != NULL) {
   3585 		arc_ksp->ks_data = &arc_stats;
   3586 		kstat_install(arc_ksp);
   3587 	}
   3588 
   3589 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
   3590 	    TS_RUN, minclsyspri);
   3591 
   3592 	arc_dead = FALSE;
   3593 	arc_warm = B_FALSE;
   3594 
   3595 	if (zfs_write_limit_max == 0)
   3596 		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
   3597 	else
   3598 		zfs_write_limit_shift = 0;
   3599 	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
   3600 }
   3601 
   3602 void
   3603 arc_fini(void)
   3604 {
   3605 	mutex_enter(&arc_reclaim_thr_lock);
   3606 	arc_thread_exit = 1;
   3607 	while (arc_thread_exit != 0)
   3608 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
   3609 	mutex_exit(&arc_reclaim_thr_lock);
   3610 
   3611 	arc_flush(NULL);
   3612 
   3613 	arc_dead = TRUE;
   3614 
   3615 	if (arc_ksp != NULL) {
   3616 		kstat_delete(arc_ksp);
   3617 		arc_ksp = NULL;