Home | History | Annotate | Download | only in zfs
      1    789     ahrens /*
      2    789     ahrens  * CDDL HEADER START
      3    789     ahrens  *
      4    789     ahrens  * The contents of this file are subject to the terms of the
      5   1484   ek110237  * Common Development and Distribution License (the "License").
      6   1484   ek110237  * You may not use this file except in compliance with the License.
      7    789     ahrens  *
      8    789     ahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9    789     ahrens  * or http://www.opensolaris.org/os/licensing.
     10    789     ahrens  * See the License for the specific language governing permissions
     11    789     ahrens  * and limitations under the License.
     12    789     ahrens  *
     13    789     ahrens  * When distributing Covered Code, include this CDDL HEADER in each
     14    789     ahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15    789     ahrens  * If applicable, add the following below this CDDL HEADER, with the
     16    789     ahrens  * fields enclosed by brackets "[]" replaced with your own identifying
     17    789     ahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
     18    789     ahrens  *
     19    789     ahrens  * CDDL HEADER END
     20    789     ahrens  */
     21    789     ahrens /*
     22   8582    Brendan  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23    789     ahrens  * Use is subject to license terms.
     24    789     ahrens  */
     25    789     ahrens 
     26    789     ahrens /*
     27   3403        bmc  * DVA-based Adjustable Replacement Cache
     28    789     ahrens  *
     29   1544   eschrock  * While much of the theory of operation used here is
     30   1544   eschrock  * based on the self-tuning, low overhead replacement cache
     31    789     ahrens  * presented by Megiddo and Modha at FAST 2003, there are some
     32    789     ahrens  * significant differences:
     33    789     ahrens  *
     34    789     ahrens  * 1. The Megiddo and Modha model assumes any page is evictable.
     35    789     ahrens  * Pages in its cache cannot be "locked" into memory.  This makes
     36    789     ahrens  * the eviction algorithm simple: evict the last page in the list.
     37    789     ahrens  * This also make the performance characteristics easy to reason
     38    789     ahrens  * about.  Our cache is not so simple.  At any given moment, some
     39    789     ahrens  * subset of the blocks in the cache are un-evictable because we
     40    789     ahrens  * have handed out a reference to them.  Blocks are only evictable
     41    789     ahrens  * when there are no external references active.  This makes
     42    789     ahrens  * eviction far more problematic:  we choose to evict the evictable
     43    789     ahrens  * blocks that are the "lowest" in the list.
     44    789     ahrens  *
     45    789     ahrens  * There are times when it is not possible to evict the requested
     46    789     ahrens  * space.  In these circumstances we are unable to adjust the cache
     47    789     ahrens  * size.  To prevent the cache growing unbounded at these times we
     48   5450    brendan  * implement a "cache throttle" that slows the flow of new data
     49   5450    brendan  * into the cache until we can make space available.
     50    789     ahrens  *
     51    789     ahrens  * 2. The Megiddo and Modha model assumes a fixed cache size.
     52    789     ahrens  * Pages are evicted when the cache is full and there is a cache
     53    789     ahrens  * miss.  Our model has a variable sized cache.  It grows with
     54   5450    brendan  * high use, but also tries to react to memory pressure from the
     55    789     ahrens  * operating system: decreasing its size when system memory is
     56    789     ahrens  * tight.
     57    789     ahrens  *
     58    789     ahrens  * 3. The Megiddo and Modha model assumes a fixed page size. All
     59    789     ahrens  * elements of the cache are therefor exactly the same size.  So
     60    789     ahrens  * when adjusting the cache size following a cache miss, its simply
     61    789     ahrens  * a matter of choosing a single page to evict.  In our model, we
     62    789     ahrens  * have variable sized cache blocks (rangeing from 512 bytes to
     63    789     ahrens  * 128K bytes).  We therefor choose a set of blocks to evict to make
     64    789     ahrens  * space for a cache miss that approximates as closely as possible
     65    789     ahrens  * the space used by the new block.
     66    789     ahrens  *
     67    789     ahrens  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
     68    789     ahrens  * by N. Megiddo & D. Modha, FAST 2003
     69    789     ahrens  */
     70    789     ahrens 
     71    789     ahrens /*
     72    789     ahrens  * The locking model:
     73    789     ahrens  *
     74    789     ahrens  * A new reference to a cache buffer can be obtained in two
     75    789     ahrens  * ways: 1) via a hash table lookup using the DVA as a key,
     76   5450    brendan  * or 2) via one of the ARC lists.  The arc_read() interface
     77    789     ahrens  * uses method 1, while the internal arc algorithms for
     78    789     ahrens  * adjusting the cache use method 2.  We therefor provide two
     79    789     ahrens  * types of locks: 1) the hash table lock array, and 2) the
     80    789     ahrens  * arc list locks.
     81    789     ahrens  *
     82    789     ahrens  * Buffers do not have their own mutexs, rather they rely on the
     83    789     ahrens  * hash table mutexs for the bulk of their protection (i.e. most
     84    789     ahrens  * fields in the arc_buf_hdr_t are protected by these mutexs).
     85    789     ahrens  *
     86    789     ahrens  * buf_hash_find() returns the appropriate mutex (held) when it
     87    789     ahrens  * locates the requested buffer in the hash table.  It returns
     88    789     ahrens  * NULL for the mutex if the buffer was not in the table.
     89    789     ahrens  *
     90    789     ahrens  * buf_hash_remove() expects the appropriate hash mutex to be
     91    789     ahrens  * already held before it is invoked.
     92    789     ahrens  *
     93    789     ahrens  * Each arc state also has a mutex which is used to protect the
     94    789     ahrens  * buffer list associated with the state.  When attempting to
     95    789     ahrens  * obtain a hash table lock while holding an arc list lock you
     96    789     ahrens  * must use: mutex_tryenter() to avoid deadlock.  Also note that
     97   2688     maybee  * the active state mutex must be held before the ghost state mutex.
     98    789     ahrens  *
     99   1544   eschrock  * Arc buffers may have an associated eviction callback function.
    100   1544   eschrock  * This function will be invoked prior to removing the buffer (e.g.
    101   1544   eschrock  * in arc_do_user_evicts()).  Note however that the data associated
    102   1544   eschrock  * with the buffer may be evicted prior to the callback.  The callback
    103   1544   eschrock  * must be made with *no locks held* (to prevent deadlock).  Additionally,
    104   1544   eschrock  * the users of callbacks must ensure that their private data is
    105   1544   eschrock  * protected from simultaneous callbacks from arc_buf_evict()
    106   1544   eschrock  * and arc_do_user_evicts().
    107   1544   eschrock  *
    108    789     ahrens  * Note that the majority of the performance stats are manipulated
    109    789     ahrens  * with atomic operations.
    110   5450    brendan  *
    111   5450    brendan  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
    112   5450    brendan  *
    113   5450    brendan  *	- L2ARC buflist creation
    114   5450    brendan  *	- L2ARC buflist eviction
    115   5450    brendan  *	- L2ARC write completion, which walks L2ARC buflists
    116   5450    brendan  *	- ARC header destruction, as it removes from L2ARC buflists
    117   5450    brendan  *	- ARC header release, as it removes from L2ARC buflists
    118    789     ahrens  */
    119    789     ahrens 
    120    789     ahrens #include <sys/spa.h>
    121    789     ahrens #include <sys/zio.h>
    122    789     ahrens #include <sys/zfs_context.h>
    123    789     ahrens #include <sys/arc.h>
    124    789     ahrens #include <sys/refcount.h>
    125   6643   eschrock #include <sys/vdev.h>
    126   9816     George #include <sys/vdev_impl.h>
    127    789     ahrens #ifdef _KERNEL
    128    789     ahrens #include <sys/vmsystm.h>
    129    789     ahrens #include <vm/anon.h>
    130    789     ahrens #include <sys/fs/swapnode.h>
    131   1484   ek110237 #include <sys/dnlc.h>
    132    789     ahrens #endif
    133    789     ahrens #include <sys/callb.h>
    134   3403        bmc #include <sys/kstat.h>
    135  10922       Jeff #include <zfs_fletcher.h>
    136    789     ahrens 
    137    789     ahrens static kmutex_t		arc_reclaim_thr_lock;
    138    789     ahrens static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
    139    789     ahrens static uint8_t		arc_thread_exit;
    140   6245     maybee 
    141   6245     maybee extern int zfs_write_limit_shift;
    142   6245     maybee extern uint64_t zfs_write_limit_max;
    143   7468       Mark extern kmutex_t zfs_write_limit_lock;
    144   1484   ek110237 
    145   1484   ek110237 #define	ARC_REDUCE_DNLC_PERCENT	3
    146   1484   ek110237 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
    147    789     ahrens 
    148    789     ahrens typedef enum arc_reclaim_strategy {
    149    789     ahrens 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
    150    789     ahrens 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
    151    789     ahrens } arc_reclaim_strategy_t;
    152    789     ahrens 
    153    789     ahrens /* number of seconds before growing cache again */
    154    789     ahrens static int		arc_grow_retry = 60;
    155    789     ahrens 
    156   8582    Brendan /* shift of arc_c for calculating both min and max arc_p */
    157   8582    Brendan static int		arc_p_min_shift = 4;
    158   8582    Brendan 
    159   8582    Brendan /* log2(fraction of arc to reclaim) */
    160   8582    Brendan static int		arc_shrink_shift = 5;
    161   8582    Brendan 
    162   2391     maybee /*
    163   2638     perrin  * minimum lifespan of a prefetch block in clock ticks
    164   2638     perrin  * (initialized in arc_init())
    165   2391     maybee  */
    166   2638     perrin static int		arc_min_prefetch_lifespan;
    167   2391     maybee 
    168    789     ahrens static int arc_dead;
    169   6987    brendan 
    170   6987    brendan /*
    171   6987    brendan  * The arc has filled available memory and has now warmed up.
    172   6987    brendan  */
    173   6987    brendan static boolean_t arc_warm;
    174   2885     ahrens 
    175   2885     ahrens /*
    176   2885     ahrens  * These tunables are for performance analysis.
    177   2885     ahrens  */
    178   2885     ahrens uint64_t zfs_arc_max;
    179   2885     ahrens uint64_t zfs_arc_min;
    180   4645   ek110237 uint64_t zfs_arc_meta_limit = 0;
    181   8582    Brendan int zfs_arc_grow_retry = 0;
    182   8582    Brendan int zfs_arc_shrink_shift = 0;
    183   8582    Brendan int zfs_arc_p_min_shift = 0;
    184    789     ahrens 
    185    789     ahrens /*
    186   5450    brendan  * Note that buffers can be in one of 6 states:
    187    789     ahrens  *	ARC_anon	- anonymous (discussed below)
    188   1544   eschrock  *	ARC_mru		- recently used, currently cached
    189   1544   eschrock  *	ARC_mru_ghost	- recentely used, no longer in cache
    190   1544   eschrock  *	ARC_mfu		- frequently used, currently cached
    191   1544   eschrock  *	ARC_mfu_ghost	- frequently used, no longer in cache
    192   5450    brendan  *	ARC_l2c_only	- exists in L2ARC but not other states
    193   4309     maybee  * When there are no active references to the buffer, they are
    194   4309     maybee  * are linked onto a list in one of these arc states.  These are
    195   4309     maybee  * the only buffers that can be evicted or deleted.  Within each
    196   4309     maybee  * state there are multiple lists, one for meta-data and one for
    197   4309     maybee  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
    198   4309     maybee  * etc.) is tracked separately so that it can be managed more
    199   5450    brendan  * explicitly: favored over data, limited explicitly.
    200    789     ahrens  *
    201    789     ahrens  * Anonymous buffers are buffers that are not associated with
    202    789     ahrens  * a DVA.  These are buffers that hold dirty block copies
    203    789     ahrens  * before they are written to stable storage.  By definition,
    204   1544   eschrock  * they are "ref'd" and are considered part of arc_mru
    205    789     ahrens  * that cannot be freed.  Generally, they will aquire a DVA
    206   1544   eschrock  * as they are written and migrate onto the arc_mru list.
    207   5450    brendan  *
    208   5450    brendan  * The ARC_l2c_only state is for buffers that are in the second
    209   5450    brendan  * level ARC but no longer in any of the ARC_m* lists.  The second
    210   5450    brendan  * level ARC itself may also contain buffers that are in any of
    211   5450    brendan  * the ARC_m* states - meaning that a buffer can exist in two
    212   5450    brendan  * places.  The reason for the ARC_l2c_only state is to keep the
    213   5450    brendan  * buffer header in the hash table, so that reads that hit the
    214   5450    brendan  * second level ARC benefit from these fast lookups.
    215    789     ahrens  */
    216    789     ahrens 
    217    789     ahrens typedef struct arc_state {
    218   4309     maybee 	list_t	arcs_list[ARC_BUFC_NUMTYPES];	/* list of evictable buffers */
    219   4309     maybee 	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
    220   4309     maybee 	uint64_t arcs_size;	/* total amount of data in this state */
    221   3403        bmc 	kmutex_t arcs_mtx;
    222    789     ahrens } arc_state_t;
    223    789     ahrens 
    224   5450    brendan /* The 6 states: */
    225    789     ahrens static arc_state_t ARC_anon;
    226   1544   eschrock static arc_state_t ARC_mru;
    227   1544   eschrock static arc_state_t ARC_mru_ghost;
    228   1544   eschrock static arc_state_t ARC_mfu;
    229   1544   eschrock static arc_state_t ARC_mfu_ghost;
    230   5450    brendan static arc_state_t ARC_l2c_only;
    231    789     ahrens 
    232   3403        bmc typedef struct arc_stats {
    233   3403        bmc 	kstat_named_t arcstat_hits;
    234   3403        bmc 	kstat_named_t arcstat_misses;
    235   3403        bmc 	kstat_named_t arcstat_demand_data_hits;
    236   3403        bmc 	kstat_named_t arcstat_demand_data_misses;
    237   3403        bmc 	kstat_named_t arcstat_demand_metadata_hits;
    238   3403        bmc 	kstat_named_t arcstat_demand_metadata_misses;
    239   3403        bmc 	kstat_named_t arcstat_prefetch_data_hits;
    240   3403        bmc 	kstat_named_t arcstat_prefetch_data_misses;
    241   3403        bmc 	kstat_named_t arcstat_prefetch_metadata_hits;
    242   3403        bmc 	kstat_named_t arcstat_prefetch_metadata_misses;
    243   3403        bmc 	kstat_named_t arcstat_mru_hits;
    244   3403        bmc 	kstat_named_t arcstat_mru_ghost_hits;
    245   3403        bmc 	kstat_named_t arcstat_mfu_hits;
    246   3403        bmc 	kstat_named_t arcstat_mfu_ghost_hits;
    247   3403        bmc 	kstat_named_t arcstat_deleted;
    248   3403        bmc 	kstat_named_t arcstat_recycle_miss;
    249   3403        bmc 	kstat_named_t arcstat_mutex_miss;
    250   3403        bmc 	kstat_named_t arcstat_evict_skip;
    251  10357    Brendan 	kstat_named_t arcstat_evict_l2_cached;
    252  10357    Brendan 	kstat_named_t arcstat_evict_l2_eligible;
    253  10357    Brendan 	kstat_named_t arcstat_evict_l2_ineligible;
    254   3403        bmc 	kstat_named_t arcstat_hash_elements;
    255   3403        bmc 	kstat_named_t arcstat_hash_elements_max;
    256   3403        bmc 	kstat_named_t arcstat_hash_collisions;
    257   3403        bmc 	kstat_named_t arcstat_hash_chains;
    258   3403        bmc 	kstat_named_t arcstat_hash_chain_max;
    259   3403        bmc 	kstat_named_t arcstat_p;
    260   3403        bmc 	kstat_named_t arcstat_c;
    261   3403        bmc 	kstat_named_t arcstat_c_min;
    262   3403        bmc 	kstat_named_t arcstat_c_max;
    263   3403        bmc 	kstat_named_t arcstat_size;
    264   5450    brendan 	kstat_named_t arcstat_hdr_size;
    265   8582    Brendan 	kstat_named_t arcstat_data_size;
    266   8582    Brendan 	kstat_named_t arcstat_other_size;
    267   5450    brendan 	kstat_named_t arcstat_l2_hits;
    268   5450    brendan 	kstat_named_t arcstat_l2_misses;
    269   5450    brendan 	kstat_named_t arcstat_l2_feeds;
    270   5450    brendan 	kstat_named_t arcstat_l2_rw_clash;
    271   8582    Brendan 	kstat_named_t arcstat_l2_read_bytes;
    272   8582    Brendan 	kstat_named_t arcstat_l2_write_bytes;
    273   5450    brendan 	kstat_named_t arcstat_l2_writes_sent;
    274   5450    brendan 	kstat_named_t arcstat_l2_writes_done;
    275   5450    brendan 	kstat_named_t arcstat_l2_writes_error;
    276   5450    brendan 	kstat_named_t arcstat_l2_writes_hdr_miss;
    277   5450    brendan 	kstat_named_t arcstat_l2_evict_lock_retry;
    278   5450    brendan 	kstat_named_t arcstat_l2_evict_reading;
    279   5450    brendan 	kstat_named_t arcstat_l2_free_on_write;
    280   5450    brendan 	kstat_named_t arcstat_l2_abort_lowmem;
    281   5450    brendan 	kstat_named_t arcstat_l2_cksum_bad;
    282   5450    brendan 	kstat_named_t arcstat_l2_io_error;
    283   5450    brendan 	kstat_named_t arcstat_l2_size;
    284   5450    brendan 	kstat_named_t arcstat_l2_hdr_size;
    285   6245     maybee 	kstat_named_t arcstat_memory_throttle_count;
    286   3403        bmc } arc_stats_t;
    287    789     ahrens 
    288   3403        bmc static arc_stats_t arc_stats = {
    289   3403        bmc 	{ "hits",			KSTAT_DATA_UINT64 },
    290   3403        bmc 	{ "misses",			KSTAT_DATA_UINT64 },
    291   3403        bmc 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
    292   3403        bmc 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
    293   3403        bmc 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
    294   3403        bmc 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
    295   3403        bmc 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
    296   3403        bmc 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
    297   3403        bmc 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
    298   3403        bmc 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
    299   3403        bmc 	{ "mru_hits",			KSTAT_DATA_UINT64 },
    300   3403        bmc 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
    301   3403        bmc 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
    302   3403        bmc 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
    303   3403        bmc 	{ "deleted",			KSTAT_DATA_UINT64 },
    304   3403        bmc 	{ "recycle_miss",		KSTAT_DATA_UINT64 },
    305   3403        bmc 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
    306   3403        bmc 	{ "evict_skip",			KSTAT_DATA_UINT64 },
    307  10357    Brendan 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
    308  10357    Brendan 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
    309  10357    Brendan 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
    310   3403        bmc 	{ "hash_elements",		KSTAT_DATA_UINT64 },
    311   3403        bmc 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
    312   3403        bmc 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
    313   3403        bmc 	{ "hash_chains",		KSTAT_DATA_UINT64 },
    314   3403        bmc 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
    315   3403        bmc 	{ "p",				KSTAT_DATA_UINT64 },
    316   3403        bmc 	{ "c",				KSTAT_DATA_UINT64 },
    317   3403        bmc 	{ "c_min",			KSTAT_DATA_UINT64 },
    318   3403        bmc 	{ "c_max",			KSTAT_DATA_UINT64 },
    319   5450    brendan 	{ "size",			KSTAT_DATA_UINT64 },
    320   5450    brendan 	{ "hdr_size",			KSTAT_DATA_UINT64 },
    321   8582    Brendan 	{ "data_size",			KSTAT_DATA_UINT64 },
    322   8582    Brendan 	{ "other_size",			KSTAT_DATA_UINT64 },
    323   5450    brendan 	{ "l2_hits",			KSTAT_DATA_UINT64 },
    324   5450    brendan 	{ "l2_misses",			KSTAT_DATA_UINT64 },
    325   5450    brendan 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
    326   5450    brendan 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
    327   8582    Brendan 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
    328   8582    Brendan 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
    329   5450    brendan 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
    330   5450    brendan 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
    331   5450    brendan 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
    332   5450    brendan 	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
    333   5450    brendan 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
    334   5450    brendan 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
    335   5450    brendan 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
    336   5450    brendan 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
    337   5450    brendan 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
    338   5450    brendan 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
    339   5450    brendan 	{ "l2_size",			KSTAT_DATA_UINT64 },
    340   6245     maybee 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
    341   6245     maybee 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 }
    342   3403        bmc };
    343    789     ahrens 
    344   3403        bmc #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
    345    789     ahrens 
    346   3403        bmc #define	ARCSTAT_INCR(stat, val) \
    347   3403        bmc 	atomic_add_64(&arc_stats.stat.value.ui64, (val));
    348   3403        bmc 
    349  10922       Jeff #define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
    350   3403        bmc #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
    351   3403        bmc 
    352   3403        bmc #define	ARCSTAT_MAX(stat, val) {					\
    353   3403        bmc 	uint64_t m;							\
    354   3403        bmc 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
    355   3403        bmc 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
    356   3403        bmc 		continue;						\
    357   3403        bmc }
    358   3403        bmc 
    359   3403        bmc #define	ARCSTAT_MAXSTAT(stat) \
    360   3403        bmc 	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
    361   3403        bmc 
    362   3403        bmc /*
    363   3403        bmc  * We define a macro to allow ARC hits/misses to be easily broken down by
    364   3403        bmc  * two separate conditions, giving a total of four different subtypes for
    365   3403        bmc  * each of hits and misses (so eight statistics total).
    366   3403        bmc  */
    367   3403        bmc #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
    368   3403        bmc 	if (cond1) {							\
    369   3403        bmc 		if (cond2) {						\
    370   3403        bmc 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
    371   3403        bmc 		} else {						\
    372   3403        bmc 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
    373   3403        bmc 		}							\
    374   3403        bmc 	} else {							\
    375   3403        bmc 		if (cond2) {						\
    376   3403        bmc 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
    377   3403        bmc 		} else {						\
    378   3403        bmc 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
    379   3403        bmc 		}							\
    380   3403        bmc 	}
    381   3403        bmc 
    382   3403        bmc kstat_t			*arc_ksp;
    383  10922       Jeff static arc_state_t	*arc_anon;
    384   3403        bmc static arc_state_t	*arc_mru;
    385   3403        bmc static arc_state_t	*arc_mru_ghost;
    386   3403        bmc static arc_state_t	*arc_mfu;
    387   3403        bmc static arc_state_t	*arc_mfu_ghost;
    388   5450    brendan static arc_state_t	*arc_l2c_only;
    389   3403        bmc 
    390   3403        bmc /*
    391   3403        bmc  * There are several ARC variables that are critical to export as kstats --
    392   3403        bmc  * but we don't want to have to grovel around in the kstat whenever we wish to
    393   3403        bmc  * manipulate them.  For these variables, we therefore define them to be in
    394   3403        bmc  * terms of the statistic variable.  This assures that we are not introducing
    395   3403        bmc  * the possibility of inconsistency by having shadow copies of the variables,
    396   3403        bmc  * while still allowing the code to be readable.
    397   3403        bmc  */
    398   3403        bmc #define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
    399   3403        bmc #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
    400   3403        bmc #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
    401   3403        bmc #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
    402   3403        bmc #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
    403   3403        bmc 
    404   3403        bmc static int		arc_no_grow;	/* Don't try to grow cache size */
    405   3403        bmc static uint64_t		arc_tempreserve;
    406   9412  Aleksandr static uint64_t		arc_loaned_bytes;
    407   4309     maybee static uint64_t		arc_meta_used;
    408   4309     maybee static uint64_t		arc_meta_limit;
    409   4309     maybee static uint64_t		arc_meta_max = 0;
    410   5450    brendan 
    411   5450    brendan typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
    412    789     ahrens 
    413    789     ahrens typedef struct arc_callback arc_callback_t;
    414    789     ahrens 
    415    789     ahrens struct arc_callback {
    416   3547     maybee 	void			*acb_private;
    417    789     ahrens 	arc_done_func_t		*acb_done;
    418    789     ahrens 	arc_buf_t		*acb_buf;
    419    789     ahrens 	zio_t			*acb_zio_dummy;
    420    789     ahrens 	arc_callback_t		*acb_next;
    421   3547     maybee };
    422   3547     maybee 
    423   3547     maybee typedef struct arc_write_callback arc_write_callback_t;
    424   3547     maybee 
    425   3547     maybee struct arc_write_callback {
    426   3547     maybee 	void		*awcb_private;
    427   3547     maybee 	arc_done_func_t	*awcb_ready;
    428   3547     maybee 	arc_done_func_t	*awcb_done;
    429   3547     maybee 	arc_buf_t	*awcb_buf;
    430    789     ahrens };
    431    789     ahrens 
    432    789     ahrens struct arc_buf_hdr {
    433    789     ahrens 	/* protected by hash lock */
    434    789     ahrens 	dva_t			b_dva;
    435    789     ahrens 	uint64_t		b_birth;
    436    789     ahrens 	uint64_t		b_cksum0;
    437   3093     ahrens 
    438   3093     ahrens 	kmutex_t		b_freeze_lock;
    439   3093     ahrens 	zio_cksum_t		*b_freeze_cksum;
    440    789     ahrens 
    441    789     ahrens 	arc_buf_hdr_t		*b_hash_next;
    442    789     ahrens 	arc_buf_t		*b_buf;
    443    789     ahrens 	uint32_t		b_flags;
    444   1544   eschrock 	uint32_t		b_datacnt;
    445    789     ahrens 
    446   3290   johansen 	arc_callback_t		*b_acb;
    447    789     ahrens 	kcondvar_t		b_cv;
    448   3290   johansen 
    449   3290   johansen 	/* immutable */
    450   3290   johansen 	arc_buf_contents_t	b_type;
    451   3290   johansen 	uint64_t		b_size;
    452   8636       Mark 	uint64_t		b_spa;
    453    789     ahrens 
    454    789     ahrens 	/* protected by arc state mutex */
    455    789     ahrens 	arc_state_t		*b_state;
    456    789     ahrens 	list_node_t		b_arc_node;
    457    789     ahrens 
    458    789     ahrens 	/* updated atomically */
    459    789     ahrens 	clock_t			b_arc_access;
    460    789     ahrens 
    461    789     ahrens 	/* self protecting */
    462    789     ahrens 	refcount_t		b_refcnt;
    463   5450    brendan 
    464   5450    brendan 	l2arc_buf_hdr_t		*b_l2hdr;
    465   5450    brendan 	list_node_t		b_l2node;
    466    789     ahrens };
    467    789     ahrens 
    468   1544   eschrock static arc_buf_t *arc_eviction_list;
    469   1544   eschrock static kmutex_t arc_eviction_mtx;
    470   2887     maybee static arc_buf_hdr_t arc_eviction_hdr;
    471   2688     maybee static void arc_get_data_buf(arc_buf_t *buf);
    472   2688     maybee static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
    473   4309     maybee static int arc_evict_needed(arc_buf_contents_t type);
    474   8636       Mark static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
    475   1544   eschrock 
    476  10357    Brendan static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
    477  10357    Brendan 
    478   1544   eschrock #define	GHOST_STATE(state)	\
    479   5450    brendan 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
    480   5450    brendan 	(state) == arc_l2c_only)
    481   1544   eschrock 
    482    789     ahrens /*
    483    789     ahrens  * Private ARC flags.  These flags are private ARC only flags that will show up
    484    789     ahrens  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
    485    789     ahrens  * be passed in as arc_flags in things like arc_read.  However, these flags
    486    789     ahrens  * should never be passed and should only be set by ARC code.  When adding new
    487    789     ahrens  * public flags, make sure not to smash the private ones.
    488    789     ahrens  */
    489    789     ahrens 
    490   1544   eschrock #define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
    491    789     ahrens #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
    492    789     ahrens #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
    493    789     ahrens #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
    494   1544   eschrock #define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
    495   2391     maybee #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
    496   5450    brendan #define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
    497   7237   ek110237 #define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
    498   7237   ek110237 #define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
    499   7237   ek110237 #define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
    500    789     ahrens 
    501   1544   eschrock #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
    502    789     ahrens #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
    503    789     ahrens #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
    504   8582    Brendan #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
    505    789     ahrens #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
    506   1544   eschrock #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
    507   5450    brendan #define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
    508   7237   ek110237 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
    509   6987    brendan #define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
    510   6987    brendan 				    (hdr)->b_l2hdr != NULL)
    511   5450    brendan #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
    512   5450    brendan #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
    513   5450    brendan #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
    514    789     ahrens 
    515    789     ahrens /*
    516   6018    brendan  * Other sizes
    517   6018    brendan  */
    518   6018    brendan 
    519   6018    brendan #define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
    520   6018    brendan #define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
    521   6018    brendan 
    522   6018    brendan /*
    523    789     ahrens  * Hash table routines
    524    789     ahrens  */
    525    789     ahrens 
    526    789     ahrens #define	HT_LOCK_PAD	64
    527    789     ahrens 
    528    789     ahrens struct ht_lock {
    529    789     ahrens 	kmutex_t	ht_lock;
    530    789     ahrens #ifdef _KERNEL
    531    789     ahrens 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
    532    789     ahrens #endif
    533    789     ahrens };
    534    789     ahrens 
    535    789     ahrens #define	BUF_LOCKS 256
    536    789     ahrens typedef struct buf_hash_table {
    537    789     ahrens 	uint64_t ht_mask;
    538    789     ahrens 	arc_buf_hdr_t **ht_table;
    539    789     ahrens 	struct ht_lock ht_locks[BUF_LOCKS];
    540    789     ahrens } buf_hash_table_t;
    541    789     ahrens 
    542    789     ahrens static buf_hash_table_t buf_hash_table;
    543    789     ahrens 
    544    789     ahrens #define	BUF_HASH_INDEX(spa, dva, birth) \
    545    789     ahrens 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
    546    789     ahrens #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
    547    789     ahrens #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
    548    789     ahrens #define	HDR_LOCK(buf) \
    549    789     ahrens 	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
    550    789     ahrens 
    551    789     ahrens uint64_t zfs_crc64_table[256];
    552    789     ahrens 
    553   5450    brendan /*
    554   5450    brendan  * Level 2 ARC
    555   5450    brendan  */
    556   5450    brendan 
    557   5450    brendan #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
    558   8582    Brendan #define	L2ARC_HEADROOM		2		/* num of writes */
    559   8582    Brendan #define	L2ARC_FEED_SECS		1		/* caching interval secs */
    560   8582    Brendan #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
    561   5450    brendan 
    562   5450    brendan #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
    563   5450    brendan #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
    564   5450    brendan 
    565   5450    brendan /*
    566   5450    brendan  * L2ARC Performance Tunables
    567   5450    brendan  */
    568   5450    brendan uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
    569   6987    brendan uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
    570   5450    brendan uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
    571   5450    brendan uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
    572   8582    Brendan uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
    573   5450    brendan boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
    574   8582    Brendan boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
    575   8582    Brendan boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
    576   5450    brendan 
    577   5450    brendan /*
    578   5450    brendan  * L2ARC Internals
    579   5450    brendan  */
    580   5450    brendan typedef struct l2arc_dev {
    581   5450    brendan 	vdev_t			*l2ad_vdev;	/* vdev */
    582   5450    brendan 	spa_t			*l2ad_spa;	/* spa */
    583   5450    brendan 	uint64_t		l2ad_hand;	/* next write location */
    584   5450    brendan 	uint64_t		l2ad_write;	/* desired write size, bytes */
    585   6987    brendan 	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
    586   5450    brendan 	uint64_t		l2ad_start;	/* first addr on device */
    587   5450    brendan 	uint64_t		l2ad_end;	/* last addr on device */
    588   5450    brendan 	uint64_t		l2ad_evict;	/* last addr eviction reached */
    589   5450    brendan 	boolean_t		l2ad_first;	/* first sweep through */
    590   8582    Brendan 	boolean_t		l2ad_writing;	/* currently writing */
    591   5450    brendan 	list_t			*l2ad_buflist;	/* buffer list */
    592   5450    brendan 	list_node_t		l2ad_node;	/* device list node */
    593   5450    brendan } l2arc_dev_t;
    594   5450    brendan 
    595   5450    brendan static list_t L2ARC_dev_list;			/* device list */
    596   5450    brendan static list_t *l2arc_dev_list;			/* device list pointer */
    597   5450    brendan static kmutex_t l2arc_dev_mtx;			/* device list mutex */
    598   5450    brendan static l2arc_dev_t *l2arc_dev_last;		/* last device used */
    599   5450    brendan static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
    600   5450    brendan static list_t L2ARC_free_on_write;		/* free after write buf list */
    601   5450    brendan static list_t *l2arc_free_on_write;		/* free after write list ptr */
    602   5450    brendan static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
    603   5450    brendan static uint64_t l2arc_ndev;			/* number of devices */
    604   5450    brendan 
    605   5450    brendan typedef struct l2arc_read_callback {
    606   5450    brendan 	arc_buf_t	*l2rcb_buf;		/* read buffer */
    607   5450    brendan 	spa_t		*l2rcb_spa;		/* spa */
    608   5450    brendan 	blkptr_t	l2rcb_bp;		/* original blkptr */
    609   5450    brendan 	zbookmark_t	l2rcb_zb;		/* original bookmark */
    610   5450    brendan 	int		l2rcb_flags;		/* original flags */
    611   5450    brendan } l2arc_read_callback_t;
    612   5450    brendan 
    613   5450    brendan typedef struct l2arc_write_callback {
    614   5450    brendan 	l2arc_dev_t	*l2wcb_dev;		/* device info */
    615   5450    brendan 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
    616   5450    brendan } l2arc_write_callback_t;
    617   5450    brendan 
    618   5450    brendan struct l2arc_buf_hdr {
    619   5450    brendan 	/* protected by arc_buf_hdr  mutex */
    620   5450    brendan 	l2arc_dev_t	*b_dev;			/* L2ARC device */
    621   9215     George 	uint64_t	b_daddr;		/* disk address, offset byte */
    622   5450    brendan };
    623   5450    brendan 
    624   5450    brendan typedef struct l2arc_data_free {
    625   5450    brendan 	/* protected by l2arc_free_on_write_mtx */
    626   5450    brendan 	void		*l2df_data;
    627   5450    brendan 	size_t		l2df_size;
    628   5450    brendan 	void		(*l2df_func)(void *, size_t);
    629   5450    brendan 	list_node_t	l2df_list_node;
    630   5450    brendan } l2arc_data_free_t;
    631   5450    brendan 
    632   5450    brendan static kmutex_t l2arc_feed_thr_lock;
    633   5450    brendan static kcondvar_t l2arc_feed_thr_cv;
    634   5450    brendan static uint8_t l2arc_thread_exit;
    635   5450    brendan 
    636   5450    brendan static void l2arc_read_done(zio_t *zio);
    637   5450    brendan static void l2arc_hdr_stat_add(void);
    638   5450    brendan static void l2arc_hdr_stat_remove(void);
    639   5450    brendan 
    640    789     ahrens static uint64_t
    641   8636       Mark buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
    642   8636       Mark {
    643    789     ahrens 	uint8_t *vdva = (uint8_t *)dva;
    644    789     ahrens 	uint64_t crc = -1ULL;
    645    789     ahrens 	int i;
    646    789     ahrens 
    647    789     ahrens 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
    648    789     ahrens 
    649    789     ahrens 	for (i = 0; i < sizeof (dva_t); i++)
    650    789     ahrens 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
    651    789     ahrens 
    652   8636       Mark 	crc ^= (spa>>8) ^ birth;
    653    789     ahrens 
    654    789     ahrens 	return (crc);
    655    789     ahrens }
    656    789     ahrens 
    657    789     ahrens #define	BUF_EMPTY(buf)						\
    658    789     ahrens 	((buf)->b_dva.dva_word[0] == 0 &&			\
    659    789     ahrens 	(buf)->b_dva.dva_word[1] == 0 &&			\
    660    789     ahrens 	(buf)->b_birth == 0)
    661    789     ahrens 
    662    789     ahrens #define	BUF_EQUAL(spa, dva, birth, buf)				\
    663    789     ahrens 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
    664    789     ahrens 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
    665    789     ahrens 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
    666    789     ahrens 
    667    789     ahrens static arc_buf_hdr_t *
    668   8636       Mark buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
    669    789     ahrens {
    670    789     ahrens 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
    671    789     ahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
    672    789     ahrens 	arc_buf_hdr_t *buf;
    673    789     ahrens 
    674    789     ahrens 	mutex_enter(hash_lock);
    675    789     ahrens 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
    676    789     ahrens 	    buf = buf->b_hash_next) {
    677    789     ahrens 		if (BUF_EQUAL(spa, dva, birth, buf)) {
    678    789     ahrens 			*lockp = hash_lock;
    679    789     ahrens 			return (buf);
    680    789     ahrens 		}
    681    789     ahrens 	}
    682    789     ahrens 	mutex_exit(hash_lock);
    683    789     ahrens 	*lockp = NULL;
    684    789     ahrens 	return (NULL);
    685    789     ahrens }
    686    789     ahrens 
    687    789     ahrens /*
    688    789     ahrens  * Insert an entry into the hash table.  If there is already an element
    689    789     ahrens  * equal to elem in the hash table, then the already existing element
    690    789     ahrens  * will be returned and the new element will not be inserted.
    691    789     ahrens  * Otherwise returns NULL.
    692    789     ahrens  */
    693    789     ahrens static arc_buf_hdr_t *
    694    789     ahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
    695    789     ahrens {
    696    789     ahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
    697    789     ahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
    698    789     ahrens 	arc_buf_hdr_t *fbuf;
    699   3403        bmc 	uint32_t i;
    700    789     ahrens 
    701   1544   eschrock 	ASSERT(!HDR_IN_HASH_TABLE(buf));
    702    789     ahrens 	*lockp = hash_lock;
    703    789     ahrens 	mutex_enter(hash_lock);
    704    789     ahrens 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
    705    789     ahrens 	    fbuf = fbuf->b_hash_next, i++) {
    706    789     ahrens 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
    707    789     ahrens 			return (fbuf);
    708    789     ahrens 	}
    709    789     ahrens 
    710    789     ahrens 	buf->b_hash_next = buf_hash_table.ht_table[idx];
    711    789     ahrens 	buf_hash_table.ht_table[idx] = buf;
    712   1544   eschrock 	buf->b_flags |= ARC_IN_HASH_TABLE;
    713    789     ahrens 
    714    789     ahrens 	/* collect some hash table performance data */
    715    789     ahrens 	if (i > 0) {
    716   3403        bmc 		ARCSTAT_BUMP(arcstat_hash_collisions);
    717    789     ahrens 		if (i == 1)
    718   3403        bmc 			ARCSTAT_BUMP(arcstat_hash_chains);
    719   3403        bmc 
    720   3403        bmc 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
    721    789     ahrens 	}
    722   3403        bmc 
    723   3403        bmc 	ARCSTAT_BUMP(arcstat_hash_elements);
    724   3403        bmc 	ARCSTAT_MAXSTAT(arcstat_hash_elements);
    725    789     ahrens 
    726    789     ahrens 	return (NULL);
    727    789     ahrens }
    728    789     ahrens 
    729    789     ahrens static void
    730    789     ahrens buf_hash_remove(arc_buf_hdr_t *buf)
    731    789     ahrens {
    732    789     ahrens 	arc_buf_hdr_t *fbuf, **bufp;
    733    789     ahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
    734    789     ahrens 
    735    789     ahrens 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
    736   1544   eschrock 	ASSERT(HDR_IN_HASH_TABLE(buf));
    737    789     ahrens 
    738    789     ahrens 	bufp = &buf_hash_table.ht_table[idx];
    739    789     ahrens 	while ((fbuf = *bufp) != buf) {
    740    789     ahrens 		ASSERT(fbuf != NULL);
    741    789     ahrens 		bufp = &fbuf->b_hash_next;
    742    789     ahrens 	}
    743    789     ahrens 	*bufp = buf->b_hash_next;
    744    789     ahrens 	buf->b_hash_next = NULL;
    745   1544   eschrock 	buf->b_flags &= ~ARC_IN_HASH_TABLE;
    746    789     ahrens 
    747    789     ahrens 	/* collect some hash table performance data */
    748   3403        bmc 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
    749   3403        bmc 
    750    789     ahrens 	if (buf_hash_table.ht_table[idx] &&
    751    789     ahrens 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
    752   3403        bmc 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
    753    789     ahrens }
    754    789     ahrens 
    755    789     ahrens /*
    756    789     ahrens  * Global data structures and functions for the buf kmem cache.
    757    789     ahrens  */
    758    789     ahrens static kmem_cache_t *hdr_cache;
    759    789     ahrens static kmem_cache_t *buf_cache;
    760    789     ahrens 
    761    789     ahrens static void
    762    789     ahrens buf_fini(void)
    763    789     ahrens {
    764    789     ahrens 	int i;
    765    789     ahrens 
    766    789     ahrens 	kmem_free(buf_hash_table.ht_table,
    767    789     ahrens 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
    768    789     ahrens 	for (i = 0; i < BUF_LOCKS; i++)
    769    789     ahrens 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
    770    789     ahrens 	kmem_cache_destroy(hdr_cache);
    771    789     ahrens 	kmem_cache_destroy(buf_cache);
    772    789     ahrens }
    773    789     ahrens 
    774    789     ahrens /*
    775    789     ahrens  * Constructor callback - called when the cache is empty
    776    789     ahrens  * and a new buf is requested.
    777    789     ahrens  */
    778    789     ahrens /* ARGSUSED */
    779    789     ahrens static int
    780    789     ahrens hdr_cons(void *vbuf, void *unused, int kmflag)
    781    789     ahrens {
    782    789     ahrens 	arc_buf_hdr_t *buf = vbuf;
    783    789     ahrens 
    784    789     ahrens 	bzero(buf, sizeof (arc_buf_hdr_t));
    785    789     ahrens 	refcount_create(&buf->b_refcnt);
    786    789     ahrens 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
    787   4831    gw25295 	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
    788   8582    Brendan 	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
    789   8582    Brendan 
    790   7545       Mark 	return (0);
    791   7545       Mark }
    792   7545       Mark 
    793   7545       Mark /* ARGSUSED */
    794   7545       Mark static int
    795   7545       Mark buf_cons(void *vbuf, void *unused, int kmflag)
    796   7545       Mark {
    797   7545       Mark 	arc_buf_t *buf = vbuf;
    798   7545       Mark 
    799   7545       Mark 	bzero(buf, sizeof (arc_buf_t));
    800   7545       Mark 	rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
    801   8582    Brendan 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
    802   8582    Brendan 
    803    789     ahrens 	return (0);
    804    789     ahrens }
    805    789     ahrens 
    806    789     ahrens /*
    807    789     ahrens  * Destructor callback - called when a cached buf is
    808    789     ahrens  * no longer required.
    809    789     ahrens  */
    810    789     ahrens /* ARGSUSED */
    811    789     ahrens static void
    812    789     ahrens hdr_dest(void *vbuf, void *unused)
    813    789     ahrens {
    814    789     ahrens 	arc_buf_hdr_t *buf = vbuf;
    815    789     ahrens 
    816  10922       Jeff 	ASSERT(BUF_EMPTY(buf));
    817    789     ahrens 	refcount_destroy(&buf->b_refcnt);
    818    789     ahrens 	cv_destroy(&buf->b_cv);
    819   4831    gw25295 	mutex_destroy(&buf->b_freeze_lock);
    820   8582    Brendan 	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
    821   7545       Mark }
    822   7545       Mark 
    823   7545       Mark /* ARGSUSED */
    824   7545       Mark static void
    825   7545       Mark buf_dest(void *vbuf, void *unused)
    826   7545       Mark {
    827   7545       Mark 	arc_buf_t *buf = vbuf;
    828   7545       Mark 
    829   7545       Mark 	rw_destroy(&buf->b_lock);
    830   8582    Brendan 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
    831    789     ahrens }
    832    789     ahrens 
    833    789     ahrens /*
    834    789     ahrens  * Reclaim callback -- invoked when memory is low.
    835    789     ahrens  */
    836    789     ahrens /* ARGSUSED */
    837    789     ahrens static void
    838    789     ahrens hdr_recl(void *unused)
    839    789     ahrens {
    840    789     ahrens 	dprintf("hdr_recl called\n");
    841   3158     maybee 	/*
    842   3158     maybee 	 * umem calls the reclaim func when we destroy the buf cache,
    843   3158     maybee 	 * which is after we do arc_fini().
    844   3158     maybee 	 */
    845   3158     maybee 	if (!arc_dead)
    846   3158     maybee 		cv_signal(&arc_reclaim_thr_cv);
    847    789     ahrens }
    848    789     ahrens 
    849    789     ahrens static void
    850    789     ahrens buf_init(void)
    851    789     ahrens {
    852    789     ahrens 	uint64_t *ct;
    853   1544   eschrock 	uint64_t hsize = 1ULL << 12;
    854    789     ahrens 	int i, j;
    855    789     ahrens 
    856    789     ahrens 	/*
    857    789     ahrens 	 * The hash table is big enough to fill all of physical memory
    858   1544   eschrock 	 * with an average 64K block size.  The table will take up
    859   1544   eschrock 	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
    860    789     ahrens 	 */
    861   1544   eschrock 	while (hsize * 65536 < physmem * PAGESIZE)
    862    789     ahrens 		hsize <<= 1;
    863   1544   eschrock retry:
    864    789     ahrens 	buf_hash_table.ht_mask = hsize - 1;
    865   1544   eschrock 	buf_hash_table.ht_table =
    866   1544   eschrock 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
    867   1544   eschrock 	if (buf_hash_table.ht_table == NULL) {
    868   1544   eschrock 		ASSERT(hsize > (1ULL << 8));
    869   1544   eschrock 		hsize >>= 1;
    870   1544   eschrock 		goto retry;
    871   1544   eschrock 	}
    872    789     ahrens 
    873    789     ahrens 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
    874    789     ahrens 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
    875    789     ahrens 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
    876   7545       Mark 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
    877    789     ahrens 
    878    789     ahrens 	for (i = 0; i < 256; i++)
    879    789     ahrens 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
    880    789     ahrens 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
    881    789     ahrens 
    882    789     ahrens 	for (i = 0; i < BUF_LOCKS; i++) {
    883    789     ahrens 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
    884    789     ahrens 		    NULL, MUTEX_DEFAULT, NULL);
    885    789     ahrens 	}
    886    789     ahrens }
    887    789     ahrens 
    888    789     ahrens #define	ARC_MINTIME	(hz>>4) /* 62 ms */
    889    789     ahrens 
    890    789     ahrens static void
    891   3093     ahrens arc_cksum_verify(arc_buf_t *buf)
    892   3093     ahrens {
    893   3093     ahrens 	zio_cksum_t zc;
    894   3093     ahrens 
    895   3312     ahrens 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
    896   3093     ahrens 		return;
    897   3093     ahrens 
    898   3093     ahrens 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    899   3265     ahrens 	if (buf->b_hdr->b_freeze_cksum == NULL ||
    900   3265     ahrens 	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
    901   3093     ahrens 		mutex_exit(&buf->b_hdr->b_freeze_lock);
    902   3093     ahrens 		return;
    903   3093     ahrens 	}
    904   3093     ahrens 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
    905   3093     ahrens 	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
    906   3093     ahrens 		panic("buffer modified while frozen!");
    907   3093     ahrens 	mutex_exit(&buf->b_hdr->b_freeze_lock);
    908   3093     ahrens }
    909   3093     ahrens 
    910   5450    brendan static int
    911   5450    brendan arc_cksum_equal(arc_buf_t *buf)
    912   5450    brendan {
    913   5450    brendan 	zio_cksum_t zc;
    914   5450    brendan 	int equal;
    915   5450    brendan 
    916   5450    brendan 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    917   5450    brendan 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
    918   5450    brendan 	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
    919   5450    brendan 	mutex_exit(&buf->b_hdr->b_freeze_lock);
    920   5450    brendan 
    921   5450    brendan 	return (equal);
    922   5450    brendan }
    923   5450    brendan 
    924   5450    brendan static void
    925   5450    brendan arc_cksum_compute(arc_buf_t *buf, boolean_t force)
    926   5450    brendan {
    927   5450    brendan 	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
    928   3093     ahrens 		return;
    929   3093     ahrens 
    930   3093     ahrens 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    931   3093     ahrens 	if (buf->b_hdr->b_freeze_cksum != NULL) {
    932   3093     ahrens 		mutex_exit(&buf->b_hdr->b_freeze_lock);
    933   3093     ahrens 		return;
    934   3093     ahrens 	}
    935   3093     ahrens 	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
    936   3093     ahrens 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
    937   3093     ahrens 	    buf->b_hdr->b_freeze_cksum);
    938   3093     ahrens 	mutex_exit(&buf->b_hdr->b_freeze_lock);
    939   3093     ahrens }
    940   3093     ahrens 
    941   3093     ahrens void
    942   3093     ahrens arc_buf_thaw(arc_buf_t *buf)
    943   3093     ahrens {
    944   5450    brendan 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
    945   5450    brendan 		if (buf->b_hdr->b_state != arc_anon)
    946   5450    brendan 			panic("modifying non-anon buffer!");
    947   5450    brendan 		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
    948   5450    brendan 			panic("modifying buffer while i/o in progress!");
    949   5450    brendan 		arc_cksum_verify(buf);
    950   5450    brendan 	}
    951   5450    brendan 
    952   3093     ahrens 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    953   3093     ahrens 	if (buf->b_hdr->b_freeze_cksum != NULL) {
    954   3093     ahrens 		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
    955   3093     ahrens 		buf->b_hdr->b_freeze_cksum = NULL;
    956   3093     ahrens 	}
    957   3093     ahrens 	mutex_exit(&buf->b_hdr->b_freeze_lock);
    958   3093     ahrens }
    959   3093     ahrens 
    960   3093     ahrens void
    961   3093     ahrens arc_buf_freeze(arc_buf_t *buf)
    962   3093     ahrens {
    963   3312     ahrens 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
    964   3312     ahrens 		return;
    965   3312     ahrens 
    966   3093     ahrens 	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
    967   3403        bmc 	    buf->b_hdr->b_state == arc_anon);
    968   5450    brendan 	arc_cksum_compute(buf, B_FALSE);
    969   3093     ahrens }
    970   3093     ahrens 
    971   3093     ahrens static void
    972    789     ahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
    973    789     ahrens {
    974    789     ahrens 	ASSERT(MUTEX_HELD(hash_lock));
    975    789     ahrens 
    976    789     ahrens 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
    977   3403        bmc 	    (ab->b_state != arc_anon)) {
    978   3700   ek110237 		uint64_t delta = ab->b_size * ab->b_datacnt;
    979   4309     maybee 		list_t *list = &ab->b_state->arcs_list[ab->b_type];
    980   4309     maybee 		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
    981    789     ahrens 
    982   3403        bmc 		ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
    983   3403        bmc 		mutex_enter(&ab->b_state->arcs_mtx);
    984    789     ahrens 		ASSERT(list_link_active(&ab->b_arc_node));
    985   4309     maybee 		list_remove(list, ab);
    986   1544   eschrock 		if (GHOST_STATE(ab->b_state)) {
    987   1544   eschrock 			ASSERT3U(ab->b_datacnt, ==, 0);
    988   1544   eschrock 			ASSERT3P(ab->b_buf, ==, NULL);
    989   1544   eschrock 			delta = ab->b_size;
    990   1544   eschrock 		}
    991   1544   eschrock 		ASSERT(delta > 0);
    992   4309     maybee 		ASSERT3U(*size, >=, delta);
    993   4309     maybee 		atomic_add_64(size, -delta);
    994   3403        bmc 		mutex_exit(&ab->b_state->arcs_mtx);
    995   7046     ahrens 		/* remove the prefetch flag if we get a reference */
    996   2391     maybee 		if (ab->b_flags & ARC_PREFETCH)
    997   2391     maybee 			ab->b_flags &= ~ARC_PREFETCH;
    998    789     ahrens 	}
    999    789     ahrens }
   1000    789     ahrens 
   1001    789     ahrens static int
   1002    789     ahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
   1003    789     ahrens {
   1004    789     ahrens 	int cnt;
   1005   3403        bmc 	arc_state_t *state = ab->b_state;
   1006    789     ahrens 
   1007   3403        bmc 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
   1008   3403        bmc 	ASSERT(!GHOST_STATE(state));
   1009    789     ahrens 
   1010    789     ahrens 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
   1011   3403        bmc 	    (state != arc_anon)) {
   1012   4309     maybee 		uint64_t *size = &state->arcs_lsize[ab->b_type];
   1013   4309     maybee 
   1014   3403        bmc 		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
   1015   3403        bmc 		mutex_enter(&state->arcs_mtx);
   1016    789     ahrens 		ASSERT(!list_link_active(&ab->b_arc_node));
   1017   4309     maybee 		list_insert_head(&state->arcs_list[ab->b_type], ab);
   1018   1544   eschrock 		ASSERT(ab->b_datacnt > 0);
   1019   4309     maybee 		atomic_add_64(size, ab->b_size * ab->b_datacnt);
   1020   3403        bmc 		mutex_exit(&state->arcs_mtx);
   1021    789     ahrens 	}
   1022    789     ahrens 	return (cnt);
   1023    789     ahrens }
   1024    789     ahrens 
   1025    789     ahrens /*
   1026    789     ahrens  * Move the supplied buffer to the indicated state.  The mutex
   1027    789     ahrens  * for the buffer must be held by the caller.
   1028    789     ahrens  */
   1029    789     ahrens static void
   1030   1544   eschrock arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
   1031    789     ahrens {
   1032   1544   eschrock 	arc_state_t *old_state = ab->b_state;
   1033   3700   ek110237 	int64_t refcnt = refcount_count(&ab->b_refcnt);
   1034   3700   ek110237 	uint64_t from_delta, to_delta;
   1035    789     ahrens 
   1036    789     ahrens 	ASSERT(MUTEX_HELD(hash_lock));
   1037   1544   eschrock 	ASSERT(new_state != old_state);
   1038   1544   eschrock 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
   1039   1544   eschrock 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
   1040  10922       Jeff 	ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon);
   1041  10922       Jeff 	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
   1042   1544   eschrock 
   1043   1544   eschrock 	from_delta = to_delta = ab->b_datacnt * ab->b_size;
   1044    789     ahrens 
   1045    789     ahrens 	/*
   1046    789     ahrens 	 * If this buffer is evictable, transfer it from the
   1047    789     ahrens 	 * old state list to the new state list.
   1048    789     ahrens 	 */
   1049   1544   eschrock 	if (refcnt == 0) {
   1050   3403        bmc 		if (old_state != arc_anon) {
   1051   3403        bmc 			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
   1052   4309     maybee 			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
   1053    789     ahrens 
   1054   1544   eschrock 			if (use_mutex)
   1055   3403        bmc 				mutex_enter(&old_state->arcs_mtx);
   1056   1544   eschrock 
   1057   1544   eschrock 			ASSERT(list_link_active(&ab->b_arc_node));
   1058   4309     maybee 			list_remove(&old_state->arcs_list[ab->b_type], ab);
   1059   1544   eschrock 
   1060   2391     maybee 			/*
   1061   2391     maybee 			 * If prefetching out of the ghost cache,
   1062   2391     maybee 			 * we will have a non-null datacnt.
   1063   2391     maybee 			 */
   1064   2391     maybee 			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
   1065   2391     maybee 				/* ghost elements have a ghost size */
   1066   1544   eschrock 				ASSERT(ab->b_buf == NULL);
   1067   1544   eschrock 				from_delta = ab->b_size;
   1068    789     ahrens 			}
   1069   4309     maybee 			ASSERT3U(*size, >=, from_delta);
   1070   4309     maybee 			atomic_add_64(size, -from_delta);
   1071   1544   eschrock 
   1072   1544   eschrock 			if (use_mutex)
   1073   3403        bmc 				mutex_exit(&old_state->arcs_mtx);
   1074    789     ahrens 		}
   1075   3403        bmc 		if (new_state != arc_anon) {
   1076   3403        bmc 			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
   1077   4309     maybee 			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
   1078    789     ahrens 
   1079   1544   eschrock 			if (use_mutex)
   1080   3403        bmc 				mutex_enter(&new_state->arcs_mtx);
   1081   1544   eschrock 
   1082   4309     maybee 			list_insert_head(&new_state->arcs_list[ab->b_type], ab);
   1083   1544   eschrock 
   1084   1544   eschrock 			/* ghost elements have a ghost size */
   1085   1544   eschrock 			if (GHOST_STATE(new_state)) {
   1086   1544   eschrock 				ASSERT(ab->b_datacnt == 0);
   1087   1544   eschrock 				ASSERT(ab->b_buf == NULL);
   1088   1544   eschrock 				to_delta = ab->b_size;
   1089    789     ahrens 			}
   1090   4309     maybee 			atomic_add_64(size, to_delta);
   1091   1544   eschrock 
   1092   1544   eschrock 			if (use_mutex)
   1093   3403        bmc 				mutex_exit(&new_state->arcs_mtx);
   1094    789     ahrens 		}
   1095    789     ahrens 	}
   1096    789     ahrens 
   1097    789     ahrens 	ASSERT(!BUF_EMPTY(ab));
   1098   5450    brendan 	if (new_state == arc_anon) {
   1099    789     ahrens 		buf_hash_remove(ab);
   1100    789     ahrens 	}
   1101    789     ahrens 
   1102   1544   eschrock 	/* adjust state sizes */
   1103   1544   eschrock 	if (to_delta)
   1104   3403        bmc 		atomic_add_64(&new_state->arcs_size, to_delta);
   1105   1544   eschrock 	if (from_delta) {
   1106   3403        bmc 		ASSERT3U(old_state->arcs_size, >=, from_delta);
   1107   3403        bmc 		atomic_add_64(&old_state->arcs_size, -from_delta);
   1108    789     ahrens 	}
   1109    789     ahrens 	ab->b_state = new_state;
   1110   5450    brendan 
   1111   5450    brendan 	/* adjust l2arc hdr stats */
   1112   5450    brendan 	if (new_state == arc_l2c_only)
   1113   5450    brendan 		l2arc_hdr_stat_add();
   1114   5450    brendan 	else if (old_state == arc_l2c_only)
   1115   5450    brendan 		l2arc_hdr_stat_remove();
   1116   4309     maybee }
   1117   4309     maybee 
   1118   4309     maybee void
   1119   8582    Brendan arc_space_consume(uint64_t space, arc_space_type_t type)
   1120   8582    Brendan {
   1121   8582    Brendan 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
   1122   8582    Brendan 
   1123   8582    Brendan 	switch (type) {
   1124   8582    Brendan 	case ARC_SPACE_DATA:
   1125   8582    Brendan 		ARCSTAT_INCR(arcstat_data_size, space);
   1126   8582    Brendan 		break;
   1127   8582    Brendan 	case ARC_SPACE_OTHER:
   1128   8582    Brendan 		ARCSTAT_INCR(arcstat_other_size, space);
   1129   8582    Brendan 		break;
   1130   8582    Brendan 	case ARC_SPACE_HDRS:
   1131   8582    Brendan 		ARCSTAT_INCR(arcstat_hdr_size, space);
   1132   8582    Brendan 		break;
   1133   8582    Brendan 	case ARC_SPACE_L2HDRS:
   1134   8582    Brendan 		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
   1135   8582    Brendan 		break;
   1136   8582    Brendan 	}
   1137   8582    Brendan 
   1138   4309     maybee 	atomic_add_64(&arc_meta_used, space);
   1139   4309     maybee 	atomic_add_64(&arc_size, space);
   1140   4309     maybee }
   1141   4309     maybee 
   1142   4309     maybee void
   1143   8582    Brendan arc_space_return(uint64_t space, arc_space_type_t type)
   1144   8582    Brendan {
   1145   8582    Brendan 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
   1146   8582    Brendan 
   1147   8582    Brendan 	switch (type) {
   1148   8582    Brendan 	case ARC_SPACE_DATA:
   1149   8582    Brendan 		ARCSTAT_INCR(arcstat_data_size, -space);
   1150   8582    Brendan 		break;
   1151   8582    Brendan 	case ARC_SPACE_OTHER:
   1152   8582    Brendan 		ARCSTAT_INCR(arcstat_other_size, -space);
   1153   8582    Brendan 		break;
   1154   8582    Brendan 	case ARC_SPACE_HDRS:
   1155   8582    Brendan 		ARCSTAT_INCR(arcstat_hdr_size, -space);
   1156   8582    Brendan 		break;
   1157   8582    Brendan 	case ARC_SPACE_L2HDRS:
   1158   8582    Brendan 		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
   1159   8582    Brendan 		break;
   1160   8582    Brendan 	}
   1161   8582    Brendan 
   1162   4309     maybee 	ASSERT(arc_meta_used >= space);
   1163   4309     maybee 	if (arc_meta_max < arc_meta_used)
   1164   4309     maybee 		arc_meta_max = arc_meta_used;
   1165   4309     maybee 	atomic_add_64(&arc_meta_used, -space);
   1166   4309     maybee 	ASSERT(arc_size >= space);
   1167   4309     maybee 	atomic_add_64(&arc_size, -space);
   1168   4309     maybee }
   1169   4309     maybee 
   1170   4309     maybee void *
   1171   4309     maybee arc_data_buf_alloc(uint64_t size)
   1172   4309     maybee {
   1173   4309     maybee 	if (arc_evict_needed(ARC_BUFC_DATA))
   1174   4309     maybee 		cv_signal(&arc_reclaim_thr_cv);
   1175   4309     maybee 	atomic_add_64(&arc_size, size);
   1176   4309     maybee 	return (zio_data_buf_alloc(size));
   1177   4309     maybee }
   1178   4309     maybee 
   1179   4309     maybee void
   1180   4309     maybee arc_data_buf_free(void *buf, uint64_t size)
   1181   4309     maybee {
   1182   4309     maybee 	zio_data_buf_free(buf, size);
   1183   4309     maybee 	ASSERT(arc_size >= size);
   1184   4309     maybee 	atomic_add_64(&arc_size, -size);
   1185    789     ahrens }
   1186    789     ahrens 
   1187    789     ahrens arc_buf_t *
   1188   3290   johansen arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
   1189    789     ahrens {
   1190    789     ahrens 	arc_buf_hdr_t *hdr;
   1191    789     ahrens 	arc_buf_t *buf;
   1192    789     ahrens 
   1193    789     ahrens 	ASSERT3U(size, >, 0);
   1194   6245     maybee 	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
   1195    789     ahrens 	ASSERT(BUF_EMPTY(hdr));
   1196    789     ahrens 	hdr->b_size = size;
   1197   3290   johansen 	hdr->b_type = type;
   1198   8636       Mark 	hdr->b_spa = spa_guid(spa);
   1199   3403        bmc 	hdr->b_state = arc_anon;
   1200    789     ahrens 	hdr->b_arc_access = 0;
   1201   6245     maybee 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
   1202    789     ahrens 	buf->b_hdr = hdr;
   1203   2688     maybee 	buf->b_data = NULL;
   1204   1544   eschrock 	buf->b_efunc = NULL;
   1205   1544   eschrock 	buf->b_private = NULL;
   1206    789     ahrens 	buf->b_next = NULL;
   1207    789     ahrens 	hdr->b_buf = buf;
   1208   2688     maybee 	arc_get_data_buf(buf);
   1209   1544   eschrock 	hdr->b_datacnt = 1;
   1210    789     ahrens 	hdr->b_flags = 0;
   1211    789     ahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
   1212    789     ahrens 	(void) refcount_add(&hdr->b_refcnt, tag);
   1213    789     ahrens 
   1214    789     ahrens 	return (buf);
   1215    789     ahrens }
   1216    789     ahrens 
   1217   9412  Aleksandr static char *arc_onloan_tag = "onloan";
   1218   9412  Aleksandr 
   1219   9412  Aleksandr /*
   1220   9412  Aleksandr  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
   1221   9412  Aleksandr  * flight data by arc_tempreserve_space() until they are "returned". Loaned
   1222   9412  Aleksandr  * buffers must be returned to the arc before they can be used by the DMU or
   1223   9412  Aleksandr  * freed.
   1224   9412  Aleksandr  */
   1225   9412  Aleksandr arc_buf_t *
   1226   9412  Aleksandr arc_loan_buf(spa_t *spa, int size)
   1227   9412  Aleksandr {
   1228   9412  Aleksandr 	arc_buf_t *buf;
   1229   9412  Aleksandr 
   1230   9412  Aleksandr 	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
   1231   9412  Aleksandr 
   1232   9412  Aleksandr 	atomic_add_64(&arc_loaned_bytes, size);
   1233   9412  Aleksandr 	return (buf);
   1234   9412  Aleksandr }
   1235   9412  Aleksandr 
   1236   9412  Aleksandr /*
   1237   9412  Aleksandr  * Return a loaned arc buffer to the arc.
   1238   9412  Aleksandr  */
   1239   9412  Aleksandr void
   1240   9412  Aleksandr arc_return_buf(arc_buf_t *buf, void *tag)
   1241   9412  Aleksandr {
   1242   9412  Aleksandr 	arc_buf_hdr_t *hdr = buf->b_hdr;
   1243   9412  Aleksandr 
   1244   9412  Aleksandr 	ASSERT(hdr->b_state == arc_anon);
   1245   9412  Aleksandr 	ASSERT(buf->b_data != NULL);
   1246   9412  Aleksandr 	VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
   1247   9412  Aleksandr 	VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
   1248   9412  Aleksandr 
   1249   9412  Aleksandr 	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
   1250   9412  Aleksandr }
   1251   9412  Aleksandr 
   1252   2688     maybee static arc_buf_t *
   1253   2688     maybee arc_buf_clone(arc_buf_t *from)
   1254   1544   eschrock {
   1255   2688     maybee 	arc_buf_t *buf;
   1256   2688     maybee 	arc_buf_hdr_t *hdr = from->b_hdr;
   1257   2688     maybee 	uint64_t size = hdr->b_size;
   1258   1544   eschrock 
   1259  10922       Jeff 	ASSERT(hdr->b_state != arc_anon);
   1260  10922       Jeff 
   1261   6245     maybee 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
   1262   2688     maybee 	buf->b_hdr = hdr;
   1263   2688     maybee 	buf->b_data = NULL;
   1264   2688     maybee 	buf->b_efunc = NULL;
   1265   2688     maybee 	buf->b_private = NULL;
   1266   2688     maybee 	buf->b_next = hdr->b_buf;
   1267   2688     maybee 	hdr->b_buf = buf;
   1268   2688     maybee 	arc_get_data_buf(buf);
   1269   2688     maybee 	bcopy(from->b_data, buf->b_data, size);
   1270   2688     maybee 	hdr->b_datacnt += 1;
   1271   2688     maybee 	return (buf);
   1272   1544   eschrock }
   1273   1544   eschrock 
   1274   1544   eschrock void
   1275   1544   eschrock arc_buf_add_ref(arc_buf_t *buf, void* tag)
   1276   1544   eschrock {
   1277   2887     maybee 	arc_buf_hdr_t *hdr;
   1278   1544   eschrock 	kmutex_t *hash_lock;
   1279   1544   eschrock 
   1280   2724     maybee 	/*
   1281   7545       Mark 	 * Check to see if this buffer is evicted.  Callers
   1282   7545       Mark 	 * must verify b_data != NULL to know if the add_ref
   1283   7545       Mark 	 * was successful.
   1284   7545       Mark 	 */
   1285   7545       Mark 	rw_enter(&buf->b_lock, RW_READER);
   1286   7545       Mark 	if (buf->b_data == NULL) {
   1287   7545       Mark 		rw_exit(&buf->b_lock);
   1288   7545       Mark 		return;
   1289   7545       Mark 	}
   1290   7545       Mark 	hdr = buf->b_hdr;
   1291   7545       Mark 	ASSERT(hdr != NULL);
   1292   2887     maybee 	hash_lock = HDR_LOCK(hdr);
   1293   7545       Mark 	mutex_enter(hash_lock);
   1294   7545       Mark 	rw_exit(&buf->b_lock);
   1295   7545       Mark 
   1296   3403        bmc 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
   1297   1544   eschrock 	add_reference(hdr, hash_lock, tag);
   1298   8582    Brendan 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
   1299   2688     maybee 	arc_access(hdr, hash_lock);
   1300   2688     maybee 	mutex_exit(hash_lock);
   1301   3403        bmc 	ARCSTAT_BUMP(arcstat_hits);
   1302   3403        bmc 	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
   1303   3403        bmc 	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
   1304   3403        bmc 	    data, metadata, hits);
   1305   1544   eschrock }
   1306   1544   eschrock 
   1307   5450    brendan /*
   1308   5450    brendan  * Free the arc data buffer.  If it is an l2arc write in progress,
   1309   5450    brendan  * the buffer is placed on l2arc_free_on_write to be freed later.
   1310   5450    brendan  */
   1311   5450    brendan static void
   1312   5450    brendan arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
   1313   5450    brendan     void *data, size_t size)
   1314   5450    brendan {
   1315   5450    brendan 	if (HDR_L2_WRITING(hdr)) {
   1316   5450    brendan 		l2arc_data_free_t *df;
   1317   5450    brendan 		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
   1318   5450    brendan 		df->l2df_data = data;
   1319   5450    brendan 		df->l2df_size = size;
   1320   5450    brendan 		df->l2df_func = free_func;
   1321   5450    brendan 		mutex_enter(&l2arc_free_on_write_mtx);
   1322   5450    brendan 		list_insert_head(l2arc_free_on_write, df);
   1323   5450    brendan 		mutex_exit(&l2arc_free_on_write_mtx);
   1324   5450    brendan 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
   1325   5450    brendan 	} else {
   1326   5450    brendan 		free_func(data, size);
   1327   5450    brendan 	}
   1328   5450    brendan }
   1329   5450    brendan 
   1330    789     ahrens static void
   1331   2688     maybee arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
   1332   1544   eschrock {
   1333   1544   eschrock 	arc_buf_t **bufp;
   1334   1544   eschrock 
   1335   1544   eschrock 	/* free up data associated with the buf */
   1336   1544   eschrock 	if (buf->b_data) {
   1337   1544   eschrock 		arc_state_t *state = buf->b_hdr->b_state;
   1338   1544   eschrock 		uint64_t size = buf->b_hdr->b_size;
   1339   3290   johansen 		arc_buf_contents_t type = buf->b_hdr->b_type;
   1340   1544   eschrock 
   1341   3093     ahrens 		arc_cksum_verify(buf);
   1342  10922       Jeff 
   1343   2688     maybee 		if (!recycle) {
   1344   3290   johansen 			if (type == ARC_BUFC_METADATA) {
   1345   5450    brendan 				arc_buf_data_free(buf->b_hdr, zio_buf_free,
   1346   5450    brendan 				    buf->b_data, size);
   1347   8582    Brendan 				arc_space_return(size, ARC_SPACE_DATA);
   1348   3290   johansen 			} else {
   1349   3290   johansen 				ASSERT(type == ARC_BUFC_DATA);
   1350   5450    brendan 				arc_buf_data_free(buf->b_hdr,
   1351   5450    brendan 				    zio_data_buf_free, buf->b_data, size);
   1352   8582    Brendan 				ARCSTAT_INCR(arcstat_data_size, -size);
   1353   4309     maybee 				atomic_add_64(&arc_size, -size);
   1354   3290   johansen 			}
   1355   2688     maybee 		}
   1356   1544   eschrock 		if (list_link_active(&buf->b_hdr->b_arc_node)) {
   1357   4309     maybee 			uint64_t *cnt = &state->arcs_lsize[type];
   1358   4309     maybee 
   1359   1544   eschrock 			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
   1360   3403        bmc 			ASSERT(state != arc_anon);
   1361   4309     maybee 
   1362   4309     maybee 			ASSERT3U(*cnt, >=, size);
   1363   4309     maybee 			atomic_add_64(cnt, -size);
   1364   1544   eschrock 		}
   1365   3403        bmc 		ASSERT3U(state->arcs_size, >=, size);
   1366   3403        bmc 		atomic_add_64(&state->arcs_size, -size);
   1367   1544   eschrock 		buf->b_data = NULL;
   1368   1544   eschrock 		ASSERT(buf->b_hdr->b_datacnt > 0);
   1369   1544   eschrock 		buf->b_hdr->b_datacnt -= 1;
   1370   1544   eschrock 	}
   1371   1544   eschrock 
   1372   1544   eschrock 	/* only remove the buf if requested */
   1373   1544   eschrock 	if (!all)
   1374   1544   eschrock 		return;
   1375   1544   eschrock 
   1376   1544   eschrock 	/* remove the buf from the hdr list */
   1377   1544   eschrock 	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
   1378   1544   eschrock 		continue;
   1379   1544   eschrock 	*bufp = buf->b_next;
   1380   1544   eschrock 
   1381   1544   eschrock 	ASSERT(buf->b_efunc == NULL);
   1382   1544   eschrock 
   1383   1544   eschrock 	/* clean up the buf */
   1384   1544   eschrock 	buf->b_hdr = NULL;
   1385   1544   eschrock 	kmem_cache_free(buf_cache, buf);
   1386   1544   eschrock }
   1387   1544   eschrock 
   1388   1544   eschrock static void
   1389   1544   eschrock arc_hdr_destroy(arc_buf_hdr_t *hdr)
   1390    789     ahrens {
   1391    789     ahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
   1392   3403        bmc 	ASSERT3P(hdr->b_state, ==, arc_anon);
   1393   1544   eschrock 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
   1394  10922       Jeff 	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
   1395  10922       Jeff 
   1396  10922       Jeff 	if (l2hdr != NULL) {
   1397  10922       Jeff 		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
   1398  10922       Jeff 		/*
   1399  10922       Jeff 		 * To prevent arc_free() and l2arc_evict() from
   1400  10922       Jeff 		 * attempting to free the same buffer at the same time,
   1401  10922       Jeff 		 * a FREE_IN_PROGRESS flag is given to arc_free() to
   1402  10922       Jeff 		 * give it priority.  l2arc_evict() can't destroy this
   1403  10922       Jeff 		 * header while we are waiting on l2arc_buflist_mtx.
   1404  10922       Jeff 		 *
   1405  10922       Jeff 		 * The hdr may be removed from l2ad_buflist before we
   1406  10922       Jeff 		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
   1407  10922       Jeff 		 */
   1408  10922       Jeff 		if (!buflist_held) {
   1409   5450    brendan 			mutex_enter(&l2arc_buflist_mtx);
   1410  10922       Jeff 			l2hdr = hdr->b_l2hdr;
   1411  10922       Jeff 		}
   1412  10922       Jeff 
   1413  10922       Jeff 		if (l2hdr != NULL) {
   1414  10922       Jeff 			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
   1415  10922       Jeff 			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
   1416  10922       Jeff 			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
   1417  10922       Jeff 			if (hdr->b_state == arc_l2c_only)
   1418  10922       Jeff 				l2arc_hdr_stat_remove();
   1419  10922       Jeff 			hdr->b_l2hdr = NULL;
   1420  10922       Jeff 		}
   1421  10922       Jeff 
   1422  10922       Jeff 		if (!buflist_held)
   1423   5450    brendan 			mutex_exit(&l2arc_buflist_mtx);
   1424   5450    brendan 	}
   1425    789     ahrens 
   1426    789     ahrens 	if (!BUF_EMPTY(hdr)) {
   1427   1544   eschrock 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
   1428    789     ahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
   1429    789     ahrens 		hdr->b_birth = 0;
   1430    789     ahrens 		hdr->b_cksum0 = 0;
   1431    789     ahrens 	}
   1432   1544   eschrock 	while (hdr->b_buf) {
   1433    789     ahrens 		arc_buf_t *buf = hdr->b_buf;
   1434    789     ahrens 
   1435   1544   eschrock 		if (buf->b_efunc) {
   1436   1544   eschrock 			mutex_enter(&arc_eviction_mtx);
   1437   7545       Mark 			rw_enter(&buf->b_lock, RW_WRITER);
   1438   1544   eschrock 			ASSERT(buf->b_hdr != NULL);
   1439   2688     maybee 			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
   1440   1544   eschrock 			hdr->b_buf = buf->b_next;
   1441   2887     maybee 			buf->b_hdr = &arc_eviction_hdr;
   1442   1544   eschrock 			buf->b_next = arc_eviction_list;
   1443   1544   eschrock 			arc_eviction_list = buf;
   1444   7545       Mark 			rw_exit(&buf->b_lock);
   1445   1544   eschrock 			mutex_exit(&arc_eviction_mtx);
   1446   1544   eschrock 		} else {
   1447   2688     maybee 			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
   1448   1544   eschrock 		}
   1449   3093     ahrens 	}
   1450   3093     ahrens 	if (hdr->b_freeze_cksum != NULL) {
   1451   3093     ahrens 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
   1452   3093     ahrens 		hdr->b_freeze_cksum = NULL;
   1453    789     ahrens 	}
   1454   1544   eschrock 
   1455    789     ahrens 	ASSERT(!list_link_active(&hdr->b_arc_node));
   1456    789     ahrens 	ASSERT3P(hdr->b_hash_next, ==, NULL);
   1457    789     ahrens 	ASSERT3P(hdr->b_acb, ==, NULL);
   1458    789     ahrens 	kmem_cache_free(hdr_cache, hdr);
   1459    789     ahrens }
   1460    789     ahrens 
   1461    789     ahrens void
   1462    789     ahrens arc_buf_free(arc_buf_t *buf, void *tag)
   1463    789     ahrens {
   1464    789     ahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
   1465   3403        bmc 	int hashed = hdr->b_state != arc_anon;
   1466   1544   eschrock 
   1467   1544   eschrock 	ASSERT(buf->b_efunc == NULL);
   1468   1544   eschrock 	ASSERT(buf->b_data != NULL);
   1469   1544   eschrock 
   1470   1544   eschrock 	if (hashed) {
   1471   1544   eschrock 		kmutex_t *hash_lock = HDR_LOCK(hdr);
   1472   1544   eschrock 
   1473   1544   eschrock 		mutex_enter(hash_lock);
   1474   1544   eschrock 		(void) remove_reference(hdr, hash_lock, tag);
   1475  10922       Jeff 		if (hdr->b_datacnt > 1) {
   1476   2688     maybee 			arc_buf_destroy(buf, FALSE, TRUE);
   1477  10922       Jeff 		} else {
   1478  10922       Jeff 			ASSERT(buf == hdr->b_buf);
   1479  10922       Jeff 			ASSERT(buf->b_efunc == NULL);
   1480   1544   eschrock 			hdr->b_flags |= ARC_BUF_AVAILABLE;
   1481  10922       Jeff 		}
   1482   1544   eschrock 		mutex_exit(hash_lock);
   1483   1544   eschrock 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
   1484   1544   eschrock 		int destroy_hdr;
   1485   1544   eschrock 		/*
   1486   1544   eschrock 		 * We are in the middle of an async write.  Don't destroy
   1487   1544   eschrock 		 * this buffer unless the write completes before we finish
   1488   1544   eschrock 		 * decrementing the reference count.
   1489   1544   eschrock 		 */
   1490   1544   eschrock 		mutex_enter(&arc_eviction_mtx);
   1491   1544   eschrock 		(void) remove_reference(hdr, NULL, tag);
   1492   1544   eschrock 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
   1493   1544   eschrock 		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
   1494   1544   eschrock 		mutex_exit(&arc_eviction_mtx);
   1495   1544   eschrock 		if (destroy_hdr)
   1496   1544   eschrock 			arc_hdr_destroy(hdr);
   1497   1544   eschrock 	} else {
   1498   1544   eschrock 		if (remove_reference(hdr, NULL, tag) > 0) {
   1499   1544   eschrock 			ASSERT(HDR_IO_ERROR(hdr));
   1500   2688     maybee 			arc_buf_destroy(buf, FALSE, TRUE);
   1501   1544   eschrock 		} else {
   1502   1544   eschrock 			arc_hdr_destroy(hdr);
   1503   1544   eschrock 		}
   1504   1544   eschrock 	}
   1505   1544   eschrock }
   1506   1544   eschrock 
   1507   1544   eschrock int
   1508   1544   eschrock arc_buf_remove_ref(arc_buf_t *buf, void* tag)
   1509   1544   eschrock {
   1510   1544   eschrock 	arc_buf_hdr_t *hdr = buf->b_hdr;
   1511    789     ahrens 	kmutex_t *hash_lock = HDR_LOCK(hdr);
   1512   1544   eschrock 	int no_callback = (buf->b_efunc == NULL);
   1513   1544   eschrock 
   1514   3403        bmc 	if (hdr->b_state == arc_anon) {
   1515  10922       Jeff 		ASSERT(hdr->b_datacnt == 1);
   1516   1544   eschrock 		arc_buf_free(buf, tag);
   1517   1544   eschrock 		return (no_callback);
   1518   1544   eschrock 	}
   1519    789     ahrens 
   1520    789     ahrens 	mutex_enter(hash_lock);
   1521   3403        bmc 	ASSERT(hdr->b_state != arc_anon);
   1522   1544   eschrock 	ASSERT(buf->b_data != NULL);
   1523    789     ahrens 
   1524   1544   eschrock 	(void) remove_reference(hdr, hash_lock, tag);
   1525   1544   eschrock 	if (hdr->b_datacnt > 1) {
   1526   1544   eschrock 		if (no_callback)
   1527   2688     maybee 			arc_buf_destroy(buf, FALSE, TRUE);
   1528   1544   eschrock 	} else if (no_callback) {
   1529   1544   eschrock 		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
   1530  10922       Jeff 		ASSERT(buf->b_efunc == NULL);
   1531   1544   eschrock 		hdr->b_flags |= ARC_BUF_AVAILABLE;
   1532    789     ahrens 	}
   1533   1544   eschrock 	ASSERT(no_callback || hdr->b_datacnt > 1 ||
   1534   1544   eschrock 	    refcount_is_zero(&hdr->b_refcnt));
   1535    789     ahrens 	mutex_exit(hash_lock);
   1536   1544   eschrock 	return (no_callback);
   1537    789     ahrens }
   1538    789     ahrens 
   1539    789     ahrens int
   1540    789     ahrens arc_buf_size(arc_buf_t *buf)
   1541    789     ahrens {
   1542    789     ahrens 	return (buf->b_hdr->b_size);
   1543    789     ahrens }
   1544    789     ahrens 
   1545    789     ahrens /*
   1546    789     ahrens  * Evict buffers from list until we've removed the specified number of
   1547    789     ahrens  * bytes.  Move the removed buffers to the appropriate evict state.
   1548   2688     maybee  * If the recycle flag is set, then attempt to "recycle" a buffer:
   1549   2688     maybee  * - look for a buffer to evict that is `bytes' long.
   1550   2688     maybee  * - return the data block from this buffer rather than freeing it.
   1551   2688     maybee  * This flag is used by callers that are trying to make space for a
   1552   2688     maybee  * new buffer in a full arc cache.
   1553   5642     maybee  *
   1554   5642     maybee  * This function makes a "best effort".  It skips over any buffers
   1555   5642     maybee  * it can't get a hash_lock on, and so may not catch all candidates.
   1556   5642     maybee  * It may also return without evicting as much space as requested.
   1557    789     ahrens  */
   1558   2688     maybee static void *
   1559   8636       Mark arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
   1560   3290   johansen     arc_buf_contents_t type)
   1561    789     ahrens {
   1562    789     ahrens 	arc_state_t *evicted_state;
   1563   2688     maybee 	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
   1564   2918     maybee 	arc_buf_hdr_t *ab, *ab_prev = NULL;
   1565   4309     maybee 	list_t *list = &state->arcs_list[type];
   1566    789     ahrens 	kmutex_t *hash_lock;
   1567   2688     maybee 	boolean_t have_lock;
   1568   2918     maybee 	void *stolen = NULL;
   1569    789     ahrens 
   1570   3403        bmc 	ASSERT(state == arc_mru || state == arc_mfu);
   1571    789     ahrens 
   1572   3403        bmc 	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
   1573    789     ahrens 
   1574   3403        bmc 	mutex_enter(&state->arcs_mtx);
   1575   3403        bmc 	mutex_enter(&evicted_state->arcs_mtx);
   1576    789     ahrens 
   1577   4309     maybee 	for (ab = list_tail(list); ab; ab = ab_prev) {
   1578   4309     maybee 		ab_prev = list_prev(list, ab);
   1579   2391     maybee 		/* prefetch buffers have a minimum lifespan */
   1580   2688     maybee 		if (HDR_IO_IN_PROGRESS(ab) ||
   1581   5642     maybee 		    (spa && ab->b_spa != spa) ||
   1582   2688     maybee 		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
   1583  11066     rafael 		    ddi_get_lbolt() - ab->b_arc_access <
   1584  11066     rafael 		    arc_min_prefetch_lifespan)) {
   1585   2391     maybee 			skipped++;
   1586   2391     maybee 			continue;
   1587   2391     maybee 		}
   1588   2918     maybee 		/* "lookahead" for better eviction candidate */
   1589   2918     maybee 		if (recycle && ab->b_size != bytes &&
   1590   2918     maybee 		    ab_prev && ab_prev->b_size == bytes)
   1591   2688     maybee 			continue;
   1592    789     ahrens 		hash_lock = HDR_LOCK(ab);
   1593   2688     maybee 		have_lock = MUTEX_HELD(hash_lock);
   1594   2688     maybee 		if (have_lock || mutex_tryenter(hash_lock)) {
   1595    789     ahrens 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
   1596   1544   eschrock 			ASSERT(ab->b_datacnt > 0);
   1597   1544   eschrock 			while (ab->b_buf) {
   1598   1544   eschrock 				arc_buf_t *buf = ab->b_buf;
   1599   7545       Mark 				if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
   1600   7545       Mark 					missed += 1;
   1601   7545       Mark 					break;
   1602   7545       Mark 				}
   1603   2688     maybee 				if (buf->b_data) {
   1604   1544   eschrock 					bytes_evicted += ab->b_size;
   1605   3290   johansen 					if (recycle && ab->b_type == type &&
   1606   5450    brendan 					    ab->b_size == bytes &&
   1607   5450    brendan 					    !HDR_L2_WRITING(ab)) {
   1608   2918     maybee 						stolen = buf->b_data;
   1609   2918     maybee 						recycle = FALSE;
   1610   2918     maybee 					}
   1611   2688     maybee 				}
   1612   1544   eschrock 				if (buf->b_efunc) {
   1613   1544   eschrock 					mutex_enter(&arc_eviction_mtx);
   1614   2918     maybee 					arc_buf_destroy(buf,
   1615   2918     maybee 					    buf->b_data == stolen, FALSE);
   1616   1544   eschrock 					ab->b_buf = buf->b_next;
   1617   2887     maybee 					buf->b_hdr = &arc_eviction_hdr;
   1618   1544   eschrock 					buf->b_next = arc_eviction_list;
   1619   1544   eschrock 					arc_eviction_list = buf;
   1620   1544   eschrock 					mutex_exit(&arc_eviction_mtx);
   1621   7545       Mark 					rw_exit(&buf->b_lock);
   1622   1544   eschrock 				} else {
   1623   7545       Mark 					rw_exit(&buf->b_lock);
   1624   2918     maybee 					arc_buf_destroy(buf,
   1625   2918     maybee 					    buf->b_data == stolen, TRUE);
   1626   1544   eschrock 				}
   1627   1544   eschrock 			}
   1628  10357    Brendan 
   1629  10357    Brendan 			if (ab->b_l2hdr) {
   1630  10357    Brendan 				ARCSTAT_INCR(arcstat_evict_l2_cached,
   1631  10357    Brendan 				    ab->b_size);
   1632  10357    Brendan 			} else {
   1633  10357    Brendan 				if (l2arc_write_eligible(ab->b_spa, ab)) {
   1634  10357    Brendan 					ARCSTAT_INCR(arcstat_evict_l2_eligible,
   1635  10357    Brendan 					    ab->b_size);
   1636  10357    Brendan 				} else {
   1637  10357    Brendan 					ARCSTAT_INCR(
   1638  10357    Brendan 					    arcstat_evict_l2_ineligible,
   1639  10357    Brendan 					    ab->b_size);
   1640  10357    Brendan 				}
   1641  10357    Brendan 			}
   1642  10357    Brendan 
   1643   7545       Mark 			if (ab->b_datacnt == 0) {
   1644   7545       Mark 				arc_change_state(evicted_state, ab, hash_lock);
   1645   7545       Mark 				ASSERT(HDR_IN_HASH_TABLE(ab));
   1646   7545       Mark 				ab->b_flags |= ARC_IN_HASH_TABLE;
   1647   7545       Mark 				ab->b_flags &= ~ARC_BUF_AVAILABLE;
   1648   7545       Mark 				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
   1649   7545       Mark 			}
   1650   2688     maybee 			if (!have_lock)
   1651   2688     maybee 				mutex_exit(hash_lock);
   1652   1544   eschrock 			if (bytes >= 0 && bytes_evicted >= bytes)
   1653    789     ahrens 				break;
   1654    789     ahrens 		} else {
   1655   2688     maybee 			missed += 1;
   1656    789     ahrens 		}
   1657    789     ahrens 	}
   1658   3403        bmc 
   1659   3403        bmc 	mutex_exit(&evicted_state->arcs_mtx);
   1660   3403        bmc 	mutex_exit(&state->arcs_mtx);
   1661    789     ahrens 
   1662    789     ahrens 	if (bytes_evicted < bytes)
   1663    789     ahrens 		dprintf("only evicted %lld bytes from %x",
   1664    789     ahrens 		    (longlong_t)bytes_evicted, state);
   1665    789     ahrens 
   1666   2688     maybee 	if (skipped)
   1667   3403        bmc 		ARCSTAT_INCR(arcstat_evict_skip, skipped);
   1668   3403        bmc 
   1669   2688     maybee 	if (missed)
   1670   3403        bmc 		ARCSTAT_INCR(arcstat_mutex_miss, missed);
   1671   4709     maybee 
   1672   4709     maybee 	/*
   1673   4709     maybee 	 * We have just evicted some date into the ghost state, make
   1674   4709     maybee 	 * sure we also adjust the ghost state size if necessary.
   1675   4709     maybee 	 */
   1676   4709     maybee 	if (arc_no_grow &&
   1677   4709     maybee 	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
   1678   4709     maybee 		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
   1679   4709     maybee 		    arc_mru_ghost->arcs_size - arc_c;
   1680   4709     maybee 
   1681   4709     maybee 		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
   1682   4709     maybee 			int64_t todelete =
   1683   4709     maybee 			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
   1684   5642     maybee 			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
   1685   4709     maybee 		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
   1686   4709     maybee 			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
   1687   4709     maybee 			    arc_mru_ghost->arcs_size +
   1688   4709     maybee 			    arc_mfu_ghost->arcs_size - arc_c);
   1689   5642     maybee 			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
   1690   4709     maybee 		}
   1691   4709     maybee 	}
   1692   3403        bmc 
   1693   2918     maybee 	return (stolen);
   1694    789     ahrens }
   1695    789     ahrens 
   1696    789     ahrens /*
   1697    789     ahrens  * Remove buffers from list until we've removed the specified number of
   1698    789     ahrens  * bytes.  Destroy the buffers that are removed.
   1699    789     ahrens  */
   1700    789     ahrens static void
   1701   8636       Mark arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
   1702    789     ahrens {
   1703    789     ahrens 	arc_buf_hdr_t *ab, *ab_prev;
   1704   4309     maybee 	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
   1705    789     ahrens 	kmutex_t *hash_lock;
   1706   1544   eschrock 	uint64_t bytes_deleted = 0;
   1707   3700   ek110237 	uint64_t bufs_skipped = 0;
   1708    789     ahrens 
   1709   1544   eschrock 	ASSERT(GHOST_STATE(state));
   1710    789     ahrens top:
   1711   3403        bmc 	mutex_enter(&state->arcs_mtx);
   1712   4309     maybee 	for (ab = list_tail(list); ab; ab = ab_prev) {
   1713   4309     maybee 		ab_prev = list_prev(list, ab);
   1714   5642     maybee 		if (spa && ab->b_spa != spa)
   1715   5642     maybee 			continue;
   1716    789     ahrens 		hash_lock = HDR_LOCK(ab);
   1717    789     ahrens 		if (mutex_tryenter(hash_lock)) {
   1718   2391     maybee 			ASSERT(!HDR_IO_IN_PROGRESS(ab));
   1719   1544   eschrock 			ASSERT(ab->b_buf == NULL);
   1720   3403        bmc 			ARCSTAT_BUMP(arcstat_deleted);
   1721   1544   eschrock 			bytes_deleted += ab->b_size;
   1722   5450    brendan 
   1723   5450    brendan 			if (ab->b_l2hdr != NULL) {
   1724   5450    brendan 				/*
   1725   5450    brendan 				 * This buffer is cached on the 2nd Level ARC;
   1726   5450    brendan 				 * don't destroy the header.
   1727   5450    brendan 				 */
   1728   5450    brendan 				arc_change_state(arc_l2c_only, ab, hash_lock);
   1729   5450    brendan 				mutex_exit(hash_lock);
   1730   5450    brendan 			} else {
   1731   5450    brendan 				arc_change_state(arc_anon, ab, hash_lock);
   1732   5450    brendan 				mutex_exit(hash_lock);
   1733   5450    brendan 				arc_hdr_destroy(ab);
   1734   5450    brendan 			}
   1735   5450    brendan 
   1736    789     ahrens 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
   1737    789     ahrens 			if (bytes >= 0 && bytes_deleted >= bytes)
   1738    789     ahrens 				break;
   1739    789     ahrens 		} else {
   1740    789     ahrens 			if (bytes < 0) {
   1741   3403        bmc 				mutex_exit(&state->arcs_mtx);
   1742    789     ahrens 				mutex_enter(hash_lock);
   1743    789     ahrens 				mutex_exit(hash_lock);
   1744    789     ahrens 				goto top;
   1745    789     ahrens 			}
   1746    789     ahrens 			bufs_skipped += 1;
   1747    789     ahrens 		}
   1748    789     ahrens 	}
   1749   3403        bmc 	mutex_exit(&state->arcs_mtx);
   1750    789     ahrens 
   1751   4309     maybee 	if (list == &state->arcs_list[ARC_BUFC_DATA] &&
   1752   4309     maybee 	    (bytes < 0 || bytes_deleted < bytes)) {
   1753   4309     maybee 		list = &state->arcs_list[ARC_BUFC_METADATA];
   1754   4309     maybee 		goto top;
   1755   4309     maybee 	}
   1756   4309     maybee 
   1757    789     ahrens 	if (bufs_skipped) {
   1758   3403        bmc 		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
   1759    789     ahrens 		ASSERT(bytes >= 0);
   1760    789     ahrens 	}
   1761    789     ahrens 
   1762    789     ahrens 	if (bytes_deleted < bytes)
   1763    789     ahrens 		dprintf("only deleted %lld bytes from %p",
   1764    789     ahrens 		    (longlong_t)bytes_deleted, state);
   1765    789     ahrens }
   1766    789     ahrens 
   1767    789     ahrens static void
   1768    789     ahrens arc_adjust(void)
   1769    789     ahrens {
   1770   8582    Brendan 	int64_t adjustment, delta;
   1771   8582    Brendan 
   1772   8582    Brendan 	/*
   1773   8582    Brendan 	 * Adjust MRU size
   1774   8582    Brendan 	 */
   1775   8582    Brendan 
   1776   8582    Brendan 	adjustment = MIN(arc_size - arc_c,
   1777   8582    Brendan 	    arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
   1778   8582    Brendan 
   1779   8582    Brendan 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
   1780   8582    Brendan 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
   1781   8582    Brendan 		(void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
   1782   8582    Brendan 		adjustment -= delta;
   1783   8582    Brendan 	}
   1784   8582    Brendan 
   1785   8582    Brendan 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
   1786   8582    Brendan 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
   1787   8582    Brendan 		(void) arc_evict(arc_mru, NULL, delta, FALSE,
   1788   5642     maybee 		    ARC_BUFC_METADATA);
   1789   8582    Brendan 	}
   1790   8582    Brendan 
   1791   8582    Brendan 	/*
   1792   8582    Brendan 	 * Adjust MFU size
   1793   8582    Brendan 	 */
   1794   8582    Brendan 
   1795   8582    Brendan 	adjustment = arc_size - arc_c;
   1796   8582    Brendan 
   1797   8582    Brendan 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
   1798   8582    Brendan 		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
   1799   8582    Brendan 		(void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
   1800   8582    Brendan 		adjustment -= delta;
   1801   8582    Brendan 	}
   1802   8582    Brendan 
   1803   8582    Brendan 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
   1804   8582    Brendan 		int64_t delta = MIN(adjustment,
   1805   8582    Brendan 		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
   1806   8582    Brendan 		(void) arc_evict(arc_mfu, NULL, delta, FALSE,
   1807   8582    Brendan 		    ARC_BUFC_METADATA);
   1808   8582    Brendan 	}
   1809   8582    Brendan 
   1810   8582    Brendan 	/*
   1811   8582    Brendan 	 * Adjust ghost lists
   1812   8582    Brendan 	 */
   1813   8582    Brendan 
   1814   8582    Brendan 	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
   1815   8582    Brendan 
   1816   8582    Brendan 	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
   1817   8582    Brendan 		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
   1818   8582    Brendan 		arc_evict_ghost(arc_mru_ghost, NULL, delta);
   1819   8582    Brendan 	}
   1820   8582    Brendan 
   1821   8582    Brendan 	adjustment =
   1822   8582    Brendan 	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
   1823   8582    Brendan 
   1824   8582    Brendan 	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
   1825   8582    Brendan 		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
   1826   8582    Brendan 		arc_evict_ghost(arc_mfu_ghost, NULL, delta);
   1827    789     ahrens 	}
   1828   1544   eschrock }
   1829   1544   eschrock 
   1830   1544   eschrock static void
   1831   1544   eschrock arc_do_user_evicts(void)
   1832   1544   eschrock {
   1833   1544   eschrock 	mutex_enter(&arc_eviction_mtx);
   1834   1544   eschrock 	while (arc_eviction_list != NULL) {
   1835   1544   eschrock 		arc_buf_t *buf = arc_eviction_list;
   1836   1544   eschrock 		arc_eviction_list = buf->b_next;
   1837   7545       Mark 		rw_enter(&buf->b_lock, RW_WRITER);
   1838   1544   eschrock 		buf->b_hdr = NULL;
   1839   7545       Mark 		rw_exit(&buf->b_lock);
   1840   1544   eschrock 		mutex_exit(&arc_eviction_mtx);
   1841   1544   eschrock 
   1842   1819     maybee 		if (buf->b_efunc != NULL)
   1843   1819     maybee 			VERIFY(buf->b_efunc(buf) == 0);
   1844   1544   eschrock 
   1845   1544   eschrock 		buf->b_efunc = NULL;
   1846   1544   eschrock 		buf->b_private = NULL;
   1847   1544   eschrock 		kmem_cache_free(buf_cache, buf);
   1848   1544   eschrock 		mutex_enter(&arc_eviction_mtx);
   1849   1544   eschrock 	}
   1850   1544   eschrock 	mutex_exit(&arc_eviction_mtx);
   1851    789     ahrens }
   1852    789     ahrens 
   1853    789     ahrens /*
   1854   5642     maybee  * Flush all *evictable* data from the cache for the given spa.
   1855    789     ahrens  * NOTE: this will not touch "active" (i.e. referenced) data.
   1856    789     ahrens  */
   1857    789     ahrens void
   1858   5642     maybee arc_flush(spa_t *spa)
   1859   5642     maybee {
   1860   8636       Mark 	uint64_t guid = 0;
   1861   8636       Mark 
   1862   8636       Mark 	if (spa)
   1863   8636       Mark 		guid = spa_guid(spa);
   1864   8636       Mark 
   1865   5642     maybee 	while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
   1866   8636       Mark 		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
   1867   5642     maybee 		if (spa)
   1868   5642     maybee 			break;
   1869   5642     maybee 	}
   1870   5642     maybee 	while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
   1871   8636       Mark 		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
   1872   5642     maybee 		if (spa)
   1873   5642     maybee 			break;
   1874   5642     maybee 	}
   1875   5642     maybee 	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
   1876   8636       Mark 		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
   1877   5642     maybee 		if (spa)
   1878   5642     maybee 			break;
   1879   5642     maybee 	}
   1880   5642     maybee 	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
   1881   8636       Mark 		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
   1882   5642     maybee 		if (spa)
   1883   5642     maybee 			break;
   1884   5642     maybee 	}
   1885   5642     maybee 
   1886   8636       Mark 	arc_evict_ghost(arc_mru_ghost, guid, -1);
   1887   8636       Mark 	arc_evict_ghost(arc_mfu_ghost, guid, -1);
   1888   1544   eschrock 
   1889   1544   eschrock 	mutex_enter(&arc_reclaim_thr_lock);
   1890   1544   eschrock 	arc_do_user_evicts();
   1891   1544   eschrock 	mutex_exit(&arc_reclaim_thr_lock);
   1892   5642     maybee 	ASSERT(spa || arc_eviction_list == NULL);
   1893    789     ahrens }
   1894   2391     maybee 
   1895    789     ahrens void
   1896   3158     maybee arc_shrink(void)
   1897    789     ahrens {
   1898   3403        bmc 	if (arc_c > arc_c_min) {
   1899   3158     maybee 		uint64_t to_free;
   1900    789     ahrens 
   1901   2048      stans #ifdef _KERNEL
   1902   3403        bmc 		to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
   1903   2048      stans #else
   1904   3403        bmc 		to_free = arc_c >> arc_shrink_shift;
   1905   2048      stans #endif
   1906   3403        bmc 		if (arc_c > arc_c_min + to_free)
   1907   3403        bmc 			atomic_add_64(&arc_c, -to_free);
   1908   3158     maybee 		else
   1909   3403        bmc 			arc_c = arc_c_min;
   1910   2048      stans 
   1911   3403        bmc 		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
   1912   3403        bmc 		if (arc_c > arc_size)
   1913   3403        bmc 			arc_c = MAX(arc_size, arc_c_min);
   1914   3403        bmc 		if (arc_p > arc_c)
   1915   3403        bmc 			arc_p = (arc_c >> 1);
   1916   3403        bmc 		ASSERT(arc_c >= arc_c_min);
   1917   3403        bmc 		ASSERT((int64_t)arc_p >= 0);
   1918   3158     maybee 	}
   1919    789     ahrens 
   1920   3403        bmc 	if (arc_size > arc_c)
   1921   3158     maybee 		arc_adjust();
   1922    789     ahrens }
   1923    789     ahrens 
   1924    789     ahrens static int
   1925    789     ahrens arc_reclaim_needed(void)
   1926    789     ahrens {
   1927    789     ahrens 	uint64_t extra;
   1928    789     ahrens 
   1929    789     ahrens #ifdef _KERNEL
   1930   2048      stans 
   1931   2048      stans 	if (needfree)
   1932   2048      stans 		return (1);
   1933   2048      stans 
   1934    789     ahrens 	/*
   1935    789     ahrens 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
   1936    789     ahrens 	 */
   1937    789     ahrens 	extra = desfree;
   1938    789     ahrens 
   1939    789     ahrens 	/*
   1940    789     ahrens 	 * check that we're out of range of the pageout scanner.  It starts to
   1941    789     ahrens 	 * schedule paging if freemem is less than lotsfree and needfree.
   1942    789     ahrens 	 * lotsfree is the high-water mark for pageout, and needfree is the
   1943    789     ahrens 	 * number of needed free pages.  We add extra pages here to make sure
   1944    789     ahrens 	 * the scanner doesn't start up while we're freeing memory.
   1945    789     ahrens 	 */
   1946    789     ahrens 	if (freemem < lotsfree + needfree + extra)
   1947    789     ahrens 		return (1);
   1948    789     ahrens 
   1949    789     ahrens 	/*
   1950    789     ahrens 	 * check to make sure that swapfs has enough space so that anon
   1951   5450    brendan 	 * reservations can still succeed. anon_resvmem() checks that the
   1952    789     ahrens 	 * availrmem is greater than swapfs_minfree, and the number of reserved
   1953    789     ahrens 	 * swap pages.  We also add a bit of extra here just to prevent
   1954    789     ahrens 	 * circumstances from getting really dire.
   1955    789     ahrens 	 */
   1956    789     ahrens 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
   1957    789     ahrens 		return (1);
   1958    789     ahrens 
   1959   1936     maybee #if defined(__i386)
   1960    789     ahrens 	/*
   1961    789     ahrens 	 * If we're on an i386 platform, it's possible that we'll exhaust the
   1962    789     ahrens 	 * kernel heap space before we ever run out of available physical
   1963    789     ahrens 	 * memory.  Most checks of the size of the heap_area compare against
   1964    789     ahrens 	 * tune.t_minarmem, which is the minimum available real memory that we
   1965    789     ahrens 	 * can have in the system.  However, this is generally fixed at 25 pages
   1966    789     ahrens 	 * which is so low that it's useless.  In this comparison, we seek to
   1967    789     ahrens 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
   1968   5450    brendan 	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
   1969    789     ahrens 	 * free)
   1970    789     ahrens 	 */
   1971    789     ahrens 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
   1972    789     ahrens 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
   1973    789     ahrens 		return (1);
   1974    789     ahrens #endif
   1975    789     ahrens 
   1976    789     ahrens #else
   1977    789     ahrens 	if (spa_get_random(100) == 0)
   1978    789     ahrens 		return (1);
   1979    789     ahrens #endif
   1980    789     ahrens 	return (0);
   1981    789     ahrens }
   1982    789     ahrens 
   1983    789     ahrens static void
   1984    789     ahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat)
   1985    789     ahrens {
   1986    789     ahrens 	size_t			i;
   1987    789     ahrens 	kmem_cache_t		*prev_cache = NULL;
   1988   3290   johansen 	kmem_cache_t		*prev_data_cache = NULL;
   1989    789     ahrens 	extern kmem_cache_t	*zio_buf_cache[];
   1990   3290   johansen 	extern kmem_cache_t	*zio_data_buf_cache[];
   1991   1484   ek110237 
   1992   1484   ek110237 #ifdef _KERNEL
   1993   4309     maybee 	if (arc_meta_used >= arc_meta_limit) {
   1994   4309     maybee 		/*
   1995   4309     maybee 		 * We are exceeding our meta-data cache limit.
   1996   4309     maybee 		 * Purge some DNLC entries to release holds on meta-data.
   1997   4309     maybee 		 */
   1998   4309     maybee 		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
   1999   4309     maybee 	}
   2000   1936     maybee #if defined(__i386)
   2001   1936     maybee 	/*
   2002   1936     maybee 	 * Reclaim unused memory from all kmem caches.
   2003   1936     maybee 	 */
   2004   1936     maybee 	kmem_reap();
   2005   1936     maybee #endif
   2006   1484   ek110237 #endif
   2007    789     ahrens 
   2008    789     ahrens 	/*
   2009   5450    brendan 	 * An aggressive reclamation will shrink the cache size as well as
   2010   1544   eschrock 	 * reap free buffers from the arc kmem caches.
   2011    789     ahrens 	 */
   2012    789     ahrens 	if (strat == ARC_RECLAIM_AGGR)
   2013   3158     maybee 		arc_shrink();
   2014    789     ahrens 
   2015    789     ahrens 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
   2016    789     ahrens 		if (zio_buf_cache[i] != prev_cache) {
   2017    789     ahrens 			prev_cache = zio_buf_cache[i];
   2018    789     ahrens 			kmem_cache_reap_now(zio_buf_cache[i]);
   2019   3290   johansen 		}
   2020   3290   johansen 		if (zio_data_buf_cache[i] != prev_data_cache) {
   2021   3290   johansen 			prev_data_cache = zio_data_buf_cache[i];
   2022   3290   johansen 			kmem_cache_reap_now(zio_data_buf_cache[i]);
   2023    789     ahrens 		}
   2024    789     ahrens 	}
   2025   1544   eschrock 	kmem_cache_reap_now(buf_cache);
   2026   1544   eschrock 	kmem_cache_reap_now(hdr_cache);
   2027    789     ahrens }
   2028    789     ahrens 
   2029    789     ahrens static void
   2030    789     ahrens arc_reclaim_thread(void)
   2031    789     ahrens {
   2032    789     ahrens 	clock_t			growtime = 0;
   2033    789     ahrens 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
   2034    789     ahrens 	callb_cpr_t		cpr;
   2035    789     ahrens 
   2036    789     ahrens 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
   2037    789     ahrens 
   2038    789     ahrens 	mutex_enter(&arc_reclaim_thr_lock);
   2039    789     ahrens 	while (arc_thread_exit == 0) {
   2040    789     ahrens 		if (arc_reclaim_needed()) {
   2041    789     ahrens 
   2042   3403        bmc 			if (arc_no_grow) {
   2043    789     ahrens 				if (last_reclaim == ARC_RECLAIM_CONS) {
   2044    789     ahrens 					last_reclaim = ARC_RECLAIM_AGGR;
   2045    789     ahrens 				} else {
   2046    789     ahrens 					last_reclaim = ARC_RECLAIM_CONS;
   2047    789     ahrens 				}
   2048    789     ahrens 			} else {
   2049   3403        bmc 				arc_no_grow = TRUE;
   2050    789     ahrens 				last_reclaim = ARC_RECLAIM_AGGR;
   2051    789     ahrens 				membar_producer();
   2052    789     ahrens 			}
   2053    789     ahrens 
   2054    789     ahrens 			/* reset the growth delay for every reclaim */
   2055  11066     rafael 			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
   2056    789     ahrens 
   2057    789     ahrens 			arc_kmem_reap_now(last_reclaim);
   2058   6987    brendan 			arc_warm = B_TRUE;
   2059    789     ahrens 
   2060  11066     rafael 		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
   2061   3403        bmc 			arc_no_grow = FALSE;
   2062    789     ahrens 		}
   2063    789     ahrens 
   2064   3403        bmc 		if (2 * arc_c < arc_size +
   2065   3403        bmc 		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
   2066   3298     maybee 			arc_adjust();
   2067   3298     maybee 
   2068   1544   eschrock 		if (arc_eviction_list != NULL)
   2069   1544   eschrock 			arc_do_user_evicts();
   2070   1544   eschrock 
   2071    789     ahrens 		/* block until needed, or one second, whichever is shorter */
   2072    789     ahrens 		CALLB_CPR_SAFE_BEGIN(&cpr);
   2073    789     ahrens 		(void) cv_timedwait(&arc_reclaim_thr_cv,
   2074  11066     rafael 		    &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
   2075    789     ahrens 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
   2076    789     ahrens 	}
   2077    789     ahrens 
   2078    789     ahrens 	arc_thread_exit = 0;
   2079    789     ahrens 	cv_broadcast(&arc_reclaim_thr_cv);
   2080    789     ahrens 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
   2081    789     ahrens 	thread_exit();
   2082    789     ahrens }
   2083    789     ahrens 
   2084   1544   eschrock /*
   2085   1544   eschrock  * Adapt arc info given the number of bytes we are trying to add and
   2086   1544   eschrock  * the state that we are comming from.  This function is only called
   2087   1544   eschrock  * when we are adding new content to the cache.
   2088   1544   eschrock  */
   2089    789     ahrens static void
   2090   1544   eschrock arc_adapt(int bytes, arc_state_t *state)
   2091    789     ahrens {
   2092   1544   eschrock 	int mult;
   2093   8582    Brendan 	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
   2094   5450    brendan 
   2095   5450    brendan 	if (state == arc_l2c_only)
   2096   5450    brendan 		return;
   2097   1544   eschrock 
   2098   1544   eschrock 	ASSERT(bytes > 0);
   2099    789     ahrens 	/*
   2100   1544   eschrock 	 * Adapt the target size of the MRU list:
   2101   1544   eschrock 	 *	- if we just hit in the MRU ghost list, then increase
   2102   1544   eschrock 	 *	  the target size of the MRU list.
   2103   1544   eschrock 	 *	- if we just hit in the MFU ghost list, then increase
   2104   1544   eschrock 	 *	  the target size of the MFU list by decreasing the
   2105   1544   eschrock 	 *	  target size of the MRU list.
   2106    789     ahrens 	 */
   2107   3403        bmc 	if (state == arc_mru_ghost) {
   2108   3403        bmc 		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
   2109   3403        bmc 		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
   2110   1544   eschrock 
   2111   8582    Brendan 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
   2112   3403        bmc 	} else if (state == arc_mfu_ghost) {
   2113   8582    Brendan 		uint64_t delta;
   2114   8582    Brendan 
   2115   3403        bmc 		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
   2116   3403        bmc 		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
   2117   1544   eschrock 
   2118   8582    Brendan 		delta = MIN(bytes * mult, arc_p);
   2119   8582    Brendan 		arc_p = MAX(arc_p_min, arc_p - delta);
   2120   1544   eschrock 	}
   2121   3403        bmc 	ASSERT((int64_t)arc_p >= 0);
   2122    789     ahrens 
   2123    789     ahrens 	if (arc_reclaim_needed()) {
   2124    789     ahrens 		cv_signal(&arc_reclaim_thr_cv);
   2125    789     ahrens 		return;
   2126    789     ahrens 	}
   2127    789     ahrens 
   2128   3403        bmc 	if (arc_no_grow)
   2129    789     ahrens 		return;
   2130    789     ahrens 
   2131   3403        bmc 	if (arc_c >= arc_c_max)
   2132   1544   eschrock 		return;
   2133   1544   eschrock 
   2134    789     ahrens 	/*
   2135   1544   eschrock 	 * If we're within (2 * maxblocksize) bytes of the target
   2136   1544   eschrock 	 * cache size, increment the target cache size
   2137    789     ahrens 	 */
   2138   3403        bmc 	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
   2139   3403        bmc 		atomic_add_64(&arc_c, (int64_t)bytes);
   2140   3403        bmc 		if (arc_c > arc_c_max)
   2141   3403        bmc 			arc_c = arc_c_max;
   2142   3403        bmc 		else if (state == arc_anon)
   2143   3403        bmc 			atomic_add_64(&arc_p, (int64_t)bytes);
   2144   3403        bmc 		if (arc_p > arc_c)
   2145   3403        bmc 			arc_p = arc_c;
   2146    789     ahrens 	}
   2147   3403        bmc 	ASSERT((int64_t)arc_p >= 0);
   2148    789     ahrens }
   2149    789     ahrens 
   2150    789     ahrens /*
   2151   1544   eschrock  * Check if the cache has reached its limits and eviction is required
   2152   1544   eschrock  * prior to insert.
   2153    789     ahrens  */
   2154    789     ahrens static int
   2155   4309     maybee arc_evict_needed(arc_buf_contents_t type)
   2156    789     ahrens {
   2157   4309     maybee 	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
   2158   4309     maybee 		return (1);
   2159   4309     maybee 
   2160   4309     maybee #ifdef _KERNEL
   2161   4309     maybee 	/*
   2162   4309     maybee 	 * If zio data pages are being allocated out of a separate heap segment,
   2163   4309     maybee 	 * then enforce that the size of available vmem for this area remains
   2164   4309     maybee 	 * above about 1/32nd free.
   2165   4309     maybee 	 */
   2166   4309     maybee 	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
   2167   4309     maybee 	    vmem_size(zio_arena, VMEM_FREE) <
   2168   4309     maybee 	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
   2169   4309     maybee 		return (1);
   2170   4309     maybee #endif
   2171   4309     maybee 
   2172    789     ahrens 	if (arc_reclaim_needed())
   2173    789     ahrens 		return (1);
   2174    789     ahrens 
   2175   3403        bmc 	return (arc_size > arc_c);
   2176    789     ahrens }
   2177    789     ahrens 
   2178    789     ahrens /*
   2179   2688     maybee  * The buffer, supplied as the first argument, needs a data block.
   2180   2688     maybee  * So, if we are at cache max, determine which cache should be victimized.
   2181   2688     maybee  * We have the following cases:
   2182    789     ahrens  *
   2183   3403        bmc  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
   2184    789     ahrens  * In this situation if we're out of space, but the resident size of the MFU is
   2185    789     ahrens  * under the limit, victimize the MFU cache to satisfy this insertion request.
   2186    789     ahrens  *
   2187   3403        bmc  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
   2188    789     ahrens  * Here, we've used up all of the available space for the MRU, so we need to
   2189    789     ahrens  * evict from our own cache instead.  Evict from the set of resident MRU
   2190    789     ahrens  * entries.
   2191    789     ahrens  *
   2192   3403        bmc  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
   2193    789     ahrens  * c minus p represents the MFU space in the cache, since p is the size of the
   2194    789     ahrens  * cache that is dedicated to the MRU.  In this situation there's still space on
   2195    789     ahrens  * the MFU side, so the MRU side needs to be victimized.
   2196    789     ahrens  *
   2197   3403        bmc  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
   2198    789     ahrens  * MFU's resident set is consuming more space than it has been allotted.  In
   2199    789     ahrens  * this situation, we must victimize our own cache, the MFU, for this insertion.
   2200    789     ahrens  */
   2201    789     ahrens static void
   2202   2688     maybee arc_get_data_buf(arc_buf_t *buf)
   2203    789     ahrens {
   2204   3290   johansen 	arc_state_t		*state = buf->b_hdr->b_state;
   2205   3290   johansen 	uint64_t		size = buf->b_hdr->b_size;
   2206   3290   johansen 	arc_buf_contents_t	type = buf->b_hdr->b_type;
   2207    789     ahrens 
   2208   2688     maybee 	arc_adapt(size, state);
   2209    789     ahrens 
   2210   2688     maybee 	/*
   2211   2688     maybee 	 * We have not yet reached cache maximum size,
   2212   2688     maybee 	 * just allocate a new buffer.
   2213   2688     maybee 	 */
   2214   4309     maybee 	if (!arc_evict_needed(type)) {
   2215   3290   johansen 		if (type == ARC_BUFC_METADATA) {
   2216   3290   johansen 			buf->b_data = zio_buf_alloc(size);
   2217   8582    Brendan 			arc_space_consume(size, ARC_SPACE_DATA);
   2218   3290   johansen 		} else {
   2219   3290   johansen 			ASSERT(type == ARC_BUFC_DATA);
   2220   3290   johansen 			buf->b_data = zio_data_buf_alloc(size);
   2221   8582    Brendan 			ARCSTAT_INCR(arcstat_data_size, size);
   2222   4309     maybee 			atomic_add_64(&arc_size, size);
   2223   3290   johansen 		}
   2224   2688     maybee 		goto out;
   2225   2688     maybee 	}
   2226   2688     maybee 
   2227   2688     maybee 	/*
   2228   2688     maybee 	 * If we are prefetching from the mfu ghost list, this buffer
   2229   2688     maybee 	 * will end up on the mru list; so steal space from there.
   2230   2688     maybee 	 */
   2231   3403        bmc 	if (state == arc_mfu_ghost)
   2232   3403        bmc 		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
   2233   3403        bmc 	else if (state == arc_mru_ghost)
   2234   3403        bmc 		state = arc_mru;
   2235   2688     maybee 
   2236   3403        bmc 	if (state == arc_mru || state == arc_anon) {
   2237   3403        bmc 		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
   2238   8582    Brendan 		state = (arc_mfu->arcs_lsize[type] >= size &&
   2239   4309     maybee 		    arc_p > mru_used) ? arc_mfu : arc_mru;
   2240    789     ahrens 	} else {
   2241   2688     maybee 		/* MFU cases */
   2242   3403        bmc 		uint64_t mfu_space = arc_c - arc_p;
   2243   8582    Brendan 		state =  (arc_mru->arcs_lsize[type] >= size &&
   2244   4309     maybee 		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
   2245   2688     maybee 	}
   2246   5642     maybee 	if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
   2247   3290   johansen 		if (type == ARC_BUFC_METADATA) {
   2248   3290   johansen 			buf->b_data = zio_buf_alloc(size);
   2249   8582    Brendan 			arc_space_consume(size, ARC_SPACE_DATA);
   2250   3290   johansen 		} else {
   2251   3290   johansen 			ASSERT(type == ARC_BUFC_DATA);
   2252   3290   johansen 			buf->b_data = zio_data_buf_alloc(size);
   2253   8582    Brendan 			ARCSTAT_INCR(arcstat_data_size, size);
   2254   4309     maybee 			atomic_add_64(&arc_size, size);
   2255   3290   johansen 		}
   2256   3403        bmc 		ARCSTAT_BUMP(arcstat_recycle_miss);
   2257   2688     maybee 	}
   2258   2688     maybee 	ASSERT(buf->b_data != NULL);
   2259   2688     maybee out:
   2260   2688     maybee 	/*
   2261   2688     maybee 	 * Update the state size.  Note that ghost states have a
   2262   2688     maybee 	 * "ghost size" and so don't need to be updated.
   2263   2688     maybee 	 */
   2264   2688     maybee 	if (!GHOST_STATE(buf->b_hdr->b_state)) {
   2265   2688     maybee 		arc_buf_hdr_t *hdr = buf->b_hdr;
   2266   2688     maybee 
   2267   3403        bmc 		atomic_add_64(&hdr->b_state->arcs_size, size);
   2268   2688     maybee 		if (list_link_active(&hdr->b_arc_node)) {
   2269   2688     maybee 			ASSERT(refcount_is_zero(&hdr->b_refcnt));
   2270   4309     maybee 			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
   2271    789     ahrens 		}
   2272   3298     maybee 		/*
   2273   3298     maybee 		 * If we are growing the cache, and we are adding anonymous
   2274   3403        bmc 		 * data, and we have outgrown arc_p, update arc_p
   2275   3298     maybee 		 */
   2276   3403        bmc 		if (arc_size < arc_c && hdr->b_state == arc_anon &&
   2277   3403        bmc 		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
   2278   3403        bmc 			arc_p = MIN(arc_c, arc_p + size);
   2279    789     ahrens 	}
   2280    789     ahrens }
   2281    789     ahrens 
   2282    789     ahrens /*
   2283    789     ahrens  * This routine is called whenever a buffer is accessed.
   2284   1544   eschrock  * NOTE: the hash lock is dropped in this function.
   2285    789     ahrens  */
   2286    789     ahrens static void
   2287   2688     maybee arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
   2288    789     ahrens {
   2289  11066     rafael 	clock_t now;
   2290  11066     rafael 
   2291    789     ahrens 	ASSERT(MUTEX_HELD(hash_lock));
   2292    789     ahrens 
   2293   3403        bmc 	if (buf->b_state == arc_anon) {
   2294    789     ahrens 		/*
   2295    789     ahrens 		 * This buffer is not in the cache, and does not
   2296    789     ahrens 		 * appear in our "ghost" list.  Add the new buffer
   2297    789     ahrens 		 * to the MRU state.
   2298    789     ahrens 		 */
   2299    789     ahrens 
   2300    789     ahrens 		ASSERT(buf->b_arc_access == 0);
   2301  11066     rafael 		buf->b_arc_access = ddi_get_lbolt();
   2302   1544   eschrock 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
   2303   3403        bmc 		arc_change_state(arc_mru, buf, hash_lock);
   2304    789     ahrens 
   2305   3403        bmc 	} else if (buf->b_state == arc_mru) {
   2306  11066     rafael 		now = ddi_get_lbolt();
   2307  11066     rafael 
   2308    789     ahrens 		/*
   2309   2391     maybee 		 * If this buffer is here because of a prefetch, then either:
   2310   2391     maybee 		 * - clear the flag if this is a "referencing" read
   2311   2391     maybee 		 *   (any subsequent access will bump this into the MFU state).
   2312   2391     maybee 		 * or
   2313   2391     maybee 		 * - move the buffer to the head of the list if this is
   2314   2391     maybee 		 *   another prefetch (to make it less likely to be evicted).
   2315    789     ahrens 		 */
   2316    789     ahrens 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
   2317   2391     maybee 			if (refcount_count(&buf->b_refcnt) == 0) {
   2318   2391     maybee 				ASSERT(list_link_active(&buf->b_arc_node));
   2319   2391     maybee 			} else {
   2320   2391     maybee 				buf->b_flags &= ~ARC_PREFETCH;
   2321   3403        bmc 				ARCSTAT_BUMP(arcstat_mru_hits);
   2322   2391     maybee 			}
   2323  11066     rafael 			buf->b_arc_access = now;
   2324    789     ahrens 			return;
   2325    789     ahrens 		}
   2326    789     ahrens 
   2327    789     ahrens 		/*
   2328    789     ahrens 		 * This buffer has been "accessed" only once so far,
   2329    789     ahrens 		 * but it is still in the cache. Move it to the MFU
   2330    789     ahrens 		 * state.
   2331    789     ahrens 		 */
   2332  11066     rafael 		if (now > buf->b_arc_access + ARC_MINTIME) {
   2333    789     ahrens 			/*
   2334    789     ahrens 			 * More than 125ms have passed since we
   2335    789     ahrens 			 * instantiated this buffer.  Move it to the
   2336    789     ahrens 			 * most frequently used state.
   2337    789     ahrens 			 */
   2338  11066     rafael 			buf->b_arc_access = now;
   2339   1544   eschrock 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
   2340   3403        bmc 			arc_change_state(arc_mfu, buf, hash_lock);
   2341    789     ahrens 		}
   2342   3403        bmc 		ARCSTAT_BUMP(arcstat_mru_hits);
   2343   3403        bmc 	} else if (buf->b_state == arc_mru_ghost) {
   2344    789     ahrens 		arc_state_t	*new_state;
   2345    789     ahrens 		/*
   2346    789     ahrens 		 * This buffer has been "accessed" recently, but
   2347    789     ahrens 		 * was evicted from the cache.  Move it to the
   2348    789     ahrens 		 * MFU state.
   2349    789     ahrens 		 */
   2350    789     ahrens 
   2351    789     ahrens 		if (buf->b_flags & ARC_PREFETCH) {
   2352   3403        bmc 			new_state = arc_mru;
   2353   2391     maybee 			if (refcount_count(&buf->b_refcnt) > 0)
   2354   2391     maybee 				buf->b_flags &= ~ARC_PREFETCH;
   2355   1544   eschrock 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
   2356    789     ahrens 		} else {
   2357   3403        bmc 			new_state = arc_mfu;
   2358   1544   eschrock 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
   2359    789     ahrens 		}
   2360    789     ahrens 
   2361  11066     rafael 		buf->b_arc_access = ddi_get_lbolt();
   2362    789     ahrens 		arc_change_state(new_state, buf, hash_lock);
   2363    789     ahrens 
   2364   3403        bmc 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
   2365   3403        bmc 	} else if (buf->b_state == arc_mfu) {
   2366    789     ahrens 		/*
   2367    789     ahrens 		 * This buffer has been accessed more than once and is
   2368    789     ahrens 		 * still in the cache.  Keep it in the MFU state.
   2369    789     ahrens 		 *
   2370   2391     maybee 		 * NOTE: an add_reference() that occurred when we did
   2371   2391     maybee 		 * the arc_read() will have kicked this off the list.
   2372   2391     maybee 		 * If it was a prefetch, we will explicitly move it to
   2373   2391     maybee 		 * the head of the list now.
   2374    789     ahrens 		 */
   2375   2391     maybee 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
   2376   2391     maybee 			ASSERT(refcount_count(&buf->b_refcnt) == 0);
   2377   2391     maybee 			ASSERT(list_link_active(&buf->b_arc_node));
   2378   2391     maybee 		}
   2379   3403        bmc 		ARCSTAT_BUMP(arcstat_mfu_hits);
   2380  11066     rafael 		buf->b_arc_access = ddi_get_lbolt();
   2381   3403        bmc 	} else if (buf->b_state == arc_mfu_ghost) {
   2382   3403        bmc 		arc_state_t	*new_state = arc_mfu;
   2383    789     ahrens 		/*
   2384    789     ahrens 		 * This buffer has been accessed more than once but has
   2385    789     ahrens 		 * been evicted from the cache.  Move it back to the
   2386    789     ahrens 		 * MFU state.
   2387    789     ahrens 		 */
   2388    789     ahrens 
   2389   2391     maybee 		if (buf->b_flags & ARC_PREFETCH) {
   2390   2391     maybee 			/*
   2391   2391     maybee 			 * This is a prefetch access...
   2392   2391     maybee 			 * move this block back to the MRU state.
   2393   2391     maybee 			 */
   2394   2391     maybee 			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
   2395   3403        bmc 			new_state = arc_mru;
   2396   2391     maybee 		}
   2397   2391     maybee 
   2398  11066     rafael 		buf->b_arc_access = ddi_get_lbolt();
   2399   1544   eschrock 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
   2400   2391     maybee 		arc_change_state(new_state, buf, hash_lock);
   2401    789     ahrens 
   2402   3403        bmc 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
   2403   5450    brendan 	} else if (buf->b_state == arc_l2c_only) {
   2404   5450    brendan 		/*
   2405   5450    brendan 		 * This buffer is on the 2nd Level ARC.
   2406   5450    brendan 		 */
   2407   5450    brendan 
   2408  11066     rafael 		buf->b_arc_access = ddi_get_lbolt();
   2409   5450    brendan 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
   2410   5450    brendan 		arc_change_state(arc_mfu, buf, hash_lock);
   2411    789     ahrens 	} else {
   2412    789     ahrens 		ASSERT(!"invalid arc state");
   2413    789     ahrens 	}
   2414    789     ahrens }
   2415    789     ahrens 
   2416    789     ahrens /* a generic arc_done_func_t which you can use */
   2417    789     ahrens /* ARGSUSED */
   2418    789     ahrens void
   2419    789     ahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
   2420    789     ahrens {
   2421    789     ahrens 	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
   2422   1544   eschrock 	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
   2423    789     ahrens }
   2424    789     ahrens 
   2425   4309     maybee /* a generic arc_done_func_t */
   2426    789     ahrens void
   2427    789     ahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
   2428    789     ahrens {
   2429    789     ahrens 	arc_buf_t **bufp = arg;
   2430    789     ahrens 	if (zio && zio->io_error) {
   2431   1544   eschrock 		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
   2432    789     ahrens 		*bufp = NULL;
   2433    789     ahrens 	} else {
   2434    789     ahrens 		*bufp = buf;
   2435    789     ahrens 	}
   2436    789     ahrens }
   2437    789     ahrens 
   2438    789     ahrens static void
   2439    789     ahrens arc_read_done(zio_t *zio)
   2440    789     ahrens {
   2441   1589     maybee 	arc_buf_hdr_t	*hdr, *found;
   2442    789     ahrens 	arc_buf_t	*buf;
   2443    789     ahrens 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
   2444    789     ahrens 	kmutex_t	*hash_lock;
   2445    789     ahrens 	arc_callback_t	*callback_list, *acb;
   2446    789     ahrens 	int		freeable = FALSE;
   2447    789     ahrens 
   2448    789     ahrens 	buf = zio->io_private;
   2449    789     ahrens 	hdr = buf->b_hdr;
   2450    789     ahrens 
   2451   1589     maybee 	/*
   2452   1589     maybee 	 * The hdr was inserted into hash-table and removed from lists
   2453   1589     maybee 	 * prior to starting I/O.  We should find this header, since
   2454   1589     maybee 	 * it's in the hash table, and it should be legit since it's
   2455   1589     maybee 	 * not possible to evict it during the I/O.  The only possible
   2456   1589     maybee 	 * reason for it not to be found is if we were freed during the
   2457   1589     maybee 	 * read.
   2458   1589     maybee 	 */
   2459   8636       Mark 	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
   2460   3093     ahrens 	    &hash_lock);
   2461    789     ahrens 
   2462   1589     maybee 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
   2463   5450    brendan 	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
   2464   5450    brendan 	    (found == hdr && HDR_L2_READING(hdr)));
   2465   5450    brendan 
   2466   6987    brendan 	hdr->b_flags &= ~ARC_L2_EVICTED;
   2467   5450    brendan 	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
   2468   7237   ek110237 		hdr->b_flags &= ~ARC_L2CACHE;
   2469    789     ahrens 
   2470    789     ahrens 	/* byteswap if necessary */
   2471    789     ahrens 	callback_list = hdr->b_acb;
   2472    789     ahrens 	ASSERT(callback_list != NULL);
   2473  10839    william 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
   2474   7046     ahrens 		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
   2475   7046     ahrens 		    byteswap_uint64_array :
   2476   7046     ahrens 		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
   2477   7046     ahrens 		func(buf->b_data, hdr->b_size);
   2478   7046     ahrens 	}
   2479   3093     ahrens 
   2480   5450    brendan 	arc_cksum_compute(buf, B_FALSE);
   2481    789     ahrens 
   2482  10922       Jeff 	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
   2483  10922       Jeff 		/*
   2484  10922       Jeff 		 * Only call arc_access on anonymous buffers.  This is because
   2485  10922       Jeff 		 * if we've issued an I/O for an evicted buffer, we've already
   2486  10922       Jeff 		 * called arc_access (to prevent any simultaneous readers from
   2487  10922       Jeff 		 * getting confused).
   2488  10922       Jeff 		 */
   2489  10922       Jeff 		arc_access(hdr, hash_lock);
   2490  10922       Jeff 	}
   2491  10922       Jeff 
   2492    789     ahrens 	/* create copies of the data buffer for the callers */
   2493    789     ahrens 	abuf = buf;
   2494    789     ahrens 	for (acb = callback_list; acb; acb = acb->acb_next) {
   2495    789     ahrens 		if (acb->acb_done) {
   2496   2688     maybee 			if (abuf == NULL)
   2497   2688     maybee 				abuf = arc_buf_clone(buf);
   2498    789     ahrens 			acb->acb_buf = abuf;
   2499    789     ahrens 			abuf = NULL;
   2500    789     ahrens 		}
   2501    789     ahrens 	}
   2502    789     ahrens 	hdr->b_acb = NULL;
   2503    789     ahrens 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
   2504   1544   eschrock 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
   2505  10922       Jeff 	if (abuf == buf) {
   2506  10922       Jeff 		ASSERT(buf->b_efunc == NULL);
   2507  10922       Jeff 		ASSERT(hdr->b_datacnt == 1);
   2508   1544   eschrock 		hdr->b_flags |= ARC_BUF_AVAILABLE;
   2509  10922       Jeff 	}
   2510    789     ahrens 
   2511    789     ahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
   2512    789     ahrens 
   2513    789     ahrens 	if (zio->io_error != 0) {
   2514    789     ahrens 		hdr->b_flags |= ARC_IO_ERROR;
   2515   3403        bmc 		if (hdr->b_state != arc_anon)
   2516   3403        bmc 			arc_change_state(arc_anon, hdr, hash_lock);
   2517   1544   eschrock 		if (HDR_IN_HASH_TABLE(hdr))
   2518   1544   eschrock 			buf_hash_remove(hdr);
   2519    789     ahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
   2520    789     ahrens 	}
   2521   1544   eschrock 
   2522   1544   eschrock 	/*
   2523   2391     maybee 	 * Broadcast before we drop the hash_lock to avoid the possibility
   2524   2391     maybee 	 * that the hdr (and hence the cv) might be freed before we get to
   2525   2391     maybee 	 * the cv_broadcast().
   2526   1544   eschrock 	 */
   2527   1544   eschrock 	cv_broadcast(&hdr->b_cv);
   2528    789     ahrens 
   2529   1589     maybee 	if (hash_lock) {
   2530   2688     maybee 		mutex_exit(hash_lock);
   2531    789     ahrens 	} else {
   2532    789     ahrens 		/*
   2533    789     ahrens 		 * This block was freed while we waited for the read to
   2534    789     ahrens 		 * complete.  It has been removed from the hash table and
   2535    789     ahrens 		 * moved to the anonymous state (so that it won't show up
   2536    789     ahrens 		 * in the cache).
   2537    789     ahrens 		 */
   2538   3403        bmc 		ASSERT3P(hdr->b_state, ==, arc_anon);
   2539    789     ahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
   2540    789     ahrens 	}
   2541    789     ahrens 
   2542    789     ahrens 	/* execute each callback and free its structure */
   2543    789     ahrens 	while ((acb = callback_list) != NULL) {
   2544    789     ahrens 		if (acb->acb_done)
   2545    789     ahrens 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
   2546    789     ahrens 
   2547    789     ahrens 		if (acb->acb_zio_dummy != NULL) {
   2548    789     ahrens 			acb->acb_zio_dummy->io_error = zio->io_error;
   2549    789     ahrens 			zio_nowait(acb->acb_zio_dummy);
   2550    789     ahrens 		}
   2551    789     ahrens 
   2552    789     ahrens 		callback_list = acb->acb_next;
   2553    789     ahrens 		kmem_free(acb, sizeof (arc_callback_t));
   2554    789     ahrens 	}
   2555    789     ahrens 
   2556    789     ahrens 	if (freeable)
   2557   1544   eschrock 		arc_hdr_destroy(hdr);
   2558    789     ahrens }
   2559    789     ahrens 
   2560    789     ahrens /*
   2561    789     ahrens  * "Read" the block block at the specified DVA (in bp) via the
   2562    789     ahrens  * cache.  If the block is found in the cache, invoke the provided
   2563    789     ahrens  * callback immediately and return.  Note that the `zio' parameter
   2564    789     ahrens  * in the callback will be NULL in this case, since no IO was
   2565    789     ahrens  * required.  If the block is not in the cache pass the read request
   2566    789     ahrens  * on to the spa with a substitute callback function, so that the
   2567    789     ahrens  * requested block will be added to the cache.
   2568    789     ahrens  *
   2569    789     ahrens  * If a read request arrives for a block that has a read in-progress,
   2570    789     ahrens  * either wait for the in-progress read to complete (and return the
   2571    789     ahrens  * results); or, if this is a read with a "done" func, add a record
   2572    789     ahrens  * to the read to invoke the "done" func when the read completes,
   2573    789     ahrens  * and return; or just return.
   2574    789     ahrens  *
   2575    789     ahrens  * arc_read_done() will invoke all the requested "done" functions
   2576    789     ahrens  * for readers of this block.
   2577   7046     ahrens  *
   2578   7046     ahrens  * Normal callers should use arc_read and pass the arc buffer and offset
   2579   7046     ahrens  * for the bp.  But if you know you don't need locking, you can use
   2580   8213   Suhasini  * arc_read_bp.
   2581   7046     ahrens  */
   2582   7046     ahrens int
   2583  10922       Jeff arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
   2584   7237   ek110237     arc_done_func_t *done, void *private, int priority, int zio_flags,
   2585   7046     ahrens     uint32_t *arc_flags, const zbookmark_t *zb)
   2586   7046     ahrens {
   2587   7046     ahrens 	int err;
   2588   7046     ahrens 
   2589   7046     ahrens 	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
   2590   7046     ahrens 	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
   2591   7545       Mark 	rw_enter(&pbuf->b_lock, RW_READER);
   2592   7046     ahrens 
   2593   7046     ahrens 	err = arc_read_nolock(pio, spa, bp, done, private, priority,
   2594   7237   ek110237 	    zio_flags, arc_flags, zb);
   2595   7545       Mark 	rw_exit(&pbuf->b_lock);
   2596   9396    Matthew 
   2597   7046     ahrens 	return (err);
   2598   7046     ahrens }
   2599   7046     ahrens 
   2600   7046     ahrens int
   2601  10922       Jeff arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
   2602   7237   ek110237     arc_done_func_t *done, void *private, int priority, int zio_flags,
   2603   7046     ahrens     uint32_t *arc_flags, const zbookmark_t *zb)
   2604    789     ahrens {
   2605    789     ahrens 	arc_buf_hdr_t *hdr;
   2606    789     ahrens 	arc_buf_t *buf;
   2607    789     ahrens 	kmutex_t *hash_lock;
   2608   5450    brendan 	zio_t *rzio;
   2609   8636       Mark 	uint64_t guid = spa_guid(spa);
   2610    789     ahrens 
   2611    789     ahrens top:
   2612  10922       Jeff 	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
   2613  10922       Jeff 	    &hash_lock);
   2614   1544   eschrock 	if (hdr && hdr->b_datacnt > 0) {
   2615    789     ahrens 
   2616   2391     maybee 		*arc_flags |= ARC_CACHED;
   2617   2391     maybee 
   2618    789     ahrens 		if (HDR_IO_IN_PROGRESS(hdr)) {
   2619   2391     maybee 
   2620   2391     maybee 			if (*arc_flags & ARC_WAIT) {
   2621   2391     maybee 				cv_wait(&hdr->b_cv, hash_lock);
   2622   2391     maybee 				mutex_exit(hash_lock);
   2623   2391     maybee 				goto top;
   2624   2391     maybee 			}
   2625   2391     maybee 			ASSERT(*arc_flags & ARC_NOWAIT);
   2626   2391     maybee 
   2627   2391     maybee 			if (done) {
   2628    789     ahrens 				arc_callback_t	*acb = NULL;
   2629    789     ahrens 
   2630    789     ahrens 				acb = kmem_zalloc(sizeof (arc_callback_t),
   2631    789     ahrens 				    KM_SLEEP);
   2632    789     ahrens 				acb->acb_done = done;
   2633    789     ahrens 				acb->acb_private = private;
   2634    789     ahrens 				if (pio != NULL)
   2635    789     ahrens 					acb->acb_zio_dummy = zio_null(pio,
   2636   8632       Bill 					    spa, NULL, NULL, NULL, zio_flags);
   2637    789     ahrens 
   2638    789     ahrens 				ASSERT(acb->acb_done != NULL);
   2639    789     ahrens 				acb->acb_next = hdr->b_acb;
   2640    789     ahrens 				hdr->b_acb = acb;
   2641    789     ahrens 				add_reference(hdr, hash_lock, private);
   2642    789     ahrens 				mutex_exit(hash_lock);
   2643    789     ahrens 				return (0);
   2644    789     ahrens 			}
   2645    789     ahrens 			mutex_exit(hash_lock);
   2646    789     ahrens 			return (0);
   2647    789     ahrens 		}
   2648    789     ahrens 
   2649   3403        bmc 		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
   2650    789     ahrens 
   2651   1544   eschrock 		if (done) {
   2652   2688     maybee 			add_reference(hdr, hash_lock, private);
   2653   1544   eschrock 			/*
   2654   1544   eschrock 			 * If this block is already in use, create a new
   2655   1544   eschrock 			 * copy of the data so that we will be guaranteed
   2656   1544   eschrock 			 * that arc_release() will always succeed.
   2657   1544   eschrock 			 */
   2658   1544   eschrock 			buf = hdr->b_buf;
   2659   1544   eschrock 			ASSERT(buf);
   2660   1544   eschrock 			ASSERT(buf->b_data);
   2661   2688     maybee 			if (HDR_BUF_AVAILABLE(hdr)) {
   2662   1544   eschrock 				ASSERT(buf->b_efunc == NULL);
   2663   1544   eschrock 				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
   2664   2688     maybee 			} else {
   2665   2688     maybee 				buf = arc_buf_clone(buf);
   2666   1544   eschrock 			}
   2667  10922       Jeff 
   2668   2391     maybee 		} else if (*arc_flags & ARC_PREFETCH &&
   2669   2391     maybee 		    refcount_count(&hdr->b_refcnt) == 0) {
   2670   2391     maybee 			hdr->b_flags |= ARC_PREFETCH;
   2671    789     ahrens 		}
   2672    789     ahrens 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
   2673   2688     maybee 		arc_access(hdr, hash_lock);
   2674   7237   ek110237 		if (*arc_flags & ARC_L2CACHE)
   2675   7237   ek110237 			hdr->b_flags |= ARC_L2CACHE;
   2676   2688     maybee 		mutex_exit(hash_lock);
   2677   3403        bmc 		ARCSTAT_BUMP(arcstat_hits);
   2678   3403        bmc 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
   2679   3403        bmc 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
   2680   3403        bmc 		    data, metadata, hits);
   2681   3403        bmc 
   2682    789     ahrens 		if (done)
   2683    789     ahrens 			done(NULL, buf, private);
   2684    789     ahrens 	} else {
   2685    789     ahrens 		uint64_t size = BP_GET_LSIZE(bp);
   2686    789     ahrens 		arc_callback_t	*acb;
   2687   6987    brendan 		vdev_t *vd = NULL;
   2688   9215     George 		uint64_t addr;
   2689   8582    Brendan 		boolean_t devw = B_FALSE;
   2690    789     ahrens 
   2691    789     ahrens 		if (hdr == NULL) {
   2692    789     ahrens 			/* this block is not in the cache */
   2693    789     ahrens 			arc_buf_hdr_t	*exists;
   2694   3290   johansen 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
   2695   3290   johansen 			buf = arc_buf_alloc(spa, size, private, type);
   2696    789     ahrens 			hdr = buf->b_hdr;
   2697    789     ahrens 			hdr->b_dva = *BP_IDENTITY(bp);
   2698  10922       Jeff 			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
   2699    789     ahrens 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
   2700    789     ahrens 			exists = buf_hash_insert(hdr, &hash_lock);
   2701    789     ahrens 			if (exists) {
   2702    789     ahrens 				/* somebody beat us to the hash insert */
   2703    789     ahrens 				mutex_exit(hash_lock);
   2704    789     ahrens 				bzero(&hdr->b_dva, sizeof (dva_t));
   2705    789     ahrens 				hdr->b_birth = 0;
   2706    789     ahrens 				hdr->b_cksum0 = 0;
   2707   1544   eschrock 				(void) arc_buf_remove_ref(buf, private);
   2708    789     ahrens 				goto top; /* restart the IO request */
   2709    789     ahrens 			}
   2710   2391     maybee 			/* if this is a prefetch, we don't have a reference */
   2711   2391     maybee 			if (*arc_flags & ARC_PREFETCH) {
   2712   2391     maybee 				(void) remove_reference(hdr, hash_lock,
   2713   2391     maybee 				    private);
   2714   2391     maybee 				hdr->b_flags |= ARC_PREFETCH;
   2715   2391     maybee 			}
   2716   7237   ek110237 			if (*arc_flags & ARC_L2CACHE)
   2717   7237   ek110237 				hdr->b_flags |= ARC_L2CACHE;
   2718   2391     maybee 			if (BP_GET_LEVEL(bp) > 0)
   2719   2391     maybee 				hdr->b_flags |= ARC_INDIRECT;
   2720    789     ahrens 		} else {
   2721    789     ahrens 			/* this block is in the ghost cache */
   2722   1544   eschrock 			ASSERT(GHOST_STATE(hdr->b_state));
   2723   1544   eschrock 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
   2724   2391     maybee 			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
   2725   2391     maybee 			ASSERT(hdr->b_buf == NULL);
   2726    789     ahrens 
   2727   2391     maybee 			/* if this is a prefetch, we don't have a reference */
   2728   2391     maybee 			if (*arc_flags & ARC_PREFETCH)
   2729   2391     maybee 				hdr->b_flags |= ARC_PREFETCH;
   2730   2391     maybee 			else
   2731   2391     maybee 				add_reference(hdr, hash_lock, private);
   2732   7237   ek110237 			if (*arc_flags & ARC_L2CACHE)
   2733   7237   ek110237 				hdr->b_flags |= ARC_L2CACHE;
   2734   6245     maybee 			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
   2735   1544   eschrock 			buf->b_hdr = hdr;
   2736   2688     maybee 			buf->b_data = NULL;
   2737   1544   eschrock 			buf->b_efunc = NULL;
   2738   1544   eschrock 			buf->b_private = NULL;
   2739   1544   eschrock 			buf->b_next = NULL;
   2740   1544   eschrock 			hdr->b_buf = buf;
   2741   2688     maybee 			arc_get_data_buf(buf);
   2742   1544   eschrock 			ASSERT(hdr->b_datacnt == 0);
   2743   1544   eschrock 			hdr->b_datacnt = 1;
   2744    789     ahrens 		}
   2745    789     ahrens 
   2746    789     ahrens 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
   2747    789     ahrens 		acb->acb_done = done;
   2748    789     ahrens 		acb->acb_private = private;
   2749    789     ahrens 
   2750    789     ahrens 		ASSERT(hdr->b_acb == NULL);
   2751    789     ahrens 		hdr->b_acb = acb;
   2752    789     ahrens 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
   2753    789     ahrens 
   2754    789     ahrens 		/*
   2755    789     ahrens 		 * If the buffer has been evicted, migrate it to a present state
   2756    789     ahrens 		 * before issuing the I/O.  Once we drop the hash-table lock,
   2757    789     ahrens 		 * the header will be marked as I/O in progress and have an
   2758    789     ahrens 		 * attached buffer.  At this point, anybody who finds this
   2759    789     ahrens 		 * buffer ought to notice that it's legit but has a pending I/O.
   2760    789     ahrens 		 */
   2761    789     ahrens 
   2762   1544   eschrock 		if (GHOST_STATE(hdr->b_state))
   2763   2688     maybee 			arc_access(hdr, hash_lock);
   2764    789     ahrens 
   2765   7754       Jeff 		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
   2766   7754       Jeff 		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
   2767   8582    Brendan 			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
   2768   6987    brendan 			addr = hdr->b_l2hdr->b_daddr;
   2769   7754       Jeff 			/*
   2770   7754       Jeff 			 * Lock out device removal.
   2771   7754       Jeff 			 */
   2772   7754       Jeff 			if (vdev_is_dead(vd) ||
   2773   7754       Jeff 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
   2774   7754       Jeff 				vd = NULL;
   2775   6987    brendan 		}
   2776   6987    brendan 
   2777   6987    brendan 		mutex_exit(hash_lock);
   2778   6987    brendan 
   2779    789     ahrens 		ASSERT3U(hdr->b_size, ==, size);
   2780  10409    Brendan 		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
   2781  10409    Brendan 		    uint64_t, size, zbookmark_t *, zb);
   2782   3403        bmc 		ARCSTAT_BUMP(arcstat_misses);
   2783   3403        bmc 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
   2784   3403        bmc 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
   2785   3403        bmc 		    data, metadata, misses);
   2786   1544   eschrock 
   2787   8582    Brendan 		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
   2788   6987    brendan 			/*
   2789   5450    brendan 			 * Read from the L2ARC if the following are true:
   2790   6987    brendan 			 * 1. The L2ARC vdev was previously cached.
   2791   6987    brendan 			 * 2. This buffer still has L2ARC metadata.
   2792   6987    brendan 			 * 3. This buffer isn't currently writing to the L2ARC.
   2793   6987    brendan 			 * 4. The L2ARC entry wasn't evicted, which may
   2794   6987    brendan 			 *    also have invalidated the vdev.
   2795   8582    Brendan 			 * 5. This isn't prefetch and l2arc_noprefetch is set.
   2796   6987    brendan 			 */
   2797   7754       Jeff 			if (hdr->b_l2hdr != NULL &&
   2798   8582    Brendan 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
   2799   8582    Brendan 			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
   2800   5450    brendan 				l2arc_read_callback_t *cb;
   2801   6643   eschrock 
   2802   5450    brendan 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
   2803   5450    brendan 				ARCSTAT_BUMP(arcstat_l2_hits);
   2804   5450    brendan 
   2805   5450    brendan 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
   2806   5450    brendan 				    KM_SLEEP);
   2807   5450    brendan 				cb->l2rcb_buf = buf;
   2808   5450    brendan 				cb->l2rcb_spa = spa;
   2809   5450    brendan 				cb->l2rcb_bp = *bp;
   2810   5450    brendan 				cb->l2rcb_zb = *zb;
   2811   7237   ek110237 				cb->l2rcb_flags = zio_flags;
   2812   5450    brendan 
   2813   5450    brendan 				/*
   2814   7754       Jeff 				 * l2arc read.  The SCL_L2ARC lock will be
   2815   7754       Jeff 				 * released by l2arc_read_done().
   2816   5450    brendan 				 */
   2817   5450    brendan 				rzio = zio_read_phys(pio, vd, addr, size,
   2818   5450    brendan 				    buf->b_data, ZIO_CHECKSUM_OFF,
   2819   7237   ek110237 				    l2arc_read_done, cb, priority, zio_flags |
   2820   7361    Brendan 				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
   2821   7754       Jeff 				    ZIO_FLAG_DONT_PROPAGATE |
   2822   7754       Jeff 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
   2823   5450    brendan 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
   2824   5450    brendan 				    zio_t *, rzio);
   2825   8582    Brendan 				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
   2826   6987    brendan 
   2827   6987    brendan 				if (*arc_flags & ARC_NOWAIT) {
   2828   6987    brendan 					zio_nowait(rzio);
   2829   6987    brendan 					return (0);
   2830   6987    brendan 				}
   2831   6987    brendan 
   2832   6987    brendan 				ASSERT(*arc_flags & ARC_WAIT);
   2833   6987    brendan 				if (zio_wait(rzio) == 0)
   2834   6987    brendan 					return (0);
   2835   6987    brendan 
   2836   6987    brendan 				/* l2arc read error; goto zio_read() */
   2837   5450    brendan 			} else {
   2838   5450    brendan 				DTRACE_PROBE1(l2arc__miss,
   2839   5450    brendan 				    arc_buf_hdr_t *, hdr);
   2840   5450    brendan 				ARCSTAT_BUMP(arcstat_l2_misses);
   2841   5450    brendan 				if (HDR_L2_WRITING(hdr))
   2842   5450    brendan 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
   2843   7754       Jeff 				spa_config_exit(spa, SCL_L2ARC, vd);
   2844   8582    Brendan 			}
   2845   8582    Brendan 		} else {
   2846   8628       Bill 			if (vd != NULL)
   2847   8628       Bill 				spa_config_exit(spa, SCL_L2ARC, vd);
   2848   8582    Brendan 			if (l2arc_ndev != 0) {
   2849   8582    Brendan 				DTRACE_PROBE1(l2arc__miss,
   2850   8582    Brendan 				    arc_buf_hdr_t *, hdr);
   2851   8582    Brendan 				ARCSTAT_BUMP(arcstat_l2_misses);
   2852   6987    brendan 			}
   2853   6987    brendan 		}
   2854   5450    brendan 
   2855    789     ahrens 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
   2856   7237   ek110237 		    arc_read_done, buf, priority, zio_flags, zb);
   2857    789     ahrens 
   2858   2391     maybee 		if (*arc_flags & ARC_WAIT)
   2859    789     ahrens 			return (zio_wait(rzio));
   2860    789     ahrens 
   2861   2391     maybee 		ASSERT(*arc_flags & ARC_NOWAIT);
   2862    789     ahrens 		zio_nowait(rzio);
   2863    789     ahrens 	}
   2864    789     ahrens 	return (0);
   2865    789     ahrens }
   2866    789     ahrens 
   2867   1544   eschrock void
   2868   1544   eschrock arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
   2869   1544   eschrock {
   2870   1544   eschrock 	ASSERT(buf->b_hdr != NULL);
   2871   3403        bmc 	ASSERT(buf->b_hdr->b_state != arc_anon);
   2872   1544   eschrock 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
   2873  10922       Jeff 	ASSERT(buf->b_efunc == NULL);
   2874  10922       Jeff 	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
   2875  10922       Jeff 
   2876   1544   eschrock 	buf->b_efunc = func;
   2877   1544   eschrock 	buf->b_private = private;
   2878   1544   eschrock }
   2879   1544   eschrock 
   2880   1544   eschrock /*
   2881   1544   eschrock  * This is used by the DMU to let the ARC know that a buffer is
   2882   1544   eschrock  * being evicted, so the ARC should clean up.  If this arc buf
   2883   1544   eschrock  * is not yet in the evicted state, it will be put there.
   2884   1544   eschrock  */
   2885   1544   eschrock int
   2886   1544   eschrock arc_buf_evict(arc_buf_t *buf)
   2887   1544   eschrock {
   2888   2887     maybee 	arc_buf_hdr_t *hdr;
   2889   1544   eschrock 	kmutex_t *hash_lock;
   2890   1544   eschrock 	arc_buf_t **bufp;
   2891   1544   eschrock 
   2892   7545       Mark 	rw_enter(&buf->b_lock, RW_WRITER);
   2893   2887     maybee 	hdr = buf->b_hdr;
   2894   1544   eschrock 	if (hdr == NULL) {
   2895   1544   eschrock 		/*
   2896   1544   eschrock 		 * We are in arc_do_user_evicts().
   2897   1544   eschrock 		 */
   2898   1544   eschrock 		ASSERT(buf->b_data == NULL);
   2899   7545       Mark 		rw_exit(&buf->b_lock);
   2900   1544   eschrock 		return (0);
   2901   7545       Mark 	} else if (buf->b_data == NULL) {
   2902   7545       Mark 		arc_buf_t copy = *buf; /* structure assignment */
   2903   7545       Mark 		/*
   2904   7545       Mark 		 * We are on the eviction list; process this buffer now
   2905   7545       Mark 		 * but let arc_do_user_evicts() do the reaping.
   2906   7545       Mark 		 */
   2907   7545       Mark 		buf->b_efunc = NULL;
   2908   7545       Mark 		rw_exit(&buf->b_lock);
   2909   7545       Mark 		VERIFY(copy.b_efunc(&copy) == 0);
   2910   7545       Mark 		return (1);
   2911   1544   eschrock 	}
   2912   2887     maybee 	hash_lock = HDR_LOCK(hdr);
   2913   7545       Mark 	mutex_enter(hash_lock);
   2914   2724     maybee 
   2915   2724     maybee 	ASSERT(buf->b_hdr == hdr);
   2916   2724     maybee 	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
   2917   3403        bmc 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
   2918   1544   eschrock 
   2919   1544   eschrock 	/*
   2920   1544   eschrock 	 * Pull this buffer off of the hdr
   2921   1544   eschrock 	 */
   2922   1544   eschrock 	bufp = &hdr->b_buf;
   2923   1544   eschrock 	while (*bufp != buf)
   2924   1544   eschrock 		bufp = &(*bufp)->b_next;
   2925   1544   eschrock 	*bufp = buf->b_next;
   2926   1544   eschrock 
   2927   1544   eschrock 	ASSERT(buf->b_data != NULL);
   2928   2688     maybee 	arc_buf_destroy(buf, FALSE, FALSE);
   2929   1544   eschrock 
   2930   1544   eschrock 	if (hdr->b_datacnt == 0) {
   2931   1544   eschrock 		arc_state_t *old_state = hdr->b_state;
   2932   1544   eschrock 		arc_state_t *evicted_state;
   2933   1544   eschrock 
   2934   1544   eschrock 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
   2935   1544   eschrock 
   2936   1544   eschrock 		evicted_state =
   2937   3403        bmc 		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
   2938   1544   eschrock 
   2939   3403        bmc 		mutex_enter(&old_state->arcs_mtx);
   2940   3403        bmc 		mutex_enter(&evicted_state->arcs_mtx);
   2941   1544   eschrock 
   2942   1544   eschrock 		arc_change_state(evicted_state, hdr, hash_lock);
   2943   1544   eschrock 		ASSERT(HDR_IN_HASH_TABLE(hdr));
   2944   5450    brendan 		hdr->b_flags |= ARC_IN_HASH_TABLE;
   2945   5450    brendan 		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
   2946   1544   eschrock 
   2947   3403        bmc 		mutex_exit(&evicted_state->arcs_mtx);
   2948   3403        bmc 		mutex_exit(&old_state->arcs_mtx);
   2949   1544   eschrock 	}
   2950   1544   eschrock 	mutex_exit(hash_lock);
   2951   7545       Mark 	rw_exit(&buf->b_lock);
   2952   1819     maybee 
   2953   1544   eschrock 	VERIFY(buf->b_efunc(buf) == 0);
   2954   1544   eschrock 	buf->b_efunc = NULL;
   2955   1544   eschrock 	buf->b_private = NULL;
   2956   1544   eschrock 	buf->b_hdr = NULL;
   2957   1544   eschrock 	kmem_cache_free(buf_cache, buf);
   2958   1544   eschrock 	return (1);
   2959    789     ahrens }
   2960    <