Home | History | Annotate | Download | only in os
      1      0    stevel /*
      2      0    stevel  * CDDL HEADER START
      3      0    stevel  *
      4      0    stevel  * The contents of this file are subject to the terms of the
      5   2208  dp201428  * Common Development and Distribution License (the "License").
      6   2208  dp201428  * You may not use this file except in compliance with the License.
      7      0    stevel  *
      8      0    stevel  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9      0    stevel  * or http://www.opensolaris.org/os/licensing.
     10      0    stevel  * See the License for the specific language governing permissions
     11      0    stevel  * and limitations under the License.
     12      0    stevel  *
     13      0    stevel  * When distributing Covered Code, include this CDDL HEADER in each
     14      0    stevel  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15      0    stevel  * If applicable, add the following below this CDDL HEADER, with the
     16      0    stevel  * fields enclosed by brackets "[]" replaced with your own identifying
     17      0    stevel  * information: Portions Copyright [yyyy] [name of copyright owner]
     18      0    stevel  *
     19      0    stevel  * CDDL HEADER END
     20      0    stevel  */
     21      0    stevel /*
     22   9367  Jonathan  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23      0    stevel  * Use is subject to license terms.
     24      0    stevel  */
     25      0    stevel 
     26      0    stevel /*
     27   6712     tomee  * Kernel memory allocator, as described in the following two papers and a
     28   6712     tomee  * statement about the consolidator:
     29      0    stevel  *
     30      0    stevel  * Jeff Bonwick,
     31      0    stevel  * The Slab Allocator: An Object-Caching Kernel Memory Allocator.
     32      0    stevel  * Proceedings of the Summer 1994 Usenix Conference.
     33      0    stevel  * Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf.
     34      0    stevel  *
     35      0    stevel  * Jeff Bonwick and Jonathan Adams,
     36      0    stevel  * Magazines and vmem: Extending the Slab Allocator to Many CPUs and
     37      0    stevel  * Arbitrary Resources.
     38      0    stevel  * Proceedings of the 2001 Usenix Conference.
     39      0    stevel  * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
     40   6712     tomee  *
     41   6712     tomee  * kmem Slab Consolidator Big Theory Statement:
     42   6712     tomee  *
     43   6712     tomee  * 1. Motivation
     44   6712     tomee  *
     45   6712     tomee  * As stated in Bonwick94, slabs provide the following advantages over other
     46   6712     tomee  * allocation structures in terms of memory fragmentation:
     47   6712     tomee  *
     48   6712     tomee  *  - Internal fragmentation (per-buffer wasted space) is minimal.
     49   6712     tomee  *  - Severe external fragmentation (unused buffers on the free list) is
     50   6712     tomee  *    unlikely.
     51   6712     tomee  *
     52   6712     tomee  * Segregating objects by size eliminates one source of external fragmentation,
     53   6712     tomee  * and according to Bonwick:
     54   6712     tomee  *
     55   6712     tomee  *   The other reason that slabs reduce external fragmentation is that all
     56   6712     tomee  *   objects in a slab are of the same type, so they have the same lifetime
     57   6712     tomee  *   distribution. The resulting segregation of short-lived and long-lived
     58   6712     tomee  *   objects at slab granularity reduces the likelihood of an entire page being
     59   6712     tomee  *   held hostage due to a single long-lived allocation [Barrett93, Hanson90].
     60   6712     tomee  *
     61   6712     tomee  * While unlikely, severe external fragmentation remains possible. Clients that
     62   6712     tomee  * allocate both short- and long-lived objects from the same cache cannot
     63   6712     tomee  * anticipate the distribution of long-lived objects within the allocator's slab
     64   6712     tomee  * implementation. Even a small percentage of long-lived objects distributed
     65   6712     tomee  * randomly across many slabs can lead to a worst case scenario where the client
     66   6712     tomee  * frees the majority of its objects and the system gets back almost none of the
     67   6712     tomee  * slabs. Despite the client doing what it reasonably can to help the system
     68   6712     tomee  * reclaim memory, the allocator cannot shake free enough slabs because of
     69   6712     tomee  * lonely allocations stubbornly hanging on. Although the allocator is in a
     70   6712     tomee  * position to diagnose the fragmentation, there is nothing that the allocator
     71   6712     tomee  * by itself can do about it. It only takes a single allocated object to prevent
     72   6712     tomee  * an entire slab from being reclaimed, and any object handed out by
     73   6712     tomee  * kmem_cache_alloc() is by definition in the client's control. Conversely,
     74   6712     tomee  * although the client is in a position to move a long-lived object, it has no
     75   6712     tomee  * way of knowing if the object is causing fragmentation, and if so, where to
     76   6712     tomee  * move it. A solution necessarily requires further cooperation between the
     77   6712     tomee  * allocator and the client.
     78   6712     tomee  *
     79   6712     tomee  * 2. Move Callback
     80   6712     tomee  *
     81   6712     tomee  * The kmem slab consolidator therefore adds a move callback to the
     82   6712     tomee  * allocator/client interface, improving worst-case external fragmentation in
     83   6712     tomee  * kmem caches that supply a function to move objects from one memory location
     84   6712     tomee  * to another. In a situation of low memory kmem attempts to consolidate all of
     85   6712     tomee  * a cache's slabs at once; otherwise it works slowly to bring external
     86   6712     tomee  * fragmentation within the 1/8 limit guaranteed for internal fragmentation,
     87   6712     tomee  * thereby helping to avoid a low memory situation in the future.
     88   6712     tomee  *
     89   6712     tomee  * The callback has the following signature:
     90   6712     tomee  *
     91   6712     tomee  *   kmem_cbrc_t move(void *old, void *new, size_t size, void *user_arg)
     92   6712     tomee  *
     93   6712     tomee  * It supplies the kmem client with two addresses: the allocated object that
     94   6712     tomee  * kmem wants to move and a buffer selected by kmem for the client to use as the
     95   6712     tomee  * copy destination. The callback is kmem's way of saying "Please get off of
     96   6712     tomee  * this buffer and use this one instead." kmem knows where it wants to move the
     97   6712     tomee  * object in order to best reduce fragmentation. All the client needs to know
     98   6712     tomee  * about the second argument (void *new) is that it is an allocated, constructed
     99   6712     tomee  * object ready to take the contents of the old object. When the move function
    100   6712     tomee  * is called, the system is likely to be low on memory, and the new object
    101   6712     tomee  * spares the client from having to worry about allocating memory for the
    102   6712     tomee  * requested move. The third argument supplies the size of the object, in case a
    103   6712     tomee  * single move function handles multiple caches whose objects differ only in
    104   6712     tomee  * size (such as zio_buf_512, zio_buf_1024, etc). Finally, the same optional
    105   6712     tomee  * user argument passed to the constructor, destructor, and reclaim functions is
    106   6712     tomee  * also passed to the move callback.
    107   6712     tomee  *
    108   6712     tomee  * 2.1 Setting the Move Callback
    109   6712     tomee  *
    110   6712     tomee  * The client sets the move callback after creating the cache and before
    111   6712     tomee  * allocating from it:
    112   6712     tomee  *
    113   6712     tomee  *	object_cache = kmem_cache_create(...);
    114   6712     tomee  *      kmem_cache_set_move(object_cache, object_move);
    115   6712     tomee  *
    116   6712     tomee  * 2.2 Move Callback Return Values
    117   6712     tomee  *
    118   6712     tomee  * Only the client knows about its own data and when is a good time to move it.
    119   6712     tomee  * The client is cooperating with kmem to return unused memory to the system,
    120   6712     tomee  * and kmem respectfully accepts this help at the client's convenience. When
    121   6712     tomee  * asked to move an object, the client can respond with any of the following:
    122   6712     tomee  *
    123   6712     tomee  *   typedef enum kmem_cbrc {
    124   6712     tomee  *           KMEM_CBRC_YES,
    125   6712     tomee  *           KMEM_CBRC_NO,
    126   6712     tomee  *           KMEM_CBRC_LATER,
    127   6712     tomee  *           KMEM_CBRC_DONT_NEED,
    128   6712     tomee  *           KMEM_CBRC_DONT_KNOW
    129   6712     tomee  *   } kmem_cbrc_t;
    130   6712     tomee  *
    131   6712     tomee  * The client must not explicitly kmem_cache_free() either of the objects passed
    132   6712     tomee  * to the callback, since kmem wants to free them directly to the slab layer
    133   6712     tomee  * (bypassing the per-CPU magazine layer). The response tells kmem which of the
    134   6712     tomee  * objects to free:
    135   6712     tomee  *
    136   6712     tomee  *       YES: (Did it) The client moved the object, so kmem frees the old one.
    137   6712     tomee  *        NO: (Never) The client refused, so kmem frees the new object (the
    138   6712     tomee  *            unused copy destination). kmem also marks the slab of the old
    139   6712     tomee  *            object so as not to bother the client with further callbacks for
    140   6712     tomee  *            that object as long as the slab remains on the partial slab list.
    141   6712     tomee  *            (The system won't be getting the slab back as long as the
    142   6712     tomee  *            immovable object holds it hostage, so there's no point in moving
    143   6712     tomee  *            any of its objects.)
    144   6712     tomee  *     LATER: The client is using the object and cannot move it now, so kmem
    145   6712     tomee  *            frees the new object (the unused copy destination). kmem still
    146   6712     tomee  *            attempts to move other objects off the slab, since it expects to
    147   6712     tomee  *            succeed in clearing the slab in a later callback. The client
    148   6712     tomee  *            should use LATER instead of NO if the object is likely to become
    149   6712     tomee  *            movable very soon.
    150   6712     tomee  * DONT_NEED: The client no longer needs the object, so kmem frees the old along
    151   6712     tomee  *            with the new object (the unused copy destination). This response
    152   6712     tomee  *            is the client's opportunity to be a model citizen and give back as
    153   6712     tomee  *            much as it can.
    154   6712     tomee  * DONT_KNOW: The client does not know about the object because
    155   6712     tomee  *            a) the client has just allocated the object and not yet put it
    156   6712     tomee  *               wherever it expects to find known objects
    157   6712     tomee  *            b) the client has removed the object from wherever it expects to
    158   6712     tomee  *               find known objects and is about to free it, or
    159   6712     tomee  *            c) the client has freed the object.
    160   6712     tomee  *            In all these cases (a, b, and c) kmem frees the new object (the
    161   6712     tomee  *            unused copy destination) and searches for the old object in the
    162   6712     tomee  *            magazine layer. If found, the object is removed from the magazine
    163   6712     tomee  *            layer and freed to the slab layer so it will no longer hold the
    164   6712     tomee  *            slab hostage.
    165   6712     tomee  *
    166   6712     tomee  * 2.3 Object States
    167   6712     tomee  *
    168   6712     tomee  * Neither kmem nor the client can be assumed to know the object's whereabouts
    169   6712     tomee  * at the time of the callback. An object belonging to a kmem cache may be in
    170   6712     tomee  * any of the following states:
    171   6712     tomee  *
    172   6712     tomee  * 1. Uninitialized on the slab
    173   6712     tomee  * 2. Allocated from the slab but not constructed (still uninitialized)
    174   6712     tomee  * 3. Allocated from the slab, constructed, but not yet ready for business
    175   6712     tomee  *    (not in a valid state for the move callback)
    176   6712     tomee  * 4. In use (valid and known to the client)
    177   6712     tomee  * 5. About to be freed (no longer in a valid state for the move callback)
    178   6712     tomee  * 6. Freed to a magazine (still constructed)
    179   6712     tomee  * 7. Allocated from a magazine, not yet ready for business (not in a valid
    180   6712     tomee  *    state for the move callback), and about to return to state #4
    181   6712     tomee  * 8. Deconstructed on a magazine that is about to be freed
    182   6712     tomee  * 9. Freed to the slab
    183   6712     tomee  *
    184   6712     tomee  * Since the move callback may be called at any time while the object is in any
    185   6712     tomee  * of the above states (except state #1), the client needs a safe way to
    186   6712     tomee  * determine whether or not it knows about the object. Specifically, the client
    187   6712     tomee  * needs to know whether or not the object is in state #4, the only state in
    188   6712     tomee  * which a move is valid. If the object is in any other state, the client should
    189   6712     tomee  * immediately return KMEM_CBRC_DONT_KNOW, since it is unsafe to access any of
    190   6712     tomee  * the object's fields.
    191   6712     tomee  *
    192   6712     tomee  * Note that although an object may be in state #4 when kmem initiates the move
    193   6712     tomee  * request, the object may no longer be in that state by the time kmem actually
    194   6712     tomee  * calls the move function. Not only does the client free objects
    195   6712     tomee  * asynchronously, kmem itself puts move requests on a queue where thay are
    196   6712     tomee  * pending until kmem processes them from another context. Also, objects freed
    197   6712     tomee  * to a magazine appear allocated from the point of view of the slab layer, so
    198   6712     tomee  * kmem may even initiate requests for objects in a state other than state #4.
    199   6712     tomee  *
    200   6712     tomee  * 2.3.1 Magazine Layer
    201   6712     tomee  *
    202   6712     tomee  * An important insight revealed by the states listed above is that the magazine
    203   6712     tomee  * layer is populated only by kmem_cache_free(). Magazines of constructed
    204   6712     tomee  * objects are never populated directly from the slab layer (which contains raw,
    205   6712     tomee  * unconstructed objects). Whenever an allocation request cannot be satisfied
    206   6712     tomee  * from the magazine layer, the magazines are bypassed and the request is
    207   6712     tomee  * satisfied from the slab layer (creating a new slab if necessary). kmem calls
    208   6712     tomee  * the object constructor only when allocating from the slab layer, and only in
    209   6712     tomee  * response to kmem_cache_alloc() or to prepare the destination buffer passed in
    210   6712     tomee  * the move callback. kmem does not preconstruct objects in anticipation of
    211   6712     tomee  * kmem_cache_alloc().
    212   6712     tomee  *
    213   6712     tomee  * 2.3.2 Object Constructor and Destructor
    214   6712     tomee  *
    215   6712     tomee  * If the client supplies a destructor, it must be valid to call the destructor
    216   6712     tomee  * on a newly created object (immediately after the constructor).
    217   6712     tomee  *
    218   6712     tomee  * 2.4 Recognizing Known Objects
    219   6712     tomee  *
    220   6712     tomee  * There is a simple test to determine safely whether or not the client knows
    221   6712     tomee  * about a given object in the move callback. It relies on the fact that kmem
    222   6712     tomee  * guarantees that the object of the move callback has only been touched by the
    223   6712     tomee  * client itself or else by kmem. kmem does this by ensuring that none of the
    224   6712     tomee  * cache's slabs are freed to the virtual memory (VM) subsystem while a move
    225   6712     tomee  * callback is pending. When the last object on a slab is freed, if there is a
    226   6712     tomee  * pending move, kmem puts the slab on a per-cache dead list and defers freeing
    227   6712     tomee  * slabs on that list until all pending callbacks are completed. That way,
    228   6712     tomee  * clients can be certain that the object of a move callback is in one of the
    229   6712     tomee  * states listed above, making it possible to distinguish known objects (in
    230   6712     tomee  * state #4) using the two low order bits of any pointer member (with the
    231   6712     tomee  * exception of 'char *' or 'short *' which may not be 4-byte aligned on some
    232   6712     tomee  * platforms).
    233   6712     tomee  *
    234   6712     tomee  * The test works as long as the client always transitions objects from state #4
    235   6712     tomee  * (known, in use) to state #5 (about to be freed, invalid) by setting the low
    236   6712     tomee  * order bit of the client-designated pointer member. Since kmem only writes
    237   6712     tomee  * invalid memory patterns, such as 0xbaddcafe to uninitialized memory and
    238   6712     tomee  * 0xdeadbeef to freed memory, any scribbling on the object done by kmem is
    239   6712     tomee  * guaranteed to set at least one of the two low order bits. Therefore, given an
    240   6712     tomee  * object with a back pointer to a 'container_t *o_container', the client can
    241   6712     tomee  * test
    242   6712     tomee  *
    243   6712     tomee  *      container_t *container = object->o_container;
    244   6712     tomee  *      if ((uintptr_t)container & 0x3) {
    245   6712     tomee  *              return (KMEM_CBRC_DONT_KNOW);
    246   6712     tomee  *      }
    247   6712     tomee  *
    248   6712     tomee  * Typically, an object will have a pointer to some structure with a list or
    249   6712     tomee  * hash where objects from the cache are kept while in use. Assuming that the
    250   6712     tomee  * client has some way of knowing that the container structure is valid and will
    251   6712     tomee  * not go away during the move, and assuming that the structure includes a lock
    252   6712     tomee  * to protect whatever collection is used, then the client would continue as
    253   6712     tomee  * follows:
    254   6712     tomee  *
    255   6712     tomee  *	// Ensure that the container structure does not go away.
    256   6712     tomee  *      if (container_hold(container) == 0) {
    257   6712     tomee  *              return (KMEM_CBRC_DONT_KNOW);
    258   6712     tomee  *      }
    259   6712     tomee  *      mutex_enter(&container->c_objects_lock);
    260   6712     tomee  *      if (container != object->o_container) {
    261   6712     tomee  *              mutex_exit(&container->c_objects_lock);
    262   6712     tomee  *              container_rele(container);
    263   6712     tomee  *              return (KMEM_CBRC_DONT_KNOW);
    264   6712     tomee  *      }
    265   6712     tomee  *
    266   6712     tomee  * At this point the client knows that the object cannot be freed as long as
    267   6712     tomee  * c_objects_lock is held. Note that after acquiring the lock, the client must
    268   6712     tomee  * recheck the o_container pointer in case the object was removed just before
    269   6712     tomee  * acquiring the lock.
    270   6712     tomee  *
    271   6712     tomee  * When the client is about to free an object, it must first remove that object
    272   6712     tomee  * from the list, hash, or other structure where it is kept. At that time, to
    273   6712     tomee  * mark the object so it can be distinguished from the remaining, known objects,
    274   6712     tomee  * the client sets the designated low order bit:
    275   6712     tomee  *
    276   6712     tomee  *      mutex_enter(&container->c_objects_lock);
    277   6712     tomee  *      object->o_container = (void *)((uintptr_t)object->o_container | 0x1);
    278   6712     tomee  *      list_remove(&container->c_objects, object);
    279   6712     tomee  *      mutex_exit(&container->c_objects_lock);
    280   6712     tomee  *
    281   6712     tomee  * In the common case, the object is freed to the magazine layer, where it may
    282   6712     tomee  * be reused on a subsequent allocation without the overhead of calling the
    283   6712     tomee  * constructor. While in the magazine it appears allocated from the point of
    284   6712     tomee  * view of the slab layer, making it a candidate for the move callback. Most
    285   6712     tomee  * objects unrecognized by the client in the move callback fall into this
    286   6712     tomee  * category and are cheaply distinguished from known objects by the test
    287   6712     tomee  * described earlier. Since recognition is cheap for the client, and searching
    288   6712     tomee  * magazines is expensive for kmem, kmem defers searching until the client first
    289   6712     tomee  * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem
    290   6712     tomee  * elsewhere does what it can to avoid bothering the client unnecessarily.
    291   6712     tomee  *
    292   6712     tomee  * Invalidating the designated pointer member before freeing the object marks
    293   6712     tomee  * the object to be avoided in the callback, and conversely, assigning a valid
    294   6712     tomee  * value to the designated pointer member after allocating the object makes the
    295   6712     tomee  * object fair game for the callback:
    296   6712     tomee  *
    297   6712     tomee  *      ... allocate object ...
    298   6712     tomee  *      ... set any initial state not set by the constructor ...
    299   6712     tomee  *
    300   6712     tomee  *      mutex_enter(&container->c_objects_lock);
    301   6712     tomee  *      list_insert_tail(&container->c_objects, object);
    302   6712     tomee  *      membar_producer();
    303   6712     tomee  *      object->o_container = container;
    304   6712     tomee  *      mutex_exit(&container->c_objects_lock);
    305   6712     tomee  *
    306   6712     tomee  * Note that everything else must be valid before setting o_container makes the
    307   6712     tomee  * object fair game for the move callback. The membar_producer() call ensures
    308   6712     tomee  * that all the object's state is written to memory before setting the pointer
    309   6712     tomee  * that transitions the object from state #3 or #7 (allocated, constructed, not
    310   6712     tomee  * yet in use) to state #4 (in use, valid). That's important because the move
    311   6712     tomee  * function has to check the validity of the pointer before it can safely
    312   6712     tomee  * acquire the lock protecting the collection where it expects to find known
    313   6712     tomee  * objects.
    314   6712     tomee  *
    315   6712     tomee  * This method of distinguishing known objects observes the usual symmetry:
    316   6712     tomee  * invalidating the designated pointer is the first thing the client does before
    317   6712     tomee  * freeing the object, and setting the designated pointer is the last thing the
    318   6712     tomee  * client does after allocating the object. Of course, the client is not
    319   6712     tomee  * required to use this method. Fundamentally, how the client recognizes known
    320   6712     tomee  * objects is completely up to the client, but this method is recommended as an
    321   6712     tomee  * efficient and safe way to take advantage of the guarantees made by kmem. If
    322   6712     tomee  * the entire object is arbitrary data without any markable bits from a suitable
    323   6712     tomee  * pointer member, then the client must find some other method, such as
    324   6712     tomee  * searching a hash table of known objects.
    325   6712     tomee  *
    326   6712     tomee  * 2.5 Preventing Objects From Moving
    327   6712     tomee  *
    328   6712     tomee  * Besides a way to distinguish known objects, the other thing that the client
    329   6712     tomee  * needs is a strategy to ensure that an object will not move while the client
    330   6712     tomee  * is actively using it. The details of satisfying this requirement tend to be
    331   6712     tomee  * highly cache-specific. It might seem that the same rules that let a client
    332   6712     tomee  * remove an object safely should also decide when an object can be moved
    333   6712     tomee  * safely. However, any object state that makes a removal attempt invalid is
    334   6712     tomee  * likely to be long-lasting for objects that the client does not expect to
    335   6712     tomee  * remove. kmem knows nothing about the object state and is equally likely (from
    336   6712     tomee  * the client's point of view) to request a move for any object in the cache,
    337   6712     tomee  * whether prepared for removal or not. Even a low percentage of objects stuck
    338   6712     tomee  * in place by unremovability will defeat the consolidator if the stuck objects
    339   6712     tomee  * are the same long-lived allocations likely to hold slabs hostage.
    340   6712     tomee  * Fundamentally, the consolidator is not aimed at common cases. Severe external
    341   6712     tomee  * fragmentation is a worst case scenario manifested as sparsely allocated
    342   6712     tomee  * slabs, by definition a low percentage of the cache's objects. When deciding
    343   6712     tomee  * what makes an object movable, keep in mind the goal of the consolidator: to
    344   6712     tomee  * bring worst-case external fragmentation within the limits guaranteed for
    345   6712     tomee  * internal fragmentation. Removability is a poor criterion if it is likely to
    346   6712     tomee  * exclude more than an insignificant percentage of objects for long periods of
    347   6712     tomee  * time.
    348   6712     tomee  *
    349   6712     tomee  * A tricky general solution exists, and it has the advantage of letting you
    350   6712     tomee  * move any object at almost any moment, practically eliminating the likelihood
    351   6712     tomee  * that an object can hold a slab hostage. However, if there is a cache-specific
    352   6712     tomee  * way to ensure that an object is not actively in use in the vast majority of
    353   6712     tomee  * cases, a simpler solution that leverages this cache-specific knowledge is
    354   6712     tomee  * preferred.
    355   6712     tomee  *
    356   6712     tomee  * 2.5.1 Cache-Specific Solution
    357   6712     tomee  *
    358   6712     tomee  * As an example of a cache-specific solution, the ZFS znode cache takes
    359   6712     tomee  * advantage of the fact that the vast majority of znodes are only being
    360   6712     tomee  * referenced from the DNLC. (A typical case might be a few hundred in active
    361   6712     tomee  * use and a hundred thousand in the DNLC.) In the move callback, after the ZFS
    362   6712     tomee  * client has established that it recognizes the znode and can access its fields
    363   6712     tomee  * safely (using the method described earlier), it then tests whether the znode
    364   6712     tomee  * is referenced by anything other than the DNLC. If so, it assumes that the
    365   6712     tomee  * znode may be in active use and is unsafe to move, so it drops its locks and
    366   6712     tomee  * returns KMEM_CBRC_LATER. The advantage of this strategy is that everywhere
    367   6712     tomee  * else znodes are used, no change is needed to protect against the possibility
    368   6712     tomee  * of the znode moving. The disadvantage is that it remains possible for an
    369   6712     tomee  * application to hold a znode slab hostage with an open file descriptor.
    370   6712     tomee  * However, this case ought to be rare and the consolidator has a way to deal
    371   6712     tomee  * with it: If the client responds KMEM_CBRC_LATER repeatedly for the same
    372   6712     tomee  * object, kmem eventually stops believing it and treats the slab as if the
    373   6712     tomee  * client had responded KMEM_CBRC_NO. Having marked the hostage slab, kmem can
    374   6712     tomee  * then focus on getting it off of the partial slab list by allocating rather
    375   6712     tomee  * than freeing all of its objects. (Either way of getting a slab off the
    376   6712     tomee  * free list reduces fragmentation.)
    377   6712     tomee  *
    378   6712     tomee  * 2.5.2 General Solution
    379   6712     tomee  *
    380   6712     tomee  * The general solution, on the other hand, requires an explicit hold everywhere
    381   6712     tomee  * the object is used to prevent it from moving. To keep the client locking
    382   6712     tomee  * strategy as uncomplicated as possible, kmem guarantees the simplifying
    383   6712     tomee  * assumption that move callbacks are sequential, even across multiple caches.
    384   6712     tomee  * Internally, a global queue processed by a single thread supports all caches
    385   6712     tomee  * implementing the callback function. No matter how many caches supply a move
    386   6712     tomee  * function, the consolidator never moves more than one object at a time, so the
    387   6712     tomee  * client does not have to worry about tricky lock ordering involving several
    388   6712     tomee  * related objects from different kmem caches.
    389   6712     tomee  *
    390   6712     tomee  * The general solution implements the explicit hold as a read-write lock, which
    391   6712     tomee  * allows multiple readers to access an object from the cache simultaneously
    392   6712     tomee  * while a single writer is excluded from moving it. A single rwlock for the
    393   6712     tomee  * entire cache would lock out all threads from using any of the cache's objects
    394   6712     tomee  * even though only a single object is being moved, so to reduce contention,
    395   6712     tomee  * the client can fan out the single rwlock into an array of rwlocks hashed by
    396   6712     tomee  * the object address, making it probable that moving one object will not
    397   6712     tomee  * prevent other threads from using a different object. The rwlock cannot be a
    398   6712     tomee  * member of the object itself, because the possibility of the object moving
    399   6712     tomee  * makes it unsafe to access any of the object's fields until the lock is
    400   6712     tomee  * acquired.
    401   6712     tomee  *
    402   6712     tomee  * Assuming a small, fixed number of locks, it's possible that multiple objects
    403   6712     tomee  * will hash to the same lock. A thread that needs to use multiple objects in
    404   6712     tomee  * the same function may acquire the same lock multiple times. Since rwlocks are
    405   6712     tomee  * reentrant for readers, and since there is never more than a single writer at
    406   6712     tomee  * a time (assuming that the client acquires the lock as a writer only when
    407   6712     tomee  * moving an object inside the callback), there would seem to be no problem.
    408   6712     tomee  * However, a client locking multiple objects in the same function must handle
    409   6712     tomee  * one case of potential deadlock: Assume that thread A needs to prevent both
    410   6712     tomee  * object 1 and object 2 from moving, and thread B, the callback, meanwhile
    411   6712     tomee  * tries to move object 3. It's possible, if objects 1, 2, and 3 all hash to the
    412   6712     tomee  * same lock, that thread A will acquire the lock for object 1 as a reader
    413   6712     tomee  * before thread B sets the lock's write-wanted bit, preventing thread A from
    414   6712     tomee  * reacquiring the lock for object 2 as a reader. Unable to make forward
    415   6712     tomee  * progress, thread A will never release the lock for object 1, resulting in
    416   6712     tomee  * deadlock.
    417   6712     tomee  *
    418   6712     tomee  * There are two ways of avoiding the deadlock just described. The first is to
    419   6712     tomee  * use rw_tryenter() rather than rw_enter() in the callback function when
    420   6712     tomee  * attempting to acquire the lock as a writer. If tryenter discovers that the
    421   6712     tomee  * same object (or another object hashed to the same lock) is already in use, it
    422   6712     tomee  * aborts the callback and returns KMEM_CBRC_LATER. The second way is to use
    423   6712     tomee  * rprwlock_t (declared in common/fs/zfs/sys/rprwlock.h) instead of rwlock_t,
    424   6712     tomee  * since it allows a thread to acquire the lock as a reader in spite of a
    425   6712     tomee  * waiting writer. This second approach insists on moving the object now, no
    426   6712     tomee  * matter how many readers the move function must wait for in order to do so,
    427   6712     tomee  * and could delay the completion of the callback indefinitely (blocking
    428   6712     tomee  * callbacks to other clients). In practice, a less insistent callback using
    429   6712     tomee  * rw_tryenter() returns KMEM_CBRC_LATER infrequently enough that there seems
    430   6712     tomee  * little reason to use anything else.
    431   6712     tomee  *
    432   6712     tomee  * Avoiding deadlock is not the only problem that an implementation using an
    433   6712     tomee  * explicit hold needs to solve. Locking the object in the first place (to
    434   6712     tomee  * prevent it from moving) remains a problem, since the object could move
    435   6712     tomee  * between the time you obtain a pointer to the object and the time you acquire
    436   6712     tomee  * the rwlock hashed to that pointer value. Therefore the client needs to
    437   6712     tomee  * recheck the value of the pointer after acquiring the lock, drop the lock if
    438   6712     tomee  * the value has changed, and try again. This requires a level of indirection:
    439   6712     tomee  * something that points to the object rather than the object itself, that the
    440   6712     tomee  * client can access safely while attempting to acquire the lock. (The object
    441   6712     tomee  * itself cannot be referenced safely because it can move at any time.)
    442   6712     tomee  * The following lock-acquisition function takes whatever is safe to reference
    443   6712     tomee  * (arg), follows its pointer to the object (using function f), and tries as
    444   6712     tomee  * often as necessary to acquire the hashed lock and verify that the object
    445   6712     tomee  * still has not moved:
    446   6712     tomee  *
    447   6712     tomee  *      object_t *
    448   6712     tomee  *      object_hold(object_f f, void *arg)
    449   6712     tomee  *      {
    450   6712     tomee  *              object_t *op;
    451   6712     tomee  *
    452   6712     tomee  *              op = f(arg);
    453   6712     tomee  *              if (op == NULL) {
    454   6712     tomee  *                      return (NULL);
    455   6712     tomee  *              }
    456   6712     tomee  *
    457   6712     tomee  *              rw_enter(OBJECT_RWLOCK(op), RW_READER);
    458   6712     tomee  *              while (op != f(arg)) {
    459   6712     tomee  *                      rw_exit(OBJECT_RWLOCK(op));
    460   6712     tomee  *                      op = f(arg);
    461   6712     tomee  *                      if (op == NULL) {
    462   6712     tomee  *                              break;
    463   6712     tomee  *                      }
    464   6712     tomee  *                      rw_enter(OBJECT_RWLOCK(op), RW_READER);
    465   6712     tomee  *              }
    466   6712     tomee  *
    467   6712     tomee  *              return (op);
    468   6712     tomee  *      }
    469   6712     tomee  *
    470   6712     tomee  * The OBJECT_RWLOCK macro hashes the object address to obtain the rwlock. The
    471   6712     tomee  * lock reacquisition loop, while necessary, almost never executes. The function
    472   6712     tomee  * pointer f (used to obtain the object pointer from arg) has the following type
    473   6712     tomee  * definition:
    474   6712     tomee  *
    475   6712     tomee  *      typedef object_t *(*object_f)(void *arg);
    476   6712     tomee  *
    477   6712     tomee  * An object_f implementation is likely to be as simple as accessing a structure
    478   6712     tomee  * member:
    479   6712     tomee  *
    480   6712     tomee  *      object_t *
    481   6712     tomee  *      s_object(void *arg)
    482   6712     tomee  *      {
    483   6712     tomee  *              something_t *sp = arg;
    484   6712     tomee  *              return (sp->s_object);
    485   6712     tomee  *      }
    486   6712     tomee  *
    487   6712     tomee  * The flexibility of a function pointer allows the path to the object to be
    488   6712     tomee  * arbitrarily complex and also supports the notion that depending on where you
    489   6712     tomee  * are using the object, you may need to get it from someplace different.
    490   6712     tomee  *
    491   6712     tomee  * The function that releases the explicit hold is simpler because it does not
    492   6712     tomee  * have to worry about the object moving:
    493   6712     tomee  *
    494   6712     tomee  *      void
    495   6712     tomee  *      object_rele(object_t *op)
    496   6712     tomee  *      {
    497   6712     tomee  *              rw_exit(OBJECT_RWLOCK(op));
    498   6712     tomee  *      }
    499   6712     tomee  *
    500   6712     tomee  * The caller is spared these details so that obtaining and releasing an
    501   6712     tomee  * explicit hold feels like a simple mutex_enter()/mutex_exit() pair. The caller
    502   6712     tomee  * of object_hold() only needs to know that the returned object pointer is valid
    503   6712     tomee  * if not NULL and that the object will not move until released.
    504   6712     tomee  *
    505   6712     tomee  * Although object_hold() prevents an object from moving, it does not prevent it
    506   6712     tomee  * from being freed. The caller must take measures before calling object_hold()
    507   6712     tomee  * (afterwards is too late) to ensure that the held object cannot be freed. The
    508   6712     tomee  * caller must do so without accessing the unsafe object reference, so any lock
    509   6712     tomee  * or reference count used to ensure the continued existence of the object must
    510   6712     tomee  * live outside the object itself.
    511   6712     tomee  *
    512   6712     tomee  * Obtaining a new object is a special case where an explicit hold is impossible
    513   6712     tomee  * for the caller. Any function that returns a newly allocated object (either as
    514   6712     tomee  * a return value, or as an in-out paramter) must return it already held; after
    515   6712     tomee  * the caller gets it is too late, since the object cannot be safely accessed
    516   6712     tomee  * without the level of indirection described earlier. The following
    517   6712     tomee  * object_alloc() example uses the same code shown earlier to transition a new
    518   6712     tomee  * object into the state of being recognized (by the client) as a known object.
    519   6712     tomee  * The function must acquire the hold (rw_enter) before that state transition
    520   6712     tomee  * makes the object movable:
    521   6712     tomee  *
    522   6712     tomee  *      static object_t *
    523   6712     tomee  *      object_alloc(container_t *container)
    524   6712     tomee  *      {
    525   7546     tomee  *              object_t *object = kmem_cache_alloc(object_cache, 0);
    526   6712     tomee  *              ... set any initial state not set by the constructor ...
    527   6712     tomee  *              rw_enter(OBJECT_RWLOCK(object), RW_READER);
    528   6712     tomee  *              mutex_enter(&container->c_objects_lock);
    529   6712     tomee  *              list_insert_tail(&container->c_objects, object);
    530   6712     tomee  *              membar_producer();
    531   6712     tomee  *              object->o_container = container;
    532   6712     tomee  *              mutex_exit(&container->c_objects_lock);
    533   6712     tomee  *              return (object);
    534   6712     tomee  *      }
    535   6712     tomee  *
    536   6712     tomee  * Functions that implicitly acquire an object hold (any function that calls
    537   6712     tomee  * object_alloc() to supply an object for the caller) need to be carefully noted
    538   6712     tomee  * so that the matching object_rele() is not neglected. Otherwise, leaked holds
    539   6712     tomee  * prevent all objects hashed to the affected rwlocks from ever being moved.
    540   6712     tomee  *
    541   6712     tomee  * The pointer to a held object can be hashed to the holding rwlock even after
    542   6712     tomee  * the object has been freed. Although it is possible to release the hold
    543   6712     tomee  * after freeing the object, you may decide to release the hold implicitly in
    544   6712     tomee  * whatever function frees the object, so as to release the hold as soon as
    545   6712     tomee  * possible, and for the sake of symmetry with the function that implicitly
    546   6712     tomee  * acquires the hold when it allocates the object. Here, object_free() releases
    547   6712     tomee  * the hold acquired by object_alloc(). Its implicit object_rele() forms a
    548   6712     tomee  * matching pair with object_hold():
    549   6712     tomee  *
    550   6712     tomee  *      void
    551   6712     tomee  *      object_free(object_t *object)
    552   6712     tomee  *      {
    553   6712     tomee  *              container_t *container;
    554   6712     tomee  *
    555   6712     tomee  *              ASSERT(object_held(object));
    556   6712     tomee  *              container = object->o_container;
    557   6712     tomee  *              mutex_enter(&container->c_objects_lock);
    558   6712     tomee  *              object->o_container =
    559   6712     tomee  *                  (void *)((uintptr_t)object->o_container | 0x1);
    560   6712     tomee  *              list_remove(&container->c_objects, object);
    561   6712     tomee  *              mutex_exit(&container->c_objects_lock);
    562   6712     tomee  *              object_rele(object);
    563   6712     tomee  *              kmem_cache_free(object_cache, object);
    564   6712     tomee  *      }
    565   6712     tomee  *
    566   6712     tomee  * Note that object_free() cannot safely accept an object pointer as an argument
    567   6712     tomee  * unless the object is already held. Any function that calls object_free()
    568   6712     tomee  * needs to be carefully noted since it similarly forms a matching pair with
    569   6712     tomee  * object_hold().
    570   6712     tomee  *
    571   6712     tomee  * To complete the picture, the following callback function implements the
    572   6712     tomee  * general solution by moving objects only if they are currently unheld:
    573   6712     tomee  *
    574   6712     tomee  *      static kmem_cbrc_t
    575   6712     tomee  *      object_move(void *buf, void *newbuf, size_t size, void *arg)
    576   6712     tomee  *      {
    577   6712     tomee  *              object_t *op = buf, *np = newbuf;
    578   6712     tomee  *              container_t *container;
    579   6712     tomee  *
    580   6712     tomee  *              container = op->o_container;
    581   6712     tomee  *              if ((uintptr_t)container & 0x3) {
    582   6712     tomee  *                      return (KMEM_CBRC_DONT_KNOW);
    583   6712     tomee  *              }
    584   6712     tomee  *
    585   6712     tomee  *	        // Ensure that the container structure does not go away.
    586   6712     tomee  *              if (container_hold(container) == 0) {
    587   6712     tomee  *                      return (KMEM_CBRC_DONT_KNOW);
    588   6712     tomee  *              }
    589   6712     tomee  *
    590   6712     tomee  *              mutex_enter(&container->c_objects_lock);
    591   6712     tomee  *              if (container != op->o_container) {
    592   6712     tomee  *                      mutex_exit(&container->c_objects_lock);
    593   6712     tomee  *                      container_rele(container);
    594   6712     tomee  *                      return (KMEM_CBRC_DONT_KNOW);
    595   6712     tomee  *              }
    596   6712     tomee  *
    597   6712     tomee  *              if (rw_tryenter(OBJECT_RWLOCK(op), RW_WRITER) == 0) {
    598   6712     tomee  *                      mutex_exit(&container->c_objects_lock);
    599   6712     tomee  *                      container_rele(container);
    600   6712     tomee  *                      return (KMEM_CBRC_LATER);
    601   6712     tomee  *              }
    602   6712     tomee  *
    603   6712     tomee  *              object_move_impl(op, np); // critical section
    604   6712     tomee  *              rw_exit(OBJECT_RWLOCK(op));
    605   6712     tomee  *
    606   6712     tomee  *              op->o_container = (void *)((uintptr_t)op->o_container | 0x1);
    607   6712     tomee  *              list_link_replace(&op->o_link_node, &np->o_link_node);
    608   6712     tomee  *              mutex_exit(&container->c_objects_lock);
    609   6712     tomee  *              container_rele(container);
    610   6712     tomee  *              return (KMEM_CBRC_YES);
    611   6712     tomee  *      }
    612   6712     tomee  *
    613   6712     tomee  * Note that object_move() must invalidate the designated o_container pointer of
    614   6712     tomee  * the old object in the same way that object_free() does, since kmem will free
    615   6712     tomee  * the object in response to the KMEM_CBRC_YES return value.
    616   6712     tomee  *
    617   6712     tomee  * The lock order in object_move() differs from object_alloc(), which locks
    618   6712     tomee  * OBJECT_RWLOCK first and &container->c_objects_lock second, but as long as the
    619   6712     tomee  * callback uses rw_tryenter() (preventing the deadlock described earlier), it's
    620   6712     tomee  * not a problem. Holding the lock on the object list in the example above
    621   6712     tomee  * through the entire callback not only prevents the object from going away, it
    622   6712     tomee  * also allows you to lock the list elsewhere and know that none of its elements
    623   6712     tomee  * will move during iteration.
    624   6712     tomee  *
    625   6712     tomee  * Adding an explicit hold everywhere an object from the cache is used is tricky
    626   6712     tomee  * and involves much more change to client code than a cache-specific solution
    627   6712     tomee  * that leverages existing state to decide whether or not an object is
    628   6712     tomee  * movable. However, this approach has the advantage that no object remains
    629   6712     tomee  * immovable for any significant length of time, making it extremely unlikely
    630   6712     tomee  * that long-lived allocations can continue holding slabs hostage; and it works
    631   6712     tomee  * for any cache.
    632   6712     tomee  *
    633   6712     tomee  * 3. Consolidator Implementation
    634   6712     tomee  *
    635   6712     tomee  * Once the client supplies a move function that a) recognizes known objects and
    636   6712     tomee  * b) avoids moving objects that are actively in use, the remaining work is up
    637   6712     tomee  * to the consolidator to decide which objects to move and when to issue
    638   6712     tomee  * callbacks.
    639   6712     tomee  *
    640   6712     tomee  * The consolidator relies on the fact that a cache's slabs are ordered by
    641   6712     tomee  * usage. Each slab has a fixed number of objects. Depending on the slab's
    642   6712     tomee  * "color" (the offset of the first object from the beginning of the slab;
    643   6712     tomee  * offsets are staggered to mitigate false sharing of cache lines) it is either
    644   6712     tomee  * the maximum number of objects per slab determined at cache creation time or
    645   6712     tomee  * else the number closest to the maximum that fits within the space remaining
    646   6712     tomee  * after the initial offset. A completely allocated slab may contribute some
    647   6712     tomee  * internal fragmentation (per-slab overhead) but no external fragmentation, so
    648   6712     tomee  * it is of no interest to the consolidator. At the other extreme, slabs whose
    649   6712     tomee  * objects have all been freed to the slab are released to the virtual memory
    650   6712     tomee  * (VM) subsystem (objects freed to magazines are still allocated as far as the
    651   6712     tomee  * slab is concerned). External fragmentation exists when there are slabs
    652   6712     tomee  * somewhere between these extremes. A partial slab has at least one but not all
    653   6712     tomee  * of its objects allocated. The more partial slabs, and the fewer allocated
    654   6712     tomee  * objects on each of them, the higher the fragmentation. Hence the
    655   6712     tomee  * consolidator's overall strategy is to reduce the number of partial slabs by
    656   6712     tomee  * moving allocated objects from the least allocated slabs to the most allocated
    657   6712     tomee  * slabs.
    658   6712     tomee  *
    659   6712     tomee  * Partial slabs are kept in an AVL tree ordered by usage. Completely allocated
    660   6712     tomee  * slabs are kept separately in an unordered list. Since the majority of slabs
    661   6712     tomee  * tend to be completely allocated (a typical unfragmented cache may have
    662   6712     tomee  * thousands of complete slabs and only a single partial slab), separating
    663   6712     tomee  * complete slabs improves the efficiency of partial slab ordering, since the
    664   6712     tomee  * complete slabs do not affect the depth or balance of the AVL tree. This
    665   6712     tomee  * ordered sequence of partial slabs acts as a "free list" supplying objects for
    666   6712     tomee  * allocation requests.
    667   6712     tomee  *
    668   6712     tomee  * Objects are always allocated from the first partial slab in the free list,
    669   6712     tomee  * where the allocation is most likely to eliminate a partial slab (by
    670   6712     tomee  * completely allocating it). Conversely, when a single object from a completely
    671   6712     tomee  * allocated slab is freed to the slab, that slab is added to the front of the
    672   6712     tomee  * free list. Since most free list activity involves highly allocated slabs
    673   6712     tomee  * coming and going at the front of the list, slabs tend naturally toward the
    674   6712     tomee  * ideal order: highly allocated at the front, sparsely allocated at the back.
    675   6712     tomee  * Slabs with few allocated objects are likely to become completely free if they
    676   6712     tomee  * keep a safe distance away from the front of the free list. Slab misorders
    677   6712     tomee  * interfere with the natural tendency of slabs to become completely free or
    678   6712     tomee  * completely allocated. For example, a slab with a single allocated object
    679   6712     tomee  * needs only a single free to escape the cache; its natural desire is
    680   6712     tomee  * frustrated when it finds itself at the front of the list where a second
    681   6712     tomee  * allocation happens just before the free could have released it. Another slab
    682   6712     tomee  * with all but one object allocated might have supplied the buffer instead, so
    683   6712     tomee  * that both (as opposed to neither) of the slabs would have been taken off the
    684   6712     tomee  * free list.
    685   6712     tomee  *
    686   6712     tomee  * Although slabs tend naturally toward the ideal order, misorders allowed by a
    687   6712     tomee  * simple list implementation defeat the consolidator's strategy of merging
    688   6712     tomee  * least- and most-allocated slabs. Without an AVL tree to guarantee order, kmem
    689   6712     tomee  * needs another way to fix misorders to optimize its callback strategy. One
    690   6712     tomee  * approach is to periodically scan a limited number of slabs, advancing a
    691   6712     tomee  * marker to hold the current scan position, and to move extreme misorders to
    692   6712     tomee  * the front or back of the free list and to the front or back of the current
    693   6712     tomee  * scan range. By making consecutive scan ranges overlap by one slab, the least
    694   6712     tomee  * allocated slab in the current range can be carried along from the end of one
    695   6712     tomee  * scan to the start of the next.
    696   6712     tomee  *
    697   6712     tomee  * Maintaining partial slabs in an AVL tree relieves kmem of this additional
    698   6712     tomee  * task, however. Since most of the cache's activity is in the magazine layer,
    699   6712     tomee  * and allocations from the slab layer represent only a startup cost, the
    700   6712     tomee  * overhead of maintaining a balanced tree is not a significant concern compared
    701   6712     tomee  * to the opportunity of reducing complexity by eliminating the partial slab
    702   6712     tomee  * scanner just described. The overhead of an AVL tree is minimized by
    703   6712     tomee  * maintaining only partial slabs in the tree and keeping completely allocated
    704   6712     tomee  * slabs separately in a list. To avoid increasing the size of the slab
    705   6712     tomee  * structure the AVL linkage pointers are reused for the slab's list linkage,
    706   6712     tomee  * since the slab will always be either partial or complete, never stored both
    707   6712     tomee  * ways at the same time. To further minimize the overhead of the AVL tree the
    708   6712     tomee  * compare function that orders partial slabs by usage divides the range of
    709   6712     tomee  * allocated object counts into bins such that counts within the same bin are
    710   6712     tomee  * considered equal. Binning partial slabs makes it less likely that allocating
    711   6712     tomee  * or freeing a single object will change the slab's order, requiring a tree
    712   6712     tomee  * reinsertion (an avl_remove() followed by an avl_add(), both potentially
    713   6712     tomee  * requiring some rebalancing of the tree). Allocation counts closest to
    714   6712     tomee  * completely free and completely allocated are left unbinned (finely sorted) to
    715   6712     tomee  * better support the consolidator's strategy of merging slabs at either
    716   6712     tomee  * extreme.
    717   6712     tomee  *
    718   6712     tomee  * 3.1 Assessing Fragmentation and Selecting Candidate Slabs
    719   6712     tomee  *
    720   6712     tomee  * The consolidator piggybacks on the kmem maintenance thread and is called on
    721   6712     tomee  * the same interval as kmem_cache_update(), once per cache every fifteen
    722   6712     tomee  * seconds. kmem maintains a running count of unallocated objects in the slab
    723   6712     tomee  * layer (cache_bufslab). The consolidator checks whether that number exceeds
    724   6712     tomee  * 12.5% (1/8) of the total objects in the cache (cache_buftotal), and whether
    725   6712     tomee  * there is a significant number of slabs in the cache (arbitrarily a minimum
    726   6712     tomee  * 101 total slabs). Unused objects that have fallen out of the magazine layer's
    727   6712     tomee  * working set are included in the assessment, and magazines in the depot are
    728   6712     tomee  * reaped if those objects would lift cache_bufslab above the fragmentation
    729   6712     tomee  * threshold. Once the consolidator decides that a cache is fragmented, it looks
    730   6712     tomee  * for a candidate slab to reclaim, starting at the end of the partial slab free
    731   6712     tomee  * list and scanning backwards. At first the consolidator is choosy: only a slab
    732   6712     tomee  * with fewer than 12.5% (1/8) of its objects allocated qualifies (or else a
    733   6712     tomee  * single allocated object, regardless of percentage). If there is difficulty
    734   6712     tomee  * finding a candidate slab, kmem raises the allocation threshold incrementally,
    735   6712     tomee  * up to a maximum 87.5% (7/8), so that eventually the consolidator will reduce
    736   6712     tomee  * external fragmentation (unused objects on the free list) below 12.5% (1/8),
    737   6712     tomee  * even in the worst case of every slab in the cache being almost 7/8 allocated.
    738   6712     tomee  * The threshold can also be lowered incrementally when candidate slabs are easy
    739   6712     tomee  * to find, and the threshold is reset to the minimum 1/8 as soon as the cache
    740   6712     tomee  * is no longer fragmented.
    741   6712     tomee  *
    742   6712     tomee  * 3.2 Generating Callbacks
    743   6712     tomee  *
    744   6712     tomee  * Once an eligible slab is chosen, a callback is generated for every allocated
    745   6712     tomee  * object on the slab, in the hope that the client will move everything off the
    746   6712     tomee  * slab and make it reclaimable. Objects selected as move destinations are
    747   6712     tomee  * chosen from slabs at the front of the free list. Assuming slabs in the ideal
    748   6712     tomee  * order (most allocated at the front, least allocated at the back) and a
    749   6712     tomee  * cooperative client, the consolidator will succeed in removing slabs from both
    750   6712     tomee  * ends of the free list, completely allocating on the one hand and completely
    751   6712     tomee  * freeing on the other. Objects selected as move destinations are allocated in
    752   6712     tomee  * the kmem maintenance thread where move requests are enqueued. A separate
    753   6712     tomee  * callback thread removes pending callbacks from the queue and calls the
    754   6712     tomee  * client. The separate thread ensures that client code (the move function) does
    755   6712     tomee  * not interfere with internal kmem maintenance tasks. A map of pending
    756   6712     tomee  * callbacks keyed by object address (the object to be moved) is checked to
    757   6712     tomee  * ensure that duplicate callbacks are not generated for the same object.
    758   6712     tomee  * Allocating the move destination (the object to move to) prevents subsequent
    759   6712     tomee  * callbacks from selecting the same destination as an earlier pending callback.
    760   6712     tomee  *
    761   6712     tomee  * Move requests can also be generated by kmem_cache_reap() when the system is
    762   6712     tomee  * desperate for memory and by kmem_cache_move_notify(), called by the client to
    763   6712     tomee  * notify kmem that a move refused earlier with KMEM_CBRC_LATER is now possible.
    764   6712     tomee  * The map of pending callbacks is protected by the same lock that protects the
    765   6712     tomee  * slab layer.
    766   6712     tomee  *
    767   6712     tomee  * When the system is desperate for memory, kmem does not bother to determine
    768   6712     tomee  * whether or not the cache exceeds the fragmentation threshold, but tries to
    769   6712     tomee  * consolidate as many slabs as possible. Normally, the consolidator chews
    770   6712     tomee  * slowly, one sparsely allocated slab at a time during each maintenance
    771   6712     tomee  * interval that the cache is fragmented. When desperate, the consolidator
    772   6712     tomee  * starts at the last partial slab and enqueues callbacks for every allocated
    773   6712     tomee  * object on every partial slab, working backwards until it reaches the first
    774   6712     tomee  * partial slab. The first partial slab, meanwhile, advances in pace with the
    775   6712     tomee  * consolidator as allocations to supply move destinations for the enqueued
    776   6712     tomee  * callbacks use up the highly allocated slabs at the front of the free list.
    777   6712     tomee  * Ideally, the overgrown free list collapses like an accordion, starting at
    778   6712     tomee  * both ends and ending at the center with a single partial slab.
    779   6712     tomee  *
    780   6712     tomee  * 3.3 Client Responses
    781   6712     tomee  *
    782   6712     tomee  * When the client returns KMEM_CBRC_NO in response to the move callback, kmem
    783   6712     tomee  * marks the slab that supplied the stuck object non-reclaimable and moves it to
    784   6712     tomee  * front of the free list. The slab remains marked as long as it remains on the
    785   6712     tomee  * free list, and it appears more allocated to the partial slab compare function
    786   6712     tomee  * than any unmarked slab, no matter how many of its objects are allocated.
    787   6712     tomee  * Since even one immovable object ties up the entire slab, the goal is to
    788   6712     tomee  * completely allocate any slab that cannot be completely freed. kmem does not
    789   6712     tomee  * bother generating callbacks to move objects from a marked slab unless the
    790   6712     tomee  * system is desperate.
    791   6712     tomee  *
    792   6712     tomee  * When the client responds KMEM_CBRC_LATER, kmem increments a count for the
    793   6712     tomee  * slab. If the client responds LATER too many times, kmem disbelieves and
    794   6712     tomee  * treats the response as a NO. The count is cleared when the slab is taken off
    795   6712     tomee  * the partial slab list or when the client moves one of the slab's objects.
    796   6712     tomee  *
    797   6712     tomee  * 4. Observability
    798   6712     tomee  *
    799   6712     tomee  * A kmem cache's external fragmentation is best observed with 'mdb -k' using
    800   6712     tomee  * the ::kmem_slabs dcmd. For a complete description of the command, enter
    801   6712     tomee  * '::help kmem_slabs' at the mdb prompt.
    802      0    stevel  */
    803      0    stevel 
    804      0    stevel #include <sys/kmem_impl.h>
    805      0    stevel #include <sys/vmem_impl.h>
    806      0    stevel #include <sys/param.h>
    807      0    stevel #include <sys/sysmacros.h>
    808      0    stevel #include <sys/vm.h>
    809      0    stevel #include <sys/proc.h>
    810      0    stevel #include <sys/tuneable.h>
    811      0    stevel #include <sys/systm.h>
    812      0    stevel #include <sys/cmn_err.h>
    813      0    stevel #include <sys/debug.h>
    814   6712     tomee #include <sys/sdt.h>
    815      0    stevel #include <sys/mutex.h>
    816      0    stevel #include <sys/bitmap.h>
    817      0    stevel #include <sys/atomic.h>
    818      0    stevel #include <sys/kobj.h>
    819      0    stevel #include <sys/disp.h>
    820      0    stevel #include <vm/seg_kmem.h>
    821      0    stevel #include <sys/log.h>
    822      0    stevel #include <sys/callb.h>
    823      0    stevel #include <sys/taskq.h>
    824      0    stevel #include <sys/modctl.h>
    825      0    stevel #include <sys/reboot.h>
    826      0    stevel #include <sys/id32.h>
    827      0    stevel #include <sys/zone.h>
    828   3448  dh155122 #include <sys/netstack.h>
    829   6712     tomee #ifdef	DEBUG
    830   6712     tomee #include <sys/random.h>
    831   6712     tomee #endif
    832      0    stevel 
    833      0    stevel extern void streams_msg_init(void);
    834      0    stevel extern int segkp_fromheap;
    835      0    stevel extern void segkp_cache_free(void);
    836      0    stevel 
    837      0    stevel struct kmem_cache_kstat {
    838      0    stevel 	kstat_named_t	kmc_buf_size;
    839      0    stevel 	kstat_named_t	kmc_align;
    840      0    stevel 	kstat_named_t	kmc_chunk_size;
    841      0    stevel 	kstat_named_t	kmc_slab_size;
    842      0    stevel 	kstat_named_t	kmc_alloc;
    843      0    stevel 	kstat_named_t	kmc_alloc_fail;
    844      0    stevel 	kstat_named_t	kmc_free;
    845      0    stevel 	kstat_named_t	kmc_depot_alloc;
    846      0    stevel 	kstat_named_t	kmc_depot_free;
    847      0    stevel 	kstat_named_t	kmc_depot_contention;
    848      0    stevel 	kstat_named_t	kmc_slab_alloc;
    849      0    stevel 	kstat_named_t	kmc_slab_free;
    850      0    stevel 	kstat_named_t	kmc_buf_constructed;
    851      0    stevel 	kstat_named_t	kmc_buf_avail;
    852      0    stevel 	kstat_named_t	kmc_buf_inuse;
    853      0    stevel 	kstat_named_t	kmc_buf_total;
    854      0    stevel 	kstat_named_t	kmc_buf_max;
    855      0    stevel 	kstat_named_t	kmc_slab_create;
    856      0    stevel 	kstat_named_t	kmc_slab_destroy;
    857      0    stevel 	kstat_named_t	kmc_vmem_source;
    858      0    stevel 	kstat_named_t	kmc_hash_size;
    859      0    stevel 	kstat_named_t	kmc_hash_lookup_depth;
    860      0    stevel 	kstat_named_t	kmc_hash_rescale;
    861      0    stevel 	kstat_named_t	kmc_full_magazines;
    862      0    stevel 	kstat_named_t	kmc_empty_magazines;
    863      0    stevel 	kstat_named_t	kmc_magazine_size;
    864  10217       Tom 	kstat_named_t	kmc_reap; /* number of kmem_cache_reap() calls */
    865  10217       Tom 	kstat_named_t	kmc_defrag; /* attempts to defrag all partial slabs */
    866  10217       Tom 	kstat_named_t	kmc_scan; /* attempts to defrag one partial slab */
    867  10217       Tom 	kstat_named_t	kmc_move_callbacks; /* sum of yes, no, later, dn, dk */
    868   6712     tomee 	kstat_named_t	kmc_move_yes;
    869   6712     tomee 	kstat_named_t	kmc_move_no;
    870   6712     tomee 	kstat_named_t	kmc_move_later;
    871   6712     tomee 	kstat_named_t	kmc_move_dont_need;
    872  10217       Tom 	kstat_named_t	kmc_move_dont_know; /* obj unrecognized by client ... */
    873  10217       Tom 	kstat_named_t	kmc_move_hunt_found; /* ... but found in mag layer */
    874  10217       Tom 	kstat_named_t	kmc_move_slabs_freed; /* slabs freed by consolidator */
    875  10217       Tom 	kstat_named_t	kmc_move_reclaimable; /* buffers, if consolidator ran */
    876      0    stevel } kmem_cache_kstat = {
    877      0    stevel 	{ "buf_size",		KSTAT_DATA_UINT64 },
    878      0    stevel 	{ "align",		KSTAT_DATA_UINT64 },
    879      0    stevel 	{ "chunk_size",		KSTAT_DATA_UINT64 },
    880      0    stevel 	{ "slab_size",		KSTAT_DATA_UINT64 },
    881      0    stevel 	{ "alloc",		KSTAT_DATA_UINT64 },
    882      0    stevel 	{ "alloc_fail",		KSTAT_DATA_UINT64 },
    883      0    stevel 	{ "free",		KSTAT_DATA_UINT64 },
    884      0    stevel 	{ "depot_alloc",	KSTAT_DATA_UINT64 },
    885      0    stevel 	{ "depot_free",		KSTAT_DATA_UINT64 },
    886      0    stevel 	{ "depot_contention",	KSTAT_DATA_UINT64 },
    887      0    stevel 	{ "slab_alloc",		KSTAT_DATA_UINT64 },
    888      0    stevel 	{ "slab_free",		KSTAT_DATA_UINT64 },
    889      0    stevel 	{ "buf_constructed",	KSTAT_DATA_UINT64 },
    890      0    stevel 	{ "buf_avail",		KSTAT_DATA_UINT64 },
    891      0    stevel 	{ "buf_inuse",		KSTAT_DATA_UINT64 },
    892      0    stevel 	{ "buf_total",		KSTAT_DATA_UINT64 },
    893      0    stevel 	{ "buf_max",		KSTAT_DATA_UINT64 },
    894      0    stevel 	{ "slab_create",	KSTAT_DATA_UINT64 },
    895      0    stevel 	{ "slab_destroy",	KSTAT_DATA_UINT64 },
    896      0    stevel 	{ "vmem_source",	KSTAT_DATA_UINT64 },
    897      0    stevel 	{ "hash_size",		KSTAT_DATA_UINT64 },
    898      0    stevel 	{ "hash_lookup_depth",	KSTAT_DATA_UINT64 },
    899      0    stevel 	{ "hash_rescale",	KSTAT_DATA_UINT64 },
    900      0    stevel 	{ "full_magazines",	KSTAT_DATA_UINT64 },
    901      0    stevel 	{ "empty_magazines",	KSTAT_DATA_UINT64 },
    902      0    stevel 	{ "magazine_size",	KSTAT_DATA_UINT64 },
    903  10217       Tom 	{ "reap",		KSTAT_DATA_UINT64 },
    904  10217       Tom 	{ "defrag",		KSTAT_DATA_UINT64 },
    905  10217       Tom 	{ "scan",		KSTAT_DATA_UINT64 },
    906   6712     tomee 	{ "move_callbacks",	KSTAT_DATA_UINT64 },
    907   6712     tomee 	{ "move_yes",		KSTAT_DATA_UINT64 },
    908   6712     tomee 	{ "move_no",		KSTAT_DATA_UINT64 },
    909   6712     tomee 	{ "move_later",		KSTAT_DATA_UINT64 },
    910   6712     tomee 	{ "move_dont_need",	KSTAT_DATA_UINT64 },
    911   6712     tomee 	{ "move_dont_know",	KSTAT_DATA_UINT64 },
    912   6712     tomee 	{ "move_hunt_found",	KSTAT_DATA_UINT64 },
    913  10217       Tom 	{ "move_slabs_freed",	KSTAT_DATA_UINT64 },
    914  10217       Tom 	{ "move_reclaimable",	KSTAT_DATA_UINT64 },
    915      0    stevel };
    916      0    stevel 
    917      0    stevel static kmutex_t kmem_cache_kstat_lock;
    918      0    stevel 
    919      0    stevel /*
    920      0    stevel  * The default set of caches to back kmem_alloc().
    921      0    stevel  * These sizes should be reevaluated periodically.
    922      0    stevel  *
    923      0    stevel  * We want allocations that are multiples of the coherency granularity
    924      0    stevel  * (64 bytes) to be satisfied from a cache which is a multiple of 64
    925      0    stevel  * bytes, so that it will be 64-byte aligned.  For all multiples of 64,
    926      0    stevel  * the next kmem_cache_size greater than or equal to it must be a
    927      0    stevel  * multiple of 64.
    928   9367  Jonathan  *
    929   9367  Jonathan  * We split the table into two sections:  size <= 4k and size > 4k.  This
    930   9367  Jonathan  * saves a lot of space and cache footprint in our cache tables.
    931      0    stevel  */
    932      0    stevel static const int kmem_alloc_sizes[] = {
    933      0    stevel 	1 * 8,
    934      0    stevel 	2 * 8,
    935      0    stevel 	3 * 8,
    936      0    stevel 	4 * 8,		5 * 8,		6 * 8,		7 * 8,
    937      0    stevel 	4 * 16,		5 * 16,		6 * 16,		7 * 16,
    938      0    stevel 	4 * 32,		5 * 32,		6 * 32,		7 * 32,
    939      0    stevel 	4 * 64,		5 * 64,		6 * 64,		7 * 64,
    940      0    stevel 	4 * 128,	5 * 128,	6 * 128,	7 * 128,
    941      0    stevel 	P2ALIGN(8192 / 7, 64),
    942      0    stevel 	P2ALIGN(8192 / 6, 64),
    943      0    stevel 	P2ALIGN(8192 / 5, 64),
    944      0    stevel 	P2ALIGN(8192 / 4, 64),
    945      0    stevel 	P2ALIGN(8192 / 3, 64),
    946      0    stevel 	P2ALIGN(8192 / 2, 64),
    947      0    stevel };
    948      0    stevel 
    949   9367  Jonathan static const int kmem_big_alloc_sizes[] = {
    950   9367  Jonathan 	2 * 4096,	3 * 4096,
    951   9367  Jonathan 	2 * 8192,	3 * 8192,
    952   9367  Jonathan 	4 * 8192,	5 * 8192,	6 * 8192,	7 * 8192,
    953   9367  Jonathan 	8 * 8192,	9 * 8192,	10 * 8192,	11 * 8192,
    954   9367  Jonathan 	12 * 8192,	13 * 8192,	14 * 8192,	15 * 8192,
    955   9367  Jonathan 	16 * 8192
    956   9367  Jonathan };
    957   9367  Jonathan 
    958   9367  Jonathan #define	KMEM_MAXBUF		4096
    959   9367  Jonathan #define	KMEM_BIG_MAXBUF_32BIT	32768
    960   9367  Jonathan #define	KMEM_BIG_MAXBUF		131072
    961   9367  Jonathan 
    962   9367  Jonathan #define	KMEM_BIG_MULTIPLE	4096	/* big_alloc_sizes must be a multiple */
    963   9367  Jonathan #define	KMEM_BIG_SHIFT		12	/* lg(KMEM_BIG_MULTIPLE) */
    964      0    stevel 
    965      0    stevel static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT];
    966   9367  Jonathan static kmem_cache_t *kmem_big_alloc_table[KMEM_BIG_MAXBUF >> KMEM_BIG_SHIFT];
    967   9367  Jonathan 
    968   9367  Jonathan #define	KMEM_ALLOC_TABLE_MAX	(KMEM_MAXBUF >> KMEM_ALIGN_SHIFT)
    969   9367  Jonathan static size_t kmem_big_alloc_table_max = 0;	/* # of filled elements */
    970      0    stevel 
    971      0    stevel static kmem_magtype_t kmem_magtype[] = {
    972      0    stevel 	{ 1,	8,	3200,	65536	},
    973      0    stevel 	{ 3,	16,	256,	32768	},
    974      0    stevel 	{ 7,	32,	64,	16384	},
    975      0    stevel 	{ 15,	64,	0,	8192	},
    976      0    stevel 	{ 31,	64,	0,	4096	},
    977      0    stevel 	{ 47,	64,	0,	2048	},
    978      0    stevel 	{ 63,	64,	0,	1024	},
    979      0    stevel 	{ 95,	64,	0,	512	},
    980      0    stevel 	{ 143,	64,	0,	0	},
    981      0    stevel };
    982      0    stevel 
    983      0    stevel static uint32_t kmem_reaping;
    984      0    stevel static uint32_t kmem_reaping_idspace;
    985      0    stevel 
    986      0    stevel /*
    987      0    stevel  * kmem tunables
    988      0    stevel  */
    989      0    stevel clock_t kmem_reap_interval;	/* cache reaping rate [15 * HZ ticks] */
    990      0    stevel int kmem_depot_contention = 3;	/* max failed tryenters per real interval */
    991      0    stevel pgcnt_t kmem_reapahead = 0;	/* start reaping N pages before pageout */
    992      0    stevel int kmem_panic = 1;		/* whether to panic on error */
    993      0    stevel int kmem_logging = 1;		/* kmem_log_enter() override */
    994      0    stevel uint32_t kmem_mtbf = 0;		/* mean time between failures [default: off] */
    995      0    stevel size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
    996      0    stevel size_t kmem_content_log_size;	/* content log size [2% of memory] */
    997      0    stevel size_t kmem_failure_log_size;	/* failure log [4 pages per CPU] */
    998      0    stevel size_t kmem_slab_log_size;	/* slab create log [4 pages per CPU] */
    999      0    stevel size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
   1000      0    stevel size_t kmem_lite_minsize = 0;	/* minimum buffer size for KMF_LITE */
   1001      0    stevel size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
   1002      0    stevel int kmem_lite_pcs = 4;		/* number of PCs to store in KMF_LITE mode */
   1003      0    stevel size_t kmem_maxverify;		/* maximum bytes to inspect in debug routines */
   1004      0    stevel size_t kmem_minfirewall;	/* hardware-enforced redzone threshold */
   1005   9367  Jonathan 
   1006   9367  Jonathan #ifdef _LP64
   1007   9367  Jonathan size_t	kmem_max_cached = KMEM_BIG_MAXBUF;	/* maximum kmem_alloc cache */
   1008   9367  Jonathan #else
   1009   9367  Jonathan size_t	kmem_max_cached = KMEM_BIG_MAXBUF_32BIT; /* maximum kmem_alloc cache */
   1010   9367  Jonathan #endif
   1011      0    stevel 
   1012      0    stevel #ifdef DEBUG
   1013      0    stevel int kmem_flags = KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS;
   1014      0    stevel #else
   1015      0    stevel int kmem_flags = 0;
   1016      0    stevel #endif
   1017      0    stevel int kmem_ready;
   1018      0    stevel 
   1019      0    stevel static kmem_cache_t	*kmem_slab_cache;
   1020      0    stevel static kmem_cache_t	*kmem_bufctl_cache;
   1021      0    stevel static kmem_cache_t	*kmem_bufctl_audit_cache;
   1022      0    stevel 
   1023      0    stevel static kmutex_t		kmem_cache_lock;	/* inter-cache linkage only */
   1024   6712     tomee static list_t		kmem_caches;
   1025      0    stevel 
   1026      0    stevel static taskq_t		*kmem_taskq;
   1027      0    stevel static kmutex_t		kmem_flags_lock;
   1028      0    stevel static vmem_t		*kmem_metadata_arena;
   1029      0    stevel static vmem_t		*kmem_msb_arena;	/* arena for metadata caches */
   1030      0    stevel static vmem_t		*kmem_cache_arena;
   1031      0    stevel static vmem_t		*kmem_hash_arena;
   1032      0    stevel static vmem_t		*kmem_log_arena;
   1033      0    stevel static vmem_t		*kmem_oversize_arena;
   1034      0    stevel static vmem_t		*kmem_va_arena;
   1035      0    stevel static vmem_t		*kmem_default_arena;
   1036      0    stevel static vmem_t		*kmem_firewall_va_arena;
   1037      0    stevel static vmem_t		*kmem_firewall_arena;
   1038      0    stevel 
   1039   6712     tomee /*
   1040   6712     tomee  * Define KMEM_STATS to turn on statistic gathering. By default, it is only
   1041   6712     tomee  * turned on when DEBUG is also defined.
   1042   6712     tomee  */
   1043   6712     tomee #ifdef	DEBUG
   1044   6712     tomee #define	KMEM_STATS
   1045   6712     tomee #endif	/* DEBUG */
   1046   6712     tomee 
   1047   6712     tomee #ifdef	KMEM_STATS
   1048   6712     tomee #define	KMEM_STAT_ADD(stat)			((stat)++)
   1049   6712     tomee #define	KMEM_STAT_COND_ADD(cond, stat)		((void) (!(cond) || (stat)++))
   1050   6712     tomee #else
   1051   6712     tomee #define	KMEM_STAT_ADD(stat)			/* nothing */
   1052   6712     tomee #define	KMEM_STAT_COND_ADD(cond, stat)		/* nothing */
   1053   6712     tomee #endif	/* KMEM_STATS */
   1054   6712     tomee 
   1055   6712     tomee /*
   1056   6712     tomee  * kmem slab consolidator thresholds (tunables)
   1057   6712     tomee  */
   1058  10217       Tom size_t kmem_frag_minslabs = 101;	/* minimum total slabs */
   1059  10217       Tom size_t kmem_frag_numer = 1;		/* free buffers (numerator) */
   1060  10217       Tom size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */
   1061   6712     tomee /*
   1062   6712     tomee  * Maximum number of slabs from which to move buffers during a single
   1063   6712     tomee  * maintenance interval while the system is not low on memory.
   1064   6712     tomee  */
   1065  10217       Tom size_t kmem_reclaim_max_slabs = 1;
   1066   6712     tomee /*
   1067   6712     tomee  * Number of slabs to scan backwards from the end of the partial slab list
   1068   6712     tomee  * when searching for buffers to relocate.
   1069   6712     tomee  */
   1070  10217       Tom size_t kmem_reclaim_scan_range = 12;
   1071   6712     tomee 
   1072   6712     tomee #ifdef	KMEM_STATS
   1073   6712     tomee static struct {
   1074   6712     tomee 	uint64_t kms_callbacks;
   1075   6712     tomee 	uint64_t kms_yes;
   1076   6712     tomee 	uint64_t kms_no;
   1077   6712     tomee 	uint64_t kms_later;
   1078   6712     tomee 	uint64_t kms_dont_need;
   1079   6712     tomee 	uint64_t kms_dont_know;
   1080  10217       Tom 	uint64_t kms_hunt_found_mag;
   1081   6712     tomee 	uint64_t kms_hunt_found_slab;
   1082   6712     tomee 	uint64_t kms_hunt_alloc_fail;
   1083   6712     tomee 	uint64_t kms_hunt_lucky;
   1084   6712     tomee 	uint64_t kms_notify;
   1085   6712     tomee 	uint64_t kms_notify_callbacks;
   1086   6712     tomee 	uint64_t kms_disbelief;
   1087   6712     tomee 	uint64_t kms_already_pending;
   1088   6712     tomee 	uint64_t kms_callback_alloc_fail;
   1089   6908     tomee 	uint64_t kms_callback_taskq_fail;
   1090  10217       Tom 	uint64_t kms_endscan_slab_dead;
   1091   6712     tomee 	uint64_t kms_endscan_slab_destroyed;
   1092   6712     tomee 	uint64_t kms_endscan_nomem;
   1093   6712     tomee 	uint64_t kms_endscan_refcnt_changed;
   1094   6712     tomee 	uint64_t kms_endscan_nomove_changed;
   1095   6712     tomee 	uint64_t kms_endscan_freelist;
   1096   6712     tomee 	uint64_t kms_avl_update;
   1097   6712     tomee 	uint64_t kms_avl_noupdate;
   1098   6712     tomee 	uint64_t kms_no_longer_reclaimable;
   1099   6712     tomee 	uint64_t kms_notify_no_longer_reclaimable;
   1100  10217       Tom 	uint64_t kms_notify_slab_dead;
   1101  10217       Tom 	uint64_t kms_notify_slab_destroyed;
   1102   6712     tomee 	uint64_t kms_alloc_fail;
   1103   6712     tomee 	uint64_t kms_constructor_fail;
   1104   6712     tomee 	uint64_t kms_dead_slabs_freed;
   1105   6712     tomee 	uint64_t kms_defrags;
   1106  10217       Tom 	uint64_t kms_scans;
   1107   6712     tomee 	uint64_t kms_scan_depot_ws_reaps;
   1108   6712     tomee 	uint64_t kms_debug_reaps;
   1109  10217       Tom 	uint64_t kms_debug_scans;
   1110   6712     tomee } kmem_move_stats;
   1111   6712     tomee #endif	/* KMEM_STATS */
   1112   6712     tomee 
   1113   6712     tomee /* consolidator knobs */
   1114   6712     tomee static boolean_t kmem_move_noreap;
   1115   6712     tomee static boolean_t kmem_move_blocked;
   1116   6712     tomee static boolean_t kmem_move_fulltilt;
   1117   6712     tomee static boolean_t kmem_move_any_partial;
   1118   6712     tomee 
   1119   6712     tomee #ifdef	DEBUG
   1120   6712     tomee /*
   1121  10217       Tom  * kmem consolidator debug tunables:
   1122   6712     tomee  * Ensure code coverage by occasionally running the consolidator even when the
   1123   6712     tomee  * caches are not fragmented (they may never be). These intervals are mean time
   1124   6712     tomee  * in cache maintenance intervals (kmem_cache_update).
   1125   6712     tomee  */
   1126  10217       Tom uint32_t kmem_mtb_move = 60;	/* defrag 1 slab (~15min) */
   1127  10217       Tom uint32_t kmem_mtb_reap = 1800;	/* defrag all slabs (~7.5hrs) */
   1128   6712     tomee #endif	/* DEBUG */
   1129   6712     tomee 
   1130   6712     tomee static kmem_cache_t	*kmem_defrag_cache;
   1131   6712     tomee static kmem_cache_t	*kmem_move_cache;
   1132   6712     tomee static taskq_t		*kmem_move_taskq;
   1133   6712     tomee 
   1134   6712     tomee static void kmem_cache_scan(kmem_cache_t *);
   1135   6712     tomee static void kmem_cache_defrag(kmem_cache_t *);
   1136   6712     tomee 
   1137   6712     tomee 
   1138      0    stevel kmem_log_header_t	*kmem_transaction_log;
   1139      0    stevel kmem_log_header_t	*kmem_content_log;
   1140      0    stevel kmem_log_header_t	*kmem_failure_log;
   1141      0    stevel kmem_log_header_t	*kmem_slab_log;
   1142      0    stevel 
   1143      0    stevel static int		kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
   1144      0    stevel 
   1145      0    stevel #define	KMEM_BUFTAG_LITE_ENTER(bt, count, caller)			\
   1146      0    stevel 	if ((count) > 0) {						\
   1147      0    stevel 		pc_t *_s = ((kmem_buftag_lite_t *)(bt))->bt_history;	\
   1148      0    stevel 		pc_t *_e;						\
   1149      0    stevel 		/* memmove() the old entries down one notch */		\
   1150      0    stevel 		for (_e = &_s[(count) - 1]; _e > _s; _e--)		\
   1151      0    stevel 			*_e = *(_e - 1);				\
   1152      0    stevel 		*_s = (uintptr_t)(caller);				\
   1153      0    stevel 	}
   1154      0    stevel 
   1155      0    stevel #define	KMERR_MODIFIED	0	/* buffer modified while on freelist */
   1156      0    stevel #define	KMERR_REDZONE	1	/* redzone violation (write past end of buf) */
   1157      0    stevel #define	KMERR_DUPFREE	2	/* freed a buffer twice */
   1158      0    stevel #define	KMERR_BADADDR	3	/* freed a bad (unallocated) address */
   1159      0    stevel #define	KMERR_BADBUFTAG	4	/* buftag corrupted */
   1160      0    stevel #define	KMERR_BADBUFCTL	5	/* bufctl corrupted */
   1161      0    stevel #define	KMERR_BADCACHE	6	/* freed a buffer to the wrong cache */
   1162      0    stevel #define	KMERR_BADSIZE	7	/* alloc size != free size */
   1163      0    stevel #define	KMERR_BADBASE	8	/* buffer base address wrong */
   1164      0    stevel 
   1165      0    stevel struct {
   1166      0    stevel 	hrtime_t	kmp_timestamp;	/* timestamp of panic */
   1167      0    stevel 	int		kmp_error;	/* type of kmem error */
   1168      0    stevel 	void		*kmp_buffer;	/* buffer that induced panic */
   1169      0    stevel 	void		*kmp_realbuf;	/* real start address for buffer */
   1170      0    stevel 	kmem_cache_t	*kmp_cache;	/* buffer's cache according to client */
   1171      0    stevel 	kmem_cache_t	*kmp_realcache;	/* actual cache containing buffer */
   1172      0    stevel 	kmem_slab_t	*kmp_slab;	/* slab accoring to kmem_findslab() */
   1173      0    stevel 	kmem_bufctl_t	*kmp_bufctl;	/* bufctl */
   1174      0    stevel } kmem_panic_info;
   1175      0    stevel 
   1176      0    stevel 
   1177      0    stevel static void
   1178      0    stevel copy_pattern(uint64_t pattern, void *buf_arg, size_t size)
   1179      0    stevel {
   1180      0    stevel 	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
   1181      0    stevel 	uint64_t *buf = buf_arg;
   1182      0    stevel 
   1183      0    stevel 	while (buf < bufend)
   1184      0    stevel 		*buf++ = pattern;
   1185      0    stevel }
   1186      0    stevel 
   1187      0    stevel static void *
   1188      0    stevel verify_pattern(uint64_t pattern, void *buf_arg, size_t size)
   1189      0    stevel {
   1190      0    stevel 	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
   1191      0    stevel 	uint64_t *buf;
   1192      0    stevel 
   1193      0    stevel 	for (buf = buf_arg; buf < bufend; buf++)
   1194      0    stevel 		if (*buf != pattern)
   1195      0    stevel 			return (buf);
   1196      0    stevel 	return (NULL);
   1197      0    stevel }
   1198      0    stevel 
   1199      0    stevel static void *
   1200      0    stevel verify_and_copy_pattern(uint64_t old, uint64_t new, void *buf_arg, size_t size)
   1201      0    stevel {
   1202      0    stevel 	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
   1203      0    stevel 	uint64_t *buf;
   1204      0    stevel 
   1205      0    stevel 	for (buf = buf_arg; buf < bufend; buf++) {
   1206      0    stevel 		if (*buf != old) {
   1207      0    stevel 			copy_pattern(old, buf_arg,
   1208   6306     tomee 			    (char *)buf - (char *)buf_arg);
   1209      0    stevel 			return (buf);
   1210      0    stevel 		}
   1211      0    stevel 		*buf = new;
   1212      0    stevel 	}
   1213      0    stevel 
   1214      0    stevel 	return (NULL);
   1215      0    stevel }
   1216      0    stevel 
   1217      0    stevel static void
   1218      0    stevel kmem_cache_applyall(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag)
   1219      0    stevel {
   1220      0    stevel 	kmem_cache_t *cp;
   1221      0    stevel 
   1222      0    stevel 	mutex_enter(&kmem_cache_lock);
   1223   6712     tomee 	for (cp = list_head(&kmem_caches); cp != NULL;
   1224   6712     tomee 	    cp = list_next(&kmem_caches, cp))
   1225      0    stevel 		if (tq != NULL)
   1226      0    stevel 			(void) taskq_dispatch(tq, (task_func_t *)func, cp,
   1227      0    stevel 			    tqflag);
   1228      0    stevel 		else
   1229      0    stevel 			func(cp);
   1230      0    stevel 	mutex_exit(&kmem_cache_lock);
   1231      0    stevel }
   1232      0    stevel 
   1233      0    stevel static void
   1234      0    stevel kmem_cache_applyall_id(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag)
   1235      0    stevel {
   1236      0    stevel 	kmem_cache_t *cp;
   1237      0    stevel 
   1238      0    stevel 	mutex_enter(&kmem_cache_lock);
   1239   6712     tomee 	for (cp = list_head(&kmem_caches); cp != NULL;
   1240   6712     tomee 	    cp = list_next(&kmem_caches, cp)) {
   1241      0    stevel 		if (!(cp->cache_cflags & KMC_IDENTIFIER))
   1242      0    stevel 			continue;
   1243      0    stevel 		if (tq != NULL)
   1244      0    stevel 			(void) taskq_dispatch(tq, (task_func_t *)func, cp,
   1245      0    stevel 			    tqflag);
   1246      0    stevel 		else
   1247      0    stevel 			func(cp);
   1248      0    stevel 	}
   1249      0    stevel 	mutex_exit(&kmem_cache_lock);
   1250      0    stevel }
   1251      0    stevel 
   1252      0    stevel /*
   1253      0    stevel  * Debugging support.  Given a buffer address, find its slab.
   1254      0    stevel  */
   1255      0    stevel static kmem_slab_t *
   1256      0    stevel kmem_findslab(kmem_cache_t *cp, void *buf)
   1257      0    stevel {
   1258      0    stevel 	kmem_slab_t *sp;
   1259      0    stevel 
   1260      0    stevel 	mutex_enter(&cp->cache_lock);
   1261   6712     tomee 	for (sp = list_head(&cp->cache_complete_slabs); sp != NULL;
   1262   6712     tomee 	    sp = list_next(&cp->cache_complete_slabs, sp)) {
   1263   6712     tomee 		if (KMEM_SLAB_MEMBER(sp, buf)) {
   1264   6712     tomee 			mutex_exit(&cp->cache_lock);
   1265   6712     tomee 			return (sp);
   1266   6712     tomee 		}
   1267   6712     tomee 	}
   1268   6712     tomee 	for (sp = avl_first(&cp->cache_partial_slabs); sp != NULL;
   1269   6712     tomee 	    sp = AVL_NEXT(&cp->cache_partial_slabs, sp)) {
   1270      0    stevel 		if (KMEM_SLAB_MEMBER(sp, buf)) {
   1271      0    stevel 			mutex_exit(&cp->cache_lock);
   1272      0    stevel 			return (sp);
   1273      0    stevel 		}
   1274      0    stevel 	}
   1275      0    stevel 	mutex_exit(&cp->cache_lock);
   1276      0    stevel 
   1277      0    stevel 	return (NULL);
   1278      0    stevel }
   1279      0    stevel 
   1280      0    stevel static void
   1281      0    stevel kmem_error(int error, kmem_cache_t *cparg, void *bufarg)
   1282      0    stevel {
   1283      0    stevel 	kmem_buftag_t *btp = NULL;
   1284      0    stevel 	kmem_bufctl_t *bcp = NULL;
   1285      0    stevel 	kmem_cache_t *cp = cparg;
   1286      0    stevel 	kmem_slab_t *sp;
   1287      0    stevel 	uint64_t *off;
   1288      0    stevel 	void *buf = bufarg;
   1289      0    stevel 
   1290      0    stevel 	kmem_logging = 0;	/* stop logging when a bad thing happens */
   1291      0    stevel 
   1292      0    stevel 	kmem_panic_info.kmp_timestamp = gethrtime();
   1293      0    stevel 
   1294      0    stevel 	sp = kmem_findslab(cp, buf);
   1295      0    stevel 	if (sp == NULL) {
   1296   6712     tomee 		for (cp = list_tail(&kmem_caches); cp != NULL;
   1297   6712     tomee 		    cp = list_prev(&kmem_caches, cp)) {
   1298      0    stevel 			if ((sp = kmem_findslab(cp, buf)) != NULL)
   1299      0    stevel 				break;
   1300      0    stevel 		}
   1301      0    stevel 	}
   1302      0    stevel 
   1303      0    stevel 	if (sp == NULL) {
   1304      0    stevel 		cp = NULL;
   1305      0    stevel 		error = KMERR_BADADDR;
   1306      0    stevel 	} else {
   1307      0    stevel 		if (cp != cparg)
   1308      0    stevel 			error = KMERR_BADCACHE;
   1309      0    stevel 		else
   1310      0    stevel 			buf = (char *)bufarg - ((uintptr_t)bufarg -
   1311      0    stevel 			    (uintptr_t)sp->slab_base) % cp->cache_chunksize;
   1312      0    stevel 		if (buf != bufarg)
   1313      0    stevel 			error = KMERR_BADBASE;
   1314      0    stevel 		if (cp->cache_flags & KMF_BUFTAG)
   1315      0    stevel 			btp = KMEM_BUFTAG(cp, buf);
   1316      0    stevel 		if (cp->cache_flags & KMF_HASH) {
   1317      0    stevel 			mutex_enter(&cp->cache_lock);
   1318      0    stevel 			for (bcp = *KMEM_HASH(cp, buf); bcp; bcp = bcp->bc_next)
   1319      0    stevel 				if (bcp->bc_addr == buf)
   1320      0    stevel 					break;
   1321      0    stevel 			mutex_exit(&cp->cache_lock);
   1322      0    stevel 			if (bcp == NULL && btp != NULL)
   1323      0    stevel 				bcp = btp->bt_bufctl;
   1324      0    stevel 			if (kmem_findslab(cp->cache_bufctl_cache, bcp) ==
   1325      0    stevel 			    NULL || P2PHASE((uintptr_t)bcp, KMEM_ALIGN) ||
   1326      0    stevel 			    bcp->bc_addr != buf) {
   1327      0    stevel 				error = KMERR_BADBUFCTL;
   1328      0    stevel 				bcp = NULL;
   1329      0    stevel 			}
   1330      0    stevel 		}
   1331      0    stevel 	}
   1332      0    stevel 
   1333      0    stevel 	kmem_panic_info.kmp_error = error;
   1334      0    stevel 	kmem_panic_info.kmp_buffer = bufarg;
   1335      0    stevel 	kmem_panic_info.kmp_realbuf = buf;
   1336      0    stevel 	kmem_panic_info.kmp_cache = cparg;
   1337      0    stevel 	kmem_panic_info.kmp_realcache = cp;
   1338      0    stevel 	kmem_panic_info.kmp_slab = sp;
   1339      0    stevel 	kmem_panic_info.kmp_bufctl = bcp;
   1340      0    stevel 
   1341      0    stevel 	printf("kernel memory allocator: ");
   1342      0    stevel 
   1343      0    stevel 	switch (error) {
   1344      0    stevel 
   1345      0    stevel 	case KMERR_MODIFIED:
   1346      0    stevel 		printf("buffer modified after being freed\n");
   1347      0    stevel 		off = verify_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
   1348      0    stevel 		if (off == NULL)	/* shouldn't happen */
   1349      0    stevel 			off = buf;
   1350      0    stevel 		printf("modification occurred at offset 0x%lx "
   1351      0    stevel 		    "(0x%llx replaced by 0x%llx)\n",
   1352      0    stevel 		    (uintptr_t)off - (uintptr_t)buf,
   1353      0    stevel 		    (longlong_t)KMEM_FREE_PATTERN, (longlong_t)*off);
   1354      0    stevel 		break;
   1355      0    stevel 
   1356      0    stevel 	case KMERR_REDZONE:
   1357      0    stevel 		printf("redzone violation: write past end of buffer\n");
   1358      0    stevel 		break;
   1359      0    stevel 
   1360      0    stevel 	case KMERR_BADADDR:
   1361      0    stevel 		printf("invalid free: buffer not in cache\n");
   1362      0    stevel 		break;
   1363      0    stevel 
   1364      0    stevel 	case KMERR_DUPFREE:
   1365      0    stevel 		printf("duplicate free: buffer freed twice\n");
   1366      0    stevel 		break;
   1367      0    stevel 
   1368      0    stevel 	case KMERR_BADBUFTAG:
   1369      0    stevel 		printf("boundary tag corrupted\n");
   1370      0    stevel 		printf("bcp ^ bxstat = %lx, should be %lx\n",
   1371      0    stevel 		    (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat,
   1372      0    stevel 		    KMEM_BUFTAG_FREE);
   1373      0    stevel 		break;
   1374      0    stevel 
   1375      0    stevel 	case KMERR_BADBUFCTL:
   1376      0    stevel 		printf("bufctl corrupted\n");
   1377      0    stevel 		break;
   1378      0    stevel 
   1379      0    stevel 	case KMERR_BADCACHE:
   1380      0    stevel 		printf("buffer freed to wrong cache\n");
   1381      0    stevel 		printf("buffer was allocated from %s,\n", cp->cache_name);
   1382      0    stevel 		printf("caller attempting free to %s.\n", cparg->cache_name);
   1383      0    stevel 		break;
   1384      0    stevel 
   1385      0    stevel 	case KMERR_BADSIZE:
   1386      0    stevel 		printf("bad free: free size (%u) != alloc size (%u)\n",
   1387      0    stevel 		    KMEM_SIZE_DECODE(((uint32_t *)btp)[0]),
   1388      0    stevel 		    KMEM_SIZE_DECODE(((uint32_t *)btp)[1]));
   1389      0    stevel 		break;
   1390      0    stevel 
   1391      0    stevel 	case KMERR_BADBASE:
   1392      0    stevel 		printf("bad free: free address (%p) != alloc address (%p)\n",
   1393      0    stevel 		    bufarg, buf);
   1394      0    stevel 		break;
   1395      0    stevel 	}
   1396      0    stevel 
   1397      0    stevel 	printf("buffer=%p  bufctl=%p  cache: %s\n",
   1398      0    stevel 	    bufarg, (void *)bcp, cparg->cache_name);
   1399      0    stevel 
   1400      0    stevel 	if (bcp != NULL && (cp->cache_flags & KMF_AUDIT) &&
   1401      0    stevel 	    error != KMERR_BADBUFCTL) {
   1402      0    stevel 		int d;
   1403      0    stevel 		timestruc_t ts;
   1404      0    stevel 		kmem_bufctl_audit_t *bcap = (kmem_bufctl_audit_t *)bcp;
   1405      0    stevel 
   1406      0    stevel 		hrt2ts(kmem_panic_info.kmp_timestamp - bcap->bc_timestamp, &ts);
   1407      0    stevel 		printf("previous transaction on buffer %p:\n", buf);
   1408      0    stevel 		printf("thread=%p  time=T-%ld.%09ld  slab=%p  cache: %s\n",
   1409      0    stevel 		    (void *)bcap->bc_thread, ts.tv_sec, ts.tv_nsec,
   1410      0    stevel 		    (void *)sp, cp->cache_name);
   1411      0    stevel 		for (d = 0; d < MIN(bcap->bc_depth, KMEM_STACK_DEPTH); d++) {
   1412      0    stevel 			ulong_t off;
   1413      0    stevel 			char *sym = kobj_getsymname(bcap->bc_stack[d], &off);
   1414      0    stevel 			printf("%s+%lx\n", sym ? sym : "?", off);
   1415      0    stevel 		}
   1416      0    stevel 	}
   1417      0    stevel 	if (kmem_panic > 0)
   1418      0    stevel 		panic("kernel heap corruption detected");
   1419      0    stevel 	if (kmem_panic == 0)
   1420      0    stevel 		debug_enter(NULL);
   1421      0    stevel 	kmem_logging = 1;	/* resume logging */
   1422      0    stevel }
   1423      0    stevel 
   1424      0    stevel static kmem_log_header_t *
   1425      0    stevel kmem_log_init(size_t logsize)
   1426      0    stevel {
   1427      0    stevel 	kmem_log_header_t *lhp;
   1428      0    stevel 	int nchunks = 4 * max_ncpus;
   1429      0    stevel 	size_t lhsize = (size_t)&((kmem_log_header_t *)0)->lh_cpu[max_ncpus];
   1430      0    stevel 	int i;
   1431      0    stevel 
   1432      0    stevel 	/*
   1433      0    stevel 	 * Make sure that lhp->lh_cpu[] is nicely aligned
   1434      0    stevel 	 * to prevent false sharing of cache lines.
   1435      0    stevel 	 */
   1436      0    stevel 	lhsize = P2ROUNDUP(lhsize, KMEM_ALIGN);
   1437      0    stevel 	lhp = vmem_xalloc(kmem_log_arena, lhsize, 64, P2NPHASE(lhsize, 64), 0,
   1438      0    stevel 	    NULL, NULL, VM_SLEEP);
   1439      0    stevel 	bzero(lhp, lhsize);
   1440      0    stevel 
   1441      0    stevel 	mutex_init(&lhp->lh_lock, NULL, MUTEX_DEFAULT, NULL);
   1442      0    stevel 	lhp->lh_nchunks = nchunks;
   1443      0    stevel 	lhp->lh_chunksize = P2ROUNDUP(logsize / nchunks + 1, PAGESIZE);
   1444      0    stevel 	lhp->lh_base = vmem_alloc(kmem_log_arena,
   1445      0    stevel 	    lhp->lh_chunksize * nchunks, VM_SLEEP);
   1446      0    stevel 	lhp->lh_free = vmem_alloc(kmem_log_arena,
   1447      0    stevel 	    nchunks * sizeof (int), VM_SLEEP);
   1448      0    stevel 	bzero(lhp->lh_base, lhp->lh_chunksize * nchunks);
   1449      0    stevel 
   1450      0    stevel 	for (i = 0; i < max_ncpus; i++) {
   1451      0    stevel 		kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[i];
   1452      0    stevel 		mutex_init(&clhp->clh_lock, NULL, MUTEX_DEFAULT, NULL);
   1453      0    stevel 		clhp->clh_chunk = i;
   1454      0    stevel 	}
   1455      0    stevel 
   1456      0    stevel 	for (i = max_ncpus; i < nchunks; i++)
   1457      0    stevel 		lhp->lh_free[i] = i;
   1458      0    stevel 
   1459      0    stevel 	lhp->lh_head = max_ncpus;
   1460      0    stevel 	lhp->lh_tail = 0;
   1461      0    stevel 
   1462      0    stevel 	return (lhp);
   1463      0    stevel }
   1464      0    stevel 
   1465      0    stevel static void *
   1466      0    stevel kmem_log_enter(kmem_log_header_t *lhp, void *data, size_t size)
   1467      0    stevel {
   1468      0    stevel 	void *logspace;
   1469      0    stevel 	kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[CPU->cpu_seqid];
   1470      0    stevel 
   1471      0    stevel 	if (lhp == NULL || kmem_logging == 0 || panicstr)
   1472      0    stevel 		return (NULL);
   1473      0    stevel 
   1474      0    stevel 	mutex_enter(&clhp->clh_lock);
   1475      0    stevel 	clhp->clh_hits++;
   1476      0    stevel 	if (size > clhp->clh_avail) {
   1477      0    stevel 		mutex_enter(&lhp->lh_lock);
   1478      0    stevel 		lhp->lh_hits++;
   1479      0    stevel 		lhp->lh_free[lhp->lh_tail] = clhp->clh_chunk;
   1480      0    stevel 		lhp->lh_tail = (lhp->lh_tail + 1) % lhp->lh_nchunks;
   1481      0    stevel 		clhp->clh_chunk = lhp->lh_free[lhp->lh_head];
   1482      0    stevel 		lhp->lh_head = (lhp->lh_head + 1) % lhp->lh_nchunks;
   1483      0    stevel 		clhp->clh_current = lhp->lh_base +
   1484   6306     tomee 		    clhp->clh_chunk * lhp->lh_chunksize;
   1485      0    stevel 		clhp->clh_avail = lhp->lh_chunksize;
   1486      0    stevel 		if (size > lhp->lh_chunksize)
   1487      0    stevel 			size = lhp->lh_chunksize;
   1488      0    stevel 		mutex_exit(&lhp->lh_lock);
   1489      0    stevel 	}
   1490      0    stevel 	logspace = clhp->clh_current;
   1491      0    stevel 	clhp->clh_current += size;
   1492      0    stevel 	clhp->clh_avail -= size;
   1493      0    stevel 	bcopy(data, logspace, size);
   1494      0    stevel 	mutex_exit(&clhp->clh_lock);
   1495      0    stevel 	return (logspace);
   1496      0    stevel }
   1497      0    stevel 
   1498      0    stevel #define	KMEM_AUDIT(lp, cp, bcp)						\
   1499      0    stevel {									\
   1500      0    stevel 	kmem_bufctl_audit_t *_bcp = (kmem_bufctl_audit_t *)(bcp);	\
   1501      0    stevel 	_bcp->bc_timestamp = gethrtime();				\
   1502      0    stevel 	_bcp->bc_thread = curthread;					\
   1503      0    stevel 	_bcp->bc_depth = getpcstack(_bcp->bc_stack, KMEM_STACK_DEPTH);	\
   1504      0    stevel 	_bcp->bc_lastlog = kmem_log_enter((lp), _bcp, sizeof (*_bcp));	\
   1505      0    stevel }
   1506      0    stevel 
   1507      0    stevel static void
   1508      0    stevel kmem_log_event(kmem_log_header_t *lp, kmem_cache_t *cp,
   1509      0    stevel 	kmem_slab_t *sp, void *addr)
   1510      0    stevel {
   1511      0    stevel 	kmem_bufctl_audit_t bca;
   1512      0    stevel 
   1513      0    stevel 	bzero(&bca, sizeof (kmem_bufctl_audit_t));
   1514      0    stevel 	bca.bc_addr = addr;
   1515      0    stevel 	bca.bc_slab = sp;
   1516      0    stevel 	bca.bc_cache = cp;
   1517      0    stevel 	KMEM_AUDIT(lp, cp, &bca);
   1518      0    stevel }
   1519      0    stevel 
   1520      0    stevel /*
   1521      0    stevel  * Create a new slab for cache cp.
   1522      0    stevel  */
   1523      0    stevel static kmem_slab_t *
   1524      0    stevel kmem_slab_create(kmem_cache_t *cp, int kmflag)
   1525      0    stevel {
   1526      0    stevel 	size_t slabsize = cp->cache_slabsize;
   1527      0    stevel 	size_t chunksize = cp->cache_chunksize;
   1528      0    stevel 	int cache_flags = cp->cache_flags;
   1529      0    stevel 	size_t color, chunks;
   1530      0    stevel 	char *buf, *slab;
   1531      0    stevel 	kmem_slab_t *sp;
   1532      0    stevel 	kmem_bufctl_t *bcp;
   1533      0    stevel 	vmem_t *vmp = cp->cache_arena;
   1534      0    stevel 
   1535   6712     tomee 	ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
   1536   6712     tomee 
   1537      0    stevel 	color = cp->cache_color + cp->cache_align;
   1538      0    stevel 	if (color > cp->cache_maxcolor)
   1539      0    stevel 		color = cp->cache_mincolor;
   1540      0    stevel 	cp->cache_color = color;
   1541      0    stevel 
   1542      0    stevel 	slab = vmem_alloc(vmp, slabsize, kmflag & KM_VMFLAGS);
   1543      0    stevel 
   1544      0    stevel 	if (slab == NULL)
   1545      0    stevel 		goto vmem_alloc_failure;
   1546      0    stevel 
   1547      0    stevel 	ASSERT(P2PHASE((uintptr_t)slab, vmp->vm_quantum) == 0);
   1548      0    stevel 
   1549   6712     tomee 	/*
   1550   6712     tomee 	 * Reverify what was already checked in kmem_cache_set_move(), since the
   1551   6712     tomee 	 * consolidator depends (for correctness) on slabs being initialized
   1552   6712     tomee 	 * with the 0xbaddcafe memory pattern (setting a low order bit usable by
   1553   6712     tomee 	 * clients to distinguish uninitialized memory from known objects).
   1554   6712     tomee 	 */
   1555   6712     tomee 	ASSERT((cp->cache_move == NULL) || !(cp->cache_cflags & KMC_NOTOUCH));
   1556      0    stevel 	if (!(cp->cache_cflags & KMC_NOTOUCH))
   1557      0    stevel 		copy_pattern(KMEM_UNINITIALIZED_PATTERN, slab, slabsize);
   1558      0    stevel 
   1559      0    stevel 	if (cache_flags & KMF_HASH) {
   1560      0    stevel 		if ((sp = kmem_cache_alloc(kmem_slab_cache, kmflag)) == NULL)
   1561      0    stevel 			goto slab_alloc_failure;
   1562      0    stevel 		chunks = (slabsize - color) / chunksize;
   1563      0    stevel 	} else {
   1564      0    stevel 		sp = KMEM_SLAB(cp, slab);
   1565      0    stevel 		chunks = (slabsize - sizeof (kmem_slab_t) - color) / chunksize;
   1566      0    stevel 	}
   1567      0    stevel 
   1568      0    stevel 	sp->slab_cache	= cp;
   1569      0    stevel 	sp->slab_head	= NULL;
   1570      0    stevel 	sp->slab_refcnt	= 0;
   1571      0    stevel 	sp->slab_base	= buf = slab + color;
   1572      0    stevel 	sp->slab_chunks	= chunks;
   1573   6712     tomee 	sp->slab_stuck_offset = (uint32_t)-1;
   1574   6712     tomee 	sp->slab_later_count = 0;
   1575   6712     tomee 	sp->slab_flags = 0;
   1576      0    stevel 
   1577      0    stevel 	ASSERT(chunks > 0);
   1578      0    stevel 	while (chunks-- != 0) {
   1579      0    stevel 		if (cache_flags & KMF_HASH) {
   1580      0    stevel 			bcp = kmem_cache_alloc(cp->cache_bufctl_cache, kmflag);
   1581      0    stevel 			if (bcp == NULL)
   1582      0    stevel 				goto bufctl_alloc_failure;
   1583      0    stevel 			if (cache_flags & KMF_AUDIT) {
   1584      0    stevel 				kmem_bufctl_audit_t *bcap =
   1585      0    stevel 				    (kmem_bufctl_audit_t *)bcp;
   1586      0    stevel 				bzero(bcap, sizeof (kmem_bufctl_audit_t));
   1587      0    stevel 				bcap->bc_cache = cp;
   1588      0    stevel 			}
   1589      0    stevel 			bcp->bc_addr = buf;
   1590      0    stevel 			bcp->bc_slab = sp;
   1591      0    stevel 		} else {
   1592      0    stevel 			bcp = KMEM_BUFCTL(cp, buf);
   1593      0    stevel 		}
   1594      0    stevel 		if (cache_flags & KMF_BUFTAG) {
   1595      0    stevel 			kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
   1596      0    stevel 			btp->bt_redzone = KMEM_REDZONE_PATTERN;
   1597      0    stevel 			btp->bt_bufctl = bcp;
   1598      0    stevel 			btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
   1599      0    stevel 			if (cache_flags & KMF_DEADBEEF) {
   1600      0    stevel 				copy_pattern(KMEM_FREE_PATTERN, buf,
   1601      0    stevel 				    cp->cache_verify);
   1602      0    stevel 			}
   1603      0    stevel 		}
   1604      0    stevel 		bcp->bc_next = sp->slab_head;
   1605      0    stevel 		sp->slab_head = bcp;
   1606      0    stevel 		buf += chunksize;
   1607      0    stevel 	}
   1608      0    stevel 
   1609      0    stevel 	kmem_log_event(kmem_slab_log, cp, sp, slab);
   1610      0    stevel 
   1611      0    stevel 	return (sp);
   1612      0    stevel 
   1613      0    stevel bufctl_alloc_failure:
   1614      0    stevel 
   1615      0    stevel 	while ((bcp = sp->slab_head) != NULL) {
   1616      0    stevel 		sp->slab_head = bcp->bc_next;
   1617      0    stevel 		kmem_cache_free(cp->cache_bufctl_cache, bcp);
   1618      0    stevel 	}
   1619      0    stevel 	kmem_cache_free(kmem_slab_cache, sp);
   1620      0    stevel 
   1621      0    stevel slab_alloc_failure:
   1622      0    stevel 
   1623      0    stevel 	vmem_free(vmp, slab, slabsize);
   1624      0    stevel 
   1625      0    stevel vmem_alloc_failure:
   1626      0    stevel 
   1627      0    stevel 	kmem_log_event(kmem_failure_log, cp, NULL, NULL);
   1628      0    stevel 	atomic_add_64(&cp->cache_alloc_fail, 1);
   1629      0    stevel 
   1630      0    stevel 	return (NULL);
   1631      0    stevel }
   1632      0    stevel 
   1633      0    stevel /*
   1634      0    stevel  * Destroy a slab.
   1635      0    stevel  */
   1636      0    stevel static void
   1637      0    stevel kmem_slab_destroy(kmem_cache_t *cp, kmem_slab_t *sp)
   1638      0    stevel {
   1639      0    stevel 	vmem_t *vmp = cp->cache_arena;
   1640      0    stevel 	void *slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum);
   1641      0    stevel 
   1642   6712     tomee 	ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
   1643   6712     tomee 	ASSERT(sp->slab_refcnt == 0);
   1644   6712     tomee 
   1645      0    stevel 	if (cp->cache_flags & KMF_HASH) {
   1646      0    stevel 		kmem_bufctl_t *bcp;
   1647      0    stevel 		while ((bcp = sp->slab_head) != NULL) {
   1648      0    stevel 			sp->slab_head = bcp->bc_next;
   1649      0    stevel 			kmem_cache_free(cp->cache_bufctl_cache, bcp);
   1650      0    stevel 		}
   1651      0    stevel 		kmem_cache_free(kmem_slab_cache, sp);
   1652      0    stevel 	}
   1653      0    stevel 	vmem_free(vmp, slab, cp->cache_slabsize);
   1654      0    stevel }
   1655      0    stevel 
   1656   6712     tomee static void *
   1657   6712     tomee kmem_slab_alloc_impl(kmem_cache_t *cp, kmem_slab_t *sp)
   1658      0    stevel {
   1659      0    stevel 	kmem_bufctl_t *bcp, **hash_bucket;
   1660   6712     tomee 	void *buf;
   1661   6712     tomee 
   1662   6712     tomee 	ASSERT(MUTEX_HELD(&cp->cache_lock));
   1663   6712     tomee 	/*
   1664   6712     tomee 	 * kmem_slab_alloc() drops cache_lock when it creates a new slab, so we
   1665   6712     tomee 	 * can't ASSERT(avl_is_empty(&cp->cache_partial_slabs)) here when the
   1666   6712     tomee 	 * slab is newly created (sp->slab_refcnt == 0).
   1667   6712     tomee 	 */
   1668   6712     tomee 	ASSERT((sp->slab_refcnt == 0) || (KMEM_SLAB_IS_PARTIAL(sp) &&
   1669   6712     tomee 	    (sp == avl_first(&cp->cache_partial_slabs))));
   1670   6712     tomee 	ASSERT(sp->slab_cache == cp);
   1671   6712     tomee 
   1672      0    stevel 	cp->cache_slab_alloc++;
   1673   6306     tomee 	cp->cache_bufslab--;
   1674      0    stevel 	sp->slab_refcnt++;
   1675   6712     tomee 
   1676      0    stevel 	bcp = sp->slab_head;
   1677      0    stevel 	if ((sp->slab_head = bcp->bc_next) == NULL) {
   1678   6712     tomee 		ASSERT(KMEM_SLAB_IS_ALL_USED(sp));
   1679   6712     tomee 		if (sp->slab_refcnt == 1) {
   1680   6712     tomee 			ASSERT(sp->slab_chunks == 1);
   1681   6712     tomee 		} else {
   1682   6712     tomee 			ASSERT(sp->slab_chunks > 1); /* the slab was partial */
   1683   6712     tomee 			avl_remove(&cp->cache_partial_slabs, sp);
   1684   6712     tomee 			sp->slab_later_count = 0; /* clear history */
   1685   6712     tomee 			sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
   1686   6712     tomee 			sp->slab_stuck_offset = (uint32_t)-1;
   1687   6712     tomee 		}
   1688   6712     tomee 		list_insert_head(&cp->cache_complete_slabs, sp);
   1689   6712     tomee 		cp->cache_complete_slab_count++;
   1690   6712     tomee 	} else {
   1691   6712     tomee 		ASSERT(KMEM_SLAB_IS_PARTIAL(sp));
   1692   6712     tomee 		if (sp->slab_refcnt == 1) {
   1693   6712     tomee 			avl_add(&cp->cache_partial_slabs, sp);
   1694   6712     tomee 		} else {
   1695   6712     tomee 			/*
   1696   6712     tomee 			 * The slab is now more allocated than it was, so the
   1697   6712     tomee 			 * order remains unchanged.
   1698   6712     tomee 			 */
   1699   6712     tomee 			ASSERT(!avl_update(&cp->cache_partial_slabs, sp));
   1700   6712     tomee 		}
   1701      0    stevel 	}
   1702      0    stevel 
   1703      0    stevel 	if (cp->cache_flags & KMF_HASH) {
   1704      0    stevel 		/*
   1705      0    stevel 		 * Add buffer to allocated-address hash table.
   1706      0    stevel 		 */
   1707      0    stevel 		buf = bcp->bc_addr;
   1708      0    stevel 		hash_bucket = KMEM_HASH(cp, buf);
   1709      0    stevel 		bcp->bc_next = *hash_bucket;
   1710      0    stevel 		*hash_bucket = bcp;
   1711      0    stevel 		if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) {
   1712      0    stevel 			KMEM_AUDIT(kmem_transaction_log, cp, bcp);
   1713      0    stevel 		}
   1714      0    stevel 	} else {
   1715      0    stevel 		buf = KMEM_BUF(cp, bcp);
   1716      0    stevel 	}
   1717      0    stevel 
   1718      0    stevel 	ASSERT(KMEM_SLAB_MEMBER(sp, buf));
   1719   6712     tomee 	return (buf);
   1720   6712     tomee }
   1721   6712     tomee 
   1722   6712     tomee /*
   1723   6712     tomee  * Allocate a raw (unconstructed) buffer from cp's slab layer.
   1724   6712     tomee  */
   1725   6712     tomee static void *
   1726   6712     tomee kmem_slab_alloc(kmem_cache_t *cp, int kmflag)
   1727   6712     tomee {
   1728   6712     tomee 	kmem_slab_t *sp;
   1729   6712     tomee 	void *buf;
   1730   7546     tomee 	boolean_t test_destructor;
   1731   7546     tomee 
   1732   7546     tomee 	mutex_enter(&cp->cache_lock);
   1733   7546     tomee 	test_destructor = (cp->cache_slab_alloc == 0);
   1734   6712     tomee 	sp = avl_first(&cp->cache_partial_slabs);
   1735   6712     tomee 	if (sp == NULL) {
   1736   6712     tomee 		ASSERT(cp->cache_bufslab == 0);
   1737   6712     tomee 
   1738   6712     tomee 		/*
   1739   6712     tomee 		 * The freelist is empty.  Create a new slab.
   1740   6712     tomee 		 */
   1741   6712     tomee 		mutex_exit(&cp->cache_lock);
   1742   6712     tomee 		if ((sp = kmem_slab_create(cp, kmflag)) == NULL) {
   1743   6712     tomee 			return (NULL);
   1744   6712     tomee 		}
   1745   6712     tomee 		mutex_enter(&cp->cache_lock);
   1746   6712     tomee 		cp->cache_slab_create++;
   1747   6712     tomee 		if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax)
   1748   6712     tomee 			cp->cache_bufmax = cp->cache_buftotal;
   1749   6712     tomee 		cp->cache_bufslab += sp->slab_chunks;
   1750   6712     tomee 	}
   1751   6712     tomee 
   1752   6712     tomee 	buf = kmem_slab_alloc_impl(cp, sp);
   1753   6712     tomee 	ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
   1754   6712     tomee 	    (cp->cache_complete_slab_count +
   1755   6712     tomee 	    avl_numnodes(&cp->cache_partial_slabs) +
   1756   6712     tomee 	    (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
   1757   6712     tomee 	mutex_exit(&cp->cache_lock);
   1758   7546     tomee 
   1759   7546     tomee 	if (test_destructor && cp->cache_destructor != NULL) {
   1760   7546     tomee 		/*
   1761   7546     tomee 		 * On the first kmem_slab_alloc(), assert that it is valid to
   1762   7546     tomee 		 * call the destructor on a newly constructed object without any
   1763   7546     tomee 		 * client involvement.
   1764   7546     tomee 		 */
   1765   7546     tomee 		if ((cp->cache_constructor == NULL) ||
   1766   7546     tomee 		    cp->cache_constructor(buf, cp->cache_private,
   1767   7546     tomee 		    kmflag) == 0) {
   1768   7546     tomee 			cp->cache_destructor(buf, cp->cache_private);
   1769   7546     tomee 		}
   1770   7546     tomee 		copy_pattern(KMEM_UNINITIALIZED_PATTERN, buf,
   1771   7546     tomee 		    cp->cache_bufsize);
   1772   7546     tomee 		if (cp->cache_flags & KMF_DEADBEEF) {
   1773   7546     tomee 			copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
   1774   7546     tomee 		}
   1775   7546     tomee 	}
   1776   6712     tomee 
   1777   6712     tomee 	return (buf);
   1778   6712     tomee }
   1779   6712     tomee 
   1780   6712     tomee static void kmem_slab_move_yes(kmem_cache_t *, kmem_slab_t *, void *);
   1781      0    stevel 
   1782      0    stevel /*
   1783      0    stevel  * Free a raw (unconstructed) buffer to cp's slab layer.
   1784      0    stevel  */
   1785      0    stevel static void
   1786      0    stevel kmem_slab_free(kmem_cache_t *cp, void *buf)
   1787      0    stevel {
   1788      0    stevel 	kmem_slab_t *sp;
   1789      0    stevel 	kmem_bufctl_t *bcp, **prev_bcpp;
   1790      0    stevel 
   1791      0    stevel 	ASSERT(buf != NULL);
   1792      0    stevel 
   1793      0    stevel 	mutex_enter(&cp->cache_lock);
   1794      0    stevel 	cp->cache_slab_free++;
   1795      0    stevel 
   1796      0    stevel 	if (cp->cache_flags & KMF_HASH) {
   1797      0    stevel 		/*
   1798      0    stevel 		 * Look up buffer in allocated-address hash table.
   1799      0    stevel 		 */
   1800      0    stevel 		prev_bcpp = KMEM_HASH(cp, buf);
   1801      0    stevel 		while ((bcp = *prev_bcpp) != NULL) {
   1802      0    stevel 			if (bcp->bc_addr == buf) {
   1803      0    stevel 				*prev_bcpp = bcp->bc_next;
   1804      0    stevel 				sp = bcp->bc_slab;
   1805      0    stevel 				break;
   1806      0    stevel 			}
   1807      0    stevel 			cp->cache_lookup_depth++;
   1808      0    stevel 			prev_bcpp = &bcp->bc_next;
   1809      0    stevel 		}
   1810      0    stevel 	} else {
   1811      0    stevel 		bcp = KMEM_BUFCTL(cp, buf);
   1812      0    stevel 		sp = KMEM_SLAB(cp, buf);
   1813      0    stevel 	}
   1814      0    stevel 
   1815      0    stevel 	if (bcp == NULL || sp->slab_cache != cp || !KMEM_SLAB_MEMBER(sp, buf)) {
   1816      0    stevel 		mutex_exit(&cp->cache_lock);
   1817      0    stevel 		kmem_error(KMERR_BADADDR, cp, buf);
   1818      0    stevel 		return;
   1819      0    stevel 	}
   1820      0    stevel 
   1821   6712     tomee 	if (KMEM_SLAB_OFFSET(sp, buf) == sp->slab_stuck_offset) {
   1822   6712     tomee 		/*
   1823   6712     tomee 		 * If this is the buffer that prevented the consolidator from
   1824   6712     tomee 		 * clearing the slab, we can reset the slab flags now that the
   1825   6712     tomee 		 * buffer is freed. (It makes sense to do this in
   1826   6712     tomee 		 * kmem_cache_free(), where the client gives up ownership of the
   1827   6712     tomee 		 * buffer, but on the hot path the test is too expensive.)
   1828   6712     tomee 		 */
   1829   6712     tomee 		kmem_slab_move_yes(cp, sp, buf);
   1830   6712     tomee 	}
   1831   6712     tomee 
   1832      0    stevel 	if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) {
   1833      0    stevel 		if (cp->cache_flags & KMF_CONTENTS)
   1834      0    stevel 			((kmem_bufctl_audit_t *)bcp)->bc_contents =
   1835      0    stevel 			    kmem_log_enter(kmem_content_log, buf,
   1836   6306     tomee 			    cp->cache_contents);
   1837      0    stevel 		KMEM_AUDIT(kmem_transaction_log, cp, bcp);
   1838      0    stevel 	}
   1839      0    stevel 
   1840      0    stevel 	bcp->bc_next = sp->slab_head;
   1841      0    stevel 	sp->slab_head = bcp;
   1842      0    stevel 
   1843   6306     tomee 	cp->cache_bufslab++;
   1844      0    stevel 	ASSERT(sp->slab_refcnt >= 1);
   1845   6712     tomee 
   1846      0    stevel 	if (--sp->slab_refcnt == 0) {
   1847      0    stevel 		/*
   1848      0    stevel 		 * There are no outstanding allocations from this slab,
   1849      0    stevel 		 * so we can reclaim the memory.
   1850      0    stevel 		 */
   1851   6712     tomee 		if (sp->slab_chunks == 1) {
   1852   6712     tomee 			list_remove(&cp->cache_complete_slabs, sp);
   1853   6712     tomee 			cp->cache_complete_slab_count--;
   1854   6712     tomee 		} else {
   1855   6712     tomee 			avl_remove(&cp->cache_partial_slabs, sp);
   1856   6712     tomee 		}
   1857   6712     tomee 
   1858      0    stevel 		cp->cache_buftotal -= sp->slab_chunks;
   1859   6306     tomee 		cp->cache_bufslab -= sp->slab_chunks;
   1860   6712     tomee 		/*
   1861   6712     tomee 		 * Defer releasing the slab to the virtual memory subsystem
   1862   6712     tomee 		 * while there is a pending move callback, since we guarantee
   1863   6712     tomee 		 * that buffers passed to the move callback have only been
   1864   6712     tomee 		 * touched by kmem or by the client itself. Since the memory
   1865   6712     tomee 		 * patterns baddcafe (uninitialized) and deadbeef (freed) both
   1866   6712     tomee 		 * set at least one of the two lowest order bits, the client can
   1867   6712     tomee 		 * test those bits in the move callback to determine whether or
   1868   6712     tomee 		 * not it knows about the buffer (assuming that the client also
   1869   6712     tomee 		 * sets one of those low order bits whenever it frees a buffer).
   1870   6712     tomee 		 */
   1871   6712     tomee 		if (cp->cache_defrag == NULL ||
   1872   6712     tomee 		    (avl_is_empty(&cp->cache_defrag->kmd_moves_pending) &&
   1873   6712     tomee 		    !(sp->slab_flags & KMEM_SLAB_MOVE_PENDING))) {
   1874   6712     tomee 			cp->cache_slab_destroy++;
   1875   6712     tomee 			mutex_exit(&cp->cache_lock);
   1876   6712     tomee 			kmem_slab_destroy(cp, sp);
   1877   6712     tomee 		} else {
   1878   6712     tomee 			list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
   1879   6712     tomee 			/*
   1880   6712     tomee 			 * Slabs are inserted at both ends of the deadlist to
   1881   6712     tomee 			 * distinguish between slabs freed while move callbacks
   1882   6712     tomee 			 * are pending (list head) and a slab freed while the
   1883   6712     tomee 			 * lock is dropped in kmem_move_buffers() (list tail) so
   1884   6712     tomee 			 * that in both cases slab_destroy() is called from the
   1885   6712     tomee 			 * right context.
   1886   6712     tomee 			 */
   1887   6712     tomee 			if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
   1888   6712     tomee 				list_insert_tail(deadlist, sp);
   1889   6712     tomee 			} else {
   1890   6712     tomee 				list_insert_head(deadlist, sp);
   1891   6712     tomee 			}
   1892   6712     tomee 			cp->cache_defrag->kmd_deadcount++;
   1893   6712     tomee 			mutex_exit(&cp->cache_lock);
   1894   6712     tomee 		}
   1895   6712     tomee 		return;
   1896   6712     tomee 	}
   1897   6712     tomee 
   1898   6712     tomee 	if (bcp->bc_next == NULL) {
   1899   6712     tomee 		/* Transition the slab from completely allocated to partial. */
   1900   6712     tomee 		ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1));
   1901   6712     tomee 		ASSERT(sp->slab_chunks > 1);
   1902   6712     tomee 		list_remove(&cp->cache_complete_slabs, sp);
   1903   6712     tomee 		cp->cache_complete_slab_count--;
   1904   6712     tomee 		avl_add(&cp->cache_partial_slabs, sp);
   1905   6712     tomee 	} else {
   1906   6712     tomee #ifdef	DEBUG
   1907   6712     tomee 		if (avl_update_gt(&cp->cache_partial_slabs, sp)) {
   1908   6712     tomee 			KMEM_STAT_ADD(kmem_move_stats.kms_avl_update);
   1909   6712     tomee 		} else {
   1910   6712     tomee 			KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate);
   1911   6712     tomee 		}
   1912   6712     tomee #else
   1913   6712     tomee 		(void) avl_update_gt(&cp->cache_partial_slabs, sp);
   1914   6712     tomee #endif
   1915   6712     tomee 	}
   1916   6712     tomee 
   1917   6712     tomee 	ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
   1918   6712     tomee 	    (cp->cache_complete_slab_count +
   1919   6712     tomee 	    avl_numnodes(&cp->cache_partial_slabs) +
   1920   6712     tomee 	    (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
   1921   6712     tomee 	mutex_exit(&cp->cache_lock);
   1922   6712     tomee }
   1923   6712     tomee 
   1924   6712     tomee /*
   1925   6712     tomee  * Return -1 if kmem_error, 1 if constructor fails, 0 if successful.
   1926   6712     tomee  */
   1927      0    stevel static int
   1928      0    stevel kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
   1929      0    stevel     caddr_t caller)
   1930      0    stevel {
   1931      0    stevel 	kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
   1932      0    stevel 	kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
   1933      0    stevel 	uint32_t mtbf;
   1934      0    stevel 
   1935      0    stevel 	if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) {
   1936      0    stevel 		kmem_error(KMERR_BADBUFTAG, cp, buf);
   1937      0    stevel 		return (-1);
   1938      0    stevel 	}
   1939      0    stevel 
   1940      0    stevel 	btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_ALLOC;
   1941      0    stevel 
   1942      0    stevel 	if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) {
   1943      0    stevel 		kmem_error(KMERR_BADBUFCTL, cp, buf);
   1944      0    stevel 		return (-1);
   1945      0    stevel 	}
   1946      0    stevel 
   1947      0    stevel 	if (cp->cache_flags & KMF_DEADBEEF) {
   1948      0    stevel 		if (!construct && (cp->cache_flags & KMF_LITE)) {
   1949      0    stevel 			if (*(uint64_t *)buf != KMEM_FREE_PATTERN) {
   1950      0    stevel 				kmem_error(KMERR_MODIFIED, cp, buf);
   1951      0    stevel 				return (-1);
   1952      0    stevel 			}
   1953      0    stevel 			if (cp->cache_constructor != NULL)
   1954      0    stevel 				*(uint64_t *)buf = btp->bt_redzone;
   1955      0    stevel 			else
   1956      0    stevel 				*(uint64_t *)buf = KMEM_UNINITIALIZED_PATTERN;
   1957      0    stevel 		} else {
   1958      0    stevel 			construct = 1;
   1959      0    stevel 			if (verify_and_copy_pattern(KMEM_FREE_PATTERN,
   1960      0    stevel 			    KMEM_UNINITIALIZED_PATTERN, buf,
   1961      0    stevel 			    cp->cache_verify)) {
   1962      0    stevel 				kmem_error(KMERR_MODIFIED, cp, buf);
   1963      0    stevel 				return (-1);
   1964      0    stevel 			}
   1965      0    stevel 		}
   1966      0    stevel 	}
   1967      0    stevel 	btp->bt_redzone = KMEM_REDZONE_PATTERN;
   1968      0    stevel 
   1969      0    stevel 	if ((mtbf = kmem_mtbf | cp->cache_mtbf) != 0 &&
   1970      0    stevel 	    gethrtime() % mtbf == 0 &&
   1971      0    stevel 	    (kmflag & (KM_NOSLEEP | KM_PANIC)) == KM_NOSLEEP) {
   1972      0    stevel 		kmem_log_event(kmem_failure_log, cp, NULL, NULL);
   1973      0    stevel 		if (!construct && cp->cache_destructor != NULL)
   1974      0    stevel 			cp->cache_destructor(buf, cp->cache_private);
   1975      0    stevel 	} else {
   1976      0    stevel 		mtbf = 0;
   1977      0    stevel 	}
   1978      0    stevel 
   1979      0    stevel 	if (mtbf || (construct && cp->cache_constructor != NULL &&
   1980      0    stevel 	    cp->cache_constructor(buf, cp->cache_private, kmflag) != 0)) {
   1981      0    stevel 		atomic_add_64(&cp->cache_alloc_fail, 1);
   1982      0    stevel 		btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
   1983      0    stevel 		if (cp->cache_flags & KMF_DEADBEEF)
   1984      0    stevel 			copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
   1985      0    stevel 		kmem_slab_free(cp, buf);
   1986   6712     tomee 		return (1);
   1987      0    stevel 	}
   1988      0    stevel 
   1989      0    stevel 	if (cp->cache_flags & KMF_AUDIT) {
   1990      0    stevel 		KMEM_AUDIT(kmem_transaction_log, cp, bcp);
   1991      0    stevel 	}
   1992      0    stevel 
   1993      0    stevel 	if ((cp->cache_flags & KMF_LITE) &&
   1994      0    stevel 	    !(cp->cache_cflags & KMC_KMEM_ALLOC)) {
   1995      0    stevel 		KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller);
   1996      0    stevel 	}
   1997      0    stevel 
   1998      0    stevel 	return (0);
   1999      0    stevel }
   2000      0    stevel 
   2001      0    stevel static int
   2002      0    stevel kmem_cache_free_debug(kmem_cache_t *cp, void *buf, caddr_t caller)
   2003      0    stevel {
   2004      0    stevel 	kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
   2005      0    stevel 	kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
   2006      0    stevel 	kmem_slab_t *sp;
   2007      0    stevel 
   2008      0    stevel 	if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_ALLOC)) {
   2009      0    stevel 		if (btp->bt_bxstat == ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) {
   2010      0    stevel 			kmem_error(KMERR_DUPFREE, cp, buf);
   2011      0    stevel 			return (-1);
   2012      0    stevel 		}
   2013      0    stevel 		sp = kmem_findslab(cp, buf);
   2014      0    stevel 		if (sp == NULL || sp->slab_cache != cp)
   2015      0    stevel 			kmem_error(KMERR_BADADDR, cp, buf);
   2016      0    stevel 		else
   2017      0    stevel 			kmem_error(KMERR_REDZONE, cp, buf);
   2018      0    stevel 		return (-1);
   2019      0    stevel 	}
   2020      0    stevel 
   2021      0    stevel 	btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
   2022      0    stevel 
   2023      0    stevel 	if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) {
   2024      0    stevel 		kmem_error(KMERR_BADBUFCTL, cp, buf);
   2025      0    stevel 		return (-1);
   2026      0    stevel 	}
   2027      0    stevel 
   2028      0    stevel 	if (btp->bt_redzone != KMEM_REDZONE_PATTERN) {
   2029      0    stevel 		kmem_error(KMERR_REDZONE, cp, buf);
   2030      0    stevel 		return (-1);
   2031      0    stevel 	}
   2032      0    stevel 
   2033      0    stevel 	if (cp->cache_flags & KMF_AUDIT) {
   2034      0    stevel 		if (cp->cache_flags & KMF_CONTENTS)
   2035      0    stevel 			bcp->bc_contents = kmem_log_enter(kmem_content_log,
   2036      0    stevel 			    buf, cp->cache_contents);
   2037      0    stevel 		KMEM_AUDIT(kmem_transaction_log, cp, bcp);
   2038      0    stevel 	}
   2039      0    stevel 
   2040      0    stevel 	if ((cp->cache_flags & KMF_LITE) &&
   2041      0    stevel 	    !(cp->cache_cflags & KMC_KMEM_ALLOC)) {
   2042      0    stevel 		KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller);
   2043      0    stevel 	}
   2044      0    stevel 
   2045      0    stevel 	if (cp->cache_flags & KMF_DEADBEEF) {
   2046      0    stevel 		if (cp->cache_flags & KMF_LITE)
   2047      0    stevel 			btp->bt_redzone = *(uint64_t *)buf;
   2048      0    stevel 		else if (cp->cache_destructor != NULL)
   2049      0    stevel 			cp->cache_destructor(buf, cp->cache_private);
   2050      0    stevel 
   2051      0    stevel 		copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
   2052      0    stevel 	}
   2053      0    stevel 
   2054      0    stevel 	return (0);
   2055      0    stevel }
   2056      0    stevel 
   2057      0    stevel /*
   2058      0    stevel  * Free each object in magazine mp to cp's slab layer, and free mp itself.
   2059      0    stevel  */
   2060      0    stevel static void
   2061      0    stevel kmem_magazine_destroy(kmem_cache_t *cp, kmem_magazine_t *mp, int nrounds)
   2062      0    stevel {
   2063      0    stevel 	int round;
   2064      0    stevel 
   2065   6712     tomee 	ASSERT(!list_link_active(&cp->cache_link) ||
   2066   6712     tomee 	    taskq_member(kmem_taskq, curthread));
   2067      0    stevel 
   2068      0    stevel 	for (round = 0; round < nrounds; round++) {
   2069      0    stevel 		void *buf = mp->mag_round[round];
   2070      0    stevel 
   2071      0    stevel 		if (cp->cache_flags & KMF_DEADBEEF) {
   2072      0    stevel 			if (verify_pattern(KMEM_FREE_PATTERN, buf,
   2073      0    stevel 			    cp->cache_verify) != NULL) {
   2074      0    stevel 				kmem_error(KMERR_MODIFIED, cp, buf);
   2075      0    stevel 				continue;
   2076      0    stevel 			}
   2077      0    stevel 			if ((cp->cache_flags & KMF_LITE) &&
   2078      0    stevel 			    cp->cache_destructor != NULL) {
   2079      0    stevel 				kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
   2080      0    stevel 				*(uint64_t *)buf = btp->bt_redzone;
   2081      0    stevel 				cp->cache_destructor(buf, cp->cache_private);
   2082      0    stevel 				*(uint64_t *)buf = KMEM_FREE_PATTERN;
   2083      0    stevel 			}
   2084      0    stevel 		} else if (cp->cache_destructor != NULL) {
   2085      0    stevel 			cp->cache_destructor(buf, cp->cache_private);
   2086      0    stevel 		}
   2087      0    stevel 
   2088      0    stevel 		kmem_slab_free(cp, buf);
   2089      0    stevel 	}
   2090      0    stevel 	ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
   2091      0    stevel 	kmem_cache_free(cp->cache_magtype->mt_cache, mp);
   2092      0    stevel }
   2093      0    stevel 
   2094      0    stevel /*
   2095      0    stevel  * Allocate a magazine from the depot.
   2096      0    stevel  */
   2097      0    stevel static kmem_magazine_t *
   2098      0    stevel kmem_depot_alloc(kmem_cache_t *cp, kmem_maglist_t *mlp)
   2099      0    stevel {
   2100      0    stevel 	kmem_magazine_t *mp;
   2101      0    stevel 
   2102      0    stevel 	/*
   2103      0    stevel 	 * If we can't get the depot lock without contention,
   2104      0    stevel 	 * update our contention count.  We use the depot
   2105      0    stevel 	 * contention rate to determine whether we need to
   2106      0    stevel 	 * increase the magazine size for better scalability.
   2107      0    stevel 	 */
   2108      0    stevel 	if (!mutex_tryenter(&cp->cache_depot_lock)) {
   2109      0    stevel 		mutex_enter(&cp->cache_depot_lock);
   2110      0    stevel 		cp->cache_depot_contention++;
   2111      0    stevel 	}
   2112      0    stevel 
   2113      0    stevel 	if ((mp = mlp->ml_list) != NULL) {
   2114      0    stevel 		ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
   2115      0    stevel 		mlp->ml_list = mp->mag_next;
   2116      0    stevel 		if (--mlp->ml_total < mlp->ml_min)
   2117      0    stevel 			mlp->ml_min = mlp->ml_total;
   2118      0    stevel 		mlp->ml_alloc++;
   2119      0    stevel 	}
   2120      0    stevel 
   2121      0    stevel 	mutex_exit(&cp->cache_depot_lock);
   2122      0    stevel 
   2123      0    stevel 	return (mp);
   2124      0    stevel }
   2125      0    stevel 
   2126      0    stevel /*
   2127      0    stevel  * Free a magazine to the depot.
   2128      0    stevel  */
   2129      0    stevel static void
   2130      0    stevel kmem_depot_free(kmem_cache_t *cp, kmem_maglist_t *mlp, kmem_magazine_t *mp)
   2131      0    stevel {
   2132      0    stevel 	mutex_enter(&cp->cache_depot_lock);
   2133      0    stevel 	ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
   2134      0    stevel 	mp->mag_next = mlp->ml_list;
   2135      0    stevel 	mlp->ml_list = mp;
   2136      0    stevel 	mlp->ml_total++;
   2137      0    stevel 	mutex_exit(&cp->cache_depot_lock);
   2138      0    stevel }
   2139      0    stevel 
   2140      0    stevel /*
   2141      0    stevel  * Update the working set statistics for cp's depot.
   2142      0    stevel  */
   2143      0    stevel static void
   2144      0    stevel kmem_depot_ws_update(kmem_cache_t *cp)
   2145      0    stevel {
   2146      0    stevel 	mutex_enter(&cp->cache_depot_lock);
   2147      0    stevel 	cp->cache_full.ml_reaplimit = cp->cache_full.ml_min;
   2148      0    stevel 	cp->cache_full.ml_min = cp->cache_full.ml_total;
   2149      0    stevel 	cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_min;
   2150      0    stevel 	cp->cache_empty.ml_min = cp->cache_empty.ml_total;
   2151      0    stevel 	mutex_exit(&cp->cache_depot_lock);
   2152      0    stevel }
   2153      0    stevel 
   2154      0    stevel /*
   2155      0    stevel  * Reap all magazines that have fallen out of the depot's working set.
   2156      0    stevel  */
   2157      0    stevel static void
   2158      0    stevel kmem_depot_ws_reap(kmem_cache_t *cp)
   2159      0    stevel {
   2160      0    stevel 	long reap;
   2161      0    stevel 	kmem_magazine_t *mp;
   2162      0    stevel 
   2163   6712     tomee 	ASSERT(!list_link_active(&cp->cache_link) ||
   2164   6712     tomee 	    taskq_member(kmem_taskq, curthread));
   2165      0    stevel 
   2166      0    stevel 	reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min);
   2167      0    stevel 	while (reap-- && (mp = kmem_depot_alloc(cp, &cp->cache_full)) != NULL)
   2168      0    stevel 		kmem_magazine_destroy(cp, mp, cp->cache_magtype->mt_magsize);
   2169      0    stevel 
   2170      0    stevel 	reap = MIN(cp->cache_empty.ml_reaplimit, cp->cache_empty.ml_min);
   2171      0    stevel 	while (reap-- && (mp = kmem_depot_alloc(cp, &cp->cache_empty)) != NULL)
   2172      0    stevel 		kmem_magazine_destroy(cp, mp, 0);
   2173      0    stevel }
   2174      0    stevel 
   2175      0    stevel static void
   2176      0    stevel kmem_cpu_reload(kmem_cpu_cache_t *ccp, kmem_magazine_t *mp, int rounds)
   2177      0    stevel {
   2178      0    stevel 	ASSERT((ccp->cc_loaded == NULL && ccp->cc_rounds == -1) ||
   2179      0    stevel 	    (ccp->cc_loaded && ccp->cc_rounds + rounds == ccp->cc_magsize));
   2180      0    stevel 	ASSERT(ccp->cc_magsize > 0);
   2181      0    stevel 
   2182      0    stevel 	ccp->cc_ploaded = ccp->cc_loaded;
   2183      0    stevel 	ccp->cc_prounds = ccp->cc_rounds;
   2184      0    stevel 	ccp->cc_loaded = mp;
   2185      0    stevel 	ccp->cc_rounds = rounds;
   2186      0    stevel }
   2187      0    stevel 
   2188      0    stevel /*
   2189  11178      Dave  * Intercept kmem alloc/free calls during crash dump in order to avoid
   2190  11178      Dave  * changing kmem state while memory is being saved to the dump device.
   2191  11178      Dave  * Otherwise, ::kmem_verify will report "corrupt buffers".  Note that
   2192  11178      Dave  * there are no locks because only one CPU calls kmem during a crash
   2193  11178      Dave  * dump. To enable this feature, first create the associated vmem
   2194  11178      Dave  * arena with VMC_DUMPSAFE.
   2195  11178      Dave  */
   2196  11178      Dave static void *kmem_dump_start;	/* start of pre-reserved heap */
   2197  11178      Dave static void *kmem_dump_end;	/* end of heap area */
   2198  11178      Dave static void *kmem_dump_curr;	/* current free heap pointer */
   2199  11178      Dave static size_t kmem_dump_size;	/* size of heap area */
   2200  11178      Dave 
   2201  11178      Dave /* append to each buf created in the pre-reserved heap */
   2202  11178      Dave typedef struct kmem_dumpctl {
   2203  11178      Dave 	void	*kdc_next;	/* cache dump free list linkage */
   2204  11178      Dave } kmem_dumpctl_t;
   2205  11178      Dave 
   2206  11178      Dave #define	KMEM_DUMPCTL(cp, buf)	\
   2207  11178      Dave 	((kmem_dumpctl_t *)P2ROUNDUP((uintptr_t)(buf) + (cp)->cache_bufsize, \
   2208  11178      Dave 	    sizeof (void *)))
   2209  11178      Dave 
   2210  11178      Dave /* Keep some simple stats. */
   2211  11178      Dave #define	KMEM_DUMP_LOGS	(100)
   2212  11178      Dave 
   2213  11178      Dave typedef struct kmem_dump_log {
   2214  11178      Dave 	kmem_cache_t	*kdl_cache;
   2215  11178      Dave 	uint_t		kdl_allocs;		/* # of dump allocations */
   2216  11178      Dave 	uint_t		kdl_frees;		/* # of dump frees */
   2217  11178      Dave 	uint_t		kdl_alloc_fails;	/* # of allocation failures */
   2218  11178      Dave 	uint_t		kdl_free_nondump;	/* # of non-dump frees */
   2219  11178      Dave 	uint_t		kdl_unsafe;		/* cache was used, but unsafe */
   2220  11178      Dave } kmem_dump_log_t;
   2221  11178      Dave 
   2222  11178      Dave static kmem_dump_log_t *kmem_dump_log;
   2223  11178      Dave static int kmem_dump_log_idx;
   2224  11178      Dave 
   2225  11178      Dave #define	KDI_LOG(cp, stat) {						\
   2226  11178      Dave 	kmem_dump_log_t *kdl;						\
   2227  11178      Dave 	if ((kdl = (kmem_dump_log_t *)((cp)->cache_dumplog)) != NULL) {	\
   2228  11178      Dave 		kdl->stat++;						\
   2229  11178      Dave 	} else if (kmem_dump_log_idx < KMEM_DUMP_LOGS) {		\
   2230  11178      Dave 		kdl = &kmem_dump_log[kmem_dump_log_idx++];		\
   2231  11178      Dave 		kdl->stat++;						\
   2232  11178      Dave 		kdl->kdl_cache = (cp);					\
   2233  11178      Dave 		(cp)->cache_dumplog = kdl;				\
   2234  11178      Dave 	}								\
   2235  11178      Dave }
   2236  11178      Dave 
   2237  11178      Dave /* set non zero for full report */
   2238  11178      Dave uint_t kmem_dump_verbose = 0;
   2239  11178      Dave 
   2240  11178      Dave /* stats for overize heap */
   2241  11178      Dave uint_t kmem_dump_oversize_allocs = 0;
   2242  11178      Dave uint_t kmem_dump_oversize_max = 0;
   2243  11178      Dave 
   2244  11178      Dave static void
   2245  11178      Dave kmem_dumppr(char **pp, char *e, const char *format, ...)
   2246  11178      Dave {
   2247  11178      Dave 	char *p = *pp;
   2248  11178      Dave 
   2249  11178      Dave 	if (p < e) {
   2250  11178      Dave 		int n;
   2251  11178      Dave 		va_list ap;
   2252  11178      Dave 
   2253  11178      Dave 		va_start(ap, format);
   2254  11178      Dave 		n = vsnprintf(p, e - p, format, ap);
   2255  11178      Dave 		va_end(ap);
   2256  11178      Dave 		*pp = p + n;
   2257  11178      Dave 	}
   2258  11178      Dave }
   2259  11178      Dave 
   2260  11178      Dave /*
   2261  11178      Dave  * Called when dumpadm(1M) configures dump parameters.
   2262  11178      Dave  */
   2263  11178      Dave void
   2264  11178      Dave kmem_dump_init(size_t size)
   2265  11178      Dave {
   2266  11178      Dave 	if (kmem_dump_start != NULL)
   2267  11178      Dave 		kmem_free(kmem_dump_start, kmem_dump_size);
   2268  11178      Dave 
   2269  11178      Dave 	if (kmem_dump_log == NULL)
   2270  11178      Dave 		kmem_dump_log = (kmem_dump_log_t *)kmem_zalloc(KMEM_DUMP_LOGS *
   2271  11178      Dave 		    sizeof (kmem_dump_log_t), KM_SLEEP);
   2272  11178      Dave 
   2273  11178      Dave 	kmem_dump_start = kmem_alloc(size, KM_SLEEP);
   2274  11178      Dave 
   2275  11178      Dave 	if (kmem_dump_start != NULL) {
   2276  11178      Dave 		kmem_dump_size = size;
   2277  11178      Dave 		kmem_dump_curr = kmem_dump_start;
   2278  11178      Dave 		kmem_dump_end = (void *)((char *)kmem_dump_start + size);
   2279  11178      Dave 		copy_pattern(KMEM_UNINITIALIZED_PATTERN, kmem_dump_start, size);
   2280  11178      Dave 	} else {
   2281  11178      Dave 		kmem_dump_size = 0;
   2282  11178      Dave 		kmem_dump_curr = NULL;
   2283  11178      Dave 		kmem_dump_end = NULL;
   2284  11178      Dave 	}
   2285  11178      Dave }
   2286  11178      Dave 
   2287  11178      Dave /*
   2288  11178      Dave  * Set flag for each kmem_cache_t if is safe to use alternate dump
   2289  11178      Dave  * memory. Called just before panic crash dump starts. Set the flag
   2290  11178      Dave  * for the calling CPU.
   2291  11178      Dave  */
   2292  11178      Dave void
   2293  11178      Dave kmem_dump_begin(void)
   2294  11178      Dave {
   2295  11178      Dave 	ASSERT(panicstr != NULL);
   2296  11178      Dave 	if (kmem_dump_start != NULL) {
   2297  11178      Dave 		kmem_cache_t *cp;
   2298  11178      Dave 
   2299  11178      Dave 		for (cp = list_head(&kmem_caches); cp != NULL;
   2300  11178      Dave 		    cp = list_next(&kmem_caches, cp)) {
   2301  11178      Dave 			kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
   2302  11178      Dave 
   2303  11178      Dave 			if (cp->cache_arena->vm_cflags & VMC_DUMPSAFE) {
   2304  11178      Dave 				cp->cache_flags |= KMF_DUMPDIVERT;
   2305  11178      Dave 				ccp->cc_flags |= KMF_DUMPDIVERT;
   2306  11178      Dave 				ccp->cc_dump_rounds = ccp->cc_rounds;
   2307  11178      Dave 				ccp->cc_dump_prounds = ccp->cc_prounds;
   2308  11178      Dave 				ccp->cc_rounds = ccp->cc_prounds = -1;
   2309  11178      Dave 			} else {
   2310  11178      Dave 				cp->cache_flags |= KMF_DUMPUNSAFE;
   2311  11178      Dave 				ccp->cc_flags |= KMF_DUMPUNSAFE;
   2312  11178      Dave 			}
   2313  11178      Dave 		}
   2314  11178      Dave 	}
   2315  11178      Dave }
   2316  11178      Dave 
   2317  11178      Dave /*
   2318  11178      Dave  * finished dump intercept
   2319  11178      Dave  * print any warnings on the console
   2320  11178      Dave  * return verbose information to dumpsys() in the given buffer
   2321  11178      Dave  */
   2322  11178      Dave size_t
   2323  11178      Dave kmem_dump_finish(char *buf, size_t size)
   2324  11178      Dave {
   2325  11178      Dave 	int kdi_idx;
   2326  11178      Dave 	int kdi_end = kmem_dump_log_idx;
   2327  11178      Dave 	int percent = 0;
   2328  11178      Dave 	int header = 0;
   2329  11178      Dave 	int warn = 0;
   2330  11178      Dave 	size_t used;
   2331  11178      Dave 	kmem_cache_t *cp;
   2332  11178      Dave 	kmem_dump_log_t *kdl;
   2333  11178      Dave 	char *e = buf + size;
   2334  11178      Dave 	char *p = buf;
   2335  11178      Dave 
   2336  11178      Dave 	if (kmem_dump_size == 0 || kmem_dump_verbose == 0)
   2337  11178      Dave 		return (0);
   2338  11178      Dave 
   2339  11178      Dave 	used = (char *)kmem_dump_curr - (char *)kmem_dump_start;
   2340  11178      Dave 	percent = (used * 100) / kmem_dump_size;
   2341  11178      Dave 
   2342  11178      Dave 	kmem_dumppr(&p, e, "%% heap used,%d\n", percent);
   2343  11178      Dave 	kmem_dumppr(&p, e, "used bytes,%ld\n", used);
   2344  11178      Dave 	kmem_dumppr(&p, e, "heap size,%ld\n", kmem_dump_size);
   2345  11178      Dave 	kmem_dumppr(&p, e, "Oversize allocs,%d\n",
   2346  11178      Dave 	    kmem_dump_oversize_allocs);
   2347  11178      Dave 	kmem_dumppr(&p, e, "Oversize max size,%ld\n",
   2348  11178      Dave 	    kmem_dump_oversize_max);
   2349  11178      Dave 
   2350  11178      Dave 	for (kdi_idx = 0; kdi_idx < kdi_end; kdi_idx++) {
   2351  11178      Dave 		kdl = &kmem_dump_log[kdi_idx];
   2352  11178      Dave 		cp = kdl->kdl_cache;
   2353  11178      Dave 		if (cp == NULL)
   2354  11178      Dave 			break;
   2355  11178      Dave 		if (kdl->kdl_alloc_fails)
   2356  11178      Dave 			++warn;
   2357  11178      Dave 		if (header == 0) {
   2358  11178      Dave 			kmem_dumppr(&p, e,
   2359  11178      Dave 			    "Cache Name,Allocs,Frees,Alloc Fails,"
   2360  11178      Dave 			    "Nondump Frees,Unsafe Allocs/Frees\n");
   2361  11178      Dave 			header = 1;
   2362  11178      Dave 		}
   2363  11178      Dave 		kmem_dumppr(&p, e, "%s,%d,%d,%d,%d,%d\n",
   2364  11178      Dave 		    cp->cache_name, kdl->kdl_allocs, kdl->kdl_frees,
   2365  11178      Dave 		    kdl->kdl_alloc_fails, kdl->kdl_free_nondump,
   2366  11178      Dave 		    kdl->kdl_unsafe);
   2367  11178      Dave 	}
   2368  11178      Dave 
   2369  11178      Dave 	/* return buffer size used */
   2370  11178      Dave 	if (p < e)
   2371  11178      Dave 		bzero(p, e - p);
   2372  11178      Dave 	return (p - buf);
   2373  11178      Dave }
   2374  11178      Dave 
   2375  11178      Dave /*
   2376  11178      Dave  * Allocate a constructed object from alternate dump memory.
   2377  11178      Dave  */
   2378  11178      Dave void *
   2379  11178      Dave kmem_cache_alloc_dump(kmem_cache_t *cp, int kmflag)
   2380  11178      Dave {
   2381  11178      Dave 	void *buf;
   2382  11178      Dave 	void *curr;
   2383  11178      Dave 	char *bufend;
   2384  11178      Dave 
   2385  11178      Dave 	/* return a constructed object */
   2386  11178      Dave 	if ((buf = cp->cache_dumpfreelist) != NULL) {
   2387  11178      Dave 		cp->cache_dumpfreelist = KMEM_DUMPCTL(cp, buf)->kdc_next;
   2388  11178      Dave 		KDI_LOG(cp, kdl_allocs);
   2389  11178      Dave 		return (buf);
   2390  11178      Dave 	}
   2391  11178      Dave 
   2392  11178      Dave 	/* create a new constructed object */
   2393  11178      Dave 	curr = kmem_dump_curr;
   2394  11178      Dave 	buf = (void *)P2ROUNDUP((uintptr_t)curr, cp->cache_align);
   2395  11178      Dave 	bufend = (char *)KMEM_DUMPCTL(cp, buf) + sizeof (kmem_dumpctl_t);
   2396  11178      Dave 
   2397  11178      Dave 	/* hat layer objects cannot cross a page boundary */
   2398  11178      Dave 	if (cp->cache_align < PAGESIZE) {
   2399  11178      Dave 		char *page = (char *)P2ROUNDUP((uintptr_t)buf, PAGESIZE);
   2400  11178      Dave 		if (bufend > page) {
   2401  11178      Dave 			bufend += page - (char *)buf;
   2402  11178      Dave 			buf = (void *)page;
   2403  11178      Dave 		}
   2404  11178      Dave 	}
   2405  11178      Dave 
   2406  11178      Dave 	/* fall back to normal alloc if reserved area is used up */
   2407  11178      Dave 	if (bufend > (char *)kmem_dump_end) {
   2408  11178      Dave 		kmem_dump_curr = kmem_dump_end;
   2409  11178      Dave 		KDI_LOG(cp, kdl_alloc_fails);
   2410  11178      Dave 		return (NULL);
   2411  11178      Dave 	}
   2412  11178      Dave 
   2413  11178      Dave 	/*
   2414  11178      Dave 	 * Must advance curr pointer before calling a constructor that
   2415  11178      Dave 	 * may also allocate memory.
   2416  11178      Dave 	 */
   2417  11178      Dave 	kmem_dump_curr = bufend;
   2418  11178      Dave 
   2419  11178      Dave 	/* run constructor */
   2420  11178      Dave 	if (cp->cache_constructor != NULL &&
   2421  11178      Dave 	    cp->cache_constructor(buf, cp->cache_private, kmflag)
   2422  11178      Dave 	    != 0) {
   2423  11178      Dave #ifdef DEBUG
   2424  11178      Dave 		printf("name='%s' cache=0x%p: kmem cache constructor failed\n",
   2425  11178      Dave 		    cp->cache_name, (void *)cp);
   2426  11178      Dave #endif
   2427  11178      Dave 		/* reset curr pointer iff no allocs were done */
   2428  11178      Dave 		if (kmem_dump_curr == bufend)
   2429  11178      Dave 			kmem_dump_curr = curr;
   2430  11178      Dave 
   2431  11178      Dave 		/* fall back to normal alloc if the constructor fails */
   2432  11178      Dave 		KDI_LOG(cp, kdl_alloc_fails);
   2433  11178      Dave 		return (NULL);
   2434  11178      Dave 	}
   2435  11178      Dave 
   2436  11178      Dave 	KDI_LOG(cp, kdl_allocs);
   2437  11178      Dave 	return (buf);
   2438  11178      Dave }
   2439  11178      Dave 
   2440  11178      Dave /*
   2441  11178      Dave  * Free a constructed object in alternate dump memory.
   2442  11178      Dave  */
   2443  11178      Dave int
   2444  11178      Dave kmem_cache_free_dump(kmem_cache_t *cp, void *buf)
   2445  11178      Dave {
   2446  11178      Dave 	/* save constructed buffers for next time */
   2447  11178      Dave 	if ((char *)buf >= (char *)kmem_dump_start &&
   2448  11178      Dave 	    (char *)buf < (char *)kmem_dump_end) {
   2449  11178      Dave 		KMEM_DUMPCTL(cp, buf)->kdc_next = cp->cache_dumpfreelist;
   2450  11178      Dave 		cp->cache_dumpfreelist = buf;
   2451  11178      Dave 		KDI_LOG(cp, kdl_frees);
   2452  11178      Dave 		return (0);
   2453  11178      Dave 	}
   2454  11178      Dave 
   2455  11178      Dave 	/* count all non-dump buf frees */
   2456  11178      Dave 	KDI_LOG(cp, kdl_free_nondump);
   2457  11178      Dave 
   2458  11178      Dave 	/* just drop buffers that were allocated before dump started */
   2459  11178      Dave 	if (kmem_dump_curr < kmem_dump_end)
   2460  11178      Dave 		return (0);
   2461  11178      Dave 
   2462  11178      Dave 	/* fall back to normal free if reserved area is used up */
   2463  11178      Dave 	return (1);
   2464  11178      Dave }
   2465  11178      Dave 
   2466  11178      Dave /*
   2467      0    stevel  * Allocate a constructed object from cache cp.
   2468      0    stevel  */
   2469      0    stevel void *
   2470      0    stevel kmem_cache_alloc(kmem_cache_t *cp, int kmflag)
   2471      0    stevel {
   2472      0    stevel 	kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
   2473      0    stevel 	kmem_magazine_t *fmp;
   2474      0    stevel 	void *buf;
   2475      0    stevel 
   2476      0    stevel 	mutex_enter(&ccp->cc_lock);
   2477      0    stevel 	for (;;) {
   2478      0    stevel 		/*
   2479      0    stevel 		 * If there's an object available in the current CPU's
   2480      0    stevel 		 * loaded magazine, just take it and return.
   2481      0    stevel 		 */
   2482      0    stevel 		if (ccp->cc_rounds > 0) {
   2483      0    stevel 			buf = ccp->cc_loaded->mag_round[--ccp->cc_rounds];
   2484      0    stevel 			ccp->cc_alloc++;
   2485      0    stevel 			mutex_exit(&ccp->cc_lock);
   2486  11178      Dave 			if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPUNSAFE)) {
   2487  11178      Dave 				if (ccp->cc_flags & KMF_DUMPUNSAFE) {
   2488  11178      Dave 					ASSERT(!(ccp->cc_flags &
   2489  11178      Dave 					    KMF_DUMPDIVERT));
   2490  11178      Dave 					KDI_LOG(cp, kdl_unsafe);
   2491  11178      Dave 				}
   2492  11178      Dave 				if ((ccp->cc_flags & KMF_BUFTAG) &&
   2493  11178      Dave 				    kmem_cache_alloc_debug(cp, buf, kmflag, 0,
   2494  11178      Dave 				    caller()) != 0) {
   2495  11178      Dave 					if (kmflag & KM_NOSLEEP)
   2496  11178      Dave 						return (NULL);
   2497  11178      Dave 					mutex_enter(&ccp->cc_lock);
   2498  11178      Dave 					continue;
   2499  11178      Dave 				}
   2500      0    stevel 			}
   2501      0    stevel 			return (buf);
   2502      0    stevel 		}
   2503      0    stevel 
   2504      0    stevel 		/*
   2505      0    stevel 		 * The loaded magazine is empty.  If the previously loaded
   2506      0    stevel 		 * magazine was full, exchange them and try again.
   2507      0    stevel 		 */
   2508      0    stevel 		if (ccp->cc_prounds > 0) {
   2509      0    stevel 			kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
   2510      0    stevel 			continue;
   2511  11178      Dave 		}
   2512  11178      Dave 
   2513  11178      Dave 		/*
   2514  11178      Dave 		 * Return an alternate buffer at dump time to preserve
   2515  11178      Dave 		 * the heap.
   2516  11178      Dave 		 */
   2517  11178      Dave 		if (ccp->cc_flags & (KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) {
   2518  11178      Dave 			if (ccp->cc_flags & KMF_DUMPUNSAFE) {
   2519  11178      Dave 				ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT));
   2520  11178      Dave 				/* log it so that we can warn about it */
   2521  11178      Dave 				KDI_LOG(cp, kdl_unsafe);
   2522  11178      Dave 			} else {
   2523  11178      Dave 				if ((buf = kmem_cache_alloc_dump(cp, kmflag)) !=
   2524  11178      Dave 				    NULL) {
   2525  11178      Dave 					mutex_exit(&ccp->cc_lock);
   2526  11178      Dave 					return (buf);
   2527  11178      Dave 				}
   2528  11178      Dave 				break;		/* fall back to slab layer */
   2529  11178      Dave 			}
   2530      0    stevel 		}
   2531      0    stevel 
   2532      0    stevel 		/*
   2533      0    stevel 		 * If the magazine layer is disabled, break out now.
   2534      0    stevel 		 */
   2535      0    stevel 		if (ccp->cc_magsize == 0)
   2536      0    stevel 			break;
   2537      0    stevel 
   2538      0    stevel 		/*
   2539      0    stevel 		 * Try to get a full magazine from the depot.
   2540      0    stevel 		 */
   2541      0    stevel 		fmp = kmem_depot_alloc(cp, &cp->cache_full);
   2542      0    stevel 		if (fmp != NULL) {
   2543      0    stevel 			if (ccp->cc_ploaded != NULL)
   2544      0    stevel 				kmem_depot_free(cp, &cp->cache_empty,
   2545      0    stevel 				    ccp->cc_ploaded);
   2546      0    stevel 			kmem_cpu_reload(ccp, fmp, ccp->cc_magsize);
   2547      0    stevel 			continue;
   2548      0    stevel 		}
   2549      0    stevel 
   2550      0    stevel 		/*
   2551      0    stevel 		 * There are no full magazines in the depot,
   2552      0    stevel 		 * so fall through to the slab layer.
   2553      0    stevel 		 */
   2554      0    stevel 		break;
   2555      0    stevel 	}
   2556      0    stevel 	mutex_exit(&ccp->cc_lock);
   2557      0    stevel 
   2558      0    stevel 	/*
   2559      0    stevel 	 * We couldn't allocate a constructed object from the magazine layer,
   2560      0    stevel 	 * so get a raw buffer from the slab layer and apply its constructor.
   2561      0    stevel 	 */
   2562      0    stevel 	buf = kmem_slab_alloc(cp, kmflag);
   2563      0    stevel 
   2564      0    stevel 	if (buf == NULL)
   2565      0    stevel 		return (NULL);
   2566      0    stevel 
   2567      0    stevel 	if (cp->cache_flags & KMF_BUFTAG) {
   2568      0    stevel 		/*
   2569      0    stevel 		 * Make kmem_cache_alloc_debug() apply the constructor for us.
   2570      0    stevel 		 */
   2571   6712     tomee 		int rc = kmem_cache_alloc_debug(cp, buf, kmflag, 1, caller());
   2572   6712     tomee 		if (rc != 0) {
   2573      0    stevel 			if (kmflag & KM_NOSLEEP)
   2574      0    stevel 				return (NULL);
   2575      0    stevel 			/*
   2576      0    stevel 			 * kmem_cache_alloc_debug() detected corruption
   2577   6712     tomee 			 * but didn't panic (kmem_panic <= 0). We should not be
   2578   6712     tomee 			 * here because the constructor failed (indicated by a
   2579   6712     tomee 			 * return code of 1). Try again.
   2580   6712     tomee 			 */
   2581   6712     tomee 			ASSERT(rc == -1);
   2582      0    stevel 			return (kmem_cache_alloc(cp, kmflag));
   2583      0    stevel 		}
   2584      0    stevel 		return (buf);
   2585      0    stevel 	}
   2586      0    stevel 
   2587      0    stevel 	if (cp->cache_constructor != NULL &&
   2588      0    stevel 	    cp->cache_constructor(buf, cp->cache_private, kmflag) != 0) {
   2589      0    stevel 		atomic_add_64(&cp->cache_alloc_fail, 1);
   2590      0    stevel 		kmem_slab_free(cp, buf);
   2591      0    stevel 		return (NULL);
   2592      0    stevel 	}
   2593      0    stevel 
   2594      0    stevel 	return (buf);
   2595      0    stevel }
   2596      0    stevel 
   2597      0    stevel /*
   2598   6712     tomee  * The freed argument tells whether or not kmem_cache_free_debug() has already
   2599   6712     tomee  * been called so that we can avoid the duplicate free error. For example, a
   2600   6712     tomee  * buffer on a magazine has already been freed by the client but is still
   2601   6712     tomee  * constructed.
   2602   6712     tomee  */
   2603   6712     tomee static void
   2604   6712     tomee kmem_slab_free_constructed(kmem_cache_t *cp, void *buf, boolean_t freed)
   2605   6712     tomee {
   2606   6712     tomee 	if (!freed && (cp->cache_flags & KMF_BUFTAG))
   2607   6712     tomee 		if (kmem_cache_free_debug(cp, buf, caller()) == -1)
   2608   6712     tomee 			return;
   2609   6712     tomee 
   2610   6712     tomee 	/*
   2611   6712     tomee 	 * Note that if KMF_DEADBEEF is in effect and KMF_LITE is not,
   2612   6712     tomee 	 * kmem_cache_free_debug() will have already applied the destructor.
   2613   6712     tomee 	 */
   2614   6712     tomee 	if ((cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) != KMF_DEADBEEF &&
   2615   6712     tomee 	    cp->cache_destructor != NULL) {
   2616   6712     tomee 		if (cp->cache_flags & KMF_DEADBEEF) {	/* KMF_LITE implied */
   2617   6712     tomee 			kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
   2618   6712     tomee 			*(uint64_t *)buf = btp->bt_redzone;
   2619   6712     tomee 			cp->cache_destructor(buf, cp->cache_private);
   2620   6712     tomee 			*(uint64_t *)buf = KMEM_FREE_PATTERN;
   2621   6712     tomee 		} else {
   2622   6712     tomee 			cp->cache_destructor(buf, cp->cache_private);
   2623   6712     tomee 		}
   2624   6712     tomee 	}
   2625   6712     tomee 
   2626   6712     tomee 	kmem_slab_free(cp, buf);
   2627   6712     tomee }
   2628   6712     tomee 
   2629   6712     tomee /*
   2630      0    stevel  * Free a constructed object to cache cp.
   2631      0    stevel  */
   2632      0    stevel void
   2633      0    stevel kmem_cache_free(kmem_cache_t *cp, void *buf)
   2634      0    stevel {
   2635      0    stevel 	kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
   2636      0    stevel 	kmem_magazine_t *emp;
   2637      0    stevel 	kmem_magtype_t *mtp;
   2638   6712     tomee 
   2639   6712     tomee 	/*
   2640   6712     tomee 	 * The client must not free either of the buffers passed to the move
   2641   6712     tomee 	 * callback function.
   2642   6712     tomee 	 */
   2643   6712     tomee 	ASSERT(cp->cache_defrag == NULL ||
   2644   6712     tomee 	    cp->cache_defrag->kmd_thread != curthread ||
   2645   6712     tomee 	    (buf != cp->cache_defrag->kmd_from_buf &&
   2646   6712     tomee 	    buf != cp->cache_defrag->kmd_to_buf));
   2647      0    stevel 
   2648  11178      Dave 	if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) {
   2649  11178      Dave 		if (ccp->cc_flags & KMF_DUMPUNSAFE) {
   2650  11178      Dave 			ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT));
   2651  11178      Dave 			/* log it so that we can warn about it */
   2652  11178      Dave 			KDI_LOG(cp, kdl_unsafe);
   2653  11178      Dave 		} else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, buf)) {
   2654  11178      Dave 			return;
   2655  11178      Dave 		}
   2656  11178      Dave 		if (ccp->cc_flags & KMF_BUFTAG) {
   2657  11178      Dave 			if (kmem_cache_free_debug(cp, buf, caller()) == -1)
   2658  11178      Dave 				return;
   2659  11178      Dave 		}
   2660  11178      Dave 	}
   2661      0    stevel 
   2662      0    stevel 	mutex_enter(&ccp->cc_lock);
   2663      0    stevel 	for (;;) {
   2664      0    stevel 		/*
   2665      0    stevel 		 * If there's a slot available in the current CPU's
   2666      0    stevel 		 * loaded magazine, just put the object there and return.
   2667      0    stevel 		 */
   2668      0    stevel 		if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) {
   2669      0    stevel 			ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf;
   2670      0    stevel 			ccp->cc_free++;
   2671      0    stevel 			mutex_exit(&ccp->cc_lock);
   2672      0    stevel 			return;
   2673      0    stevel 		}
   2674      0    stevel 
   2675      0    stevel 		/*
   2676      0    stevel 		 * The loaded magazine is full.  If the previously loaded
   2677      0    stevel 		 * magazine was empty, exchange them and try again.
   2678      0    stevel 		 */
   2679      0    stevel 		if (ccp->cc_prounds == 0) {
   2680      0    stevel 			kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
   2681      0    stevel 			continue;
   2682      0    stevel 		}
   2683      0    stevel 
   2684      0    stevel 		/*
   2685      0    stevel 		 * If the magazine layer is disabled, break out now.
   2686      0    stevel 		 */
   2687      0    stevel 		if (ccp->cc_magsize == 0)
   2688      0    stevel 			break;
   2689      0    stevel 
   2690      0    stevel 		/*
   2691      0    stevel 		 * Try to get an empty magazine from the depot.
   2692      0    stevel 		 */
   2693      0    stevel 		emp = kmem_depot_alloc(cp, &cp->cache_empty);
   2694      0    stevel 		if (emp != NULL) {
   2695      0    stevel 			if (ccp->cc_ploaded != NULL)
   2696      0    stevel 				kmem_depot_free(cp, &cp->cache_full,
   2697      0    stevel 				    ccp->cc_ploaded);
   2698      0    stevel 			kmem_cpu_reload(ccp, emp, 0);
   2699      0    stevel 			continue;
   2700      0    stevel 		}
   2701      0    stevel 
   2702      0    stevel 		/*
   2703      0    stevel 		 * There are no empty magazines in the depot,
   2704      0    stevel 		 * so try to allocate a new one.  We must drop all locks
   2705      0    stevel 		 * across kmem_cache_alloc() because lower layers may
   2706      0    stevel 		 * attempt to allocate from this cache.
   2707      0    stevel 		 */
   2708      0    stevel 		mtp = cp->cache_magtype;
   2709      0    stevel 		mutex_exit(&ccp->cc_lock);
   2710      0    stevel 		emp = kmem_cache_alloc(mtp->mt_cache, KM_NOSLEEP);
   2711      0    stevel 		mutex_enter(&ccp->cc_lock);
   2712      0    stevel 
   2713      0    stevel 		if (emp != NULL) {
   2714      0    stevel 			/*
   2715      0    stevel 			 * We successfully allocated an empty magazine.
   2716      0    stevel 			 * However, we had to drop ccp->cc_lock to do it,
   2717      0    stevel 			 * so the cache's magazine size may have changed.
   2718      0    stevel 			 * If so, free the magazine and try again.
   2719      0    stevel 			 */
   2720      0    stevel 			if (ccp->cc_magsize != mtp->mt_magsize) {
   2721      0    stevel 				mutex_exit(&ccp->cc_lock);
   2722      0    stevel 				kmem_cache_free(mtp->mt_cache, emp);
   2723      0    stevel 				mutex_enter(&ccp->cc_lock);
   2724      0    stevel 				continue;
   2725      0    stevel 			}
   2726      0    stevel 
   2727      0    stevel 			/*
   2728      0    stevel 			 * We got a magazine of the right size.  Add it to
   2729      0    stevel 			 * the depot and try the whole dance again.
   2730      0    stevel 			 */
   2731      0    stevel 			kmem_depot_free(cp, &cp->cache_empty, emp);
   2732      0    stevel 			continue;
   2733      0    stevel 		}
   2734      0    stevel 
   2735      0    stevel 		/*
   2736      0    stevel 		 * We couldn't allocate an empty magazine,
   2737      0    stevel 		 * so fall through to the slab layer.
   2738      0    stevel 		 */
   2739      0    stevel 		break;
   2740      0    stevel 	}
   2741      0    stevel 	mutex_exit(&ccp->cc_lock);
   2742      0    stevel 
   2743      0    stevel 	/*
   2744      0    stevel 	 * We couldn't free our constructed object to the magazine layer,
   2745      0    stevel 	 * so apply its destructor and free it to the slab layer.
   2746   6712     tomee 	 */
   2747   6712     tomee 	kmem_slab_free_constructed(cp, buf, B_TRUE);
   2748      0    stevel }
   2749      0    stevel 
   2750      0    stevel void *
   2751      0    stevel kmem_zalloc(size_t size, int kmflag)
   2752      0    stevel {
   2753   9367  Jonathan 	size_t index;
   2754   9367  Jonathan 	void *buf;
   2755   9367  Jonathan 
   2756   9367  Jonathan 	if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) {
   2757      0    stevel 		kmem_cache_t *cp = kmem_alloc_table[index];
   2758      0    stevel 		buf = kmem_cache_alloc(cp, kmflag);
   2759      0    stevel 		if (buf != NULL) {
   2760  11178      Dave 			if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) {
   2761      0    stevel 				kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
   2762      0    stevel 				((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
   2763      0    stevel 				((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
   2764      0    stevel 
   2765      0    stevel 				if (cp->cache_flags & KMF_LITE) {
   2766      0    stevel 					KMEM_BUFTAG_LITE_ENTER(btp,
   2767      0    stevel 					    kmem_lite_count, caller());
   2768      0    stevel 				}
   2769      0    stevel 			}
   2770      0    stevel 			bzero(buf, size);
   2771      0    stevel 		}
   2772      0    stevel 	} else {
   2773      0    stevel 		buf = kmem_alloc(size, kmflag);
   2774      0    stevel 		if (buf != NULL)
   2775      0    stevel 			bzero(buf, size);
   2776      0    stevel 	}
   2777      0    stevel 	return (buf);
   2778      0    stevel }
   2779      0    stevel 
   2780      0    stevel void *
   2781      0    stevel kmem_alloc(size_t size, int kmflag)
   2782      0    stevel {
   2783   9367  Jonathan 	size_t index;
   2784   9367  Jonathan 	kmem_cache_t *cp;
   2785   9367  Jonathan 	void *buf;
   2786   9367  Jonathan 
   2787   9367  Jonathan 	if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) {
   2788   9367  Jonathan 		cp = kmem_alloc_table[index];
   2789   9367  Jonathan 		/* fall through to kmem_cache_alloc() */
   2790   9367  Jonathan 
   2791   9367  Jonathan 	} else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) <
   2792   9367  Jonathan 	    kmem_big_alloc_table_max) {
   2793   9367  Jonathan 		cp = kmem_big_alloc_table[index];
   2794   9367  Jonathan 		/* fall through to kmem_cache_alloc() */
   2795   9367  Jonathan 
   2796   9367  Jonathan 	} else {
   2797   9367  Jonathan 		if (size == 0)
   2798   9367  Jonathan 			return (NULL);
   2799   9367  Jonathan 
   2800   9367  Jonathan 		buf = vmem_alloc(kmem_oversize_arena, size,
   2801   9367  Jonathan 		    kmflag & KM_VMFLAGS);
   2802   9367  Jonathan 		if (buf == NULL)
   2803   9367  Jonathan 			kmem_log_event(kmem_failure_log, NULL, NULL,
   2804   9367  Jonathan 			    (void *)size);
   2805  11178      Dave 		else if (KMEM_DUMP(kmem_slab_cache)) {
   2806  11178      Dave 			/* stats for dump intercept */
   2807  11178      Dave 			kmem_dump_oversize_allocs++;
   2808  11178      Dave 			if (size > kmem_dump_oversize_max)
   2809  11178      Dave 				kmem_dump_oversize_max = size;
   2810  11178      Dave 		}
   2811      0    stevel 		return (buf);
   2812      0    stevel 	}
   2813   9367  Jonathan 
   2814   9367  Jonathan 	buf = kmem_cache_alloc(cp, kmflag);
   2815  11178      Dave 	if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp) && buf != NULL) {
   2816   9367  Jonathan 		kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
   2817   9367  Jonathan 		((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
   2818   9367  Jonathan 		((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
   2819   9367  Jonathan 
   2820   9367  Jonathan 		if (cp->cache_flags & KMF_LITE) {
   2821   9367  Jonathan 			KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller());
   2822   9367  Jonathan 		}
   2823   9367  Jonathan 	}
   2824      0    stevel 	return (buf);
   2825      0    stevel }
   2826      0    stevel 
   2827      0    stevel void
   2828      0    stevel kmem_free(void *buf, size_t size)
   2829      0    stevel {
   2830   9367  Jonathan 	size_t index;
   2831   9367  Jonathan 	kmem_cache_t *cp;
   2832   9367  Jonathan 
   2833   9367  Jonathan 	if ((index = (size - 1) >> KMEM_ALIGN_SHIFT) < KMEM_ALLOC_TABLE_MAX) {
   2834   9367  Jonathan 		cp = kmem_alloc_table[index];
   2835   9367  Jonathan 		/* fall through to kmem_cache_free() */
   2836   9367  Jonathan 
   2837   9367  Jonathan 	} else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) <
   2838   9367  Jonathan 	    kmem_big_alloc_table_max) {
   2839   9367  Jonathan 		cp = kmem_big_alloc_table[index];
   2840   9367  Jonathan 		/* fall through to kmem_cache_free() */
   2841   9367  Jonathan 
   2842   9367  Jonathan 	} else {
   2843   9367  Jonathan 		if (buf == NULL && size == 0)
   2844   9367  Jonathan 			return;
   2845   9367  Jonathan 		vmem_free(kmem_oversize_arena, buf, size);
   2846   9367  Jonathan 		return;
   2847   9367  Jonathan 	}
   2848   9367  Jonathan 
   2849  11178      Dave 	if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) {
   2850   9367  Jonathan 		kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
   2851   9367  Jonathan 		uint32_t *ip = (uint32_t *)btp;
   2852   9367  Jonathan 		if (ip[1] != KMEM_SIZE_ENCODE(size)) {
   2853   9367  Jonathan 			if (*(uint64_t *)buf == KMEM_FREE_PATTERN) {
   2854   9367  Jonathan 				kmem_error(KMERR_DUPFREE, cp, buf);
   2855      0    stevel 				return;
   2856      0    stevel 			}
   2857   9367  Jonathan 			if (KMEM_SIZE_VALID(ip[1])) {
   2858   9367  Jonathan 				ip[0] = KMEM_SIZE_ENCODE(size);
   2859   9367  Jonathan 				kmem_error(KMERR_BADSIZE, cp, buf);
   2860   9367  Jonathan 			} else {
   2861      0    stevel 				kmem_error(KMERR_REDZONE, cp, buf);
   2862   9367  Jonathan 			}
   2863   9367  Jonathan 			return;
   2864   9367  Jonathan 		}
   2865   9367  Jonathan 		if (((uint8_t *)buf)[size] != KMEM_REDZONE_BYTE) {
   2866   9367  Jonathan 			kmem_error(KMERR_REDZONE, cp, buf);
   2867   9367  Jonathan 			return;
   2868   9367  Jonathan 		}
   2869   9367  Jonathan 		btp->bt_redzone = KMEM_REDZONE_PATTERN;
   2870   9367  Jonathan 		if (cp->cache_flags & KMF_LITE) {
   2871   9367  Jonathan 			KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count,
   2872   9367  Jonathan 			    caller());
   2873   9367  Jonathan 		}
   2874   9367  Jonathan 	}
   2875   9367  Jonathan 	kmem_cache_free(cp, buf);
   2876      0    stevel }
   2877      0    stevel 
   2878      0    stevel void *
   2879      0    stevel kmem_firewall_va_alloc(vmem_t *vmp, size_t size, int vmflag)
   2880      0    stevel {
   2881      0    stevel 	size_t realsize = size + vmp->vm_quantum;
   2882      0    stevel 	void *addr;
   2883      0    stevel 
   2884      0    stevel 	/*
   2885      0    stevel 	 * Annoying edge case: if 'size' is just shy of ULONG_MAX, adding
   2886      0    stevel 	 * vm_quantum will cause integer wraparound.  Check for this, and
   2887      0    stevel 	 * blow off the firewall page in this case.  Note that such a
   2888      0    stevel 	 * giant allocation (the entire kernel address space) can never
   2889      0    stevel 	 * be satisfied, so it will either fail immediately (VM_NOSLEEP)
   2890      0    stevel 	 * or sleep forever (VM_SLEEP).  Thus, there is no need for a
   2891      0    stevel 	 * corresponding check in kmem_firewall_va_free().
   2892      0    stevel 	 */
   2893      0    stevel 	if (realsize < size)
   2894      0    stevel 		realsize = size;
   2895      0    stevel 
   2896      0    stevel 	/*
   2897      0    stevel 	 * While boot still owns resource management, make sure that this
   2898      0    stevel 	 * redzone virtual address allocation is properly accounted for in
   2899      0    stevel 	 * OBPs "virtual-memory" "available" lists because we're
   2900      0    stevel 	 * effectively claiming them for a red zone.  If we don't do this,
   2901      0    stevel 	 * the available lists become too fragmented and too large for the
   2902      0    stevel 	 * current boot/kernel memory list interface.
   2903      0    stevel 	 */
   2904      0    stevel 	addr = vmem_alloc(vmp, realsize, vmflag | VM_NEXTFIT);
   2905      0    stevel 
   2906      0    stevel 	if (addr != NULL && kvseg.s_base == NULL && realsize != size)
   2907      0    stevel 		(void) boot_virt_alloc((char *)addr + size, vmp->vm_quantum);
   2908      0    stevel 
   2909      0    stevel 	return (addr);
   2910      0    stevel }
   2911      0    stevel 
   2912      0    stevel void
   2913      0    stevel kmem_firewall_va_free(vmem_t *vmp, void *addr, size_t size)
   2914      0    stevel {
   2915      0    stevel 	ASSERT((kvseg.s_base == NULL ?
   2916      0    stevel 	    va_to_pfn((char *)addr + size) :
   2917      0    stevel 	    hat_getpfnum(kas.a_hat, (caddr_t)addr + size)) == PFN_INVALID);
   2918      0    stevel 
   2919      0    stevel 	vmem_free(vmp, addr, size + vmp->vm_quantum);
   2920      0    stevel }
   2921      0    stevel 
   2922      0    stevel /*
   2923      0    stevel  * Try to allocate at least `size' bytes of memory without sleeping or
   2924      0    stevel  * panicking. Return actual allocated size in `asize'. If allocation failed,
   2925      0    stevel  * try final allocation with sleep or panic allowed.
   2926      0    stevel  */
   2927      0    stevel void *
   2928      0    stevel kmem_alloc_tryhard(size_t size, size_t *asize, int kmflag)
   2929      0    stevel {
   2930      0    stevel 	void *p;
   2931      0    stevel 
   2932      0    stevel 	*asize = P2ROUNDUP(size, KMEM_ALIGN);
   2933      0    stevel 	do {
   2934      0    stevel 		p = kmem_alloc(*asize, (kmflag | KM_NOSLEEP) & ~KM_PANIC);
   2935      0    stevel 		if (p != NULL)
   2936      0    stevel 			return (p);
   2937      0    stevel 		*asize += KMEM_ALIGN;
   2938      0    stevel 	} while (*asize <= PAGESIZE);
   2939      0    stevel 
   2940      0    stevel 	*asize = P2ROUNDUP(size, KMEM_ALIGN);
   2941      0    stevel 	return (kmem_alloc(*asize, kmflag));
   2942      0    stevel }
   2943      0    stevel 
   2944      0    stevel /*
   2945      0    stevel  * Reclaim all unused memory from a cache.
   2946      0    stevel  */
   2947      0    stevel static void
   2948      0    stevel kmem_cache_reap(kmem_cache_t *cp)
   2949      0    stevel {
   2950   6712     tomee 	ASSERT(taskq_member(kmem_taskq, curthread));
   2951  10217       Tom 	cp->cache_reap++;
   2952   6712     tomee 
   2953      0    stevel 	/*
   2954      0    stevel 	 * Ask the cache's owner to free some memory if possible.
   2955      0    stevel 	 * The idea is to handle things like the inode cache, which
   2956      0    stevel 	 * typically sits on a bunch of memory that it doesn't truly
   2957      0    stevel 	 * *need*.  Reclaim policy is entirely up to the owner; this
   2958      0    stevel 	 * callback is just an advisory plea for help.
   2959      0    stevel 	 */
   2960   6712     tomee 	if (cp->cache_reclaim != NULL) {
   2961   6712     tomee 		long delta;
   2962   6712     tomee 
   2963   6712     tomee 		/*
   2964   6712     tomee 		 * Reclaimed memory should be reapable (not included in the
   2965   6712     tomee 		 * depot's working set).
   2966   6712     tomee 		 */
   2967   6712     tomee 		delta = cp->cache_full.ml_total;
   2968      0    stevel 		cp->cache_reclaim(cp->cache_private);
   2969   6712     tomee 		delta = cp->cache_full.ml_total - delta;
   2970   6712     tomee 		if (delta > 0) {
   2971   6712     tomee 			mutex_enter(&cp->cache_depot_lock);
   2972   6712     tomee 			cp->cache_full.ml_reaplimit += delta;
   2973   6712     tomee 			cp->cache_full.ml_min += delta;
   2974   6712     tomee 			mutex_exit(&cp->cache_depot_lock);
   2975   6712     tomee 		}
   2976   6712     tomee 	}
   2977      0    stevel 
   2978      0    stevel 	kmem_depot_ws_reap(cp);
   2979   6712     tomee 
   2980   6712     tomee 	if (cp->cache_defrag != NULL && !kmem_move_noreap) {
   2981   6712     tomee 		kmem_cache_defrag(cp);
   2982   6712     tomee 	}
   2983      0    stevel }
   2984      0    stevel 
   2985      0    stevel static void
   2986      0    stevel kmem_reap_timeout(void *flag_arg)
   2987      0    stevel {
   2988      0    stevel 	uint32_t *flag = (uint32_t *)flag_arg;
   2989      0    stevel 
   2990      0    stevel 	ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace);
   2991      0    stevel 	*flag = 0;
   2992      0    stevel }
   2993      0    stevel 
   2994      0    stevel static void
   2995      0    stevel kmem_reap_done(void *flag)
   2996      0    stevel {
   2997      0    stevel 	(void) timeout(kmem_reap_timeout, flag, kmem_reap_interval);
   2998      0    stevel }
   2999      0    stevel 
   3000      0    stevel static void
   3001      0    stevel kmem_reap_start(void *flag)
   3002      0    stevel {
   3003      0    stevel 	ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace);
   3004      0    stevel 
   3005      0    stevel 	if (flag == &kmem_reaping) {
   3006      0    stevel 		kmem_cache_applyall(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP);
   3007      0    stevel 		/*
   3008      0    stevel 		 * if we have segkp under heap, reap segkp cache.
   3009      0    stevel 		 */
   3010      0    stevel 		if (segkp_fromheap)
   3011      0    stevel 			segkp_cache_free();
   3012      0    stevel 	}
   3013      0    stevel 	else
   3014      0    stevel 		kmem_cache_applyall_id(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP);
   3015      0    stevel 
   3016      0    stevel 	/*
   3017      0    stevel 	 * We use taskq_dispatch() to schedule a timeout to clear
   3018      0    stevel 	 * the flag so that kmem_reap() becomes self-throttling:
   3019      0    stevel 	 * we won't reap again until the current reap completes *and*
   3020      0    stevel 	 * at least kmem_reap_interval ticks have elapsed.
   3021      0    stevel 	 */
   3022      0    stevel 	if (!taskq_dispatch(kmem_taskq, kmem_reap_done, flag, TQ_NOSLEEP))
   3023      0    stevel 		kmem_reap_done(flag);
   3024      0    stevel }
   3025      0    stevel 
   3026      0    stevel static void
   3027      0    stevel kmem_reap_common(void *flag_arg)
   3028      0    stevel {
   3029      0    stevel 	uint32_t *flag = (uint32_t *)flag_arg;
   3030      0    stevel 
   3031      0    stevel 	if (MUTEX_HELD(&kmem_cache_lock) || kmem_taskq == NULL ||
   3032      0    stevel 	    cas32(flag, 0, 1) != 0)
   3033      0    stevel 		return;
   3034      0    stevel 
   3035      0    stevel 	/*
   3036      0    stevel 	 * It may not be kosher to do memory allocation when a reap is called
   3037      0    stevel 	 * is called (for example, if vmem_populate() is in the call chain).
   3038      0    stevel 	 * So we start the reap going with a TQ_NOALLOC dispatch.  If the
   3039      0    stevel 	 * dispatch fails, we reset the flag, and the next reap will try again.
   3040      0    stevel 	 */
   3041      0    stevel 	if (!taskq_dispatch(kmem_taskq, kmem_reap_start, flag, TQ_NOALLOC))
   3042      0    stevel 		*flag = 0;
   3043      0    stevel }
   3044      0    stevel 
   3045      0    stevel /*
   3046      0    stevel  * Reclaim all unused memory from all caches.  Called from the VM system
   3047      0    stevel  * when memory gets tight.
   3048      0    stevel  */
   3049      0    stevel void
   3050      0    stevel kmem_reap(void)
   3051      0    stevel {
   3052      0    stevel 	kmem_reap_common(&kmem_reaping);
   3053      0    stevel }
   3054      0    stevel 
   3055      0    stevel /*
   3056      0    stevel  * Reclaim all unused memory from identifier arenas, called when a vmem
   3057      0    stevel  * arena not back by memory is exhausted.  Since reaping memory-backed caches
   3058      0    stevel  * cannot help with identifier exhaustion, we avoid both a large amount of
   3059      0    stevel  * work and unwanted side-effects from reclaim callbacks.
   3060      0    stevel  */
   3061      0    stevel void
   3062      0    stevel kmem_reap_idspace(void)
   3063      0    stevel {
   3064      0    stevel 	kmem_reap_common(&kmem_reaping_idspace);
   3065      0    stevel }
   3066      0    stevel 
   3067      0    stevel /*
   3068      0    stevel  * Purge all magazines from a cache and set its magazine limit to zero.
   3069      0    stevel  * All calls are serialized by the kmem_taskq lock, except for the final
   3070      0    stevel  * call from kmem_cache_destroy().
   3071      0    stevel  */
   3072      0    stevel static void
   3073      0    stevel kmem_cache_magazine_purge(kmem_cache_t *cp)
   3074      0    stevel {
   3075      0    stevel 	kmem_cpu_cache_t *ccp;
   3076      0    stevel 	kmem_magazine_t *mp, *pmp;
   3077      0    stevel 	int rounds, prounds, cpu_seqid;
   3078      0    stevel 
   3079   6712     tomee 	ASSERT(!list_link_active(&cp->cache_link) ||
   3080   6712     tomee 	    taskq_member(kmem_taskq, curthread));
   3081      0    stevel 	ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
   3082      0    stevel 
   3083      0    stevel 	for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
   3084      0    stevel 		ccp = &cp->cache_cpu[cpu_seqid];
   3085      0    stevel 
   3086      0    stevel 		mutex_enter(&ccp->cc_lock);
   3087      0    stevel 		mp = ccp->cc_loaded;
   3088      0    stevel 		pmp = ccp->cc_ploaded;
   3089      0    stevel 		rounds = ccp->cc_rounds;
   3090      0    stevel 		prounds = ccp->cc_prounds;
   3091      0    stevel 		ccp->cc_loaded = NULL;
   3092      0    stevel 		ccp->cc_ploaded = NULL;
   3093      0    stevel 		ccp->cc_rounds = -1;
   3094      0    stevel 		ccp->cc_prounds = -1;
   3095      0    stevel 		ccp->cc_magsize = 0;
   3096      0    stevel 		mutex_exit(&ccp->cc_lock);
   3097      0    stevel 
   3098      0    stevel 		if (mp)
   3099      0    stevel 			kmem_magazine_destroy(cp, mp, rounds);
   3100      0    stevel 		if (pmp)
   3101      0    stevel 			kmem_magazine_destroy(cp, pmp, prounds);
   3102      0    stevel 	}
   3103      0    stevel 
   3104      0    stevel 	/*
   3105      0    stevel 	 * Updating the working set statistics twice in a row has the
   3106      0    stevel 	 * effect of setting the working set size to zero, so everything
   3107      0    stevel 	 * is eligible for reaping.
   3108      0    stevel 	 */
   3109      0    stevel 	kmem_depot_ws_update(cp);
   3110      0    stevel 	kmem_depot_ws_update(cp);
   3111      0    stevel 
   3112      0    stevel 	kmem_depot_ws_reap(cp);
   3113      0    stevel }
   3114