Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #ifndef	_VM_PAGE_H
     40 #define	_VM_PAGE_H
     41 
     42 #include <vm/seg.h>
     43 
     44 #ifdef	__cplusplus
     45 extern "C" {
     46 #endif
     47 
     48 #if defined(_KERNEL) || defined(_KMEMUSER)
     49 
     50 /*
     51  * Shared/Exclusive lock.
     52  */
     53 
     54 /*
     55  * Types of page locking supported by page_lock & friends.
     56  */
     57 typedef enum {
     58 	SE_SHARED,
     59 	SE_EXCL			/* exclusive lock (value == -1) */
     60 } se_t;
     61 
     62 /*
     63  * For requesting that page_lock reclaim the page from the free list.
     64  */
     65 typedef enum {
     66 	P_RECLAIM,		/* reclaim page from free list */
     67 	P_NO_RECLAIM		/* DON`T reclaim the page	*/
     68 } reclaim_t;
     69 
     70 /*
     71  * Callers of page_try_reclaim_lock and page_lock_es can use this flag
     72  * to get SE_EXCL access before reader/writers are given access.
     73  */
     74 #define	SE_EXCL_WANTED	0x02
     75 
     76 /*
     77  * All page_*lock() requests will be denied unless this flag is set in
     78  * the 'es' parameter.
     79  */
     80 #define	SE_RETIRED	0x04
     81 
     82 #endif	/* _KERNEL | _KMEMUSER */
     83 
     84 typedef int	selock_t;
     85 
     86 /*
     87  * Define VM_STATS to turn on all sorts of statistic gathering about
     88  * the VM layer.  By default, it is only turned on when DEBUG is
     89  * also defined.
     90  */
     91 #ifdef DEBUG
     92 #define	VM_STATS
     93 #endif	/* DEBUG */
     94 
     95 #ifdef VM_STATS
     96 #define	VM_STAT_ADD(stat)			(stat)++
     97 #define	VM_STAT_COND_ADD(cond, stat)		((void) (!(cond) || (stat)++))
     98 #else
     99 #define	VM_STAT_ADD(stat)
    100 #define	VM_STAT_COND_ADD(cond, stat)
    101 #endif	/* VM_STATS */
    102 
    103 #ifdef _KERNEL
    104 
    105 /*
    106  * Macros to acquire and release the page logical lock.
    107  */
    108 #define	page_struct_lock(pp)	mutex_enter(&page_llock)
    109 #define	page_struct_unlock(pp)	mutex_exit(&page_llock)
    110 
    111 #endif	/* _KERNEL */
    112 
    113 #include <sys/t_lock.h>
    114 
    115 struct as;
    116 
    117 /*
    118  * Each physical page has a page structure, which is used to maintain
    119  * these pages as a cache.  A page can be found via a hashed lookup
    120  * based on the [vp, offset].  If a page has an [vp, offset] identity,
    121  * then it is entered on a doubly linked circular list off the
    122  * vnode using the vpnext/vpprev pointers.   If the p_free bit
    123  * is on, then the page is also on a doubly linked circular free
    124  * list using next/prev pointers.  If the "p_selock" and "p_iolock"
    125  * are held, then the page is currently being read in (exclusive p_selock)
    126  * or written back (shared p_selock).  In this case, the next/prev pointers
    127  * are used to link the pages together for a consecutive i/o request.  If
    128  * the page is being brought in from its backing store, then other processes
    129  * will wait for the i/o to complete before attaching to the page since it
    130  * will have an "exclusive" lock.
    131  *
    132  * Each page structure has the locks described below along with
    133  * the fields they protect:
    134  *
    135  *	p_selock	This is a per-page shared/exclusive lock that is
    136  *			used to implement the logical shared/exclusive
    137  *			lock for each page.  The "shared" lock is normally
    138  *			used in most cases while the "exclusive" lock is
    139  *			required to destroy or retain exclusive access to
    140  *			a page (e.g., while reading in pages).  The appropriate
    141  *			lock is always held whenever there is any reference
    142  *			to a page structure (e.g., during i/o).
    143  *			(Note that with the addition of the "writer-lock-wanted"
    144  *			semantics (via SE_EWANTED), threads must not acquire
    145  *			multiple reader locks or else a deadly embrace will
    146  *			occur in the following situation: thread 1 obtains a
    147  *			reader lock; next thread 2 fails to get a writer lock
    148  *			but specified SE_EWANTED so it will wait by either
    149  *			blocking (when using page_lock_es) or spinning while
    150  *			retrying (when using page_try_reclaim_lock) until the
    151  *			reader lock is released; then thread 1 attempts to
    152  *			get another reader lock but is denied due to
    153  *			SE_EWANTED being set, and now both threads are in a
    154  *			deadly embrace.)
    155  *
    156  *				p_hash
    157  *				p_vnode
    158  *				p_offset
    159  *
    160  *				p_free
    161  *				p_age
    162  *
    163  *	p_iolock	This is a binary semaphore lock that provides
    164  *			exclusive access to the i/o list links in each
    165  *			page structure.  It is always held while the page
    166  *			is on an i/o list (i.e., involved in i/o).  That is,
    167  *			even though a page may be only `shared' locked
    168  *			while it is doing a write, the following fields may
    169  *			change anyway.  Normally, the page must be
    170  *			`exclusively' locked to change anything in it.
    171  *
    172  *				p_next
    173  *				p_prev
    174  *
    175  * The following fields are protected by the global page_llock:
    176  *
    177  *				p_lckcnt
    178  *				p_cowcnt
    179  *
    180  * The following lists are protected by the global page_freelock:
    181  *
    182  *				page_cachelist
    183  *				page_freelist
    184  *
    185  * The following, for our purposes, are protected by
    186  * the global freemem_lock:
    187  *
    188  *				freemem
    189  *				freemem_wait
    190  *				freemem_cv
    191  *
    192  * The following fields are protected by hat layer lock(s).  When a page
    193  * structure is not mapped and is not associated with a vnode (after a call
    194  * to page_hashout() for example) the p_nrm field may be modified with out
    195  * holding the hat layer lock:
    196  *
    197  *				p_nrm
    198  *				p_mapping
    199  *				p_share
    200  *
    201  * The following field is file system dependent.  How it is used and
    202  * the locking strategies applied are up to the individual file system
    203  * implementation.
    204  *
    205  *				p_fsdata
    206  *
    207  * The page structure is used to represent and control the system's
    208  * physical pages.  There is one instance of the structure for each
    209  * page that is not permenately allocated.  For example, the pages that
    210  * hold the page structures are permanently held by the kernel
    211  * and hence do not need page structures to track them.  The array
    212  * of page structures is allocated early on in the kernel's life and
    213  * is based on the amount of available physical memory.
    214  *
    215  * Each page structure may simultaneously appear on several linked lists.
    216  * The lists are:  hash list, free or in i/o list, and a vnode's page list.
    217  * Each type of list is protected by a different group of mutexes as described
    218  * below:
    219  *
    220  * The hash list is used to quickly find a page when the page's vnode and
    221  * offset within the vnode are known.  Each page that is hashed is
    222  * connected via the `p_hash' field.  The anchor for each hash is in the
    223  * array `page_hash'.  An array of mutexes, `ph_mutex', protects the
    224  * lists anchored by page_hash[].  To either search or modify a given hash
    225  * list, the appropriate mutex in the ph_mutex array must be held.
    226  *
    227  * The free list contains pages that are `free to be given away'.  For
    228  * efficiency reasons, pages on this list are placed in two catagories:
    229  * pages that are still associated with a vnode, and pages that are not
    230  * associated with a vnode.  Free pages always have their `p_free' bit set,
    231  * free pages that are still associated with a vnode also have their
    232  * `p_age' bit set.  Pages on the free list are connected via their
    233  * `p_next' and `p_prev' fields.  When a page is involved in some sort
    234  * of i/o, it is not free and these fields may be used to link associated
    235  * pages together.  At the moment, the free list is protected by a
    236  * single mutex `page_freelock'.  The list of free pages still associated
    237  * with a vnode is anchored by `page_cachelist' while other free pages
    238  * are anchored in architecture dependent ways (to handle page coloring etc.).
    239  *
    240  * Pages associated with a given vnode appear on a list anchored in the
    241  * vnode by the `v_pages' field.  They are linked together with
    242  * `p_vpnext' and `p_vpprev'.  The field `p_offset' contains a page's
    243  * offset within the vnode.  The pages on this list are not kept in
    244  * offset order.  These lists, in a manner similar to the hash lists,
    245  * are protected by an array of mutexes called `vph_hash'.  Before
    246  * searching or modifying this chain the appropriate mutex in the
    247  * vph_hash[] array must be held.
    248  *
    249  * Again, each of the lists that a page can appear on is protected by a
    250  * mutex.  Before reading or writing any of the fields comprising the
    251  * list, the appropriate lock must be held.  These list locks should only
    252  * be held for very short intervals.
    253  *
    254  * In addition to the list locks, each page structure contains a
    255  * shared/exclusive lock that protects various fields within it.
    256  * To modify one of these fields, the `p_selock' must be exclusively held.
    257  * To read a field with a degree of certainty, the lock must be at least
    258  * held shared.
    259  *
    260  * Removing a page structure from one of the lists requires holding
    261  * the appropriate list lock and the page's p_selock.  A page may be
    262  * prevented from changing identity, being freed, or otherwise modified
    263  * by acquiring p_selock shared.
    264  *
    265  * To avoid deadlocks, a strict locking protocol must be followed.  Basically
    266  * there are two cases:  In the first case, the page structure in question
    267  * is known ahead of time (e.g., when the page is to be added or removed
    268  * from a list).  In the second case, the page structure is not known and
    269  * must be found by searching one of the lists.
    270  *
    271  * When adding or removing a known page to one of the lists, first the
    272  * page must be exclusively locked (since at least one of its fields
    273  * will be modified), second the lock protecting the list must be acquired,
    274  * third the page inserted or deleted, and finally the list lock dropped.
    275  *
    276  * The more interesting case occures when the particular page structure
    277  * is not known ahead of time.  For example, when a call is made to
    278  * page_lookup(), it is not known if a page with the desired (vnode and
    279  * offset pair) identity exists.  So the appropriate mutex in ph_mutex is
    280  * acquired, the hash list searched, and if the desired page is found
    281  * an attempt is made to lock it.  The attempt to acquire p_selock must
    282  * not block while the hash list lock is held.  A deadlock could occure
    283  * if some other process was trying to remove the page from the list.
    284  * The removing process (following the above protocol) would have exclusively
    285  * locked the page, and be spinning waiting to acquire the lock protecting
    286  * the hash list.  Since the searching process holds the hash list lock
    287  * and is waiting to acquire the page lock, a deadlock occurs.
    288  *
    289  * The proper scheme to follow is: first, lock the appropriate list,
    290  * search the list, and if the desired page is found either use
    291  * page_trylock() (which will not block) or pass the address of the
    292  * list lock to page_lock().  If page_lock() can not acquire the page's
    293  * lock, it will drop the list lock before going to sleep.  page_lock()
    294  * returns a value to indicate if the list lock was dropped allowing the
    295  * calling program to react appropriately (i.e., retry the operation).
    296  *
    297  * If the list lock was dropped before the attempt at locking the page
    298  * was made, checks would have to be made to ensure that the page had
    299  * not changed identity before its lock was obtained.  This is because
    300  * the interval between dropping the list lock and acquiring the page
    301  * lock is indeterminate.
    302  *
    303  * In addition, when both a hash list lock (ph_mutex[]) and a vnode list
    304  * lock (vph_mutex[]) are needed, the hash list lock must be acquired first.
    305  * The routine page_hashin() is a good example of this sequence.
    306  * This sequence is ASSERTed by checking that the vph_mutex[] is not held
    307  * just before each acquisition of one of the mutexs in ph_mutex[].
    308  *
    309  * So, as a quick summary:
    310  *
    311  * 	pse_mutex[]'s protect the p_selock and p_cv fields.
    312  *
    313  * 	p_selock protects the p_free, p_age, p_vnode, p_offset and p_hash,
    314  *
    315  * 	ph_mutex[]'s protect the page_hash[] array and its chains.
    316  *
    317  * 	vph_mutex[]'s protect the v_pages field and the vp page chains.
    318  *
    319  *	First lock the page, then the hash chain, then the vnode chain.  When
    320  *	this is not possible `trylocks' must be used.  Sleeping while holding
    321  *	any of these mutexes (p_selock is not a mutex) is not allowed.
    322  *
    323  *
    324  *	field		reading		writing		    ordering
    325  *	======================================================================
    326  *	p_vnode		p_selock(E,S)	p_selock(E)
    327  *	p_offset
    328  *	p_free
    329  *	p_age
    330  *	=====================================================================
    331  *	p_hash		p_selock(E,S)	p_selock(E) &&	    p_selock, ph_mutex
    332  *					ph_mutex[]
    333  *	=====================================================================
    334  *	p_vpnext	p_selock(E,S)	p_selock(E) &&	    p_selock, vph_mutex
    335  *	p_vpprev			vph_mutex[]
    336  *	=====================================================================
    337  *	When the p_free bit is set:
    338  *
    339  *	p_next		p_selock(E,S)	p_selock(E) &&	    p_selock,
    340  *	p_prev				page_freelock	    page_freelock
    341  *
    342  *	When the p_free bit is not set:
    343  *
    344  *	p_next		p_selock(E,S)	p_selock(E) &&	    p_selock, p_iolock
    345  *	p_prev				p_iolock
    346  *	=====================================================================
    347  *	p_selock	pse_mutex[]	pse_mutex[]	    can`t acquire any
    348  *	p_cv						    other mutexes or
    349  *							    sleep while holding
    350  *							    this lock.
    351  *	=====================================================================
    352  *	p_lckcnt	p_selock(E,S)	p_selock(E) &&
    353  *	p_cowcnt			page_llock
    354  *	=====================================================================
    355  *	p_nrm		hat layer lock	hat layer lock
    356  *	p_mapping
    357  *	p_pagenum
    358  *	=====================================================================
    359  *
    360  *	where:
    361  *		E----> exclusive version of p_selock.
    362  *		S----> shared version of p_selock.
    363  *
    364  *
    365  *	Global data structures and variable:
    366  *
    367  *	field		reading		writing		    ordering
    368  *	=====================================================================
    369  *	page_hash[]	ph_mutex[]	ph_mutex[]	    can hold this lock
    370  *							    before acquiring
    371  *							    a vph_mutex or
    372  *							    pse_mutex.
    373  *	=====================================================================
    374  *	vp->v_pages	vph_mutex[]	vph_mutex[]	    can only acquire
    375  *							    a pse_mutex while
    376  *							    holding this lock.
    377  *	=====================================================================
    378  *	page_cachelist	page_freelock	page_freelock	    can't acquire any
    379  *	page_freelist	page_freelock	page_freelock
    380  *	=====================================================================
    381  *	freemem		freemem_lock	freemem_lock	    can't acquire any
    382  *	freemem_wait					    other mutexes while
    383  *	freemem_cv					    holding this mutex.
    384  *	=====================================================================
    385  *
    386  * Page relocation, PG_NORELOC and P_NORELOC.
    387  *
    388  * Pages may be relocated using the page_relocate() interface. Relocation
    389  * involves moving the contents and identity of a page to another, free page.
    390  * To relocate a page, the SE_EXCL lock must be obtained. The way to prevent
    391  * a page from being relocated is to hold the SE_SHARED lock (the SE_EXCL
    392  * lock must not be held indefinitely). If the page is going to be held
    393  * SE_SHARED indefinitely, then the PG_NORELOC hint should be passed
    394  * to page_create_va so that pages that are prevented from being relocated
    395  * can be managed differently by the platform specific layer.
    396  *
    397  * Pages locked in memory using page_pp_lock (p_lckcnt/p_cowcnt != 0)
    398  * are guaranteed to be held in memory, but can still be relocated
    399  * providing the SE_EXCL lock can be obtained.
    400  *
    401  * The P_NORELOC bit in the page_t.p_state field is provided for use by
    402  * the platform specific code in managing pages when the PG_NORELOC
    403  * hint is used.
    404  *
    405  * Memory delete and page locking.
    406  *
    407  * The set of all usable pages is managed using the global page list as
    408  * implemented by the memseg structure defined below. When memory is added
    409  * or deleted this list changes. Additions to this list guarantee that the
    410  * list is never corrupt.  In order to avoid the necessity of an additional
    411  * lock to protect against failed accesses to the memseg being deleted and,
    412  * more importantly, the page_ts, the memseg structure is never freed and the
    413  * page_t virtual address space is remapped to a page (or pages) of
    414  * zeros.  If a page_t is manipulated while it is p_selock'd, or if it is
    415  * locked indirectly via a hash or freelist lock, it is not possible for
    416  * memory delete to collect the page and so that part of the page list is
    417  * prevented from being deleted. If the page is referenced outside of one
    418  * of these locks, it is possible for the page_t being referenced to be
    419  * deleted.  Examples of this are page_t pointers returned by
    420  * page_numtopp_nolock, page_first and page_next.  Providing the page_t
    421  * is re-checked after taking the p_selock (for p_vnode != NULL), the
    422  * remapping to the zero pages will be detected.
    423  *
    424  *
    425  * Page size (p_szc field) and page locking.
    426  *
    427  * p_szc field of free pages is changed by free list manager under freelist
    428  * locks and is of no concern to the rest of VM subsystem.
    429  *
    430  * p_szc changes of allocated anonymous (swapfs) can only be done only after
    431  * exclusively locking all constituent pages and calling hat_pageunload() on
    432  * each of them. To prevent p_szc changes of non free anonymous (swapfs) large
    433  * pages it's enough to either lock SHARED any of constituent pages or prevent
    434  * hat_pageunload() by holding hat level lock that protects mapping lists (this
    435  * method is for hat code only)
    436  *
    437  * To increase (promote) p_szc of allocated non anonymous file system pages
    438  * one has to first lock exclusively all involved constituent pages and call
    439  * hat_pageunload() on each of them. To prevent p_szc promote it's enough to
    440  * either lock SHARED any of constituent pages that will be needed to make a
    441  * large page or prevent hat_pageunload() by holding hat level lock that
    442  * protects mapping lists (this method is for hat code only).
    443  *
    444  * To decrease (demote) p_szc of an allocated non anonymous file system large
    445  * page one can either use the same method as used for changeing p_szc of
    446  * anonymous large pages or if it's not possible to lock all constituent pages
    447  * exclusively a different method can be used. In the second method one only
    448  * has to exclusively lock one of constituent pages but then one has to
    449  * acquire further locks by calling page_szc_lock() and
    450  * hat_page_demote(). hat_page_demote() acquires hat level locks and then
    451  * demotes the page. This mechanism relies on the fact that any code that
    452  * needs to prevent p_szc of a file system large page from changeing either
    453  * locks all constituent large pages at least SHARED or locks some pages at
    454  * least SHARED and calls page_szc_lock() or uses hat level page locks.
    455  * Demotion using this method is implemented by page_demote_vp_pages().
    456  * Please see comments in front of page_demote_vp_pages(), hat_page_demote()
    457  * and page_szc_lock() for more details.
    458  *
    459  * Lock order: p_selock, page_szc_lock, ph_mutex/vph_mutex/freelist,
    460  * hat level locks.
    461  */
    462 
    463 typedef struct page {
    464 	u_offset_t	p_offset;	/* offset into vnode for this page */
    465 	struct vnode	*p_vnode;	/* vnode that this page is named by */
    466 	selock_t	p_selock;	/* shared/exclusive lock on the page */
    467 #if defined(_LP64)
    468 	uint_t		p_vpmref;	/* vpm ref - index of the vpmap_t */
    469 #endif
    470 	struct page	*p_hash;	/* hash by [vnode, offset] */
    471 	struct page	*p_vpnext;	/* next page in vnode list */
    472 	struct page	*p_vpprev;	/* prev page in vnode list */
    473 	struct page	*p_next;	/* next page in free/intrans lists */
    474 	struct page	*p_prev;	/* prev page in free/intrans lists */
    475 	ushort_t	p_lckcnt;	/* number of locks on page data */
    476 	ushort_t	p_cowcnt;	/* number of copy on write lock */
    477 	kcondvar_t	p_cv;		/* page struct's condition var */
    478 	kcondvar_t	p_io_cv;	/* for iolock */
    479 	uchar_t		p_iolock_state;	/* replaces p_iolock */
    480 	volatile uchar_t p_szc;		/* page size code */
    481 	uchar_t		p_fsdata;	/* file system dependent byte */
    482 	uchar_t		p_state;	/* p_free, p_noreloc */
    483 	uchar_t		p_nrm;		/* non-cache, ref, mod readonly bits */
    484 #if defined(__sparc)
    485 	uchar_t		p_vcolor;	/* virtual color */
    486 #else
    487 	uchar_t		p_embed;	/* x86 - changes p_mapping & p_index */
    488 #endif
    489 	uchar_t		p_index;	/* MPSS mapping info. Not used on x86 */
    490 	uchar_t		p_toxic;	/* page has an unrecoverable error */
    491 	void		*p_mapping;	/* hat specific translation info */
    492 	pfn_t		p_pagenum;	/* physical page number */
    493 
    494 	uint_t		p_share;	/* number of translations */
    495 #if defined(_LP64)
    496 	uint_t		p_sharepad;	/* pad for growing p_share */
    497 #endif
    498 	uint_t		p_slckcnt;	/* number of softlocks */
    499 #if defined(__sparc)
    500 	uint_t		p_kpmref;	/* number of kpm mapping sharers */
    501 	struct kpme	*p_kpmelist;	/* kpm specific mapping info */
    502 #else
    503 	/* index of entry in p_map when p_embed is set */
    504 	uint_t		p_mlentry;
    505 #endif
    506 #if defined(_LP64)
    507 	kmutex_t	p_ilock;	/* protects p_vpmref */
    508 #else
    509 	uint64_t	p_msresv_2;	/* page allocation debugging */
    510 #endif
    511 } page_t;
    512 
    513 
    514 typedef	page_t	devpage_t;
    515 #define	devpage	page
    516 
    517 #define	PAGE_LOCK_MAXIMUM \
    518 	((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1)
    519 
    520 #define	PAGE_SLOCK_MAXIMUM UINT_MAX
    521 
    522 /*
    523  * Page hash table is a power-of-two in size, externally chained
    524  * through the hash field.  PAGE_HASHAVELEN is the average length
    525  * desired for this chain, from which the size of the page_hash
    526  * table is derived at boot time and stored in the kernel variable
    527  * page_hashsz.  In the hash function it is given by PAGE_HASHSZ.
    528  *
    529  * PAGE_HASH_FUNC returns an index into the page_hash[] array.  This
    530  * index is also used to derive the mutex that protects the chain.
    531  *
    532  * In constructing the hash function, first we dispose of unimportant bits
    533  * (page offset from "off" and the low 3 bits of "vp" which are zero for
    534  * struct alignment). Then shift and sum the remaining bits a couple times
    535  * in order to get as many source bits from the two source values into the
    536  * resulting hashed value.  Note that this will perform quickly, since the
    537  * shifting/summing are fast register to register operations with no additional
    538  * memory references).
    539  */
    540 #if defined(_LP64)
    541 
    542 #if NCPU < 4
    543 #define	PH_TABLE_SIZE	128
    544 #define	VP_SHIFT	7
    545 #else
    546 #define	PH_TABLE_SIZE	1024
    547 #define	VP_SHIFT	9
    548 #endif
    549 
    550 #else	/* 32 bits */
    551 
    552 #if NCPU < 4
    553 #define	PH_TABLE_SIZE	16
    554 #define	VP_SHIFT	7
    555 #else
    556 #define	PH_TABLE_SIZE	128
    557 #define	VP_SHIFT	9
    558 #endif
    559 
    560 #endif	/* _LP64 */
    561 
    562 /*
    563  * The amount to use for the successive shifts in the hash function below.
    564  * The actual value is LOG2(PH_TABLE_SIZE), so that as many bits as
    565  * possible will filter thru PAGE_HASH_FUNC() and PAGE_HASH_MUTEX().
    566  */
    567 #define	PH_SHIFT_SIZE   (7)
    568 
    569 #define	PAGE_HASHSZ	page_hashsz
    570 #define	PAGE_HASHAVELEN		4
    571 #define	PAGE_HASH_FUNC(vp, off) \
    572 	((((uintptr_t)(off) >> PAGESHIFT) + \
    573 		((uintptr_t)(off) >> (PAGESHIFT + PH_SHIFT_SIZE)) + \
    574 		((uintptr_t)(vp) >> 3) + \
    575 		((uintptr_t)(vp) >> (3 + PH_SHIFT_SIZE)) + \
    576 		((uintptr_t)(vp) >> (3 + 2 * PH_SHIFT_SIZE))) & \
    577 		(PAGE_HASHSZ - 1))
    578 #ifdef _KERNEL
    579 
    580 /*
    581  * The page hash value is re-hashed to an index for the ph_mutex array.
    582  *
    583  * For 64 bit kernels, the mutex array is padded out to prevent false
    584  * sharing of cache sub-blocks (64 bytes) of adjacent mutexes.
    585  *
    586  * For 32 bit kernels, we don't want to waste kernel address space with
    587  * padding, so instead we rely on the hash function to introduce skew of
    588  * adjacent vnode/offset indexes (the left shift part of the hash function).
    589  * Since sizeof (kmutex_t) is 8, we shift an additional 3 to skew to a different
    590  * 64 byte sub-block.
    591  */
    592 typedef struct pad_mutex {
    593 	kmutex_t	pad_mutex;
    594 #ifdef _LP64
    595 	char		pad_pad[64 - sizeof (kmutex_t)];
    596 #endif
    597 } pad_mutex_t;
    598 extern pad_mutex_t ph_mutex[];
    599 
    600 #define	PAGE_HASH_MUTEX(x) \
    601 	&(ph_mutex[((x) + ((x) >> VP_SHIFT) + ((x) << 3)) & \
    602 		(PH_TABLE_SIZE - 1)].pad_mutex)
    603 
    604 /*
    605  * Flags used while creating pages.
    606  */
    607 #define	PG_EXCL		0x0001
    608 #define	PG_WAIT		0x0002
    609 #define	PG_PHYSCONTIG	0x0004		/* NOT SUPPORTED */
    610 #define	PG_MATCH_COLOR	0x0008		/* SUPPORTED by free list routines */
    611 #define	PG_NORELOC	0x0010		/* Non-relocatable alloc hint. */
    612 					/* Page must be PP_ISNORELOC */
    613 #define	PG_PANIC	0x0020		/* system will panic if alloc fails */
    614 #define	PG_PUSHPAGE	0x0040		/* alloc may use reserve */
    615 #define	PG_LOCAL	0x0080		/* alloc from given lgrp only */
    616 
    617 /*
    618  * When p_selock has the SE_EWANTED bit set, threads waiting for SE_EXCL
    619  * access are given priority over all other waiting threads.
    620  */
    621 #define	SE_EWANTED	0x40000000
    622 #define	PAGE_LOCKED(pp)		(((pp)->p_selock & ~SE_EWANTED) != 0)
    623 #define	PAGE_SHARED(pp)		(((pp)->p_selock & ~SE_EWANTED) > 0)
    624 #define	PAGE_EXCL(pp)		((pp)->p_selock < 0)
    625 #define	PAGE_LOCKED_SE(pp, se)	\
    626 	((se) == SE_EXCL ? PAGE_EXCL(pp) : PAGE_SHARED(pp))
    627 
    628 extern	long page_hashsz;
    629 extern	page_t **page_hash;
    630 
    631 extern	kmutex_t page_llock;		/* page logical lock mutex */
    632 extern	kmutex_t freemem_lock;		/* freemem lock */
    633 
    634 extern	pgcnt_t	total_pages;		/* total pages in the system */
    635 
    636 /*
    637  * Variables controlling locking of physical memory.
    638  */
    639 extern	pgcnt_t	pages_pp_maximum;	/* tuning: lock + claim <= max */
    640 extern	void init_pages_pp_maximum(void);
    641 
    642 struct lgrp;
    643 
    644 /* page_list_{add,sub} flags */
    645 
    646 /* which list */
    647 #define	PG_FREE_LIST	0x0001
    648 #define	PG_CACHE_LIST	0x0002
    649 
    650 /* where on list */
    651 #define	PG_LIST_TAIL	0x0010
    652 #define	PG_LIST_HEAD	0x0020
    653 
    654 /* called from */
    655 #define	PG_LIST_ISINIT	0x1000
    656 
    657 /*
    658  * Page frame operations.
    659  */
    660 page_t	*page_lookup(struct vnode *, u_offset_t, se_t);
    661 page_t	*page_lookup_create(struct vnode *, u_offset_t, se_t, page_t *,
    662 	spgcnt_t *, int);
    663 page_t	*page_lookup_nowait(struct vnode *, u_offset_t, se_t);
    664 page_t	*page_find(struct vnode *, u_offset_t);
    665 page_t	*page_exists(struct vnode *, u_offset_t);
    666 int	page_exists_physcontig(vnode_t *, u_offset_t, uint_t, page_t *[]);
    667 int	page_exists_forreal(struct vnode *, u_offset_t, uint_t *);
    668 void	page_needfree(spgcnt_t);
    669 page_t	*page_create(struct vnode *, u_offset_t, size_t, uint_t);
    670 int	page_alloc_pages(struct vnode *, struct seg *, caddr_t, page_t **,
    671 	page_t **, uint_t, int, int);
    672 page_t  *page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes,
    673 	uint_t flags, struct seg *seg, caddr_t vaddr, void *arg);
    674 page_t	*page_create_va(struct vnode *, u_offset_t, size_t, uint_t,
    675 	struct seg *, caddr_t);
    676 int	page_create_wait(pgcnt_t npages, uint_t flags);
    677 void    page_create_putback(spgcnt_t npages);
    678 void	page_free(page_t *, int);
    679 void	page_free_at_startup(page_t *);
    680 void	page_free_pages(page_t *);
    681 void	free_vp_pages(struct vnode *, u_offset_t, size_t);
    682 int	page_reclaim(page_t *, kmutex_t *);
    683 int	page_reclaim_pages(page_t *, kmutex_t *, uint_t);
    684 void	page_destroy(page_t *, int);
    685 void	page_destroy_pages(page_t *);
    686 void	page_destroy_free(page_t *);
    687 void	page_rename(page_t *, struct vnode *, u_offset_t);
    688 int	page_hashin(page_t *, struct vnode *, u_offset_t, kmutex_t *);
    689 void	page_hashout(page_t *, kmutex_t *);
    690 int	page_num_hashin(pfn_t, struct vnode *, u_offset_t);
    691 void	page_add(page_t **, page_t *);
    692 void	page_add_common(page_t **, page_t *);
    693 void	page_sub(page_t **, page_t *);
    694 void	page_sub_common(page_t **, page_t *);
    695 page_t	*page_get_freelist(struct vnode *, u_offset_t, struct seg *,
    696 		caddr_t, size_t, uint_t, struct lgrp *);
    697 
    698 page_t	*page_get_cachelist(struct vnode *, u_offset_t, struct seg *,
    699 		caddr_t, uint_t, struct lgrp *);
    700 #if defined(__i386) || defined(__amd64)
    701 int	page_chk_freelist(uint_t);
    702 #endif
    703 void	page_list_add(page_t *, int);
    704 void	page_boot_demote(page_t *);
    705 void	page_promote_size(page_t *, uint_t);
    706 void	page_list_add_pages(page_t *, int);
    707 void	page_list_sub(page_t *, int);
    708 void	page_list_sub_pages(page_t *, uint_t);
    709 void	page_list_xfer(page_t *, int, int);
    710 void	page_list_break(page_t **, page_t **, size_t);
    711 void	page_list_concat(page_t **, page_t **);
    712 void	page_vpadd(page_t **, page_t *);
    713 void	page_vpsub(page_t **, page_t *);
    714 int	page_lock(page_t *, se_t, kmutex_t *, reclaim_t);
    715 int	page_lock_es(page_t *, se_t, kmutex_t *, reclaim_t, int);
    716 void page_lock_clr_exclwanted(page_t *);
    717 int	page_trylock(page_t *, se_t);
    718 int	page_try_reclaim_lock(page_t *, se_t, int);
    719 int	page_tryupgrade(page_t *);
    720 void	page_downgrade(page_t *);
    721 void	page_unlock(page_t *);
    722 void	page_unlock_nocapture(page_t *);
    723 void	page_lock_delete(page_t *);
    724 int	page_deleted(page_t *);
    725 int	page_pp_lock(page_t *, int, int);
    726 void	page_pp_unlock(page_t *, int, int);
    727 int	page_resv(pgcnt_t, uint_t);
    728 void	page_unresv(pgcnt_t);
    729 void	page_pp_useclaim(page_t *, page_t *, uint_t);
    730 int	page_addclaim(page_t *);
    731 int	page_subclaim(page_t *);
    732 int	page_addclaim_pages(page_t **);
    733 int	page_subclaim_pages(page_t **);
    734 pfn_t	page_pptonum(page_t *);
    735 page_t	*page_numtopp(pfn_t, se_t);
    736 page_t	*page_numtopp_noreclaim(pfn_t, se_t);
    737 page_t	*page_numtopp_nolock(pfn_t);
    738 page_t	*page_numtopp_nowait(pfn_t, se_t);
    739 page_t  *page_first();
    740 page_t  *page_next(page_t *);
    741 page_t  *page_list_next(page_t *);
    742 page_t	*page_nextn(page_t *, ulong_t);
    743 page_t	*page_next_scan_init(void **);
    744 page_t	*page_next_scan_large(page_t *, ulong_t *, void **);
    745 void    prefetch_page_r(void *);
    746 int	ppcopy(page_t *, page_t *);
    747 void	page_relocate_hash(page_t *, page_t *);
    748 void	pagezero(page_t *, uint_t, uint_t);
    749 void	pagescrub(page_t *, uint_t, uint_t);
    750 void	page_io_lock(page_t *);
    751 void	page_io_unlock(page_t *);
    752 int	page_io_trylock(page_t *);
    753 int	page_iolock_assert(page_t *);
    754 void	page_iolock_init(page_t *);
    755 void	page_io_wait(page_t *);
    756 int	page_io_locked(page_t *);
    757 pgcnt_t	page_busy(int);
    758 void	page_lock_init(void);
    759 ulong_t	page_share_cnt(page_t *);
    760 int	page_isshared(page_t *);
    761 int	page_isfree(page_t *);
    762 int	page_isref(page_t *);
    763 int	page_ismod(page_t *);
    764 int	page_release(page_t *, int);
    765 void	page_retire_init(void);
    766 int	page_retire(uint64_t, uchar_t);
    767 int	page_retire_check(uint64_t, uint64_t *);
    768 int	page_unretire(uint64_t);
    769 int	page_unretire_pp(page_t *, int);
    770 void	page_tryretire(page_t *);
    771 void	page_retire_mdboot();
    772 uint64_t	page_retire_pend_count(void);
    773 uint64_t	page_retire_pend_kas_count(void);
    774 void	page_retire_incr_pend_count(void *);
    775 void	page_retire_decr_pend_count(void *);
    776 void	page_clrtoxic(page_t *, uchar_t);
    777 void	page_settoxic(page_t *, uchar_t);
    778 
    779 int	page_mem_avail(pgcnt_t);
    780 int	page_reclaim_mem(pgcnt_t, pgcnt_t, int);
    781 
    782 void page_set_props(page_t *, uint_t);
    783 void page_clr_all_props(page_t *);
    784 int page_clear_lck_cow(page_t *, int);
    785 
    786 kmutex_t	*page_vnode_mutex(struct vnode *);
    787 kmutex_t	*page_se_mutex(struct page *);
    788 kmutex_t	*page_szc_lock(struct page *);
    789 int		page_szc_lock_assert(struct page *pp);
    790 
    791 /*
    792  * Page relocation interfaces. page_relocate() is generic.
    793  * page_get_replacement_page() is provided by the PSM.
    794  * page_free_replacement_page() is generic.
    795  */
    796 int group_page_trylock(page_t *, se_t);
    797 void group_page_unlock(page_t *);
    798 int page_relocate(page_t **, page_t **, int, int, spgcnt_t *, struct lgrp *);
    799 int do_page_relocate(page_t **, page_t **, int, spgcnt_t *, struct lgrp *);
    800 page_t *page_get_replacement_page(page_t *, struct lgrp *, uint_t);
    801 void page_free_replacement_page(page_t *);
    802 int page_relocate_cage(page_t **, page_t **);
    803 
    804 int page_try_demote_pages(page_t *);
    805 int page_try_demote_free_pages(page_t *);
    806 void page_demote_free_pages(page_t *);
    807 
    808 struct anon_map;
    809 
    810 void page_mark_migrate(struct seg *, caddr_t, size_t, struct anon_map *,
    811     ulong_t, vnode_t *, u_offset_t, int);
    812 void page_migrate(struct seg *, caddr_t, page_t **, pgcnt_t);
    813 
    814 /*
    815  * Tell the PIM we are adding physical memory
    816  */
    817 void add_physmem(page_t *, size_t, pfn_t);
    818 void add_physmem_cb(page_t *, pfn_t);	/* callback for page_t part */
    819 
    820 /*
    821  * hw_page_array[] is configured with hardware supported page sizes by
    822  * platform specific code.
    823  */
    824 typedef struct {
    825 	size_t	hp_size;
    826 	uint_t	hp_shift;
    827 	uint_t  hp_colors;
    828 	pgcnt_t	hp_pgcnt;	/* base pagesize cnt */
    829 } hw_pagesize_t;
    830 
    831 extern hw_pagesize_t	hw_page_array[];
    832 extern uint_t		page_coloring_shift;
    833 extern uint_t		page_colors_mask;
    834 extern int		cpu_page_colors;
    835 extern uint_t		colorequiv;
    836 extern uchar_t		colorequivszc[];
    837 
    838 uint_t	page_num_pagesizes(void);
    839 uint_t	page_num_user_pagesizes(int);
    840 size_t	page_get_pagesize(uint_t);
    841 size_t	page_get_user_pagesize(uint_t n);
    842 pgcnt_t	page_get_pagecnt(uint_t);
    843 uint_t	page_get_shift(uint_t);
    844 int	page_szc(size_t);
    845 int	page_szc_user_filtered(size_t);
    846 
    847 /* page_get_replacement page flags */
    848 #define	PGR_SAMESZC	0x1	/* only look for page size same as orig */
    849 #define	PGR_NORELOC	0x2	/* allocate a P_NORELOC page */
    850 
    851 /*
    852  * macros for "masked arithmetic"
    853  * The purpose is to step through all combinations of a set of bits while
    854  * keeping some other bits fixed. Fixed bits need not be contiguous. The
    855  * variable bits need not be contiguous either, or even right aligned. The
    856  * trick is to set all fixed bits to 1, then increment, then restore the
    857  * fixed bits. If incrementing causes a carry from a low bit position, the
    858  * carry propagates thru the fixed bits, because they are temporarily set to 1.
    859  *	v is the value
    860  *	i is the increment
    861  *	eq_mask defines the fixed bits
    862  *	mask limits the size of the result
    863  */
    864 #define	ADD_MASKED(v, i, eq_mask, mask) \
    865 	(((((v) | (eq_mask)) + (i)) & (mask) & ~(eq_mask)) | ((v) & (eq_mask)))
    866 
    867 /*
    868  * convenience macro which increments by 1
    869  */
    870 #define	INC_MASKED(v, eq_mask, mask) ADD_MASKED(v, 1, eq_mask, mask)
    871 
    872 #endif	/* _KERNEL */
    873 
    874 /*
    875  * Constants used for the p_iolock_state
    876  */
    877 #define	PAGE_IO_INUSE	0x1
    878 #define	PAGE_IO_WANTED	0x2
    879 
    880 /*
    881  * Constants used for page_release status
    882  */
    883 #define	PGREL_NOTREL    0x1
    884 #define	PGREL_CLEAN	0x2
    885 #define	PGREL_MOD	0x3
    886 
    887 /*
    888  * The p_state field holds what used to be the p_age and p_free
    889  * bits.  These fields are protected by p_selock (see above).
    890  */
    891 #define	P_FREE		0x80		/* Page on free list */
    892 #define	P_NORELOC	0x40		/* Page is non-relocatable */
    893 #define	P_MIGRATE	0x20		/* Migrate page on next touch */
    894 #define	P_SWAP		0x10		/* belongs to vnode that is V_ISSWAP */
    895 #define	P_BOOTPAGES	0x08		/* member of bootpages list */
    896 
    897 #define	PP_ISFREE(pp)		((pp)->p_state & P_FREE)
    898 #define	PP_ISAGED(pp)		(((pp)->p_state & P_FREE) && \
    899 					((pp)->p_vnode == NULL))
    900 #define	PP_ISNORELOC(pp)	((pp)->p_state & P_NORELOC)
    901 #define	PP_ISKAS(pp)		(VN_ISKAS((pp)->p_vnode))
    902 #define	PP_ISNORELOCKERNEL(pp)	(PP_ISNORELOC(pp) && PP_ISKAS(pp))
    903 #define	PP_ISMIGRATE(pp)	((pp)->p_state & P_MIGRATE)
    904 #define	PP_ISSWAP(pp)		((pp)->p_state & P_SWAP)
    905 #define	PP_ISBOOTPAGES(pp)	((pp)->p_state & P_BOOTPAGES)
    906 
    907 #define	PP_SETFREE(pp)		((pp)->p_state = ((pp)->p_state & ~P_MIGRATE) \
    908 				| P_FREE)
    909 #define	PP_SETAGED(pp)		ASSERT(PP_ISAGED(pp))
    910 #define	PP_SETNORELOC(pp)	((pp)->p_state |= P_NORELOC)
    911 #define	PP_SETMIGRATE(pp)	((pp)->p_state |= P_MIGRATE)
    912 #define	PP_SETSWAP(pp)		((pp)->p_state |= P_SWAP)
    913 #define	PP_SETBOOTPAGES(pp)	((pp)->p_state |= P_BOOTPAGES)
    914 
    915 #define	PP_CLRFREE(pp)		((pp)->p_state &= ~P_FREE)
    916 #define	PP_CLRAGED(pp)		ASSERT(!PP_ISAGED(pp))
    917 #define	PP_CLRNORELOC(pp)	((pp)->p_state &= ~P_NORELOC)
    918 #define	PP_CLRMIGRATE(pp)	((pp)->p_state &= ~P_MIGRATE)
    919 #define	PP_CLRSWAP(pp)		((pp)->p_state &= ~P_SWAP)
    920 #define	PP_CLRBOOTPAGES(pp)	((pp)->p_state &= ~P_BOOTPAGES)
    921 
    922 /*
    923  * Flags for page_t p_toxic, for tracking memory hardware errors.
    924  *
    925  * These flags are OR'ed into p_toxic with page_settoxic() to track which
    926  * error(s) have occurred on a given page. The flags are cleared with
    927  * page_clrtoxic(). Both page_settoxic() and page_cleartoxic use atomic
    928  * primitives to manipulate the p_toxic field so no other locking is needed.
    929  *
    930  * When an error occurs on a page, p_toxic is set to record the error. The
    931  * error could be a memory error or something else (i.e. a datapath). The Page
    932  * Retire mechanism does not try to determine the exact cause of the error;
    933  * Page Retire rightly leaves that sort of determination to FMA's Diagnostic
    934  * Engine (DE).
    935  *
    936  * Note that, while p_toxic bits can be set without holding any locks, they
    937  * should only be cleared while holding the page exclusively locked.
    938  * There is one exception to this, the PR_CAPTURE bit is protected by a mutex
    939  * within the page capture logic and thus to set or clear the bit, that mutex
    940  * needs to be held.  The page does not need to be locked but the page_clrtoxic
    941  * function must be used as we need an atomic operation.
    942  * Also note that there is what amounts to a hack to prevent recursion with
    943  * large pages such that if we are unlocking a page and the PR_CAPTURE bit is
    944  * set, we will only try to capture the page if the current threads T_CAPTURING
    945  * flag is not set.  If the flag is set, the unlock will not try to capture
    946  * the page even though the PR_CAPTURE bit is set.
    947  *
    948  * Pages with PR_UE or PR_FMA flags are retired unconditionally, while pages
    949  * with PR_MCE are retired if the system has not retired too many of them.
    950  *
    951  * A page must be exclusively locked to be retired. Pages can be retired if
    952  * they are mapped, modified, or both, as long as they are not marked PR_UE,
    953  * since pages with uncorrectable errors cannot be relocated in memory.
    954  * Once a page has been successfully retired it is zeroed, attached to the
    955  * retired_pages vnode and, finally, PR_RETIRED is set in p_toxic. The other
    956  * p_toxic bits are NOT cleared. Pages are not left locked after retiring them
    957  * to avoid special case code throughout the kernel; rather, page_*lock() will
    958  * fail to lock the page, unless SE_RETIRED is passed as an argument.
    959  *
    960  * While we have your attention, go take a look at the comments at the
    961  * beginning of page_retire.c too.
    962  */
    963 #define	PR_OK		0x00	/* no problem */
    964 #define	PR_MCE		0x01	/* page has seen two or more CEs */
    965 #define	PR_UE		0x02	/* page has an unhandled UE */
    966 #define	PR_UE_SCRUBBED	0x04	/* page has seen a UE but was cleaned */
    967 #define	PR_FMA		0x08	/* A DE wants this page retired */
    968 #define	PR_CAPTURE	0x10	/* Generic page capture flag */
    969 #define	PR_RESV		0x20	/* Reserved for future use */
    970 #define	PR_MSG		0x40	/* message(s) already printed for this page */
    971 #define	PR_RETIRED	0x80	/* This page has been retired */
    972 
    973 #define	PR_REASONS	(PR_UE | PR_MCE | PR_FMA)
    974 #define	PR_TOXIC	(PR_UE)
    975 #define	PR_ERRMASK	(PR_UE | PR_UE_SCRUBBED | PR_MCE | PR_FMA)
    976 #define	PR_TOXICFLAGS	(0xCF)
    977 
    978 #define	PP_RETIRED(pp)	((pp)->p_toxic & PR_RETIRED)
    979 #define	PP_TOXIC(pp)	((pp)->p_toxic & PR_TOXIC)
    980 #define	PP_PR_REQ(pp)	(((pp)->p_toxic & PR_REASONS) && !PP_RETIRED(pp))
    981 #define	PP_PR_NOSHARE(pp)						\
    982 	((((pp)->p_toxic & (PR_RETIRED | PR_FMA | PR_UE)) == PR_FMA) &&	\
    983 	!PP_ISKAS(pp))
    984 
    985 /*
    986  * Flags for page_unretire_pp
    987  */
    988 #define	PR_UNR_FREE	0x1
    989 #define	PR_UNR_CLEAN	0x2
    990 #define	PR_UNR_TEMP	0x4
    991 
    992 /*
    993  * kpm large page description.
    994  * The virtual address range of segkpm is divided into chunks of
    995  * kpm_pgsz. Each chunk is controlled by a kpm_page_t. The ushort
    996  * is sufficient for 2^^15 * PAGESIZE, so e.g. the maximum kpm_pgsz
    997  * for 8K is 256M and 2G for 64K pages. It it kept as small as
    998  * possible to save physical memory space.
    999  *
   1000  * There are 2 segkpm mapping windows within in the virtual address
   1001  * space when we have to prevent VAC alias conflicts. The so called
   1002  * Alias window (mappings are always by PAGESIZE) is controlled by
   1003  * kp_refcnta. The regular window is controlled by kp_refcnt for the
   1004  * normal operation, which is to use the largest available pagesize.
   1005  * When VAC alias conflicts are present within a chunk in the regular
   1006  * window the large page mapping is broken up into smaller PAGESIZE
   1007  * mappings. kp_refcntc is used to control the pages that are invoked
   1008  * in the conflict and kp_refcnts holds the active mappings done
   1009  * with the small page size. In non vac conflict mode kp_refcntc is
   1010  * also used as "go" indication (-1) for the trap level tsbmiss
   1011  * handler.
   1012  */
   1013 typedef struct kpm_page {
   1014 	short kp_refcnt;	/* pages mapped large */
   1015 	short kp_refcnta;	/* pages mapped in Alias window */
   1016 	short kp_refcntc;	/* TL-tsbmiss flag; #vac alias conflict pages */
   1017 	short kp_refcnts;	/* vac alias: pages mapped small */
   1018 } kpm_page_t;
   1019 
   1020 /*
   1021  * Note: khl_lock offset changes must be reflected in sfmmu_asm.s
   1022  */
   1023 typedef struct kpm_hlk {
   1024 	kmutex_t khl_mutex;	/* kpm_page mutex */
   1025 	uint_t   khl_lock;	/* trap level tsbmiss handling */
   1026 } kpm_hlk_t;
   1027 
   1028 /*
   1029  * kpm small page description.
   1030  * When kpm_pgsz is equal to PAGESIZE a smaller representation is used
   1031  * to save memory space. Alias range mappings and regular segkpm
   1032  * mappings are done in units of PAGESIZE and can share the mapping
   1033  * information and the mappings are always distinguishable by their
   1034  * virtual address. Other information needed for VAC conflict prevention
   1035  * is already available on a per page basis.
   1036  *
   1037  * The state about how a kpm page is mapped and whether it is ready to go
   1038  * is indicated by the following 1 byte kpm_spage structure. This byte is
   1039  * split into two 4-bit parts - kp_mapped and kp_mapped_go.
   1040  * 	- kp_mapped == 1	the page is mapped cacheable
   1041  *	- kp_mapped == 2	the page is mapped non-cacheable
   1042  *	- kp_mapped_go == 1	the mapping is ready to be dropped in
   1043  *	- kp_mapped_go == 0	the mapping is not ready to be dropped in.
   1044  * When kp_mapped_go == 0, we will have C handler resolve the VAC conflict.
   1045  * Otherwise, the assembly tsb miss handler can simply drop in the mapping
   1046  * when a tsb miss occurs.
   1047  */
   1048 typedef union kpm_spage {
   1049 	struct {
   1050 #ifdef  _BIG_ENDIAN
   1051 		uchar_t mapped_go: 4;	/* go or nogo flag */
   1052 		uchar_t mapped: 4;	/* page mapped small */
   1053 #else
   1054 		uchar_t mapped: 4;	/* page mapped small */
   1055 		uchar_t mapped_go: 4;	/* go or nogo flag */
   1056 #endif
   1057 	} kpm_spage_un;
   1058 	uchar_t kp_mapped_flag;
   1059 } kpm_spage_t;
   1060 
   1061 #define	kp_mapped	kpm_spage_un.mapped
   1062 #define	kp_mapped_go	kpm_spage_un.mapped_go
   1063 
   1064 /*
   1065  * Note: kshl_lock offset changes must be reflected in sfmmu_asm.s
   1066  */
   1067 typedef struct kpm_shlk {
   1068 	uint_t   kshl_lock;	/* trap level tsbmiss handling */
   1069 } kpm_shlk_t;
   1070 
   1071 /*
   1072  * Each segment of physical memory is described by a memseg struct.
   1073  * Within a segment, memory is considered contiguous. The members
   1074  * can be categorized as follows:
   1075  * . Platform independent:
   1076  *         pages, epages, pages_base, pages_end, next, lnext.
   1077  * . 64bit only but platform independent:
   1078  *         kpm_pbase, kpm_nkpmpgs, kpm_pages, kpm_spages.
   1079  * . Really platform or mmu specific:
   1080  *         pagespa, epagespa, nextpa, kpm_pagespa.
   1081  * . Mixed:
   1082  *         msegflags.
   1083  */
   1084 struct memseg {
   1085 	page_t *pages, *epages;		/* [from, to] in page array */
   1086 	pfn_t pages_base, pages_end;	/* [from, to] in page numbers */
   1087 	struct memseg *next;		/* next segment in list */
   1088 #if defined(__sparc)
   1089 	struct memseg *lnext;		/* next segment in deleted list */
   1090 	uint64_t pagespa, epagespa;	/* [from, to] page array physical */
   1091 	uint64_t nextpa;		/* physical next pointer */
   1092 	pfn_t	kpm_pbase;		/* start of kpm range */
   1093 	pgcnt_t kpm_nkpmpgs;		/* # of kpm_pgsz pages */
   1094 	union _mseg_un {
   1095 		kpm_page_t  *kpm_lpgs;	/* ptr to kpm_page array */
   1096 		kpm_spage_t *kpm_spgs;	/* ptr to kpm_spage array */
   1097 	} mseg_un;
   1098 	uint64_t kpm_pagespa;		/* physical ptr to kpm (s)pages array */
   1099 	uint_t msegflags;		/* memseg flags */
   1100 #endif /* __sparc */
   1101 };
   1102 
   1103 /* memseg union aliases */
   1104 #define	kpm_pages	mseg_un.kpm_lpgs
   1105 #define	kpm_spages	mseg_un.kpm_spgs
   1106 
   1107 /* msegflags */
   1108 #define	MEMSEG_DYNAMIC		0x1	/* DR: memory was added dynamically */
   1109 #define	MEMSEG_META_INCL	0x2	/* DR: memseg includes it's metadata */
   1110 #define	MEMSEG_META_ALLOC	0x4	/* DR: memseg allocated it's metadata */
   1111 
   1112 /* memseg support macros */
   1113 #define	MSEG_NPAGES(SEG)	((SEG)->pages_end - (SEG)->pages_base)
   1114 
   1115 /* memseg hash */
   1116 #define	MEM_HASH_SHIFT		0x9
   1117 #define	N_MEM_SLOTS		0x200		/* must be a power of 2 */
   1118 #define	MEMSEG_PFN_HASH(pfn)	(((pfn)/mhash_per_slot) & (N_MEM_SLOTS - 1))
   1119 
   1120 /* memseg  externals */
   1121 extern struct memseg *memsegs;		/* list of memory segments */
   1122 extern ulong_t mhash_per_slot;
   1123 extern uint64_t memsegspa;		/* memsegs as physical address */
   1124 
   1125 void build_pfn_hash();
   1126 extern struct memseg *page_numtomemseg_nolock(pfn_t pfnum);
   1127 
   1128 /*
   1129  * page capture related info:
   1130  * The page capture routines allow us to asynchronously capture given pages
   1131  * for the explicit use of the requestor.  New requestors can be added by
   1132  * explicitly adding themselves to the PC_* flags below and incrementing
   1133  * PC_NUM_CALLBACKS as necessary.
   1134  *
   1135  * Subsystems using page capture must register a callback before attempting
   1136  * to capture a page.  A duration of -1 will indicate that we will never give
   1137  * up while trying to capture a page and will only stop trying to capture the
   1138  * given page once we have successfully captured it.  Thus the user needs to be
   1139  * aware of the behavior of all callers who have a duration of -1.
   1140  *
   1141  * For now, only /dev/physmem and page retire use the page capture interface
   1142  * and only a single request can be outstanding for a given page.  Thus, if
   1143  * /dev/phsymem wants a page and page retire also wants the same page, only
   1144  * the page retire request will be honored until the point in time that the
   1145  * page is actually retired, at which point in time, subsequent requests by
   1146  * /dev/physmem will succeed if the CAPTURE_GET_RETIRED flag was set.
   1147  */
   1148 
   1149 #define	PC_RETIRE		(0)
   1150 #define	PC_PHYSMEM		(1)
   1151 #define	PC_NUM_CALLBACKS	(2)
   1152 #define	PC_MASK			((1 << PC_NUM_CALLBACKS) - 1)
   1153 
   1154 #define	CAPTURE_RETIRE		(1 << PC_RETIRE)
   1155 #define	CAPTURE_PHYSMEM		(1 << PC_PHYSMEM)
   1156 
   1157 #define	CAPTURE_ASYNC		(0x0200)
   1158 
   1159 #define	CAPTURE_GET_RETIRED	(0x1000)
   1160 #define	CAPTURE_GET_CAGE	(0x2000)
   1161 
   1162 struct page_capture_callback {
   1163 	int cb_active;		/* 1 means active, 0 means inactive */
   1164 	clock_t duration;	/* the length in time that we'll attempt to */
   1165 				/* capture this page asynchronously. (in HZ) */
   1166 	krwlock_t cb_rwlock;
   1167 	int (*cb_func)(page_t *, void *, uint_t); /* callback function */
   1168 };
   1169 
   1170 extern kcondvar_t pc_cv;
   1171 
   1172 void page_capture_register_callback(uint_t index, clock_t duration,
   1173     int (*cb_func)(page_t *, void *, uint_t));
   1174 void page_capture_unregister_callback(uint_t index);
   1175 int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
   1176 void page_unlock_capture(page_t *pp);
   1177 int page_capture_unretire_pp(page_t *);
   1178 
   1179 extern int memsegs_trylock(int);
   1180 extern void memsegs_lock(int);
   1181 extern void memsegs_unlock(int);
   1182 extern int memsegs_lock_held(void);
   1183 extern void memlist_read_lock(void);
   1184 extern void memlist_read_unlock(void);
   1185 extern void memlist_write_lock(void);
   1186 extern void memlist_write_unlock(void);
   1187 
   1188 #ifdef	__cplusplus
   1189 }
   1190 #endif
   1191 
   1192 #endif	/* _VM_PAGE_H */
   1193