Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
     27 /*	All Rights Reserved   */
     28 
     29 /*
     30  * Portions of this source code were derived from Berkeley 4.3 BSD
     31  * under license from the Regents of the University of California.
     32  */
     33 
     34 
     35 /*
     36  * This file contains common functions to access and manage the page lists.
     37  * Many of these routines originated from platform dependent modules
     38  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
     39  * a platform independent manner.
     40  *
     41  * vm/vm_dep.h provides for platform specific support.
     42  */
     43 
     44 #include <sys/types.h>
     45 #include <sys/debug.h>
     46 #include <sys/cmn_err.h>
     47 #include <sys/systm.h>
     48 #include <sys/atomic.h>
     49 #include <sys/sysmacros.h>
     50 #include <vm/as.h>
     51 #include <vm/page.h>
     52 #include <vm/seg_kmem.h>
     53 #include <vm/seg_vn.h>
     54 #include <sys/vmsystm.h>
     55 #include <sys/memnode.h>
     56 #include <vm/vm_dep.h>
     57 #include <sys/lgrp.h>
     58 #include <sys/mem_config.h>
     59 #include <sys/callb.h>
     60 #include <sys/mem_cage.h>
     61 #include <sys/sdt.h>
     62 #include <sys/dumphdr.h>
     63 
     64 extern uint_t	vac_colors;
     65 
     66 #define	MAX_PRAGMA_ALIGN	128
     67 
     68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
     69 
     70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
     71 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
     72 #else
     73 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
     74 #endif
     75 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
     76 
     77 /*
     78  * number of page colors equivalent to reqested color in page_get routines.
     79  * If set, keeps large pages intact longer and keeps MPO allocation
     80  * from the local mnode in favor of acquiring the 'correct' page color from
     81  * a demoted large page or from a remote mnode.
     82  */
     83 uint_t	colorequiv;
     84 
     85 /*
     86  * color equivalency mask for each page size.
     87  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
     88  * High 4 bits determine the number of high order bits of the color to ignore.
     89  * Low 4 bits determines number of low order bits of color to ignore (it's only
     90  * relevant for hashed index based page coloring).
     91  */
     92 uchar_t colorequivszc[MMU_PAGE_SIZES];
     93 
     94 /*
     95  * if set, specifies the percentage of large pages that are free from within
     96  * a large page region before attempting to lock those pages for
     97  * page_get_contig_pages processing.
     98  *
     99  * Should be turned on when kpr is available when page_trylock_contig_pages
    100  * can be more selective.
    101  */
    102 
    103 int	ptcpthreshold;
    104 
    105 /*
    106  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
    107  * Enabled by default via pgcplimitsearch.
    108  *
    109  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
    110  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
    111  * bound. This upper bound range guarantees:
    112  *    - all large page 'slots' will be searched over time
    113  *    - the minimum (1) large page candidates considered on each pgcp call
    114  *    - count doesn't wrap around to 0
    115  */
    116 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
    117 int	pgcplimitsearch = 1;
    118 
    119 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
    120 #define	SETPGCPFAILCNT(szc)						\
    121 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
    122 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
    123 
    124 #ifdef VM_STATS
    125 struct vmm_vmstats_str  vmm_vmstats;
    126 
    127 #endif /* VM_STATS */
    128 
    129 #if defined(__sparc)
    130 #define	LPGCREATE	0
    131 #else
    132 /* enable page_get_contig_pages */
    133 #define	LPGCREATE	1
    134 #endif
    135 
    136 int pg_contig_disable;
    137 int pg_lpgcreate_nocage = LPGCREATE;
    138 
    139 /*
    140  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
    141  */
    142 #define	PFNNULL		0
    143 
    144 /* Flags involved in promotion and demotion routines */
    145 #define	PC_FREE		0x1	/* put page on freelist */
    146 #define	PC_ALLOC	0x2	/* return page for allocation */
    147 
    148 /*
    149  * Flag for page_demote to be used with PC_FREE to denote that we don't care
    150  * what the color is as the color parameter to the function is ignored.
    151  */
    152 #define	PC_NO_COLOR	(-1)
    153 
    154 /* mtype value for page_promote to use when mtype does not matter */
    155 #define	PC_MTYPE_ANY	(-1)
    156 
    157 /*
    158  * page counters candidates info
    159  * See page_ctrs_cands comment below for more details.
    160  * fields are as follows:
    161  *	pcc_pages_free:		# pages which freelist coalesce can create
    162  *	pcc_color_free:		pointer to page free counts per color
    163  */
    164 typedef struct pcc_info {
    165 	pgcnt_t	pcc_pages_free;
    166 	pgcnt_t	*pcc_color_free;
    167 	uint_t	pad[12];
    168 } pcc_info_t;
    169 
    170 /*
    171  * On big machines it can take a long time to check page_counters
    172  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
    173  * updated sum of all elements of the corresponding page_counters arrays.
    174  * page_freelist_coalesce() searches page_counters only if an appropriate
    175  * element of page_ctrs_cands array is greater than 0.
    176  *
    177  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
    178  */
    179 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
    180 
    181 /*
    182  * Return in val the total number of free pages which can be created
    183  * for the given mnode (m), mrange (g), and region size (r)
    184  */
    185 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
    186 	int i;								\
    187 	val = 0;							\
    188 	for (i = 0; i < NPC_MUTEX; i++) {				\
    189 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
    190 	}								\
    191 }
    192 
    193 /*
    194  * Return in val the total number of free pages which can be created
    195  * for the given mnode (m), mrange (g), region size (r), and color (c)
    196  */
    197 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
    198 	int i;								\
    199 	val = 0;							\
    200 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
    201 	for (i = 0; i < NPC_MUTEX; i++) {				\
    202 	    val +=							\
    203 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
    204 	}								\
    205 }
    206 
    207 /*
    208  * We can only allow a single thread to update a counter within the physical
    209  * range of the largest supported page size. That is the finest granularity
    210  * possible since the counter values are dependent on each other
    211  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
    212  * ctr_mutex lock index for a particular physical range.
    213  */
    214 static kmutex_t	*ctr_mutex[NPC_MUTEX];
    215 
    216 #define	PP_CTR_LOCK_INDX(pp)						\
    217 	(((pp)->p_pagenum >>						\
    218 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
    219 
    220 #define	INVALID_COLOR 0xffffffff
    221 #define	INVALID_MASK  0xffffffff
    222 
    223 /*
    224  * Local functions prototypes.
    225  */
    226 
    227 void page_ctr_add(int, int, page_t *, int);
    228 void page_ctr_add_internal(int, int, page_t *, int);
    229 void page_ctr_sub(int, int, page_t *, int);
    230 void page_ctr_sub_internal(int, int, page_t *, int);
    231 void page_freelist_lock(int);
    232 void page_freelist_unlock(int);
    233 page_t *page_promote(int, pfn_t, uchar_t, int, int);
    234 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
    235 page_t *page_freelist_split(uchar_t,
    236     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
    237 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
    238 static int page_trylock_cons(page_t *pp, se_t se);
    239 
    240 /*
    241  * The page_counters array below is used to keep track of free contiguous
    242  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
    243  * This contains an array of counters, the size of the array, a shift value
    244  * used to convert a pagenum into a counter array index or vice versa, as
    245  * well as a cache of the last successful index to be promoted to a larger
    246  * page size.  As an optimization, we keep track of the last successful index
    247  * to be promoted per page color for the given size region, and this is
    248  * allocated dynamically based upon the number of colors for a given
    249  * region size.
    250  *
    251  * Conceptually, the page counters are represented as:
    252  *
    253  *	page_counters[region_size][mnode]
    254  *
    255  *	region_size:	size code of a candidate larger page made up
    256  *			of contiguous free smaller pages.
    257  *
    258  *	page_counters[region_size][mnode].hpm_counters[index]:
    259  *		represents how many (region_size - 1) pages either
    260  *		exist or can be created within the given index range.
    261  *
    262  * Let's look at a sparc example:
    263  *	If we want to create a free 512k page, we look at region_size 2
    264  *	for the mnode we want.  We calculate the index and look at a specific
    265  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
    266  *	this location, it means that 8 64k pages either exist or can be created
    267  *	from 8K pages in order to make a single free 512k page at the given
    268  *	index.  Note that when a region is full, it will contribute to the
    269  *	counts in the region above it.  Thus we will not know what page
    270  *	size the free pages will be which can be promoted to this new free
    271  *	page unless we look at all regions below the current region.
    272  */
    273 
    274 /*
    275  * Note: hpmctr_t is defined in platform vm_dep.h
    276  * hw_page_map_t contains all the information needed for the page_counters
    277  * logic. The fields are as follows:
    278  *
    279  *	hpm_counters:	dynamically allocated array to hold counter data
    280  *	hpm_entries:	entries in hpm_counters
    281  *	hpm_shift:	shift for pnum/array index conv
    282  *	hpm_base:	PFN mapped to counter index 0
    283  *	hpm_color_current:	last index in counter array for this color at
    284  *				which we successfully created a large page
    285  */
    286 typedef struct hw_page_map {
    287 	hpmctr_t	*hpm_counters;
    288 	size_t		hpm_entries;
    289 	int		hpm_shift;
    290 	pfn_t		hpm_base;
    291 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
    292 #if defined(__sparc)
    293 	uint_t		pad[4];
    294 #endif
    295 } hw_page_map_t;
    296 
    297 /*
    298  * Element zero is not used, but is allocated for convenience.
    299  */
    300 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
    301 
    302 /*
    303  * Cached value of MNODE_RANGE_CNT(mnode).
    304  * This is a function call in x86.
    305  */
    306 static int mnode_nranges[MAX_MEM_NODES];
    307 static int mnode_maxmrange[MAX_MEM_NODES];
    308 
    309 /*
    310  * The following macros are convenient ways to get access to the individual
    311  * elements of the page_counters arrays.  They can be used on both
    312  * the left side and right side of equations.
    313  */
    314 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
    315 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
    316 
    317 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
    318 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
    319 
    320 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
    321 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
    322 
    323 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
    324 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
    325 
    326 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
    327 	(page_counters[(rg_szc)][(mnode)].hpm_base)
    328 
    329 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
    330 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
    331 
    332 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
    333 	(page_counters[(rg_szc)][(mnode)].				\
    334 	hpm_color_current[(mrange)][(color)])
    335 
    336 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
    337 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
    338 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
    339 
    340 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
    341 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
    342 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
    343 
    344 /*
    345  * Protects the hpm_counters and hpm_color_current memory from changing while
    346  * looking at page counters information.
    347  * Grab the write lock to modify what these fields point at.
    348  * Grab the read lock to prevent any pointers from changing.
    349  * The write lock can not be held during memory allocation due to a possible
    350  * recursion deadlock with trying to grab the read lock while the
    351  * write lock is already held.
    352  */
    353 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
    354 
    355 
    356 /*
    357  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
    358  */
    359 void
    360 cpu_vm_data_init(struct cpu *cp)
    361 {
    362 	if (cp == CPU0) {
    363 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
    364 	} else {
    365 		void	*kmptr;
    366 		int	align;
    367 		size_t	sz;
    368 
    369 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
    370 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
    371 		kmptr = kmem_zalloc(sz, KM_SLEEP);
    372 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
    373 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
    374 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
    375 	}
    376 }
    377 
    378 /*
    379  * free cpu_vm_data
    380  */
    381 void
    382 cpu_vm_data_destroy(struct cpu *cp)
    383 {
    384 	if (cp->cpu_seqid && cp->cpu_vm_data) {
    385 		ASSERT(cp != CPU0);
    386 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
    387 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
    388 	}
    389 	cp->cpu_vm_data = NULL;
    390 }
    391 
    392 
    393 /*
    394  * page size to page size code
    395  */
    396 int
    397 page_szc(size_t pagesize)
    398 {
    399 	int	i = 0;
    400 
    401 	while (hw_page_array[i].hp_size) {
    402 		if (pagesize == hw_page_array[i].hp_size)
    403 			return (i);
    404 		i++;
    405 	}
    406 	return (-1);
    407 }
    408 
    409 /*
    410  * page size to page size code with the restriction that it be a supported
    411  * user page size.  If it's not a supported user page size, -1 will be returned.
    412  */
    413 int
    414 page_szc_user_filtered(size_t pagesize)
    415 {
    416 	int szc = page_szc(pagesize);
    417 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
    418 		return (szc);
    419 	}
    420 	return (-1);
    421 }
    422 
    423 /*
    424  * Return how many page sizes are available for the user to use.  This is
    425  * what the hardware supports and not based upon how the OS implements the
    426  * support of different page sizes.
    427  *
    428  * If legacy is non-zero, return the number of pagesizes available to legacy
    429  * applications. The number of legacy page sizes might be less than the
    430  * exported user page sizes. This is to prevent legacy applications that
    431  * use the largest page size returned from getpagesizes(3c) from inadvertantly
    432  * using the 'new' large pagesizes.
    433  */
    434 uint_t
    435 page_num_user_pagesizes(int legacy)
    436 {
    437 	if (legacy)
    438 		return (mmu_legacy_page_sizes);
    439 	return (mmu_exported_page_sizes);
    440 }
    441 
    442 uint_t
    443 page_num_pagesizes(void)
    444 {
    445 	return (mmu_page_sizes);
    446 }
    447 
    448 /*
    449  * returns the count of the number of base pagesize pages associated with szc
    450  */
    451 pgcnt_t
    452 page_get_pagecnt(uint_t szc)
    453 {
    454 	if (szc >= mmu_page_sizes)
    455 		panic("page_get_pagecnt: out of range %d", szc);
    456 	return (hw_page_array[szc].hp_pgcnt);
    457 }
    458 
    459 size_t
    460 page_get_pagesize(uint_t szc)
    461 {
    462 	if (szc >= mmu_page_sizes)
    463 		panic("page_get_pagesize: out of range %d", szc);
    464 	return (hw_page_array[szc].hp_size);
    465 }
    466 
    467 /*
    468  * Return the size of a page based upon the index passed in.  An index of
    469  * zero refers to the smallest page size in the system, and as index increases
    470  * it refers to the next larger supported page size in the system.
    471  * Note that szc and userszc may not be the same due to unsupported szc's on
    472  * some systems.
    473  */
    474 size_t
    475 page_get_user_pagesize(uint_t userszc)
    476 {
    477 	uint_t szc = USERSZC_2_SZC(userszc);
    478 
    479 	if (szc >= mmu_page_sizes)
    480 		panic("page_get_user_pagesize: out of range %d", szc);
    481 	return (hw_page_array[szc].hp_size);
    482 }
    483 
    484 uint_t
    485 page_get_shift(uint_t szc)
    486 {
    487 	if (szc >= mmu_page_sizes)
    488 		panic("page_get_shift: out of range %d", szc);
    489 	return (PAGE_GET_SHIFT(szc));
    490 }
    491 
    492 uint_t
    493 page_get_pagecolors(uint_t szc)
    494 {
    495 	if (szc >= mmu_page_sizes)
    496 		panic("page_get_pagecolors: out of range %d", szc);
    497 	return (PAGE_GET_PAGECOLORS(szc));
    498 }
    499 
    500 /*
    501  * this assigns the desired equivalent color after a split
    502  */
    503 uint_t
    504 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
    505     uint_t ncolor, uint_t ceq_mask)
    506 {
    507 	ASSERT(nszc > szc);
    508 	ASSERT(szc < mmu_page_sizes);
    509 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
    510 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
    511 
    512 	color &= ceq_mask;
    513 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
    514 	return (color | (ncolor & ~ceq_mask));
    515 }
    516 
    517 /*
    518  * The interleaved_mnodes flag is set when mnodes overlap in
    519  * the physbase..physmax range, but have disjoint slices.
    520  * In this case hpm_counters is shared by all mnodes.
    521  * This flag is set dynamically by the platform.
    522  */
    523 int interleaved_mnodes = 0;
    524 
    525 /*
    526  * Called by startup().
    527  * Size up the per page size free list counters based on physmax
    528  * of each node and max_mem_nodes.
    529  *
    530  * If interleaved_mnodes is set we need to find the first mnode that
    531  * exists. hpm_counters for the first mnode will then be shared by
    532  * all other mnodes. If interleaved_mnodes is not set, just set
    533  * first=mnode each time. That means there will be no sharing.
    534  */
    535 size_t
    536 page_ctrs_sz(void)
    537 {
    538 	int	r;		/* region size */
    539 	int	mnode;
    540 	int	firstmn;	/* first mnode that exists */
    541 	int	nranges;
    542 	pfn_t	physbase;
    543 	pfn_t	physmax;
    544 	uint_t	ctrs_sz = 0;
    545 	int 	i;
    546 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
    547 
    548 	/*
    549 	 * We need to determine how many page colors there are for each
    550 	 * page size in order to allocate memory for any color specific
    551 	 * arrays.
    552 	 */
    553 	for (i = 0; i < mmu_page_sizes; i++) {
    554 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
    555 	}
    556 
    557 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
    558 
    559 		pgcnt_t r_pgcnt;
    560 		pfn_t   r_base;
    561 		pgcnt_t r_align;
    562 
    563 		if (mem_node_config[mnode].exists == 0)
    564 			continue;
    565 
    566 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
    567 		nranges = MNODE_RANGE_CNT(mnode);
    568 		mnode_nranges[mnode] = nranges;
    569 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
    570 
    571 		/*
    572 		 * determine size needed for page counter arrays with
    573 		 * base aligned to large page size.
    574 		 */
    575 		for (r = 1; r < mmu_page_sizes; r++) {
    576 			/* add in space for hpm_color_current */
    577 			ctrs_sz += sizeof (size_t) *
    578 			    colors_per_szc[r] * nranges;
    579 
    580 			if (firstmn != mnode)
    581 				continue;
    582 
    583 			/* add in space for hpm_counters */
    584 			r_align = page_get_pagecnt(r);
    585 			r_base = physbase;
    586 			r_base &= ~(r_align - 1);
    587 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
    588 
    589 			/*
    590 			 * Round up to always allocate on pointer sized
    591 			 * boundaries.
    592 			 */
    593 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
    594 			    sizeof (hpmctr_t *));
    595 		}
    596 	}
    597 
    598 	for (r = 1; r < mmu_page_sizes; r++) {
    599 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
    600 	}
    601 
    602 	/* add in space for page_ctrs_cands and pcc_color_free */
    603 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
    604 	    mmu_page_sizes * NPC_MUTEX;
    605 
    606 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
    607 
    608 		if (mem_node_config[mnode].exists == 0)
    609 			continue;
    610 
    611 		nranges = mnode_nranges[mnode];
    612 		ctrs_sz += sizeof (pcc_info_t) * nranges *
    613 		    mmu_page_sizes * NPC_MUTEX;
    614 		for (r = 1; r < mmu_page_sizes; r++) {
    615 			ctrs_sz += sizeof (pgcnt_t) * nranges *
    616 			    colors_per_szc[r] * NPC_MUTEX;
    617 		}
    618 	}
    619 
    620 	/* ctr_mutex */
    621 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
    622 
    623 	/* size for page list counts */
    624 	PLCNT_SZ(ctrs_sz);
    625 
    626 	/*
    627 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
    628 	 * address of the counters to ecache_alignsize boundary for every
    629 	 * memory node.
    630 	 */
    631 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
    632 }
    633 
    634 caddr_t
    635 page_ctrs_alloc(caddr_t alloc_base)
    636 {
    637 	int	mnode;
    638 	int	mrange, nranges;
    639 	int	r;		/* region size */
    640 	int	i;
    641 	int	firstmn;	/* first mnode that exists */
    642 	pfn_t	physbase;
    643 	pfn_t	physmax;
    644 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
    645 
    646 	/*
    647 	 * We need to determine how many page colors there are for each
    648 	 * page size in order to allocate memory for any color specific
    649 	 * arrays.
    650 	 */
    651 	for (i = 0; i < mmu_page_sizes; i++) {
    652 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
    653 	}
    654 
    655 	for (r = 1; r < mmu_page_sizes; r++) {
    656 		page_counters[r] = (hw_page_map_t *)alloc_base;
    657 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
    658 	}
    659 
    660 	/* page_ctrs_cands and pcc_color_free array */
    661 	for (i = 0; i < NPC_MUTEX; i++) {
    662 		for (r = 1; r < mmu_page_sizes; r++) {
    663 
    664 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
    665 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
    666 
    667 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
    668 				pcc_info_t *pi;
    669 
    670 				if (mem_node_config[mnode].exists == 0)
    671 					continue;
    672 
    673 				nranges = mnode_nranges[mnode];
    674 
    675 				pi = (pcc_info_t *)alloc_base;
    676 				alloc_base += sizeof (pcc_info_t) * nranges;
    677 				page_ctrs_cands[i][r][mnode] = pi;
    678 
    679 				for (mrange = 0; mrange < nranges; mrange++) {
    680 					pi->pcc_color_free =
    681 					    (pgcnt_t *)alloc_base;
    682 					alloc_base += sizeof (pgcnt_t) *
    683 					    colors_per_szc[r];
    684 					pi++;
    685 				}
    686 			}
    687 		}
    688 	}
    689 
    690 	/* ctr_mutex */
    691 	for (i = 0; i < NPC_MUTEX; i++) {
    692 		ctr_mutex[i] = (kmutex_t *)alloc_base;
    693 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
    694 	}
    695 
    696 	/* initialize page list counts */
    697 	PLCNT_INIT(alloc_base);
    698 
    699 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
    700 
    701 		pgcnt_t r_pgcnt;
    702 		pfn_t	r_base;
    703 		pgcnt_t r_align;
    704 		int	r_shift;
    705 		int	nranges = mnode_nranges[mnode];
    706 
    707 		if (mem_node_config[mnode].exists == 0)
    708 			continue;
    709 
    710 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
    711 
    712 		for (r = 1; r < mmu_page_sizes; r++) {
    713 			/*
    714 			 * the page_counters base has to be aligned to the
    715 			 * page count of page size code r otherwise the counts
    716 			 * will cross large page boundaries.
    717 			 */
    718 			r_align = page_get_pagecnt(r);
    719 			r_base = physbase;
    720 			/* base needs to be aligned - lower to aligned value */
    721 			r_base &= ~(r_align - 1);
    722 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
    723 			r_shift = PAGE_BSZS_SHIFT(r);
    724 
    725 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
    726 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
    727 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
    728 			for (mrange = 0; mrange < nranges; mrange++) {
    729 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
    730 				    r, mrange) = (size_t *)alloc_base;
    731 				alloc_base += sizeof (size_t) *
    732 				    colors_per_szc[r];
    733 			}
    734 			for (i = 0; i < colors_per_szc[r]; i++) {
    735 				uint_t color_mask = colors_per_szc[r] - 1;
    736 				pfn_t  pfnum = r_base;
    737 				size_t idx;
    738 				int mrange;
    739 				MEM_NODE_ITERATOR_DECL(it);
    740 
    741 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
    742 				if (pfnum == (pfn_t)-1) {
    743 					idx = 0;
    744 				} else {
    745 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
    746 					    color_mask, color_mask, &it);
    747 					idx = PNUM_TO_IDX(mnode, r, pfnum);
    748 					idx = (idx >= r_pgcnt) ? 0 : idx;
    749 				}
    750 				for (mrange = 0; mrange < nranges; mrange++) {
    751 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
    752 					    r, i, mrange) = idx;
    753 				}
    754 			}
    755 
    756 			/* hpm_counters may be shared by all mnodes */
    757 			if (firstmn == mnode) {
    758 				PAGE_COUNTERS_COUNTERS(mnode, r) =
    759 				    (hpmctr_t *)alloc_base;
    760 				alloc_base +=
    761 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
    762 				    sizeof (hpmctr_t *));
    763 			} else {
    764 				PAGE_COUNTERS_COUNTERS(mnode, r) =
    765 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
    766 			}
    767 
    768 			/*
    769 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
    770 			 * satisfy the identity requirement.
    771 			 * We should be able to go from one to the other
    772 			 * and get consistent values.
    773 			 */
    774 			ASSERT(PNUM_TO_IDX(mnode, r,
    775 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
    776 			ASSERT(IDX_TO_PNUM(mnode, r,
    777 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
    778 		}
    779 		/*
    780 		 * Roundup the start address of the page_counters to
    781 		 * cache aligned boundary for every memory node.
    782 		 * page_ctrs_sz() has added some slop for these roundups.
    783 		 */
    784 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
    785 		    L2CACHE_ALIGN);
    786 	}
    787 
    788 	/* Initialize other page counter specific data structures. */
    789 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
    790 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
    791 	}
    792 
    793 	return (alloc_base);
    794 }
    795 
    796 /*
    797  * Functions to adjust region counters for each size free list.
    798  * Caller is responsible to acquire the ctr_mutex lock if necessary and
    799  * thus can be called during startup without locks.
    800  */
    801 /* ARGSUSED */
    802 void
    803 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
    804 {
    805 	ssize_t		r;	/* region size */
    806 	ssize_t		idx;
    807 	pfn_t		pfnum;
    808 	int		lckidx;
    809 
    810 	ASSERT(mnode == PP_2_MEM_NODE(pp));
    811 	ASSERT(mtype == PP_2_MTYPE(pp));
    812 
    813 	ASSERT(pp->p_szc < mmu_page_sizes);
    814 
    815 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
    816 
    817 	/* no counter update needed for largest page size */
    818 	if (pp->p_szc >= mmu_page_sizes - 1) {
    819 		return;
    820 	}
    821 
    822 	r = pp->p_szc + 1;
    823 	pfnum = pp->p_pagenum;
    824 	lckidx = PP_CTR_LOCK_INDX(pp);
    825 
    826 	/*
    827 	 * Increment the count of free pages for the current
    828 	 * region. Continue looping up in region size incrementing
    829 	 * count if the preceeding region is full.
    830 	 */
    831 	while (r < mmu_page_sizes) {
    832 		idx = PNUM_TO_IDX(mnode, r, pfnum);
    833 
    834 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
    835 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
    836 
    837 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
    838 			break;
    839 		} else {
    840 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
    841 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
    842 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
    843 
    844 			cand->pcc_pages_free++;
    845 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
    846 		}
    847 		r++;
    848 	}
    849 }
    850 
    851 void
    852 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
    853 {
    854 	int		lckidx = PP_CTR_LOCK_INDX(pp);
    855 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
    856 
    857 	mutex_enter(lock);
    858 	page_ctr_add_internal(mnode, mtype, pp, flags);
    859 	mutex_exit(lock);
    860 }
    861 
    862 void
    863 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
    864 {
    865 	int		lckidx;
    866 	ssize_t		r;	/* region size */
    867 	ssize_t		idx;
    868 	pfn_t		pfnum;
    869 
    870 	ASSERT(mnode == PP_2_MEM_NODE(pp));
    871 	ASSERT(mtype == PP_2_MTYPE(pp));
    872 
    873 	ASSERT(pp->p_szc < mmu_page_sizes);
    874 
    875 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
    876 
    877 	/* no counter update needed for largest page size */
    878 	if (pp->p_szc >= mmu_page_sizes - 1) {
    879 		return;
    880 	}
    881 
    882 	r = pp->p_szc + 1;
    883 	pfnum = pp->p_pagenum;
    884 	lckidx = PP_CTR_LOCK_INDX(pp);
    885 
    886 	/*
    887 	 * Decrement the count of free pages for the current
    888 	 * region. Continue looping up in region size decrementing
    889 	 * count if the preceeding region was full.
    890 	 */
    891 	while (r < mmu_page_sizes) {
    892 		idx = PNUM_TO_IDX(mnode, r, pfnum);
    893 
    894 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
    895 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
    896 
    897 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
    898 			break;
    899 		} else {
    900 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
    901 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
    902 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
    903 
    904 			ASSERT(cand->pcc_pages_free != 0);
    905 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
    906 
    907 			cand->pcc_pages_free--;
    908 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
    909 		}
    910 		r++;
    911 	}
    912 }
    913 
    914 void
    915 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
    916 {
    917 	int		lckidx = PP_CTR_LOCK_INDX(pp);
    918 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
    919 
    920 	mutex_enter(lock);
    921 	page_ctr_sub_internal(mnode, mtype, pp, flags);
    922 	mutex_exit(lock);
    923 }
    924 
    925 /*
    926  * Adjust page counters following a memory attach, since typically the
    927  * size of the array needs to change, and the PFN to counter index
    928  * mapping needs to change.
    929  *
    930  * It is possible this mnode did not exist at startup. In that case
    931  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
    932  * to change (a theoretical possibility on x86), which means pcc_color_free
    933  * arrays must be extended.
    934  */
    935 uint_t
    936 page_ctrs_adjust(int mnode)
    937 {
    938 	pgcnt_t npgs;
    939 	int	r;		/* region size */
    940 	int	i;
    941 	size_t	pcsz, old_csz;
    942 	hpmctr_t *new_ctr, *old_ctr;
    943 	pfn_t	oldbase, newbase;
    944 	pfn_t	physbase, physmax;
    945 	size_t	old_npgs;
    946 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
    947 	size_t	size_cache[MMU_PAGE_SIZES];
    948 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
    949 	size_t	*old_color_array[MAX_MNODE_MRANGES];
    950 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
    951 	pcc_info_t **cands_cache;
    952 	pcc_info_t *old_pi, *pi;
    953 	pgcnt_t *pgcntp;
    954 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
    955 	int cands_cache_nranges;
    956 	int old_maxmrange, new_maxmrange;
    957 	int rc = 0;
    958 	int oldmnode;
    959 
    960 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
    961 	    MMU_PAGE_SIZES, KM_NOSLEEP);
    962 	if (cands_cache == NULL)
    963 		return (ENOMEM);
    964 
    965 	i = -1;
    966 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
    967 
    968 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
    969 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
    970 
    971 	/* prepare to free non-null pointers on the way out */
    972 	cands_cache_nranges = nranges;
    973 	bzero(ctr_cache, sizeof (ctr_cache));
    974 	bzero(color_cache, sizeof (color_cache));
    975 
    976 	/*
    977 	 * We need to determine how many page colors there are for each
    978 	 * page size in order to allocate memory for any color specific
    979 	 * arrays.
    980 	 */
    981 	for (r = 0; r < mmu_page_sizes; r++) {
    982 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
    983 	}
    984 
    985 	/*
    986 	 * Preallocate all of the new hpm_counters arrays as we can't
    987 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
    988 	 * If we can't allocate all of the arrays, undo our work so far
    989 	 * and return failure.
    990 	 */
    991 	for (r = 1; r < mmu_page_sizes; r++) {
    992 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
    993 		size_cache[r] = pcsz;
    994 		ctr_cache[r] = kmem_zalloc(pcsz *
    995 		    sizeof (hpmctr_t), KM_NOSLEEP);
    996 		if (ctr_cache[r] == NULL) {
    997 			rc = ENOMEM;
    998 			goto cleanup;
    999 		}
   1000 	}
   1001 
   1002 	/*
   1003 	 * Preallocate all of the new color current arrays as we can't
   1004 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
   1005 	 * If we can't allocate all of the arrays, undo our work so far
   1006 	 * and return failure.
   1007 	 */
   1008 	for (r = 1; r < mmu_page_sizes; r++) {
   1009 		for (mrange = 0; mrange < nranges; mrange++) {
   1010 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
   1011 			    colors_per_szc[r], KM_NOSLEEP);
   1012 			if (color_cache[r][mrange] == NULL) {
   1013 				rc = ENOMEM;
   1014 				goto cleanup;
   1015 			}
   1016 		}
   1017 	}
   1018 
   1019 	/*
   1020 	 * Preallocate all of the new pcc_info_t arrays as we can't
   1021 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
   1022 	 * If we can't allocate all of the arrays, undo our work so far
   1023 	 * and return failure.
   1024 	 */
   1025 	for (r = 1; r < mmu_page_sizes; r++) {
   1026 		for (i = 0; i < NPC_MUTEX; i++) {
   1027 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
   1028 			    KM_NOSLEEP);
   1029 			if (pi == NULL) {
   1030 				rc = ENOMEM;
   1031 				goto cleanup;
   1032 			}
   1033 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
   1034 
   1035 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
   1036 				pgcntp = kmem_zalloc(colors_per_szc[r] *
   1037 				    sizeof (pgcnt_t), KM_NOSLEEP);
   1038 				if (pgcntp == NULL) {
   1039 					rc = ENOMEM;
   1040 					goto cleanup;
   1041 				}
   1042 				pi->pcc_color_free = pgcntp;
   1043 			}
   1044 		}
   1045 	}
   1046 
   1047 	/*
   1048 	 * Grab the write lock to prevent others from walking these arrays
   1049 	 * while we are modifying them.
   1050 	 */
   1051 	PAGE_CTRS_WRITE_LOCK(mnode);
   1052 
   1053 	/*
   1054 	 * For interleaved mnodes, find the first mnode
   1055 	 * with valid page counters since the current
   1056 	 * mnode may have just been added and not have
   1057 	 * valid page counters.
   1058 	 */
   1059 	if (interleaved_mnodes) {
   1060 		for (i = 0; i < max_mem_nodes; i++)
   1061 			if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
   1062 				break;
   1063 		ASSERT(i < max_mem_nodes);
   1064 		oldmnode = i;
   1065 	} else
   1066 		oldmnode = mnode;
   1067 
   1068 	old_nranges = mnode_nranges[mnode];
   1069 	cands_cache_nranges = old_nranges;
   1070 	mnode_nranges[mnode] = nranges;
   1071 	old_maxmrange = mnode_maxmrange[mnode];
   1072 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
   1073 	new_maxmrange = mnode_maxmrange[mnode];
   1074 
   1075 	for (r = 1; r < mmu_page_sizes; r++) {
   1076 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
   1077 		old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
   1078 		old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
   1079 		oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
   1080 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
   1081 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
   1082 			old_color_array[mrange] =
   1083 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
   1084 			    r, mrange);
   1085 		}
   1086 
   1087 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
   1088 		new_ctr = ctr_cache[r];
   1089 		ctr_cache[r] = NULL;
   1090 		if (old_ctr != NULL &&
   1091 		    (oldbase + old_npgs > newbase) &&
   1092 		    (newbase + npgs > oldbase)) {
   1093 			/*
   1094 			 * Map the intersection of the old and new
   1095 			 * counters into the new array.
   1096 			 */
   1097 			size_t offset;
   1098 			if (newbase > oldbase) {
   1099 				offset = (newbase - oldbase) >>
   1100 				    PAGE_COUNTERS_SHIFT(mnode, r);
   1101 				bcopy(old_ctr + offset, new_ctr,
   1102 				    MIN(pcsz, (old_csz - offset)) *
   1103 				    sizeof (hpmctr_t));
   1104 			} else {
   1105 				offset = (oldbase - newbase) >>
   1106 				    PAGE_COUNTERS_SHIFT(mnode, r);
   1107 				bcopy(old_ctr, new_ctr + offset,
   1108 				    MIN(pcsz - offset, old_csz) *
   1109 				    sizeof (hpmctr_t));
   1110 			}
   1111 		}
   1112 
   1113 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
   1114 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
   1115 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
   1116 
   1117 		/* update shared hpm_counters in other mnodes */
   1118 		if (interleaved_mnodes) {
   1119 			for (i = 0; i < max_mem_nodes; i++) {
   1120 				if ((i == mnode) ||
   1121 				    (mem_node_config[i].exists == 0))
   1122 					continue;
   1123 				ASSERT(
   1124 				    PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
   1125 				    PAGE_COUNTERS_COUNTERS(i, r) == NULL);
   1126 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
   1127 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
   1128 				PAGE_COUNTERS_BASE(i, r) = newbase;
   1129 			}
   1130 		}
   1131 
   1132 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
   1133 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
   1134 			    color_cache[r][mrange];
   1135 			color_cache[r][mrange] = NULL;
   1136 		}
   1137 		/*
   1138 		 * for now, just reset on these events as it's probably
   1139 		 * not worthwhile to try and optimize this.
   1140 		 */
   1141 		for (i = 0; i < colors_per_szc[r]; i++) {
   1142 			uint_t color_mask = colors_per_szc[r] - 1;
   1143 			int mlo = interleaved_mnodes ? 0 : mnode;
   1144 			int mhi = interleaved_mnodes ? max_mem_nodes :
   1145 			    (mnode + 1);
   1146 			int m;
   1147 			pfn_t  pfnum;
   1148 			size_t idx;
   1149 			MEM_NODE_ITERATOR_DECL(it);
   1150 
   1151 			for (m = mlo; m < mhi; m++) {
   1152 				if (mem_node_config[m].exists == 0)
   1153 					continue;
   1154 				pfnum = newbase;
   1155 				MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
   1156 				if (pfnum == (pfn_t)-1) {
   1157 					idx = 0;
   1158 				} else {
   1159 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
   1160 					    color_mask, color_mask, &it);
   1161 					idx = PNUM_TO_IDX(m, r, pfnum);
   1162 					idx = (idx < pcsz) ? idx : 0;
   1163 				}
   1164 				for (mrange = 0; mrange < nranges; mrange++) {
   1165 					if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
   1166 					    r, mrange) != NULL)
   1167 						PAGE_COUNTERS_CURRENT_COLOR(m,
   1168 						    r, i, mrange) = idx;
   1169 				}
   1170 			}
   1171 		}
   1172 
   1173 		/* cache info for freeing out of the critical path */
   1174 		if ((caddr_t)old_ctr >= kernelheap &&
   1175 		    (caddr_t)old_ctr < ekernelheap) {
   1176 			ctr_cache[r] = old_ctr;
   1177 			size_cache[r] = old_csz;
   1178 		}
   1179 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
   1180 			size_t *tmp = old_color_array[mrange];
   1181 			if ((caddr_t)tmp >= kernelheap &&
   1182 			    (caddr_t)tmp < ekernelheap) {
   1183 				color_cache[r][mrange] = tmp;
   1184 			}
   1185 		}
   1186 		/*
   1187 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
   1188 		 * satisfy the identity requirement.
   1189 		 * We should be able to go from one to the other
   1190 		 * and get consistent values.
   1191 		 */
   1192 		ASSERT(PNUM_TO_IDX(mnode, r,
   1193 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
   1194 		ASSERT(IDX_TO_PNUM(mnode, r,
   1195 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
   1196 
   1197 		/* pcc_info_t and pcc_color_free */
   1198 		for (i = 0; i < NPC_MUTEX; i++) {
   1199 			pcc_info_t *epi;
   1200 			pcc_info_t *eold_pi;
   1201 
   1202 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
   1203 			old_pi = page_ctrs_cands[i][r][mnode];
   1204 			page_ctrs_cands[i][r][mnode] = pi;
   1205 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
   1206 
   1207 			/* preserve old pcc_color_free values, if any */
   1208 			if (old_pi == NULL)
   1209 				continue;
   1210 
   1211 			/*
   1212 			 * when/if x86 does DR, must account for
   1213 			 * possible change in range index when
   1214 			 * preserving pcc_info
   1215 			 */
   1216 			epi = &pi[nranges];
   1217 			eold_pi = &old_pi[old_nranges];
   1218 			if (new_maxmrange > old_maxmrange) {
   1219 				pi += new_maxmrange - old_maxmrange;
   1220 			} else if (new_maxmrange < old_maxmrange) {
   1221 				old_pi += old_maxmrange - new_maxmrange;
   1222 			}
   1223 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
   1224 				pcc_info_t tmp = *pi;
   1225 				*pi = *old_pi;
   1226 				*old_pi = tmp;
   1227 			}
   1228 		}
   1229 	}
   1230 	PAGE_CTRS_WRITE_UNLOCK(mnode);
   1231 
   1232 	/*
   1233 	 * Now that we have dropped the write lock, it is safe to free all
   1234 	 * of the memory we have cached above.
   1235 	 * We come thru here to free memory when pre-alloc fails, and also to
   1236 	 * free old pointers which were recorded while locked.
   1237 	 */
   1238 cleanup:
   1239 	for (r = 1; r < mmu_page_sizes; r++) {
   1240 		if (ctr_cache[r] != NULL) {
   1241 			kmem_free(ctr_cache[r],
   1242 			    size_cache[r] * sizeof (hpmctr_t));
   1243 		}
   1244 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
   1245 			if (color_cache[r][mrange] != NULL) {
   1246 				kmem_free(color_cache[r][mrange],
   1247 				    colors_per_szc[r] * sizeof (size_t));
   1248 			}
   1249 		}
   1250 		for (i = 0; i < NPC_MUTEX; i++) {
   1251 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
   1252 			if (pi == NULL)
   1253 				continue;
   1254 			nr = cands_cache_nranges;
   1255 			for (mrange = 0; mrange < nr; mrange++, pi++) {
   1256 				pgcntp = pi->pcc_color_free;
   1257 				if (pgcntp == NULL)
   1258 					continue;
   1259 				if ((caddr_t)pgcntp >= kernelheap &&
   1260 				    (caddr_t)pgcntp < ekernelheap) {
   1261 					kmem_free(pgcntp,
   1262 					    colors_per_szc[r] *
   1263 					    sizeof (pgcnt_t));
   1264 				}
   1265 			}
   1266 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
   1267 			if ((caddr_t)pi >= kernelheap &&
   1268 			    (caddr_t)pi < ekernelheap) {
   1269 				kmem_free(pi, nr * sizeof (pcc_info_t));
   1270 			}
   1271 		}
   1272 	}
   1273 
   1274 	kmem_free(cands_cache,
   1275 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
   1276 	return (rc);
   1277 }
   1278 
   1279 /*
   1280  * Cleanup the hpm_counters field in the page counters
   1281  * array.
   1282  */
   1283 void
   1284 page_ctrs_cleanup(void)
   1285 {
   1286 	int r;	/* region size */
   1287 	int i;	/* mnode index */
   1288 
   1289 	/*
   1290 	 * Get the page counters write lock while we are
   1291 	 * setting the page hpm_counters field to NULL
   1292 	 * for non-existent mnodes.
   1293 	 */
   1294 	for (i = 0; i < max_mem_nodes; i++) {
   1295 		PAGE_CTRS_WRITE_LOCK(i);
   1296 		if (mem_node_config[i].exists) {
   1297 			PAGE_CTRS_WRITE_UNLOCK(i);
   1298 			continue;
   1299 		}
   1300 		for (r = 1; r < mmu_page_sizes; r++) {
   1301 			PAGE_COUNTERS_COUNTERS(i, r) = NULL;
   1302 		}
   1303 		PAGE_CTRS_WRITE_UNLOCK(i);
   1304 	}
   1305 }
   1306 
   1307 #ifdef DEBUG
   1308 
   1309 /*
   1310  * confirm pp is a large page corresponding to szc
   1311  */
   1312 void
   1313 chk_lpg(page_t *pp, uchar_t szc)
   1314 {
   1315 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
   1316 	uint_t noreloc;
   1317 
   1318 	if (npgs == 1) {
   1319 		ASSERT(pp->p_szc == 0);
   1320 		ASSERT(pp->p_next == pp);
   1321 		ASSERT(pp->p_prev == pp);
   1322 		return;
   1323 	}
   1324 
   1325 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
   1326 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
   1327 
   1328 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
   1329 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
   1330 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
   1331 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
   1332 
   1333 	/*
   1334 	 * Check list of pages.
   1335 	 */
   1336 	noreloc = PP_ISNORELOC(pp);
   1337 	while (npgs--) {
   1338 		if (npgs != 0) {
   1339 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
   1340 			ASSERT(pp->p_next == (pp + 1));
   1341 		}
   1342 		ASSERT(pp->p_szc == szc);
   1343 		ASSERT(PP_ISFREE(pp));
   1344 		ASSERT(PP_ISAGED(pp));
   1345 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
   1346 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
   1347 		ASSERT(pp->p_vnode  == NULL);
   1348 		ASSERT(PP_ISNORELOC(pp) == noreloc);
   1349 
   1350 		pp = pp->p_next;
   1351 	}
   1352 }
   1353 #endif /* DEBUG */
   1354 
   1355 void
   1356 page_freelist_lock(int mnode)
   1357 {
   1358 	int i;
   1359 	for (i = 0; i < NPC_MUTEX; i++) {
   1360 		mutex_enter(FPC_MUTEX(mnode, i));
   1361 		mutex_enter(CPC_MUTEX(mnode, i));
   1362 	}
   1363 }
   1364 
   1365 void
   1366 page_freelist_unlock(int mnode)
   1367 {
   1368 	int i;
   1369 	for (i = 0; i < NPC_MUTEX; i++) {
   1370 		mutex_exit(FPC_MUTEX(mnode, i));
   1371 		mutex_exit(CPC_MUTEX(mnode, i));
   1372 	}
   1373 }
   1374 
   1375 /*
   1376  * add pp to the specified page list. Defaults to head of the page list
   1377  * unless PG_LIST_TAIL is specified.
   1378  */
   1379 void
   1380 page_list_add(page_t *pp, int flags)
   1381 {
   1382 	page_t		**ppp;
   1383 	kmutex_t	*pcm;
   1384 	uint_t		bin, mtype;
   1385 	int		mnode;
   1386 
   1387 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
   1388 	ASSERT(PP_ISFREE(pp));
   1389 	ASSERT(!hat_page_is_mapped(pp));
   1390 	ASSERT(hat_page_getshare(pp) == 0);
   1391 
   1392 	/*
   1393 	 * Large pages should be freed via page_list_add_pages().
   1394 	 */
   1395 	ASSERT(pp->p_szc == 0);
   1396 
   1397 	/*
   1398 	 * Don't need to lock the freelist first here
   1399 	 * because the page isn't on the freelist yet.
   1400 	 * This means p_szc can't change on us.
   1401 	 */
   1402 
   1403 	bin = PP_2_BIN(pp);
   1404 	mnode = PP_2_MEM_NODE(pp);
   1405 	mtype = PP_2_MTYPE(pp);
   1406 
   1407 	if (flags & PG_LIST_ISINIT) {
   1408 		/*
   1409 		 * PG_LIST_ISINIT is set during system startup (ie. single
   1410 		 * threaded), add a page to the free list and add to the
   1411 		 * the free region counters w/o any locking
   1412 		 */
   1413 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
   1414 
   1415 		/* inline version of page_add() */
   1416 		if (*ppp != NULL) {
   1417 			pp->p_next = *ppp;
   1418 			pp->p_prev = (*ppp)->p_prev;
   1419 			(*ppp)->p_prev = pp;
   1420 			pp->p_prev->p_next = pp;
   1421 		} else
   1422 			*ppp = pp;
   1423 
   1424 		page_ctr_add_internal(mnode, mtype, pp, flags);
   1425 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
   1426 	} else {
   1427 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
   1428 
   1429 		if (flags & PG_FREE_LIST) {
   1430 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
   1431 			ASSERT(PP_ISAGED(pp));
   1432 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
   1433 
   1434 		} else {
   1435 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
   1436 			ASSERT(pp->p_vnode);
   1437 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
   1438 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
   1439 		}
   1440 		mutex_enter(pcm);
   1441 		page_add(ppp, pp);
   1442 
   1443 		if (flags & PG_LIST_TAIL)
   1444 			*ppp = (*ppp)->p_next;
   1445 		/*
   1446 		 * Add counters before releasing pcm mutex to avoid a race with
   1447 		 * page_freelist_coalesce and page_freelist_split.
   1448 		 */
   1449 		page_ctr_add(mnode, mtype, pp, flags);
   1450 		mutex_exit(pcm);
   1451 	}
   1452 
   1453 
   1454 #if defined(__sparc)
   1455 	if (PP_ISNORELOC(pp)) {
   1456 		kcage_freemem_add(1);
   1457 	}
   1458 #endif
   1459 	/*
   1460 	 * It is up to the caller to unlock the page!
   1461 	 */
   1462 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
   1463 }
   1464 
   1465 
   1466 #ifdef __sparc
   1467 /*
   1468  * This routine is only used by kcage_init during system startup.
   1469  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
   1470  * without the overhead of taking locks and updating counters.
   1471  */
   1472 void
   1473 page_list_noreloc_startup(page_t *pp)
   1474 {
   1475 	page_t		**ppp;
   1476 	uint_t		bin;
   1477 	int		mnode;
   1478 	int		mtype;
   1479 	int		flags = 0;
   1480 
   1481 	/*
   1482 	 * If this is a large page on the freelist then
   1483 	 * break it up into smaller pages.
   1484 	 */
   1485 	if (pp->p_szc != 0)
   1486 		page_boot_demote(pp);
   1487 
   1488 	/*
   1489 	 * Get list page is currently on.
   1490 	 */
   1491 	bin = PP_2_BIN(pp);
   1492 	mnode = PP_2_MEM_NODE(pp);
   1493 	mtype = PP_2_MTYPE(pp);
   1494 	ASSERT(mtype == MTYPE_RELOC);
   1495 	ASSERT(pp->p_szc == 0);
   1496 
   1497 	if (PP_ISAGED(pp)) {
   1498 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
   1499 		flags |= PG_FREE_LIST;
   1500 	} else {
   1501 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
   1502 		flags |= PG_CACHE_LIST;
   1503 	}
   1504 
   1505 	ASSERT(*ppp != NULL);
   1506 
   1507 	/*
   1508 	 * Delete page from current list.
   1509 	 */
   1510 	if (*ppp == pp)
   1511 		*ppp = pp->p_next;		/* go to next page */
   1512 	if (*ppp == pp) {
   1513 		*ppp = NULL;			/* page list is gone */
   1514 	} else {
   1515 		pp->p_prev->p_next = pp->p_next;
   1516 		pp->p_next->p_prev = pp->p_prev;
   1517 	}
   1518 
   1519 	/*
   1520 	 * Decrement page counters
   1521 	 */
   1522 	page_ctr_sub_internal(mnode, mtype, pp, flags);
   1523 
   1524 	/*
   1525 	 * Set no reloc for cage initted pages.
   1526 	 */
   1527 	PP_SETNORELOC(pp);
   1528 
   1529 	mtype = PP_2_MTYPE(pp);
   1530 	ASSERT(mtype == MTYPE_NORELOC);
   1531 
   1532 	/*
   1533 	 * Get new list for page.
   1534 	 */
   1535 	if (PP_ISAGED(pp)) {
   1536 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
   1537 	} else {
   1538 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
   1539 	}
   1540 
   1541 	/*
   1542 	 * Insert page on new list.
   1543 	 */
   1544 	if (*ppp == NULL) {
   1545 		*ppp = pp;
   1546 		pp->p_next = pp->p_prev = pp;
   1547 	} else {
   1548 		pp->p_next = *ppp;
   1549 		pp->p_prev = (*ppp)->p_prev;
   1550 		(*ppp)->p_prev = pp;
   1551 		pp->p_prev->p_next = pp;
   1552 	}
   1553 
   1554 	/*
   1555 	 * Increment page counters
   1556 	 */
   1557 	page_ctr_add_internal(mnode, mtype, pp, flags);
   1558 
   1559 	/*
   1560 	 * Update cage freemem counter
   1561 	 */
   1562 	atomic_add_long(&kcage_freemem, 1);
   1563 }
   1564 #else	/* __sparc */
   1565 
   1566 /* ARGSUSED */
   1567 void
   1568 page_list_noreloc_startup(page_t *pp)
   1569 {
   1570 	panic("page_list_noreloc_startup: should be here only for sparc");
   1571 }
   1572 #endif
   1573 
   1574 void
   1575 page_list_add_pages(page_t *pp, int flags)
   1576 {
   1577 	kmutex_t *pcm;
   1578 	pgcnt_t	pgcnt;
   1579 	uint_t	bin, mtype, i;
   1580 	int	mnode;
   1581 
   1582 	/* default to freelist/head */
   1583 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
   1584 
   1585 	CHK_LPG(pp, pp->p_szc);
   1586 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
   1587 
   1588 	bin = PP_2_BIN(pp);
   1589 	mnode = PP_2_MEM_NODE(pp);
   1590 	mtype = PP_2_MTYPE(pp);
   1591 
   1592 	if (flags & PG_LIST_ISINIT) {
   1593 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
   1594 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
   1595 		ASSERT(!PP_ISNORELOC(pp));
   1596 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
   1597 	} else {
   1598 
   1599 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
   1600 
   1601 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
   1602 
   1603 		mutex_enter(pcm);
   1604 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
   1605 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
   1606 		mutex_exit(pcm);
   1607 
   1608 		pgcnt = page_get_pagecnt(pp->p_szc);
   1609 #if defined(__sparc)
   1610 		if (PP_ISNORELOC(pp))
   1611 			kcage_freemem_add(pgcnt);
   1612 #endif
   1613 		for (i = 0; i < pgcnt; i++, pp++)
   1614 			page_unlock_nocapture(pp);
   1615 	}
   1616 }
   1617 
   1618 /*
   1619  * During boot, need to demote a large page to base
   1620  * pagesize pages for seg_kmem for use in boot_alloc()
   1621  */
   1622 void
   1623 page_boot_demote(page_t *pp)
   1624 {
   1625 	ASSERT(pp->p_szc != 0);
   1626 	ASSERT(PP_ISFREE(pp));
   1627 	ASSERT(PP_ISAGED(pp));
   1628 
   1629 	(void) page_demote(PP_2_MEM_NODE(pp),
   1630 	    PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
   1631 	    PC_FREE);
   1632 
   1633 	ASSERT(PP_ISFREE(pp));
   1634 	ASSERT(PP_ISAGED(pp));
   1635 	ASSERT(pp->p_szc == 0);
   1636 }
   1637 
   1638 /*
   1639  * Take a particular page off of whatever freelist the page
   1640  * is claimed to be on.
   1641  *
   1642  * NOTE: Only used for PAGESIZE pages.
   1643  */
   1644 void
   1645 page_list_sub(page_t *pp, int flags)
   1646 {
   1647 	int		bin;
   1648 	uint_t		mtype;
   1649 	int		mnode;
   1650 	kmutex_t	*pcm;
   1651 	page_t		**ppp;
   1652 
   1653 	ASSERT(PAGE_EXCL(pp));
   1654 	ASSERT(PP_ISFREE(pp));
   1655 
   1656 	/*
   1657 	 * The p_szc field can only be changed by page_promote()
   1658 	 * and page_demote(). Only free pages can be promoted and
   1659 	 * demoted and the free list MUST be locked during these
   1660 	 * operations. So to prevent a race in page_list_sub()
   1661 	 * between computing which bin of the freelist lock to
   1662 	 * grab and actually grabing the lock we check again that
   1663 	 * the bin we locked is still the correct one. Notice that
   1664 	 * the p_szc field could have actually changed on us but
   1665 	 * if the bin happens to still be the same we are safe.
   1666 	 */
   1667 try_again:
   1668 	bin = PP_2_BIN(pp);
   1669 	mnode = PP_2_MEM_NODE(pp);
   1670 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
   1671 	mutex_enter(pcm);
   1672 	if (PP_2_BIN(pp) != bin) {
   1673 		mutex_exit(pcm);
   1674 		goto try_again;
   1675 	}
   1676 	mtype = PP_2_MTYPE(pp);
   1677 
   1678 	if (flags & PG_FREE_LIST) {
   1679 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
   1680 		ASSERT(PP_ISAGED(pp));
   1681 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
   1682 	} else {
   1683 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
   1684 		ASSERT(!PP_ISAGED(pp));
   1685 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
   1686 	}
   1687 
   1688 	/*
   1689 	 * Common PAGESIZE case.
   1690 	 *
   1691 	 * Note that we locked the freelist. This prevents
   1692 	 * any page promotion/demotion operations. Therefore
   1693 	 * the p_szc will not change until we drop pcm mutex.
   1694 	 */
   1695 	if (pp->p_szc == 0) {
   1696 		page_sub(ppp, pp);
   1697 		/*
   1698 		 * Subtract counters before releasing pcm mutex
   1699 		 * to avoid race with page_freelist_coalesce.
   1700 		 */
   1701 		page_ctr_sub(mnode, mtype, pp, flags);
   1702 		mutex_exit(pcm);
   1703 
   1704 #if defined(__sparc)
   1705 		if (PP_ISNORELOC(pp)) {
   1706 			kcage_freemem_sub(1);
   1707 		}
   1708 #endif
   1709 		return;
   1710 	}
   1711 
   1712 	/*
   1713 	 * Large pages on the cache list are not supported.
   1714 	 */
   1715 	if (flags & PG_CACHE_LIST)
   1716 		panic("page_list_sub: large page on cachelist");
   1717 
   1718 	/*
   1719 	 * Slow but rare.
   1720 	 *
   1721 	 * Somebody wants this particular page which is part
   1722 	 * of a large page. In this case we just demote the page
   1723 	 * if it's on the freelist.
   1724 	 *
   1725 	 * We have to drop pcm before locking the entire freelist.
   1726 	 * Once we have re-locked the freelist check to make sure
   1727 	 * the page hasn't already been demoted or completely
   1728 	 * freed.
   1729 	 */
   1730 	mutex_exit(pcm);
   1731 	page_freelist_lock(mnode);
   1732 	if (pp->p_szc != 0) {
   1733 		/*
   1734 		 * Large page is on freelist.
   1735 		 */
   1736 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
   1737 		    0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
   1738 	}
   1739 	ASSERT(PP_ISFREE(pp));
   1740 	ASSERT(PP_ISAGED(pp));
   1741 	ASSERT(pp->p_szc == 0);
   1742 
   1743 	/*
   1744 	 * Subtract counters before releasing pcm mutex
   1745 	 * to avoid race with page_freelist_coalesce.
   1746 	 */
   1747 	bin = PP_2_BIN(pp);
   1748 	mtype = PP_2_MTYPE(pp);
   1749 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
   1750 
   1751 	page_sub(ppp, pp);
   1752 	page_ctr_sub(mnode, mtype, pp, flags);
   1753 	page_freelist_unlock(mnode);
   1754 
   1755 #if defined(__sparc)
   1756 	if (PP_ISNORELOC(pp)) {
   1757 		kcage_freemem_sub(1);
   1758 	}
   1759 #endif
   1760 }
   1761 
   1762 void
   1763 page_list_sub_pages(page_t *pp, uint_t szc)
   1764 {
   1765 	kmutex_t *pcm;
   1766 	uint_t	bin, mtype;
   1767 	int	mnode;
   1768 
   1769 	ASSERT(PAGE_EXCL(pp));
   1770 	ASSERT(PP_ISFREE(pp));
   1771 	ASSERT(PP_ISAGED(pp));
   1772 
   1773 	/*
   1774 	 * See comment in page_list_sub().
   1775 	 */
   1776 try_again:
   1777 	bin = PP_2_BIN(pp);
   1778 	mnode = PP_2_MEM_NODE(pp);
   1779 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
   1780 	mutex_enter(pcm);
   1781 	if (PP_2_BIN(pp) != bin) {
   1782 		mutex_exit(pcm);
   1783 		goto	try_again;
   1784 	}
   1785 
   1786 	/*
   1787 	 * If we're called with a page larger than szc or it got
   1788 	 * promoted above szc before we locked the freelist then
   1789 	 * drop pcm and re-lock entire freelist. If page still larger
   1790 	 * than szc then demote it.
   1791 	 */
   1792 	if (pp->p_szc > szc) {
   1793 		mutex_exit(pcm);
   1794 		pcm = NULL;
   1795 		page_freelist_lock(mnode);
   1796 		if (pp->p_szc > szc) {
   1797 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
   1798 			(void) page_demote(mnode,
   1799 			    PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
   1800 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
   1801 		}
   1802 		bin = PP_2_BIN(pp);
   1803 	}
   1804 	ASSERT(PP_ISFREE(pp));
   1805 	ASSERT(PP_ISAGED(pp));
   1806 	ASSERT(pp->p_szc <= szc);
   1807 	ASSERT(pp == PP_PAGEROOT(pp));
   1808 
   1809 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
   1810 
   1811 	mtype = PP_2_MTYPE(pp);
   1812 	if (pp->p_szc != 0) {
   1813 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
   1814 		CHK_LPG(pp, pp->p_szc);
   1815 	} else {
   1816 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
   1817 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
   1818 	}
   1819 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
   1820 
   1821 	if (pcm != NULL) {
   1822 		mutex_exit(pcm);
   1823 	} else {
   1824 		page_freelist_unlock(mnode);
   1825 	}
   1826 
   1827 #if defined(__sparc)
   1828 	if (PP_ISNORELOC(pp)) {
   1829 		pgcnt_t	pgcnt;
   1830 
   1831 		pgcnt = page_get_pagecnt(pp->p_szc);
   1832 		kcage_freemem_sub(pgcnt);
   1833 	}
   1834 #endif
   1835 }
   1836 
   1837 /*
   1838  * Add the page to the front of a linked list of pages
   1839  * using the p_next & p_prev pointers for the list.
   1840  * The caller is responsible for protecting the list pointers.
   1841  */
   1842 void
   1843 mach_page_add(page_t **ppp, page_t *pp)
   1844 {
   1845 	if (*ppp == NULL) {
   1846 		pp->p_next = pp->p_prev = pp;
   1847 	} else {
   1848 		pp->p_next = *ppp;
   1849 		pp->p_prev = (*ppp)->p_prev;
   1850 		(*ppp)->p_prev = pp;
   1851 		pp->p_prev->p_next = pp;
   1852 	}
   1853 	*ppp = pp;
   1854 }
   1855 
   1856 /*
   1857  * Remove this page from a linked list of pages
   1858  * using the p_next & p_prev pointers for the list.
   1859  *
   1860  * The caller is responsible for protecting the list pointers.
   1861  */
   1862 void
   1863 mach_page_sub(page_t **ppp, page_t *pp)
   1864 {
   1865 	ASSERT(PP_ISFREE(pp));
   1866 
   1867 	if (*ppp == NULL || pp == NULL)
   1868 		panic("mach_page_sub");
   1869 
   1870 	if (*ppp == pp)
   1871 		*ppp = pp->p_next;		/* go to next page */
   1872 
   1873 	if (*ppp == pp)
   1874 		*ppp = NULL;			/* page list is gone */
   1875 	else {
   1876 		pp->p_prev->p_next = pp->p_next;
   1877 		pp->p_next->p_prev = pp->p_prev;
   1878 	}
   1879 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
   1880 }
   1881 
   1882 /*
   1883  * Routine fsflush uses to gradually coalesce the free list into larger pages.
   1884  */
   1885 void
   1886 page_promote_size(page_t *pp, uint_t cur_szc)
   1887 {
   1888 	pfn_t pfn;
   1889 	int mnode;
   1890 	int idx;
   1891 	int new_szc = cur_szc + 1;
   1892 	int full = FULL_REGION_CNT(new_szc);
   1893 
   1894 	pfn = page_pptonum(pp);
   1895 	mnode = PFN_2_MEM_NODE(pfn);
   1896 
   1897 	page_freelist_lock(mnode);
   1898 
   1899 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
   1900 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
   1901 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
   1902 
   1903 	page_freelist_unlock(mnode);
   1904 }
   1905 
   1906 static uint_t page_promote_err;
   1907 static uint_t page_promote_noreloc_err;
   1908 
   1909 /*
   1910  * Create a single larger page (of szc new_szc) from smaller contiguous pages
   1911  * for the given mnode starting at pfnum. Pages involved are on the freelist
   1912  * before the call and may be returned to the caller if requested, otherwise
   1913  * they will be placed back on the freelist.
   1914  * If flags is PC_ALLOC, then the large page will be returned to the user in
   1915  * a state which is consistent with a page being taken off the freelist.  If
   1916  * we failed to lock the new large page, then we will return NULL to the
   1917  * caller and put the large page on the freelist instead.
   1918  * If flags is PC_FREE, then the large page will be placed on the freelist,
   1919  * and NULL will be returned.
   1920  * The caller is responsible for locking the freelist as well as any other
   1921  * accounting which needs to be done for a returned page.
   1922  *
   1923  * RFE: For performance pass in pp instead of pfnum so
   1924  * 	we can avoid excessive calls to page_numtopp_nolock().
   1925  *	This would depend on an assumption that all contiguous
   1926  *	pages are in the same memseg so we can just add/dec
   1927  *	our pp.
   1928  *
   1929  * Lock ordering:
   1930  *
   1931  *	There is a potential but rare deadlock situation
   1932  *	for page promotion and demotion operations. The problem
   1933  *	is there are two paths into the freelist manager and
   1934  *	they have different lock orders:
   1935  *
   1936  *	page_create()
   1937  *		lock freelist
   1938  *		page_lock(EXCL)
   1939  *		unlock freelist
   1940  *		return
   1941  *		caller drops page_lock
   1942  *
   1943  *	page_free() and page_reclaim()
   1944  *		caller grabs page_lock(EXCL)
   1945  *
   1946  *		lock freelist
   1947  *		unlock freelist
   1948  *		drop page_lock
   1949  *
   1950  *	What prevents a thread in page_create() from deadlocking
   1951  *	with a thread freeing or reclaiming the same page is the
   1952  *	page_trylock() in page_get_freelist(). If the trylock fails
   1953  *	it skips the page.
   1954  *
   1955  *	The lock ordering for promotion and demotion is the same as
   1956  *	for page_create(). Since the same deadlock could occur during
   1957  *	page promotion and freeing or reclaiming of a page on the
   1958  *	cache list we might have to fail the operation and undo what
   1959  *	have done so far. Again this is rare.
   1960  */
   1961 page_t *
   1962 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
   1963 {
   1964 	page_t		*pp, *pplist, *tpp, *start_pp;
   1965 	pgcnt_t		new_npgs, npgs;
   1966 	uint_t		bin;
   1967 	pgcnt_t		tmpnpgs, pages_left;
   1968 	uint_t		noreloc;
   1969 	int 		which_list;
   1970 	ulong_t		index;
   1971 	kmutex_t	*phm;
   1972 
   1973 	/*
   1974 	 * General algorithm:
   1975 	 * Find the starting page
   1976 	 * Walk each page struct removing it from the freelist,
   1977 	 * and linking it to all the other pages removed.
   1978 	 * Once all pages are off the freelist,
   1979 	 * walk the list, modifying p_szc to new_szc and what
   1980 	 * ever other info needs to be done to create a large free page.
   1981 	 * According to the flags, either return the page or put it
   1982 	 * on the freelist.
   1983 	 */
   1984 
   1985 	start_pp = page_numtopp_nolock(pfnum);
   1986 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
   1987 	new_npgs = page_get_pagecnt(new_szc);
   1988 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
   1989 
   1990 	/* don't return page of the wrong mtype */
   1991 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
   1992 			return (NULL);
   1993 
   1994 	/*
   1995 	 * Loop through smaller pages to confirm that all pages
   1996 	 * give the same result for PP_ISNORELOC().
   1997 	 * We can check this reliably here as the protocol for setting
   1998 	 * P_NORELOC requires pages to be taken off the free list first.
   1999 	 */
   2000 	noreloc = PP_ISNORELOC(start_pp);
   2001 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
   2002 		if (noreloc != PP_ISNORELOC(pp)) {
   2003 			page_promote_noreloc_err++;
   2004 			page_promote_err++;
   2005 			return (NULL);
   2006 		}
   2007 	}
   2008 
   2009 	pages_left = new_npgs;
   2010 	pplist = NULL;
   2011 	pp = start_pp;
   2012 
   2013 	/* Loop around coalescing the smaller pages into a big page. */
   2014 	while (pages_left) {
   2015 		/*
   2016 		 * Remove from the freelist.
   2017 		 */
   2018 		ASSERT(PP_ISFREE(pp));
   2019 		bin = PP_2_BIN(pp);
   2020 		ASSERT(mnode == PP_2_MEM_NODE(pp));
   2021 		mtype = PP_2_MTYPE(pp);
   2022 		if (PP_ISAGED(pp)) {
   2023 
   2024 			/*
   2025 			 * PG_FREE_LIST
   2026 			 */
   2027 			if (pp->p_szc) {
   2028 				page_vpsub(&PAGE_FREELISTS(mnode,
   2029 				    pp->p_szc, bin, mtype), pp);
   2030 			} else {
   2031 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
   2032 				    bin, mtype), pp);
   2033 			}
   2034 			which_list = PG_FREE_LIST;
   2035 		} else {
   2036 			ASSERT(pp->p_szc == 0);
   2037 
   2038 			/*
   2039 			 * PG_CACHE_LIST
   2040 			 *
   2041 			 * Since this page comes from the
   2042 			 * cachelist, we must destroy the
   2043 			 * vnode association.
   2044 			 */
   2045 			if (!page_trylock(pp, SE_EXCL)) {
   2046 				goto fail_promote;
   2047 			}
   2048 
   2049 			/*
   2050 			 * We need to be careful not to deadlock
   2051 			 * with another thread in page_lookup().
   2052 			 * The page_lookup() thread could be holding
   2053 			 * the same phm that we need if the two
   2054 			 * pages happen to hash to the same phm lock.
   2055 			 * At this point we have locked the entire
   2056 			 * freelist and page_lookup() could be trying
   2057 			 * to grab a freelist lock.
   2058 			 */
   2059 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
   2060 			phm = PAGE_HASH_MUTEX(index);
   2061 			if (!mutex_tryenter(phm)) {
   2062 				page_unlock_nocapture(pp);
   2063 				goto fail_promote;
   2064 			}
   2065 
   2066 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
   2067 			page_hashout(pp, phm);
   2068 			mutex_exit(phm);
   2069 			PP_SETAGED(pp);
   2070 			page_unlock_nocapture(pp);
   2071 			which_list = PG_CACHE_LIST;
   2072 		}
   2073 		page_ctr_sub(mnode, mtype, pp, which_list);
   2074 
   2075 		/*
   2076 		 * Concatenate the smaller page(s) onto
   2077 		 * the large page list.
   2078 		 */
   2079 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
   2080 		pages_left -= npgs;
   2081 		tpp = pp;
   2082 		while (npgs--) {
   2083 			tpp->p_szc = new_szc;
   2084 			tpp = tpp->p_next;
   2085 		}
   2086 		page_list_concat(&pplist, &pp);
   2087 		pp += tmpnpgs;
   2088 	}
   2089 	CHK_LPG(pplist, new_szc);
   2090 
   2091 	/*
   2092 	 * return the page to the user if requested
   2093 	 * in the properly locked state.
   2094 	 */
   2095 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
   2096 		return (pplist);
   2097 	}
   2098 
   2099 	/*
   2100 	 * Otherwise place the new large page on the freelist
   2101 	 */
   2102 	bin = PP_2_BIN(pplist);
   2103 	mnode = PP_2_MEM_NODE(pplist);
   2104 	mtype = PP_2_MTYPE(pplist);
   2105 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
   2106 
   2107 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
   2108 	return (NULL);
   2109 
   2110 fail_promote:
   2111 	/*
   2112 	 * A thread must have still been freeing or
   2113 	 * reclaiming the page on the cachelist.
   2114 	 * To prevent a deadlock undo what we have
   2115 	 * done sofar and return failure. This
   2116 	 * situation can only happen while promoting
   2117 	 * PAGESIZE pages.
   2118 	 */
   2119 	page_promote_err++;
   2120 	while (pplist) {
   2121 		pp = pplist;
   2122 		mach_page_sub(&pplist, pp);
   2123 		pp->p_szc = 0;
   2124 		bin = PP_2_BIN(pp);
   2125 		mtype = PP_2_MTYPE(pp);
   2126 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
   2127 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
   2128 	}
   2129 	return (NULL);
   2130 
   2131 }
   2132 
   2133 /*
   2134  * Break up a large page into smaller size pages.
   2135  * Pages involved are on the freelist before the call and may
   2136  * be returned to the caller if requested, otherwise they will
   2137  * be placed back on the freelist.
   2138  * The caller is responsible for locking the freelist as well as any other
   2139  * accounting which needs to be done for a returned page.
   2140  * If flags is not PC_ALLOC, the color argument is ignored, and thus
   2141  * technically, any value may be passed in but PC_NO_COLOR is the standard
   2142  * which should be followed for clarity's sake.
   2143  * Returns a page whose pfn is < pfnmax
   2144  */
   2145 page_t *
   2146 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
   2147     uchar_t new_szc, int color, int flags)
   2148 {
   2149 	page_t	*pp, *pplist, *npplist;
   2150 	pgcnt_t	npgs, n;
   2151 	uint_t	bin;
   2152 	uint_t	mtype;
   2153 	page_t	*ret_pp = NULL;
   2154 
   2155 	ASSERT(cur_szc != 0);
   2156 	ASSERT(new_szc < cur_szc);
   2157 
   2158 	pplist = page_numtopp_nolock(pfnum);
   2159 	ASSERT(pplist != NULL);
   2160 
   2161 	ASSERT(pplist->p_szc == cur_szc);
   2162 
   2163 	bin = PP_2_BIN(pplist);
   2164 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
   2165 	mtype = PP_2_MTYPE(pplist);
   2166 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
   2167 
   2168 	CHK_LPG(pplist, cur_szc);
   2169 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
   2170 
   2171 	/*
   2172 	 * Number of PAGESIZE pages for smaller new_szc
   2173 	 * page.
   2174 	 */
   2175 	npgs = page_get_pagecnt(new_szc);
   2176 
   2177 	while (pplist) {
   2178 		pp = pplist;
   2179 
   2180 		ASSERT(pp->p_szc == cur_szc);
   2181 
   2182 		/*
   2183 		 * We either break it up into PAGESIZE pages or larger.
   2184 		 */
   2185 		if (npgs == 1) {	/* PAGESIZE case */
   2186 			mach_page_sub(&pplist, pp);
   2187 			ASSERT(pp->p_szc == cur_szc);
   2188 			ASSERT(new_szc == 0);
   2189 			ASSERT(mnode == PP_2_MEM_NODE(pp));
   2190 			pp->p_szc = new_szc;
   2191 			bin = PP_2_BIN(pp);
   2192 			if ((bin == color) && (flags == PC_ALLOC) &&
   2193 			    (ret_pp == NULL) && (pfnmax == 0 ||
   2194 			    pp->p_pagenum < pfnmax) &&
   2195 			    page_trylock_cons(pp, SE_EXCL)) {
   2196 				ret_pp = pp;
   2197 			} else {
   2198 				mtype = PP_2_MTYPE(pp);
   2199 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
   2200 				    mtype), pp);
   2201 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
   2202 			}
   2203 		} else {
   2204 			page_t *try_to_return_this_page = NULL;
   2205 			int count = 0;
   2206 
   2207 			/*
   2208 			 * Break down into smaller lists of pages.
   2209 			 */
   2210 			page_list_break(&pplist, &npplist, npgs);
   2211 
   2212 			pp = pplist;
   2213 			n = npgs;
   2214 			while (n--) {
   2215 				ASSERT(pp->p_szc == cur_szc);
   2216 				/*
   2217 				 * Check whether all the pages in this list
   2218 				 * fit the request criteria.
   2219 				 */
   2220 				if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
   2221 					count++;
   2222 				}
   2223 				pp->p_szc = new_szc;
   2224 				pp = pp->p_next;
   2225 			}
   2226 
   2227 			if (count == npgs &&
   2228 			    (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
   2229 				try_to_return_this_page = pp;
   2230 			}
   2231 
   2232 			CHK_LPG(pplist, new_szc);
   2233 
   2234 			bin = PP_2_BIN(pplist);
   2235 			if (try_to_return_this_page)
   2236 				ASSERT(mnode ==
   2237 				    PP_2_MEM_NODE(try_to_return_this_page));
   2238 			if ((bin == color) && (flags == PC_ALLOC) &&
   2239 			    (ret_pp == NULL) && try_to_return_this_page &&
   2240 			    page_trylock_cons(try_to_return_this_page,
   2241 			    SE_EXCL)) {
   2242 				ret_pp = try_to_return_this_page;
   2243 			} else {
   2244 				mtype = PP_2_MTYPE(pp);
   2245 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
   2246 				    bin, mtype), pplist);
   2247 
   2248 				page_ctr_add(mnode, mtype, pplist,
   2249 				    PG_FREE_LIST);
   2250 			}
   2251 			pplist = npplist;
   2252 		}
   2253 	}
   2254 	return (ret_pp);
   2255 }
   2256 
   2257 int mpss_coalesce_disable = 0;
   2258 
   2259 /*
   2260  * Coalesce free pages into a page of the given szc and color if possible.
   2261  * Return the pointer to the page created, otherwise, return NULL.
   2262  *
   2263  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
   2264  */
   2265 page_t *
   2266 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
   2267     int mtype, pfn_t pfnhi)
   2268 {
   2269 	int 	r = szc;		/* region size */
   2270 	int	mrange;
   2271 	uint_t 	full, bin, color_mask, wrap = 0;
   2272 	pfn_t	pfnum, lo, hi;
   2273 	size_t	len, idx, idx0;
   2274 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
   2275 	page_t	*ret_pp;
   2276 	MEM_NODE_ITERATOR_DECL(it);
   2277 #if defined(__sparc)
   2278 	pfn_t pfnum0, nlo, nhi;
   2279 #endif
   2280 
   2281 	if (mpss_coalesce_disable) {
   2282 		ASSERT(szc < MMU_PAGE_SIZES);
   2283 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
   2284 		return (NULL);
   2285 	}
   2286 
   2287 	ASSERT(szc < mmu_page_sizes);
   2288 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
   2289 	ASSERT(ceq_mask <= color_mask);
   2290 	ASSERT(color <= color_mask);
   2291 	color &= ceq_mask;
   2292 
   2293 	/* Prevent page_counters dynamic memory from being freed */
   2294 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
   2295 
   2296 	mrange = MTYPE_2_MRANGE(mnode, mtype);
   2297 	ASSERT(mrange < mnode_nranges[mnode]);
   2298 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
   2299 
   2300 	/* get pfn range for mtype */
   2301 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
   2302 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
   2303 	hi++;
   2304 
   2305 	/* use lower limit if given */
   2306 	if (pfnhi != PFNNULL && pfnhi < hi)
   2307 		hi = pfnhi;
   2308 
   2309 	/* round to szcpgcnt boundaries */
   2310 	lo = P2ROUNDUP(lo, szcpgcnt);
   2311 	MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
   2312 	if (lo == (pfn_t)-1) {
   2313 		rw_exit(&page_ctrs_rwlock[mnode]);
   2314 		return (NULL);
   2315 	}
   2316 	hi = hi & ~(szcpgcnt - 1);
   2317 
   2318 	/* set lo to the closest pfn of the right color */
   2319 	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
   2320 	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
   2321 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
   2322 		    &it);
   2323 	}
   2324 
   2325 	if (hi <= lo) {
   2326 		rw_exit(&page_ctrs_rwlock[mnode]);
   2327 		return (NULL);
   2328 	}
   2329 
   2330 	full = FULL_REGION_CNT(r);
   2331 
   2332 	/* calculate the number of page candidates and initial search index */
   2333 	bin = color;
   2334 	idx0 = (size_t)(-1);
   2335 	do {
   2336 		pgcnt_t acand;
   2337 
   2338 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
   2339 		if (acand) {
   2340 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
   2341 			    r, bin, mrange);
   2342 			idx0 = MIN(idx0, idx);
   2343 			cands += acand;
   2344 		}
   2345 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
   2346 	} while (bin != color);
   2347 
   2348 	if (cands == 0) {
   2349 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
   2350 		rw_exit(&page_ctrs_rwlock[mnode]);
   2351 		return (NULL);
   2352 	}
   2353 
   2354 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
   2355 	if (pfnum < lo || pfnum >= hi) {
   2356 		pfnum = lo;
   2357 	} else {
   2358 		MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
   2359 		if (pfnum == (pfn_t)-1) {
   2360 			pfnum = lo;
   2361 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
   2362 			ASSERT(pfnum != (pfn_t)-1);
   2363 		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
   2364 		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
   2365 			/* invalid color, get the closest correct pfn */
   2366 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
   2367 			    color_mask, &it);
   2368 			if (pfnum >= hi) {
   2369 				pfnum = lo;
   2370 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
   2371 			}
   2372 		}
   2373 	}
   2374 
   2375 	/* set starting index */
   2376 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
   2377 	ASSERT(idx0 < len);
   2378 
   2379 #if defined(__sparc)
   2380 	pfnum0 = pfnum;		/* page corresponding to idx0 */
   2381 	nhi = 0;		/* search kcage ranges */
   2382 #endif
   2383 
   2384 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
   2385 
   2386 #if defined(__sparc)
   2387 		/*
   2388 		 * Find lowest intersection of kcage ranges and mnode.
   2389 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
   2390 		 */
   2391 		if (nhi <= pfnum) {
   2392 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
   2393 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
   2394 				goto wrapit;
   2395 
   2396 			/* jump to the next page in the range */
   2397 			if (pfnum < nlo) {
   2398 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
   2399 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
   2400 				idx = PNUM_TO_IDX(mnode, r, pfnum);
   2401 				if (idx >= len || pfnum >= hi)
   2402 					goto wrapit;
   2403 				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
   2404 				    ceq_mask)
   2405 					goto next;
   2406 				if (interleaved_mnodes &&
   2407 				    PFN_2_MEM_NODE(pfnum) != mnode)
   2408 					goto next;
   2409 			}
   2410 		}
   2411 #endif
   2412 
   2413 		if (PAGE_COUNTERS(mnode, r, idx) != full)
   2414 			goto next;
   2415 
   2416 		/*
   2417 		 * RFE: For performance maybe we can do something less
   2418 		 *	brutal than locking the entire freelist. So far
   2419 		 * 	this doesn't seem to be a performance problem?
   2420 		 */
   2421 		page_freelist_lock(mnode);
   2422 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
   2423 			ret_pp =
   2424 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
   2425 			if (ret_pp != NULL) {
   2426 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
   2427 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
   2428 				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
   2429 				page_freelist_unlock(mnode);
   2430 				rw_exit(&page_ctrs_rwlock[mnode]);
   2431 #if defined(__sparc)
   2432 				if (PP_ISNORELOC(ret_pp)) {
   2433 					pgcnt_t npgs;
   2434 
   2435 					npgs = page_get_pagecnt(ret_pp->p_szc);
   2436 					kcage_freemem_sub(npgs);
   2437 				}
   2438 #endif
   2439 				return (ret_pp);
   2440 			}
   2441 		} else {
   2442 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
   2443 		}
   2444 
   2445 		page_freelist_unlock(mnode);
   2446 		/*
   2447 		 * No point looking for another page if we've
   2448 		 * already tried all of the ones that
   2449 		 * page_ctr_cands indicated.  Stash off where we left
   2450 		 * off.
   2451 		 * Note: this is not exact since we don't hold the
   2452 		 * page_freelist_locks before we initially get the
   2453 		 * value of cands for performance reasons, but should
   2454 		 * be a decent approximation.
   2455 		 */
   2456 		if (--cands == 0) {
   2457 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
   2458 			    idx;
   2459 			break;
   2460 		}
   2461 next:
   2462 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
   2463 		    color_mask, &it);
   2464 		idx = PNUM_TO_IDX(mnode, r, pfnum);
   2465 		if (idx >= len || pfnum >= hi) {
   2466 wrapit:
   2467 			pfnum = lo;
   2468 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
   2469 			idx = PNUM_TO_IDX(mnode, r, pfnum);
   2470 			wrap++;
   2471 #if defined(__sparc)
   2472 			nhi = 0;	/* search kcage ranges */
   2473 #endif
   2474 		}
   2475 	}
   2476 
   2477 	rw_exit(&page_ctrs_rwlock[mnode]);
   2478 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
   2479 	return (NULL);
   2480 }
   2481 
   2482 /*
   2483  * For the given mnode, promote as many small pages to large pages as possible.
   2484  * mnode can be -1, which means do them all
   2485  */
   2486 void
   2487 page_freelist_coalesce_all(int mnode)
   2488 {
   2489 	int 	r;		/* region size */
   2490 	int 	idx, full;
   2491 	size_t	len;
   2492 	int doall = interleaved_mnodes || mnode < 0;
   2493 	int mlo = doall ? 0 : mnode;
   2494 	int mhi = doall ? max_mem_nodes : (mnode + 1);
   2495 
   2496 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
   2497 
   2498 	if (mpss_coalesce_disable) {
   2499 		return;
   2500 	}
   2501 
   2502 	/*
   2503 	 * Lock the entire freelist and coalesce what we can.
   2504 	 *
   2505 	 * Always promote to the largest page possible
   2506 	 * first to reduce the number of page promotions.
   2507 	 */
   2508 	for (mnode = mlo; mnode < mhi; mnode++) {
   2509 		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
   2510 		page_freelist_lock(mnode);
   2511 	}
   2512 	for (r = mmu_page_sizes - 1; r > 0; r--) {
   2513 		for (mnode = mlo; mnode < mhi; mnode++) {
   2514 			pgcnt_t cands = 0;
   2515 			int mrange, nranges = mnode_nranges[mnode];
   2516 
   2517 			for (mrange = 0; mrange < nranges; mrange++) {
   2518 				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
   2519 				if (cands != 0)
   2520 					break;
   2521 			}
   2522 			if (cands == 0) {
   2523 				VM_STAT_ADD(vmm_vmstats.
   2524 				    page_ctrs_cands_skip_all);
   2525 				continue;
   2526 			}
   2527 
   2528 			full = FULL_REGION_CNT(r);
   2529 			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
   2530 
   2531 			for (idx = 0; idx < len; idx++) {
   2532 				if (PAGE_COUNTERS(mnode, r, idx) == full) {
   2533 					pfn_t pfnum =
   2534 					    IDX_TO_PNUM(mnode, r, idx);
   2535 					int tmnode = interleaved_mnodes ?
   2536 					    PFN_2_MEM_NODE(pfnum) : mnode;
   2537 
   2538 					ASSERT(pfnum >=
   2539 					    mem_node_config[tmnode].physbase &&
   2540 					    pfnum <
   2541 					    mem_node_config[tmnode].physmax);
   2542 
   2543 					(void) page_promote(tmnode,
   2544 					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
   2545 				}
   2546 			}
   2547 			/* shared hpm_counters covers all mnodes, so we quit */
   2548 			if (interleaved_mnodes)
   2549 				break;
   2550 		}
   2551 	}
   2552 	for (mnode = mlo; mnode < mhi; mnode++) {
   2553 		page_freelist_unlock(mnode);
   2554 		rw_exit(&page_ctrs_rwlock[mnode]);
   2555 	}
   2556 }
   2557 
   2558 /*
   2559  * This is where all polices for moving pages around
   2560  * to different page size free lists is implemented.
   2561  * Returns 1 on success, 0 on failure.
   2562  *
   2563  * So far these are the priorities for this algorithm in descending
   2564  * order:
   2565  *
   2566  *	1) When servicing a request try to do so with a free page
   2567  *	   from next size up. Helps defer fragmentation as long
   2568  *	   as possible.
   2569  *
   2570  *	2) Page coalesce on demand. Only when a freelist
   2571  *	   larger than PAGESIZE is empty and step 1
   2572  *	   will not work since all larger size lists are
   2573  *	   also empty.
   2574  *
   2575  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
   2576  */
   2577 
   2578 page_t *
   2579 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
   2580     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
   2581 {
   2582 	uchar_t nszc = szc + 1;
   2583 	uint_t 	bin, sbin, bin_prev;
   2584 	page_t	*pp, *firstpp;
   2585 	page_t	*ret_pp = NULL;
   2586 	uint_t  color_mask;
   2587 
   2588 	if (nszc == mmu_page_sizes)
   2589 		return (NULL);
   2590 
   2591 	ASSERT(nszc < mmu_page_sizes);
   2592 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
   2593 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
   2594 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
   2595 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
   2596 
   2597 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
   2598 	/*
   2599 	 * First try to break up a larger page to fill current size freelist.
   2600 	 */
   2601 	while (plw->plw_bins[nszc] != 0) {
   2602 
   2603 		ASSERT(nszc < mmu_page_sizes);
   2604 
   2605 		/*
   2606 		 * If page found then demote it.
   2607 		 */
   2608 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
   2609 			page_freelist_lock(mnode);
   2610 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
   2611 
   2612 			/*
   2613 			 * If pfnhi is not PFNNULL, look for large page below
   2614 			 * pfnhi. PFNNULL signifies no pfn requirement.
   2615 			 */
   2616 			if (pp &&
   2617 			    ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
   2618 			    (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
   2619 				do {
   2620 					pp = pp->p_vpnext;
   2621 					if (pp == firstpp) {
   2622 						pp = NULL;
   2623 						break;
   2624 					}
   2625 				} while ((pfnhi != PFNNULL &&
   2626 				    pp->p_pagenum >= pfnhi) ||
   2627 				    (pfnlo != PFNNULL &&
   2628 				    pp->p_pagenum < pfnlo));
   2629 
   2630 				if (pfnhi != PFNNULL && pp != NULL)
   2631 					ASSERT(pp->p_pagenum < pfnhi);
   2632 
   2633 				if (pfnlo != PFNNULL && pp != NULL)
   2634 					ASSERT(pp->p_pagenum >= pfnlo);
   2635 			}
   2636 			if (pp) {
   2637 				uint_t ccolor = page_correct_color(szc, nszc,
   2638 				    color, bin, plw->plw_ceq_mask[szc]);
   2639 
   2640 				ASSERT(pp->p_szc == nszc);
   2641 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
   2642 				ret_pp = page_demote(mnode, pp->p_pagenum,
   2643 				    pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
   2644 				if (ret_pp) {
   2645 					page_freelist_unlock(mnode);
   2646 #if defined(__sparc)
   2647 					if (PP_ISNORELOC(ret_pp)) {
   2648 						pgcnt_t npgs;
   2649 
   2650 						npgs = page_get_pagecnt(
   2651 						    ret_pp->p_szc);
   2652 						kcage_freemem_sub(npgs);
   2653 					}
   2654 #endif
   2655 					return (ret_pp);
   2656 				}
   2657 			}
   2658 			page_freelist_unlock(mnode);
   2659 		}
   2660 
   2661 		/* loop through next size bins */
   2662 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
   2663 		plw->plw_bins[nszc]--;
   2664 
   2665 		if (bin == sbin) {
   2666 			uchar_t nnszc = nszc + 1;
   2667 
   2668 			/* we are done with this page size - check next */
   2669 			if (plw->plw_bins[nnszc] == 0)
   2670 				/* we have already checked next size bins */
   2671 				break;
   2672 
   2673 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
   2674 			if (bin_prev != INVALID_COLOR) {
   2675 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
   2676 				if (!((bin ^ bin_prev) &
   2677 				    plw->plw_ceq_mask[nnszc]))
   2678 					break;
   2679 			}
   2680 			ASSERT(nnszc < mmu_page_sizes);
   2681 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
   2682 			nszc = nnszc;
   2683 			ASSERT(nszc < mmu_page_sizes);
   2684 		}
   2685 	}
   2686 
   2687 	return (ret_pp);
   2688 }
   2689 
   2690 /*
   2691  * Helper routine used only by the freelist code to lock
   2692  * a page. If the page is a large page then it succeeds in
   2693  * locking all the constituent pages or none at all.
   2694  * Returns 1 on sucess, 0 on failure.
   2695  */
   2696 static int
   2697 page_trylock_cons(page_t *pp, se_t se)
   2698 {
   2699 	page_t	*tpp, *first_pp = pp;
   2700 
   2701 	/*
   2702 	 * Fail if can't lock first or only page.
   2703 	 */
   2704 	if (!page_trylock(pp, se)) {
   2705 		return (0);
   2706 	}
   2707 
   2708 	/*
   2709 	 * PAGESIZE: common case.
   2710 	 */
   2711 	if (pp->p_szc == 0) {
   2712 		return (1);
   2713 	}
   2714 
   2715 	/*
   2716 	 * Large page case.
   2717 	 */
   2718 	tpp = pp->p_next;
   2719 	while (tpp != pp) {
   2720 		if (!page_trylock(tpp, se)) {
   2721 			/*
   2722 			 * On failure unlock what we have locked so far.
   2723 			 * We want to avoid attempting to capture these
   2724 			 * pages as the pcm mutex may be held which could
   2725 			 * lead to a recursive mutex panic.
   2726 			 */
   2727 			while (first_pp != tpp) {
   2728 				page_unlock_nocapture(first_pp);
   2729 				first_pp = first_pp->p_next;
   2730 			}
   2731 			return (0);
   2732 		}
   2733 		tpp = tpp->p_next;
   2734 	}
   2735 	return (1);
   2736 }
   2737 
   2738 /*
   2739  * init context for walking page lists
   2740  * Called when a page of the given szc in unavailable. Sets markers
   2741  * for the beginning of the search to detect when search has
   2742  * completed a full cycle. Sets flags for splitting larger pages
   2743  * and coalescing smaller pages. Page walking procedes until a page
   2744  * of the desired equivalent color is found.
   2745  */
   2746 void
   2747 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
   2748     int use_ceq, page_list_walker_t *plw)
   2749 {
   2750 	uint_t  nszc, ceq_mask, colors;
   2751 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
   2752 
   2753 	ASSERT(szc < mmu_page_sizes);
   2754 	colors = PAGE_GET_PAGECOLORS(szc);
   2755 
   2756 	plw->plw_colors = colors;
   2757 	plw->plw_color_mask = colors - 1;
   2758 	plw->plw_bin_marker = plw->plw_bin0 = bin;
   2759 	plw->plw_bin_split_prev = bin;
   2760 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
   2761 
   2762 	/*
   2763 	 * if vac aliasing is possible make sure lower order color
   2764 	 * bits are never ignored
   2765 	 */
   2766 	if (vac_colors > 1)
   2767 		ceq &= 0xf0;
   2768 
   2769 	/*
   2770 	 * calculate the number of non-equivalent colors and
   2771 	 * color equivalency mask
   2772 	 */
   2773 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
   2774 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
   2775 	ASSERT(plw->plw_ceq_dif > 0);
   2776 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
   2777 
   2778 	if (flags & PG_MATCH_COLOR) {
   2779 		if (cpu_page_colors <  0) {
   2780 			/*
   2781 			 * this is a heterogeneous machine with different CPUs
   2782 			 * having different size e$ (not supported for ni2/rock
   2783 			 */
   2784 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
   2785 			cpucolors = MAX(cpucolors, 1);
   2786 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
   2787 			plw->plw_ceq_mask[szc] =
   2788 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
   2789 		}
   2790 		plw->plw_ceq_dif = 1;
   2791 	}
   2792 
   2793 	/* we can split pages in the freelist, but not the cachelist */
   2794 	if (can_split) {
   2795 		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
   2796 
   2797 		/* set next szc color masks and number of free list bins */
   2798 		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
   2799 			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
   2800 			    plw->plw_ceq_mask[szc]);
   2801 			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
   2802 		}
   2803 		plw->plw_ceq_mask[nszc] = INVALID_MASK;
   2804 		plw->plw_bins[nszc] = 0;
   2805 
   2806 	} else {
   2807 		ASSERT(szc == 0);
   2808 		plw->plw_do_split = 0;
   2809 		plw->plw_bins[1] = 0;
   2810 		plw->plw_ceq_mask[1] = INVALID_MASK;
   2811 	}
   2812 }
   2813 
   2814 /*
   2815  * set mark to flag where next split should occur
   2816  */
   2817 #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
   2818 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
   2819 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
   2820 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
   2821 	plw->plw_split_next =						     \
   2822 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
   2823 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
   2824 		plw->plw_split_next =					     \
   2825 		INC_MASKED(plw->plw_split_next,				     \
   2826 		    neq_mask, plw->plw_color_mask);			     \
   2827 	}								     \
   2828 }
   2829 
   2830 uint_t
   2831 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
   2832 {
   2833 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
   2834 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
   2835 	uchar_t nszc = szc + 1;
   2836 
   2837 	nbin = ADD_MASKED(bin,
   2838 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
   2839 
   2840 	if (plw->plw_do_split) {
   2841 		plw->plw_bin_split_prev = bin;
   2842 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
   2843 		plw->plw_do_split = 0;
   2844 	}
   2845 
   2846 	if (szc == 0) {
   2847 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
   2848 			if (nbin == plw->plw_bin0 &&
   2849 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
   2850 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
   2851 				    neq_mask, plw->plw_color_mask);
   2852 				plw->plw_bin_split_prev = plw->plw_bin0;
   2853 			}
   2854 
   2855 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
   2856 				plw->plw_bin_marker =
   2857 				    nbin = INC_MASKED(nbin, neq_mask,
   2858 				    plw->plw_color_mask);
   2859 				plw->plw_bin_split_prev = plw->plw_bin0;
   2860 				/*
   2861 				 * large pages all have the same vac color
   2862 				 * so by now we should be done with next
   2863 				 * size page splitting process
   2864 				 */
   2865 				ASSERT(plw->plw_bins[1] == 0);
   2866 				plw->plw_do_split = 0;
   2867 				return (nbin);
   2868 			}
   2869 
   2870 		} else {
   2871 			uint_t bin_jump = (vac_colors == 1) ?
   2872 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
   2873 
   2874 			bin_jump &= ~(vac_colors - 1);
   2875 
   2876 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
   2877 			    plw->plw_color_mask);
   2878 
   2879 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
   2880 
   2881 				plw->plw_bin_marker = nbin = nbin0;
   2882 
   2883 				if (plw->plw_bins[nszc] != 0) {
   2884 					/*
   2885 					 * check if next page size bin is the
   2886 					 * same as the next page size bin for
   2887 					 * bin0
   2888 					 */
   2889 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
   2890 					    nbin);
   2891 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
   2892 					    plw->plw_bin0);
   2893 
   2894 					if ((bin0_nsz ^ nbin_nsz) &
   2895 					    plw->plw_ceq_mask[nszc])
   2896 						plw->plw_do_split = 1;
   2897 				}
   2898 				return (nbin);
   2899 			}
   2900 		}
   2901 	}
   2902 
   2903 	if (plw->plw_bins[nszc] != 0) {
   2904 		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
   2905 		if (!((plw->plw_split_next ^ nbin_nsz) &
   2906 		    plw->plw_ceq_mask[nszc]))
   2907 			plw->plw_do_split = 1;
   2908 	}
   2909 
   2910 	return (nbin);
   2911 }
   2912 
   2913 page_t *
   2914 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
   2915     uint_t flags)
   2916 {
   2917 	kmutex_t		*pcm;
   2918 	page_t			*pp, *first_pp;
   2919 	uint_t			sbin;
   2920 	int			plw_initialized;
   2921 	page_list_walker_t	plw;
   2922 
   2923 	ASSERT(szc < mmu_page_sizes);
   2924 
   2925 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
   2926 
   2927 	MTYPE_START(mnode, mtype, flags);
   2928 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
   2929 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
   2930 		return (NULL);
   2931 	}
   2932 try_again:
   2933 
   2934 	plw_initialized = 0;
   2935 	plw.plw_ceq_dif = 1;
   2936 
   2937 	/*
   2938 	 * Only hold one freelist lock at a time, that way we
   2939 	 * can start anywhere and not have to worry about lock
   2940 	 * ordering.
   2941 	 */
   2942 	for (plw.plw_count = 0;
   2943 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
   2944 		sbin = bin;
   2945 		do {
   2946 			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
   2947 				goto bin_empty_1;
   2948 
   2949 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
   2950 			mutex_enter(pcm);
   2951 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
   2952 			if (pp == NULL)
   2953 				goto bin_empty_0;
   2954 
   2955 			/*
   2956 			 * These were set before the page
   2957 			 * was put on the free list,
   2958 			 * they must still be set.
   2959 			 */
   2960 			ASSERT(PP_ISFREE(pp));
   2961 			ASSERT(PP_ISAGED(pp));
   2962 			ASSERT(pp->p_vnode == NULL);
   2963 			ASSERT(pp->p_hash == NULL);
   2964 			ASSERT(pp->p_offset == (u_offset_t)-1);
   2965 			ASSERT(pp->p_szc == szc);
   2966 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
   2967 
   2968 			/*
   2969 			 * Walk down the hash chain.
   2970 			 * 8k pages are linked on p_next
   2971 			 * and p_prev fields. Large pages
   2972 			 * are a contiguous group of
   2973 			 * constituent pages linked together
   2974 			 * on their p_next and p_prev fields.
   2975 			 * The large pages are linked together
   2976 			 * on the hash chain using p_vpnext
   2977 			 * p_vpprev of the base constituent
   2978 			 * page of each large page.
   2979 			 */
   2980 			first_pp = pp;
   2981 			while (!page_trylock_cons(pp, SE_EXCL) ||
   2982 			    IS_DUMP_PAGE(pp)) {
   2983 				if (szc == 0) {
   2984 					pp = pp->p_next;
   2985 				} else {
   2986 					pp = pp->p_vpnext;
   2987 				}
   2988 
   2989 				ASSERT(PP_ISFREE(pp));
   2990 				ASSERT(PP_ISAGED(pp));
   2991 				ASSERT(pp->p_vnode == NULL);
   2992 				ASSERT(pp->p_hash == NULL);
   2993 				ASSERT(pp->p_offset == (u_offset_t)-1);
   2994 				ASSERT(pp->p_szc == szc);
   2995 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
   2996 
   2997 				if (pp == first_pp)
   2998 					goto bin_empty_0;
   2999 			}
   3000 
   3001 			ASSERT(pp != NULL);
   3002 			ASSERT(mtype == PP_2_MTYPE(pp));
   3003 			ASSERT(pp->p_szc == szc);
   3004 			if (szc == 0) {
   3005 				page_sub(&PAGE_FREELISTS(mnode,
   3006 				    szc, bin, mtype), pp);
   3007 			} else {
   3008 				page_vpsub(&PAGE_FREELISTS(mnode,
   3009 				    szc, bin, mtype), pp);
   3010 				CHK_LPG(pp, szc);
   3011 			}
   3012 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
   3013 
   3014 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
   3015 				panic("free page is not. pp %p", (void *)pp);
   3016 			mutex_exit(pcm);
   3017 
   3018 #if defined(__sparc)
   3019 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
   3020 			    (flags & PG_NORELOC) == 0);
   3021 
   3022 			if (PP_ISNORELOC(pp))
   3023 				kcage_freemem_sub(page_get_pagecnt(szc));
   3024 #endif
   3025 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
   3026 			return (pp);
   3027 
   3028 bin_empty_0:
   3029 			mutex_exit(pcm);
   3030 bin_empty_1:
   3031 			if (plw_initialized == 0) {
   3032 				page_list_walk_init(szc, flags, bin, 1, 1,
   3033 				    &plw);
   3034 				plw_initialized = 1;
   3035 				ASSERT(plw.plw_colors <=
   3036 				    PAGE_GET_PAGECOLORS(szc));
   3037 				ASSERT(plw.plw_colors > 0);
   3038 				ASSERT((plw.plw_colors &
   3039 				    (plw.plw_colors - 1)) == 0);
   3040 				ASSERT(bin < plw.plw_colors);
   3041 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
   3042 			}
   3043 			/* calculate the next bin with equivalent color */
   3044 			bin = ADD_MASKED(bin, plw.plw_bin_step,
   3045 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
   3046 		} while (sbin != bin);
   3047 
   3048 		/*
   3049 		 * color bins are all empty if color match. Try and
   3050 		 * satisfy the request by breaking up or coalescing
   3051 		 * pages from a different size freelist of the correct
   3052 		 * color that satisfies the ORIGINAL color requested.
   3053 		 * If that fails then try pages of the same size but
   3054 		 * different colors assuming we are not called with
   3055 		 * PG_MATCH_COLOR.
   3056 		 */
   3057 		if (plw.plw_do_split &&
   3058 		    (pp = page_freelist_split(szc, bin, mnode,
   3059 		    mtype, PFNNULL, PFNNULL, &plw)) != NULL)
   3060 			return (pp);
   3061 
   3062 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
   3063 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
   3064 			return (pp);
   3065 
   3066 		if (plw.plw_ceq_dif > 1)
   3067 			bin = page_list_walk_next_bin(szc, bin, &plw);
   3068 	}
   3069 
   3070 	/* if allowed, cycle through additional mtypes */
   3071 	MTYPE_NEXT(mnode, mtype, flags);
   3072 	if (mtype >= 0)
   3073 		goto try_again;
   3074 
   3075 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
   3076 
   3077 	return (NULL);
   3078 }
   3079 
   3080 /*
   3081  * Returns the count of free pages for 'pp' with size code 'szc'.
   3082  * Note: This function does not return an exact value as the page freelist
   3083  * locks are not held and thus the values in the page_counters may be
   3084  * changing as we walk through the data.
   3085  */
   3086 static int
   3087 page_freecnt(int mnode, page_t *pp, uchar_t szc)
   3088 {
   3089 	pgcnt_t	pgfree;
   3090 	pgcnt_t cnt;
   3091 	ssize_t	r = szc;	/* region size */
   3092 	ssize_t	idx;
   3093 	int	i;
   3094 	int	full, range;
   3095 
   3096 	/* Make sure pagenum passed in is aligned properly */
   3097 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
   3098 	ASSERT(szc > 0);
   3099 
   3100 	/* Prevent page_counters dynamic memory from being freed */
   3101 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
   3102 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
   3103 	cnt = PAGE_COUNTERS(mnode, r, idx);
   3104 	pgfree = cnt << PNUM_SHIFT(r - 1);
   3105 	range = FULL_REGION_CNT(szc);
   3106 
   3107 	/* Check for completely full region */
   3108 	if (cnt == range) {
   3109 		rw_exit(&page_ctrs_rwlock[mnode]);
   3110 		return (pgfree);
   3111 	}
   3112 
   3113 	while (--r > 0) {
   3114 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
   3115 		full = FULL_REGION_CNT(r);
   3116 		for (i = 0; i < range; i++, idx++) {
   3117 			cnt = PAGE_COUNTERS(mnode, r, idx);
   3118 			/*
   3119 			 * If cnt here is full, that means we have already
   3120 			 * accounted for these pages earlier.
   3121 			 */
   3122 			if (cnt != full) {
   3123 				pgfree += (cnt << PNUM_SHIFT(r - 1));
   3124 			}
   3125 		}
   3126 		range *= full;
   3127 	}
   3128 	rw_exit(&page_ctrs_rwlock[mnode]);
   3129 	return (pgfree);
   3130 }
   3131 
   3132 /*
   3133  * Called from page_geti_contig_pages to exclusively lock constituent pages
   3134  * starting from 'spp' for page size code 'szc'.
   3135  *
   3136  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
   3137  * region needs to be greater than or equal to the threshold.
   3138  */
   3139 static int
   3140 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
   3141 {
   3142 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
   3143 	pgcnt_t pgfree, i;
   3144 	page_t *pp;
   3145 
   3146 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
   3147 
   3148 
   3149 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
   3150 		goto skipptcpcheck;
   3151 	/*
   3152 	 * check if there are sufficient free pages available before attempting
   3153 	 * to trylock. Count is approximate as page counters can change.
   3154 	 */
   3155 	pgfree = page_freecnt(mnode, spp, szc);
   3156 
   3157 	/* attempt to trylock if there are sufficient already free pages */
   3158 	if (pgfree < pgcnt/ptcpthreshold) {
   3159 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
   3160 		return (0);
   3161 	}
   3162 
   3163 skipptcpcheck:
   3164 
   3165 	for (i = 0; i < pgcnt; i++) {
   3166 		pp = &spp[i];
   3167 		if (!page_trylock(pp, SE_EXCL)) {
   3168 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
   3169 			while (--i != (pgcnt_t)-1) {
   3170 				pp = &spp[i];
   3171 				ASSERT(PAGE_EXCL(pp));
   3172 				page_unlock_nocapture(pp);
   3173 			}
   3174 			return (0);
   3175 		}
   3176 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
   3177 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
   3178 		    !PP_ISFREE(pp)) {
   3179 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
   3180 			ASSERT(i == 0);
   3181 			page_unlock_nocapture(pp);
   3182 			return (0);
   3183 		}
   3184 		if (PP_ISNORELOC(pp)) {
   3185 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
   3186 			while (i != (pgcnt_t)-1) {
   3187 				pp = &spp[i];
   3188 				ASSERT(PAGE_EXCL(pp));
   3189 				page_unlock_nocapture(pp);
   3190 				i--;
   3191 			}
   3192 			return (0);
   3193 		}
   3194 	}
   3195 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
   3196 	return (1);
   3197 }
   3198 
   3199 /*
   3200  * Claim large page pointed to by 'pp'. 'pp' is the starting set
   3201  * of 'szc' constituent pages that had been locked exclusively previously.
   3202  * Will attempt to relocate constituent pages in use.
   3203  */
   3204 static page_t *
   3205 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
   3206 {
   3207 	spgcnt_t pgcnt, npgs, i;
   3208 	page_t *targpp, *rpp, *hpp;
   3209 	page_t *replpp = NULL;
   3210 	page_t *pplist = NULL;
   3211 
   3212 	ASSERT(pp != NULL);
   3213 
   3214 	pgcnt = page_get_pagecnt(szc);
   3215 	while (pgcnt) {
   3216 		ASSERT(PAGE_EXCL(pp));
   3217 		ASSERT(!PP_ISNORELOC(pp));
   3218 		if (PP_ISFREE(pp)) {
   3219 			/*
   3220 			 * If this is a PG_FREE_LIST page then its
   3221 			 * size code can change underneath us due to
   3222 			 * page promotion or demotion. As an optimzation
   3223 			 * use page_list_sub_pages() instead of
   3224 			 * page_list_sub().
   3225 			 */
   3226 			if (PP_ISAGED(pp)) {
   3227 				page_list_sub_pages(pp, szc);
   3228 				if (pp->p_szc == szc) {
   3229 					return (pp);
   3230 				}
   3231 				ASSERT(pp->p_szc < szc);
   3232 				npgs = page_get_pagecnt(pp->p_szc);
   3233 				hpp = pp;
   3234 				for (i = 0; i < npgs; i++, pp++) {
   3235 					pp->p_szc = szc;
   3236 				}
   3237 				page_list_concat(&pplist, &hpp);
   3238 				pgcnt -= npgs;
   3239 				continue;
   3240 			}
   3241 			ASSERT(!PP_ISAGED(pp));
   3242 			ASSERT(pp->p_szc == 0);
   3243 			page_list_sub(pp, PG_CACHE_LIST);
   3244 			page_hashout(pp, NULL);
   3245 			PP_SETAGED(pp);
   3246 			pp->p_szc = szc;
   3247 			page_list_concat(&pplist, &pp);
   3248 			pp++;
   3249 			pgcnt--;
   3250 			continue;
   3251 		}
   3252 		npgs = page_get_pagecnt(pp->p_szc);
   3253 
   3254 		/*
   3255 		 * page_create_wait freemem accounting done by caller of
   3256 		 * page_get_freelist and not necessary to call it prior to
   3257 		 * calling page_get_replacement_page.
   3258 		 *
   3259 		 * page_get_replacement_page can call page_get_contig_pages
   3260 		 * to acquire a large page (szc > 0); the replacement must be
   3261 		 * smaller than the contig page size to avoid looping or
   3262 		 * szc == 0 and PGI_PGCPSZC0 is set.
   3263 		 */
   3264 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
   3265 			replpp = page_get_replacement_page(pp, NULL, 0);
   3266 			if (replpp) {
   3267 				npgs = page_get_pagecnt(pp->p_szc);
   3268 				ASSERT(npgs <= pgcnt);
   3269 				targpp = pp;
   3270 			}
   3271 		}
   3272 
   3273 		/*
   3274 		 * If replacement is NULL or do_page_relocate fails, fail
   3275 		 * coalescing of pages.
   3276 		 */
   3277 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
   3278 		    &npgs, NULL) != 0)) {
   3279 			/*
   3280 			 * Unlock un-processed target list
   3281 			 */
   3282 			while (pgcnt--) {
   3283 				ASSERT(PAGE_EXCL(pp));
   3284 				page_unlock_nocapture(pp);
   3285 				pp++;
   3286 			}
   3287 			/*
   3288 			 * Free the processed target list.
   3289 			 */
   3290 			while (pplist) {
   3291 				pp = pplist;
   3292 				page_sub(&pplist, pp);
   3293 				ASSERT(PAGE_EXCL(pp));
   3294 				ASSERT(pp->p_szc == szc);
   3295 				ASSERT(PP_ISFREE(pp));
   3296 				ASSERT(PP_ISAGED(pp));
   3297 				pp->p_szc = 0;
   3298 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
   3299 				page_unlock_nocapture(pp);
   3300 			}
   3301 
   3302 			if (replpp != NULL)
   3303 				page_free_replacement_page(replpp);
   3304 
   3305 			return (NULL);
   3306 		}
   3307 		ASSERT(pp == targpp);
   3308 
   3309 		/* LINTED */
   3310 		ASSERT(hpp = pp); /* That's right, it's an assignment */
   3311 
   3312 		pp += npgs;
   3313 		pgcnt -= npgs;
   3314 
   3315 		while (npgs--) {
   3316 			ASSERT(PAGE_EXCL(targpp));
   3317 			ASSERT(!PP_ISFREE(targpp));
   3318 			ASSERT(!PP_ISNORELOC(targpp));
   3319 			PP_SETFREE(targpp);
   3320 			ASSERT(PP_ISAGED(targpp));
   3321 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
   3322 			    (flags & PGI_PGCPSZC0)));
   3323 			targpp->p_szc = szc;
   3324 			targpp = targpp->p_next;
   3325 
   3326 			rpp = replpp;
   3327 			ASSERT(rpp != NULL);
   3328 			page_sub(&replpp, rpp);
   3329 			ASSERT(PAGE_EXCL(rpp));
   3330 			ASSERT(!PP_ISFREE(rpp));
   3331 			page_unlock_nocapture(rpp);
   3332 		}
   3333 		ASSERT(targpp == hpp);
   3334 		ASSERT(replpp == NULL);
   3335 		page_list_concat(&pplist, &targpp);
   3336 	}
   3337 	CHK_LPG(pplist, szc);
   3338 	return (pplist);
   3339 }
   3340 
   3341 /*
   3342  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
   3343  * of 0 means nothing left after trim.
   3344  */
   3345 int
   3346 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
   3347 {
   3348 	pfn_t	kcagepfn;
   3349 	int	decr;
   3350 	int	rc = 0;
   3351 
   3352 	if (PP_ISNORELOC(mseg->pages)) {
   3353 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
   3354 
   3355 			/* lower part of this mseg inside kernel cage */
   3356 			decr = kcage_current_pfn(&kcagepfn);
   3357 
   3358 			/* kernel cage may have transitioned past mseg */
   3359 			if (kcagepfn >= mseg->pages_base &&
   3360 			    kcagepfn < mseg->pages_end) {
   3361 				ASSERT(decr == 0);
   3362 				*lo = MAX(kcagepfn, pfnlo);
   3363 				*hi = MIN(pfnhi, (mseg->pages_end - 1));
   3364 				rc = 1;
   3365 			}
   3366 		}
   3367 		/* else entire mseg in the cage */
   3368 	} else {
   3369 		if (PP_ISNORELOC(mseg->epages - 1)) {
   3370 
   3371 			/* upper part of this mseg inside kernel cage */
   3372 			decr = kcage_current_pfn(&kcagepfn);
   3373 
   3374 			/* kernel cage may have transitioned past mseg */
   3375 			if (kcagepfn >= mseg->pages_base &&
   3376 			    kcagepfn < mseg->pages_end) {
   3377 				ASSERT(decr);
   3378 				*hi = MIN(kcagepfn, pfnhi);
   3379 				*lo = MAX(pfnlo, mseg->pages_base);
   3380 				rc = 1;
   3381 			}
   3382 		} else {
   3383 			/* entire mseg outside of kernel cage */
   3384 			*lo = MAX(pfnlo, mseg->pages_base);
   3385 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
   3386 			rc = 1;
   3387 		}
   3388 	}
   3389 	return (rc);
   3390 }
   3391 
   3392 /*
   3393  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
   3394  * page with size code 'szc'. Claiming such a page requires acquiring
   3395  * exclusive locks on all constituent pages (page_trylock_contig_pages),
   3396  * relocating pages in use and concatenating these constituent pages into a
   3397  * large page.
   3398  *
   3399  * The page lists do not have such a large page and page_freelist_split has
   3400  * already failed to demote larger pages and/or coalesce smaller free pages.
   3401  *
   3402  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
   3403  * pages with the same color as 'bin'.
   3404  *
   3405  * 'pfnflag' specifies the subset of the pfn range to search.
   3406  */
   3407 
   3408 static page_t *
   3409 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
   3410     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
   3411 {
   3412 	struct memseg *mseg;
   3413 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
   3414 	pgcnt_t szcpgmask = szcpgcnt - 1;
   3415 	pfn_t	randpfn;
   3416 	page_t *pp, *randpp, *endpp;
   3417 	uint_t colors, ceq_mask;
   3418 	/* LINTED : set but not used in function */
   3419 	uint_t color_mask;
   3420 	pfn_t hi, lo;
   3421 	uint_t skip;
   3422 	MEM_NODE_ITERATOR_DECL(it);
   3423 
   3424 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
   3425 
   3426 	pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
   3427 
   3428 	if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
   3429 		return (NULL);
   3430 
   3431 	ASSERT(szc < mmu_page_sizes);
   3432 
   3433 	colors = PAGE_GET_PAGECOLORS(szc);
   3434 	color_mask = colors - 1;
   3435 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
   3436 		uchar_t ceq = colorequivszc[szc];
   3437 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
   3438 
   3439 		ASSERT(ceq_dif > 0);
   3440 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
   3441 	} else {
   3442 		ceq_mask = 0;
   3443 	}
   3444 
   3445 	ASSERT(bin < colors);
   3446 
   3447 	/* clear "non-significant" color bits */
   3448 	bin &= ceq_mask;
   3449 
   3450 	/*
   3451 	 * trim the pfn range to search based on pfnflag. pfnflag is set
   3452 	 * when there have been previous page_get_contig_page failures to
   3453 	 * limit the search.
   3454 	 *
   3455 	 * The high bit in pfnflag specifies the number of 'slots' in the
   3456 	 * pfn range and the remainder of pfnflag specifies which slot.
   3457 	 * For example, a value of 1010b would mean the second slot of
   3458 	 * the pfn range that has been divided into 8 slots.
   3459 	 */
   3460 	if (pfnflag > 1) {
   3461 		int	slots = 1 << (highbit(pfnflag) - 1);
   3462 		int	slotid = pfnflag & (slots - 1);
   3463 		pgcnt_t	szcpages;
   3464 		int	slotlen;
   3465 
   3466 		pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
   3467 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
   3468 		slotlen = howmany(szcpages, slots);
   3469 		/* skip if 'slotid' slot is empty */
   3470 		if (slotid * slotlen >= szcpages)
   3471 			return (NULL);
   3472 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
   3473 		ASSERT(pfnlo < pfnhi);
   3474 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
   3475 			pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
   3476 	}
   3477 
   3478 	/*
   3479 	 * This routine is can be called recursively so we shouldn't
   3480 	 * acquire a reader lock if a write request is pending. This
   3481 	 * could lead to a deadlock with the DR thread.
   3482 	 *
   3483 	 * Returning NULL informs the caller that we could not get
   3484 	 * a contig page with the required characteristics.
   3485 	 */
   3486 
   3487 	if (!memsegs_trylock(0))
   3488 		return (NULL);
   3489 
   3490 	/*
   3491 	 * loop through memsegs to look for contig page candidates
   3492 	 */
   3493 
   3494 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
   3495 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
   3496 			/* no overlap */
   3497 			continue;
   3498 		}
   3499 
   3500 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
   3501 			/* mseg too small */
   3502 			continue;
   3503 
   3504 		/*
   3505 		 * trim off kernel cage pages from pfn range and check for
   3506 		 * a trimmed pfn range returned that does not span the
   3507 		 * desired large page size.
   3508 		 */
   3509 		if (kcage_on) {
   3510 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
   3511 			    lo >= hi || ((hi - lo) + 1) < szcpgcnt)
   3512 				continue;
   3513 		} else {
   3514 			lo = MAX(pfnlo, mseg->pages_base);
   3515 			hi = MIN(pfnhi, (mseg->pages_end - 1));
   3516 		}
   3517 
   3518 		/* round to szcpgcnt boundaries */
   3519 		lo = P2ROUNDUP(lo, szcpgcnt);
   3520 
   3521 		MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
   3522 		hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
   3523 
   3524 		if (hi <= lo)
   3525 			continue;
   3526 
   3527 		/*
   3528 		 * set lo to point to the pfn for the desired bin. Large
   3529 		 * page sizes may only have a single page color
   3530 		 */
   3531 		skip = szcpgcnt;
   3532 		if (ceq_mask > 0 || interleaved_mnodes) {
   3533 			/* set lo to point at appropriate color */
   3534 			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
   3535 			    (interleaved_mnodes &&
   3536 			    PFN_2_MEM_NODE(lo) != mnode)) {
   3537 				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
   3538 				    color_mask, &it);
   3539 			}
   3540 			if (hi <= lo)
   3541 				/* mseg cannot satisfy color request */
   3542 				continue;
   3543 		}
   3544 
   3545 		/* randomly choose a point between lo and hi to begin search */
   3546 
   3547 		randpfn = (pfn_t)GETTICK();
   3548 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
   3549 		MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
   3550 		if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
   3551 			if (randpfn != (pfn_t)-1) {
   3552 				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
   3553 				    ceq_mask, color_mask, &it);
   3554 			}
   3555 			if (randpfn >= hi) {
   3556 				randpfn = lo;
   3557 				MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
   3558 				    &it);
   3559 			}
   3560 		}
   3561 		randpp = mseg->pages + (randpfn - mseg->pages_base);
   3562 
   3563 		ASSERT(randpp->p_pagenum == randpfn);
   3564 
   3565 		pp = randpp;
   3566 		endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
   3567 
   3568 		ASSERT(randpp + szcpgcnt <= endpp);
   3569 
   3570 		do {
   3571 			ASSERT(!(pp->p_pagenum & szcpgmask));
   3572 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
   3573 
   3574 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
   3575 				/* pages unlocked by page_claim on failure */
   3576 				if (page_claim_contig_pages(pp, szc, flags)) {
   3577 					memsegs_unlock(0);
   3578 					return (pp);
   3579 				}
   3580 			}
   3581 
   3582 			if (ceq_mask == 0 && !interleaved_mnodes) {
   3583 				pp += skip;
   3584 			} else {
   3585 				pfn_t pfn = pp->p_pagenum;
   3586 
   3587 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
   3588 				    ceq_mask, color_mask, &it);
   3589 				if (pfn == (pfn_t)-1) {
   3590 					pp = endpp;
   3591 				} else {
   3592 					pp = mseg->pages +
   3593 					    (pfn - mseg->pages_base);
   3594 				}
   3595 			}
   3596 			if (pp >= endpp) {
   3597 				/* start from the beginning */
   3598 				MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
   3599 				pp = mseg->pages + (lo - mseg->pages_base);
   3600 				ASSERT(pp->p_pagenum == lo);
   3601 				ASSERT(pp + szcpgcnt <= endpp);
   3602 			}
   3603 		} while (pp != randpp);
   3604 	}
   3605 	memsegs_unlock(0);
   3606 	return (NULL);
   3607 }
   3608 
   3609 
   3610 /*
   3611  * controlling routine that searches through physical memory in an attempt to
   3612  * claim a large page based on the input parameters.
   3613  * on the page free lists.
   3614  *
   3615  * calls page_geti_contig_pages with an initial pfn range from the mnode
   3616  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
   3617  * that overlaps with the kernel cage or does not match the requested page
   3618  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
   3619  * page_geti_contig_pages may further limit the search range based on
   3620  * previous failure counts (pgcpfailcnt[]).
   3621  *
   3622  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
   3623  * pagesize page that satisfies mtype.
   3624  */
   3625 page_t *
   3626 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
   3627     uint_t flags)
   3628 {
   3629 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
   3630 	page_t		*pp;
   3631 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
   3632 
   3633 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
   3634 
   3635 	/* no allocations from cage */
   3636 	flags |= PGI_NOCAGE;
   3637 
   3638 	/* LINTED */
   3639 	MTYPE_START(mnode, mtype, flags);
   3640 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
   3641 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
   3642 		return (NULL);
   3643 	}
   3644 
   3645 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
   3646 
   3647 	/* do not limit search and ignore color if hi pri */
   3648 
   3649 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
   3650 		pfnflag = pgcpfailcnt[szc];
   3651 
   3652 	/* remove color match to improve chances */
   3653 
   3654 	if (flags & PGI_PGCPHIPRI || pfnflag)
   3655 		flags &= ~PG_MATCH_COLOR;
   3656 
   3657 	do {
   3658 		/* get pfn range based on mnode and mtype */
   3659 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
   3660 
   3661 		ASSERT(pfnhi >= pfnlo);
   3662 
   3663 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
   3664 		    pfnlo, pfnhi, pfnflag);
   3665 
   3666 		if (pp != NULL) {
   3667 			pfnflag = pgcpfailcnt[szc];
   3668 			if (pfnflag) {
   3669 				/* double the search size */
   3670 				pgcpfailcnt[szc] = pfnflag >> 1;
   3671 			}
   3672 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
   3673 			return (pp);
   3674 		}
   3675 		MTYPE_NEXT(mnode, mtype, flags);
   3676 	} while (mtype >= 0);
   3677 
   3678 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
   3679 	return (NULL);
   3680 }
   3681 
   3682 #if defined(__i386) || defined(__amd64)
   3683 /*
   3684  * Determine the likelihood of finding/coalescing a szc page.
   3685  * Return 0 if the likelihood is small otherwise return 1.
   3686  *
   3687  * For now, be conservative and check only 1g pages and return 0
   3688  * if there had been previous coalescing failures and the szc pages
   3689  * needed to satisfy request would exhaust most of freemem.
   3690  */
   3691 int
   3692 page_chk_freelist(uint_t szc)
   3693 {
   3694 	pgcnt_t		pgcnt;
   3695 
   3696 	if (szc <= 1)
   3697 		return (1);
   3698 
   3699 	pgcnt = page_get_pagecnt(szc);
   3700 	if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
   3701 		VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
   3702 		return (0);
   3703 	}
   3704 	VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
   3705 	return (1);
   3706 }
   3707 #endif
   3708 
   3709 /*
   3710  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
   3711  *
   3712  * Does its own locking and accounting.
   3713  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
   3714  * pages of the proper color even if there are pages of a different color.
   3715  *
   3716  * Finds a page, removes it, THEN locks it.
   3717  */
   3718 
   3719 /*ARGSUSED*/
   3720 page_t *
   3721 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
   3722 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
   3723 {
   3724 	struct as	*as = seg->s_as;
   3725 	page_t		*pp = NULL;
   3726 	ulong_t		bin;
   3727 	uchar_t		szc;
   3728 	int		mnode;
   3729 	int		mtype;
   3730 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
   3731 	lgrp_mnode_cookie_t	lgrp_cookie;
   3732 
   3733 	page_get_func = page_get_mnode_freelist;
   3734 
   3735 	/*
   3736 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
   3737 	 * assume we wish to allocate near to the current thread's home.
   3738 	 */
   3739 	if (!LGRP_EXISTS(lgrp))
   3740 		lgrp = lgrp_home_lgrp();
   3741 
   3742 	if (kcage_on) {
   3743 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
   3744 		    kcage_freemem < kcage_throttlefree + btop(size) &&
   3745 		    curthread != kcage_cageout_thread) {
   3746 			/*
   3747 			 * Set a "reserve" of kcage_throttlefree pages for
   3748 			 * PG_PANIC and cageout thread allocations.
   3749 			 *
   3750 			 * Everybody else has to serialize in
   3751 			 * page_create_get_something() to get a cage page, so
   3752 			 * that we don't deadlock cageout!
   3753 			 */
   3754 			return (NULL);
   3755 		}
   3756 	} else {
   3757 		flags &= ~PG_NORELOC;
   3758 		flags |= PGI_NOCAGE;
   3759 	}
   3760 
   3761 	/* LINTED */
   3762 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
   3763 
   3764 	/*
   3765 	 * Convert size to page size code.
   3766 	 */
   3767 	if ((szc = page_szc(size)) == (uchar_t)-1)
   3768 		panic("page_get_freelist: illegal page size request");
   3769 	ASSERT(szc < mmu_page_sizes);
   3770 
   3771 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
   3772 
   3773 	/* LINTED */
   3774 	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
   3775 
   3776 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
   3777 
   3778 	/*
   3779 	 * Try to get a local page first, but try remote if we can't
   3780 	 * get a page of the right color.
   3781 	 */
   3782 pgretry:
   3783 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
   3784 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
   3785 		pp = page_get_func(mnode, bin, mtype, szc, flags);
   3786 		if (pp != NULL) {
   3787 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
   3788 			DTRACE_PROBE4(page__get,
   3789 			    lgrp_t *, lgrp,
   3790 			    int, mnode,
   3791 			    ulong_t, bin,
   3792 			    uint_t, flags);
   3793 			return (pp);
   3794 		}
   3795 	}
   3796 	ASSERT(pp == NULL);
   3797 
   3798 	/*
   3799 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
   3800 	 * remote free lists.  Caller expected to call page_get_cachelist which
   3801 	 * will check local cache lists and remote free lists.
   3802 	 */
   3803 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
   3804 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
   3805 		return (NULL);
   3806 	}
   3807 
   3808 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
   3809 
   3810 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
   3811 
   3812 	if (!(flags & PG_LOCAL)) {
   3813 		/*
   3814 		 * Try to get a non-local freelist page.
   3815 		 */
   3816 		LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
   3817 		while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
   3818 			pp = page_get_func(mnode, bin, mtype, szc, flags);
   3819 			if (pp != NULL) {
   3820 				DTRACE_PROBE4(page__get,
   3821 				    lgrp_t *, lgrp,
   3822 				    int, mnode,
   3823 				    ulong_t, bin,
   3824 				    uint_t, flags);
   3825 				VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
   3826 				return (pp);
   3827 			}
   3828 		}
   3829 		ASSERT(pp == NULL);
   3830 	}
   3831 
   3832 	/*
   3833 	 * when the cage is off chances are page_get_contig_pages() will fail
   3834 	 * to lock a large page chunk therefore when the cage is off it's not
   3835 	 * called by default.  this can be changed via /etc/system.
   3836 	 *
   3837 	 * page_get_contig_pages() also called to acquire a base pagesize page
   3838 	 * for page_create_get_something().
   3839 	 */
   3840 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
   3841 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
   3842 	    (page_get_func != page_get_contig_pages)) {
   3843 
   3844 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
   3845 		page_get_func = page_get_contig_pages;
   3846 		goto pgretry;
   3847 	}
   3848 
   3849 	if (!(flags & PG_LOCAL) && pgcplimitsearch &&
   3850 	    page_get_func == page_get_contig_pages)
   3851 		SETPGCPFAILCNT(szc);
   3852 
   3853 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
   3854 	return (NULL);
   3855 }
   3856 
   3857 /*
   3858  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
   3859  *
   3860  * Does its own locking.
   3861  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
   3862  * pages of the proper color even if there are pages of a different color.
   3863  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
   3864  * try to lock one of them.  If no page can be locked, try the
   3865  * next bin.  Return NULL if a page can not be found and locked.
   3866  *
   3867  * Finds a pages, trys to lock it, then removes it.
   3868  */
   3869 
   3870 /*ARGSUSED*/
   3871 page_t *
   3872 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
   3873     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
   3874 {
   3875 	page_t		*pp;
   3876 	struct as	*as = seg->s_as;
   3877 	ulong_t		bin;
   3878 	/*LINTED*/
   3879 	int		mnode;
   3880 	int		mtype;
   3881 	lgrp_mnode_cookie_t	lgrp_cookie;
   3882 
   3883 	/*
   3884 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
   3885 	 * assume we wish to allocate near to the current thread's home.
   3886 	 */
   3887 	if (!LGRP_EXISTS(lgrp))
   3888 		lgrp = lgrp_home_lgrp();
   3889 
   3890 	if (!kcage_on) {
   3891 		flags &= ~PG_NORELOC;
   3892 		flags |= PGI_NOCAGE;
   3893 	}
   3894 
   3895 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
   3896 	    kcage_freemem <= kcage_throttlefree) {
   3897 		/*
   3898 		 * Reserve kcage_throttlefree pages for critical kernel
   3899 		 * threads.
   3900 		 *
   3901 		 * Everybody else has to go to page_create_get_something()
   3902 		 * to get a cage page, so we don't deadlock cageout.
   3903 		 */
   3904 		return (NULL);
   3905 	}
   3906 
   3907 	/* LINTED */
   3908 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
   3909 
   3910 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
   3911 
   3912 	/* LINTED */
   3913 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
   3914 
   3915 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
   3916 
   3917 	/*
   3918 	 * Try local cachelists first
   3919 	 */
   3920 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
   3921 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
   3922 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
   3923 		if (pp != NULL) {
   3924 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
   3925 			DTRACE_PROBE4(page__get,
   3926 			    lgrp_t *, lgrp,
   3927 			    int, mnode,
   3928 			    ulong_t, bin,
   3929 			    uint_t, flags);
   3930 			return (pp);
   3931 		}
   3932 	}
   3933 
   3934 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
   3935 
   3936 	/*
   3937 	 * Try freelists/cachelists that are farther away
   3938 	 * This is our only chance to allocate remote pages for PAGESIZE
   3939 	 * requests.
   3940 	 */
   3941 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
   3942 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
   3943 		pp = page_get_mnode_freelist(mnode, bin, mtype,
   3944 		    0, flags);
   3945 		if (pp != NULL) {
   3946 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
   3947 			DTRACE_PROBE4(page__get,
   3948 			    lgrp_t *, lgrp,
   3949 			    int, mnode,
   3950 			    ulong_t, bin,
   3951 			    uint_t, flags);
   3952 			return (pp);
   3953 		}
   3954 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
   3955 		if (pp != NULL) {
   3956 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
   3957 			DTRACE_PROBE4(page__get,
   3958 			    lgrp_t *, lgrp,
   3959 			    int, mnode,
   3960 			    ulong_t, bin,
   3961 			    uint_t, flags);
   3962 			return (pp);
   3963 		}
   3964 	}
   3965 
   3966 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
   3967 	return (NULL);
   3968 }
   3969 
   3970 page_t *
   3971 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
   3972 {
   3973 	kmutex_t		*pcm;
   3974 	page_t			*pp, *first_pp;
   3975 	uint_t			sbin;
   3976 	int			plw_initialized;
   3977 	page_list_walker_t	plw;
   3978 
   3979 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
   3980 
   3981 	/* LINTED */
   3982 	MTYPE_START(mnode, mtype, flags);
   3983 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
   3984 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
   3985 		return (NULL);
   3986 	}
   3987 
   3988 try_again:
   3989 
   3990 	plw_initialized = 0;
   3991 	plw.plw_ceq_dif = 1;
   3992 
   3993 	/*
   3994 	 * Only hold one cachelist lock at a time, that way we
   3995 	 * can start anywhere and not have to worry about lock
   3996 	 * ordering.
   3997 	 */
   3998 
   3999 	for (plw.plw_count = 0;
   4000 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
   4001 		sbin = bin;
   4002 		do {
   4003 
   4004 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
   4005 				goto bin_empty_1;
   4006 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
   4007 			mutex_enter(pcm);
   4008 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
   4009 			if (pp == NULL)
   4010 				goto bin_empty_0;
   4011 
   4012 			first_pp = pp;
   4013 			ASSERT(pp->p_vnode);
   4014 			ASSERT(PP_ISAGED(pp) == 0);
   4015 			ASSERT(pp->p_szc == 0);
   4016 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
   4017 			while (!page_trylock(pp, SE_EXCL)) {
   4018 				pp = pp->p_next;
   4019 				ASSERT(pp->p_szc == 0);
   4020 				if (pp == first_pp) {
   4021 					/*
   4022 					 * We have searched the complete list!
   4023 					 * And all of them (might only be one)
   4024 					 * are locked. This can happen since
   4025 					 * these pages can also be found via
   4026 					 * the hash list. When found via the
   4027 					 * hash list, they are locked first,
   4028 					 * then removed. We give up to let the
   4029 					 * other thread run.
   4030 					 */
   4031 					pp = NULL;
   4032 					break;
   4033 				}
   4034 				ASSERT(pp->p_vnode);
   4035 				ASSERT(PP_ISFREE(pp));
   4036 				ASSERT(PP_ISAGED(pp) == 0);
   4037 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
   4038 				    mnode);
   4039 			}
   4040 
   4041 			if (pp) {
   4042 				page_t	**ppp;
   4043 				/*
   4044 				 * Found and locked a page.
   4045 				 * Pull it off the list.
   4046 				 */
   4047 				ASSERT(mtype == PP_2_MTYPE(pp));
   4048 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
   4049 				page_sub(ppp, pp);
   4050 				/*
   4051 				 * Subtract counters before releasing pcm mutex
   4052 				 * to avoid a race with page_freelist_coalesce
   4053 				 * and page_freelist_split.
   4054 				 */
   4055 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
   4056 				mutex_exit(pcm);
   4057 				ASSERT(pp->p_vnode);
   4058 				ASSERT(PP_ISAGED(pp) == 0);
   4059 #if defined(__sparc)
   4060 				ASSERT(!kcage_on ||
   4061 				    (flags & PG_NORELOC) == 0 ||
   4062 				    PP_ISNORELOC(pp));
   4063 				if (PP_ISNORELOC(pp)) {
   4064 					kcage_freemem_sub(1);
   4065 				}
   4066 #endif
   4067 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
   4068 				return (pp);
   4069 			}
   4070 bin_empty_0:
   4071 			mutex_exit(pcm);
   4072 bin_empty_1:
   4073 			if (plw_initialized == 0) {
   4074 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
   4075 				plw_initialized = 1;
   4076 			}
   4077 			/* calculate the next bin with equivalent color */
   4078 			bin = ADD_MASKED(bin, plw.plw_bin_step,
   4079 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
   4080 		} while (sbin != bin);
   4081 
   4082 		if (plw.plw_ceq_dif > 1)
   4083 			bin = page_list_walk_next_bin(0, bin, &plw);
   4084 	}
   4085 
   4086 	MTYPE_NEXT(mnode, mtype, flags);
   4087 	if (mtype >= 0)
   4088 		goto try_again;
   4089 
   4090 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
   4091 	return (NULL);
   4092 }
   4093 
   4094 #ifdef DEBUG
   4095 #define	REPL_PAGE_STATS
   4096 #endif /* DEBUG */
   4097 
   4098 #ifdef REPL_PAGE_STATS
   4099 struct repl_page_stats {
   4100 	uint_t	ngets;
   4101 	uint_t	ngets_noreloc;
   4102 	uint_t	npgr_noreloc;
   4103 	uint_t	nnopage_first;
   4104 	uint_t	nnopage;
   4105 	uint_t	nhashout;
   4106 	uint_t	nnofree;
   4107 	uint_t	nnext_pp;
   4108 } repl_page_stats;
   4109 #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
   4110 #else /* REPL_PAGE_STATS */
   4111 #define	REPL_STAT_INCR(v)
   4112 #endif /* REPL_PAGE_STATS */
   4113 
   4114 int	pgrppgcp;
   4115 
   4116 /*
   4117  * The freemem accounting must be done by the caller.
   4118  * First we try to get a replacement page of the same size as like_pp,
   4119  * if that is not possible, then we just get a set of discontiguous
   4120  * PAGESIZE pages.
   4121  */
   4122 page_t *
   4123 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
   4124     uint_t pgrflags)
   4125 {
   4126 	page_t		*like_pp;
   4127 	page_t		*pp, *pplist;
   4128 	page_t		*pl = NULL;
   4129 	ulong_t		bin;
   4130 	int		mnode, page_mnode;
   4131 	int		szc;
   4132 	spgcnt_t	npgs, pg_cnt;
   4133 	pfn_t		pfnum;
   4134 	int		mtype;
   4135 	int		flags = 0;
   4136 	lgrp_mnode_cookie_t	lgrp_cookie;
   4137 	lgrp_t		*lgrp;
   4138 
   4139 	REPL_STAT_INCR(ngets);
   4140 	like_pp = orig_like_pp;
   4141 	ASSERT(PAGE_EXCL(like_pp));
   4142 
   4143 	szc = like_pp->p_szc;
   4144 	npgs = page_get_pagecnt(szc);
   4145 	/*
   4146 	 * Now we reset like_pp to the base page_t.
   4147 	 * That way, we won't walk past the end of this 'szc' page.
   4148 	 */
   4149 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
   4150 	like_pp = page_numtopp_nolock(pfnum);
   4151 	ASSERT(like_pp->p_szc == szc);
   4152 
   4153 	if (PP_ISNORELOC(like_pp)) {
   4154 		ASSERT(kcage_on);
   4155 		REPL_STAT_INCR(ngets_noreloc);
   4156 		flags = PGI_RELOCONLY;
   4157 	} else if (pgrflags & PGR_NORELOC) {
   4158 		ASSERT(kcage_on);
   4159 		REPL_STAT_INCR(npgr_noreloc);
   4160 		flags = PG_NORELOC;
   4161 	}
   4162 
   4163 	/*
   4164 	 * Kernel pages must always be replaced with the same size
   4165 	 * pages, since we cannot properly handle demotion of kernel
   4166 	 * pages.
   4167 	 */
   4168 	if (PP_ISKAS(like_pp))
   4169 		pgrflags |= PGR_SAMESZC;
   4170 
   4171 	/* LINTED */
   4172 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
   4173 
   4174 	while (npgs) {
   4175 		pplist = NULL;
   4176 		for (;;) {
   4177 			pg_cnt = page_get_pagecnt(szc);
   4178 			bin = PP_2_BIN(like_pp);
   4179 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
   4180 			ASSERT(pg_cnt <= npgs);
   4181 
   4182 			/*
   4183 			 * If an lgroup was specified, try to get the
   4184 			 * page from that lgroup.
   4185 			 * NOTE: Must be careful with code below because
   4186 			 *	 lgroup may disappear and reappear since there
   4187 			 *	 is no locking for lgroup here.
   4188 			 */
   4189 			if (LGRP_EXISTS(lgrp_target)) {
   4190 				/*
   4191 				 * Keep local variable for lgroup separate
   4192 				 * from lgroup argument since this code should
   4193 				 * only be exercised when lgroup argument
   4194 				 * exists....
   4195 				 */
   4196 				lgrp = lgrp_target;
   4197 
   4198 				/* Try the lgroup's freelists first */
   4199 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
   4200 				    LGRP_SRCH_LOCAL);
   4201 				while ((pplist == NULL) &&
   4202 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
   4203 				    != -1) {
   4204 					pplist =
   4205 					    page_get_mnode_freelist(mnode, bin,
   4206 					    mtype, szc, flags);
   4207 				}
   4208 
   4209 				/*
   4210 				 * Now try it's cachelists if this is a
   4211 				 * small page. Don't need to do it for
   4212 				 * larger ones since page_freelist_coalesce()
   4213 				 * already failed.
   4214 				 */
   4215 				if (pplist != NULL || szc != 0)
   4216 					break;
   4217 
   4218 				/* Now try it's cachelists */
   4219 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
   4220 				    LGRP_SRCH_LOCAL);
   4221 
   4222 				while ((pplist == NULL) &&
   4223 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
   4224 				    != -1) {
   4225 					pplist =
   4226 					    page_get_mnode_cachelist(bin, flags,
   4227 					    mnode, mtype);
   4228 				}
   4229 				if (pplist != NULL) {
   4230 					page_hashout(pplist, NULL);
   4231 					PP_SETAGED(pplist);
   4232 					REPL_STAT_INCR(nhashout);
   4233 					break;
   4234 				}
   4235 				/* Done looking in this lgroup. Bail out. */
   4236 				break;
   4237 			}
   4238 
   4239 			/*
   4240 			 * No lgroup was specified (or lgroup was removed by
   4241 			 * DR, so just try to get the page as close to
   4242 			 * like_pp's mnode as possible.
   4243 			 * First try the local freelist...
   4244 			 */
   4245 			mnode = PP_2_MEM_NODE(like_pp);
   4246 			pplist = page_get_mnode_freelist(mnode, bin,
   4247 			    mtype, szc, flags);
   4248 			if (pplist != NULL)
   4249 				break;
   4250 
   4251 			REPL_STAT_INCR(nnofree);
   4252 
   4253 			/*
   4254 			 * ...then the local cachelist. Don't need to do it for
   4255 			 * larger pages cause page_freelist_coalesce() already
   4256 			 * failed there anyway.
   4257 			 */
   4258 			if (szc == 0) {
   4259 				pplist = page_get_mnode_cachelist(bin, flags,
   4260 				    mnode, mtype);
   4261 				if (pplist != NULL) {
   4262 					page_hashout(pplist, NULL);
   4263 					PP_SETAGED(pplist);
   4264 					REPL_STAT_INCR(nhashout);
   4265 					break;
   4266 				}
   4267 			}
   4268 
   4269 			/* Now try remote freelists */
   4270 			page_mnode = mnode;
   4271 			lgrp =
   4272 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
   4273 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
   4274 			    LGRP_SRCH_HIER);
   4275 			while (pplist == NULL &&
   4276 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
   4277 			    != -1) {
   4278 				/*
   4279 				 * Skip local mnode.
   4280 				 */
   4281 				if ((mnode == page_mnode) ||
   4282 				    (mem_node_config[mnode].exists == 0))
   4283 					continue;
   4284 
   4285 				pplist = page_get_mnode_freelist(mnode,
   4286 				    bin, mtype, szc, flags);
   4287 			}
   4288 
   4289 			if (pplist != NULL)
   4290 				break;
   4291 
   4292 
   4293 			/* Now try remote cachelists */
   4294 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
   4295 			    LGRP_SRCH_HIER);
   4296 			while (pplist == NULL && szc == 0) {
   4297 				mnode = lgrp_memnode_choose(&lgrp_cookie);
   4298 				if (mnode == -1)
   4299 					break;
   4300 				/*
   4301 				 * Skip local mnode.
   4302 				 */
   4303 				if ((mnode == page_mnode) ||
   4304 				    (mem_node_config[mnode].exists == 0))
   4305 					continue;
   4306 
   4307 				pplist = page_get_mnode_cachelist(bin,
   4308 				    flags, mnode, mtype);
   4309 
   4310 				if (pplist != NULL) {
   4311 					page_hashout(pplist, NULL);
   4312 					PP_SETAGED(pplist);
   4313 					REPL_STAT_INCR(nhashout);
   4314 					break;
   4315 				}
   4316 			}
   4317 
   4318 			/*
   4319 			 * Break out of while loop under the following cases:
   4320 			 * - If we successfully got a page.
   4321 			 * - If pgrflags specified only returning a specific
   4322 			 *   page size and we could not find that page size.
   4323 			 * - If we could not satisfy the request with PAGESIZE
   4324 			 *   or larger pages.
   4325 			 */
   4326 			if (pplist != NULL || szc == 0)
   4327 				break;
   4328 
   4329 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
   4330 				/* try to find contig page */
   4331 
   4332 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
   4333 				    LGRP_SRCH_HIER);
   4334 
   4335 				while ((pplist == NULL) &&
   4336 				    (mnode =
   4337 				    lgrp_memnode_choose(&lgrp_cookie))
   4338 				    != -1) {
   4339 					pplist = page_get_contig_pages(
   4340 					    mnode, bin, mtype, szc,
   4341 					    flags | PGI_PGCPHIPRI);
   4342 				}
   4343 				break;
   4344 			}
   4345 
   4346 			/*
   4347 			 * The correct thing to do here is try the next
   4348 			 * page size down using szc--. Due to a bug
   4349 			 * with the processing of HAT_RELOAD_SHARE
   4350 			 * where the sfmmu_ttecnt arrays of all
   4351 			 * hats sharing an ISM segment don't get updated,
   4352 			 * using intermediate size pages for relocation
   4353 			 * can lead to continuous page faults.
   4354 			 */
   4355 			szc = 0;
   4356 		}
   4357 
   4358 		if (pplist != NULL) {
   4359 			DTRACE_PROBE4(page__get,
   4360 			    lgrp_t *, lgrp,
   4361 			    int, mnode,
   4362 			    ulong_t, bin,
   4363 			    uint_t, flags);
   4364 
   4365 			while (pplist != NULL && pg_cnt--) {
   4366 				ASSERT(pplist != NULL);
   4367 				pp = pplist;
   4368 				page_sub(&pplist, pp);
   4369 				PP_CLRFREE(pp);
   4370 				PP_CLRAGED(pp);
   4371 				page_list_concat(&pl, &pp);
   4372 				npgs--;
   4373 				like_pp = like_pp + 1;
   4374 				REPL_STAT_INCR(nnext_pp);
   4375 			}
   4376 			ASSERT(pg_cnt == 0);
   4377 		} else {
   4378 			break;
   4379 		}
   4380 	}
   4381 
   4382 	if (npgs) {
   4383 		/*
   4384 		 * We were unable to allocate the necessary number
   4385 		 * of pages.
   4386 		 * We need to free up any pl.
   4387 		 */
   4388 		REPL_STAT_INCR(nnopage);
   4389 		page_free_replacement_page(pl);
   4390 		return (NULL);
   4391 	} else {
   4392 		return (pl);
   4393 	}
   4394 }
   4395 
   4396 /*
   4397  * demote a free large page to it's constituent pages
   4398  */
   4399 void
   4400 page_demote_free_pages(page_t *pp)
   4401 {
   4402 
   4403 	int mnode;
   4404 
   4405 	ASSERT(pp != NULL);
   4406 	ASSERT(PAGE_LOCKED(pp));
   4407 	ASSERT(PP_ISFREE(pp));
   4408 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
   4409 
   4410 	mnode = PP_2_MEM_NODE(pp);
   4411 	page_freelist_lock(mnode);
   4412 	if (pp->p_szc != 0) {
   4413 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
   4414 		    pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
   4415 	}
   4416 	page_freelist_unlock(mnode);
   4417 	ASSERT(pp->p_szc == 0);
   4418 }
   4419 
   4420 /*
   4421  * Factor in colorequiv to check additional 'equivalent' bins.
   4422  * colorequiv may be set in /etc/system
   4423  */
   4424 void
   4425 page_set_colorequiv_arr(void)
   4426 {
   4427 	if (colorequiv > 1) {
   4428 		int i;
   4429 		uint_t sv_a = lowbit(colorequiv) - 1;
   4430 
   4431 		if (sv_a > 15)
   4432 			sv_a = 15;
   4433 
   4434 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
   4435 			uint_t colors;
   4436 			uint_t a = sv_a;
   4437 
   4438 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
   4439 				continue;
   4440 			}
   4441 			while ((colors >> a) == 0)
   4442 				a--;
   4443 			if ((a << 4) > colorequivszc[i]) {
   4444 				colorequivszc[i] = (a << 4);
   4445 			}
   4446 		}
   4447 	}
   4448 }
   4449