Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * UNIX machine dependent virtual memory support.
     28  */
     29 
     30 #include <sys/vm.h>
     31 #include <sys/exec.h>
     32 
     33 #include <sys/exechdr.h>
     34 #include <vm/seg_kmem.h>
     35 #include <sys/atomic.h>
     36 #include <sys/archsystm.h>
     37 #include <sys/machsystm.h>
     38 #include <sys/kdi.h>
     39 #include <sys/cpu_module.h>
     40 
     41 #include <vm/hat_sfmmu.h>
     42 
     43 #include <sys/memnode.h>
     44 
     45 #include <sys/mem_config.h>
     46 #include <sys/mem_cage.h>
     47 #include <vm/vm_dep.h>
     48 #include <vm/page.h>
     49 #include <sys/platform_module.h>
     50 
     51 /*
     52  * These variables are set by module specific config routines.
     53  * They are only set by modules which will use physical cache page coloring.
     54  */
     55 int do_pg_coloring = 0;
     56 
     57 /*
     58  * These variables can be conveniently patched at kernel load time to
     59  * prevent do_pg_coloring from being enabled by
     60  * module specific config routines.
     61  */
     62 
     63 int use_page_coloring = 1;
     64 
     65 /*
     66  * initialized by page_coloring_init()
     67  */
     68 extern uint_t page_colors;
     69 extern uint_t page_colors_mask;
     70 extern uint_t page_coloring_shift;
     71 int cpu_page_colors;
     72 uint_t vac_colors = 0;
     73 uint_t vac_colors_mask = 0;
     74 
     75 /* cpu specific coloring initialization */
     76 extern void page_coloring_init_cpu();
     77 #pragma weak page_coloring_init_cpu
     78 
     79 /*
     80  * get the ecache setsize for the current cpu.
     81  */
     82 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
     83 
     84 plcnt_t		plcnt;		/* page list count */
     85 
     86 /*
     87  * This variable is set by the cpu module to contain the lowest
     88  * address not affected by the SF_ERRATA_57 workaround.  It should
     89  * remain 0 if the workaround is not needed.
     90  */
     91 #if defined(SF_ERRATA_57)
     92 caddr_t errata57_limit;
     93 #endif
     94 
     95 extern void page_relocate_hash(page_t *, page_t *);
     96 
     97 /*
     98  * these must be defined in platform specific areas
     99  */
    100 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
    101 	struct proc *, uint_t);
    102 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
    103 	caddr_t, size_t, uint_t, struct lgrp *);
    104 /*
    105  * Convert page frame number to an OBMEM page frame number
    106  * (i.e. put in the type bits -- zero for this implementation)
    107  */
    108 pfn_t
    109 impl_obmem_pfnum(pfn_t pf)
    110 {
    111 	return (pf);
    112 }
    113 
    114 /*
    115  * Use physmax to determine the highest physical page of DRAM memory
    116  * It is assumed that any physical addresses above physmax is in IO space.
    117  * We don't bother checking the low end because we assume that memory space
    118  * begins at physical page frame 0.
    119  *
    120  * Return 1 if the page frame is onboard DRAM memory, else 0.
    121  * Returns 0 for nvram so it won't be cached.
    122  */
    123 int
    124 pf_is_memory(pfn_t pf)
    125 {
    126 	/* We must be IO space */
    127 	if (pf > physmax)
    128 		return (0);
    129 
    130 	/* We must be memory space */
    131 	return (1);
    132 }
    133 
    134 /*
    135  * Handle a pagefault.
    136  */
    137 faultcode_t
    138 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
    139 {
    140 	struct as *as;
    141 	struct proc *p;
    142 	faultcode_t res;
    143 	caddr_t base;
    144 	size_t len;
    145 	int err;
    146 
    147 	if (INVALID_VADDR(addr))
    148 		return (FC_NOMAP);
    149 
    150 	if (iskernel) {
    151 		as = &kas;
    152 	} else {
    153 		p = curproc;
    154 		as = p->p_as;
    155 #if defined(SF_ERRATA_57)
    156 		/*
    157 		 * Prevent infinite loops due to a segment driver
    158 		 * setting the execute permissions and the sfmmu hat
    159 		 * silently ignoring them.
    160 		 */
    161 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
    162 		    addr < errata57_limit) {
    163 			res = FC_NOMAP;
    164 			goto out;
    165 		}
    166 #endif
    167 	}
    168 
    169 	/*
    170 	 * Dispatch pagefault.
    171 	 */
    172 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
    173 
    174 	/*
    175 	 * If this isn't a potential unmapped hole in the user's
    176 	 * UNIX data or stack segments, just return status info.
    177 	 */
    178 	if (!(res == FC_NOMAP && iskernel == 0))
    179 		goto out;
    180 
    181 	/*
    182 	 * Check to see if we happened to faulted on a currently unmapped
    183 	 * part of the UNIX data or stack segments.  If so, create a zfod
    184 	 * mapping there and then try calling the fault routine again.
    185 	 */
    186 	base = p->p_brkbase;
    187 	len = p->p_brksize;
    188 
    189 	if (addr < base || addr >= base + len) {		/* data seg? */
    190 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
    191 		len = p->p_stksize;
    192 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
    193 			/* not in either UNIX data or stack segments */
    194 			res = FC_NOMAP;
    195 			goto out;
    196 		}
    197 	}
    198 
    199 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
    200 	/* This code is probably not needed anymore */
    201 
    202 	/* expand the gap to the page boundaries on each side */
    203 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
    204 	    ((uintptr_t)base & PAGEMASK);
    205 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
    206 
    207 	as_rangelock(as);
    208 	as_purge(as);
    209 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
    210 		err = as_map(as, base, len, segvn_create, zfod_argsp);
    211 		as_rangeunlock(as);
    212 		if (err) {
    213 			res = FC_MAKE_ERR(err);
    214 			goto out;
    215 		}
    216 	} else {
    217 		/*
    218 		 * This page is already mapped by another thread after we
    219 		 * returned from as_fault() above.  We just fallthrough
    220 		 * as_fault() below.
    221 		 */
    222 		as_rangeunlock(as);
    223 	}
    224 
    225 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
    226 
    227 out:
    228 
    229 	return (res);
    230 }
    231 
    232 /*
    233  * This is the routine which defines the address limit implied
    234  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
    235  * mappable address in a 32-bit process on this platform (though
    236  * perhaps we should make it be UINT32_MAX here?)
    237  */
    238 void
    239 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
    240 {
    241 	struct proc *p = curproc;
    242 	caddr_t userlimit = flags & _MAP_LOW32 ?
    243 	    (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
    244 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
    245 }
    246 
    247 /*
    248  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
    249  */
    250 caddr_t	hole_start, hole_end;
    251 
    252 /*
    253  * kpm mapping window
    254  */
    255 caddr_t kpm_vbase;
    256 size_t  kpm_size;
    257 uchar_t kpm_size_shift;
    258 
    259 int valid_va_range_aligned_wraparound;
    260 /*
    261  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
    262  * addresses at least "minlen" long, where the base of the range is at "off"
    263  * phase from an "align" boundary and there is space for a "redzone"-sized
    264  * redzone on either side of the range.  On success, 1 is returned and *basep
    265  * and *lenp are adjusted to describe the acceptable range (including
    266  * the redzone).  On failure, 0 is returned.
    267  */
    268 int
    269 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
    270     size_t align, size_t redzone, size_t off)
    271 {
    272 	caddr_t hi, lo;
    273 	size_t tot_len;
    274 
    275 	ASSERT(align == 0 ? off == 0 : off < align);
    276 	ASSERT(ISP2(align));
    277 	ASSERT(align == 0 || align >= PAGESIZE);
    278 
    279 	lo = *basep;
    280 	hi = lo + *lenp;
    281 	tot_len = minlen + 2 * redzone;	/* need at least this much space */
    282 
    283 	/* If hi rolled over the top try cutting back. */
    284 	if (hi < lo) {
    285 		*lenp = 0UL - (uintptr_t)lo - 1UL;
    286 		/* Trying to see if this really happens, and then if so, why */
    287 		valid_va_range_aligned_wraparound++;
    288 		hi = lo + *lenp;
    289 	}
    290 	if (*lenp < tot_len) {
    291 		return (0);
    292 	}
    293 
    294 	/*
    295 	 * Deal with a possible hole in the address range between
    296 	 * hole_start and hole_end that should never be mapped by the MMU.
    297 	 */
    298 
    299 	if (lo < hole_start) {
    300 		if (hi > hole_start)
    301 			if (hi < hole_end)
    302 				hi = hole_start;
    303 			else
    304 				/* lo < hole_start && hi >= hole_end */
    305 				if (dir == AH_LO) {
    306 					/*
    307 					 * prefer lowest range
    308 					 */
    309 					if (hole_start - lo >= tot_len)
    310 						hi = hole_start;
    311 					else if (hi - hole_end >= tot_len)
    312 						lo = hole_end;
    313 					else
    314 						return (0);
    315 				} else {
    316 					/*
    317 					 * prefer highest range
    318 					 */
    319 					if (hi - hole_end >= tot_len)
    320 						lo = hole_end;
    321 					else if (hole_start - lo >= tot_len)
    322 						hi = hole_start;
    323 					else
    324 						return (0);
    325 				}
    326 	} else {
    327 		/* lo >= hole_start */
    328 		if (hi < hole_end)
    329 			return (0);
    330 		if (lo < hole_end)
    331 			lo = hole_end;
    332 	}
    333 
    334 	/* Check if remaining length is too small */
    335 	if (hi - lo < tot_len) {
    336 		return (0);
    337 	}
    338 	if (align > 1) {
    339 		caddr_t tlo = lo + redzone;
    340 		caddr_t thi = hi - redzone;
    341 		tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
    342 		if (tlo < lo + redzone) {
    343 			return (0);
    344 		}
    345 		if (thi < tlo || thi - tlo < minlen) {
    346 			return (0);
    347 		}
    348 	}
    349 	*basep = lo;
    350 	*lenp = hi - lo;
    351 	return (1);
    352 }
    353 
    354 /*
    355  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
    356  * addresses at least "minlen" long.  On success, 1 is returned and *basep
    357  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
    358  * is returned.
    359  */
    360 int
    361 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
    362 {
    363 	return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
    364 }
    365 
    366 /*
    367  * Determine whether [addr, addr+len] with protections `prot' are valid
    368  * for a user address space.
    369  */
    370 /*ARGSUSED*/
    371 int
    372 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
    373     caddr_t userlimit)
    374 {
    375 	caddr_t eaddr = addr + len;
    376 
    377 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
    378 		return (RANGE_BADADDR);
    379 
    380 	/*
    381 	 * Determine if the address range falls within an illegal
    382 	 * range of the MMU.
    383 	 */
    384 	if (eaddr > hole_start && addr < hole_end)
    385 		return (RANGE_BADADDR);
    386 
    387 #if defined(SF_ERRATA_57)
    388 	/*
    389 	 * Make sure USERLIMIT isn't raised too high
    390 	 */
    391 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
    392 	    errata57_limit == 0);
    393 
    394 	if (AS_TYPE_64BIT(as) &&
    395 	    (addr < errata57_limit) &&
    396 	    (prot & PROT_EXEC))
    397 		return (RANGE_BADPROT);
    398 #endif /* SF_ERRATA57 */
    399 	return (RANGE_OKAY);
    400 }
    401 
    402 /*
    403  * Routine used to check to see if an a.out can be executed
    404  * by the current machine/architecture.
    405  */
    406 int
    407 chkaout(struct exdata *exp)
    408 {
    409 	if (exp->ux_mach == M_SPARC)
    410 		return (0);
    411 	else
    412 		return (ENOEXEC);
    413 }
    414 
    415 /*
    416  * The following functions return information about an a.out
    417  * which is used when a program is executed.
    418  */
    419 
    420 /*
    421  * Return the load memory address for the data segment.
    422  */
    423 caddr_t
    424 getdmem(struct exec *exp)
    425 {
    426 	/*
    427 	 * XXX - Sparc Reference Hack approaching
    428 	 * Remember that we are loading
    429 	 * 8k executables into a 4k machine
    430 	 * DATA_ALIGN == 2 * PAGESIZE
    431 	 */
    432 	if (exp->a_text)
    433 		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
    434 	else
    435 		return ((caddr_t)USRTEXT);
    436 }
    437 
    438 /*
    439  * Return the starting disk address for the data segment.
    440  */
    441 ulong_t
    442 getdfile(struct exec *exp)
    443 {
    444 	if (exp->a_magic == ZMAGIC)
    445 		return (exp->a_text);
    446 	else
    447 		return (sizeof (struct exec) + exp->a_text);
    448 }
    449 
    450 /*
    451  * Return the load memory address for the text segment.
    452  */
    453 
    454 /*ARGSUSED*/
    455 caddr_t
    456 gettmem(struct exec *exp)
    457 {
    458 	return ((caddr_t)USRTEXT);
    459 }
    460 
    461 /*
    462  * Return the file byte offset for the text segment.
    463  */
    464 uint_t
    465 gettfile(struct exec *exp)
    466 {
    467 	if (exp->a_magic == ZMAGIC)
    468 		return (0);
    469 	else
    470 		return (sizeof (struct exec));
    471 }
    472 
    473 void
    474 getexinfo(
    475 	struct exdata *edp_in,
    476 	struct exdata *edp_out,
    477 	int *pagetext,
    478 	int *pagedata)
    479 {
    480 	*edp_out = *edp_in;	/* structure copy */
    481 
    482 	if ((edp_in->ux_mag == ZMAGIC) &&
    483 	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
    484 		*pagetext = 1;
    485 		*pagedata = 1;
    486 	} else {
    487 		*pagetext = 0;
    488 		*pagedata = 0;
    489 	}
    490 }
    491 
    492 /*
    493  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
    494  * KPM selects an address such that it's equal offset modulo shm_alignment and
    495  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
    496  */
    497 int
    498 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
    499 {
    500 	if (vac) {
    501 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
    502 	} else {
    503 		return (0);
    504 	}
    505 }
    506 
    507 /*
    508  * Sanity control. Don't use large pages regardless of user
    509  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
    510  * The units for this variable is 8K pages.
    511  */
    512 pgcnt_t shm_lpg_min_physmem = 131072;			/* 1GB */
    513 pgcnt_t privm_lpg_min_physmem = 131072;			/* 1GB */
    514 
    515 static size_t
    516 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
    517 {
    518 	size_t		pgsz = MMU_PAGESIZE;
    519 	int		szc;
    520 
    521 	/*
    522 	 * If len is zero, retrieve from proc and don't demote the page size.
    523 	 * Use atleast the default pagesize.
    524 	 */
    525 	if (len == 0) {
    526 		len = p->p_brkbase + p->p_brksize - p->p_bssbase;
    527 	}
    528 	len = MAX(len, default_uheap_lpsize);
    529 
    530 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
    531 		pgsz = hw_page_array[szc].hp_size;
    532 		if ((disable_auto_data_large_pages & (1 << szc)) ||
    533 		    pgsz > max_uheap_lpsize)
    534 			continue;
    535 		if (len >= pgsz) {
    536 			break;
    537 		}
    538 	}
    539 
    540 	/*
    541 	 * If addr == 0 we were called by memcntl() when the
    542 	 * size code is 0.  Don't set pgsz less than current size.
    543 	 */
    544 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
    545 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
    546 	}
    547 
    548 	return (pgsz);
    549 }
    550 
    551 static size_t
    552 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
    553 {
    554 	size_t		pgsz = MMU_PAGESIZE;
    555 	int		szc;
    556 
    557 	/*
    558 	 * If len is zero, retrieve from proc and don't demote the page size.
    559 	 * Use atleast the default pagesize.
    560 	 */
    561 	if (len == 0) {
    562 		len = p->p_stksize;
    563 	}
    564 	len = MAX(len, default_ustack_lpsize);
    565 
    566 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
    567 		pgsz = hw_page_array[szc].hp_size;
    568 		if ((disable_auto_data_large_pages & (1 << szc)) ||
    569 		    pgsz > max_ustack_lpsize)
    570 			continue;
    571 		if (len >= pgsz) {
    572 			break;
    573 		}
    574 	}
    575 
    576 	/*
    577 	 * If addr == 0 we were called by memcntl() or exec_args() when the
    578 	 * size code is 0.  Don't set pgsz less than current size.
    579 	 */
    580 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
    581 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
    582 	}
    583 
    584 	return (pgsz);
    585 }
    586 
    587 static size_t
    588 map_pgszism(caddr_t addr, size_t len)
    589 {
    590 	uint_t szc;
    591 	size_t pgsz;
    592 
    593 	for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
    594 		if (disable_ism_large_pages & (1 << szc))
    595 			continue;
    596 
    597 		pgsz = hw_page_array[szc].hp_size;
    598 		if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
    599 			return (pgsz);
    600 	}
    601 
    602 	return (DEFAULT_ISM_PAGESIZE);
    603 }
    604 
    605 /*
    606  * Suggest a page size to be used to map a segment of type maptype and length
    607  * len.  Returns a page size (not a size code).
    608  */
    609 /* ARGSUSED */
    610 size_t
    611 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
    612 {
    613 	size_t	pgsz = MMU_PAGESIZE;
    614 
    615 	ASSERT(maptype != MAPPGSZ_VA);
    616 
    617 	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
    618 		return (MMU_PAGESIZE);
    619 	}
    620 
    621 	switch (maptype) {
    622 	case MAPPGSZ_ISM:
    623 		pgsz = map_pgszism(addr, len);
    624 		break;
    625 
    626 	case MAPPGSZ_STK:
    627 		if (max_ustack_lpsize > MMU_PAGESIZE) {
    628 			pgsz = map_pgszstk(p, addr, len);
    629 		}
    630 		break;
    631 
    632 	case MAPPGSZ_HEAP:
    633 		if (max_uheap_lpsize > MMU_PAGESIZE) {
    634 			pgsz = map_pgszheap(p, addr, len);
    635 		}
    636 		break;
    637 	}
    638 	return (pgsz);
    639 }
    640 
    641 
    642 /* assumes TTE8K...TTE4M == szc */
    643 
    644 static uint_t
    645 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
    646     size_t max_lpsize, size_t min_physmem)
    647 {
    648 	caddr_t eaddr = addr + size;
    649 	uint_t szcvec = 0;
    650 	caddr_t raddr;
    651 	caddr_t readdr;
    652 	size_t pgsz;
    653 	int i;
    654 
    655 	if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
    656 		return (0);
    657 	}
    658 	for (i = mmu_page_sizes - 1; i > 0; i--) {
    659 		if (disable_lpgs & (1 << i)) {
    660 			continue;
    661 		}
    662 		pgsz = page_get_pagesize(i);
    663 		if (pgsz > max_lpsize) {
    664 			continue;
    665 		}
    666 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
    667 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
    668 		if (raddr < addr || raddr >= readdr) {
    669 			continue;
    670 		}
    671 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
    672 			continue;
    673 		}
    674 		szcvec |= (1 << i);
    675 		/*
    676 		 * And or in the remaining enabled page sizes.
    677 		 */
    678 		szcvec |= P2PHASE(~disable_lpgs, (1 << i));
    679 		szcvec &= ~1; /* no need to return 8K pagesize */
    680 		break;
    681 	}
    682 	return (szcvec);
    683 }
    684 
    685 /*
    686  * Return a bit vector of large page size codes that
    687  * can be used to map [addr, addr + len) region.
    688  */
    689 /* ARGSUSED */
    690 uint_t
    691 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
    692     int memcntl)
    693 {
    694 	if (flags & MAP_TEXT) {
    695 		return (map_szcvec(addr, size, off,
    696 		    disable_auto_text_large_pages,
    697 		    max_utext_lpsize, shm_lpg_min_physmem));
    698 
    699 	} else if (flags & MAP_INITDATA) {
    700 		return (map_szcvec(addr, size, off,
    701 		    disable_auto_data_large_pages,
    702 		    max_uidata_lpsize, privm_lpg_min_physmem));
    703 
    704 	} else if (type == MAPPGSZC_SHM) {
    705 		return (map_szcvec(addr, size, off,
    706 		    disable_auto_data_large_pages,
    707 		    max_shm_lpsize, shm_lpg_min_physmem));
    708 
    709 	} else if (type == MAPPGSZC_HEAP) {
    710 		return (map_szcvec(addr, size, off,
    711 		    disable_auto_data_large_pages,
    712 		    max_uheap_lpsize, privm_lpg_min_physmem));
    713 
    714 	} else if (type == MAPPGSZC_STACK) {
    715 		return (map_szcvec(addr, size, off,
    716 		    disable_auto_data_large_pages,
    717 		    max_ustack_lpsize, privm_lpg_min_physmem));
    718 
    719 	} else {
    720 		return (map_szcvec(addr, size, off,
    721 		    disable_auto_data_large_pages,
    722 		    max_privmap_lpsize, privm_lpg_min_physmem));
    723 	}
    724 }
    725 
    726 /*
    727  * Anchored in the table below are counters used to keep track
    728  * of free contiguous physical memory. Each element of the table contains
    729  * the array of counters, the size of array which is allocated during
    730  * startup based on physmax and a shift value used to convert a pagenum
    731  * into a counter array index or vice versa. The table has page size
    732  * for rows and region size for columns:
    733  *
    734  *	page_counters[page_size][region_size]
    735  *
    736  *	page_size: 	TTE size code of pages on page_size freelist.
    737  *
    738  *	region_size:	TTE size code of a candidate larger page made up
    739  *			made up of contiguous free page_size pages.
    740  *
    741  * As you go across a page_size row increasing region_size each
    742  * element keeps track of how many (region_size - 1) size groups
    743  * made up of page_size free pages can be coalesced into a
    744  * regsion_size page. Yuck! Lets try an example:
    745  *
    746  * 	page_counters[1][3] is the table element used for identifying
    747  *	candidate 4M pages from contiguous pages off the 64K free list.
    748  *	Each index in the page_counters[1][3].array spans 4M. Its the
    749  *	number of free 512K size (regsion_size - 1) groups of contiguous
    750  *	64K free pages.	So when page_counters[1][3].counters[n] == 8
    751  *	we know we have a candidate 4M page made up of 512K size groups
    752  *	of 64K free pages.
    753  */
    754 
    755 /*
    756  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
    757  * dimensions are allocated dynamically.
    758  */
    759 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
    760 
    761 /*
    762  * For now there is only a single size cache list.
    763  * Allocated dynamically.
    764  */
    765 page_t ***page_cachelists[MAX_MEM_TYPES];
    766 
    767 kmutex_t *fpc_mutex[NPC_MUTEX];
    768 kmutex_t *cpc_mutex[NPC_MUTEX];
    769 
    770 /*
    771  * Calculate space needed for page freelists and counters
    772  */
    773 size_t
    774 calc_free_pagelist_sz(void)
    775 {
    776 	int szc;
    777 	size_t alloc_sz, cache_sz, free_sz;
    778 
    779 	/*
    780 	 * one cachelist per color, node, and type
    781 	 */
    782 	cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
    783 	    sizeof (page_t **);
    784 	cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
    785 
    786 	/*
    787 	 * one freelist per size, color, node, and type
    788 	 */
    789 	free_sz = sizeof (page_t **);
    790 	for (szc = 0; szc < mmu_page_sizes; szc++)
    791 		free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
    792 	free_sz *= max_mem_nodes * MAX_MEM_TYPES;
    793 
    794 	alloc_sz = cache_sz + free_sz + page_ctrs_sz();
    795 	return (alloc_sz);
    796 }
    797 
    798 caddr_t
    799 alloc_page_freelists(caddr_t alloc_base)
    800 {
    801 	int	mnode, mtype;
    802 	int	szc, clrs;
    803 
    804 	/*
    805 	 * We only support small pages in the cachelist.
    806 	 */
    807 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
    808 		page_cachelists[mtype] = (page_t ***)alloc_base;
    809 		alloc_base += (max_mem_nodes * sizeof (page_t **));
    810 		for (mnode = 0; mnode < max_mem_nodes; mnode++) {
    811 			page_cachelists[mtype][mnode] = (page_t **)alloc_base;
    812 			alloc_base +=
    813 			    (page_get_pagecolors(0) * sizeof (page_t *));
    814 		}
    815 	}
    816 
    817 	/*
    818 	 * Allocate freelists bins for all
    819 	 * supported page sizes.
    820 	 */
    821 	for (szc = 0; szc < mmu_page_sizes; szc++) {
    822 		clrs = page_get_pagecolors(szc);
    823 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
    824 			page_freelists[szc][mtype] = (page_t ***)alloc_base;
    825 			alloc_base += (max_mem_nodes * sizeof (page_t **));
    826 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
    827 				page_freelists[szc][mtype][mnode] =
    828 				    (page_t **)alloc_base;
    829 				alloc_base += (clrs * (sizeof (page_t *)));
    830 			}
    831 		}
    832 	}
    833 
    834 	alloc_base = page_ctrs_alloc(alloc_base);
    835 	return (alloc_base);
    836 }
    837 
    838 /*
    839  * Allocate page_freelists locks for a memnode from the nucleus data
    840  * area. This is the first time that mmu_page_sizes is used during
    841  * bootup, so check mmu_page_sizes initialization.
    842  */
    843 int
    844 ndata_alloc_page_mutexs(struct memlist *ndata)
    845 {
    846 	size_t alloc_sz;
    847 	caddr_t alloc_base;
    848 	int	i;
    849 	void	page_coloring_init();
    850 
    851 	page_coloring_init();
    852 	if (&mmu_init_mmu_page_sizes) {
    853 		if (!mmu_init_mmu_page_sizes(0)) {
    854 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
    855 			    mmu_page_sizes);
    856 		}
    857 	}
    858 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
    859 
    860 	/* fpc_mutex and cpc_mutex */
    861 	alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
    862 
    863 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
    864 	if (alloc_base == NULL)
    865 		return (-1);
    866 
    867 	ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
    868 
    869 	for (i = 0; i < NPC_MUTEX; i++) {
    870 		fpc_mutex[i] = (kmutex_t *)alloc_base;
    871 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
    872 		cpc_mutex[i] = (kmutex_t *)alloc_base;
    873 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
    874 	}
    875 	return (0);
    876 }
    877 
    878 /*
    879  * To select our starting bin, we stride through the bins with a stride
    880  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
    881  * in simulation and practice for different workloads on varying cache sizes.
    882  */
    883 uint32_t color_start_current = 0;
    884 uint32_t color_start_stride = 337;
    885 int color_start_random = 0;
    886 
    887 /* ARGSUSED */
    888 uint_t
    889 get_color_start(struct as *as)
    890 {
    891 	uint32_t old, new;
    892 
    893 	if (consistent_coloring == 2 || color_start_random) {
    894 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
    895 		    (hw_page_array[0].hp_colors - 1)));
    896 	}
    897 
    898 	do {
    899 		old = color_start_current;
    900 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
    901 	} while (cas32(&color_start_current, old, new) != old);
    902 
    903 	return ((uint_t)(new));
    904 }
    905 
    906 /*
    907  * Called once at startup from kphysm_init() -- before memialloc()
    908  * is invoked to do the 1st page_free()/page_freelist_add().
    909  *
    910  * initializes page_colors and page_colors_mask based on ecache_setsize.
    911  *
    912  * Also initializes the counter locks.
    913  */
    914 void
    915 page_coloring_init()
    916 {
    917 	int	a, i;
    918 	uint_t colors;
    919 
    920 	if (do_pg_coloring == 0) {
    921 		page_colors = 1;
    922 		for (i = 0; i < mmu_page_sizes; i++) {
    923 			colorequivszc[i] = 0;
    924 			hw_page_array[i].hp_colors = 1;
    925 		}
    926 		return;
    927 	}
    928 
    929 	/*
    930 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
    931 	 * the max ecache setsize of all cpus configured in the system or, for
    932 	 * cheetah+ systems, the max possible ecache setsize for all possible
    933 	 * cheetah+ cpus.
    934 	 */
    935 	page_colors = ecache_setsize / MMU_PAGESIZE;
    936 	page_colors_mask = page_colors - 1;
    937 
    938 	vac_colors = vac_size / MMU_PAGESIZE;
    939 	vac_colors_mask = vac_colors -1;
    940 
    941 	page_coloring_shift = 0;
    942 	a = ecache_setsize;
    943 	while (a >>= 1) {
    944 		page_coloring_shift++;
    945 	}
    946 
    947 	/* initialize number of colors per page size */
    948 	for (i = 0; i < mmu_page_sizes; i++) {
    949 		hw_page_array[i].hp_colors = (page_colors_mask >>
    950 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
    951 		    + 1;
    952 		colorequivszc[i] = 0;
    953 	}
    954 
    955 	/*
    956 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
    957 	 * cpu_page_colors set to -1 during DR operation or during startup
    958 	 * if setsizes are heterogenous.
    959 	 *
    960 	 * The value of cpu_page_colors determines if additional color bins
    961 	 * need to be checked for a particular color in the page_get routines.
    962 	 */
    963 	if (cpu_setsize > 0 && cpu_page_colors == 0 &&
    964 	    cpu_setsize < ecache_setsize) {
    965 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
    966 		a = lowbit(page_colors) - lowbit(cpu_page_colors);
    967 		ASSERT(a > 0);
    968 		ASSERT(a < 16);
    969 
    970 		for (i = 0; i < mmu_page_sizes; i++) {
    971 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
    972 				continue;
    973 			}
    974 			while ((colors >> a) == 0)
    975 				a--;
    976 			ASSERT(a >= 0);
    977 
    978 			/* higher 4 bits encodes color equiv mask */
    979 			colorequivszc[i] = (a << 4);
    980 		}
    981 	}
    982 
    983 	/* do cpu specific color initialization */
    984 	if (&page_coloring_init_cpu) {
    985 		page_coloring_init_cpu();
    986 	}
    987 }
    988 
    989 int
    990 bp_color(struct buf *bp)
    991 {
    992 	int color = -1;
    993 
    994 	if (vac) {
    995 		if ((bp->b_flags & B_PAGEIO) != 0) {
    996 			color = sfmmu_get_ppvcolor(bp->b_pages);
    997 		} else if (bp->b_un.b_addr != NULL) {
    998 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
    999 		}
   1000 	}
   1001 	return (color < 0 ? 0 : ptob(color));
   1002 }
   1003 
   1004 /*
   1005  * Function for flushing D-cache when performing module relocations
   1006  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
   1007  * at least for now.
   1008  */
   1009 void
   1010 dcache_flushall()
   1011 {
   1012 	sfmmu_cache_flushall();
   1013 }
   1014 
   1015 static int
   1016 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
   1017 {
   1018 	if (va1 < va2 && va1 + sz1 <= va2)
   1019 		return (0);
   1020 
   1021 	if (va2 < va1 && va2 + sz2 <= va1)
   1022 		return (0);
   1023 
   1024 	return (1);
   1025 }
   1026 
   1027 /*
   1028  * Return the number of bytes, relative to the beginning of a given range, that
   1029  * are non-toxic (can be read from and written to with relative impunity).
   1030  */
   1031 size_t
   1032 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
   1033 {
   1034 	/* OBP reads are harmless, but we don't want people writing there */
   1035 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
   1036 	    OFW_START_ADDR + 1))
   1037 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
   1038 
   1039 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
   1040 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
   1041 
   1042 	return (sz); /* no overlap */
   1043 }
   1044 
   1045 /*
   1046  * Minimum physmem required for enabling large pages for kernel heap
   1047  * Currently we do not enable lp for kmem on systems with less
   1048  * than 1GB of memory. This value can be changed via /etc/system
   1049  */
   1050 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
   1051 
   1052 /*
   1053  * this function chooses large page size for kernel heap
   1054  */
   1055 size_t
   1056 get_segkmem_lpsize(size_t lpsize)
   1057 {
   1058 	size_t memtotal = physmem * PAGESIZE;
   1059 	size_t mmusz;
   1060 	uint_t szc;
   1061 
   1062 	if (memtotal < segkmem_lpminphysmem)
   1063 		return (PAGESIZE);
   1064 
   1065 	if (plat_lpkmem_is_supported != NULL &&
   1066 	    plat_lpkmem_is_supported() == 0)
   1067 		return (PAGESIZE);
   1068 
   1069 	mmusz = mmu_get_kernel_lpsize(lpsize);
   1070 	szc = page_szc(mmusz);
   1071 
   1072 	while (szc) {
   1073 		if (!(disable_large_pages & (1 << szc)))
   1074 			return (page_get_pagesize(szc));
   1075 		szc--;
   1076 	}
   1077 	return (PAGESIZE);
   1078 }
   1079