Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 /*
     29  * UNIX machine dependent virtual memory support.
     30  */
     31 
     32 #include <sys/vm.h>
     33 #include <sys/exec.h>
     34 
     35 #include <sys/exechdr.h>
     36 #include <vm/seg_kmem.h>
     37 #include <sys/atomic.h>
     38 #include <sys/archsystm.h>
     39 #include <sys/machsystm.h>
     40 #include <sys/kdi.h>
     41 #include <sys/cpu_module.h>
     42 
     43 #include <vm/hat_sfmmu.h>
     44 
     45 #include <sys/memnode.h>
     46 
     47 #include <sys/mem_config.h>
     48 #include <sys/mem_cage.h>
     49 #include <vm/vm_dep.h>
     50 #include <vm/page.h>
     51 #include <sys/platform_module.h>
     52 
     53 /*
     54  * These variables are set by module specific config routines.
     55  * They are only set by modules which will use physical cache page coloring.
     56  */
     57 int do_pg_coloring = 0;
     58 
     59 /*
     60  * These variables can be conveniently patched at kernel load time to
     61  * prevent do_pg_coloring from being enabled by
     62  * module specific config routines.
     63  */
     64 
     65 int use_page_coloring = 1;
     66 
     67 /*
     68  * initialized by page_coloring_init()
     69  */
     70 extern uint_t page_colors;
     71 extern uint_t page_colors_mask;
     72 extern uint_t page_coloring_shift;
     73 int cpu_page_colors;
     74 uint_t vac_colors = 0;
     75 uint_t vac_colors_mask = 0;
     76 
     77 /* cpu specific coloring initialization */
     78 extern void page_coloring_init_cpu();
     79 #pragma weak page_coloring_init_cpu
     80 
     81 /*
     82  * get the ecache setsize for the current cpu.
     83  */
     84 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
     85 
     86 plcnt_t		plcnt;		/* page list count */
     87 
     88 /*
     89  * This variable is set by the cpu module to contain the lowest
     90  * address not affected by the SF_ERRATA_57 workaround.  It should
     91  * remain 0 if the workaround is not needed.
     92  */
     93 #if defined(SF_ERRATA_57)
     94 caddr_t errata57_limit;
     95 #endif
     96 
     97 extern void page_relocate_hash(page_t *, page_t *);
     98 
     99 /*
    100  * these must be defined in platform specific areas
    101  */
    102 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
    103 	struct proc *, uint_t);
    104 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
    105 	caddr_t, size_t, uint_t, struct lgrp *);
    106 /*
    107  * Convert page frame number to an OBMEM page frame number
    108  * (i.e. put in the type bits -- zero for this implementation)
    109  */
    110 pfn_t
    111 impl_obmem_pfnum(pfn_t pf)
    112 {
    113 	return (pf);
    114 }
    115 
    116 /*
    117  * Use physmax to determine the highest physical page of DRAM memory
    118  * It is assumed that any physical addresses above physmax is in IO space.
    119  * We don't bother checking the low end because we assume that memory space
    120  * begins at physical page frame 0.
    121  *
    122  * Return 1 if the page frame is onboard DRAM memory, else 0.
    123  * Returns 0 for nvram so it won't be cached.
    124  */
    125 int
    126 pf_is_memory(pfn_t pf)
    127 {
    128 	/* We must be IO space */
    129 	if (pf > physmax)
    130 		return (0);
    131 
    132 	/* We must be memory space */
    133 	return (1);
    134 }
    135 
    136 /*
    137  * Handle a pagefault.
    138  */
    139 faultcode_t
    140 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
    141 {
    142 	struct as *as;
    143 	struct proc *p;
    144 	faultcode_t res;
    145 	caddr_t base;
    146 	size_t len;
    147 	int err;
    148 
    149 	if (INVALID_VADDR(addr))
    150 		return (FC_NOMAP);
    151 
    152 	if (iskernel) {
    153 		as = &kas;
    154 	} else {
    155 		p = curproc;
    156 		as = p->p_as;
    157 #if defined(SF_ERRATA_57)
    158 		/*
    159 		 * Prevent infinite loops due to a segment driver
    160 		 * setting the execute permissions and the sfmmu hat
    161 		 * silently ignoring them.
    162 		 */
    163 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
    164 		    addr < errata57_limit) {
    165 			res = FC_NOMAP;
    166 			goto out;
    167 		}
    168 #endif
    169 	}
    170 
    171 	/*
    172 	 * Dispatch pagefault.
    173 	 */
    174 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
    175 
    176 	/*
    177 	 * If this isn't a potential unmapped hole in the user's
    178 	 * UNIX data or stack segments, just return status info.
    179 	 */
    180 	if (!(res == FC_NOMAP && iskernel == 0))
    181 		goto out;
    182 
    183 	/*
    184 	 * Check to see if we happened to faulted on a currently unmapped
    185 	 * part of the UNIX data or stack segments.  If so, create a zfod
    186 	 * mapping there and then try calling the fault routine again.
    187 	 */
    188 	base = p->p_brkbase;
    189 	len = p->p_brksize;
    190 
    191 	if (addr < base || addr >= base + len) {		/* data seg? */
    192 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
    193 		len = p->p_stksize;
    194 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
    195 			/* not in either UNIX data or stack segments */
    196 			res = FC_NOMAP;
    197 			goto out;
    198 		}
    199 	}
    200 
    201 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
    202 	/* This code is probably not needed anymore */
    203 
    204 	/* expand the gap to the page boundaries on each side */
    205 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
    206 	    ((uintptr_t)base & PAGEMASK);
    207 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
    208 
    209 	as_rangelock(as);
    210 	as_purge(as);
    211 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
    212 		err = as_map(as, base, len, segvn_create, zfod_argsp);
    213 		as_rangeunlock(as);
    214 		if (err) {
    215 			res = FC_MAKE_ERR(err);
    216 			goto out;
    217 		}
    218 	} else {
    219 		/*
    220 		 * This page is already mapped by another thread after we
    221 		 * returned from as_fault() above.  We just fallthrough
    222 		 * as_fault() below.
    223 		 */
    224 		as_rangeunlock(as);
    225 	}
    226 
    227 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
    228 
    229 out:
    230 
    231 	return (res);
    232 }
    233 
    234 /*
    235  * This is the routine which defines the address limit implied
    236  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
    237  * mappable address in a 32-bit process on this platform (though
    238  * perhaps we should make it be UINT32_MAX here?)
    239  */
    240 void
    241 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
    242 {
    243 	struct proc *p = curproc;
    244 	caddr_t userlimit = flags & _MAP_LOW32 ?
    245 	    (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
    246 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
    247 }
    248 
    249 /*
    250  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
    251  */
    252 caddr_t	hole_start, hole_end;
    253 
    254 /*
    255  * kpm mapping window
    256  */
    257 caddr_t kpm_vbase;
    258 size_t  kpm_size;
    259 uchar_t kpm_size_shift;
    260 
    261 int valid_va_range_aligned_wraparound;
    262 /*
    263  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
    264  * addresses at least "minlen" long, where the base of the range is at "off"
    265  * phase from an "align" boundary and there is space for a "redzone"-sized
    266  * redzone on either side of the range.  On success, 1 is returned and *basep
    267  * and *lenp are adjusted to describe the acceptable range (including
    268  * the redzone).  On failure, 0 is returned.
    269  */
    270 int
    271 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
    272     size_t align, size_t redzone, size_t off)
    273 {
    274 	caddr_t hi, lo;
    275 	size_t tot_len;
    276 
    277 	ASSERT(align == 0 ? off == 0 : off < align);
    278 	ASSERT(ISP2(align));
    279 	ASSERT(align == 0 || align >= PAGESIZE);
    280 
    281 	lo = *basep;
    282 	hi = lo + *lenp;
    283 	tot_len = minlen + 2 * redzone;	/* need at least this much space */
    284 
    285 	/* If hi rolled over the top try cutting back. */
    286 	if (hi < lo) {
    287 		*lenp = 0UL - (uintptr_t)lo - 1UL;
    288 		/* Trying to see if this really happens, and then if so, why */
    289 		valid_va_range_aligned_wraparound++;
    290 		hi = lo + *lenp;
    291 	}
    292 	if (*lenp < tot_len) {
    293 		return (0);
    294 	}
    295 
    296 	/*
    297 	 * Deal with a possible hole in the address range between
    298 	 * hole_start and hole_end that should never be mapped by the MMU.
    299 	 */
    300 
    301 	if (lo < hole_start) {
    302 		if (hi > hole_start)
    303 			if (hi < hole_end)
    304 				hi = hole_start;
    305 			else
    306 				/* lo < hole_start && hi >= hole_end */
    307 				if (dir == AH_LO) {
    308 					/*
    309 					 * prefer lowest range
    310 					 */
    311 					if (hole_start - lo >= tot_len)
    312 						hi = hole_start;
    313 					else if (hi - hole_end >= tot_len)
    314 						lo = hole_end;
    315 					else
    316 						return (0);
    317 				} else {
    318 					/*
    319 					 * prefer highest range
    320 					 */
    321 					if (hi - hole_end >= tot_len)
    322 						lo = hole_end;
    323 					else if (hole_start - lo >= tot_len)
    324 						hi = hole_start;
    325 					else
    326 						return (0);
    327 				}
    328 	} else {
    329 		/* lo >= hole_start */
    330 		if (hi < hole_end)
    331 			return (0);
    332 		if (lo < hole_end)
    333 			lo = hole_end;
    334 	}
    335 
    336 	/* Check if remaining length is too small */
    337 	if (hi - lo < tot_len) {
    338 		return (0);
    339 	}
    340 	if (align > 1) {
    341 		caddr_t tlo = lo + redzone;
    342 		caddr_t thi = hi - redzone;
    343 		tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
    344 		if (tlo < lo + redzone) {
    345 			return (0);
    346 		}
    347 		if (thi < tlo || thi - tlo < minlen) {
    348 			return (0);
    349 		}
    350 	}
    351 	*basep = lo;
    352 	*lenp = hi - lo;
    353 	return (1);
    354 }
    355 
    356 /*
    357  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
    358  * addresses at least "minlen" long.  On success, 1 is returned and *basep
    359  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
    360  * is returned.
    361  */
    362 int
    363 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
    364 {
    365 	return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
    366 }
    367 
    368 /*
    369  * Determine whether [addr, addr+len] with protections `prot' are valid
    370  * for a user address space.
    371  */
    372 /*ARGSUSED*/
    373 int
    374 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
    375     caddr_t userlimit)
    376 {
    377 	caddr_t eaddr = addr + len;
    378 
    379 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
    380 		return (RANGE_BADADDR);
    381 
    382 	/*
    383 	 * Determine if the address range falls within an illegal
    384 	 * range of the MMU.
    385 	 */
    386 	if (eaddr > hole_start && addr < hole_end)
    387 		return (RANGE_BADADDR);
    388 
    389 #if defined(SF_ERRATA_57)
    390 	/*
    391 	 * Make sure USERLIMIT isn't raised too high
    392 	 */
    393 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
    394 	    errata57_limit == 0);
    395 
    396 	if (AS_TYPE_64BIT(as) &&
    397 	    (addr < errata57_limit) &&
    398 	    (prot & PROT_EXEC))
    399 		return (RANGE_BADPROT);
    400 #endif /* SF_ERRATA57 */
    401 	return (RANGE_OKAY);
    402 }
    403 
    404 /*
    405  * Routine used to check to see if an a.out can be executed
    406  * by the current machine/architecture.
    407  */
    408 int
    409 chkaout(struct exdata *exp)
    410 {
    411 	if (exp->ux_mach == M_SPARC)
    412 		return (0);
    413 	else
    414 		return (ENOEXEC);
    415 }
    416 
    417 /*
    418  * The following functions return information about an a.out
    419  * which is used when a program is executed.
    420  */
    421 
    422 /*
    423  * Return the load memory address for the data segment.
    424  */
    425 caddr_t
    426 getdmem(struct exec *exp)
    427 {
    428 	/*
    429 	 * XXX - Sparc Reference Hack approaching
    430 	 * Remember that we are loading
    431 	 * 8k executables into a 4k machine
    432 	 * DATA_ALIGN == 2 * PAGESIZE
    433 	 */
    434 	if (exp->a_text)
    435 		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
    436 	else
    437 		return ((caddr_t)USRTEXT);
    438 }
    439 
    440 /*
    441  * Return the starting disk address for the data segment.
    442  */
    443 ulong_t
    444 getdfile(struct exec *exp)
    445 {
    446 	if (exp->a_magic == ZMAGIC)
    447 		return (exp->a_text);
    448 	else
    449 		return (sizeof (struct exec) + exp->a_text);
    450 }
    451 
    452 /*
    453  * Return the load memory address for the text segment.
    454  */
    455 
    456 /*ARGSUSED*/
    457 caddr_t
    458 gettmem(struct exec *exp)
    459 {
    460 	return ((caddr_t)USRTEXT);
    461 }
    462 
    463 /*
    464  * Return the file byte offset for the text segment.
    465  */
    466 uint_t
    467 gettfile(struct exec *exp)
    468 {
    469 	if (exp->a_magic == ZMAGIC)
    470 		return (0);
    471 	else
    472 		return (sizeof (struct exec));
    473 }
    474 
    475 void
    476 getexinfo(
    477 	struct exdata *edp_in,
    478 	struct exdata *edp_out,
    479 	int *pagetext,
    480 	int *pagedata)
    481 {
    482 	*edp_out = *edp_in;	/* structure copy */
    483 
    484 	if ((edp_in->ux_mag == ZMAGIC) &&
    485 	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
    486 		*pagetext = 1;
    487 		*pagedata = 1;
    488 	} else {
    489 		*pagetext = 0;
    490 		*pagedata = 0;
    491 	}
    492 }
    493 
    494 /*
    495  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
    496  * KPM selects an address such that it's equal offset modulo shm_alignment and
    497  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
    498  */
    499 int
    500 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
    501 {
    502 	if (vac) {
    503 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
    504 	} else {
    505 		return (0);
    506 	}
    507 }
    508 
    509 /*
    510  * Sanity control. Don't use large pages regardless of user
    511  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
    512  * The units for this variable is 8K pages.
    513  */
    514 pgcnt_t shm_lpg_min_physmem = 131072;			/* 1GB */
    515 pgcnt_t privm_lpg_min_physmem = 131072;			/* 1GB */
    516 
    517 static size_t
    518 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
    519 {
    520 	size_t		pgsz = MMU_PAGESIZE;
    521 	int		szc;
    522 
    523 	/*
    524 	 * If len is zero, retrieve from proc and don't demote the page size.
    525 	 * Use atleast the default pagesize.
    526 	 */
    527 	if (len == 0) {
    528 		len = p->p_brkbase + p->p_brksize - p->p_bssbase;
    529 	}
    530 	len = MAX(len, default_uheap_lpsize);
    531 
    532 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
    533 		pgsz = hw_page_array[szc].hp_size;
    534 		if ((disable_auto_data_large_pages & (1 << szc)) ||
    535 		    pgsz > max_uheap_lpsize)
    536 			continue;
    537 		if (len >= pgsz) {
    538 			break;
    539 		}
    540 	}
    541 
    542 	/*
    543 	 * If addr == 0 we were called by memcntl() when the
    544 	 * size code is 0.  Don't set pgsz less than current size.
    545 	 */
    546 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
    547 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
    548 	}
    549 
    550 	return (pgsz);
    551 }
    552 
    553 static size_t
    554 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
    555 {
    556 	size_t		pgsz = MMU_PAGESIZE;
    557 	int		szc;
    558 
    559 	/*
    560 	 * If len is zero, retrieve from proc and don't demote the page size.
    561 	 * Use atleast the default pagesize.
    562 	 */
    563 	if (len == 0) {
    564 		len = p->p_stksize;
    565 	}
    566 	len = MAX(len, default_ustack_lpsize);
    567 
    568 	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
    569 		pgsz = hw_page_array[szc].hp_size;
    570 		if ((disable_auto_data_large_pages & (1 << szc)) ||
    571 		    pgsz > max_ustack_lpsize)
    572 			continue;
    573 		if (len >= pgsz) {
    574 			break;
    575 		}
    576 	}
    577 
    578 	/*
    579 	 * If addr == 0 we were called by memcntl() or exec_args() when the
    580 	 * size code is 0.  Don't set pgsz less than current size.
    581 	 */
    582 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
    583 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
    584 	}
    585 
    586 	return (pgsz);
    587 }
    588 
    589 static size_t
    590 map_pgszism(caddr_t addr, size_t len)
    591 {
    592 	uint_t szc;
    593 	size_t pgsz;
    594 
    595 	for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
    596 		if (disable_ism_large_pages & (1 << szc))
    597 			continue;
    598 
    599 		pgsz = hw_page_array[szc].hp_size;
    600 		if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
    601 			return (pgsz);
    602 	}
    603 
    604 	return (DEFAULT_ISM_PAGESIZE);
    605 }
    606 
    607 /*
    608  * Suggest a page size to be used to map a segment of type maptype and length
    609  * len.  Returns a page size (not a size code).
    610  */
    611 /* ARGSUSED */
    612 size_t
    613 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
    614 {
    615 	size_t	pgsz = MMU_PAGESIZE;
    616 
    617 	ASSERT(maptype != MAPPGSZ_VA);
    618 
    619 	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
    620 		return (MMU_PAGESIZE);
    621 	}
    622 
    623 	switch (maptype) {
    624 	case MAPPGSZ_ISM:
    625 		pgsz = map_pgszism(addr, len);
    626 		break;
    627 
    628 	case MAPPGSZ_STK:
    629 		if (max_ustack_lpsize > MMU_PAGESIZE) {
    630 			pgsz = map_pgszstk(p, addr, len);
    631 		}
    632 		break;
    633 
    634 	case MAPPGSZ_HEAP:
    635 		if (max_uheap_lpsize > MMU_PAGESIZE) {
    636 			pgsz = map_pgszheap(p, addr, len);
    637 		}
    638 		break;
    639 	}
    640 	return (pgsz);
    641 }
    642 
    643 
    644 /* assumes TTE8K...TTE4M == szc */
    645 
    646 static uint_t
    647 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
    648     size_t max_lpsize, size_t min_physmem)
    649 {
    650 	caddr_t eaddr = addr + size;
    651 	uint_t szcvec = 0;
    652 	caddr_t raddr;
    653 	caddr_t readdr;
    654 	size_t pgsz;
    655 	int i;
    656 
    657 	if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
    658 		return (0);
    659 	}
    660 	for (i = mmu_page_sizes - 1; i > 0; i--) {
    661 		if (disable_lpgs & (1 << i)) {
    662 			continue;
    663 		}
    664 		pgsz = page_get_pagesize(i);
    665 		if (pgsz > max_lpsize) {
    666 			continue;
    667 		}
    668 		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
    669 		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
    670 		if (raddr < addr || raddr >= readdr) {
    671 			continue;
    672 		}
    673 		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
    674 			continue;
    675 		}
    676 		szcvec |= (1 << i);
    677 		/*
    678 		 * And or in the remaining enabled page sizes.
    679 		 */
    680 		szcvec |= P2PHASE(~disable_lpgs, (1 << i));
    681 		szcvec &= ~1; /* no need to return 8K pagesize */
    682 		break;
    683 	}
    684 	return (szcvec);
    685 }
    686 
    687 /*
    688  * Return a bit vector of large page size codes that
    689  * can be used to map [addr, addr + len) region.
    690  */
    691 /* ARGSUSED */
    692 uint_t
    693 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
    694     int memcntl)
    695 {
    696 	if (flags & MAP_TEXT) {
    697 		return (map_szcvec(addr, size, off,
    698 		    disable_auto_text_large_pages,
    699 		    max_utext_lpsize, shm_lpg_min_physmem));
    700 
    701 	} else if (flags & MAP_INITDATA) {
    702 		return (map_szcvec(addr, size, off,
    703 		    disable_auto_data_large_pages,
    704 		    max_uidata_lpsize, privm_lpg_min_physmem));
    705 
    706 	} else if (type == MAPPGSZC_SHM) {
    707 		return (map_szcvec(addr, size, off,
    708 		    disable_auto_data_large_pages,
    709 		    max_shm_lpsize, shm_lpg_min_physmem));
    710 
    711 	} else if (type == MAPPGSZC_HEAP) {
    712 		return (map_szcvec(addr, size, off,
    713 		    disable_auto_data_large_pages,
    714 		    max_uheap_lpsize, privm_lpg_min_physmem));
    715 
    716 	} else if (type == MAPPGSZC_STACK) {
    717 		return (map_szcvec(addr, size, off,
    718 		    disable_auto_data_large_pages,
    719 		    max_ustack_lpsize, privm_lpg_min_physmem));
    720 
    721 	} else {
    722 		return (map_szcvec(addr, size, off,
    723 		    disable_auto_data_large_pages,
    724 		    max_privmap_lpsize, privm_lpg_min_physmem));
    725 	}
    726 }
    727 
    728 /*
    729  * Anchored in the table below are counters used to keep track
    730  * of free contiguous physical memory. Each element of the table contains
    731  * the array of counters, the size of array which is allocated during
    732  * startup based on physmax and a shift value used to convert a pagenum
    733  * into a counter array index or vice versa. The table has page size
    734  * for rows and region size for columns:
    735  *
    736  *	page_counters[page_size][region_size]
    737  *
    738  *	page_size: 	TTE size code of pages on page_size freelist.
    739  *
    740  *	region_size:	TTE size code of a candidate larger page made up
    741  *			made up of contiguous free page_size pages.
    742  *
    743  * As you go across a page_size row increasing region_size each
    744  * element keeps track of how many (region_size - 1) size groups
    745  * made up of page_size free pages can be coalesced into a
    746  * regsion_size page. Yuck! Lets try an example:
    747  *
    748  * 	page_counters[1][3] is the table element used for identifying
    749  *	candidate 4M pages from contiguous pages off the 64K free list.
    750  *	Each index in the page_counters[1][3].array spans 4M. Its the
    751  *	number of free 512K size (regsion_size - 1) groups of contiguous
    752  *	64K free pages.	So when page_counters[1][3].counters[n] == 8
    753  *	we know we have a candidate 4M page made up of 512K size groups
    754  *	of 64K free pages.
    755  */
    756 
    757 /*
    758  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
    759  * dimensions are allocated dynamically.
    760  */
    761 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
    762 
    763 /*
    764  * For now there is only a single size cache list.
    765  * Allocated dynamically.
    766  */
    767 page_t ***page_cachelists[MAX_MEM_TYPES];
    768 
    769 kmutex_t *fpc_mutex[NPC_MUTEX];
    770 kmutex_t *cpc_mutex[NPC_MUTEX];
    771 
    772 /*
    773  * Calculate space needed for page freelists and counters
    774  */
    775 size_t
    776 calc_free_pagelist_sz(void)
    777 {
    778 	int szc;
    779 	size_t alloc_sz, cache_sz, free_sz;
    780 
    781 	/*
    782 	 * one cachelist per color, node, and type
    783 	 */
    784 	cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
    785 	    sizeof (page_t **);
    786 	cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
    787 
    788 	/*
    789 	 * one freelist per size, color, node, and type
    790 	 */
    791 	free_sz = sizeof (page_t **);
    792 	for (szc = 0; szc < mmu_page_sizes; szc++)
    793 		free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
    794 	free_sz *= max_mem_nodes * MAX_MEM_TYPES;
    795 
    796 	alloc_sz = cache_sz + free_sz + page_ctrs_sz();
    797 	return (alloc_sz);
    798 }
    799 
    800 caddr_t
    801 alloc_page_freelists(caddr_t alloc_base)
    802 {
    803 	int	mnode, mtype;
    804 	int	szc, clrs;
    805 
    806 	/*
    807 	 * We only support small pages in the cachelist.
    808 	 */
    809 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
    810 		page_cachelists[mtype] = (page_t ***)alloc_base;
    811 		alloc_base += (max_mem_nodes * sizeof (page_t **));
    812 		for (mnode = 0; mnode < max_mem_nodes; mnode++) {
    813 			page_cachelists[mtype][mnode] = (page_t **)alloc_base;
    814 			alloc_base +=
    815 			    (page_get_pagecolors(0) * sizeof (page_t *));
    816 		}
    817 	}
    818 
    819 	/*
    820 	 * Allocate freelists bins for all
    821 	 * supported page sizes.
    822 	 */
    823 	for (szc = 0; szc < mmu_page_sizes; szc++) {
    824 		clrs = page_get_pagecolors(szc);
    825 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
    826 			page_freelists[szc][mtype] = (page_t ***)alloc_base;
    827 			alloc_base += (max_mem_nodes * sizeof (page_t **));
    828 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
    829 				page_freelists[szc][mtype][mnode] =
    830 				    (page_t **)alloc_base;
    831 				alloc_base += (clrs * (sizeof (page_t *)));
    832 			}
    833 		}
    834 	}
    835 
    836 	alloc_base = page_ctrs_alloc(alloc_base);
    837 	return (alloc_base);
    838 }
    839 
    840 /*
    841  * Allocate page_freelists locks for a memnode from the nucleus data
    842  * area. This is the first time that mmu_page_sizes is used during
    843  * bootup, so check mmu_page_sizes initialization.
    844  */
    845 int
    846 ndata_alloc_page_mutexs(struct memlist *ndata)
    847 {
    848 	size_t alloc_sz;
    849 	caddr_t alloc_base;
    850 	int	i;
    851 	void	page_coloring_init();
    852 
    853 	page_coloring_init();
    854 	if (&mmu_init_mmu_page_sizes) {
    855 		if (!mmu_init_mmu_page_sizes(0)) {
    856 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
    857 			    mmu_page_sizes);
    858 		}
    859 	}
    860 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
    861 
    862 	/* fpc_mutex and cpc_mutex */
    863 	alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
    864 
    865 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
    866 	if (alloc_base == NULL)
    867 		return (-1);
    868 
    869 	ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
    870 
    871 	for (i = 0; i < NPC_MUTEX; i++) {
    872 		fpc_mutex[i] = (kmutex_t *)alloc_base;
    873 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
    874 		cpc_mutex[i] = (kmutex_t *)alloc_base;
    875 		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
    876 	}
    877 	return (0);
    878 }
    879 
    880 /*
    881  * To select our starting bin, we stride through the bins with a stride
    882  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
    883  * in simulation and practice for different workloads on varying cache sizes.
    884  */
    885 uint32_t color_start_current = 0;
    886 uint32_t color_start_stride = 337;
    887 int color_start_random = 0;
    888 
    889 /* ARGSUSED */
    890 uint_t
    891 get_color_start(struct as *as)
    892 {
    893 	uint32_t old, new;
    894 
    895 	if (consistent_coloring == 2 || color_start_random) {
    896 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
    897 		    (hw_page_array[0].hp_colors - 1)));
    898 	}
    899 
    900 	do {
    901 		old = color_start_current;
    902 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
    903 	} while (cas32(&color_start_current, old, new) != old);
    904 
    905 	return ((uint_t)(new));
    906 }
    907 
    908 /*
    909  * Called once at startup from kphysm_init() -- before memialloc()
    910  * is invoked to do the 1st page_free()/page_freelist_add().
    911  *
    912  * initializes page_colors and page_colors_mask based on ecache_setsize.
    913  *
    914  * Also initializes the counter locks.
    915  */
    916 void
    917 page_coloring_init()
    918 {
    919 	int	a, i;
    920 	uint_t colors;
    921 
    922 	if (do_pg_coloring == 0) {
    923 		page_colors = 1;
    924 		for (i = 0; i < mmu_page_sizes; i++) {
    925 			colorequivszc[i] = 0;
    926 			hw_page_array[i].hp_colors = 1;
    927 		}
    928 		return;
    929 	}
    930 
    931 	/*
    932 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
    933 	 * the max ecache setsize of all cpus configured in the system or, for
    934 	 * cheetah+ systems, the max possible ecache setsize for all possible
    935 	 * cheetah+ cpus.
    936 	 */
    937 	page_colors = ecache_setsize / MMU_PAGESIZE;
    938 	page_colors_mask = page_colors - 1;
    939 
    940 	vac_colors = vac_size / MMU_PAGESIZE;
    941 	vac_colors_mask = vac_colors -1;
    942 
    943 	page_coloring_shift = 0;
    944 	a = ecache_setsize;
    945 	while (a >>= 1) {
    946 		page_coloring_shift++;
    947 	}
    948 
    949 	/* initialize number of colors per page size */
    950 	for (i = 0; i < mmu_page_sizes; i++) {
    951 		hw_page_array[i].hp_colors = (page_colors_mask >>
    952 		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
    953 		    + 1;
    954 		colorequivszc[i] = 0;
    955 	}
    956 
    957 	/*
    958 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
    959 	 * cpu_page_colors set to -1 during DR operation or during startup
    960 	 * if setsizes are heterogenous.
    961 	 *
    962 	 * The value of cpu_page_colors determines if additional color bins
    963 	 * need to be checked for a particular color in the page_get routines.
    964 	 */
    965 	if (cpu_setsize > 0 && cpu_page_colors == 0 &&
    966 	    cpu_setsize < ecache_setsize) {
    967 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
    968 		a = lowbit(page_colors) - lowbit(cpu_page_colors);
    969 		ASSERT(a > 0);
    970 		ASSERT(a < 16);
    971 
    972 		for (i = 0; i < mmu_page_sizes; i++) {
    973 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
    974 				continue;
    975 			}
    976 			while ((colors >> a) == 0)
    977 				a--;
    978 			ASSERT(a >= 0);
    979 
    980 			/* higher 4 bits encodes color equiv mask */
    981 			colorequivszc[i] = (a << 4);
    982 		}
    983 	}
    984 
    985 	/* do cpu specific color initialization */
    986 	if (&page_coloring_init_cpu) {
    987 		page_coloring_init_cpu();
    988 	}
    989 }
    990 
    991 int
    992 bp_color(struct buf *bp)
    993 {
    994 	int color = -1;
    995 
    996 	if (vac) {
    997 		if ((bp->b_flags & B_PAGEIO) != 0) {
    998 			color = sfmmu_get_ppvcolor(bp->b_pages);
    999 		} else if (bp->b_un.b_addr != NULL) {
   1000 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
   1001 		}
   1002 	}
   1003 	return (color < 0 ? 0 : ptob(color));
   1004 }
   1005 
   1006 /*
   1007  * Create & Initialise pageout scanner thread. The thread has to
   1008  * start at procedure with process pp and priority pri.
   1009  */
   1010 void
   1011 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
   1012 {
   1013 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
   1014 }
   1015 
   1016 /*
   1017  * Function for flushing D-cache when performing module relocations
   1018  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
   1019  * at least for now.
   1020  */
   1021 void
   1022 dcache_flushall()
   1023 {
   1024 	sfmmu_cache_flushall();
   1025 }
   1026 
   1027 static int
   1028 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
   1029 {
   1030 	if (va1 < va2 && va1 + sz1 <= va2)
   1031 		return (0);
   1032 
   1033 	if (va2 < va1 && va2 + sz2 <= va1)
   1034 		return (0);
   1035 
   1036 	return (1);
   1037 }
   1038 
   1039 /*
   1040  * Return the number of bytes, relative to the beginning of a given range, that
   1041  * are non-toxic (can be read from and written to with relative impunity).
   1042  */
   1043 size_t
   1044 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
   1045 {
   1046 	/* OBP reads are harmless, but we don't want people writing there */
   1047 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
   1048 	    OFW_START_ADDR + 1))
   1049 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
   1050 
   1051 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
   1052 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
   1053 
   1054 	return (sz); /* no overlap */
   1055 }
   1056 
   1057 /*
   1058  * Minimum physmem required for enabling large pages for kernel heap
   1059  * Currently we do not enable lp for kmem on systems with less
   1060  * than 1GB of memory. This value can be changed via /etc/system
   1061  */
   1062 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
   1063 
   1064 /*
   1065  * this function chooses large page size for kernel heap
   1066  */
   1067 size_t
   1068 get_segkmem_lpsize(size_t lpsize)
   1069 {
   1070 	size_t memtotal = physmem * PAGESIZE;
   1071 	size_t mmusz;
   1072 	uint_t szc;
   1073 
   1074 	if (memtotal < segkmem_lpminphysmem)
   1075 		return (PAGESIZE);
   1076 
   1077 	if (plat_lpkmem_is_supported != NULL &&
   1078 	    plat_lpkmem_is_supported() == 0)
   1079 		return (PAGESIZE);
   1080 
   1081 	mmusz = mmu_get_kernel_lpsize(lpsize);
   1082 	szc = page_szc(mmusz);
   1083 
   1084 	while (szc) {
   1085 		if (!(disable_large_pages & (1 << szc)))
   1086 			return (page_get_pagesize(szc));
   1087 		szc--;
   1088 	}
   1089 	return (PAGESIZE);
   1090 }
   1091