Home | History | Annotate | Download | only in cpu
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/systm.h>
     29 #include <sys/archsystm.h>
     30 #include <sys/machparam.h>
     31 #include <sys/machsystm.h>
     32 #include <sys/cpu.h>
     33 #include <sys/elf_SPARC.h>
     34 #include <vm/page.h>
     35 #include <vm/vm_dep.h>
     36 #include <sys/cpuvar.h>
     37 #include <sys/async.h>
     38 #include <sys/cmn_err.h>
     39 #include <sys/debug.h>
     40 #include <sys/dditypes.h>
     41 #include <sys/sunddi.h>
     42 #include <sys/cpu_module.h>
     43 #include <sys/prom_debug.h>
     44 #include <sys/vmsystm.h>
     45 #include <sys/prom_plat.h>
     46 #include <sys/sysmacros.h>
     47 #include <sys/intreg.h>
     48 #include <sys/machtrap.h>
     49 #include <sys/ontrap.h>
     50 #include <sys/ivintr.h>
     51 #include <sys/atomic.h>
     52 #include <sys/panic.h>
     53 #include <sys/dtrace.h>
     54 #include <vm/seg_spt.h>
     55 #include <sys/hypervisor_api.h>
     56 #include <sys/rock_hypervisor_api.h>
     57 #include <sys/hsvc.h>
     58 #include <vm/hat_sfmmu.h>
     59 #include <sys/mutex_impl.h>
     60 
     61 uint_t root_phys_addr_lo_mask = 0xffffffffU;
     62 uint8_t	enable_tm = 1;
     63 
     64 char cpu_module_name[] = "SUNW,UltraSPARC-AT10";
     65 boolean_t	hsvc_tm_available = B_TRUE;
     66 
     67 static	hsvc_info_t rock_tm_hsvc = {
     68 	HSVC_REV_1,		/* HSVC rev num */
     69 	NULL,			/* Private */
     70 	HSVC_GROUP_TM,		/* Requested API Group */
     71 	ROCK_HSVC_MAJOR,	/* Requested Major */
     72 	ROCK_HSVC_MINOR,	/* Requested Minor */
     73 	cpu_module_name		/* Module name */
     74 };
     75 
     76 boolean_t	hsvc_mmu_ext_available = B_TRUE;
     77 
     78 static	hsvc_info_t rock_mmu_ext_hsvc = {
     79 	HSVC_REV_1,		/* HSVC rev num */
     80 	NULL,			/* Private */
     81 	HSVC_GROUP_RKMMU_EXT,	/* Requested API Group */
     82 	ROCK_HSVC_MAJOR,	/* Requested Major */
     83 	ROCK_HSVC_MINOR,	/* Requested Minor */
     84 	cpu_module_name		/* Module name */
     85 };
     86 
     87 static void encode_pgsz_order(uint64_t, int, int, uint16_t *, uchar_t *);
     88 static void set_pgsz_order(uchar_t, uchar_t, uint64_t *, int *, int *,
     89     sfmmu_t *);
     90 
     91 extern	void rock_mutex_delay(void);
     92 
     93 /*
     94  * External /etc/system tunable, for controlling whether shared or private pages
     95  * come first in the pagesize order register.
     96  */
     97 int pgsz_order_shared_first = 1;
     98 
     99 #define	MCOREID_MASK	0x1E
    100 #define	MCOREID_SHIFT	1
    101 
    102 static uint_t mmu_disable_large_pages = ((1 << TTE512K) | (1 << TTE32M) |
    103 		(1 << TTE2G) | (1 << TTE16G));
    104 static uint_t mmu_disable_ism_large_pages = ((1 << TTE512K) | (1 << TTE32M) |
    105 	(1 << TTE2G) | (1 << TTE16G));
    106 static uint_t mmu_disable_auto_data_large_pages = ((1 << TTE512K) |
    107 	(1 << TTE32M) | (1 << TTE2G) | (1 << TTE16G));
    108 static uint_t mmu_disable_auto_text_large_pages = ((1 << TTE512K) |
    109 	(1 << TTE32M) | (1 << TTE2G) | (1 << TTE16G));
    110 
    111 void
    112 cpu_setup(void)
    113 {
    114 	extern int	cpc_has_overflow_intr;
    115 	uint64_t	sup_minor;
    116 	int		status;
    117 
    118 	/*
    119 	 * The setup common to all CPU modules is done in cpu_setup_common
    120 	 * routine.
    121 	 */
    122 	cpu_setup_common(NULL);
    123 
    124 	/*
    125 	 * Rock's max nctxs is 64K. Set it accordingly.
    126 	 */
    127 	nctxs = MAX_NCTXS;
    128 
    129 	/*
    130 	 * Rock I$ is non-coherent.
    131 	 */
    132 	mach_setup_icache(0);
    133 
    134 #ifdef DEBUG
    135 	/*
    136 	 * These should always be present on Rock
    137 	 */
    138 	if (cpu_hwcap_flags == 0)
    139 		cmn_err(CE_WARN, "hwcap-list missing from MD");
    140 #endif
    141 	cpu_hwcap_flags |= AV_SPARC_ASI_CACHE_SPARING;
    142 
    143 	cache |= (CACHE_PTAG | CACHE_IOCOHERENT);
    144 
    145 	if (use_page_coloring) {
    146 		do_pg_coloring = 1;
    147 	}
    148 
    149 	/*
    150 	 * Rock generates hpriv performance event trap instead of pic overflow
    151 	 * trap. To get the attention of the guest hv in-turn generates pic
    152 	 * overflow trap. Therefore enable support for that.
    153 	 */
    154 	cpc_has_overflow_intr = 1;
    155 
    156 	/*
    157 	 * Enable 4M pages for OOB.
    158 	 */
    159 	max_uheap_lpsize = MMU_PAGESIZE4M;
    160 	max_ustack_lpsize = MMU_PAGESIZE4M;
    161 	max_privmap_lpsize = MMU_PAGESIZE4M;
    162 
    163 	/*
    164 	 * hv_tm_enable is a part of TM group. We need to
    165 	 * negotiate that API group before we can use it.
    166 	 */
    167 	status = hsvc_register(&rock_tm_hsvc, &sup_minor);
    168 	if ((status != 0) || (sup_minor < (uint64_t)ROCK_HSVC_MINOR)) {
    169 		cmn_err(CE_WARN, "%s cannot negotiate hypervisor services: "
    170 		    "major: 0x%lx minor: 0x%lx group: 0x%x errno: %d",
    171 		    cpu_module_name, rock_tm_hsvc.hsvc_major,
    172 		    rock_tm_hsvc.hsvc_minor, HSVC_GROUP_TM, status);
    173 		hsvc_tm_available = B_FALSE;
    174 	}
    175 
    176 	/*
    177 	 * Negotiate API group for rock mmu extensions.
    178 	 */
    179 	status = hsvc_register(&rock_mmu_ext_hsvc, &sup_minor);
    180 	if ((status != 0) || (sup_minor <
    181 	    (uint64_t)ROCK_HSVC_MINOR)) {
    182 		cmn_err(CE_WARN, "%s cannot negotiate hypervisor services: "
    183 		    "major: 0x%lx minor: 0x%lx group: 0x%x errno: %d",
    184 		    cpu_module_name, rock_mmu_ext_hsvc.hsvc_major,
    185 		    rock_mmu_ext_hsvc.hsvc_minor, HSVC_GROUP_RKMMU_EXT,
    186 		    status);
    187 		hsvc_mmu_ext_available = B_FALSE;
    188 	}
    189 }
    190 
    191 /*
    192  * Set the magic constants of the implementation.
    193  */
    194 void
    195 cpu_fiximp(struct cpu_node *cpunode)
    196 {
    197 	/*
    198 	 * The Cache node is optional in MD. Therefore in case it
    199 	 * does not exist, use hardcoded values.
    200 	 */
    201 #ifdef DEBUG
    202 	/*
    203 	 * ...that said, we do want this info to come from the MD.
    204 	 */
    205 	if (cpunode->ecache_size == 0 || cpunode->ecache_linesize == 0 ||
    206 	    cpunode->ecache_associativity == 0) {
    207 		cmn_err(CE_WARN, "ecache info missing from MD");
    208 	}
    209 #endif
    210 	if (cpunode->ecache_size == 0)
    211 		cpunode->ecache_size = 2 * 1024 * 1024;
    212 	if (cpunode->ecache_linesize == 0)
    213 		cpunode->ecache_linesize = 64;
    214 	if (cpunode->ecache_associativity == 0)
    215 		cpunode->ecache_associativity = 8;
    216 }
    217 
    218 void
    219 dtrace_flush_sec(uintptr_t addr)
    220 {
    221 	pfn_t pfn;
    222 	proc_t *procp = ttoproc(curthread);
    223 	page_t *pp;
    224 	caddr_t va;
    225 
    226 	pfn = hat_getpfnum(procp->p_as->a_hat, (void *)addr);
    227 	if (pfn != -1) {
    228 		ASSERT(pf_is_memory(pfn));
    229 		pp = page_numtopp_noreclaim(pfn, SE_SHARED);
    230 		if (pp != NULL) {
    231 			va = ppmapin(pp, PROT_READ | PROT_WRITE, (void *)addr);
    232 			/* sparc needs 8-byte align */
    233 			doflush((caddr_t)((uintptr_t)va & -8l));
    234 			ppmapout(va);
    235 			page_unlock(pp);
    236 		}
    237 	}
    238 }
    239 
    240 void
    241 cpu_map_exec_units(struct cpu *cp)
    242 {
    243 	ASSERT(MUTEX_HELD(&cpu_lock));
    244 
    245 	/*
    246 	 * The cpu_ipipe and cpu_fpu fields are initialized based on
    247 	 * the execution unit sharing information from the MD. They
    248 	 * default to the CPU id in the absence of such information.
    249 	 */
    250 	cp->cpu_m.cpu_ipipe = cpunodes[cp->cpu_id].exec_unit_mapping;
    251 	if (cp->cpu_m.cpu_ipipe == NO_EU_MAPPING_FOUND)
    252 		cp->cpu_m.cpu_ipipe = (id_t)(cp->cpu_id);
    253 
    254 	cp->cpu_m.cpu_fpu = cpunodes[cp->cpu_id].fpu_mapping;
    255 	if (cp->cpu_m.cpu_fpu == NO_EU_MAPPING_FOUND)
    256 		cp->cpu_m.cpu_fpu = (id_t)(cp->cpu_id);
    257 
    258 	cp->cpu_m.cpu_core = (cp->cpu_id & MCOREID_MASK) >> MCOREID_SHIFT;
    259 
    260 	/*
    261 	 * The cpu_chip field is initialized based on the information
    262 	 * in the MD and assume that all cpus within a chip
    263 	 * share the same L2 cache. If no such info is available, we
    264 	 * set the cpu to CPU_CHIPID_INVALID.
    265 	 */
    266 	cp->cpu_m.cpu_mpipe = cpunodes[cp->cpu_id].l2_cache_mapping;
    267 	if (cp->cpu_m.cpu_mpipe == NO_L2_CACHE_MAPPING_FOUND)
    268 		cp->cpu_m.cpu_mpipe = CPU_L2_CACHEID_INVALID;
    269 
    270 	cp->cpu_m.cpu_chip = cpunodes[cp->cpu_id].l2_cache_mapping;
    271 	if (cp->cpu_m.cpu_chip == NO_L2_CACHE_MAPPING_FOUND)
    272 		cp->cpu_m.cpu_chip = CPU_CHIPID_INVALID;
    273 }
    274 
    275 void
    276 cpu_init_private(struct cpu *cp)
    277 {
    278 	cpu_map_exec_units(cp);
    279 	mutex_delay = rock_mutex_delay;
    280 }
    281 
    282 /*ARGSUSED*/
    283 void
    284 cpu_uninit_private(struct cpu *cp)
    285 {
    286 }
    287 
    288 /*
    289  * cpu_feature_init
    290  *
    291  * This function is called once per strand.
    292  */
    293 void
    294 cpu_feature_init(void)
    295 {
    296 	static	int	set_mutex_backoff_tunables = 0;
    297 	/*
    298 	 * Set constants for mutex_backoff only once.
    299 	 * On Rock, setting this to 8 gives the best performance,
    300 	 * even for multi-chip systems.
    301 	 */
    302 	if (! set_mutex_backoff_tunables) {
    303 		mutex_backoff_base = 1;
    304 		mutex_cap_factor = 8;
    305 		set_mutex_backoff_tunables = 1;
    306 	}
    307 
    308 	/*
    309 	 * Enable or disable for each cpu if hypervisor API is negotiated.
    310 	 */
    311 	if (hsvc_tm_available == B_TRUE)
    312 		(void) hv_tm_enable((uint64_t)enable_tm);
    313 }
    314 
    315 /*
    316  * Flush specified address range from I$ via hv_mem_iflush interface
    317  * Note that the hypervisor interface expects physical address range
    318  * and can flush less than the requested size.
    319  */
    320 
    321 void
    322 rock_sync_icache(caddr_t addr, size_t size)
    323 {
    324 	uint64_t pa, i, flushlen, flushed;
    325 
    326 	if (!force_sync_icache_after_bcopy)
    327 		/*
    328 		 * Do not clear the I-cache after bcopy.
    329 		 * The default value is 0. This flag made be
    330 		 * set via /etc/system.
    331 		 */
    332 		return;
    333 
    334 	if (!tba_taken_over)
    335 		/*
    336 		 * Very early in boot, va_to_pa() will try to call back
    337 		 * into OBP.  Very *very* early in boot, this will fail
    338 		 * because we haven't set up the OBP callback handler.
    339 		 * (Without this check, kmdb boot will fail.)
    340 		 */
    341 		return;
    342 
    343 	for (i = 0; i < size; i += flushed) {
    344 		pa = va_to_pa(addr + i);
    345 		ASSERT(pa != -1);
    346 
    347 		/*
    348 		 * Only flush the required length up to a PAGESIZE.
    349 		 */
    350 
    351 		flushlen = MIN((size - i), (PAGESIZE - (pa & MMU_PAGEOFFSET)));
    352 
    353 		/*
    354 		 * Flush I$ up to the page bounday. This call should never
    355 		 * fail. If it does, we panic the system as I$ may contain
    356 		 * stale instructions, which can result in silent data
    357 		 * corruption.
    358 		 */
    359 
    360 		if (hv_mem_iflush(pa, flushlen, &flushed) != H_EOK) {
    361 			cmn_err(CE_PANIC, "Flushing the Icache failed");
    362 		}
    363 
    364 	}
    365 }
    366 
    367 /*
    368  * There are no Hypervisor trapstat(1m) interfaces for Rock
    369  * If trapstat(1m) wants to do its thing, it will have to
    370  * take over all TLB miss handling.
    371  */
    372 int
    373 cpu_trapstat_conf(int cmd)
    374 {
    375 	int status;
    376 
    377 	switch (cmd) {
    378 	case CPU_TSTATCONF_INIT:
    379 	case CPU_TSTATCONF_FINI:
    380 	case CPU_TSTATCONF_ENABLE:
    381 	case CPU_TSTATCONF_DISABLE:
    382 		status = ENOTSUP;
    383 		break;
    384 	default:
    385 		status = EINVAL;
    386 		break;
    387 	}
    388 	return (status);
    389 }
    390 
    391 /*ARGSUSED*/
    392 void
    393 cpu_trapstat_data(void *buf, uint_t tstat_pgszs)
    394 {
    395 }
    396 
    397 #define	MAX_PAGE_COLORS		(1 << MAX_PAGE_COLORS_SHIFT)
    398 #define	MAX_PAGE_COLORS_SHIFT	(5)
    399 
    400 /*ARGSUSED*/
    401 uint_t
    402 page_pfn_2_color_cpu(pfn_t pfn, uchar_t szc, void *cookie)
    403 {
    404 	uint_t	color;
    405 
    406 	pfn = PFN_BASE(pfn, szc);
    407 	color = pfn ^ (pfn >> 20);
    408 	color = color ^ (color >> 10);
    409 	return ((color ^ (color >> 5)) & 0x1f);
    410 }
    411 
    412 /*
    413  * this macro rotates value "x" n steps to the right
    414  * mask consists of "n + m" bits
    415  * ASSERT(x < (1 << (n + m));
    416  */
    417 #define	ROTATE_BITS(x, n, m) (((x) >> (n)) | (((x) & ((1 << (n)) - 1)) << m))
    418 
    419 
    420 uchar_t clr2sqnclr_table[MMU_PAGE_SIZES][MAX_PAGE_COLORS];
    421 
    422 /*
    423  * on Rock, the hash cache index is calculated as follows:
    424  * pa[47:43]^pa[42:38]^pa[37:33]^pa[32:28]^
    425  * 	pa[27:23]^pa[22:18]^pa[17:13].pa[12:6]
    426  * That is, every 5 bits is folded and XORd together. Page sizes
    427  * differ by 3 bits, which is a factor of 8. This function computes
    428  * the next sequential color by rotating by 3 steps within a field of 5 bits
    429  * for every page size.
    430  */
    431 void
    432 clr2sqnclr_table_init()
    433 {
    434 	uchar_t szc;
    435 	uint_t  color;
    436 	uint_t  rot = 0;
    437 
    438 	for (szc = 0; szc < MMU_PAGE_SIZES; szc++) {
    439 		rot = (szc * 3) % MAX_PAGE_COLORS_SHIFT;
    440 		for (color = 0; color < MAX_PAGE_COLORS; color++) {
    441 			clr2sqnclr_table[szc][color] =
    442 			    ROTATE_BITS(color, rot,
    443 			    (MAX_PAGE_COLORS_SHIFT - rot));
    444 		}
    445 	}
    446 }
    447 
    448 uint_t
    449 clr2sqnclr(uchar_t szc, uint_t color)
    450 {
    451 	ASSERT(szc < MMU_PAGE_SIZES);
    452 	ASSERT(color < MAX_PAGE_COLORS);
    453 
    454 	return (clr2sqnclr_table[szc][color]);
    455 }
    456 
    457 #if MMU_PAGE_SIZES > 8
    458 #error MMU_PAGE_SIZES can be at most 8
    459 #endif
    460 
    461 uint_t
    462 page_get_nsz_color_mask_cpu(uchar_t szc, uint_t mask)
    463 {
    464 	static uint_t rock_color_masks[7] = {0x18, 6, 0x11, 0xc, 3, 0x18, 6};
    465 
    466 	ASSERT(szc < MMU_PAGE_SIZES - 1);
    467 	return (mask & rock_color_masks[szc]);
    468 }
    469 
    470 /*ARGSUSED*/
    471 uint_t
    472 page_get_nsz_color_cpu(uchar_t szc, uint_t color)
    473 {
    474 	return (color);
    475 }
    476 
    477 uint_t
    478 page_get_color_shift_cpu(uchar_t szc, uchar_t nszc)
    479 {
    480 	ASSERT(nszc >= szc);
    481 	return (0);
    482 }
    483 
    484 /*ARGSUSED*/
    485 pfn_t
    486 page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
    487     uint_t ceq_mask, uint_t color_mask, void *cookie)
    488 {
    489 	uint_t	sqn_ceq_mask = clr2sqnclr(szc, ceq_mask);
    490 	uint_t	sqn_color = clr2sqnclr(szc, color);
    491 	uint_t	pfn_shift = PNUM_SHIFT(szc);
    492 	pfn_t	cpfn, npfn, base_pfn = pfn & (~(pfn_t)color_mask << pfn_shift);
    493 	uint_t  base_sqn_color, nsqn_color, wrap = 0;
    494 
    495 	ASSERT((color & ~ceq_mask) == 0);
    496 
    497 	base_sqn_color = clr2sqnclr(szc,
    498 	    page_pfn_2_color_cpu(base_pfn, szc, NULL)) ^ sqn_color;
    499 	nsqn_color = base_sqn_color;
    500 
    501 	cpfn = (pfn_t)-1L;
    502 	do {
    503 		npfn = base_pfn | (nsqn_color << pfn_shift);
    504 
    505 		ASSERT(((page_pfn_2_color_cpu(npfn, szc, NULL) ^ color) &
    506 		    ceq_mask) == 0);
    507 
    508 		if (npfn > pfn && npfn < cpfn)
    509 			cpfn = npfn;
    510 
    511 		nsqn_color = INC_MASKED(nsqn_color, sqn_ceq_mask, color_mask);
    512 		if (nsqn_color != base_sqn_color)
    513 			continue;
    514 
    515 		if (cpfn != (pfn_t)-1L)
    516 			break;
    517 
    518 		base_pfn += ((pfn_t)color_mask + 1) << pfn_shift;
    519 
    520 		base_sqn_color = clr2sqnclr(szc,
    521 		    page_pfn_2_color_cpu(base_pfn, szc, NULL)) ^ sqn_color;
    522 		nsqn_color = base_sqn_color;
    523 		wrap++;
    524 
    525 	} while (nsqn_color != base_sqn_color || wrap < 2);
    526 
    527 	ASSERT(cpfn != (pfn_t)-1L);
    528 
    529 	return (cpfn);
    530 }
    531 
    532 void
    533 page_coloring_init_cpu()
    534 {
    535 	int i;
    536 	uint_t colors = 1 << MAX_PAGE_COLORS_SHIFT;
    537 
    538 	for (i = 0; i < mmu_page_sizes; i++) {
    539 		hw_page_array[i].hp_colors = colors;
    540 	}
    541 
    542 	/*
    543 	 * initialise conversion table between page colors and
    544 	 * sequential colors
    545 	 */
    546 	clr2sqnclr_table_init();
    547 
    548 }
    549 
    550 /*
    551  * group colorequiv colors on Rock by low order bits of the color first
    552  */
    553 void
    554 page_set_colorequiv_arr_cpu(void)
    555 {
    556 	static uint_t nequiv_shades_log2[MMU_PAGE_SIZES] = {0, 3, 0, 0, 0, 0};
    557 
    558 	if (colorequiv > 1) {
    559 		int i;
    560 		uint_t sv_a = lowbit(colorequiv) - 1;
    561 
    562 		if (sv_a > 15)
    563 			sv_a = 15;
    564 
    565 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
    566 			uint_t colors;
    567 			uint_t a = sv_a;
    568 
    569 			if ((colors = hw_page_array[i].hp_colors) <= 1)
    570 				continue;
    571 			while ((colors >> a) == 0)
    572 				a--;
    573 			if (a > (colorequivszc[i] & 0xf) +
    574 			    (colorequivszc[i] >> 4)) {
    575 				if (a <= nequiv_shades_log2[i]) {
    576 					colorequivszc[i] = (uchar_t)a;
    577 				} else {
    578 					colorequivszc[i] =
    579 					    ((a - nequiv_shades_log2[i]) << 4) |
    580 					    nequiv_shades_log2[i];
    581 				}
    582 			}
    583 		}
    584 	}
    585 }
    586 
    587 /*
    588  * Calculate the page sizes needed to program Rock TLB page size register.
    589  * The invctx parameter is a flag which indicates that it will be necessary to
    590  * synchronize by invalidating contexts if the sfmmu pagesize register is
    591  * updated.
    592  */
    593 void
    594 mmu_set_pgsz_order(sfmmu_t *sfmmup, int invctx)
    595 {
    596 	uchar_t private_pgsz_mask;
    597 	uchar_t shared_pgsz_mask;
    598 	uint16_t pgsz_order_hv[MAX_PGSZ_SEARCH_ORDER];
    599 	uint64_t pgsz_order = 0;
    600 	uchar_t pgsz_map = 0;
    601 	int private_pgsz_num = 0;
    602 	int shared_pgsz_num = 0;
    603 	int tot_pgsz_num;
    604 	sf_scd_t *scdp;
    605 	int ret;
    606 	int i;
    607 
    608 	/*
    609 	 * The hatlock must be held in all cases except when the sfmmu is
    610 	 * being initialized by hat_alloc() or we are calling hat_dup(), in
    611 	 * these cases no other thread will be using the sfmmu yet.
    612 	 */
    613 
    614 	ASSERT(!invctx || sfmmu_hat_lock_held(sfmmup));
    615 
    616 	if (pgsz_search_on == 0)
    617 		return;
    618 
    619 	/* Always enable 8K private mappings */
    620 	private_pgsz_mask = 1 << TTE8K;
    621 
    622 	/* Enable 64K private mappings unless specifically disabled */
    623 	if (!(disable_large_pages & (1 << TTE64K))) {
    624 		private_pgsz_mask |= 1 << TTE64K;
    625 	}
    626 
    627 	/*
    628 	 * First check for ISM segments not in an SCD. The algorithm for
    629 	 * creating an SCD is to create one when an (D)ISM segment is attached
    630 	 * unless the process's shared segments are a subset of an SCD which
    631 	 * already exists.
    632 	 *
    633 	 * This situation also arises when we attach to more than the maximum
    634 	 * number of (D)ISM segments defined in the region bit map
    635 	 * (currently 64).
    636 	 *
    637 	 * We have set mmu_disable_ism_large_pages to force ISM segments to use
    638 	 * only 4M and 256M pages.
    639 	 */
    640 	if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMNOTINSCD)) {
    641 		private_pgsz_mask |= 1 << TTE4M;
    642 		if (SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) {
    643 			private_pgsz_mask |= 1 << TTE256M;
    644 		}
    645 	}
    646 
    647 	/* Now check for regions not included in the SCD. */
    648 	if ((scdp = sfmmup->sfmmu_scdp) != NULL) {
    649 		SF_RGNMAP_EQUAL(&scdp->scd_hmeregion_map,
    650 		    &sfmmup->sfmmu_hmeregion_map,
    651 		    SFMMU_HMERGNMAP_WORDS, ret);
    652 		if (!ret) {
    653 			private_pgsz_mask |= sfmmup->sfmmu_rtteflags;
    654 		}
    655 	} else {
    656 		private_pgsz_mask |= sfmmup->sfmmu_rtteflags;
    657 	}
    658 
    659 	private_pgsz_mask |= sfmmup->sfmmu_tteflags;
    660 
    661 	/*
    662 	 * If the process is part of an SCD then enable 4M and 256M shared
    663 	 * page sizes - unless these are specifically disabled. If the 4M
    664 	 * shared page size is specifically disabled and the process has (D)ISM
    665 	 * segments attached or 4M regions then enable the private 4M page size.
    666 	 * If the 256M shared page size is disabled and the process has a 256M
    667 	 * page size region then enable the 256M private page size. The trap
    668 	 * handler looks at the shared page sizes enabled and if a shared
    669 	 * mapping does not correspond to one these sizes then it is treated
    670 	 * as a private mapping.
    671 	 *
    672 	 * The SCD includes the process's main text segment and (D)ISM segments
    673 	 * but we only enable the 4M shared page size so an 8K main text
    674 	 * segment will be treated as private due to the trap handler support.
    675 	 *
    676 	 * Note that for simplicity the ordering of the shared page sizes is
    677 	 * hard coded.
    678 	 */
    679 	shared_pgsz_mask = 0;
    680 	if (sfmmup->sfmmu_scdp != NULL) {
    681 		if (!(disable_shctx_large_pages  & (1 << TTE4M))) {
    682 			shared_pgsz_mask |= 1 << TTE4M;
    683 		} else if (sfmmup->sfmmu_iblk != NULL ||
    684 		    (sfmmup->sfmmu_rtteflags &
    685 		    (1 << TTE4M))) {
    686 			private_pgsz_mask |= 1 << TTE4M;
    687 		}
    688 
    689 		if (SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM) ||
    690 		    (sfmmup->sfmmu_rtteflags & (1 << TTE256M))) {
    691 			if (!(disable_shctx_large_pages  & (1 << TTE256M))) {
    692 				shared_pgsz_mask |= 1 << TTE256M;
    693 			} else {
    694 				private_pgsz_mask |= 1 << TTE256M;
    695 			}
    696 		}
    697 	}
    698 
    699 	set_pgsz_order(private_pgsz_mask, shared_pgsz_mask, &pgsz_order,
    700 	    &private_pgsz_num, &shared_pgsz_num, sfmmup);
    701 
    702 	encode_pgsz_order(pgsz_order, private_pgsz_num, shared_pgsz_num,
    703 	    pgsz_order_hv, &pgsz_map);
    704 
    705 	tot_pgsz_num = private_pgsz_num + shared_pgsz_num;
    706 	ASSERT(tot_pgsz_num <= MAX_PGSZ_SEARCH_ORDER);
    707 
    708 	for (i = 0; i < tot_pgsz_num; i++) {
    709 		if (pgsz_order_hv[i] != sfmmup->sfmmu_pgsz_order_hv[i])
    710 			break;
    711 	}
    712 
    713 	/*
    714 	 * If either we've reached the maximum number of page sizes or the
    715 	 * next element is 0, indicating the end of the list, then both the
    716 	 * entries and their number in both arrays is the same and we return.
    717 	 */
    718 	if ((i == tot_pgsz_num) && (i == MAX_PGSZ_SEARCH_ORDER ||
    719 	    sfmmup->sfmmu_pgsz_order_hv[i] == 0)) {
    720 		ASSERT(pgsz_map == sfmmup->sfmmu_pgsz_map);
    721 		return;
    722 	}
    723 
    724 	/* Otherwise update the sw page size register setting */
    725 	if (invctx) {
    726 		sfmmu_invalidate_ctx(sfmmup);
    727 	}
    728 
    729 	for (i = 0; i < tot_pgsz_num; i++) {
    730 		sfmmup->sfmmu_pgsz_order_hv[i] = pgsz_order_hv[i];
    731 	}
    732 
    733 	/* Disable next entry in search list to mark the end */
    734 	if (i < MAX_PGSZ_SEARCH_ORDER) {
    735 		sfmmup->sfmmu_pgsz_order_hv[i] = 0;
    736 	}
    737 	sfmmup->sfmmu_pgsz_map = pgsz_map;
    738 }
    739 
    740 /*
    741  * Encode the Rock TLB page size register.
    742  *
    743  * Input:
    744  *        pgsz_order, ordered list of page sizes, private and shared, the order
    745  *        between these depends on the pgsz_order_shared_first config variable.
    746  *        private_pgsz_num, number of private page sizes.
    747  *        shared_pgsz_num, number of shared page sizes.
    748  * Output:
    749  *        pgsz_order_hv contains the encoded pagesize search order for the hv
    750  *	  pgsz_map field contains the page size bit map used by the trap
    751  *        handler to prevent unauthorized shared page sizes being used.
    752  */
    753 
    754 static void
    755 encode_pgsz_order(uint64_t pgsz_order, int private_pgsz_num,
    756     int shared_pgsz_num, uint16_t *pgsz_order_hv, uchar_t *pgsz_map)
    757 {
    758 	int i;
    759 	int tot_pgsz_num;
    760 	uint16_t pgsz_entry;
    761 	uint16_t first_entry_mask, second_entry_mask;
    762 	int	first_pgsz_num;
    763 
    764 	ASSERT(private_pgsz_num < MMU_PAGE_SIZES);
    765 	ASSERT(shared_pgsz_num < MMU_PAGE_SIZES);
    766 	ASSERT(private_pgsz_num > 0);
    767 
    768 	if (pgsz_order_shared_first) {
    769 		first_entry_mask = TLB_PGSZ_CONTEXT1_ENABLE;
    770 		second_entry_mask = TLB_PGSZ_ENABLE;
    771 		first_pgsz_num = shared_pgsz_num;
    772 	} else {
    773 		first_entry_mask = TLB_PGSZ_ENABLE;
    774 		second_entry_mask = TLB_PGSZ_CONTEXT1_ENABLE;
    775 		first_pgsz_num = private_pgsz_num;
    776 	}
    777 
    778 	tot_pgsz_num = private_pgsz_num + shared_pgsz_num;
    779 	for (i = 0; i < tot_pgsz_num; i++) {
    780 		pgsz_entry = pgsz_order & TTE_SZ_BITS;
    781 		if (i < first_pgsz_num) {
    782 			if (pgsz_order_shared_first) {
    783 				*pgsz_map |= (1 << pgsz_entry);
    784 			}
    785 			pgsz_entry |= first_entry_mask;
    786 		} else {
    787 			if (!pgsz_order_shared_first) {
    788 				*pgsz_map |= (1 << pgsz_entry);
    789 			}
    790 			pgsz_entry |= second_entry_mask;
    791 		}
    792 		pgsz_order >>= 4;
    793 		pgsz_order_hv[i] = pgsz_entry;
    794 	}
    795 }
    796 
    797 /*
    798  * The function returns the mmu-specific values for the
    799  * hat's disable_large_pages, disable_ism_large_pages, and
    800  * disable_auto_data_large_pages and
    801  * disable_text_data_large_pages variables.
    802  */
    803 uint_t
    804 mmu_large_pages_disabled(uint_t flag)
    805 {
    806 	uint_t pages_disable = 0;
    807 
    808 	if (flag == HAT_LOAD) {
    809 		pages_disable =  mmu_disable_large_pages;
    810 	} else if (flag == HAT_LOAD_SHARE) {
    811 		pages_disable = mmu_disable_ism_large_pages;
    812 	} else if (flag == HAT_AUTO_DATA) {
    813 		pages_disable = mmu_disable_auto_data_large_pages;
    814 	} else if (flag == HAT_AUTO_TEXT) {
    815 		pages_disable = mmu_disable_auto_text_large_pages;
    816 	}
    817 	return (pages_disable);
    818 }
    819 
    820 /*
    821  * Uses private and shared page size bitmaps to produce an ordered list
    822  * of page sizes and counts to be passed to encode_pgsz_order().
    823  *
    824  * Input:
    825  *        private_pgsz_mask, bit map of private page sizes.
    826  *        shared_pgsz_mask,  bit map of private page sizes.
    827  *	  sfmmup, pointer to hat structure.
    828  *
    829  * Output:
    830  *        pgsz_order, ordered list of page sizes.
    831  *        private_pgsz_num, number of private page sizes in pgsz_order.
    832  *        shared_pgsz_num, number of shared page sizes in pgsz_order.
    833  */
    834 static void
    835 set_pgsz_order(uchar_t private_pgsz_mask, uchar_t shared_pgsz_mask,
    836     uint64_t *pgsz_order, int *private_pgsz_num, int *shared_pgsz_num,
    837     sfmmu_t *sfmmup)
    838 {
    839 	int64_t sortcnt[MMU_PAGE_SIZES];
    840 	int8_t tmp_pgsz[MMU_PAGE_SIZES];
    841 	ulong_t tmp;
    842 	uint8_t i, j, max;
    843 
    844 	*private_pgsz_num = 0;
    845 	*shared_pgsz_num = 0;
    846 	*pgsz_order = 0;
    847 
    848 	/* Sort pages by area mapped */
    849 	for (i = 0; i < mmu_page_sizes; i++) {
    850 		tmp = sfmmup->sfmmu_ttecnt[i] + sfmmup->sfmmu_ismttecnt[i];
    851 		sortcnt[i] = tmp << TTE_PAGE_SHIFT(i);
    852 	}
    853 
    854 	for (j = 0; j < mmu_page_sizes; j++) {
    855 		for (i = mmu_page_sizes - 1, max = 0; i > 0; i--) {
    856 			if (sortcnt[i] > sortcnt[max])
    857 				max = i;
    858 		}
    859 		tmp_pgsz[j] = max;
    860 		sortcnt[max] = -1;
    861 	}
    862 
    863 	/* Add shared page sizes to page order if these come first */
    864 	if (pgsz_order_shared_first) {
    865 		if (shared_pgsz_mask & (1 << TTE256M)) {
    866 			*pgsz_order =  TTE256M;
    867 			(*shared_pgsz_num)++;
    868 		}
    869 		if (shared_pgsz_mask & (1 << TTE4M)) {
    870 			*pgsz_order |= (TTE4M << (*shared_pgsz_num * 4));
    871 			(*shared_pgsz_num)++;
    872 		}
    873 	}
    874 
    875 
    876 	/* Add private page sizes to page order */
    877 	for (i = 0; i < mmu_page_sizes; i++) {
    878 		if (private_pgsz_mask & (1 << tmp_pgsz[i])) {
    879 			*pgsz_order |= (tmp_pgsz[i] <<
    880 			    ((*private_pgsz_num + *shared_pgsz_num) * 4));
    881 			(*private_pgsz_num)++;
    882 		}
    883 	}
    884 
    885 	/* Add shared page sizes to page order if these come last */
    886 	if (!pgsz_order_shared_first) {
    887 		if (shared_pgsz_mask & (1 << TTE256M)) {
    888 			*pgsz_order |=  (TTE256M <<
    889 			    ((*private_pgsz_num + *shared_pgsz_num) * 4));
    890 			(*shared_pgsz_num)++;
    891 		}
    892 		if (shared_pgsz_mask & (1 << TTE4M)) {
    893 			*pgsz_order |= (TTE4M <<
    894 			    ((*private_pgsz_num + *shared_pgsz_num) * 4));
    895 			(*shared_pgsz_num)++;
    896 		}
    897 	}
    898 
    899 	ASSERT(*pgsz_order);
    900 	ASSERT(*private_pgsz_num);
    901 	ASSERT((*private_pgsz_num + *shared_pgsz_num)
    902 	    <= MAX_PGSZ_SEARCH_ORDER);
    903 }
    904 
    905 /*
    906  * This routine is called without holding the hat lock to determine
    907  * whether the process's optimal page size order has changed significantly
    908  * since the page size register was last set. If it has changed we get the
    909  * hat lock and call mmu_set_pgsz_order() to update the effective pagesize
    910  * order.
    911  */
    912 void
    913 mmu_check_page_sizes(sfmmu_t *sfmmup, uint64_t *ttecnt)
    914 {
    915 	int64_t sortcnt[MMU_PAGE_SIZES];
    916 	int8_t tmp_pgsz[MMU_PAGE_SIZES];
    917 	ulong_t tmp;
    918 	int8_t i, j, max;
    919 	uint_t pgsz;
    920 	uint16_t *pgsz_order_hv;
    921 	int page_order_changed;
    922 	hatlock_t *hatlockp;
    923 	int pgsz_count = 0;
    924 
    925 	ASSERT(!sfmmu_hat_lock_held(sfmmup));
    926 
    927 	if (pgsz_search_on == 0)
    928 		return;
    929 
    930 	/*
    931 	 * Check if ttecnt has changed significantly, since the last time we
    932 	 * were called. If the shared page sizes have changed then this is
    933 	 * handled by mmu_set_pgsz_order() being called directly when we join
    934 	 * the SCD.
    935 	 */
    936 	for (i = 0; i < mmu_page_sizes; i++) {
    937 		if (ttecnt[i] > (sfmmup->sfmmu_mmuttecnt[i] << 1) ||
    938 		    ttecnt[i] < (sfmmup->sfmmu_mmuttecnt[i] >> 1))
    939 			break;
    940 	}
    941 
    942 	if (i == mmu_page_sizes) {
    943 		return;
    944 	}
    945 
    946 	/* Sort pages by area mapped */
    947 	for (i = 0; i < mmu_page_sizes; i++) {
    948 		tmp = ttecnt[i];
    949 		sortcnt[i] = tmp << TTE_PAGE_SHIFT(i);
    950 	}
    951 
    952 	for (j = 0; j < mmu_page_sizes; j++) {
    953 		for (i = mmu_page_sizes - 1, max = 0; i > 0; i--) {
    954 			if (sortcnt[i] > sortcnt[max])
    955 				max = i;
    956 		}
    957 		tmp_pgsz[j] = max;
    958 		sortcnt[max] = -1;
    959 	}
    960 
    961 	/*
    962 	 * Check if the order of the private page sizes has changed. We call
    963 	 * mmu_set_pgsz_order() directly if additional page sizes are used,
    964 	 * so we can assume that the number of entries is unchanged.
    965 	 */
    966 	pgsz_order_hv = sfmmup->sfmmu_pgsz_order_hv;
    967 	if (pgsz_order_shared_first) {
    968 		/* skip over shared pgsz entries */
    969 		while ((pgsz_order_hv[pgsz_count] & TLB_PGSZ_CONTEXT1_ENABLE) ==
    970 		    TLB_PGSZ_CONTEXT1_ENABLE) {
    971 			pgsz_count++;
    972 		}
    973 	}
    974 
    975 	i = 0;
    976 	page_order_changed = 0;
    977 	while ((pgsz_order_hv[pgsz_count] & TLB_PGSZ_ENABLE) &&
    978 	    !(pgsz_order_hv[pgsz_count] & TLB_PGSZ_CONTEXT1) &&
    979 	    (pgsz_count < MAX_PGSZ_SEARCH_ORDER)) {
    980 		pgsz = (pgsz_order_hv[pgsz_count] & TTE_SZ_BITS);
    981 		ASSERT(pgsz < MMU_PAGE_SIZES);
    982 
    983 		if (pgsz != tmp_pgsz[i]) {
    984 			page_order_changed = 1;
    985 			break;
    986 		}
    987 		pgsz_count++;
    988 		i++;
    989 	}
    990 
    991 	if (page_order_changed) {
    992 		hatlockp = sfmmu_hat_enter(sfmmup);
    993 		/* Save old values of ttecnt */
    994 		for (i = 0; i < mmu_page_sizes; i++) {
    995 			sfmmup->sfmmu_mmuttecnt[i] = ttecnt[i];
    996 		}
    997 		mmu_set_pgsz_order(sfmmup, 1);
    998 		sfmmu_hat_exit(hatlockp);
    999 	}
   1000 }
   1001 
   1002 /*
   1003  * If the mmu extension API is supported and pgsz_search_on is set,
   1004  * patch out the instruction to branch over the hypervisor call in
   1005  * sfmmu_load_mmustate().
   1006  */
   1007 void
   1008 mmu_enable_pgsz_search()
   1009 {
   1010 	if ((hsvc_mmu_ext_available == B_TRUE) && pgsz_search_on) {
   1011 		/* patch in hcall to set pgsz order */
   1012 		sfmmu_patch_pgsz_reg();
   1013 	}
   1014 }
   1015