Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/sysmacros.h>
     29 #include <sys/kmem.h>
     30 #include <sys/atomic.h>
     31 #include <sys/bitmap.h>
     32 #include <sys/machparam.h>
     33 #include <sys/machsystm.h>
     34 #include <sys/mman.h>
     35 #include <sys/systm.h>
     36 #include <sys/cpuvar.h>
     37 #include <sys/thread.h>
     38 #include <sys/proc.h>
     39 #include <sys/cpu.h>
     40 #include <sys/kmem.h>
     41 #include <sys/disp.h>
     42 #include <sys/vmem.h>
     43 #include <sys/vmsystm.h>
     44 #include <sys/promif.h>
     45 #include <sys/var.h>
     46 #include <sys/x86_archext.h>
     47 #include <sys/archsystm.h>
     48 #include <sys/bootconf.h>
     49 #include <sys/dumphdr.h>
     50 #include <vm/seg_kmem.h>
     51 #include <vm/seg_kpm.h>
     52 #include <vm/hat.h>
     53 #include <vm/hat_i86.h>
     54 #include <sys/cmn_err.h>
     55 #include <sys/panic.h>
     56 
     57 #ifdef __xpv
     58 #include <sys/hypervisor.h>
     59 #include <sys/xpv_panic.h>
     60 #endif
     61 
     62 #include <sys/bootinfo.h>
     63 #include <vm/kboot_mmu.h>
     64 
     65 static void x86pte_zero(htable_t *dest, uint_t entry, uint_t count);
     66 
     67 kmem_cache_t *htable_cache;
     68 
     69 /*
     70  * The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT,
     71  * is used in order to facilitate testing of the htable_steal() code.
     72  * By resetting htable_reserve_amount to a lower value, we can force
     73  * stealing to occur.  The reserve amount is a guess to get us through boot.
     74  */
     75 #define	HTABLE_RESERVE_AMOUNT	(200)
     76 uint_t htable_reserve_amount = HTABLE_RESERVE_AMOUNT;
     77 kmutex_t htable_reserve_mutex;
     78 uint_t htable_reserve_cnt;
     79 htable_t *htable_reserve_pool;
     80 
     81 /*
     82  * Used to hand test htable_steal().
     83  */
     84 #ifdef DEBUG
     85 ulong_t force_steal = 0;
     86 ulong_t ptable_cnt = 0;
     87 #endif
     88 
     89 /*
     90  * This variable is so that we can tune this via /etc/system
     91  * Any value works, but a power of two <= mmu.ptes_per_table is best.
     92  */
     93 uint_t htable_steal_passes = 8;
     94 
     95 /*
     96  * mutex stuff for access to htable hash
     97  */
     98 #define	NUM_HTABLE_MUTEX 128
     99 kmutex_t htable_mutex[NUM_HTABLE_MUTEX];
    100 #define	HTABLE_MUTEX_HASH(h) ((h) & (NUM_HTABLE_MUTEX - 1))
    101 
    102 #define	HTABLE_ENTER(h)	mutex_enter(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
    103 #define	HTABLE_EXIT(h)	mutex_exit(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
    104 
    105 /*
    106  * forward declarations
    107  */
    108 static void link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr);
    109 static void unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr);
    110 static void htable_free(htable_t *ht);
    111 static x86pte_t *x86pte_access_pagetable(htable_t *ht, uint_t index);
    112 static void x86pte_release_pagetable(htable_t *ht);
    113 static x86pte_t x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old,
    114 	x86pte_t new);
    115 
    116 /*
    117  * A counter to track if we are stealing or reaping htables. When non-zero
    118  * htable_free() will directly free htables (either to the reserve or kmem)
    119  * instead of putting them in a hat's htable cache.
    120  */
    121 uint32_t htable_dont_cache = 0;
    122 
    123 /*
    124  * Track the number of active pagetables, so we can know how many to reap
    125  */
    126 static uint32_t active_ptables = 0;
    127 
    128 #ifdef __xpv
    129 /*
    130  * Deal with hypervisor complications.
    131  */
    132 void
    133 xen_flush_va(caddr_t va)
    134 {
    135 	struct mmuext_op t;
    136 	uint_t count;
    137 
    138 	if (IN_XPV_PANIC()) {
    139 		mmu_tlbflush_entry((caddr_t)va);
    140 	} else {
    141 		t.cmd = MMUEXT_INVLPG_LOCAL;
    142 		t.arg1.linear_addr = (uintptr_t)va;
    143 		if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
    144 			panic("HYPERVISOR_mmuext_op() failed");
    145 		ASSERT(count == 1);
    146 	}
    147 }
    148 
    149 void
    150 xen_gflush_va(caddr_t va, cpuset_t cpus)
    151 {
    152 	struct mmuext_op t;
    153 	uint_t count;
    154 
    155 	if (IN_XPV_PANIC()) {
    156 		mmu_tlbflush_entry((caddr_t)va);
    157 		return;
    158 	}
    159 
    160 	t.cmd = MMUEXT_INVLPG_MULTI;
    161 	t.arg1.linear_addr = (uintptr_t)va;
    162 	/*LINTED: constant in conditional context*/
    163 	set_xen_guest_handle(t.arg2.vcpumask, &cpus);
    164 	if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
    165 		panic("HYPERVISOR_mmuext_op() failed");
    166 	ASSERT(count == 1);
    167 }
    168 
    169 void
    170 xen_flush_tlb()
    171 {
    172 	struct mmuext_op t;
    173 	uint_t count;
    174 
    175 	if (IN_XPV_PANIC()) {
    176 		xpv_panic_reload_cr3();
    177 	} else {
    178 		t.cmd = MMUEXT_TLB_FLUSH_LOCAL;
    179 		if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
    180 			panic("HYPERVISOR_mmuext_op() failed");
    181 		ASSERT(count == 1);
    182 	}
    183 }
    184 
    185 void
    186 xen_gflush_tlb(cpuset_t cpus)
    187 {
    188 	struct mmuext_op t;
    189 	uint_t count;
    190 
    191 	ASSERT(!IN_XPV_PANIC());
    192 	t.cmd = MMUEXT_TLB_FLUSH_MULTI;
    193 	/*LINTED: constant in conditional context*/
    194 	set_xen_guest_handle(t.arg2.vcpumask, &cpus);
    195 	if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
    196 		panic("HYPERVISOR_mmuext_op() failed");
    197 	ASSERT(count == 1);
    198 }
    199 
    200 /*
    201  * Install/Adjust a kpm mapping under the hypervisor.
    202  * Value of "how" should be:
    203  *	PT_WRITABLE | PT_VALID - regular kpm mapping
    204  *	PT_VALID - make mapping read-only
    205  *	0	- remove mapping
    206  *
    207  * returns 0 on success. non-zero for failure.
    208  */
    209 int
    210 xen_kpm_page(pfn_t pfn, uint_t how)
    211 {
    212 	paddr_t pa = mmu_ptob((paddr_t)pfn);
    213 	x86pte_t pte = PT_NOCONSIST | PT_REF | PT_MOD;
    214 
    215 	if (kpm_vbase == NULL)
    216 		return (0);
    217 
    218 	if (how)
    219 		pte |= pa_to_ma(pa) | how;
    220 	else
    221 		pte = 0;
    222 	return (HYPERVISOR_update_va_mapping((uintptr_t)kpm_vbase + pa,
    223 	    pte, UVMF_INVLPG | UVMF_ALL));
    224 }
    225 
    226 void
    227 xen_pin(pfn_t pfn, level_t lvl)
    228 {
    229 	struct mmuext_op t;
    230 	uint_t count;
    231 
    232 	t.cmd = MMUEXT_PIN_L1_TABLE + lvl;
    233 	t.arg1.mfn = pfn_to_mfn(pfn);
    234 	if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
    235 		panic("HYPERVISOR_mmuext_op() failed");
    236 	ASSERT(count == 1);
    237 }
    238 
    239 void
    240 xen_unpin(pfn_t pfn)
    241 {
    242 	struct mmuext_op t;
    243 	uint_t count;
    244 
    245 	t.cmd = MMUEXT_UNPIN_TABLE;
    246 	t.arg1.mfn = pfn_to_mfn(pfn);
    247 	if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
    248 		panic("HYPERVISOR_mmuext_op() failed");
    249 	ASSERT(count == 1);
    250 }
    251 
    252 static void
    253 xen_map(uint64_t pte, caddr_t va)
    254 {
    255 	if (HYPERVISOR_update_va_mapping((uintptr_t)va, pte,
    256 	    UVMF_INVLPG | UVMF_LOCAL))
    257 		panic("HYPERVISOR_update_va_mapping() failed");
    258 }
    259 #endif /* __xpv */
    260 
    261 /*
    262  * Allocate a memory page for a hardware page table.
    263  *
    264  * A wrapper around page_get_physical(), with some extra checks.
    265  */
    266 static pfn_t
    267 ptable_alloc(uintptr_t seed)
    268 {
    269 	pfn_t pfn;
    270 	page_t *pp;
    271 
    272 	pfn = PFN_INVALID;
    273 
    274 	/*
    275 	 * The first check is to see if there is memory in the system. If we
    276 	 * drop to throttlefree, then fail the ptable_alloc() and let the
    277 	 * stealing code kick in. Note that we have to do this test here,
    278 	 * since the test in page_create_throttle() would let the NOSLEEP
    279 	 * allocation go through and deplete the page reserves.
    280 	 *
    281 	 * The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check.
    282 	 */
    283 	if (!NOMEMWAIT() && freemem <= throttlefree + 1)
    284 		return (PFN_INVALID);
    285 
    286 #ifdef DEBUG
    287 	/*
    288 	 * This code makes htable_steal() easier to test. By setting
    289 	 * force_steal we force pagetable allocations to fall
    290 	 * into the stealing code. Roughly 1 in ever "force_steal"
    291 	 * page table allocations will fail.
    292 	 */
    293 	if (proc_pageout != NULL && force_steal > 1 &&
    294 	    ++ptable_cnt > force_steal) {
    295 		ptable_cnt = 0;
    296 		return (PFN_INVALID);
    297 	}
    298 #endif /* DEBUG */
    299 
    300 	pp = page_get_physical(seed);
    301 	if (pp == NULL)
    302 		return (PFN_INVALID);
    303 	ASSERT(PAGE_SHARED(pp));
    304 	pfn = pp->p_pagenum;
    305 	if (pfn == PFN_INVALID)
    306 		panic("ptable_alloc(): Invalid PFN!!");
    307 	atomic_add_32(&active_ptables, 1);
    308 	HATSTAT_INC(hs_ptable_allocs);
    309 	return (pfn);
    310 }
    311 
    312 /*
    313  * Free an htable's associated page table page.  See the comments
    314  * for ptable_alloc().
    315  */
    316 static void
    317 ptable_free(pfn_t pfn)
    318 {
    319 	page_t *pp = page_numtopp_nolock(pfn);
    320 
    321 	/*
    322 	 * need to destroy the page used for the pagetable
    323 	 */
    324 	ASSERT(pfn != PFN_INVALID);
    325 	HATSTAT_INC(hs_ptable_frees);
    326 	atomic_add_32(&active_ptables, -1);
    327 	if (pp == NULL)
    328 		panic("ptable_free(): no page for pfn!");
    329 	ASSERT(PAGE_SHARED(pp));
    330 	ASSERT(pfn == pp->p_pagenum);
    331 	ASSERT(!IN_XPV_PANIC());
    332 
    333 	/*
    334 	 * Get an exclusive lock, might have to wait for a kmem reader.
    335 	 */
    336 	if (!page_tryupgrade(pp)) {
    337 		page_unlock(pp);
    338 		/*
    339 		 * RFE: we could change this to not loop forever
    340 		 * For now looping works - it's just like sfmmu.
    341 		 */
    342 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
    343 			continue;
    344 	}
    345 #ifdef __xpv
    346 	if (kpm_vbase && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0)
    347 		panic("failure making kpm r/w pfn=0x%lx", pfn);
    348 #endif
    349 	page_free(pp, 1);
    350 	page_unresv(1);
    351 }
    352 
    353 /*
    354  * Put one htable on the reserve list.
    355  */
    356 static void
    357 htable_put_reserve(htable_t *ht)
    358 {
    359 	ht->ht_hat = NULL;		/* no longer tied to a hat */
    360 	ASSERT(ht->ht_pfn == PFN_INVALID);
    361 	HATSTAT_INC(hs_htable_rputs);
    362 	mutex_enter(&htable_reserve_mutex);
    363 	ht->ht_next = htable_reserve_pool;
    364 	htable_reserve_pool = ht;
    365 	++htable_reserve_cnt;
    366 	mutex_exit(&htable_reserve_mutex);
    367 }
    368 
    369 /*
    370  * Take one htable from the reserve.
    371  */
    372 static htable_t *
    373 htable_get_reserve(void)
    374 {
    375 	htable_t *ht = NULL;
    376 
    377 	mutex_enter(&htable_reserve_mutex);
    378 	if (htable_reserve_cnt != 0) {
    379 		ht = htable_reserve_pool;
    380 		ASSERT(ht != NULL);
    381 		ASSERT(ht->ht_pfn == PFN_INVALID);
    382 		htable_reserve_pool = ht->ht_next;
    383 		--htable_reserve_cnt;
    384 		HATSTAT_INC(hs_htable_rgets);
    385 	}
    386 	mutex_exit(&htable_reserve_mutex);
    387 	return (ht);
    388 }
    389 
    390 /*
    391  * Allocate initial htables and put them on the reserve list
    392  */
    393 void
    394 htable_initial_reserve(uint_t count)
    395 {
    396 	htable_t *ht;
    397 
    398 	count += HTABLE_RESERVE_AMOUNT;
    399 	while (count > 0) {
    400 		ht = kmem_cache_alloc(htable_cache, KM_NOSLEEP);
    401 		ASSERT(ht != NULL);
    402 
    403 		ASSERT(use_boot_reserve);
    404 		ht->ht_pfn = PFN_INVALID;
    405 		htable_put_reserve(ht);
    406 		--count;
    407 	}
    408 }
    409 
    410 /*
    411  * Readjust the reserves after a thread finishes using them.
    412  */
    413 void
    414 htable_adjust_reserve()
    415 {
    416 	htable_t *ht;
    417 
    418 	/*
    419 	 * Free any excess htables in the reserve list
    420 	 */
    421 	while (htable_reserve_cnt > htable_reserve_amount &&
    422 	    !USE_HAT_RESERVES()) {
    423 		ht = htable_get_reserve();
    424 		if (ht == NULL)
    425 			return;
    426 		ASSERT(ht->ht_pfn == PFN_INVALID);
    427 		kmem_cache_free(htable_cache, ht);
    428 	}
    429 }
    430 
    431 
    432 /*
    433  * This routine steals htables from user processes for htable_alloc() or
    434  * for htable_reap().
    435  */
    436 static htable_t *
    437 htable_steal(uint_t cnt)
    438 {
    439 	hat_t		*hat = kas.a_hat;	/* list starts with khat */
    440 	htable_t	*list = NULL;
    441 	htable_t	*ht;
    442 	htable_t	*higher;
    443 	uint_t		h;
    444 	uint_t		h_start;
    445 	static uint_t	h_seed = 0;
    446 	uint_t		e;
    447 	uintptr_t	va;
    448 	x86pte_t	pte;
    449 	uint_t		stolen = 0;
    450 	uint_t		pass;
    451 	uint_t		threshold;
    452 
    453 	/*
    454 	 * Limit htable_steal_passes to something reasonable
    455 	 */
    456 	if (htable_steal_passes == 0)
    457 		htable_steal_passes = 1;
    458 	if (htable_steal_passes > mmu.ptes_per_table)
    459 		htable_steal_passes = mmu.ptes_per_table;
    460 
    461 	/*
    462 	 * Loop through all user hats. The 1st pass takes cached htables that
    463 	 * aren't in use. The later passes steal by removing mappings, too.
    464 	 */
    465 	atomic_add_32(&htable_dont_cache, 1);
    466 	for (pass = 0; pass <= htable_steal_passes && stolen < cnt; ++pass) {
    467 		threshold = pass * mmu.ptes_per_table / htable_steal_passes;
    468 		hat = kas.a_hat;
    469 		for (;;) {
    470 
    471 			/*
    472 			 * Clear the victim flag and move to next hat
    473 			 */
    474 			mutex_enter(&hat_list_lock);
    475 			if (hat != kas.a_hat) {
    476 				hat->hat_flags &= ~HAT_VICTIM;
    477 				cv_broadcast(&hat_list_cv);
    478 			}
    479 			hat = hat->hat_next;
    480 
    481 			/*
    482 			 * Skip any hat that is already being stolen from.
    483 			 *
    484 			 * We skip SHARED hats, as these are dummy
    485 			 * hats that host ISM shared page tables.
    486 			 *
    487 			 * We also skip if HAT_FREEING because hat_pte_unmap()
    488 			 * won't zero out the PTE's. That would lead to hitting
    489 			 * stale PTEs either here or under hat_unload() when we
    490 			 * steal and unload the same page table in competing
    491 			 * threads.
    492 			 */
    493 			while (hat != NULL &&
    494 			    (hat->hat_flags &
    495 			    (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0)
    496 				hat = hat->hat_next;
    497 
    498 			if (hat == NULL) {
    499 				mutex_exit(&hat_list_lock);
    500 				break;
    501 			}
    502 
    503 			/*
    504 			 * Are we finished?
    505 			 */
    506 			if (stolen == cnt) {
    507 				/*
    508 				 * Try to spread the pain of stealing,
    509 				 * move victim HAT to the end of the HAT list.
    510 				 */
    511 				if (pass >= 1 && cnt == 1 &&
    512 				    kas.a_hat->hat_prev != hat) {
    513 
    514 					/* unlink victim hat */
    515 					if (hat->hat_prev)
    516 						hat->hat_prev->hat_next =
    517 						    hat->hat_next;
    518 					else
    519 						kas.a_hat->hat_next =
    520 						    hat->hat_next;
    521 					if (hat->hat_next)
    522 						hat->hat_next->hat_prev =
    523 						    hat->hat_prev;
    524 					else
    525 						kas.a_hat->hat_prev =
    526 						    hat->hat_prev;
    527 
    528 
    529 					/* relink at end of hat list */
    530 					hat->hat_next = NULL;
    531 					hat->hat_prev = kas.a_hat->hat_prev;
    532 					if (hat->hat_prev)
    533 						hat->hat_prev->hat_next = hat;
    534 					else
    535 						kas.a_hat->hat_next = hat;
    536 					kas.a_hat->hat_prev = hat;
    537 
    538 				}
    539 
    540 				mutex_exit(&hat_list_lock);
    541 				break;
    542 			}
    543 
    544 			/*
    545 			 * Mark the HAT as a stealing victim.
    546 			 */
    547 			hat->hat_flags |= HAT_VICTIM;
    548 			mutex_exit(&hat_list_lock);
    549 
    550 			/*
    551 			 * Take any htables from the hat's cached "free" list.
    552 			 */
    553 			hat_enter(hat);
    554 			while ((ht = hat->hat_ht_cached) != NULL &&
    555 			    stolen < cnt) {
    556 				hat->hat_ht_cached = ht->ht_next;
    557 				ht->ht_next = list;
    558 				list = ht;
    559 				++stolen;
    560 			}
    561 			hat_exit(hat);
    562 
    563 			/*
    564 			 * Don't steal on first pass.
    565 			 */
    566 			if (pass == 0 || stolen == cnt)
    567 				continue;
    568 
    569 			/*
    570 			 * Search the active htables for one to steal.
    571 			 * Start at a different hash bucket every time to
    572 			 * help spread the pain of stealing.
    573 			 */
    574 			h = h_start = h_seed++ % hat->hat_num_hash;
    575 			do {
    576 				higher = NULL;
    577 				HTABLE_ENTER(h);
    578 				for (ht = hat->hat_ht_hash[h]; ht;
    579 				    ht = ht->ht_next) {
    580 
    581 					/*
    582 					 * Can we rule out reaping?
    583 					 */
    584 					if (ht->ht_busy != 0 ||
    585 					    (ht->ht_flags & HTABLE_SHARED_PFN)||
    586 					    ht->ht_level > 0 ||
    587 					    ht->ht_valid_cnt > threshold ||
    588 					    ht->ht_lock_cnt != 0)
    589 						continue;
    590 
    591 					/*
    592 					 * Increment busy so the htable can't
    593 					 * disappear. We drop the htable mutex
    594 					 * to avoid deadlocks with
    595 					 * hat_pageunload() and the hment mutex
    596 					 * while we call hat_pte_unmap()
    597 					 */
    598 					++ht->ht_busy;
    599 					HTABLE_EXIT(h);
    600 
    601 					/*
    602 					 * Try stealing.
    603 					 * - unload and invalidate all PTEs
    604 					 */
    605 					for (e = 0, va = ht->ht_vaddr;
    606 					    e < HTABLE_NUM_PTES(ht) &&
    607 					    ht->ht_valid_cnt > 0 &&
    608 					    ht->ht_busy == 1 &&
    609 					    ht->ht_lock_cnt == 0;
    610 					    ++e, va += MMU_PAGESIZE) {
    611 						pte = x86pte_get(ht, e);
    612 						if (!PTE_ISVALID(pte))
    613 							continue;
    614 						hat_pte_unmap(ht, e,
    615 						    HAT_UNLOAD, pte, NULL);
    616 					}
    617 
    618 					/*
    619 					 * Reacquire htable lock. If we didn't
    620 					 * remove all mappings in the table,
    621 					 * or another thread added a new mapping
    622 					 * behind us, give up on this table.
    623 					 */
    624 					HTABLE_ENTER(h);
    625 					if (ht->ht_busy != 1 ||
    626 					    ht->ht_valid_cnt != 0 ||
    627 					    ht->ht_lock_cnt != 0) {
    628 						--ht->ht_busy;
    629 						continue;
    630 					}
    631 
    632 					/*
    633 					 * Steal it and unlink the page table.
    634 					 */
    635 					higher = ht->ht_parent;
    636 					unlink_ptp(higher, ht, ht->ht_vaddr);
    637 
    638 					/*
    639 					 * remove from the hash list
    640 					 */
    641 					if (ht->ht_next)
    642 						ht->ht_next->ht_prev =
    643 						    ht->ht_prev;
    644 
    645 					if (ht->ht_prev) {
    646 						ht->ht_prev->ht_next =
    647 						    ht->ht_next;
    648 					} else {
    649 						ASSERT(hat->hat_ht_hash[h] ==
    650 						    ht);
    651 						hat->hat_ht_hash[h] =
    652 						    ht->ht_next;
    653 					}
    654 
    655 					/*
    656 					 * Break to outer loop to release the
    657 					 * higher (ht_parent) pagetable. This
    658 					 * spreads out the pain caused by
    659 					 * pagefaults.
    660 					 */
    661 					ht->ht_next = list;
    662 					list = ht;
    663 					++stolen;
    664 					break;
    665 				}
    666 				HTABLE_EXIT(h);
    667 				if (higher != NULL)
    668 					htable_release(higher);
    669 				if (++h == hat->hat_num_hash)
    670 					h = 0;
    671 			} while (stolen < cnt && h != h_start);
    672 		}
    673 	}
    674 	atomic_add_32(&htable_dont_cache, -1);
    675 	return (list);
    676 }
    677 
    678 /*
    679  * This is invoked from kmem when the system is low on memory.  We try
    680  * to free hments, htables, and ptables to improve the memory situation.
    681  */
    682 /*ARGSUSED*/
    683 static void
    684 htable_reap(void *handle)
    685 {
    686 	uint_t		reap_cnt;
    687 	htable_t	*list;
    688 	htable_t	*ht;
    689 
    690 	HATSTAT_INC(hs_reap_attempts);
    691 	if (!can_steal_post_boot)
    692 		return;
    693 
    694 	/*
    695 	 * Try to reap 5% of the page tables bounded by a maximum of
    696 	 * 5% of physmem and a minimum of 10.
    697 	 */
    698 	reap_cnt = MAX(MIN(physmem / 20, active_ptables / 20), 10);
    699 
    700 	/*
    701 	 * Let htable_steal() do the work, we just call htable_free()
    702 	 */
    703 	XPV_DISALLOW_MIGRATE();
    704 	list = htable_steal(reap_cnt);
    705 	XPV_ALLOW_MIGRATE();
    706 	while ((ht = list) != NULL) {
    707 		list = ht->ht_next;
    708 		HATSTAT_INC(hs_reaped);
    709 		htable_free(ht);
    710 	}
    711 
    712 	/*
    713 	 * Free up excess reserves
    714 	 */
    715 	htable_adjust_reserve();
    716 	hment_adjust_reserve();
    717 }
    718 
    719 /*
    720  * Allocate an htable, stealing one or using the reserve if necessary
    721  */
    722 static htable_t *
    723 htable_alloc(
    724 	hat_t		*hat,
    725 	uintptr_t	vaddr,
    726 	level_t		level,
    727 	htable_t	*shared)
    728 {
    729 	htable_t	*ht = NULL;
    730 	uint_t		is_vlp;
    731 	uint_t		is_bare = 0;
    732 	uint_t		need_to_zero = 1;
    733 	int		kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP);
    734 
    735 	if (level < 0 || level > TOP_LEVEL(hat))
    736 		panic("htable_alloc(): level %d out of range\n", level);
    737 
    738 	is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL;
    739 	if (is_vlp || shared != NULL)
    740 		is_bare = 1;
    741 
    742 	/*
    743 	 * First reuse a cached htable from the hat_ht_cached field, this
    744 	 * avoids unnecessary trips through kmem/page allocators.
    745 	 */
    746 	if (hat->hat_ht_cached != NULL && !is_bare) {
    747 		hat_enter(hat);
    748 		ht = hat->hat_ht_cached;
    749 		if (ht != NULL) {
    750 			hat->hat_ht_cached = ht->ht_next;
    751 			need_to_zero = 0;
    752 			/* XX64 ASSERT() they're all zero somehow */
    753 			ASSERT(ht->ht_pfn != PFN_INVALID);
    754 		}
    755 		hat_exit(hat);
    756 	}
    757 
    758 	if (ht == NULL) {
    759 		/*
    760 		 * Allocate an htable, possibly refilling the reserves.
    761 		 */
    762 		if (USE_HAT_RESERVES()) {
    763 			ht = htable_get_reserve();
    764 		} else {
    765 			/*
    766 			 * Donate successful htable allocations to the reserve.
    767 			 */
    768 			for (;;) {
    769 				ht = kmem_cache_alloc(htable_cache, kmflags);
    770 				if (ht == NULL)
    771 					break;
    772 				ht->ht_pfn = PFN_INVALID;
    773 				if (USE_HAT_RESERVES() ||
    774 				    htable_reserve_cnt >= htable_reserve_amount)
    775 					break;
    776 				htable_put_reserve(ht);
    777 			}
    778 		}
    779 
    780 		/*
    781 		 * allocate a page for the hardware page table if needed
    782 		 */
    783 		if (ht != NULL && !is_bare) {
    784 			ht->ht_hat = hat;
    785 			ht->ht_pfn = ptable_alloc((uintptr_t)ht);
    786 			if (ht->ht_pfn == PFN_INVALID) {
    787 				if (USE_HAT_RESERVES())
    788 					htable_put_reserve(ht);
    789 				else
    790 					kmem_cache_free(htable_cache, ht);
    791 				ht = NULL;
    792 			}
    793 		}
    794 	}
    795 
    796 	/*
    797 	 * If allocations failed, kick off a kmem_reap() and resort to
    798 	 * htable steal(). We may spin here if the system is very low on
    799 	 * memory. If the kernel itself has consumed all memory and kmem_reap()
    800 	 * can't free up anything, then we'll really get stuck here.
    801 	 * That should only happen in a system where the administrator has
    802 	 * misconfigured VM parameters via /etc/system.
    803 	 */
    804 	while (ht == NULL && can_steal_post_boot) {
    805 		kmem_reap();
    806 		ht = htable_steal(1);
    807 		HATSTAT_INC(hs_steals);
    808 
    809 		/*
    810 		 * If we stole for a bare htable, release the pagetable page.
    811 		 */
    812 		if (ht != NULL) {
    813 			if (is_bare) {
    814 				ptable_free(ht->ht_pfn);
    815 				ht->ht_pfn = PFN_INVALID;
    816 #if defined(__xpv) && defined(__amd64)
    817 			/*
    818 			 * make stolen page table writable again in kpm
    819 			 */
    820 			} else if (kpm_vbase && xen_kpm_page(ht->ht_pfn,
    821 			    PT_VALID | PT_WRITABLE) < 0) {
    822 				panic("failure making kpm r/w pfn=0x%lx",
    823 				    ht->ht_pfn);
    824 #endif
    825 			}
    826 		}
    827 	}
    828 
    829 	/*
    830 	 * All attempts to allocate or steal failed. This should only happen
    831 	 * if we run out of memory during boot, due perhaps to a huge
    832 	 * boot_archive. At this point there's no way to continue.
    833 	 */
    834 	if (ht == NULL)
    835 		panic("htable_alloc(): couldn't steal\n");
    836 
    837 #if defined(__amd64) && defined(__xpv)
    838 	/*
    839 	 * Under the 64-bit hypervisor, we have 2 top level page tables.
    840 	 * If this allocation fails, we'll resort to stealing.
    841 	 * We use the stolen page indirectly, by freeing the
    842 	 * stolen htable first.
    843 	 */
    844 	if (level == mmu.max_level) {
    845 		for (;;) {
    846 			htable_t *stolen;
    847 
    848 			hat->hat_user_ptable = ptable_alloc((uintptr_t)ht + 1);
    849 			if (hat->hat_user_ptable != PFN_INVALID)
    850 				break;
    851 			stolen = htable_steal(1);
    852 			if (stolen == NULL)
    853 				panic("2nd steal ptable failed\n");
    854 			htable_free(stolen);
    855 		}
    856 		block_zero_no_xmm(kpm_vbase + pfn_to_pa(hat->hat_user_ptable),
    857 		    MMU_PAGESIZE);
    858 	}
    859 #endif
    860 
    861 	/*
    862 	 * Shared page tables have all entries locked and entries may not
    863 	 * be added or deleted.
    864 	 */
    865 	ht->ht_flags = 0;
    866 	if (shared != NULL) {
    867 		ASSERT(shared->ht_valid_cnt > 0);
    868 		ht->ht_flags |= HTABLE_SHARED_PFN;
    869 		ht->ht_pfn = shared->ht_pfn;
    870 		ht->ht_lock_cnt = 0;
    871 		ht->ht_valid_cnt = 0;		/* updated in hat_share() */
    872 		ht->ht_shares = shared;
    873 		need_to_zero = 0;
    874 	} else {
    875 		ht->ht_shares = NULL;
    876 		ht->ht_lock_cnt = 0;
    877 		ht->ht_valid_cnt = 0;
    878 	}
    879 
    880 	/*
    881 	 * setup flags, etc. for VLP htables
    882 	 */
    883 	if (is_vlp) {
    884 		ht->ht_flags |= HTABLE_VLP;
    885 		ASSERT(ht->ht_pfn == PFN_INVALID);
    886 		need_to_zero = 0;
    887 	}
    888 
    889 	/*
    890 	 * fill in the htable
    891 	 */
    892 	ht->ht_hat = hat;
    893 	ht->ht_parent = NULL;
    894 	ht->ht_vaddr = vaddr;
    895 	ht->ht_level = level;
    896 	ht->ht_busy = 1;
    897 	ht->ht_next = NULL;
    898 	ht->ht_prev = NULL;
    899 
    900 	/*
    901 	 * Zero out any freshly allocated page table
    902 	 */
    903 	if (need_to_zero)
    904 		x86pte_zero(ht, 0, mmu.ptes_per_table);
    905 
    906 #if defined(__amd64) && defined(__xpv)
    907 	if (!is_bare && kpm_vbase) {
    908 		(void) xen_kpm_page(ht->ht_pfn, PT_VALID);
    909 		if (level == mmu.max_level)
    910 			(void) xen_kpm_page(hat->hat_user_ptable, PT_VALID);
    911 	}
    912 #endif
    913 
    914 	return (ht);
    915 }
    916 
    917 /*
    918  * Free up an htable, either to a hat's cached list, the reserves or
    919  * back to kmem.
    920  */
    921 static void
    922 htable_free(htable_t *ht)
    923 {
    924 	hat_t *hat = ht->ht_hat;
    925 
    926 	/*
    927 	 * If the process isn't exiting, cache the free htable in the hat
    928 	 * structure. We always do this for the boot time reserve. We don't
    929 	 * do this if the hat is exiting or we are stealing/reaping htables.
    930 	 */
    931 	if (hat != NULL &&
    932 	    !(ht->ht_flags & HTABLE_SHARED_PFN) &&
    933 	    (use_boot_reserve ||
    934 	    (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) {
    935 		ASSERT((ht->ht_flags & HTABLE_VLP) == 0);
    936 		ASSERT(ht->ht_pfn != PFN_INVALID);
    937 		hat_enter(hat);
    938 		ht->ht_next = hat->hat_ht_cached;
    939 		hat->hat_ht_cached = ht;
    940 		hat_exit(hat);
    941 		return;
    942 	}
    943 
    944 	/*
    945 	 * If we have a hardware page table, free it.
    946 	 * We don't free page tables that are accessed by sharing.
    947 	 */
    948 	if (ht->ht_flags & HTABLE_SHARED_PFN) {
    949 		ASSERT(ht->ht_pfn != PFN_INVALID);
    950 	} else if (!(ht->ht_flags & HTABLE_VLP)) {
    951 		ptable_free(ht->ht_pfn);
    952 #if defined(__amd64) && defined(__xpv)
    953 		if (ht->ht_level == mmu.max_level) {
    954 			ptable_free(hat->hat_user_ptable);
    955 			hat->hat_user_ptable = PFN_INVALID;
    956 		}
    957 #endif
    958 	}
    959 	ht->ht_pfn = PFN_INVALID;
    960 
    961 	/*
    962 	 * Free it or put into reserves.
    963 	 */
    964 	if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) {
    965 		htable_put_reserve(ht);
    966 	} else {
    967 		kmem_cache_free(htable_cache, ht);
    968 		htable_adjust_reserve();
    969 	}
    970 }
    971 
    972 
    973 /*
    974  * This is called when a hat is being destroyed or swapped out. We reap all
    975  * the remaining htables in the hat cache. If destroying all left over
    976  * htables are also destroyed.
    977  *
    978  * We also don't need to invalidate any of the PTPs nor do any demapping.
    979  */
    980 void
    981 htable_purge_hat(hat_t *hat)
    982 {
    983 	htable_t *ht;
    984 	int h;
    985 
    986 	/*
    987 	 * Purge the htable cache if just reaping.
    988 	 */
    989 	if (!(hat->hat_flags & HAT_FREEING)) {
    990 		atomic_add_32(&htable_dont_cache, 1);
    991 		for (;;) {
    992 			hat_enter(hat);
    993 			ht = hat->hat_ht_cached;
    994 			if (ht == NULL) {
    995 				hat_exit(hat);
    996 				break;
    997 			}
    998 			hat->hat_ht_cached = ht->ht_next;
    999 			hat_exit(hat);
   1000 			htable_free(ht);
   1001 		}
   1002 		atomic_add_32(&htable_dont_cache, -1);
   1003 		return;
   1004 	}
   1005 
   1006 	/*
   1007 	 * if freeing, no locking is needed
   1008 	 */
   1009 	while ((ht = hat->hat_ht_cached) != NULL) {
   1010 		hat->hat_ht_cached = ht->ht_next;
   1011 		htable_free(ht);
   1012 	}
   1013 
   1014 	/*
   1015 	 * walk thru the htable hash table and free all the htables in it.
   1016 	 */
   1017 	for (h = 0; h < hat->hat_num_hash; ++h) {
   1018 		while ((ht = hat->hat_ht_hash[h]) != NULL) {
   1019 			if (ht->ht_next)
   1020 				ht->ht_next->ht_prev = ht->ht_prev;
   1021 
   1022 			if (ht->ht_prev) {
   1023 				ht->ht_prev->ht_next = ht->ht_next;
   1024 			} else {
   1025 				ASSERT(hat->hat_ht_hash[h] == ht);
   1026 				hat->hat_ht_hash[h] = ht->ht_next;
   1027 			}
   1028 			htable_free(ht);
   1029 		}
   1030 	}
   1031 }
   1032 
   1033 /*
   1034  * Unlink an entry for a table at vaddr and level out of the existing table
   1035  * one level higher. We are always holding the HASH_ENTER() when doing this.
   1036  */
   1037 static void
   1038 unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr)
   1039 {
   1040 	uint_t		entry = htable_va2entry(vaddr, higher);
   1041 	x86pte_t	expect = MAKEPTP(old->ht_pfn, old->ht_level);
   1042 	x86pte_t	found;
   1043 	hat_t		*hat = old->ht_hat;
   1044 
   1045 	ASSERT(higher->ht_busy > 0);
   1046 	ASSERT(higher->ht_valid_cnt > 0);
   1047 	ASSERT(old->ht_valid_cnt == 0);
   1048 	found = x86pte_cas(higher, entry, expect, 0);
   1049 #ifdef __xpv
   1050 	/*
   1051 	 * This is weird, but Xen apparently automatically unlinks empty
   1052 	 * pagetables from the upper page table. So allow PTP to be 0 already.
   1053 	 */
   1054 	if (found != expect && found != 0)
   1055 #else
   1056 	if (found != expect)
   1057 #endif
   1058 		panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE,
   1059 		    found, expect);
   1060 
   1061 	/*
   1062 	 * When a top level VLP page table entry changes, we must issue
   1063 	 * a reload of cr3 on all processors.
   1064 	 *
   1065 	 * If we don't need do do that, then we still have to INVLPG against
   1066 	 * an address covered by the inner page table, as the latest processors
   1067 	 * have TLB-like caches for non-leaf page table entries.
   1068 	 */
   1069 	if (!(hat->hat_flags & HAT_FREEING)) {
   1070 		hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ?
   1071 		    DEMAP_ALL_ADDR : old->ht_vaddr);
   1072 	}
   1073 
   1074 	HTABLE_DEC(higher->ht_valid_cnt);
   1075 }
   1076 
   1077 /*
   1078  * Link an entry for a new table at vaddr and level into the existing table
   1079  * one level higher. We are always holding the HASH_ENTER() when doing this.
   1080  */
   1081 static void
   1082 link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr)
   1083 {
   1084 	uint_t		entry = htable_va2entry(vaddr, higher);
   1085 	x86pte_t	newptp = MAKEPTP(new->ht_pfn, new->ht_level);
   1086 	x86pte_t	found;
   1087 
   1088 	ASSERT(higher->ht_busy > 0);
   1089 
   1090 	ASSERT(new->ht_level != mmu.max_level);
   1091 
   1092 	HTABLE_INC(higher->ht_valid_cnt);
   1093 
   1094 	found = x86pte_cas(higher, entry, 0, newptp);
   1095 	if ((found & ~PT_REF) != 0)
   1096 		panic("HAT: ptp not 0, found=" FMT_PTE, found);
   1097 
   1098 	/*
   1099 	 * When any top level VLP page table entry changes, we must issue
   1100 	 * a reload of cr3 on all processors using it.
   1101 	 * We also need to do this for the kernel hat on PAE 32 bit kernel.
   1102 	 */
   1103 	if (
   1104 #ifdef __i386
   1105 	    (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) ||
   1106 #endif
   1107 	    (higher->ht_flags & HTABLE_VLP))
   1108 		hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);
   1109 }
   1110 
   1111 /*
   1112  * Release of hold on an htable. If this is the last use and the pagetable
   1113  * is empty we may want to free it, then recursively look at the pagetable
   1114  * above it. The recursion is handled by the outer while() loop.
   1115  *
   1116  * On the metal, during process exit, we don't bother unlinking the tables from
   1117  * upper level pagetables. They are instead handled in bulk by hat_free_end().
   1118  * We can't do this on the hypervisor as we need the page table to be
   1119  * implicitly unpinnned before it goes to the free page lists. This can't
   1120  * happen unless we fully unlink it from the page table hierarchy.
   1121  */
   1122 void
   1123 htable_release(htable_t *ht)
   1124 {
   1125 	uint_t		hashval;
   1126 	htable_t	*shared;
   1127 	htable_t	*higher;
   1128 	hat_t		*hat;
   1129 	uintptr_t	va;
   1130 	level_t		level;
   1131 
   1132 	while (ht != NULL) {
   1133 		shared = NULL;
   1134 		for (;;) {
   1135 			hat = ht->ht_hat;
   1136 			va = ht->ht_vaddr;
   1137 			level = ht->ht_level;
   1138 			hashval = HTABLE_HASH(hat, va, level);
   1139 
   1140 			/*
   1141 			 * The common case is that this isn't the last use of
   1142 			 * an htable so we don't want to free the htable.
   1143 			 */
   1144 			HTABLE_ENTER(hashval);
   1145 			ASSERT(ht->ht_valid_cnt >= 0);
   1146 			ASSERT(ht->ht_busy > 0);
   1147 			if (ht->ht_valid_cnt > 0)
   1148 				break;
   1149 			if (ht->ht_busy > 1)
   1150 				break;
   1151 			ASSERT(ht->ht_lock_cnt == 0);
   1152 
   1153 #if !defined(__xpv)
   1154 			/*
   1155 			 * we always release empty shared htables
   1156 			 */
   1157 			if (!(ht->ht_flags & HTABLE_SHARED_PFN)) {
   1158 
   1159 				/*
   1160 				 * don't release if in address space tear down
   1161 				 */
   1162 				if (hat->hat_flags & HAT_FREEING)
   1163 					break;
   1164 
   1165 				/*
   1166 				 * At and above max_page_level, free if it's for
   1167 				 * a boot-time kernel mapping below kernelbase.
   1168 				 */
   1169 				if (level >= mmu.max_page_level &&
   1170 				    (hat != kas.a_hat || va >= kernelbase))
   1171 					break;
   1172 			}
   1173 #endif /* __xpv */
   1174 
   1175 			/*
   1176 			 * Remember if we destroy an htable that shares its PFN
   1177 			 * from elsewhere.
   1178 			 */
   1179 			if (ht->ht_flags & HTABLE_SHARED_PFN) {
   1180 				ASSERT(shared == NULL);
   1181 				shared = ht->ht_shares;
   1182 				HATSTAT_INC(hs_htable_unshared);
   1183 			}
   1184 
   1185 			/*
   1186 			 * Handle release of a table and freeing the htable_t.
   1187 			 * Unlink it from the table higher (ie. ht_parent).
   1188 			 */
   1189 			higher = ht->ht_parent;
   1190 			ASSERT(higher != NULL);
   1191 
   1192 			/*
   1193 			 * Unlink the pagetable.
   1194 			 */
   1195 			unlink_ptp(higher, ht, va);
   1196 
   1197 			/*
   1198 			 * remove this htable from its hash list
   1199 			 */
   1200 			if (ht->ht_next)
   1201 				ht->ht_next->ht_prev = ht->ht_prev;
   1202 
   1203 			if (ht->ht_prev) {
   1204 				ht->ht_prev->ht_next = ht->ht_next;
   1205 			} else {
   1206 				ASSERT(hat->hat_ht_hash[hashval] == ht);
   1207 				hat->hat_ht_hash[hashval] = ht->ht_next;
   1208 			}
   1209 			HTABLE_EXIT(hashval);
   1210 			htable_free(ht);
   1211 			ht = higher;
   1212 		}
   1213 
   1214 		ASSERT(ht->ht_busy >= 1);
   1215 		--ht->ht_busy;
   1216 		HTABLE_EXIT(hashval);
   1217 
   1218 		/*
   1219 		 * If we released a shared htable, do a release on the htable
   1220 		 * from which it shared
   1221 		 */
   1222 		ht = shared;
   1223 	}
   1224 }
   1225 
   1226 /*
   1227  * Find the htable for the pagetable at the given level for the given address.
   1228  * If found acquires a hold that eventually needs to be htable_release()d
   1229  */
   1230 htable_t *
   1231 htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level)
   1232 {
   1233 	uintptr_t	base;
   1234 	uint_t		hashval;
   1235 	htable_t	*ht = NULL;
   1236 
   1237 	ASSERT(level >= 0);
   1238 	ASSERT(level <= TOP_LEVEL(hat));
   1239 
   1240 	if (level == TOP_LEVEL(hat)) {
   1241 #if defined(__amd64)
   1242 		/*
   1243 		 * 32 bit address spaces on 64 bit kernels need to check
   1244 		 * for overflow of the 32 bit address space
   1245 		 */
   1246 		if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32))
   1247 			return (NULL);
   1248 #endif
   1249 		base = 0;
   1250 	} else {
   1251 		base = vaddr & LEVEL_MASK(level + 1);
   1252 	}
   1253 
   1254 	hashval = HTABLE_HASH(hat, base, level);
   1255 	HTABLE_ENTER(hashval);
   1256 	for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) {
   1257 		if (ht->ht_hat == hat &&
   1258 		    ht->ht_vaddr == base &&
   1259 		    ht->ht_level == level)
   1260 			break;
   1261 	}
   1262 	if (ht)
   1263 		++ht->ht_busy;
   1264 
   1265 	HTABLE_EXIT(hashval);
   1266 	return (ht);
   1267 }
   1268 
   1269 /*
   1270  * Acquires a hold on a known htable (from a locked hment entry).
   1271  */
   1272 void
   1273 htable_acquire(htable_t *ht)
   1274 {
   1275 	hat_t		*hat = ht->ht_hat;
   1276 	level_t		level = ht->ht_level;
   1277 	uintptr_t	base = ht->ht_vaddr;
   1278 	uint_t		hashval = HTABLE_HASH(hat, base, level);
   1279 
   1280 	HTABLE_ENTER(hashval);
   1281 #ifdef DEBUG
   1282 	/*
   1283 	 * make sure the htable is there
   1284 	 */
   1285 	{
   1286 		htable_t	*h;
   1287 
   1288 		for (h = hat->hat_ht_hash[hashval];
   1289 		    h && h != ht;
   1290 		    h = h->ht_next)
   1291 			;
   1292 		ASSERT(h == ht);
   1293 	}
   1294 #endif /* DEBUG */
   1295 	++ht->ht_busy;
   1296 	HTABLE_EXIT(hashval);
   1297 }
   1298 
   1299 /*
   1300  * Find the htable for the pagetable at the given level for the given address.
   1301  * If found acquires a hold that eventually needs to be htable_release()d
   1302  * If not found the table is created.
   1303  *
   1304  * Since we can't hold a hash table mutex during allocation, we have to
   1305  * drop it and redo the search on a create. Then we may have to free the newly
   1306  * allocated htable if another thread raced in and created it ahead of us.
   1307  */
   1308 htable_t *
   1309 htable_create(
   1310 	hat_t		*hat,
   1311 	uintptr_t	vaddr,
   1312 	level_t		level,
   1313 	htable_t	*shared)
   1314 {
   1315 	uint_t		h;
   1316 	level_t		l;
   1317 	uintptr_t	base;
   1318 	htable_t	*ht;
   1319 	htable_t	*higher = NULL;
   1320 	htable_t	*new = NULL;
   1321 
   1322 	if (level < 0 || level > TOP_LEVEL(hat))
   1323 		panic("htable_create(): level %d out of range\n", level);
   1324 
   1325 	/*
   1326 	 * Create the page tables in top down order.
   1327 	 */
   1328 	for (l = TOP_LEVEL(hat); l >= level; --l) {
   1329 		new = NULL;
   1330 		if (l == TOP_LEVEL(hat))
   1331 			base = 0;
   1332 		else
   1333 			base = vaddr & LEVEL_MASK(l + 1);
   1334 
   1335 		h = HTABLE_HASH(hat, base, l);
   1336 try_again:
   1337 		/*
   1338 		 * look up the htable at this level
   1339 		 */
   1340 		HTABLE_ENTER(h);
   1341 		if (l == TOP_LEVEL(hat)) {
   1342 			ht = hat->hat_htable;
   1343 		} else {
   1344 			for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
   1345 				ASSERT(ht->ht_hat == hat);
   1346 				if (ht->ht_vaddr == base &&
   1347 				    ht->ht_level == l)
   1348 					break;
   1349 			}
   1350 		}
   1351 
   1352 		/*
   1353 		 * if we found the htable, increment its busy cnt
   1354 		 * and if we had allocated a new htable, free it.
   1355 		 */
   1356 		if (ht != NULL) {
   1357 			/*
   1358 			 * If we find a pre-existing shared table, it must
   1359 			 * share from the same place.
   1360 			 */
   1361 			if (l == level && shared && ht->ht_shares &&
   1362 			    ht->ht_shares != shared) {
   1363 				panic("htable shared from wrong place "
   1364 				    "found htable=%p shared=%p",
   1365 				    (void *)ht, (void *)shared);
   1366 			}
   1367 			++ht->ht_busy;
   1368 			HTABLE_EXIT(h);
   1369 			if (new)
   1370 				htable_free(new);
   1371 			if (higher != NULL)
   1372 				htable_release(higher);
   1373 			higher = ht;
   1374 
   1375 		/*
   1376 		 * if we didn't find it on the first search
   1377 		 * allocate a new one and search again
   1378 		 */
   1379 		} else if (new == NULL) {
   1380 			HTABLE_EXIT(h);
   1381 			new = htable_alloc(hat, base, l,
   1382 			    l == level ? shared : NULL);
   1383 			goto try_again;
   1384 
   1385 		/*
   1386 		 * 2nd search and still not there, use "new" table
   1387 		 * Link new table into higher, when not at top level.
   1388 		 */
   1389 		} else {
   1390 			ht = new;
   1391 			if (higher != NULL) {
   1392 				link_ptp(higher, ht, base);
   1393 				ht->ht_parent = higher;
   1394 			}
   1395 			ht->ht_next = hat->hat_ht_hash[h];
   1396 			ASSERT(ht->ht_prev == NULL);
   1397 			if (hat->hat_ht_hash[h])
   1398 				hat->hat_ht_hash[h]->ht_prev = ht;
   1399 			hat->hat_ht_hash[h] = ht;
   1400 			HTABLE_EXIT(h);
   1401 
   1402 			/*
   1403 			 * Note we don't do htable_release(higher).
   1404 			 * That happens recursively when "new" is removed by
   1405 			 * htable_release() or htable_steal().
   1406 			 */
   1407 			higher = ht;
   1408 
   1409 			/*
   1410 			 * If we just created a new shared page table we
   1411 			 * increment the shared htable's busy count, so that
   1412 			 * it can't be the victim of a steal even if it's empty.
   1413 			 */
   1414 			if (l == level && shared) {
   1415 				(void) htable_lookup(shared->ht_hat,
   1416 				    shared->ht_vaddr, shared->ht_level);
   1417 				HATSTAT_INC(hs_htable_shared);
   1418 			}
   1419 		}
   1420 	}
   1421 
   1422 	return (ht);
   1423 }
   1424 
   1425 /*
   1426  * Inherit initial pagetables from the boot program. On the 64-bit
   1427  * hypervisor we also temporarily mark the p_index field of page table
   1428  * pages, so we know not to try making them writable in seg_kpm.
   1429  */
   1430 void
   1431 htable_attach(
   1432 	hat_t *hat,
   1433 	uintptr_t base,
   1434 	level_t level,
   1435 	htable_t *parent,
   1436 	pfn_t pfn)
   1437 {
   1438 	htable_t	*ht;
   1439 	uint_t		h;
   1440 	uint_t		i;
   1441 	x86pte_t	pte;
   1442 	x86pte_t	*ptep;
   1443 	page_t		*pp;
   1444 	extern page_t	*boot_claim_page(pfn_t);
   1445 
   1446 	ht = htable_get_reserve();
   1447 	if (level == mmu.max_level)
   1448 		kas.a_hat->hat_htable = ht;
   1449 	ht->ht_hat = hat;
   1450 	ht->ht_parent = parent;
   1451 	ht->ht_vaddr = base;
   1452 	ht->ht_level = level;
   1453 	ht->ht_busy = 1;
   1454 	ht->ht_next = NULL;
   1455 	ht->ht_prev = NULL;
   1456 	ht->ht_flags = 0;
   1457 	ht->ht_pfn = pfn;
   1458 	ht->ht_lock_cnt = 0;
   1459 	ht->ht_valid_cnt = 0;
   1460 	if (parent != NULL)
   1461 		++parent->ht_busy;
   1462 
   1463 	h = HTABLE_HASH(hat, base, level);
   1464 	HTABLE_ENTER(h);
   1465 	ht->ht_next = hat->hat_ht_hash[h];
   1466 	ASSERT(ht->ht_prev == NULL);
   1467 	if (hat->hat_ht_hash[h])
   1468 		hat->hat_ht_hash[h]->ht_prev = ht;
   1469 	hat->hat_ht_hash[h] = ht;
   1470 	HTABLE_EXIT(h);
   1471 
   1472 	/*
   1473 	 * make sure the page table physical page is not FREE
   1474 	 */
   1475 	if (page_resv(1, KM_NOSLEEP) == 0)
   1476 		panic("page_resv() failed in ptable alloc");
   1477 
   1478 	pp = boot_claim_page(pfn);
   1479 	ASSERT(pp != NULL);
   1480 	page_downgrade(pp);
   1481 #if defined(__xpv) && defined(__amd64)
   1482 	/*
   1483 	 * Record in the page_t that is a pagetable for segkpm setup.
   1484 	 */
   1485 	if (kpm_vbase)
   1486 		pp->p_index = 1;
   1487 #endif
   1488 
   1489 	/*
   1490 	 * Count valid mappings and recursively attach lower level pagetables.
   1491 	 */
   1492 	ptep = kbm_remap_window(pfn_to_pa(pfn), 0);
   1493 	for (i = 0; i < HTABLE_NUM_PTES(ht); ++i) {
   1494 		if (mmu.pae_hat)
   1495 			pte = ptep[i];
   1496 		else
   1497 			pte = ((x86pte32_t *)ptep)[i];
   1498 		if (!IN_HYPERVISOR_VA(base) && PTE_ISVALID(pte)) {
   1499 			++ht->ht_valid_cnt;
   1500 			if (!PTE_ISPAGE(pte, level)) {
   1501 				htable_attach(hat, base, level - 1,
   1502 				    ht, PTE2PFN(pte, level));
   1503 				ptep = kbm_remap_window(pfn_to_pa(pfn), 0);
   1504 			}
   1505 		}
   1506 		base += LEVEL_SIZE(level);
   1507 		if (base == mmu.hole_start)
   1508 			base = (mmu.hole_end + MMU_PAGEOFFSET) & MMU_PAGEMASK;
   1509 	}
   1510 
   1511 	/*
   1512 	 * As long as all the mappings we had were below kernel base
   1513 	 * we can release the htable.
   1514 	 */
   1515 	if (base < kernelbase)
   1516 		htable_release(ht);
   1517 }
   1518 
   1519 /*
   1520  * Walk through a given htable looking for the first valid entry.  This
   1521  * routine takes both a starting and ending address.  The starting address
   1522  * is required to be within the htable provided by the caller, but there is
   1523  * no such restriction on the ending address.
   1524  *
   1525  * If the routine finds a valid entry in the htable (at or beyond the
   1526  * starting address), the PTE (and its address) will be returned.
   1527  * This PTE may correspond to either a page or a pagetable - it is the
   1528  * caller's responsibility to determine which.  If no valid entry is
   1529  * found, 0 (and invalid PTE) and the next unexamined address will be
   1530  * returned.
   1531  *
   1532  * The loop has been carefully coded for optimization.
   1533  */
   1534 static x86pte_t
   1535 htable_scan(htable_t *ht, uintptr_t *vap, uintptr_t eaddr)
   1536 {
   1537 	uint_t e;
   1538 	x86pte_t found_pte = (x86pte_t)0;
   1539 	caddr_t pte_ptr;
   1540 	caddr_t end_pte_ptr;
   1541 	int l = ht->ht_level;
   1542 	uintptr_t va = *vap & LEVEL_MASK(l);
   1543 	size_t pgsize = LEVEL_SIZE(l);
   1544 
   1545 	ASSERT(va >= ht->ht_vaddr);
   1546 	ASSERT(va <= HTABLE_LAST_PAGE(ht));
   1547 
   1548 	/*
   1549 	 * Compute the starting index and ending virtual address
   1550 	 */
   1551 	e = htable_va2entry(va, ht);
   1552 
   1553 	/*
   1554 	 * The following page table scan code knows that the valid
   1555 	 * bit of a PTE is in the lowest byte AND that x86 is little endian!!
   1556 	 */
   1557 	pte_ptr = (caddr_t)x86pte_access_pagetable(ht, 0);
   1558 	end_pte_ptr = (caddr_t)PT_INDEX_PTR(pte_ptr, HTABLE_NUM_PTES(ht));
   1559 	pte_ptr = (caddr_t)PT_INDEX_PTR((x86pte_t *)pte_ptr, e);
   1560 	while (!PTE_ISVALID(*pte_ptr)) {
   1561 		va += pgsize;
   1562 		if (va >= eaddr)
   1563 			break;
   1564 		pte_ptr += mmu.pte_size;
   1565 		ASSERT(pte_ptr <= end_pte_ptr);
   1566 		if (pte_ptr == end_pte_ptr)
   1567 			break;
   1568 	}
   1569 
   1570 	/*
   1571 	 * if we found a valid PTE, load the entire PTE
   1572 	 */
   1573 	if (va < eaddr && pte_ptr != end_pte_ptr)
   1574 		found_pte = GET_PTE((x86pte_t *)pte_ptr);
   1575 	x86pte_release_pagetable(ht);
   1576 
   1577 #if defined(__amd64)
   1578 	/*
   1579 	 * deal with VA hole on amd64
   1580 	 */
   1581 	if (l == mmu.max_level && va >= mmu.hole_start && va <= mmu.hole_end)
   1582 		va = mmu.hole_end + va - mmu.hole_start;
   1583 #endif /* __amd64 */
   1584 
   1585 	*vap = va;
   1586 	return (found_pte);
   1587 }
   1588 
   1589 /*
   1590  * Find the address and htable for the first populated translation at or
   1591  * above the given virtual address.  The caller may also specify an upper
   1592  * limit to the address range to search.  Uses level information to quickly
   1593  * skip unpopulated sections of virtual address spaces.
   1594  *
   1595  * If not found returns NULL. When found, returns the htable and virt addr
   1596  * and has a hold on the htable.
   1597  */
   1598 x86pte_t
   1599 htable_walk(
   1600 	struct hat *hat,
   1601 	htable_t **htp,
   1602 	uintptr_t *vaddr,
   1603 	uintptr_t eaddr)
   1604 {
   1605 	uintptr_t va = *vaddr;
   1606 	htable_t *ht;
   1607 	htable_t *prev = *htp;
   1608 	level_t l;
   1609 	level_t max_mapped_level;
   1610 	x86pte_t pte;
   1611 
   1612 	ASSERT(eaddr > va);
   1613 
   1614 	/*
   1615 	 * If this is a user address, then we know we need not look beyond
   1616 	 * kernelbase.
   1617 	 */
   1618 	ASSERT(hat == kas.a_hat || eaddr <= kernelbase ||
   1619 	    eaddr == HTABLE_WALK_TO_END);
   1620 	if (hat != kas.a_hat && eaddr == HTABLE_WALK_TO_END)
   1621 		eaddr = kernelbase;
   1622 
   1623 	/*
   1624 	 * If we're coming in with a previous page table, search it first
   1625 	 * without doing an htable_lookup(), this should be frequent.
   1626 	 */
   1627 	if (prev) {
   1628 		ASSERT(prev->ht_busy > 0);
   1629 		ASSERT(prev->ht_vaddr <= va);
   1630 		l = prev->ht_level;
   1631 		if (va <= HTABLE_LAST_PAGE(prev)) {
   1632 			pte = htable_scan(prev, &va, eaddr);
   1633 
   1634 			if (PTE_ISPAGE(pte, l)) {
   1635 				*vaddr = va;
   1636 				*htp = prev;
   1637 				return (pte);
   1638 			}
   1639 		}
   1640 
   1641 		/*
   1642 		 * We found nothing in the htable provided by the caller,
   1643 		 * so fall through and do the full search
   1644 		 */
   1645 		htable_release(prev);
   1646 	}
   1647 
   1648 	/*
   1649 	 * Find the level of the largest pagesize used by this HAT.
   1650 	 */
   1651 	if (hat->hat_ism_pgcnt > 0) {
   1652 		max_mapped_level = mmu.umax_page_level;
   1653 	} else {
   1654 		max_mapped_level = 0;
   1655 		for (l = 1; l <= mmu.max_page_level; ++l)
   1656 			if (hat->hat_pages_mapped[l] != 0)
   1657 				max_mapped_level = l;
   1658 	}
   1659 
   1660 	while (va < eaddr && va >= *vaddr) {
   1661 		ASSERT(!IN_VA_HOLE(va));
   1662 
   1663 		/*
   1664 		 *  Find lowest table with any entry for given address.
   1665 		 */
   1666 		for (l = 0; l <= TOP_LEVEL(hat); ++l) {
   1667 			ht = htable_lookup(hat, va, l);
   1668 			if (ht != NULL) {
   1669 				pte = htable_scan(ht, &va, eaddr);
   1670 				if (PTE_ISPAGE(pte, l)) {
   1671 					*vaddr = va;
   1672 					*htp = ht;
   1673 					return (pte);
   1674 				}
   1675 				htable_release(ht);
   1676 				break;
   1677 			}
   1678 
   1679 			/*
   1680 			 * No htable at this level for the address. If there
   1681 			 * is no larger page size that could cover it, we can
   1682 			 * skip right to the start of the next page table.
   1683 			 */
   1684 			ASSERT(l < TOP_LEVEL(hat));
   1685 			if (l >= max_mapped_level) {
   1686 				va = NEXT_ENTRY_VA(va, l + 1);
   1687 				if (va >= eaddr)
   1688 					break;
   1689 			}
   1690 		}
   1691 	}
   1692 
   1693 	*vaddr = 0;
   1694 	*htp = NULL;
   1695 	return (0);
   1696 }
   1697 
   1698 /*
   1699  * Find the htable and page table entry index of the given virtual address
   1700  * with pagesize at or below given level.
   1701  * If not found returns NULL. When found, returns the htable, sets
   1702  * entry, and has a hold on the htable.
   1703  */
   1704 htable_t *
   1705 htable_getpte(
   1706 	struct hat *hat,
   1707 	uintptr_t vaddr,
   1708 	uint_t *entry,
   1709 	x86pte_t *pte,
   1710 	level_t level)
   1711 {
   1712 	htable_t	*ht;
   1713 	level_t		l;
   1714 	uint_t		e;
   1715 
   1716 	ASSERT(level <= mmu.max_page_level);
   1717 
   1718 	for (l = 0; l <= level; ++l) {
   1719 		ht = htable_lookup(hat, vaddr, l);
   1720 		if (ht == NULL)
   1721 			continue;
   1722 		e = htable_va2entry(vaddr, ht);
   1723 		if (entry != NULL)
   1724 			*entry = e;
   1725 		if (pte != NULL)
   1726 			*pte = x86pte_get(ht, e);
   1727 		return (ht);
   1728 	}
   1729 	return (NULL);
   1730 }
   1731 
   1732 /*
   1733  * Find the htable and page table entry index of the given virtual address.
   1734  * There must be a valid page mapped at the given address.
   1735  * If not found returns NULL. When found, returns the htable, sets
   1736  * entry, and has a hold on the htable.
   1737  */
   1738 htable_t *
   1739 htable_getpage(struct hat *hat, uintptr_t vaddr, uint_t *entry)
   1740 {
   1741 	htable_t	*ht;
   1742 	uint_t		e;
   1743 	x86pte_t	pte;
   1744 
   1745 	ht = htable_getpte(hat, vaddr, &e, &pte, mmu.max_page_level);
   1746 	if (ht == NULL)
   1747 		return (NULL);
   1748 
   1749 	if (entry)
   1750 		*entry = e;
   1751 
   1752 	if (PTE_ISPAGE(pte, ht->ht_level))
   1753 		return (ht);
   1754 	htable_release(ht);
   1755 	return (NULL);
   1756 }
   1757 
   1758 
   1759 void
   1760 htable_init()
   1761 {
   1762 	/*
   1763 	 * To save on kernel VA usage, we avoid debug information in 32 bit
   1764 	 * kernels.
   1765 	 */
   1766 #if defined(__amd64)
   1767 	int	kmem_flags = KMC_NOHASH;
   1768 #elif defined(__i386)
   1769 	int	kmem_flags = KMC_NOHASH | KMC_NODEBUG;
   1770 #endif
   1771 
   1772 	/*
   1773 	 * initialize kmem caches
   1774 	 */
   1775 	htable_cache = kmem_cache_create("htable_t",
   1776 	    sizeof (htable_t), 0, NULL, NULL,
   1777 	    htable_reap, NULL, hat_memload_arena, kmem_flags);
   1778 }
   1779 
   1780 /*
   1781  * get the pte index for the virtual address in the given htable's pagetable
   1782  */
   1783 uint_t
   1784 htable_va2entry(uintptr_t va, htable_t *ht)
   1785 {
   1786 	level_t	l = ht->ht_level;
   1787 
   1788 	ASSERT(va >= ht->ht_vaddr);
   1789 	ASSERT(va <= HTABLE_LAST_PAGE(ht));
   1790 	return ((va >> LEVEL_SHIFT(l)) & (HTABLE_NUM_PTES(ht) - 1));
   1791 }
   1792 
   1793 /*
   1794  * Given an htable and the index of a pte in it, return the virtual address
   1795  * of the page.
   1796  */
   1797 uintptr_t
   1798 htable_e2va(htable_t *ht, uint_t entry)
   1799 {
   1800 	level_t	l = ht->ht_level;
   1801 	uintptr_t va;
   1802 
   1803 	ASSERT(entry < HTABLE_NUM_PTES(ht));
   1804 	va = ht->ht_vaddr + ((uintptr_t)entry << LEVEL_SHIFT(l));
   1805 
   1806 	/*
   1807 	 * Need to skip over any VA hole in top level table
   1808 	 */
   1809 #if defined(__amd64)
   1810 	if (ht->ht_level == mmu.max_level && va >= mmu.hole_start)
   1811 		va += ((mmu.hole_end - mmu.hole_start) + 1);
   1812 #endif
   1813 
   1814 	return (va);
   1815 }
   1816 
   1817 /*
   1818  * The code uses compare and swap instructions to read/write PTE's to
   1819  * avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems.
   1820  * will naturally be atomic.
   1821  *
   1822  * The combination of using kpreempt_disable()/_enable() and the hci_mutex
   1823  * are used to ensure that an interrupt won't overwrite a temporary mapping
   1824  * while it's in use. If an interrupt thread tries to access a PTE, it will
   1825  * yield briefly back to the pinned thread which holds the cpu's hci_mutex.
   1826  */
   1827 void
   1828 x86pte_cpu_init(cpu_t *cpu)
   1829 {
   1830 	struct hat_cpu_info *hci;
   1831 
   1832 	hci = kmem_zalloc(sizeof (*hci), KM_SLEEP);
   1833 	mutex_init(&hci->hci_mutex, NULL, MUTEX_DEFAULT, NULL);
   1834 	cpu->cpu_hat_info = hci;
   1835 }
   1836 
   1837 void
   1838 x86pte_cpu_fini(cpu_t *cpu)
   1839 {
   1840 	struct hat_cpu_info *hci = cpu->cpu_hat_info;
   1841 
   1842 	kmem_free(hci, sizeof (*hci));
   1843 	cpu->cpu_hat_info = NULL;
   1844 }
   1845 
   1846 #ifdef __i386
   1847 /*
   1848  * On 32 bit kernels, loading a 64 bit PTE is a little tricky
   1849  */
   1850 x86pte_t
   1851 get_pte64(x86pte_t *ptr)
   1852 {
   1853 	volatile uint32_t *p = (uint32_t *)ptr;
   1854 	x86pte_t t;
   1855 
   1856 	ASSERT(mmu.pae_hat != 0);
   1857 	for (;;) {
   1858 		t = p[0];
   1859 		t |= (uint64_t)p[1] << 32;
   1860 		if ((t & 0xffffffff) == p[0])
   1861 			return (t);
   1862 	}
   1863 }
   1864 #endif /* __i386 */
   1865 
   1866 /*
   1867  * Disable preemption and establish a mapping to the pagetable with the
   1868  * given pfn. This is optimized for there case where it's the same
   1869  * pfn as we last used referenced from this CPU.
   1870  */
   1871 static x86pte_t *
   1872 x86pte_access_pagetable(htable_t *ht, uint_t index)
   1873 {
   1874 	/*
   1875 	 * VLP pagetables are contained in the hat_t
   1876 	 */
   1877 	if (ht->ht_flags & HTABLE_VLP)
   1878 		return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index));
   1879 	return (x86pte_mapin(ht->ht_pfn, index, ht));
   1880 }
   1881 
   1882 /*
   1883  * map the given pfn into the page table window.
   1884  */
   1885 /*ARGSUSED*/
   1886 x86pte_t *
   1887 x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
   1888 {
   1889 	x86pte_t *pteptr;
   1890 	x86pte_t pte = 0;
   1891 	x86pte_t newpte;
   1892 	int x;
   1893 
   1894 	ASSERT(pfn != PFN_INVALID);
   1895 
   1896 	if (!khat_running) {
   1897 		caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1);
   1898 		return (PT_INDEX_PTR(va, index));
   1899 	}
   1900 
   1901 	/*
   1902 	 * If kpm is available, use it.
   1903 	 */
   1904 	if (kpm_vbase)
   1905 		return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index));
   1906 
   1907 	/*
   1908 	 * Disable preemption and grab the CPU's hci_mutex
   1909 	 */
   1910 	kpreempt_disable();
   1911 	ASSERT(CPU->cpu_hat_info != NULL);
   1912 	mutex_enter(&CPU->cpu_hat_info->hci_mutex);
   1913 	x = PWIN_TABLE(CPU->cpu_id);
   1914 	pteptr = (x86pte_t *)PWIN_PTE_VA(x);
   1915 #ifndef __xpv
   1916 	if (mmu.pae_hat)
   1917 		pte = *pteptr;
   1918 	else
   1919 		pte = *(x86pte32_t *)pteptr;
   1920 #endif
   1921 
   1922 	newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx;
   1923 
   1924 	/*
   1925 	 * For hardware we can use a writable mapping.
   1926 	 */
   1927 #ifdef __xpv
   1928 	if (IN_XPV_PANIC())
   1929 #endif
   1930 		newpte |= PT_WRITABLE;
   1931 
   1932 	if (!PTE_EQUIV(newpte, pte)) {
   1933 
   1934 #ifdef __xpv
   1935 		if (!IN_XPV_PANIC()) {
   1936 			xen_map(newpte, PWIN_VA(x));
   1937 		} else
   1938 #endif
   1939 		{
   1940 			XPV_ALLOW_PAGETABLE_UPDATES();
   1941 			if (mmu.pae_hat)
   1942 				*pteptr = newpte;
   1943 			else
   1944 				*(x86pte32_t *)pteptr = newpte;
   1945 			XPV_DISALLOW_PAGETABLE_UPDATES();
   1946 			mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
   1947 		}
   1948 	}
   1949 	return (PT_INDEX_PTR(PWIN_VA(x), index));
   1950 }
   1951 
   1952 /*
   1953  * Release access to a page table.
   1954  */
   1955 static void
   1956 x86pte_release_pagetable(htable_t *ht)
   1957 {
   1958 	/*
   1959 	 * nothing to do for VLP htables
   1960 	 */
   1961 	if (ht->ht_flags & HTABLE_VLP)
   1962 		return;
   1963 
   1964 	x86pte_mapout();
   1965 }
   1966 
   1967 void
   1968 x86pte_mapout(void)
   1969 {
   1970 	if (kpm_vbase != NULL || !khat_running)
   1971 		return;
   1972 
   1973 	/*
   1974 	 * Drop the CPU's hci_mutex and restore preemption.
   1975 	 */
   1976 #ifdef __xpv
   1977 	if (!IN_XPV_PANIC()) {
   1978 		uintptr_t va;
   1979 
   1980 		/*
   1981 		 * We need to always clear the mapping in case a page
   1982 		 * that was once a page table page is ballooned out.
   1983 		 */
   1984 		va = (uintptr_t)PWIN_VA(PWIN_TABLE(CPU->cpu_id));
   1985 		(void) HYPERVISOR_update_va_mapping(va, 0,
   1986 		    UVMF_INVLPG | UVMF_LOCAL);
   1987 	}
   1988 #endif
   1989 	mutex_exit(&CPU->cpu_hat_info->hci_mutex);
   1990 	kpreempt_enable();
   1991 }
   1992 
   1993 /*
   1994  * Atomic retrieval of a pagetable entry
   1995  */
   1996 x86pte_t
   1997 x86pte_get(htable_t *ht, uint_t entry)
   1998 {
   1999 	x86pte_t	pte;
   2000 	x86pte_t	*ptep;
   2001 
   2002 	/*
   2003 	 * Be careful that loading PAE entries in 32 bit kernel is atomic.
   2004 	 */
   2005 	ASSERT(entry < mmu.ptes_per_table);
   2006 	ptep = x86pte_access_pagetable(ht, entry);
   2007 	pte = GET_PTE(ptep);
   2008 	x86pte_release_pagetable(ht);
   2009 	return (pte);
   2010 }
   2011 
   2012 /*
   2013  * Atomic unconditional set of a page table entry, it returns the previous
   2014  * value. For pre-existing mappings if the PFN changes, then we don't care
   2015  * about the old pte's REF / MOD bits. If the PFN remains the same, we leave
   2016  * the MOD/REF bits unchanged.
   2017  *
   2018  * If asked to overwrite a link to a lower page table with a large page
   2019  * mapping, this routine returns the special value of LPAGE_ERROR. This
   2020  * allows the upper HAT layers to retry with a smaller mapping size.
   2021  */
   2022 x86pte_t
   2023 x86pte_set(htable_t *ht, uint_t entry, x86pte_t new, void *ptr)
   2024 {
   2025 	x86pte_t	old;
   2026 	x86pte_t	prev;
   2027 	x86pte_t	*ptep;
   2028 	level_t		l = ht->ht_level;
   2029 	x86pte_t	pfn_mask = (l != 0) ? PT_PADDR_LGPG : PT_PADDR;
   2030 	x86pte_t	n;
   2031 	uintptr_t	addr = htable_e2va(ht, entry);
   2032 	hat_t		*hat = ht->ht_hat;
   2033 
   2034 	ASSERT(new != 0); /* don't use to invalidate a PTE, see x86pte_update */
   2035 	ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
   2036 	if (ptr == NULL)
   2037 		ptep = x86pte_access_pagetable(ht, entry);
   2038 	else
   2039 		ptep = ptr;
   2040 
   2041 	/*
   2042 	 * Install the new PTE. If remapping the same PFN, then
   2043 	 * copy existing REF/MOD bits to new mapping.
   2044 	 */
   2045 	do {
   2046 		prev = GET_PTE(ptep);
   2047 		n = new;
   2048 		if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask))
   2049 			n |= prev & (PT_REF | PT_MOD);
   2050 
   2051 		/*
   2052 		 * Another thread may have installed this mapping already,
   2053 		 * flush the local TLB and be done.
   2054 		 */
   2055 		if (prev == n) {
   2056 			old = new;
   2057 #ifdef __xpv
   2058 			if (!IN_XPV_PANIC())
   2059 				xen_flush_va((caddr_t)addr);
   2060 			else
   2061 #endif
   2062 				mmu_tlbflush_entry((caddr_t)addr);
   2063 			goto done;
   2064 		}
   2065 
   2066 		/*
   2067 		 * Detect if we have a collision of installing a large
   2068 		 * page mapping where there already is a lower page table.
   2069 		 */
   2070 		if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) {
   2071 			old = LPAGE_ERROR;
   2072 			goto done;
   2073 		}
   2074 
   2075 		XPV_ALLOW_PAGETABLE_UPDATES();
   2076 		old = CAS_PTE(ptep, prev, n);
   2077 		XPV_DISALLOW_PAGETABLE_UPDATES();
   2078 	} while (old != prev);
   2079 
   2080 	/*
   2081 	 * Do a TLB demap if needed, ie. the old pte was valid.
   2082 	 *
   2083 	 * Note that a stale TLB writeback to the PTE here either can't happen
   2084 	 * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST
   2085 	 * mappings, but they were created with REF and MOD already set, so
   2086 	 * no stale writeback will happen.
   2087 	 *
   2088 	 * Segmap is the only place where remaps happen on the same pfn and for
   2089 	 * that we want to preserve the stale REF/MOD bits.
   2090 	 */
   2091 	if (old & PT_REF)
   2092 		hat_tlb_inval(hat, addr);
   2093 
   2094 done:
   2095 	if (ptr == NULL)
   2096 		x86pte_release_pagetable(ht);
   2097 	return (old);
   2098 }
   2099 
   2100 /*
   2101  * Atomic compare and swap of a page table entry. No TLB invalidates are done.
   2102  * This is used for links between pagetables of different levels.
   2103  * Note we always create these links with dirty/access set, so they should
   2104  * never change.
   2105  */
   2106 x86pte_t
   2107 x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new)
   2108 {
   2109 	x86pte_t	pte;
   2110 	x86pte_t	*ptep;
   2111 #ifdef __xpv
   2112 	/*
   2113 	 * We can't use writable pagetables for upper level tables, so fake it.
   2114 	 */
   2115 	mmu_update_t t[2];
   2116 	int cnt = 1;
   2117 	int count;
   2118 	maddr_t ma;
   2119 
   2120 	if (!IN_XPV_PANIC()) {
   2121 		ASSERT(!(ht->ht_flags & HTABLE_VLP));	/* no VLP yet */
   2122 		ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
   2123 		t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
   2124 		t[0].val = new;
   2125 
   2126 #if defined(__amd64)
   2127 		/*
   2128 		 * On the 64-bit hypervisor we need to maintain the user mode
   2129 		 * top page table too.
   2130 		 */
   2131 		if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) {
   2132 			ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(
   2133 			    ht->ht_hat->hat_user_ptable), entry));
   2134 			t[1].ptr = ma | MMU_NORMAL_PT_UPDATE;
   2135 			t[1].val = new;
   2136 			++cnt;
   2137 		}
   2138 #endif	/* __amd64 */
   2139 
   2140 		if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF))
   2141 			panic("HYPERVISOR_mmu_update() failed");
   2142 		ASSERT(count == cnt);
   2143 		return (old);
   2144 	}
   2145 #endif
   2146 	ptep = x86pte_access_pagetable(ht, entry);
   2147 	XPV_ALLOW_PAGETABLE_UPDATES();
   2148 	pte = CAS_PTE(ptep, old, new);
   2149 	XPV_DISALLOW_PAGETABLE_UPDATES();
   2150 	x86pte_release_pagetable(ht);
   2151 	return (pte);
   2152 }
   2153 
   2154 /*
   2155  * Invalidate a page table entry as long as it currently maps something that
   2156  * matches the value determined by expect.
   2157  *
   2158  * Also invalidates any TLB entries and returns the previous value of the PTE.
   2159  */
   2160 x86pte_t
   2161 x86pte_inval(
   2162 	htable_t *ht,
   2163 	uint_t entry,
   2164 	x86pte_t expect,
   2165 	x86pte_t *pte_ptr)
   2166 {
   2167 	x86pte_t	*ptep;
   2168 	x86pte_t	oldpte;
   2169 	x86pte_t	found;
   2170 
   2171 	ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
   2172 	ASSERT(ht->ht_level <= mmu.max_page_level);
   2173 
   2174 	if (pte_ptr != NULL)
   2175 		ptep = pte_ptr;
   2176 	else
   2177 		ptep = x86pte_access_pagetable(ht, entry);
   2178 
   2179 #if defined(__xpv)
   2180 	/*
   2181 	 * If exit()ing just use HYPERVISOR_mmu_update(), as we can't be racing
   2182 	 * with anything else.
   2183 	 */
   2184 	if ((ht->ht_hat->hat_flags & HAT_FREEING) && !IN_XPV_PANIC()) {
   2185 		int count;
   2186 		mmu_update_t t[1];
   2187 		maddr_t ma;
   2188 
   2189 		oldpte = GET_PTE(ptep);
   2190 		if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR))
   2191 			goto done;
   2192 		ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
   2193 		t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
   2194 		t[0].val = 0;
   2195 		if (HYPERVISOR_mmu_update(t, 1, &count, DOMID_SELF))
   2196 			panic("HYPERVISOR_mmu_update() failed");
   2197 		ASSERT(count == 1);
   2198 		goto done;
   2199 	}
   2200 #endif /* __xpv */
   2201 
   2202 	/*
   2203 	 * Note that the loop is needed to handle changes due to h/w updating
   2204 	 * of PT_MOD/PT_REF.
   2205 	 */
   2206 	do {
   2207 		oldpte = GET_PTE(ptep);
   2208 		if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR))
   2209 			goto done;
   2210 		XPV_ALLOW_PAGETABLE_UPDATES();
   2211 		found = CAS_PTE(ptep, oldpte, 0);
   2212 		XPV_DISALLOW_PAGETABLE_UPDATES();
   2213 	} while (found != oldpte);
   2214 	if (oldpte & (PT_REF | PT_MOD))
   2215 		hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry));
   2216 
   2217 done:
   2218 	if (pte_ptr == NULL)
   2219 		x86pte_release_pagetable(ht);
   2220 	return (oldpte);
   2221 }
   2222 
   2223 /*
   2224  * Change a page table entry af it currently matches the value in expect.
   2225  */
   2226 x86pte_t
   2227 x86pte_update(
   2228 	htable_t *ht,
   2229 	uint_t entry,
   2230 	x86pte_t expect,
   2231 	x86pte_t new)
   2232 {
   2233 	x86pte_t	*ptep;
   2234 	x86pte_t	found;
   2235 
   2236 	ASSERT(new != 0);
   2237 	ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
   2238 	ASSERT(ht->ht_level <= mmu.max_page_level);
   2239 
   2240 	ptep = x86pte_access_pagetable(ht, entry);
   2241 	XPV_ALLOW_PAGETABLE_UPDATES();
   2242 	found = CAS_PTE(ptep, expect, new);
   2243 	XPV_DISALLOW_PAGETABLE_UPDATES();
   2244 	if (found == expect) {
   2245 		hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry));
   2246 
   2247 		/*
   2248 		 * When removing write permission *and* clearing the
   2249 		 * MOD bit, check if a write happened via a stale
   2250 		 * TLB entry before the TLB shootdown finished.
   2251 		 *
   2252 		 * If it did happen, simply re-enable write permission and
   2253 		 * act like the original CAS failed.
   2254 		 */
   2255 		if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE &&
   2256 		    (new & (PT_WRITABLE | PT_MOD)) == 0 &&
   2257 		    (GET_PTE(ptep) & PT_MOD) != 0) {
   2258 			do {
   2259 				found = GET_PTE(ptep);
   2260 				XPV_ALLOW_PAGETABLE_UPDATES();
   2261 				found =
   2262 				    CAS_PTE(ptep, found, found | PT_WRITABLE);
   2263 				XPV_DISALLOW_PAGETABLE_UPDATES();
   2264 			} while ((found & PT_WRITABLE) == 0);
   2265 		}
   2266 	}
   2267 	x86pte_release_pagetable(ht);
   2268 	return (found);
   2269 }
   2270 
   2271 #ifndef __xpv
   2272 /*
   2273  * Copy page tables - this is just a little more complicated than the
   2274  * previous routines. Note that it's also not atomic! It also is never
   2275  * used for VLP pagetables.
   2276  */
   2277 void
   2278 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
   2279 {
   2280 	caddr_t	src_va;
   2281 	caddr_t dst_va;
   2282 	size_t size;
   2283 	x86pte_t *pteptr;
   2284 	x86pte_t pte;
   2285 
   2286 	ASSERT(khat_running);
   2287 	ASSERT(!(dest->ht_flags & HTABLE_VLP));
   2288 	ASSERT(!(src->ht_flags & HTABLE_VLP));
   2289 	ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN));
   2290 	ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
   2291 
   2292 	/*
   2293 	 * Acquire access to the CPU pagetable windows for the dest and source.
   2294 	 */
   2295 	dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
   2296 	if (kpm_vbase) {
   2297 		src_va = (caddr_t)
   2298 		    PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry);
   2299 	} else {
   2300 		uint_t x = PWIN_SRC(CPU->cpu_id);
   2301 
   2302 		/*
   2303 		 * Finish defining the src pagetable mapping
   2304 		 */
   2305 		src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
   2306 		pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx;
   2307 		pteptr = (x86pte_t *)PWIN_PTE_VA(x);
   2308 		if (mmu.pae_hat)
   2309 			*pteptr = pte;
   2310 		else
   2311 			*(x86pte32_t *)pteptr = pte;
   2312 		mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
   2313 	}
   2314 
   2315 	/*
   2316 	 * now do the copy
   2317 	 */
   2318 	size = count << mmu.pte_size_shift;
   2319 	bcopy(src_va, dst_va, size);
   2320 
   2321 	x86pte_release_pagetable(dest);
   2322 }
   2323 
   2324 #else /* __xpv */
   2325 
   2326 /*
   2327  * The hypervisor only supports writable pagetables at level 0, so we have
   2328  * to install these 1 by 1 the slow way.
   2329  */
   2330 void
   2331 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
   2332 {
   2333 	caddr_t	src_va;
   2334 	x86pte_t pte;
   2335 
   2336 	ASSERT(!IN_XPV_PANIC());
   2337 	src_va = (caddr_t)x86pte_access_pagetable(src, entry);
   2338 	while (count) {
   2339 		if (mmu.pae_hat)
   2340 			pte = *(x86pte_t *)src_va;
   2341 		else
   2342 			pte = *(x86pte32_t *)src_va;
   2343 		if (pte != 0) {
   2344 			set_pteval(pfn_to_pa(dest->ht_pfn), entry,
   2345 			    dest->ht_level, pte);
   2346 #ifdef __amd64
   2347 			if (dest->ht_level == mmu.max_level &&
   2348 			    htable_e2va(dest, entry) < HYPERVISOR_VIRT_END)
   2349 				set_pteval(
   2350 				    pfn_to_pa(dest->ht_hat->hat_user_ptable),
   2351 				    entry, dest->ht_level, pte);
   2352 #endif
   2353 		}
   2354 		--count;
   2355 		++entry;
   2356 		src_va += mmu.pte_size;
   2357 	}
   2358 	x86pte_release_pagetable(src);
   2359 }
   2360 #endif /* __xpv */
   2361 
   2362 /*
   2363  * Zero page table entries - Note this doesn't use atomic stores!
   2364  */
   2365 static void
   2366 x86pte_zero(htable_t *dest, uint_t entry, uint_t count)
   2367 {
   2368 	caddr_t dst_va;
   2369 	size_t size;
   2370 #ifdef __xpv
   2371 	int x;
   2372 	x86pte_t newpte;
   2373 #endif
   2374 
   2375 	/*
   2376 	 * Map in the page table to be zeroed.
   2377 	 */
   2378 	ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
   2379 	ASSERT(!(dest->ht_flags & HTABLE_VLP));
   2380 
   2381 	/*
   2382 	 * On the hypervisor we don't use x86pte_access_pagetable() since
   2383 	 * in this case the page is not pinned yet.
   2384 	 */
   2385 #ifdef __xpv
   2386 	if (kpm_vbase == NULL) {
   2387 		kpreempt_disable();
   2388 		ASSERT(CPU->cpu_hat_info != NULL);
   2389 		mutex_enter(&CPU->cpu_hat_info->hci_mutex);
   2390 		x = PWIN_TABLE(CPU->cpu_id);
   2391 		newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE;
   2392 		xen_map(newpte, PWIN_VA(x));
   2393 		dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
   2394 	} else
   2395 #endif
   2396 		dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
   2397 
   2398 	size = count << mmu.pte_size_shift;
   2399 	ASSERT(size > BLOCKZEROALIGN);
   2400 #ifdef __i386
   2401 	if ((x86_feature & X86_SSE2) == 0)
   2402 		bzero(dst_va, size);
   2403 	else
   2404 #endif
   2405 		block_zero_no_xmm(dst_va, size);
   2406 
   2407 #ifdef __xpv
   2408 	if (kpm_vbase == NULL) {
   2409 		xen_map(0, PWIN_VA(x));
   2410 		mutex_exit(&CPU->cpu_hat_info->hci_mutex);
   2411 		kpreempt_enable();
   2412 	} else
   2413 #endif
   2414 		x86pte_release_pagetable(dest);
   2415 }
   2416 
   2417 /*
   2418  * Called to ensure that all pagetables are in the system dump
   2419  */
   2420 void
   2421 hat_dump(void)
   2422 {
   2423 	hat_t *hat;
   2424 	uint_t h;
   2425 	htable_t *ht;
   2426 
   2427 	/*
   2428 	 * Dump all page tables
   2429 	 */
   2430 	for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) {
   2431 		for (h = 0; h < hat->hat_num_hash; ++h) {
   2432 			for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
   2433 				if ((ht->ht_flags & HTABLE_VLP) == 0)
   2434 					dump_page(ht->ht_pfn);
   2435 			}
   2436 		}
   2437 	}
   2438 }
   2439