Home | History | Annotate | Download | only in dboot
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 
     28 #include <sys/types.h>
     29 #include <sys/machparam.h>
     30 #include <sys/x86_archext.h>
     31 #include <sys/systm.h>
     32 #include <sys/mach_mmu.h>
     33 #include <sys/multiboot.h>
     34 
     35 #if defined(__xpv)
     36 
     37 #include <sys/hypervisor.h>
     38 uintptr_t xen_virt_start;
     39 pfn_t *mfn_to_pfn_mapping;
     40 
     41 #else /* !__xpv */
     42 
     43 extern multiboot_header_t mb_header;
     44 extern int have_cpuid(void);
     45 
     46 #endif /* !__xpv */
     47 
     48 #include <sys/inttypes.h>
     49 #include <sys/bootinfo.h>
     50 #include <sys/mach_mmu.h>
     51 #include <sys/boot_console.h>
     52 
     53 #include "dboot_asm.h"
     54 #include "dboot_printf.h"
     55 #include "dboot_xboot.h"
     56 #include "dboot_elfload.h"
     57 
     58 /*
     59  * This file contains code that runs to transition us from either a multiboot
     60  * compliant loader (32 bit non-paging) or a XPV domain loader to
     61  * regular kernel execution. Its task is to setup the kernel memory image
     62  * and page tables.
     63  *
     64  * The code executes as:
     65  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
     66  * 	- a 32 bit program for the 32-bit PV hypervisor
     67  *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
     68  *
     69  * Under the PV hypervisor, we must create mappings for any memory beyond the
     70  * initial start of day allocation (such as the kernel itself).
     71  *
     72  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
     73  * Since we are running in real mode, so all such memory is accessible.
     74  */
     75 
     76 /*
     77  * Standard bits used in PTE (page level) and PTP (internal levels)
     78  */
     79 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
     80 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
     81 
     82 /*
     83  * This is the target addresses (physical) where the kernel text and data
     84  * nucleus pages will be unpacked. On the hypervisor this is actually a
     85  * virtual address.
     86  */
     87 paddr_t ktext_phys;
     88 uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
     89 
     90 static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
     91 
     92 /*
     93  * The stack is setup in assembler before entering startup_kernel()
     94  */
     95 char stack_space[STACK_SIZE];
     96 
     97 /*
     98  * Used to track physical memory allocation
     99  */
    100 static paddr_t next_avail_addr = 0;
    101 
    102 #if defined(__xpv)
    103 /*
    104  * Additional information needed for hypervisor memory allocation.
    105  * Only memory up to scratch_end is mapped by page tables.
    106  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
    107  * to derive a pfn from a pointer, you subtract mfn_base.
    108  */
    109 
    110 static paddr_t scratch_end = 0;	/* we can't write all of mem here */
    111 static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
    112 start_info_t *xen_info;
    113 
    114 #else	/* __xpv */
    115 
    116 /*
    117  * If on the metal, then we have a multiboot loader.
    118  */
    119 multiboot_info_t *mb_info;
    120 
    121 #endif	/* __xpv */
    122 
    123 /*
    124  * This contains information passed to the kernel
    125  */
    126 struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
    127 struct xboot_info *bi;
    128 
    129 /*
    130  * Page table and memory stuff.
    131  */
    132 static paddr_t max_mem;			/* maximum memory address */
    133 
    134 /*
    135  * Information about processor MMU
    136  */
    137 int amd64_support = 0;
    138 int largepage_support = 0;
    139 int pae_support = 0;
    140 int pge_support = 0;
    141 int NX_support = 0;
    142 
    143 /*
    144  * Low 32 bits of kernel entry address passed back to assembler.
    145  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
    146  */
    147 uint32_t entry_addr_low;
    148 
    149 /*
    150  * Memlists for the kernel. We shouldn't need a lot of these.
    151  */
    152 #define	MAX_MEMLIST (50)
    153 struct boot_memlist memlists[MAX_MEMLIST];
    154 uint_t memlists_used = 0;
    155 struct boot_memlist pcimemlists[MAX_MEMLIST];
    156 uint_t pcimemlists_used = 0;
    157 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
    158 uint_t rsvdmemlists_used = 0;
    159 
    160 #define	MAX_MODULES (10)
    161 struct boot_modules modules[MAX_MODULES];
    162 uint_t modules_used = 0;
    163 
    164 /*
    165  * Debugging macros
    166  */
    167 uint_t prom_debug = 0;
    168 uint_t map_debug = 0;
    169 
    170 /*
    171  * Either hypervisor-specific or grub-specific code builds the initial
    172  * memlists. This code does the sort/merge/link for final use.
    173  */
    174 static void
    175 sort_physinstall(void)
    176 {
    177 	int i;
    178 #if !defined(__xpv)
    179 	int j;
    180 	struct boot_memlist tmp;
    181 
    182 	/*
    183 	 * Now sort the memlists, in case they weren't in order.
    184 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
    185 	 */
    186 	DBG_MSG("Sorting phys-installed list\n");
    187 	for (j = memlists_used - 1; j > 0; --j) {
    188 		for (i = 0; i < j; ++i) {
    189 			if (memlists[i].addr < memlists[i + 1].addr)
    190 				continue;
    191 			tmp = memlists[i];
    192 			memlists[i] = memlists[i + 1];
    193 			memlists[i + 1] = tmp;
    194 		}
    195 	}
    196 
    197 	/*
    198 	 * Merge any memlists that don't have holes between them.
    199 	 */
    200 	for (i = 0; i <= memlists_used - 1; ++i) {
    201 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
    202 			continue;
    203 
    204 		if (prom_debug)
    205 			dboot_printf(
    206 			    "merging mem segs %" PRIx64 "...%" PRIx64
    207 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
    208 			    memlists[i].addr,
    209 			    memlists[i].addr + memlists[i].size,
    210 			    memlists[i + 1].addr,
    211 			    memlists[i + 1].addr + memlists[i + 1].size);
    212 
    213 		memlists[i].size += memlists[i + 1].size;
    214 		for (j = i + 1; j < memlists_used - 1; ++j)
    215 			memlists[j] = memlists[j + 1];
    216 		--memlists_used;
    217 		DBG(memlists_used);
    218 		--i;	/* after merging we need to reexamine, so do this */
    219 	}
    220 #endif	/* __xpv */
    221 
    222 	if (prom_debug) {
    223 		dboot_printf("\nFinal memlists:\n");
    224 		for (i = 0; i < memlists_used; ++i) {
    225 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
    226 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
    227 		}
    228 	}
    229 
    230 	/*
    231 	 * link together the memlists with native size pointers
    232 	 */
    233 	memlists[0].next = 0;
    234 	memlists[0].prev = 0;
    235 	for (i = 1; i < memlists_used; ++i) {
    236 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
    237 		memlists[i].next = 0;
    238 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
    239 	}
    240 	bi->bi_phys_install = (native_ptr_t)memlists;
    241 	DBG(bi->bi_phys_install);
    242 }
    243 
    244 /*
    245  * build bios reserved memlists
    246  */
    247 static void
    248 build_rsvdmemlists(void)
    249 {
    250 	int i;
    251 
    252 	rsvdmemlists[0].next = 0;
    253 	rsvdmemlists[0].prev = 0;
    254 	for (i = 1; i < rsvdmemlists_used; ++i) {
    255 		rsvdmemlists[i].prev =
    256 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
    257 		rsvdmemlists[i].next = 0;
    258 		rsvdmemlists[i - 1].next =
    259 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
    260 	}
    261 	bi->bi_rsvdmem = (native_ptr_t)rsvdmemlists;
    262 	DBG(bi->bi_rsvdmem);
    263 }
    264 
    265 #if defined(__xpv)
    266 
    267 /*
    268  * halt on the hypervisor after a delay to drain console output
    269  */
    270 void
    271 dboot_halt(void)
    272 {
    273 	uint_t i = 10000;
    274 
    275 	while (--i)
    276 		(void) HYPERVISOR_yield();
    277 	(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
    278 }
    279 
    280 /*
    281  * From a machine address, find the corresponding pseudo-physical address.
    282  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
    283  * Machine addresses are the real underlying hardware addresses.
    284  * These are needed for page table entries. Note that this routine is
    285  * poorly protected. A bad value of "ma" will cause a page fault.
    286  */
    287 paddr_t
    288 ma_to_pa(maddr_t ma)
    289 {
    290 	ulong_t pgoff = ma & MMU_PAGEOFFSET;
    291 	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
    292 	paddr_t pa;
    293 
    294 	if (pfn >= xen_info->nr_pages)
    295 		return (-(paddr_t)1);
    296 	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
    297 #ifdef DEBUG
    298 	if (ma != pa_to_ma(pa))
    299 		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
    300 		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
    301 #endif
    302 	return (pa);
    303 }
    304 
    305 /*
    306  * From a pseudo-physical address, find the corresponding machine address.
    307  */
    308 maddr_t
    309 pa_to_ma(paddr_t pa)
    310 {
    311 	pfn_t pfn;
    312 	ulong_t mfn;
    313 
    314 	pfn = mmu_btop(pa - mfn_base);
    315 	if (pa < mfn_base || pfn >= xen_info->nr_pages)
    316 		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
    317 	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
    318 #ifdef DEBUG
    319 	if (mfn_to_pfn_mapping[mfn] != pfn)
    320 		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
    321 		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
    322 #endif
    323 	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
    324 }
    325 
    326 #endif	/* __xpv */
    327 
    328 x86pte_t
    329 get_pteval(paddr_t table, uint_t index)
    330 {
    331 	if (pae_support)
    332 		return (((x86pte_t *)(uintptr_t)table)[index]);
    333 	return (((x86pte32_t *)(uintptr_t)table)[index]);
    334 }
    335 
    336 /*ARGSUSED*/
    337 void
    338 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
    339 {
    340 #ifdef __xpv
    341 	mmu_update_t t;
    342 	maddr_t mtable = pa_to_ma(table);
    343 	int retcnt;
    344 
    345 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
    346 	t.val = pteval;
    347 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
    348 		dboot_panic("HYPERVISOR_mmu_update() failed");
    349 #else /* __xpv */
    350 	uintptr_t tab_addr = (uintptr_t)table;
    351 
    352 	if (pae_support)
    353 		((x86pte_t *)tab_addr)[index] = pteval;
    354 	else
    355 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
    356 	if (level == top_level && level == 2)
    357 		reload_cr3();
    358 #endif /* __xpv */
    359 }
    360 
    361 paddr_t
    362 make_ptable(x86pte_t *pteval, uint_t level)
    363 {
    364 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
    365 
    366 	if (level == top_level && level == 2)
    367 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
    368 	else
    369 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
    370 
    371 #ifdef __xpv
    372 	/* Remove write permission to the new page table. */
    373 	if (HYPERVISOR_update_va_mapping(new_table,
    374 	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
    375 		dboot_panic("HYP_update_va_mapping error");
    376 #endif
    377 
    378 	if (map_debug)
    379 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
    380 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
    381 	return (new_table);
    382 }
    383 
    384 x86pte_t *
    385 map_pte(paddr_t table, uint_t index)
    386 {
    387 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
    388 }
    389 
    390 /*
    391  * dump out the contents of page tables...
    392  */
    393 static void
    394 dump_tables(void)
    395 {
    396 	uint_t save_index[4];	/* for recursion */
    397 	char *save_table[4];	/* for recursion */
    398 	uint_t	l;
    399 	uint64_t va;
    400 	uint64_t pgsize;
    401 	int index;
    402 	int i;
    403 	x86pte_t pteval;
    404 	char *table;
    405 	static char *tablist = "\t\t\t";
    406 	char *tabs = tablist + 3 - top_level;
    407 	uint_t pa, pa1;
    408 #if !defined(__xpv)
    409 #define	maddr_t paddr_t
    410 #endif /* !__xpv */
    411 
    412 	dboot_printf("Finished pagetables:\n");
    413 	table = (char *)(uintptr_t)top_page_table;
    414 	l = top_level;
    415 	va = 0;
    416 	for (index = 0; index < ptes_per_table; ++index) {
    417 		pgsize = 1ull << shift_amt[l];
    418 		if (pae_support)
    419 			pteval = ((x86pte_t *)table)[index];
    420 		else
    421 			pteval = ((x86pte32_t *)table)[index];
    422 		if (pteval == 0)
    423 			goto next_entry;
    424 
    425 		dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
    426 		    tabs + l, (void *)table, index, (uint64_t)pteval, va);
    427 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
    428 		dboot_printf(" physaddr=%x\n", pa);
    429 
    430 		/*
    431 		 * Don't try to walk hypervisor private pagetables
    432 		 */
    433 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
    434 			save_table[l] = table;
    435 			save_index[l] = index;
    436 			--l;
    437 			index = -1;
    438 			table = (char *)(uintptr_t)
    439 			    ma_to_pa(pteval & MMU_PAGEMASK);
    440 			goto recursion;
    441 		}
    442 
    443 		/*
    444 		 * shorten dump for consecutive mappings
    445 		 */
    446 		for (i = 1; index + i < ptes_per_table; ++i) {
    447 			if (pae_support)
    448 				pteval = ((x86pte_t *)table)[index + i];
    449 			else
    450 				pteval = ((x86pte32_t *)table)[index + i];
    451 			if (pteval == 0)
    452 				break;
    453 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
    454 			if (pa1 != pa + i * pgsize)
    455 				break;
    456 		}
    457 		if (i > 2) {
    458 			dboot_printf("%s...\n", tabs + l);
    459 			va += pgsize * (i - 2);
    460 			index += i - 2;
    461 		}
    462 next_entry:
    463 		va += pgsize;
    464 		if (l == 3 && index == 256)	/* VA hole */
    465 			va = 0xffff800000000000ull;
    466 recursion:
    467 		;
    468 	}
    469 	if (l < top_level) {
    470 		++l;
    471 		index = save_index[l];
    472 		table = save_table[l];
    473 		goto recursion;
    474 	}
    475 }
    476 
    477 /*
    478  * Add a mapping for the machine page at the given virtual address.
    479  */
    480 static void
    481 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
    482 {
    483 	x86pte_t *ptep;
    484 	x86pte_t pteval;
    485 
    486 	pteval = ma | pte_bits;
    487 	if (level > 0)
    488 		pteval |= PT_PAGESIZE;
    489 	if (va >= target_kernel_text && pge_support)
    490 		pteval |= PT_GLOBAL;
    491 
    492 	if (map_debug && ma != va)
    493 		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
    494 		    " pte=0x%" PRIx64 " l=%d\n",
    495 		    (uint64_t)ma, (uint64_t)va, pteval, level);
    496 
    497 #if defined(__xpv)
    498 	/*
    499 	 * see if we can avoid find_pte() on the hypervisor
    500 	 */
    501 	if (HYPERVISOR_update_va_mapping(va, pteval,
    502 	    UVMF_INVLPG | UVMF_LOCAL) == 0)
    503 		return;
    504 #endif
    505 
    506 	/*
    507 	 * Find the pte that will map this address. This creates any
    508 	 * missing intermediate level page tables
    509 	 */
    510 	ptep = find_pte(va, NULL, level, 0);
    511 
    512 	/*
    513 	 * When paravirtualized, we must use hypervisor calls to modify the
    514 	 * PTE, since paging is active. On real hardware we just write to
    515 	 * the pagetables which aren't in use yet.
    516 	 */
    517 #if defined(__xpv)
    518 	ptep = ptep;	/* shut lint up */
    519 	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
    520 		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
    521 		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
    522 		    (uint64_t)va, level, (uint64_t)ma, pteval);
    523 #else
    524 	if (va < 1024 * 1024)
    525 		pteval |= PT_NOCACHE;		/* for video RAM */
    526 	if (pae_support)
    527 		*ptep = pteval;
    528 	else
    529 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
    530 #endif
    531 }
    532 
    533 /*
    534  * Add a mapping for the physical page at the given virtual address.
    535  */
    536 static void
    537 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
    538 {
    539 	map_ma_at_va(pa_to_ma(pa), va, level);
    540 }
    541 
    542 /*
    543  * This is called to remove start..end from the
    544  * possible range of PCI addresses.
    545  */
    546 const uint64_t pci_lo_limit = 0x00100000ul;
    547 const uint64_t pci_hi_limit = 0xfff00000ul;
    548 static void
    549 exclude_from_pci(uint64_t start, uint64_t end)
    550 {
    551 	int i;
    552 	int j;
    553 	struct boot_memlist *ml;
    554 
    555 	for (i = 0; i < pcimemlists_used; ++i) {
    556 		ml = &pcimemlists[i];
    557 
    558 		/* delete the entire range? */
    559 		if (start <= ml->addr && ml->addr + ml->size <= end) {
    560 			--pcimemlists_used;
    561 			for (j = i; j < pcimemlists_used; ++j)
    562 				pcimemlists[j] = pcimemlists[j + 1];
    563 			--i;	/* to revisit the new one at this index */
    564 		}
    565 
    566 		/* split a range? */
    567 		else if (ml->addr < start && end < ml->addr + ml->size) {
    568 
    569 			++pcimemlists_used;
    570 			if (pcimemlists_used > MAX_MEMLIST)
    571 				dboot_panic("too many pcimemlists");
    572 
    573 			for (j = pcimemlists_used - 1; j > i; --j)
    574 				pcimemlists[j] = pcimemlists[j - 1];
    575 			ml->size = start - ml->addr;
    576 
    577 			++ml;
    578 			ml->size = (ml->addr + ml->size) - end;
    579 			ml->addr = end;
    580 			++i;	/* skip on to next one */
    581 		}
    582 
    583 		/* cut memory off the start? */
    584 		else if (ml->addr < end && end < ml->addr + ml->size) {
    585 			ml->size -= end - ml->addr;
    586 			ml->addr = end;
    587 		}
    588 
    589 		/* cut memory off the end? */
    590 		else if (ml->addr <= start && start < ml->addr + ml->size) {
    591 			ml->size = start - ml->addr;
    592 		}
    593 	}
    594 }
    595 
    596 /*
    597  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
    598  * definition in Xen source.
    599  */
    600 #ifdef __xpv
    601 typedef struct {
    602 	uint32_t	base_addr_low;
    603 	uint32_t	base_addr_high;
    604 	uint32_t	length_low;
    605 	uint32_t	length_high;
    606 	uint32_t	type;
    607 } mmap_t;
    608 #else
    609 typedef mb_memory_map_t mmap_t;
    610 #endif
    611 
    612 static void
    613 build_pcimemlists(mmap_t *mem, int num)
    614 {
    615 	mmap_t *mmap;
    616 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
    617 	uint64_t start;
    618 	uint64_t end;
    619 	int i;
    620 
    621 	/*
    622 	 * initialize
    623 	 */
    624 	pcimemlists[0].addr = pci_lo_limit;
    625 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
    626 	pcimemlists_used = 1;
    627 
    628 	/*
    629 	 * Fill in PCI memlists.
    630 	 */
    631 	for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
    632 		start = ((uint64_t)mmap->base_addr_high << 32) +
    633 		    mmap->base_addr_low;
    634 		end = start + ((uint64_t)mmap->length_high << 32) +
    635 		    mmap->length_low;
    636 
    637 		if (prom_debug)
    638 			dboot_printf("\ttype: %d %" PRIx64 "..%"
    639 			    PRIx64 "\n", mmap->type, start, end);
    640 
    641 		/*
    642 		 * page align start and end
    643 		 */
    644 		start = (start + page_offset) & ~page_offset;
    645 		end &= ~page_offset;
    646 		if (end <= start)
    647 			continue;
    648 
    649 		exclude_from_pci(start, end);
    650 	}
    651 
    652 	/*
    653 	 * Finish off the pcimemlist
    654 	 */
    655 	if (prom_debug) {
    656 		for (i = 0; i < pcimemlists_used; ++i) {
    657 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
    658 			    PRIx64 "\n", pcimemlists[i].addr,
    659 			    pcimemlists[i].addr + pcimemlists[i].size);
    660 		}
    661 	}
    662 	pcimemlists[0].next = 0;
    663 	pcimemlists[0].prev = 0;
    664 	for (i = 1; i < pcimemlists_used; ++i) {
    665 		pcimemlists[i].prev =
    666 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
    667 		pcimemlists[i].next = 0;
    668 		pcimemlists[i - 1].next =
    669 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
    670 	}
    671 	bi->bi_pcimem = (native_ptr_t)pcimemlists;
    672 	DBG(bi->bi_pcimem);
    673 }
    674 
    675 #if defined(__xpv)
    676 /*
    677  * Initialize memory allocator stuff from hypervisor-supplied start info.
    678  *
    679  * There is 512KB of scratch area after the boot stack page.
    680  * We'll use that for everything except the kernel nucleus pages which are too
    681  * big to fit there and are allocated last anyway.
    682  */
    683 #define	MAXMAPS	100
    684 static mmap_t map_buffer[MAXMAPS];
    685 static void
    686 init_mem_alloc(void)
    687 {
    688 	int	local;	/* variables needed to find start region */
    689 	paddr_t	scratch_start;
    690 	xen_memory_map_t map;
    691 
    692 	DBG_MSG("Entered init_mem_alloc()\n");
    693 
    694 	/*
    695 	 * Free memory follows the stack. There's at least 512KB of scratch
    696 	 * space, rounded up to at least 2Mb alignment.  That should be enough
    697 	 * for the page tables we'll need to build.  The nucleus memory is
    698 	 * allocated last and will be outside the addressible range.  We'll
    699 	 * switch to new page tables before we unpack the kernel
    700 	 */
    701 	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
    702 	DBG(scratch_start);
    703 	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
    704 	DBG(scratch_end);
    705 
    706 	/*
    707 	 * For paranoia, leave some space between hypervisor data and ours.
    708 	 * Use 500 instead of 512.
    709 	 */
    710 	next_avail_addr = scratch_end - 500 * 1024;
    711 	DBG(next_avail_addr);
    712 
    713 	/*
    714 	 * The domain builder gives us at most 1 module
    715 	 */
    716 	DBG(xen_info->mod_len);
    717 	if (xen_info->mod_len > 0) {
    718 		DBG(xen_info->mod_start);
    719 		modules[0].bm_addr = xen_info->mod_start;
    720 		modules[0].bm_size = xen_info->mod_len;
    721 		bi->bi_module_cnt = 1;
    722 		bi->bi_modules = (native_ptr_t)modules;
    723 	} else {
    724 		bi->bi_module_cnt = 0;
    725 		bi->bi_modules = NULL;
    726 	}
    727 	DBG(bi->bi_module_cnt);
    728 	DBG(bi->bi_modules);
    729 
    730 	DBG(xen_info->mfn_list);
    731 	DBG(xen_info->nr_pages);
    732 	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
    733 	DBG(max_mem);
    734 
    735 	/*
    736 	 * Using pseudo-physical addresses, so only 1 memlist element
    737 	 */
    738 	memlists[0].addr = 0;
    739 	DBG(memlists[0].addr);
    740 	memlists[0].size = max_mem;
    741 	DBG(memlists[0].size);
    742 	memlists_used = 1;
    743 	DBG(memlists_used);
    744 
    745 	/*
    746 	 * finish building physinstall list
    747 	 */
    748 	sort_physinstall();
    749 
    750 	/*
    751 	 * build bios reserved memlists
    752 	 */
    753 	build_rsvdmemlists();
    754 
    755 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
    756 		/*
    757 		 * build PCI Memory list
    758 		 */
    759 		map.nr_entries = MAXMAPS;
    760 		/*LINTED: constant in conditional context*/
    761 		set_xen_guest_handle(map.buffer, map_buffer);
    762 		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
    763 			dboot_panic("getting XENMEM_machine_memory_map failed");
    764 		build_pcimemlists(map_buffer, map.nr_entries);
    765 	}
    766 }
    767 
    768 #else	/* !__xpv */
    769 
    770 /*
    771  * During memory allocation, find the highest address not used yet.
    772  */
    773 static void
    774 check_higher(paddr_t a)
    775 {
    776 	if (a < next_avail_addr)
    777 		return;
    778 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
    779 	DBG(next_avail_addr);
    780 }
    781 
    782 /*
    783  * Walk through the module information finding the last used address.
    784  * The first available address will become the top level page table.
    785  *
    786  * We then build the phys_install memlist from the multiboot information.
    787  */
    788 static void
    789 init_mem_alloc(void)
    790 {
    791 	mb_memory_map_t *mmap;
    792 	mb_module_t *mod;
    793 	uint64_t start;
    794 	uint64_t end;
    795 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
    796 	extern char _end[];
    797 	int i;
    798 
    799 	DBG_MSG("Entered init_mem_alloc()\n");
    800 	DBG((uintptr_t)mb_info);
    801 
    802 	if (mb_info->mods_count > MAX_MODULES) {
    803 		dboot_panic("Too many modules (%d) -- the maximum is %d.",
    804 		    mb_info->mods_count, MAX_MODULES);
    805 	}
    806 	/*
    807 	 * search the modules to find the last used address
    808 	 * we'll build the module list while we're walking through here
    809 	 */
    810 	DBG_MSG("\nFinding Modules\n");
    811 	check_higher((paddr_t)&_end);
    812 	for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
    813 	    i < mb_info->mods_count;
    814 	    ++mod, ++i) {
    815 		if (prom_debug) {
    816 			dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n",
    817 			    i, (char *)(mod->mod_name),
    818 			    (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
    819 		}
    820 		modules[i].bm_addr = mod->mod_start;
    821 		if (mod->mod_start > mod->mod_end) {
    822 			dboot_panic("module[%d]: Invalid module start address "
    823 			    "(0x%llx)", i, (uint64_t)mod->mod_start);
    824 		}
    825 		modules[i].bm_size = mod->mod_end - mod->mod_start;
    826 
    827 		check_higher(mod->mod_end);
    828 	}
    829 	bi->bi_modules = (native_ptr_t)modules;
    830 	DBG(bi->bi_modules);
    831 	bi->bi_module_cnt = mb_info->mods_count;
    832 	DBG(bi->bi_module_cnt);
    833 
    834 	/*
    835 	 * Walk through the memory map from multiboot and build our memlist
    836 	 * structures. Note these will have native format pointers.
    837 	 */
    838 	DBG_MSG("\nFinding Memory Map\n");
    839 	DBG(mb_info->flags);
    840 	max_mem = 0;
    841 	if (mb_info->flags & 0x40) {
    842 		int cnt = 0;
    843 
    844 		DBG(mb_info->mmap_addr);
    845 		DBG(mb_info->mmap_length);
    846 		check_higher(mb_info->mmap_addr + mb_info->mmap_length);
    847 
    848 		for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
    849 		    (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
    850 		    mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
    851 		    + sizeof (mmap->size))) {
    852 			++cnt;
    853 			start = ((uint64_t)mmap->base_addr_high << 32) +
    854 			    mmap->base_addr_low;
    855 			end = start + ((uint64_t)mmap->length_high << 32) +
    856 			    mmap->length_low;
    857 
    858 			if (prom_debug)
    859 				dboot_printf("\ttype: %d %" PRIx64 "..%"
    860 				    PRIx64 "\n", mmap->type, start, end);
    861 
    862 			/*
    863 			 * page align start and end
    864 			 */
    865 			start = (start + page_offset) & ~page_offset;
    866 			end &= ~page_offset;
    867 			if (end <= start)
    868 				continue;
    869 
    870 			/*
    871 			 * only type 1 is usable RAM
    872 			 */
    873 			switch (mmap->type) {
    874 			case 1:
    875 				if (end > max_mem)
    876 					max_mem = end;
    877 				memlists[memlists_used].addr = start;
    878 				memlists[memlists_used].size = end - start;
    879 				++memlists_used;
    880 				if (memlists_used > MAX_MEMLIST)
    881 					dboot_panic("too many memlists");
    882 				break;
    883 			case 2:
    884 				rsvdmemlists[rsvdmemlists_used].addr = start;
    885 				rsvdmemlists[rsvdmemlists_used].size =
    886 				    end - start;
    887 				++rsvdmemlists_used;
    888 				if (rsvdmemlists_used > MAX_MEMLIST)
    889 					dboot_panic("too many rsvdmemlists");
    890 				break;
    891 			default:
    892 				continue;
    893 			}
    894 		}
    895 		build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
    896 	} else if (mb_info->flags & 0x01) {
    897 		DBG(mb_info->mem_lower);
    898 		memlists[memlists_used].addr = 0;
    899 		memlists[memlists_used].size = mb_info->mem_lower * 1024;
    900 		++memlists_used;
    901 		DBG(mb_info->mem_upper);
    902 		memlists[memlists_used].addr = 1024 * 1024;
    903 		memlists[memlists_used].size = mb_info->mem_upper * 1024;
    904 		++memlists_used;
    905 
    906 		/*
    907 		 * Old platform - assume I/O space at the end of memory.
    908 		 */
    909 		pcimemlists[0].addr =
    910 		    (mb_info->mem_upper * 1024) + (1024 * 1024);
    911 		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
    912 		pcimemlists[0].next = 0;
    913 		pcimemlists[0].prev = 0;
    914 		bi->bi_pcimem = (native_ptr_t)pcimemlists;
    915 		DBG(bi->bi_pcimem);
    916 	} else {
    917 		dboot_panic("No memory info from boot loader!!!");
    918 	}
    919 
    920 	check_higher(bi->bi_cmdline);
    921 
    922 	/*
    923 	 * finish processing the physinstall list
    924 	 */
    925 	sort_physinstall();
    926 
    927 	/*
    928 	 * build bios reserved mem lists
    929 	 */
    930 	build_rsvdmemlists();
    931 }
    932 #endif /* !__xpv */
    933 
    934 /*
    935  * Simple memory allocator, allocates aligned physical memory.
    936  * Note that startup_kernel() only allocates memory, never frees.
    937  * Memory usage just grows in an upward direction.
    938  */
    939 static void *
    940 do_mem_alloc(uint32_t size, uint32_t align)
    941 {
    942 	uint_t i;
    943 	uint64_t best;
    944 	uint64_t start;
    945 	uint64_t end;
    946 
    947 	/*
    948 	 * make sure size is a multiple of pagesize
    949 	 */
    950 	size = RNDUP(size, MMU_PAGESIZE);
    951 	next_avail_addr = RNDUP(next_avail_addr, align);
    952 
    953 	/*
    954 	 * XXPV fixme joe
    955 	 *
    956 	 * a really large bootarchive that causes you to run out of memory
    957 	 * may cause this to blow up
    958 	 */
    959 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
    960 	best = (uint64_t)-size;
    961 	for (i = 0; i < memlists_used; ++i) {
    962 		start = memlists[i].addr;
    963 #if defined(__xpv)
    964 		start += mfn_base;
    965 #endif
    966 		end = start + memlists[i].size;
    967 
    968 		/*
    969 		 * did we find the desired address?
    970 		 */
    971 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
    972 			best = next_avail_addr;
    973 			goto done;
    974 		}
    975 
    976 		/*
    977 		 * if not is this address the best so far?
    978 		 */
    979 		if (start > next_avail_addr && start < best &&
    980 		    RNDUP(start, align) + size <= end)
    981 			best = RNDUP(start, align);
    982 	}
    983 
    984 	/*
    985 	 * We didn't find exactly the address we wanted, due to going off the
    986 	 * end of a memory region. Return the best found memory address.
    987 	 */
    988 done:
    989 	next_avail_addr = best + size;
    990 #if defined(__xpv)
    991 	if (next_avail_addr > scratch_end)
    992 		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
    993 		    "0x%lx", (ulong_t)next_avail_addr,
    994 		    (ulong_t)scratch_end);
    995 #endif
    996 	(void) memset((void *)(uintptr_t)best, 0, size);
    997 	return ((void *)(uintptr_t)best);
    998 }
    999 
   1000 void *
   1001 mem_alloc(uint32_t size)
   1002 {
   1003 	return (do_mem_alloc(size, MMU_PAGESIZE));
   1004 }
   1005 
   1006 
   1007 /*
   1008  * Build page tables to map all of memory used so far as well as the kernel.
   1009  */
   1010 static void
   1011 build_page_tables(void)
   1012 {
   1013 	uint32_t psize;
   1014 	uint32_t level;
   1015 	uint32_t off;
   1016 	uint64_t start;
   1017 #if !defined(__xpv)
   1018 	uint32_t i;
   1019 	uint64_t end;
   1020 #endif	/* __xpv */
   1021 
   1022 	/*
   1023 	 * If we're on metal, we need to create the top level pagetable.
   1024 	 */
   1025 #if defined(__xpv)
   1026 	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
   1027 #else /* __xpv */
   1028 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
   1029 #endif /* __xpv */
   1030 	DBG((uintptr_t)top_page_table);
   1031 
   1032 	/*
   1033 	 * Determine if we'll use large mappings for kernel, then map it.
   1034 	 */
   1035 	if (largepage_support) {
   1036 		psize = lpagesize;
   1037 		level = 1;
   1038 	} else {
   1039 		psize = MMU_PAGESIZE;
   1040 		level = 0;
   1041 	}
   1042 
   1043 	DBG_MSG("Mapping kernel\n");
   1044 	DBG(ktext_phys);
   1045 	DBG(target_kernel_text);
   1046 	DBG(ksize);
   1047 	DBG(psize);
   1048 	for (off = 0; off < ksize; off += psize)
   1049 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
   1050 
   1051 	/*
   1052 	 * The kernel will need a 1 page window to work with page tables
   1053 	 */
   1054 	bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
   1055 	DBG(bi->bi_pt_window);
   1056 	bi->bi_pte_to_pt_window =
   1057 	    (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
   1058 	DBG(bi->bi_pte_to_pt_window);
   1059 
   1060 #if defined(__xpv)
   1061 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
   1062 		/* If this is a domU we're done. */
   1063 		DBG_MSG("\nPage tables constructed\n");
   1064 		return;
   1065 	}
   1066 #endif /* __xpv */
   1067 
   1068 	/*
   1069 	 * We need 1:1 mappings for the lower 1M of memory to access
   1070 	 * BIOS tables used by a couple of drivers during boot.
   1071 	 *
   1072 	 * The following code works because our simple memory allocator
   1073 	 * only grows usage in an upwards direction.
   1074 	 *
   1075 	 * Note that by this point in boot some mappings for low memory
   1076 	 * may already exist because we've already accessed device in low
   1077 	 * memory.  (Specifically the video frame buffer and keyboard
   1078 	 * status ports.)  If we're booting on raw hardware then GRUB
   1079 	 * created these mappings for us.  If we're booting under a
   1080 	 * hypervisor then we went ahead and remapped these devices into
   1081 	 * memory allocated within dboot itself.
   1082 	 */
   1083 	if (map_debug)
   1084 		dboot_printf("1:1 map pa=0..1Meg\n");
   1085 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
   1086 #if defined(__xpv)
   1087 		map_ma_at_va(start, start, 0);
   1088 #else /* __xpv */
   1089 		map_pa_at_va(start, start, 0);
   1090 #endif /* __xpv */
   1091 	}
   1092 
   1093 #if !defined(__xpv)
   1094 	for (i = 0; i < memlists_used; ++i) {
   1095 		start = memlists[i].addr;
   1096 
   1097 		end = start + memlists[i].size;
   1098 
   1099 		if (map_debug)
   1100 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
   1101 			    start, end);
   1102 		while (start < end && start < next_avail_addr) {
   1103 			map_pa_at_va(start, start, 0);
   1104 			start += MMU_PAGESIZE;
   1105 		}
   1106 	}
   1107 #endif /* !__xpv */
   1108 
   1109 	DBG_MSG("\nPage tables constructed\n");
   1110 }
   1111 
   1112 #define	NO_MULTIBOOT	\
   1113 "multiboot is no longer used to boot the Solaris Operating System.\n\
   1114 The grub entry should be changed to:\n\
   1115 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
   1116 module$ /platform/i86pc/$ISADIR/boot_archive\n\
   1117 See http://www.sun.com/msg/SUNOS-8000-AK for details.\n"
   1118 
   1119 /*
   1120  * startup_kernel has a pretty simple job. It builds pagetables which reflect
   1121  * 1:1 mappings for all memory in use. It then also adds mappings for
   1122  * the kernel nucleus at virtual address of target_kernel_text using large page
   1123  * mappings. The page table pages are also accessible at 1:1 mapped
   1124  * virtual addresses.
   1125  */
   1126 /*ARGSUSED*/
   1127 void
   1128 startup_kernel(void)
   1129 {
   1130 	char *cmdline;
   1131 	uintptr_t addr;
   1132 #if defined(__xpv)
   1133 	physdev_set_iopl_t set_iopl;
   1134 #endif /* __xpv */
   1135 
   1136 	/*
   1137 	 * At this point we are executing in a 32 bit real mode.
   1138 	 */
   1139 #if defined(__xpv)
   1140 	cmdline = (char *)xen_info->cmd_line;
   1141 #else /* __xpv */
   1142 	cmdline = (char *)mb_info->cmdline;
   1143 #endif /* __xpv */
   1144 
   1145 	prom_debug = (strstr(cmdline, "prom_debug") != NULL);
   1146 	map_debug = (strstr(cmdline, "map_debug") != NULL);
   1147 
   1148 #if defined(__xpv)
   1149 	/*
   1150 	 * For dom0, before we initialize the console subsystem we'll
   1151 	 * need to enable io operations, so set I/O priveldge level to 1.
   1152 	 */
   1153 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
   1154 		set_iopl.iopl = 1;
   1155 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
   1156 	}
   1157 #endif /* __xpv */
   1158 
   1159 	bcons_init(cmdline);
   1160 	DBG_MSG("\n\nSolaris prekernel set: ");
   1161 	DBG_MSG(cmdline);
   1162 	DBG_MSG("\n");
   1163 
   1164 	if (strstr(cmdline, "multiboot") != NULL) {
   1165 		dboot_panic(NO_MULTIBOOT);
   1166 	}
   1167 
   1168 	/*
   1169 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
   1170 	 */
   1171 	addr = (uintptr_t)boot_info;
   1172 	addr = (addr + 0xf) & ~0xf;
   1173 	bi = (struct xboot_info *)addr;
   1174 	DBG((uintptr_t)bi);
   1175 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
   1176 
   1177 	/*
   1178 	 * Need correct target_kernel_text value
   1179 	 */
   1180 #if defined(_BOOT_TARGET_amd64)
   1181 	target_kernel_text = KERNEL_TEXT_amd64;
   1182 #elif defined(__xpv)
   1183 	target_kernel_text = KERNEL_TEXT_i386_xpv;
   1184 #else
   1185 	target_kernel_text = KERNEL_TEXT_i386;
   1186 #endif
   1187 	DBG(target_kernel_text);
   1188 
   1189 #if defined(__xpv)
   1190 
   1191 	/*
   1192 	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
   1193 	 */
   1194 
   1195 #if defined(_BOOT_TARGET_amd64)
   1196 	/*
   1197 	 * 64-bit hypervisor.
   1198 	 */
   1199 	amd64_support = 1;
   1200 	pae_support = 1;
   1201 
   1202 #else	/* _BOOT_TARGET_amd64 */
   1203 
   1204 	/*
   1205 	 * See if we are running on a PAE Hypervisor
   1206 	 */
   1207 	{
   1208 		xen_capabilities_info_t caps;
   1209 
   1210 		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
   1211 			dboot_panic("HYPERVISOR_xen_version(caps) failed");
   1212 		caps[sizeof (caps) - 1] = 0;
   1213 		if (prom_debug)
   1214 			dboot_printf("xen capabilities %s\n", caps);
   1215 		if (strstr(caps, "x86_32p") != NULL)
   1216 			pae_support = 1;
   1217 	}
   1218 
   1219 #endif	/* _BOOT_TARGET_amd64 */
   1220 	{
   1221 		xen_platform_parameters_t p;
   1222 
   1223 		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
   1224 			dboot_panic("HYPERVISOR_xen_version(parms) failed");
   1225 		DBG(p.virt_start);
   1226 		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
   1227 	}
   1228 
   1229 	/*
   1230 	 * The hypervisor loads stuff starting at 1Gig
   1231 	 */
   1232 	mfn_base = ONE_GIG;
   1233 	DBG(mfn_base);
   1234 
   1235 	/*
   1236 	 * enable writable page table mode for the hypervisor
   1237 	 */
   1238 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
   1239 	    VMASST_TYPE_writable_pagetables) < 0)
   1240 		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
   1241 
   1242 	/*
   1243 	 * check for NX support
   1244 	 */
   1245 	if (pae_support) {
   1246 		uint32_t eax = 0x80000000;
   1247 		uint32_t edx = get_cpuid_edx(&eax);
   1248 
   1249 		if (eax >= 0x80000001) {
   1250 			eax = 0x80000001;
   1251 			edx = get_cpuid_edx(&eax);
   1252 			if (edx & CPUID_AMD_EDX_NX)
   1253 				NX_support = 1;
   1254 		}
   1255 	}
   1256 
   1257 #if !defined(_BOOT_TARGET_amd64)
   1258 
   1259 	/*
   1260 	 * The 32-bit hypervisor uses segmentation to protect itself from
   1261 	 * guests. This means when a guest attempts to install a flat 4GB
   1262 	 * code or data descriptor the 32-bit hypervisor will protect itself
   1263 	 * by silently shrinking the segment such that if the guest attempts
   1264 	 * any access where the hypervisor lives a #gp fault is generated.
   1265 	 * The problem is that some applications expect a full 4GB flat
   1266 	 * segment for their current thread pointer and will use negative
   1267 	 * offset segment wrap around to access data. TLS support in linux
   1268 	 * brand is one example of this.
   1269 	 *
   1270 	 * The 32-bit hypervisor can catch the #gp fault in these cases
   1271 	 * and emulate the access without passing the #gp fault to the guest
   1272 	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
   1273 	 * Seems like this should have been the default.
   1274 	 * Either way, we want the hypervisor -- and not Solaris -- to deal
   1275 	 * to deal with emulating these accesses.
   1276 	 */
   1277 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
   1278 	    VMASST_TYPE_4gb_segments) < 0)
   1279 		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
   1280 #endif	/* !_BOOT_TARGET_amd64 */
   1281 
   1282 #else	/* __xpv */
   1283 
   1284 	/*
   1285 	 * use cpuid to enable MMU features
   1286 	 */
   1287 	if (have_cpuid()) {
   1288 		uint32_t eax, edx;
   1289 
   1290 		eax = 1;
   1291 		edx = get_cpuid_edx(&eax);
   1292 		if (edx & CPUID_INTC_EDX_PSE)
   1293 			largepage_support = 1;
   1294 		if (edx & CPUID_INTC_EDX_PGE)
   1295 			pge_support = 1;
   1296 		if (edx & CPUID_INTC_EDX_PAE)
   1297 			pae_support = 1;
   1298 
   1299 		eax = 0x80000000;
   1300 		edx = get_cpuid_edx(&eax);
   1301 		if (eax >= 0x80000001) {
   1302 			eax = 0x80000001;
   1303 			edx = get_cpuid_edx(&eax);
   1304 			if (edx & CPUID_AMD_EDX_LM)
   1305 				amd64_support = 1;
   1306 			if (edx & CPUID_AMD_EDX_NX)
   1307 				NX_support = 1;
   1308 		}
   1309 	} else {
   1310 		dboot_printf("cpuid not supported\n");
   1311 	}
   1312 #endif /* __xpv */
   1313 
   1314 
   1315 #if defined(_BOOT_TARGET_amd64)
   1316 	if (amd64_support == 0)
   1317 		dboot_panic("long mode not supported, rebooting");
   1318 	else if (pae_support == 0)
   1319 		dboot_panic("long mode, but no PAE; rebooting");
   1320 #else
   1321 	/*
   1322 	 * Allow the command line to over-ride use of PAE for 32 bit.
   1323 	 */
   1324 	if (strstr(cmdline, "disablePAE=true") != NULL) {
   1325 		pae_support = 0;
   1326 		NX_support = 0;
   1327 		amd64_support = 0;
   1328 	}
   1329 #endif
   1330 
   1331 	/*
   1332 	 * initialize the simple memory allocator
   1333 	 */
   1334 	init_mem_alloc();
   1335 
   1336 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
   1337 	/*
   1338 	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
   1339 	 */
   1340 	if (max_mem < FOUR_GIG && NX_support == 0)
   1341 		pae_support = 0;
   1342 #endif
   1343 
   1344 	/*
   1345 	 * configure mmu information
   1346 	 */
   1347 	if (pae_support) {
   1348 		shift_amt = shift_amt_pae;
   1349 		ptes_per_table = 512;
   1350 		pte_size = 8;
   1351 		lpagesize = TWO_MEG;
   1352 #if defined(_BOOT_TARGET_amd64)
   1353 		top_level = 3;
   1354 #else
   1355 		top_level = 2;
   1356 #endif
   1357 	} else {
   1358 		pae_support = 0;
   1359 		NX_support = 0;
   1360 		shift_amt = shift_amt_nopae;
   1361 		ptes_per_table = 1024;
   1362 		pte_size = 4;
   1363 		lpagesize = FOUR_MEG;
   1364 		top_level = 1;
   1365 	}
   1366 
   1367 	DBG(pge_support);
   1368 	DBG(NX_support);
   1369 	DBG(largepage_support);
   1370 	DBG(amd64_support);
   1371 	DBG(top_level);
   1372 	DBG(pte_size);
   1373 	DBG(ptes_per_table);
   1374 	DBG(lpagesize);
   1375 
   1376 #if defined(__xpv)
   1377 	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
   1378 #else
   1379 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
   1380 #endif
   1381 
   1382 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
   1383 	/*
   1384 	 * For grub, copy kernel bits from the ELF64 file to final place.
   1385 	 */
   1386 	DBG_MSG("\nAllocating nucleus pages.\n");
   1387 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
   1388 	if (ktext_phys == 0)
   1389 		dboot_panic("failed to allocate aligned kernel memory");
   1390 	if (dboot_elfload64(mb_header.load_addr) != 0)
   1391 		dboot_panic("failed to parse kernel ELF image, rebooting");
   1392 #endif
   1393 
   1394 	DBG(ktext_phys);
   1395 
   1396 	/*
   1397 	 * Allocate page tables.
   1398 	 */
   1399 	build_page_tables();
   1400 
   1401 	/*
   1402 	 * return to assembly code to switch to running kernel
   1403 	 */
   1404 	entry_addr_low = (uint32_t)target_kernel_text;
   1405 	DBG(entry_addr_low);
   1406 	bi->bi_use_largepage = largepage_support;
   1407 	bi->bi_use_pae = pae_support;
   1408 	bi->bi_use_pge = pge_support;
   1409 	bi->bi_use_nx = NX_support;
   1410 
   1411 #if defined(__xpv)
   1412 
   1413 	bi->bi_next_paddr = next_avail_addr - mfn_base;
   1414 	DBG(bi->bi_next_paddr);
   1415 	bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
   1416 	DBG(bi->bi_next_vaddr);
   1417 
   1418 	/*
   1419 	 * unmap unused pages in start area to make them available for DMA
   1420 	 */
   1421 	while (next_avail_addr < scratch_end) {
   1422 		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
   1423 		    0, UVMF_INVLPG | UVMF_LOCAL);
   1424 		next_avail_addr += MMU_PAGESIZE;
   1425 	}
   1426 
   1427 	bi->bi_xen_start_info = (uintptr_t)xen_info;
   1428 	DBG((uintptr_t)HYPERVISOR_shared_info);
   1429 	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
   1430 	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
   1431 
   1432 #else /* __xpv */
   1433 
   1434 	bi->bi_next_paddr = next_avail_addr;
   1435 	DBG(bi->bi_next_paddr);
   1436 	bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
   1437 	DBG(bi->bi_next_vaddr);
   1438 	bi->bi_mb_info = (uintptr_t)mb_info;
   1439 	bi->bi_top_page_table = (uintptr_t)top_page_table;
   1440 
   1441 #endif /* __xpv */
   1442 
   1443 	bi->bi_kseg_size = FOUR_MEG;
   1444 	DBG(bi->bi_kseg_size);
   1445 
   1446 #ifndef __xpv
   1447 	if (map_debug)
   1448 		dump_tables();
   1449 #endif
   1450 
   1451 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
   1452 }
   1453