Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /* derived from netbsd's xen_machdep.c 1.1.2.1 */
     28 
     29 /*
     30  *
     31  * Copyright (c) 2004 Christian Limpach.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. This section intentionally left blank.
     43  * 4. The name of the author may not be used to endorse or promote products
     44  *    derived from this software without specific prior written permission.
     45  *
     46  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     47  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     48  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     49  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     50  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     51  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     52  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     53  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     54  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     55  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     56  */
     57 /*
     58  * Section 3 of the above license was updated in response to bug 6379571.
     59  */
     60 
     61 #include <sys/xpv_user.h>
     62 
     63 /* XXX 3.3. TODO remove this include */
     64 #include <xen/public/arch-x86/xen-mca.h>
     65 
     66 #include <sys/ctype.h>
     67 #include <sys/types.h>
     68 #include <sys/cmn_err.h>
     69 #include <sys/trap.h>
     70 #include <sys/segments.h>
     71 #include <sys/hypervisor.h>
     72 #include <sys/xen_mmu.h>
     73 #include <sys/machsystm.h>
     74 #include <sys/promif.h>
     75 #include <sys/bootconf.h>
     76 #include <sys/bootinfo.h>
     77 #include <sys/cpr.h>
     78 #include <sys/taskq.h>
     79 #include <sys/uadmin.h>
     80 #include <sys/evtchn_impl.h>
     81 #include <sys/archsystm.h>
     82 #include <xen/sys/xenbus_impl.h>
     83 #include <sys/mach_mmu.h>
     84 #include <vm/hat_i86.h>
     85 #include <sys/gnttab.h>
     86 #include <sys/reboot.h>
     87 #include <sys/stack.h>
     88 #include <sys/clock.h>
     89 #include <sys/bitmap.h>
     90 #include <sys/processor.h>
     91 #include <sys/xen_errno.h>
     92 #include <sys/xpv_panic.h>
     93 #include <sys/smp_impldefs.h>
     94 #include <sys/cpu.h>
     95 #include <sys/balloon_impl.h>
     96 #include <sys/ddi.h>
     97 
     98 #ifdef DEBUG
     99 #define	SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
    100 #else
    101 #define	SUSPEND_DEBUG(...)
    102 #endif
    103 
    104 int cpr_debug;
    105 cpuset_t cpu_suspend_lost_set;
    106 static int xen_suspend_debug;
    107 
    108 uint_t xen_phys_ncpus;
    109 xen_mc_logical_cpu_t *xen_phys_cpus;
    110 int xen_physinfo_debug = 0;
    111 
    112 /*
    113  * Determine helpful version information.
    114  *
    115  * (And leave copies in the data segment so we can look at them later
    116  * with e.g. kmdb.)
    117  */
    118 
    119 typedef enum xen_version {
    120 	XENVER_BOOT_IDX,
    121 	XENVER_CURRENT_IDX
    122 } xen_version_t;
    123 
    124 struct xenver {
    125 	ulong_t xv_major;
    126 	ulong_t xv_minor;
    127 	ulong_t xv_revision;
    128 	xen_extraversion_t xv_ver;
    129 	ulong_t xv_is_xvm;
    130 	xen_changeset_info_t xv_chgset;
    131 	xen_compile_info_t xv_build;
    132 	xen_capabilities_info_t xv_caps;
    133 } xenver[2];
    134 
    135 #define	XENVER_BOOT(m)	(xenver[XENVER_BOOT_IDX].m)
    136 #define	XENVER_CURRENT(m)	(xenver[XENVER_CURRENT_IDX].m)
    137 
    138 /*
    139  * Update the xenver data. We maintain two copies, boot and
    140  * current. If we are setting the boot, then also set current.
    141  */
    142 static void
    143 xen_set_version(xen_version_t idx)
    144 {
    145 	ulong_t ver;
    146 
    147 	bzero(&xenver[idx], sizeof (xenver[idx]));
    148 
    149 	ver = HYPERVISOR_xen_version(XENVER_version, 0);
    150 
    151 	xenver[idx].xv_major = BITX(ver, 31, 16);
    152 	xenver[idx].xv_minor = BITX(ver, 15, 0);
    153 
    154 	(void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver);
    155 
    156 	/*
    157 	 * The revision is buried in the extraversion information that is
    158 	 * maintained by the hypervisor. For our purposes we expect that
    159 	 * the revision number is:
    160 	 * 	- the second character in the extraversion information
    161 	 *	- one character long
    162 	 *	- numeric digit
    163 	 * If it isn't then we can't extract the revision and we leave it
    164 	 * set to 0.
    165 	 */
    166 	if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1]))
    167 		xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0';
    168 	else
    169 		cmn_err(CE_WARN, "Cannot extract revision on this hypervisor "
    170 		    "version: v%s, unexpected version format",
    171 		    xenver[idx].xv_ver);
    172 
    173 	xenver[idx].xv_is_xvm = 0;
    174 
    175 	if (strstr(xenver[idx].xv_ver, "-xvm") != NULL)
    176 		xenver[idx].xv_is_xvm = 1;
    177 
    178 	(void) HYPERVISOR_xen_version(XENVER_changeset,
    179 	    &xenver[idx].xv_chgset);
    180 
    181 	(void) HYPERVISOR_xen_version(XENVER_compile_info,
    182 	    &xenver[idx].xv_build);
    183 	/*
    184 	 * Capabilities are a set of space separated ascii strings
    185 	 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64'
    186 	 */
    187 	(void) HYPERVISOR_xen_version(XENVER_capabilities,
    188 	    &xenver[idx].xv_caps);
    189 
    190 	cmn_err(CE_CONT, "?v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major,
    191 	    xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset);
    192 
    193 	if (idx == XENVER_BOOT_IDX)
    194 		bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX],
    195 		    sizeof (xenver[XENVER_BOOT_IDX]));
    196 }
    197 
    198 typedef enum xen_hypervisor_check {
    199 	XEN_RUN_CHECK,
    200 	XEN_SUSPEND_CHECK
    201 } xen_hypervisor_check_t;
    202 
    203 /*
    204  * To run the hypervisor must be 3.0.4 or better. To suspend/resume
    205  * we need 3.0.4 or better and if it is 3.0.4. then it must be provided
    206  * by the Solaris xVM project.
    207  * Checking can be disabled for testing purposes by setting the
    208  * xen_suspend_debug variable.
    209  */
    210 static int
    211 xen_hypervisor_supports_solaris(xen_hypervisor_check_t check)
    212 {
    213 	if (xen_suspend_debug == 1)
    214 		return (1);
    215 	if (XENVER_CURRENT(xv_major) < 3)
    216 		return (0);
    217 	if (XENVER_CURRENT(xv_major) > 3)
    218 		return (1);
    219 	if (XENVER_CURRENT(xv_minor) > 0)
    220 		return (1);
    221 	if (XENVER_CURRENT(xv_revision) < 4)
    222 		return (0);
    223 	if (check == XEN_SUSPEND_CHECK && XENVER_CURRENT(xv_revision) == 4 &&
    224 	    !XENVER_CURRENT(xv_is_xvm))
    225 		return (0);
    226 
    227 	return (1);
    228 }
    229 
    230 /*
    231  * If the hypervisor is -xvm, or 3.1.2 or higher, we don't need the
    232  * workaround.
    233  */
    234 static void
    235 xen_pte_workaround(void)
    236 {
    237 #if defined(__amd64)
    238 	extern int pt_kern;
    239 
    240 	if (XENVER_CURRENT(xv_major) != 3)
    241 		return;
    242 	if (XENVER_CURRENT(xv_minor) > 1)
    243 		return;
    244 	if (XENVER_CURRENT(xv_minor) == 1 &&
    245 	    XENVER_CURRENT(xv_revision) > 1)
    246 		return;
    247 	if (XENVER_CURRENT(xv_is_xvm))
    248 		return;
    249 
    250 	pt_kern = PT_USER;
    251 #endif
    252 }
    253 
    254 void
    255 xen_set_callback(void (*func)(void), uint_t type, uint_t flags)
    256 {
    257 	struct callback_register cb;
    258 
    259 	bzero(&cb, sizeof (cb));
    260 #if defined(__amd64)
    261 	cb.address = (ulong_t)func;
    262 #elif defined(__i386)
    263 	cb.address.cs = KCS_SEL;
    264 	cb.address.eip = (ulong_t)func;
    265 #endif
    266 	cb.type = type;
    267 	cb.flags = flags;
    268 
    269 	/*
    270 	 * XXPV always ignore return value for NMI
    271 	 */
    272 	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 &&
    273 	    type != CALLBACKTYPE_nmi)
    274 		panic("HYPERVISOR_callback_op failed");
    275 }
    276 
    277 void
    278 xen_init_callbacks(void)
    279 {
    280 	/*
    281 	 * register event (interrupt) handler.
    282 	 */
    283 	xen_set_callback(xen_callback, CALLBACKTYPE_event, 0);
    284 
    285 	/*
    286 	 * failsafe handler.
    287 	 */
    288 	xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe,
    289 	    CALLBACKF_mask_events);
    290 
    291 	/*
    292 	 * NMI handler.
    293 	 */
    294 	xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0);
    295 
    296 	/*
    297 	 * system call handler
    298 	 * XXPV move to init_cpu_syscall?
    299 	 */
    300 #if defined(__amd64)
    301 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
    302 	    CALLBACKF_mask_events);
    303 #endif	/* __amd64 */
    304 }
    305 
    306 
    307 /*
    308  * cmn_err() followed by a 1/4 second delay; this gives the
    309  * logging service a chance to flush messages and helps avoid
    310  * intermixing output from prom_printf().
    311  * XXPV: doesn't exactly help us on UP though.
    312  */
    313 /*PRINTFLIKE2*/
    314 void
    315 cpr_err(int ce, const char *fmt, ...)
    316 {
    317 	va_list adx;
    318 
    319 	va_start(adx, fmt);
    320 	vcmn_err(ce, fmt, adx);
    321 	va_end(adx);
    322 	drv_usecwait(MICROSEC >> 2);
    323 }
    324 
    325 void
    326 xen_suspend_devices(void)
    327 {
    328 	int rc;
    329 
    330 	SUSPEND_DEBUG("xen_suspend_devices\n");
    331 
    332 	if ((rc = cpr_suspend_devices(ddi_root_node())) != 0)
    333 		panic("failed to suspend devices: %d", rc);
    334 }
    335 
    336 void
    337 xen_resume_devices(void)
    338 {
    339 	int rc;
    340 
    341 	SUSPEND_DEBUG("xen_resume_devices\n");
    342 
    343 	if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0)
    344 		panic("failed to resume devices: %d", rc);
    345 }
    346 
    347 /*
    348  * The list of mfn pages is out of date.  Recompute it.
    349  */
    350 static void
    351 rebuild_mfn_list(void)
    352 {
    353 	int i = 0;
    354 	size_t sz;
    355 	size_t off;
    356 	pfn_t pfn;
    357 
    358 	SUSPEND_DEBUG("rebuild_mfn_list\n");
    359 
    360 	sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK;
    361 
    362 	for (off = 0; off < sz; off += MMU_PAGESIZE) {
    363 		size_t j = mmu_btop(off);
    364 		if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) {
    365 			pfn = hat_getpfnum(kas.a_hat,
    366 			    (caddr_t)&mfn_list_pages[j]);
    367 			mfn_list_pages_page[i++] = pfn_to_mfn(pfn);
    368 		}
    369 
    370 		pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off);
    371 		mfn_list_pages[j] = pfn_to_mfn(pfn);
    372 	}
    373 
    374 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page);
    375 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list
    376 	    = pfn_to_mfn(pfn);
    377 }
    378 
    379 static void
    380 suspend_cpus(void)
    381 {
    382 	int i;
    383 
    384 	SUSPEND_DEBUG("suspend_cpus\n");
    385 
    386 	mp_enter_barrier();
    387 
    388 	for (i = 1; i < ncpus; i++) {
    389 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
    390 			SUSPEND_DEBUG("xen_vcpu_down %d\n", i);
    391 			(void) xen_vcpu_down(i);
    392 		}
    393 
    394 		mach_cpucontext_reset(cpu[i]);
    395 	}
    396 }
    397 
    398 static void
    399 resume_cpus(void)
    400 {
    401 	int i;
    402 
    403 	for (i = 1; i < ncpus; i++) {
    404 		if (cpu[i] == NULL)
    405 			continue;
    406 
    407 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
    408 			SUSPEND_DEBUG("xen_vcpu_up %d\n", i);
    409 			mach_cpucontext_restore(cpu[i]);
    410 			(void) xen_vcpu_up(i);
    411 		}
    412 	}
    413 
    414 	mp_leave_barrier();
    415 }
    416 
    417 /*
    418  * Top level routine to direct suspend/resume of a domain.
    419  */
    420 void
    421 xen_suspend_domain(void)
    422 {
    423 	extern void rtcsync(void);
    424 	extern hrtime_t hres_last_tick;
    425 	mfn_t start_info_mfn;
    426 	ulong_t flags;
    427 	pfn_t pfn;
    428 	int i;
    429 
    430 	/*
    431 	 * Check that we are happy to suspend on this hypervisor.
    432 	 */
    433 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) {
    434 		cpr_err(CE_WARN, "Cannot suspend on this hypervisor "
    435 		    "version: v%lu.%lu%s, need at least version v3.0.4 or "
    436 		    "-xvm based hypervisor", XENVER_CURRENT(xv_major),
    437 		    XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver));
    438 		return;
    439 	}
    440 
    441 	/*
    442 	 * XXPV - Are we definitely OK to suspend by the time we've connected
    443 	 * the handler?
    444 	 */
    445 
    446 	cpr_err(CE_NOTE, "Domain suspending for save/migrate");
    447 
    448 	SUSPEND_DEBUG("xen_suspend_domain\n");
    449 
    450 	/*
    451 	 * suspend interrupts and devices
    452 	 * XXPV - we use suspend/resume for both save/restore domains (like sun
    453 	 * cpr) and for migration.  Would be nice to know the difference if
    454 	 * possible.  For save/restore where down time may be a long time, we
    455 	 * may want to do more of the things that cpr does.  (i.e. notify user
    456 	 * processes, shrink memory footprint for faster restore, etc.)
    457 	 */
    458 	xen_suspend_devices();
    459 	SUSPEND_DEBUG("xenbus_suspend\n");
    460 	xenbus_suspend();
    461 
    462 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info);
    463 	start_info_mfn = pfn_to_mfn(pfn);
    464 
    465 	/*
    466 	 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe
    467 	 * wrt xenbus being suspended here?
    468 	 */
    469 	mutex_enter(&cpu_lock);
    470 
    471 	/*
    472 	 * Suspend must be done on vcpu 0, as no context for other CPUs is
    473 	 * saved.
    474 	 *
    475 	 * XXPV - add to taskq API ?
    476 	 */
    477 	thread_affinity_set(curthread, 0);
    478 	kpreempt_disable();
    479 
    480 	SUSPEND_DEBUG("xen_start_migrate\n");
    481 	xen_start_migrate();
    482 	if (ncpus > 1)
    483 		suspend_cpus();
    484 
    485 	/*
    486 	 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence
    487 	 * any holder would have dropped it to get through suspend_cpus().
    488 	 */
    489 	mutex_enter(&ec_lock);
    490 
    491 	/*
    492 	 * From here on in, we can't take locks.
    493 	 */
    494 	SUSPEND_DEBUG("ec_suspend\n");
    495 	ec_suspend();
    496 	SUSPEND_DEBUG("gnttab_suspend\n");
    497 	gnttab_suspend();
    498 
    499 	flags = intr_clear();
    500 
    501 	xpv_time_suspend();
    502 
    503 	/*
    504 	 * Currently, the hypervisor incorrectly fails to bring back
    505 	 * powered-down VCPUs.  Thus we need to record any powered-down VCPUs
    506 	 * to prevent any attempts to operate on them.  But we have to do this
    507 	 * *after* the very first time we do ec_suspend().
    508 	 */
    509 	for (i = 1; i < ncpus; i++) {
    510 		if (cpu[i] == NULL)
    511 			continue;
    512 
    513 		if (cpu_get_state(cpu[i]) == P_POWEROFF)
    514 			CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i);
    515 	}
    516 
    517 	/*
    518 	 * The dom0 save/migrate code doesn't automatically translate
    519 	 * these into PFNs, but expects them to be, so we do it here.
    520 	 * We don't use mfn_to_pfn() because so many OS services have
    521 	 * been disabled at this point.
    522 	 */
    523 	xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn];
    524 	xen_info->console.domU.mfn =
    525 	    mfn_to_pfn_mapping[xen_info->console.domU.mfn];
    526 
    527 	if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) {
    528 		prom_printf("xen_suspend_domain(): "
    529 		    "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n");
    530 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
    531 	}
    532 
    533 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
    534 	    0, UVMF_INVLPG)) {
    535 		prom_printf("xen_suspend_domain(): "
    536 		    "HYPERVISOR_update_va_mapping() failed\n");
    537 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
    538 	}
    539 
    540 	SUSPEND_DEBUG("HYPERVISOR_suspend\n");
    541 
    542 	/*
    543 	 * At this point we suspend and sometime later resume.
    544 	 */
    545 	if (HYPERVISOR_suspend(start_info_mfn)) {
    546 		prom_printf("xen_suspend_domain(): "
    547 		    "HYPERVISOR_suspend() failed\n");
    548 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
    549 	}
    550 
    551 	/*
    552 	 * Point HYPERVISOR_shared_info to its new value.
    553 	 */
    554 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
    555 	    xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE,
    556 	    UVMF_INVLPG))
    557 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
    558 
    559 	if (xen_info->nr_pages != mfn_count) {
    560 		prom_printf("xen_suspend_domain(): number of pages"
    561 		    " changed, was 0x%lx, now 0x%lx\n", mfn_count,
    562 		    xen_info->nr_pages);
    563 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
    564 	}
    565 
    566 	xpv_time_resume();
    567 
    568 	cached_max_mfn = 0;
    569 
    570 	SUSPEND_DEBUG("gnttab_resume\n");
    571 	gnttab_resume();
    572 
    573 	/* XXPV: add a note that this must be lockless. */
    574 	SUSPEND_DEBUG("ec_resume\n");
    575 	ec_resume();
    576 
    577 	intr_restore(flags);
    578 
    579 	if (ncpus > 1)
    580 		resume_cpus();
    581 
    582 	mutex_exit(&ec_lock);
    583 	xen_end_migrate();
    584 	mutex_exit(&cpu_lock);
    585 
    586 	/*
    587 	 * Now we can take locks again.
    588 	 */
    589 
    590 	/*
    591 	 * Force the tick value used for tv_nsec in hres_tick() to be up to
    592 	 * date. rtcsync() will reset the hrestime value appropriately.
    593 	 */
    594 	hres_last_tick = xpv_gethrtime();
    595 
    596 	/*
    597 	 * XXPV: we need to have resumed the CPUs since this takes locks, but
    598 	 * can remote CPUs see bad state? Presumably yes. Should probably nest
    599 	 * taking of todlock inside of cpu_lock, or vice versa, then provide an
    600 	 * unlocked version.  Probably need to call clkinitf to reset cpu freq
    601 	 * and re-calibrate if we migrated to a different speed cpu.  Also need
    602 	 * to make a (re)init_cpu_info call to update processor info structs
    603 	 * and device tree info.  That remains to be written at the moment.
    604 	 */
    605 	rtcsync();
    606 
    607 	rebuild_mfn_list();
    608 
    609 	SUSPEND_DEBUG("xenbus_resume\n");
    610 	xenbus_resume();
    611 	SUSPEND_DEBUG("xenbus_resume_devices\n");
    612 	xen_resume_devices();
    613 
    614 	thread_affinity_clear(curthread);
    615 	kpreempt_enable();
    616 
    617 	SUSPEND_DEBUG("finished xen_suspend_domain\n");
    618 
    619 	/*
    620 	 * We have restarted our suspended domain, update the hypervisor
    621 	 * details. NB: This must be done at the end of this function,
    622 	 * since we need the domain to be completely resumed before
    623 	 * these functions will work correctly.
    624 	 */
    625 	xen_set_version(XENVER_CURRENT_IDX);
    626 
    627 	/*
    628 	 * We can check and report a warning, but we don't stop the
    629 	 * process.
    630 	 */
    631 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0)
    632 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
    633 		    "but need at least version v3.0.4",
    634 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
    635 		    XENVER_CURRENT(xv_ver));
    636 
    637 	cmn_err(CE_NOTE, "domain restore/migrate completed");
    638 }
    639 
    640 /*ARGSUSED*/
    641 int
    642 xen_debug_handler(void *arg)
    643 {
    644 	debug_enter("External debug event received");
    645 
    646 	/*
    647 	 * If we've not got KMDB loaded, output some stuff difficult to capture
    648 	 * from a domain core.
    649 	 */
    650 	if (!(boothowto & RB_DEBUG)) {
    651 		shared_info_t *si = HYPERVISOR_shared_info;
    652 		int i;
    653 
    654 		prom_printf("evtchn_pending [ ");
    655 		for (i = 0; i < 8; i++)
    656 			prom_printf("%lx ", si->evtchn_pending[i]);
    657 		prom_printf("]\nevtchn_mask [ ");
    658 		for (i = 0; i < 8; i++)
    659 			prom_printf("%lx ", si->evtchn_mask[i]);
    660 		prom_printf("]\n");
    661 
    662 		for (i = 0; i < ncpus; i++) {
    663 			vcpu_info_t *vcpu = &si->vcpu_info[i];
    664 			if (cpu[i] == NULL)
    665 				continue;
    666 			prom_printf("CPU%d pending %d mask %d sel %lx\n",
    667 			    i, vcpu->evtchn_upcall_pending,
    668 			    vcpu->evtchn_upcall_mask,
    669 			    vcpu->evtchn_pending_sel);
    670 		}
    671 	}
    672 
    673 	return (0);
    674 }
    675 
    676 /*ARGSUSED*/
    677 static void
    678 xen_sysrq_handler(struct xenbus_watch *watch, const char **vec,
    679     unsigned int len)
    680 {
    681 	xenbus_transaction_t xbt;
    682 	char key = '\0';
    683 	int ret;
    684 
    685 retry:
    686 	if (xenbus_transaction_start(&xbt)) {
    687 		cmn_err(CE_WARN, "failed to start sysrq transaction");
    688 		return;
    689 	}
    690 
    691 	if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) {
    692 		/*
    693 		 * ENOENT happens in response to our own xenbus_rm.
    694 		 * XXPV - this happens spuriously on boot?
    695 		 */
    696 		if (ret != ENOENT)
    697 			cmn_err(CE_WARN, "failed to read sysrq: %d", ret);
    698 		goto out;
    699 	}
    700 
    701 	if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) {
    702 		cmn_err(CE_WARN, "failed to reset sysrq: %d", ret);
    703 		goto out;
    704 	}
    705 
    706 	if (xenbus_transaction_end(xbt, 0) == EAGAIN)
    707 		goto retry;
    708 
    709 	/*
    710 	 * Somewhat arbitrary - on Linux this means 'reboot'. We could just
    711 	 * accept any key, but this might increase the risk of sending a
    712 	 * harmless sysrq to the wrong domain...
    713 	 */
    714 	if (key == 'b')
    715 		(void) xen_debug_handler(NULL);
    716 	else
    717 		cmn_err(CE_WARN, "Ignored sysrq %c", key);
    718 	return;
    719 
    720 out:
    721 	(void) xenbus_transaction_end(xbt, 1);
    722 }
    723 
    724 taskq_t *xen_shutdown_tq;
    725 
    726 #define	SHUTDOWN_INVALID	-1
    727 #define	SHUTDOWN_POWEROFF	0
    728 #define	SHUTDOWN_REBOOT		1
    729 #define	SHUTDOWN_SUSPEND	2
    730 #define	SHUTDOWN_HALT		3
    731 #define	SHUTDOWN_MAX		4
    732 
    733 #define	SHUTDOWN_TIMEOUT_SECS (60 * 5)
    734 
    735 static const char *cmd_strings[SHUTDOWN_MAX] = {
    736 	"poweroff",
    737 	"reboot",
    738 	"suspend",
    739 	"halt"
    740 };
    741 
    742 static void
    743 xen_dirty_shutdown(void *arg)
    744 {
    745 	int cmd = (uintptr_t)arg;
    746 
    747 	cmn_err(CE_WARN, "Externally requested shutdown failed or "
    748 	    "timed out.\nShutting down.\n");
    749 
    750 	switch (cmd) {
    751 	case SHUTDOWN_HALT:
    752 	case SHUTDOWN_POWEROFF:
    753 		(void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred);
    754 		break;
    755 	case SHUTDOWN_REBOOT:
    756 		(void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred);
    757 		break;
    758 	}
    759 }
    760 
    761 static void
    762 xen_shutdown(void *arg)
    763 {
    764 	int cmd = (uintptr_t)arg;
    765 	proc_t *initpp;
    766 
    767 	ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX);
    768 
    769 	if (cmd == SHUTDOWN_SUSPEND) {
    770 		xen_suspend_domain();
    771 		return;
    772 	}
    773 
    774 	switch (cmd) {
    775 	case SHUTDOWN_POWEROFF:
    776 		force_shutdown_method = AD_POWEROFF;
    777 		break;
    778 	case SHUTDOWN_HALT:
    779 		force_shutdown_method = AD_HALT;
    780 		break;
    781 	case SHUTDOWN_REBOOT:
    782 		force_shutdown_method = AD_BOOT;
    783 		break;
    784 	}
    785 
    786 	/*
    787 	 * If we're still booting and init(1) isn't set up yet, simply halt.
    788 	 */
    789 	mutex_enter(&pidlock);
    790 	initpp = prfind(P_INITPID);
    791 	mutex_exit(&pidlock);
    792 	if (initpp == NULL) {
    793 		extern void halt(char *);
    794 		halt("Power off the System");   /* just in case */
    795 	}
    796 
    797 	/*
    798 	 * else, graceful shutdown with inittab and all getting involved
    799 	 */
    800 	psignal(initpp, SIGPWR);
    801 
    802 	(void) timeout(xen_dirty_shutdown, arg,
    803 	    SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC));
    804 }
    805 
    806 /*ARGSUSED*/
    807 static void
    808 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec,
    809 	unsigned int len)
    810 {
    811 	char *str;
    812 	xenbus_transaction_t xbt;
    813 	int err, shutdown_code = SHUTDOWN_INVALID;
    814 	unsigned int slen;
    815 
    816 again:
    817 	err = xenbus_transaction_start(&xbt);
    818 	if (err)
    819 		return;
    820 	if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) {
    821 		(void) xenbus_transaction_end(xbt, 1);
    822 		return;
    823 	}
    824 
    825 	SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str);
    826 
    827 	/*
    828 	 * If this is a watch fired from our write below, check out early to
    829 	 * avoid an infinite loop.
    830 	 */
    831 	if (strcmp(str, "") == 0) {
    832 		(void) xenbus_transaction_end(xbt, 0);
    833 		kmem_free(str, slen);
    834 		return;
    835 	} else if (strcmp(str, "poweroff") == 0) {
    836 		shutdown_code = SHUTDOWN_POWEROFF;
    837 	} else if (strcmp(str, "reboot") == 0) {
    838 		shutdown_code = SHUTDOWN_REBOOT;
    839 	} else if (strcmp(str, "suspend") == 0) {
    840 		shutdown_code = SHUTDOWN_SUSPEND;
    841 	} else if (strcmp(str, "halt") == 0) {
    842 		shutdown_code = SHUTDOWN_HALT;
    843 	} else {
    844 		printf("Ignoring shutdown request: %s\n", str);
    845 	}
    846 
    847 	/*
    848 	 * XXPV	Should we check the value of xenbus_write() too, or are all
    849 	 *	errors automatically folded into xenbus_transaction_end() ??
    850 	 */
    851 	(void) xenbus_write(xbt, "control", "shutdown", "");
    852 	err = xenbus_transaction_end(xbt, 0);
    853 	if (err == EAGAIN) {
    854 		SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id);
    855 		kmem_free(str, slen);
    856 		goto again;
    857 	}
    858 
    859 	kmem_free(str, slen);
    860 	if (shutdown_code != SHUTDOWN_INVALID) {
    861 		(void) taskq_dispatch(xen_shutdown_tq, xen_shutdown,
    862 		    (void *)(intptr_t)shutdown_code, 0);
    863 	}
    864 }
    865 
    866 static struct xenbus_watch shutdown_watch;
    867 static struct xenbus_watch sysrq_watch;
    868 
    869 void
    870 xen_late_startup(void)
    871 {
    872 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
    873 		xen_shutdown_tq = taskq_create("shutdown_taskq", 1,
    874 		    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
    875 		shutdown_watch.node = "control/shutdown";
    876 		shutdown_watch.callback = xen_shutdown_handler;
    877 		if (register_xenbus_watch(&shutdown_watch))
    878 			cmn_err(CE_WARN, "Failed to set shutdown watcher");
    879 
    880 		sysrq_watch.node = "control/sysrq";
    881 		sysrq_watch.callback = xen_sysrq_handler;
    882 		if (register_xenbus_watch(&sysrq_watch))
    883 			cmn_err(CE_WARN, "Failed to set sysrq watcher");
    884 	}
    885 	balloon_init(xen_info->nr_pages);
    886 }
    887 
    888 #ifdef DEBUG
    889 #define	XEN_PRINTF_BUFSIZE	1024
    890 
    891 char xen_printf_buffer[XEN_PRINTF_BUFSIZE];
    892 
    893 /*
    894  * Printf function that calls hypervisor directly.  For DomU it only
    895  * works when running on a xen hypervisor built with debug on.  Works
    896  * always since no I/O ring interaction is needed.
    897  */
    898 /*PRINTFLIKE1*/
    899 void
    900 xen_printf(const char *fmt, ...)
    901 {
    902 	va_list	ap;
    903 
    904 	va_start(ap, fmt);
    905 	(void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap);
    906 	va_end(ap);
    907 
    908 	(void) HYPERVISOR_console_io(CONSOLEIO_write,
    909 	    strlen(xen_printf_buffer), xen_printf_buffer);
    910 }
    911 #else
    912 void
    913 xen_printf(const char *fmt, ...)
    914 {
    915 }
    916 #endif	/* DEBUG */
    917 
    918 void
    919 startup_xen_version(void)
    920 {
    921 	xen_set_version(XENVER_BOOT_IDX);
    922 	if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0)
    923 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
    924 		    "but need at least version v3.0.4",
    925 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
    926 		    XENVER_CURRENT(xv_ver));
    927 	xen_pte_workaround();
    928 }
    929 
    930 int xen_mca_simulate_mc_physinfo_failure = 0;
    931 
    932 void
    933 startup_xen_mca(void)
    934 {
    935 	if (!DOMAIN_IS_INITDOMAIN(xen_info))
    936 		return;
    937 
    938 	xen_phys_ncpus = 0;
    939 	xen_phys_cpus = NULL;
    940 
    941 	if (xen_mca_simulate_mc_physinfo_failure ||
    942 	    xen_get_mc_physcpuinfo(NULL, &xen_phys_ncpus) != 0) {
    943 		cmn_err(CE_WARN,
    944 		    "%sxen_get_mc_physinfo failure during xen MCA startup: "
    945 		    "there will be no machine check support",
    946 		    xen_mca_simulate_mc_physinfo_failure ? "(simulated) " : "");
    947 		return;
    948 	}
    949 
    950 	xen_phys_cpus = kmem_alloc(xen_phys_ncpus *
    951 	    sizeof (xen_mc_logical_cpu_t), KM_NOSLEEP);
    952 
    953 	if (xen_phys_cpus == NULL) {
    954 		cmn_err(CE_WARN,
    955 		    "xen_get_mc_physinfo failure: can't allocate CPU array");
    956 		return;
    957 	}
    958 
    959 	if (xen_get_mc_physcpuinfo(xen_phys_cpus, &xen_phys_ncpus) != 0) {
    960 		cmn_err(CE_WARN, "xen_get_mc_physinfo failure: no "
    961 		    "physical CPU info");
    962 		kmem_free(xen_phys_cpus,
    963 		    xen_phys_ncpus * sizeof (xen_mc_logical_cpu_t));
    964 		xen_phys_ncpus = 0;
    965 		xen_phys_cpus = NULL;
    966 	}
    967 
    968 	if (xen_physinfo_debug) {
    969 		xen_mc_logical_cpu_t *xcp;
    970 		unsigned i;
    971 
    972 		cmn_err(CE_NOTE, "xvm mca: %u physical cpus:\n",
    973 		    xen_phys_ncpus);
    974 		for (i = 0; i < xen_phys_ncpus; i++) {
    975 			xcp = &xen_phys_cpus[i];
    976 			cmn_err(CE_NOTE, "cpu%u: (%u, %u, %u) apid %u",
    977 			    xcp->mc_cpunr, xcp->mc_chipid, xcp->mc_coreid,
    978 			    xcp->mc_threadid, xcp->mc_apicid);
    979 		}
    980 	}
    981 }
    982 
    983 /*
    984  * Miscellaneous hypercall wrappers with slightly more verbose diagnostics.
    985  */
    986 
    987 void
    988 xen_set_gdt(ulong_t *frame_list, int entries)
    989 {
    990 	int err;
    991 	if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) {
    992 		/*
    993 		 * X_EINVAL:	reserved entry or bad frames
    994 		 * X_EFAULT:	bad address
    995 		 */
    996 		panic("xen_set_gdt(%p, %d): error %d",
    997 		    (void *)frame_list, entries, -(int)err);
    998 	}
    999 }
   1000 
   1001 void
   1002 xen_set_ldt(user_desc_t *ldt, uint_t nsels)
   1003 {
   1004 	struct mmuext_op	op;
   1005 	long			err;
   1006 
   1007 	op.cmd = MMUEXT_SET_LDT;
   1008 	op.arg1.linear_addr = (uintptr_t)ldt;
   1009 	op.arg2.nr_ents = nsels;
   1010 
   1011 	if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) {
   1012 		panic("xen_set_ldt(%p, %d): error %d",
   1013 		    (void *)ldt, nsels, -(int)err);
   1014 	}
   1015 }
   1016 
   1017 void
   1018 xen_stack_switch(ulong_t ss, ulong_t esp)
   1019 {
   1020 	long err;
   1021 
   1022 	if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) {
   1023 		/*
   1024 		 * X_EPERM:	bad selector
   1025 		 */
   1026 		panic("xen_stack_switch(%lx, %lx): error %d", ss, esp,
   1027 		    -(int)err);
   1028 	}
   1029 }
   1030 
   1031 long
   1032 xen_set_trap_table(trap_info_t *table)
   1033 {
   1034 	long err;
   1035 
   1036 	if ((err = HYPERVISOR_set_trap_table(table)) != 0) {
   1037 		/*
   1038 		 * X_EFAULT:	bad address
   1039 		 * X_EPERM:	bad selector
   1040 		 */
   1041 		panic("xen_set_trap_table(%p): error %d", (void *)table,
   1042 		    -(int)err);
   1043 	}
   1044 	return (err);
   1045 }
   1046 
   1047 #if defined(__amd64)
   1048 void
   1049 xen_set_segment_base(int reg, ulong_t value)
   1050 {
   1051 	long err;
   1052 
   1053 	if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) {
   1054 		/*
   1055 		 * X_EFAULT:	bad address
   1056 		 * X_EINVAL:	bad type
   1057 		 */
   1058 		panic("xen_set_segment_base(%d, %lx): error %d",
   1059 		    reg, value, -(int)err);
   1060 	}
   1061 }
   1062 #endif	/* __amd64 */
   1063 
   1064 /*
   1065  * Translate a hypervisor errcode to a Solaris error code.
   1066  */
   1067 int
   1068 xen_xlate_errcode(int error)
   1069 {
   1070 	switch (-error) {
   1071 
   1072 	/*
   1073 	 * Translate hypervisor errno's into native errno's
   1074 	 */
   1075 
   1076 #define	CASE(num)	case X_##num: error = num; break
   1077 
   1078 	CASE(EPERM);	CASE(ENOENT);	CASE(ESRCH);
   1079 	CASE(EINTR);	CASE(EIO);	CASE(ENXIO);
   1080 	CASE(E2BIG);	CASE(ENOMEM);	CASE(EACCES);
   1081 	CASE(EFAULT);	CASE(EBUSY);	CASE(EEXIST);
   1082 	CASE(ENODEV);	CASE(EISDIR);	CASE(EINVAL);
   1083 	CASE(ENOSPC);	CASE(ESPIPE);	CASE(EROFS);
   1084 	CASE(ENOSYS);	CASE(ENOTEMPTY); CASE(EISCONN);
   1085 	CASE(ENODATA);	CASE(EAGAIN);
   1086 
   1087 #undef CASE
   1088 
   1089 	default:
   1090 		panic("xen_xlate_errcode: unknown error %d", error);
   1091 	}
   1092 
   1093 	return (error);
   1094 }
   1095 
   1096 /*
   1097  * Raise PS_IOPL on current vcpu to user level.
   1098  * Caller responsible for preventing kernel preemption.
   1099  */
   1100 void
   1101 xen_enable_user_iopl(void)
   1102 {
   1103 	physdev_set_iopl_t set_iopl;
   1104 	set_iopl.iopl = 3;		/* user ring 3 */
   1105 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
   1106 }
   1107 
   1108 /*
   1109  * Drop PS_IOPL on current vcpu to kernel level
   1110  */
   1111 void
   1112 xen_disable_user_iopl(void)
   1113 {
   1114 	physdev_set_iopl_t set_iopl;
   1115 	set_iopl.iopl = 1;		/* kernel pseudo ring 1 */
   1116 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
   1117 }
   1118 
   1119 int
   1120 xen_gdt_setprot(cpu_t *cp, uint_t prot)
   1121 {
   1122 	int err;
   1123 #if defined(__amd64)
   1124 	int pt_bits = PT_VALID;
   1125 	if (prot & PROT_WRITE)
   1126 		pt_bits |= PT_WRITABLE;
   1127 #endif
   1128 
   1129 	if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt,
   1130 	    MMU_PAGESIZE, prot)) != 0)
   1131 		goto done;
   1132 
   1133 #if defined(__amd64)
   1134 	err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits);
   1135 #endif
   1136 
   1137 done:
   1138 	if (err) {
   1139 		cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d",
   1140 		    cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only",
   1141 		    err);
   1142 	}
   1143 
   1144 	return (err);
   1145 }
   1146 
   1147 int
   1148 xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot)
   1149 {
   1150 	int err;
   1151 	caddr_t	lva = (caddr_t)ldt;
   1152 #if defined(__amd64)
   1153 	int pt_bits = PT_VALID;
   1154 	pgcnt_t npgs;
   1155 	if (prot & PROT_WRITE)
   1156 		pt_bits |= PT_WRITABLE;
   1157 #endif	/* __amd64 */
   1158 
   1159 	if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0)
   1160 		goto done;
   1161 
   1162 #if defined(__amd64)
   1163 
   1164 	ASSERT(IS_P2ALIGNED(lsize, PAGESIZE));
   1165 	npgs = mmu_btop(lsize);
   1166 	while (npgs--) {
   1167 		if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva),
   1168 		    pt_bits)) != 0)
   1169 			break;
   1170 		lva += PAGESIZE;
   1171 	}
   1172 #endif	/* __amd64 */
   1173 
   1174 done:
   1175 	if (err) {
   1176 		cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d",
   1177 		    (void *)lva,
   1178 		    (prot & PROT_WRITE) ? "writable" : "read-only", err);
   1179 	}
   1180 
   1181 	return (err);
   1182 }
   1183 
   1184 int
   1185 xen_get_mc_physcpuinfo(xen_mc_logical_cpu_t *log_cpus, uint_t *ncpus)
   1186 {
   1187 	xen_mc_t xmc;
   1188 	struct xen_mc_physcpuinfo *cpi = &xmc.u.mc_physcpuinfo;
   1189 
   1190 	cpi->ncpus = *ncpus;
   1191 	/*LINTED: constant in conditional context*/
   1192 	set_xen_guest_handle(cpi->info, log_cpus);
   1193 
   1194 	if (HYPERVISOR_mca(XEN_MC_physcpuinfo, &xmc) != 0)
   1195 		return (-1);
   1196 
   1197 	*ncpus = cpi->ncpus;
   1198 	return (0);
   1199 }
   1200 
   1201 void
   1202 print_panic(const char *str)
   1203 {
   1204 	xen_printf(str);
   1205 }
   1206 
   1207 /*
   1208  * Interfaces to iterate over real cpu information, but only that info
   1209  * which we choose to expose here.  These are of interest to dom0
   1210  * only (and the backing hypercall should not work for domu).
   1211  */
   1212 
   1213 xen_mc_lcpu_cookie_t
   1214 xen_physcpu_next(xen_mc_lcpu_cookie_t cookie)
   1215 {
   1216 	xen_mc_logical_cpu_t *xcp = (xen_mc_logical_cpu_t *)cookie;
   1217 
   1218 	if (!DOMAIN_IS_INITDOMAIN(xen_info))
   1219 		return (NULL);
   1220 
   1221 	if (cookie == NULL)
   1222 		return ((xen_mc_lcpu_cookie_t)xen_phys_cpus);
   1223 
   1224 	if (xcp == xen_phys_cpus + xen_phys_ncpus - 1)
   1225 		return (NULL);
   1226 	else
   1227 		return ((xen_mc_lcpu_cookie_t)++xcp);
   1228 }
   1229 
   1230 #define	COOKIE2XCP(c) ((xen_mc_logical_cpu_t *)(c))
   1231 
   1232 const char *
   1233 xen_physcpu_vendorstr(xen_mc_lcpu_cookie_t cookie)
   1234 {
   1235 	xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie);
   1236 
   1237 	return ((const char *)&xcp->mc_vendorid[0]);
   1238 }
   1239 
   1240 int
   1241 xen_physcpu_family(xen_mc_lcpu_cookie_t cookie)
   1242 {
   1243 	return (COOKIE2XCP(cookie)->mc_family);
   1244 }
   1245 
   1246 int
   1247 xen_physcpu_model(xen_mc_lcpu_cookie_t cookie)
   1248 {
   1249 	return (COOKIE2XCP(cookie)->mc_model);
   1250 }
   1251 
   1252 int
   1253 xen_physcpu_stepping(xen_mc_lcpu_cookie_t cookie)
   1254 {
   1255 	return (COOKIE2XCP(cookie)->mc_step);
   1256 }
   1257 
   1258 id_t
   1259 xen_physcpu_chipid(xen_mc_lcpu_cookie_t cookie)
   1260 {
   1261 	return (COOKIE2XCP(cookie)->mc_chipid);
   1262 }
   1263 
   1264 id_t
   1265 xen_physcpu_coreid(xen_mc_lcpu_cookie_t cookie)
   1266 {
   1267 	return (COOKIE2XCP(cookie)->mc_coreid);
   1268 }
   1269 
   1270 id_t
   1271 xen_physcpu_strandid(xen_mc_lcpu_cookie_t cookie)
   1272 {
   1273 	return (COOKIE2XCP(cookie)->mc_threadid);
   1274 }
   1275 
   1276 id_t
   1277 xen_physcpu_initial_apicid(xen_mc_lcpu_cookie_t cookie)
   1278 {
   1279 	return (COOKIE2XCP(cookie)->mc_clusterid);
   1280 }
   1281 
   1282 id_t
   1283 xen_physcpu_logical_id(xen_mc_lcpu_cookie_t cookie)
   1284 {
   1285 	return (COOKIE2XCP(cookie)->mc_cpunr);
   1286 }
   1287 
   1288 boolean_t
   1289 xen_physcpu_is_cmt(xen_mc_lcpu_cookie_t cookie)
   1290 {
   1291 	return (COOKIE2XCP(cookie)->mc_nthreads > 1);
   1292 }
   1293 
   1294 uint64_t
   1295 xen_physcpu_mcg_cap(xen_mc_lcpu_cookie_t cookie)
   1296 {
   1297 	xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie);
   1298 
   1299 	/*
   1300 	 * Need to #define the indices, or search through the array.
   1301 	 */
   1302 	return (xcp->mc_msrvalues[0].value);
   1303 }
   1304 
   1305 int
   1306 xen_map_gref(uint_t cmd, gnttab_map_grant_ref_t *mapop, uint_t count,
   1307     boolean_t uvaddr)
   1308 {
   1309 	long rc;
   1310 	uint_t i;
   1311 
   1312 	ASSERT(cmd == GNTTABOP_map_grant_ref);
   1313 
   1314 #if !defined(_BOOT)
   1315 	if (uvaddr == B_FALSE) {
   1316 		for (i = 0; i < count; ++i) {
   1317 			mapop[i].flags |= (PT_FOREIGN <<_GNTMAP_guest_avail0);
   1318 		}
   1319 	}
   1320 #endif
   1321 
   1322 	rc = HYPERVISOR_grant_table_op(cmd, mapop, count);
   1323 
   1324 	return (rc);
   1325 }
   1326 
   1327 static int
   1328 xpv_get_physinfo(xen_sysctl_physinfo_t *pi)
   1329 {
   1330 	xen_sysctl_t op;
   1331 	struct sp { void *p; } *sp = (struct sp *)&op.u.physinfo.cpu_to_node;
   1332 	int ret;
   1333 
   1334 	bzero(&op, sizeof (op));
   1335 	op.cmd = XEN_SYSCTL_physinfo;
   1336 	op.interface_version = XEN_SYSCTL_INTERFACE_VERSION;
   1337 	/*LINTED: constant in conditional context*/
   1338 	set_xen_guest_handle(*sp, NULL);
   1339 
   1340 	ret = HYPERVISOR_sysctl(&op);
   1341 
   1342 	if (ret != 0)
   1343 		return (xen_xlate_errcode(ret));
   1344 
   1345 	bcopy(&op.u.physinfo, pi, sizeof (op.u.physinfo));
   1346 	return (0);
   1347 }
   1348 
   1349 /*
   1350  * On dom0, we can determine the number of physical cpus on the machine.
   1351  * This number is important when figuring out what workarounds are
   1352  * appropriate, so compute it now.
   1353  */
   1354 uint_t
   1355 xpv_nr_phys_cpus(void)
   1356 {
   1357 	static uint_t nphyscpus = 0;
   1358 
   1359 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
   1360 
   1361 	if (nphyscpus == 0) {
   1362 		xen_sysctl_physinfo_t pi;
   1363 		int ret;
   1364 
   1365 		if ((ret = xpv_get_physinfo(&pi)) != 0)
   1366 			panic("xpv_get_physinfo() failed: %d\n", ret);
   1367 		nphyscpus = pi.nr_cpus;
   1368 	}
   1369 	return (nphyscpus);
   1370 }
   1371 
   1372 pgcnt_t
   1373 xpv_nr_phys_pages(void)
   1374 {
   1375 	xen_sysctl_physinfo_t pi;
   1376 	int ret;
   1377 
   1378 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
   1379 
   1380 	if ((ret = xpv_get_physinfo(&pi)) != 0)
   1381 		panic("xpv_get_physinfo() failed: %d\n", ret);
   1382 
   1383 	return ((pgcnt_t)pi.total_pages);
   1384 }
   1385 
   1386 uint64_t
   1387 xpv_cpu_khz(void)
   1388 {
   1389 	xen_sysctl_physinfo_t pi;
   1390 	int ret;
   1391 
   1392 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
   1393 
   1394 	if ((ret = xpv_get_physinfo(&pi)) != 0)
   1395 		panic("xpv_get_physinfo() failed: %d\n", ret);
   1396 	return ((uint64_t)pi.cpu_khz);
   1397 }
   1398