Home | History | Annotate | Download | only in psm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #define	PSMI_1_6
     28 
     29 #include <sys/mutex.h>
     30 #include <sys/types.h>
     31 #include <sys/time.h>
     32 #include <sys/clock.h>
     33 #include <sys/machlock.h>
     34 #include <sys/smp_impldefs.h>
     35 #include <sys/uadmin.h>
     36 #include <sys/promif.h>
     37 #include <sys/psm.h>
     38 #include <sys/psm_common.h>
     39 #include <sys/atomic.h>
     40 #include <sys/apic.h>
     41 #include <sys/archsystm.h>
     42 #include <sys/mach_intr.h>
     43 #include <sys/hypervisor.h>
     44 #include <sys/evtchn_impl.h>
     45 #include <sys/modctl.h>
     46 #include <sys/trap.h>
     47 #include <sys/panic.h>
     48 #include <sys/sysmacros.h>
     49 #include <sys/pci_intr_lib.h>
     50 #include <vm/hat_i86.h>
     51 
     52 #include <xen/public/vcpu.h>
     53 #include <xen/public/physdev.h>
     54 
     55 
     56 /*
     57  * Global Data
     58  */
     59 
     60 int xen_psm_verbose = 0;
     61 
     62 /* As of now we don't support x2apic in xVM */
     63 volatile uint32_t *apicadr = NULL;	/* dummy, so common code will link */
     64 int apic_error = 0;
     65 int apic_verbose = 0;
     66 cpuset_t apic_cpumask;
     67 int apic_forceload = 0;
     68 uchar_t apic_vectortoipl[APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL] = {
     69 	3, 4, 5, 5, 6, 6, 9, 10, 11, 12, 13, 14, 15, 15
     70 };
     71 uchar_t apic_ipltopri[MAXIPL + 1];
     72 uchar_t apic_ipls[APIC_AVAIL_VECTOR];
     73 uint_t apic_picinit_called;
     74 apic_cpus_info_t *apic_cpus;
     75 int xen_psm_intr_policy = INTR_ROUND_ROBIN_WITH_AFFINITY;
     76 /* use to make sure only one cpu handles the nmi */
     77 static lock_t xen_psm_nmi_lock;
     78 int xen_psm_kmdb_on_nmi = 0;		/* 0 - no, 1 - yes enter kmdb */
     79 int xen_psm_panic_on_nmi = 0;
     80 int xen_psm_num_nmis = 0;
     81 
     82 cpuset_t xen_psm_cpus_online;	/* online cpus */
     83 int xen_psm_ncpus = 1;		/* cpu count */
     84 int xen_psm_next_bind_cpu;	/* next cpu to bind an interrupt to */
     85 
     86 int xen_support_msi = 0;
     87 
     88 static int xen_clock_irq = INVALID_IRQ;
     89 
     90 /* flag definitions for xen_psm_verbose */
     91 #define	XEN_PSM_VERBOSE_IRQ_FLAG		0x00000001
     92 #define	XEN_PSM_VERBOSE_POWEROFF_FLAG		0x00000002
     93 #define	XEN_PSM_VERBOSE_POWEROFF_PAUSE_FLAG	0x00000004
     94 
     95 #define	XEN_PSM_VERBOSE_IRQ(fmt) \
     96 	if (xen_psm_verbose & XEN_PSM_VERBOSE_IRQ_FLAG) \
     97 		cmn_err fmt;
     98 
     99 #define	XEN_PSM_VERBOSE_POWEROFF(fmt) \
    100 	if (xen_psm_verbose & XEN_PSM_VERBOSE_POWEROFF_FLAG) \
    101 		prom_printf fmt;
    102 
    103 /*
    104  * Dummy apic array to point common routines at that want to do some apic
    105  * manipulation.  Xen doesn't allow guest apic access so we point at these
    106  * memory locations to fake out those who want to do apic fiddling.
    107  */
    108 uint32_t xen_psm_dummy_apic[APIC_IRR_REG + 1];
    109 
    110 static struct psm_info xen_psm_info;
    111 static void xen_psm_setspl(int);
    112 
    113 int
    114 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
    115     int behavior);
    116 int
    117 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
    118     int behavior);
    119 
    120 /*
    121  * Local support routines
    122  */
    123 
    124 /*
    125  * Select vcpu to bind xen virtual device interrupt to.
    126  */
    127 /*ARGSUSED*/
    128 int
    129 xen_psm_bind_intr(int irq)
    130 {
    131 	int bind_cpu;
    132 	apic_irq_t *irqptr;
    133 
    134 	bind_cpu = IRQ_UNBOUND;
    135 	if (xen_psm_intr_policy == INTR_LOWEST_PRIORITY)
    136 		return (bind_cpu);
    137 	if (irq <= APIC_MAX_VECTOR)
    138 		irqptr = apic_irq_table[irq];
    139 	else
    140 		irqptr = NULL;
    141 	if (irqptr && (irqptr->airq_cpu != IRQ_UNBOUND))
    142 		bind_cpu = irqptr->airq_cpu & ~IRQ_USER_BOUND;
    143 	if (bind_cpu != IRQ_UNBOUND) {
    144 		if (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu))
    145 			bind_cpu = 0;
    146 		goto done;
    147 	}
    148 	if (xen_psm_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
    149 		do {
    150 			bind_cpu = xen_psm_next_bind_cpu++;
    151 			if (xen_psm_next_bind_cpu >= xen_psm_ncpus)
    152 				xen_psm_next_bind_cpu = 0;
    153 		} while (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu));
    154 	} else {
    155 		bind_cpu = 0;
    156 	}
    157 done:
    158 	return (bind_cpu);
    159 }
    160 
    161 /*
    162  * Autoconfiguration Routines
    163  */
    164 
    165 static int
    166 xen_psm_probe(void)
    167 {
    168 	int ret = PSM_SUCCESS;
    169 
    170 	if (DOMAIN_IS_INITDOMAIN(xen_info))
    171 		ret = apic_probe_common(xen_psm_info.p_mach_idstring);
    172 	return (ret);
    173 }
    174 
    175 static void
    176 xen_psm_softinit(void)
    177 {
    178 	/* LINTED logical expression always true: op "||" */
    179 	ASSERT((1 << EVTCHN_SHIFT) == NBBY * sizeof (ulong_t));
    180 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, 0);
    181 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
    182 		apic_init_common();
    183 	}
    184 }
    185 
    186 #define	XEN_NSEC_PER_TICK	10 /* XXX - assume we have a 100 Mhz clock */
    187 
    188 /*ARGSUSED*/
    189 static int
    190 xen_psm_clkinit(int hertz)
    191 {
    192 	extern enum tod_fault_type tod_fault(enum tod_fault_type, int);
    193 	extern int dosynctodr;
    194 
    195 	/*
    196 	 * domU cannot set the TOD hardware, fault the TOD clock now to
    197 	 * indicate that and turn off attempts to sync TOD hardware
    198 	 * with the hires timer.
    199 	 */
    200 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
    201 		mutex_enter(&tod_lock);
    202 		(void) tod_fault(TOD_RDONLY, 0);
    203 		dosynctodr = 0;
    204 		mutex_exit(&tod_lock);
    205 	}
    206 	/*
    207 	 * The hypervisor provides a timer based on the local APIC timer.
    208 	 * The interface supports requests of nanosecond resolution.
    209 	 * A common frequency of the apic clock is 100 Mhz which
    210 	 * gives a resolution of 10 nsec per tick.  What we would really like
    211 	 * is a way to get the ns per tick value from xen.
    212 	 * XXPV - This is an assumption that needs checking and may change
    213 	 */
    214 	return (XEN_NSEC_PER_TICK);
    215 }
    216 
    217 static void
    218 xen_psm_hrtimeinit(void)
    219 {
    220 	extern int gethrtime_hires;
    221 	gethrtime_hires = 1;
    222 }
    223 
    224 /* xen_psm NMI handler */
    225 /*ARGSUSED*/
    226 static void
    227 xen_psm_nmi_intr(caddr_t arg, struct regs *rp)
    228 {
    229 	xen_psm_num_nmis++;
    230 
    231 	if (!lock_try(&xen_psm_nmi_lock))
    232 		return;
    233 
    234 	if (xen_psm_kmdb_on_nmi && psm_debugger()) {
    235 		debug_enter("NMI received: entering kmdb\n");
    236 	} else if (xen_psm_panic_on_nmi) {
    237 		/* Keep panic from entering kmdb. */
    238 		nopanicdebug = 1;
    239 		panic("NMI received\n");
    240 	} else {
    241 		/*
    242 		 * prom_printf is the best shot we have of something which is
    243 		 * problem free from high level/NMI type of interrupts
    244 		 */
    245 		prom_printf("NMI received\n");
    246 	}
    247 
    248 	lock_clear(&xen_psm_nmi_lock);
    249 }
    250 
    251 static void
    252 xen_psm_picinit()
    253 {
    254 	int cpu, irqno;
    255 	cpuset_t cpus;
    256 
    257 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
    258 		/* set a flag so we know we have run xen_psm_picinit() */
    259 		apic_picinit_called = 1;
    260 		LOCK_INIT_CLEAR(&apic_ioapic_lock);
    261 
    262 		/* XXPV - do we need to do this? */
    263 		picsetup();	 /* initialise the 8259 */
    264 
    265 		/* enable apic mode if imcr present */
    266 		/* XXPV - do we need to do this either? */
    267 		if (apic_imcrp) {
    268 			outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
    269 			outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_APIC);
    270 		}
    271 
    272 		ioapic_init_intr(IOAPIC_NOMASK);
    273 		/*
    274 		 * We never called xen_psm_addspl() when the SCI
    275 		 * interrupt was added because that happened before the
    276 		 * PSM module was loaded.  Fix that up here by doing
    277 		 * any missed operations (e.g. bind to CPU)
    278 		 */
    279 		if ((irqno = apic_sci_vect) > 0) {
    280 			if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
    281 				CPUSET_ZERO(cpus);
    282 				CPUSET_OR(cpus, xen_psm_cpus_online);
    283 			} else {
    284 				CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
    285 			}
    286 			ec_set_irq_affinity(irqno, cpus);
    287 			apic_irq_table[irqno]->airq_temp_cpu =
    288 			    (uchar_t)(cpu & ~IRQ_USER_BOUND);
    289 			ec_enable_irq(irqno);
    290 		}
    291 	}
    292 
    293 	/* add nmi handler - least priority nmi handler */
    294 	LOCK_INIT_CLEAR(&xen_psm_nmi_lock);
    295 
    296 	if (!psm_add_nmintr(0, (avfunc) xen_psm_nmi_intr,
    297 	    "xVM_psm NMI handler", (caddr_t)NULL))
    298 		cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler");
    299 }
    300 
    301 
    302 /*
    303  * generates an interprocessor interrupt to another CPU
    304  */
    305 static void
    306 xen_psm_send_ipi(int cpun, int ipl)
    307 {
    308 	ulong_t flag = intr_clear();
    309 
    310 	ec_send_ipi(ipl, cpun);
    311 	intr_restore(flag);
    312 }
    313 
    314 /*ARGSUSED*/
    315 static int
    316 xen_psm_addspl(int irqno, int ipl, int min_ipl, int max_ipl)
    317 {
    318 	int cpu, ret;
    319 	cpuset_t cpus;
    320 
    321 	/*
    322 	 * We are called at splhi() so we can't call anything that might end
    323 	 * up trying to context switch.
    324 	 */
    325 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
    326 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
    327 		/*
    328 		 * Priority/affinity/enable for PIRQ's is set in ec_setup_pirq()
    329 		 */
    330 		ret = apic_addspl_common(irqno, ipl, min_ipl, max_ipl);
    331 	} else {
    332 		/*
    333 		 * Set priority/affinity/enable for non PIRQs
    334 		 */
    335 		ret = ec_set_irq_priority(irqno, ipl);
    336 		ASSERT(ret == 0);
    337 		if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
    338 			CPUSET_ZERO(cpus);
    339 			CPUSET_OR(cpus, xen_psm_cpus_online);
    340 		} else {
    341 			CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
    342 		}
    343 		ec_set_irq_affinity(irqno, cpus);
    344 		ec_enable_irq(irqno);
    345 	}
    346 	return (ret);
    347 }
    348 
    349 /*
    350  * Acquire ownership of this irq on this cpu
    351  */
    352 void
    353 xen_psm_acquire_irq(int irq)
    354 {
    355 	ulong_t flags;
    356 	int cpuid;
    357 
    358 	/*
    359 	 * If the irq is currently being serviced by another cpu
    360 	 * we busy-wait for the other cpu to finish.  Take any
    361 	 * pending interrupts before retrying.
    362 	 */
    363 	do {
    364 		flags = intr_clear();
    365 		cpuid = ec_block_irq(irq);
    366 		intr_restore(flags);
    367 	} while (cpuid != CPU->cpu_id);
    368 }
    369 
    370 /*ARGSUSED*/
    371 static int
    372 xen_psm_delspl(int irqno, int ipl, int min_ipl, int max_ipl)
    373 {
    374 	apic_irq_t *irqptr;
    375 	int err = PSM_SUCCESS;
    376 
    377 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
    378 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
    379 		irqptr = apic_irq_table[irqno];
    380 		/*
    381 		 * unbind if no more sharers of this irq/evtchn
    382 		 */
    383 		if (irqptr->airq_share == 1) {
    384 			xen_psm_acquire_irq(irqno);
    385 			ec_unbind_irq(irqno);
    386 		}
    387 		err = apic_delspl_common(irqno, ipl, min_ipl, max_ipl);
    388 		/*
    389 		 * If still in use reset priority
    390 		 */
    391 		if (!err && irqptr->airq_share != 0) {
    392 			err = ec_set_irq_priority(irqno, max_ipl);
    393 			return (err);
    394 		}
    395 	} else {
    396 		xen_psm_acquire_irq(irqno);
    397 		ec_unbind_irq(irqno);
    398 	}
    399 	return (err);
    400 }
    401 
    402 static processorid_t
    403 xen_psm_get_next_processorid(processorid_t id)
    404 {
    405 	if (id == -1)
    406 		return (0);
    407 
    408 	for (id++; id < NCPU; id++) {
    409 		switch (-HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL)) {
    410 		case 0:		/* yeah, that one's there */
    411 			return (id);
    412 		default:
    413 		case X_EINVAL:	/* out of range */
    414 			return (-1);
    415 		case X_ENOENT:	/* not present in the domain */
    416 			/*
    417 			 * It's not clear that we -need- to keep looking
    418 			 * at this point, if, e.g., we can guarantee
    419 			 * the hypervisor always keeps a contiguous range
    420 			 * of vcpus around this is equivalent to "out of range".
    421 			 *
    422 			 * But it would be sad to miss a vcpu we're
    423 			 * supposed to be using ..
    424 			 */
    425 			break;
    426 		}
    427 	}
    428 
    429 	return (-1);
    430 }
    431 
    432 /*
    433  * XXPV - undo the start cpu op change; return to ignoring this value
    434  *	- also tweak error handling in main startup loop
    435  */
    436 /*ARGSUSED*/
    437 static int
    438 xen_psm_cpu_start(processorid_t id, caddr_t arg)
    439 {
    440 	int ret;
    441 
    442 	ASSERT(id > 0);
    443 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, id);
    444 	ec_bind_cpu_ipis(id);
    445 	(void) ec_bind_virq_to_irq(VIRQ_TIMER, id);
    446 	if ((ret = xen_vcpu_up(id)) == 0)
    447 		xen_psm_ncpus++;
    448 	else
    449 		ret = EINVAL;
    450 	return (ret);
    451 }
    452 
    453 /*
    454  * Allocate an irq for inter cpu signaling
    455  */
    456 /*ARGSUSED*/
    457 static int
    458 xen_psm_get_ipivect(int ipl, int type)
    459 {
    460 	return (ec_bind_ipi_to_irq(ipl, 0));
    461 }
    462 
    463 /*ARGSUSED*/
    464 static int
    465 xen_psm_get_clockirq(int ipl)
    466 {
    467 	if (xen_clock_irq != INVALID_IRQ)
    468 		return (xen_clock_irq);
    469 
    470 	xen_clock_irq = ec_bind_virq_to_irq(VIRQ_TIMER, 0);
    471 	return (xen_clock_irq);
    472 }
    473 
    474 /*ARGSUSED*/
    475 static void
    476 xen_psm_shutdown(int cmd, int fcn)
    477 {
    478 	XEN_PSM_VERBOSE_POWEROFF(("xen_psm_shutdown(%d,%d);\n", cmd, fcn));
    479 
    480 	switch (cmd) {
    481 	case A_SHUTDOWN:
    482 		switch (fcn) {
    483 		case AD_BOOT:
    484 		case AD_IBOOT:
    485 			(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
    486 			break;
    487 		case AD_POWEROFF:
    488 			/* fall through if domU or if poweroff fails */
    489 			if (DOMAIN_IS_INITDOMAIN(xen_info))
    490 				if (apic_enable_acpi)
    491 					(void) acpi_poweroff();
    492 			/* FALLTHRU */
    493 		case AD_HALT:
    494 		default:
    495 			(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
    496 			break;
    497 		}
    498 		break;
    499 	case A_REBOOT:
    500 		(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
    501 		break;
    502 	default:
    503 		return;
    504 	}
    505 }
    506 
    507 
    508 static int
    509 xen_psm_translate_irq(dev_info_t *dip, int irqno)
    510 {
    511 	if (dip == NULL) {
    512 		XEN_PSM_VERBOSE_IRQ((CE_CONT, "!xen_psm: irqno = %d"
    513 		    " dip = NULL\n", irqno));
    514 		return (irqno);
    515 	}
    516 	return (irqno);
    517 }
    518 
    519 /*
    520  * xen_psm_intr_enter() acks the event that triggered the interrupt and
    521  * returns the new priority level,
    522  */
    523 /*ARGSUSED*/
    524 static int
    525 xen_psm_intr_enter(int ipl, int *vector)
    526 {
    527 	int newipl;
    528 	uint_t intno;
    529 	cpu_t *cpu = CPU;
    530 
    531 	intno = (*vector);
    532 
    533 	ASSERT(intno < NR_IRQS);
    534 	ASSERT(cpu->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
    535 
    536 	if (!ec_is_edge_pirq(intno))
    537 		ec_clear_irq(intno);
    538 
    539 	newipl = autovect[intno].avh_hi_pri;
    540 	if (newipl == 0) {
    541 		/*
    542 		 * (newipl == 0) means we have no service routines for this
    543 		 * vector.  We will treat this as a spurious interrupt.
    544 		 * We have cleared the pending bit already, clear the event
    545 		 * mask and return a spurious interrupt.  This case can happen
    546 		 * when an interrupt delivery is racing with the removal of
    547 		 * of the service routine for that interrupt.
    548 		 */
    549 		ec_unmask_irq(intno);
    550 		newipl = -1;	/* flag spurious interrupt */
    551 	} else if (newipl <= cpu->cpu_pri) {
    552 		/*
    553 		 * (newipl <= cpu->cpu_pri) means that we must be trying to
    554 		 * service a vector that was shared with a higher priority
    555 		 * isr.  The higher priority handler has been removed and
    556 		 * we need to service this int.  We can't return a lower
    557 		 * priority than current cpu priority.  Just synthesize a
    558 		 * priority to return that should be acceptable.
    559 		 * It should never happen that we synthesize a priority that
    560 		 * moves us from low-priority to high-priority that would make
    561 		 * a us incorrectly run on the high priority stack.
    562 		 */
    563 		newipl = cpu->cpu_pri + 1;	/* synthetic priority */
    564 		ASSERT(newipl != LOCK_LEVEL + 1);
    565 	}
    566 	return (newipl);
    567 }
    568 
    569 
    570 /*
    571  * xen_psm_intr_exit() restores the old interrupt
    572  * priority level after processing an interrupt.
    573  * It is called with interrupts disabled, and does not enable interrupts.
    574  */
    575 /* ARGSUSED */
    576 static void
    577 xen_psm_intr_exit(int ipl, int vector)
    578 {
    579 	ec_try_unmask_irq(vector);
    580 	xen_psm_setspl(ipl);
    581 }
    582 
    583 intr_exit_fn_t
    584 psm_intr_exit_fn(void)
    585 {
    586 	return (xen_psm_intr_exit);
    587 }
    588 
    589 /*
    590  * Check if new ipl level allows delivery of previously unserviced events
    591  */
    592 static void
    593 xen_psm_setspl(int ipl)
    594 {
    595 	struct cpu *cpu = CPU;
    596 	volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
    597 	uint16_t pending;
    598 
    599 	ASSERT(vci->evtchn_upcall_mask != 0);
    600 
    601 	/*
    602 	 * If new ipl level will enable any pending interrupts, setup so the
    603 	 * upcoming sti will cause us to get an upcall.
    604 	 */
    605 	pending = cpu->cpu_m.mcpu_intr_pending & ~((1 << (ipl + 1)) - 1);
    606 	if (pending) {
    607 		int i;
    608 		ulong_t pending_sels = 0;
    609 		volatile ulong_t *selp;
    610 		struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
    611 
    612 		for (i = bsrw_insn(pending); i > ipl; i--)
    613 			pending_sels |= cpe->pending_sel[i];
    614 		ASSERT(pending_sels);
    615 		selp = (volatile ulong_t *)&vci->evtchn_pending_sel;
    616 		atomic_or_ulong(selp, pending_sels);
    617 		vci->evtchn_upcall_pending = 1;
    618 	}
    619 }
    620 
    621 /*
    622  * This function provides external interface to the nexus for all
    623  * functionality related to the new DDI interrupt framework.
    624  *
    625  * Input:
    626  * dip     - pointer to the dev_info structure of the requested device
    627  * hdlp    - pointer to the internal interrupt handle structure for the
    628  *	     requested interrupt
    629  * intr_op - opcode for this call
    630  * result  - pointer to the integer that will hold the result to be
    631  *	     passed back if return value is PSM_SUCCESS
    632  *
    633  * Output:
    634  * return value is either PSM_SUCCESS or PSM_FAILURE
    635  */
    636 int
    637 xen_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
    638     psm_intr_op_t intr_op, int *result)
    639 {
    640 	int		cap;
    641 	int		err;
    642 	int		new_priority;
    643 	apic_irq_t	*irqp;
    644 	struct intrspec *ispec;
    645 
    646 	DDI_INTR_IMPLDBG((CE_CONT, "xen_intr_ops: dip: %p hdlp: %p "
    647 	    "intr_op: %x\n", (void *)dip, (void *)hdlp, intr_op));
    648 
    649 	switch (intr_op) {
    650 	case PSM_INTR_OP_CHECK_MSI:
    651 		/*
    652 		 * Till PCI passthru is supported, only dom0 has MSI/MSIX
    653 		 */
    654 		if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
    655 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
    656 			    DDI_INTR_TYPE_MSIX);
    657 			break;
    658 		}
    659 		/*
    660 		 * Check MSI/X is supported or not at APIC level and
    661 		 * masked off the MSI/X bits in hdlp->ih_type if not
    662 		 * supported before return.  If MSI/X is supported,
    663 		 * leave the ih_type unchanged and return.
    664 		 *
    665 		 * hdlp->ih_type passed in from the nexus has all the
    666 		 * interrupt types supported by the device.
    667 		 */
    668 		if (xen_support_msi == 0) {
    669 			/*
    670 			 * if xen_support_msi is not set, call
    671 			 * apic_check_msi_support() to check whether msi
    672 			 * is supported first
    673 			 */
    674 			if (apic_check_msi_support() == PSM_SUCCESS)
    675 				xen_support_msi = 1;
    676 			else
    677 				xen_support_msi = -1;
    678 		}
    679 		if (xen_support_msi == 1)
    680 			*result = hdlp->ih_type;
    681 		else
    682 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
    683 			    DDI_INTR_TYPE_MSIX);
    684 		break;
    685 	case PSM_INTR_OP_ALLOC_VECTORS:
    686 		if (hdlp->ih_type == DDI_INTR_TYPE_MSI)
    687 			*result = apic_alloc_msi_vectors(dip, hdlp->ih_inum,
    688 			    hdlp->ih_scratch1, hdlp->ih_pri,
    689 			    (int)(uintptr_t)hdlp->ih_scratch2);
    690 		else
    691 			*result = apic_alloc_msix_vectors(dip, hdlp->ih_inum,
    692 			    hdlp->ih_scratch1, hdlp->ih_pri,
    693 			    (int)(uintptr_t)hdlp->ih_scratch2);
    694 		break;
    695 	case PSM_INTR_OP_FREE_VECTORS:
    696 		apic_free_vectors(dip, hdlp->ih_inum, hdlp->ih_scratch1,
    697 		    hdlp->ih_pri, hdlp->ih_type);
    698 		break;
    699 	case PSM_INTR_OP_NAVAIL_VECTORS:
    700 		/*
    701 		 * XXPV - maybe we should make this be:
    702 		 * min(APIC_VECTOR_PER_IPL, count of all avail vectors);
    703 		 */
    704 		if (DOMAIN_IS_INITDOMAIN(xen_info))
    705 			*result = APIC_VECTOR_PER_IPL;
    706 		else
    707 			*result = 1;
    708 		break;
    709 	case PSM_INTR_OP_XLATE_VECTOR:
    710 		ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
    711 		if (ispec->intrspec_vec >= PIRQ_BASE &&
    712 		    ispec->intrspec_vec < NR_PIRQS &&
    713 		    DOMAIN_IS_INITDOMAIN(xen_info)) {
    714 			*result = apic_introp_xlate(dip, ispec, hdlp->ih_type);
    715 		} else {
    716 			*result = ispec->intrspec_vec;
    717 		}
    718 		break;
    719 	case PSM_INTR_OP_GET_PENDING:
    720 		/* XXPV - is this enough for dom0 or do we need to ref ioapic */
    721 		*result = ec_pending_irq(hdlp->ih_vector);
    722 		break;
    723 	case PSM_INTR_OP_CLEAR_MASK:
    724 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
    725 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
    726 			return (PSM_FAILURE);
    727 		ec_enable_irq(hdlp->ih_vector);
    728 		break;
    729 	case PSM_INTR_OP_SET_MASK:
    730 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
    731 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
    732 			return (PSM_FAILURE);
    733 		ec_disable_irq(hdlp->ih_vector);
    734 		break;
    735 	case PSM_INTR_OP_GET_CAP:
    736 		cap = DDI_INTR_FLAG_PENDING | DDI_INTR_FLAG_EDGE;
    737 		if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
    738 			cap |= DDI_INTR_FLAG_MASKABLE;
    739 		*result = cap;
    740 		break;
    741 	case PSM_INTR_OP_GET_SHARED:
    742 		if (DOMAIN_IS_INITDOMAIN(xen_info)) {
    743 			if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
    744 				return (PSM_FAILURE);
    745 			ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
    746 			if ((irqp = apic_find_irq(dip, ispec, hdlp->ih_type))
    747 			    == NULL)
    748 				return (PSM_FAILURE);
    749 			*result = (irqp->airq_share > 1) ? 1: 0;
    750 		} else {
    751 			return (PSM_FAILURE);
    752 		}
    753 		break;
    754 	case PSM_INTR_OP_SET_PRI:
    755 		new_priority = *(int *)result;
    756 		err = ec_set_irq_priority(hdlp->ih_vector, new_priority);
    757 		if (err != 0)
    758 			return (PSM_FAILURE);
    759 		break;
    760 	case PSM_INTR_OP_GET_INTR:
    761 		if (!DOMAIN_IS_INITDOMAIN(xen_info))
    762 			return (PSM_FAILURE);
    763 		/*
    764 		 * The interrupt handle given here has been allocated
    765 		 * specifically for this command, and ih_private carries
    766 		 * a pointer to a apic_get_intr_t.
    767 		 */
    768 		if (apic_get_vector_intr_info(
    769 		    hdlp->ih_vector, hdlp->ih_private) != PSM_SUCCESS)
    770 			return (PSM_FAILURE);
    771 		break;
    772 	case PSM_INTR_OP_SET_CAP:
    773 		/* FALLTHRU */
    774 	default:
    775 		return (PSM_FAILURE);
    776 	}
    777 	return (PSM_SUCCESS);
    778 }
    779 
    780 static void
    781 xen_psm_rebind_irq(int irq)
    782 {
    783 	cpuset_t ncpu;
    784 	processorid_t newcpu;
    785 	apic_irq_t *irqptr;
    786 
    787 	newcpu = xen_psm_bind_intr(irq);
    788 	if (newcpu == IRQ_UNBOUND) {
    789 		CPUSET_ZERO(ncpu);
    790 		CPUSET_OR(ncpu, xen_psm_cpus_online);
    791 	} else {
    792 		CPUSET_ONLY(ncpu, newcpu & ~IRQ_USER_BOUND);
    793 	}
    794 	ec_set_irq_affinity(irq, ncpu);
    795 	if (irq <= APIC_MAX_VECTOR) {
    796 		irqptr = apic_irq_table[irq];
    797 		ASSERT(irqptr != NULL);
    798 		irqptr->airq_temp_cpu = (uchar_t)newcpu;
    799 	}
    800 }
    801 
    802 /*
    803  * Disable all device interrupts for the given cpu.
    804  * High priority interrupts are not disabled and will still be serviced.
    805  */
    806 static int
    807 xen_psm_disable_intr(processorid_t cpun)
    808 {
    809 	int irq;
    810 
    811 	/*
    812 	 * Can't offline VCPU 0 on this hypervisor.  There's no reason
    813 	 * anyone would want to given that the CPUs are virtual. Also note
    814 	 * that the hypervisor requires suspend/resume to be on VCPU 0.
    815 	 */
    816 	if (cpun == 0)
    817 		return (PSM_FAILURE);
    818 
    819 	CPUSET_ATOMIC_DEL(xen_psm_cpus_online, cpun);
    820 	for (irq = 0; irq < NR_IRQS; irq++) {
    821 		if (!ec_irq_needs_rebind(irq, cpun))
    822 			continue;
    823 		xen_psm_rebind_irq(irq);
    824 	}
    825 	return (PSM_SUCCESS);
    826 }
    827 
    828 static void
    829 xen_psm_enable_intr(processorid_t cpun)
    830 {
    831 	int irq;
    832 
    833 	if (cpun == 0)
    834 		return;
    835 
    836 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, cpun);
    837 
    838 	/*
    839 	 * Rebalance device interrupts among online processors
    840 	 */
    841 	for (irq = 0; irq < NR_IRQS; irq++) {
    842 		if (!ec_irq_rebindable(irq))
    843 			continue;
    844 		xen_psm_rebind_irq(irq);
    845 	}
    846 
    847 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
    848 		apic_cpus[cpun].aci_status |= APIC_CPU_INTR_ENABLE;
    849 	}
    850 }
    851 
    852 static int
    853 xen_psm_post_cpu_start()
    854 {
    855 	processorid_t cpun;
    856 
    857 	cpun = psm_get_cpu_id();
    858 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
    859 		/*
    860 		 * Non-virtualized environments can call psm_post_cpu_start
    861 		 * from Suspend/Resume with the APIC_CPU_INTR_ENABLE bit set.
    862 		 * xen_psm_post_cpu_start() is only called from boot.
    863 		 */
    864 		apic_cpus[cpun].aci_status |= APIC_CPU_ONLINE;
    865 	}
    866 	return (PSM_SUCCESS);
    867 }
    868 
    869 /*
    870  * This function will reprogram the timer.
    871  *
    872  * When in oneshot mode the argument is the absolute time in future at which to
    873  * generate the interrupt.
    874  *
    875  * When in periodic mode, the argument is the interval at which the
    876  * interrupts should be generated. There is no need to support the periodic
    877  * mode timer change at this time.
    878  *
    879  * Note that we must be careful to convert from hrtime to Xen system time (see
    880  * xpv_timestamp.c).
    881  */
    882 static void
    883 xen_psm_timer_reprogram(hrtime_t timer_req)
    884 {
    885 	hrtime_t now, timer_new, time_delta, xen_time;
    886 	ulong_t flags;
    887 
    888 	flags = intr_clear();
    889 	/*
    890 	 * We should be called from high PIL context (CBE_HIGH_PIL),
    891 	 * so kpreempt is disabled.
    892 	 */
    893 
    894 	now = xpv_gethrtime();
    895 	xen_time = xpv_getsystime();
    896 	if (timer_req <= now) {
    897 		/*
    898 		 * requested to generate an interrupt in the past
    899 		 * generate an interrupt as soon as possible
    900 		 */
    901 		time_delta = XEN_NSEC_PER_TICK;
    902 	} else
    903 		time_delta = timer_req - now;
    904 
    905 	timer_new = xen_time + time_delta;
    906 	if (HYPERVISOR_set_timer_op(timer_new) != 0)
    907 		panic("can't set hypervisor timer?");
    908 	intr_restore(flags);
    909 }
    910 
    911 /*
    912  * This function will enable timer interrupts.
    913  */
    914 static void
    915 xen_psm_timer_enable(void)
    916 {
    917 	ec_unmask_irq(xen_clock_irq);
    918 }
    919 
    920 /*
    921  * This function will disable timer interrupts on the current cpu.
    922  */
    923 static void
    924 xen_psm_timer_disable(void)
    925 {
    926 	(void) ec_block_irq(xen_clock_irq);
    927 	/*
    928 	 * If the clock irq is pending on this cpu then we need to
    929 	 * clear the pending interrupt.
    930 	 */
    931 	ec_unpend_irq(xen_clock_irq);
    932 }
    933 
    934 /*
    935  *
    936  * The following functions are in the platform specific file so that they
    937  * can be different functions depending on whether we are running on
    938  * bare metal or a hypervisor.
    939  */
    940 
    941 /*
    942  * Allocate a free vector for irq at ipl.
    943  */
    944 /* ARGSUSED */
    945 uchar_t
    946 apic_allocate_vector(int ipl, int irq, int pri)
    947 {
    948 	physdev_irq_t irq_op;
    949 	uchar_t vector;
    950 	int rc;
    951 
    952 	irq_op.irq = irq;
    953 
    954 	if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
    955 	    != 0)
    956 		panic("Hypervisor alloc vector failed err: %d", -rc);
    957 	vector = irq_op.vector;
    958 	/*
    959 	 * No need to worry about vector colliding with our reserved vectors
    960 	 * e.g. T_FASTTRAP, xen can differentiate between hardware and software
    961 	 * generated traps and handle them properly.
    962 	 */
    963 	apic_vector_to_irq[vector] = (uchar_t)irq;
    964 	return (vector);
    965 }
    966 
    967 /* Mark vector as not being used by any irq */
    968 void
    969 apic_free_vector(uchar_t vector)
    970 {
    971 	apic_vector_to_irq[vector] = APIC_RESV_IRQ;
    972 }
    973 
    974 /*
    975  * This function returns the no. of vectors available for the pri.
    976  * dip is not used at this moment.  If we really don't need that,
    977  * it will be removed.  Since priority is not limited by hardware
    978  * when running on the hypervisor we simply return the maximum no.
    979  * of available contiguous vectors.
    980  */
    981 /*ARGSUSED*/
    982 int
    983 apic_navail_vector(dev_info_t *dip, int pri)
    984 {
    985 	int	lowest, highest, i, navail, count;
    986 
    987 	DDI_INTR_IMPLDBG((CE_CONT, "apic_navail_vector: dip: %p, pri: %x\n",
    988 	    (void *)dip, pri));
    989 
    990 	highest = APIC_MAX_VECTOR;
    991 	lowest = APIC_BASE_VECT;
    992 	navail = count = 0;
    993 
    994 	/* It has to be contiguous */
    995 	for (i = lowest; i < highest; i++) {
    996 		count = 0;
    997 		while ((apic_vector_to_irq[i] == APIC_RESV_IRQ) &&
    998 		    (i < highest)) {
    999 			count++;
   1000 			i++;
   1001 		}
   1002 		if (count > navail)
   1003 			navail = count;
   1004 	}
   1005 	return (navail);
   1006 }
   1007 
   1008 static physdev_manage_pci_t *managed_devlist;
   1009 static int mdev_cnt;
   1010 static int mdev_size = 128;
   1011 static uchar_t	msi_vector_to_pirq[APIC_MAX_VECTOR+1];
   1012 
   1013 /*
   1014  * Add devfn on given bus to devices managed by hypervisor
   1015  */
   1016 static int
   1017 xen_manage_device(uint8_t bus, uint8_t devfn)
   1018 {
   1019 	physdev_manage_pci_t manage_pci, *newlist;
   1020 	int rc, i, oldsize;
   1021 
   1022 	/*
   1023 	 * Check if bus/devfn already managed.  If so just return success.
   1024 	 */
   1025 	if (managed_devlist == NULL) {
   1026 		managed_devlist = kmem_alloc(sizeof (physdev_manage_pci_t) *
   1027 		    mdev_size, KM_NOSLEEP);
   1028 		if (managed_devlist == NULL) {
   1029 			cmn_err(CE_WARN,
   1030 			    "Can't alloc space for managed device list");
   1031 			return (0);
   1032 		}
   1033 	};
   1034 	for (i = 0; i < mdev_cnt; i++) {
   1035 		if (managed_devlist[i].bus == bus &&
   1036 		    managed_devlist[i].devfn == devfn)
   1037 			return (1); /* device already managed */
   1038 	}
   1039 	manage_pci.bus = bus;
   1040 	manage_pci.devfn = devfn;
   1041 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, &manage_pci);
   1042 	if (rc < 0) {
   1043 		cmn_err(CE_WARN,
   1044 		    "hypervisor add pci device call failed bus:0x%x"
   1045 		    " devfn:0x%x", bus, devfn);
   1046 		return (0);
   1047 	}
   1048 	/*
   1049 	 * Add device to the managed device list
   1050 	 */
   1051 	if (i == mdev_size) {
   1052 		/*
   1053 		 * grow the managed device list
   1054 		 */
   1055 		oldsize = mdev_size * sizeof (physdev_manage_pci_t);
   1056 		mdev_size *= 2;
   1057 		newlist = kmem_alloc(sizeof (physdev_manage_pci_t) * mdev_size,
   1058 		    KM_NOSLEEP);
   1059 		if (newlist == NULL) {
   1060 			cmn_err(CE_WARN, "Can't grow managed device list");
   1061 			return (0);
   1062 		}
   1063 		bcopy(managed_devlist, newlist, oldsize);
   1064 		kmem_free(managed_devlist, oldsize);
   1065 		managed_devlist = newlist;
   1066 	}
   1067 	managed_devlist[i].bus = bus;
   1068 	managed_devlist[i].devfn = devfn;
   1069 	mdev_cnt++;
   1070 	return (1);
   1071 }
   1072 
   1073 /*
   1074  * allocate an apic irq struct for an MSI interrupt
   1075  */
   1076 static int
   1077 msi_allocate_irq(int irq)
   1078 {
   1079 	apic_irq_t *irqptr = apic_irq_table[irq];
   1080 
   1081 	if (irqptr == NULL) {
   1082 		irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_NOSLEEP);
   1083 		if (irqptr == NULL) {
   1084 			cmn_err(CE_WARN, "xpv_psm: NO memory to allocate IRQ");
   1085 			return (-1);
   1086 		}
   1087 		apic_irq_table[irq] = irqptr;
   1088 	} else {
   1089 		if (irq == APIC_RESV_IRQ && irqptr->airq_mps_intr_index == 0)
   1090 			irqptr->airq_mps_intr_index = FREE_INDEX;
   1091 		if (irqptr->airq_mps_intr_index != FREE_INDEX) {
   1092 			cmn_err(CE_WARN, "xpv_psm: MSI IRQ already in use");
   1093 			return (-1);
   1094 		}
   1095 	}
   1096 	irqptr->airq_mps_intr_index = FREE_INDEX;
   1097 	return (irq);
   1098 }
   1099 
   1100 /*
   1101  * read MSI/MSIX vector out of config space
   1102  */
   1103 static uchar_t
   1104 xpv_psm_get_msi_vector(dev_info_t *dip, int type, int entry)
   1105 {
   1106 	uint64_t		msi_data = 0;
   1107 	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip);
   1108 	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(dip);
   1109 	ushort_t		msi_ctrl;
   1110 	uchar_t			vector;
   1111 
   1112 	ASSERT((handle != NULL) && (cap_ptr != 0));
   1113 	if (type == DDI_INTR_TYPE_MSI) {
   1114 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
   1115 		/*
   1116 		 * Get vector
   1117 		 */
   1118 		if (msi_ctrl &  PCI_MSI_64BIT_MASK) {
   1119 			msi_data = pci_config_get16(handle,
   1120 			    cap_ptr + PCI_MSI_64BIT_DATA);
   1121 		} else {
   1122 			msi_data = pci_config_get16(handle,
   1123 			    cap_ptr + PCI_MSI_32BIT_DATA);
   1124 		}
   1125 	} else if (type == DDI_INTR_TYPE_MSIX) {
   1126 		uintptr_t	off;
   1127 		ddi_intr_msix_t	*msix_p = i_ddi_get_msix(dip);
   1128 
   1129 		/* Offset into the given entry in the MSI-X table */
   1130 		off = (uintptr_t)msix_p->msix_tbl_addr +
   1131 		    (entry  * PCI_MSIX_VECTOR_SIZE);
   1132 
   1133 		msi_data = ddi_get32(msix_p->msix_tbl_hdl,
   1134 		    (uint32_t *)(off + PCI_MSIX_DATA_OFFSET));
   1135 	}
   1136 	vector = msi_data & 0xff;
   1137 	return (vector);
   1138 }
   1139 
   1140 
   1141 static void
   1142 get_busdevfn(dev_info_t *dip, int *busp, int *devfnp)
   1143 {
   1144 	pci_regspec_t *regspec;
   1145 	int reglen;
   1146 
   1147 	/*
   1148 	 * Get device reg spec, first word has PCI bus and
   1149 	 * device/function info we need.
   1150 	 */
   1151 	if (ddi_getlongprop(DDI_DEV_T_NONE, dip, DDI_PROP_DONTPASS, "reg",
   1152 	    (caddr_t)&regspec, &reglen) != DDI_SUCCESS) {
   1153 		cmn_err(CE_WARN,
   1154 		    "get_busdevfn() failed to get regspec.");
   1155 		return;
   1156 	}
   1157 	/*
   1158 	 * get PCI bus # from reg spec for device
   1159 	 */
   1160 	*busp = PCI_REG_BUS_G(regspec[0].pci_phys_hi);
   1161 	/*
   1162 	 * get combined device/function from reg spec for device.
   1163 	 */
   1164 	*devfnp = (regspec[0].pci_phys_hi & (PCI_REG_FUNC_M | PCI_REG_DEV_M)) >>
   1165 	    PCI_REG_FUNC_SHIFT;
   1166 
   1167 	kmem_free(regspec, reglen);
   1168 }
   1169 
   1170 /*
   1171  * This function allocates "count" MSI vector(s) for the given "dip/pri/type"
   1172  */
   1173 int
   1174 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
   1175     int behavior)
   1176 {
   1177 	int	rcount, i, rc, irqno;
   1178 	uchar_t	vector, cpu;
   1179 	major_t	major;
   1180 	apic_irq_t	*irqptr;
   1181 	physdev_map_pirq_t map_irq;
   1182 	int busnum, devfn;
   1183 
   1184 	DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: dip=0x%p "
   1185 	    "inum=0x%x  pri=0x%x count=0x%x behavior=%d\n",
   1186 	    (void *)dip, inum, pri, count, behavior));
   1187 
   1188 	if (count > 1) {
   1189 		if (behavior == DDI_INTR_ALLOC_STRICT &&
   1190 		    apic_multi_msi_enable == 0)
   1191 			return (0);
   1192 		if (apic_multi_msi_enable == 0)
   1193 			count = 1;
   1194 	}
   1195 
   1196 	if ((rcount = apic_navail_vector(dip, pri)) > count)
   1197 		rcount = count;
   1198 	else if (rcount == 0 || (rcount < count &&
   1199 	    behavior == DDI_INTR_ALLOC_STRICT))
   1200 		return (0);
   1201 
   1202 	/* if not ISP2, then round it down */
   1203 	if (!ISP2(rcount))
   1204 		rcount = 1 << (highbit(rcount) - 1);
   1205 
   1206 	/*
   1207 	 * get PCI bus #  and devfn from reg spec for device
   1208 	 */
   1209 	get_busdevfn(dip, &busnum, &devfn);
   1210 
   1211 	/*
   1212 	 * Tell xen about this pci device
   1213 	 */
   1214 	if (!xen_manage_device(busnum, devfn))
   1215 		return (0);
   1216 
   1217 	mutex_enter(&airq_mutex);
   1218 
   1219 	major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
   1220 	for (i = 0; i < rcount; i++) {
   1221 		/*
   1222 		 * use PHYSDEVOP_map_pirq to have xen map MSI to a pirq
   1223 		 */
   1224 		map_irq.domid = DOMID_SELF;
   1225 		map_irq.type = MAP_PIRQ_TYPE_MSI;
   1226 		map_irq.index = -1; /* hypervisor auto allocates vector */
   1227 		map_irq.pirq = -1;
   1228 		map_irq.bus = busnum;
   1229 		map_irq.devfn = devfn;
   1230 		map_irq.entry_nr = 0;
   1231 		map_irq.table_base = 0;
   1232 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
   1233 		irqno = map_irq.pirq;
   1234 		if (rc < 0) {
   1235 			mutex_exit(&airq_mutex);
   1236 			cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
   1237 			return (0);
   1238 		}
   1239 		if (irqno < 0) {
   1240 			mutex_exit(&airq_mutex);
   1241 			cmn_err(CE_NOTE,
   1242 			    "!hypervisor not configured for MSI support");
   1243 			xen_support_msi = -1;
   1244 			return (0);
   1245 		}
   1246 		if (msi_allocate_irq(irqno) < 0) {
   1247 			mutex_exit(&airq_mutex);
   1248 			return (0);
   1249 		}
   1250 		/*
   1251 		 * Find out what vector the hypervisor assigned
   1252 		 */
   1253 		vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSI, 0);
   1254 		apic_max_device_irq = max(irqno, apic_max_device_irq);
   1255 		apic_min_device_irq = min(irqno, apic_min_device_irq);
   1256 		irqptr = apic_irq_table[irqno];
   1257 		ASSERT(irqptr != NULL);
   1258 #ifdef	DEBUG
   1259 		if (apic_vector_to_irq[vector] != APIC_RESV_IRQ)
   1260 			DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: "
   1261 			    "apic_vector_to_irq is not APIC_RESV_IRQ\n"));
   1262 #endif
   1263 		apic_vector_to_irq[vector] = (uchar_t)irqno;
   1264 		msi_vector_to_pirq[vector] = (uchar_t)irqno;
   1265 
   1266 		irqptr->airq_vector = vector;
   1267 		irqptr->airq_ioapicindex = (uchar_t)inum;	/* start */
   1268 		irqptr->airq_intin_no = (uchar_t)rcount;
   1269 		irqptr->airq_ipl = pri;
   1270 		irqptr->airq_origirq = (uchar_t)(inum + i);
   1271 		irqptr->airq_share_id = 0;
   1272 		irqptr->airq_mps_intr_index = MSI_INDEX;
   1273 		irqptr->airq_dip = dip;
   1274 		irqptr->airq_major = major;
   1275 		if (i == 0) /* they all bind to the same cpu */
   1276 			cpu = irqptr->airq_cpu = xen_psm_bind_intr(irqno);
   1277 		else
   1278 			irqptr->airq_cpu = cpu;
   1279 		DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: irq=0x%x "
   1280 		    "dip=0x%p vector=0x%x origirq=0x%x pri=0x%x\n", irqno,
   1281 		    (void *)irqptr->airq_dip, irqptr->airq_vector,
   1282 		    irqptr->airq_origirq, pri));
   1283 	}
   1284 	mutex_exit(&airq_mutex);
   1285 	return (rcount);
   1286 }
   1287 
   1288 /*
   1289  * This function allocates "count" MSI-X vector(s) for the given "dip/pri/type"
   1290  */
   1291 int
   1292 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
   1293     int behavior)
   1294 {
   1295 	int	rcount, i, rc;
   1296 	major_t	major;
   1297 	physdev_map_pirq_t map_irq;
   1298 	int busnum, devfn;
   1299 	ddi_intr_msix_t *msix_p = i_ddi_get_msix(dip);
   1300 	uint64_t table_base;
   1301 	pfn_t pfnum;
   1302 
   1303 	if (msix_p == NULL) {
   1304 		msix_p = pci_msix_init(dip);
   1305 		if (msix_p != NULL) {
   1306 			i_ddi_set_msix(dip, msix_p);
   1307 		} else {
   1308 			cmn_err(CE_WARN, "apic_alloc_msix_vectors()"
   1309 			    " msix_init failed");
   1310 			return (0);
   1311 		}
   1312 	}
   1313 	/*
   1314 	 * Hypervisor wants PCI config space address of msix table base
   1315 	 */
   1316 	pfnum = hat_getpfnum(kas.a_hat, (caddr_t)msix_p->msix_tbl_addr) &
   1317 	    ~PFN_IS_FOREIGN_MFN;
   1318 	table_base = (uint64_t)((pfnum << PAGESHIFT) - msix_p->msix_tbl_offset |
   1319 	    ((uintptr_t)msix_p->msix_tbl_addr & PAGEOFFSET));
   1320 	/*
   1321 	 * get PCI bus #  and devfn from reg spec for device
   1322 	 */
   1323 	get_busdevfn(dip, &busnum, &devfn);
   1324 
   1325 	/*
   1326 	 * Tell xen about this pci device
   1327 	 */
   1328 	if (!xen_manage_device(busnum, devfn))
   1329 		return (0);
   1330 	mutex_enter(&airq_mutex);
   1331 
   1332 	if ((rcount = apic_navail_vector(dip, pri)) > count)
   1333 		rcount = count;
   1334 	else if (rcount == 0 || (rcount < count &&
   1335 	    behavior == DDI_INTR_ALLOC_STRICT)) {
   1336 		rcount = 0;
   1337 		goto out;
   1338 	}
   1339 
   1340 	major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
   1341 	for (i = 0; i < rcount; i++) {
   1342 		int irqno;
   1343 		uchar_t	vector;
   1344 		apic_irq_t	*irqptr;
   1345 
   1346 		/*
   1347 		 * use PHYSDEVOP_map_pirq to have xen map MSI-X to a pirq
   1348 		 */
   1349 		map_irq.domid = DOMID_SELF;
   1350 		map_irq.type = MAP_PIRQ_TYPE_MSI;
   1351 		map_irq.index = -1; /* hypervisor auto allocates vector */
   1352 		map_irq.pirq = -1;
   1353 		map_irq.bus = busnum;
   1354 		map_irq.devfn = devfn;
   1355 		map_irq.entry_nr = i;
   1356 		map_irq.table_base = table_base;
   1357 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
   1358 		irqno = map_irq.pirq;
   1359 		if (rc < 0) {
   1360 			mutex_exit(&airq_mutex);
   1361 			cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
   1362 			return (0);
   1363 		}
   1364 		if (irqno < 0) {
   1365 			mutex_exit(&airq_mutex);
   1366 			cmn_err(CE_NOTE,
   1367 			    "!hypervisor not configured for MSI support");
   1368 			xen_support_msi = -1;
   1369 			return (0);
   1370 		}
   1371 		/*
   1372 		 * Find out what vector the hypervisor assigned
   1373 		 */
   1374 		vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSIX, i);
   1375 		if (msi_allocate_irq(irqno) < 0) {
   1376 			mutex_exit(&airq_mutex);
   1377 			return (0);
   1378 		}
   1379 		apic_vector_to_irq[vector] = (uchar_t)irqno;
   1380 		msi_vector_to_pirq[vector] = (uchar_t)irqno;
   1381 		apic_max_device_irq = max(irqno, apic_max_device_irq);
   1382 		apic_min_device_irq = min(irqno, apic_min_device_irq);
   1383 		irqptr = apic_irq_table[irqno];
   1384 		ASSERT(irqptr != NULL);
   1385 		irqptr->airq_vector = (uchar_t)vector;
   1386 		irqptr->airq_ipl = pri;
   1387 		irqptr->airq_origirq = (uchar_t)(inum + i);
   1388 		irqptr->airq_share_id = 0;
   1389 		irqptr->airq_mps_intr_index = MSIX_INDEX;
   1390 		irqptr->airq_dip = dip;
   1391 		irqptr->airq_major = major;
   1392 		irqptr->airq_cpu = IRQ_UNBOUND; /* will be bound when addspl */
   1393 	}
   1394 out:
   1395 	mutex_exit(&airq_mutex);
   1396 	return (rcount);
   1397 }
   1398 
   1399 
   1400 /*
   1401  * This finds the apic_irq_t associated with the dip, ispec and type.
   1402  * The entry should have already been freed, but it can not have been
   1403  * reused yet since the hypervisor can not have reassigned the pirq since
   1404  * we have not freed that yet.
   1405  */
   1406 static apic_irq_t *
   1407 msi_find_irq(dev_info_t *dip, struct intrspec *ispec)
   1408 {
   1409 	apic_irq_t	*irqp;
   1410 	int i;
   1411 
   1412 	for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
   1413 		if ((irqp = apic_irq_table[i]) == NULL)
   1414 			continue;
   1415 		if ((irqp->airq_dip == dip) &&
   1416 		    (irqp->airq_origirq == ispec->intrspec_vec) &&
   1417 		    (irqp->airq_ipl == ispec->intrspec_pri)) {
   1418 			return (irqp);
   1419 		}
   1420 	}
   1421 	return (NULL);
   1422 }
   1423 
   1424 void
   1425 apic_free_vectors(dev_info_t *dip, int inum, int count, int pri, int type)
   1426 {
   1427 	int i, rc;
   1428 	physdev_unmap_pirq_t unmap_pirq;
   1429 	apic_irq_t *irqptr;
   1430 	struct intrspec ispec;
   1431 
   1432 	DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: dip: %p inum: %x "
   1433 	    "count: %x pri: %x type: %x\n",
   1434 	    (void *)dip, inum, count, pri, type));
   1435 
   1436 	/* for MSI/X only */
   1437 	if (!DDI_INTR_IS_MSI_OR_MSIX(type))
   1438 		return;
   1439 
   1440 	for (i = 0; i < count; i++) {
   1441 		DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: inum=0x%x "
   1442 		    "pri=0x%x count=0x%x\n", inum, pri, count));
   1443 		ispec.intrspec_vec = inum + i;
   1444 		ispec.intrspec_pri = pri;
   1445 		if ((irqptr = msi_find_irq(dip, &ispec)) == NULL) {
   1446 			cmn_err(CE_WARN,
   1447 			    "couldn't find irq %s,%s dip: 0x%p vec: %x pri: %x",
   1448 			    ddi_get_name(dip), ddi_get_name_addr(dip),
   1449 			    (void *)dip, inum + i, pri);
   1450 			continue;
   1451 		}
   1452 		/*
   1453 		 * use PHYSDEVOP_unmap_pirq to have xen unmap MSI from a pirq
   1454 		 */
   1455 		unmap_pirq.domid = DOMID_SELF;
   1456 		unmap_pirq.pirq = msi_vector_to_pirq[irqptr->airq_vector];
   1457 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_pirq);
   1458 		if (rc < 0) {
   1459 			cmn_err(CE_WARN, "unmap pirq failed");
   1460 			return;
   1461 		}
   1462 		irqptr->airq_mps_intr_index = FREE_INDEX;
   1463 		apic_vector_to_irq[irqptr->airq_vector] = APIC_RESV_IRQ;
   1464 	}
   1465 }
   1466 
   1467 /*
   1468  * The hypervisor doesn't permit access to local apics directly
   1469  */
   1470 /* ARGSUSED */
   1471 uint32_t *
   1472 mapin_apic(uint32_t addr, size_t len, int flags)
   1473 {
   1474 	/*
   1475 	 * Return a pointer to a memory area to fake out the
   1476 	 * probe code that wants to read apic registers.
   1477 	 * The dummy values will end up being ignored by xen
   1478 	 * later on when they are used anyway.
   1479 	 */
   1480 	xen_psm_dummy_apic[APIC_VERS_REG] = APIC_INTEGRATED_VERS;
   1481 	return (xen_psm_dummy_apic);
   1482 }
   1483 
   1484 /* ARGSUSED */
   1485 uint32_t *
   1486 mapin_ioapic(uint32_t addr, size_t len, int flags)
   1487 {
   1488 	/*
   1489 	 * Return non-null here to fake out configure code that calls this.
   1490 	 * The i86xpv platform will not reference through the returned value..
   1491 	 */
   1492 	return ((uint32_t *)0x1);
   1493 }
   1494 
   1495 /* ARGSUSED */
   1496 void
   1497 mapout_apic(caddr_t addr, size_t len)
   1498 {
   1499 }
   1500 
   1501 /* ARGSUSED */
   1502 void
   1503 mapout_ioapic(caddr_t addr, size_t len)
   1504 {
   1505 }
   1506 
   1507 uint32_t
   1508 ioapic_read(int apic_ix, uint32_t reg)
   1509 {
   1510 	physdev_apic_t apic;
   1511 
   1512 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
   1513 	apic.reg = reg;
   1514 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic))
   1515 		panic("read ioapic %d reg %d failed", apic_ix, reg);
   1516 	return (apic.value);
   1517 }
   1518 
   1519 void
   1520 ioapic_write(int apic_ix, uint32_t reg, uint32_t value)
   1521 {
   1522 	physdev_apic_t apic;
   1523 
   1524 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
   1525 	apic.reg = reg;
   1526 	apic.value = value;
   1527 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
   1528 		panic("write ioapic %d reg %d failed", apic_ix, reg);
   1529 }
   1530 
   1531 /*
   1532  * This function was added as part of x2APIC support in pcplusmp.
   1533  */
   1534 void
   1535 ioapic_write_eoi(int apic_ix, uint32_t value)
   1536 {
   1537 	physdev_apic_t apic;
   1538 
   1539 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
   1540 	apic.reg = APIC_IO_EOI;
   1541 	apic.value = value;
   1542 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
   1543 		panic("write ioapic reg : APIC_IO_EOI %d failed", apic_ix);
   1544 }
   1545 
   1546 /*
   1547  * This function was added as part of x2APIC support in pcplusmp to resolve
   1548  * undefined symbol in xpv_psm.
   1549  */
   1550 void
   1551 x2apic_update_psm()
   1552 {
   1553 }
   1554 
   1555 /*
   1556  * This function was added as part of x2APIC support in pcplusmp to resolve
   1557  * undefined symbol in xpv_psm.
   1558  */
   1559 void
   1560 apic_ret()
   1561 {
   1562 }
   1563 
   1564 /*
   1565  * Call rebind to do the actual programming.
   1566  */
   1567 int
   1568 apic_setup_io_intr(void *p, int irq, boolean_t deferred)
   1569 {
   1570 	apic_irq_t *irqptr;
   1571 	struct ioapic_reprogram_data *drep = NULL;
   1572 	int rv, cpu;
   1573 	cpuset_t cpus;
   1574 
   1575 	if (deferred) {
   1576 		drep = (struct ioapic_reprogram_data *)p;
   1577 		ASSERT(drep != NULL);
   1578 		irqptr = drep->irqp;
   1579 	} else {
   1580 		irqptr = (apic_irq_t *)p;
   1581 	}
   1582 	ASSERT(irqptr != NULL);
   1583 	/*
   1584 	 * Set cpu based on xen idea of online cpu's not apic tables.
   1585 	 * Note that xen ignores/sets to it's own preferred value the
   1586 	 * target cpu field when programming ioapic anyway.
   1587 	 */
   1588 	if (irqptr->airq_mps_intr_index == MSI_INDEX)
   1589 		cpu = irqptr->airq_cpu; /* MSI cpus are already set */
   1590 	else {
   1591 		cpu = xen_psm_bind_intr(irq);
   1592 		irqptr->airq_cpu = cpu;
   1593 	}
   1594 	if (cpu == IRQ_UNBOUND) {
   1595 		CPUSET_ZERO(cpus);
   1596 		CPUSET_OR(cpus, xen_psm_cpus_online);
   1597 	} else {
   1598 		CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
   1599 	}
   1600 	rv = apic_rebind(irqptr, cpu, drep);
   1601 	if (rv) {
   1602 		/* CPU is not up or interrupt is disabled. Fall back to 0 */
   1603 		cpu = 0;
   1604 		irqptr->airq_cpu = cpu;
   1605 		rv = apic_rebind(irqptr, cpu, drep);
   1606 	}
   1607 	/*
   1608 	 * If rebind successful bind the irq to an event channel
   1609 	 */
   1610 	if (rv == 0) {
   1611 		ec_setup_pirq(irq, irqptr->airq_ipl, &cpus);
   1612 		CPUSET_FIND(cpus, cpu);
   1613 		apic_irq_table[irq]->airq_temp_cpu = cpu & ~IRQ_USER_BOUND;
   1614 	}
   1615 	return (rv);
   1616 }
   1617 
   1618 /*
   1619  * Allocate a new vector for the given irq
   1620  */
   1621 /* ARGSUSED */
   1622 uchar_t
   1623 apic_modify_vector(uchar_t vector, int irq)
   1624 {
   1625 	return (apic_allocate_vector(0, irq, 0));
   1626 }
   1627 
   1628 /*
   1629  * The rest of the file is just generic psm module boilerplate
   1630  */
   1631 
   1632 static struct psm_ops xen_psm_ops = {
   1633 	xen_psm_probe,				/* psm_probe		*/
   1634 
   1635 	xen_psm_softinit,			/* psm_init		*/
   1636 	xen_psm_picinit,			/* psm_picinit		*/
   1637 	xen_psm_intr_enter,			/* psm_intr_enter	*/
   1638 	xen_psm_intr_exit,			/* psm_intr_exit	*/
   1639 	xen_psm_setspl,				/* psm_setspl		*/
   1640 	xen_psm_addspl,				/* psm_addspl		*/
   1641 	xen_psm_delspl,				/* psm_delspl		*/
   1642 	xen_psm_disable_intr,			/* psm_disable_intr	*/
   1643 	xen_psm_enable_intr,			/* psm_enable_intr	*/
   1644 	(int (*)(int))NULL,			/* psm_softlvl_to_irq	*/
   1645 	(void (*)(int))NULL,			/* psm_set_softintr	*/
   1646 	(void (*)(processorid_t))NULL,		/* psm_set_idlecpu	*/
   1647 	(void (*)(processorid_t))NULL,		/* psm_unset_idlecpu	*/
   1648 
   1649 	xen_psm_clkinit,			/* psm_clkinit		*/
   1650 	xen_psm_get_clockirq,			/* psm_get_clockirq	*/
   1651 	xen_psm_hrtimeinit,			/* psm_hrtimeinit	*/
   1652 	xpv_gethrtime,				/* psm_gethrtime	*/
   1653 
   1654 	xen_psm_get_next_processorid,		/* psm_get_next_processorid */
   1655 	xen_psm_cpu_start,			/* psm_cpu_start	*/
   1656 	xen_psm_post_cpu_start,			/* psm_post_cpu_start	*/
   1657 	xen_psm_shutdown,			/* psm_shutdown		*/
   1658 	xen_psm_get_ipivect,			/* psm_get_ipivect	*/
   1659 	xen_psm_send_ipi,			/* psm_send_ipi		*/
   1660 
   1661 	xen_psm_translate_irq,			/* psm_translate_irq	*/
   1662 
   1663 	(void (*)(int, char *))NULL,		/* psm_notify_error	*/
   1664 	(void (*)(int msg))NULL,		/* psm_notify_func	*/
   1665 	xen_psm_timer_reprogram,		/* psm_timer_reprogram	*/
   1666 	xen_psm_timer_enable,			/* psm_timer_enable	*/
   1667 	xen_psm_timer_disable,			/* psm_timer_disable	*/
   1668 	(void (*)(void *arg))NULL,		/* psm_post_cyclic_setup */
   1669 	(void (*)(int, int))NULL,		/* psm_preshutdown	*/
   1670 	xen_intr_ops,			/* Advanced DDI Interrupt framework */
   1671 	(int (*)(psm_state_request_t *))NULL	/* psm_state		*/
   1672 };
   1673 
   1674 static struct psm_info xen_psm_info = {
   1675 	PSM_INFO_VER01_5,	/* version				*/
   1676 	PSM_OWN_EXCLUSIVE,	/* ownership				*/
   1677 	&xen_psm_ops,		/* operation				*/
   1678 	"xVM_psm",		/* machine name				*/
   1679 	"platform module"	/* machine descriptions			*/
   1680 };
   1681 
   1682 static void *xen_psm_hdlp;
   1683 
   1684 int
   1685 _init(void)
   1686 {
   1687 	return (psm_mod_init(&xen_psm_hdlp, &xen_psm_info));
   1688 }
   1689 
   1690 int
   1691 _fini(void)
   1692 {
   1693 	return (psm_mod_fini(&xen_psm_hdlp, &xen_psm_info));
   1694 }
   1695 
   1696 int
   1697 _info(struct modinfo *modinfop)
   1698 {
   1699 	return (psm_mod_info(&xen_psm_hdlp, &xen_psm_info, modinfop));
   1700 }
   1701