Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/cpuvar.h>
     28 #include <sys/cpu_event.h>
     29 #include <sys/regset.h>
     30 #include <sys/psw.h>
     31 #include <sys/types.h>
     32 #include <sys/thread.h>
     33 #include <sys/systm.h>
     34 #include <sys/segments.h>
     35 #include <sys/pcb.h>
     36 #include <sys/trap.h>
     37 #include <sys/ftrace.h>
     38 #include <sys/traptrace.h>
     39 #include <sys/clock.h>
     40 #include <sys/panic.h>
     41 #include <sys/disp.h>
     42 #include <vm/seg_kp.h>
     43 #include <sys/stack.h>
     44 #include <sys/sysmacros.h>
     45 #include <sys/cmn_err.h>
     46 #include <sys/kstat.h>
     47 #include <sys/smp_impldefs.h>
     48 #include <sys/pool_pset.h>
     49 #include <sys/zone.h>
     50 #include <sys/bitmap.h>
     51 #include <sys/archsystm.h>
     52 #include <sys/machsystm.h>
     53 #include <sys/ontrap.h>
     54 #include <sys/x86_archext.h>
     55 #include <sys/promif.h>
     56 #include <vm/hat_i86.h>
     57 #if defined(__xpv)
     58 #include <sys/hypervisor.h>
     59 #endif
     60 
     61 
     62 #if defined(__xpv) && defined(DEBUG)
     63 
     64 /*
     65  * This panic message is intended as an aid to interrupt debugging.
     66  *
     67  * The associated assertion tests the condition of enabling
     68  * events when events are already enabled.  The implication
     69  * being that whatever code the programmer thought was
     70  * protected by having events disabled until the second
     71  * enable happened really wasn't protected at all ..
     72  */
     73 
     74 int stistipanic = 1;	/* controls the debug panic check */
     75 const char *stistimsg = "stisti";
     76 ulong_t laststi[NCPU];
     77 
     78 /*
     79  * This variable tracks the last place events were disabled on each cpu
     80  * it assists in debugging when asserts that interrupts are enabled trip.
     81  */
     82 ulong_t lastcli[NCPU];
     83 
     84 #endif
     85 
     86 /*
     87  * Set cpu's base SPL level to the highest active interrupt level
     88  */
     89 void
     90 set_base_spl(void)
     91 {
     92 	struct cpu *cpu = CPU;
     93 	uint16_t active = (uint16_t)cpu->cpu_intr_actv;
     94 
     95 	cpu->cpu_base_spl = active == 0 ? 0 : bsrw_insn(active);
     96 }
     97 
     98 /*
     99  * Do all the work necessary to set up the cpu and thread structures
    100  * to dispatch a high-level interrupt.
    101  *
    102  * Returns 0 if we're -not- already on the high-level interrupt stack,
    103  * (and *must* switch to it), non-zero if we are already on that stack.
    104  *
    105  * Called with interrupts masked.
    106  * The 'pil' is already set to the appropriate level for rp->r_trapno.
    107  */
    108 static int
    109 hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, struct regs *rp)
    110 {
    111 	struct machcpu *mcpu = &cpu->cpu_m;
    112 	uint_t mask;
    113 	hrtime_t intrtime;
    114 	hrtime_t now = tsc_read();
    115 
    116 	ASSERT(pil > LOCK_LEVEL);
    117 
    118 	if (pil == CBE_HIGH_PIL) {
    119 		cpu->cpu_profile_pil = oldpil;
    120 		if (USERMODE(rp->r_cs)) {
    121 			cpu->cpu_profile_pc = 0;
    122 			cpu->cpu_profile_upc = rp->r_pc;
    123 			cpu->cpu_cpcprofile_pc = 0;
    124 			cpu->cpu_cpcprofile_upc = rp->r_pc;
    125 		} else {
    126 			cpu->cpu_profile_pc = rp->r_pc;
    127 			cpu->cpu_profile_upc = 0;
    128 			cpu->cpu_cpcprofile_pc = rp->r_pc;
    129 			cpu->cpu_cpcprofile_upc = 0;
    130 		}
    131 	}
    132 
    133 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
    134 	if (mask != 0) {
    135 		int nestpil;
    136 
    137 		/*
    138 		 * We have interrupted another high-level interrupt.
    139 		 * Load starting timestamp, compute interval, update
    140 		 * cumulative counter.
    141 		 */
    142 		nestpil = bsrw_insn((uint16_t)mask);
    143 		ASSERT(nestpil < pil);
    144 		intrtime = now -
    145 		    mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)];
    146 		mcpu->intrstat[nestpil][0] += intrtime;
    147 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
    148 		/*
    149 		 * Another high-level interrupt is active below this one, so
    150 		 * there is no need to check for an interrupt thread.  That
    151 		 * will be done by the lowest priority high-level interrupt
    152 		 * active.
    153 		 */
    154 	} else {
    155 		kthread_t *t = cpu->cpu_thread;
    156 
    157 		/*
    158 		 * See if we are interrupting a low-level interrupt thread.
    159 		 * If so, account for its time slice only if its time stamp
    160 		 * is non-zero.
    161 		 */
    162 		if ((t->t_flag & T_INTR_THREAD) != 0 && t->t_intr_start != 0) {
    163 			intrtime = now - t->t_intr_start;
    164 			mcpu->intrstat[t->t_pil][0] += intrtime;
    165 			cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
    166 			t->t_intr_start = 0;
    167 		}
    168 	}
    169 
    170 	/*
    171 	 * Store starting timestamp in CPU structure for this PIL.
    172 	 */
    173 	mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] = now;
    174 
    175 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
    176 
    177 	if (pil == 15) {
    178 		/*
    179 		 * To support reentrant level 15 interrupts, we maintain a
    180 		 * recursion count in the top half of cpu_intr_actv.  Only
    181 		 * when this count hits zero do we clear the PIL 15 bit from
    182 		 * the lower half of cpu_intr_actv.
    183 		 */
    184 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
    185 		(*refcntp)++;
    186 	}
    187 
    188 	mask = cpu->cpu_intr_actv;
    189 
    190 	cpu->cpu_intr_actv |= (1 << pil);
    191 
    192 	return (mask & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
    193 }
    194 
    195 /*
    196  * Does most of the work of returning from a high level interrupt.
    197  *
    198  * Returns 0 if there are no more high level interrupts (in which
    199  * case we must switch back to the interrupted thread stack) or
    200  * non-zero if there are more (in which case we should stay on it).
    201  *
    202  * Called with interrupts masked
    203  */
    204 static int
    205 hilevel_intr_epilog(struct cpu *cpu, uint_t pil, uint_t oldpil, uint_t vecnum)
    206 {
    207 	struct machcpu *mcpu = &cpu->cpu_m;
    208 	uint_t mask;
    209 	hrtime_t intrtime;
    210 	hrtime_t now = tsc_read();
    211 
    212 	ASSERT(mcpu->mcpu_pri == pil);
    213 
    214 	cpu->cpu_stats.sys.intr[pil - 1]++;
    215 
    216 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
    217 
    218 	if (pil == 15) {
    219 		/*
    220 		 * To support reentrant level 15 interrupts, we maintain a
    221 		 * recursion count in the top half of cpu_intr_actv.  Only
    222 		 * when this count hits zero do we clear the PIL 15 bit from
    223 		 * the lower half of cpu_intr_actv.
    224 		 */
    225 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
    226 
    227 		ASSERT(*refcntp > 0);
    228 
    229 		if (--(*refcntp) == 0)
    230 			cpu->cpu_intr_actv &= ~(1 << pil);
    231 	} else {
    232 		cpu->cpu_intr_actv &= ~(1 << pil);
    233 	}
    234 
    235 	ASSERT(mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] != 0);
    236 
    237 	intrtime = now - mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)];
    238 	mcpu->intrstat[pil][0] += intrtime;
    239 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
    240 
    241 	/*
    242 	 * Check for lower-pil nested high-level interrupt beneath
    243 	 * current one.  If so, place a starting timestamp in its
    244 	 * pil_high_start entry.
    245 	 */
    246 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
    247 	if (mask != 0) {
    248 		int nestpil;
    249 
    250 		/*
    251 		 * find PIL of nested interrupt
    252 		 */
    253 		nestpil = bsrw_insn((uint16_t)mask);
    254 		ASSERT(nestpil < pil);
    255 		mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)] = now;
    256 		/*
    257 		 * (Another high-level interrupt is active below this one,
    258 		 * so there is no need to check for an interrupt
    259 		 * thread.  That will be done by the lowest priority
    260 		 * high-level interrupt active.)
    261 		 */
    262 	} else {
    263 		/*
    264 		 * Check to see if there is a low-level interrupt active.
    265 		 * If so, place a starting timestamp in the thread
    266 		 * structure.
    267 		 */
    268 		kthread_t *t = cpu->cpu_thread;
    269 
    270 		if (t->t_flag & T_INTR_THREAD)
    271 			t->t_intr_start = now;
    272 	}
    273 
    274 	mcpu->mcpu_pri = oldpil;
    275 	(void) (*setlvlx)(oldpil, vecnum);
    276 
    277 	return (cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
    278 }
    279 
    280 /*
    281  * Set up the cpu, thread and interrupt thread structures for
    282  * executing an interrupt thread.  The new stack pointer of the
    283  * interrupt thread (which *must* be switched to) is returned.
    284  */
    285 static caddr_t
    286 intr_thread_prolog(struct cpu *cpu, caddr_t stackptr, uint_t pil)
    287 {
    288 	struct machcpu *mcpu = &cpu->cpu_m;
    289 	kthread_t *t, *volatile it;
    290 	hrtime_t now = tsc_read();
    291 
    292 	ASSERT(pil > 0);
    293 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
    294 	cpu->cpu_intr_actv |= (1 << pil);
    295 
    296 	/*
    297 	 * Get set to run an interrupt thread.
    298 	 * There should always be an interrupt thread, since we
    299 	 * allocate one for each level on each CPU.
    300 	 *
    301 	 * t_intr_start could be zero due to cpu_intr_swtch_enter.
    302 	 */
    303 	t = cpu->cpu_thread;
    304 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
    305 		hrtime_t intrtime = now - t->t_intr_start;
    306 		mcpu->intrstat[t->t_pil][0] += intrtime;
    307 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
    308 		t->t_intr_start = 0;
    309 	}
    310 
    311 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
    312 
    313 	t->t_sp = (uintptr_t)stackptr;	/* mark stack in curthread for resume */
    314 
    315 	/*
    316 	 * unlink the interrupt thread off the cpu
    317 	 *
    318 	 * Note that the code in kcpc_overflow_intr -relies- on the
    319 	 * ordering of events here - in particular that t->t_lwp of
    320 	 * the interrupt thread is set to the pinned thread *before*
    321 	 * curthread is changed.
    322 	 */
    323 	it = cpu->cpu_intr_thread;
    324 	cpu->cpu_intr_thread = it->t_link;
    325 	it->t_intr = t;
    326 	it->t_lwp = t->t_lwp;
    327 
    328 	/*
    329 	 * (threads on the interrupt thread free list could have state
    330 	 * preset to TS_ONPROC, but it helps in debugging if
    331 	 * they're TS_FREE.)
    332 	 */
    333 	it->t_state = TS_ONPROC;
    334 
    335 	cpu->cpu_thread = it;		/* new curthread on this cpu */
    336 	it->t_pil = (uchar_t)pil;
    337 	it->t_pri = intr_pri + (pri_t)pil;
    338 	it->t_intr_start = now;
    339 
    340 	return (it->t_stk);
    341 }
    342 
    343 
    344 #ifdef DEBUG
    345 int intr_thread_cnt;
    346 #endif
    347 
    348 /*
    349  * Called with interrupts disabled
    350  */
    351 static void
    352 intr_thread_epilog(struct cpu *cpu, uint_t vec, uint_t oldpil)
    353 {
    354 	struct machcpu *mcpu = &cpu->cpu_m;
    355 	kthread_t *t;
    356 	kthread_t *it = cpu->cpu_thread;	/* curthread */
    357 	uint_t pil, basespl;
    358 	hrtime_t intrtime;
    359 	hrtime_t now = tsc_read();
    360 
    361 	pil = it->t_pil;
    362 	cpu->cpu_stats.sys.intr[pil - 1]++;
    363 
    364 	ASSERT(it->t_intr_start != 0);
    365 	intrtime = now - it->t_intr_start;
    366 	mcpu->intrstat[pil][0] += intrtime;
    367 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
    368 
    369 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
    370 	cpu->cpu_intr_actv &= ~(1 << pil);
    371 
    372 	/*
    373 	 * If there is still an interrupted thread underneath this one
    374 	 * then the interrupt was never blocked and the return is
    375 	 * fairly simple.  Otherwise it isn't.
    376 	 */
    377 	if ((t = it->t_intr) == NULL) {
    378 		/*
    379 		 * The interrupted thread is no longer pinned underneath
    380 		 * the interrupt thread.  This means the interrupt must
    381 		 * have blocked, and the interrupted thread has been
    382 		 * unpinned, and has probably been running around the
    383 		 * system for a while.
    384 		 *
    385 		 * Since there is no longer a thread under this one, put
    386 		 * this interrupt thread back on the CPU's free list and
    387 		 * resume the idle thread which will dispatch the next
    388 		 * thread to run.
    389 		 */
    390 #ifdef DEBUG
    391 		intr_thread_cnt++;
    392 #endif
    393 		cpu->cpu_stats.sys.intrblk++;
    394 		/*
    395 		 * Set CPU's base SPL based on active interrupts bitmask
    396 		 */
    397 		set_base_spl();
    398 		basespl = cpu->cpu_base_spl;
    399 		mcpu->mcpu_pri = basespl;
    400 		(*setlvlx)(basespl, vec);
    401 		(void) splhigh();
    402 		sti();
    403 		it->t_state = TS_FREE;
    404 		/*
    405 		 * Return interrupt thread to pool
    406 		 */
    407 		it->t_link = cpu->cpu_intr_thread;
    408 		cpu->cpu_intr_thread = it;
    409 		swtch();
    410 		panic("intr_thread_epilog: swtch returned");
    411 		/*NOTREACHED*/
    412 	}
    413 
    414 	/*
    415 	 * Return interrupt thread to the pool
    416 	 */
    417 	it->t_link = cpu->cpu_intr_thread;
    418 	cpu->cpu_intr_thread = it;
    419 	it->t_state = TS_FREE;
    420 
    421 	basespl = cpu->cpu_base_spl;
    422 	pil = MAX(oldpil, basespl);
    423 	mcpu->mcpu_pri = pil;
    424 	(*setlvlx)(pil, vec);
    425 	t->t_intr_start = now;
    426 	cpu->cpu_thread = t;
    427 }
    428 
    429 /*
    430  * intr_get_time() is a resource for interrupt handlers to determine how
    431  * much time has been spent handling the current interrupt. Such a function
    432  * is needed because higher level interrupts can arrive during the
    433  * processing of an interrupt.  intr_get_time() only returns time spent in the
    434  * current interrupt handler.
    435  *
    436  * The caller must be calling from an interrupt handler running at a pil
    437  * below or at lock level. Timings are not provided for high-level
    438  * interrupts.
    439  *
    440  * The first time intr_get_time() is called while handling an interrupt,
    441  * it returns the time since the interrupt handler was invoked. Subsequent
    442  * calls will return the time since the prior call to intr_get_time(). Time
    443  * is returned as ticks. Use scalehrtimef() to convert ticks to nsec.
    444  *
    445  * Theory Of Intrstat[][]:
    446  *
    447  * uint64_t intrstat[pil][0..1] is an array indexed by pil level, with two
    448  * uint64_ts per pil.
    449  *
    450  * intrstat[pil][0] is a cumulative count of the number of ticks spent
    451  * handling all interrupts at the specified pil on this CPU. It is
    452  * exported via kstats to the user.
    453  *
    454  * intrstat[pil][1] is always a count of ticks less than or equal to the
    455  * value in [0]. The difference between [1] and [0] is the value returned
    456  * by a call to intr_get_time(). At the start of interrupt processing,
    457  * [0] and [1] will be equal (or nearly so). As the interrupt consumes
    458  * time, [0] will increase, but [1] will remain the same. A call to
    459  * intr_get_time() will return the difference, then update [1] to be the
    460  * same as [0]. Future calls will return the time since the last call.
    461  * Finally, when the interrupt completes, [1] is updated to the same as [0].
    462  *
    463  * Implementation:
    464  *
    465  * intr_get_time() works much like a higher level interrupt arriving. It
    466  * "checkpoints" the timing information by incrementing intrstat[pil][0]
    467  * to include elapsed running time, and by setting t_intr_start to rdtsc.
    468  * It then sets the return value to intrstat[pil][0] - intrstat[pil][1],
    469  * and updates intrstat[pil][1] to be the same as the new value of
    470  * intrstat[pil][0].
    471  *
    472  * In the normal handling of interrupts, after an interrupt handler returns
    473  * and the code in intr_thread() updates intrstat[pil][0], it then sets
    474  * intrstat[pil][1] to the new value of intrstat[pil][0]. When [0] == [1],
    475  * the timings are reset, i.e. intr_get_time() will return [0] - [1] which
    476  * is 0.
    477  *
    478  * Whenever interrupts arrive on a CPU which is handling a lower pil
    479  * interrupt, they update the lower pil's [0] to show time spent in the
    480  * handler that they've interrupted. This results in a growing discrepancy
    481  * between [0] and [1], which is returned the next time intr_get_time() is
    482  * called. Time spent in the higher-pil interrupt will not be returned in
    483  * the next intr_get_time() call from the original interrupt, because
    484  * the higher-pil interrupt's time is accumulated in intrstat[higherpil][].
    485  */
    486 uint64_t
    487 intr_get_time(void)
    488 {
    489 	struct cpu *cpu;
    490 	struct machcpu *mcpu;
    491 	kthread_t *t;
    492 	uint64_t time, delta, ret;
    493 	uint_t pil;
    494 
    495 	cli();
    496 	cpu = CPU;
    497 	mcpu = &cpu->cpu_m;
    498 	t = cpu->cpu_thread;
    499 	pil = t->t_pil;
    500 	ASSERT((cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK) == 0);
    501 	ASSERT(t->t_flag & T_INTR_THREAD);
    502 	ASSERT(pil != 0);
    503 	ASSERT(t->t_intr_start != 0);
    504 
    505 	time = tsc_read();
    506 	delta = time - t->t_intr_start;
    507 	t->t_intr_start = time;
    508 
    509 	time = mcpu->intrstat[pil][0] + delta;
    510 	ret = time - mcpu->intrstat[pil][1];
    511 	mcpu->intrstat[pil][0] = time;
    512 	mcpu->intrstat[pil][1] = time;
    513 	cpu->cpu_intracct[cpu->cpu_mstate] += delta;
    514 
    515 	sti();
    516 	return (ret);
    517 }
    518 
    519 static caddr_t
    520 dosoftint_prolog(
    521 	struct cpu *cpu,
    522 	caddr_t stackptr,
    523 	uint32_t st_pending,
    524 	uint_t oldpil)
    525 {
    526 	kthread_t *t, *volatile it;
    527 	struct machcpu *mcpu = &cpu->cpu_m;
    528 	uint_t pil;
    529 	hrtime_t now;
    530 
    531 top:
    532 	ASSERT(st_pending == mcpu->mcpu_softinfo.st_pending);
    533 
    534 	pil = bsrw_insn((uint16_t)st_pending);
    535 	if (pil <= oldpil || pil <= cpu->cpu_base_spl)
    536 		return (0);
    537 
    538 	/*
    539 	 * XX64	Sigh.
    540 	 *
    541 	 * This is a transliteration of the i386 assembler code for
    542 	 * soft interrupts.  One question is "why does this need
    543 	 * to be atomic?"  One possible race is -other- processors
    544 	 * posting soft interrupts to us in set_pending() i.e. the
    545 	 * CPU might get preempted just after the address computation,
    546 	 * but just before the atomic transaction, so another CPU would
    547 	 * actually set the original CPU's st_pending bit.  However,
    548 	 * it looks like it would be simpler to disable preemption there.
    549 	 * Are there other races for which preemption control doesn't work?
    550 	 *
    551 	 * The i386 assembler version -also- checks to see if the bit
    552 	 * being cleared was actually set; if it wasn't, it rechecks
    553 	 * for more.  This seems a bit strange, as the only code that
    554 	 * ever clears the bit is -this- code running with interrupts
    555 	 * disabled on -this- CPU.  This code would probably be cheaper:
    556 	 *
    557 	 * atomic_and_32((uint32_t *)&mcpu->mcpu_softinfo.st_pending,
    558 	 *   ~(1 << pil));
    559 	 *
    560 	 * and t->t_preempt--/++ around set_pending() even cheaper,
    561 	 * but at this point, correctness is critical, so we slavishly
    562 	 * emulate the i386 port.
    563 	 */
    564 	if (atomic_btr32((uint32_t *)
    565 	    &mcpu->mcpu_softinfo.st_pending, pil) == 0) {
    566 		st_pending = mcpu->mcpu_softinfo.st_pending;
    567 		goto top;
    568 	}
    569 
    570 	mcpu->mcpu_pri = pil;
    571 	(*setspl)(pil);
    572 
    573 	now = tsc_read();
    574 
    575 	/*
    576 	 * Get set to run interrupt thread.
    577 	 * There should always be an interrupt thread since we
    578 	 * allocate one for each level on the CPU.
    579 	 */
    580 	it = cpu->cpu_intr_thread;
    581 	cpu->cpu_intr_thread = it->t_link;
    582 
    583 	/* t_intr_start could be zero due to cpu_intr_swtch_enter. */
    584 	t = cpu->cpu_thread;
    585 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
    586 		hrtime_t intrtime = now - t->t_intr_start;
    587 		mcpu->intrstat[pil][0] += intrtime;
    588 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
    589 		t->t_intr_start = 0;
    590 	}
    591 
    592 	/*
    593 	 * Note that the code in kcpc_overflow_intr -relies- on the
    594 	 * ordering of events here - in particular that t->t_lwp of
    595 	 * the interrupt thread is set to the pinned thread *before*
    596 	 * curthread is changed.
    597 	 */
    598 	it->t_lwp = t->t_lwp;
    599 	it->t_state = TS_ONPROC;
    600 
    601 	/*
    602 	 * Push interrupted thread onto list from new thread.
    603 	 * Set the new thread as the current one.
    604 	 * Set interrupted thread's T_SP because if it is the idle thread,
    605 	 * resume() may use that stack between threads.
    606 	 */
    607 
    608 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
    609 	t->t_sp = (uintptr_t)stackptr;
    610 
    611 	it->t_intr = t;
    612 	cpu->cpu_thread = it;
    613 
    614 	/*
    615 	 * Set bit for this pil in CPU's interrupt active bitmask.
    616 	 */
    617 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
    618 	cpu->cpu_intr_actv |= (1 << pil);
    619 
    620 	/*
    621 	 * Initialize thread priority level from intr_pri
    622 	 */
    623 	it->t_pil = (uchar_t)pil;
    624 	it->t_pri = (pri_t)pil + intr_pri;
    625 	it->t_intr_start = now;
    626 
    627 	return (it->t_stk);
    628 }
    629 
    630 static void
    631 dosoftint_epilog(struct cpu *cpu, uint_t oldpil)
    632 {
    633 	struct machcpu *mcpu = &cpu->cpu_m;
    634 	kthread_t *t, *it;
    635 	uint_t pil, basespl;
    636 	hrtime_t intrtime;
    637 	hrtime_t now = tsc_read();
    638 
    639 	it = cpu->cpu_thread;
    640 	pil = it->t_pil;
    641 
    642 	cpu->cpu_stats.sys.intr[pil - 1]++;
    643 
    644 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
    645 	cpu->cpu_intr_actv &= ~(1 << pil);
    646 	intrtime = now - it->t_intr_start;
    647 	mcpu->intrstat[pil][0] += intrtime;
    648 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
    649 
    650 	/*
    651 	 * If there is still an interrupted thread underneath this one
    652 	 * then the interrupt was never blocked and the return is
    653 	 * fairly simple.  Otherwise it isn't.
    654 	 */
    655 	if ((t = it->t_intr) == NULL) {
    656 		/*
    657 		 * Put thread back on the interrupt thread list.
    658 		 * This was an interrupt thread, so set CPU's base SPL.
    659 		 */
    660 		set_base_spl();
    661 		it->t_state = TS_FREE;
    662 		it->t_link = cpu->cpu_intr_thread;
    663 		cpu->cpu_intr_thread = it;
    664 		(void) splhigh();
    665 		sti();
    666 		swtch();
    667 		/*NOTREACHED*/
    668 		panic("dosoftint_epilog: swtch returned");
    669 	}
    670 	it->t_link = cpu->cpu_intr_thread;
    671 	cpu->cpu_intr_thread = it;
    672 	it->t_state = TS_FREE;
    673 	cpu->cpu_thread = t;
    674 	if (t->t_flag & T_INTR_THREAD)
    675 		t->t_intr_start = now;
    676 	basespl = cpu->cpu_base_spl;
    677 	pil = MAX(oldpil, basespl);
    678 	mcpu->mcpu_pri = pil;
    679 	(*setspl)(pil);
    680 }
    681 
    682 
    683 /*
    684  * Make the interrupted thread 'to' be runnable.
    685  *
    686  * Since t->t_sp has already been saved, t->t_pc is all
    687  * that needs to be set in this function.
    688  *
    689  * Returns the interrupt level of the interrupt thread.
    690  */
    691 int
    692 intr_passivate(
    693 	kthread_t *it,		/* interrupt thread */
    694 	kthread_t *t)		/* interrupted thread */
    695 {
    696 	extern void _sys_rtt();
    697 
    698 	ASSERT(it->t_flag & T_INTR_THREAD);
    699 	ASSERT(SA(t->t_sp) == t->t_sp);
    700 
    701 	t->t_pc = (uintptr_t)_sys_rtt;
    702 	return (it->t_pil);
    703 }
    704 
    705 /*
    706  * Create interrupt kstats for this CPU.
    707  */
    708 void
    709 cpu_create_intrstat(cpu_t *cp)
    710 {
    711 	int		i;
    712 	kstat_t		*intr_ksp;
    713 	kstat_named_t	*knp;
    714 	char		name[KSTAT_STRLEN];
    715 	zoneid_t	zoneid;
    716 
    717 	ASSERT(MUTEX_HELD(&cpu_lock));
    718 
    719 	if (pool_pset_enabled())
    720 		zoneid = GLOBAL_ZONEID;
    721 	else
    722 		zoneid = ALL_ZONES;
    723 
    724 	intr_ksp = kstat_create_zone("cpu", cp->cpu_id, "intrstat", "misc",
    725 	    KSTAT_TYPE_NAMED, PIL_MAX * 2, NULL, zoneid);
    726 
    727 	/*
    728 	 * Initialize each PIL's named kstat
    729 	 */
    730 	if (intr_ksp != NULL) {
    731 		intr_ksp->ks_update = cpu_kstat_intrstat_update;
    732 		knp = (kstat_named_t *)intr_ksp->ks_data;
    733 		intr_ksp->ks_private = cp;
    734 		for (i = 0; i < PIL_MAX; i++) {
    735 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-time",
    736 			    i + 1);
    737 			kstat_named_init(&knp[i * 2], name, KSTAT_DATA_UINT64);
    738 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-count",
    739 			    i + 1);
    740 			kstat_named_init(&knp[(i * 2) + 1], name,
    741 			    KSTAT_DATA_UINT64);
    742 		}
    743 		kstat_install(intr_ksp);
    744 	}
    745 }
    746 
    747 /*
    748  * Delete interrupt kstats for this CPU.
    749  */
    750 void
    751 cpu_delete_intrstat(cpu_t *cp)
    752 {
    753 	kstat_delete_byname_zone("cpu", cp->cpu_id, "intrstat", ALL_ZONES);
    754 }
    755 
    756 /*
    757  * Convert interrupt statistics from CPU ticks to nanoseconds and
    758  * update kstat.
    759  */
    760 int
    761 cpu_kstat_intrstat_update(kstat_t *ksp, int rw)
    762 {
    763 	kstat_named_t	*knp = ksp->ks_data;
    764 	cpu_t		*cpup = (cpu_t *)ksp->ks_private;
    765 	int		i;
    766 	hrtime_t	hrt;
    767 
    768 	if (rw == KSTAT_WRITE)
    769 		return (EACCES);
    770 
    771 	for (i = 0; i < PIL_MAX; i++) {
    772 		hrt = (hrtime_t)cpup->cpu_m.intrstat[i + 1][0];
    773 		scalehrtimef(&hrt);
    774 		knp[i * 2].value.ui64 = (uint64_t)hrt;
    775 		knp[(i * 2) + 1].value.ui64 = cpup->cpu_stats.sys.intr[i];
    776 	}
    777 
    778 	return (0);
    779 }
    780 
    781 /*
    782  * An interrupt thread is ending a time slice, so compute the interval it
    783  * ran for and update the statistic for its PIL.
    784  */
    785 void
    786 cpu_intr_swtch_enter(kthread_id_t t)
    787 {
    788 	uint64_t	interval;
    789 	uint64_t	start;
    790 	cpu_t		*cpu;
    791 
    792 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
    793 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
    794 
    795 	/*
    796 	 * We could be here with a zero timestamp. This could happen if:
    797 	 * an interrupt thread which no longer has a pinned thread underneath
    798 	 * it (i.e. it blocked at some point in its past) has finished running
    799 	 * its handler. intr_thread() updated the interrupt statistic for its
    800 	 * PIL and zeroed its timestamp. Since there was no pinned thread to
    801 	 * return to, swtch() gets called and we end up here.
    802 	 *
    803 	 * Note that we use atomic ops below (cas64 and atomic_add_64), which
    804 	 * we don't use in the functions above, because we're not called
    805 	 * with interrupts blocked, but the epilog/prolog functions are.
    806 	 */
    807 	if (t->t_intr_start) {
    808 		do {
    809 			start = t->t_intr_start;
    810 			interval = tsc_read() - start;
    811 		} while (cas64(&t->t_intr_start, start, 0) != start);
    812 		cpu = CPU;
    813 		cpu->cpu_m.intrstat[t->t_pil][0] += interval;
    814 
    815 		atomic_add_64((uint64_t *)&cpu->cpu_intracct[cpu->cpu_mstate],
    816 		    interval);
    817 	} else
    818 		ASSERT(t->t_intr == NULL);
    819 }
    820 
    821 /*
    822  * An interrupt thread is returning from swtch(). Place a starting timestamp
    823  * in its thread structure.
    824  */
    825 void
    826 cpu_intr_swtch_exit(kthread_id_t t)
    827 {
    828 	uint64_t ts;
    829 
    830 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
    831 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
    832 
    833 	do {
    834 		ts = t->t_intr_start;
    835 	} while (cas64(&t->t_intr_start, ts, tsc_read()) != ts);
    836 }
    837 
    838 /*
    839  * Dispatch a hilevel interrupt (one above LOCK_LEVEL)
    840  */
    841 /*ARGSUSED*/
    842 static void
    843 dispatch_hilevel(uint_t vector, uint_t arg2)
    844 {
    845 	sti();
    846 	av_dispatch_autovect(vector);
    847 	cli();
    848 }
    849 
    850 /*
    851  * Dispatch a soft interrupt
    852  */
    853 /*ARGSUSED*/
    854 static void
    855 dispatch_softint(uint_t oldpil, uint_t arg2)
    856 {
    857 	struct cpu *cpu = CPU;
    858 
    859 	sti();
    860 	av_dispatch_softvect((int)cpu->cpu_thread->t_pil);
    861 	cli();
    862 
    863 	/*
    864 	 * Must run softint_epilog() on the interrupt thread stack, since
    865 	 * there may not be a return from it if the interrupt thread blocked.
    866 	 */
    867 	dosoftint_epilog(cpu, oldpil);
    868 }
    869 
    870 /*
    871  * Dispatch a normal interrupt
    872  */
    873 static void
    874 dispatch_hardint(uint_t vector, uint_t oldipl)
    875 {
    876 	struct cpu *cpu = CPU;
    877 
    878 	sti();
    879 	av_dispatch_autovect(vector);
    880 	cli();
    881 
    882 	/*
    883 	 * Must run intr_thread_epilog() on the interrupt thread stack, since
    884 	 * there may not be a return from it if the interrupt thread blocked.
    885 	 */
    886 	intr_thread_epilog(cpu, vector, oldipl);
    887 }
    888 
    889 /*
    890  * Deliver any softints the current interrupt priority allows.
    891  * Called with interrupts disabled.
    892  */
    893 void
    894 dosoftint(struct regs *regs)
    895 {
    896 	struct cpu *cpu = CPU;
    897 	int oldipl;
    898 	caddr_t newsp;
    899 
    900 	while (cpu->cpu_softinfo.st_pending) {
    901 		oldipl = cpu->cpu_pri;
    902 		newsp = dosoftint_prolog(cpu, (caddr_t)regs,
    903 		    cpu->cpu_softinfo.st_pending, oldipl);
    904 		/*
    905 		 * If returned stack pointer is NULL, priority is too high
    906 		 * to run any of the pending softints now.
    907 		 * Break out and they will be run later.
    908 		 */
    909 		if (newsp == NULL)
    910 			break;
    911 		switch_sp_and_call(newsp, dispatch_softint, oldipl, 0);
    912 	}
    913 }
    914 
    915 /*
    916  * Interrupt service routine, called with interrupts disabled.
    917  */
    918 /*ARGSUSED*/
    919 void
    920 do_interrupt(struct regs *rp, trap_trace_rec_t *ttp)
    921 {
    922 	struct cpu *cpu = CPU;
    923 	int newipl, oldipl = cpu->cpu_pri;
    924 	uint_t vector;
    925 	caddr_t newsp;
    926 
    927 #ifdef TRAPTRACE
    928 	ttp->ttr_marker = TT_INTERRUPT;
    929 	ttp->ttr_ipl = 0xff;
    930 	ttp->ttr_pri = oldipl;
    931 	ttp->ttr_spl = cpu->cpu_base_spl;
    932 	ttp->ttr_vector = 0xff;
    933 #endif	/* TRAPTRACE */
    934 
    935 	cpu_idle_exit(CPU_IDLE_CB_FLAG_INTR);
    936 
    937 	++*(uint16_t *)&cpu->cpu_m.mcpu_istamp;
    938 
    939 	/*
    940 	 * If it's a softint go do it now.
    941 	 */
    942 	if (rp->r_trapno == T_SOFTINT) {
    943 		dosoftint(rp);
    944 		ASSERT(!interrupts_enabled());
    945 		return;
    946 	}
    947 
    948 	/*
    949 	 * Raise the interrupt priority.
    950 	 */
    951 	newipl = (*setlvl)(oldipl, (int *)&rp->r_trapno);
    952 #ifdef TRAPTRACE
    953 	ttp->ttr_ipl = newipl;
    954 #endif	/* TRAPTRACE */
    955 
    956 	/*
    957 	 * Bail if it is a spurious interrupt
    958 	 */
    959 	if (newipl == -1)
    960 		return;
    961 	cpu->cpu_pri = newipl;
    962 	vector = rp->r_trapno;
    963 #ifdef TRAPTRACE
    964 	ttp->ttr_vector = vector;
    965 #endif	/* TRAPTRACE */
    966 	if (newipl > LOCK_LEVEL) {
    967 		/*
    968 		 * High priority interrupts run on this cpu's interrupt stack.
    969 		 */
    970 		if (hilevel_intr_prolog(cpu, newipl, oldipl, rp) == 0) {
    971 			newsp = cpu->cpu_intr_stack;
    972 			switch_sp_and_call(newsp, dispatch_hilevel, vector, 0);
    973 		} else { /* already on the interrupt stack */
    974 			dispatch_hilevel(vector, 0);
    975 		}
    976 		(void) hilevel_intr_epilog(cpu, newipl, oldipl, vector);
    977 	} else {
    978 		/*
    979 		 * Run this interrupt in a separate thread.
    980 		 */
    981 		newsp = intr_thread_prolog(cpu, (caddr_t)rp, newipl);
    982 		switch_sp_and_call(newsp, dispatch_hardint, vector, oldipl);
    983 	}
    984 
    985 #if !defined(__xpv)
    986 	/*
    987 	 * Deliver any pending soft interrupts.
    988 	 */
    989 	if (cpu->cpu_softinfo.st_pending)
    990 		dosoftint(rp);
    991 #endif	/* !__xpv */
    992 }
    993 
    994 
    995 /*
    996  * Common tasks always done by _sys_rtt, called with interrupts disabled.
    997  * Returns 1 if returning to userland, 0 if returning to system mode.
    998  */
    999 int
   1000 sys_rtt_common(struct regs *rp)
   1001 {
   1002 	kthread_t *tp;
   1003 	extern void mutex_exit_critical_start();
   1004 	extern long mutex_exit_critical_size;
   1005 	extern void mutex_owner_running_critical_start();
   1006 	extern long mutex_owner_running_critical_size;
   1007 
   1008 loop:
   1009 
   1010 	/*
   1011 	 * Check if returning to user
   1012 	 */
   1013 	tp = CPU->cpu_thread;
   1014 	if (USERMODE(rp->r_cs)) {
   1015 		/*
   1016 		 * Check if AST pending.
   1017 		 */
   1018 		if (tp->t_astflag) {
   1019 			/*
   1020 			 * Let trap() handle the AST
   1021 			 */
   1022 			sti();
   1023 			rp->r_trapno = T_AST;
   1024 			trap(rp, (caddr_t)0, CPU->cpu_id);
   1025 			cli();
   1026 			goto loop;
   1027 		}
   1028 
   1029 #if defined(__amd64)
   1030 		/*
   1031 		 * We are done if segment registers do not need updating.
   1032 		 */
   1033 		if (tp->t_lwp->lwp_pcb.pcb_rupdate == 0)
   1034 			return (1);
   1035 
   1036 		if (update_sregs(rp, tp->t_lwp)) {
   1037 			/*
   1038 			 * 1 or more of the selectors is bad.
   1039 			 * Deliver a SIGSEGV.
   1040 			 */
   1041 			proc_t *p = ttoproc(tp);
   1042 
   1043 			sti();
   1044 			mutex_enter(&p->p_lock);
   1045 			tp->t_lwp->lwp_cursig = SIGSEGV;
   1046 			mutex_exit(&p->p_lock);
   1047 			psig();
   1048 			tp->t_sig_check = 1;
   1049 			cli();
   1050 		}
   1051 		tp->t_lwp->lwp_pcb.pcb_rupdate = 0;
   1052 
   1053 #endif	/* __amd64 */
   1054 		return (1);
   1055 	}
   1056 
   1057 	/*
   1058 	 * Here if we are returning to supervisor mode.
   1059 	 * Check for a kernel preemption request.
   1060 	 */
   1061 	if (CPU->cpu_kprunrun && (rp->r_ps & PS_IE)) {
   1062 
   1063 		/*
   1064 		 * Do nothing if already in kpreempt
   1065 		 */
   1066 		if (!tp->t_preempt_lk) {
   1067 			tp->t_preempt_lk = 1;
   1068 			sti();
   1069 			kpreempt(1); /* asynchronous kpreempt call */
   1070 			cli();
   1071 			tp->t_preempt_lk = 0;
   1072 		}
   1073 	}
   1074 
   1075 	/*
   1076 	 * If we interrupted the mutex_exit() critical region we must
   1077 	 * reset the PC back to the beginning to prevent missed wakeups
   1078 	 * See the comments in mutex_exit() for details.
   1079 	 */
   1080 	if ((uintptr_t)rp->r_pc - (uintptr_t)mutex_exit_critical_start <
   1081 	    mutex_exit_critical_size) {
   1082 		rp->r_pc = (greg_t)mutex_exit_critical_start;
   1083 	}
   1084 
   1085 	/*
   1086 	 * If we interrupted the mutex_owner_running() critical region we
   1087 	 * must reset the PC back to the beginning to prevent dereferencing
   1088 	 * of a freed thread pointer. See the comments in mutex_owner_running
   1089 	 * for details.
   1090 	 */
   1091 	if ((uintptr_t)rp->r_pc -
   1092 	    (uintptr_t)mutex_owner_running_critical_start <
   1093 	    mutex_owner_running_critical_size) {
   1094 		rp->r_pc = (greg_t)mutex_owner_running_critical_start;
   1095 	}
   1096 
   1097 	return (0);
   1098 }
   1099 
   1100 void
   1101 send_dirint(int cpuid, int int_level)
   1102 {
   1103 	(*send_dirintf)(cpuid, int_level);
   1104 }
   1105 
   1106 /*
   1107  * do_splx routine, takes new ipl to set
   1108  * returns the old ipl.
   1109  * We are careful not to set priority lower than CPU->cpu_base_pri,
   1110  * even though it seems we're raising the priority, it could be set
   1111  * higher at any time by an interrupt routine, so we must block interrupts
   1112  * and look at CPU->cpu_base_pri
   1113  */
   1114 int
   1115 do_splx(int newpri)
   1116 {
   1117 	ulong_t	flag;
   1118 	cpu_t	*cpu;
   1119 	int	curpri, basepri;
   1120 
   1121 	flag = intr_clear();
   1122 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
   1123 	curpri = cpu->cpu_m.mcpu_pri;
   1124 	basepri = cpu->cpu_base_spl;
   1125 	if (newpri < basepri)
   1126 		newpri = basepri;
   1127 	cpu->cpu_m.mcpu_pri = newpri;
   1128 	(*setspl)(newpri);
   1129 	/*
   1130 	 * If we are going to reenable interrupts see if new priority level
   1131 	 * allows pending softint delivery.
   1132 	 */
   1133 	if ((flag & PS_IE) &&
   1134 	    bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri)
   1135 		fakesoftint();
   1136 	ASSERT(!interrupts_enabled());
   1137 	intr_restore(flag);
   1138 	return (curpri);
   1139 }
   1140 
   1141 /*
   1142  * Common spl raise routine, takes new ipl to set
   1143  * returns the old ipl, will not lower ipl.
   1144  */
   1145 int
   1146 splr(int newpri)
   1147 {
   1148 	ulong_t	flag;
   1149 	cpu_t	*cpu;
   1150 	int	curpri, basepri;
   1151 
   1152 	flag = intr_clear();
   1153 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
   1154 	curpri = cpu->cpu_m.mcpu_pri;
   1155 	/*
   1156 	 * Only do something if new priority is larger
   1157 	 */
   1158 	if (newpri > curpri) {
   1159 		basepri = cpu->cpu_base_spl;
   1160 		if (newpri < basepri)
   1161 			newpri = basepri;
   1162 		cpu->cpu_m.mcpu_pri = newpri;
   1163 		(*setspl)(newpri);
   1164 		/*
   1165 		 * See if new priority level allows pending softint delivery
   1166 		 */
   1167 		if ((flag & PS_IE) &&
   1168 		    bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri)
   1169 			fakesoftint();
   1170 	}
   1171 	intr_restore(flag);
   1172 	return (curpri);
   1173 }
   1174 
   1175 int
   1176 getpil(void)
   1177 {
   1178 	return (CPU->cpu_m.mcpu_pri);
   1179 }
   1180 
   1181 int
   1182 spl_xcall(void)
   1183 {
   1184 	return (splr(ipltospl(XCALL_PIL)));
   1185 }
   1186 
   1187 int
   1188 interrupts_enabled(void)
   1189 {
   1190 	ulong_t	flag;
   1191 
   1192 	flag = getflags();
   1193 	return ((flag & PS_IE) == PS_IE);
   1194 }
   1195 
   1196 #ifdef DEBUG
   1197 void
   1198 assert_ints_enabled(void)
   1199 {
   1200 	ASSERT(!interrupts_unleashed || interrupts_enabled());
   1201 }
   1202 #endif	/* DEBUG */
   1203