Home | History | Annotate | Download | only in cpupm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /*
     26  * Copyright (c) 2009, Intel Corporation.
     27  * All rights reserved.
     28  */
     29 
     30 #include <sys/x86_archext.h>
     31 #include <sys/machsystm.h>
     32 #include <sys/x_call.h>
     33 #include <sys/stat.h>
     34 #include <sys/acpi/acpi.h>
     35 #include <sys/acpica.h>
     36 #include <sys/cpu_acpi.h>
     37 #include <sys/cpu_idle.h>
     38 #include <sys/cpupm.h>
     39 #include <sys/cpu_event.h>
     40 #include <sys/hpet.h>
     41 #include <sys/archsystm.h>
     42 #include <vm/hat_i86.h>
     43 #include <sys/dtrace.h>
     44 #include <sys/sdt.h>
     45 #include <sys/callb.h>
     46 
     47 #define	CSTATE_USING_HPET		1
     48 #define	CSTATE_USING_LAT		2
     49 
     50 extern void cpu_idle_adaptive(void);
     51 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
     52     cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
     53 
     54 static int cpu_idle_init(cpu_t *);
     55 static void cpu_idle_fini(cpu_t *);
     56 static void cpu_idle_stop(cpu_t *);
     57 static boolean_t cpu_deep_idle_callb(void *arg, int code);
     58 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
     59 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
     60 
     61 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);
     62 
     63 /*
     64  * the flag of always-running local APIC timer.
     65  * the flag of HPET Timer use in deep cstate.
     66  */
     67 static boolean_t cpu_cstate_arat = B_FALSE;
     68 static boolean_t cpu_cstate_hpet = B_FALSE;
     69 
     70 /*
     71  * Interfaces for modules implementing Intel's deep c-state.
     72  */
     73 cpupm_state_ops_t cpu_idle_ops = {
     74 	"Generic ACPI C-state Support",
     75 	cpu_idle_init,
     76 	cpu_idle_fini,
     77 	NULL,
     78 	cpu_idle_stop
     79 };
     80 
     81 static kmutex_t		cpu_idle_callb_mutex;
     82 static callb_id_t	cpu_deep_idle_callb_id;
     83 static callb_id_t	cpu_idle_cpr_callb_id;
     84 static uint_t		cpu_idle_cfg_state;
     85 
     86 static kmutex_t cpu_idle_mutex;
     87 
     88 cpu_idle_kstat_t cpu_idle_kstat = {
     89 	{ "address_space_id",	KSTAT_DATA_STRING },
     90 	{ "latency",		KSTAT_DATA_UINT32 },
     91 	{ "power",		KSTAT_DATA_UINT32 },
     92 };
     93 
     94 /*
     95  * kstat update function of the c-state info
     96  */
     97 static int
     98 cpu_idle_kstat_update(kstat_t *ksp, int flag)
     99 {
    100 	cpu_acpi_cstate_t *cstate = ksp->ks_private;
    101 
    102 	if (flag == KSTAT_WRITE) {
    103 		return (EACCES);
    104 	}
    105 
    106 	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
    107 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
    108 		"FFixedHW");
    109 	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
    110 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
    111 		"SystemIO");
    112 	} else {
    113 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
    114 		"Unsupported");
    115 	}
    116 
    117 	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
    118 	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
    119 
    120 	return (0);
    121 }
    122 
    123 /*
    124  * Used during configuration callbacks to manage implementation specific
    125  * details of the hardware timer used during Deep C-state.
    126  */
    127 boolean_t
    128 cstate_timer_callback(int code)
    129 {
    130 	if (cpu_cstate_arat) {
    131 		return (B_TRUE);
    132 	} else if (cpu_cstate_hpet) {
    133 		return (hpet.callback(code));
    134 	}
    135 	return (B_FALSE);
    136 }
    137 
    138 /*
    139  * Some Local APIC Timers do not work during Deep C-states.
    140  * The Deep C-state idle function uses this function to ensure it is using a
    141  * hardware timer that works during Deep C-states.  This function also
    142  * switches the timer back to the LACPI Timer after Deep C-state.
    143  */
    144 static boolean_t
    145 cstate_use_timer(hrtime_t *lapic_expire, int timer)
    146 {
    147 	if (cpu_cstate_arat)
    148 		return (B_TRUE);
    149 
    150 	/*
    151 	 * We have to return B_FALSE if no arat or hpet support
    152 	 */
    153 	if (!cpu_cstate_hpet)
    154 		return (B_FALSE);
    155 
    156 	switch (timer) {
    157 	case CSTATE_USING_HPET:
    158 		return (hpet.use_hpet_timer(lapic_expire));
    159 	case CSTATE_USING_LAT:
    160 		hpet.use_lapic_timer(*lapic_expire);
    161 		return (B_TRUE);
    162 	default:
    163 		return (B_FALSE);
    164 	}
    165 }
    166 
    167 /*
    168  * c-state wakeup function.
    169  * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
    170  * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
    171  */
    172 void
    173 cstate_wakeup(cpu_t *cp, int bound)
    174 {
    175 	struct machcpu	*mcpu = &(cp->cpu_m);
    176 	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
    177 	cpupart_t	*cpu_part;
    178 	uint_t		cpu_found;
    179 	processorid_t	cpu_sid;
    180 
    181 	cpu_part = cp->cpu_part;
    182 	cpu_sid = cp->cpu_seqid;
    183 	/*
    184 	 * Clear the halted bit for that CPU since it will be woken up
    185 	 * in a moment.
    186 	 */
    187 	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
    188 		/*
    189 		 * Clear the halted bit for that CPU since it will be
    190 		 * poked in a moment.
    191 		 */
    192 		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
    193 
    194 		/*
    195 		 * We may find the current CPU present in the halted cpuset
    196 		 * if we're in the context of an interrupt that occurred
    197 		 * before we had a chance to clear our bit in cpu_idle().
    198 		 * Waking ourself is obviously unnecessary, since if
    199 		 * we're here, we're not halted.
    200 		 */
    201 		if (cp != CPU) {
    202 			/*
    203 			 * Use correct wakeup mechanism
    204 			 */
    205 			if ((mcpu_mwait != NULL) &&
    206 			    (*mcpu_mwait == MWAIT_HALTED))
    207 				MWAIT_WAKEUP(cp);
    208 			else
    209 				poke_cpu(cp->cpu_id);
    210 		}
    211 		return;
    212 	} else {
    213 		/*
    214 		 * This cpu isn't halted, but it's idle or undergoing a
    215 		 * context switch. No need to awaken anyone else.
    216 		 */
    217 		if (cp->cpu_thread == cp->cpu_idle_thread ||
    218 		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
    219 			return;
    220 	}
    221 
    222 	/*
    223 	 * No need to wake up other CPUs if the thread we just enqueued
    224 	 * is bound.
    225 	 */
    226 	if (bound)
    227 		return;
    228 
    229 
    230 	/*
    231 	 * See if there's any other halted CPUs. If there are, then
    232 	 * select one, and awaken it.
    233 	 * It's possible that after we find a CPU, somebody else
    234 	 * will awaken it before we get the chance.
    235 	 * In that case, look again.
    236 	 */
    237 	do {
    238 		cpu_found = bitset_find(&cpu_part->cp_haltset);
    239 		if (cpu_found == (uint_t)-1)
    240 			return;
    241 
    242 	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
    243 	    cpu_found) < 0);
    244 
    245 	/*
    246 	 * Must use correct wakeup mechanism to avoid lost wakeup of
    247 	 * alternate cpu.
    248 	 */
    249 	if (cpu_found != CPU->cpu_seqid) {
    250 		mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait;
    251 		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
    252 			MWAIT_WAKEUP(cpu_seq[cpu_found]);
    253 		else
    254 			poke_cpu(cpu_seq[cpu_found]->cpu_id);
    255 	}
    256 }
    257 
    258 /*
    259  * Function called by CPU idle notification framework to check whether CPU
    260  * has been awakened. It will be called with interrupt disabled.
    261  * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
    262  * notification framework.
    263  */
    264 static void
    265 acpi_cpu_mwait_check_wakeup(void *arg)
    266 {
    267 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
    268 
    269 	ASSERT(arg != NULL);
    270 	if (*mcpu_mwait != MWAIT_HALTED) {
    271 		/*
    272 		 * CPU has been awakened, notify CPU idle notification system.
    273 		 */
    274 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
    275 	} else {
    276 		/*
    277 		 * Toggle interrupt flag to detect pending interrupts.
    278 		 * If interrupt happened, do_interrupt() will notify CPU idle
    279 		 * notification framework so no need to call cpu_idle_exit()
    280 		 * here.
    281 		 */
    282 		sti();
    283 		SMT_PAUSE();
    284 		cli();
    285 	}
    286 }
    287 
    288 static void
    289 acpi_cpu_mwait_ipi_check_wakeup(void *arg)
    290 {
    291 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
    292 
    293 	ASSERT(arg != NULL);
    294 	if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
    295 		/*
    296 		 * CPU has been awakened, notify CPU idle notification system.
    297 		 */
    298 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
    299 	} else {
    300 		/*
    301 		 * Toggle interrupt flag to detect pending interrupts.
    302 		 * If interrupt happened, do_interrupt() will notify CPU idle
    303 		 * notification framework so no need to call cpu_idle_exit()
    304 		 * here.
    305 		 */
    306 		sti();
    307 		SMT_PAUSE();
    308 		cli();
    309 	}
    310 }
    311 
    312 /*ARGSUSED*/
    313 static void
    314 acpi_cpu_check_wakeup(void *arg)
    315 {
    316 	/*
    317 	 * Toggle interrupt flag to detect pending interrupts.
    318 	 * If interrupt happened, do_interrupt() will notify CPU idle
    319 	 * notification framework so no need to call cpu_idle_exit() here.
    320 	 */
    321 	sti();
    322 	SMT_PAUSE();
    323 	cli();
    324 }
    325 
    326 /*
    327  * enter deep c-state handler
    328  */
    329 static void
    330 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
    331 {
    332 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
    333 	cpu_t			*cpup = CPU;
    334 	processorid_t		cpu_sid = cpup->cpu_seqid;
    335 	cpupart_t		*cp = cpup->cpu_part;
    336 	hrtime_t		lapic_expire;
    337 	uint8_t			type = cstate->cs_addrspace_id;
    338 	uint32_t		cs_type = cstate->cs_type;
    339 	int			hset_update = 1;
    340 	boolean_t		using_timer;
    341 	cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;
    342 
    343 	/*
    344 	 * Set our mcpu_mwait here, so we can tell if anyone tries to
    345 	 * wake us between now and when we call mwait.  No other cpu will
    346 	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
    347 	 */
    348 	if (mcpu_mwait) {
    349 		if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
    350 			*mcpu_mwait = MWAIT_WAKEUP_IPI;
    351 			check_func = &acpi_cpu_mwait_ipi_check_wakeup;
    352 		} else {
    353 			*mcpu_mwait = MWAIT_HALTED;
    354 			check_func = &acpi_cpu_mwait_check_wakeup;
    355 		}
    356 	}
    357 
    358 	/*
    359 	 * If this CPU is online, and there are multiple CPUs
    360 	 * in the system, then we should note our halting
    361 	 * by adding ourselves to the partition's halted CPU
    362 	 * bitmap. This allows other CPUs to find/awaken us when
    363 	 * work becomes available.
    364 	 */
    365 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
    366 		hset_update = 0;
    367 
    368 	/*
    369 	 * Add ourselves to the partition's halted CPUs bitmask
    370 	 * and set our HALTED flag, if necessary.
    371 	 *
    372 	 * When a thread becomes runnable, it is placed on the queue
    373 	 * and then the halted cpuset is checked to determine who
    374 	 * (if anyone) should be awakened. We therefore need to first
    375 	 * add ourselves to the halted cpuset, and and then check if there
    376 	 * is any work available.
    377 	 *
    378 	 * Note that memory barriers after updating the HALTED flag
    379 	 * are not necessary since an atomic operation (updating the bitmap)
    380 	 * immediately follows. On x86 the atomic operation acts as a
    381 	 * memory barrier for the update of cpu_disp_flags.
    382 	 */
    383 	if (hset_update) {
    384 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
    385 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
    386 	}
    387 
    388 	/*
    389 	 * Check to make sure there's really nothing to do.
    390 	 * Work destined for this CPU may become available after
    391 	 * this check. We'll be notified through the clearing of our
    392 	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
    393 	 *
    394 	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
    395 	 */
    396 	if (disp_anywork()) {
    397 		if (hset_update) {
    398 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
    399 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
    400 		}
    401 		return;
    402 	}
    403 
    404 	/*
    405 	 * We're on our way to being halted.
    406 	 *
    407 	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
    408 	 * Try to program the HPET hardware to substitute for this CPU's
    409 	 * LAPIC timer.
    410 	 * cstate_use_timer() could disable the LAPIC Timer.  Make sure
    411 	 * to start the LAPIC Timer again before leaving this function.
    412 	 *
    413 	 * Disable interrupts here so we will awaken immediately after halting
    414 	 * if someone tries to poke us between now and the time we actually
    415 	 * halt.
    416 	 */
    417 	cli();
    418 	using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);
    419 
    420 	/*
    421 	 * We check for the presence of our bit after disabling interrupts.
    422 	 * If it's cleared, we'll return. If the bit is cleared after
    423 	 * we check then the cstate_wakeup() will pop us out of the halted
    424 	 * state.
    425 	 *
    426 	 * This means that the ordering of the cstate_wakeup() and the clearing
    427 	 * of the bit by cpu_wakeup is important.
    428 	 * cpu_wakeup() must clear our mc_haltset bit, and then call
    429 	 * cstate_wakeup().
    430 	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
    431 	 */
    432 	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
    433 		(void) cstate_use_timer(&lapic_expire,
    434 		    CSTATE_USING_LAT);
    435 		sti();
    436 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
    437 		return;
    438 	}
    439 
    440 	/*
    441 	 * The check for anything locally runnable is here for performance
    442 	 * and isn't needed for correctness. disp_nrunnable ought to be
    443 	 * in our cache still, so it's inexpensive to check, and if there
    444 	 * is anything runnable we won't have to wait for the poke.
    445 	 */
    446 	if (cpup->cpu_disp->disp_nrunnable != 0) {
    447 		(void) cstate_use_timer(&lapic_expire,
    448 		    CSTATE_USING_LAT);
    449 		sti();
    450 		if (hset_update) {
    451 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
    452 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
    453 		}
    454 		return;
    455 	}
    456 
    457 	if (using_timer == B_FALSE) {
    458 
    459 		(void) cstate_use_timer(&lapic_expire,
    460 		    CSTATE_USING_LAT);
    461 		sti();
    462 
    463 		/*
    464 		 * We are currently unable to program the HPET to act as this
    465 		 * CPU's proxy LAPIC timer.  This CPU cannot enter C2 or deeper
    466 		 * because no timer is set to wake it up while its LAPIC timer
    467 		 * stalls in deep C-States.
    468 		 * Enter C1 instead.
    469 		 *
    470 		 * cstate_wake_cpu() will wake this CPU with an IPI which
    471 		 * works with MWAIT.
    472 		 */
    473 		i86_monitor(mcpu_mwait, 0, 0);
    474 		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
    475 			if (cpu_idle_enter(IDLE_STATE_C1, 0,
    476 			    check_func, (void *)mcpu_mwait) == 0) {
    477 				if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) ==
    478 				    MWAIT_HALTED) {
    479 					i86_mwait(0, 0);
    480 				}
    481 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
    482 			}
    483 		}
    484 
    485 		/*
    486 		 * We're no longer halted
    487 		 */
    488 		if (hset_update) {
    489 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
    490 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
    491 		}
    492 		return;
    493 	}
    494 
    495 	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
    496 		/*
    497 		 * We're on our way to being halted.
    498 		 * To avoid a lost wakeup, arm the monitor before checking
    499 		 * if another cpu wrote to mcpu_mwait to wake us up.
    500 		 */
    501 		i86_monitor(mcpu_mwait, 0, 0);
    502 		if (*mcpu_mwait == MWAIT_HALTED) {
    503 			if (cpu_idle_enter((uint_t)cs_type, 0,
    504 			    check_func, (void *)mcpu_mwait) == 0) {
    505 				if (*mcpu_mwait == MWAIT_HALTED) {
    506 					i86_mwait(cstate->cs_address, 1);
    507 				}
    508 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
    509 			}
    510 		}
    511 	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
    512 		uint32_t value;
    513 		ACPI_TABLE_FADT *gbl_FADT;
    514 
    515 		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
    516 			if (cpu_idle_enter((uint_t)cs_type, 0,
    517 			    check_func, (void *)mcpu_mwait) == 0) {
    518 				if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
    519 					(void) cpu_acpi_read_port(
    520 					    cstate->cs_address, &value, 8);
    521 					acpica_get_global_FADT(&gbl_FADT);
    522 					(void) cpu_acpi_read_port(
    523 					    gbl_FADT->XPmTimerBlock.Address,
    524 					    &value, 32);
    525 				}
    526 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
    527 			}
    528 		}
    529 	}
    530 
    531 	/*
    532 	 * The LAPIC timer may have stopped in deep c-state.
    533 	 * Reprogram this CPU's LAPIC here before enabling interrupts.
    534 	 */
    535 	(void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
    536 	sti();
    537 
    538 	/*
    539 	 * We're no longer halted
    540 	 */
    541 	if (hset_update) {
    542 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
    543 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
    544 	}
    545 }
    546 
    547 /*
    548  * Idle the present CPU, deep c-state is supported
    549  */
    550 void
    551 cpu_acpi_idle(void)
    552 {
    553 	cpu_t *cp = CPU;
    554 	cpu_acpi_handle_t handle;
    555 	cma_c_state_t *cs_data;
    556 	cpu_acpi_cstate_t *cstates;
    557 	hrtime_t start, end;
    558 	int cpu_max_cstates;
    559 	uint32_t cs_indx;
    560 	uint16_t cs_type;
    561 
    562 	cpupm_mach_state_t *mach_state =
    563 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
    564 	handle = mach_state->ms_acpi_handle;
    565 	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
    566 
    567 	cs_data = mach_state->ms_cstate.cma_state.cstate;
    568 	cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
    569 	ASSERT(cstates != NULL);
    570 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
    571 	if (cpu_max_cstates > CPU_MAX_CSTATES)
    572 		cpu_max_cstates = CPU_MAX_CSTATES;
    573 	if (cpu_max_cstates == 1) {	/* no ACPI c-state data */
    574 		(*non_deep_idle_cpu)();
    575 		return;
    576 	}
    577 
    578 	start = gethrtime_unscaled();
    579 
    580 	cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
    581 
    582 	cs_type = cstates[cs_indx].cs_type;
    583 
    584 	switch (cs_type) {
    585 	default:
    586 		/* FALLTHROUGH */
    587 	case CPU_ACPI_C1:
    588 		(*non_deep_idle_cpu)();
    589 		break;
    590 
    591 	case CPU_ACPI_C2:
    592 		acpi_cpu_cstate(&cstates[cs_indx]);
    593 		break;
    594 
    595 	case CPU_ACPI_C3:
    596 		/*
    597 		 * All supported Intel processors maintain cache coherency
    598 		 * during C3.  Currently when entering C3 processors flush
    599 		 * core caches to higher level shared cache. The shared cache
    600 		 * maintains state and supports probes during C3.
    601 		 * Consequently there is no need to handle cache coherency
    602 		 * and Bus Master activity here with the cache flush, BM_RLD
    603 		 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described
    604 		 * in section 8.1.4 of the ACPI Specification 4.0.
    605 		 */
    606 		acpi_cpu_cstate(&cstates[cs_indx]);
    607 		break;
    608 	}
    609 
    610 	end = gethrtime_unscaled();
    611 
    612 	/*
    613 	 * Update statistics
    614 	 */
    615 	cpupm_wakeup_cstate_data(cs_data, end);
    616 }
    617 
    618 boolean_t
    619 cpu_deep_cstates_supported(void)
    620 {
    621 	extern int	idle_cpu_no_deep_c;
    622 
    623 	if (idle_cpu_no_deep_c)
    624 		return (B_FALSE);
    625 
    626 	if (!cpuid_deep_cstates_supported())
    627 		return (B_FALSE);
    628 
    629 	if (cpuid_arat_supported()) {
    630 		cpu_cstate_arat = B_TRUE;
    631 		return (B_TRUE);
    632 	}
    633 
    634 	if ((hpet.supported == HPET_FULL_SUPPORT) &&
    635 	    hpet.install_proxy()) {
    636 		cpu_cstate_hpet = B_TRUE;
    637 		return (B_TRUE);
    638 	}
    639 
    640 	return (B_FALSE);
    641 }
    642 
    643 /*
    644  * Validate that this processor supports deep cstate and if so,
    645  * get the c-state data from ACPI and cache it.
    646  */
    647 static int
    648 cpu_idle_init(cpu_t *cp)
    649 {
    650 	cpupm_mach_state_t *mach_state =
    651 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
    652 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
    653 	cpu_acpi_cstate_t *cstate;
    654 	char name[KSTAT_STRLEN];
    655 	int cpu_max_cstates, i;
    656 	int ret;
    657 
    658 	/*
    659 	 * Cache the C-state specific ACPI data.
    660 	 */
    661 	if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
    662 		if (ret < 0)
    663 			cmn_err(CE_NOTE,
    664 			    "!Support for CPU deep idle states is being "
    665 			    "disabled due to errors parsing ACPI C-state "
    666 			    "objects exported by BIOS.");
    667 		cpu_idle_fini(cp);
    668 		return (-1);
    669 	}
    670 
    671 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
    672 
    673 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
    674 
    675 	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
    676 		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
    677 		/*
    678 		 * Allocate, initialize and install cstate kstat
    679 		 */
    680 		cstate->cs_ksp = kstat_create("cstate", CPU->cpu_id,
    681 		    name, "misc",
    682 		    KSTAT_TYPE_NAMED,
    683 		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
    684 		    KSTAT_FLAG_VIRTUAL);
    685 
    686 		if (cstate->cs_ksp == NULL) {
    687 			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
    688 		} else {
    689 			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
    690 			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
    691 			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
    692 			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
    693 			cstate->cs_ksp->ks_private = cstate;
    694 			kstat_install(cstate->cs_ksp);
    695 			cstate++;
    696 		}
    697 	}
    698 
    699 	cpupm_alloc_domains(cp, CPUPM_C_STATES);
    700 	cpupm_alloc_ms_cstate(cp);
    701 
    702 	if (cpu_deep_cstates_supported()) {
    703 		uint32_t value;
    704 
    705 		mutex_enter(&cpu_idle_callb_mutex);
    706 		if (cpu_deep_idle_callb_id == (callb_id_t)0)
    707 			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
    708 			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
    709 		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
    710 			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
    711 			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
    712 		mutex_exit(&cpu_idle_callb_mutex);
    713 
    714 
    715 		/*
    716 		 * All supported CPUs (Nehalem and later) will remain in C3
    717 		 * during Bus Master activity.
    718 		 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it
    719 		 * is not already 0 before enabling Deeper C-states.
    720 		 */
    721 		cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value);
    722 		if (value & 1)
    723 			cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
    724 	}
    725 
    726 	return (0);
    727 }
    728 
    729 /*
    730  * Free resources allocated by cpu_idle_init().
    731  */
    732 static void
    733 cpu_idle_fini(cpu_t *cp)
    734 {
    735 	cpupm_mach_state_t *mach_state =
    736 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
    737 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
    738 	cpu_acpi_cstate_t *cstate;
    739 	uint_t	cpu_max_cstates, i;
    740 
    741 	/*
    742 	 * idle cpu points back to the generic one
    743 	 */
    744 	idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
    745 	disp_enq_thread = non_deep_idle_disp_enq_thread;
    746 
    747 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
    748 	if (cstate) {
    749 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
    750 
    751 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
    752 			if (cstate->cs_ksp != NULL)
    753 				kstat_delete(cstate->cs_ksp);
    754 			cstate++;
    755 		}
    756 	}
    757 
    758 	cpupm_free_ms_cstate(cp);
    759 	cpupm_free_domains(&cpupm_cstate_domains);
    760 	cpu_acpi_free_cstate_data(handle);
    761 
    762 	mutex_enter(&cpu_idle_callb_mutex);
    763 	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
    764 		(void) callb_delete(cpu_deep_idle_callb_id);
    765 		cpu_deep_idle_callb_id = (callb_id_t)0;
    766 	}
    767 	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
    768 		(void) callb_delete(cpu_idle_cpr_callb_id);
    769 		cpu_idle_cpr_callb_id = (callb_id_t)0;
    770 	}
    771 	mutex_exit(&cpu_idle_callb_mutex);
    772 }
    773 
    774 static void
    775 cpu_idle_stop(cpu_t *cp)
    776 {
    777 	cpupm_mach_state_t *mach_state =
    778 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
    779 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
    780 	cpu_acpi_cstate_t *cstate;
    781 	uint_t cpu_max_cstates, i;
    782 
    783 	/*
    784 	 * place the CPUs in a safe place so that we can disable
    785 	 * deep c-state on them.
    786 	 */
    787 	pause_cpus(NULL);
    788 	cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
    789 	start_cpus();
    790 
    791 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
    792 	if (cstate) {
    793 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
    794 
    795 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
    796 			if (cstate->cs_ksp != NULL)
    797 				kstat_delete(cstate->cs_ksp);
    798 			cstate++;
    799 		}
    800 	}
    801 	cpupm_free_ms_cstate(cp);
    802 	cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains);
    803 	cpu_acpi_free_cstate_data(handle);
    804 }
    805 
    806 /*ARGSUSED*/
    807 static boolean_t
    808 cpu_deep_idle_callb(void *arg, int code)
    809 {
    810 	boolean_t rslt = B_TRUE;
    811 
    812 	mutex_enter(&cpu_idle_callb_mutex);
    813 	switch (code) {
    814 	case PM_DEFAULT_CPU_DEEP_IDLE:
    815 		/*
    816 		 * Default policy is same as enable
    817 		 */
    818 		/*FALLTHROUGH*/
    819 	case PM_ENABLE_CPU_DEEP_IDLE:
    820 		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
    821 			break;
    822 
    823 		if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
    824 			disp_enq_thread = cstate_wakeup;
    825 			idle_cpu = cpu_idle_adaptive;
    826 			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
    827 		} else {
    828 			rslt = B_FALSE;
    829 		}
    830 		break;
    831 
    832 	case PM_DISABLE_CPU_DEEP_IDLE:
    833 		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
    834 			break;
    835 
    836 		idle_cpu = non_deep_idle_cpu;
    837 		if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
    838 			disp_enq_thread = non_deep_idle_disp_enq_thread;
    839 			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
    840 		}
    841 		break;
    842 
    843 	default:
    844 		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
    845 		    code);
    846 		break;
    847 	}
    848 	mutex_exit(&cpu_idle_callb_mutex);
    849 	return (rslt);
    850 }
    851 
    852 /*ARGSUSED*/
    853 static boolean_t
    854 cpu_idle_cpr_callb(void *arg, int code)
    855 {
    856 	boolean_t rslt = B_TRUE;
    857 
    858 	mutex_enter(&cpu_idle_callb_mutex);
    859 	switch (code) {
    860 	case CB_CODE_CPR_RESUME:
    861 		if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
    862 			/*
    863 			 * Do not enable dispatcher hooks if disabled by user.
    864 			 */
    865 			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
    866 				break;
    867 
    868 			disp_enq_thread = cstate_wakeup;
    869 			idle_cpu = cpu_idle_adaptive;
    870 		} else {
    871 			rslt = B_FALSE;
    872 		}
    873 		break;
    874 
    875 	case CB_CODE_CPR_CHKPT:
    876 		idle_cpu = non_deep_idle_cpu;
    877 		disp_enq_thread = non_deep_idle_disp_enq_thread;
    878 		(void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
    879 		break;
    880 
    881 	default:
    882 		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
    883 		break;
    884 	}
    885 	mutex_exit(&cpu_idle_callb_mutex);
    886 	return (rslt);
    887 }
    888 
    889 /*
    890  * handle _CST notification
    891  */
    892 void
    893 cpuidle_cstate_instance(cpu_t *cp)
    894 {
    895 #ifndef	__xpv
    896 	cpupm_mach_state_t	*mach_state =
    897 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
    898 	cpu_acpi_handle_t	handle;
    899 	struct machcpu		*mcpu;
    900 	cpuset_t 		dom_cpu_set;
    901 	kmutex_t		*pm_lock;
    902 	int			result = 0;
    903 	processorid_t		cpu_id;
    904 
    905 	if (mach_state == NULL) {
    906 		return;
    907 	}
    908 
    909 	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
    910 	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
    911 	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
    912 
    913 	/*
    914 	 * Do for all the CPU's in the domain
    915 	 */
    916 	mutex_enter(pm_lock);
    917 	do {
    918 		CPUSET_FIND(dom_cpu_set, cpu_id);
    919 		if (cpu_id == CPUSET_NOTINSET)
    920 			break;
    921 
    922 		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
    923 		cp = cpu[cpu_id];
    924 		mach_state = (cpupm_mach_state_t *)
    925 		    cp->cpu_m.mcpu_pm_mach_state;
    926 		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
    927 			mutex_exit(pm_lock);
    928 			return;
    929 		}
    930 		handle = mach_state->ms_acpi_handle;
    931 		ASSERT(handle != NULL);
    932 
    933 		/*
    934 		 * re-evaluate cstate object
    935 		 */
    936 		if (cpu_acpi_cache_cstate_data(handle) != 0) {
    937 			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
    938 			    " object Instance: %d", cpu_id);
    939 		}
    940 		mutex_enter(&cpu_lock);
    941 		mcpu = &(cp->cpu_m);
    942 		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
    943 		if (mcpu->max_cstates > CPU_ACPI_C1) {
    944 			(void) cstate_timer_callback(
    945 			    CST_EVENT_MULTIPLE_CSTATES);
    946 			disp_enq_thread = cstate_wakeup;
    947 			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
    948 		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
    949 			disp_enq_thread = non_deep_idle_disp_enq_thread;
    950 			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
    951 			(void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
    952 		}
    953 		mutex_exit(&cpu_lock);
    954 
    955 		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
    956 	} while (result < 0);
    957 	mutex_exit(pm_lock);
    958 #endif
    959 }
    960 
    961 /*
    962  * handle the number or the type of available processor power states change
    963  */
    964 void
    965 cpuidle_manage_cstates(void *ctx)
    966 {
    967 	cpu_t			*cp = ctx;
    968 	cpupm_mach_state_t	*mach_state =
    969 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
    970 	boolean_t		is_ready;
    971 
    972 	if (mach_state == NULL) {
    973 		return;
    974 	}
    975 
    976 	/*
    977 	 * We currently refuse to power manage if the CPU is not ready to
    978 	 * take cross calls (cross calls fail silently if CPU is not ready
    979 	 * for it).
    980 	 *
    981 	 * Additionally, for x86 platforms we cannot power manage an instance,
    982 	 * until it has been initialized.
    983 	 */
    984 	is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp);
    985 	if (!is_ready)
    986 		return;
    987 
    988 	cpuidle_cstate_instance(cp);
    989 }
    990