Home | History | Annotate | Download | only in cpupm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /*
     26  * Copyright (c) 2009, Intel Corporation.
     27  * All rights reserved.
     28  */
     29 
     30 #include <sys/cpu_pm.h>
     31 #include <sys/x86_archext.h>
     32 #include <sys/sdt.h>
     33 #include <sys/spl.h>
     34 #include <sys/machsystm.h>
     35 #include <sys/archsystm.h>
     36 #include <sys/hpet.h>
     37 #include <sys/acpi/acpi.h>
     38 #include <sys/acpica.h>
     39 #include <sys/cpupm.h>
     40 #include <sys/cpu_idle.h>
     41 #include <sys/cpu_acpi.h>
     42 #include <sys/cpupm_throttle.h>
     43 #include <sys/dtrace.h>
     44 #include <sys/note.h>
     45 
     46 /*
     47  * This callback is used to build the PPM CPU domains once
     48  * a CPU device has been started. The callback is initialized
     49  * by the PPM driver to point to a routine that will build the
     50  * domains.
     51  */
     52 void (*cpupm_ppm_alloc_pstate_domains)(cpu_t *);
     53 
     54 /*
     55  * This callback is used to remove CPU from the PPM CPU domains
     56  * when the cpu driver is detached. The callback is initialized
     57  * by the PPM driver to point to a routine that will remove CPU
     58  * from the domains.
     59  */
     60 void (*cpupm_ppm_free_pstate_domains)(cpu_t *);
     61 
     62 /*
     63  * This callback is used to redefine the topspeed for a CPU device.
     64  * Since all CPUs in a domain should have identical properties, this
     65  * callback is initialized by the PPM driver to point to a routine
     66  * that will redefine the topspeed for all devices in a CPU domain.
     67  * This callback is exercised whenever an ACPI _PPC change notification
     68  * is received by the CPU driver.
     69  */
     70 void (*cpupm_redefine_topspeed)(void *);
     71 
     72 /*
     73  * This callback is used by the PPM driver to call into the CPU driver
     74  * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
     75  */
     76 void (*cpupm_set_topspeed_callb)(void *, int);
     77 
     78 /*
     79  * This callback is used by the PPM driver to call into the CPU driver
     80  * to set a new topspeed for a CPU.
     81  */
     82 int (*cpupm_get_topspeed_callb)(void *);
     83 
     84 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
     85 static void cpupm_free_notify_handlers(cpu_t *);
     86 static void cpupm_power_manage_notifications(void *);
     87 
     88 /*
     89  * Until proven otherwise, all power states are manageable.
     90  */
     91 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
     92 
     93 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
     94 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
     95 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
     96 
     97 /*
     98  * c-state tunables
     99  *
    100  * cpupm_cs_sample_interval is the length of time we wait before
    101  * recalculating c-state statistics.  When a CPU goes idle it checks
    102  * to see if it has been longer than cpupm_cs_sample_interval since it last
    103  * caculated which C-state to go to.
    104  *
    105  * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
    106  * divided by time spent in the idle state transitions.
    107  * A value of 10 means the CPU will not spend more than 1/10 of its time
    108  * in idle latency.  The worst case performance will be 90% of non Deep C-state
    109  * kernel.
    110  *
    111  * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
    112  * before it is worth going there.  Expressed as a multiple of latency.
    113  */
    114 uint32_t cpupm_cs_sample_interval = 100*1000*1000;	/* 100 milliseconds */
    115 uint32_t cpupm_cs_idle_cost_tunable = 10;	/* work time / latency cost */
    116 uint32_t cpupm_cs_idle_save_tunable = 2;	/* idle power savings */
    117 uint16_t cpupm_C2_idle_pct_tunable = 70;
    118 uint16_t cpupm_C3_idle_pct_tunable = 80;
    119 
    120 #ifndef __xpv
    121 extern boolean_t cpupm_intel_init(cpu_t *);
    122 extern boolean_t cpupm_amd_init(cpu_t *);
    123 
    124 typedef struct cpupm_vendor {
    125 	boolean_t	(*cpuv_init)(cpu_t *);
    126 } cpupm_vendor_t;
    127 
    128 /*
    129  * Table of supported vendors.
    130  */
    131 static cpupm_vendor_t cpupm_vendors[] = {
    132 	cpupm_intel_init,
    133 	cpupm_amd_init,
    134 	NULL
    135 };
    136 #endif
    137 
    138 /*
    139  * Initialize the machine.
    140  * See if a module exists for managing power for this CPU.
    141  */
    142 /*ARGSUSED*/
    143 void
    144 cpupm_init(cpu_t *cp)
    145 {
    146 #ifndef __xpv
    147 	cpupm_vendor_t *vendors;
    148 	cpupm_mach_state_t *mach_state;
    149 	struct machcpu *mcpu = &(cp->cpu_m);
    150 	static boolean_t first = B_TRUE;
    151 	int *speeds;
    152 	uint_t nspeeds;
    153 	int ret;
    154 
    155 	mach_state = cp->cpu_m.mcpu_pm_mach_state =
    156 	    kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
    157 	mach_state->ms_caps = CPUPM_NO_STATES;
    158 	mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
    159 
    160 	mach_state->ms_acpi_handle = cpu_acpi_init(cp);
    161 	if (mach_state->ms_acpi_handle == NULL) {
    162 		cpupm_fini(cp);
    163 		cmn_err(CE_WARN, "!cpupm_init: processor %d: "
    164 		    "unable to get ACPI handle", cp->cpu_id);
    165 		cmn_err(CE_NOTE, "!CPU power management will not function.");
    166 		CPUPM_DISABLE();
    167 		first = B_FALSE;
    168 		return;
    169 	}
    170 
    171 	/*
    172 	 * Loop through the CPU management module table and see if
    173 	 * any of the modules implement CPU power management
    174 	 * for this CPU.
    175 	 */
    176 	for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
    177 		if (vendors->cpuv_init(cp))
    178 			break;
    179 	}
    180 
    181 	/*
    182 	 * Nope, we can't power manage this CPU.
    183 	 */
    184 	if (vendors == NULL) {
    185 		cpupm_fini(cp);
    186 		CPUPM_DISABLE();
    187 		first = B_FALSE;
    188 		return;
    189 	}
    190 
    191 	/*
    192 	 * If P-state support exists for this system, then initialize it.
    193 	 */
    194 	if (mach_state->ms_pstate.cma_ops != NULL) {
    195 		ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
    196 		if (ret != 0) {
    197 			mach_state->ms_pstate.cma_ops = NULL;
    198 			cpupm_disable(CPUPM_P_STATES);
    199 		} else {
    200 			nspeeds = cpupm_get_speeds(cp, &speeds);
    201 			if (nspeeds == 0) {
    202 				cmn_err(CE_NOTE, "!cpupm_init: processor %d:"
    203 				    " no speeds to manage", cp->cpu_id);
    204 			} else {
    205 				cpupm_set_supp_freqs(cp, speeds, nspeeds);
    206 				cpupm_free_speeds(speeds, nspeeds);
    207 				mach_state->ms_caps |= CPUPM_P_STATES;
    208 			}
    209 		}
    210 	} else {
    211 		cpupm_disable(CPUPM_P_STATES);
    212 	}
    213 
    214 	if (mach_state->ms_tstate.cma_ops != NULL) {
    215 		ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
    216 		if (ret != 0) {
    217 			mach_state->ms_tstate.cma_ops = NULL;
    218 			cpupm_disable(CPUPM_T_STATES);
    219 		} else {
    220 			mach_state->ms_caps |= CPUPM_T_STATES;
    221 		}
    222 	} else {
    223 		cpupm_disable(CPUPM_T_STATES);
    224 	}
    225 
    226 	/*
    227 	 * If C-states support exists for this system, then initialize it.
    228 	 */
    229 	if (mach_state->ms_cstate.cma_ops != NULL) {
    230 		ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
    231 		if (ret != 0) {
    232 			mach_state->ms_cstate.cma_ops = NULL;
    233 			mcpu->max_cstates = CPU_ACPI_C1;
    234 			cpupm_disable(CPUPM_C_STATES);
    235 			idle_cpu = non_deep_idle_cpu;
    236 			disp_enq_thread = non_deep_idle_disp_enq_thread;
    237 		} else if (cpu_deep_cstates_supported()) {
    238 			mcpu->max_cstates = cpu_acpi_get_max_cstates(
    239 			    mach_state->ms_acpi_handle);
    240 			if (mcpu->max_cstates > CPU_ACPI_C1) {
    241 				(void) cstate_timer_callback(
    242 				    CST_EVENT_MULTIPLE_CSTATES);
    243 				CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
    244 				mcpu->mcpu_idle_type = CPU_ACPI_C1;
    245 				disp_enq_thread = cstate_wakeup;
    246 			} else {
    247 				(void) cstate_timer_callback(
    248 				    CST_EVENT_ONE_CSTATE);
    249 			}
    250 			mach_state->ms_caps |= CPUPM_C_STATES;
    251 		} else {
    252 			mcpu->max_cstates = CPU_ACPI_C1;
    253 			idle_cpu = non_deep_idle_cpu;
    254 			disp_enq_thread = non_deep_idle_disp_enq_thread;
    255 		}
    256 	} else {
    257 		cpupm_disable(CPUPM_C_STATES);
    258 	}
    259 
    260 
    261 	if (mach_state->ms_caps == CPUPM_NO_STATES) {
    262 		cpupm_fini(cp);
    263 		CPUPM_DISABLE();
    264 		first = B_FALSE;
    265 		return;
    266 	}
    267 
    268 	if ((mach_state->ms_caps & CPUPM_T_STATES) ||
    269 	    (mach_state->ms_caps & CPUPM_P_STATES) ||
    270 	    (mach_state->ms_caps & CPUPM_C_STATES)) {
    271 		if (first) {
    272 			acpica_write_cpupm_capabilities(
    273 			    mach_state->ms_caps & CPUPM_P_STATES,
    274 			    mach_state->ms_caps & CPUPM_C_STATES);
    275 		}
    276 		if (mach_state->ms_caps & CPUPM_T_STATES) {
    277 			cpupm_throttle_manage_notification(cp);
    278 		}
    279 		if (mach_state->ms_caps & CPUPM_C_STATES) {
    280 			cpuidle_manage_cstates(cp);
    281 		}
    282 		if (mach_state->ms_caps & CPUPM_P_STATES) {
    283 			cpupm_power_manage_notifications(cp);
    284 		}
    285 		cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
    286 	}
    287 	first = B_FALSE;
    288 #endif
    289 }
    290 
    291 /*
    292  * Free any resources allocated during cpupm initialization or cpupm start.
    293  */
    294 /*ARGSUSED*/
    295 void
    296 cpupm_free(cpu_t *cp, boolean_t cpupm_stop)
    297 {
    298 #ifndef __xpv
    299 	cpupm_mach_state_t *mach_state =
    300 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
    301 
    302 	if (mach_state == NULL)
    303 		return;
    304 
    305 	if (mach_state->ms_pstate.cma_ops != NULL) {
    306 		if (cpupm_stop)
    307 			mach_state->ms_pstate.cma_ops->cpus_stop(cp);
    308 		else
    309 			mach_state->ms_pstate.cma_ops->cpus_fini(cp);
    310 		mach_state->ms_pstate.cma_ops = NULL;
    311 	}
    312 
    313 	if (mach_state->ms_tstate.cma_ops != NULL) {
    314 		if (cpupm_stop)
    315 			mach_state->ms_tstate.cma_ops->cpus_stop(cp);
    316 		else
    317 			mach_state->ms_tstate.cma_ops->cpus_fini(cp);
    318 		mach_state->ms_tstate.cma_ops = NULL;
    319 	}
    320 
    321 	if (mach_state->ms_cstate.cma_ops != NULL) {
    322 		if (cpupm_stop)
    323 			mach_state->ms_cstate.cma_ops->cpus_stop(cp);
    324 		else
    325 			mach_state->ms_cstate.cma_ops->cpus_fini(cp);
    326 
    327 		mach_state->ms_cstate.cma_ops = NULL;
    328 	}
    329 
    330 	cpupm_free_notify_handlers(cp);
    331 
    332 	if (mach_state->ms_acpi_handle != NULL) {
    333 		cpu_acpi_fini(mach_state->ms_acpi_handle);
    334 		mach_state->ms_acpi_handle = NULL;
    335 	}
    336 
    337 	mutex_destroy(&mach_state->ms_lock);
    338 	kmem_free(mach_state, sizeof (cpupm_mach_state_t));
    339 	cp->cpu_m.mcpu_pm_mach_state = NULL;
    340 #endif
    341 }
    342 
    343 void
    344 cpupm_fini(cpu_t *cp)
    345 {
    346 	/*
    347 	 * call (*cpus_fini)() ops to release the cpupm resource
    348 	 * in the P/C/T-state driver
    349 	 */
    350 	cpupm_free(cp, B_FALSE);
    351 }
    352 
    353 void
    354 cpupm_start(cpu_t *cp)
    355 {
    356 	cpupm_init(cp);
    357 }
    358 
    359 void
    360 cpupm_stop(cpu_t *cp)
    361 {
    362 	/*
    363 	 * call (*cpus_stop)() ops to reclaim the cpupm resource
    364 	 * in the P/C/T-state driver
    365 	 */
    366 	cpupm_free(cp, B_TRUE);
    367 }
    368 
    369 /*
    370  * If A CPU has started and at least one power state is manageable,
    371  * then the CPU is ready for power management.
    372  */
    373 boolean_t
    374 cpupm_is_ready(cpu_t *cp)
    375 {
    376 #ifndef __xpv
    377 	cpupm_mach_state_t *mach_state =
    378 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
    379 	uint32_t cpupm_caps = mach_state->ms_caps;
    380 
    381 	if (cpupm_enabled == CPUPM_NO_STATES)
    382 		return (B_FALSE);
    383 
    384 	if ((cpupm_caps & CPUPM_T_STATES) ||
    385 	    (cpupm_caps & CPUPM_P_STATES) ||
    386 	    (cpupm_caps & CPUPM_C_STATES))
    387 
    388 		return (B_TRUE);
    389 	return (B_FALSE);
    390 #else
    391 	_NOTE(ARGUNUSED(cp));
    392 	return (B_FALSE);
    393 #endif
    394 }
    395 
    396 boolean_t
    397 cpupm_is_enabled(uint32_t state)
    398 {
    399 	return ((cpupm_enabled & state) == state);
    400 }
    401 
    402 /*
    403  * By default, all states are enabled.
    404  */
    405 void
    406 cpupm_disable(uint32_t state)
    407 {
    408 
    409 	if (state & CPUPM_P_STATES) {
    410 		cpupm_free_domains(&cpupm_pstate_domains);
    411 	}
    412 	if (state & CPUPM_T_STATES) {
    413 		cpupm_free_domains(&cpupm_tstate_domains);
    414 	}
    415 	if (state & CPUPM_C_STATES) {
    416 		cpupm_free_domains(&cpupm_cstate_domains);
    417 	}
    418 	cpupm_enabled &= ~state;
    419 }
    420 
    421 /*
    422  * Allocate power domains for C,P and T States
    423  */
    424 void
    425 cpupm_alloc_domains(cpu_t *cp, int state)
    426 {
    427 	cpupm_mach_state_t *mach_state =
    428 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
    429 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
    430 	cpupm_state_domains_t **dom_ptr;
    431 	cpupm_state_domains_t *dptr;
    432 	cpupm_state_domains_t **mach_dom_state_ptr;
    433 	uint32_t domain;
    434 	uint32_t type;
    435 
    436 	switch (state) {
    437 	case CPUPM_P_STATES:
    438 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
    439 			domain = CPU_ACPI_PSD(handle).sd_domain;
    440 			type = CPU_ACPI_PSD(handle).sd_type;
    441 		} else {
    442 			mutex_enter(&cpu_lock);
    443 			domain = cpuid_get_chipid(cp);
    444 			mutex_exit(&cpu_lock);
    445 			type = CPU_ACPI_HW_ALL;
    446 		}
    447 		dom_ptr = &cpupm_pstate_domains;
    448 		mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
    449 		break;
    450 	case CPUPM_T_STATES:
    451 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
    452 			domain = CPU_ACPI_TSD(handle).sd_domain;
    453 			type = CPU_ACPI_TSD(handle).sd_type;
    454 		} else {
    455 			mutex_enter(&cpu_lock);
    456 			domain = cpuid_get_chipid(cp);
    457 			mutex_exit(&cpu_lock);
    458 			type = CPU_ACPI_HW_ALL;
    459 		}
    460 		dom_ptr = &cpupm_tstate_domains;
    461 		mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
    462 		break;
    463 	case CPUPM_C_STATES:
    464 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
    465 			domain = CPU_ACPI_CSD(handle).sd_domain;
    466 			type = CPU_ACPI_CSD(handle).sd_type;
    467 		} else {
    468 			mutex_enter(&cpu_lock);
    469 			domain = cpuid_get_coreid(cp);
    470 			mutex_exit(&cpu_lock);
    471 			type = CPU_ACPI_HW_ALL;
    472 		}
    473 		dom_ptr = &cpupm_cstate_domains;
    474 		mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
    475 		break;
    476 	default:
    477 		return;
    478 	}
    479 
    480 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
    481 		if (dptr->pm_domain == domain)
    482 			break;
    483 	}
    484 
    485 	/* new domain is created and linked at the head */
    486 	if (dptr == NULL) {
    487 		dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
    488 		dptr->pm_domain = domain;
    489 		dptr->pm_type = type;
    490 		dptr->pm_next = *dom_ptr;
    491 		mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
    492 		    (void *)ipltospl(DISP_LEVEL));
    493 		CPUSET_ZERO(dptr->pm_cpus);
    494 		*dom_ptr = dptr;
    495 	}
    496 	CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
    497 	*mach_dom_state_ptr = dptr;
    498 }
    499 
    500 /*
    501  * Free C, P or T state power domains
    502  */
    503 void
    504 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
    505 {
    506 	cpupm_state_domains_t *this_domain, *next_domain;
    507 
    508 	this_domain = *dom_ptr;
    509 	while (this_domain != NULL) {
    510 		next_domain = this_domain->pm_next;
    511 		mutex_destroy(&this_domain->pm_lock);
    512 		kmem_free((void *)this_domain,
    513 		    sizeof (cpupm_state_domains_t));
    514 		this_domain = next_domain;
    515 	}
    516 	*dom_ptr = NULL;
    517 }
    518 
    519 /*
    520  * Remove CPU from C, P or T state power domains
    521  */
    522 void
    523 cpupm_remove_domains(cpu_t *cp, int state, cpupm_state_domains_t **dom_ptr)
    524 {
    525 	cpupm_mach_state_t *mach_state =
    526 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
    527 	cpupm_state_domains_t *dptr;
    528 	uint32_t pm_domain;
    529 
    530 	ASSERT(mach_state);
    531 
    532 	switch (state) {
    533 	case CPUPM_P_STATES:
    534 		pm_domain = mach_state->ms_pstate.cma_domain->pm_domain;
    535 		break;
    536 	case CPUPM_T_STATES:
    537 		pm_domain = mach_state->ms_tstate.cma_domain->pm_domain;
    538 		break;
    539 	case CPUPM_C_STATES:
    540 		pm_domain = mach_state->ms_cstate.cma_domain->pm_domain;
    541 		break;
    542 	default:
    543 		return;
    544 	}
    545 
    546 	/*
    547 	 * Find the CPU C, P or T state power domain
    548 	 */
    549 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
    550 		if (dptr->pm_domain == pm_domain)
    551 			break;
    552 	}
    553 
    554 	/*
    555 	 * return if no matched domain found
    556 	 */
    557 	if (dptr == NULL)
    558 		return;
    559 
    560 	/*
    561 	 * We found one matched power domain, remove CPU from its cpuset.
    562 	 * pm_lock(spin lock) here to avoid the race conditions between
    563 	 * event change notification and cpu remove.
    564 	 */
    565 	mutex_enter(&dptr->pm_lock);
    566 	if (CPU_IN_SET(dptr->pm_cpus, cp->cpu_id))
    567 		CPUSET_DEL(dptr->pm_cpus, cp->cpu_id);
    568 	mutex_exit(&dptr->pm_lock);
    569 }
    570 
    571 void
    572 cpupm_alloc_ms_cstate(cpu_t *cp)
    573 {
    574 	cpupm_mach_state_t *mach_state;
    575 	cpupm_mach_acpi_state_t *ms_cstate;
    576 
    577 	mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
    578 	ms_cstate = &mach_state->ms_cstate;
    579 	ASSERT(ms_cstate->cma_state.cstate == NULL);
    580 	ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
    581 	    KM_SLEEP);
    582 	ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
    583 }
    584 
    585 void
    586 cpupm_free_ms_cstate(cpu_t *cp)
    587 {
    588 	cpupm_mach_state_t *mach_state =
    589 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
    590 	cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
    591 
    592 	if (ms_cstate->cma_state.cstate != NULL) {
    593 		kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
    594 		ms_cstate->cma_state.cstate = NULL;
    595 	}
    596 }
    597 
    598 void
    599 cpupm_state_change(cpu_t *cp, int level, int state)
    600 {
    601 	cpupm_mach_state_t	*mach_state =
    602 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
    603 	cpupm_state_ops_t	*state_ops;
    604 	cpupm_state_domains_t  	*state_domain;
    605 	cpuset_t		set;
    606 
    607 	DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
    608 
    609 	if (mach_state == NULL) {
    610 		return;
    611 	}
    612 
    613 	switch (state) {
    614 	case CPUPM_P_STATES:
    615 		state_ops = mach_state->ms_pstate.cma_ops;
    616 		state_domain = mach_state->ms_pstate.cma_domain;
    617 		break;
    618 	case CPUPM_T_STATES:
    619 		state_ops = mach_state->ms_tstate.cma_ops;
    620 		state_domain = mach_state->ms_tstate.cma_domain;
    621 		break;
    622 	default:
    623 		break;
    624 	}
    625 
    626 	switch (state_domain->pm_type) {
    627 	case CPU_ACPI_SW_ANY:
    628 		/*
    629 		 * A request on any CPU in the domain transitions the domain
    630 		 */
    631 		CPUSET_ONLY(set, cp->cpu_id);
    632 		state_ops->cpus_change(set, level);
    633 		break;
    634 	case CPU_ACPI_SW_ALL:
    635 		/*
    636 		 * All CPUs in the domain must request the transition
    637 		 */
    638 	case CPU_ACPI_HW_ALL:
    639 		/*
    640 		 * P/T-state transitions are coordinated by the hardware
    641 		 * For now, request the transition on all CPUs in the domain,
    642 		 * but looking ahead we can probably be smarter about this.
    643 		 */
    644 		mutex_enter(&state_domain->pm_lock);
    645 		state_ops->cpus_change(state_domain->pm_cpus, level);
    646 		mutex_exit(&state_domain->pm_lock);
    647 		break;
    648 	default:
    649 		cmn_err(CE_NOTE, "Unknown domain coordination type: %d",
    650 		    state_domain->pm_type);
    651 	}
    652 }
    653 
    654 /*
    655  * CPU PM interfaces exposed to the CPU power manager
    656  */
    657 /*ARGSUSED*/
    658 id_t
    659 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
    660 {
    661 	cpupm_mach_state_t	*mach_state =
    662 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
    663 
    664 	if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
    665 	    !cpupm_is_enabled(CPUPM_C_STATES))) {
    666 		return (CPUPM_NO_DOMAIN);
    667 	}
    668 	if (type == CPUPM_DTYPE_ACTIVE) {
    669 		/*
    670 		 * Return P-State domain for the specified CPU
    671 		 */
    672 		if (mach_state->ms_pstate.cma_domain) {
    673 			return (mach_state->ms_pstate.cma_domain->pm_domain);
    674 		}
    675 	} else if (type == CPUPM_DTYPE_IDLE) {
    676 		/*
    677 		 * Return C-State domain for the specified CPU
    678 		 */
    679 		if (mach_state->ms_cstate.cma_domain) {
    680 			return (mach_state->ms_cstate.cma_domain->pm_domain);
    681 		}
    682 	}
    683 	return (CPUPM_NO_DOMAIN);
    684 }
    685 
    686 /*ARGSUSED*/
    687 uint_t
    688 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
    689     cpupm_state_t *states)
    690 {
    691 	int	*speeds;
    692 	uint_t	nspeeds, i;
    693 
    694 	/*
    695 	 * Idle domain support unimplemented
    696 	 */
    697 	if (type != CPUPM_DTYPE_ACTIVE) {
    698 		return (0);
    699 	}
    700 	nspeeds = cpupm_get_speeds(cp, &speeds);
    701 
    702 	/*
    703 	 * If the caller passes NULL for states, just return the
    704 	 * number of states.
    705 	 */
    706 	if (states != NULL) {
    707 		for (i = 0; i < nspeeds; i++) {
    708 			states[i].cps_speed = speeds[i];
    709 			states[i].cps_handle = (cpupm_handle_t)i;
    710 		}
    711 	}
    712 	cpupm_free_speeds(speeds, nspeeds);
    713 	return (nspeeds);
    714 }
    715 
    716 /*ARGSUSED*/
    717 int
    718 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
    719 {
    720 	if (!cpupm_is_ready(cp))
    721 		return (-1);
    722 
    723 	cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
    724 
    725 	return (0);
    726 }
    727 
    728 /*ARGSUSED*/
    729 /*
    730  * Note: It is the responsibility of the users of
    731  * cpupm_get_speeds() to free the memory allocated
    732  * for speeds using cpupm_free_speeds()
    733  */
    734 uint_t
    735 cpupm_get_speeds(cpu_t *cp, int **speeds)
    736 {
    737 #ifndef __xpv
    738 	cpupm_mach_state_t *mach_state =
    739 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
    740 	return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
    741 #else
    742 	return (0);
    743 #endif
    744 }
    745 
    746 /*ARGSUSED*/
    747 void
    748 cpupm_free_speeds(int *speeds, uint_t nspeeds)
    749 {
    750 #ifndef __xpv
    751 	cpu_acpi_free_speeds(speeds, nspeeds);
    752 #endif
    753 }
    754 
    755 /*
    756  * All CPU instances have been initialized successfully.
    757  */
    758 boolean_t
    759 cpupm_power_ready(cpu_t *cp)
    760 {
    761 	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready(cp));
    762 }
    763 
    764 /*
    765  * All CPU instances have been initialized successfully.
    766  */
    767 boolean_t
    768 cpupm_throttle_ready(cpu_t *cp)
    769 {
    770 	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready(cp));
    771 }
    772 
    773 /*
    774  * All CPU instances have been initialized successfully.
    775  */
    776 boolean_t
    777 cpupm_cstate_ready(cpu_t *cp)
    778 {
    779 	return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready(cp));
    780 }
    781 
    782 void
    783 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
    784 {
    785 	cpu_t *cp = ctx;
    786 	cpupm_mach_state_t *mach_state =
    787 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
    788 	cpupm_notification_t *entry;
    789 
    790 	mutex_enter(&mach_state->ms_lock);
    791 	for (entry =  mach_state->ms_handlers; entry != NULL;
    792 	    entry = entry->nq_next) {
    793 		entry->nq_handler(obj, val, entry->nq_ctx);
    794 	}
    795 	mutex_exit(&mach_state->ms_lock);
    796 }
    797 
    798 /*ARGSUSED*/
    799 void
    800 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
    801 {
    802 #ifndef __xpv
    803 	cpupm_mach_state_t *mach_state =
    804 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
    805 	cpupm_notification_t *entry;
    806 
    807 	entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
    808 	entry->nq_handler = handler;
    809 	entry->nq_ctx = ctx;
    810 	mutex_enter(&mach_state->ms_lock);
    811 	if (mach_state->ms_handlers == NULL) {
    812 		entry->nq_next = NULL;
    813 		mach_state->ms_handlers = entry;
    814 		cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
    815 		    cpupm_notify_handler, cp);
    816 
    817 	} else {
    818 		entry->nq_next = mach_state->ms_handlers;
    819 		mach_state->ms_handlers = entry;
    820 	}
    821 	mutex_exit(&mach_state->ms_lock);
    822 #endif
    823 }
    824 
    825 /*ARGSUSED*/
    826 static void
    827 cpupm_free_notify_handlers(cpu_t *cp)
    828 {
    829 #ifndef __xpv
    830 	cpupm_mach_state_t *mach_state =
    831 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
    832 	cpupm_notification_t *entry;
    833 	cpupm_notification_t *next;
    834 
    835 	mutex_enter(&mach_state->ms_lock);
    836 	if (mach_state->ms_handlers == NULL) {
    837 		mutex_exit(&mach_state->ms_lock);
    838 		return;
    839 	}
    840 	if (mach_state->ms_acpi_handle != NULL) {
    841 		cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
    842 		    cpupm_notify_handler);
    843 	}
    844 	entry = mach_state->ms_handlers;
    845 	while (entry != NULL) {
    846 		next = entry->nq_next;
    847 		kmem_free(entry, sizeof (cpupm_notification_t));
    848 		entry = next;
    849 	}
    850 	mach_state->ms_handlers = NULL;
    851 	mutex_exit(&mach_state->ms_lock);
    852 #endif
    853 }
    854 
    855 /*
    856  * Get the current max speed from the ACPI _PPC object
    857  */
    858 /*ARGSUSED*/
    859 int
    860 cpupm_get_top_speed(cpu_t *cp)
    861 {
    862 #ifndef __xpv
    863 	cpupm_mach_state_t 	*mach_state;
    864 	cpu_acpi_handle_t 	handle;
    865 	int 			plat_level;
    866 	uint_t			nspeeds;
    867 	int			max_level;
    868 
    869 	mach_state =
    870 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
    871 	handle = mach_state->ms_acpi_handle;
    872 
    873 	cpu_acpi_cache_ppc(handle);
    874 	plat_level = CPU_ACPI_PPC(handle);
    875 
    876 	nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
    877 
    878 	max_level = nspeeds - 1;
    879 	if ((plat_level < 0) || (plat_level > max_level)) {
    880 		cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
    881 		    "_PPC out of range %d", cp->cpu_id, plat_level);
    882 		plat_level = 0;
    883 	}
    884 
    885 	return (plat_level);
    886 #else
    887 	return (0);
    888 #endif
    889 }
    890 
    891 /*
    892  * This notification handler is called whenever the ACPI _PPC
    893  * object changes. The _PPC is a sort of governor on power levels.
    894  * It sets an upper threshold on which, _PSS defined, power levels
    895  * are usuable. The _PPC value is dynamic and may change as properties
    896  * (i.e., thermal or AC source) of the system change.
    897  */
    898 
    899 static void
    900 cpupm_power_manage_notifications(void *ctx)
    901 {
    902 	cpu_t			*cp = ctx;
    903 	int			top_speed;
    904 
    905 	top_speed = cpupm_get_top_speed(cp);
    906 	cpupm_redefine_max_activepwr_state(cp, top_speed);
    907 }
    908 
    909 /* ARGSUSED */
    910 static void
    911 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
    912 {
    913 #ifndef __xpv
    914 
    915 	cpu_t *cp = ctx;
    916 	cpupm_mach_state_t *mach_state =
    917 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
    918 
    919 	if (mach_state == NULL)
    920 		return;
    921 
    922 	/*
    923 	 * Currently, we handle _TPC,_CST and _PPC change notifications.
    924 	 */
    925 	if (val == CPUPM_TPC_CHANGE_NOTIFICATION &&
    926 	    mach_state->ms_caps & CPUPM_T_STATES) {
    927 		cpupm_throttle_manage_notification(ctx);
    928 	} else if (val == CPUPM_CST_CHANGE_NOTIFICATION &&
    929 	    mach_state->ms_caps & CPUPM_C_STATES) {
    930 		cpuidle_manage_cstates(ctx);
    931 	} else if (val == CPUPM_PPC_CHANGE_NOTIFICATION &&
    932 	    mach_state->ms_caps & CPUPM_P_STATES) {
    933 		cpupm_power_manage_notifications(ctx);
    934 	}
    935 #endif
    936 }
    937 
    938 /*
    939  * Update cpupm cstate data each time CPU exits idle.
    940  */
    941 void
    942 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
    943 {
    944 	cs_data->cs_idle_exit = end;
    945 }
    946 
    947 /*
    948  * Determine next cstate based on cpupm data.
    949  * Update cpupm cstate data each time CPU goes idle.
    950  * Do as much as possible in the idle state bookkeeping function because the
    951  * performance impact while idle is minimal compared to in the wakeup function
    952  * when there is real work to do.
    953  */
    954 uint32_t
    955 cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates,
    956     uint32_t cs_count, hrtime_t start)
    957 {
    958 	hrtime_t duration;
    959 	hrtime_t ave_interval;
    960 	hrtime_t ave_idle_time;
    961 	uint32_t i, smpl_cnt;
    962 
    963 	duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
    964 	scalehrtime(&duration);
    965 	cs_data->cs_idle += duration;
    966 	cs_data->cs_idle_enter = start;
    967 
    968 	smpl_cnt = ++cs_data->cs_cnt;
    969 	cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
    970 	scalehrtime(&cs_data->cs_smpl_len);
    971 	if (cs_data->cs_smpl_len > cpupm_cs_sample_interval) {
    972 		cs_data->cs_smpl_idle = cs_data->cs_idle;
    973 		cs_data->cs_idle = 0;
    974 		cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
    975 		    cs_data->cs_smpl_len);
    976 
    977 		cs_data->cs_smpl_start = start;
    978 		cs_data->cs_cnt = 0;
    979 
    980 		/*
    981 		 * Strand level C-state policy
    982 		 * The cpu_acpi_cstate_t *cstates array is not required to
    983 		 * have an entry for both CPU_ACPI_C2 and CPU_ACPI_C3.
    984 		 * There are cs_count entries in the cstates array.
    985 		 * cs_data->cs_next_cstate contains the index of the next
    986 		 * C-state this CPU should enter.
    987 		 */
    988 		ASSERT(cstates[0].cs_type == CPU_ACPI_C1);
    989 
    990 		/*
    991 		 * Will CPU be idle long enough to save power?
    992 		 */
    993 		ave_idle_time = (cs_data->cs_smpl_idle / smpl_cnt) / 1000;
    994 		for (i = 1; i < cs_count; ++i) {
    995 			if (ave_idle_time < (cstates[i].cs_latency *
    996 			    cpupm_cs_idle_save_tunable)) {
    997 				cs_count = i;
    998 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
    999 				    CPU, int, i);
   1000 			}
   1001 		}
   1002 
   1003 		/*
   1004 		 * Wakeup often (even when non-idle time is very short)?
   1005 		 * Some producer/consumer type loads fall into this category.
   1006 		 */
   1007 		ave_interval = (cs_data->cs_smpl_len / smpl_cnt) / 1000;
   1008 		for (i = 1; i < cs_count; ++i) {
   1009 			if (ave_interval <= (cstates[i].cs_latency *
   1010 			    cpupm_cs_idle_cost_tunable)) {
   1011 				cs_count = i;
   1012 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
   1013 				    CPU, int, (CPU_MAX_CSTATES + i));
   1014 			}
   1015 		}
   1016 
   1017 		/*
   1018 		 * Idle percent
   1019 		 */
   1020 		for (i = 1; i < cs_count; ++i) {
   1021 			switch (cstates[i].cs_type) {
   1022 			case CPU_ACPI_C2:
   1023 				if (cs_data->cs_smpl_idle_pct <
   1024 				    cpupm_C2_idle_pct_tunable) {
   1025 					cs_count = i;
   1026 					DTRACE_PROBE2(cpupm__next__cstate,
   1027 					    cpu_t *, CPU, int,
   1028 					    ((2 * CPU_MAX_CSTATES) + i));
   1029 				}
   1030 				break;
   1031 
   1032 			case CPU_ACPI_C3:
   1033 				if (cs_data->cs_smpl_idle_pct <
   1034 				    cpupm_C3_idle_pct_tunable) {
   1035 					cs_count = i;
   1036 					DTRACE_PROBE2(cpupm__next__cstate,
   1037 					    cpu_t *, CPU, int,
   1038 					    ((2 * CPU_MAX_CSTATES) + i));
   1039 				}
   1040 				break;
   1041 			}
   1042 		}
   1043 
   1044 		cs_data->cs_next_cstate = cs_count - 1;
   1045 	}
   1046 
   1047 	return (cs_data->cs_next_cstate);
   1048 }
   1049