Home | History | Annotate | Download | only in disp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/types.h>
     29 #include <sys/systm.h>
     30 #include <sys/cmn_err.h>
     31 #include <sys/cpuvar.h>
     32 #include <sys/thread.h>
     33 #include <sys/disp.h>
     34 #include <sys/kmem.h>
     35 #include <sys/debug.h>
     36 #include <sys/cpupart.h>
     37 #include <sys/pset.h>
     38 #include <sys/var.h>
     39 #include <sys/cyclic.h>
     40 #include <sys/lgrp.h>
     41 #include <sys/pghw.h>
     42 #include <sys/loadavg.h>
     43 #include <sys/class.h>
     44 #include <sys/fss.h>
     45 #include <sys/pool.h>
     46 #include <sys/pool_pset.h>
     47 #include <sys/policy.h>
     48 
     49 /*
     50  * Calling pool_lock() protects the pools configuration, which includes
     51  * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
     52  * partitions from being created or destroyed while the lock is held.
     53  * The lock ordering with respect to related locks is:
     54  *
     55  *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
     56  *
     57  * Blocking memory allocations may be made while holding "pool_lock"
     58  * or cpu_lock.
     59  */
     60 
     61 /*
     62  * The cp_default partition is allocated statically, but its lgroup load average
     63  * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
     64  * saves some memory since the space allocated reflects the actual number of
     65  * lgroups supported by the platform. The lgrp facility provides a temporary
     66  * space to hold lpl information during system bootstrap.
     67  */
     68 
     69 cpupart_t		*cp_list_head;
     70 cpupart_t		cp_default;
     71 struct mach_cpupart	cp_default_mach;
     72 static cpupartid_t	cp_id_next;
     73 uint_t			cp_numparts;
     74 uint_t			cp_numparts_nonempty;
     75 
     76 /*
     77  * Need to limit total number of partitions to avoid slowing down the
     78  * clock code too much.  The clock code traverses the list of
     79  * partitions and needs to be able to execute in a reasonable amount
     80  * of time (less than 1/hz seconds).  The maximum is sized based on
     81  * max_ncpus so it shouldn't be a problem unless there are large
     82  * numbers of empty partitions.
     83  */
     84 static uint_t		cp_max_numparts;
     85 
     86 /*
     87  * Processor sets and CPU partitions are different but related concepts.
     88  * A processor set is a user-level abstraction allowing users to create
     89  * sets of CPUs and bind threads exclusively to those sets.  A CPU
     90  * partition is a kernel dispatcher object consisting of a set of CPUs
     91  * and a global dispatch queue.  The processor set abstraction is
     92  * implemented via a CPU partition, and currently there is a 1-1
     93  * mapping between processor sets and partitions (excluding the default
     94  * partition, which is not visible as a processor set).  Hence, the
     95  * numbering for processor sets and CPU partitions is identical.  This
     96  * may not always be true in the future, and these macros could become
     97  * less trivial if we support e.g. a processor set containing multiple
     98  * CPU partitions.
     99  */
    100 #define	PSTOCP(psid)	((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
    101 #define	CPTOPS(cpid)	((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
    102 
    103 
    104 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
    105 
    106 /*
    107  * Find a CPU partition given a processor set ID.
    108  */
    109 static cpupart_t *
    110 cpupart_find_all(psetid_t psid)
    111 {
    112 	cpupart_t *cp;
    113 	cpupartid_t cpid = PSTOCP(psid);
    114 
    115 	ASSERT(MUTEX_HELD(&cpu_lock));
    116 
    117 	/* default partition not visible as a processor set */
    118 	if (psid == CP_DEFAULT)
    119 		return (NULL);
    120 
    121 	if (psid == PS_MYID)
    122 		return (curthread->t_cpupart);
    123 
    124 	cp = cp_list_head;
    125 	do {
    126 		if (cp->cp_id == cpid)
    127 			return (cp);
    128 		cp = cp->cp_next;
    129 	} while (cp != cp_list_head);
    130 	return (NULL);
    131 }
    132 
    133 /*
    134  * Find a CPU partition given a processor set ID if the processor set
    135  * should be visible from the calling zone.
    136  */
    137 cpupart_t *
    138 cpupart_find(psetid_t psid)
    139 {
    140 	cpupart_t *cp;
    141 
    142 	ASSERT(MUTEX_HELD(&cpu_lock));
    143 	cp = cpupart_find_all(psid);
    144 	if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
    145 	    zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
    146 			return (NULL);
    147 	return (cp);
    148 }
    149 
    150 static int
    151 cpupart_kstat_update(kstat_t *ksp, int rw)
    152 {
    153 	cpupart_t *cp = (cpupart_t *)ksp->ks_private;
    154 	cpupart_kstat_t *cpksp = ksp->ks_data;
    155 
    156 	if (rw == KSTAT_WRITE)
    157 		return (EACCES);
    158 
    159 	cpksp->cpk_updates.value.ui64 = cp->cp_updates;
    160 	cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
    161 	cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
    162 	cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
    163 	cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
    164 	    (16 - FSHIFT);
    165 	cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
    166 	    (16 - FSHIFT);
    167 	cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
    168 	    (16 - FSHIFT);
    169 	return (0);
    170 }
    171 
    172 static void
    173 cpupart_kstat_create(cpupart_t *cp)
    174 {
    175 	kstat_t *ksp;
    176 	zoneid_t zoneid;
    177 
    178 	ASSERT(MUTEX_HELD(&cpu_lock));
    179 
    180 	/*
    181 	 * We have a bit of a chicken-egg problem since this code will
    182 	 * get called to create the kstats for CP_DEFAULT before the
    183 	 * pools framework gets initialized.  We circumvent the problem
    184 	 * by special-casing cp_default.
    185 	 */
    186 	if (cp != &cp_default && pool_pset_enabled())
    187 		zoneid = GLOBAL_ZONEID;
    188 	else
    189 		zoneid = ALL_ZONES;
    190 	ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
    191 	    KSTAT_TYPE_NAMED,
    192 	    sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
    193 	if (ksp != NULL) {
    194 		cpupart_kstat_t *cpksp = ksp->ks_data;
    195 
    196 		kstat_named_init(&cpksp->cpk_updates, "updates",
    197 		    KSTAT_DATA_UINT64);
    198 		kstat_named_init(&cpksp->cpk_runnable, "runnable",
    199 		    KSTAT_DATA_UINT64);
    200 		kstat_named_init(&cpksp->cpk_waiting, "waiting",
    201 		    KSTAT_DATA_UINT64);
    202 		kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
    203 		    KSTAT_DATA_UINT32);
    204 		kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
    205 		    KSTAT_DATA_UINT32);
    206 		kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
    207 		    KSTAT_DATA_UINT32);
    208 		kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
    209 		    KSTAT_DATA_UINT32);
    210 
    211 		ksp->ks_update = cpupart_kstat_update;
    212 		ksp->ks_private = cp;
    213 
    214 		kstat_install(ksp);
    215 	}
    216 	cp->cp_kstat = ksp;
    217 }
    218 
    219 /*
    220  * Initialize the default partition and kpreempt disp queue.
    221  */
    222 void
    223 cpupart_initialize_default(void)
    224 {
    225 	lgrp_id_t i;
    226 
    227 	cp_list_head = &cp_default;
    228 	cp_default.cp_next = &cp_default;
    229 	cp_default.cp_prev = &cp_default;
    230 	cp_default.cp_id = CP_DEFAULT;
    231 	cp_default.cp_kp_queue.disp_maxrunpri = -1;
    232 	cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
    233 	cp_default.cp_kp_queue.disp_cpu = NULL;
    234 	cp_default.cp_gen = 0;
    235 	cp_default.cp_loadavg.lg_cur = 0;
    236 	cp_default.cp_loadavg.lg_len = 0;
    237 	cp_default.cp_loadavg.lg_total = 0;
    238 	for (i = 0; i < S_LOADAVG_SZ; i++) {
    239 		cp_default.cp_loadavg.lg_loads[i] = 0;
    240 	}
    241 	CPUSET_ZERO(cp_default.cp_mach->mc_haltset);
    242 	DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
    243 	cp_id_next = CP_DEFAULT + 1;
    244 	cpupart_kstat_create(&cp_default);
    245 	cp_numparts = 1;
    246 	if (cp_max_numparts == 0)	/* allow for /etc/system tuning */
    247 		cp_max_numparts = max_ncpus * 2 + 1;
    248 	/*
    249 	 * Allocate space for cp_default list of lgrploads
    250 	 */
    251 	cp_default.cp_nlgrploads = lgrp_plat_max_lgrps();
    252 	cp_default.cp_lgrploads = kmem_zalloc(sizeof (lpl_t) *
    253 	    cp_default.cp_nlgrploads, KM_SLEEP);
    254 
    255 	/*
    256 	 * The initial lpl topology is created in a special lpl list
    257 	 * lpl_bootstrap. It should be copied to cp_default.
    258 	 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
    259 	 *	 to the correct lpl in the cp_default.cp_lgrploads list.
    260 	 */
    261 	lpl_topo_bootstrap(cp_default.cp_lgrploads,
    262 	    cp_default.cp_nlgrploads);
    263 
    264 	for (i = 0; i < cp_default.cp_nlgrploads; i++) {
    265 		cp_default.cp_lgrploads[i].lpl_lgrpid = i;
    266 	}
    267 	cp_default.cp_attr = PSET_NOESCAPE;
    268 	cp_numparts_nonempty = 1;
    269 	/*
    270 	 * Set t0's home
    271 	 */
    272 	t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
    273 
    274 	bitset_init(&cp_default.cp_cmt_pgs);
    275 }
    276 
    277 
    278 static int
    279 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
    280 {
    281 	cpupart_t *oldpp;
    282 	cpu_t	*ncp, *newlist;
    283 	kthread_t *t;
    284 	int	move_threads = 1;
    285 	lgrp_id_t lgrpid;
    286 	proc_t 	*p;
    287 	int lgrp_diff_lpl;
    288 	lpl_t	*cpu_lpl;
    289 	int	ret;
    290 	boolean_t unbind_all_threads = (forced != 0);
    291 
    292 	ASSERT(MUTEX_HELD(&cpu_lock));
    293 	ASSERT(newpp != NULL);
    294 
    295 	oldpp = cp->cpu_part;
    296 	ASSERT(oldpp != NULL);
    297 	ASSERT(oldpp->cp_ncpus > 0);
    298 
    299 	if (newpp == oldpp) {
    300 		/*
    301 		 * Don't need to do anything.
    302 		 */
    303 		return (0);
    304 	}
    305 
    306 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
    307 
    308 	if (!disp_bound_partition(cp, 0)) {
    309 		/*
    310 		 * Don't need to move threads if there are no threads in
    311 		 * the partition.  Note that threads can't enter the
    312 		 * partition while we're holding cpu_lock.
    313 		 */
    314 		move_threads = 0;
    315 	} else if (oldpp->cp_ncpus == 1) {
    316 		/*
    317 		 * The last CPU is removed from a partition which has threads
    318 		 * running in it. Some of these threads may be bound to this
    319 		 * CPU.
    320 		 *
    321 		 * Attempt to unbind threads from the CPU and from the processor
    322 		 * set. Note that no threads should be bound to this CPU since
    323 		 * cpupart_move_threads will refuse to move bound threads to
    324 		 * other CPUs.
    325 		 */
    326 		(void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
    327 		(void) cpupart_unbind_threads(oldpp, B_FALSE);
    328 
    329 		if (!disp_bound_partition(cp, 0)) {
    330 			/*
    331 			 * No bound threads in this partition any more
    332 			 */
    333 			move_threads = 0;
    334 		} else {
    335 			/*
    336 			 * There are still threads bound to the partition
    337 			 */
    338 			cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
    339 			return (EBUSY);
    340 		}
    341 	}
    342 
    343 	/*
    344 	 * If forced flag is set unbind any threads from this CPU.
    345 	 * Otherwise unbind soft-bound threads only.
    346 	 */
    347 	if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
    348 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
    349 		return (ret);
    350 	}
    351 
    352 	/*
    353 	 * Stop further threads weak binding to this cpu.
    354 	 */
    355 	cpu_inmotion = cp;
    356 	membar_enter();
    357 
    358 	/*
    359 	 * Notify the Processor Groups subsystem that the CPU
    360 	 * will be moving cpu partitions. This is done before
    361 	 * CPUs are paused to provide an opportunity for any
    362 	 * needed memory allocations.
    363 	 */
    364 	pg_cpupart_out(cp, oldpp);
    365 	pg_cpupart_in(cp, newpp);
    366 
    367 again:
    368 	if (move_threads) {
    369 		int loop_count;
    370 		/*
    371 		 * Check for threads strong or weak bound to this CPU.
    372 		 */
    373 		for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
    374 			if (loop_count >= 5) {
    375 				cpu_state_change_notify(cp->cpu_id,
    376 				    CPU_CPUPART_IN);
    377 				pg_cpupart_out(cp, newpp);
    378 				pg_cpupart_in(cp, oldpp);
    379 				cpu_inmotion = NULL;
    380 				return (EBUSY);	/* some threads still bound */
    381 			}
    382 			delay(1);
    383 		}
    384 	}
    385 
    386 	/*
    387 	 * Before we actually start changing data structures, notify
    388 	 * the cyclic subsystem that we want to move this CPU out of its
    389 	 * partition.
    390 	 */
    391 	if (!cyclic_move_out(cp)) {
    392 		/*
    393 		 * This CPU must be the last CPU in a processor set with
    394 		 * a bound cyclic.
    395 		 */
    396 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
    397 		pg_cpupart_out(cp, newpp);
    398 		pg_cpupart_in(cp, oldpp);
    399 		cpu_inmotion = NULL;
    400 		return (EBUSY);
    401 	}
    402 
    403 	pause_cpus(cp);
    404 
    405 	if (move_threads) {
    406 		/*
    407 		 * The thread on cpu before the pause thread may have read
    408 		 * cpu_inmotion before we raised the barrier above.  Check
    409 		 * again.
    410 		 */
    411 		if (disp_bound_threads(cp, 1)) {
    412 			start_cpus();
    413 			goto again;
    414 		}
    415 
    416 	}
    417 
    418 	/*
    419 	 * Now that CPUs are paused, let the PG subsystem perform
    420 	 * any necessary data structure updates.
    421 	 */
    422 	pg_cpupart_move(cp, oldpp, newpp);
    423 
    424 	/* save this cpu's lgroup -- it'll be the same in the new partition */
    425 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
    426 
    427 	cpu_lpl = cp->cpu_lpl;
    428 	/*
    429 	 * let the lgroup framework know cp has left the partition
    430 	 */
    431 	lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
    432 
    433 	/* move out of old partition */
    434 	oldpp->cp_ncpus--;
    435 	if (oldpp->cp_ncpus > 0) {
    436 
    437 		ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
    438 		cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
    439 		if (oldpp->cp_cpulist == cp) {
    440 			oldpp->cp_cpulist = ncp;
    441 		}
    442 	} else {
    443 		ncp = oldpp->cp_cpulist = NULL;
    444 		cp_numparts_nonempty--;
    445 		ASSERT(cp_numparts_nonempty != 0);
    446 	}
    447 	oldpp->cp_gen++;
    448 
    449 	/* move into new partition */
    450 	newlist = newpp->cp_cpulist;
    451 	if (newlist == NULL) {
    452 		newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
    453 		cp_numparts_nonempty++;
    454 		ASSERT(cp_numparts_nonempty != 0);
    455 	} else {
    456 		cp->cpu_next_part = newlist;
    457 		cp->cpu_prev_part = newlist->cpu_prev_part;
    458 		newlist->cpu_prev_part->cpu_next_part = cp;
    459 		newlist->cpu_prev_part = cp;
    460 	}
    461 	cp->cpu_part = newpp;
    462 	newpp->cp_ncpus++;
    463 	newpp->cp_gen++;
    464 
    465 	ASSERT(CPUSET_ISNULL(newpp->cp_mach->mc_haltset));
    466 	ASSERT(CPUSET_ISNULL(oldpp->cp_mach->mc_haltset));
    467 
    468 	/*
    469 	 * let the lgroup framework know cp has entered the partition
    470 	 */
    471 	lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
    472 
    473 	/*
    474 	 * If necessary, move threads off processor.
    475 	 */
    476 	if (move_threads) {
    477 		ASSERT(ncp != NULL);
    478 
    479 		/*
    480 		 * Walk thru the active process list to look for
    481 		 * threads that need to have a new home lgroup,
    482 		 * or the last CPU they run on is the same CPU
    483 		 * being moved out of the partition.
    484 		 */
    485 
    486 		for (p = practive; p != NULL; p = p->p_next) {
    487 
    488 			t = p->p_tlist;
    489 
    490 			if (t == NULL)
    491 				continue;
    492 
    493 			lgrp_diff_lpl = 0;
    494 
    495 			do {
    496 
    497 				ASSERT(t->t_lpl != NULL);
    498 
    499 				/*
    500 				 * Update the count of how many threads are
    501 				 * in this CPU's lgroup but have a different lpl
    502 				 */
    503 
    504 				if (t->t_lpl != cpu_lpl &&
    505 				    t->t_lpl->lpl_lgrpid == lgrpid)
    506 					lgrp_diff_lpl++;
    507 				/*
    508 				 * If the lgroup that t is assigned to no
    509 				 * longer has any CPUs in t's partition,
    510 				 * we'll have to choose a new lgroup for t.
    511 				 */
    512 
    513 				if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
    514 				    t->t_cpupart)) {
    515 					lgrp_move_thread(t,
    516 					    lgrp_choose(t, t->t_cpupart), 0);
    517 				}
    518 
    519 				/*
    520 				 * make sure lpl points to our own partition
    521 				 */
    522 				ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
    523 				    (t->t_lpl < t->t_cpupart->cp_lgrploads +
    524 					t->t_cpupart->cp_nlgrploads));
    525 
    526 				ASSERT(t->t_lpl->lpl_ncpu > 0);
    527 
    528 				/* Update CPU last ran on if it was this CPU */
    529 				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
    530 				    t->t_bound_cpu != cp) {
    531 					t->t_cpu = disp_lowpri_cpu(ncp,
    532 					    t->t_lpl, t->t_pri, NULL);
    533 				}
    534 				t = t->t_forw;
    535 			} while (t != p->p_tlist);
    536 
    537 			/*
    538 			 * Didn't find any threads in the same lgroup as this
    539 			 * CPU with a different lpl, so remove the lgroup from
    540 			 * the process lgroup bitmask.
    541 			 */
    542 
    543 			if (lgrp_diff_lpl)
    544 				klgrpset_del(p->p_lgrpset, lgrpid);
    545 		}
    546 
    547 		/*
    548 		 * Walk thread list looking for threads that need to be
    549 		 * rehomed, since there are some threads that are not in
    550 		 * their process's p_tlist.
    551 		 */
    552 
    553 		t = curthread;
    554 
    555 		do {
    556 			ASSERT(t != NULL && t->t_lpl != NULL);
    557 
    558 			/*
    559 			 * If the lgroup that t is assigned to no
    560 			 * longer has any CPUs in t's partition,
    561 			 * we'll have to choose a new lgroup for t.
    562 			 * Also, choose best lgroup for home when
    563 			 * thread has specified lgroup affinities,
    564 			 * since there may be an lgroup with more
    565 			 * affinity available after moving CPUs
    566 			 * around.
    567 			 */
    568 			if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
    569 			    t->t_cpupart) || t->t_lgrp_affinity) {
    570 				lgrp_move_thread(t,
    571 				    lgrp_choose(t, t->t_cpupart), 1);
    572 			}
    573 
    574 			/* make sure lpl points to our own partition */
    575 			ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
    576 			    (t->t_lpl < t->t_cpupart->cp_lgrploads +
    577 				t->t_cpupart->cp_nlgrploads));
    578 
    579 			ASSERT(t->t_lpl->lpl_ncpu > 0);
    580 
    581 			/* Update CPU last ran on if it was this CPU */
    582 			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
    583 			    t->t_bound_cpu != cp) {
    584 				t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
    585 				    t->t_pri, NULL);
    586 			}
    587 
    588 			t = t->t_next;
    589 		} while (t != curthread);
    590 
    591 		/*
    592 		 * Clear off the CPU's run queue, and the kp queue if the
    593 		 * partition is now empty.
    594 		 */
    595 		disp_cpu_inactive(cp);
    596 
    597 		/*
    598 		 * Make cp switch to a thread from the new partition.
    599 		 */
    600 		cp->cpu_runrun = 1;
    601 		cp->cpu_kprunrun = 1;
    602 	}
    603 
    604 	cpu_inmotion = NULL;
    605 	start_cpus();
    606 
    607 	/*
    608 	 * Let anyone interested know that cpu has been added to the set.
    609 	 */
    610 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
    611 
    612 	/*
    613 	 * Now let the cyclic subsystem know that it can reshuffle cyclics
    614 	 * bound to the new processor set.
    615 	 */
    616 	cyclic_move_in(cp);
    617 
    618 	return (0);
    619 }
    620 
    621 /*
    622  * Check if thread can be moved to a new cpu partition.  Called by
    623  * cpupart_move_thread() and pset_bind_start().
    624  */
    625 int
    626 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
    627 {
    628 	ASSERT(MUTEX_HELD(&cpu_lock));
    629 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
    630 	ASSERT(cp != NULL);
    631 	ASSERT(THREAD_LOCK_HELD(tp));
    632 
    633 	/*
    634 	 * CPU-bound threads can't be moved.
    635 	 */
    636 	if (!ignore) {
    637 		cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
    638 		    tp->t_weakbound_cpu;
    639 		if (boundcpu != NULL && boundcpu->cpu_part != cp)
    640 			return (EBUSY);
    641 	}
    642 	return (0);
    643 }
    644 
    645 /*
    646  * Move thread to new partition.  If ignore is non-zero, then CPU
    647  * bindings should be ignored (this is used when destroying a
    648  * partition).
    649  */
    650 static int
    651 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
    652     void *projbuf, void *zonebuf)
    653 {
    654 	cpupart_t *oldpp = tp->t_cpupart;
    655 	int ret;
    656 
    657 	ASSERT(MUTEX_HELD(&cpu_lock));
    658 	ASSERT(MUTEX_HELD(&pidlock));
    659 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
    660 	ASSERT(newpp != NULL);
    661 
    662 	if (newpp->cp_cpulist == NULL)
    663 		return (EINVAL);
    664 
    665 	/*
    666 	 * Check for errors first.
    667 	 */
    668 	thread_lock(tp);
    669 	if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
    670 		thread_unlock(tp);
    671 		return (ret);
    672 	}
    673 
    674 	/* move the thread */
    675 	if (oldpp != newpp) {
    676 		/*
    677 		 * Make the thread switch to the new partition.
    678 		 */
    679 		tp->t_cpupart = newpp;
    680 		ASSERT(tp->t_lpl != NULL);
    681 		/*
    682 		 * Leave the thread on the same lgroup if possible; otherwise
    683 		 * choose a new lgroup for it.  In either case, update its
    684 		 * t_lpl.
    685 		 */
    686 		if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
    687 		    tp->t_lgrp_affinity == NULL) {
    688 			/*
    689 			 * The thread's lgroup has CPUs in the thread's new
    690 			 * partition, so the thread can stay assigned to the
    691 			 * same lgroup.  Update its t_lpl to point to the
    692 			 * lpl_t for its lgroup in its new partition.
    693 			 */
    694 			lgrp_move_thread(tp, &tp->t_cpupart->\
    695 			    cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
    696 		} else {
    697 			/*
    698 			 * The thread's lgroup has no cpus in its new
    699 			 * partition or it has specified lgroup affinities,
    700 			 * so choose the best lgroup for the thread and
    701 			 * assign it to that lgroup.
    702 			 */
    703 			lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
    704 			    1);
    705 		}
    706 		/*
    707 		 * make sure lpl points to our own partition
    708 		 */
    709 		ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
    710 		    (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
    711 			tp->t_cpupart->cp_nlgrploads));
    712 
    713 		ASSERT(tp->t_lpl->lpl_ncpu > 0);
    714 
    715 		if (tp->t_state == TS_ONPROC) {
    716 			cpu_surrender(tp);
    717 		} else if (tp->t_state == TS_RUN) {
    718 			(void) dispdeq(tp);
    719 			setbackdq(tp);
    720 		}
    721 	}
    722 
    723 	/*
    724 	 * Our binding has changed; set TP_CHANGEBIND.
    725 	 */
    726 	tp->t_proc_flag |= TP_CHANGEBIND;
    727 	aston(tp);
    728 
    729 	thread_unlock(tp);
    730 	fss_changepset(tp, newpp, projbuf, zonebuf);
    731 
    732 	return (0);		/* success */
    733 }
    734 
    735 
    736 /*
    737  * This function binds a thread to a partition.  Must be called with the
    738  * p_lock of the containing process held (to keep the thread from going
    739  * away), and thus also with cpu_lock held (since cpu_lock must be
    740  * acquired before p_lock).  If ignore is non-zero, then CPU bindings
    741  * should be ignored (this is used when destroying a partition).
    742  */
    743 int
    744 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
    745     void *zonebuf)
    746 {
    747 	cpupart_t	*newpp;
    748 
    749 	ASSERT(pool_lock_held());
    750 	ASSERT(MUTEX_HELD(&cpu_lock));
    751 	ASSERT(MUTEX_HELD(&pidlock));
    752 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
    753 
    754 	if (psid == PS_NONE)
    755 		newpp = &cp_default;
    756 	else {
    757 		newpp = cpupart_find(psid);
    758 		if (newpp == NULL) {
    759 			return (EINVAL);
    760 		}
    761 	}
    762 	return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
    763 }
    764 
    765 
    766 /*
    767  * Create a new partition.  On MP systems, this also allocates a
    768  * kpreempt disp queue for that partition.
    769  */
    770 int
    771 cpupart_create(psetid_t *psid)
    772 {
    773 	cpupart_t	*pp;
    774 	lgrp_id_t	i;
    775 
    776 	ASSERT(pool_lock_held());
    777 
    778 	pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
    779 	pp->cp_mach = kmem_zalloc(sizeof (struct mach_cpupart), KM_SLEEP);
    780 	pp->cp_nlgrploads = lgrp_plat_max_lgrps();
    781 	pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads,
    782 	    KM_SLEEP);
    783 
    784 	mutex_enter(&cpu_lock);
    785 	if (cp_numparts == cp_max_numparts) {
    786 		mutex_exit(&cpu_lock);
    787 		kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
    788 		pp->cp_lgrploads = NULL;
    789 		kmem_free(pp->cp_mach, sizeof (struct mach_cpupart));
    790 		kmem_free(pp, sizeof (cpupart_t));
    791 		return (ENOMEM);
    792 	}
    793 	cp_numparts++;
    794 	/* find the next free partition ID */
    795 	while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
    796 		cp_id_next++;
    797 	pp->cp_id = cp_id_next++;
    798 	pp->cp_ncpus = 0;
    799 	pp->cp_cpulist = NULL;
    800 	pp->cp_attr = 0;
    801 	klgrpset_clear(pp->cp_lgrpset);
    802 	pp->cp_kp_queue.disp_maxrunpri = -1;
    803 	pp->cp_kp_queue.disp_max_unbound_pri = -1;
    804 	pp->cp_kp_queue.disp_cpu = NULL;
    805 	pp->cp_gen = 0;
    806 	CPUSET_ZERO(pp->cp_mach->mc_haltset);
    807 	DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
    808 	*psid = CPTOPS(pp->cp_id);
    809 	disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
    810 	cpupart_kstat_create(pp);
    811 	for (i = 0; i < pp->cp_nlgrploads; i++) {
    812 		pp->cp_lgrploads[i].lpl_lgrpid = i;
    813 	}
    814 	bitset_init(&pp->cp_cmt_pgs);
    815 
    816 	/*
    817 	 * Pause all CPUs while changing the partition list, to make sure
    818 	 * the clock thread (which traverses the list without holding
    819 	 * cpu_lock) isn't running.
    820 	 */
    821 	pause_cpus(NULL);
    822 	pp->cp_next = cp_list_head;
    823 	pp->cp_prev = cp_list_head->cp_prev;
    824 	cp_list_head->cp_prev->cp_next = pp;
    825 	cp_list_head->cp_prev = pp;
    826 	start_cpus();
    827 	mutex_exit(&cpu_lock);
    828 
    829 	return (0);
    830 }
    831 
    832 /*
    833  * Move threads from specified partition to cp_default. If `force' is specified,
    834  * move all threads, otherwise move only soft-bound threads.
    835  */
    836 static int
    837 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
    838 {
    839 	void 	*projbuf, *zonebuf;
    840 	kthread_t *t;
    841 	proc_t	*p;
    842 	int	err = 0;
    843 	psetid_t psid = pp->cp_id;
    844 
    845 	ASSERT(pool_lock_held());
    846 	ASSERT(MUTEX_HELD(&cpu_lock));
    847 
    848 	if (pp == NULL || pp == &cp_default) {
    849 		return (EINVAL);
    850 	}
    851 
    852 	/*
    853 	 * Pre-allocate enough buffers for FSS for all active projects and
    854 	 * for all active zones on the system.  Unused buffers will be
    855 	 * freed later by fss_freebuf().
    856 	 */
    857 	projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
    858 	zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
    859 
    860 	mutex_enter(&pidlock);
    861 	t = curthread;
    862 	do {
    863 		if (t->t_bind_pset == psid) {
    864 again:			p = ttoproc(t);
    865 			mutex_enter(&p->p_lock);
    866 			if (ttoproc(t) != p) {
    867 				/*
    868 				 * lwp_exit has changed this thread's process
    869 				 * pointer before we grabbed its p_lock.
    870 				 */
    871 				mutex_exit(&p->p_lock);
    872 				goto again;
    873 			}
    874 
    875 			/*
    876 			 * Can only unbind threads which have revocable binding
    877 			 * unless force unbinding requested.
    878 			 */
    879 			if (unbind_all || TB_PSET_IS_SOFT(t)) {
    880 				err = cpupart_bind_thread(t, PS_NONE, 1,
    881 				    projbuf, zonebuf);
    882 				if (err) {
    883 					mutex_exit(&p->p_lock);
    884 					mutex_exit(&pidlock);
    885 					fss_freebuf(projbuf, FSS_ALLOC_PROJ);
    886 					fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
    887 					return (err);
    888 				}
    889 				t->t_bind_pset = PS_NONE;
    890 			}
    891 			mutex_exit(&p->p_lock);
    892 		}
    893 		t = t->t_next;
    894 	} while (t != curthread);
    895 
    896 	mutex_exit(&pidlock);
    897 	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
    898 	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
    899 	return (err);
    900 }
    901 
    902 /*
    903  * Destroy a partition.
    904  */
    905 int
    906 cpupart_destroy(psetid_t psid)
    907 {
    908 	cpu_t	*cp, *first_cp;
    909 	cpupart_t *pp, *newpp;
    910 	int	err = 0;
    911 
    912 	ASSERT(pool_lock_held());
    913 	mutex_enter(&cpu_lock);
    914 
    915 	pp = cpupart_find(psid);
    916 	if (pp == NULL || pp == &cp_default) {
    917 		mutex_exit(&cpu_lock);
    918 		return (EINVAL);
    919 	}
    920 
    921 	/*
    922 	 * Unbind all the threads currently bound to the partition.
    923 	 */
    924 	err = cpupart_unbind_threads(pp, B_TRUE);
    925 	if (err) {
    926 		mutex_exit(&cpu_lock);
    927 		return (err);
    928 	}
    929 
    930 	newpp = &cp_default;
    931 	while ((cp = pp->cp_cpulist) != NULL) {
    932 		if (err = cpupart_move_cpu(cp, newpp, 0)) {
    933 			mutex_exit(&cpu_lock);
    934 			return (err);
    935 		}
    936 	}
    937 
    938 	ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
    939 	ASSERT(CPUSET_ISNULL(pp->cp_mach->mc_haltset));
    940 
    941 	/*
    942 	 * Teardown the partition's group of active CMT PGs now that
    943 	 * all of the CPUs have left.
    944 	 */
    945 	bitset_fini(&pp->cp_cmt_pgs);
    946 
    947 	/*
    948 	 * Reset the pointers in any offline processors so they won't
    949 	 * try to rejoin the destroyed partition when they're turned
    950 	 * online.
    951 	 */
    952 	first_cp = cp = CPU;
    953 	do {
    954 		if (cp->cpu_part == pp) {
    955 			ASSERT(cp->cpu_flags & CPU_OFFLINE);
    956 			cp->cpu_part = newpp;
    957 		}
    958 		cp = cp->cpu_next;
    959 	} while (cp != first_cp);
    960 
    961 	/*
    962 	 * Pause all CPUs while changing the partition list, to make sure
    963 	 * the clock thread (which traverses the list without holding
    964 	 * cpu_lock) isn't running.
    965 	 */
    966 	pause_cpus(NULL);
    967 	pp->cp_prev->cp_next = pp->cp_next;
    968 	pp->cp_next->cp_prev = pp->cp_prev;
    969 	if (cp_list_head == pp)
    970 		cp_list_head = pp->cp_next;
    971 	start_cpus();
    972 
    973 	if (cp_id_next > pp->cp_id)
    974 		cp_id_next = pp->cp_id;
    975 
    976 	if (pp->cp_kstat)
    977 		kstat_delete(pp->cp_kstat);
    978 
    979 	cp_numparts--;
    980 
    981 	disp_kp_free(&pp->cp_kp_queue);
    982 	kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
    983 	pp->cp_lgrploads = NULL;
    984 	kmem_free(pp->cp_mach, sizeof (struct mach_cpupart));
    985 	kmem_free(pp, sizeof (cpupart_t));
    986 	mutex_exit(&cpu_lock);
    987 
    988 	return (err);
    989 }
    990 
    991 
    992 /*
    993  * Return the ID of the partition to which the specified processor belongs.
    994  */
    995 psetid_t
    996 cpupart_query_cpu(cpu_t *cp)
    997 {
    998 	ASSERT(MUTEX_HELD(&cpu_lock));
    999 
   1000 	return (CPTOPS(cp->cpu_part->cp_id));
   1001 }
   1002 
   1003 
   1004 /*
   1005  * Attach a processor to an existing partition.
   1006  */
   1007 int
   1008 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
   1009 {
   1010 	cpupart_t	*pp;
   1011 	int		err;
   1012 
   1013 	ASSERT(pool_lock_held());
   1014 	ASSERT(MUTEX_HELD(&cpu_lock));
   1015 
   1016 	pp = cpupart_find(psid);
   1017 	if (pp == NULL)
   1018 		return (EINVAL);
   1019 	if (cp->cpu_flags & CPU_OFFLINE)
   1020 		return (EINVAL);
   1021 
   1022 	err = cpupart_move_cpu(cp, pp, forced);
   1023 	return (err);
   1024 }
   1025 
   1026 /*
   1027  * Get a list of cpus belonging to the partition.  If numcpus is NULL,
   1028  * this just checks for a valid partition.  If numcpus is non-NULL but
   1029  * cpulist is NULL, the current number of cpus is stored in *numcpus.
   1030  * If both are non-NULL, the current number of cpus is stored in *numcpus,
   1031  * and a list of those cpus up to the size originally in *numcpus is
   1032  * stored in cpulist[].  Also, store the processor set id in *psid.
   1033  * This is useful in case the processor set id passed in was PS_MYID.
   1034  */
   1035 int
   1036 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
   1037 {
   1038 	cpupart_t	*pp;
   1039 	uint_t		ncpus;
   1040 	cpu_t		*c;
   1041 	int		i;
   1042 
   1043 	mutex_enter(&cpu_lock);
   1044 	pp = cpupart_find(*psid);
   1045 	if (pp == NULL) {
   1046 		mutex_exit(&cpu_lock);
   1047 		return (EINVAL);
   1048 	}
   1049 	*psid = CPTOPS(pp->cp_id);
   1050 	ncpus = pp->cp_ncpus;
   1051 	if (numcpus) {
   1052 		if (ncpus > *numcpus) {
   1053 			/*
   1054 			 * Only copy as many cpus as were passed in, but
   1055 			 * pass back the real number.
   1056 			 */
   1057 			uint_t t = ncpus;
   1058 			ncpus = *numcpus;
   1059 			*numcpus = t;
   1060 		} else
   1061 			*numcpus = ncpus;
   1062 
   1063 		if (cpulist) {
   1064 			c = pp->cp_cpulist;
   1065 			for (i = 0; i < ncpus; i++) {
   1066 				ASSERT(c != NULL);
   1067 				cpulist[i] = c->cpu_id;
   1068 				c = c->cpu_next_part;
   1069 			}
   1070 		}
   1071 	}
   1072 	mutex_exit(&cpu_lock);
   1073 	return (0);
   1074 }
   1075 
   1076 /*
   1077  * Reallocate kpreempt queues for each CPU partition.  Called from
   1078  * disp_setup when a new scheduling class is loaded that increases the
   1079  * number of priorities in the system.
   1080  */
   1081 void
   1082 cpupart_kpqalloc(pri_t npri)
   1083 {
   1084 	cpupart_t *cpp;
   1085 
   1086 	ASSERT(MUTEX_HELD(&cpu_lock));
   1087 	cpp = cp_list_head;
   1088 	do {
   1089 		disp_kp_alloc(&cpp->cp_kp_queue, npri);
   1090 		cpp = cpp->cp_next;
   1091 	} while (cpp != cp_list_head);
   1092 }
   1093 
   1094 int
   1095 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
   1096 {
   1097 	cpupart_t *cp;
   1098 	int i;
   1099 
   1100 	ASSERT(nelem >= 0);
   1101 	ASSERT(nelem <= LOADAVG_NSTATS);
   1102 	ASSERT(MUTEX_HELD(&cpu_lock));
   1103 
   1104 	cp = cpupart_find(psid);
   1105 	if (cp == NULL)
   1106 		return (EINVAL);
   1107 	for (i = 0; i < nelem; i++)
   1108 		buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
   1109 
   1110 	return (0);
   1111 }
   1112 
   1113 
   1114 uint_t
   1115 cpupart_list(psetid_t *list, uint_t nelem, int flag)
   1116 {
   1117 	uint_t numpart = 0;
   1118 	cpupart_t *cp;
   1119 
   1120 	ASSERT(MUTEX_HELD(&cpu_lock));
   1121 	ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
   1122 
   1123 	if (list != NULL) {
   1124 		cp = cp_list_head;
   1125 		do {
   1126 			if (((flag == CP_ALL) && (cp != &cp_default)) ||
   1127 			    ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
   1128 				if (numpart == nelem)
   1129 					break;
   1130 				list[