Home | History | Annotate | Download | only in disp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/systm.h>
     29 #include <sys/types.h>
     30 #include <sys/param.h>
     31 #include <sys/thread.h>
     32 #include <sys/cpuvar.h>
     33 #include <sys/cpupart.h>
     34 #include <sys/kmem.h>
     35 #include <sys/cmn_err.h>
     36 #include <sys/kstat.h>
     37 #include <sys/processor.h>
     38 #include <sys/disp.h>
     39 #include <sys/group.h>
     40 #include <sys/pghw.h>
     41 #include <sys/bitset.h>
     42 #include <sys/lgrp.h>
     43 #include <sys/cmt.h>
     44 
     45 /*
     46  * CMT scheduler / dispatcher support
     47  *
     48  * This file implements CMT scheduler support using Processor Groups.
     49  * The CMT processor group class creates and maintains the CMT class
     50  * specific processor group pg_cmt_t.
     51  *
     52  * ---------------------------- <-- pg_cmt_t *
     53  * | pghw_t                   |
     54  * ----------------------------
     55  * | CMT class specific data  |
     56  * | - hierarchy linkage      |
     57  * | - CMT load balancing data|
     58  * | - active CPU group/bitset|
     59  * ----------------------------
     60  *
     61  * The scheduler/dispatcher leverages knowledge of the performance
     62  * relevant CMT sharing relationships existing between cpus to implement
     63  * optimized affinity and load balancing policies.
     64  *
     65  * Load balancing policy seeks to improve performance by minimizing
     66  * contention over shared processor resources / facilities, while the
     67  * affinity policies seek to improve cache and TLB utilization.
     68  *
     69  * The CMT PGs created by this class are already arranged into a
     70  * hierarchy (which is done in the pghw layer). To implement the top-down
     71  * CMT load balancing algorithm, the CMT PGs additionally maintain
     72  * parent, child and sibling hierarchy relationships.
     73  * Parent PGs always contain a superset of their children(s) resources,
     74  * each PG can have at most one parent, and siblings are the group of PGs
     75  * sharing the same parent.
     76  *
     77  * On NUMA systems, the CMT load balancing algorithm balances across the
     78  * CMT PGs within their respective lgroups. On UMA based system, there
     79  * exists a top level group of PGs to balance across. On NUMA systems multiple
     80  * top level groups are instantiated, where the top level balancing begins by
     81  * balancng across the CMT PGs within their respective (per lgroup) top level
     82  * groups.
     83  */
     84 
     85 typedef struct cmt_lgrp {
     86 	group_t		cl_pgs;		/* Top level group of active CMT PGs */
     87 	int		cl_npgs;	/* # of top level PGs in the lgroup */
     88 	lgrp_handle_t	cl_hand;	/* lgroup's platform handle */
     89 	struct cmt_lgrp *cl_next;	/* next cmt_lgrp */
     90 } cmt_lgrp_t;
     91 
     92 static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
     93 static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
     94 						/* used for null_proc_lpa */
     95 
     96 static int		is_cpu0 = 1; /* true if this is boot CPU context */
     97 
     98 /*
     99  * Set this to non-zero to disable CMT scheduling
    100  * This must be done via kmdb -d, as /etc/system will be too late
    101  */
    102 static int		cmt_sched_disabled = 0;
    103 
    104 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
    105 
    106 static pg_t		*pg_cmt_alloc();
    107 static void		pg_cmt_free(pg_t *);
    108 static void		pg_cmt_cpu_init(cpu_t *);
    109 static void		pg_cmt_cpu_fini(cpu_t *);
    110 static void		pg_cmt_cpu_active(cpu_t *);
    111 static void		pg_cmt_cpu_inactive(cpu_t *);
    112 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
    113 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
    114 static void		pg_cmt_hier_pack(pg_cmt_t **, int);
    115 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
    116 static int		pg_cmt_hw(pghw_type_t);
    117 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
    118 static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
    119 
    120 /*
    121  * Macro to test if PG is managed by the CMT PG class
    122  */
    123 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
    124 
    125 /*
    126  * CMT PG ops
    127  */
    128 struct pg_ops pg_ops_cmt = {
    129 	pg_cmt_alloc,
    130 	pg_cmt_free,
    131 	pg_cmt_cpu_init,
    132 	pg_cmt_cpu_fini,
    133 	pg_cmt_cpu_active,
    134 	pg_cmt_cpu_inactive,
    135 	pg_cmt_cpupart_in,
    136 	NULL,			/* cpupart_out */
    137 	pg_cmt_cpupart_move,
    138 	pg_cmt_cpu_belongs,
    139 };
    140 
    141 /*
    142  * Initialize the CMT PG class
    143  */
    144 void
    145 pg_cmt_class_init(void)
    146 {
    147 	if (cmt_sched_disabled)
    148 		return;
    149 
    150 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
    151 }
    152 
    153 /*
    154  * Called to indicate a new CPU has started up so
    155  * that either t0 or the slave startup thread can
    156  * be accounted for.
    157  */
    158 void
    159 pg_cmt_cpu_startup(cpu_t *cp)
    160 {
    161 	PG_NRUN_UPDATE(cp, 1);
    162 }
    163 
    164 /*
    165  * Adjust the CMT load in the CMT PGs in which the CPU belongs
    166  * Note that "n" can be positive in the case of increasing
    167  * load, or negative in the case of decreasing load.
    168  */
    169 void
    170 pg_cmt_load(cpu_t *cp, int n)
    171 {
    172 	pg_cmt_t	*pg;
    173 
    174 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
    175 	while (pg != NULL) {
    176 		ASSERT(IS_CMT_PG(pg));
    177 		atomic_add_32(&pg->cmt_nrunning, n);
    178 		pg = pg->cmt_parent;
    179 	}
    180 }
    181 
    182 /*
    183  * Return non-zero if thread can migrate between "from" and "to"
    184  * without a performance penalty
    185  */
    186 int
    187 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
    188 {
    189 	if (from->cpu_physid->cpu_cacheid ==
    190 	    to->cpu_physid->cpu_cacheid)
    191 		return (1);
    192 	return (0);
    193 }
    194 
    195 /*
    196  * CMT class specific PG allocation
    197  */
    198 static pg_t *
    199 pg_cmt_alloc(void)
    200 {
    201 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
    202 }
    203 
    204 /*
    205  * Class specific PG de-allocation
    206  */
    207 static void
    208 pg_cmt_free(pg_t *pg)
    209 {
    210 	ASSERT(pg != NULL);
    211 	ASSERT(IS_CMT_PG(pg));
    212 
    213 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
    214 }
    215 
    216 /*
    217  * Return 1 if CMT scheduling policies should be impelmented
    218  * for the specified hardware sharing relationship.
    219  */
    220 static int
    221 pg_cmt_hw(pghw_type_t hw)
    222 {
    223 	return (pg_plat_cmt_load_bal_hw(hw) ||
    224 	    pg_plat_cmt_affinity_hw(hw));
    225 }
    226 
    227 /*
    228  * CMT class callback for a new CPU entering the system
    229  */
    230 static void
    231 pg_cmt_cpu_init(cpu_t *cp)
    232 {
    233 	pg_cmt_t	*pg;
    234 	group_t		*cmt_pgs;
    235 	int		level, max_level, nlevels;
    236 	pghw_type_t	hw;
    237 	pg_t		*pg_cache = NULL;
    238 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
    239 	lgrp_handle_t	lgrp_handle;
    240 	cmt_lgrp_t	*lgrp;
    241 
    242 	ASSERT(MUTEX_HELD(&cpu_lock));
    243 
    244 	/*
    245 	 * A new CPU is coming into the system.
    246 	 * Interrogate the platform to see if the CPU
    247 	 * has any performance relevant CMT sharing
    248 	 * relationships
    249 	 */
    250 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
    251 	cp->cpu_pg->cmt_lineage = NULL;
    252 
    253 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
    254 	max_level = nlevels = 0;
    255 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
    256 
    257 		/*
    258 		 * We're only interested in CMT hw sharing relationships
    259 		 */
    260 		if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0)
    261 			continue;
    262 
    263 		/*
    264 		 * Find (or create) the PG associated with
    265 		 * the hw sharing relationship in which cp
    266 		 * belongs.
    267 		 *
    268 		 * Determine if a suitable PG already
    269 		 * exists, or if one needs to be created.
    270 		 */
    271 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
    272 		if (pg == NULL) {
    273 			/*
    274 			 * Create a new one.
    275 			 * Initialize the common...
    276 			 */
    277 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
    278 
    279 			/* ... physical ... */
    280 			pghw_init((pghw_t *)pg, cp, hw);
    281 
    282 			/*
    283 			 * ... and CMT specific portions of the
    284 			 * structure.
    285 			 */
    286 			bitset_init(&pg->cmt_cpus_actv_set);
    287 			group_create(&pg->cmt_cpus_actv);
    288 		} else {
    289 			ASSERT(IS_CMT_PG(pg));
    290 		}
    291 
    292 		/* Add the CPU to the PG */
    293 		pg_cpu_add((pg_t *)pg, cp);
    294 
    295 		/*
    296 		 * Ensure capacity of the active CPUs group/bitset
    297 		 */
    298 		group_expand(&pg->cmt_cpus_actv,
    299 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
    300 
    301 		if (cp->cpu_seqid >=
    302 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
    303 			bitset_resize(&pg->cmt_cpus_actv_set,
    304 			    cp->cpu_seqid + 1);
    305 		}
    306 
    307 		/*
    308 		 * Build a lineage of CMT PGs for load balancing
    309 		 */
    310 		if (pg_plat_cmt_load_bal_hw(hw)) {
    311 			level = pghw_level(hw);
    312 			cpu_cmt_hier[level] = pg;
    313 			if (level > max_level)
    314 				max_level = level;
    315 			nlevels++;
    316 		}
    317 
    318 		/* Cache this for later */
    319 		if (hw == PGHW_CACHE)
    320 			pg_cache = (pg_t *)pg;
    321 	}
    322 
    323 	/*
    324 	 * Pack out any gaps in the constructed lineage.
    325 	 * Gaps may exist where the architecture knows
    326 	 * about a hardware sharing relationship, but such a
    327 	 * relationship either isn't relevant for load
    328 	 * balancing or doesn't exist between CPUs on the system.
    329 	 */
    330 	pg_cmt_hier_pack(cpu_cmt_hier, max_level + 1);
    331 
    332 	/*
    333 	 * For each of the PGs int the CPU's lineage:
    334 	 *	- Add an entry in the CPU sorted CMT PG group
    335 	 *	  which is used for top down CMT load balancing
    336 	 *	- Tie the PG into the CMT hierarchy by connecting
    337 	 *	  it to it's parent and siblings.
    338 	 */
    339 	group_expand(cmt_pgs, nlevels);
    340 
    341 	/*
    342 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
    343 	 */
    344 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
    345 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
    346 	if (lgrp == NULL)
    347 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
    348 
    349 	for (level = 0; level < nlevels; level++) {
    350 		uint_t		children;
    351 		int		err;
    352 
    353 		pg = cpu_cmt_hier[level];
    354 		err = group_add_at(cmt_pgs, pg, nlevels - level - 1);
    355 		ASSERT(err == 0);
    356 
    357 		if (level == 0)
    358 			cp->cpu_pg->cmt_lineage = (pg_t *)pg;
    359 
    360 		if (pg->cmt_siblings != NULL) {
    361 			/* Already initialized */
    362 			ASSERT(pg->cmt_parent == NULL ||
    363 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
    364 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
    365 			    ((pg->cmt_parent != NULL) &&
    366 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
    367 			continue;
    368 		}
    369 
    370 		if ((level + 1) == nlevels) {
    371 			pg->cmt_parent = NULL;
    372 			pg->cmt_siblings = &lgrp->cl_pgs;
    373 			children = ++lgrp->cl_npgs;
    374 		} else {
    375 			pg->cmt_parent = cpu_cmt_hier[level + 1];
    376 
    377 			/*
    378 			 * A good parent keeps track of their children.
    379 			 * The parent's children group is also the PG's
    380 			 * siblings.
    381 			 */
    382 			if (pg->cmt_parent->cmt_children == NULL) {
    383 				pg->cmt_parent->cmt_children =
    384 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
    385 				group_create(pg->cmt_parent->cmt_children);
    386 			}
    387 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
    388 			children = ++pg->cmt_parent->cmt_nchildren;
    389 		}
    390 		pg->cmt_hint = 0;
    391 		group_expand(pg->cmt_siblings, children);
    392 	}
    393 
    394 	/*
    395 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
    396 	 * for fast lookups later.
    397 	 */
    398 	if (cp->cpu_physid) {
    399 		cp->cpu_physid->cpu_chipid =
    400 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
    401 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
    402 
    403 		/*
    404 		 * If this cpu has a PG representing shared cache, then set
    405 		 * cpu_cacheid to that PG's logical id
    406 		 */
    407 		if (pg_cache)
    408 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
    409 	}
    410 
    411 	/* CPU0 only initialization */
    412 	if (is_cpu0) {
    413 		pg_cmt_cpu_startup(cp);
    414 		is_cpu0 = 0;
    415 		cpu0_lgrp = lgrp;
    416 	}
    417 
    418 }
    419 
    420 /*
    421  * Class callback when a CPU is leaving the system (deletion)
    422  */
    423 static void
    424 pg_cmt_cpu_fini(cpu_t *cp)
    425 {
    426 	group_iter_t	i;
    427 	pg_cmt_t	*pg;
    428 	group_t		*pgs, *cmt_pgs;
    429 	lgrp_handle_t	lgrp_handle;
    430 	cmt_lgrp_t	*lgrp;
    431 
    432 	pgs = &cp->cpu_pg->pgs;
    433 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
    434 
    435 	/*
    436 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
    437 	 */
    438 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
    439 
    440 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
    441 	if (lgrp == NULL) {
    442 		/*
    443 		 * This is a bit of a special case.
    444 		 * The only way this can happen is if the CPU's lgrp
    445 		 * handle changed out from underneath us, which is what
    446 		 * happens with null_proc_lpa on starcat systems.
    447 		 *
    448 		 * Use the initial boot CPU lgrp, since this is what
    449 		 * we need to tear down.
    450 		 */
    451 		lgrp = cpu0_lgrp;
    452 	}
    453 
    454 	/*
    455 	 * First, clean up anything load balancing specific for each of
    456 	 * the CPU's PGs that participated in CMT load balancing
    457 	 */
    458 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
    459 	while (pg != NULL) {
    460 
    461 		/*
    462 		 * Remove the PG from the CPU's load balancing lineage
    463 		 */
    464 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
    465 
    466 		/*
    467 		 * If it's about to become empty, destroy it's children
    468 		 * group, and remove it's reference from it's siblings.
    469 		 * This is done here (rather than below) to avoid removing
    470 		 * our reference from a PG that we just eliminated.
    471 		 */
    472 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
    473 			if (pg->cmt_children != NULL)
    474 				group_destroy(pg->cmt_children);
    475 			if (pg->cmt_siblings != NULL) {
    476 				if (pg->cmt_siblings == &lgrp->cl_pgs)
    477 					lgrp->cl_npgs--;
    478 				else
    479 					pg->cmt_parent->cmt_nchildren--;
    480 			}
    481 		}
    482 		pg = pg->cmt_parent;
    483 	}
    484 
    485 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
    486 
    487 	/*
    488 	 * Now that the load balancing lineage updates have happened,
    489 	 * remove the CPU from all it's PGs (destroying any that become
    490 	 * empty).
    491 	 */
    492 	group_iter_init(&i);
    493 	while ((pg = group_iterate(pgs, &i)) != NULL) {
    494 		if (IS_CMT_PG(pg) == 0)
    495 			continue;
    496 
    497 		pg_cpu_delete((pg_t *)pg, cp);
    498 		/*
    499 		 * Deleting the CPU from the PG changes the CPU's
    500 		 * PG group over which we are actively iterating
    501 		 * Re-initialize the iteration
    502 		 */
    503 		group_iter_init(&i);
    504 
    505 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
    506 
    507 			/*
    508 			 * The PG has become zero sized, so destroy it.
    509 			 */
    510 			group_destroy(&pg->cmt_cpus_actv);
    511 			bitset_fini(&pg->cmt_cpus_actv_set);
    512 			pghw_fini((pghw_t *)pg);
    513 
    514 			pg_destroy((pg_t *)pg);
    515 		}
    516 	}
    517 }
    518 
    519 /*
    520  * Class callback when a CPU is entering a cpu partition
    521  */
    522 static void
    523 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
    524 {
    525 	group_t		*pgs;
    526 	pg_t		*pg;
    527 	group_iter_t	i;
    528 
    529 	ASSERT(MUTEX_HELD(&cpu_lock));
    530 
    531 	pgs = &cp->cpu_pg->pgs;
    532 
    533 	/*
    534 	 * Ensure that the new partition's PG bitset
    535 	 * is large enough for all CMT PG's to which cp
    536 	 * belongs
    537 	 */
    538 	group_iter_init(&i);
    539 	while ((pg = group_iterate(pgs, &i)) != NULL) {
    540 		if (IS_CMT_PG(pg) == 0)
    541 			continue;
    542 
    543 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
    544 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
    545 	}
    546 }
    547 
    548 /*
    549  * Class callback when a CPU is actually moving partitions
    550  */
    551 static void
    552 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
    553 {
    554 	cpu_t		*cpp;
    555 	group_t		*pgs;
    556 	pg_t		*pg;
    557 	group_iter_t	pg_iter;
    558 	pg_cpu_itr_t	cpu_iter;
    559 	boolean_t	found;
    560 
    561 	ASSERT(MUTEX_HELD(&cpu_lock));
    562 
    563 	pgs = &cp->cpu_pg->pgs;
    564 	group_iter_init(&pg_iter);
    565 
    566 	/*
    567 	 * Iterate over the CPUs CMT PGs
    568 	 */
    569 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
    570 
    571 		if (IS_CMT_PG(pg) == 0)
    572 			continue;
    573 
    574 		/*
    575 		 * Add the PG to the bitset in the new partition.
    576 		 */
    577 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
    578 
    579 		/*
    580 		 * Remove the PG from the bitset in the old partition
    581 		 * if the last of the PG's CPUs have left.
    582 		 */
    583 		found = B_FALSE;
    584 		PG_CPU_ITR_INIT(pg, cpu_iter);
    585 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
    586 			if (cpp == cp)
    587 				continue;
    588 			if (CPU_ACTIVE(cpp) &&
    589 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
    590 				found = B_TRUE;
    591 				break;
    592 			}
    593 		}
    594 		if (!found)
    595 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
    596 	}
    597 }
    598 
    599 /*
    600  * Class callback when a CPU becomes active (online)
    601  *
    602  * This is called in a context where CPUs are paused
    603  */
    604 static void
    605 pg_cmt_cpu_active(cpu_t *cp)
    606 {
    607 	int		err;
    608 	group_iter_t	i;
    609 	pg_cmt_t	*pg;
    610 	group_t		*pgs;
    611 
    612 	ASSERT(MUTEX_HELD(&cpu_lock));
    613 
    614 	pgs = &cp->cpu_pg->pgs;
    615 	group_iter_init(&i);
    616 
    617 	/*
    618 	 * Iterate over the CPU's PGs
    619 	 */
    620 	while ((pg = group_iterate(pgs, &i)) != NULL) {
    621 
    622 		if (IS_CMT_PG(pg) == 0)
    623 			continue;
    624 
    625 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
    626 		ASSERT(err == 0);
    627 
    628 		/*
    629 		 * If this is the first active CPU in the PG, and it
    630 		 * represents a hardware sharing relationship over which
    631 		 * CMT load balancing is performed, add it as a candidate
    632 		 * for balancing with it's siblings.
    633 		 */
    634 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
    635 		    pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
    636 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
    637 			ASSERT(err == 0);
    638 		}
    639 
    640 		/*
    641 		 * Notate the CPU in the PGs active CPU bitset.
    642 		 * Also notate the PG as being active in it's associated
    643 		 * partition
    644 		 */
    645 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
    646 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
    647 	}
    648 }
    649 
    650 /*
    651  * Class callback when a CPU goes inactive (offline)
    652  *
    653  * This is called in a context where CPUs are paused
    654  */
    655 static void
    656 pg_cmt_cpu_inactive(cpu_t *cp)
    657 {
    658 	int		err;
    659 	group_t		*pgs;
    660 	pg_cmt_t	*pg;
    661 	cpu_t		*cpp;
    662 	group_iter_t	i;
    663 	pg_cpu_itr_t	cpu_itr;
    664 	boolean_t	found;
    665 
    666 	ASSERT(MUTEX_HELD(&cpu_lock));
    667 
    668 	pgs = &cp->cpu_pg->pgs;
    669 	group_iter_init(&i);
    670 
    671 	while ((pg = group_iterate(pgs, &i)) != NULL) {
    672 
    673 		if (IS_CMT_PG(pg) == 0)
    674 			continue;
    675 
    676 		/*
    677 		 * Remove the CPU from the CMT PGs active CPU group
    678 		 * bitmap
    679 		 */
    680 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
    681 		ASSERT(err == 0);
    682 
    683 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
    684 
    685 		/*
    686 		 * If there are no more active CPUs in this PG over which
    687 		 * load was balanced, remove it as a balancing candidate.
    688 		 */
    689 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
    690 		    pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
    691 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
    692 			ASSERT(err == 0);
    693 		}
    694 
    695 		/*
    696 		 * Assert the number of active CPUs does not exceed
    697 		 * the total number of CPUs in the PG
    698 		 */
    699 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
    700 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
    701 
    702 		/*
    703 		 * Update the PG bitset in the CPU's old partition
    704 		 */
    705 		found = B_FALSE;
    706 		PG_CPU_ITR_INIT(pg, cpu_itr);
    707 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
    708 			if (cpp == cp)
    709 				continue;
    710 			if (CPU_ACTIVE(cpp) &&
    711 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
    712 				found = B_TRUE;
    713 				break;
    714 			}
    715 		}
    716 		if (!found) {
    717 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
    718 			    ((pg_t *)pg)->pg_id);
    719 		}
    720 	}
    721 }
    722 
    723 /*
    724  * Return non-zero if the CPU belongs in the given PG
    725  */
    726 static int
    727 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
    728 {
    729 	cpu_t	*pg_cpu;
    730 
    731 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
    732 
    733 	ASSERT(pg_cpu != NULL);
    734 
    735 	/*
    736 	 * The CPU belongs if, given the nature of the hardware sharing
    737 	 * relationship represented by the PG, the CPU has that
    738 	 * relationship with some other CPU already in the PG
    739 	 */
    740 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
    741 		return (1);
    742 
    743 	return (0);
    744 }
    745 
    746 /*
    747  * Pack the CPUs CMT hierarchy
    748  * The hierarchy order is preserved
    749  */
    750 static void
    751 pg_cmt_hier_pack(pg_cmt_t *hier[], int sz)
    752 {
    753 	int	i, j;
    754 
    755 	for (i = 0; i < sz; i++) {
    756 		if (hier[i] != NULL)
    757 			continue;
    758 
    759 		for (j = i; j < sz; j++) {
    760 			if (hier[j] != NULL) {
    761 				hier[i] = hier[j];
    762 				hier[j] = NULL;
    763 				break;
    764 			}
    765 		}
    766 		if (j == sz)
    767 			break;
    768 	}
    769 }
    770 
    771 /*
    772  * Return a cmt_lgrp_t * given an lgroup handle.
    773  */
    774 static cmt_lgrp_t *
    775 pg_cmt_find_lgrp(lgrp_handle_t hand)
    776 {
    777 	cmt_lgrp_t	*lgrp;
    778 
    779 	ASSERT(MUTEX_HELD(&cpu_lock));
    780 
    781 	lgrp = cmt_lgrps;
    782 	while (lgrp != NULL) {
    783 		if (lgrp->cl_hand == hand)
    784 			break;
    785 		lgrp = lgrp->cl_next;
    786 	}
    787 	return (lgrp);
    788 }
    789 
    790 /*
    791  * Create a cmt_lgrp_t with the specified handle.
    792  */
    793 static cmt_lgrp_t *
    794 pg_cmt_lgrp_create(lgrp_handle_t hand)
    795 {
    796 	cmt_lgrp_t	*lgrp;
    797 
    798 	ASSERT(MUTEX_HELD(&cpu_lock));
    799 
    800 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
    801 
    802 	lgrp->cl_hand = hand;
    803 	lgrp->cl_npgs = 0;
    804 	lgrp->cl_next = cmt_lgrps;
    805 	cmt_lgrps = lgrp;
    806 	group_create(&lgrp->cl_pgs);
    807 
    808 	return (lgrp);
    809 }
    810