Home | History | Annotate | Download | only in disp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 
     30 #include <sys/types.h>
     31 #include <sys/param.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/signal.h>
     34 #include <sys/user.h>
     35 #include <sys/systm.h>
     36 #include <sys/sysinfo.h>
     37 #include <sys/var.h>
     38 #include <sys/errno.h>
     39 #include <sys/cmn_err.h>
     40 #include <sys/debug.h>
     41 #include <sys/inline.h>
     42 #include <sys/disp.h>
     43 #include <sys/class.h>
     44 #include <sys/bitmap.h>
     45 #include <sys/kmem.h>
     46 #include <sys/cpuvar.h>
     47 #include <sys/vtrace.h>
     48 #include <sys/tnf.h>
     49 #include <sys/cpupart.h>
     50 #include <sys/lgrp.h>
     51 #include <sys/pg.h>
     52 #include <sys/cmt.h>
     53 #include <sys/bitset.h>
     54 #include <sys/schedctl.h>
     55 #include <sys/atomic.h>
     56 #include <sys/dtrace.h>
     57 #include <sys/sdt.h>
     58 #include <sys/archsystm.h>
     59 
     60 #include <vm/as.h>
     61 
     62 #define	BOUND_CPU	0x1
     63 #define	BOUND_PARTITION	0x2
     64 #define	BOUND_INTR	0x4
     65 
     66 /* Dispatch queue allocation structure and functions */
     67 struct disp_queue_info {
     68 	disp_t	*dp;
     69 	dispq_t *olddispq;
     70 	dispq_t *newdispq;
     71 	ulong_t	*olddqactmap;
     72 	ulong_t	*newdqactmap;
     73 	int	oldnglobpris;
     74 };
     75 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
     76     disp_t *dp);
     77 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
     78 static void	disp_dq_free(struct disp_queue_info *dptr);
     79 
     80 /* platform-specific routine to call when processor is idle */
     81 static void	generic_idle_cpu();
     82 void		(*idle_cpu)() = generic_idle_cpu;
     83 
     84 /* routines invoked when a CPU enters/exits the idle loop */
     85 static void	idle_enter();
     86 static void	idle_exit();
     87 
     88 /* platform-specific routine to call when thread is enqueued */
     89 static void	generic_enq_thread(cpu_t *, int);
     90 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
     91 
     92 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
     93 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
     94 pri_t	intr_pri;		/* interrupt thread priority base level */
     95 
     96 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
     97 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
     98 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
     99 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
    100 int	nswapped;		/* total number of swapped threads */
    101 void	disp_swapped_enq(kthread_t *tp);
    102 static void	disp_swapped_setrun(kthread_t *tp);
    103 static void	cpu_resched(cpu_t *cp, pri_t tpri);
    104 
    105 /*
    106  * If this is set, only interrupt threads will cause kernel preemptions.
    107  * This is done by changing the value of kpreemptpri.  kpreemptpri
    108  * will either be the max sysclass pri + 1 or the min interrupt pri.
    109  */
    110 int	only_intr_kpreempt;
    111 
    112 extern void set_idle_cpu(int cpun);
    113 extern void unset_idle_cpu(int cpun);
    114 static void setkpdq(kthread_t *tp, int borf);
    115 #define	SETKP_BACK	0
    116 #define	SETKP_FRONT	1
    117 /*
    118  * Parameter that determines how recently a thread must have run
    119  * on the CPU to be considered loosely-bound to that CPU to reduce
    120  * cold cache effects.  The interval is in hertz.
    121  */
    122 #define	RECHOOSE_INTERVAL 3
    123 int	rechoose_interval = RECHOOSE_INTERVAL;
    124 static cpu_t	*cpu_choose(kthread_t *, pri_t);
    125 
    126 /*
    127  * Parameter that determines how long (in nanoseconds) a thread must
    128  * be sitting on a run queue before it can be stolen by another CPU
    129  * to reduce migrations.  The interval is in nanoseconds.
    130  *
    131  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
    132  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
    133  * here indicating it is uninitiallized.
    134  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
    135  *
    136  */
    137 #define	NOSTEAL_UNINITIALIZED	(-1)
    138 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
    139 extern void cmp_set_nosteal_interval(void);
    140 
    141 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
    142 
    143 disp_lock_t	transition_lock;	/* lock on transitioning threads */
    144 disp_lock_t	stop_lock;		/* lock on stopped threads */
    145 
    146 static void	cpu_dispqalloc(int numpris);
    147 
    148 /*
    149  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
    150  * a thread because it was sitting on its run queue for a very short
    151  * period of time.
    152  */
    153 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
    154 
    155 static kthread_t	*disp_getwork(cpu_t *to);
    156 static kthread_t	*disp_getbest(disp_t *from);
    157 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
    158 
    159 void	swtch_to(kthread_t *);
    160 
    161 /*
    162  * dispatcher and scheduler initialization
    163  */
    164 
    165 /*
    166  * disp_setup - Common code to calculate and allocate dispatcher
    167  *		variables and structures based on the maximum priority.
    168  */
    169 static void
    170 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
    171 {
    172 	pri_t	newnglobpris;
    173 
    174 	ASSERT(MUTEX_HELD(&cpu_lock));
    175 
    176 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
    177 
    178 	if (newnglobpris > oldnglobpris) {
    179 		/*
    180 		 * Allocate new kp queues for each CPU partition.
    181 		 */
    182 		cpupart_kpqalloc(newnglobpris);
    183 
    184 		/*
    185 		 * Allocate new dispatch queues for each CPU.
    186 		 */
    187 		cpu_dispqalloc(newnglobpris);
    188 
    189 		/*
    190 		 * compute new interrupt thread base priority
    191 		 */
    192 		intr_pri = maxglobpri;
    193 		if (only_intr_kpreempt) {
    194 			kpreemptpri = intr_pri + 1;
    195 			if (kpqpri == KPQPRI)
    196 				kpqpri = kpreemptpri;
    197 		}
    198 		v.v_nglobpris = newnglobpris;
    199 	}
    200 }
    201 
    202 /*
    203  * dispinit - Called to initialize all loaded classes and the
    204  *	      dispatcher framework.
    205  */
    206 void
    207 dispinit(void)
    208 {
    209 	id_t	cid;
    210 	pri_t	maxglobpri;
    211 	pri_t	cl_maxglobpri;
    212 
    213 	maxglobpri = -1;
    214 
    215 	/*
    216 	 * Initialize transition lock, which will always be set.
    217 	 */
    218 	DISP_LOCK_INIT(&transition_lock);
    219 	disp_lock_enter_high(&transition_lock);
    220 	DISP_LOCK_INIT(&stop_lock);
    221 
    222 	mutex_enter(&cpu_lock);
    223 	CPU->cpu_disp->disp_maxrunpri = -1;
    224 	CPU->cpu_disp->disp_max_unbound_pri = -1;
    225 
    226 	/*
    227 	 * Initialize the default CPU partition.
    228 	 */
    229 	cpupart_initialize_default();
    230 	/*
    231 	 * Call the class specific initialization functions for
    232 	 * all pre-installed schedulers.
    233 	 *
    234 	 * We pass the size of a class specific parameter
    235 	 * buffer to each of the initialization functions
    236 	 * to try to catch problems with backward compatibility
    237 	 * of class modules.
    238 	 *
    239 	 * For example a new class module running on an old system
    240 	 * which didn't provide sufficiently large parameter buffers
    241 	 * would be bad news. Class initialization modules can check for
    242 	 * this and take action if they detect a problem.
    243 	 */
    244 
    245 	for (cid = 0; cid < nclass; cid++) {
    246 		sclass_t	*sc;
    247 
    248 		sc = &sclass[cid];
    249 		if (SCHED_INSTALLED(sc)) {
    250 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
    251 			    &sc->cl_funcs);
    252 			if (cl_maxglobpri > maxglobpri)
    253 				maxglobpri = cl_maxglobpri;
    254 		}
    255 	}
    256 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
    257 	if (kpqpri == KPQPRI)
    258 		kpqpri = kpreemptpri;
    259 
    260 	ASSERT(maxglobpri >= 0);
    261 	disp_setup(maxglobpri, 0);
    262 
    263 	mutex_exit(&cpu_lock);
    264 
    265 	/*
    266 	 * Platform specific sticky scheduler setup.
    267 	 */
    268 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
    269 		cmp_set_nosteal_interval();
    270 
    271 	/*
    272 	 * Get the default class ID; this may be later modified via
    273 	 * dispadmin(1M).  This will load the class (normally TS) and that will
    274 	 * call disp_add(), which is why we had to drop cpu_lock first.
    275 	 */
    276 	if (getcid(defaultclass, &defaultcid) != 0) {
    277 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
    278 		    defaultclass);
    279 	}
    280 }
    281 
    282 /*
    283  * disp_add - Called with class pointer to initialize the dispatcher
    284  *	      for a newly loaded class.
    285  */
    286 void
    287 disp_add(sclass_t *clp)
    288 {
    289 	pri_t	maxglobpri;
    290 	pri_t	cl_maxglobpri;
    291 
    292 	mutex_enter(&cpu_lock);
    293 	/*
    294 	 * Initialize the scheduler class.
    295 	 */
    296 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
    297 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
    298 	if (cl_maxglobpri > maxglobpri)
    299 		maxglobpri = cl_maxglobpri;
    300 
    301 	/*
    302 	 * Save old queue information.  Since we're initializing a
    303 	 * new scheduling class which has just been loaded, then
    304 	 * the size of the dispq may have changed.  We need to handle
    305 	 * that here.
    306 	 */
    307 	disp_setup(maxglobpri, v.v_nglobpris);
    308 
    309 	mutex_exit(&cpu_lock);
    310 }
    311 
    312 
    313 /*
    314  * For each CPU, allocate new dispatch queues
    315  * with the stated number of priorities.
    316  */
    317 static void
    318 cpu_dispqalloc(int numpris)
    319 {
    320 	cpu_t	*cpup;
    321 	struct disp_queue_info	*disp_mem;
    322 	int i, num;
    323 
    324 	ASSERT(MUTEX_HELD(&cpu_lock));
    325 
    326 	disp_mem = kmem_zalloc(NCPU *
    327 	    sizeof (struct disp_queue_info), KM_SLEEP);
    328 
    329 	/*
    330 	 * This routine must allocate all of the memory before stopping
    331 	 * the cpus because it must not sleep in kmem_alloc while the
    332 	 * CPUs are stopped.  Locks they hold will not be freed until they
    333 	 * are restarted.
    334 	 */
    335 	i = 0;
    336 	cpup = cpu_list;
    337 	do {
    338 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
    339 		i++;
    340 		cpup = cpup->cpu_next;
    341 	} while (cpup != cpu_list);
    342 	num = i;
    343 
    344 	pause_cpus(NULL);
    345 	for (i = 0; i < num; i++)
    346 		disp_dq_assign(&disp_mem[i], numpris);
    347 	start_cpus();
    348 
    349 	/*
    350 	 * I must free all of the memory after starting the cpus because
    351 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
    352 	 */
    353 	for (i = 0; i < num; i++)
    354 		disp_dq_free(&disp_mem[i]);
    355 
    356 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
    357 }
    358 
    359 static void
    360 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
    361 {
    362 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
    363 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
    364 	    sizeof (long), KM_SLEEP);
    365 	dptr->dp = dp;
    366 }
    367 
    368 static void
    369 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
    370 {
    371 	disp_t	*dp;
    372 
    373 	dp = dptr->dp;
    374 	dptr->olddispq = dp->disp_q;
    375 	dptr->olddqactmap = dp->disp_qactmap;
    376 	dptr->oldnglobpris = dp->disp_npri;
    377 
    378 	ASSERT(dptr->oldnglobpris < numpris);
    379 
    380 	if (dptr->olddispq != NULL) {
    381 		/*
    382 		 * Use kcopy because bcopy is platform-specific
    383 		 * and could block while we might have paused the cpus.
    384 		 */
    385 		(void) kcopy(dptr->olddispq, dptr->newdispq,
    386 		    dptr->oldnglobpris * sizeof (dispq_t));
    387 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
    388 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
    389 		    sizeof (long));
    390 	}
    391 	dp->disp_q = dptr->newdispq;
    392 	dp->disp_qactmap = dptr->newdqactmap;
    393 	dp->disp_q_limit = &dptr->newdispq[numpris];
    394 	dp->disp_npri = numpris;
    395 }
    396 
    397 static void
    398 disp_dq_free(struct disp_queue_info *dptr)
    399 {
    400 	if (dptr->olddispq != NULL)
    401 		kmem_free(dptr->olddispq,
    402 		    dptr->oldnglobpris * sizeof (dispq_t));
    403 	if (dptr->olddqactmap != NULL)
    404 		kmem_free(dptr->olddqactmap,
    405 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
    406 }
    407 
    408 /*
    409  * For a newly created CPU, initialize the dispatch queue.
    410  * This is called before the CPU is known through cpu[] or on any lists.
    411  */
    412 void
    413 disp_cpu_init(cpu_t *cp)
    414 {
    415 	disp_t	*dp;
    416 	dispq_t	*newdispq;
    417 	ulong_t	*newdqactmap;
    418 
    419 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
    420 
    421 	if (cp == cpu0_disp.disp_cpu)
    422 		dp = &cpu0_disp;
    423 	else
    424 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
    425 	bzero(dp, sizeof (disp_t));
    426 	cp->cpu_disp = dp;
    427 	dp->disp_cpu = cp;
    428 	dp->disp_maxrunpri = -1;
    429 	dp->disp_max_unbound_pri = -1;
    430 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
    431 	/*
    432 	 * Allocate memory for the dispatcher queue headers
    433 	 * and the active queue bitmap.
    434 	 */
    435 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
    436 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
    437 	    sizeof (long), KM_SLEEP);
    438 	dp->disp_q = newdispq;
    439 	dp->disp_qactmap = newdqactmap;
    440 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
    441 	dp->disp_npri = v.v_nglobpris;
    442 }
    443 
    444 void
    445 disp_cpu_fini(cpu_t *cp)
    446 {
    447 	ASSERT(MUTEX_HELD(&cpu_lock));
    448 
    449 	disp_kp_free(cp->cpu_disp);
    450 	if (cp->cpu_disp != &cpu0_disp)
    451 		kmem_free(cp->cpu_disp, sizeof (disp_t));
    452 }
    453 
    454 /*
    455  * Allocate new, larger kpreempt dispatch queue to replace the old one.
    456  */
    457 void
    458 disp_kp_alloc(disp_t *dq, pri_t npri)
    459 {
    460 	struct disp_queue_info	mem_info;
    461 
    462 	if (npri > dq->disp_npri) {
    463 		/*
    464 		 * Allocate memory for the new array.
    465 		 */
    466 		disp_dq_alloc(&mem_info, npri, dq);
    467 
    468 		/*
    469 		 * We need to copy the old structures to the new
    470 		 * and free the old.
    471 		 */
    472 		disp_dq_assign(&mem_info, npri);
    473 		disp_dq_free(&mem_info);
    474 	}
    475 }
    476 
    477 /*
    478  * Free dispatch queue.
    479  * Used for the kpreempt queues for a removed CPU partition and
    480  * for the per-CPU queues of deleted CPUs.
    481  */
    482 void
    483 disp_kp_free(disp_t *dq)
    484 {
    485 	struct disp_queue_info	mem_info;
    486 
    487 	mem_info.olddispq = dq->disp_q;
    488 	mem_info.olddqactmap = dq->disp_qactmap;
    489 	mem_info.oldnglobpris = dq->disp_npri;
    490 	disp_dq_free(&mem_info);
    491 }
    492 
    493 /*
    494  * End dispatcher and scheduler initialization.
    495  */
    496 
    497 /*
    498  * See if there's anything to do other than remain idle.
    499  * Return non-zero if there is.
    500  *
    501  * This function must be called with high spl, or with
    502  * kernel preemption disabled to prevent the partition's
    503  * active cpu list from changing while being traversed.
    504  *
    505  */
    506 int
    507 disp_anywork(void)
    508 {
    509 	cpu_t   *cp = CPU;
    510 	cpu_t   *ocp;
    511 
    512 	if (cp->cpu_disp->disp_nrunnable != 0)
    513 		return (1);
    514 
    515 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
    516 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
    517 			return (1);
    518 
    519 		/*
    520 		 * Work can be taken from another CPU if:
    521 		 *	- There is unbound work on the run queue
    522 		 *	- That work isn't a thread undergoing a
    523 		 *	- context switch on an otherwise empty queue.
    524 		 *	- The CPU isn't running the idle loop.
    525 		 */
    526 		for (ocp = cp->cpu_next_part; ocp != cp;
    527 		    ocp = ocp->cpu_next_part) {
    528 			ASSERT(CPU_ACTIVE(ocp));
    529 
    530 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
    531 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
    532 			    ocp->cpu_disp->disp_nrunnable == 1) &&
    533 			    ocp->cpu_dispatch_pri != -1)
    534 				return (1);
    535 		}
    536 	}
    537 	return (0);
    538 }
    539 
    540 /*
    541  * Called when CPU enters the idle loop
    542  */
    543 static void
    544 idle_enter()
    545 {
    546 	cpu_t		*cp = CPU;
    547 
    548 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
    549 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
    550 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
    551 }
    552 
    553 /*
    554  * Called when CPU exits the idle loop
    555  */
    556 static void
    557 idle_exit()
    558 {
    559 	cpu_t		*cp = CPU;
    560 
    561 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
    562 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
    563 }
    564 
    565 /*
    566  * Idle loop.
    567  */
    568 void
    569 idle()
    570 {
    571 	struct cpu	*cp = CPU;		/* pointer to this CPU */
    572 	kthread_t	*t;			/* taken thread */
    573 
    574 	idle_enter();
    575 
    576 	/*
    577 	 * Uniprocessor version of idle loop.
    578 	 * Do this until notified that we're on an actual multiprocessor.
    579 	 */
    580 	while (ncpus == 1) {
    581 		if (cp->cpu_disp->disp_nrunnable == 0) {
    582 			(*idle_cpu)();
    583 			continue;
    584 		}
    585 		idle_exit();
    586 		swtch();
    587 
    588 		idle_enter(); /* returned from swtch */
    589 	}
    590 
    591 	/*
    592 	 * Multiprocessor idle loop.
    593 	 */
    594 	for (;;) {
    595 		/*
    596 		 * If CPU is completely quiesced by p_online(2), just wait
    597 		 * here with minimal bus traffic until put online.
    598 		 */
    599 		while (cp->cpu_flags & CPU_QUIESCED)
    600 			(*idle_cpu)();
    601 
    602 		if (cp->cpu_disp->disp_nrunnable != 0) {
    603 			idle_exit();
    604 			swtch();
    605 		} else {
    606 			if (cp->cpu_flags & CPU_OFFLINE)
    607 				continue;
    608 			if ((t = disp_getwork(cp)) == NULL) {
    609 				if (cp->cpu_chosen_level != -1) {
    610 					disp_t *dp = cp->cpu_disp;
    611 					disp_t *kpq;
    612 
    613 					disp_lock_enter(&dp->disp_lock);
    614 					/*
    615 					 * Set kpq under lock to prevent
    616 					 * migration between partitions.
    617 					 */
    618 					kpq = &cp->cpu_part->cp_kp_queue;
    619 					if (kpq->disp_maxrunpri == -1)
    620 						cp->cpu_chosen_level = -1;
    621 					disp_lock_exit(&dp->disp_lock);
    622 				}
    623 				(*idle_cpu)();
    624 				continue;
    625 			}
    626 			/*
    627 			 * If there was a thread but we couldn't steal
    628 			 * it, then keep trying.
    629 			 */
    630 			if (t == T_DONTSTEAL)
    631 				continue;
    632 			idle_exit();
    633 			swtch_to(t);
    634 		}
    635 		idle_enter(); /* returned from swtch/swtch_to */
    636 	}
    637 }
    638 
    639 
    640 /*
    641  * Preempt the currently running thread in favor of the highest
    642  * priority thread.  The class of the current thread controls
    643  * where it goes on the dispatcher queues. If panicking, turn
    644  * preemption off.
    645  */
    646 void
    647 preempt()
    648 {
    649 	kthread_t 	*t = curthread;
    650 	klwp_t 		*lwp = ttolwp(curthread);
    651 
    652 	if (panicstr)
    653 		return;
    654 
    655 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
    656 
    657 	thread_lock(t);
    658 
    659 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
    660 		/*
    661 		 * this thread has already been chosen to be run on
    662 		 * another CPU. Clear kprunrun on this CPU since we're
    663 		 * already headed for swtch().
    664 		 */
    665 		CPU->cpu_kprunrun = 0;
    666 		thread_unlock_nopreempt(t);
    667 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
    668 	} else {
    669 		if (lwp != NULL)
    670 			lwp->lwp_ru.nivcsw++;
    671 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
    672 		THREAD_TRANSITION(t);
    673 		CL_PREEMPT(t);
    674 		DTRACE_SCHED(preempt);
    675 		thread_unlock_nopreempt(t);
    676 
    677 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
    678 
    679 		swtch();		/* clears CPU->cpu_runrun via disp() */
    680 	}
    681 }
    682 
    683 extern kthread_t *thread_unpin();
    684 
    685 /*
    686  * disp() - find the highest priority thread for this processor to run, and
    687  * set it in TS_ONPROC state so that resume() can be called to run it.
    688  */
    689 static kthread_t *
    690 disp()
    691 {
    692 	cpu_t		*cpup;
    693 	disp_t		*dp;
    694 	kthread_t	*tp;
    695 	dispq_t		*dq;
    696 	int		maxrunword;
    697 	pri_t		pri;
    698 	disp_t		*kpq;
    699 
    700 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
    701 
    702 	cpup = CPU;
    703 	/*
    704 	 * Find the highest priority loaded, runnable thread.
    705 	 */
    706 	dp = cpup->cpu_disp;
    707 
    708 reschedule:
    709 	/*
    710 	 * If there is more important work on the global queue with a better
    711 	 * priority than the maximum on this CPU, take it now.
    712 	 */
    713 	kpq = &cpup->cpu_part->cp_kp_queue;
    714 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
    715 	    pri >= dp->disp_maxrunpri &&
    716 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
    717 	    (tp = disp_getbest(kpq)) != NULL) {
    718 		if (disp_ratify(tp, kpq) != NULL) {
    719 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
    720 			    "disp_end:tid %p", tp);
    721 			return (tp);
    722 		}
    723 	}
    724 
    725 	disp_lock_enter(&dp->disp_lock);
    726 	pri = dp->disp_maxrunpri;
    727 
    728 	/*
    729 	 * If there is nothing to run, look at what's runnable on other queues.
    730 	 * Choose the idle thread if the CPU is quiesced.
    731 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
    732 	 * interrupt threads, which will be the only threads on the CPU's own
    733 	 * queue, but cannot run threads from other queues.
    734 	 */
    735 	if (pri == -1) {
    736 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
    737 			disp_lock_exit(&dp->disp_lock);
    738 			if ((tp = disp_getwork(cpup)) == NULL ||
    739 			    tp == T_DONTSTEAL) {
    740 				tp = cpup->cpu_idle_thread;
    741 				(void) splhigh();
    742 				THREAD_ONPROC(tp, cpup);
    743 				cpup->cpu_dispthread = tp;
    744 				cpup->cpu_dispatch_pri = -1;
    745 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
    746 				cpup->cpu_chosen_level = -1;
    747 			}
    748 		} else {
    749 			disp_lock_exit_high(&dp->disp_lock);
    750 			tp = cpup->cpu_idle_thread;
    751 			THREAD_ONPROC(tp, cpup);
    752 			cpup->cpu_dispthread = tp;
    753 			cpup->cpu_dispatch_pri = -1;
    754 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
    755 			cpup->cpu_chosen_level = -1;
    756 		}
    757 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
    758 		    "disp_end:tid %p", tp);
    759 		return (tp);
    760 	}
    761 
    762 	dq = &dp->disp_q[pri];
    763 	tp = dq->dq_first;
    764 
    765 	ASSERT(tp != NULL);
    766 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
    767 
    768 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
    769 
    770 	/*
    771 	 * Found it so remove it from queue.
    772 	 */
    773 	dp->disp_nrunnable--;
    774 	dq->dq_sruncnt--;
    775 	if ((dq->dq_first = tp->t_link) == NULL) {
    776 		ulong_t	*dqactmap = dp->disp_qactmap;
    777 
    778 		ASSERT(dq->dq_sruncnt == 0);
    779 		dq->dq_last = NULL;
    780 
    781 		/*
    782 		 * The queue is empty, so the corresponding bit needs to be
    783 		 * turned off in dqactmap.   If nrunnable != 0 just took the
    784 		 * last runnable thread off the
    785 		 * highest queue, so recompute disp_maxrunpri.
    786 		 */
    787 		maxrunword = pri >> BT_ULSHIFT;
    788 		dqactmap[maxrunword] &= ~BT_BIW(pri);
    789 
    790 		if (dp->disp_nrunnable == 0) {
    791 			dp->disp_max_unbound_pri = -1;
    792 			dp->disp_maxrunpri = -1;
    793 		} else {
    794 			int ipri;
    795 
    796 			ipri = bt_gethighbit(dqactmap, maxrunword);
    797 			dp->disp_maxrunpri = ipri;
    798 			if (ipri < dp->disp_max_unbound_pri)
    799 				dp->disp_max_unbound_pri = ipri;
    800 		}
    801 	} else {
    802 		tp->t_link = NULL;
    803 	}
    804 
    805 	/*
    806 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
    807 	 * out this thread before we have a chance to run it.
    808 	 * While running, it is protected against swapping by t_lock.
    809 	 */
    810 	tp->t_schedflag |= TS_DONT_SWAP;
    811 	cpup->cpu_dispthread = tp;		/* protected by spl only */
    812 	cpup->cpu_dispatch_pri = pri;
    813 	ASSERT(pri == DISP_PRIO(tp));
    814 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
    815 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
    816 
    817 	ASSERT(tp != NULL);
    818 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
    819 	    "disp_end:tid %p", tp);
    820 
    821 	if (disp_ratify(tp, kpq) == NULL)
    822 		goto reschedule;
    823 
    824 	return (tp);
    825 }
    826 
    827 /*
    828  * swtch()
    829  *	Find best runnable thread and run it.
    830  *	Called with the current thread already switched to a new state,
    831  *	on a sleep queue, run queue, stopped, and not zombied.
    832  *	May be called at any spl level less than or equal to LOCK_LEVEL.
    833  *	Always drops spl to the base level (spl0()).
    834  */
    835 void
    836 swtch()
    837 {
    838 	kthread_t	*t = curthread;
    839 	kthread_t	*next;
    840 	cpu_t		*cp;
    841 
    842 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
    843 
    844 	if (t->t_flag & T_INTR_THREAD)
    845 		cpu_intr_swtch_enter(t);
    846 
    847 	if (t->t_intr != NULL) {
    848 		/*
    849 		 * We are an interrupt thread.  Setup and return
    850 		 * the interrupted thread to be resumed.
    851 		 */
    852 		(void) splhigh();	/* block other scheduler action */
    853 		cp = CPU;		/* now protected against migration */
    854 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
    855 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
    856 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
    857 		next = thread_unpin();
    858 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
    859 		resume_from_intr(next);
    860 	} else {
    861 #ifdef	DEBUG
    862 		if (t->t_state == TS_ONPROC &&
    863 		    t->t_disp_queue->disp_cpu == CPU &&
    864 		    t->t_preempt == 0) {
    865 			thread_lock(t);
    866 			ASSERT(t->t_state != TS_ONPROC ||
    867 			    t->t_disp_queue->disp_cpu != CPU ||
    868 			    t->t_preempt != 0);	/* cannot migrate */
    869 			thread_unlock_nopreempt(t);
    870 		}
    871 #endif	/* DEBUG */
    872 		cp = CPU;
    873 		next = disp();		/* returns with spl high */
    874 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
    875 
    876 		/* OK to steal anything left on run queue */
    877 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
    878 
    879 		if (next != t) {
    880 			if (t == cp->cpu_idle_thread) {
    881 				PG_NRUN_UPDATE(cp, 1);
    882 			} else if (next == cp->cpu_idle_thread) {
    883 				PG_NRUN_UPDATE(cp, -1);
    884 			}
    885 
    886 			/*
    887 			 * If t was previously in the TS_ONPROC state,
    888 			 * setfrontdq and setbackdq won't have set its t_waitrq.
    889 			 * Since we now finally know that we're switching away
    890 			 * from this thread, set its t_waitrq if it is on a run
    891 			 * queue.
    892 			 */
    893 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
    894 				t->t_waitrq = gethrtime_unscaled();
    895 			}
    896 
    897 			/*
    898 			 * restore mstate of thread that we are switching to
    899 			 */
    900 			restore_mstate(next);
    901 
    902 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
    903 			cp->cpu_last_swtch = t->t_disp_time = lbolt;
    904 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
    905 
    906 			if (dtrace_vtime_active)
    907 				dtrace_vtime_switch(next);
    908 
    909 			resume(next);
    910 			/*
    911 			 * The TR_RESUME_END and TR_SWTCH_END trace points
    912 			 * appear at the end of resume(), because we may not
    913 			 * return here
    914 			 */
    915 		} else {
    916 			if (t->t_flag & T_INTR_THREAD)
    917 				cpu_intr_swtch_exit(t);
    918 
    919 			DTRACE_SCHED(remain__cpu);
    920 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
    921 			(void) spl0();
    922 		}
    923 	}
    924 }
    925 
    926 /*
    927  * swtch_from_zombie()
    928  *	Special case of swtch(), which allows checks for TS_ZOMB to be
    929  *	eliminated from normal resume.
    930  *	Find best runnable thread and run it.
    931  *	Called with the current thread zombied.
    932  *	Zombies cannot migrate, so CPU references are safe.
    933  */
    934 void
    935 swtch_from_zombie()
    936 {
    937 	kthread_t	*next;
    938 	cpu_t		*cpu = CPU;
    939 
    940 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
    941 
    942 	ASSERT(curthread->t_state == TS_ZOMB);
    943 
    944 	next = disp();			/* returns with spl high */
    945 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
    946 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
    947 	ASSERT(next != curthread);
    948 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
    949 
    950 	if (next == cpu->cpu_idle_thread)
    951 		PG_NRUN_UPDATE(cpu, -1);
    952 
    953 	restore_mstate(next);
    954 
    955 	if (dtrace_vtime_active)
    956 		dtrace_vtime_switch(next);
    957 
    958 	resume_from_zombie(next);
    959 	/*
    960 	 * The TR_RESUME_END and TR_SWTCH_END trace points
    961 	 * appear at the end of resume(), because we certainly will not
    962 	 * return here
    963 	 */
    964 }
    965 
    966 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
    967 
    968 /*
    969  * search_disp_queues()
    970  *	Search the given dispatch queues for thread tp.
    971  *	Return 1 if tp is found, otherwise return 0.
    972  */
    973 static int
    974 search_disp_queues(disp_t *dp, kthread_t *tp)
    975 {
    976 	dispq_t		*dq;
    977 	dispq_t		*eq;
    978 
    979 	disp_lock_enter_high(&dp->disp_lock);
    980 
    981 	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
    982 		kthread_t	*rp;
    983 
    984 		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
    985 
    986 		for (rp = dq->dq_first; rp; rp = rp->t_link)
    987 			if (tp == rp) {
    988 				disp_lock_exit_high(&dp->disp_lock);
    989 				return (1);
    990 			}
    991 	}
    992 	disp_lock_exit_high(&dp->disp_lock);
    993 
    994 	return (0);
    995 }
    996 
    997 /*
    998  * thread_on_queue()
    999  *	Search all per-CPU dispatch queues and all partition-wide kpreempt
   1000  *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
   1001  */
   1002 static int
   1003 thread_on_queue(kthread_t *tp)
   1004 {
   1005 	cpu_t		*cp;
   1006 	struct cpupart	*part;
   1007 
   1008 	ASSERT(getpil() >= DISP_LEVEL);
   1009 
   1010 	/*
   1011 	 * Search the per-CPU dispatch queues for tp.
   1012 	 */
   1013 	cp = CPU;
   1014 	do {
   1015 		if (search_disp_queues(cp->cpu_disp, tp))
   1016 			return (1);
   1017 	} while ((cp = cp->cpu_next_onln) != CPU);
   1018 
   1019 	/*
   1020 	 * Search the partition-wide kpreempt queues for tp.
   1021 	 */
   1022 	part = CPU->cpu_part;
   1023 	do {
   1024 		if (search_disp_queues(&part->cp_kp_queue, tp))
   1025 			return (1);
   1026 	} while ((part = part->cp_next) != CPU->cpu_part);
   1027 
   1028 	return (0);
   1029 }
   1030 
   1031 #else
   1032 
   1033 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
   1034 
   1035 #endif  /* DEBUG */
   1036 
   1037 /*
   1038  * like swtch(), but switch to a specified thread taken from another CPU.
   1039  *	called with spl high..
   1040  */
   1041 void
   1042 swtch_to(kthread_t *next)
   1043 {
   1044 	cpu_t			*cp = CPU;
   1045 
   1046 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
   1047 
   1048 	/*
   1049 	 * Update context switch statistics.
   1050 	 */
   1051 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
   1052 
   1053 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
   1054 
   1055 	if (curthread == cp->cpu_idle_thread)
   1056 		PG_NRUN_UPDATE(cp, 1);
   1057 
   1058 	/* OK to steal anything left on run queue */
   1059 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
   1060 
   1061 	/* record last execution time */
   1062 	cp->cpu_last_swtch = curthread->t_disp_time = lbolt;
   1063 
   1064 	/*
   1065 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
   1066 	 * won't have set its t_waitrq.  Since we now finally know that we're
   1067 	 * switching away from this thread, set its t_waitrq if it is on a run
   1068 	 * queue.
   1069 	 */
   1070 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
   1071 		curthread->t_waitrq = gethrtime_unscaled();
   1072 	}
   1073 
   1074 	/* restore next thread to previously running microstate */
   1075 	restore_mstate(next);
   1076 
   1077 	if (dtrace_vtime_active)
   1078 		dtrace_vtime_switch(next);
   1079 
   1080 	resume(next);
   1081 	/*
   1082 	 * The TR_RESUME_END and TR_SWTCH_END trace points
   1083 	 * appear at the end of resume(), because we may not
   1084 	 * return here
   1085 	 */
   1086 }
   1087 
   1088 
   1089 
   1090 #define	CPU_IDLING(pri)	((pri) == -1)
   1091 
   1092 static void
   1093 cpu_resched(cpu_t *cp, pri_t tpri)
   1094 {
   1095 	int	call_poke_cpu = 0;
   1096 	pri_t   cpupri = cp->cpu_dispatch_pri;
   1097 
   1098 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
   1099 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
   1100 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
   1101 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
   1102 			cp->cpu_runrun = 1;
   1103 			aston(cp->cpu_dispthread);
   1104 			if (tpri < kpreemptpri && cp != CPU)
   1105 				call_poke_cpu = 1;
   1106 		}
   1107 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
   1108 			cp->cpu_kprunrun = 1;
   1109 			if (cp != CPU)
   1110 				call_poke_cpu = 1;
   1111 		}
   1112 	}
   1113 
   1114 	/*
   1115 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
   1116 	 */
   1117 	membar_enter();
   1118 
   1119 	if (call_poke_cpu)
   1120 		poke_cpu(cp->cpu_id);
   1121 }
   1122 
   1123 /*
   1124  * Perform multi-level CMT load balancing of running threads.
   1125  * tp is the thread being enqueued
   1126  * cp is the hint CPU (chosen by cpu_choose()).
   1127  */
   1128 static cpu_t *
   1129 cmt_balance(kthread_t *tp, cpu_t *cp)
   1130 {
   1131 	int		hint, i, cpu, nsiblings;
   1132 	int		self = 0;
   1133 	group_t		*cmt_pgs, *siblings;
   1134 	pg_cmt_t	*pg, *pg_tmp, *tpg = NULL;
   1135 	int		pg_nrun, tpg_nrun;
   1136 	int		level = 0;
   1137 	cpu_t		*newcp;
   1138 
   1139 	ASSERT(THREAD_LOCK_HELD(tp));
   1140 
   1141 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
   1142 
   1143 	if (GROUP_SIZE(cmt_pgs) == 0)
   1144 		return (cp);	/* nothing to do */
   1145 
   1146 	if (tp == curthread)
   1147 		sel