Home | History | Annotate | Download | only in disp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 
     30 #include <sys/types.h>
     31 #include <sys/param.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/signal.h>
     34 #include <sys/user.h>
     35 #include <sys/systm.h>
     36 #include <sys/sysinfo.h>
     37 #include <sys/var.h>
     38 #include <sys/errno.h>
     39 #include <sys/cmn_err.h>
     40 #include <sys/debug.h>
     41 #include <sys/inline.h>
     42 #include <sys/disp.h>
     43 #include <sys/class.h>
     44 #include <sys/bitmap.h>
     45 #include <sys/kmem.h>
     46 #include <sys/cpuvar.h>
     47 #include <sys/vtrace.h>
     48 #include <sys/tnf.h>
     49 #include <sys/cpupart.h>
     50 #include <sys/lgrp.h>
     51 #include <sys/pg.h>
     52 #include <sys/cmt.h>
     53 #include <sys/bitset.h>
     54 #include <sys/schedctl.h>
     55 #include <sys/atomic.h>
     56 #include <sys/dtrace.h>
     57 #include <sys/sdt.h>
     58 #include <sys/archsystm.h>
     59 
     60 #include <vm/as.h>
     61 
     62 #define	BOUND_CPU	0x1
     63 #define	BOUND_PARTITION	0x2
     64 #define	BOUND_INTR	0x4
     65 
     66 /* Dispatch queue allocation structure and functions */
     67 struct disp_queue_info {
     68 	disp_t	*dp;
     69 	dispq_t *olddispq;
     70 	dispq_t *newdispq;
     71 	ulong_t	*olddqactmap;
     72 	ulong_t	*newdqactmap;
     73 	int	oldnglobpris;
     74 };
     75 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
     76     disp_t *dp);
     77 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
     78 static void	disp_dq_free(struct disp_queue_info *dptr);
     79 
     80 /* platform-specific routine to call when processor is idle */
     81 static void	generic_idle_cpu();
     82 void		(*idle_cpu)() = generic_idle_cpu;
     83 
     84 /* routines invoked when a CPU enters/exits the idle loop */
     85 static void	idle_enter();
     86 static void	idle_exit();
     87 
     88 /* platform-specific routine to call when thread is enqueued */
     89 static void	generic_enq_thread(cpu_t *, int);
     90 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
     91 
     92 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
     93 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
     94 pri_t	intr_pri;		/* interrupt thread priority base level */
     95 
     96 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
     97 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
     98 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
     99 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
    100 int	nswapped;		/* total number of swapped threads */
    101 void	disp_swapped_enq(kthread_t *tp);
    102 static void	disp_swapped_setrun(kthread_t *tp);
    103 static void	cpu_resched(cpu_t *cp, pri_t tpri);
    104 
    105 /*
    106  * If this is set, only interrupt threads will cause kernel preemptions.
    107  * This is done by changing the value of kpreemptpri.  kpreemptpri
    108  * will either be the max sysclass pri + 1 or the min interrupt pri.
    109  */
    110 int	only_intr_kpreempt;
    111 
    112 extern void set_idle_cpu(int cpun);
    113 extern void unset_idle_cpu(int cpun);
    114 static void setkpdq(kthread_t *tp, int borf);
    115 #define	SETKP_BACK	0
    116 #define	SETKP_FRONT	1
    117 /*
    118  * Parameter that determines how recently a thread must have run
    119  * on the CPU to be considered loosely-bound to that CPU to reduce
    120  * cold cache effects.  The interval is in hertz.
    121  */
    122 #define	RECHOOSE_INTERVAL 3
    123 int	rechoose_interval = RECHOOSE_INTERVAL;
    124 
    125 /*
    126  * Parameter that determines how long (in nanoseconds) a thread must
    127  * be sitting on a run queue before it can be stolen by another CPU
    128  * to reduce migrations.  The interval is in nanoseconds.
    129  *
    130  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
    131  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
    132  * here indicating it is uninitiallized.
    133  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
    134  *
    135  */
    136 #define	NOSTEAL_UNINITIALIZED	(-1)
    137 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
    138 extern void cmp_set_nosteal_interval(void);
    139 
    140 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
    141 
    142 disp_lock_t	transition_lock;	/* lock on transitioning threads */
    143 disp_lock_t	stop_lock;		/* lock on stopped threads */
    144 
    145 static void	cpu_dispqalloc(int numpris);
    146 
    147 /*
    148  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
    149  * a thread because it was sitting on its run queue for a very short
    150  * period of time.
    151  */
    152 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
    153 
    154 static kthread_t	*disp_getwork(cpu_t *to);
    155 static kthread_t	*disp_getbest(disp_t *from);
    156 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
    157 
    158 void	swtch_to(kthread_t *);
    159 
    160 /*
    161  * dispatcher and scheduler initialization
    162  */
    163 
    164 /*
    165  * disp_setup - Common code to calculate and allocate dispatcher
    166  *		variables and structures based on the maximum priority.
    167  */
    168 static void
    169 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
    170 {
    171 	pri_t	newnglobpris;
    172 
    173 	ASSERT(MUTEX_HELD(&cpu_lock));
    174 
    175 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
    176 
    177 	if (newnglobpris > oldnglobpris) {
    178 		/*
    179 		 * Allocate new kp queues for each CPU partition.
    180 		 */
    181 		cpupart_kpqalloc(newnglobpris);
    182 
    183 		/*
    184 		 * Allocate new dispatch queues for each CPU.
    185 		 */
    186 		cpu_dispqalloc(newnglobpris);
    187 
    188 		/*
    189 		 * compute new interrupt thread base priority
    190 		 */
    191 		intr_pri = maxglobpri;
    192 		if (only_intr_kpreempt) {
    193 			kpreemptpri = intr_pri + 1;
    194 			if (kpqpri == KPQPRI)
    195 				kpqpri = kpreemptpri;
    196 		}
    197 		v.v_nglobpris = newnglobpris;
    198 	}
    199 }
    200 
    201 /*
    202  * dispinit - Called to initialize all loaded classes and the
    203  *	      dispatcher framework.
    204  */
    205 void
    206 dispinit(void)
    207 {
    208 	id_t	cid;
    209 	pri_t	maxglobpri;
    210 	pri_t	cl_maxglobpri;
    211 
    212 	maxglobpri = -1;
    213 
    214 	/*
    215 	 * Initialize transition lock, which will always be set.
    216 	 */
    217 	DISP_LOCK_INIT(&transition_lock);
    218 	disp_lock_enter_high(&transition_lock);
    219 	DISP_LOCK_INIT(&stop_lock);
    220 
    221 	mutex_enter(&cpu_lock);
    222 	CPU->cpu_disp->disp_maxrunpri = -1;
    223 	CPU->cpu_disp->disp_max_unbound_pri = -1;
    224 
    225 	/*
    226 	 * Initialize the default CPU partition.
    227 	 */
    228 	cpupart_initialize_default();
    229 	/*
    230 	 * Call the class specific initialization functions for
    231 	 * all pre-installed schedulers.
    232 	 *
    233 	 * We pass the size of a class specific parameter
    234 	 * buffer to each of the initialization functions
    235 	 * to try to catch problems with backward compatibility
    236 	 * of class modules.
    237 	 *
    238 	 * For example a new class module running on an old system
    239 	 * which didn't provide sufficiently large parameter buffers
    240 	 * would be bad news. Class initialization modules can check for
    241 	 * this and take action if they detect a problem.
    242 	 */
    243 
    244 	for (cid = 0; cid < nclass; cid++) {
    245 		sclass_t	*sc;
    246 
    247 		sc = &sclass[cid];
    248 		if (SCHED_INSTALLED(sc)) {
    249 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
    250 			    &sc->cl_funcs);
    251 			if (cl_maxglobpri > maxglobpri)
    252 				maxglobpri = cl_maxglobpri;
    253 		}
    254 	}
    255 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
    256 	if (kpqpri == KPQPRI)
    257 		kpqpri = kpreemptpri;
    258 
    259 	ASSERT(maxglobpri >= 0);
    260 	disp_setup(maxglobpri, 0);
    261 
    262 	mutex_exit(&cpu_lock);
    263 
    264 	/*
    265 	 * Platform specific sticky scheduler setup.
    266 	 */
    267 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
    268 		cmp_set_nosteal_interval();
    269 
    270 	/*
    271 	 * Get the default class ID; this may be later modified via
    272 	 * dispadmin(1M).  This will load the class (normally TS) and that will
    273 	 * call disp_add(), which is why we had to drop cpu_lock first.
    274 	 */
    275 	if (getcid(defaultclass, &defaultcid) != 0) {
    276 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
    277 		    defaultclass);
    278 	}
    279 }
    280 
    281 /*
    282  * disp_add - Called with class pointer to initialize the dispatcher
    283  *	      for a newly loaded class.
    284  */
    285 void
    286 disp_add(sclass_t *clp)
    287 {
    288 	pri_t	maxglobpri;
    289 	pri_t	cl_maxglobpri;
    290 
    291 	mutex_enter(&cpu_lock);
    292 	/*
    293 	 * Initialize the scheduler class.
    294 	 */
    295 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
    296 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
    297 	if (cl_maxglobpri > maxglobpri)
    298 		maxglobpri = cl_maxglobpri;
    299 
    300 	/*
    301 	 * Save old queue information.  Since we're initializing a
    302 	 * new scheduling class which has just been loaded, then
    303 	 * the size of the dispq may have changed.  We need to handle
    304 	 * that here.
    305 	 */
    306 	disp_setup(maxglobpri, v.v_nglobpris);
    307 
    308 	mutex_exit(&cpu_lock);
    309 }
    310 
    311 
    312 /*
    313  * For each CPU, allocate new dispatch queues
    314  * with the stated number of priorities.
    315  */
    316 static void
    317 cpu_dispqalloc(int numpris)
    318 {
    319 	cpu_t	*cpup;
    320 	struct disp_queue_info	*disp_mem;
    321 	int i, num;
    322 
    323 	ASSERT(MUTEX_HELD(&cpu_lock));
    324 
    325 	disp_mem = kmem_zalloc(NCPU *
    326 	    sizeof (struct disp_queue_info), KM_SLEEP);
    327 
    328 	/*
    329 	 * This routine must allocate all of the memory before stopping
    330 	 * the cpus because it must not sleep in kmem_alloc while the
    331 	 * CPUs are stopped.  Locks they hold will not be freed until they
    332 	 * are restarted.
    333 	 */
    334 	i = 0;
    335 	cpup = cpu_list;
    336 	do {
    337 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
    338 		i++;
    339 		cpup = cpup->cpu_next;
    340 	} while (cpup != cpu_list);
    341 	num = i;
    342 
    343 	pause_cpus(NULL);
    344 	for (i = 0; i < num; i++)
    345 		disp_dq_assign(&disp_mem[i], numpris);
    346 	start_cpus();
    347 
    348 	/*
    349 	 * I must free all of the memory after starting the cpus because
    350 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
    351 	 */
    352 	for (i = 0; i < num; i++)
    353 		disp_dq_free(&disp_mem[i]);
    354 
    355 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
    356 }
    357 
    358 static void
    359 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
    360 {
    361 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
    362 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
    363 	    sizeof (long), KM_SLEEP);
    364 	dptr->dp = dp;
    365 }
    366 
    367 static void
    368 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
    369 {
    370 	disp_t	*dp;
    371 
    372 	dp = dptr->dp;
    373 	dptr->olddispq = dp->disp_q;
    374 	dptr->olddqactmap = dp->disp_qactmap;
    375 	dptr->oldnglobpris = dp->disp_npri;
    376 
    377 	ASSERT(dptr->oldnglobpris < numpris);
    378 
    379 	if (dptr->olddispq != NULL) {
    380 		/*
    381 		 * Use kcopy because bcopy is platform-specific
    382 		 * and could block while we might have paused the cpus.
    383 		 */
    384 		(void) kcopy(dptr->olddispq, dptr->newdispq,
    385 		    dptr->oldnglobpris * sizeof (dispq_t));
    386 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
    387 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
    388 		    sizeof (long));
    389 	}
    390 	dp->disp_q = dptr->newdispq;
    391 	dp->disp_qactmap = dptr->newdqactmap;
    392 	dp->disp_q_limit = &dptr->newdispq[numpris];
    393 	dp->disp_npri = numpris;
    394 }
    395 
    396 static void
    397 disp_dq_free(struct disp_queue_info *dptr)
    398 {
    399 	if (dptr->olddispq != NULL)
    400 		kmem_free(dptr->olddispq,
    401 		    dptr->oldnglobpris * sizeof (dispq_t));
    402 	if (dptr->olddqactmap != NULL)
    403 		kmem_free(dptr->olddqactmap,
    404 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
    405 }
    406 
    407 /*
    408  * For a newly created CPU, initialize the dispatch queue.
    409  * This is called before the CPU is known through cpu[] or on any lists.
    410  */
    411 void
    412 disp_cpu_init(cpu_t *cp)
    413 {
    414 	disp_t	*dp;
    415 	dispq_t	*newdispq;
    416 	ulong_t	*newdqactmap;
    417 
    418 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
    419 
    420 	if (cp == cpu0_disp.disp_cpu)
    421 		dp = &cpu0_disp;
    422 	else
    423 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
    424 	bzero(dp, sizeof (disp_t));
    425 	cp->cpu_disp = dp;
    426 	dp->disp_cpu = cp;
    427 	dp->disp_maxrunpri = -1;
    428 	dp->disp_max_unbound_pri = -1;
    429 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
    430 	/*
    431 	 * Allocate memory for the dispatcher queue headers
    432 	 * and the active queue bitmap.
    433 	 */
    434 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
    435 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
    436 	    sizeof (long), KM_SLEEP);
    437 	dp->disp_q = newdispq;
    438 	dp->disp_qactmap = newdqactmap;
    439 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
    440 	dp->disp_npri = v.v_nglobpris;
    441 }
    442 
    443 void
    444 disp_cpu_fini(cpu_t *cp)
    445 {
    446 	ASSERT(MUTEX_HELD(&cpu_lock));
    447 
    448 	disp_kp_free(cp->cpu_disp);
    449 	if (cp->cpu_disp != &cpu0_disp)
    450 		kmem_free(cp->cpu_disp, sizeof (disp_t));
    451 }
    452 
    453 /*
    454  * Allocate new, larger kpreempt dispatch queue to replace the old one.
    455  */
    456 void
    457 disp_kp_alloc(disp_t *dq, pri_t npri)
    458 {
    459 	struct disp_queue_info	mem_info;
    460 
    461 	if (npri > dq->disp_npri) {
    462 		/*
    463 		 * Allocate memory for the new array.
    464 		 */
    465 		disp_dq_alloc(&mem_info, npri, dq);
    466 
    467 		/*
    468 		 * We need to copy the old structures to the new
    469 		 * and free the old.
    470 		 */
    471 		disp_dq_assign(&mem_info, npri);
    472 		disp_dq_free(&mem_info);
    473 	}
    474 }
    475 
    476 /*
    477  * Free dispatch queue.
    478  * Used for the kpreempt queues for a removed CPU partition and
    479  * for the per-CPU queues of deleted CPUs.
    480  */
    481 void
    482 disp_kp_free(disp_t *dq)
    483 {
    484 	struct disp_queue_info	mem_info;
    485 
    486 	mem_info.olddispq = dq->disp_q;
    487 	mem_info.olddqactmap = dq->disp_qactmap;
    488 	mem_info.oldnglobpris = dq->disp_npri;
    489 	disp_dq_free(&mem_info);
    490 }
    491 
    492 /*
    493  * End dispatcher and scheduler initialization.
    494  */
    495 
    496 /*
    497  * See if there's anything to do other than remain idle.
    498  * Return non-zero if there is.
    499  *
    500  * This function must be called with high spl, or with
    501  * kernel preemption disabled to prevent the partition's
    502  * active cpu list from changing while being traversed.
    503  *
    504  * This is essentially a simpler version of disp_getwork()
    505  * to be called by CPUs preparing to "halt".
    506  */
    507 int
    508 disp_anywork(void)
    509 {
    510 	cpu_t		*cp = CPU;
    511 	cpu_t		*ocp;
    512 	volatile int	*local_nrunnable = &cp->cpu_disp->disp_nrunnable;
    513 
    514 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
    515 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
    516 			return (1);
    517 
    518 		for (ocp = cp->cpu_next_part; ocp != cp;
    519 		    ocp = ocp->cpu_next_part) {
    520 			ASSERT(CPU_ACTIVE(ocp));
    521 
    522 			/*
    523 			 * Something has appeared on the local run queue.
    524 			 */
    525 			if (*local_nrunnable > 0)
    526 				return (1);
    527 			/*
    528 			 * If we encounter another idle CPU that will
    529 			 * soon be trolling around through disp_anywork()
    530 			 * terminate our walk here and let this other CPU
    531 			 * patrol the next part of the list.
    532 			 */
    533 			if (ocp->cpu_dispatch_pri == -1 &&
    534 			    (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
    535 				return (0);
    536 			/*
    537 			 * Work can be taken from another CPU if:
    538 			 *	- There is unbound work on the run queue
    539 			 *	- That work isn't a thread undergoing a
    540 			 *	- context switch on an otherwise empty queue.
    541 			 *	- The CPU isn't running the idle loop.
    542 			 */
    543 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
    544 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
    545 			    ocp->cpu_disp->disp_nrunnable == 1) &&
    546 			    ocp->cpu_dispatch_pri != -1)
    547 				return (1);
    548 		}
    549 	}
    550 	return (0);
    551 }
    552 
    553 /*
    554  * Called when CPU enters the idle loop
    555  */
    556 static void
    557 idle_enter()
    558 {
    559 	cpu_t		*cp = CPU;
    560 
    561 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
    562 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
    563 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
    564 }
    565 
    566 /*
    567  * Called when CPU exits the idle loop
    568  */
    569 static void
    570 idle_exit()
    571 {
    572 	cpu_t		*cp = CPU;
    573 
    574 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
    575 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
    576 }
    577 
    578 /*
    579  * Idle loop.
    580  */
    581 void
    582 idle()
    583 {
    584 	struct cpu	*cp = CPU;		/* pointer to this CPU */
    585 	kthread_t	*t;			/* taken thread */
    586 
    587 	idle_enter();
    588 
    589 	/*
    590 	 * Uniprocessor version of idle loop.
    591 	 * Do this until notified that we're on an actual multiprocessor.
    592 	 */
    593 	while (ncpus == 1) {
    594 		if (cp->cpu_disp->disp_nrunnable == 0) {
    595 			(*idle_cpu)();
    596 			continue;
    597 		}
    598 		idle_exit();
    599 		swtch();
    600 
    601 		idle_enter(); /* returned from swtch */
    602 	}
    603 
    604 	/*
    605 	 * Multiprocessor idle loop.
    606 	 */
    607 	for (;;) {
    608 		/*
    609 		 * If CPU is completely quiesced by p_online(2), just wait
    610 		 * here with minimal bus traffic until put online.
    611 		 */
    612 		while (cp->cpu_flags & CPU_QUIESCED)
    613 			(*idle_cpu)();
    614 
    615 		if (cp->cpu_disp->disp_nrunnable != 0) {
    616 			idle_exit();
    617 			swtch();
    618 		} else {
    619 			if (cp->cpu_flags & CPU_OFFLINE)
    620 				continue;
    621 			if ((t = disp_getwork(cp)) == NULL) {
    622 				if (cp->cpu_chosen_level != -1) {
    623 					disp_t *dp = cp->cpu_disp;
    624 					disp_t *kpq;
    625 
    626 					disp_lock_enter(&dp->disp_lock);
    627 					/*
    628 					 * Set kpq under lock to prevent
    629 					 * migration between partitions.
    630 					 */
    631 					kpq = &cp->cpu_part->cp_kp_queue;
    632 					if (kpq->disp_maxrunpri == -1)
    633 						cp->cpu_chosen_level = -1;
    634 					disp_lock_exit(&dp->disp_lock);
    635 				}
    636 				(*idle_cpu)();
    637 				continue;
    638 			}
    639 			/*
    640 			 * If there was a thread but we couldn't steal
    641 			 * it, then keep trying.
    642 			 */
    643 			if (t == T_DONTSTEAL)
    644 				continue;
    645 			idle_exit();
    646 			swtch_to(t);
    647 		}
    648 		idle_enter(); /* returned from swtch/swtch_to */
    649 	}
    650 }
    651 
    652 
    653 /*
    654  * Preempt the currently running thread in favor of the highest
    655  * priority thread.  The class of the current thread controls
    656  * where it goes on the dispatcher queues. If panicking, turn
    657  * preemption off.
    658  */
    659 void
    660 preempt()
    661 {
    662 	kthread_t 	*t = curthread;
    663 	klwp_t 		*lwp = ttolwp(curthread);
    664 
    665 	if (panicstr)
    666 		return;
    667 
    668 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
    669 
    670 	thread_lock(t);
    671 
    672 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
    673 		/*
    674 		 * this thread has already been chosen to be run on
    675 		 * another CPU. Clear kprunrun on this CPU since we're
    676 		 * already headed for swtch().
    677 		 */
    678 		CPU->cpu_kprunrun = 0;
    679 		thread_unlock_nopreempt(t);
    680 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
    681 	} else {
    682 		if (lwp != NULL)
    683 			lwp->lwp_ru.nivcsw++;
    684 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
    685 		THREAD_TRANSITION(t);
    686 		CL_PREEMPT(t);
    687 		DTRACE_SCHED(preempt);
    688 		thread_unlock_nopreempt(t);
    689 
    690 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
    691 
    692 		swtch();		/* clears CPU->cpu_runrun via disp() */
    693 	}
    694 }
    695 
    696 extern kthread_t *thread_unpin();
    697 
    698 /*
    699  * disp() - find the highest priority thread for this processor to run, and
    700  * set it in TS_ONPROC state so that resume() can be called to run it.
    701  */
    702 static kthread_t *
    703 disp()
    704 {
    705 	cpu_t		*cpup;
    706 	disp_t		*dp;
    707 	kthread_t	*tp;
    708 	dispq_t		*dq;
    709 	int		maxrunword;
    710 	pri_t		pri;
    711 	disp_t		*kpq;
    712 
    713 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
    714 
    715 	cpup = CPU;
    716 	/*
    717 	 * Find the highest priority loaded, runnable thread.
    718 	 */
    719 	dp = cpup->cpu_disp;
    720 
    721 reschedule:
    722 	/*
    723 	 * If there is more important work on the global queue with a better
    724 	 * priority than the maximum on this CPU, take it now.
    725 	 */
    726 	kpq = &cpup->cpu_part->cp_kp_queue;
    727 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
    728 	    pri >= dp->disp_maxrunpri &&
    729 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
    730 	    (tp = disp_getbest(kpq)) != NULL) {
    731 		if (disp_ratify(tp, kpq) != NULL) {
    732 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
    733 			    "disp_end:tid %p", tp);
    734 			return (tp);
    735 		}
    736 	}
    737 
    738 	disp_lock_enter(&dp->disp_lock);
    739 	pri = dp->disp_maxrunpri;
    740 
    741 	/*
    742 	 * If there is nothing to run, look at what's runnable on other queues.
    743 	 * Choose the idle thread if the CPU is quiesced.
    744 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
    745 	 * interrupt threads, which will be the only threads on the CPU's own
    746 	 * queue, but cannot run threads from other queues.
    747 	 */
    748 	if (pri == -1) {
    749 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
    750 			disp_lock_exit(&dp->disp_lock);
    751 			if ((tp = disp_getwork(cpup)) == NULL ||
    752 			    tp == T_DONTSTEAL) {
    753 				tp = cpup->cpu_idle_thread;
    754 				(void) splhigh();
    755 				THREAD_ONPROC(tp, cpup);
    756 				cpup->cpu_dispthread = tp;
    757 				cpup->cpu_dispatch_pri = -1;
    758 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
    759 				cpup->cpu_chosen_level = -1;
    760 			}
    761 		} else {
    762 			disp_lock_exit_high(&dp->disp_lock);
    763 			tp = cpup->cpu_idle_thread;
    764 			THREAD_ONPROC(tp, cpup);
    765 			cpup->cpu_dispthread = tp;
    766 			cpup->cpu_dispatch_pri = -1;
    767 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
    768 			cpup->cpu_chosen_level = -1;
    769 		}
    770 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
    771 		    "disp_end:tid %p", tp);
    772 		return (tp);
    773 	}
    774 
    775 	dq = &dp->disp_q[pri];
    776 	tp = dq->dq_first;
    777 
    778 	ASSERT(tp != NULL);
    779 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
    780 
    781 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
    782 
    783 	/*
    784 	 * Found it so remove it from queue.
    785 	 */
    786 	dp->disp_nrunnable--;
    787 	dq->dq_sruncnt--;
    788 	if ((dq->dq_first = tp->t_link) == NULL) {
    789 		ulong_t	*dqactmap = dp->disp_qactmap;
    790 
    791 		ASSERT(dq->dq_sruncnt == 0);
    792 		dq->dq_last = NULL;
    793 
    794 		/*
    795 		 * The queue is empty, so the corresponding bit needs to be
    796 		 * turned off in dqactmap.   If nrunnable != 0 just took the
    797 		 * last runnable thread off the
    798 		 * highest queue, so recompute disp_maxrunpri.
    799 		 */
    800 		maxrunword = pri >> BT_ULSHIFT;
    801 		dqactmap[maxrunword] &= ~BT_BIW(pri);
    802 
    803 		if (dp->disp_nrunnable == 0) {
    804 			dp->disp_max_unbound_pri = -1;
    805 			dp->disp_maxrunpri = -1;
    806 		} else {
    807 			int ipri;
    808 
    809 			ipri = bt_gethighbit(dqactmap, maxrunword);
    810 			dp->disp_maxrunpri = ipri;
    811 			if (ipri < dp->disp_max_unbound_pri)
    812 				dp->disp_max_unbound_pri = ipri;
    813 		}
    814 	} else {
    815 		tp->t_link = NULL;
    816 	}
    817 
    818 	/*
    819 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
    820 	 * out this thread before we have a chance to run it.
    821 	 * While running, it is protected against swapping by t_lock.
    822 	 */
    823 	tp->t_schedflag |= TS_DONT_SWAP;
    824 	cpup->cpu_dispthread = tp;		/* protected by spl only */
    825 	cpup->cpu_dispatch_pri = pri;
    826 	ASSERT(pri == DISP_PRIO(tp));
    827 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
    828 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
    829 
    830 	ASSERT(tp != NULL);
    831 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
    832 	    "disp_end:tid %p", tp);
    833 
    834 	if (disp_ratify(tp, kpq) == NULL)
    835 		goto reschedule;
    836 
    837 	return (tp);
    838 }
    839 
    840 /*
    841  * swtch()
    842  *	Find best runnable thread and run it.
    843  *	Called with the current thread already switched to a new state,
    844  *	on a sleep queue, run queue, stopped, and not zombied.
    845  *	May be called at any spl level less than or equal to LOCK_LEVEL.
    846  *	Always drops spl to the base level (spl0()).
    847  */
    848 void
    849 swtch()
    850 {
    851 	kthread_t	*t = curthread;
    852 	kthread_t	*next;
    853 	cpu_t		*cp;
    854 
    855 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
    856 
    857 	if (t->t_flag & T_INTR_THREAD)
    858 		cpu_intr_swtch_enter(t);
    859 
    860 	if (t->t_intr != NULL) {
    861 		/*
    862 		 * We are an interrupt thread.  Setup and return
    863 		 * the interrupted thread to be resumed.
    864 		 */
    865 		(void) splhigh();	/* block other scheduler action */
    866 		cp = CPU;		/* now protected against migration */
    867 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
    868 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
    869 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
    870 		next = thread_unpin();
    871 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
    872 		resume_from_intr(next);
    873 	} else {
    874 #ifdef	DEBUG
    875 		if (t->t_state == TS_ONPROC &&
    876 		    t->t_disp_queue->disp_cpu == CPU &&
    877 		    t->t_preempt == 0) {
    878 			thread_lock(t);
    879 			ASSERT(t->t_state != TS_ONPROC ||
    880 			    t->t_disp_queue->disp_cpu != CPU ||
    881 			    t->t_preempt != 0);	/* cannot migrate */
    882 			thread_unlock_nopreempt(t);
    883 		}
    884 #endif	/* DEBUG */
    885 		cp = CPU;
    886 		next = disp();		/* returns with spl high */
    887 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
    888 
    889 		/* OK to steal anything left on run queue */
    890 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
    891 
    892 		if (next != t) {
    893 			hrtime_t now;
    894 
    895 			now = gethrtime_unscaled();
    896 			pg_ev_thread_swtch(cp, now, t, next);
    897 
    898 			/*
    899 			 * If t was previously in the TS_ONPROC state,
    900 			 * setfrontdq and setbackdq won't have set its t_waitrq.
    901 			 * Since we now finally know that we're switching away
    902 			 * from this thread, set its t_waitrq if it is on a run
    903 			 * queue.
    904 			 */
    905 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
    906 				t->t_waitrq = now;
    907 			}
    908 
    909 			/*
    910 			 * restore mstate of thread that we are switching to
    911 			 */
    912 			restore_mstate(next);
    913 
    914 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
    915 			cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
    916 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
    917 
    918 			if (dtrace_vtime_active)
    919 				dtrace_vtime_switch(next);
    920 
    921 			resume(next);
    922 			/*
    923 			 * The TR_RESUME_END and TR_SWTCH_END trace points
    924 			 * appear at the end of resume(), because we may not
    925 			 * return here
    926 			 */
    927 		} else {
    928 			if (t->t_flag & T_INTR_THREAD)
    929 				cpu_intr_swtch_exit(t);
    930 
    931 			pg_ev_thread_remain(cp, t);
    932 
    933 			DTRACE_SCHED(remain__cpu);
    934 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
    935 			(void) spl0();
    936 		}
    937 	}
    938 }
    939 
    940 /*
    941  * swtch_from_zombie()
    942  *	Special case of swtch(), which allows checks for TS_ZOMB to be
    943  *	eliminated from normal resume.
    944  *	Find best runnable thread and run it.
    945  *	Called with the current thread zombied.
    946  *	Zombies cannot migrate, so CPU references are safe.
    947  */
    948 void
    949 swtch_from_zombie()
    950 {
    951 	kthread_t	*next;
    952 	cpu_t		*cpu = CPU;
    953 
    954 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
    955 
    956 	ASSERT(curthread->t_state == TS_ZOMB);
    957 
    958 	next = disp();			/* returns with spl high */
    959 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
    960 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
    961 	ASSERT(next != curthread);
    962 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
    963 
    964 	pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
    965 
    966 	restore_mstate(next);
    967 
    968 	if (dtrace_vtime_active)
    969 		dtrace_vtime_switch(next);
    970 
    971 	resume_from_zombie(next);
    972 	/*
    973 	 * The TR_RESUME_END and TR_SWTCH_END trace points
    974 	 * appear at the end of resume(), because we certainly will not
    975 	 * return here
    976 	 */
    977 }
    978 
    979 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
    980 
    981 /*
    982  * search_disp_queues()
    983  *	Search the given dispatch queues for thread tp.
    984  *	Return 1 if tp is found, otherwise return 0.
    985  */
    986 static int
    987 search_disp_queues(disp_t *dp, kthread_t *tp)
    988 {
    989 	dispq_t		*dq;
    990 	dispq_t		*eq;
    991 
    992 	disp_lock_enter_high(&dp->disp_lock);
    993 
    994 	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
    995 		kthread_t	*rp;
    996 
    997 		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
    998 
    999 		for (rp = dq->dq_first; rp; rp = rp->t_link)
   1000 			if (tp == rp) {
   1001 				disp_lock_exit_high(&dp->disp_lock);
   1002 				return (1);
   1003 			}
   1004 	}
   1005 	disp_lock_exit_high(&dp->disp_lock);
   1006 
   1007 	return (0);
   1008 }
   1009 
   1010 /*
   1011  * thread_on_queue()
   1012  *	Search all per-CPU dispatch queues and all partition-wide kpreempt
   1013  *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
   1014  */
   1015 static int
   1016 thread_on_queue(kthread_t *tp)
   1017 {
   1018 	cpu_t		*cp;
   1019 	struct cpupart	*part;
   1020 
   1021 	ASSERT(getpil() >= DISP_LEVEL);
   1022 
   1023 	/*
   1024 	 * Search the per-CPU dispatch queues for tp.
   1025 	 */
   1026 	cp = CPU;
   1027 	do {
   1028 		if (search_disp_queues(cp->cpu_disp, tp))
   1029 			return (1);
   1030 	} while ((cp = cp->cpu_next_onln) != CPU);
   1031 
   1032 	/*
   1033 	 * Search the partition-wide kpreempt queues for tp.
   1034 	 */
   1035 	part = CPU->cpu_part;
   1036 	do {
   1037 		if (search_disp_queues(&part->cp_kp_queue, tp))
   1038 			return (1);
   1039 	} while ((part = part->cp_next) != CPU->cpu_part);
   1040 
   1041 	return (0);
   1042 }
   1043 
   1044 #else
   1045 
   1046 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
   1047 
   1048 #endif  /* DEBUG */
   1049 
   1050 /*
   1051  * like swtch(), but switch to a specified thread taken from another CPU.
   1052  *	called with spl high..
   1053  */
   1054 void
   1055 swtch_to(kthread_t *next)
   1056 {
   1057 	cpu_t			*cp = CPU;
   1058 	hrtime_t		now;
   1059 
   1060 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
   1061 
   1062 	/*
   1063 	 * Update context switch statistics.
   1064 	 */
   1065 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
   1066 
   1067 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
   1068 
   1069 	now = gethrtime_unscaled();
   1070 	pg_ev_thread_swtch(cp, now, curthread, next);
   1071 
   1072 	/* OK to steal anything left on run queue */
   1073 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
   1074 
   1075 	/* record last execution time */
   1076 	cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
   1077 
   1078 	/*
   1079 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
   1080 	 * won't have set its t_waitrq.  Since we now finally know that we're
   1081 	 * switching away from this thread, set its t_waitrq if it is on a run
   1082 	 * queue.
   1083 	 */
   1084 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
   1085 		curthread->t_waitrq = now;
   1086 	}
   1087 
   1088 	/* restore next thread to previously running microstate */
   1089 	restore_mstate(next);
   1090 
   1091 	if (dtrace_vtime_active)
   1092 		dtrace_vtime_switch(next);
   1093 
   1094 	resume(next);
   1095 	/*
   1096 	 * The TR_RESUME_END and TR_SWTCH_END trace points
   1097 	 * appear at the end of resume(), because we may not
   1098 	 * return here
   1099 	 */
   1100 }
   1101 
   1102 #define	CPU_IDLING(pri)	((pri) == -1)
   1103 
   1104 static void
   1105 cpu_resched(cpu_t *cp, pri_t tpri)
   1106 {
   1107 	int	call_poke_cpu = 0;
   1108 	pri_t   cpupri = cp->cpu_dispatch_pri;
   1109 
   1110 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
   1111 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
   1112 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
   1113 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
   1114 			cp->cpu_runrun = 1;
   1115 			aston(cp->cpu_dispthread);
   1116 			if (tpri < kpreemptpri && cp != CPU)
   1117 				call_poke_cpu = 1;
   1118 		}
   1119 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
   1120 			cp->cpu_kprunrun = 1;
   1121 			if (cp != CPU)
   1122 				call_poke_cpu = 1;
   1123 		}
   1124 	}
   1125 
   1126 	/*
   1127 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
   1128 	 */
   1129 	membar_enter();
   1130 
   1131 	if (call_poke_cpu)
   1132 		poke_cpu(cp->cpu_id);
   1133 }
   1134 
   1135 /*
   1136  * setbackdq() keeps runqs balanced such that the difference in length
   1137  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
   1138  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
   1139  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
   1140  * try to keep runqs perfectly balanced regardless of the thread priority.
   1141  */
   1142 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
   1143 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
   1144 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
   1145 
   1146 /*
   1147  * Macro that evaluates to true if it is likely that the thread has cache
   1148  * warmth. This is based on the amount of time that has elapsed since the
   1149  * thread last ran. If that amount of time is less than "rechoose_interval"
   1150  * ticks, then we decide that the thread has enough cache warmth to warrant
   1151  * some affinity for t->t_cpu.
   1152  */
   1153 #define	THREAD_HAS_CACHE_WARMTH(thread)	\
   1154 	((thread == curthread) ||	\
   1155 	((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
   1156 /*
   1157  * Put the specified thread on the back of the dispatcher
   1158  * queue corresponding to its current priority.
   1159  *
   1160  * Called with the thread in transition, onproc or stopped state
   1161  * and locked (transition implies locked) and at high spl.
   1162  * Returns with the thread in TS_RUN state and still locked.
   1163  */
   1164 void
   1165 setbackdq(kthread_t *tp)
   1166 {
   1167 	dispq_t	*dq;
   1168 	disp_t		*dp;
   1169 	cpu_t		*cp;
   1170 	pri_t		tpri;
   1171 	int		bound;
   1172 	boolean_t	self;
   1173 
   1174 	ASSERT(THREAD_LOCK_HELD(tp));
   1175 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
   1176 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
   1177 
   1178 	/*
   1179 	 * If thread is "swapped" or on the swap queue don't
   1180 	 * queue it, but wake sched.
   1181 	 */
   1182 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
   1183 		disp_swapped_setrun(tp);
   1184 		return;
   1185 	}
   1186 
   1187 	self = (tp == curthread);
   1188 
   1189 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
   1190 		bound = 1;
   1191 	else
   1192 		bound = 0;
   1193 
   1194 	tpri = DISP_PRIO(tp);
   1195 	if (ncpus == 1)
   1196 		cp = tp->t_cpu;
   1197 	else if (!bound) {
   1198 		if (tpri >= kpqpri) {
   1199 			setkpdq(tp, SETKP_BACK);
   1200 			return;
   1201 		}
   1202 
   1203 		/*
   1204 		 * We'll generally let this thread continue to run where
   1205 		 * it last ran...but will consider migration if:
   1206 		 * - We thread probably doesn't have much cache warmth.
   1207 		 * - The CPU where it last ran is the target of an offline
   1208 		 *   request.
   1209 		 * - The thread last ran outside it's home lgroup.
   1210 		 */
   1211 		if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
   1212 		    (tp->t_cpu == cpu_inmotion)) {
   1213 			cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
   1214 		} else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
   1215 			cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
   1216 			    self ? tp->t_cpu : NULL);
   1217 		} else {
   1218 			cp = tp->t_cpu;
   1219 		}
   1220 
   1221 		if (tp->t_cpupart == cp->cpu_part) {
   1222 			int	qlen;
   1223 
   1224 			/*
   1225 			 * Perform any CMT load balancing
   1226 			 */
   1227 			cp = cmt_balance(tp, cp);
   1228 
   1229 			/*
   1230 			 * Balance across the run queues
   1231 			 */
   1232 			qlen = RUNQ_LEN(cp, tpri);
   1233 			if (tpri >= RUNQ_MATCH_PRI &&
   1234 			    !(tp->t_schedflag & TS_RUNQMATCH))
   1235 				qlen -= RUNQ_MAX_DIFF;
   1236 			if (qlen > 0) {
   1237 				cpu_t *newcp;
   1238 
   1239 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
   1240 					newcp = cp->cpu_next_part;
   1241 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
   1242 					newcp = cp->cpu_next_part;
   1243 				}
   1244 
   1245 				if (RUNQ_LEN(newcp, tpri) < qlen) {
   1246 					DTRACE_PROBE3(runq__balance,
   1247 					    kthread_t *, tp,
   1248 					    cpu_t *, cp, cpu_t *, newcp);
   1249 					cp = newcp;
   1250 				}
   1251 			}
   1252 		} else {
   1253 			/*
   1254 			 * Migrate to a cpu in the new partition.
   1255 			 */
   1256 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
   1257 			    tp->t_lpl, tp->t_pri, NULL);
   1258 		}
   1259 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
   1260 	} else {
   1261 		/*
   1262 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
   1263 		 * a short time until weak binding that existed when the
   1264 		 * strong binding was established has dropped) so we must
   1265 		 * favour weak binding over strong.
   1266 		 */
   1267 		cp = tp->t_weakbound_cpu ?
   1268 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
   1269 	}
   1270 	/*
   1271 	 * A thread that is ONPROC may be temporarily placed on the run queue
   1272 	 * but then chosen to run again by disp.  If the thread we're placing on
   1273 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
   1274 	 * replacement process is actually scheduled in swtch().  In this
   1275 	 * situation, curthread is the only thread that could be in the ONPROC
   1276 	 * state.
   1277 	 */
   1278 	if ((!self) && (tp->t_waitrq == 0)) {
   1279 		hrtime_t curtime;
   1280 
   1281 		curtime = gethrtime_unscaled();
   1282 		(void) cpu_update_pct(tp, curtime);
   1283 		tp->t_waitrq = curtime;
   1284 	} else {
   1285 		(void) cpu_update_pct(tp, gethrtime_unscaled());
   1286 	}
   1287 
   1288 	dp = cp->cpu_disp;
   1289 	disp_lock_enter_high(&dp->disp_lock);
   1290 
   1291 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
   1292 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
   1293 	    tpri, cp, tp);
   1294 
   1295 #ifndef NPROBE
   1296 	/* Kernel probe */
   1297 	if (tnf_tracing_active)
   1298 		tnf_thread_queue(tp, cp, tpri);
   1299 #endif /* NPROBE */
   1300 
   1301 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
   1302 
   1303 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
   1304 	tp->t_disp_queue = dp;
   1305 	tp->t_link = NULL;
   1306 
   1307 	dq = &dp->disp_q[tpri];
   1308 	dp->disp_nrunnable++;
   1309 	if (!bound)
   1310 		dp->disp_steal = 0;
   1311 	membar_enter();
   1312 
   1313 	if (dq->dq_sruncnt++ != 0) {
   1314 		ASSERT(dq->dq_first != NULL);
   1315 		dq->dq_last->t_link = tp;
   1316 		dq->dq_last = tp;
   1317 	} else {
   1318 		ASSERT(dq->dq_first == NULL);
   1319 		ASSERT(dq->dq_last == NULL);
   1320 		dq->dq_first = dq->dq_last = tp;
   1321 		BT_SET(dp->disp_qactmap, tpri);
   1322 		if (tpri > dp->disp_maxrunpri) {
   1323 			dp->disp_maxrunpri = tpri;
   1324 			membar_enter();
   1325 			cpu_resched(cp, tpri);
   1326 		}
   1327 	}
   1328 
   1329 	if (!bound && tpri > dp->disp_max_unbound_pri) {
   1330 		if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
   1331 			/*
   1332 			 * If there are no other unbound threads on the
   1333 			 * run queue, don't allow other CPUs to steal
   1334 			 * this thread while we are in the middle of a
   1335 			 * context switch. We may just switch to it
   1336 			 * again right away. CPU_DISP_DONTSTEAL is cleared
   1337 			 * in swtch and swtch_to.
   1338 			 */
   1339 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
   1340 		}
   1341 		dp->disp_max_unbound_pri = tpri;
   1342 	}
   1343 	(*disp_enq_thread)(cp, bound);
   1344 }
   1345 
   1346 /*
   1347  * Put the specified thread on the front of the dispatcher
   1348  * queue corresponding to its current priority.
   1349  *
   1350  * Called with the thread in transition, onproc or stopped state
   1351  * and locked (transition implies locked) and at high spl.
   1352  * Returns with the thread in TS_RUN state and still locked.
   1353  */
   1354 void
   1355 setfrontdq(kthread_t *tp)
   1356 {
   1357 	disp_t		*dp;
   1358 	dispq_t		*dq;
   1359 	cpu_t		*cp;
   1360 	pri_t		tpri;
   1361 	int		bound;
   1362 
   1363 	ASSERT(THREAD_LOCK_HELD(tp));
   1364 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
   1365 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
   1366 
   1367 	/*
   1368 	 * If thread is "swapped" or on the swap queue don't
   1369 	 * queue it, but wake sched.
   1370 	 */
   1371 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
   1372 		disp_swapped_setrun(tp);
   1373 		return;
   1374 	}
   1375 
   1376 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
   1377 		bound = 1;
   1378 	else
   1379 		bound = 0;
   1380 
   1381 	tpri = DISP_PRIO(tp);
   1382 	if (ncpus == 1)
   1383 		cp = tp->t_cpu;
   1384 	else if (!bound) {
   1385 		if (tpri >= kpqpri) {
   1386 			setkpdq(tp, SETKP_FRONT);
   1387 			return;
   1388 		}
   1389 		cp = tp->t_cpu;
   1390 		if (tp->t_cpupart == cp->cpu_part) {
   1391 			/*
   1392 			 * We'll generally let this thread continue to run
   1393 			 * where it last ran, but will consider migration if:
   1394 			 * - The thread last ran outside it's home lgroup.
   1395 			 * - The CPU where it last ran is the target of an
   1396 			 *   offline request (a thread_nomigrate() on the in
   1397 			 *   motion CPU relies on this when forcing a preempt).
   1398 			 * - The thread isn't the highest priority thread where
   1399 			 *   it last ran, and it is considered not likely to
   1400 			 *   have significant cache warmth.
   1401 			 */
   1402 			if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
   1403 			    (cp == cpu_inmotion)) {
   1404 				cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
   1405 				    (tp == curthread) ? cp : NULL);
   1406 			} else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
   1407 			    (!THREAD_HAS_CACHE_WARMTH(tp))) {
   1408 				cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
   1409 				    NULL);
   1410 			}
   1411 		} else {
   1412 			/*
   1413 			 * Migrate to a cpu in the new partition.
   1414 			 */
   1415 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
   1416 			    tp->t_lpl, tp->t_pri, NULL);
   1417 		}
   1418 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
   1419 	} else {
   1420 		/*
   1421 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
   1422 		 * a short time until weak binding that existed when the
   1423 		 * strong binding was established has dropped) so we must
   1424 		 * favour weak binding over strong.
   1425 		 */
   1426 		cp = tp->t_weakbound_cpu ?
   1427 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
   1428 	}
   1429 
   1430 	/*
   1431 	 * A thread that is ONPROC may be temporarily placed on the run queue
   1432 	 * but then chosen to run again by disp.  If the thread we're placing on
   1433 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
   1434 	 * replacement process is actually scheduled in swtch().  In this
   1435 	 * situation, curthread is the only thread that could be in the ONPROC
   1436 	 * state.
   1437 	 */
   1438 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
   1439 		hrtime_t curtime;
   1440 
   1441 		curtime = gethrtime_unscaled();
   1442 		(void) cpu_update_pct(tp, curtime);
   1443 		tp->t_waitrq = curtime;
   1444 	} else {
   1445 		(void) cpu_update_pct(tp, gethrtime_unscaled());
   1446 	}
   1447 
   1448 	dp = cp->cpu_disp;
   1449 	disp_lock_enter_high(&dp->disp_lock);
   1450 
   1451 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
   1452 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
   1453 
   1454 #ifndef NPROBE
   1455 	/* Kernel probe */
   1456 	if (tnf_tracing_active)
   1457 		tnf_thread_queue(tp, cp, tpri);
   1458 #endif /* NPROBE */
   1459 
   1460 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
   1461 
   1462 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
   1463 	tp->t_disp_queue = dp;
   1464 
   1465 	dq = &dp->disp_q[tpri];
   1466 	dp->disp_nrunnable++;
   1467 	if (!bound)
   1468 		dp->disp_steal = 0;
   1469 	membar_enter();
   1470 
   1471 	if (dq->dq_sruncnt++ != 0) {
   1472 		ASSERT(dq->dq_last != NULL);
   1473 		tp->t_link = dq->dq_first;
   1474 		dq->dq_first = tp;
   1475 	} else {
   1476 		ASSERT(dq->dq_last == NULL);
   1477 		ASSERT(dq->dq_first == NULL);
   1478 		tp->t_link = NULL;
   1479 		dq->dq_first = dq->dq_last = tp;
   1480 		BT_SET(dp->disp_qactmap, tpri);
   1481 		if (tpri > dp->disp_maxrunpri) {
   1482 			dp->disp_maxrunpri = tpri;
   1483 			membar_enter();
   1484 			cpu_resched(cp, tpri);
   1485 		}
   1486 	}
   1487 
   1488 	if (!bound && tpri > dp->disp_max_unbound_pri) {
   1489 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
   1490 		    cp == CPU) {
   1491 			/*
   1492 			 * If there are no other unbound threads on the
   1493 			 * run queue, don't allow other CPUs to steal
   1494 			 * this thread while we are in the middle of a
   1495 			 * context switch. We may just switch to it
   1496 			 * again right away. CPU_DISP_DONTSTEAL is cleared
   1497 			 * in swtch and swtch_to.
   1498 			 */
   1499 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
   1500 		}
   1501 		dp->disp_max_unbound_pri = tpri;
   1502 	}
   1503 	(*disp_enq_thread)(cp, bound);
   1504 }
   1505 
   1506 /*
   1507  * Put a high-priority unbound thread on the kp queue
   1508  */
   1509 static void
   1510 setkpdq(kthread_t *tp, int borf)
   1511 {
   1512 	dispq_t	*dq;
   1513 	disp_t	*dp;
   1514 	cpu_t	*cp;
   1515 	pri_t	tpri;
   1516 
   1517 	tpri = DISP_PRIO(tp);
   1518 
   1519 	dp = &tp->t_cpupart->cp_kp_queue;
   1520 	disp_lock_enter_high(&dp->disp_lock);
   1521 
   1522 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
   1523 
   1524 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
   1525 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
   1526 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
   1527 	tp->t_disp_queue = dp;
   1528 	dp->disp_nrunnable++;
   1529 	dq = &dp->disp_q[tpri];
   1530 
   1531 	if (dq->dq_sruncnt++ != 0) {
   1532 		if (borf == SETKP_BACK) {
   1533 			ASSERT(dq->dq_first != NULL);
   1534 			tp->t_link = NULL;
   1535 			dq->dq_last->t_link = tp;
   1536 			dq->dq_last = tp;
   1537 		} else {
   1538 			ASSERT(dq->dq_last != NULL);
   1539 			tp->t_link = dq->dq_first;
   1540 			dq->dq_first = tp;
   1541 		}
   1542 	} else {
   1543 		if (borf == SETKP_BACK) {
   1544 			ASSERT(dq->dq_first == NULL);
   1545 			ASSERT(dq->dq_last == NULL);
   1546 			dq->dq_first = dq->dq_last = tp;
   1547 		} else {
   1548 			ASSERT(dq->dq_last == NULL);
   1549 			ASSERT(dq->dq_first == NULL);
   1550 			tp->t_link = NULL;
   1551 			dq->dq_first = dq->dq_last = tp;
   1552 		}
   1553 		BT_SET(dp->disp_qactmap, tpri);
   1554 		if (tpri > dp->disp_max_unbound_pri)
   1555 			dp->disp_max_unbound_pri = tpri;
   1556 		if (tpri > dp->disp_maxrunpri) {
   1557 			dp->disp_maxrunpri = tpri;
   1558 			membar_enter();
   1559 		}
   1560 	}
   1561 
   1562 	cp = tp->t_cpu;
   1563 	if (tp->t_cpupart != cp->cpu_part) {
   1564 		/* migrate to a cpu in the new partition */
   1565 		cp = tp->t_cpupart->cp_cpulist;
   1566 	}
   1567 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
   1568 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
   1569 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
   1570 
   1571 #ifndef NPROBE
   1572 	/* Kernel probe */
   1573 	if (tnf_tracing_active)
   1574 		tnf_thread_queue(tp, cp, tpri);
   1575 #endif /* NPROBE */
   1576 
   1577 	if (cp->cpu_chosen_level < tpri)
   1578 		cp->cpu_chosen_level = tpri;
   1579 	cpu_resched(cp, tpri);
   1580 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
   1581 	(*disp_enq_thread)(cp, 0);
   1582 }
   1583 
   1584 /*
   1585  * Remove a thread from the dispatcher queue if it is on it.
   1586  * It is not an error if it is not found but we return whether
   1587  * or not it was found in case the caller wants to check.
   1588  */
   1589 int
   1590 dispdeq(kthread_t *tp)
   1591 {
   1592 	disp_t		*dp;
   1593 	dispq_t		*dq;
   1594 	kthread_t	*rp;
   1595 	kthread_t	*trp;
   1596 	kthread_t	**ptp;
   1597 	int		tpri;
   1598 
   1599 	ASSERT(THREAD_LOCK_HELD(tp));
   1600 
   1601 	if (tp->t_state != TS_RUN)
   1602 		return (0);
   1603 
   1604 	/*
   1605 	 * The thread is "swapped" or is on the swap queue and
   1606 	 * hence no longer on the run queue, so return true.
   1607 	 */
   1608 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
   1609 		return (1);
   1610 
   1611 	tpri = DISP_PRIO(tp);
   1612 	dp = tp->t_disp_queue;
   1613 	ASSERT(tpri < dp->disp_npri);
   1614 	dq = &dp->disp_q[tpri];
   1615 	ptp = &dq->dq_first;
   1616 	rp = *ptp;
   1617 	trp = NULL;
   1618 
   1619 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
   1620 
   1621 	/*
   1622 	 * Search for thread in queue.
   1623 	 * Double links would simplify this at the expense of disp/setrun.
   1624 	 */
   1625 	while (rp != tp && rp != NULL) {
   1626 		trp = rp;
   1627 		ptp = &trp->t_link;
   1628 		rp = trp->t_link;
   1629 	}
   1630 
   1631 	if (rp == NULL) {
   1632 		panic("dispdeq: thread not on queue");
   1633 	}
   1634 
   1635 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
   1636 
   1637 	/*
   1638 	 * Found it so remove it from queue.
   1639 	 */
   1640 	if ((*ptp = rp->t_link) == NULL)
   1641 		dq->dq_last = trp;
   1642 
   1643 	dp->disp_nrunnable--;
   1644 	if (--dq->dq_sruncnt == 0) {
   1645 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
   1646 		if (dp->disp_nrunnable == 0) {
   1647 			dp->disp_max_unbound_pri = -1;
   1648 			dp->disp_maxrunpri = -1;
   1649 		} else if (tpri == dp->disp_maxrunpri) {
   1650 			int ipri;
   1651 
   1652 			ipri = bt_gethighbit(dp->disp_qactmap,
   1653 			    dp->disp_maxrunpri >> BT_ULSHIFT);
   1654 			if (ipri < dp->disp_max_unbound_pri)
   1655 				dp->disp_max_unbound_pri = ipri;
   1656 			dp->disp_maxrunpri = ipri;
   1657 		}
   1658 	}
   1659 	tp->t_link = NULL;
   1660 	THREAD_TRANSITION(tp);		/* put in intermediate state */
   1661 	return (1);
   1662 }
   1663 
   1664 
   1665 /*
   1666  * dq_sruninc and dq_srundec are public functions for
   1667  * incrementing/decrementing the sruncnts when a thread on
   1668  * a dispatcher queue is made schedulable/unschedulable by
   1669  * resetting the TS_LOAD flag.
   1670  *
   1671  * The caller MUST have the thread lock and therefore the dispatcher
   1672  * queue lock so that the operation which changes
   1673  * the flag, the operation that checks the status of the thread to
   1674  * determine if it's on a disp queue AND the call to this function
   1675  * are one atomic operation with respect to interrupts.
   1676  */
   1677 
   1678 /*
   1679  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
   1680  */
   1681 void
   1682 dq_sruninc(kthread_t *t)
   1683 {
   1684 	ASSERT(t->t_state == TS_RUN);
   1685 	ASSERT(t->t_schedflag & TS_LOAD);
   1686 
   1687 	THREAD_TRANSITION(t);
   1688 	setfrontdq(t);
   1689 }
   1690 
   1691 /*
   1692  * See comment on calling conventions above.
   1693  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
   1694  */
   1695 void
   1696 dq_srundec(kthread_t *t)
   1697 {
   1698 	ASSERT(t->t_schedflag & TS_LOAD);
   1699 
   1700 	(void) dispdeq(t);
   1701 	disp_swapped_enq(t);
   1702 }
   1703 
   1704 /*
   1705  * Change the dispatcher lock of thread to the "swapped_lock"
   1706  * and return with thread lock still held.
   1707  *
   1708  * Called with thread_lock held, in transition state, and at high spl.
   1709  */
   1710 void
   1711 disp_swapped_enq(kthread_t *tp)
   1712 {
   1713 	ASSERT(THREAD_LOCK_HELD(tp));
   1714 	ASSERT(tp->t_schedflag & TS_LOAD);
   1715 
   1716 	switch (tp->t_state) {
   1717 	case TS_RUN:
   1718 		disp_lock_enter_high(&swapped_lock);
   1719 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
   1720 		break;
   1721 	case TS_ONPROC:
   1722 		disp_lock_enter_high(&swapped_lock);
   1723 		THREAD_TRANSITION(tp);
   1724 		wake_sched_sec = 1;		/* tell clock to wake sched */
   1725 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
   1726 		break;
   1727 	default:
   1728 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
   1729 	}
   1730 }
   1731 
   1732 /*
   1733  * This routine is called by setbackdq/setfrontdq if the thread is
   1734  * not loaded or loaded and on the swap queue.
   1735  *
   1736  * Thread state TS_SLEEP implies that a swapped thread
   1737  * has been woken up and needs to be swapped in by the swapper.
   1738  *
   1739  * Thread state TS_RUN, it implies that the priority of a swapped
   1740  * thread is being increased by scheduling class (e.g. ts_update).
   1741  */
   1742 static void
   1743 disp_swapped_setrun(kthread_t *tp)
   1744 {
   1745 	ASSERT(THREAD_LOCK_HELD(tp));
   1746 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
   1747 
   1748 	switch (tp->t_state) {
   1749 	case TS_SLEEP:
   1750 		disp_lock_enter_high(&swapped_lock);
   1751 		/*
   1752 		 * Wakeup sched immediately (i.e., next tick) if the
   1753 		 * thread priority is above maxclsyspri.
   1754 		 */
   1755 		if (DISP_PRIO(tp) > maxclsyspri)
   1756 			wake_sched = 1;
   1757 		else
   1758 			wake_sched_sec = 1;
   1759 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
   1760 		break;
   1761 	case TS_RUN:				/* called from ts_update */
   1762 		break;
   1763 	default:
   1764 		panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
   1765 	}
   1766 }
   1767 
   1768 
   1769 /*
   1770  *	Make a thread give up its processor.  Find the processor on
   1771  *	which this thread is executing, and have that processor
   1772  *	preempt.
   1773  */
   1774 void
   1775 cpu_surrender(kthread_t *tp)
   1776 {
   1777 	cpu_t	*cpup;
   1778 	int	max_pri;
   1779 	int	max_run_pri;
   1780 	klwp_t	*lwp;
   1781 
   1782 	ASSERT(THREAD_LOCK_HELD(tp));
   1783 
   1784 	if (tp->t_state != TS_ONPROC)
   1785 		return;
   1786 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
   1787 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
   1788 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
   1789 	if (max_pri < max_run_pri)
   1790 		max_pri = max_run_pri;
   1791 
   1792 	cpup->cpu_runrun = 1;
   1793 	if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
   1794 		cpup->cpu_kprunrun = 1;
   1795 	}
   1796 
   1797 	/*
   1798 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
   1799 	 */
   1800 	membar_enter();
   1801 
   1802 	DTRACE_SCHED1(surrender, kthread_t *, tp);
   1803 
   1804 	/*
   1805 	 * Make the target thread take an excursion through trap()
   1806 	 * to do preempt() (unless we're already in trap or post_syscall,
   1807 	 * calling cpu_surrender via CL_TRAPRET).
   1808 	 */
   1809 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
   1810 	    lwp->lwp_state != LWP_USER) {
   1811 		aston(tp);
   1812 		if (cpup != CPU)
   1813 			poke_cpu(cpup->cpu_id);
   1814 	}
   1815 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
   1816 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
   1817 }
   1818 
   1819 
   1820 /*
   1821  * Commit to and ratify a scheduling decision
   1822  */
   1823 /*ARGSUSED*/
   1824 static kthread_t *
   1825 disp_ratify(kthread_t *tp, disp_t *kpq)
   1826 {
   1827 	pri_t	tpri, maxpri;
   1828 	pri_t	maxkpri;
   1829 	cpu_t	*cpup;
   1830 
   1831 	ASSERT(tp != NULL);
   1832 	/*
   1833 	 * Commit to, then ratify scheduling decision
   1834 	 */
   1835 	cpup = CPU;
   1836 	if (cpup->cpu_runrun != 0)
   1837 		cpup->cpu_runrun = 0;
   1838 	if (cpup->cpu_kprunrun != 0)
   1839 		cpup->cpu_kprunrun = 0;
   1840 	if (cpup->cpu_chosen_level != -1)
   1841 		cpup->cpu_chosen_level = -1;
   1842 	membar_enter();
   1843 	tpri = DISP_PRIO(tp);
   1844 	maxpri = cpup->cpu_disp->disp_maxrunpri;
   1845 	maxkpri = kpq->disp_maxrunpri;
   1846 	if (maxpri < maxkpri)
   1847 		maxpri = maxkpri;
   1848 	if (tpri < maxpri) {
   1849 		/*
   1850 		 * should have done better
   1851 		 * put this one back and indicate to try again
   1852 		 */
   1853 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
   1854 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
   1855 		thread_lock_high(tp);
   1856 		THREAD_TRANSITION(tp);
   1857 		setfrontdq(tp);
   1858 		thread_unlock_nopreempt(tp);
   1859 
   1860 		tp = NULL;
   1861 	}
   1862 	return (tp);
   1863 }
   1864 
   1865 /*
   1866  * See if there is any work on the dispatcher queue for other CPUs.
   1867  * If there is, dequeue the best thread and return.
   1868  */
   1869 static kthread_t *
   1870 disp_getwork(cpu_t *cp)
   1871 {
   1872 	cpu_t		*ocp;		/* other CPU */
   1873 	cpu_t		*ocp_start;
   1874 	cpu_t		*tcp;		/* target local CPU */
   1875 	kthread_t	*tp;
   1876 	kthread_t	*retval = NULL;
   1877 	pri_t		maxpri;
   1878 	disp_t		*kpq;		/* kp queue for this partition */
   1879 	lpl_t		*lpl, *lpl_leaf;
   1880 	int		leafidx, startidx;
   1881 	hrtime_t	stealtime;
   1882 	lgrp_id_t	local_id;
   1883 
   1884 	maxpri = -1;
   1885 	tcp = NULL;
   1886 
   1887 	kpq = &cp->cpu_part->cp_kp_queue;
   1888 	while (kpq->disp_maxrunpri >= 0) {
   1889 		/*
   1890 		 * Try to take a thread from the kp_queue.
   1891 		 */
   1892 		tp = (disp_getbest(kpq));
   1893 		if (tp)
   1894 			return (disp_ratify(tp, kpq));
   1895 	}
   1896 
   1897 	kpreempt_disable();		/* protect the cpu_active list */
   1898 
   1899 	/*
   1900 	 * Try to find something to do on another CPU's run queue.
   1901 	 * Loop through all other CPUs looking for the one with the highest
   1902 	 * priority unbound thread.
   1903 	 *
   1904 	 * On NUMA machines, the partition's CPUs are consulted in order of
   1905 	 * distance from the current CPU. This way, the first available
   1906 	 * work found is also the closest, and will suffer the least
   1907 	 * from being migrated.
   1908 	 */
   1909 	lpl = lpl_leaf = cp->cpu_lpl;
   1910 	local_id = lpl_leaf->lpl_lgrpid;
   1911 	leafidx = startidx = 0;
   1912 
   1913 	/*
   1914 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
   1915 	 * broader levels of locality
   1916 	 */
   1917 	do {
   1918 		/* This loop iterates over the lpl's leaves */
   1919 		do {
   1920 			if (lpl_leaf != cp->cpu_lpl)
   1921 				ocp = lpl_leaf->lpl_cpus;
   1922 			else
   1923 				ocp = cp->cpu_next_lpl;
   1924 
   1925 			/* This loop iterates over the CPUs in the leaf */
   1926 			ocp_start = ocp;
   1927 			do {
   1928 				pri_t pri;
   1929 
   1930 				ASSERT(CPU_ACTIVE(ocp));
   1931 
   1932 				/*
   1933 				 * End our stroll around this lpl if:
   1934 				 *
   1935 				 * - Something became runnable on the local
   1936 				 *   queue...which also ends our stroll around
   1937 				 *   the partition.
   1938 				 *
   1939 				 * - We happen across another idle CPU.
   1940 				 *   Since it is patrolling the next portion
   1941 				 *   of the lpl's list (assuming it's not
   1942 				 *   halted, or busy servicing an interrupt),
   1943 				 *   move to the next higher level of locality.
   1944 				 */
   1945 				if (cp->cpu_disp->disp_nrunnable != 0) {
   1946 					kpreempt_enable();
   1947 					return (NULL);
   1948 				}
   1949 				if (ocp->cpu_dispatch_pri == -1) {
   1950 					if (ocp->cpu_disp_flags &
   1951 					    CPU_DISP_HALTED ||
   1952 					    ocp->cpu_intr_actv != 0)
   1953 						continue;
   1954 					else
   1955 						goto next_level;
   1956 				}
   1957 
   1958 				/*
   1959 				 * If there's only one thread and the CPU
   1960 				 * is in the middle of a context switch,
   1961 				 * or it's currently running the idle thread,
   1962 				 * don't steal it.
   1963 				 */
   1964 				if ((ocp->cpu_disp_flags &
   1965 				    CPU_DISP_DONTSTEAL) &&
   1966 				    ocp->cpu_disp->disp_nrunnable == 1)
   1967 					continue;
   1968 
   1969 				pri = ocp->cpu_disp->disp_max_unbound_pri;
   1970 				if (pri > maxpri) {
   1971 					/*
   1972 					 * Don't steal threads that we attempted
   1973 					 * to steal recently until they're ready
   1974 					 * to be stolen again.
   1975 					 */
   1976 					stealtime = ocp->cpu_disp->disp_steal;
   1977 					if (stealtime == 0 ||
   1978 					    stealtime - gethrtime() <= 0) {
   1979 						maxpri = pri;
   1980 						tcp = ocp;
   1981 					} else {
   1982 						/*
   1983 						 * Don't update tcp, just set
   1984 						 * the retval to T_DONTSTEAL, so
   1985 						 * that if no acceptable CPUs
   1986 						 * are found the return value
   1987 						 * will be T_DONTSTEAL rather
   1988 						 * then NULL.
   1989 						 */
   1990 						retval = T_DONTSTEAL;
   1991 					}
   1992 				}
   1993 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
   1994 
   1995 			/*
   1996 			 * Iterate to the next leaf lpl in the resource set
   1997 			 * at this level of locality. If we hit the end of
   1998 			 * the set, wrap back around to the beginning.
   1999 			 *
   2000 			 * Note: This iteration is NULL terminated for a reason
   2001 			 * see lpl_topo_bootstrap() in lgrp.c for details.
   2002 			 */
   2003 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
   2004 				leafidx = 0;
   2005 				lpl_leaf = lpl->lpl_rset[leafidx];
   2006 			}
   2007 		} while (leafidx != startidx);
   2008 
   2009 next_level:
   2010 		/*
   2011 		 * Expand the search to include farther away CPUs (next
   2012 		 * locality level). The closer CPUs that have already been
   2013 		 * checked will be checked again. In doing so, idle CPUs
   2014 		 * will tend to be more aggresive about stealing from CPUs
   2015 		 * that are closer (since the closer CPUs will be considered
   2016 		 * more often).
   2017 		 * Begin at this level with the CPUs local leaf lpl.
   2018 		 */
   2019 		if ((lpl = lpl->lpl_parent) != NULL) {
   2020 			leafidx = startidx = lpl->lpl_id2rset[local_id];
   2021 			lpl_leaf = lpl->lpl_rset[leafidx];
   2022 		}
   2023 	} while (!tcp && lpl);
   2024 
   2025 	kpreempt_enable();
   2026 
   2027 	/*
   2028 	 * If another queue looks good, and there is still nothing on
   2029 	 * the local queue, try to transfer one or more threads
   2030 	 * from it to our queue.
   2031 	 */
   2032 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
   2033 		tp = disp_getbest(tcp->cpu_disp);
   2034 		if (tp == NULL || tp == T_DONTSTEAL)
   2035 			return (tp);
   2036 		return (disp_ratify(tp, kpq));
   2037 	}
   2038 	return (retval);
   2039 }
   2040 
   2041 
   2042 /*
   2043  * disp_fix_unbound_pri()
   2044  *	Determines the maximum priority of unbound threads on the queue.
   2045  *	The priority is kept for the queue, but is only increased, never
   2046  *	reduced unless some CPU is looking for something on that queue.
   2047  *
   2048  *	The priority argument is the known upper limit.
   2049  *
   2050  *	Perhaps this should be kept accurately, but that probably means
   2051  *	separate bitmaps for bound and unbound threads.  Since only idled
   2052  *	CPUs will have to do this recalculation, it seems better this way.
   2053  */
   2054 static void
   2055 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
   2056 {
   2057 	kthread_t	*tp;
   2058 	dispq_t		*dq;
   2059 	ulong_t		*dqactmap = dp->disp_qactmap;
   2060 	ulong_t		mapword;
   2061 	int		wx;
   2062 
   2063 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
   2064 
   2065 	ASSERT(pri >= 0);			/* checked by caller */
   2066 
   2067 	/*
   2068 	 * Start the search at the next lowest priority below the supplied
   2069 	 * priority.  This depends on the bitmap implementation.
   2070 	 */
   2071 	do {
   2072 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
   2073 
   2074 		/*
   2075 		 * Form mask for all lower priorities in the word.
   2076 		 */
   2077 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
   2078 
   2079 		/*
   2080 		 * Get next lower active priority.
   2081 		 */
   2082 		if (mapword != 0) {
   2083 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
   2084 		} else if (wx > 0) {
   2085 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
   2086 			if (pri < 0)
   2087 				break;
   2088 		} else {
   2089 			pri = -1;
   2090 			break;
   2091 		}
   2092 
   2093 		/*
   2094 		 * Search the queue for unbound, runnable threads.
   2095 		 */
   2096 		dq = &dp->disp_q[pri];
   2097 		tp = dq->dq_first;
   2098 
   2099 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
   2100 			tp = tp->t_link;
   2101 		}
   2102 
   2103 		/*
   2104 		 * If a thread was found, set the priority and return.
   2105 		 */
   2106 	} while (tp == NULL);
   2107 
   2108 	/*
   2109 	 * pri holds the maximum unbound thread priority or -1.
   2110 	 */
   2111 	if (dp->disp_max_unbound_pri != pri)
   2112 		dp->disp_max_unbound_pri = pri;
   2113 }
   2114 
   2115 /*
   2116  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
   2117  * 	check if the CPU to which is was previously bound should have
   2118  * 	its disp_max_unbound_pri increased.
   2119  */
   2120 void
   2121 disp_adjust_unbound_pri(kthread_t *tp)
   2122 {
   2123 	disp_t *dp;
   2124 	pri_t tpri;
   2125 
   2126 	ASSERT(THREAD_LOCK_HELD(tp));
   2127 
   2128 	/*
   2129 	 * Don't do anything if the thread is not bound, or
   2130 	 * currently not runnable or swapped out.
   2131 	 */
   2132 	if (tp->t_bound_cpu == NULL ||
   2133 	    tp->t_state != TS_RUN ||
   2134 	    tp->t_schedflag & TS_ON_SWAPQ)
   2135 		return;
   2136 
   2137 	tpri = DISP_PRIO(tp);
   2138 	dp = tp->t_bound_cpu->cpu_disp;
   2139 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
   2140 	if (tpri > dp->disp_max_unbound_pri)
   2141 		dp->disp_max_unbound_pri = tpri;
   2142 }
   2143 
   2144 /*
   2145  * disp_getbest()
   2146  *   De-queue the highest priority unbound runnable thread.
   2147  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
   2148  *   Returns NULL if nothing found.
   2149  *   Returns T_DONTSTEAL if the thread was not stealable.
   2150  *   so that the caller will try again later.
   2151  *
   2152  *   Passed a pointer to a dispatch queue not associated with this CPU, and
   2153  *   its type.
   2154  */
   2155 static kthread_t *
   2156 disp_getbest(disp_t *dp)
   2157 {
   2158 	kthread_t	*tp;
   2159 	dispq_t		*dq;
   2160 	pri_t		pri;
   2161 	cpu_t		*cp, *tcp;
   2162 	boolean_t	allbound;
   2163 
   2164 	disp_lock_enter(&dp->disp_lock);
   2165 
   2166 	/*
   2167 	 * If there is nothing to run, or the CPU is in the middle of a
   2168 	 * context switch of the only thread, return NULL.
   2169 	 */
   2170 	tcp = dp->disp_cpu;
   2171 	cp = CPU;
   2172 	pri = dp->disp_max_unbound_pri;
   2173 	if (pri == -1 ||
   2174 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
   2175 	    tcp->cpu_disp->disp_nrunnable == 1)) {
   2176 		disp_lock_exit_nopreempt(&dp->disp_lock);
   2177 		return (NULL);
   2178 	}
   2179 
   2180 	dq = &dp->disp_q[pri];
   2181 
   2182 
   2183 	/*
   2184 	 * Assume that all threads are bound on this queue, and change it
   2185 	 * later when we find out that it is not the case.
   2186 	 */
   2187 	allbound = B_TRUE;
   2188 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
   2189 		hrtime_t now, nosteal, rqtime;
   2190 
   2191 		/*
   2192 		 * Skip over bound threads which could be here even
   2193 		 * though disp_max_unbound_pri indicated this level.
   2194 		 */
   2195 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
   2196 			continue;
   2197 
   2198 		/*
   2199 		 * We've got some unbound threads on this queue, so turn
   2200 		 * the allbound flag off now.
   2201 		 */
   2202 		allbound = B_FALSE;
   2203 
   2204 		/*
   2205 		 * The thread is a candidate for stealing from its run queue. We
   2206 		 * don't want to steal threads that became runnable just a
   2207 		 * moment ago. This improves CPU affinity for threads that get
   2208 		 * preempted for short periods of time and go back on the run
   2209 		 * queue.
   2210 		 *
   2211 		 * We want to let it stay on its run queue if it was only placed
   2212 		 * there recently and it was running on the same CPU before that
   2213 		 * to preserve its cache investment. For the thread to remain on
   2214 		 * its run queue, ALL of the following conditions must be
   2215 		 * satisfied:
   2216 		 *
   2217 		 * - the disp queue should not be the kernel preemption queue
   2218 		 * - delayed idle stealing should not be disabled
   2219 		 * - nosteal_nsec should be non-zero
   2220 		 * - it should run with user priority
   2221 		 * - it should be on the run queue of the CPU where it was
   2222 		 *   running before being placed on the run queue
   2223 		 * - it should be the only thread on the run queue (to prevent
   2224 		 *   extra scheduling latency for other threads)
   2225 		 * - it should sit on the run queue for less than per-chip
   2226 		 *   nosteal interval or global nosteal interval
   2227 		 * - in case of CPUs with shared cache it should sit in a run
   2228 		 *   queue of a CPU from a different chip
   2229 		 *
   2230 		 * The checks are arranged so that the ones that are faster are
   2231 		 * placed earlier.
   2232 		 */
   2233 		if (tcp == NULL ||
   2234 		    pri >= minclsyspri ||
   2235 		    tp->t_cpu != tcp)
   2236 			break;
   2237 
   2238 		/*
   2239 		 * Steal immediately if, due to CMT processor architecture
   2240 		 * migraiton between cp and tcp would incur no performance
   2241 		 * penalty.
   2242 		 */
   2243 		if (pg_cmt_can_migrate(cp, tcp))
   2244 			break;
   2245 
   2246 		nosteal = nosteal_nsec;
   2247 		if (nosteal == 0)
   2248 			break;
   2249 
   2250 		/*
   2251 		 * Calculate time spent sitting on run queue
   2252 		 */
   2253 		now = gethrtime_unscaled();
   2254 		rqtime = now - tp->t_waitrq;
   2255 		scalehrtime(&rqtime);
   2256 
   2257 		/*
   2258 		 * Steal immediately if the time spent on this run queue is more
   2259 		 * than allowed nosteal delay.
   2260 		 *
   2261 		 * Negative rqtime check is needed here to avoid infinite
   2262 		 * stealing delays caused by unlikely but not impossible
   2263 		 * drifts between CPU times on different CPUs.
   2264 		 */
   2265 		if (rqtime > nosteal || rqtime < 0)
   2266 			break;
   2267 
   2268 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
   2269 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
   2270 		scalehrtime(&now);
   2271 		/*
   2272 		 * Calculate when this thread becomes stealable
   2273 		 */
   2274 		now += (nosteal - rqtime);
   2275 
   2276 		/*
   2277 		 * Calculate time when some thread becomes stealable
   2278 		 */
   2279 		if (now < dp->disp_steal)
   2280 			dp->disp_steal = now;
   2281 	}
   2282 
   2283 	/*
   2284 	 * If there were no unbound threads on this queue, find the queue
   2285 	 * where they are and then return later. The value of
   2286 	 * disp_max_unbound_pri is not always accurate because it isn't
   2287 	 * reduced until another idle CPU looks for work.
   2288 	 */
   2289 	if (allbound)
   2290 		disp_fix_unbound_pri(dp, pri);
   2291 
   2292 	/*
   2293 	 * If we reached the end of the queue and found no unbound threads
   2294 	 * then return NULL so that other CPUs will be considered.  If there
   2295 	 * are unbound threads but they cannot yet be stolen, then
   2296 	 * return T_DONTSTEAL and try again later.
   2297 	 */
   2298 	if (tp == NULL) {
   2299 		disp_lock_exit_nopreempt(&dp->disp_lock);
   2300 		return (allbound ? NULL : T_DONTSTEAL);
   2301 	}
   2302 
   2303 	/*
   2304 	 * Found a runnable, unbound thread, so remove it from queue.
   2305 	 * dispdeq() requires that we have the thread locked, and we do,
   2306 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
   2307 	 * put the thread in transition state, thereby dropping the dispq
   2308 	 * lock.
   2309 	 */
   2310 
   2311 #ifdef DEBUG
   2312 	{
   2313 		int	thread_was_on_queue;
   2314 
   2315 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
   2316 		ASSERT(thread_was_on_queue);
   2317 	}
   2318 
   2319 #else /* DEBUG */
   2320 	(void) dispdeq(tp);			/* drops disp_lock */
   2321 #endif /* DEBUG */
   2322 
   2323 	/*
   2324 	 * Reset the disp_queue steal time - we do not know what is the smallest
   2325 	 * value across the queue is.
   2326 	 */
   2327 	dp->disp_steal = 0;
   2328 
   2329 	tp->t_schedflag |= TS_DONT_SWAP;
   2330 
   2331 	/*
   2332 	 * Setup thread to run on the current CPU.
   2333 	 */
   2334 	tp->t_disp_queue = cp->cpu_disp;
   2335 
   2336 	cp->cpu_dispthread = tp;		/* protected by spl only */
   2337 	cp->cpu_dispatch_pri = pri;
   2338 
   2339 	/*
   2340 	 * There can be a memory synchronization race between disp_getbest()
   2341 	 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
   2342 	 * to preempt the current thread to run the enqueued thread while
   2343 	 * disp_getbest() and disp_ratify() are changing the current thread
   2344 	 * to the stolen thread. This may lead to a situation where
   2345 	 * cpu_resched() tries to preempt the wrong thread and the
   2346 	 * stolen thread continues to run on the CPU which has been tagged
   2347 	 * for preemption.
   2348 	 * Later the clock thread gets enqueued but doesn't get to run on the
   2349 	 * CPU causing the system to hang.
   2350 	 *
   2351 	 * To avoid this, grabbing and dropping the disp_lock (which does
   2352 	 * a memory barrier) is needed to synchronize the execution of
   2353 	 * cpu_resched() with disp_getbest() and disp_ratify() and
   2354 	 * synchronize the memory read and written by cpu_resched(),
   2355 	 * disp_getbest(), and disp_ratify() with each other.
   2356 	 *  (see CR#6482861 for more details).
   2357 	 */
   2358 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
   2359 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
   2360 
   2361 	ASSERT(pri == DISP_PRIO(tp));
   2362 
   2363 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
   2364 
   2365 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
   2366 
   2367 	/*
   2368 	 * Return with spl high so that swtch() won't need to raise it.
   2369 	 * The disp_lock was dropped by dispdeq().
   2370 	 */
   2371 
   2372 	return (tp);
   2373 }
   2374 
   2375 /*
   2376  * disp_bound_common() - common routine for higher level functions
   2377  *	that check for bound threads under certain conditions.
   2378  *	If 'threadlistsafe' is set then there is no need to acquire
   2379  *	pidlock to stop the thread list from changing (eg, if
   2380  *	disp_bound_* is called with cpus paused).
   2381  */
   2382 static int
   2383 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
   2384 {
   2385 	int		found = 0;
   2386 	kthread_t	*tp;
   2387 
   2388 	ASSERT(flag);
   2389 
   2390 	if (!threadlistsafe)
   2391 		mutex_enter(&pidlock);
   2392 	tp = curthread;		/* faster than allthreads */
   2393 	do {
   2394 		if (tp->t_state != TS_FREE) {
   2395 			/*
   2396 			 * If an interrupt thread is busy, but the
   2397 			 * caller doesn't care (i.e. BOUND_INTR is off),
   2398 			 * then just ignore it and continue through.
   2399 			 */
   2400 			if ((tp->t_flag & T_INTR_THREAD) &&
   2401 			    !(flag & BOUND_INTR))
   2402 				continue;
   2403 
   2404 			/*
   2405 			 * Skip the idle thread for the CPU
   2406 			 * we're about to set offline.
   2407 			 */
   2408 			if (tp == cp->cpu_idle_thread)
   2409 				continue;
   2410 
   2411 			/*
   2412 			 * Skip the pause thread for the CPU
   2413 			 * we're about to set offline.
   2414 			 */
   2415 			if (tp == cp->cpu_pause_thread)
   2416 				continue;
   2417 
   2418 			if ((flag & BOUND_CPU) &&
   2419 			    (tp->t_bound_cpu == cp ||
   2420 			    tp->t_bind_cpu == cp->cpu_id ||
   2421 			    tp->t_weakbound_cpu == cp)) {
   2422 				found = 1;
   2423 				break;
   2424 			}
   2425 
   2426 			if ((flag & BOUND_PARTITION) &&
   2427 			    (tp->t_cpupart == cp->cpu_part)) {
   2428 				found = 1;
   2429 				break;
   2430 			}
   2431 		}
   2432 	} while ((tp = tp->t_next) != curthread && found == 0);
   2433 	if (!threadlistsafe)
   2434 		mutex_exit(&pidlock);
   2435 	return (found);
   2436 }
   2437 
   2438 /*
   2439  * disp_bound_threads - return nonzero if threads are bound to the processor.
   2440  *	Called infrequently.  Keep this simple.
   2441  *	Includes threads that are asleep or stopped but not onproc.
   2442  */
   2443 int
   2444 disp_bound_threads(cpu_t *cp, int threadlistsafe)
   2445 {
   2446 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
   2447 }
   2448 
   2449 /*
   2450  * disp_bound_anythreads - return nonzero if _any_ threads are bound
   2451  * to the given processor, including interrupt threads.
   2452  */
   2453 int
   2454 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
   2455 {
   2456 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
   2457 }
   2458 
   2459 /*
   2460  * disp_bound_partition - return nonzero if threads are bound to the same
   2461  * partition as the processor.
   2462  *	Called infrequently.  Keep this simple.
   2463  *	Includes threads that are asleep or stopped but not onproc.
   2464  */
   2465 int
   2466 disp_bound_partition(cpu_t *cp, int threadlistsafe)
   2467 {
   2468 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
   2469 }
   2470 
   2471 /*
   2472  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
   2473  * threads to other CPUs.
   2474  */
   2475 void
   2476 disp_cpu_inactive(cpu_t *cp)
   2477 {
   2478 	kthread_t	*tp;
   2479 	disp_t		*dp = cp->cpu_disp;
   2480 	dispq_t		*dq;
   2481 	pri_t		pri;
   2482 	int		wasonq;
   2483 
   2484 	disp_lock_enter(&dp->disp_lock);
   2485 	while ((pri = dp->disp_max_unbound_pri) != -1) {
   2486 		dq = &dp->disp_q[pri];
   2487 		tp = dq->dq_first;
   2488 
   2489 		/*
   2490 		 * Skip over bound threads.
   2491 		 */
   2492 		while (tp != NULL && tp->t_bound_cpu != NULL) {
   2493 			tp = tp->t_link;
   2494 		}
   2495 
   2496 		if (tp == NULL) {
   2497 			/* disp_max_unbound_pri must be inaccurate, so fix it */
   2498 			disp_fix_unbound_pri(dp, pri);
   2499 			continue;
   2500 		}
   2501 
   2502 		wasonq = dispdeq(tp);		/* drops disp_lock */
   2503 		ASSERT(wasonq);
   2504 		ASSERT(tp->t_weakbound_cpu == NULL);
   2505 
   2506 		setbackdq(tp);
   2507 		/*
   2508 		 * Called from cpu_offline:
   2509 		 *
   2510 		 * cp has already been removed from the list of active cpus
   2511 		 * and tp->t_cpu has been changed so there is no risk of
   2512 		 * tp ending up back on cp.
   2513 		 *
   2514 		 * Called from cpupart_move_cpu:
   2515 		 *
   2516 		 * The cpu has moved to a new cpupart.  Any threads that
   2517 		 * were on it's dispatch queues before the move remain
   2518 		 * in the old partition and can't run in the new partition.
   2519 		 */
   2520 		ASSERT(tp->t_cpu != cp);
   2521 		thread_unlock(tp);
   2522 
   2523 		disp_lock_enter(&dp->disp_lock);
   2524 	}
   2525 	disp_lock_exit(&dp->disp_lock);
   2526 }
   2527 
   2528 /*
   2529  * disp_lowpri_cpu - find CPU running the lowest priority thread.
   2530  *	The hint passed in is used as a starting point so we don't favor
   2531  *	CPU 0 or any other CPU.  The caller should pass in the most recently
   2532  *	used CPU for the thread.
   2533  *
   2534  *	The lgroup and priority are used to determine the best CPU to run on
   2535  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
   2536  *	the thread priority will indicate whether the thread will actually run
   2537  *	there.  To pick the best CPU, the CPUs inside and outside of the given
   2538  *	lgroup which are running the lowest priority threads are found.  The
   2539  *	remote CPU is chosen only if the thread will not run locally on a CPU
   2540  *	within the lgroup, but will run on the remote CPU. If the thread
   2541  *	cannot immediately run on any CPU, the best local CPU will be chosen.
   2542  *
   2543  *	The lpl specified also identifies the cpu partition from which
   2544  *	disp_lowpri_cpu should select a CPU.
   2545  *
   2546  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
   2547  *      behalf of the current thread. (curthread is looking for a new cpu)
   2548  *      In this case, cpu_dispatch_pri for this thread's cpu should be
   2549  *      ignored.
   2550  *
   2551  *      If a cpu is the target of an offline request then try to avoid it.
   2552  *
   2553  *	This function must be called at either high SPL, or with preemption
   2554  *	disabled, so that the "hint" CPU cannot be removed from the online
   2555  *	CPU list while we are traversing it.
   2556  */
   2557 cpu_t *
   2558 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
   2559 {
   2560 	cpu_t	*bestcpu;
   2561 	cpu_t	*besthomecpu;
   2562 	cpu_t   *cp, *cpstart;
   2563 
   2564 	pri_t   bestpri;
   2565 	pri_t   cpupri;
   2566 
   2567 	klgrpset_t	done;
   2568 	klgrpset_t	cur_set;
   2569 
   2570 	lpl_t		*lpl_iter, *lpl_leaf;
   2571 	int		i;
   2572 
   2573 	/*
   2574 	 * Scan for a CPU currently running the lowest priority thread.
   2575 	 * Cannot get cpu_lock here because it is adaptive.
   2576 	 * We do not require lock on CPU list.
   2577 	 */
   2578 	ASSERT(hint != NULL);
   2579 	ASSERT(lpl != NULL);
   2580 	ASSERT(lpl->lpl_ncpu > 0);
   2581 
   2582 	/*
   2583 	 * First examine local CPUs. Note that it's possible the hint CPU
   2584 	 * passed in in remote to the specified home lgroup. If our priority
   2585 	 * isn't sufficient enough such that we can run immediately at home,
   2586 	 * then examine CPUs remote to our home lgroup.
   2587 	 * We would like to give preference to CPUs closest to "home".
   2588 	 * If we can't find a CPU where we'll run at a given level
   2589 	 * of locality, we expand our search to include the next level.
   2590 	 */
   2591 	bestcpu = besthomecpu = NULL;
   2592 	klgrpset_clear(done);
   2593 	/* start with lpl we were passed */
   2594 
   2595 	lpl_iter = lpl;
   2596 
   2597 	do {
   2598 
   2599 		bestpri = SHRT_MAX;
   2600 		klgrpset_clear(cur_set);
   2601 
   2602 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
   2603 			lpl_leaf = lpl_iter->lpl_rset[i];
   2604 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
   2605 				continue;
   2606 
   2607 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
   2608 
   2609 			if (hint->cpu_lpl == lpl_leaf)
   2610 				cp = cpstart = hint;
   2611 			else
   2612 				cp = cpstart = lpl_leaf->lpl_cpus;
   2613 
   2614 			do {
   2615 				if (cp == curcpu)
   2616 					cpupri = -1;
   2617 				else if (cp == cpu_inmotion)
   2618 					cpupri = SHRT_MAX;
   2619 				else
   2620 					cpupri = cp->cpu_dispatch_pri;
   2621 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
   2622 					cpupri = cp->cpu_disp->disp_maxrunpri;
   2623 				if (cp->cpu_chosen_level > cpupri)
   2624 					cpupri = cp->cpu_chosen_level;
   2625 				if (cpupri < bestpri) {
   2626 					if (CPU_IDLING(cpupri)) {
   2627 						ASSERT((cp->cpu_flags &
   2628 						    CPU_QUIESCED) == 0);
   2629 						return (cp);
   2630 					}
   2631 					bestcpu = cp;
   2632 					bestpri = cpupri;
   2633 				}
   2634 			} while ((cp = cp->cpu_next_lpl) != cpstart);
   2635 		}
   2636 
   2637 		if (bestcpu && (tpri > bestpri)) {
   2638 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
   2639 			return (bestcpu);
   2640 		}
   2641 		if (besthomecpu == NULL)
   2642 			besthomecpu = bestcpu;
   2643 		/*
   2644 		 * Add the lgrps we just considered to the "done" set
   2645 		 */
   2646 		klgrpset_or(done, cur_set);
   2647 
   2648 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
   2649 
   2650 	/*
   2651 	 * The specified priority isn't high enough to run immediately
   2652 	 * anywhere, so just return the best CPU from the home lgroup.
   2653 	 */
   2654 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
   2655 	return (besthomecpu);
   2656 }
   2657 
   2658 /*
   2659  * This routine provides the generic idle cpu function for all processors.
   2660  * If a processor has some specific code to execute when idle (say, to stop
   2661  * the pipeline and save power) then that routine should be defined in the
   2662  * processors specific code (module_xx.c) and the global variable idle_cpu
   2663  * set to that function.
   2664  */
   2665 static void
   2666 generic_idle_cpu(void)
   2667 {
   2668 }
   2669 
   2670 /*ARGSUSED*/
   2671 static void
   2672 generic_enq_thread(cpu_t *cpu, int bound)
   2673 {
   2674 }
   2675