1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/sysmacros.h> 33 #include <sys/signal.h> 34 #include <sys/user.h> 35 #include <sys/systm.h> 36 #include <sys/sysinfo.h> 37 #include <sys/var.h> 38 #include <sys/errno.h> 39 #include <sys/cmn_err.h> 40 #include <sys/debug.h> 41 #include <sys/inline.h> 42 #include <sys/disp.h> 43 #include <sys/class.h> 44 #include <sys/bitmap.h> 45 #include <sys/kmem.h> 46 #include <sys/cpuvar.h> 47 #include <sys/vtrace.h> 48 #include <sys/tnf.h> 49 #include <sys/cpupart.h> 50 #include <sys/lgrp.h> 51 #include <sys/pg.h> 52 #include <sys/cmt.h> 53 #include <sys/bitset.h> 54 #include <sys/schedctl.h> 55 #include <sys/atomic.h> 56 #include <sys/dtrace.h> 57 #include <sys/sdt.h> 58 #include <sys/archsystm.h> 59 60 #include <vm/as.h> 61 62 #define BOUND_CPU 0x1 63 #define BOUND_PARTITION 0x2 64 #define BOUND_INTR 0x4 65 66 /* Dispatch queue allocation structure and functions */ 67 struct disp_queue_info { 68 disp_t *dp; 69 dispq_t *olddispq; 70 dispq_t *newdispq; 71 ulong_t *olddqactmap; 72 ulong_t *newdqactmap; 73 int oldnglobpris; 74 }; 75 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 76 disp_t *dp); 77 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 78 static void disp_dq_free(struct disp_queue_info *dptr); 79 80 /* platform-specific routine to call when processor is idle */ 81 static void generic_idle_cpu(); 82 void (*idle_cpu)() = generic_idle_cpu; 83 84 /* routines invoked when a CPU enters/exits the idle loop */ 85 static void idle_enter(); 86 static void idle_exit(); 87 88 /* platform-specific routine to call when thread is enqueued */ 89 static void generic_enq_thread(cpu_t *, int); 90 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 91 92 pri_t kpreemptpri; /* priority where kernel preemption applies */ 93 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 94 pri_t intr_pri; /* interrupt thread priority base level */ 95 96 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */ 97 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 98 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 99 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */ 100 int nswapped; /* total number of swapped threads */ 101 void disp_swapped_enq(kthread_t *tp); 102 static void disp_swapped_setrun(kthread_t *tp); 103 static void cpu_resched(cpu_t *cp, pri_t tpri); 104 105 /* 106 * If this is set, only interrupt threads will cause kernel preemptions. 107 * This is done by changing the value of kpreemptpri. kpreemptpri 108 * will either be the max sysclass pri + 1 or the min interrupt pri. 109 */ 110 int only_intr_kpreempt; 111 112 extern void set_idle_cpu(int cpun); 113 extern void unset_idle_cpu(int cpun); 114 static void setkpdq(kthread_t *tp, int borf); 115 #define SETKP_BACK 0 116 #define SETKP_FRONT 1 117 /* 118 * Parameter that determines how recently a thread must have run 119 * on the CPU to be considered loosely-bound to that CPU to reduce 120 * cold cache effects. The interval is in hertz. 121 */ 122 #define RECHOOSE_INTERVAL 3 123 int rechoose_interval = RECHOOSE_INTERVAL; 124 static cpu_t *cpu_choose(kthread_t *, pri_t); 125 126 /* 127 * Parameter that determines how long (in nanoseconds) a thread must 128 * be sitting on a run queue before it can be stolen by another CPU 129 * to reduce migrations. The interval is in nanoseconds. 130 * 131 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval() 132 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED 133 * here indicating it is uninitiallized. 134 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'. 135 * 136 */ 137 #define NOSTEAL_UNINITIALIZED (-1) 138 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED; 139 extern void cmp_set_nosteal_interval(void); 140 141 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 142 143 disp_lock_t transition_lock; /* lock on transitioning threads */ 144 disp_lock_t stop_lock; /* lock on stopped threads */ 145 146 static void cpu_dispqalloc(int numpris); 147 148 /* 149 * This gets returned by disp_getwork/disp_getbest if we couldn't steal 150 * a thread because it was sitting on its run queue for a very short 151 * period of time. 152 */ 153 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */ 154 155 static kthread_t *disp_getwork(cpu_t *to); 156 static kthread_t *disp_getbest(disp_t *from); 157 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 158 159 void swtch_to(kthread_t *); 160 161 /* 162 * dispatcher and scheduler initialization 163 */ 164 165 /* 166 * disp_setup - Common code to calculate and allocate dispatcher 167 * variables and structures based on the maximum priority. 168 */ 169 static void 170 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 171 { 172 pri_t newnglobpris; 173 174 ASSERT(MUTEX_HELD(&cpu_lock)); 175 176 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 177 178 if (newnglobpris > oldnglobpris) { 179 /* 180 * Allocate new kp queues for each CPU partition. 181 */ 182 cpupart_kpqalloc(newnglobpris); 183 184 /* 185 * Allocate new dispatch queues for each CPU. 186 */ 187 cpu_dispqalloc(newnglobpris); 188 189 /* 190 * compute new interrupt thread base priority 191 */ 192 intr_pri = maxglobpri; 193 if (only_intr_kpreempt) { 194 kpreemptpri = intr_pri + 1; 195 if (kpqpri == KPQPRI) 196 kpqpri = kpreemptpri; 197 } 198 v.v_nglobpris = newnglobpris; 199 } 200 } 201 202 /* 203 * dispinit - Called to initialize all loaded classes and the 204 * dispatcher framework. 205 */ 206 void 207 dispinit(void) 208 { 209 id_t cid; 210 pri_t maxglobpri; 211 pri_t cl_maxglobpri; 212 213 maxglobpri = -1; 214 215 /* 216 * Initialize transition lock, which will always be set. 217 */ 218 DISP_LOCK_INIT(&transition_lock); 219 disp_lock_enter_high(&transition_lock); 220 DISP_LOCK_INIT(&stop_lock); 221 222 mutex_enter(&cpu_lock); 223 CPU->cpu_disp->disp_maxrunpri = -1; 224 CPU->cpu_disp->disp_max_unbound_pri = -1; 225 226 /* 227 * Initialize the default CPU partition. 228 */ 229 cpupart_initialize_default(); 230 /* 231 * Call the class specific initialization functions for 232 * all pre-installed schedulers. 233 * 234 * We pass the size of a class specific parameter 235 * buffer to each of the initialization functions 236 * to try to catch problems with backward compatibility 237 * of class modules. 238 * 239 * For example a new class module running on an old system 240 * which didn't provide sufficiently large parameter buffers 241 * would be bad news. Class initialization modules can check for 242 * this and take action if they detect a problem. 243 */ 244 245 for (cid = 0; cid < nclass; cid++) { 246 sclass_t *sc; 247 248 sc = &sclass[cid]; 249 if (SCHED_INSTALLED(sc)) { 250 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 251 &sc->cl_funcs); 252 if (cl_maxglobpri > maxglobpri) 253 maxglobpri = cl_maxglobpri; 254 } 255 } 256 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 257 if (kpqpri == KPQPRI) 258 kpqpri = kpreemptpri; 259 260 ASSERT(maxglobpri >= 0); 261 disp_setup(maxglobpri, 0); 262 263 mutex_exit(&cpu_lock); 264 265 /* 266 * Platform specific sticky scheduler setup. 267 */ 268 if (nosteal_nsec == NOSTEAL_UNINITIALIZED) 269 cmp_set_nosteal_interval(); 270 271 /* 272 * Get the default class ID; this may be later modified via 273 * dispadmin(1M). This will load the class (normally TS) and that will 274 * call disp_add(), which is why we had to drop cpu_lock first. 275 */ 276 if (getcid(defaultclass, &defaultcid) != 0) { 277 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 278 defaultclass); 279 } 280 } 281 282 /* 283 * disp_add - Called with class pointer to initialize the dispatcher 284 * for a newly loaded class. 285 */ 286 void 287 disp_add(sclass_t *clp) 288 { 289 pri_t maxglobpri; 290 pri_t cl_maxglobpri; 291 292 mutex_enter(&cpu_lock); 293 /* 294 * Initialize the scheduler class. 295 */ 296 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 297 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 298 if (cl_maxglobpri > maxglobpri) 299 maxglobpri = cl_maxglobpri; 300 301 /* 302 * Save old queue information. Since we're initializing a 303 * new scheduling class which has just been loaded, then 304 * the size of the dispq may have changed. We need to handle 305 * that here. 306 */ 307 disp_setup(maxglobpri, v.v_nglobpris); 308 309 mutex_exit(&cpu_lock); 310 } 311 312 313 /* 314 * For each CPU, allocate new dispatch queues 315 * with the stated number of priorities. 316 */ 317 static void 318 cpu_dispqalloc(int numpris) 319 { 320 cpu_t *cpup; 321 struct disp_queue_info *disp_mem; 322 int i, num; 323 324 ASSERT(MUTEX_HELD(&cpu_lock)); 325 326 disp_mem = kmem_zalloc(NCPU * 327 sizeof (struct disp_queue_info), KM_SLEEP); 328 329 /* 330 * This routine must allocate all of the memory before stopping 331 * the cpus because it must not sleep in kmem_alloc while the 332 * CPUs are stopped. Locks they hold will not be freed until they 333 * are restarted. 334 */ 335 i = 0; 336 cpup = cpu_list; 337 do { 338 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 339 i++; 340 cpup = cpup->cpu_next; 341 } while (cpup != cpu_list); 342 num = i; 343 344 pause_cpus(NULL); 345 for (i = 0; i < num; i++) 346 disp_dq_assign(&disp_mem[i], numpris); 347 start_cpus(); 348 349 /* 350 * I must free all of the memory after starting the cpus because 351 * I can not risk sleeping in kmem_free while the cpus are stopped. 352 */ 353 for (i = 0; i < num; i++) 354 disp_dq_free(&disp_mem[i]); 355 356 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 357 } 358 359 static void 360 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 361 { 362 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 363 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 364 sizeof (long), KM_SLEEP); 365 dptr->dp = dp; 366 } 367 368 static void 369 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 370 { 371 disp_t *dp; 372 373 dp = dptr->dp; 374 dptr->olddispq = dp->disp_q; 375 dptr->olddqactmap = dp->disp_qactmap; 376 dptr->oldnglobpris = dp->disp_npri; 377 378 ASSERT(dptr->oldnglobpris < numpris); 379 380 if (dptr->olddispq != NULL) { 381 /* 382 * Use kcopy because bcopy is platform-specific 383 * and could block while we might have paused the cpus. 384 */ 385 (void) kcopy(dptr->olddispq, dptr->newdispq, 386 dptr->oldnglobpris * sizeof (dispq_t)); 387 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 388 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 389 sizeof (long)); 390 } 391 dp->disp_q = dptr->newdispq; 392 dp->disp_qactmap = dptr->newdqactmap; 393 dp->disp_q_limit = &dptr->newdispq[numpris]; 394 dp->disp_npri = numpris; 395 } 396 397 static void 398 disp_dq_free(struct disp_queue_info *dptr) 399 { 400 if (dptr->olddispq != NULL) 401 kmem_free(dptr->olddispq, 402 dptr->oldnglobpris * sizeof (dispq_t)); 403 if (dptr->olddqactmap != NULL) 404 kmem_free(dptr->olddqactmap, 405 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 406 } 407 408 /* 409 * For a newly created CPU, initialize the dispatch queue. 410 * This is called before the CPU is known through cpu[] or on any lists. 411 */ 412 void 413 disp_cpu_init(cpu_t *cp) 414 { 415 disp_t *dp; 416 dispq_t *newdispq; 417 ulong_t *newdqactmap; 418 419 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 420 421 if (cp == cpu0_disp.disp_cpu) 422 dp = &cpu0_disp; 423 else 424 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 425 bzero(dp, sizeof (disp_t)); 426 cp->cpu_disp = dp; 427 dp->disp_cpu = cp; 428 dp->disp_maxrunpri = -1; 429 dp->disp_max_unbound_pri = -1; 430 DISP_LOCK_INIT(&cp->cpu_thread_lock); 431 /* 432 * Allocate memory for the dispatcher queue headers 433 * and the active queue bitmap. 434 */ 435 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 436 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 437 sizeof (long), KM_SLEEP); 438 dp->disp_q = newdispq; 439 dp->disp_qactmap = newdqactmap; 440 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 441 dp->disp_npri = v.v_nglobpris; 442 } 443 444 void 445 disp_cpu_fini(cpu_t *cp) 446 { 447 ASSERT(MUTEX_HELD(&cpu_lock)); 448 449 disp_kp_free(cp->cpu_disp); 450 if (cp->cpu_disp != &cpu0_disp) 451 kmem_free(cp->cpu_disp, sizeof (disp_t)); 452 } 453 454 /* 455 * Allocate new, larger kpreempt dispatch queue to replace the old one. 456 */ 457 void 458 disp_kp_alloc(disp_t *dq, pri_t npri) 459 { 460 struct disp_queue_info mem_info; 461 462 if (npri > dq->disp_npri) { 463 /* 464 * Allocate memory for the new array. 465 */ 466 disp_dq_alloc(&mem_info, npri, dq); 467 468 /* 469 * We need to copy the old structures to the new 470 * and free the old. 471 */ 472 disp_dq_assign(&mem_info, npri); 473 disp_dq_free(&mem_info); 474 } 475 } 476 477 /* 478 * Free dispatch queue. 479 * Used for the kpreempt queues for a removed CPU partition and 480 * for the per-CPU queues of deleted CPUs. 481 */ 482 void 483 disp_kp_free(disp_t *dq) 484 { 485 struct disp_queue_info mem_info; 486 487 mem_info.olddispq = dq->disp_q; 488 mem_info.olddqactmap = dq->disp_qactmap; 489 mem_info.oldnglobpris = dq->disp_npri; 490 disp_dq_free(&mem_info); 491 } 492 493 /* 494 * End dispatcher and scheduler initialization. 495 */ 496 497 /* 498 * See if there's anything to do other than remain idle. 499 * Return non-zero if there is. 500 * 501 * This function must be called with high spl, or with 502 * kernel preemption disabled to prevent the partition's 503 * active cpu list from changing while being traversed. 504 * 505 */ 506 int 507 disp_anywork(void) 508 { 509 cpu_t *cp = CPU; 510 cpu_t *ocp; 511 512 if (cp->cpu_disp->disp_nrunnable != 0) 513 return (1); 514 515 if (!(cp->cpu_flags & CPU_OFFLINE)) { 516 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 517 return (1); 518 519 /* 520 * Work can be taken from another CPU if: 521 * - There is unbound work on the run queue 522 * - That work isn't a thread undergoing a 523 * - context switch on an otherwise empty queue. 524 * - The CPU isn't running the idle loop. 525 */ 526 for (ocp = cp->cpu_next_part; ocp != cp; 527 ocp = ocp->cpu_next_part) { 528 ASSERT(CPU_ACTIVE(ocp)); 529 530 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 531 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 532 ocp->cpu_disp->disp_nrunnable == 1) && 533 ocp->cpu_dispatch_pri != -1) 534 return (1); 535 } 536 } 537 return (0); 538 } 539 540 /* 541 * Called when CPU enters the idle loop 542 */ 543 static void 544 idle_enter() 545 { 546 cpu_t *cp = CPU; 547 548 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled()); 549 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 550 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 551 } 552 553 /* 554 * Called when CPU exits the idle loop 555 */ 556 static void 557 idle_exit() 558 { 559 cpu_t *cp = CPU; 560 561 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled()); 562 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 563 } 564 565 /* 566 * Idle loop. 567 */ 568 void 569 idle() 570 { 571 struct cpu *cp = CPU; /* pointer to this CPU */ 572 kthread_t *t; /* taken thread */ 573 574 idle_enter(); 575 576 /* 577 * Uniprocessor version of idle loop. 578 * Do this until notified that we're on an actual multiprocessor. 579 */ 580 while (ncpus == 1) { 581 if (cp->cpu_disp->disp_nrunnable == 0) { 582 (*idle_cpu)(); 583 continue; 584 } 585 idle_exit(); 586 swtch(); 587 588 idle_enter(); /* returned from swtch */ 589 } 590 591 /* 592 * Multiprocessor idle loop. 593 */ 594 for (;;) { 595 /* 596 * If CPU is completely quiesced by p_online(2), just wait 597 * here with minimal bus traffic until put online. 598 */ 599 while (cp->cpu_flags & CPU_QUIESCED) 600 (*idle_cpu)(); 601 602 if (cp->cpu_disp->disp_nrunnable != 0) { 603 idle_exit(); 604 swtch(); 605 } else { 606 if (cp->cpu_flags & CPU_OFFLINE) 607 continue; 608 if ((t = disp_getwork(cp)) == NULL) { 609 if (cp->cpu_chosen_level != -1) { 610 disp_t *dp = cp->cpu_disp; 611 disp_t *kpq; 612 613 disp_lock_enter(&dp->disp_lock); 614 /* 615 * Set kpq under lock to prevent 616 * migration between partitions. 617 */ 618 kpq = &cp->cpu_part->cp_kp_queue; 619 if (kpq->disp_maxrunpri == -1) 620 cp->cpu_chosen_level = -1; 621 disp_lock_exit(&dp->disp_lock); 622 } 623 (*idle_cpu)(); 624 continue; 625 } 626 /* 627 * If there was a thread but we couldn't steal 628 * it, then keep trying. 629 */ 630 if (t == T_DONTSTEAL) 631 continue; 632 idle_exit(); 633 swtch_to(t); 634 } 635 idle_enter(); /* returned from swtch/swtch_to */ 636 } 637 } 638 639 640 /* 641 * Preempt the currently running thread in favor of the highest 642 * priority thread. The class of the current thread controls 643 * where it goes on the dispatcher queues. If panicking, turn 644 * preemption off. 645 */ 646 void 647 preempt() 648 { 649 kthread_t *t = curthread; 650 klwp_t *lwp = ttolwp(curthread); 651 652 if (panicstr) 653 return; 654 655 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 656 657 thread_lock(t); 658 659 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 660 /* 661 * this thread has already been chosen to be run on 662 * another CPU. Clear kprunrun on this CPU since we're 663 * already headed for swtch(). 664 */ 665 CPU->cpu_kprunrun = 0; 666 thread_unlock_nopreempt(t); 667 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 668 } else { 669 if (lwp != NULL) 670 lwp->lwp_ru.nivcsw++; 671 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 672 THREAD_TRANSITION(t); 673 CL_PREEMPT(t); 674 DTRACE_SCHED(preempt); 675 thread_unlock_nopreempt(t); 676 677 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 678 679 swtch(); /* clears CPU->cpu_runrun via disp() */ 680 } 681 } 682 683 extern kthread_t *thread_unpin(); 684 685 /* 686 * disp() - find the highest priority thread for this processor to run, and 687 * set it in TS_ONPROC state so that resume() can be called to run it. 688 */ 689 static kthread_t * 690 disp() 691 { 692 cpu_t *cpup; 693 disp_t *dp; 694 kthread_t *tp; 695 dispq_t *dq; 696 int maxrunword; 697 pri_t pri; 698 disp_t *kpq; 699 700 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 701 702 cpup = CPU; 703 /* 704 * Find the highest priority loaded, runnable thread. 705 */ 706 dp = cpup->cpu_disp; 707 708 reschedule: 709 /* 710 * If there is more important work on the global queue with a better 711 * priority than the maximum on this CPU, take it now. 712 */ 713 kpq = &cpup->cpu_part->cp_kp_queue; 714 while ((pri = kpq->disp_maxrunpri) >= 0 && 715 pri >= dp->disp_maxrunpri && 716 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 717 (tp = disp_getbest(kpq)) != NULL) { 718 if (disp_ratify(tp, kpq) != NULL) { 719 TRACE_1(TR_FAC_DISP, TR_DISP_END, 720 "disp_end:tid %p", tp); 721 return (tp); 722 } 723 } 724 725 disp_lock_enter(&dp->disp_lock); 726 pri = dp->disp_maxrunpri; 727 728 /* 729 * If there is nothing to run, look at what's runnable on other queues. 730 * Choose the idle thread if the CPU is quiesced. 731 * Note that CPUs that have the CPU_OFFLINE flag set can still run 732 * interrupt threads, which will be the only threads on the CPU's own 733 * queue, but cannot run threads from other queues. 734 */ 735 if (pri == -1) { 736 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 737 disp_lock_exit(&dp->disp_lock); 738 if ((tp = disp_getwork(cpup)) == NULL || 739 tp == T_DONTSTEAL) { 740 tp = cpup->cpu_idle_thread; 741 (void) splhigh(); 742 THREAD_ONPROC(tp, cpup); 743 cpup->cpu_dispthread = tp; 744 cpup->cpu_dispatch_pri = -1; 745 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 746 cpup->cpu_chosen_level = -1; 747 } 748 } else { 749 disp_lock_exit_high(&dp->disp_lock); 750 tp = cpup->cpu_idle_thread; 751 THREAD_ONPROC(tp, cpup); 752 cpup->cpu_dispthread = tp; 753 cpup->cpu_dispatch_pri = -1; 754 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 755 cpup->cpu_chosen_level = -1; 756 } 757 TRACE_1(TR_FAC_DISP, TR_DISP_END, 758 "disp_end:tid %p", tp); 759 return (tp); 760 } 761 762 dq = &dp->disp_q[pri]; 763 tp = dq->dq_first; 764 765 ASSERT(tp != NULL); 766 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */ 767 768 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 769 770 /* 771 * Found it so remove it from queue. 772 */ 773 dp->disp_nrunnable--; 774 dq->dq_sruncnt--; 775 if ((dq->dq_first = tp->t_link) == NULL) { 776 ulong_t *dqactmap = dp->disp_qactmap; 777 778 ASSERT(dq->dq_sruncnt == 0); 779 dq->dq_last = NULL; 780 781 /* 782 * The queue is empty, so the corresponding bit needs to be 783 * turned off in dqactmap. If nrunnable != 0 just took the 784 * last runnable thread off the 785 * highest queue, so recompute disp_maxrunpri. 786 */ 787 maxrunword = pri >> BT_ULSHIFT; 788 dqactmap[maxrunword] &= ~BT_BIW(pri); 789 790 if (dp->disp_nrunnable == 0) { 791 dp->disp_max_unbound_pri = -1; 792 dp->disp_maxrunpri = -1; 793 } else { 794 int ipri; 795 796 ipri = bt_gethighbit(dqactmap, maxrunword); 797 dp->disp_maxrunpri = ipri; 798 if (ipri < dp->disp_max_unbound_pri) 799 dp->disp_max_unbound_pri = ipri; 800 } 801 } else { 802 tp->t_link = NULL; 803 } 804 805 /* 806 * Set TS_DONT_SWAP flag to prevent another processor from swapping 807 * out this thread before we have a chance to run it. 808 * While running, it is protected against swapping by t_lock. 809 */ 810 tp->t_schedflag |= TS_DONT_SWAP; 811 cpup->cpu_dispthread = tp; /* protected by spl only */ 812 cpup->cpu_dispatch_pri = pri; 813 ASSERT(pri == DISP_PRIO(tp)); 814 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 815 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 816 817 ASSERT(tp != NULL); 818 TRACE_1(TR_FAC_DISP, TR_DISP_END, 819 "disp_end:tid %p", tp); 820 821 if (disp_ratify(tp, kpq) == NULL) 822 goto reschedule; 823 824 return (tp); 825 } 826 827 /* 828 * swtch() 829 * Find best runnable thread and run it. 830 * Called with the current thread already switched to a new state, 831 * on a sleep queue, run queue, stopped, and not zombied. 832 * May be called at any spl level less than or equal to LOCK_LEVEL. 833 * Always drops spl to the base level (spl0()). 834 */ 835 void 836 swtch() 837 { 838 kthread_t *t = curthread; 839 kthread_t *next; 840 cpu_t *cp; 841 842 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 843 844 if (t->t_flag & T_INTR_THREAD) 845 cpu_intr_swtch_enter(t); 846 847 if (t->t_intr != NULL) { 848 /* 849 * We are an interrupt thread. Setup and return 850 * the interrupted thread to be resumed. 851 */ 852 (void) splhigh(); /* block other scheduler action */ 853 cp = CPU; /* now protected against migration */ 854 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 855 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 856 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 857 next = thread_unpin(); 858 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 859 resume_from_intr(next); 860 } else { 861 #ifdef DEBUG 862 if (t->t_state == TS_ONPROC && 863 t->t_disp_queue->disp_cpu == CPU && 864 t->t_preempt == 0) { 865 thread_lock(t); 866 ASSERT(t->t_state != TS_ONPROC || 867 t->t_disp_queue->disp_cpu != CPU || 868 t->t_preempt != 0); /* cannot migrate */ 869 thread_unlock_nopreempt(t); 870 } 871 #endif /* DEBUG */ 872 cp = CPU; 873 next = disp(); /* returns with spl high */ 874 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 875 876 /* OK to steal anything left on run queue */ 877 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 878 879 if (next != t) { 880 if (t == cp->cpu_idle_thread) { 881 PG_NRUN_UPDATE(cp, 1); 882 } else if (next == cp->cpu_idle_thread) { 883 PG_NRUN_UPDATE(cp, -1); 884 } 885 886 /* 887 * If t was previously in the TS_ONPROC state, 888 * setfrontdq and setbackdq won't have set its t_waitrq. 889 * Since we now finally know that we're switching away 890 * from this thread, set its t_waitrq if it is on a run 891 * queue. 892 */ 893 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) { 894 t->t_waitrq = gethrtime_unscaled(); 895 } 896 897 /* 898 * restore mstate of thread that we are switching to 899 */ 900 restore_mstate(next); 901 902 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 903 cp->cpu_last_swtch = t->t_disp_time = lbolt; 904 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 905 906 if (dtrace_vtime_active) 907 dtrace_vtime_switch(next); 908 909 resume(next); 910 /* 911 * The TR_RESUME_END and TR_SWTCH_END trace points 912 * appear at the end of resume(), because we may not 913 * return here 914 */ 915 } else { 916 if (t->t_flag & T_INTR_THREAD) 917 cpu_intr_swtch_exit(t); 918 919 DTRACE_SCHED(remain__cpu); 920 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 921 (void) spl0(); 922 } 923 } 924 } 925 926 /* 927 * swtch_from_zombie() 928 * Special case of swtch(), which allows checks for TS_ZOMB to be 929 * eliminated from normal resume. 930 * Find best runnable thread and run it. 931 * Called with the current thread zombied. 932 * Zombies cannot migrate, so CPU references are safe. 933 */ 934 void 935 swtch_from_zombie() 936 { 937 kthread_t *next; 938 cpu_t *cpu = CPU; 939 940 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 941 942 ASSERT(curthread->t_state == TS_ZOMB); 943 944 next = disp(); /* returns with spl high */ 945 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 946 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 947 ASSERT(next != curthread); 948 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 949 950 if (next == cpu->cpu_idle_thread) 951 PG_NRUN_UPDATE(cpu, -1); 952 953 restore_mstate(next); 954 955 if (dtrace_vtime_active) 956 dtrace_vtime_switch(next); 957 958 resume_from_zombie(next); 959 /* 960 * The TR_RESUME_END and TR_SWTCH_END trace points 961 * appear at the end of resume(), because we certainly will not 962 * return here 963 */ 964 } 965 966 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 967 968 /* 969 * search_disp_queues() 970 * Search the given dispatch queues for thread tp. 971 * Return 1 if tp is found, otherwise return 0. 972 */ 973 static int 974 search_disp_queues(disp_t *dp, kthread_t *tp) 975 { 976 dispq_t *dq; 977 dispq_t *eq; 978 979 disp_lock_enter_high(&dp->disp_lock); 980 981 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 982 kthread_t *rp; 983 984 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 985 986 for (rp = dq->dq_first; rp; rp = rp->t_link) 987 if (tp == rp) { 988 disp_lock_exit_high(&dp->disp_lock); 989 return (1); 990 } 991 } 992 disp_lock_exit_high(&dp->disp_lock); 993 994 return (0); 995 } 996 997 /* 998 * thread_on_queue() 999 * Search all per-CPU dispatch queues and all partition-wide kpreempt 1000 * queues for thread tp. Return 1 if tp is found, otherwise return 0. 1001 */ 1002 static int 1003 thread_on_queue(kthread_t *tp) 1004 { 1005 cpu_t *cp; 1006 struct cpupart *part; 1007 1008 ASSERT(getpil() >= DISP_LEVEL); 1009 1010 /* 1011 * Search the per-CPU dispatch queues for tp. 1012 */ 1013 cp = CPU; 1014 do { 1015 if (search_disp_queues(cp->cpu_disp, tp)) 1016 return (1); 1017 } while ((cp = cp->cpu_next_onln) != CPU); 1018 1019 /* 1020 * Search the partition-wide kpreempt queues for tp. 1021 */ 1022 part = CPU->cpu_part; 1023 do { 1024 if (search_disp_queues(&part->cp_kp_queue, tp)) 1025 return (1); 1026 } while ((part = part->cp_next) != CPU->cpu_part); 1027 1028 return (0); 1029 } 1030 1031 #else 1032 1033 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 1034 1035 #endif /* DEBUG */ 1036 1037 /* 1038 * like swtch(), but switch to a specified thread taken from another CPU. 1039 * called with spl high.. 1040 */ 1041 void 1042 swtch_to(kthread_t *next) 1043 { 1044 cpu_t *cp = CPU; 1045 1046 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 1047 1048 /* 1049 * Update context switch statistics. 1050 */ 1051 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 1052 1053 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 1054 1055 if (curthread == cp->cpu_idle_thread) 1056 PG_NRUN_UPDATE(cp, 1); 1057 1058 /* OK to steal anything left on run queue */ 1059 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 1060 1061 /* record last execution time */ 1062 cp->cpu_last_swtch = curthread->t_disp_time = lbolt; 1063 1064 /* 1065 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq 1066 * won't have set its t_waitrq. Since we now finally know that we're 1067 * switching away from this thread, set its t_waitrq if it is on a run 1068 * queue. 1069 */ 1070 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) { 1071 curthread->t_waitrq = gethrtime_unscaled(); 1072 } 1073 1074 /* restore next thread to previously running microstate */ 1075 restore_mstate(next); 1076 1077 if (dtrace_vtime_active) 1078 dtrace_vtime_switch(next); 1079 1080 resume(next); 1081 /* 1082 * The TR_RESUME_END and TR_SWTCH_END trace points 1083 * appear at the end of resume(), because we may not 1084 * return here 1085 */ 1086 } 1087 1088 1089 1090 #define CPU_IDLING(pri) ((pri) == -1) 1091 1092 static void 1093 cpu_resched(cpu_t *cp, pri_t tpri) 1094 { 1095 int call_poke_cpu = 0; 1096 pri_t cpupri = cp->cpu_dispatch_pri; 1097 1098 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { 1099 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1100 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1101 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1102 cp->cpu_runrun = 1; 1103 aston(cp->cpu_dispthread); 1104 if (tpri < kpreemptpri && cp != CPU) 1105 call_poke_cpu = 1; 1106 } 1107 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1108 cp->cpu_kprunrun = 1; 1109 if (cp != CPU) 1110 call_poke_cpu = 1; 1111 } 1112 } 1113 1114 /* 1115 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1116 */ 1117 membar_enter(); 1118 1119 if (call_poke_cpu) 1120 poke_cpu(cp->cpu_id); 1121 } 1122 1123 /* 1124 * Perform multi-level CMT load balancing of running threads. 1125 * tp is the thread being enqueued 1126 * cp is the hint CPU (chosen by cpu_choose()). 1127 */ 1128 static cpu_t * 1129 cmt_balance(kthread_t *tp, cpu_t *cp) 1130 { 1131 int hint, i, cpu, nsiblings; 1132 int self = 0; 1133 group_t *cmt_pgs, *siblings; 1134 pg_cmt_t *pg, *pg_tmp, *tpg = NULL; 1135 int pg_nrun, tpg_nrun; 1136 int level = 0; 1137 cpu_t *newcp; 1138 1139 ASSERT(THREAD_LOCK_HELD(tp)); 1140 1141 cmt_pgs = &cp->cpu_pg->cmt_pgs; 1142 1143 if (GROUP_SIZE(cmt_pgs) == 0) 1144 return (cp); /* nothing to do */ 1145 1146 if (tp == curthread) 1147 sel