1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/sysmacros.h> 32 #include <sys/cred.h> 33 #include <sys/proc.h> 34 #include <sys/strsubr.h> 35 #include <sys/priocntl.h> 36 #include <sys/class.h> 37 #include <sys/disp.h> 38 #include <sys/procset.h> 39 #include <sys/debug.h> 40 #include <sys/kmem.h> 41 #include <sys/errno.h> 42 #include <sys/systm.h> 43 #include <sys/schedctl.h> 44 #include <sys/vmsystm.h> 45 #include <sys/atomic.h> 46 #include <sys/project.h> 47 #include <sys/modctl.h> 48 #include <sys/fss.h> 49 #include <sys/fsspriocntl.h> 50 #include <sys/cpupart.h> 51 #include <sys/zone.h> 52 #include <vm/rm.h> 53 #include <vm/seg_kmem.h> 54 #include <sys/tnf_probe.h> 55 #include <sys/policy.h> 56 #include <sys/sdt.h> 57 #include <sys/cpucaps.h> 58 59 /* 60 * FSS Data Structures: 61 * 62 * fsszone 63 * ----- ----- 64 * ----- | | | | 65 * | |-------->| |<------->| |<---->... 66 * | | ----- ----- 67 * | | ^ ^ ^ 68 * | |--- | \ \ 69 * ----- | | \ \ 70 * fsspset | | \ \ 71 * | | \ \ 72 * | ----- ----- ----- 73 * -->| |<--->| |<--->| | 74 * | | | | | | 75 * ----- ----- ----- 76 * fssproj 77 * 78 * 79 * That is, fsspsets contain a list of fsszone's that are currently active in 80 * the pset, and a list of fssproj's, corresponding to projects with runnable 81 * threads on the pset. fssproj's in turn point to the fsszone which they 82 * are a member of. 83 * 84 * An fssproj_t is removed when there are no threads in it. 85 * 86 * An fsszone_t is removed when there are no projects with threads in it. 87 * 88 * Projects in a zone compete with each other for cpu time, receiving cpu 89 * allocation within a zone proportional to fssproj->fssp_shares 90 * (project.cpu-shares); at a higher level zones compete with each other, 91 * receiving allocation in a pset proportional to fsszone->fssz_shares 92 * (zone.cpu-shares). See fss_decay_usage() for the precise formula. 93 */ 94 95 static pri_t fss_init(id_t, int, classfuncs_t **); 96 97 static struct sclass fss = { 98 "FSS", 99 fss_init, 100 0 101 }; 102 103 extern struct mod_ops mod_schedops; 104 105 /* 106 * Module linkage information for the kernel. 107 */ 108 static struct modlsched modlsched = { 109 &mod_schedops, "fair share scheduling class", &fss 110 }; 111 112 static struct modlinkage modlinkage = { 113 MODREV_1, (void *)&modlsched, NULL 114 }; 115 116 #define FSS_MAXUPRI 60 117 118 /* 119 * The fssproc_t structures are kept in an array of circular doubly linked 120 * lists. A hash on the thread pointer is used to determine which list each 121 * thread should be placed in. Each list has a dummy "head" which is never 122 * removed, so the list is never empty. fss_update traverses these lists to 123 * update the priorities of threads that have been waiting on the run queue. 124 */ 125 #define FSS_LISTS 16 /* number of lists, must be power of 2 */ 126 #define FSS_LIST_HASH(t) (((uintptr_t)(t) >> 9) & (FSS_LISTS - 1)) 127 #define FSS_LIST_NEXT(i) (((i) + 1) & (FSS_LISTS - 1)) 128 129 #define FSS_LIST_INSERT(fssproc) \ 130 { \ 131 int index = FSS_LIST_HASH(fssproc->fss_tp); \ 132 kmutex_t *lockp = &fss_listlock[index]; \ 133 fssproc_t *headp = &fss_listhead[index]; \ 134 mutex_enter(lockp); \ 135 fssproc->fss_next = headp->fss_next; \ 136 fssproc->fss_prev = headp; \ 137 headp->fss_next->fss_prev = fssproc; \ 138 headp->fss_next = fssproc; \ 139 mutex_exit(lockp); \ 140 } 141 142 #define FSS_LIST_DELETE(fssproc) \ 143 { \ 144 int index = FSS_LIST_HASH(fssproc->fss_tp); \ 145 kmutex_t *lockp = &fss_listlock[index]; \ 146 mutex_enter(lockp); \ 147 fssproc->fss_prev->fss_next = fssproc->fss_next; \ 148 fssproc->fss_next->fss_prev = fssproc->fss_prev; \ 149 mutex_exit(lockp); \ 150 } 151 152 #define FSS_TICK_COST 1000 /* tick cost for threads with nice level = 0 */ 153 154 /* 155 * Decay rate percentages are based on n/128 rather than n/100 so that 156 * calculations can avoid having to do an integer divide by 100 (divide 157 * by FSS_DECAY_BASE == 128 optimizes to an arithmetic shift). 158 * 159 * FSS_DECAY_MIN = 83/128 ~= 65% 160 * FSS_DECAY_MAX = 108/128 ~= 85% 161 * FSS_DECAY_USG = 96/128 ~= 75% 162 */ 163 #define FSS_DECAY_MIN 83 /* fsspri decay pct for threads w/ nice -20 */ 164 #define FSS_DECAY_MAX 108 /* fsspri decay pct for threads w/ nice +19 */ 165 #define FSS_DECAY_USG 96 /* fssusage decay pct for projects */ 166 #define FSS_DECAY_BASE 128 /* base for decay percentages above */ 167 168 #define FSS_NICE_MIN 0 169 #define FSS_NICE_MAX (2 * NZERO - 1) 170 #define FSS_NICE_RANGE (FSS_NICE_MAX - FSS_NICE_MIN + 1) 171 172 static int fss_nice_tick[FSS_NICE_RANGE]; 173 static int fss_nice_decay[FSS_NICE_RANGE]; 174 175 static pri_t fss_maxupri = FSS_MAXUPRI; /* maximum FSS user priority */ 176 static pri_t fss_maxumdpri; /* maximum user mode fss priority */ 177 static pri_t fss_maxglobpri; /* maximum global priority used by fss class */ 178 static pri_t fss_minglobpri; /* minimum global priority */ 179 180 static fssproc_t fss_listhead[FSS_LISTS]; 181 static kmutex_t fss_listlock[FSS_LISTS]; 182 183 static fsspset_t *fsspsets; 184 static kmutex_t fsspsets_lock; /* protects fsspsets */ 185 186 static id_t fss_cid; 187 188 static time_t fss_minrun = 2; /* t_pri becomes 59 within 2 secs */ 189 static time_t fss_minslp = 2; /* min time on sleep queue for hardswap */ 190 static int fss_quantum = 11; 191 192 static void fss_newpri(fssproc_t *); 193 static void fss_update(void *); 194 static int fss_update_list(int); 195 static void fss_change_priority(kthread_t *, fssproc_t *); 196 197 static int fss_admin(caddr_t, cred_t *); 198 static int fss_getclinfo(void *); 199 static int fss_parmsin(void *); 200 static int fss_parmsout(void *, pc_vaparms_t *); 201 static int fss_vaparmsin(void *, pc_vaparms_t *); 202 static int fss_vaparmsout(void *, pc_vaparms_t *); 203 static int fss_getclpri(pcpri_t *); 204 static int fss_alloc(void **, int); 205 static void fss_free(void *); 206 207 static int fss_enterclass(kthread_t *, id_t, void *, cred_t *, void *); 208 static void fss_exitclass(void *); 209 static int fss_canexit(kthread_t *, cred_t *); 210 static int fss_fork(kthread_t *, kthread_t *, void *); 211 static void fss_forkret(kthread_t *, kthread_t *); 212 static void fss_parmsget(kthread_t *, void *); 213 static int fss_parmsset(kthread_t *, void *, id_t, cred_t *); 214 static void fss_stop(kthread_t *, int, int); 215 static void fss_exit(kthread_t *); 216 static void fss_active(kthread_t *); 217 static void fss_inactive(kthread_t *); 218 static pri_t fss_swapin(kthread_t *, int); 219 static pri_t fss_swapout(kthread_t *, int); 220 static void fss_trapret(kthread_t *); 221 static void fss_preempt(kthread_t *); 222 static void fss_setrun(kthread_t *); 223 static void fss_sleep(kthread_t *); 224 static void fss_tick(kthread_t *); 225 static void fss_wakeup(kthread_t *); 226 static int fss_donice(kthread_t *, cred_t *, int, int *); 227 static int fss_doprio(kthread_t *, cred_t *, int, int *); 228 static pri_t fss_globpri(kthread_t *); 229 static void fss_yield(kthread_t *); 230 static void fss_nullsys(); 231 232 static struct classfuncs fss_classfuncs = { 233 /* class functions */ 234 fss_admin, 235 fss_getclinfo, 236 fss_parmsin, 237 fss_parmsout, 238 fss_vaparmsin, 239 fss_vaparmsout, 240 fss_getclpri, 241 fss_alloc, 242 fss_free, 243 244 /* thread functions */ 245 fss_enterclass, 246 fss_exitclass, 247 fss_canexit, 248 fss_fork, 249 fss_forkret, 250 fss_parmsget, 251 fss_parmsset, 252 fss_stop, 253 fss_exit, 254 fss_active, 255 fss_inactive, 256 fss_swapin, 257 fss_swapout, 258 fss_trapret, 259 fss_preempt, 260 fss_setrun, 261 fss_sleep, 262 fss_tick, 263 fss_wakeup, 264 fss_donice, 265 fss_globpri, 266 fss_nullsys, /* set_process_group */ 267 fss_yield, 268 fss_doprio, 269 }; 270 271 int 272 _init() 273 { 274 return (mod_install(&modlinkage)); 275 } 276 277 int 278 _fini() 279 { 280 return (EBUSY); 281 } 282 283 int 284 _info(struct modinfo *modinfop) 285 { 286 return (mod_info(&modlinkage, modinfop)); 287 } 288 289 /*ARGSUSED*/ 290 static int 291 fss_project_walker(kproject_t *kpj, void *buf) 292 { 293 return (0); 294 } 295 296 void * 297 fss_allocbuf(int op, int type) 298 { 299 fssbuf_t *fssbuf; 300 void **fsslist; 301 int cnt; 302 int i; 303 size_t size; 304 305 ASSERT(op == FSS_NPSET_BUF || op == FSS_NPROJ_BUF || op == FSS_ONE_BUF); 306 ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE); 307 ASSERT(MUTEX_HELD(&cpu_lock)); 308 309 fssbuf = kmem_zalloc(sizeof (fssbuf_t), KM_SLEEP); 310 switch (op) { 311 case FSS_NPSET_BUF: 312 cnt = cpupart_list(NULL, 0, CP_NONEMPTY); 313 break; 314 case FSS_NPROJ_BUF: 315 cnt = project_walk_all(ALL_ZONES, fss_project_walker, NULL); 316 break; 317 case FSS_ONE_BUF: 318 cnt = 1; 319 break; 320 } 321 322 switch (type) { 323 case FSS_ALLOC_PROJ: 324 size = sizeof (fssproj_t); 325 break; 326 case FSS_ALLOC_ZONE: 327 size = sizeof (fsszone_t); 328 break; 329 } 330 fsslist = kmem_zalloc(cnt * sizeof (void *), KM_SLEEP); 331 fssbuf->fssb_size = cnt; 332 fssbuf->fssb_list = fsslist; 333 for (i = 0; i < cnt; i++) 334 fsslist[i] = kmem_zalloc(size, KM_SLEEP); 335 return (fssbuf); 336 } 337 338 void 339 fss_freebuf(fssbuf_t *fssbuf, int type) 340 { 341 void **fsslist; 342 int i; 343 size_t size; 344 345 ASSERT(fssbuf != NULL); 346 ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE); 347 fsslist = fssbuf->fssb_list; 348 349 switch (type) { 350 case FSS_ALLOC_PROJ: 351 size = sizeof (fssproj_t); 352 break; 353 case FSS_ALLOC_ZONE: 354 size = sizeof (fsszone_t); 355 break; 356 } 357 358 for (i = 0; i < fssbuf->fssb_size; i++) { 359 if (fsslist[i] != NULL) 360 kmem_free(fsslist[i], size); 361 } 362 kmem_free(fsslist, sizeof (void *) * fssbuf->fssb_size); 363 kmem_free(fssbuf, sizeof (fssbuf_t)); 364 } 365 366 static fsspset_t * 367 fss_find_fsspset(cpupart_t *cpupart) 368 { 369 int i; 370 fsspset_t *fsspset = NULL; 371 int found = 0; 372 373 ASSERT(cpupart != NULL); 374 ASSERT(MUTEX_HELD(&fsspsets_lock)); 375 376 /* 377 * Search for the cpupart pointer in the array of fsspsets. 378 */ 379 for (i = 0; i < max_ncpus; i++) { 380 fsspset = &fsspsets[i]; 381 if (fsspset->fssps_cpupart == cpupart) { 382 ASSERT(fsspset->fssps_nproj > 0); 383 found = 1; 384 break; 385 } 386 } 387 if (found == 0) { 388 /* 389 * If we didn't find anything, then use the first 390 * available slot in the fsspsets array. 391 */ 392 for (i = 0; i < max_ncpus; i++) { 393 fsspset = &fsspsets[i]; 394 if (fsspset->fssps_cpupart == NULL) { 395 ASSERT(fsspset->fssps_nproj == 0); 396 found = 1; 397 break; 398 } 399 } 400 fsspset->fssps_cpupart = cpupart; 401 } 402 ASSERT(found == 1); 403 return (fsspset); 404 } 405 406 static void 407 fss_del_fsspset(fsspset_t *fsspset) 408 { 409 ASSERT(MUTEX_HELD(&fsspsets_lock)); 410 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 411 ASSERT(fsspset->fssps_nproj == 0); 412 ASSERT(fsspset->fssps_list == NULL); 413 ASSERT(fsspset->fssps_zones == NULL); 414 fsspset->fssps_cpupart = NULL; 415 fsspset->fssps_maxfsspri = 0; 416 fsspset->fssps_shares = 0; 417 } 418 419 /* 420 * The following routine returns a pointer to the fsszone structure which 421 * belongs to zone "zone" and cpu partition fsspset, if such structure exists. 422 */ 423 static fsszone_t * 424 fss_find_fsszone(fsspset_t *fsspset, zone_t *zone) 425 { 426 fsszone_t *fsszone; 427 428 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 429 430 if (fsspset->fssps_list != NULL) { 431 /* 432 * There are projects/zones active on this cpu partition 433 * already. Try to find our zone among them. 434 */ 435 fsszone = fsspset->fssps_zones; 436 do { 437 if (fsszone->fssz_zone == zone) { 438 return (fsszone); 439 } 440 fsszone = fsszone->fssz_next; 441 } while (fsszone != fsspset->fssps_zones); 442 } 443 return (NULL); 444 } 445 446 /* 447 * The following routine links new fsszone structure into doubly linked list of 448 * zones active on the specified cpu partition. 449 */ 450 static void 451 fss_insert_fsszone(fsspset_t *fsspset, zone_t *zone, fsszone_t *fsszone) 452 { 453 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 454 455 fsszone->fssz_zone = zone; 456 fsszone->fssz_rshares = zone->zone_shares; 457 458 if (fsspset->fssps_zones == NULL) { 459 /* 460 * This will be the first fsszone for this fsspset 461 */ 462 fsszone->fssz_next = fsszone->fssz_prev = fsszone; 463 fsspset->fssps_zones = fsszone; 464 } else { 465 /* 466 * Insert this fsszone to the doubly linked list. 467 */ 468 fsszone_t *fssz_head = fsspset->fssps_zones; 469 470 fsszone->fssz_next = fssz_head; 471 fsszone->fssz_prev = fssz_head->fssz_prev; 472 fssz_head->fssz_prev->fssz_next = fsszone; 473 fssz_head->fssz_prev = fsszone; 474 fsspset->fssps_zones = fsszone; 475 } 476 } 477 478 /* 479 * The following routine removes a single fsszone structure from the doubly 480 * linked list of zones active on the specified cpu partition. Note that 481 * global fsspsets_lock must be held in case this fsszone structure is the last 482 * on the above mentioned list. Also note that the fsszone structure is not 483 * freed here, it is the responsibility of the caller to call kmem_free for it. 484 */ 485 static void 486 fss_remove_fsszone(fsspset_t *fsspset, fsszone_t *fsszone) 487 { 488 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 489 ASSERT(fsszone->fssz_nproj == 0); 490 ASSERT(fsszone->fssz_shares == 0); 491 ASSERT(fsszone->fssz_runnable == 0); 492 493 if (fsszone->fssz_next != fsszone) { 494 /* 495 * This is not the last zone in the list. 496 */ 497 fsszone->fssz_prev->fssz_next = fsszone->fssz_next; 498 fsszone->fssz_next->fssz_prev = fsszone->fssz_prev; 499 if (fsspset->fssps_zones == fsszone) 500 fsspset->fssps_zones = fsszone->fssz_next; 501 } else { 502 /* 503 * This was the last zone active in this cpu partition. 504 */ 505 fsspset->fssps_zones = NULL; 506 } 507 } 508 509 /* 510 * The following routine returns a pointer to the fssproj structure 511 * which belongs to project kpj and cpu partition fsspset, if such structure 512 * exists. 513 */ 514 static fssproj_t * 515 fss_find_fssproj(fsspset_t *fsspset, kproject_t *kpj) 516 { 517 fssproj_t *fssproj; 518 519 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 520 521 if (fsspset->fssps_list != NULL) { 522 /* 523 * There are projects running on this cpu partition already. 524 * Try to find our project among them. 525 */ 526 fssproj = fsspset->fssps_list; 527 do { 528 if (fssproj->fssp_proj == kpj) { 529 ASSERT(fssproj->fssp_pset == fsspset); 530 return (fssproj); 531 } 532 fssproj = fssproj->fssp_next; 533 } while (fssproj != fsspset->fssps_list); 534 } 535 return (NULL); 536 } 537 538 /* 539 * The following routine links new fssproj structure into doubly linked list 540 * of projects running on the specified cpu partition. 541 */ 542 static void 543 fss_insert_fssproj(fsspset_t *fsspset, kproject_t *kpj, fsszone_t *fsszone, 544 fssproj_t *fssproj) 545 { 546 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 547 548 fssproj->fssp_pset = fsspset; 549 fssproj->fssp_proj = kpj; 550 fssproj->fssp_shares = kpj->kpj_shares; 551 552 fsspset->fssps_nproj++; 553 554 if (fsspset->fssps_list == NULL) { 555 /* 556 * This will be the first fssproj for this fsspset 557 */ 558 fssproj->fssp_next = fssproj->fssp_prev = fssproj; 559 fsspset->fssps_list = fssproj; 560 } else { 561 /* 562 * Insert this fssproj to the doubly linked list. 563 */ 564 fssproj_t *fssp_head = fsspset->fssps_list; 565 566 fssproj->fssp_next = fssp_head; 567 fssproj->fssp_prev = fssp_head->fssp_prev; 568 fssp_head->fssp_prev->fssp_next = fssproj; 569 fssp_head->fssp_prev = fssproj; 570 fsspset->fssps_list = fssproj; 571 } 572 fssproj->fssp_fsszone = fsszone; 573 fsszone->fssz_nproj++; 574 ASSERT(fsszone->fssz_nproj != 0); 575 } 576 577 /* 578 * The following routine removes a single fssproj structure from the doubly 579 * linked list of projects running on the specified cpu partition. Note that 580 * global fsspsets_lock must be held in case if this fssproj structure is the 581 * last on the above mentioned list. Also note that the fssproj structure is 582 * not freed here, it is the responsibility of the caller to call kmem_free 583 * for it. 584 */ 585 static void 586 fss_remove_fssproj(fsspset_t *fsspset, fssproj_t *fssproj) 587 { 588 fsszone_t *fsszone; 589 590 ASSERT(MUTEX_HELD(&fsspsets_lock)); 591 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 592 ASSERT(fssproj->fssp_runnable == 0); 593 594 fsspset->fssps_nproj--; 595 596 fsszone = fssproj->fssp_fsszone; 597 fsszone->fssz_nproj--; 598 599 if (fssproj->fssp_next != fssproj) { 600 /* 601 * This is not the last part in the list. 602 */ 603 fssproj->fssp_prev->fssp_next = fssproj->fssp_next; 604 fssproj->fssp_next->fssp_prev = fssproj->fssp_prev; 605 if (fsspset->fssps_list == fssproj) 606 fsspset->fssps_list = fssproj->fssp_next; 607 if (fsszone->fssz_nproj == 0) 608 fss_remove_fsszone(fsspset, fsszone); 609 } else { 610 /* 611 * This was the last project part running 612 * at this cpu partition. 613 */ 614 fsspset->fssps_list = NULL; 615 ASSERT(fsspset->fssps_nproj == 0); 616 ASSERT(fsszone->fssz_nproj == 0); 617 fss_remove_fsszone(fsspset, fsszone); 618 fss_del_fsspset(fsspset); 619 } 620 } 621 622 static void 623 fss_inactive(kthread_t *t) 624 { 625 fssproc_t *fssproc; 626 fssproj_t *fssproj; 627 fsspset_t *fsspset; 628 fsszone_t *fsszone; 629 630 ASSERT(THREAD_LOCK_HELD(t)); 631 fssproc = FSSPROC(t); 632 fssproj = FSSPROC2FSSPROJ(fssproc); 633 if (fssproj == NULL) /* if this thread already exited */ 634 return; 635 fsspset = FSSPROJ2FSSPSET(fssproj); 636 fsszone = fssproj->fssp_fsszone; 637 disp_lock_enter_high(&fsspset->fssps_displock); 638 ASSERT(fssproj->fssp_runnable > 0); 639 if (--fssproj->fssp_runnable == 0) { 640 fsszone->fssz_shares -= fssproj->fssp_shares; 641 if (--fsszone->fssz_runnable == 0) 642 fsspset->fssps_shares -= fsszone->fssz_rshares; 643 } 644 ASSERT(fssproc->fss_runnable == 1); 645 fssproc->fss_runnable = 0; 646 disp_lock_exit_high(&fsspset->fssps_displock); 647 } 648 649 static void 650 fss_active(kthread_t *t) 651 { 652 fssproc_t *fssproc; 653 fssproj_t *fssproj; 654 fsspset_t *fsspset; 655 fsszone_t *fsszone; 656 657 ASSERT(THREAD_LOCK_HELD(t)); 658 fssproc = FSSPROC(t); 659 fssproj = FSSPROC2FSSPROJ(fssproc); 660 if (fssproj == NULL) /* if this thread already exited */ 661 return; 662 fsspset = FSSPROJ2FSSPSET(fssproj); 663 fsszone = fssproj->fssp_fsszone; 664 disp_lock_enter_high(&fsspset->fssps_displock); 665 if (++fssproj->fssp_runnable == 1) { 666 fsszone->fssz_shares += fssproj->fssp_shares; 667 if (++fsszone->fssz_runnable == 1) 668 fsspset->fssps_shares += fsszone->fssz_rshares; 669 } 670 ASSERT(fssproc->fss_runnable == 0); 671 fssproc->fss_runnable = 1; 672 disp_lock_exit_high(&fsspset->fssps_displock); 673 } 674 675 /* 676 * Fair share scheduler initialization. Called by dispinit() at boot time. 677 * We can ignore clparmsz argument since we know that the smallest possible 678 * parameter buffer is big enough for us. 679 */ 680 /*ARGSUSED*/ 681 static pri_t 682 fss_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp) 683 { 684 int i; 685 686 ASSERT(MUTEX_HELD(&cpu_lock)); 687 688 fss_cid = cid; 689 fss_maxumdpri = minclsyspri - 1; 690 fss_maxglobpri = minclsyspri; 691 fss_minglobpri = 0; 692 fsspsets = kmem_zalloc(sizeof (fsspset_t) * max_ncpus, KM_SLEEP); 693 694 /* 695 * Initialize the fssproc hash table. 696 */ 697 for (i = 0; i < FSS_LISTS; i++) 698 fss_listhead[i].fss_next = fss_listhead[i].fss_prev = 699 &fss_listhead[i]; 700 701 *clfuncspp = &fss_classfuncs; 702 703 /* 704 * Fill in fss_nice_tick and fss_nice_decay arrays: 705 * The cost of a tick is lower at positive nice values (so that it 706 * will not increase its project's usage as much as normal) with 50% 707 * drop at the maximum level and 50% increase at the minimum level. 708 * The fsspri decay is slower at positive nice values. fsspri values 709 * of processes with negative nice levels must decay faster to receive 710 * time slices more frequently than normal. 711 */ 712 for (i = 0; i < FSS_NICE_RANGE; i++) { 713 fss_nice_tick[i] = (FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2) 714 - i)) / FSS_NICE_RANGE; 715 fss_nice_decay[i] = FSS_DECAY_MIN + 716 ((FSS_DECAY_MAX - FSS_DECAY_MIN) * i) / 717 (FSS_NICE_RANGE - 1); 718 } 719 720 return (fss_maxglobpri); 721 } 722 723 /* 724 * Calculate the new cpupri based on the usage, the number of shares and 725 * the number of active threads. Reset the tick counter for this thread. 726 */ 727 static void 728 fss_newpri(fssproc_t *fssproc) 729 { 730 kthread_t *tp; 731 fssproj_t *fssproj; 732 fsspset_t *fsspset; 733 fsszone_t *fsszone; 734 fsspri_t fsspri, maxfsspri; 735 pri_t invpri; 736 uint32_t ticks; 737 738 tp = fssproc->fss_tp; 739 ASSERT(tp != NULL); 740 741 if (tp->t_cid != fss_cid) 742 return; 743 744 ASSERT(THREAD_LOCK_HELD(tp)); 745 746 fssproj = FSSPROC2FSSPROJ(fssproc); 747 fsszone = FSSPROJ2FSSZONE(fssproj); 748 if (fssproj == NULL) 749 /* 750 * No need to change priority of exited threads. 751 */ 752 return; 753 754 fsspset = FSSPROJ2FSSPSET(fssproj); 755 disp_lock_enter_high(&fsspset->fssps_displock); 756 757 if (fssproj->fssp_shares == 0 || fsszone->fssz_rshares == 0) { 758 /* 759 * Special case: threads with no shares. 760 */ 761 fssproc->fss_umdpri = fss_minglobpri; 762 fssproc->fss_ticks = 0; 763 disp_lock_exit_high(&fsspset->fssps_displock); 764 return; 765 } 766 767 /* 768 * fsspri += shusage * nrunnable * ticks 769 */ 770 ticks = fssproc->fss_ticks; 771 fssproc->fss_ticks = 0; 772 fsspri = fssproc->fss_fsspri; 773 fsspri += fssproj->fssp_shusage * fssproj->fssp_runnable * ticks; 774 fssproc->fss_fsspri = fsspri; 775 776 if (fsspri < fss_maxumdpri) 777 fsspri = fss_maxumdpri; /* so that maxfsspri is != 0 */ 778 779 /* 780 * The general priority formula: 781 * 782 * (fsspri * umdprirange) 783 * pri = maxumdpri - ------------------------ 784 * maxfsspri 785 * 786 * If this thread's fsspri is greater than the previous largest 787 * fsspri, then record it as the new high and priority for this 788 * thread will be one (the lowest priority assigned to a thread 789 * that has non-zero shares). 790 * Note that this formula cannot produce out of bounds priority 791 * values; if it is changed, additional checks may need to be 792 * added. 793 */ 794 maxfsspri = fsspset->fssps_maxfsspri; 795 if (fsspri >= maxfsspri) { 796 fsspset->fssps_maxfsspri = fsspri; 797 disp_lock_exit_high(&fsspset->fssps_displock); 798 fssproc->fss_umdpri = 1; 799 } else { 800 disp_lock_exit_high(&fsspset->fssps_displock); 801 invpri = (fsspri * (fss_maxumdpri - 1)) / maxfsspri; 802 fssproc->fss_umdpri = fss_maxumdpri - invpri; 803 } 804 } 805 806 /* 807 * Decays usages of all running projects and resets their tick counters. 808 * Called once per second from fss_update() after updating priorities. 809 */ 810 static void 811 fss_decay_usage() 812 { 813 uint32_t zone_ext_shares, zone_int_shares; 814 uint32_t kpj_shares, pset_shares; 815 fsspset_t *fsspset; 816 fssproj_t *fssproj; 817 fsszone_t *fsszone; 818 fsspri_t maxfsspri; 819 int psetid; 820 821 mutex_enter(&fsspsets_lock); 822 /* 823 * Go through all active processor sets and decay usages of projects 824 * running on them. 825 */ 826 for (psetid = 0; psetid < max_ncpus; psetid++) { 827 fsspset = &fsspsets[psetid]; 828 mutex_enter(&fsspset->fssps_lock); 829 830 if (fsspset->fssps_cpupart == NULL || 831 (fssproj = fsspset->fssps_list) == NULL) { 832 mutex_exit(&fsspset->fssps_lock); 833 continue; 834 } 835 836 /* 837 * Decay maxfsspri for this cpu partition with the 838 * fastest possible decay rate. 839 */ 840 disp_lock_enter(&fsspset->fssps_displock); 841 842 maxfsspri = (fsspset->fssps_maxfsspri * 843 fss_nice_decay[NZERO]) / FSS_DECAY_BASE; 844 if (maxfsspri < fss_maxumdpri) 845 maxfsspri = fss_maxumdpri; 846 fsspset->fssps_maxfsspri = maxfsspri; 847 848 do { 849 /* 850 * Decay usage for each project running on 851 * this cpu partition. 852 */ 853 fssproj->fssp_usage = 854 (fssproj->fssp_usage * FSS_DECAY_USG) / 855 FSS_DECAY_BASE + fssproj->fssp_ticks; 856 fssproj->fssp_ticks = 0; 857 858 fsszone = fssproj->fssp_fsszone; 859 /* 860 * Readjust the project's number of shares if it has 861 * changed since we checked it last time. 862 */ 863 kpj_shares = fssproj->fssp_proj->kpj_shares; 864 if (fssproj->fssp_shares != kpj_shares) { 865 if (fssproj->fssp_runnable != 0) { 866 fsszone->fssz_shares -= 867 fssproj->fssp_shares; 868 fsszone->fssz_shares += kpj_shares; 869 } 870 fssproj->fssp_shares = kpj_shares; 871 } 872 873 /* 874 * Readjust the zone's number of shares if it 875 * has changed since we checked it last time. 876 */ 877 zone_ext_shares = fsszone->fssz_zone->zone_shares; 878 if (fsszone->fssz_rshares != zone_ext_shares) { 879 if (fsszone->fssz_runnable != 0) { 880 fsspset->fssps_shares -= 881 fsszone->fssz_rshares; 882 fsspset->fssps_shares += 883 zone_ext_shares; 884 } 885 fsszone->fssz_rshares = zone_ext_shares; 886 } 887 zone_int_shares = fsszone->fssz_shares; 888 pset_shares = fsspset->fssps_shares; 889 /* 890 * Calculate fssp_shusage value to be used 891 * for fsspri increments for the next second. 892 */ 893 if (kpj_shares == 0 || zone_ext_shares == 0) { 894 fssproj->fssp_shusage = 0; 895 } else if (FSSPROJ2KPROJ(fssproj) == proj0p) { 896 /* 897 * Project 0 in the global zone has 50% 898 * of its zone. 899 */ 900 fssproj->fssp_shusage = (fssproj->fssp_usage * 901 zone_int_shares * zone_int_shares) / 902 (zone_ext_shares * zone_ext_shares); 903 } else { 904 /* 905 * Thread's priority is based on its project's 906 * normalized usage (shusage) value which gets 907 * calculated this way: 908 * 909 * pset_shares^2 zone_int_shares^2 910 * usage * ------------- * ------------------ 911 * kpj_shares^2 zone_ext_shares^2 912 * 913 * Where zone_int_shares is the sum of shares 914 * of all active projects within the zone (and 915 * the pset), and zone_ext_shares is the number 916 * of zone shares (ie, zone.cpu-shares). 917 * 918 * If there is only one zone active on the pset 919 * the above reduces to: 920 * 921 * zone_int_shares^2 922 * shusage = usage * --------------------- 923 * kpj_shares^2 924 * 925 * If there's only one project active in the 926 * zone this formula reduces to: 927 * 928 * pset_shares^2 929 * shusage = usage * ---------------------- 930 * zone_ext_shares^2 931 */ 932 fssproj->fssp_shusage = fssproj->fssp_usage * 933 pset_shares * zone_int_shares; 934 fssproj->fssp_shusage /= 935 kpj_shares * zone_ext_shares; 936 fssproj->fssp_shusage *= 937 pset_shares * zone_int_shares; 938 fssproj->fssp_shusage /= 939 kpj_shares * zone_ext_shares; 940 } 941 fssproj = fssproj->fssp_next; 942 } while (fssproj != fsspset->fssps_list); 943 944 disp_lock_exit(&fsspset->fssps_displock); 945 mutex_exit(&fsspset->fssps_lock); 946 } 947 mutex_exit(&fsspsets_lock); 948 } 949 950 static void 951 fss_change_priority(kthread_t *t, fssproc_t *fssproc) 952 { 953 pri_t new_pri; 954 955 ASSERT(THREAD_LOCK_HELD(t)); 956 new_pri = fssproc->fss_umdpri; 957 ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri); 958 959 t->t_cpri = fssproc->fss_upri; 960 fssproc->fss_flags &= ~FSSRESTORE; 961 if (t == curthread || t->t_state == TS_ONPROC) { 962 /* 963 * curthread is always onproc 964 */ 965 cpu_t *cp = t->t_disp_queue->disp_cpu; 966 THREAD_CHANGE_PRI(t, new_pri); 967 if (t == cp->cpu_dispthread) 968 cp->cpu_dispatch_pri = DISP_PRIO(t); 969 if (DISP_MUST_SURRENDER(t)) { 970 fssproc->fss_flags |= FSSBACKQ; 971 cpu_surrender(t); 972 } else { 973 fssproc->fss_timeleft = fss_quantum; 974 } 975 } else { 976 /* 977 * When the priority of a thread is changed, it may be 978 * necessary to adjust its position on a sleep queue or 979 * dispatch queue. The function thread_change_pri accomplishes 980 * this. 981 */ 982 if (thread_change_pri(t, new_pri, 0)) { 983 /* 984 * The thread was on a run queue. 985 */ 986 fssproc->fss_timeleft = fss_quantum; 987 } else { 988 fssproc->fss_flags |= FSSBACKQ; 989 } 990 } 991 } 992 993 /* 994 * Update priorities of all fair-sharing threads that are currently runnable 995 * at a user mode priority based on the number of shares and current usage. 996 * Called once per second via timeout which we reset here. 997 * 998 * There are several lists of fair-sharing threads broken up by a hash on the 999 * thread pointer. Each list has its own lock. This avoids blocking all 1000 * fss_enterclass, fss_fork, and fss_exitclass operations while fss_update runs. 1001 * fss_update traverses each list in turn. 1002 */ 1003 static void 1004 fss_update(void *arg) 1005 { 1006 int i; 1007 int new_marker = -1; 1008 static int fss_update_marker; 1009 1010 /* 1011 * Decay and update usages for all projects. 1012 */ 1013 fss_decay_usage(); 1014 1015 /* 1016 * Start with the fss_update_marker list, then do the rest. 1017 */ 1018 i = fss_update_marker; 1019 1020 /* 1021 * Go around all threads, set new priorities and decay 1022 * per-thread CPU usages. 1023 */ 1024 do { 1025 /* 1026 * If this is the first list after the current marker to have 1027 * threads with priorities updates, advance the marker to this 1028 * list for the next time fss_update runs. 1029 */ 1030 if (fss_update_list(i) && 1031 new_marker == -1 && i != fss_update_marker) 1032 new_marker = i; 1033 } while ((i = FSS_LIST_NEXT(i)) != fss_update_marker); 1034 1035 /* 1036 * Advance marker for the next fss_update call 1037 */ 1038 if (new_marker != -1) 1039 fss_update_marker = new_marker; 1040 1041 (void) timeout(fss_update, arg, hz); 1042 } 1043 1044 /* 1045 * Updates priority for a list of threads. Returns 1 if the priority of one 1046 * of the threads was actually updated, 0 if none were for various reasons 1047 * (thread is no longer in the FSS class, is not runnable, has the preemption 1048 * control no-preempt bit set, etc.) 1049 */ 1050 static int 1051 fss_update_list(int i) 1052 { 1053 fssproc_t *fssproc; 1054 fssproj_t *fssproj; 1055 fsspri_t fsspri; 1056 kthread_t *t; 1057 int updated = 0; 1058 1059 mutex_enter(&fss_listlock[i]); 1060 for (fssproc = fss_listhead[i].fss_next; fssproc != &fss_listhead[i]; 1061 fssproc = fssproc->fss_next) { 1062 t = fssproc->fss_tp; 1063 /* 1064 * Lock the thread and verify the state. 1065 */ 1066 thread_lock(t); 1067 /* 1068 * Skip the thread if it is no longer in the FSS class or 1069 * is running with kernel mode priority. 1070 */ 1071 if (t->t_cid != fss_cid) 1072 goto next; 1073 if ((fssproc->fss_flags & FSSKPRI) != 0) 1074 goto next; 1075 1076 fssproj = FSSPROC2FSSPROJ(fssproc); 1077 if (fssproj == NULL) 1078 goto next; 1079 if (fssproj->fssp_shares != 0) { 1080 /* 1081 * Decay fsspri value. 1082 */ 1083 fsspri = fssproc->fss_fsspri; 1084 fsspri = (fsspri * fss_nice_decay[fssproc->fss_nice]) / 1085 FSS_DECAY_BASE; 1086 fssproc->fss_fsspri = fsspri; 1087 } 1088 1089 if (t->t_schedctl && schedctl_get_nopreempt(t)) 1090 goto next; 1091 if (t->t_state != TS_RUN && t->t_state != TS_WAIT) { 1092 /* 1093 * Make next syscall/trap call fss_trapret 1094 */ 1095 t->t_trapret = 1; 1096 aston(t); 1097 goto next; 1098 } 1099 fss_newpri(fssproc); 1100 updated = 1; 1101 1102 /* 1103 * Only dequeue the thread if it needs to be moved; otherwise 1104 * it should just round-robin here. 1105 */ 1106 if (t->t_pri != fssproc->fss_umdpri) 1107 fss_change_priority(t, fssproc); 1108 next: 1109 thread_unlock(t); 1110 } 1111 mutex_exit(&fss_listlock[i]); 1112 return (updated); 1113 } 1114 1115 /*ARGSUSED*/ 1116 static int 1117 fss_admin(caddr_t uaddr