Home | History | Annotate | Download | only in disp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     28 
     29 #include <sys/types.h>
     30 #include <sys/param.h>
     31 #include <sys/sysmacros.h>
     32 #include <sys/cred.h>
     33 #include <sys/proc.h>
     34 #include <sys/strsubr.h>
     35 #include <sys/priocntl.h>
     36 #include <sys/class.h>
     37 #include <sys/disp.h>
     38 #include <sys/procset.h>
     39 #include <sys/debug.h>
     40 #include <sys/kmem.h>
     41 #include <sys/errno.h>
     42 #include <sys/systm.h>
     43 #include <sys/schedctl.h>
     44 #include <sys/vmsystm.h>
     45 #include <sys/atomic.h>
     46 #include <sys/project.h>
     47 #include <sys/modctl.h>
     48 #include <sys/fss.h>
     49 #include <sys/fsspriocntl.h>
     50 #include <sys/cpupart.h>
     51 #include <sys/zone.h>
     52 #include <vm/rm.h>
     53 #include <vm/seg_kmem.h>
     54 #include <sys/tnf_probe.h>
     55 #include <sys/policy.h>
     56 #include <sys/sdt.h>
     57 #include <sys/cpucaps.h>
     58 
     59 /*
     60  * FSS Data Structures:
     61  *
     62  *                 fsszone
     63  *                  -----           -----
     64  *  -----          |     |         |     |
     65  * |     |-------->|     |<------->|     |<---->...
     66  * |     |          -----           -----
     67  * |     |          ^    ^            ^
     68  * |     |---       |     \            \
     69  *  -----    |      |      \            \
     70  * fsspset   |      |       \            \
     71  *           |      |        \            \
     72  *           |    -----       -----       -----
     73  *            -->|     |<--->|     |<--->|     |
     74  *               |     |     |     |     |     |
     75  *                -----       -----       -----
     76  *               fssproj
     77  *
     78  *
     79  * That is, fsspsets contain a list of fsszone's that are currently active in
     80  * the pset, and a list of fssproj's, corresponding to projects with runnable
     81  * threads on the pset.  fssproj's in turn point to the fsszone which they
     82  * are a member of.
     83  *
     84  * An fssproj_t is removed when there are no threads in it.
     85  *
     86  * An fsszone_t is removed when there are no projects with threads in it.
     87  *
     88  * Projects in a zone compete with each other for cpu time, receiving cpu
     89  * allocation within a zone proportional to fssproj->fssp_shares
     90  * (project.cpu-shares); at a higher level zones compete with each other,
     91  * receiving allocation in a pset proportional to fsszone->fssz_shares
     92  * (zone.cpu-shares).  See fss_decay_usage() for the precise formula.
     93  */
     94 
     95 static pri_t fss_init(id_t, int, classfuncs_t **);
     96 
     97 static struct sclass fss = {
     98 	"FSS",
     99 	fss_init,
    100 	0
    101 };
    102 
    103 extern struct mod_ops mod_schedops;
    104 
    105 /*
    106  * Module linkage information for the kernel.
    107  */
    108 static struct modlsched modlsched = {
    109 	&mod_schedops, "fair share scheduling class", &fss
    110 };
    111 
    112 static struct modlinkage modlinkage = {
    113 	MODREV_1, (void *)&modlsched, NULL
    114 };
    115 
    116 #define	FSS_MAXUPRI	60
    117 
    118 /*
    119  * The fssproc_t structures are kept in an array of circular doubly linked
    120  * lists.  A hash on the thread pointer is used to determine which list each
    121  * thread should be placed in.  Each list has a dummy "head" which is never
    122  * removed, so the list is never empty.  fss_update traverses these lists to
    123  * update the priorities of threads that have been waiting on the run queue.
    124  */
    125 #define	FSS_LISTS		16 /* number of lists, must be power of 2 */
    126 #define	FSS_LIST_HASH(t)	(((uintptr_t)(t) >> 9) & (FSS_LISTS - 1))
    127 #define	FSS_LIST_NEXT(i)	(((i) + 1) & (FSS_LISTS - 1))
    128 
    129 #define	FSS_LIST_INSERT(fssproc)				\
    130 {								\
    131 	int index = FSS_LIST_HASH(fssproc->fss_tp);		\
    132 	kmutex_t *lockp = &fss_listlock[index];			\
    133 	fssproc_t *headp = &fss_listhead[index];		\
    134 	mutex_enter(lockp);					\
    135 	fssproc->fss_next = headp->fss_next;			\
    136 	fssproc->fss_prev = headp;				\
    137 	headp->fss_next->fss_prev = fssproc;			\
    138 	headp->fss_next = fssproc;				\
    139 	mutex_exit(lockp);					\
    140 }
    141 
    142 #define	FSS_LIST_DELETE(fssproc)				\
    143 {								\
    144 	int index = FSS_LIST_HASH(fssproc->fss_tp);		\
    145 	kmutex_t *lockp = &fss_listlock[index];			\
    146 	mutex_enter(lockp);					\
    147 	fssproc->fss_prev->fss_next = fssproc->fss_next;	\
    148 	fssproc->fss_next->fss_prev = fssproc->fss_prev;	\
    149 	mutex_exit(lockp);					\
    150 }
    151 
    152 #define	FSS_TICK_COST	1000	/* tick cost for threads with nice level = 0 */
    153 
    154 /*
    155  * Decay rate percentages are based on n/128 rather than n/100 so  that
    156  * calculations can avoid having to do an integer divide by 100 (divide
    157  * by FSS_DECAY_BASE == 128 optimizes to an arithmetic shift).
    158  *
    159  * FSS_DECAY_MIN	=  83/128 ~= 65%
    160  * FSS_DECAY_MAX	= 108/128 ~= 85%
    161  * FSS_DECAY_USG	=  96/128 ~= 75%
    162  */
    163 #define	FSS_DECAY_MIN	83	/* fsspri decay pct for threads w/ nice -20 */
    164 #define	FSS_DECAY_MAX	108	/* fsspri decay pct for threads w/ nice +19 */
    165 #define	FSS_DECAY_USG	96	/* fssusage decay pct for projects */
    166 #define	FSS_DECAY_BASE	128	/* base for decay percentages above */
    167 
    168 #define	FSS_NICE_MIN	0
    169 #define	FSS_NICE_MAX	(2 * NZERO - 1)
    170 #define	FSS_NICE_RANGE	(FSS_NICE_MAX - FSS_NICE_MIN + 1)
    171 
    172 static int	fss_nice_tick[FSS_NICE_RANGE];
    173 static int	fss_nice_decay[FSS_NICE_RANGE];
    174 
    175 static pri_t	fss_maxupri = FSS_MAXUPRI; /* maximum FSS user priority */
    176 static pri_t	fss_maxumdpri; /* maximum user mode fss priority */
    177 static pri_t	fss_maxglobpri;	/* maximum global priority used by fss class */
    178 static pri_t	fss_minglobpri;	/* minimum global priority */
    179 
    180 static fssproc_t fss_listhead[FSS_LISTS];
    181 static kmutex_t	fss_listlock[FSS_LISTS];
    182 
    183 static fsspset_t *fsspsets;
    184 static kmutex_t fsspsets_lock;	/* protects fsspsets */
    185 
    186 static id_t	fss_cid;
    187 
    188 static time_t	fss_minrun = 2;	/* t_pri becomes 59 within 2 secs */
    189 static time_t	fss_minslp = 2;	/* min time on sleep queue for hardswap */
    190 static int	fss_quantum = 11;
    191 
    192 static void	fss_newpri(fssproc_t *);
    193 static void	fss_update(void *);
    194 static int	fss_update_list(int);
    195 static void	fss_change_priority(kthread_t *, fssproc_t *);
    196 
    197 static int	fss_admin(caddr_t, cred_t *);
    198 static int	fss_getclinfo(void *);
    199 static int	fss_parmsin(void *);
    200 static int	fss_parmsout(void *, pc_vaparms_t *);
    201 static int	fss_vaparmsin(void *, pc_vaparms_t *);
    202 static int	fss_vaparmsout(void *, pc_vaparms_t *);
    203 static int	fss_getclpri(pcpri_t *);
    204 static int	fss_alloc(void **, int);
    205 static void	fss_free(void *);
    206 
    207 static int	fss_enterclass(kthread_t *, id_t, void *, cred_t *, void *);
    208 static void	fss_exitclass(void *);
    209 static int	fss_canexit(kthread_t *, cred_t *);
    210 static int	fss_fork(kthread_t *, kthread_t *, void *);
    211 static void	fss_forkret(kthread_t *, kthread_t *);
    212 static void	fss_parmsget(kthread_t *, void *);
    213 static int	fss_parmsset(kthread_t *, void *, id_t, cred_t *);
    214 static void	fss_stop(kthread_t *, int, int);
    215 static void	fss_exit(kthread_t *);
    216 static void	fss_active(kthread_t *);
    217 static void	fss_inactive(kthread_t *);
    218 static pri_t	fss_swapin(kthread_t *, int);
    219 static pri_t	fss_swapout(kthread_t *, int);
    220 static void	fss_trapret(kthread_t *);
    221 static void	fss_preempt(kthread_t *);
    222 static void	fss_setrun(kthread_t *);
    223 static void	fss_sleep(kthread_t *);
    224 static void	fss_tick(kthread_t *);
    225 static void	fss_wakeup(kthread_t *);
    226 static int	fss_donice(kthread_t *, cred_t *, int, int *);
    227 static int	fss_doprio(kthread_t *, cred_t *, int, int *);
    228 static pri_t	fss_globpri(kthread_t *);
    229 static void	fss_yield(kthread_t *);
    230 static void	fss_nullsys();
    231 
    232 static struct classfuncs fss_classfuncs = {
    233 	/* class functions */
    234 	fss_admin,
    235 	fss_getclinfo,
    236 	fss_parmsin,
    237 	fss_parmsout,
    238 	fss_vaparmsin,
    239 	fss_vaparmsout,
    240 	fss_getclpri,
    241 	fss_alloc,
    242 	fss_free,
    243 
    244 	/* thread functions */
    245 	fss_enterclass,
    246 	fss_exitclass,
    247 	fss_canexit,
    248 	fss_fork,
    249 	fss_forkret,
    250 	fss_parmsget,
    251 	fss_parmsset,
    252 	fss_stop,
    253 	fss_exit,
    254 	fss_active,
    255 	fss_inactive,
    256 	fss_swapin,
    257 	fss_swapout,
    258 	fss_trapret,
    259 	fss_preempt,
    260 	fss_setrun,
    261 	fss_sleep,
    262 	fss_tick,
    263 	fss_wakeup,
    264 	fss_donice,
    265 	fss_globpri,
    266 	fss_nullsys,	/* set_process_group */
    267 	fss_yield,
    268 	fss_doprio,
    269 };
    270 
    271 int
    272 _init()
    273 {
    274 	return (mod_install(&modlinkage));
    275 }
    276 
    277 int
    278 _fini()
    279 {
    280 	return (EBUSY);
    281 }
    282 
    283 int
    284 _info(struct modinfo *modinfop)
    285 {
    286 	return (mod_info(&modlinkage, modinfop));
    287 }
    288 
    289 /*ARGSUSED*/
    290 static int
    291 fss_project_walker(kproject_t *kpj, void *buf)
    292 {
    293 	return (0);
    294 }
    295 
    296 void *
    297 fss_allocbuf(int op, int type)
    298 {
    299 	fssbuf_t *fssbuf;
    300 	void **fsslist;
    301 	int cnt;
    302 	int i;
    303 	size_t size;
    304 
    305 	ASSERT(op == FSS_NPSET_BUF || op == FSS_NPROJ_BUF || op == FSS_ONE_BUF);
    306 	ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE);
    307 	ASSERT(MUTEX_HELD(&cpu_lock));
    308 
    309 	fssbuf = kmem_zalloc(sizeof (fssbuf_t), KM_SLEEP);
    310 	switch (op) {
    311 	case FSS_NPSET_BUF:
    312 		cnt = cpupart_list(NULL, 0, CP_NONEMPTY);
    313 		break;
    314 	case FSS_NPROJ_BUF:
    315 		cnt = project_walk_all(ALL_ZONES, fss_project_walker, NULL);
    316 		break;
    317 	case FSS_ONE_BUF:
    318 		cnt = 1;
    319 		break;
    320 	}
    321 
    322 	switch (type) {
    323 	case FSS_ALLOC_PROJ:
    324 		size = sizeof (fssproj_t);
    325 		break;
    326 	case FSS_ALLOC_ZONE:
    327 		size = sizeof (fsszone_t);
    328 		break;
    329 	}
    330 	fsslist = kmem_zalloc(cnt * sizeof (void *), KM_SLEEP);
    331 	fssbuf->fssb_size = cnt;
    332 	fssbuf->fssb_list = fsslist;
    333 	for (i = 0; i < cnt; i++)
    334 		fsslist[i] = kmem_zalloc(size, KM_SLEEP);
    335 	return (fssbuf);
    336 }
    337 
    338 void
    339 fss_freebuf(fssbuf_t *fssbuf, int type)
    340 {
    341 	void **fsslist;
    342 	int i;
    343 	size_t size;
    344 
    345 	ASSERT(fssbuf != NULL);
    346 	ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE);
    347 	fsslist = fssbuf->fssb_list;
    348 
    349 	switch (type) {
    350 	case FSS_ALLOC_PROJ:
    351 		size = sizeof (fssproj_t);
    352 		break;
    353 	case FSS_ALLOC_ZONE:
    354 		size = sizeof (fsszone_t);
    355 		break;
    356 	}
    357 
    358 	for (i = 0; i < fssbuf->fssb_size; i++) {
    359 		if (fsslist[i] != NULL)
    360 			kmem_free(fsslist[i], size);
    361 	}
    362 	kmem_free(fsslist, sizeof (void *) * fssbuf->fssb_size);
    363 	kmem_free(fssbuf, sizeof (fssbuf_t));
    364 }
    365 
    366 static fsspset_t *
    367 fss_find_fsspset(cpupart_t *cpupart)
    368 {
    369 	int i;
    370 	fsspset_t *fsspset = NULL;
    371 	int found = 0;
    372 
    373 	ASSERT(cpupart != NULL);
    374 	ASSERT(MUTEX_HELD(&fsspsets_lock));
    375 
    376 	/*
    377 	 * Search for the cpupart pointer in the array of fsspsets.
    378 	 */
    379 	for (i = 0; i < max_ncpus; i++) {
    380 		fsspset = &fsspsets[i];
    381 		if (fsspset->fssps_cpupart == cpupart) {
    382 			ASSERT(fsspset->fssps_nproj > 0);
    383 			found = 1;
    384 			break;
    385 		}
    386 	}
    387 	if (found == 0) {
    388 		/*
    389 		 * If we didn't find anything, then use the first
    390 		 * available slot in the fsspsets array.
    391 		 */
    392 		for (i = 0; i < max_ncpus; i++) {
    393 			fsspset = &fsspsets[i];
    394 			if (fsspset->fssps_cpupart == NULL) {
    395 				ASSERT(fsspset->fssps_nproj == 0);
    396 				found = 1;
    397 				break;
    398 			}
    399 		}
    400 		fsspset->fssps_cpupart = cpupart;
    401 	}
    402 	ASSERT(found == 1);
    403 	return (fsspset);
    404 }
    405 
    406 static void
    407 fss_del_fsspset(fsspset_t *fsspset)
    408 {
    409 	ASSERT(MUTEX_HELD(&fsspsets_lock));
    410 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    411 	ASSERT(fsspset->fssps_nproj == 0);
    412 	ASSERT(fsspset->fssps_list == NULL);
    413 	ASSERT(fsspset->fssps_zones == NULL);
    414 	fsspset->fssps_cpupart = NULL;
    415 	fsspset->fssps_maxfsspri = 0;
    416 	fsspset->fssps_shares = 0;
    417 }
    418 
    419 /*
    420  * The following routine returns a pointer to the fsszone structure which
    421  * belongs to zone "zone" and cpu partition fsspset, if such structure exists.
    422  */
    423 static fsszone_t *
    424 fss_find_fsszone(fsspset_t *fsspset, zone_t *zone)
    425 {
    426 	fsszone_t *fsszone;
    427 
    428 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    429 
    430 	if (fsspset->fssps_list != NULL) {
    431 		/*
    432 		 * There are projects/zones active on this cpu partition
    433 		 * already.  Try to find our zone among them.
    434 		 */
    435 		fsszone = fsspset->fssps_zones;
    436 		do {
    437 			if (fsszone->fssz_zone == zone) {
    438 				return (fsszone);
    439 			}
    440 			fsszone = fsszone->fssz_next;
    441 		} while (fsszone != fsspset->fssps_zones);
    442 	}
    443 	return (NULL);
    444 }
    445 
    446 /*
    447  * The following routine links new fsszone structure into doubly linked list of
    448  * zones active on the specified cpu partition.
    449  */
    450 static void
    451 fss_insert_fsszone(fsspset_t *fsspset, zone_t *zone, fsszone_t *fsszone)
    452 {
    453 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    454 
    455 	fsszone->fssz_zone = zone;
    456 	fsszone->fssz_rshares = zone->zone_shares;
    457 
    458 	if (fsspset->fssps_zones == NULL) {
    459 		/*
    460 		 * This will be the first fsszone for this fsspset
    461 		 */
    462 		fsszone->fssz_next = fsszone->fssz_prev = fsszone;
    463 		fsspset->fssps_zones = fsszone;
    464 	} else {
    465 		/*
    466 		 * Insert this fsszone to the doubly linked list.
    467 		 */
    468 		fsszone_t *fssz_head = fsspset->fssps_zones;
    469 
    470 		fsszone->fssz_next = fssz_head;
    471 		fsszone->fssz_prev = fssz_head->fssz_prev;
    472 		fssz_head->fssz_prev->fssz_next = fsszone;
    473 		fssz_head->fssz_prev = fsszone;
    474 		fsspset->fssps_zones = fsszone;
    475 	}
    476 }
    477 
    478 /*
    479  * The following routine removes a single fsszone structure from the doubly
    480  * linked list of zones active on the specified cpu partition.  Note that
    481  * global fsspsets_lock must be held in case this fsszone structure is the last
    482  * on the above mentioned list.  Also note that the fsszone structure is not
    483  * freed here, it is the responsibility of the caller to call kmem_free for it.
    484  */
    485 static void
    486 fss_remove_fsszone(fsspset_t *fsspset, fsszone_t *fsszone)
    487 {
    488 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    489 	ASSERT(fsszone->fssz_nproj == 0);
    490 	ASSERT(fsszone->fssz_shares == 0);
    491 	ASSERT(fsszone->fssz_runnable == 0);
    492 
    493 	if (fsszone->fssz_next != fsszone) {
    494 		/*
    495 		 * This is not the last zone in the list.
    496 		 */
    497 		fsszone->fssz_prev->fssz_next = fsszone->fssz_next;
    498 		fsszone->fssz_next->fssz_prev = fsszone->fssz_prev;
    499 		if (fsspset->fssps_zones == fsszone)
    500 			fsspset->fssps_zones = fsszone->fssz_next;
    501 	} else {
    502 		/*
    503 		 * This was the last zone active in this cpu partition.
    504 		 */
    505 		fsspset->fssps_zones = NULL;
    506 	}
    507 }
    508 
    509 /*
    510  * The following routine returns a pointer to the fssproj structure
    511  * which belongs to project kpj and cpu partition fsspset, if such structure
    512  * exists.
    513  */
    514 static fssproj_t *
    515 fss_find_fssproj(fsspset_t *fsspset, kproject_t *kpj)
    516 {
    517 	fssproj_t *fssproj;
    518 
    519 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    520 
    521 	if (fsspset->fssps_list != NULL) {
    522 		/*
    523 		 * There are projects running on this cpu partition already.
    524 		 * Try to find our project among them.
    525 		 */
    526 		fssproj = fsspset->fssps_list;
    527 		do {
    528 			if (fssproj->fssp_proj == kpj) {
    529 				ASSERT(fssproj->fssp_pset == fsspset);
    530 				return (fssproj);
    531 			}
    532 			fssproj = fssproj->fssp_next;
    533 		} while (fssproj != fsspset->fssps_list);
    534 	}
    535 	return (NULL);
    536 }
    537 
    538 /*
    539  * The following routine links new fssproj structure into doubly linked list
    540  * of projects running on the specified cpu partition.
    541  */
    542 static void
    543 fss_insert_fssproj(fsspset_t *fsspset, kproject_t *kpj, fsszone_t *fsszone,
    544     fssproj_t *fssproj)
    545 {
    546 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    547 
    548 	fssproj->fssp_pset = fsspset;
    549 	fssproj->fssp_proj = kpj;
    550 	fssproj->fssp_shares = kpj->kpj_shares;
    551 
    552 	fsspset->fssps_nproj++;
    553 
    554 	if (fsspset->fssps_list == NULL) {
    555 		/*
    556 		 * This will be the first fssproj for this fsspset
    557 		 */
    558 		fssproj->fssp_next = fssproj->fssp_prev = fssproj;
    559 		fsspset->fssps_list = fssproj;
    560 	} else {
    561 		/*
    562 		 * Insert this fssproj to the doubly linked list.
    563 		 */
    564 		fssproj_t *fssp_head = fsspset->fssps_list;
    565 
    566 		fssproj->fssp_next = fssp_head;
    567 		fssproj->fssp_prev = fssp_head->fssp_prev;
    568 		fssp_head->fssp_prev->fssp_next = fssproj;
    569 		fssp_head->fssp_prev = fssproj;
    570 		fsspset->fssps_list = fssproj;
    571 	}
    572 	fssproj->fssp_fsszone = fsszone;
    573 	fsszone->fssz_nproj++;
    574 	ASSERT(fsszone->fssz_nproj != 0);
    575 }
    576 
    577 /*
    578  * The following routine removes a single fssproj structure from the doubly
    579  * linked list of projects running on the specified cpu partition.  Note that
    580  * global fsspsets_lock must be held in case if this fssproj structure is the
    581  * last on the above mentioned list.  Also note that the fssproj structure is
    582  * not freed here, it is the responsibility of the caller to call kmem_free
    583  * for it.
    584  */
    585 static void
    586 fss_remove_fssproj(fsspset_t *fsspset, fssproj_t *fssproj)
    587 {
    588 	fsszone_t *fsszone;
    589 
    590 	ASSERT(MUTEX_HELD(&fsspsets_lock));
    591 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    592 	ASSERT(fssproj->fssp_runnable == 0);
    593 
    594 	fsspset->fssps_nproj--;
    595 
    596 	fsszone = fssproj->fssp_fsszone;
    597 	fsszone->fssz_nproj--;
    598 
    599 	if (fssproj->fssp_next != fssproj) {
    600 		/*
    601 		 * This is not the last part in the list.
    602 		 */
    603 		fssproj->fssp_prev->fssp_next = fssproj->fssp_next;
    604 		fssproj->fssp_next->fssp_prev = fssproj->fssp_prev;
    605 		if (fsspset->fssps_list == fssproj)
    606 			fsspset->fssps_list = fssproj->fssp_next;
    607 		if (fsszone->fssz_nproj == 0)
    608 			fss_remove_fsszone(fsspset, fsszone);
    609 	} else {
    610 		/*
    611 		 * This was the last project part running
    612 		 * at this cpu partition.
    613 		 */
    614 		fsspset->fssps_list = NULL;
    615 		ASSERT(fsspset->fssps_nproj == 0);
    616 		ASSERT(fsszone->fssz_nproj == 0);
    617 		fss_remove_fsszone(fsspset, fsszone);
    618 		fss_del_fsspset(fsspset);
    619 	}
    620 }
    621 
    622 static void
    623 fss_inactive(kthread_t *t)
    624 {
    625 	fssproc_t *fssproc;
    626 	fssproj_t *fssproj;
    627 	fsspset_t *fsspset;
    628 	fsszone_t *fsszone;
    629 
    630 	ASSERT(THREAD_LOCK_HELD(t));
    631 	fssproc = FSSPROC(t);
    632 	fssproj = FSSPROC2FSSPROJ(fssproc);
    633 	if (fssproj == NULL)	/* if this thread already exited */
    634 		return;
    635 	fsspset = FSSPROJ2FSSPSET(fssproj);
    636 	fsszone = fssproj->fssp_fsszone;
    637 	disp_lock_enter_high(&fsspset->fssps_displock);
    638 	ASSERT(fssproj->fssp_runnable > 0);
    639 	if (--fssproj->fssp_runnable == 0) {
    640 		fsszone->fssz_shares -= fssproj->fssp_shares;
    641 		if (--fsszone->fssz_runnable == 0)
    642 			fsspset->fssps_shares -= fsszone->fssz_rshares;
    643 	}
    644 	ASSERT(fssproc->fss_runnable == 1);
    645 	fssproc->fss_runnable = 0;
    646 	disp_lock_exit_high(&fsspset->fssps_displock);
    647 }
    648 
    649 static void
    650 fss_active(kthread_t *t)
    651 {
    652 	fssproc_t *fssproc;
    653 	fssproj_t *fssproj;
    654 	fsspset_t *fsspset;
    655 	fsszone_t *fsszone;
    656 
    657 	ASSERT(THREAD_LOCK_HELD(t));
    658 	fssproc = FSSPROC(t);
    659 	fssproj = FSSPROC2FSSPROJ(fssproc);
    660 	if (fssproj == NULL)	/* if this thread already exited */
    661 		return;
    662 	fsspset = FSSPROJ2FSSPSET(fssproj);
    663 	fsszone = fssproj->fssp_fsszone;
    664 	disp_lock_enter_high(&fsspset->fssps_displock);
    665 	if (++fssproj->fssp_runnable == 1) {
    666 		fsszone->fssz_shares += fssproj->fssp_shares;
    667 		if (++fsszone->fssz_runnable == 1)
    668 			fsspset->fssps_shares += fsszone->fssz_rshares;
    669 	}
    670 	ASSERT(fssproc->fss_runnable == 0);
    671 	fssproc->fss_runnable = 1;
    672 	disp_lock_exit_high(&fsspset->fssps_displock);
    673 }
    674 
    675 /*
    676  * Fair share scheduler initialization. Called by dispinit() at boot time.
    677  * We can ignore clparmsz argument since we know that the smallest possible
    678  * parameter buffer is big enough for us.
    679  */
    680 /*ARGSUSED*/
    681 static pri_t
    682 fss_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
    683 {
    684 	int i;
    685 
    686 	ASSERT(MUTEX_HELD(&cpu_lock));
    687 
    688 	fss_cid = cid;
    689 	fss_maxumdpri = minclsyspri - 1;
    690 	fss_maxglobpri = minclsyspri;
    691 	fss_minglobpri = 0;
    692 	fsspsets = kmem_zalloc(sizeof (fsspset_t) * max_ncpus, KM_SLEEP);
    693 
    694 	/*
    695 	 * Initialize the fssproc hash table.
    696 	 */
    697 	for (i = 0; i < FSS_LISTS; i++)
    698 		fss_listhead[i].fss_next = fss_listhead[i].fss_prev =
    699 		    &fss_listhead[i];
    700 
    701 	*clfuncspp = &fss_classfuncs;
    702 
    703 	/*
    704 	 * Fill in fss_nice_tick and fss_nice_decay arrays:
    705 	 * The cost of a tick is lower at positive nice values (so that it
    706 	 * will not increase its project's usage as much as normal) with 50%
    707 	 * drop at the maximum level and 50% increase at the minimum level.
    708 	 * The fsspri decay is slower at positive nice values.  fsspri values
    709 	 * of processes with negative nice levels must decay faster to receive
    710 	 * time slices more frequently than normal.
    711 	 */
    712 	for (i = 0; i < FSS_NICE_RANGE; i++) {
    713 		fss_nice_tick[i] = (FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2)
    714 		    - i)) / FSS_NICE_RANGE;
    715 		fss_nice_decay[i] = FSS_DECAY_MIN +
    716 		    ((FSS_DECAY_MAX - FSS_DECAY_MIN) * i) /
    717 		    (FSS_NICE_RANGE - 1);
    718 	}
    719 
    720 	return (fss_maxglobpri);
    721 }
    722 
    723 /*
    724  * Calculate the new cpupri based on the usage, the number of shares and
    725  * the number of active threads.  Reset the tick counter for this thread.
    726  */
    727 static void
    728 fss_newpri(fssproc_t *fssproc)
    729 {
    730 	kthread_t *tp;
    731 	fssproj_t *fssproj;
    732 	fsspset_t *fsspset;
    733 	fsszone_t *fsszone;
    734 	fsspri_t fsspri, maxfsspri;
    735 	pri_t invpri;
    736 	uint32_t ticks;
    737 
    738 	tp = fssproc->fss_tp;
    739 	ASSERT(tp != NULL);
    740 
    741 	if (tp->t_cid != fss_cid)
    742 		return;
    743 
    744 	ASSERT(THREAD_LOCK_HELD(tp));
    745 
    746 	fssproj = FSSPROC2FSSPROJ(fssproc);
    747 	fsszone = FSSPROJ2FSSZONE(fssproj);
    748 	if (fssproj == NULL)
    749 		/*
    750 		 * No need to change priority of exited threads.
    751 		 */
    752 		return;
    753 
    754 	fsspset = FSSPROJ2FSSPSET(fssproj);
    755 	disp_lock_enter_high(&fsspset->fssps_displock);
    756 
    757 	if (fssproj->fssp_shares == 0 || fsszone->fssz_rshares == 0) {
    758 		/*
    759 		 * Special case: threads with no shares.
    760 		 */
    761 		fssproc->fss_umdpri = fss_minglobpri;
    762 		fssproc->fss_ticks = 0;
    763 		disp_lock_exit_high(&fsspset->fssps_displock);
    764 		return;
    765 	}
    766 
    767 	/*
    768 	 * fsspri += shusage * nrunnable * ticks
    769 	 */
    770 	ticks = fssproc->fss_ticks;
    771 	fssproc->fss_ticks = 0;
    772 	fsspri = fssproc->fss_fsspri;
    773 	fsspri += fssproj->fssp_shusage * fssproj->fssp_runnable * ticks;
    774 	fssproc->fss_fsspri = fsspri;
    775 
    776 	if (fsspri < fss_maxumdpri)
    777 		fsspri = fss_maxumdpri;	/* so that maxfsspri is != 0 */
    778 
    779 	/*
    780 	 * The general priority formula:
    781 	 *
    782 	 *			(fsspri * umdprirange)
    783 	 *   pri = maxumdpri - ------------------------
    784 	 *				maxfsspri
    785 	 *
    786 	 * If this thread's fsspri is greater than the previous largest
    787 	 * fsspri, then record it as the new high and priority for this
    788 	 * thread will be one (the lowest priority assigned to a thread
    789 	 * that has non-zero shares).
    790 	 * Note that this formula cannot produce out of bounds priority
    791 	 * values; if it is changed, additional checks may need  to  be
    792 	 * added.
    793 	 */
    794 	maxfsspri = fsspset->fssps_maxfsspri;
    795 	if (fsspri >= maxfsspri) {
    796 		fsspset->fssps_maxfsspri = fsspri;
    797 		disp_lock_exit_high(&fsspset->fssps_displock);
    798 		fssproc->fss_umdpri = 1;
    799 	} else {
    800 		disp_lock_exit_high(&fsspset->fssps_displock);
    801 		invpri = (fsspri * (fss_maxumdpri - 1)) / maxfsspri;
    802 		fssproc->fss_umdpri = fss_maxumdpri - invpri;
    803 	}
    804 }
    805 
    806 /*
    807  * Decays usages of all running projects and resets their tick counters.
    808  * Called once per second from fss_update() after updating priorities.
    809  */
    810 static void
    811 fss_decay_usage()
    812 {
    813 	uint32_t zone_ext_shares, zone_int_shares;
    814 	uint32_t kpj_shares, pset_shares;
    815 	fsspset_t *fsspset;
    816 	fssproj_t *fssproj;
    817 	fsszone_t *fsszone;
    818 	fsspri_t maxfsspri;
    819 	int psetid;
    820 
    821 	mutex_enter(&fsspsets_lock);
    822 	/*
    823 	 * Go through all active processor sets and decay usages of projects
    824 	 * running on them.
    825 	 */
    826 	for (psetid = 0; psetid < max_ncpus; psetid++) {
    827 		fsspset = &fsspsets[psetid];
    828 		mutex_enter(&fsspset->fssps_lock);
    829 
    830 		if (fsspset->fssps_cpupart == NULL ||
    831 		    (fssproj = fsspset->fssps_list) == NULL) {
    832 			mutex_exit(&fsspset->fssps_lock);
    833 			continue;
    834 		}
    835 
    836 		/*
    837 		 * Decay maxfsspri for this cpu partition with the
    838 		 * fastest possible decay rate.
    839 		 */
    840 		disp_lock_enter(&fsspset->fssps_displock);
    841 
    842 		maxfsspri = (fsspset->fssps_maxfsspri *
    843 		    fss_nice_decay[NZERO]) / FSS_DECAY_BASE;
    844 		if (maxfsspri < fss_maxumdpri)
    845 			maxfsspri = fss_maxumdpri;
    846 		fsspset->fssps_maxfsspri = maxfsspri;
    847 
    848 		do {
    849 			/*
    850 			 * Decay usage for each project running on
    851 			 * this cpu partition.
    852 			 */
    853 			fssproj->fssp_usage =
    854 			    (fssproj->fssp_usage * FSS_DECAY_USG) /
    855 			    FSS_DECAY_BASE + fssproj->fssp_ticks;
    856 			fssproj->fssp_ticks = 0;
    857 
    858 			fsszone = fssproj->fssp_fsszone;
    859 			/*
    860 			 * Readjust the project's number of shares if it has
    861 			 * changed since we checked it last time.
    862 			 */
    863 			kpj_shares = fssproj->fssp_proj->kpj_shares;
    864 			if (fssproj->fssp_shares != kpj_shares) {
    865 				if (fssproj->fssp_runnable != 0) {
    866 					fsszone->fssz_shares -=
    867 					    fssproj->fssp_shares;
    868 					fsszone->fssz_shares += kpj_shares;
    869 				}
    870 				fssproj->fssp_shares = kpj_shares;
    871 			}
    872 
    873 			/*
    874 			 * Readjust the zone's number of shares if it
    875 			 * has changed since we checked it last time.
    876 			 */
    877 			zone_ext_shares = fsszone->fssz_zone->zone_shares;
    878 			if (fsszone->fssz_rshares != zone_ext_shares) {
    879 				if (fsszone->fssz_runnable != 0) {
    880 					fsspset->fssps_shares -=
    881 					    fsszone->fssz_rshares;
    882 					fsspset->fssps_shares +=
    883 					    zone_ext_shares;
    884 				}
    885 				fsszone->fssz_rshares = zone_ext_shares;
    886 			}
    887 			zone_int_shares = fsszone->fssz_shares;
    888 			pset_shares = fsspset->fssps_shares;
    889 			/*
    890 			 * Calculate fssp_shusage value to be used
    891 			 * for fsspri increments for the next second.
    892 			 */
    893 			if (kpj_shares == 0 || zone_ext_shares == 0) {
    894 				fssproj->fssp_shusage = 0;
    895 			} else if (FSSPROJ2KPROJ(fssproj) == proj0p) {
    896 				/*
    897 				 * Project 0 in the global zone has 50%
    898 				 * of its zone.
    899 				 */
    900 				fssproj->fssp_shusage = (fssproj->fssp_usage *
    901 				    zone_int_shares * zone_int_shares) /
    902 				    (zone_ext_shares * zone_ext_shares);
    903 			} else {
    904 				/*
    905 				 * Thread's priority is based on its project's
    906 				 * normalized usage (shusage) value which gets
    907 				 * calculated this way:
    908 				 *
    909 				 *	   pset_shares^2    zone_int_shares^2
    910 				 * usage * ------------- * ------------------
    911 				 *	   kpj_shares^2	    zone_ext_shares^2
    912 				 *
    913 				 * Where zone_int_shares is the sum of shares
    914 				 * of all active projects within the zone (and
    915 				 * the pset), and zone_ext_shares is the number
    916 				 * of zone shares (ie, zone.cpu-shares).
    917 				 *
    918 				 * If there is only one zone active on the pset
    919 				 * the above reduces to:
    920 				 *
    921 				 * 			zone_int_shares^2
    922 				 * shusage = usage * ---------------------
    923 				 * 			kpj_shares^2
    924 				 *
    925 				 * If there's only one project active in the
    926 				 * zone this formula reduces to:
    927 				 *
    928 				 *			pset_shares^2
    929 				 * shusage = usage * ----------------------
    930 				 *			zone_ext_shares^2
    931 				 */
    932 				fssproj->fssp_shusage = fssproj->fssp_usage *
    933 				    pset_shares * zone_int_shares;
    934 				fssproj->fssp_shusage /=
    935 				    kpj_shares * zone_ext_shares;
    936 				fssproj->fssp_shusage *=
    937 				    pset_shares * zone_int_shares;
    938 				fssproj->fssp_shusage /=
    939 				    kpj_shares * zone_ext_shares;
    940 			}
    941 			fssproj = fssproj->fssp_next;
    942 		} while (fssproj != fsspset->fssps_list);
    943 
    944 		disp_lock_exit(&fsspset->fssps_displock);
    945 		mutex_exit(&fsspset->fssps_lock);
    946 	}
    947 	mutex_exit(&fsspsets_lock);
    948 }
    949 
    950 static void
    951 fss_change_priority(kthread_t *t, fssproc_t *fssproc)
    952 {
    953 	pri_t new_pri;
    954 
    955 	ASSERT(THREAD_LOCK_HELD(t));
    956 	new_pri = fssproc->fss_umdpri;
    957 	ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri);
    958 
    959 	t->t_cpri = fssproc->fss_upri;
    960 	fssproc->fss_flags &= ~FSSRESTORE;
    961 	if (t == curthread || t->t_state == TS_ONPROC) {
    962 		/*
    963 		 * curthread is always onproc
    964 		 */
    965 		cpu_t *cp = t->t_disp_queue->disp_cpu;
    966 		THREAD_CHANGE_PRI(t, new_pri);
    967 		if (t == cp->cpu_dispthread)
    968 			cp->cpu_dispatch_pri = DISP_PRIO(t);
    969 		if (DISP_MUST_SURRENDER(t)) {
    970 			fssproc->fss_flags |= FSSBACKQ;
    971 			cpu_surrender(t);
    972 		} else {
    973 			fssproc->fss_timeleft = fss_quantum;
    974 		}
    975 	} else {
    976 		/*
    977 		 * When the priority of a thread is changed, it may be
    978 		 * necessary to adjust its position on a sleep queue or
    979 		 * dispatch queue.  The function thread_change_pri accomplishes
    980 		 * this.
    981 		 */
    982 		if (thread_change_pri(t, new_pri, 0)) {
    983 			/*
    984 			 * The thread was on a run queue.
    985 			 */
    986 			fssproc->fss_timeleft = fss_quantum;
    987 		} else {
    988 			fssproc->fss_flags |= FSSBACKQ;
    989 		}
    990 	}
    991 }
    992 
    993 /*
    994  * Update priorities of all fair-sharing threads that are currently runnable
    995  * at a user mode priority based on the number of shares and current usage.
    996  * Called once per second via timeout which we reset here.
    997  *
    998  * There are several lists of fair-sharing threads broken up by a hash on the
    999  * thread pointer.  Each list has its own lock.  This avoids blocking all
   1000  * fss_enterclass, fss_fork, and fss_exitclass operations while fss_update runs.
   1001  * fss_update traverses each list in turn.
   1002  */
   1003 static void
   1004 fss_update(void *arg)
   1005 {
   1006 	int i;
   1007 	int new_marker = -1;
   1008 	static int fss_update_marker;
   1009 
   1010 	/*
   1011 	 * Decay and update usages for all projects.
   1012 	 */
   1013 	fss_decay_usage();
   1014 
   1015 	/*
   1016 	 * Start with the fss_update_marker list, then do the rest.
   1017 	 */
   1018 	i = fss_update_marker;
   1019 
   1020 	/*
   1021 	 * Go around all threads, set new priorities and decay
   1022 	 * per-thread CPU usages.
   1023 	 */
   1024 	do {
   1025 		/*
   1026 		 * If this is the first list after the current marker to have
   1027 		 * threads with priorities updates, advance the marker to this
   1028 		 * list for the next time fss_update runs.
   1029 		 */
   1030 		if (fss_update_list(i) &&
   1031 		    new_marker == -1 && i != fss_update_marker)
   1032 			new_marker = i;
   1033 	} while ((i = FSS_LIST_NEXT(i)) != fss_update_marker);
   1034 
   1035 	/*
   1036 	 * Advance marker for the next fss_update call
   1037 	 */
   1038 	if (new_marker != -1)
   1039 		fss_update_marker = new_marker;
   1040 
   1041 	(void) timeout(fss_update, arg, hz);
   1042 }
   1043 
   1044 /*
   1045  * Updates priority for a list of threads.  Returns 1 if the priority of one
   1046  * of the threads was actually updated, 0 if none were for various reasons
   1047  * (thread is no longer in the FSS class, is not runnable, has the preemption
   1048  * control no-preempt bit set, etc.)
   1049  */
   1050 static int
   1051 fss_update_list(int i)
   1052 {
   1053 	fssproc_t *fssproc;
   1054 	fssproj_t *fssproj;
   1055 	fsspri_t fsspri;
   1056 	kthread_t *t;
   1057 	int updated = 0;
   1058 
   1059 	mutex_enter(&fss_listlock[i]);
   1060 	for (fssproc = fss_listhead[i].fss_next; fssproc != &fss_listhead[i];
   1061 	    fssproc = fssproc->fss_next) {
   1062 		t = fssproc->fss_tp;
   1063 		/*
   1064 		 * Lock the thread and verify the state.
   1065 		 */
   1066 		thread_lock(t);
   1067 		/*
   1068 		 * Skip the thread if it is no longer in the FSS class or
   1069 		 * is running with kernel mode priority.
   1070 		 */
   1071 		if (t->t_cid != fss_cid)
   1072 			goto next;
   1073 		if ((fssproc->fss_flags & FSSKPRI) != 0)
   1074 			goto next;
   1075 
   1076 		fssproj = FSSPROC2FSSPROJ(fssproc);
   1077 		if (fssproj == NULL)
   1078 			goto next;
   1079 		if (fssproj->fssp_shares != 0) {
   1080 			/*
   1081 			 * Decay fsspri value.
   1082 			 */
   1083 			fsspri = fssproc->fss_fsspri;
   1084 			fsspri = (fsspri * fss_nice_decay[fssproc->fss_nice]) /
   1085 			    FSS_DECAY_BASE;
   1086 			fssproc->fss_fsspri = fsspri;
   1087 		}
   1088 
   1089 		if (t->t_schedctl && schedctl_get_nopreempt(t))
   1090 			goto next;
   1091 		if (t->t_state != TS_RUN && t->t_state != TS_WAIT) {
   1092 			/*
   1093 			 * Make next syscall/trap call fss_trapret
   1094 			 */
   1095 			t->t_trapret = 1;
   1096 			aston(t);
   1097 			goto next;
   1098 		}
   1099 		fss_newpri(fssproc);
   1100 		updated = 1;
   1101 
   1102 		/*
   1103 		 * Only dequeue the thread if it needs to be moved; otherwise
   1104 		 * it should just round-robin here.
   1105 		 */
   1106 		if (t->t_pri != fssproc->fss_umdpri)
   1107 			fss_change_priority(t, fssproc);
   1108 next:
   1109 		thread_unlock(t);
   1110 	}
   1111 	mutex_exit(&fss_listlock[i]);
   1112 	return (updated);
   1113 }
   1114 
   1115 /*ARGSUSED*/
   1116 static int
   1117 fss_admin(caddr_t uaddr