Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/pool.h>
     28 #include <sys/pool_impl.h>
     29 #include <sys/pool_pset.h>
     30 #include <sys/id_space.h>
     31 #include <sys/mutex.h>
     32 #include <sys/nvpair.h>
     33 #include <sys/cpuvar.h>
     34 #include <sys/errno.h>
     35 #include <sys/cmn_err.h>
     36 #include <sys/systm.h>
     37 #include <sys/proc.h>
     38 #include <sys/fss.h>
     39 #include <sys/class.h>
     40 #include <sys/exacct.h>
     41 #include <sys/utsname.h>
     42 #include <sys/procset.h>
     43 #include <sys/atomic.h>
     44 #include <sys/zone.h>
     45 #include <sys/policy.h>
     46 #include <sys/schedctl.h>
     47 
     48 /*
     49  * RESOURCE POOLS
     50  *
     51  * The resource pools facility brings together process-bindable resource into
     52  * a common abstraction called a pool. Processor sets and other entities can
     53  * be configured, grouped, and labelled such that workload components can be
     54  * associated with a subset of a system's total resources.
     55  *
     56  * When disabled, the pools facility is "invisible".  All processes belong
     57  * to the same pool (pool_default), and processor sets can be managed through
     58  * the old pset() system call.  When enabled, processor sets can only be
     59  * managed via the pools facility.  New pools can be created and associated
     60  * with processor sets.  Processes can be bound to pools which have non-empty
     61  * resource sets.
     62  *
     63  * Locking: pool_lock() protects global pools state and must be called
     64  * before modifying the configuration, or when taking a snapshot of the
     65  * configuration.  If pool_lock_intr() is used, the operation may be
     66  * interrupted by a signal or a request.
     67  *
     68  * To prevent processes from being rebound between pools while they are
     69  * the middle of an operation which affects resource set bindings, such
     70  * operations must be surrounded by calls to pool_barrier_enter() and
     71  * pool_barrier_exit().  This mechanism guarantees that such processes will
     72  * be stopped either at the beginning or at the end of the barrier so that
     73  * the rebind operation can atomically bind the process and its threads
     74  * to new resource sets, and then let process run again.
     75  *
     76  * Lock ordering with respect to other locks is as follows:
     77  *
     78  * 	pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
     79  *
     80  * Most static and global variables defined in this file are protected
     81  * by calling pool_lock().
     82  *
     83  * The operation that binds tasks and projects to pools is atomic.  That is,
     84  * either all processes in a given task or a project will be bound to a
     85  * new pool, or (in case of an error) they will be all left bound to the
     86  * old pool. Processes in a given task or a given project can only be bound to
     87  * different pools if they were rebound individually one by one as single
     88  * processes.  Threads or LWPs of the same process do not have pool bindings,
     89  * and are bound to the same resource sets associated with the resource pool
     90  * of that process.
     91  *
     92  * The following picture shows one possible pool configuration with three
     93  * pools and three processor sets.  Note that processor set "foo" is not
     94  * associated with any pools and therefore cannot have any processes
     95  * bound to it.  Two pools (default and foo) are associated with the
     96  * same processor set (default).  Also, note that processes in Task 2
     97  * are bound to different pools.
     98  *
     99  *
    100  *							       Processor Sets
    101  *								+---------+
    102  *		       +--------------+========================>| default |
    103  *		      a|	      |				+---------+
    104  *		      s|	      |				    ||
    105  *		      s|	      |				+---------+
    106  *		      o|	      |				|   foo   |
    107  *		      c|	      |				+---------+
    108  *		      i|	      |				    ||
    109  *		      a|	      |				+---------+
    110  *		      t|	      |			+------>|   bar   |
    111  *		      e|	      |			|	+---------+
    112  *                    d|              |                 |
    113  *                     |              |                 |
    114  *	       +---------+      +---------+      +---------+
    115  *     Pools   | default |======|   foo   |======|   bar   |
    116  *	       +---------+      +---------+      +---------+
    117  *	           @  @            @              @ @   @
    118  *                b|  |            |              | |   |
    119  *                o|  |            |              | |   |
    120  *                u|  +-----+      |      +-------+ |   +---+
    121  *                n|        |      |      |         |       |
    122  *            ....d|........|......|......|.........|.......|....
    123  *            :    |   ::   |      |      |    ::   |       |   :
    124  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
    125  *  Processes :  | p | :: | p |  | p |  | p |  :: | p |...| p | :
    126  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
    127  *            :........::......................::...............:
    128  *              Task 1            Task 2              Task N
    129  *                 |                 |                  |
    130  *                 |                 |                  |
    131  *                 |  +-----------+  |             +-----------+
    132  *                 +--| Project 1 |--+             | Project N |
    133  *                    +-----------+                +-----------+
    134  *
    135  * This is just an illustration of relationships between processes, tasks,
    136  * projects, pools, and processor sets. New types of resource sets will be
    137  * added in the future.
    138  */
    139 
    140 pool_t		*pool_default;	/* default pool which always exists */
    141 int		pool_count;	/* number of pools created on this system */
    142 int		pool_state;	/* pools state -- enabled/disabled */
    143 void		*pool_buf;	/* pre-commit snapshot of the pools state */
    144 size_t		pool_bufsz;	/* size of pool_buf */
    145 static hrtime_t	pool_pool_mod;	/* last modification time for pools */
    146 static hrtime_t	pool_sys_mod;	/* last modification time for system */
    147 static nvlist_t	*pool_sys_prop;	/* system properties */
    148 static id_space_t *pool_ids;	/* pool ID space */
    149 static list_t	pool_list;	/* doubly-linked list of pools */
    150 static kmutex_t		pool_mutex;		/* protects pool_busy_* */
    151 static kcondvar_t	pool_busy_cv;		/* waiting for "pool_lock" */
    152 static kthread_t	*pool_busy_thread;	/* thread holding "pool_lock" */
    153 static kmutex_t		pool_barrier_lock;	/* synch. with pool_barrier_* */
    154 static kcondvar_t	pool_barrier_cv;	/* synch. with pool_barrier_* */
    155 static int		pool_barrier_count;	/* synch. with pool_barrier_* */
    156 
    157 /*
    158  * Boot-time pool initialization.
    159  */
    160 void
    161 pool_init(void)
    162 {
    163 	pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID);
    164 
    165 	/*
    166 	 * Initialize default pool.
    167 	 */
    168 	pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
    169 	pool_default->pool_id = POOL_DEFAULT;
    170 	list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link));
    171 	list_insert_head(&pool_list, pool_default);
    172 
    173 	/*
    174 	 * Initialize plugins for resource sets.
    175 	 */
    176 	pool_pset_init();
    177 	pool_count = 1;
    178 	p0.p_pool = pool_default;
    179 	global_zone->zone_pool = pool_default;
    180 	pool_default->pool_ref = 1;
    181 }
    182 
    183 /*
    184  * Synchronization routines.
    185  *
    186  * pool_lock is only called from syscall-level routines (processor_bind(),
    187  * pset_*(), and /dev/pool ioctls).  The pool "lock" may be held for long
    188  * periods of time, including across sleeping operations, so we allow its
    189  * acquisition to be interruptible.
    190  *
    191  * The current thread that owns the "lock" is stored in the variable
    192  * pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
    193  */
    194 void
    195 pool_lock(void)
    196 {
    197 	mutex_enter(&pool_mutex);
    198 	ASSERT(!pool_lock_held());
    199 	while (pool_busy_thread != NULL)
    200 		cv_wait(&pool_busy_cv, &pool_mutex);
    201 	pool_busy_thread = curthread;
    202 	mutex_exit(&pool_mutex);
    203 }
    204 
    205 int
    206 pool_lock_intr(void)
    207 {
    208 	mutex_enter(&pool_mutex);
    209 	ASSERT(!pool_lock_held());
    210 	while (pool_busy_thread != NULL) {
    211 		if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) {
    212 			cv_signal(&pool_busy_cv);
    213 			mutex_exit(&pool_mutex);
    214 			return (1);
    215 		}
    216 	}
    217 	pool_busy_thread = curthread;
    218 	mutex_exit(&pool_mutex);
    219 	return (0);
    220 }
    221 
    222 int
    223 pool_lock_held(void)
    224 {
    225 	return (pool_busy_thread == curthread);
    226 }
    227 
    228 void
    229 pool_unlock(void)
    230 {
    231 	mutex_enter(&pool_mutex);
    232 	ASSERT(pool_lock_held());
    233 	pool_busy_thread = NULL;
    234 	cv_signal(&pool_busy_cv);
    235 	mutex_exit(&pool_mutex);
    236 }
    237 
    238 /*
    239  * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
    240  * with pool_do_bind().
    241  *
    242  * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
    243  * operations which modify pool or pset associations.  They can be called
    244  * while the process is multi-threaded.  In the common case, when current
    245  * process is not being rebound (PBWAIT flag is not set), these functions
    246  * will be just incrementing and decrementing reference counts.
    247  */
    248 void
    249 pool_barrier_enter(void)
    250 {
    251 	proc_t *p = curproc;
    252 
    253 	ASSERT(MUTEX_HELD(&p->p_lock));
    254 	while (p->p_poolflag & PBWAIT)
    255 		cv_wait(&p->p_poolcv, &p->p_lock);
    256 	p->p_poolcnt++;
    257 }
    258 
    259 void
    260 pool_barrier_exit(void)
    261 {
    262 	proc_t *p = curproc;
    263 
    264 	ASSERT(MUTEX_HELD(&p->p_lock));
    265 	ASSERT(p->p_poolcnt > 0);
    266 	p->p_poolcnt--;
    267 	if (p->p_poolflag & PBWAIT) {
    268 		mutex_enter(&pool_barrier_lock);
    269 		ASSERT(pool_barrier_count > 0);
    270 		pool_barrier_count--;
    271 		if (pool_barrier_count == 0)
    272 			cv_signal(&pool_barrier_cv);
    273 		mutex_exit(&pool_barrier_lock);
    274 		while (p->p_poolflag & PBWAIT)
    275 			cv_wait(&p->p_poolcv, &p->p_lock);
    276 	}
    277 }
    278 
    279 /*
    280  * Enable pools facility.
    281  */
    282 static int
    283 pool_enable(void)
    284 {
    285 	int ret;
    286 
    287 	ASSERT(pool_lock_held());
    288 	ASSERT(pool_count == 1);
    289 
    290 	ret = pool_pset_enable();
    291 	if (ret != 0)
    292 		return (ret);
    293 	(void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP);
    294 	(void) nvlist_add_string(pool_sys_prop, "system.name",
    295 	    "default");
    296 	(void) nvlist_add_string(pool_sys_prop, "system.comment", "");
    297 	(void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
    298 	(void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
    299 	(void) nvlist_add_string(pool_sys_prop, "system.poold.objectives",
    300 	    "wt-load");
    301 
    302 	(void) nvlist_alloc(&pool_default->pool_props,
    303 	    NV_UNIQUE_NAME, KM_SLEEP);
    304 	(void) nvlist_add_string(pool_default->pool_props,
    305 	    "pool.name", "pool_default");
    306 	(void) nvlist_add_string(pool_default->pool_props, "pool.comment", "");
    307 	(void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1);
    308 	(void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1);
    309 	(void) nvlist_add_int64(pool_default->pool_props,
    310 	    "pool.importance", 1);
    311 	(void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id",
    312 	    pool_default->pool_id);
    313 
    314 	pool_sys_mod = pool_pool_mod = gethrtime();
    315 
    316 	return (ret);
    317 }
    318 
    319 /*
    320  * Disable pools facility.
    321  */
    322 static int
    323 pool_disable(void)
    324 {
    325 	int ret;
    326 
    327 	ASSERT(pool_lock_held());
    328 
    329 	if (pool_count > 1)	/* must destroy all pools first */
    330 		return (EBUSY);
    331 
    332 	ret = pool_pset_disable();
    333 	if (ret != 0)
    334 		return (ret);
    335 	if (pool_sys_prop != NULL) {
    336 		nvlist_free(pool_sys_prop);
    337 		pool_sys_prop = NULL;
    338 	}
    339 	if (pool_default->pool_props != NULL) {
    340 		nvlist_free(pool_default->pool_props);
    341 		pool_default->pool_props = NULL;
    342 	}
    343 	return (0);
    344 }
    345 
    346 pool_t *
    347 pool_lookup_pool_by_name(char *name)
    348 {
    349 	pool_t *pool = pool_default;
    350 	char *p;
    351 
    352 	ASSERT(pool_lock_held());
    353 	for (pool = list_head(&pool_list); pool;
    354 	    pool = list_next(&pool_list, pool)) {
    355 		if (nvlist_lookup_string(pool->pool_props,
    356 		    "pool.name", &p) == 0 && strcmp(name, p) == 0)
    357 			return (pool);
    358 	}
    359 	return (NULL);
    360 }
    361 
    362 pool_t *
    363 pool_lookup_pool_by_id(poolid_t poolid)
    364 {
    365 	pool_t *pool = pool_default;
    366 
    367 	ASSERT(pool_lock_held());
    368 	for (pool = list_head(&pool_list); pool;
    369 	    pool = list_next(&pool_list, pool)) {
    370 		if (pool->pool_id == poolid)
    371 			return (pool);
    372 	}
    373 	return (NULL);
    374 }
    375 
    376 /*
    377  * Create new pool, associate it with default resource sets, and give
    378  * it a temporary name.
    379  */
    380 static int
    381 pool_pool_create(poolid_t *poolid)
    382 {
    383 	pool_t *pool;
    384 	char pool_name[40];
    385 
    386 	ASSERT(pool_lock_held());
    387 
    388 	pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
    389 	pool->pool_id = *poolid = id_alloc(pool_ids);
    390 	pool->pool_pset = pool_pset_default;
    391 	pool_pset_default->pset_npools++;
    392 	list_insert_tail(&pool_list, pool);
    393 	(void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP);
    394 	(void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id);
    395 	(void) nvlist_add_byte(pool->pool_props, "pool.default", 0);
    396 	pool_pool_mod = gethrtime();
    397 	(void) snprintf(pool_name, sizeof (pool_name), "pool_%lld",
    398 	    pool_pool_mod);
    399 	(void) nvlist_add_string(pool->pool_props, "pool.name", pool_name);
    400 	pool_count++;
    401 	return (0);
    402 }
    403 
    404 struct destroy_zone_arg {
    405 	pool_t *old;
    406 	pool_t *new;
    407 };
    408 
    409 /*
    410  * Update pool pointers for zones that are currently bound to pool "old"
    411  * to be bound to pool "new".
    412  */
    413 static int
    414 pool_destroy_zone_cb(zone_t *zone, void *arg)
    415 {
    416 	struct destroy_zone_arg *dza = arg;
    417 
    418 	ASSERT(pool_lock_held());
    419 	ASSERT(MUTEX_HELD(&cpu_lock));
    420 
    421 	if (zone_pool_get(zone) == dza->old)
    422 		zone_pool_set(zone, dza->new);
    423 	return (0);
    424 }
    425 
    426 /*
    427  * Destroy specified pool, and rebind all processes in it
    428  * to the default pool.
    429  */
    430 static int
    431 pool_pool_destroy(poolid_t poolid)
    432 {
    433 	pool_t *pool;
    434 	int ret;
    435 
    436 	ASSERT(pool_lock_held());
    437 
    438 	if (poolid == POOL_DEFAULT)
    439 		return (EINVAL);
    440 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
    441 		return (ESRCH);
    442 	ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL);
    443 	if (ret == 0) {
    444 		struct destroy_zone_arg dzarg;
    445 
    446 		dzarg.old = pool;
    447 		dzarg.new = pool_default;
    448 		mutex_enter(&cpu_lock);
    449 		ret = zone_walk(pool_destroy_zone_cb, &dzarg);
    450 		mutex_exit(&cpu_lock);
    451 		ASSERT(ret == 0);
    452 		ASSERT(pool->pool_ref == 0);
    453 		(void) nvlist_free(pool->pool_props);
    454 		id_free(pool_ids, pool->pool_id);
    455 		pool->pool_pset->pset_npools--;
    456 		list_remove(&pool_list, pool);
    457 		pool_count--;
    458 		pool_pool_mod = gethrtime();
    459 		kmem_free(pool, sizeof (pool_t));
    460 	}
    461 	return (ret);
    462 }
    463 
    464 /*
    465  * Create new pool or resource set.
    466  */
    467 int
    468 pool_create(int class, int subclass, id_t *id)
    469 {
    470 	int ret;
    471 
    472 	ASSERT(pool_lock_held());
    473 	if (pool_state == POOL_DISABLED)
    474 		return (ENOTACTIVE);
    475 	switch (class) {
    476 	case PEC_POOL:
    477 		ret = pool_pool_create((poolid_t *)id);
    478 		break;
    479 	case PEC_RES_COMP:
    480 		switch (subclass) {
    481 		case PREC_PSET:
    482 			ret = pool_pset_create((psetid_t *)id);
    483 			break;
    484 		default:
    485 			ret = EINVAL;
    486 		}
    487 		break;
    488 	case PEC_RES_AGG:
    489 		ret = ENOTSUP;
    490 		break;
    491 	default:
    492 		ret = EINVAL;
    493 	}
    494 	return (ret);
    495 }
    496 
    497 /*
    498  * Destroy an existing pool or resource set.
    499  */
    500 int
    501 pool_destroy(int class, int subclass, id_t id)
    502 {
    503 	int ret;
    504 
    505 	ASSERT(pool_lock_held());
    506 	if (pool_state == POOL_DISABLED)
    507 		return (ENOTACTIVE);
    508 	switch (class) {
    509 	case PEC_POOL:
    510 		ret = pool_pool_destroy((poolid_t)id);
    511 		break;
    512 	case PEC_RES_COMP:
    513 		switch (subclass) {
    514 		case PREC_PSET:
    515 			ret = pool_pset_destroy((psetid_t)id);
    516 			break;
    517 		default:
    518 			ret = EINVAL;
    519 		}
    520 		break;
    521 	case PEC_RES_AGG:
    522 		ret = ENOTSUP;
    523 		break;
    524 	default:
    525 		ret = EINVAL;
    526 	}
    527 	return (ret);
    528 }
    529 
    530 /*
    531  * Enable or disable pools.
    532  */
    533 int
    534 pool_status(int status)
    535 {
    536 	int ret = 0;
    537 
    538 	ASSERT(pool_lock_held());
    539 
    540 	if (pool_state == status)
    541 		return (0);
    542 	switch (status) {
    543 	case POOL_ENABLED:
    544 		ret = pool_enable();
    545 		if (ret != 0)
    546 			return (ret);
    547 		pool_state = POOL_ENABLED;
    548 		break;
    549 	case POOL_DISABLED:
    550 		ret = pool_disable();
    551 		if (ret != 0)
    552 			return (ret);
    553 		pool_state = POOL_DISABLED;
    554 		break;
    555 	default:
    556 		ret = EINVAL;
    557 	}
    558 	return (ret);
    559 }
    560 
    561 /*
    562  * Associate pool with resource set.
    563  */
    564 int
    565 pool_assoc(poolid_t poolid, int idtype, id_t id)
    566 {
    567 	int ret;
    568 
    569 	ASSERT(pool_lock_held());
    570 	if (pool_state == POOL_DISABLED)
    571 		return (ENOTACTIVE);
    572 	switch (idtype) {
    573 	case PREC_PSET:
    574 		ret = pool_pset_assoc(poolid, (psetid_t)id);
    575 		break;
    576 	default:
    577 		ret = EINVAL;
    578 	}
    579 	if (ret == 0)
    580 		pool_pool_mod = gethrtime();
    581 	return (ret);
    582 }
    583 
    584 /*
    585  * Disassociate resource set from pool.
    586  */
    587 int
    588 pool_dissoc(poolid_t poolid, int idtype)
    589 {
    590 	int ret;
    591 
    592 	ASSERT(pool_lock_held());
    593 	if (pool_state == POOL_DISABLED)
    594 		return (ENOTACTIVE);
    595 	switch (idtype) {
    596 	case PREC_PSET:
    597 		ret = pool_pset_assoc(poolid, PS_NONE);
    598 		break;
    599 	default:
    600 		ret = EINVAL;
    601 	}
    602 	if (ret == 0)
    603 		pool_pool_mod = gethrtime();
    604 	return (ret);
    605 }
    606 
    607 /*
    608  * Transfer specified quantity of resources between resource sets.
    609  */
    610 /*ARGSUSED*/
    611 int
    612 pool_transfer(int type, id_t src, id_t dst, uint64_t qty)
    613 {
    614 	int ret = EINVAL;
    615 	return (ret);
    616 }
    617 
    618 /*
    619  * Transfer resources specified by their IDs between resource sets.
    620  */
    621 int
    622 pool_xtransfer(int type, id_t src, id_t dst, uint_t size, id_t *ids)
    623 {
    624 	int ret;
    625 
    626 	ASSERT(pool_lock_held());
    627 	if (pool_state == POOL_DISABLED)
    628 		return (ENOTACTIVE);
    629 	switch (type) {
    630 	case PREC_PSET:
    631 		ret = pool_pset_xtransfer((psetid_t)src, (psetid_t)dst,
    632 		    size, ids);
    633 		break;
    634 	default:
    635 		ret = EINVAL;
    636 	}
    637 	return (ret);
    638 }
    639 
    640 /*
    641  * Bind processes to pools.
    642  */
    643 int
    644 pool_bind(poolid_t poolid, idtype_t idtype, id_t id)
    645 {
    646 	pool_t *pool;
    647 
    648 	ASSERT(pool_lock_held());
    649 
    650 	if (pool_state == POOL_DISABLED)
    651 		return (ENOTACTIVE);
    652 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
    653 		return (ESRCH);
    654 
    655 	switch (idtype) {
    656 	case P_PID:
    657 	case P_TASKID:
    658 	case P_PROJID:
    659 	case P_ZONEID:
    660 		break;
    661 	default:
    662 		return (EINVAL);
    663 	}
    664 	return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL));
    665 }
    666 
    667 /*
    668  * Query pool binding of the specifed process.
    669  */
    670 int
    671 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid)
    672 {
    673 	proc_t *p;
    674 
    675 	if (idtype != P_PID)
    676 		return (ENOTSUP);
    677 	if (id == P_MYID)
    678 		id = curproc->p_pid;
    679 
    680 	ASSERT(pool_lock_held());
    681 
    682 	mutex_enter(&pidlock);
    683 	if ((p = prfind((pid_t)id)) == NULL) {
    684 		mutex_exit(&pidlock);
    685 		return (ESRCH);
    686 	}
    687 	mutex_enter(&p->p_lock);
    688 	/*
    689 	 * In local zones, lie about pool bindings of processes from
    690 	 * the global zone.
    691 	 */
    692 	if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) {
    693 		pool_t *pool;
    694 
    695 		pool = zone_pool_get(curproc->p_zone);
    696 		*poolid = pool->pool_id;
    697 	} else {
    698 		*poolid = p->p_pool->pool_id;
    699 	}
    700 	mutex_exit(&p->p_lock);
    701 	mutex_exit(&pidlock);
    702 	return (0);
    703 }
    704 
    705 static ea_object_t *
    706 pool_system_pack(void)
    707 {
    708 	ea_object_t *eo_system;
    709 	size_t bufsz = 0;
    710 	char *buf = NULL;
    711 
    712 	ASSERT(pool_lock_held());
    713 
    714 	eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM);
    715 	(void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t),
    716 	    EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64);
    717 	if (INGLOBALZONE(curproc))
    718 		(void) ea_attach_item(eo_system, &pool_pool_mod,
    719 		    sizeof (hrtime_t),
    720 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
    721 	else
    722 		(void) ea_attach_item(eo_system,
    723 		    &curproc->p_zone->zone_pool_mod,
    724 		    sizeof (hrtime_t),
    725 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
    726 	(void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t),
    727 	    EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64);
    728 	(void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t),
    729 	    EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64);
    730 	(void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
    731 	(void) ea_attach_item(eo_system, buf, bufsz,
    732 	    EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW);
    733 	kmem_free(buf, bufsz);
    734 	return (eo_system);
    735 }
    736 
    737 /*
    738  * Pack information about pools and attach it to specified exacct group.
    739  */
    740 static int
    741 pool_pool_pack(ea_object_t *eo_system)
    742 {
    743 	ea_object_t *eo_pool;
    744 	pool_t *pool;
    745 	size_t bufsz;
    746 	char *buf;
    747 	pool_t *myzonepool;
    748 
    749 	ASSERT(pool_lock_held());
    750 	myzonepool = zone_pool_get(curproc->p_zone);
    751 	for (pool = list_head(&pool_list); pool;
    752 	    pool = list_next(&pool_list, pool)) {
    753 		if (!INGLOBALZONE(curproc) && myzonepool != pool)
    754 			continue;
    755 		bufsz = 0;
    756 		buf = NULL;
    757 		eo_pool = ea_alloc_group(EXT_GROUP |
    758 		    EXC_LOCAL | EXD_GROUP_POOL);
    759 		(void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t),
    760 		    EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32);
    761 		(void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id,
    762 		    sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32);
    763 		(void) nvlist_pack(pool->pool_props, &buf, &bufsz,
    764 		    NV_ENCODE_NATIVE, 0);
    765 		(void) ea_attach_item(eo_pool, buf, bufsz,
    766 		    EXC_LOCAL | EXD_POOL_PROP | EXT_RAW);
    767 		kmem_free(buf, bufsz);
    768 		(void) ea_attach_to_group(eo_system, eo_pool);
    769 	}
    770 	return (0);
    771 }
    772 
    773 /*
    774  * Pack the whole pool configuration in the specified buffer.
    775  */
    776 int
    777 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize)
    778 {
    779 	ea_object_t *eo_system;
    780 	size_t ksize;
    781 	int ret = 0;
    782 
    783 	ASSERT(pool_lock_held());
    784 
    785 	eo_system = pool_system_pack();		/* 1. pack system */
    786 	(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
    787 	(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
    788 	ksize = ea_pack_object(eo_system, NULL, 0);
    789 	if (kbuf == NULL || kbufsz == 0)
    790 		*asize = ksize;
    791 	else if (ksize > kbufsz)
    792 		ret = ENOMEM;
    793 	else
    794 		*asize = ea_pack_object(eo_system, kbuf, kbufsz);
    795 	ea_free_object(eo_system, EUP_ALLOC);
    796 	return (ret);
    797 }
    798 
    799 /*
    800  * Start/end the commit transaction.  If commit transaction is currently
    801  * in progress, then all POOL_QUERY ioctls will return pools configuration
    802  * at the beginning of transaction.
    803  */
    804 int
    805 pool_commit(int state)
    806 {
    807 	ea_object_t *eo_system;
    808 	int ret = 0;
    809 
    810 	ASSERT(pool_lock_held());
    811 
    812 	if (pool_state == POOL_DISABLED)
    813 		return (ENOTACTIVE);
    814 	switch (state) {
    815 	case 1:
    816 		/*
    817 		 * Beginning commit transation.
    818 		 */
    819 		if (pool_buf != NULL)		/* transaction in progress */
    820 			return (EBUSY);
    821 		eo_system = pool_system_pack();		/* 1. pack system */
    822 		(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
    823 		(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
    824 		pool_bufsz = ea_pack_object(eo_system, NULL, 0);
    825 		pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP);
    826 		pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz);
    827 		ea_free_object(eo_system, EUP_ALLOC);
    828 		break;
    829 	case 0:
    830 		/*
    831 		 * Finishing commit transaction.
    832 		 */
    833 		if (pool_buf != NULL) {
    834 			kmem_free(pool_buf, pool_bufsz);
    835 			pool_buf = NULL;
    836 			pool_bufsz = 0;
    837 		}
    838 		break;
    839 	default:
    840 		ret = EINVAL;
    841 	}
    842 	return (ret);
    843 }
    844 
    845 /*
    846  * Check is the specified property is special
    847  */
    848 static pool_property_t *
    849 pool_property_find(char *name, pool_property_t *list)
    850 {
    851 	pool_property_t *prop;
    852 
    853 	for (prop = list; prop->pp_name != NULL; prop++)
    854 		if (strcmp(prop->pp_name, name) == 0)
    855 			return (prop);
    856 	return (NULL);
    857 }
    858 
    859 static pool_property_t pool_prop_sys[] = {
    860 	{ "system.name",		DATA_TYPE_STRING,	PP_RDWR },
    861 	{ "system.comment",		DATA_TYPE_STRING,	PP_RDWR },
    862 	{ "system.version",		DATA_TYPE_UINT64,	PP_READ },
    863 	{ "system.bind-default",	DATA_TYPE_BYTE,		PP_RDWR },
    864 	{ "system.allocate-method",	DATA_TYPE_STRING,
    865 	    PP_RDWR | PP_OPTIONAL },
    866 	{ "system.poold.log-level",	DATA_TYPE_STRING,
    867 	    PP_RDWR | PP_OPTIONAL },
    868 	{ "system.poold.log-location",	DATA_TYPE_STRING,
    869 	    PP_RDWR | PP_OPTIONAL },
    870 	{ "system.poold.monitor-interval",	DATA_TYPE_UINT64,
    871 	    PP_RDWR | PP_OPTIONAL },
    872 	{ "system.poold.history-file",	DATA_TYPE_STRING,
    873 	    PP_RDWR | PP_OPTIONAL },
    874 	{ "system.poold.objectives",	DATA_TYPE_STRING,
    875 	    PP_RDWR | PP_OPTIONAL },
    876 	{ NULL,				0,			0 }
    877 };
    878 
    879 static pool_property_t pool_prop_pool[] = {
    880 	{ "pool.sys_id",		DATA_TYPE_UINT64,	PP_READ },
    881 	{ "pool.name",			DATA_TYPE_STRING,	PP_RDWR },
    882 	{ "pool.default",		DATA_TYPE_BYTE,		PP_READ },
    883 	{ "pool.active",		DATA_TYPE_BYTE,		PP_RDWR },
    884 	{ "pool.importance",		DATA_TYPE_INT64,	PP_RDWR },
    885 	{ "pool.comment",		DATA_TYPE_STRING,	PP_RDWR },
    886 	{ "pool.scheduler",		DATA_TYPE_STRING,
    887 	    PP_RDWR | PP_OPTIONAL },
    888 	{ NULL,				0,			0 }
    889 };
    890 
    891 /*
    892  * Common routine to put new property on the specified list
    893  */
    894 int
    895 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props)
    896 {
    897 	pool_property_t *prop;
    898 
    899 	if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) {
    900 		/*
    901 		 * No read-only properties or properties with bad types
    902 		 */
    903 		if (!(prop->pp_perm & PP_WRITE) ||
    904 		    prop->pp_type != nvpair_type(pair))
    905 			return (EINVAL);
    906 	}
    907 	return (nvlist_add_nvpair(nvlist, pair));
    908 }
    909 
    910 /*
    911  * Common routine to remove property from the given list
    912  */
    913 int
    914 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props)
    915 {
    916 	pool_property_t *prop;
    917 
    918 	if ((prop = pool_property_find(name, props)) != NULL) {
    919 		if (!(prop->pp_perm & PP_OPTIONAL))
    920 			return (EINVAL);
    921 	}
    922 	return (nvlist_remove_all(nvlist, name));
    923 }
    924 
    925 static int
    926 pool_system_propput(nvpair_t *pair)
    927 {
    928 	int ret;
    929 
    930 	ASSERT(pool_lock_held());
    931 	ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys);
    932 	if (ret == 0)
    933 		pool_sys_mod = gethrtime();
    934 	return (ret);
    935 }
    936 
    937 static int
    938 pool_system_proprm(char *name)
    939 {
    940 	int ret;
    941 
    942 	ASSERT(pool_lock_held());
    943 	ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys);
    944 	if (ret == 0)
    945 		pool_sys_mod = gethrtime();
    946 	return (ret);
    947 }
    948 
    949 static int
    950 pool_pool_propput(poolid_t poolid, nvpair_t *pair)
    951 {
    952 	pool_t *pool;
    953 	int ret;
    954 
    955 	ASSERT(pool_lock_held());
    956 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
    957 		return (ESRCH);
    958 	ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool);
    959 	if (ret == 0)
    960 		pool_pool_mod = gethrtime();
    961 	return (ret);
    962 }
    963 
    964 static int
    965 pool_pool_proprm(poolid_t poolid, char *name)
    966 {
    967 	int ret;
    968 	pool_t *pool;
    969 
    970 	ASSERT(pool_lock_held());
    971 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
    972 		return (ESRCH);
    973 	ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool);
    974 	if (ret == 0)
    975 		pool_pool_mod = gethrtime();
    976 	return (ret);
    977 }
    978 
    979 int
    980 pool_propput(int class, int subclass, id_t id, nvpair_t *pair)
    981 {
    982 	int ret;
    983 
    984 	ASSERT(pool_lock_held());
    985 	if (pool_state == POOL_DISABLED)
    986 		return (ENOTACTIVE);
    987 	switch (class) {
    988 	case PEC_SYSTEM:
    989 		ret = pool_system_propput(pair);
    990 		break;
    991 	case PEC_POOL:
    992 		ret = pool_pool_propput((poolid_t)id, pair);
    993 		break;
    994 	case PEC_RES_COMP:
    995 		switch (subclass) {
    996 		case PREC_PSET:
    997 			ret = pool_pset_propput((psetid_t)id, pair);
    998 			break;
    999 		default:
   1000 			ret = EINVAL;
   1001 		}
   1002 		break;
   1003 	case PEC_RES_AGG:
   1004 		ret = ENOTSUP;
   1005 		break;
   1006 	case PEC_COMP:
   1007 		switch (subclass) {
   1008 		case PCEC_CPU:
   1009 			ret = pool_cpu_propput((processorid_t)id, pair);
   1010 			break;
   1011 		default:
   1012 			ret = EINVAL;
   1013 		}
   1014 		break;
   1015 	default:
   1016 		ret = EINVAL;
   1017 	}
   1018 	return (ret);
   1019 }
   1020 
   1021 int
   1022 pool_proprm(int class, int subclass, id_t id, char *name)
   1023 {
   1024 	int ret;
   1025 
   1026 	ASSERT(pool_lock_held());
   1027 	if (pool_state == POOL_DISABLED)
   1028 		return (ENOTACTIVE);
   1029 	switch (class) {
   1030 	case PEC_SYSTEM:
   1031 		ret = pool_system_proprm(name);
   1032 		break;
   1033 	case PEC_POOL:
   1034 		ret = pool_pool_proprm((poolid_t)id, name);
   1035 		break;
   1036 	case PEC_RES_COMP:
   1037 		switch (subclass) {
   1038 		case PREC_PSET:
   1039 			ret = pool_pset_proprm((psetid_t)id, name);
   1040 			break;
   1041 		default:
   1042 			ret = EINVAL;
   1043 		}
   1044 		break;
   1045 	case PEC_RES_AGG:
   1046 		ret = ENOTSUP;
   1047 		break;
   1048 	case PEC_COMP:
   1049 		switch (subclass) {
   1050 		case PCEC_CPU:
   1051 			ret = pool_cpu_proprm((processorid_t)id, name);
   1052 			break;
   1053 		default:
   1054 			ret = EINVAL;
   1055 		}
   1056 		break;
   1057 	default:
   1058 		ret = EINVAL;
   1059 	}
   1060 	return (ret);
   1061 }
   1062 
   1063 int
   1064 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp)
   1065 {
   1066 	int ret;
   1067 	nvlist_t *nvl;
   1068 
   1069 	ASSERT(pool_lock_held());
   1070 	if (pool_state == POOL_DISABLED)
   1071 		return (ENOTACTIVE);
   1072 
   1073 	(void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
   1074 
   1075 	switch (class) {
   1076 	case PEC_SYSTEM:
   1077 	case PEC_POOL:
   1078 		ret = EINVAL;
   1079 		break;
   1080 	case PEC_RES_COMP:
   1081 		switch (subclass) {
   1082 		case PREC_PSET:
   1083 			ret = pool_pset_propget((psetid_t)id, name, nvl);
   1084 			break;
   1085 		default:
   1086 			ret = EINVAL;
   1087 		}
   1088 		break;
   1089 	case PEC_RES_AGG:
   1090 		ret = ENOTSUP;
   1091 		break;
   1092 	case PEC_COMP:
   1093 		switch (subclass) {
   1094 		case PCEC_CPU:
   1095 			ret = pool_cpu_propget((processorid_t)id, name, nvl);
   1096 			break;
   1097 		default:
   1098 			ret = EINVAL;
   1099 		}
   1100 		break;
   1101 	default:
   1102 		ret = EINVAL;
   1103 	}
   1104 	if (ret == 0)
   1105 		*nvlp = nvl;
   1106 	else
   1107 		nvlist_free(nvl);
   1108 	return (ret);
   1109 }
   1110 
   1111 /*
   1112  * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
   1113  * in case of failure in pool_do_bind().
   1114  */
   1115 static void
   1116 pool_bind_wake(proc_t *p)
   1117 {
   1118 	ASSERT(pool_lock_held());
   1119 
   1120 	mutex_enter(&p->p_lock);
   1121 	ASSERT(p->p_poolflag & PBWAIT);
   1122 	if (p->p_poolcnt > 0) {
   1123 		mutex_enter(&pool_barrier_lock);
   1124 		pool_barrier_count -= p->p_poolcnt;
   1125 		mutex_exit(&pool_barrier_lock);
   1126 	}
   1127 	p->p_poolflag &= ~PBWAIT;
   1128 	cv_signal(&p->p_poolcv);
   1129 	mutex_exit(&p->p_lock);
   1130 }
   1131 
   1132 static void
   1133 pool_bind_wakeall(proc_t **procs)
   1134 {
   1135 	proc_t *p, **pp;
   1136 
   1137 	ASSERT(pool_lock_held());
   1138 	for (pp = procs; (p = *pp) != NULL; pp++)
   1139 		pool_bind_wake(p);
   1140 }
   1141 
   1142 /*
   1143  * Return the scheduling class for this pool, or
   1144  * 	POOL_CLASS_UNSET if not set
   1145  * 	POOL_CLASS_INVAL if set to an invalid class ID.
   1146  */
   1147 id_t
   1148 pool_get_class(pool_t *pool)
   1149 {
   1150 	char *name;
   1151 	id_t cid;
   1152 
   1153 	ASSERT(pool_lock_held());
   1154 
   1155 	if (nvlist_lookup_string(pool->pool_props, "pool.scheduler",
   1156 	    &name) == 0) {
   1157 		if (getcidbyname(name, &cid) == 0)
   1158 			return (cid);
   1159 		else
   1160 			return (POOL_CLASS_INVAL);
   1161 	}
   1162 	return (POOL_CLASS_UNSET);
   1163 }
   1164 
   1165 /*
   1166  * Move process to the new scheduling class.
   1167  */
   1168 static void
   1169 pool_change_class(proc_t *p, id_t cid)
   1170 {
   1171 	kthread_t *t;
   1172 	void *cldata;
   1173 	id_t oldcid;
   1174 	void **bufs;
   1175 	void **buf;
   1176 	int nlwp;
   1177 	int ret;
   1178 	int i;
   1179 
   1180 	/*
   1181 	 * Do not move kernel processes (such as zsched).
   1182 	 */
   1183 	if (p->p_flag & SSYS)
   1184 		return;
   1185 	/*
   1186 	 * This process is in the pool barrier, so it can't possibly be
   1187 	 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1
   1188 	 * (for possible agent LWP which doesn't use pool barrier) as
   1189 	 * our upper bound.
   1190 	 */
   1191 	nlwp = p->p_lwpcnt + p->p_zombcnt + 1;
   1192 
   1193 	/*
   1194 	 * Pre-allocate scheduling class specific buffers before
   1195 	 * grabbing p_lock.
   1196 	 */
   1197 	bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP);
   1198 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
   1199 		ret = CL_ALLOC(buf, cid, KM_SLEEP);
   1200 		ASSERT(ret == 0);
   1201 	}
   1202 
   1203 	/*
   1204 	 * Move threads one by one to the new scheduling class.
   1205 	 * This never fails because we have all the right
   1206 	 * privileges here.
   1207 	 */
   1208 	mutex_enter(&p->p_lock);
   1209 	ASSERT(p->p_poolflag & PBWAIT);
   1210 	buf = bufs;
   1211 	t = p->p_tlist;
   1212 	ASSERT(t != NULL);
   1213 	do {
   1214 		if (t->t_cid != cid) {
   1215 			oldcid = t->t_cid;
   1216 			cldata = t->t_cldata;
   1217 			ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf);
   1218 			ASSERT(ret == 0);
   1219 			CL_EXITCLASS(oldcid, cldata);
   1220 			schedctl_set_cidpri(t);
   1221 			*buf++ = NULL;
   1222 		}
   1223 	} while ((t = t->t_forw) != p->p_tlist);
   1224 	mutex_exit(&p->p_lock);
   1225 	/*
   1226 	 * Free unused scheduling class specific buffers.
   1227 	 */
   1228 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
   1229 		if (*buf != NULL) {
   1230 			CL_FREE(cid, *buf);
   1231 			*buf = NULL;
   1232 		}
   1233 	}
   1234 	kmem_free(bufs, nlwp * sizeof (void *));
   1235 }
   1236 
   1237 /*
   1238  * The meat of the bind operation.  The steps in pool_do_bind are:
   1239  *
   1240  * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
   1241  *    such processes to an array.  For any interesting process that has
   1242  *    threads inside the pool barrier set, increment a counter by the
   1243  *    count of such threads.  Once PBWAIT is set on a process, that process
   1244  *    will not disappear.
   1245  *
   1246  * 2) Wait for the counter from step 2 to drop to zero.  Any process which
   1247  *    calls pool_barrier_exit() and notices that PBWAIT has been set on it
   1248  *    will decrement that counter before going to sleep, and the process
   1249  *    calling pool_barrier_exit() which does the final decrement will wake us.
   1250  *
   1251  * 3) For each interesting process, perform a calculation on it to see if
   1252  *    the bind will actually succeed.  This uses the following three
   1253  *    resource-set-specific functions:
   1254  *
   1255  *    - int set_bind_start(procs, pool)
   1256  *
   1257  *      Determine whether the given array of processes can be bound to the
   1258  *      resource set associated with the given pool.  If it can, take and hold
   1259  *      any locks necessary to ensure that the operation will succeed, and
   1260  *      make any necessary reservations in the target resource set.  If it
   1261  *      can't, return failure with no reservations made and no new locks held.
   1262  *
   1263  *    - void set_bind_abort(procs, pool)
   1264  *
   1265  *      set_bind_start() has completed successfully, but another resource set's
   1266  *      set_bind_start() has failed, and we haven't begun the bind yet.  Undo
   1267  *      any reservations made and drop any locks acquired by our
   1268  *      set_bind_start().
   1269  *
   1270  *    - void set_bind_finish(void)
   1271  *
   1272  *      The bind has completed successfully.  The processes have been released,
   1273  *      and the reservation acquired in set_bind_start() has been depleted as
   1274  *      the processes have finished their bindings.  Drop any locks acquired by
   1275  *      set_bind_start().
   1276  *
   1277  * 4) If we've decided that we can proceed with the bind, iterate through
   1278  *    the list of interesting processes, grab the necessary locks (which
   1279  *    may differ per resource set), perform the bind, and ASSERT that it
   1280  *    succeeds.  Once a process has been rebound, it can be awakened.
   1281  *
   1282  * The operations from step 4 must be kept in sync with anything which might
   1283  * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
   1284  * are thus located in the same source files as the associated bind operations.
   1285  */
   1286 int
   1287 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
   1288 {
   1289 	extern uint_t nproc;
   1290 	klwp_t *lwp = ttolwp(curthread);
   1291 	proc_t **pp, **procs;
   1292 	proc_t *prstart;
   1293 	int procs_count = 0;
   1294 	kproject_t *kpj;
   1295 	procset_t set;
   1296 	zone_t *zone;
   1297 	int procs_size;
   1298 	int rv = 0;
   1299 	proc_t *p;
   1300 	id_t cid = -1;
   1301 
   1302 	ASSERT(pool_lock_held());
   1303 
   1304 	if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL)
   1305 		return (EINVAL);
   1306 
   1307 	if (idtype == P_ZONEID) {
   1308 		zone = zone_find_by_id(id);
   1309 		if (zone == NULL)
   1310 			return (ESRCH);
   1311 		if (zone_status_get(zone) > ZONE_IS_RUNNING) {
   1312 			zone_rele(zone);
   1313 			return (EBUSY);
   1314 		}
   1315 	}
   1316 
   1317 	if (idtype == P_PROJID) {
   1318 		kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND);
   1319 		if (kpj == NULL)
   1320 			return (ESRCH);
   1321 		mutex_enter(&kpj->kpj_poolbind);
   1322 	}
   1323 
   1324 	if (idtype == P_PID) {
   1325 		/*
   1326 		 * Fast-path for a single process case.
   1327 		 */
   1328 		procs_size = 2;	/* procs is NULL-terminated */
   1329 		procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP);
   1330 		mutex_enter(&pidlock);
   1331 	} else {
   1332 		/*
   1333 		 * We will need enough slots for proc_t pointers for as many as
   1334 		 * twice the number of currently running processes (assuming
   1335 		 * that each one could be in fork() creating a new child).
   1336 		 */
   1337 		for (;;) {
   1338 			procs_size = nproc * 2;
   1339 			procs = kmem_zalloc(procs_size * sizeof (proc_t *),
   1340 			    KM_SLEEP);
   1341 			mutex_enter(&pidlock);
   1342 
   1343 			if (nproc * 2 <= procs_size)
   1344 				break;
   1345 			/*
   1346 			 * If nproc has changed, try again.
   1347 			 */
   1348 			mutex_exit(&pidlock);
   1349 			kmem_free(procs, procs_size * sizeof (proc_t *));
   1350 		}
   1351 	}
   1352 
   1353 	if (id == P_MYID)
   1354 		id = getmyid(idtype);
   1355 	setprocset(&set, POP_AND, idtype, id, P_ALL, 0);
   1356 
   1357 	/*
   1358 	 * Do a first scan, and select target processes.
   1359 	 */
   1360 	if (idtype == P_PID)
   1361 		prstart = prfind(id);
   1362 	else
   1363 		prstart = practive;
   1364 	for (p = prstart, pp = procs; p != NULL; p = p->p_next) {
   1365 		mutex_enter(&p->p_lock);
   1366 		/*
   1367 		 * Skip processes that don't match our (id, idtype) set or
   1368 		 * on the way of becoming zombies.  Skip kernel processes
   1369 		 * from the global zone.
   1370 		 */
   1371 		if (procinset(p, &set) == 0 ||
   1372 		    p->p_poolflag & PEXITED ||
   1373 		    ((p->p_flag & SSYS) && INGLOBALZONE(p))) {
   1374 			mutex_exit(&p->p_lock);
   1375 			continue;
   1376 		}
   1377 		if (!INGLOBALZONE(p)) {
   1378 			switch (idtype) {
   1379 			case P_PID:
   1380 			case P_TASKID:
   1381 				/*
   1382 				 * Can't bind processes or tasks
   1383 				 * in local zones to pools.
   1384 				 */
   1385 				mutex_exit(&p->p_lock);
   1386 				mutex_exit(&pidlock);
   1387 				pool_bind_wakeall(procs);
   1388 				rv = EINVAL;
   1389 				goto out;
   1390 			case P_PROJID:
   1391 				/*
   1392 				 * Only projects in the global
   1393 				 * zone can be rebound.
   1394 				 */
   1395 				mutex_exit(&p->p_lock);
   1396 				continue;
   1397 			case P_POOLID:
   1398 				/*
   1399 				 * When rebinding pools, processes can be
   1400 				 * in different zones.
   1401 				 */
   1402 				break;
   1403 			}
   1404 		}
   1405 
   1406 		p->p_poolflag |= PBWAIT;
   1407 		/*
   1408 		 * If some threads in this process are inside the pool
   1409 		 * barrier, add them to pool_barrier_count, as we have
   1410 		 * to wait for all of them to exit the barrier.
   1411 		 */
   1412 		if (p->p_poolcnt > 0) {
   1413 			mutex_enter(&pool_barrier_lock);
   1414 			pool_barrier_count += p->p_poolcnt;
   1415 			mutex_exit(&pool_barrier_lock);
   1416 		}
   1417 		ASSERT(pp < &procs[procs_size]);
   1418 		*pp++ = p;
   1419 		procs_count++;
   1420 		mutex_exit(&p->p_lock);
   1421 
   1422 		/*
   1423 		 * We just found our process, so if we're only rebinding a
   1424 		 * single process then get out of this loop.
   1425 		 */
   1426 		if (idtype == P_PID)
   1427 			break;
   1428 	}
   1429 	*pp = NULL;	/* cap off the end of the array */
   1430 	mutex_exit(&pidlock);
   1431 
   1432 	/*
   1433 	 * Wait for relevant processes to stop before they try to enter the
   1434 	 * barrier or at the exit from the barrier.  Make sure that we do
   1435 	 * not get stopped here while we're holding pool_lock.  If we were
   1436 	 * requested to stop, or got a signal then return EAGAIN to let the
   1437 	 * library know that it needs to retry.
   1438 	 */
   1439 	mutex_enter(&pool_barrier_lock);
   1440 	lwp->lwp_nostop++;
   1441 	while (pool_barrier_count > 0) {
   1442 		(void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock);
   1443 		if (pool_barrier_count > 0) {
   1444 			/*
   1445 			 * We either got a signal or were requested to
   1446 			 * stop by /proc.  Bail out with EAGAIN.  If we were
   1447 			 * requested to stop, we'll stop in post_syscall()
   1448 			 * on our way back to userland.
   1449 			 */
   1450 			mutex_exit(&pool_barrier_lock);
   1451 			pool_bind_wakeall(procs);
   1452 			lwp->lwp_nostop--;
   1453 			rv = EAGAIN;
   1454 			goto out;
   1455 		}
   1456 	}
   1457 	lwp->lwp_nostop--;
   1458 	mutex_exit(&pool_barrier_lock);
   1459 
   1460 	if (idtype == P_PID) {
   1461 		if ((p = *procs) == NULL)
   1462 			goto skip;
   1463 		mutex_enter(&p->p_lock);
   1464 		/* Drop the process if it is exiting */
   1465 		if (p->p_poolflag & PEXITED) {
   1466 			mutex_exit(&p->p_lock);
   1467 			pool_bind_wake(p);
   1468 			procs_count--;
   1469 		} else
   1470 			mutex_exit(&p->p_lock);
   1471 		goto skip;
   1472 	}
   1473 
   1474 	/*
   1475 	 * Do another run, and drop processes that were inside the barrier
   1476 	 * in exit(), but when they have dropped to pool_barrier_exit
   1477 	 * they have become of no interest to us.  Pick up child processes that
   1478 	 * were created by fork() but didn't exist during our first scan.
   1479 	 * Their parents are now stopped at pool_barrier_exit in cfork().
   1480 	 */
   1481 	mutex_enter(&pidlock);
   1482 	for (pp = procs; (p = *pp) != NULL; pp++) {
   1483 		mutex_enter(&p->p_lock);
   1484 		if (p->p_poolflag & PEXITED) {
   1485 			ASSERT(p->p_lwpcnt == 0);
   1486 			mutex_exit(&p->p_lock);
   1487 			pool_bind_wake(p);
   1488 			/* flip w/last non-NULL slot */
   1489 			*pp = procs[procs_count - 1];
   1490 			procs[procs_count - 1] = NULL;
   1491 			procs_count--;
   1492 			pp--;			/* try this slot again */
   1493 			continue;
   1494 		} else
   1495 			mutex_exit(&p->p_lock);
   1496 		/*
   1497 		 * Look at the child and check if it should be rebound also.
   1498 		 * We're holding pidlock, so it is safe to reference p_child.
   1499 		 */
   1500 		if ((p = p->p_child) == NULL)
   1501 			continue;
   1502 
   1503 		mutex_enter(&p->p_lock);
   1504 
   1505 		/*
   1506 		 * Skip system processes and make sure that the child is in
   1507 		 * the same task/project/pool/zone as the parent.
   1508 		 */
   1509 		if ((!INGLOBALZONE(p) && idtype != P_ZONEID &&
   1510 		    idtype != P_POOLID) || p->p_flag & SSYS) {
   1511 			mutex_exit(&p->p_lock);
   1512 			continue;
   1513 		}
   1514 
   1515 		/*
   1516 		 * If the child process has been already created by fork(), has
   1517 		 * not exited, and has not been added to the list already,
   1518 		 * then add it now.  We will hit this process again (since we
   1519 		 * stick it at the end of the procs list) but it will ignored
   1520 		 * because it will have the PBWAIT flag set.
   1521 		 */
   1522 		if (procinset(p, &set) &&
   1523 		    !(p->p_poolflag & PEXITED) &&
   1524 		    !(p->p_poolflag & PBWAIT)) {
   1525 			ASSERT(p->p_child == NULL); /* no child of a child */
   1526 			procs[procs_count] = p;
   1527 			procs[procs_count + 1] = NULL;
   1528 			procs_count++;
   1529 			p->p_poolflag |= PBWAIT;
   1530 		}
   1531 		mutex_exit(&p->p_lock);
   1532 	}
   1533 	mutex_exit(&pidlock);
   1534 skip:
   1535 	/*
   1536 	 * If there's no processes to rebind then return ESRCH, unless
   1537 	 * we're associating a pool with new resource set, destroying it,
   1538 	 * or binding a zone to a pool.
   1539 	 */
   1540 	if (procs_count == 0) {
   1541 		if (idtype == P_POOLID || idtype == P_ZONEID)
   1542 			rv = 0;
   1543 		else
   1544 			rv = ESRCH;
   1545 		goto out;
   1546 	}
   1547 
   1548 #ifdef DEBUG
   1549 	/*
   1550 	 * All processes in the array should have PBWAIT set, and none
   1551 	 * should be in the critical section. Thus, although p_poolflag
   1552 	 * and p_poolcnt are protected by p_lock, their ASSERTions below
   1553 	 * should be stable without it. procinset(), however, ASSERTs that
   1554 	 * the p_lock is held upon entry.
   1555 	 */
   1556 	for (pp = procs; (p = *pp) != NULL; pp++) {
   1557 		int in_set;
   1558 
   1559 		mutex_enter(&p->p_lock);
   1560 		in_set = procinset(p, &set);
   1561 		mutex_exit(&p->p_lock);
   1562 
   1563 		ASSERT(in_set);
   1564 		ASSERT(p->p_poolflag & PBWAIT);
   1565 		ASSERT(p->p_poolcnt == 0);
   1566 	}
   1567 #endif
   1568 
   1569 	/*
   1570 	 * Do the check if processor set rebinding is going to succeed or not.
   1571 	 */
   1572 	if ((flags & POOL_BIND_PSET) &&
   1573 	    (rv = pset_bind_start(procs, pool)) != 0) {
   1574 		pool_bind_wakeall(procs);
   1575 		goto out;
   1576 	}
   1577 
   1578 	/*
   1579 	 * At this point, all bind operations should succeed.
   1580 	 */
   1581 	for (pp = procs; (p = *pp) != NULL; pp++) {
   1582 		if (flags & POOL_BIND_PSET) {
   1583 			psetid_t psetid = pool->pool_pset->pset_id;
   1584 			void *zonebuf;
   1585 			void *projbuf;
   1586 
   1587 			/*
   1588 			 * Pre-allocate one buffer for FSS (per-project
   1589 			 * buffer for a new pset) in case if this is the
   1590 			 * first thread from its current project getting
   1591 			 * bound to this processor set.
   1592 			 */
   1593 			projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ);
   1594 			zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE);
   1595 
   1596 			mutex_enter(&pidlock);
   1597 			mutex_enter(&p->p_lock);
   1598 			pool_pset_bind(p, psetid, projbuf, zonebuf);
   1599 			mutex_exit(&p->p_lock);
   1600 			mutex_exit(&pidlock);
   1601 			/*
   1602 			 * Free buffers pre-allocated above if it
   1603 			 * wasn't actually used.
   1604 			 */
   1605 			fss_freebuf(projbuf, FSS_ALLOC_PROJ);
   1606 			fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
   1607 		}
   1608 		/*
   1609 		 * Now let's change the scheduling class of this
   1610 		 * process if our target pool has it defined.
   1611 		 */
   1612 		if (cid != POOL_CLASS_UNSET)
   1613 			pool_change_class(p, cid);
   1614 
   1615 		/*
   1616 		 * It is safe to reference p_pool here without holding
   1617 		 * p_lock because it cannot change underneath of us.
   1618 		 * We're holding pool_lock here, so nobody else can be
   1619 		 * moving this process between pools.  If process "p"
   1620 		 * would be exiting, we're guaranteed that it would be blocked
   1621 		 * at pool_barrier_enter() in exit().  Otherwise, it would've
   1622 		 * been skipped by one of our scans of the practive list
   1623 		 * as a process with PEXITED flag set.
   1624 		 */
   1625 		if (p->p_pool != pool) {
   1626 			ASSERT(p->p_pool->pool_ref > 0);
   1627 			atomic_add_32(&p->p_pool->pool_ref, -1);
   1628 			p->p_pool = pool;
   1629 			atomic_add_32(&p->p_pool->pool_ref, 1);
   1630 		}
   1631 		/*
   1632 		 * Okay, we've tortured this guy enough.
   1633 		 * Let this poor process go now.
   1634 		 */
   1635 		pool_bind_wake(p);
   1636 	}
   1637 	if (flags & POOL_BIND_PSET)
   1638 		pset_bind_finish();
   1639 
   1640 out:	switch (idtype) {
   1641 	case P_PROJID:
   1642 		ASSERT(kpj != NULL);
   1643 		mutex_exit(&kpj->kpj_poolbind);
   1644 		project_rele(kpj);
   1645 		break;
   1646 	case P_ZONEID:
   1647 		if (rv == 0) {
   1648 			mutex_enter(&cpu_lock);
   1649 			zone_pool_set(zone, pool);
   1650 			mutex_exit(&cpu_lock);
   1651 		}
   1652 		zone->zone_pool_mod = gethrtime();
   1653 		zone_rele(zone);
   1654 		break;
   1655 	}
   1656 
   1657 	kmem_free(procs, procs_size * sizeof (proc_t *));
   1658 	ASSERT(pool_barrier_count == 0);
   1659 	return (rv);
   1660 }
   1661