Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Contracts
     28  * ---------
     29  *
     30  * Contracts are a primitive which enrich the relationships between
     31  * processes and system resources.  The primary purpose of contracts is
     32  * to provide a means for the system to negotiate the departure from a
     33  * binding relationship (e.g. pages locked in memory or a thread bound
     34  * to processor), but they can also be used as a purely asynchronous
     35  * error reporting mechanism as they are with process contracts.
     36  *
     37  * More information on how one interfaces with contracts and what
     38  * contracts can do for you can be found in:
     39  *   PSARC 2003/193 Solaris Contracts
     40  *   PSARC 2004/460 Contracts addendum
     41  *
     42  * This file contains the core contracts framework.  By itself it is
     43  * useless: it depends the contracts filesystem (ctfs) to provide an
     44  * interface to user processes and individual contract types to
     45  * implement the process/resource relationships.
     46  *
     47  * Data structure overview
     48  * -----------------------
     49  *
     50  * A contract is represented by a contract_t, which itself points to an
     51  * encapsulating contract-type specific contract object.  A contract_t
     52  * contains the contract's static identity (including its terms), its
     53  * linkage to various bookkeeping structures, the contract-specific
     54  * event queue, and a reference count.
     55  *
     56  * A contract template is represented by a ct_template_t, which, like a
     57  * contract, points to an encapsulating contract-type specific template
     58  * object.  A ct_template_t contains the template's terms.
     59  *
     60  * An event queue is represented by a ct_equeue_t, and consists of a
     61  * list of events, a list of listeners, and a list of listeners who are
     62  * waiting for new events (affectionately referred to as "tail
     63  * listeners").  There are three queue types, defined by ct_listnum_t
     64  * (an enum).  An event may be on one of each type of queue
     65  * simultaneously; the list linkage used by a queue is determined by
     66  * its type.
     67  *
     68  * An event is represented by a ct_kevent_t, which contains mostly
     69  * static event data (e.g. id, payload).  It also has an array of
     70  * ct_member_t structures, each of which contains a list_node_t and
     71  * represent the event's linkage in a specific event queue.
     72  *
     73  * Each open of an event endpoint results in the creation of a new
     74  * listener, represented by a ct_listener_t.  In addition to linkage
     75  * into the aforementioned lists in the event_queue, a ct_listener_t
     76  * contains a pointer to the ct_kevent_t it is currently positioned at
     77  * as well as a set of status flags and other administrative data.
     78  *
     79  * Each process has a list of contracts it owns, p_ct_held; a pointer
     80  * to the process contract it is a member of, p_ct_process; the linkage
     81  * for that membership, p_ct_member; and an array of event queue
     82  * structures representing the process bundle queues.
     83  *
     84  * Each LWP has an array of its active templates, lwp_ct_active; and
     85  * the most recently created contracts, lwp_ct_latest.
     86  *
     87  * A process contract has a list of member processes and a list of
     88  * inherited contracts.
     89  *
     90  * There is a system-wide list of all contracts, as well as per-type
     91  * lists of contracts.
     92  *
     93  * Lock ordering overview
     94  * ----------------------
     95  *
     96  * Locks at the top are taken first:
     97  *
     98  *                   ct_evtlock
     99  *                   regent ct_lock
    100  *                   member ct_lock
    101  *                   pidlock
    102  *                   p_lock
    103  *    contract ctq_lock         contract_lock
    104  *    pbundle ctq_lock
    105  *    cte_lock
    106  *                   ct_reflock
    107  *
    108  * contract_lock and ctq_lock/cte_lock are not currently taken at the
    109  * same time.
    110  *
    111  * Reference counting and locking
    112  * ------------------------------
    113  *
    114  * A contract has a reference count, protected by ct_reflock.
    115  * (ct_reflock is also used in a couple other places where atomic
    116  * access to a variable is needed in an innermost context).  A process
    117  * maintains a hold on each contract it owns.  A process contract has a
    118  * hold on each contract is has inherited.  Each event has a hold on
    119  * the contract which generated it.  Process contract templates have
    120  * holds on the contracts referred to by their transfer terms.  CTFS
    121  * contract directory nodes have holds on contracts.  Lastly, various
    122  * code paths may temporarily take holds on contracts to prevent them
    123  * from disappearing while other processing is going on.  It is
    124  * important to note that the global contract lists do not hold
    125  * references on contracts; a contract is removed from these structures
    126  * atomically with the release of its last reference.
    127  *
    128  * At a given point in time, a contract can either be owned by a
    129  * process, inherited by a regent process contract, or orphaned.  A
    130  * contract_t's  owner and regent pointers, ct_owner and ct_regent, are
    131  * protected by its ct_lock.  The linkage in the holder's (holder =
    132  * owner or regent) list of contracts, ct_ctlist, is protected by
    133  * whatever lock protects the holder's data structure.  In order for
    134  * these two directions to remain consistent, changing the holder of a
    135  * contract requires that both locks be held.
    136  *
    137  * Events also have reference counts.  There is one hold on an event
    138  * per queue it is present on, in addition to those needed for the
    139  * usual sundry reasons.  Individual listeners are associated with
    140  * specific queues, and increase a queue-specific reference count
    141  * stored in the ct_member_t structure.
    142  *
    143  * The dynamic contents of an event (reference count and flags) are
    144  * protected by its cte_lock, while the contents of the embedded
    145  * ct_member_t structures are protected by the locks of the queues they
    146  * are linked into.  A ct_listener_t's contents are also protected by
    147  * its event queue's ctq_lock.
    148  *
    149  * Resource controls
    150  * -----------------
    151  *
    152  * Control:      project.max-contracts (rc_project_contract)
    153  * Description:  Maximum number of contracts allowed a project.
    154  *
    155  *   When a contract is created, the project's allocation is tested and
    156  *   (assuming success) increased.  When the last reference to a
    157  *   contract is released, the creating project's allocation is
    158  *   decreased.
    159  */
    160 
    161 #include <sys/mutex.h>
    162 #include <sys/debug.h>
    163 #include <sys/types.h>
    164 #include <sys/param.h>
    165 #include <sys/kmem.h>
    166 #include <sys/thread.h>
    167 #include <sys/id_space.h>
    168 #include <sys/avl.h>
    169 #include <sys/list.h>
    170 #include <sys/sysmacros.h>
    171 #include <sys/proc.h>
    172 #include <sys/ctfs.h>
    173 #include <sys/contract_impl.h>
    174 #include <sys/contract/process_impl.h>
    175 #include <sys/dditypes.h>
    176 #include <sys/contract/device_impl.h>
    177 #include <sys/systm.h>
    178 #include <sys/atomic.h>
    179 #include <sys/cmn_err.h>
    180 #include <sys/model.h>
    181 #include <sys/policy.h>
    182 #include <sys/zone.h>
    183 #include <sys/task.h>
    184 #include <sys/ddi.h>
    185 #include <sys/sunddi.h>
    186 
    187 extern rctl_hndl_t rc_project_contract;
    188 
    189 static id_space_t	*contract_ids;
    190 static avl_tree_t	contract_avl;
    191 static kmutex_t		contract_lock;
    192 
    193 int			ct_ntypes = CTT_MAXTYPE;
    194 static ct_type_t	*ct_types_static[CTT_MAXTYPE];
    195 ct_type_t		**ct_types = ct_types_static;
    196 int			ct_debug;
    197 
    198 static void cte_queue_create(ct_equeue_t *, ct_listnum_t, int, int);
    199 static void cte_queue_destroy(ct_equeue_t *);
    200 static void cte_queue_drain(ct_equeue_t *, int);
    201 static void cte_trim(ct_equeue_t *, contract_t *);
    202 static void cte_copy(ct_equeue_t *, ct_equeue_t *);
    203 
    204 /*
    205  * contract_compar
    206  *
    207  * A contract comparator which sorts on contract ID.
    208  */
    209 int
    210 contract_compar(const void *x, const void *y)
    211 {
    212 	const contract_t *ct1 = x;
    213 	const contract_t *ct2 = y;
    214 
    215 	if (ct1->ct_id < ct2->ct_id)
    216 		return (-1);
    217 	if (ct1->ct_id > ct2->ct_id)
    218 		return (1);
    219 	return (0);
    220 }
    221 
    222 /*
    223  * contract_init
    224  *
    225  * Initializes the contract subsystem, the specific contract types, and
    226  * process 0.
    227  */
    228 void
    229 contract_init(void)
    230 {
    231 	/*
    232 	 * Initialize contract subsystem.
    233 	 */
    234 	contract_ids = id_space_create("contracts", 1, INT_MAX);
    235 	avl_create(&contract_avl, contract_compar, sizeof (contract_t),
    236 	    offsetof(contract_t, ct_ctavl));
    237 	mutex_init(&contract_lock, NULL, MUTEX_DEFAULT, NULL);
    238 
    239 	/*
    240 	 * Initialize contract types.
    241 	 */
    242 	contract_process_init();
    243 	contract_device_init();
    244 
    245 	/*
    246 	 * Initialize p0/lwp0 contract state.
    247 	 */
    248 	avl_create(&p0.p_ct_held, contract_compar, sizeof (contract_t),
    249 	    offsetof(contract_t, ct_ctlist));
    250 }
    251 
    252 /*
    253  * contract_dtor
    254  *
    255  * Performs basic destruction of the common portions of a contract.
    256  * Called from the failure path of contract_ctor and from
    257  * contract_rele.
    258  */
    259 static void
    260 contract_dtor(contract_t *ct)
    261 {
    262 	cte_queue_destroy(&ct->ct_events);
    263 	list_destroy(&ct->ct_vnodes);
    264 	mutex_destroy(&ct->ct_reflock);
    265 	mutex_destroy(&ct->ct_lock);
    266 	mutex_destroy(&ct->ct_evtlock);
    267 }
    268 
    269 /*
    270  * contract_ctor
    271  *
    272  * Called by a contract type to initialize a contract.  Fails if the
    273  * max-contract resource control would have been exceeded.  After a
    274  * successful call to contract_ctor, the contract is unlocked and
    275  * visible in all namespaces; any type-specific initialization should
    276  * be completed before calling contract_ctor.  Returns 0 on success.
    277  *
    278  * Because not all callers can tolerate failure, a 0 value for canfail
    279  * instructs contract_ctor to ignore the project.max-contracts resource
    280  * control.  Obviously, this "out" should only be employed by callers
    281  * who are sufficiently constrained in other ways (e.g. newproc).
    282  */
    283 int
    284 contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,
    285     ctflags_t flags, proc_t *author, int canfail)
    286 {
    287 	avl_index_t where;
    288 	klwp_t *curlwp = ttolwp(curthread);
    289 
    290 	ASSERT(author == curproc);
    291 
    292 	mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);
    293 	mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL);
    294 	mutex_init(&ct->ct_evtlock, NULL, MUTEX_DEFAULT, NULL);
    295 	ct->ct_id = id_alloc(contract_ids);
    296 
    297 	cte_queue_create(&ct->ct_events, CTEL_CONTRACT, 20, 0);
    298 	list_create(&ct->ct_vnodes, sizeof (contract_vnode_t),
    299 	    offsetof(contract_vnode_t, ctv_node));
    300 
    301 	/*
    302 	 * Instance data
    303 	 */
    304 	ct->ct_ref = 2;		/* one for the holder, one for "latest" */
    305 	ct->ct_cuid = crgetuid(CRED());
    306 	ct->ct_type = type;
    307 	ct->ct_data = data;
    308 	gethrestime(&ct->ct_ctime);
    309 	ct->ct_state = CTS_OWNED;
    310 	ct->ct_flags = flags;
    311 	ct->ct_regent = author->p_ct_process ?
    312 	    &author->p_ct_process->conp_contract : NULL;
    313 	ct->ct_ev_info = tmpl->ctmpl_ev_info;
    314 	ct->ct_ev_crit = tmpl->ctmpl_ev_crit;
    315 	ct->ct_cookie = tmpl->ctmpl_cookie;
    316 	ct->ct_owner = author;
    317 	ct->ct_ntime.ctm_total = -1;
    318 	ct->ct_qtime.ctm_total = -1;
    319 	ct->ct_nevent = NULL;
    320 
    321 	/*
    322 	 * Test project.max-contracts.
    323 	 */
    324 	mutex_enter(&author->p_lock);
    325 	mutex_enter(&contract_lock);
    326 	if (canfail && rctl_test(rc_project_contract,
    327 	    author->p_task->tk_proj->kpj_rctls, author, 1,
    328 	    RCA_SAFE) & RCT_DENY) {
    329 		id_free(contract_ids, ct->ct_id);
    330 		mutex_exit(&contract_lock);
    331 		mutex_exit(&author->p_lock);
    332 		ct->ct_events.ctq_flags |= CTQ_DEAD;
    333 		contract_dtor(ct);
    334 		return (1);
    335 	}
    336 	ct->ct_proj = author->p_task->tk_proj;
    337 	ct->ct_proj->kpj_data.kpd_contract++;
    338 	(void) project_hold(ct->ct_proj);
    339 	mutex_exit(&contract_lock);
    340 
    341 	/*
    342 	 * Insert into holder's avl of contracts.
    343 	 * We use an avl not because order is important, but because
    344 	 * readdir of /proc/contracts requires we be able to use a
    345 	 * scalar as an index into the process's list of contracts
    346 	 */
    347 	ct->ct_zoneid = author->p_zone->zone_id;
    348 	ct->ct_czuniqid = ct->ct_mzuniqid = author->p_zone->zone_uniqid;
    349 	VERIFY(avl_find(&author->p_ct_held, ct, &where) == NULL);
    350 	avl_insert(&author->p_ct_held, ct, where);
    351 	mutex_exit(&author->p_lock);
    352 
    353 	/*
    354 	 * Insert into global contract AVL
    355 	 */
    356 	mutex_enter(&contract_lock);
    357 	VERIFY(avl_find(&contract_avl, ct, &where) == NULL);
    358 	avl_insert(&contract_avl, ct, where);
    359 	mutex_exit(&contract_lock);
    360 
    361 	/*
    362 	 * Insert into type AVL
    363 	 */
    364 	mutex_enter(&type->ct_type_lock);
    365 	VERIFY(avl_find(&type->ct_type_avl, ct, &where) == NULL);
    366 	avl_insert(&type->ct_type_avl, ct, where);
    367 	type->ct_type_timestruc = ct->ct_ctime;
    368 	mutex_exit(&type->ct_type_lock);
    369 
    370 	if (curlwp->lwp_ct_latest[type->ct_type_index])
    371 		contract_rele(curlwp->lwp_ct_latest[type->ct_type_index]);
    372 	curlwp->lwp_ct_latest[type->ct_type_index] = ct;
    373 
    374 	return (0);
    375 }
    376 
    377 /*
    378  * contract_rele
    379  *
    380  * Releases a reference to a contract.  If the caller had the last
    381  * reference, the contract is removed from all namespaces, its
    382  * allocation against the max-contracts resource control is released,
    383  * and the contract type's free entry point is invoked for any
    384  * type-specific deconstruction and to (presumably) free the object.
    385  */
    386 void
    387 contract_rele(contract_t *ct)
    388 {
    389 	uint64_t nref;
    390 
    391 	mutex_enter(&ct->ct_reflock);
    392 	ASSERT(ct->ct_ref > 0);
    393 	nref = --ct->ct_ref;
    394 	mutex_exit(&ct->ct_reflock);
    395 	if (nref == 0) {
    396 		/*
    397 		 * ct_owner is cleared when it drops its reference.
    398 		 */
    399 		ASSERT(ct->ct_owner == NULL);
    400 		ASSERT(ct->ct_evcnt == 0);
    401 
    402 		/*
    403 		 * Remove from global contract AVL
    404 		 */
    405 		mutex_enter(&contract_lock);
    406 		avl_remove(&contract_avl, ct);
    407 		mutex_exit(&contract_lock);
    408 
    409 		/*
    410 		 * Remove from type AVL
    411 		 */
    412 		mutex_enter(&ct->ct_type->ct_type_lock);
    413 		avl_remove(&ct->ct_type->ct_type_avl, ct);
    414 		mutex_exit(&ct->ct_type->ct_type_lock);
    415 
    416 		/*
    417 		 * Release the contract's ID
    418 		 */
    419 		id_free(contract_ids, ct->ct_id);
    420 
    421 		/*
    422 		 * Release project hold
    423 		 */
    424 		mutex_enter(&contract_lock);
    425 		ct->ct_proj->kpj_data.kpd_contract--;
    426 		project_rele(ct->ct_proj);
    427 		mutex_exit(&contract_lock);
    428 
    429 		/*
    430 		 * Free the contract
    431 		 */
    432 		contract_dtor(ct);
    433 		ct->ct_type->ct_type_ops->contop_free(ct);
    434 	}
    435 }
    436 
    437 /*
    438  * contract_hold
    439  *
    440  * Adds a reference to a contract
    441  */
    442 void
    443 contract_hold(contract_t *ct)
    444 {
    445 	mutex_enter(&ct->ct_reflock);
    446 	ASSERT(ct->ct_ref < UINT64_MAX);
    447 	ct->ct_ref++;
    448 	mutex_exit(&ct->ct_reflock);
    449 }
    450 
    451 /*
    452  * contract_getzuniqid
    453  *
    454  * Get a contract's zone unique ID.  Needed because 64-bit reads and
    455  * writes aren't atomic on x86.  Since there are contexts where we are
    456  * unable to take ct_lock, we instead use ct_reflock; in actuality any
    457  * lock would do.
    458  */
    459 uint64_t
    460 contract_getzuniqid(contract_t *ct)
    461 {
    462 	uint64_t zuniqid;
    463 
    464 	mutex_enter(&ct->ct_reflock);
    465 	zuniqid = ct->ct_mzuniqid;
    466 	mutex_exit(&ct->ct_reflock);
    467 
    468 	return (zuniqid);
    469 }
    470 
    471 /*
    472  * contract_setzuniqid
    473  *
    474  * Sets a contract's zone unique ID.   See contract_getzuniqid.
    475  */
    476 void
    477 contract_setzuniqid(contract_t *ct, uint64_t zuniqid)
    478 {
    479 	mutex_enter(&ct->ct_reflock);
    480 	ct->ct_mzuniqid = zuniqid;
    481 	mutex_exit(&ct->ct_reflock);
    482 }
    483 
    484 /*
    485  * contract_abandon
    486  *
    487  * Abandons the specified contract.  If "explicit" is clear, the
    488  * contract was implicitly abandoned (by process exit) and should be
    489  * inherited if its terms allow it and its owner was a member of a
    490  * regent contract.  Otherwise, the contract type's abandon entry point
    491  * is invoked to either destroy or orphan the contract.
    492  */
    493 int
    494 contract_abandon(contract_t *ct, proc_t *p, int explicit)
    495 {
    496 	ct_equeue_t *q = NULL;
    497 	contract_t *parent = &p->p_ct_process->conp_contract;
    498 	int inherit = 0;
    499 
    500 	ASSERT(p == curproc);
    501 
    502 	mutex_enter(&ct->ct_lock);
    503 
    504 	/*
    505 	 * Multiple contract locks are taken contract -> subcontract.
    506 	 * Check if the contract will be inherited so we can acquire
    507 	 * all the necessary locks before making sensitive changes.
    508 	 */
    509 	if (!explicit && (ct->ct_flags & CTF_INHERIT) &&
    510 	    contract_process_accept(parent)) {
    511 		mutex_exit(&ct->ct_lock);
    512 		mutex_enter(&parent->ct_lock);
    513 		mutex_enter(&ct->ct_lock);
    514 		inherit = 1;
    515 	}
    516 
    517 	if (ct->ct_owner != p) {
    518 		mutex_exit(&ct->ct_lock);
    519 		if (inherit)
    520 			mutex_exit(&parent->ct_lock);
    521 		return (EINVAL);
    522 	}
    523 
    524 	mutex_enter(&p->p_lock);
    525 	if (explicit)
    526 		avl_remove(&p->p_ct_held, ct);
    527 	ct->ct_owner = NULL;
    528 	mutex_exit(&p->p_lock);
    529 
    530 	/*
    531 	 * Since we can't call cte_trim with the contract lock held,
    532 	 * we grab the queue pointer here.
    533 	 */
    534 	if (p->p_ct_equeue)
    535 		q = p->p_ct_equeue[ct->ct_type->ct_type_index];
    536 
    537 	/*
    538 	 * contop_abandon may destroy the contract so we rely on it to
    539 	 * drop ct_lock.  We retain a reference on the contract so that
    540 	 * the cte_trim which follows functions properly.  Even though
    541 	 * cte_trim doesn't dereference the contract pointer, it is
    542 	 * still necessary to retain a reference to the contract so
    543 	 * that we don't trim events which are sent by a subsequently
    544 	 * allocated contract infortuitously located at the same address.
    545 	 */
    546 	contract_hold(ct);
    547 
    548 	if (inherit) {
    549 		ct->ct_state = CTS_INHERITED;
    550 		ASSERT(ct->ct_regent == parent);
    551 		contract_process_take(parent, ct);
    552 
    553 		/*
    554 		 * We are handing off the process's reference to the
    555 		 * parent contract.  For this reason, the order in
    556 		 * which we drop the contract locks is also important.
    557 		 */
    558 		mutex_exit(&ct->ct_lock);
    559 		mutex_exit(&parent->ct_lock);
    560 	} else {
    561 		ct->ct_regent = NULL;
    562 		ct->ct_type->ct_type_ops->contop_abandon(ct);
    563 	}
    564 
    565 	/*
    566 	 * ct_lock has been dropped; we can safely trim the event
    567 	 * queue now.
    568 	 */
    569 	if (q) {
    570 		mutex_enter(&q->ctq_lock);
    571 		cte_trim(q, ct);
    572 		mutex_exit(&q->ctq_lock);
    573 	}
    574 
    575 	contract_rele(ct);
    576 
    577 	return (0);
    578 }
    579 
    580 int
    581 contract_newct(contract_t *ct)
    582 {
    583 	return (ct->ct_type->ct_type_ops->contop_newct(ct));
    584 }
    585 
    586 /*
    587  * contract_adopt
    588  *
    589  * Adopts a contract.  After a successful call to this routine, the
    590  * previously inherited contract will belong to the calling process,
    591  * and its events will have been appended to its new owner's process
    592  * bundle queue.
    593  */
    594 int
    595 contract_adopt(contract_t *ct, proc_t *p)
    596 {
    597 	avl_index_t where;
    598 	ct_equeue_t *q;
    599 	contract_t *parent;
    600 
    601 	ASSERT(p == curproc);
    602 
    603 	/*
    604 	 * Ensure the process has an event queue.  Checked by ASSERTs
    605 	 * below.
    606 	 */
    607 	(void) contract_type_pbundle(ct->ct_type, p);
    608 
    609 	mutex_enter(&ct->ct_lock);
    610 	parent = ct->ct_regent;
    611 	if (ct->ct_state != CTS_INHERITED ||
    612 	    &p->p_ct_process->conp_contract != parent ||
    613 	    p->p_zone->zone_uniqid != ct->ct_czuniqid) {
    614 		mutex_exit(&ct->ct_lock);
    615 		return (EINVAL);
    616 	}
    617 
    618 	/*
    619 	 * Multiple contract locks are taken contract -> subcontract.
    620 	 */
    621 	mutex_exit(&ct->ct_lock);
    622 	mutex_enter(&parent->ct_lock);
    623 	mutex_enter(&ct->ct_lock);
    624 
    625 	/*
    626 	 * It is possible that the contract was adopted by someone else
    627 	 * while its lock was dropped.  It isn't possible for the
    628 	 * contract to have been inherited by a different regent
    629 	 * contract.
    630 	 */
    631 	if (ct->ct_state != CTS_INHERITED) {
    632 		mutex_exit(&parent->ct_lock);
    633 		mutex_exit(&ct->ct_lock);
    634 		return (EBUSY);
    635 	}
    636 	ASSERT(ct->ct_regent == parent);
    637 
    638 	ct->ct_state = CTS_OWNED;
    639 
    640 	contract_process_adopt(ct, p);
    641 
    642 	mutex_enter(&p->p_lock);
    643 	ct->ct_owner = p;
    644 	VERIFY(avl_find(&p->p_ct_held, ct, &where) == NULL);
    645 	avl_insert(&p->p_ct_held, ct, where);
    646 	mutex_exit(&p->p_lock);
    647 
    648 	ASSERT(ct->ct_owner->p_ct_equeue);
    649 	ASSERT(ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]);
    650 	q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
    651 	cte_copy(&ct->ct_events, q);
    652 	mutex_exit(&ct->ct_lock);
    653 
    654 	return (0);
    655 }
    656 
    657 /*
    658  * contract_ack
    659  *
    660  * Acknowledges receipt of a critical event.
    661  */
    662 int
    663 contract_ack(contract_t *ct, uint64_t evid, int ack)
    664 {
    665 	ct_kevent_t *ev;
    666 	list_t *queue = &ct->ct_events.ctq_events;
    667 	int error = ESRCH;
    668 	int nego = 0;
    669 	uint_t evtype;
    670 
    671 	ASSERT(ack == CT_ACK || ack == CT_NACK);
    672 
    673 	mutex_enter(&ct->ct_lock);
    674 	mutex_enter(&ct->ct_events.ctq_lock);
    675 	/*
    676 	 * We are probably ACKing something near the head of the queue.
    677 	 */
    678 	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
    679 		if (ev->cte_id == evid) {
    680 			if (ev->cte_flags & CTE_NEG)
    681 				nego = 1;
    682 			else if (ack == CT_NACK)
    683 				break;
    684 			if ((ev->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
    685 				ev->cte_flags |= CTE_ACK;
    686 				ct->ct_evcnt--;
    687 				evtype = ev->cte_type;
    688 				error = 0;
    689 			}
    690 			break;
    691 		}
    692 	}
    693 	mutex_exit(&ct->ct_events.ctq_lock);
    694 	mutex_exit(&ct->ct_lock);
    695 
    696 	/*
    697 	 * Not all critical events are negotiation events, however
    698 	 * every negotiation event is a critical event. NEGEND events
    699 	 * are critical events but are not negotiation events
    700 	 */
    701 	if (error || !nego)
    702 		return (error);
    703 
    704 	if (ack == CT_ACK)
    705 		error = ct->ct_type->ct_type_ops->contop_ack(ct, evtype, evid);
    706 	else
    707 		error = ct->ct_type->ct_type_ops->contop_nack(ct, evtype, evid);
    708 
    709 	return (error);
    710 }
    711 
    712 /*ARGSUSED*/
    713 int
    714 contract_ack_inval(contract_t *ct, uint_t evtype, uint64_t evid)
    715 {
    716 	cmn_err(CE_PANIC, "contract_ack_inval: unsupported call: ctid: %u",
    717 	    ct->ct_id);
    718 	return (ENOSYS);
    719 }
    720 
    721 /*ARGSUSED*/
    722 int
    723 contract_qack_inval(contract_t *ct, uint_t evtype, uint64_t evid)
    724 {
    725 	cmn_err(CE_PANIC, "contract_ack_inval: unsupported call: ctid: %u",
    726 	    ct->ct_id);
    727 	return (ENOSYS);
    728 }
    729 
    730 /*ARGSUSED*/
    731 int
    732 contract_qack_notsup(contract_t *ct, uint_t evtype, uint64_t evid)
    733 {
    734 	return (ERANGE);
    735 }
    736 
    737 /*
    738  * contract_qack
    739  *
    740  * Asks that negotiations be extended by another time quantum
    741  */
    742 int
    743 contract_qack(contract_t *ct, uint64_t evid)
    744 {
    745 	ct_kevent_t *ev;
    746 	list_t *queue = &ct->ct_events.ctq_events;
    747 	int nego = 0;
    748 	uint_t evtype;
    749 
    750 	mutex_enter(&ct->ct_lock);
    751 	mutex_enter(&ct->ct_events.ctq_lock);
    752 
    753 	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
    754 		if (ev->cte_id == evid) {
    755 			if ((ev->cte_flags & (CTE_NEG | CTE_ACK)) == CTE_NEG) {
    756 				evtype = ev->cte_type;
    757 				nego = 1;
    758 			}
    759 			break;
    760 		}
    761 	}
    762 	mutex_exit(&ct->ct_events.ctq_lock);
    763 	mutex_exit(&ct->ct_lock);
    764 
    765 	/*
    766 	 * Only a negotiated event (which is by definition also a critical
    767 	 * event) which has not yet been acknowledged can provide
    768 	 * time quanta to a negotiating owner process.
    769 	 */
    770 	if (!nego)
    771 		return (ESRCH);
    772 
    773 	return (ct->ct_type->ct_type_ops->contop_qack(ct, evtype, evid));
    774 }
    775 
    776 /*
    777  * contract_orphan
    778  *
    779  * Icky-poo.  This is a process-contract special, used to ACK all
    780  * critical messages when a contract is orphaned.
    781  */
    782 void
    783 contract_orphan(contract_t *ct)
    784 {
    785 	ct_kevent_t *ev;
    786 	list_t *queue = &ct->ct_events.ctq_events;
    787 
    788 	ASSERT(MUTEX_HELD(&ct->ct_lock));
    789 	ASSERT(ct->ct_state != CTS_ORPHAN);
    790 
    791 	mutex_enter(&ct->ct_events.ctq_lock);
    792 	ct->ct_state = CTS_ORPHAN;
    793 	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
    794 		if ((ev->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
    795 			ev->cte_flags |= CTE_ACK;
    796 			ct->ct_evcnt--;
    797 		}
    798 	}
    799 	mutex_exit(&ct->ct_events.ctq_lock);
    800 
    801 	ASSERT(ct->ct_evcnt == 0);
    802 }
    803 
    804 /*
    805  * contract_destroy
    806  *
    807  * Explicit contract destruction.  Called when contract is empty.
    808  * The contract will actually stick around until all of its events are
    809  * removed from the bundle and and process bundle queues, and all fds
    810  * which refer to it are closed.  See contract_dtor if you are looking
    811  * for what destroys the contract structure.
    812  */
    813 void
    814 contract_destroy(contract_t *ct)
    815 {
    816 	ASSERT(MUTEX_HELD(&ct->ct_lock));
    817 	ASSERT(ct->ct_state != CTS_DEAD);
    818 	ASSERT(ct->ct_owner == NULL);
    819 
    820 	ct->ct_state = CTS_DEAD;
    821 	cte_queue_drain(&ct->ct_events, 1);
    822 	mutex_exit(&ct->ct_lock);
    823 	mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
    824 	cte_trim(&ct->ct_type->ct_type_events, ct);
    825 	mutex_exit(&ct->ct_type->ct_type_events.ctq_lock);
    826 	mutex_enter(&ct->ct_lock);
    827 	ct->ct_type->ct_type_ops->contop_destroy(ct);
    828 	mutex_exit(&ct->ct_lock);
    829 	contract_rele(ct);
    830 }
    831 
    832 /*
    833  * contract_vnode_get
    834  *
    835  * Obtains the contract directory vnode for this contract, if there is
    836  * one.  The caller must VN_RELE the vnode when they are through using
    837  * it.
    838  */
    839 vnode_t *
    840 contract_vnode_get(contract_t *ct, vfs_t *vfsp)
    841 {
    842 	contract_vnode_t *ctv;
    843 	vnode_t *vp = NULL;
    844 
    845 	mutex_enter(&ct->ct_lock);
    846 	for (ctv = list_head(&ct->ct_vnodes); ctv != NULL;
    847 	    ctv = list_next(&ct->ct_vnodes, ctv))
    848 		if (ctv->ctv_vnode->v_vfsp == vfsp) {
    849 			vp = ctv->ctv_vnode;
    850 			VN_HOLD(vp);
    851 			break;
    852 		}
    853 	mutex_exit(&ct->ct_lock);
    854 	return (vp);
    855 }
    856 
    857 /*
    858  * contract_vnode_set
    859  *
    860  * Sets the contract directory vnode for this contract.  We don't hold
    861  * a reference on the vnode because we don't want to prevent it from
    862  * being freed.  The vnode's inactive entry point will take care of
    863  * notifying us when it should be removed.
    864  */
    865 void
    866 contract_vnode_set(contract_t *ct, contract_vnode_t *ctv, vnode_t *vnode)
    867 {
    868 	mutex_enter(&ct->ct_lock);
    869 	ctv->ctv_vnode = vnode;
    870 	list_insert_head(&ct->ct_vnodes, ctv);
    871 	mutex_exit(&ct->ct_lock);
    872 }
    873 
    874 /*
    875  * contract_vnode_clear
    876  *
    877  * Removes this vnode as the contract directory vnode for this
    878  * contract.  Called from a contract directory's inactive entry point,
    879  * this may return 0 indicating that the vnode gained another reference
    880  * because of a simultaneous call to contract_vnode_get.
    881  */
    882 int
    883 contract_vnode_clear(contract_t *ct, contract_vnode_t *ctv)
    884 {
    885 	vnode_t *vp = ctv->ctv_vnode;
    886 	int result;
    887 
    888 	mutex_enter(&ct->ct_lock);
    889 	mutex_enter(&vp->v_lock);
    890 	if (vp->v_count == 1) {
    891 		list_remove(&ct->ct_vnodes, ctv);
    892 		result = 1;
    893 	} else {
    894 		vp->v_count--;
    895 		result = 0;
    896 	}
    897 	mutex_exit(&vp->v_lock);
    898 	mutex_exit(&ct->ct_lock);
    899 
    900 	return (result);
    901 }
    902 
    903 /*
    904  * contract_exit
    905  *
    906  * Abandons all contracts held by process p, and drains process p's
    907  * bundle queues.  Called on process exit.
    908  */
    909 void
    910 contract_exit(proc_t *p)
    911 {
    912 	contract_t *ct;
    913 	void *cookie = NULL;
    914 	int i;
    915 
    916 	ASSERT(p == curproc);
    917 
    918 	/*
    919 	 * Abandon held contracts.  contract_abandon knows enough not
    920 	 * to remove the contract from the list a second time.  We are
    921 	 * exiting, so no locks are needed here.  But because
    922 	 * contract_abandon will take p_lock, we need to make sure we
    923 	 * aren't holding it.
    924 	 */
    925 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
    926 	while ((ct = avl_destroy_nodes(&p->p_ct_held, &cookie)) != NULL)
    927 		VERIFY(contract_abandon(ct, p, 0) == 0);
    928 
    929 	/*
    930 	 * Drain pbundles.  Because a process bundle queue could have
    931 	 * been passed to another process, they may not be freed right
    932 	 * away.
    933 	 */
    934 	if (p->p_ct_equeue) {
    935 		for (i = 0; i < CTT_MAXTYPE; i++)
    936 			if (p->p_ct_equeue[i])
    937 				cte_queue_drain(p->p_ct_equeue[i], 0);
    938 		kmem_free(p->p_ct_equeue, CTT_MAXTYPE * sizeof (ct_equeue_t *));
    939 	}
    940 }
    941 
    942 static int
    943 get_time_left(struct ct_time *t)
    944 {
    945 	clock_t ticks_elapsed;
    946 	int secs_elapsed;
    947 
    948 	if (t->ctm_total == -1)
    949 		return (-1);
    950 
    951 	ticks_elapsed = ddi_get_lbolt() - t->ctm_start;
    952 	secs_elapsed = t->ctm_total - (drv_hztousec(ticks_elapsed)/MICROSEC);
    953 	return (secs_elapsed > 0 ? secs_elapsed : 0);
    954 }
    955 
    956 /*
    957  * contract_status_common
    958  *
    959  * Populates a ct_status structure.  Used by contract types in their
    960  * status entry points and ctfs when only common information is
    961  * requested.
    962  */
    963 void
    964 contract_status_common(contract_t *ct, zone_t *zone, void *status,
    965     model_t model)
    966 {
    967 	STRUCT_HANDLE(ct_status, lstatus);
    968 
    969 	STRUCT_SET_HANDLE(lstatus, model, status);
    970 	ASSERT(MUTEX_HELD(&ct->ct_lock));
    971 	if (zone->zone_uniqid == GLOBAL_ZONEUNIQID ||
    972 	    zone->zone_uniqid == ct->ct_czuniqid) {
    973 		zone_t *czone;
    974 		zoneid_t zoneid = -1;
    975 
    976 		/*
    977 		 * Contracts don't have holds on the zones they were
    978 		 * created by.  If the contract's zone no longer
    979 		 * exists, we say its zoneid is -1.
    980 		 */
    981 		if (zone->zone_uniqid == ct->ct_czuniqid ||
    982 		    ct->ct_czuniqid == GLOBAL_ZONEUNIQID) {
    983 			zoneid = ct->ct_zoneid;
    984 		} else if ((czone = zone_find_by_id(ct->ct_zoneid)) != NULL) {
    985 			if (czone->zone_uniqid == ct->ct_mzuniqid)
    986 				zoneid = ct->ct_zoneid;
    987 			zone_rele(czone);
    988 		}
    989 
    990 		STRUCT_FSET(lstatus, ctst_zoneid, zoneid);
    991 		STRUCT_FSET(lstatus, ctst_holder,
    992 		    (ct->ct_state == CTS_OWNED) ? ct->ct_owner->p_pid :
    993 		    (ct->ct_state == CTS_INHERITED) ? ct->ct_regent->ct_id : 0);
    994 		STRUCT_FSET(lstatus, ctst_state, ct->ct_state);
    995 	} else {
    996 		/*
    997 		 * We are looking at a contract which was created by a
    998 		 * process outside of our zone.  We provide fake zone,
    999 		 * holder, and state information.
   1000 		 */
   1001 
   1002 		STRUCT_FSET(lstatus, ctst_zoneid, zone->zone_id);
   1003 		/*
   1004 		 * Since "zone" can't disappear until the calling ctfs
   1005 		 * is unmounted, zone_zsched must be valid.
   1006 		 */
   1007 		STRUCT_FSET(lstatus, ctst_holder, (ct->ct_state < CTS_ORPHAN) ?
   1008 		    zone->zone_zsched->p_pid : 0);
   1009 		STRUCT_FSET(lstatus, ctst_state, (ct->ct_state < CTS_ORPHAN) ?
   1010 		    CTS_OWNED : ct->ct_state);
   1011 	}
   1012 	STRUCT_FSET(lstatus, ctst_nevents, ct->ct_evcnt);
   1013 	STRUCT_FSET(lstatus, ctst_ntime, get_time_left(&ct->ct_ntime));
   1014 	STRUCT_FSET(lstatus, ctst_qtime, get_time_left(&ct->ct_qtime));
   1015 	STRUCT_FSET(lstatus, ctst_nevid,
   1016 	    ct->ct_nevent ? ct->ct_nevent->cte_id : 0);
   1017 	STRUCT_FSET(lstatus, ctst_critical, ct->ct_ev_crit);
   1018 	STRUCT_FSET(lstatus, ctst_informative, ct->ct_ev_info);
   1019 	STRUCT_FSET(lstatus, ctst_cookie, ct->ct_cookie);
   1020 	STRUCT_FSET(lstatus, ctst_type, ct->ct_type->ct_type_index);
   1021 	STRUCT_FSET(lstatus, ctst_id, ct->ct_id);
   1022 }
   1023 
   1024 /*
   1025  * contract_checkcred
   1026  *
   1027  * Determines if the specified contract is owned by a process with the
   1028  * same effective uid as the specified credential.  The caller must
   1029  * ensure that the uid spaces are the same.  Returns 1 on success.
   1030  */
   1031 static int
   1032 contract_checkcred(contract_t *ct, const cred_t *cr)
   1033 {
   1034 	proc_t *p;
   1035 	int fail = 1;
   1036 
   1037 	mutex_enter(&ct->ct_lock);
   1038 	if ((p = ct->ct_owner) != NULL) {
   1039 		mutex_enter(&p->p_crlock);
   1040 		fail = crgetuid(cr) != crgetuid(p->p_cred);
   1041 		mutex_exit(&p->p_crlock);
   1042 	}
   1043 	mutex_exit(&ct->ct_lock);
   1044 
   1045 	return (!fail);
   1046 }
   1047 
   1048 /*
   1049  * contract_owned
   1050  *
   1051  * Determines if the specified credential can view an event generated
   1052  * by the specified contract.  If locked is set, the contract's ct_lock
   1053  * is held and the caller will need to do additional work to determine
   1054  * if they truly can see the event.  Returns 1 on success.
   1055  */
   1056 int
   1057 contract_owned(contract_t *ct, const cred_t *cr, int locked)
   1058 {
   1059 	int owner, cmatch, zmatch;
   1060 	uint64_t zuniqid, mzuniqid;
   1061 	uid_t euid;
   1062 
   1063 	ASSERT(locked || MUTEX_NOT_HELD(&ct->ct_lock));
   1064 
   1065 	zuniqid = curproc->p_zone->zone_uniqid;
   1066 	mzuniqid = contract_getzuniqid(ct);
   1067 	euid = crgetuid(cr);
   1068 
   1069 	/*
   1070 	 * owner: we own the contract
   1071 	 * cmatch: we are in the creator's (and holder's) zone and our
   1072 	 *   uid matches the creator's or holder's
   1073 	 * zmatch: we are in the effective zone of a contract created
   1074 	 *   in the global zone, and our uid matches that of the
   1075 	 *   virtualized holder's (zsched/kcred)
   1076 	 */
   1077 	owner = (ct->ct_owner == curproc);
   1078 	cmatch = (zuniqid == ct->ct_czuniqid) &&
   1079 	    ((ct->ct_cuid == euid) || (!locked && contract_checkcred(ct, cr)));
   1080 	zmatch = (ct->ct_czuniqid != mzuniqid) && (zuniqid == mzuniqid) &&
   1081 	    (crgetuid(kcred) == euid);
   1082 
   1083 	return (owner || cmatch || zmatch);
   1084 }
   1085 
   1086 
   1087 /*
   1088  * contract_type_init
   1089  *
   1090  * Called by contract types to register themselves with the contracts
   1091  * framework.
   1092  */
   1093 ct_type_t *
   1094 contract_type_init(ct_typeid_t type, const char *name, contops_t *ops,
   1095     ct_f_default_t *dfault)
   1096 {
   1097 	ct_type_t *result;
   1098 
   1099 	ASSERT(type < CTT_MAXTYPE);
   1100 
   1101 	result = kmem_alloc(sizeof (ct_type_t), KM_SLEEP);
   1102 
   1103 	mutex_init(&result->ct_type_lock, NULL, MUTEX_DEFAULT, NULL);
   1104 	avl_create(&result->ct_type_avl, contract_compar, sizeof (contract_t),
   1105 	    offsetof(contract_t, ct_cttavl));
   1106 	cte_queue_create(&result->ct_type_events, CTEL_BUNDLE, 20, 0);
   1107 	result->ct_type_name = name;
   1108 	result->ct_type_ops = ops;
   1109 	result->ct_type_default = dfault;
   1110 	result->ct_type_evid = 0;
   1111 	gethrestime(&result->ct_type_timestruc);
   1112 	result->ct_type_index = type;
   1113 
   1114 	ct_types[type] = result;
   1115 
   1116 	return (result);
   1117 }
   1118 
   1119 /*
   1120  * contract_type_count
   1121  *
   1122  * Obtains the number of contracts of a particular type.
   1123  */
   1124 int
   1125 contract_type_count(ct_type_t *type)
   1126 {
   1127 	ulong_t count;
   1128 
   1129 	mutex_enter(&type->ct_type_lock);
   1130 	count = avl_numnodes(&type->ct_type_avl);
   1131 	mutex_exit(&type->ct_type_lock);
   1132 
   1133 	return (count);
   1134 }
   1135 
   1136 /*
   1137  * contract_type_max
   1138  *
   1139  * Obtains the maximum contract id of of a particular type.
   1140  */
   1141 ctid_t
   1142 contract_type_max(ct_type_t *type)
   1143 {
   1144 	contract_t *ct;
   1145 	ctid_t res;
   1146 
   1147 	mutex_enter(&type->ct_type_lock);
   1148 	ct = avl_last(&type->ct_type_avl);
   1149 	res = ct ? ct->ct_id : -1;
   1150 	mutex_exit(&type->ct_type_lock);
   1151 
   1152 	return (res);
   1153 }
   1154 
   1155 /*
   1156  * contract_max
   1157  *
   1158  * Obtains the maximum contract id.
   1159  */
   1160 ctid_t
   1161 contract_max(void)
   1162 {
   1163 	contract_t *ct;
   1164 	ctid_t res;
   1165 
   1166 	mutex_enter(&contract_lock);
   1167 	ct = avl_last(&contract_avl);
   1168 	res = ct ? ct->ct_id : -1;
   1169 	mutex_exit(&contract_lock);
   1170 
   1171 	return (res);
   1172 }
   1173 
   1174 /*
   1175  * contract_lookup_common
   1176  *
   1177  * Common code for contract_lookup and contract_type_lookup.  Takes a
   1178  * pointer to an AVL tree to search in.  Should be called with the
   1179  * appropriate tree-protecting lock held (unfortunately unassertable).
   1180  */
   1181 static ctid_t
   1182 contract_lookup_common(avl_tree_t *tree, uint64_t zuniqid, ctid_t current)
   1183 {
   1184 	contract_t template, *ct;
   1185 	avl_index_t where;
   1186 	ctid_t res;
   1187 
   1188 	template.ct_id = current;
   1189 	ct = avl_find(tree, &template, &where);
   1190 	if (ct == NULL)
   1191 		ct = avl_nearest(tree, where, AVL_AFTER);
   1192 	if (zuniqid != GLOBAL_ZONEUNIQID)
   1193 		while (ct && (contract_getzuniqid(ct) != zuniqid))
   1194 			ct = AVL_NEXT(tree, ct);
   1195 	res = ct ? ct->ct_id : -1;
   1196 
   1197 	return (res);
   1198 }
   1199 
   1200 /*
   1201  * contract_type_lookup
   1202  *
   1203  * Returns the next type contract after the specified id, visible from
   1204  * the specified zone.
   1205  */
   1206 ctid_t
   1207 contract_type_lookup(ct_type_t *type, uint64_t zuniqid, ctid_t current)
   1208 {
   1209 	ctid_t res;
   1210 
   1211 	mutex_enter(&type->ct_type_lock);
   1212 	res = contract_lookup_common(&type->ct_type_avl, zuniqid, current);
   1213 	mutex_exit(&type->ct_type_lock);
   1214 
   1215 	return (res);
   1216 }
   1217 
   1218 /*
   1219  * contract_lookup
   1220  *
   1221  * Returns the next contract after the specified id, visible from the
   1222  * specified zone.
   1223  */
   1224 ctid_t
   1225 contract_lookup(uint64_t zuniqid, ctid_t current)
   1226 {
   1227 	ctid_t res;
   1228 
   1229 	mutex_enter(&contract_lock);
   1230 	res = contract_lookup_common(&contract_avl, zuniqid, current);
   1231 	mutex_exit(&contract_lock);
   1232 
   1233 	return (res);
   1234 }
   1235 
   1236 /*
   1237  * contract_plookup
   1238  *
   1239  * Returns the next contract held by process p after the specified id,
   1240  * visible from the specified zone.  Made complicated by the fact that
   1241  * contracts visible in a zone but held by processes outside of the
   1242  * zone need to appear as being held by zsched to zone members.
   1243  */
   1244 ctid_t
   1245 contract_plookup(proc_t *p, ctid_t current, uint64_t zuniqid)
   1246 {
   1247 	contract_t template, *ct;
   1248 	avl_index_t where;
   1249 	ctid_t res;
   1250 
   1251 	template.ct_id = current;
   1252 	if (zuniqid != GLOBAL_ZONEUNIQID &&
   1253 	    (p->p_flag & (SSYS|SZONETOP)) == (SSYS|SZONETOP)) {
   1254 		/* This is inelegant. */
   1255 		mutex_enter(&contract_lock);
   1256 		ct = avl_find(&contract_avl, &template, &where);
   1257 		if (ct == NULL)
   1258 			ct = avl_nearest(&contract_avl, where, AVL_AFTER);
   1259 		while (ct && !(ct->ct_state < CTS_ORPHAN &&
   1260 		    contract_getzuniqid(ct) == zuniqid &&
   1261 		    ct->ct_czuniqid == GLOBAL_ZONEUNIQID))
   1262 			ct = AVL_NEXT(&contract_avl, ct);
   1263 		res = ct ? ct->ct_id : -1;
   1264 		mutex_exit(&contract_lock);
   1265 	} else {
   1266 		mutex_enter(&p->p_lock);
   1267 		ct = avl_find(&p->p_ct_held, &template, &where);
   1268 		if (ct == NULL)
   1269 			ct = avl_nearest(&p->p_ct_held, where, AVL_AFTER);
   1270 		res = ct ? ct->ct_id : -1;
   1271 		mutex_exit(&p->p_lock);
   1272 	}
   1273 
   1274 	return (res);
   1275 }
   1276 
   1277 /*
   1278  * contract_ptr_common
   1279  *
   1280  * Common code for contract_ptr and contract_type_ptr.  Takes a pointer
   1281  * to an AVL tree to search in.  Should be called with the appropriate
   1282  * tree-protecting lock held (unfortunately unassertable).
   1283  */
   1284 static contract_t *
   1285 contract_ptr_common(avl_tree_t *tree, ctid_t id, uint64_t zuniqid)
   1286 {
   1287 	contract_t template, *ct;
   1288 
   1289 	template.ct_id = id;
   1290 	ct = avl_find(tree, &template, NULL);
   1291 	if (ct == NULL || (zuniqid != GLOBAL_ZONEUNIQID &&
   1292 	    contract_getzuniqid(ct) != zuniqid)) {
   1293 		return (NULL);
   1294 	}
   1295 
   1296 	/*
   1297 	 * Check to see if a thread is in the window in contract_rele
   1298 	 * between dropping the reference count and removing the
   1299 	 * contract from the type AVL.
   1300 	 */
   1301 	mutex_enter(&ct->ct_reflock);
   1302 	if (ct->ct_ref) {
   1303 		ct->ct_ref++;
   1304 		mutex_exit(&ct->ct_reflock);
   1305 	} else {
   1306 		mutex_exit(&ct->ct_reflock);
   1307 		ct = NULL;
   1308 	}
   1309 
   1310 	return (ct);
   1311 }
   1312 
   1313 /*
   1314  * contract_type_ptr
   1315  *
   1316  * Returns a pointer to the contract with the specified id.  The
   1317  * contract is held, so the caller needs to release the reference when
   1318  * it is through with the contract.
   1319  */
   1320 contract_t *
   1321 contract_type_ptr(ct_type_t *type, ctid_t id, uint64_t zuniqid)
   1322 {
   1323 	contract_t *ct;
   1324 
   1325 	mutex_enter(&type->ct_type_lock);
   1326 	ct = contract_ptr_common(&type->ct_type_avl, id, zuniqid);
   1327 	mutex_exit(&type->ct_type_lock);
   1328 
   1329 	return (ct);
   1330 }
   1331 
   1332 /*
   1333  * contract_ptr
   1334  *
   1335  * Returns a pointer to the contract with the specified id.  The
   1336  * contract is held, so the caller needs to release the reference when
   1337  * it is through with the contract.
   1338  */
   1339 contract_t *
   1340 contract_ptr(ctid_t id, uint64_t zuniqid)
   1341 {
   1342 	contract_t *ct;
   1343 
   1344 	mutex_enter(&contract_lock);
   1345 	ct = contract_ptr_common(&contract_avl, id, zuniqid);
   1346 	mutex_exit(&contract_lock);
   1347 
   1348 	return (ct);
   1349 }
   1350 
   1351 /*
   1352  * contract_type_time
   1353  *
   1354  * Obtains the last time a contract of a particular type was created.
   1355  */
   1356 void
   1357 contract_type_time(ct_type_t *type, timestruc_t *time)
   1358 {
   1359 	mutex_enter(&type->ct_type_lock);
   1360 	*time = type->ct_type_timestruc;
   1361 	mutex_exit(&type->ct_type_lock);
   1362 }
   1363 
   1364 /*
   1365  * contract_type_bundle
   1366  *
   1367  * Obtains a type's bundle queue.
   1368  */
   1369 ct_equeue_t *
   1370 contract_type_bundle(ct_type_t *type)
   1371 {
   1372 	return (&type->ct_type_events);
   1373 }
   1374 
   1375 /*
   1376  * contract_type_pbundle
   1377  *
   1378  * Obtain's a process's bundle queue.  If one doesn't exist, one is
   1379  * created.  Often used simply to ensure that a bundle queue is
   1380  * allocated.
   1381  */
   1382 ct_equeue_t *
   1383 contract_type_pbundle(ct_type_t *type, proc_t *pp)
   1384 {
   1385 	/*
   1386 	 * If there isn't an array of bundle queues, allocate one.
   1387 	 */
   1388 	if (pp->p_ct_equeue == NULL) {
   1389 		size_t size = CTT_MAXTYPE * sizeof (ct_equeue_t *);
   1390 		ct_equeue_t **qa = kmem_zalloc(size, KM_SLEEP);
   1391 
   1392 		mutex_enter(&pp->p_lock);
   1393 		if (pp->p_ct_equeue)
   1394 			kmem_free(qa, size);
   1395 		else
   1396 			pp->p_ct_equeue = qa;
   1397 		mutex_exit(&pp->p_lock);
   1398 	}
   1399 
   1400 	/*
   1401 	 * If there isn't a bundle queue of the required type, allocate
   1402 	 * one.
   1403 	 */
   1404 	if (pp->p_ct_equeue[type->ct_type_index] == NULL) {
   1405 		ct_equeue_t *q = kmem_zalloc(sizeof (ct_equeue_t), KM_SLEEP);
   1406 		cte_queue_create(q, CTEL_PBUNDLE, 20, 1);
   1407 
   1408 		mutex_enter(&pp->p_lock);
   1409 		if (pp->p_ct_equeue[type->ct_type_index])
   1410 			cte_queue_drain(q, 0);
   1411 		else
   1412 			pp->p_ct_equeue[type->ct_type_index] = q;
   1413 		mutex_exit(&pp->p_lock);
   1414 	}
   1415 
   1416 	return (pp->p_ct_equeue[type->ct_type_index]);
   1417 }
   1418 
   1419 /*
   1420  * ctparam_copyin
   1421  *
   1422  * copyin a ct_param_t for CT_TSET or CT_TGET commands.
   1423  * If ctparam_copyout() is not called after ctparam_copyin(), then
   1424  * the caller must kmem_free() the buffer pointed by kparam->ctpm_kbuf.
   1425  *
   1426  * The copyin/out of ct_param_t is not done in ctmpl_set() and ctmpl_get()
   1427  * because prctioctl() calls ctmpl_set() and ctmpl_get() while holding a
   1428  * process lock.
   1429  */
   1430 int
   1431 ctparam_copyin(const void *uaddr, ct_kparam_t *kparam, int flag, int cmd)
   1432 {
   1433 	uint32_t size;
   1434 	void *ubuf;
   1435 	ct_param_t *param = &kparam->param;
   1436 	STRUCT_DECL(ct_param, uarg);
   1437 
   1438 	STRUCT_INIT(uarg, flag);
   1439 	if (copyin(uaddr, STRUCT_BUF(uarg), STRUCT_SIZE(uarg)))
   1440 		return (EFAULT);
   1441 	size = STRUCT_FGET(uarg, ctpm_size);
   1442 	ubuf = STRUCT_FGETP(uarg, ctpm_value);
   1443 
   1444 	if (size > CT_PARAM_MAX_SIZE || size == 0)
   1445 		return (EINVAL);
   1446 
   1447 	kparam->ctpm_kbuf = kmem_alloc(size, KM_SLEEP);
   1448 	if (cmd == CT_TSET) {
   1449 		if (copyin(ubuf, kparam->ctpm_kbuf, size)) {
   1450 			kmem_free(kparam->ctpm_kbuf, size);
   1451 			return (EFAULT);
   1452 		}
   1453 	}
   1454 	param->ctpm_id = STRUCT_FGET(uarg, ctpm_id);
   1455 	param->ctpm_size = size;
   1456 	param->ctpm_value = ubuf;
   1457 	kparam->ret_size = 0;
   1458 
   1459 	return (0);
   1460 }
   1461 
   1462 /*
   1463  * ctparam_copyout
   1464  *
   1465  * copyout a ct_kparam_t and frees the buffer pointed by the member
   1466  * ctpm_kbuf of ct_kparam_t
   1467  */
   1468 int
   1469 ctparam_copyout(ct_kparam_t *kparam, void *uaddr, int flag)
   1470 {
   1471 	int r = 0;
   1472 	ct_param_t *param = &kparam->param;
   1473 	STRUCT_DECL(ct_param, uarg);
   1474 
   1475 	STRUCT_INIT(uarg, flag);
   1476 
   1477 	STRUCT_FSET(uarg, ctpm_id, param->ctpm_id);
   1478 	STRUCT_FSET(uarg, ctpm_size, kparam->ret_size);
   1479 	STRUCT_FSETP(uarg, ctpm_value, param->ctpm_value);
   1480 	if (copyout(STRUCT_BUF(uarg), uaddr, STRUCT_SIZE(uarg))) {
   1481 		r = EFAULT;
   1482 		goto error;
   1483 	}
   1484 	if (copyout(kparam->ctpm_kbuf, param->ctpm_value,
   1485 	    MIN(kparam->ret_size, param->ctpm_size))) {
   1486 		r = EFAULT;
   1487 	}
   1488 
   1489 error:
   1490 	kmem_free(kparam->ctpm_kbuf, param->ctpm_size);
   1491 
   1492 	return (r);
   1493 }
   1494 
   1495 /*
   1496  * ctmpl_free
   1497  *
   1498  * Frees a template.
   1499  */
   1500 void
   1501 ctmpl_free(ct_template_t *template)
   1502 {
   1503 	mutex_destroy(&template->ctmpl_lock);
   1504 	template->ctmpl_ops->ctop_free(template);
   1505 }
   1506 
   1507 /*
   1508  * ctmpl_dup
   1509  *
   1510  * Creates a copy of a template.
   1511  */
   1512 ct_template_t *
   1513 ctmpl_dup(ct_template_t *template)
   1514 {
   1515 	ct_template_t *new;
   1516 
   1517 	if (template == NULL)
   1518 		return (NULL);
   1519 
   1520 	new = template->ctmpl_ops->ctop_dup(template);
   1521 	/*
   1522 	 * ctmpl_lock was taken by ctop_dup's call to ctmpl_copy and
   1523 	 * should have remain held until now.
   1524 	 */
   1525 	mutex_exit(&template->ctmpl_lock);
   1526 
   1527 	return (new);
   1528 }
   1529 
   1530 /*
   1531  * ctmpl_set
   1532  *
   1533  * Sets the requested terms of a template.
   1534  */
   1535 int
   1536 ctmpl_set(ct_template_t *template, ct_kparam_t *kparam, const cred_t *cr)
   1537 {
   1538 	int result = 0;
   1539 	ct_param_t *param = &kparam->param;
   1540 	uint64_t param_value;
   1541 
   1542 	if (param->ctpm_id == CTP_COOKIE ||
   1543 	    param->ctpm_id == CTP_EV_INFO ||
   1544 	    param->ctpm_id == CTP_EV_CRITICAL) {
   1545 		if (param->ctpm_size < sizeof (uint64_t)) {
   1546 			return (EINVAL);
   1547 		} else {
   1548 			param_value = *(uint64_t *)kparam->ctpm_kbuf;
   1549 		}
   1550 	}
   1551 
   1552 	mutex_enter(&template->ctmpl_lock);
   1553 	switch (param->ctpm_id) {
   1554 	case CTP_COOKIE:
   1555 		template->ctmpl_cookie = param_value;
   1556 		break;
   1557 	case CTP_EV_INFO:
   1558 		if (param_value & ~(uint64_t)template->ctmpl_ops->allevents)
   1559 			result = EINVAL;
   1560 		else
   1561 			template->ctmpl_ev_info = param_value;
   1562 		break;
   1563 	case CTP_EV_CRITICAL:
   1564 		if (param_value & ~(uint64_t)template->ctmpl_ops->allevents) {
   1565 			result = EINVAL;
   1566 			break;
   1567 		} else if ((~template->ctmpl_ev_crit & param_value) == 0) {
   1568 			/*
   1569 			 * Assume that a pure reduction of the critical
   1570 			 * set is allowed by the contract type.
   1571 			 */
   1572 			template->ctmpl_ev_crit = param_value;
   1573 			break;
   1574 		}
   1575 		/*
   1576 		 * There may be restrictions on what we can make
   1577 		 * critical, so we defer to the judgement of the
   1578 		 * contract type.
   1579 		 */
   1580 		/* FALLTHROUGH */
   1581 	default:
   1582 		result = template->ctmpl_ops->ctop_set(template, kparam, cr);
   1583 	}
   1584 	mutex_exit(&template->ctmpl_lock);
   1585 
   1586 	return (result);
   1587 }
   1588 
   1589 /*
   1590  * ctmpl_get
   1591  *
   1592  * Obtains the requested terms from a template.
   1593  *
   1594  * If the term requested is a variable-sized term and the buffer
   1595  * provided is too small for the data, we truncate the data and return
   1596  * the buffer size necessary to fit the term in kparam->ret_size. If the
   1597  * term requested is fix-sized (uint64_t) and the buffer provided is too
   1598  * small, we return EINVAL.  This should never happen if you're using
   1599  * libcontract(3LIB), only if you call ioctl with a hand constructed
   1600  * ct_param_t argument.
   1601  *
   1602  * Currently, only contract specific parameters have variable-sized
   1603  * parameters.
   1604  */
   1605 int
   1606 ctmpl_get(ct_template_t *template, ct_kparam_t *kparam)
   1607 {
   1608 	int result = 0;
   1609 	ct_param_t *param = &kparam->param;
   1610 	uint64_t *param_value;
   1611 
   1612 	if (param->ctpm_id == CTP_COOKIE ||
   1613 	    param->ctpm_id == CTP_EV_INFO ||
   1614 	    param->ctpm_id == CTP_EV_CRITICAL) {
   1615 		if (param->ctpm_size < sizeof (uint64_t)) {
   1616 			return (EINVAL);
   1617 		} else {
   1618 			param_value = kparam->ctpm_kbuf;
   1619 			kparam->ret_size = sizeof (uint64_t);
   1620 		}
   1621 	}
   1622 
   1623 	mutex_enter(&template->ctmpl_lock);
   1624 	switch (param->ctpm_id) {
   1625 	case CTP_COOKIE:
   1626 		*param_value = template->ctmpl_cookie;
   1627 		break;
   1628 	case CTP_EV_INFO:
   1629 		*param_value = template->ctmpl_ev_info;
   1630 		break;
   1631 	case CTP_EV_CRITICAL:
   1632 		*param_value = template->ctmpl_ev_crit;
   1633 		break;
   1634 	default:
   1635 		result = template->ctmpl_ops->ctop_get(template, kparam);
   1636 	}
   1637 	mutex_exit(&template->ctmpl_lock);
   1638 
   1639 	return (result);
   1640 }
   1641 
   1642 /*
   1643  * ctmpl_makecurrent
   1644  *
   1645  * Used by ctmpl_activate and ctmpl_clear to set the current thread's
   1646  * active template.  Frees the old active template, if there was one.
   1647  */
   1648 static void
   1649 ctmpl_makecurrent(ct_template_t *template, ct_template_t *new)
   1650 {
   1651 	klwp_t *curlwp = ttolwp(curthread);
   1652 	proc_t *p = curproc;
   1653 	ct_template_t *old;
   1654 
   1655 	mutex_enter(&p->p_lock);
   1656 	old = curlwp->lwp_ct_active[template->ctmpl_type->ct_type_index];
   1657 	curlwp->lwp_ct_active[template->ctmpl_type->ct_type_index] = new;
   1658 	mutex_exit(&p->p_lock);
   1659 
   1660 	if (old)
   1661 		ctmpl_free(old);
   1662 }
   1663 
   1664 /*
   1665  * ctmpl_activate
   1666  *
   1667  * Copy the specified template as the current thread's activate
   1668  * template of that type.
   1669  */
   1670 void
   1671 ctmpl_activate(ct_template_t *template)
   1672 {
   1673 	ctmpl_makecurrent(template, ctmpl_dup(template));
   1674 }
   1675 
   1676 /*
   1677  * ctmpl_clear
   1678  *
   1679  * Clears the current thread's activate template of the same type as
   1680  * the specified template.
   1681  */
   1682 void
   1683 ctmpl_clear(ct_template_t *template)
   1684 {
   1685 	ctmpl_makecurrent(template, NULL);
   1686 }
   1687 
   1688 /*
   1689  * ctmpl_create
   1690  *
   1691  * Creates a new contract using the specified template.
   1692  */
   1693 int
   1694 ctmpl_create(ct_template_t *template, ctid_t *ctidp)
   1695 {
   1696 	return (template->ctmpl_ops->ctop_create(template, ctidp));
   1697 }
   1698 
   1699 /*
   1700  * ctmpl_init
   1701  *
   1702  * Initializes the common portion of a new contract template.
   1703  */
   1704 void
   1705 ctmpl_init(ct_template_t *new, ctmplops_t *ops, ct_type_t *type, void *data)
   1706 {
   1707 	mutex_init(&new->ctmpl_lock, NULL, MUTEX_DEFAULT, NULL);
   1708 	new->ctmpl_ops = ops;
   1709 	new->ctmpl_type = type;
   1710 	new->ctmpl_data = data;
   1711 	new->ctmpl_ev_info = new->ctmpl_ev_crit = 0;
   1712 	new->ctmpl_cookie = 0;
   1713 }
   1714 
   1715 /*
   1716  * ctmpl_copy
   1717  *
   1718  * Copies the common portions of a contract template.  Intended for use
   1719  * by a contract type's ctop_dup template op.  Returns with the old
   1720  * template's lock held, which will should remain held until the
   1721  * template op returns (it is dropped by ctmpl_dup).
   1722  */
   1723 void
   1724 ctmpl_copy(ct_template_t *new, ct_template_t *old)
   1725 {
   1726 	mutex_init(&new->ctmpl_lock, NULL, MUTEX_DEFAULT, NULL);
   1727 	mutex_enter(&old->ctmpl_lock);
   1728 	new->ctmpl_ops = old->ctmpl_ops;
   1729 	new->ctmpl_type = old->ctmpl_type;
   1730 	new->ctmpl_ev_crit = old->ctmpl_ev_crit;
   1731 	new->ctmpl_ev_info = old->ctmpl_ev_info;
   1732 	new->ctmpl_cookie = old->ctmpl_cookie;
   1733 }
   1734 
   1735 /*
   1736  * ctmpl_create_inval
   1737  *
   1738  * Returns EINVAL.  Provided for the convenience of those contract
   1739  * types which don't support ct_tmpl_create(3contract) and would
   1740  * otherwise need to create their own stub for the ctop_create template
   1741  * op.
   1742  */
   1743 /*ARGSUSED*/
   1744 int
   1745 ctmpl_create_inval(ct_template_t *template, ctid_t *ctidp)
   1746 {
   1747 	return (EINVAL);
   1748 }
   1749 
   1750 
   1751 /*
   1752  * cte_queue_create
   1753  *
   1754  * Initializes a queue of a particular type.  If dynamic is set, the
   1755  * queue is to be freed when its last listener is removed after being
   1756  * drained.
   1757  */
   1758 static void
   1759 cte_queue_create(ct_equeue_t *q, ct_listnum_t list, int maxinf, int dynamic)
   1760 {
   1761 	mutex_init(&q->ctq_lock, NULL, MUTEX_DEFAULT, NULL);
   1762 	q->ctq_listno = list;
   1763 	list_create(&q->ctq_events, sizeof (ct_kevent_t),
   1764 	    offsetof(ct_kevent_t, cte_nodes[list].ctm_node));
   1765 	list_create(&q->ctq_listeners, sizeof (ct_listener_t),
   1766 	    offsetof(ct_listener_t, ctl_allnode));
   1767 	list_create(&q->ctq_tail, sizeof (ct_listener_t),
   1768 	    offsetof(ct_listener_t, ctl_tailnode));
   1769 	gethrestime(&q->ctq_atime);
   1770 	q->ctq_nlisteners = 0;
   1771 	q->ctq_nreliable = 0;
   1772 	q->ctq_ninf = 0;
   1773 	q->ctq_max = maxinf;
   1774 
   1775 	/*
   1776 	 * Bundle queues and contract queues are embedded in other
   1777 	 * structures and are implicitly referenced counted by virtue
   1778 	 * of their vnodes' indirect hold on their contracts.  Process
   1779 	 * bundle queues are dynamically allocated and may persist
   1780 	 * after the death of the process, so they must be explicitly
   1781 	 * reference counted.
   1782 	 */
   1783 	q->ctq_flags = dynamic ? CTQ_REFFED : 0;
   1784 }
   1785 
   1786 /*
   1787  * cte_queue_destroy
   1788  *
   1789  * Destroys the specified queue.  The queue is freed if referenced
   1790  * counted.
   1791  */
   1792 static void
   1793 cte_queue_destroy(ct_equeue_t *q)
   1794 {
   1795 	ASSERT(q->ctq_flags & CTQ_DEAD);
   1796 	ASSERT(q->ctq_nlisteners == 0);
   1797 	ASSERT(q->ctq_nreliable == 0);
   1798 	list_destroy(&q->ctq_events);
   1799 	list_destroy(&q->ctq_listeners);
   1800 	list_destroy(&q->ctq_tail);
   1801 	mutex_destroy(&q->ctq_lock);
   1802 	if (q->ctq_flags & CTQ_REFFED)
   1803 		kmem_free(q, sizeof (ct_equeue_t));
   1804 }
   1805 
   1806 /*
   1807  * cte_hold
   1808  *
   1809  * Takes a hold on the specified event.
   1810  */
   1811 static void
   1812 cte_hold(ct_kevent_t *e)
   1813 {
   1814 	mutex_enter(&e->cte_lock);
   1815 	ASSERT(e->cte_refs > 0);
   1816 	e->cte_refs++;
   1817 	mutex_exit(&e->cte_lock);
   1818 }
   1819 
   1820 /*
   1821  * cte_rele
   1822  *
   1823  * Releases a hold on the specified event.  If the caller had the last
   1824  * reference, frees the event and releases its hold on the contract
   1825  * that generated it.
   1826  */
   1827 static void
   1828 cte_rele(ct_kevent_t *e)
   1829 {
   1830 	mutex_enter(&e->cte_lock);
   1831 	ASSERT(e->cte_refs > 0);
   1832 	if (--e->cte_refs) {
   1833 		mutex_exit(&e->cte_lock);
   1834 		return;
   1835 	}
   1836 
   1837 	contract_rele(e->cte_contract);
   1838 
   1839 	mutex_destroy(&e->cte_lock);
   1840 	if (e->cte_data)
   1841 		nvlist_free(e->cte_data);
   1842 	if (e->cte_gdata)
   1843 		nvlist_free(e->cte_gdata);
   1844 	kmem_free(e, sizeof (ct_kevent_t));
   1845 }
   1846 
   1847 /*
   1848  * cte_qrele
   1849  *
   1850  * Remove this listener's hold on the specified event, removing and
   1851  * releasing the queue's hold on the event if appropriate.
   1852  */
   1853 static void
   1854 cte_qrele(ct_equeue_t *q, ct_listener_t *l, ct_kevent_t *e)
   1855 {
   1856 	ct_member_t *member = &e->cte_nodes[q->ctq_listno];
   1857 
   1858 	ASSERT(MUTEX_HELD(&q->ctq_lock));
   1859 
   1860 	if (l->ctl_flags & CTLF_RELIABLE)
   1861 		member->ctm_nreliable--;
   1862 	if ((--member->ctm_refs == 0) && member->ctm_trimmed) {
   1863 		member->ctm_trimmed = 0;
   1864 		list_remove(&q->ctq_events, e);
   1865 		cte_rele(e);
   1866 	}
   1867 }
   1868 
   1869 /*
   1870  * cte_qmove
   1871  *
   1872  * Move this listener to the specified event in the queue.
   1873  */
   1874 static ct_kevent_t *
   1875 cte_qmove(ct_equeue_t *q, ct_listener_t *l, ct_kevent_t *e)
   1876 {
   1877 	ct_kevent_t *olde;
   1878 
   1879 	ASSERT(MUTEX_HELD(&q->ctq_lock));
   1880 	ASSERT(l->ctl_equeue == q);
   1881 
   1882 	if ((olde = l->ctl_position) == NULL)
   1883 		list_remove(&q->ctq_tail, l);
   1884 
   1885 	while (e != NULL && e->cte_nodes[q->ctq_listno].ctm_trimmed)
   1886 		e = list_next(&q->ctq_events, e);
   1887 
   1888 	if (e != NULL) {
   1889 		e->cte_nodes[q->ctq_listno].ctm_refs++;
   1890 		if (l->ctl_flags & CTLF_RELIABLE)
   1891 			e->cte_nodes[q->ctq_listno].ctm_nreliable++;
   1892 	} else {
   1893 		list_insert_tail(&q->ctq_tail, l);
   1894 	}
   1895 
   1896 	l->ctl_position = e;
   1897 	if (olde)
   1898 		cte_qrele(q, l, olde);
   1899 
   1900 	return (e);
   1901 }
   1902 
   1903 /*
   1904  * cte_checkcred
   1905  *
   1906  * Determines if the specified event's contract is owned by a process
   1907  * with the same effective uid as the specified credential.  Called
   1908  * after a failed call to contract_owned with locked set.  Because it
   1909  * drops the queue lock, its caller (cte_qreadable) needs to make sure
   1910  * we're still in the same place after we return.  Returns 1 on
   1911  * success.
   1912  */
   1913 static int
   1914 cte_checkcred(ct_equeue_t *q, ct_kevent_t *e, const cred_t *cr)
   1915 {
   1916 	int result;
   1917 	contract_t *ct = e->cte_contract;
   1918 
   1919 	cte_hold(e);
   1920 	mutex_exit(&q->ctq_lock);
   1921 	result = curproc->p_zone->zone_uniqid == ct->ct_czuniqid &&
   1922 	    contract_checkcred(ct, cr);
   1923 	mutex_enter(&q->ctq_lock);
   1924 	cte_rele(e);
   1925 
   1926 	return (result);
   1927 }
   1928 
   1929 /*
   1930  * cte_qreadable
   1931  *
   1932  * Ensures that the listener is pointing to a valid event that the
   1933  * caller has the credentials to read.  Returns 0 if we can read the
   1934  * event we're pointing to.
   1935  */
   1936 static int
   1937 cte_qreadable(ct_equeue_t *q, ct_listener_t *l, const cred_t *cr,
   1938     uint64_t zuniqid, int crit)
   1939 {
   1940 	ct_kevent_t *e, *next;
   1941 	contract_t *ct;
   1942 
   1943 	ASSERT(MUTEX_HELD(&q->ctq_lock));
   1944 	ASSERT(l->ctl_equeue == q);
   1945 
   1946 	if (l->ctl_flags & CTLF_COPYOUT)
   1947 		return (1);
   1948 
   1949 	next = l->ctl_position;
   1950 	while (e = cte_qmove(q, l, next)) {
   1951 		ct = e->cte_contract;
   1952 		/*
   1953 		 * Check obvious things first.  If we are looking for a
   1954 		 * critical message, is this one?  If we aren't in the
   1955 		 * global zone, is this message meant for us?
   1956 		 */
   1957 		if ((crit && (e->cte_flags & (CTE_INFO | CTE_ACK))) ||
   1958 		    (cr != NULL && zuniqid != GLOBAL_ZONEUNIQID &&
   1959 		    zuniqid != contract_getzuniqid(ct))) {
   1960 
   1961 			next = list_next(&q->ctq_events, e);
   1962 
   1963 		/*
   1964 		 * Next, see if our effective uid equals that of owner
   1965 		 * or author of the contract.  Since we are holding the
   1966 		 * queue lock, contract_owned can't always check if we
   1967 		 * have the same effective uid as the contract's
   1968 		 * owner.  If it comes to that, it fails and we take
   1969 		 * the slow(er) path.
   1970 		 */
   1971 		} else if (cr != NULL && !contract_owned(ct, cr, B_TRUE)) {
   1972 
   1973 			/*
   1974 			 * At this point we either don't have any claim
   1975 			 * to this contract or we match the effective
   1976 			 * uid of the owner but couldn't tell.  We
   1977 			 * first test for a NULL holder so that events
   1978 			 * from orphans and inherited contracts avoid
   1979 			 * the penalty phase.
   1980 			 */
   1981 			if (e->cte_contract->ct_owner == NULL &&
   1982 			    !secpolicy_contract_observer_choice(cr))
   1983 				next = list_next(&q->ctq_events, e);
   1984 
   1985 			/*
   1986 			 * cte_checkcred will juggle locks to see if we
   1987 			 * have the same uid as the event's contract's
   1988 			 * current owner.  If it succeeds, we have to
   1989 			 * make sure we are in the same point in the
   1990 			 * queue.
   1991 			 */
   1992 			else if (cte_checkcred(q, e, cr) &&
   1993 			    l->ctl_position == e)
   1994 				break;
   1995 
   1996 			/*
   1997 			 * cte_checkcred failed; see if we're in the
   1998 			 * same place.
   1999 			 */
   2000 			else if (l->ctl_position == e)
   2001 				if (secpolicy_contract_observer_choice(cr))
   2002 					break;
   2003 				else
   2004 					next = list_next(&q->ctq_events, e);
   2005 
   2006 			/*
   2007 			 * cte_checkcred failed, and our position was
   2008 			 * changed.  Start from there.
   2009 			 */
   2010 			else
   2011 				next = l->ctl_position;
   2012 		} else {
   2013 			break;
   2014 		}
   2015 	}
   2016 
   2017 	/*
   2018 	 * We check for CTLF_COPYOUT again in case we dropped the queue
   2019 	 * lock in cte_checkcred.
   2020 	 */
   2021 	return ((l->ctl_flags & CTLF_COPYOUT) || (l->ctl_position == NULL));
   2022 }
   2023 
   2024 /*
   2025  * cte_qwakeup
   2026  *
   2027  * Wakes up any waiting listeners and points them at the specified event.
   2028  */
   2029 static void
   2030 cte_qwakeup(ct_equeue_t *q, ct_kevent_t *e)
   2031 {
   2032 	ct_listener_t *l;
   2033 
   2034 	ASSERT(MUTEX_HELD(&q->ctq_lock));
   2035 
   2036 	while (l = list_head(&q->ctq_tail)) {
   2037 		list_remove(&q->ctq_tail, l);
   2038 		e->cte_nodes[q->ctq_listno].ctm_refs++;
   2039 		if (l->ctl_flags & CTLF_RELIABLE)
   2040 			e->cte_nodes[q->ctq_listno].ctm_nreliable++;
   2041 		l->ctl_position = e;
   2042 		cv_signal(&l->ctl_cv);
   2043 		pollwakeup(&l->ctl_pollhead, POLLIN);
   2044 	}
   2045 }
   2046 
   2047 /*
   2048  * cte_copy
   2049  *
   2050  * Copies events from the specified contract event queue to the
   2051  * end of the specified process bundle queue.  Only called from
   2052  * contract_adopt.
   2053  *
   2054  * We copy to the end of the target queue instead of mixing the events
   2055  * in their proper order because otherwise the act of adopting a
   2056  * contract would require a process to reset all process bundle
   2057  * listeners it needed to see the new events.  This would, in turn,
   2058  * require the process to keep track of which preexisting events had
   2059  * already been processed.
   2060  */
   2061 static void
   2062 cte_copy(ct_equeue_t *q, ct_equeue_t *newq)
   2063 {
   2064 	ct_kevent_t *e, *first = NULL;
   2065 
   2066 	ASSERT(q->ctq_listno == CTEL_CONTRACT);
   2067 	ASSERT(newq->ctq_listno == CTEL_PBUNDLE);
   2068 
   2069 	mutex_enter(&q->ctq_lock);
   2070 	mutex_enter(&newq->ctq_lock);
   2071 
   2072 	/*
   2073 	 * For now, only copy critical events.
   2074 	 */
   2075 	for (e = list_head(&q->ctq_events); e != NULL;
   2076 	    e = list_next(&q->ctq_events, e)) {
   2077 		if ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
   2078 			if (first == NULL)
   2079 				first = e;
   2080 			list_insert_tail(&newq->ctq_events, e);
   2081 			cte_hold(e);
   2082 		}
   2083 	}
   2084 
   2085 	mutex_exit(&q->ctq_lock);
   2086 
   2087 	if (first)
   2088 		cte_qwakeup(newq, first);
   2089 
   2090 	mutex_exit(&newq->ctq_lock);
   2091 }
   2092 
   2093 /*
   2094  * cte_trim
   2095  *
   2096  * Trims unneeded events from an event queue.  Algorithm works as
   2097  * follows:
   2098  *
   2099  *   Removes all informative and acknowledged critical events until the
   2100  *   first referenced event is found.
   2101  *
   2102  *   If a contract is specified, removes all events (regardless of
   2103  *   acknowledgement) generated by that contract until the first event
   2104  *   referenced by a reliable listener is found.  Reference events are
   2105  *   removed by marking them "trimmed".  Such events will be removed
   2106  *   when the last reference is dropped and will be skipped by future
   2107  *   listeners.
   2108  *
   2109  * This is pretty basic.  Ideally this should remove from the middle of
   2110  * the list (i.e. beyond the first referenced event), and even
   2111  * referenced events.
   2112  */
   2113 static void
   2114 cte_trim(ct_equeue_t *q, contract_t *ct)
   2115 {
   2116 	ct_kevent_t *e, *next;
   2117 	int flags, stopper;
   2118 	int start = 1;
   2119 
   2120 	ASSERT(MUTEX_HELD(&q->ctq_lock));
   2121 
   2122 	for (e = list_head(&q->ctq_events); e != NULL; e = next) {
   2123 		next = list_next(&q->ctq_events, e);
   2124 		flags = e->cte_flags;
   2125 		stopper = (q->ctq_listno != CTEL_PBUNDLE) &&
   2126 		    (e->cte_nodes[q->ctq_listno].ctm_nreliable > 0);
   2127 		if (e->cte_nodes[q->ctq_listno].ctm_refs == 0) {
   2128 			if ((start && (flags & (CTE_INFO | CTE_ACK))) ||
   2129 			    (e->cte_contract == ct)) {
   2130 				/*
   2131 				 * Toss informative and ACKed critical messages.
   2132 				 */
   2133 				list_remove(&q->ctq_events, e);
   2134 				cte_rele(e);
   2135 			}
   2136 		} else if ((e->cte_contract == ct) && !stopper) {
   2137 			ASSERT(q->ctq_nlisteners != 0);
   2138 			e->cte_nodes[q->ctq_listno].ctm_trimmed = 1;
   2139 		} else if (ct && !stopper) {
   2140 			start = 0;
   2141 		} else {
   2142 			/*
   2143 			 * Don't free messages past the first reader.
   2144 			 */
   2145 			break;
   2146 		}
   2147 	}
   2148 }
   2149 
   2150 /*
   2151  * cte_queue_drain
   2152  *
   2153  * Drain all events from the specified queue, and mark it dead.  If
   2154  * "ack" is set, acknowledge any critical events we find along the
   2155  * way.
   2156  */
   2157 static void
   2158 cte_queue_drain(ct_equeue_t *q, int ack)
   2159 {
   2160 	ct_kevent_t *e, *next;
   2161 	ct_listener_t *l;
   2162 
   2163 	mutex_enter(&q->ctq_lock);
   2164 
   2165 	for (e = list_head(&q->ctq_events); e != NULL; e = next) {
   2166 		next = list_next(&q->ctq_events, e);
   2167 		if (ack && ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0)) {
   2168 			/*
   2169 			 * Make sure critical messages are eventually
   2170 			 * removed from the bundle queues.
   2171 			 */
   2172 			mutex_enter(&e->cte_lock);
   2173 			e->cte_flags |= CTE_ACK;
   2174 			mutex_exit(&e->cte_lock);
   2175 			ASSERT(MUTEX_HELD(&e->cte_contract->ct_lock));
   2176 			e->cte_contract->ct_evcnt--;
   2177 		}
   2178 		list_remove(&q->ctq_events, e);
   2179 		e->cte_nodes[q->ctq_listno].ctm_refs = 0;
   2180 		e->cte_nodes[q->ctq_listno].ctm_nreliable = 0;
   2181 		e->cte_nodes[q->ctq_listno].ctm_trimmed = 0;
   2182 		cte_rele(e);
   2183 	}
   2184 
   2185 	/*
   2186 	 * This is necessary only because of CTEL_PBUNDLE listeners;
   2187 	 * the events they point to can move from one pbundle to
   2188 	 * another.  Fortunately, this only happens if the contract is
   2189 	 * inherited, which (in turn) only happens if the process
   2190 	 * exits, which means it's an all-or-nothing deal.  If this
   2191 	 * wasn't the case, we would instead need to keep track of
   2192 	 * listeners on a per-event basis, not just a per-queue basis.
   2193 	 * This would have the side benefit of letting us clean up
   2194 	 * trimmed events sooner (i.e. immediately), but would
   2195 	 * unfortunately make events even bigger than they already
   2196 	 * are.
   2197 	 */
   2198 	for (l = list_head(&q->ctq_listeners); l;
   2199 	    l = list_next(&q->ctq_listeners, l)) {
   2200 		l->ctl_flags |= CTLF_DEAD;
   2201 		if (l->ctl_position) {
   2202 			l->ctl_position = NULL;
   2203 			list_insert_tail(&q->ctq_tail, l);
   2204 		}
   2205 		cv_broadcast(&l->ctl_cv);
   2206 	}
   2207 
   2208 	/*
   2209 	 * Disallow events.
   2210 	 */
   2211 	q->ctq_flags |= CTQ_DEAD;
   2212 
   2213 	/*
   2214 	 * If we represent the last reference to a reference counted
   2215 	 * process bundle queue, free it.
   2216 	 */
   2217 	if ((q->ctq_flags & CTQ_REFFED) && (q->ctq_nlisteners == 0))
   2218 		cte_queue_destroy(q);
   2219 	else
   2220 		mutex_exit(&q->ctq_lock);
   2221 }
   2222 
   2223 /*
   2224  * cte_publish
   2225  *
   2226  * Publishes an event to a specific queue.  Only called by
   2227  * cte_publish_all.
   2228  */
   2229 static void
   2230 cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp)
   2231 {
   2232 	ASSERT(MUTEX_HELD(&q->ctq_lock));
   2233 
   2234 	q->ctq_atime = *tsp;
   2235 
   2236 	/*
   2237 	 * Don't publish if the event is informative and there aren't
   2238 	 * any listeners, or if the queue has been shut down.
   2239 	 */
   2240 	if (((q->ctq_nlisteners == 0) && (e->cte_flags & (CTE_INFO|CTE_ACK))) ||
   2241 	    (q->ctq_flags & CTQ_DEAD)) {
   2242 		mutex_exit(&q->ctq_lock);
   2243 		cte_rele(e);
   2244 		return;
   2245 	}
   2246 
   2247 	/*
   2248 	 * Enqueue event
   2249 	 */
   2250 	list_insert_tail(&q->ctq_events, e);
   2251 
   2252 	/*
   2253 	 * Check for waiting listeners
   2254 	 */
   2255 	cte_qwakeup(q, e);
   2256 
   2257 	/*
   2258 	 * Trim unnecessary events from the queue.
   2259 	 */
   2260 	cte_trim(q, NULL);
   2261 	mutex_exit(&q->ctq_lock);
   2262 }
   2263 
   2264 /*
   2265  * cte_publish_all
   2266  *
   2267  * Publish an event to all necessary event queues.  The event, e, must
   2268  * be zallocated by the caller, and the event's flags and type must be
   2269  * set.  The rest of the event's fields are initialized here.
   2270  */
   2271 uint64_t
   2272 cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata)
   2273 {
   2274 	ct_equeue_t *q;
   2275 	timespec_t ts;
   2276 	uint64_t evid;
   2277 	ct_kevent_t *negev;
   2278 	int negend;
   2279 
   2280 	e->cte_contract = ct;
   2281 	e->cte_data = data;
   2282 	e->cte_gdata = gdata;
   2283 	e->cte_refs = 3;
   2284 	evid = e->cte_id = atomic_add_64_nv(&ct->ct_type->ct_type_evid, 1);
   2285 	contract_hold(ct);
   2286 
   2287 	/*
   2288 	 * For a negotiation event we set the ct->ct_nevent field of the
   2289 	 * contract for the duration of the negotiation
   2290 	 */
   2291 	negend = 0;
   2292 	if (e->cte_flags & CTE_NEG) {
   2293 		cte_hold(e);
   2294 		ct->ct_nevent = e;
   2295 	} else if (e->cte_type == CT_EV_NEGEND) {
   2296 		negend = 1;
   2297 	}
   2298 
   2299 	gethrestime(&ts);
   2300 
   2301 	/*
   2302 	 * ct_evtlock simply (and only) ensures that two events sent
   2303 	 * from the same contract are delivered to all queues in the
   2304 	 * same order.
   2305 	 */
   2306 	mutex_enter(&ct->ct_evtlock);
   2307 
   2308 	/*
   2309 	 * CTEL_CONTRACT - First deliver to the contract queue, acking
   2310 	 * the event if the contract has been orphaned.
   2311 	 */
   2312 	mutex_enter(&ct->ct_lock);
   2313 	mutex_enter(&ct->ct_events.ctq_lock);
   2314 	if ((e->cte_flags & CTE_INFO) == 0) {
   2315 		if (ct->ct_state >= CTS_ORPHAN)
   2316 			e->cte_flags |= CTE_ACK;
   2317 		else
   2318 			ct->ct_evcnt++;
   2319 	}
   2320 	mutex_exit(&ct->ct_lock);
   2321 	cte_publish(&ct->ct_events, e, &ts);
   2322 
   2323 	/*
   2324 	 * CTEL_BUNDLE - Next deliver to the contract type's bundle
   2325 	 * queue.
   2326 	 */
   2327 	mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
   2328 	cte_publish(&ct->ct_type->ct_type_events, e, &ts);
   2329 
   2330 	/*
   2331 	 * CTEL_PBUNDLE - Finally, if the contract has an owner,
   2332 	 * deliver to the owner's process bundle queue.
   2333 	 */
   2334 	mutex_enter(&ct->ct_lock);
   2335 	if (ct->ct_owner) {
   2336 		/*
   2337 		 * proc_exit doesn't free event queues until it has
   2338 		 * abandoned all contracts.
   2339 		 */
   2340 		ASSERT(ct->ct_owner->p_ct_equeue);
   2341 		ASSERT(ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]);
   2342 		q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
   2343 		mutex_enter(&q->ctq_lock);
   2344 		mutex_exit(&ct->ct_lock);
   2345 		cte_publish(q, e, &ts);
   2346 	} else {
   2347 		mutex_exit(&ct->ct_lock);
   2348 		cte_rele(e);
   2349 	}
   2350 
   2351 	if (negend) {
   2352 		mutex_enter(&ct->ct_lock);
   2353 		negev = ct->ct_nevent;
   2354 		ct->ct_nevent = NULL;
   2355 		cte_rele(negev);
   2356 		mutex_exit(&ct->ct_lock);
   2357 	}
   2358 
   2359 	mutex_exit(&ct->ct_evtlock);
   2360 
   2361 	return (evid);
   2362 }
   2363 
   2364 /*
   2365  * cte_add_listener
   2366  *
   2367  * Add a new listener to an event queue.
   2368  */
   2369 void
   2370 cte_add_listener(ct_equeue_t *q, ct_listener_t *l)
   2371 {
   2372 	cv_init(&l->ctl_cv, NULL, CV_DEFAULT, NULL);
   2373 	l->ctl_equeue = q;
   2374 	l->ctl_position = NULL;
   2375 	l->ctl_flags = 0;
   2376 
   2377 	mutex_enter(&q->ctq_lock);
   2378 	list_insert_head(&q->ctq_tail, l);
   2379 	list_insert_head(&q->ctq_listeners, l);
   2380 	q->ctq_nlisteners++;
   2381 	mutex_exit(&q->ctq_lock);
   2382 }
   2383 
   2384 /*
   2385  * cte_remove_listener
   2386  *
   2387  * Remove a listener from an event queue.  No other queue activities
   2388  * (e.g. cte_get event) may be in progress at this endpoint when this
   2389  * is called.
   2390  */
   2391 void
   2392 cte_remove_listener(ct_listener_t *l)
   2393 {
   2394 	ct_equeue_t *q = l->ctl_equeue;
   2395 	ct_kevent_t *e;
   2396 
   2397 	mutex_enter(&q->ctq_lock);
   2398 
   2399 	ASSERT((l->ctl_flags & (CTLF_COPYOUT|CTLF_RESET)) == 0);
   2400 
   2401 	if ((e = l->ctl_position) != NULL)
   2402 		cte_qrele(q, l, e);
   2403 	else
   2404 		list_remove(&q->ctq_tail, l);
   2405 	l->ctl_position = NULL;
   2406 
   2407 	q->ctq_nlisteners--;
   2408 	list_remove(&q->ctq_listeners, l);
   2409 
   2410 	if (l->ctl_flags & CTLF_RELIABLE)
   2411 		q->ctq_nreliable--;
   2412 
   2413 	/*
   2414 	 * If we are a the last listener of a dead reference counted
   2415 	 * queue (i.e. a process bundle) we free it.  Otherwise we just
   2416 	 * trim any events which may have been kept around for our
   2417 	 * benefit.
   2418 	 */
   2419 	if ((q->ctq_flags & CTQ_REFFED) && (q->ctq_flags & CTQ_DEAD) &&
   2420 	    (q->ctq_nlisteners == 0)) {
   2421 		cte_queue_destroy(q);
   2422 	} else {
   2423 		cte_trim(q, NULL);
   2424 		mutex_exit(&q->ctq_lock);
   2425 	}
   2426 }
   2427 
   2428 /*
   2429  * cte_reset_listener
   2430  *
   2431  * Moves a listener's queue pointer to the beginning of the queue.
   2432  */
   2433 void
   2434 cte_reset_listener(ct_listener_t *l)
   2435 {
   2436 	ct_equeue_t *q = l->ctl_equeue;
   2437 
   2438 	mutex_enter(&q->ctq_lock);
   2439 
   2440 	/*
   2441 	 * We allow an asynchronous reset because it doesn't make a
   2442 	 * whole lot of sense to make reset block or fail.  We already
   2443 	 * have most of the mechanism needed thanks to queue trimming,
   2444 	 * so implementing it isn't a big deal.
   2445 	 */
   2446 	if (l->ctl_flags & CTLF_COPYOUT)
   2447 		l->ctl_flags |= CTLF_RESET;
   2448 
   2449 	(void) cte_qmove(q, l, list_head(&q->ctq_events));
   2450 
   2451 	/*
   2452 	 * Inform blocked readers.
   2453 	 */
   2454 	cv_broadcast(&l->ctl_cv);
   2455 	pollwakeup(&l->ctl_pollhead, POLLIN);
   2456 	mutex_exit(&q->ctq_lock);
   2457 }
   2458 
   2459 /*
   2460  * cte_next_event
   2461  *
   2462  * Moves the event pointer for the specified listener to the next event
   2463  * on the queue.  To avoid races, this movement only occurs if the
   2464  * specified event id matches that of the current event.  This is used
   2465  * primarily to skip events that have been read but whose extended data
   2466  * haven't been copied out.
   2467  */
   2468 int
   2469 cte_next_event(ct_listener_t *l, uint64_t id)
   2470 {
   2471 	ct_equeue_t *q = l->ctl_equeue;
   2472 	ct_kevent_t *old;
   2473 
   2474 	mutex_enter(&q->ctq_lock);
   2475 
   2476 	if (l->ctl_flags & CTLF_COPYOUT)
   2477 		l->ctl_flags |= CTLF_RESET;
   2478 
   2479 	if (((old = l->ctl_position) != NULL) && (old->cte_id == id))
   2480 		(void) cte_qmove(q, l, list_next(&q->ctq_events, old));
   2481 
   2482 	mutex_exit(&q->ctq_lock);
   2483 
   2484 	return (0);
   2485 }
   2486 
   2487 /*
   2488  * cte_get_event
   2489  *
   2490  * Reads an event from an event endpoint.  If "nonblock" is clear, we
   2491  * block until a suitable event is ready.  If "crit" is set, we only
   2492  * read critical events.  Note that while "cr" is the caller's cred,
   2493  * "zuniqid" is the unique id of the zone the calling contract
   2494  * filesystem was mounted in.
   2495  */
   2496 int
   2497 cte_get_event(ct_listener_t *l, int nonblock, void *uaddr, const cred_t *cr,
   2498     uint64_t zuniqid, int crit)
   2499 {
   2500 	ct_equeue_t *q = l->ctl_equeue;
   2501 	ct_kevent_t *temp;
   2502 	int result = 0;
   2503 	int partial = 0;
   2504 	size_t size, gsize, len;
   2505 	model_t mdl = get_udatamodel();
   2506 	STRUCT_DECL(ct_event, ev);
   2507 	STRUCT_INIT(ev, mdl);
   2508 
   2509 	/*
   2510 	 * cte_qreadable checks for CTLF_COPYOUT as well as ensures
   2511 	 * that there exists, and we are pointing to, an appropriate
   2512 	 * event.  It may temporarily drop ctq_lock, but that doesn't
   2513 	 * really matter to us.
   2514 	 */
   2515 	mutex_enter(&q->ctq_lock);
   2516 	while (cte_qreadable(q, l, cr, zuniqid, crit)) {
   2517 		if (nonblock) {
   2518 			result = EAGAIN;
   2519 			goto error;
   2520 		}
   2521 		if (q->ctq_flags & CTQ_DEAD) {
   2522 			result = EIDRM;
   2523 			goto error;
   2524 		}
   2525 		result = cv_wait_sig(&l->ctl_cv, &q->ctq_lock);
   2526 		if (result == 0) {
   2527 			result = EINTR;
   2528 			goto error;
   2529 		}
   2530 	}
   2531 	temp = l->ctl_position;
   2532 	cte_hold(temp);
   2533 	l->ctl_flags |= CTLF_COPYOUT;
   2534 	mutex_exit(&q->ctq_lock);
   2535 
   2536 	/*
   2537 	 * We now have an event.  Copy in the user event structure to
   2538 	 * see how much space we have to work with.
   2539 	 */
   2540 	result = copyin(uaddr, STRUCT_BUF(ev), STRUCT_SIZE(ev));
   2541 	if (result)
   2542 		goto copyerr;
   2543 
   2544 	/*
   2545 	 * Determine what data we have and what the user should be
   2546 	 * allowed to see.
   2547 	 */
   2548 	size = gsize = 0;
   2549 	if (temp->cte_data) {
   2550 		VERIFY(nvlist_size(temp->cte_data, &size,
   2551 		    NV_ENCODE_NATIVE) == 0);
   2552 		ASSERT(size != 0);
   2553 	}
   2554 	if (zuniqid == GLOBAL_ZONEUNIQID && temp->cte_gdata) {
   2555 		VERIFY(nvlist_size(temp->cte_gdata, &gsize,
   2556 		    NV_ENCODE_NATIVE) == 0);
   2557 		ASSERT(gsize != 0);
   2558 	}
   2559 
   2560 	/*
   2561 	 * If we have enough space, copy out the extended event data.
   2562 	 */
   2563 	len = size + gsize;
   2564 	if (len) {
   2565 		if (STRUCT_FGET(ev, ctev_nbytes) >= len) {
   2566 			char *buf = kmem_alloc(len, KM_SLEEP);
   2567 
   2568 			if (size)
   2569 				VERIFY(nvlist_pack(temp->cte_data, &buf, &size,
   2570 				    NV_ENCODE_NATIVE, KM_SLEEP) == 0);
   2571 			if (gsize) {
   2572 				char *tmp = buf + size;
   2573 
   2574 				VERIFY(nvlist_pack(temp->cte_gdata, &tmp,
   2575 				    &gsize, NV_ENCODE_NATIVE, KM_SLEEP) == 0);
   2576 			}
   2577 
   2578 			/* This shouldn't have changed */
   2579 			ASSERT(size + gsize == len);
   2580 			result = copyout(buf, STRUCT_FGETP(ev, ctev_buffer),
   2581 			    len);
   2582 			kmem_free(buf, len);
   2583 			if (result)
   2584 				goto copyerr;
   2585 		} else {
   2586 			partial = 1;
   2587 		}
   2588 	}
   2589 
   2590 	/*
   2591 	 * Copy out the common event data.
   2592 	 */
   2593 	STRUCT_FSET(ev, ctev_id, temp->cte_contract->ct_id);
   2594 	STRUCT_FSET(ev, ctev_evid, temp->cte_id);
   2595 	STRUCT_FSET(ev, ctev_cttype,
   2596 	    temp->cte_contract->ct_type->ct_type_index);
   2597 	STRUCT_FSET(ev, ctev_flags, temp->cte_flags &
   2598 	    (CTE_ACK|CTE_INFO|CTE_NEG));
   2599 	STRUCT_FSET(ev, ctev_type, temp->cte_type);
   2600 	STRUCT_FSET(ev, ctev_nbytes, len);
   2601 	STRUCT_FSET(ev, ctev_goffset, size);
   2602 	result = copyout(STRUCT_BUF(ev), uaddr, STRUCT_SIZE(ev));
   2603 
   2604 copyerr:
   2605 	/*
   2606 	 * Only move our location in the queue if all copyouts were
   2607 	 * successful, the caller provided enough space for the entire
   2608 	 * event, and our endpoint wasn't reset or otherwise moved by
   2609 	 * another thread.
   2610 	 */
   2611 	mutex_enter(&q->ctq_lock);
   2612 	if (result)
   2613 		result = EFAULT;
   2614 	else if (!partial && ((l->ctl_flags & CTLF_RESET) == 0) &&
   2615 	    (l->ctl_position == temp))
   2616 		(void) cte_qmove(q, l, list_next(&q->ctq_events, temp));
   2617 	l->ctl_flags &= ~(CTLF_COPYOUT|CTLF_RESET);
   2618 	/*
   2619 	 * Signal any readers blocked on our CTLF_COPYOUT.
   2620 	 */
   2621 	cv_signal(&l->ctl_cv);
   2622 	cte_rele(temp);
   2623 
   2624 error:
   2625 	mutex_exit(&q->ctq_lock);
   2626 	return (result);
   2627 }
   2628 
   2629 /*
   2630  * cte_set_reliable
   2631  *
   2632  * Requests that events be reliably delivered to an event endpoint.
   2633  * Unread informative and acknowledged critical events will not be
   2634  * removed from the queue until this listener reads or skips them.
   2635  * Because a listener could maliciously request reliable delivery and
   2636  * then do nothing, this requires that PRIV_CONTRACT_EVENT be in the
   2637  * caller's effective set.
   2638  */
   2639 int
   2640 cte_set_reliable(ct_listener_t *l, const cred_t *cr)
   2641 {
   2642 	ct_equeue_t *q = l->ctl_equeue;
   2643 	int error;
   2644 
   2645 	if ((error = secpolicy_contract_event(cr)) != 0)
   2646 		return (error);
   2647 
   2648 	mutex_enter(&q->ctq_lock);
   2649 	if ((l->ctl_flags & CTLF_RELIABLE) == 0) {
   2650 		l->ctl_flags |= CTLF_RELIABLE;
   2651 		q->ctq_nreliable++;
   2652 		if (l->ctl_position != NULL)
   2653 			l->ctl_position->cte_nodes[q->ctq_listno].
   2654 			    ctm_nreliable++;
   2655 	}
   2656 	mutex_exit(&q->ctq_lock);
   2657 
   2658 	return (0);
   2659 }
   2660