Home | History | Annotate | Download | only in syscall
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     28 /*	  All Rights Reserved	*/
     29 
     30 #include <sys/param.h>
     31 #include <sys/types.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/systm.h>
     34 #include <sys/cred.h>
     35 #include <sys/user.h>
     36 #include <sys/errno.h>
     37 #include <sys/file.h>
     38 #include <sys/proc.h>
     39 #include <sys/prsystm.h>
     40 #include <sys/kmem.h>
     41 #include <sys/sobject.h>
     42 #include <sys/fault.h>
     43 #include <sys/procfs.h>
     44 #include <sys/watchpoint.h>
     45 #include <sys/time.h>
     46 #include <sys/cmn_err.h>
     47 #include <sys/machlock.h>
     48 #include <sys/debug.h>
     49 #include <sys/synch.h>
     50 #include <sys/synch32.h>
     51 #include <sys/mman.h>
     52 #include <sys/class.h>
     53 #include <sys/schedctl.h>
     54 #include <sys/sleepq.h>
     55 #include <sys/policy.h>
     56 #include <sys/tnf_probe.h>
     57 #include <sys/lwpchan_impl.h>
     58 #include <sys/turnstile.h>
     59 #include <sys/atomic.h>
     60 #include <sys/lwp_timer_impl.h>
     61 #include <sys/lwp_upimutex_impl.h>
     62 #include <vm/as.h>
     63 #include <sys/sdt.h>
     64 
     65 static kthread_t *lwpsobj_owner(caddr_t);
     66 static void lwp_unsleep(kthread_t *t);
     67 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
     68 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
     69 static void lwp_mutex_unregister(void *uaddr);
     70 static void set_owner_pid(lwp_mutex_t *, uintptr_t, pid_t);
     71 static int iswanted(kthread_t *, lwpchan_t *);
     72 
     73 extern int lwp_cond_signal(lwp_cond_t *cv);
     74 
     75 /*
     76  * Maximum number of user prio inheritance locks that can be held by a thread.
     77  * Used to limit kmem for each thread. This is a per-thread limit that
     78  * can be administered on a system wide basis (using /etc/system).
     79  *
     80  * Also, when a limit, say maxlwps is added for numbers of lwps within a
     81  * process, the per-thread limit automatically becomes a process-wide limit
     82  * of maximum number of held upi locks within a process:
     83  *      maxheldupimx = maxnestupimx * maxlwps;
     84  */
     85 static uint32_t maxnestupimx = 2000;
     86 
     87 /*
     88  * The sobj_ops vector exports a set of functions needed when a thread
     89  * is asleep on a synchronization object of this type.
     90  */
     91 static sobj_ops_t lwp_sobj_ops = {
     92 	SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
     93 };
     94 
     95 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
     96 
     97 static sobj_ops_t lwp_sobj_pi_ops = {
     98 	SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
     99 	turnstile_change_pri
    100 };
    101 
    102 static sleepq_head_t	lwpsleepq[NSLEEPQ];
    103 upib_t			upimutextab[UPIMUTEX_TABSIZE];
    104 
    105 #define	LWPCHAN_LOCK_SHIFT	10	/* 1024 locks for each pool */
    106 #define	LWPCHAN_LOCK_SIZE	(1 << LWPCHAN_LOCK_SHIFT)
    107 
    108 /*
    109  * We know that both lc_wchan and lc_wchan0 are addresses that most
    110  * likely are 8-byte aligned, so we shift off the low-order 3 bits.
    111  * 'pool' is either 0 or 1.
    112  */
    113 #define	LWPCHAN_LOCK_HASH(X, pool) \
    114 	(((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
    115 	(LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
    116 
    117 static kmutex_t		lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
    118 
    119 /*
    120  * Is this a POSIX threads user-level lock requiring priority inheritance?
    121  */
    122 #define	UPIMUTEX(type)	((type) & LOCK_PRIO_INHERIT)
    123 
    124 static sleepq_head_t *
    125 lwpsqhash(lwpchan_t *lwpchan)
    126 {
    127 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
    128 	return (&lwpsleepq[SQHASHINDEX(x)]);
    129 }
    130 
    131 /*
    132  * Lock an lwpchan.
    133  * Keep this in sync with lwpchan_unlock(), below.
    134  */
    135 static void
    136 lwpchan_lock(lwpchan_t *lwpchan, int pool)
    137 {
    138 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
    139 	mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
    140 }
    141 
    142 /*
    143  * Unlock an lwpchan.
    144  * Keep this in sync with lwpchan_lock(), above.
    145  */
    146 static void
    147 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
    148 {
    149 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
    150 	mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
    151 }
    152 
    153 /*
    154  * Delete mappings from the lwpchan cache for pages that are being
    155  * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
    156  * all mappings within the range are deleted from the lwpchan cache.
    157  */
    158 void
    159 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
    160 {
    161 	lwpchan_data_t *lcp;
    162 	lwpchan_hashbucket_t *hashbucket;
    163 	lwpchan_hashbucket_t *endbucket;
    164 	lwpchan_entry_t *ent;
    165 	lwpchan_entry_t **prev;
    166 	caddr_t addr;
    167 
    168 	mutex_enter(&p->p_lcp_lock);
    169 	lcp = p->p_lcp;
    170 	hashbucket = lcp->lwpchan_cache;
    171 	endbucket = hashbucket + lcp->lwpchan_size;
    172 	for (; hashbucket < endbucket; hashbucket++) {
    173 		if (hashbucket->lwpchan_chain == NULL)
    174 			continue;
    175 		mutex_enter(&hashbucket->lwpchan_lock);
    176 		prev = &hashbucket->lwpchan_chain;
    177 		/* check entire chain */
    178 		while ((ent = *prev) != NULL) {
    179 			addr = ent->lwpchan_addr;
    180 			if (start <= addr && addr < end) {
    181 				*prev = ent->lwpchan_next;
    182 				/*
    183 				 * We do this only for the obsolete type
    184 				 * USYNC_PROCESS_ROBUST.  Otherwise robust
    185 				 * locks do not draw ELOCKUNMAPPED or
    186 				 * EOWNERDEAD due to being unmapped.
    187 				 */
    188 				if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
    189 				    (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
    190 					lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
    191 				/*
    192 				 * If there is a user-level robust lock
    193 				 * registration, mark it as invalid.
    194 				 */
    195 				if ((addr = ent->lwpchan_uaddr) != NULL)
    196 					lwp_mutex_unregister(addr);
    197 				kmem_free(ent, sizeof (*ent));
    198 				atomic_add_32(&lcp->lwpchan_entries, -1);
    199 			} else {
    200 				prev = &ent->lwpchan_next;
    201 			}
    202 		}
    203 		mutex_exit(&hashbucket->lwpchan_lock);
    204 	}
    205 	mutex_exit(&p->p_lcp_lock);
    206 }
    207 
    208 /*
    209  * Given an lwpchan cache pointer and a process virtual address,
    210  * return a pointer to the corresponding lwpchan hash bucket.
    211  */
    212 static lwpchan_hashbucket_t *
    213 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
    214 {
    215 	uint_t i;
    216 
    217 	/*
    218 	 * All user-level sync object addresses are 8-byte aligned.
    219 	 * Ignore the lowest 3 bits of the address and use the
    220 	 * higher-order 2*lwpchan_bits bits for the hash index.
    221 	 */
    222 	addr >>= 3;
    223 	i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
    224 	return (lcp->lwpchan_cache + i);
    225 }
    226 
    227 /*
    228  * (Re)allocate the per-process lwpchan cache.
    229  */
    230 static void
    231 lwpchan_alloc_cache(proc_t *p, uint_t bits)
    232 {
    233 	lwpchan_data_t *lcp;
    234 	lwpchan_data_t *old_lcp;
    235 	lwpchan_hashbucket_t *hashbucket;
    236 	lwpchan_hashbucket_t *endbucket;
    237 	lwpchan_hashbucket_t *newbucket;
    238 	lwpchan_entry_t *ent;
    239 	lwpchan_entry_t *next;
    240 	uint_t count;
    241 
    242 	ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
    243 
    244 	lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
    245 	lcp->lwpchan_bits = bits;
    246 	lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
    247 	lcp->lwpchan_mask = lcp->lwpchan_size - 1;
    248 	lcp->lwpchan_entries = 0;
    249 	lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
    250 	    sizeof (lwpchan_hashbucket_t), KM_SLEEP);
    251 	lcp->lwpchan_next_data = NULL;
    252 
    253 	mutex_enter(&p->p_lcp_lock);
    254 	if ((old_lcp = p->p_lcp) != NULL) {
    255 		if (old_lcp->lwpchan_bits >= bits) {
    256 			/* someone beat us to it */
    257 			mutex_exit(&p->p_lcp_lock);
    258 			kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
    259 			    sizeof (lwpchan_hashbucket_t));
    260 			kmem_free(lcp, sizeof (lwpchan_data_t));
    261 			return;
    262 		}
    263 		/*
    264 		 * Acquire all of the old hash table locks.
    265 		 */
    266 		hashbucket = old_lcp->lwpchan_cache;
    267 		endbucket = hashbucket + old_lcp->lwpchan_size;
    268 		for (; hashbucket < endbucket; hashbucket++)
    269 			mutex_enter(&hashbucket->lwpchan_lock);
    270 		/*
    271 		 * Move all of the old hash table entries to the
    272 		 * new hash table.  The new hash table has not yet
    273 		 * been installed so we don't need any of its locks.
    274 		 */
    275 		count = 0;
    276 		hashbucket = old_lcp->lwpchan_cache;
    277 		for (; hashbucket < endbucket; hashbucket++) {
    278 			ent = hashbucket->lwpchan_chain;
    279 			while (ent != NULL) {
    280 				next = ent->lwpchan_next;
    281 				newbucket = lwpchan_bucket(lcp,
    282 				    (uintptr_t)ent->lwpchan_addr);
    283 				ent->lwpchan_next = newbucket->lwpchan_chain;
    284 				newbucket->lwpchan_chain = ent;
    285 				ent = next;
    286 				count++;
    287 			}
    288 			hashbucket->lwpchan_chain = NULL;
    289 		}
    290 		lcp->lwpchan_entries = count;
    291 	}
    292 
    293 	/*
    294 	 * Retire the old hash table.  We can't actually kmem_free() it
    295 	 * now because someone may still have a pointer to it.  Instead,
    296 	 * we link it onto the new hash table's list of retired hash tables.
    297 	 * The new hash table is double the size of the previous one, so
    298 	 * the total size of all retired hash tables is less than the size
    299 	 * of the new one.  exit() and exec() free the retired hash tables
    300 	 * (see lwpchan_destroy_cache(), below).
    301 	 */
    302 	lcp->lwpchan_next_data = old_lcp;
    303 
    304 	/*
    305 	 * As soon as we store the new lcp, future locking operations will
    306 	 * use it.  Therefore, we must ensure that all the state we've just
    307 	 * established reaches global visibility before the new lcp does.
    308 	 */
    309 	membar_producer();
    310 	p->p_lcp = lcp;
    311 
    312 	if (old_lcp != NULL) {
    313 		/*
    314 		 * Release all of the old hash table locks.
    315 		 */
    316 		hashbucket = old_lcp->lwpchan_cache;
    317 		for (; hashbucket < endbucket; hashbucket++)
    318 			mutex_exit(&hashbucket->lwpchan_lock);
    319 	}
    320 	mutex_exit(&p->p_lcp_lock);
    321 }
    322 
    323 /*
    324  * Deallocate the lwpchan cache, and any dynamically allocated mappings.
    325  * Called when the process exits or execs.  All lwps except one have
    326  * exited so we need no locks here.
    327  */
    328 void
    329 lwpchan_destroy_cache(int exec)
    330 {
    331 	proc_t *p = curproc;
    332 	lwpchan_hashbucket_t *hashbucket;
    333 	lwpchan_hashbucket_t *endbucket;
    334 	lwpchan_data_t *lcp;
    335 	lwpchan_entry_t *ent;
    336 	lwpchan_entry_t *next;
    337 	uint16_t lockflg;
    338 
    339 	lcp = p->p_lcp;
    340 	p->p_lcp = NULL;
    341 
    342 	lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
    343 	hashbucket = lcp->lwpchan_cache;
    344 	endbucket = hashbucket + lcp->lwpchan_size;
    345 	for (; hashbucket < endbucket; hashbucket++) {
    346 		ent = hashbucket->lwpchan_chain;
    347 		hashbucket->lwpchan_chain = NULL;
    348 		while (ent != NULL) {
    349 			next = ent->lwpchan_next;
    350 			if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
    351 			    (ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
    352 			    == (USYNC_PROCESS | LOCK_ROBUST))
    353 				lwp_mutex_cleanup(ent, lockflg);
    354 			kmem_free(ent, sizeof (*ent));
    355 			ent = next;
    356 		}
    357 	}
    358 
    359 	while (lcp != NULL) {
    360 		lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
    361 		kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
    362 		    sizeof (lwpchan_hashbucket_t));
    363 		kmem_free(lcp, sizeof (lwpchan_data_t));
    364 		lcp = next_lcp;
    365 	}
    366 }
    367 
    368 /*
    369  * Return zero when there is an entry in the lwpchan cache for the
    370  * given process virtual address and non-zero when there is not.
    371  * The returned non-zero value is the current length of the
    372  * hash chain plus one.  The caller holds the hash bucket lock.
    373  */
    374 static uint_t
    375 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
    376 	lwpchan_hashbucket_t *hashbucket)
    377 {
    378 	lwpchan_entry_t *ent;
    379 	uint_t count = 1;
    380 
    381 	for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
    382 		if (ent->lwpchan_addr == addr) {
    383 			if (ent->lwpchan_type != type ||
    384 			    ent->lwpchan_pool != pool) {
    385 				/*
    386 				 * This shouldn't happen, but might if the
    387 				 * process reuses its memory for different
    388 				 * types of sync objects.  We test first
    389 				 * to avoid grabbing the memory cache line.
    390 				 */
    391 				ent->lwpchan_type = (uint16_t)type;
    392 				ent->lwpchan_pool = (uint16_t)pool;
    393 			}
    394 			*lwpchan = ent->lwpchan_lwpchan;
    395 			return (0);
    396 		}
    397 		count++;
    398 	}
    399 	return (count);
    400 }
    401 
    402 /*
    403  * Return the cached lwpchan mapping if cached, otherwise insert
    404  * a virtual address to lwpchan mapping into the cache.
    405  */
    406 static int
    407 lwpchan_get_mapping(struct as *as, caddr_t addr, caddr_t uaddr,
    408 	int type, lwpchan_t *lwpchan, int pool)
    409 {
    410 	proc_t *p = curproc;
    411 	lwpchan_data_t *lcp;
    412 	lwpchan_hashbucket_t *hashbucket;
    413 	lwpchan_entry_t *ent;
    414 	memid_t	memid;
    415 	uint_t count;
    416 	uint_t bits;
    417 
    418 top:
    419 	/* initialize the lwpchan cache, if necesary */
    420 	if ((lcp = p->p_lcp) == NULL) {
    421 		lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
    422 		goto top;
    423 	}
    424 	hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
    425 	mutex_enter(&hashbucket->lwpchan_lock);
    426 	if (lcp != p->p_lcp) {
    427 		/* someone resized the lwpchan cache; start over */
    428 		mutex_exit(&hashbucket->lwpchan_lock);
    429 		goto top;
    430 	}
    431 	if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
    432 		/* it's in the cache */
    433 		mutex_exit(&hashbucket->lwpchan_lock);
    434 		return (1);
    435 	}
    436 	mutex_exit(&hashbucket->lwpchan_lock);
    437 	if (as_getmemid(as, addr, &memid) != 0)
    438 		return (0);
    439 	lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
    440 	lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
    441 	ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
    442 	mutex_enter(&hashbucket->lwpchan_lock);
    443 	if (lcp != p->p_lcp) {
    444 		/* someone resized the lwpchan cache; start over */
    445 		mutex_exit(&hashbucket->lwpchan_lock);
    446 		kmem_free(ent, sizeof (*ent));
    447 		goto top;
    448 	}
    449 	count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
    450 	if (count == 0) {
    451 		/* someone else added this entry to the cache */
    452 		mutex_exit(&hashbucket->lwpchan_lock);
    453 		kmem_free(ent, sizeof (*ent));
    454 		return (1);
    455 	}
    456 	if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
    457 	    (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
    458 		/* hash chain too long; reallocate the hash table */
    459 		mutex_exit(&hashbucket->lwpchan_lock);
    460 		kmem_free(ent, sizeof (*ent));
    461 		lwpchan_alloc_cache(p, bits + 1);
    462 		goto top;
    463 	}
    464 	ent->lwpchan_addr = addr;
    465 	ent->lwpchan_uaddr = uaddr;
    466 	ent->lwpchan_type = (uint16_t)type;
    467 	ent->lwpchan_pool = (uint16_t)pool;
    468 	ent->lwpchan_lwpchan = *lwpchan;
    469 	ent->lwpchan_next = hashbucket->lwpchan_chain;
    470 	hashbucket->lwpchan_chain = ent;
    471 	atomic_add_32(&lcp->lwpchan_entries, 1);
    472 	mutex_exit(&hashbucket->lwpchan_lock);
    473 	return (1);
    474 }
    475 
    476 /*
    477  * Return a unique pair of identifiers that corresponds to a
    478  * synchronization object's virtual address.  Process-shared
    479  * sync objects usually get vnode/offset from as_getmemid().
    480  */
    481 static int
    482 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
    483 {
    484 	/*
    485 	 * If the lwp synch object is defined to be process-private,
    486 	 * we just make the first field of the lwpchan be 'as' and
    487 	 * the second field be the synch object's virtual address.
    488 	 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
    489 	 * The lwpchan cache is used only for process-shared objects.
    490 	 */
    491 	if (!(type & USYNC_PROCESS)) {
    492 		lwpchan->lc_wchan0 = (caddr_t)as;
    493 		lwpchan->lc_wchan = addr;
    494 		return (1);
    495 	}
    496 
    497 	return (lwpchan_get_mapping(as, addr, NULL, type, lwpchan, pool));
    498 }
    499 
    500 static void
    501 lwp_block(lwpchan_t *lwpchan)
    502 {
    503 	kthread_t *t = curthread;
    504 	klwp_t *lwp = ttolwp(t);
    505 	sleepq_head_t *sqh;
    506 
    507 	thread_lock(t);
    508 	t->t_flag |= T_WAKEABLE;
    509 	t->t_lwpchan = *lwpchan;
    510 	t->t_sobj_ops = &lwp_sobj_ops;
    511 	t->t_release = 0;
    512 	sqh = lwpsqhash(lwpchan);
    513 	disp_lock_enter_high(&sqh->sq_lock);
    514 	CL_SLEEP(t);
    515 	DTRACE_SCHED(sleep);
    516 	THREAD_SLEEP(t, &sqh->sq_lock);
    517 	sleepq_insert(&sqh->sq_queue, t);
    518 	thread_unlock(t);
    519 	lwp->lwp_asleep = 1;
    520 	lwp->lwp_sysabort = 0;
    521 	lwp->lwp_ru.nvcsw++;
    522 	(void) new_mstate(curthread, LMS_SLEEP);
    523 }
    524 
    525 static kthread_t *
    526 lwpsobj_pi_owner(upimutex_t *up)
    527 {
    528 	return (up->upi_owner);
    529 }
    530 
    531 static struct upimutex *
    532 upi_get(upib_t *upibp, lwpchan_t *lcp)
    533 {
    534 	struct upimutex *upip;
    535 
    536 	for (upip = upibp->upib_first; upip != NULL;
    537 	    upip = upip->upi_nextchain) {
    538 		if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
    539 		    upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
    540 			break;
    541 	}
    542 	return (upip);
    543 }
    544 
    545 static void
    546 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
    547 {
    548 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
    549 
    550 	/*
    551 	 * Insert upimutex at front of list. Maybe a bit unfair
    552 	 * but assume that not many lwpchans hash to the same
    553 	 * upimutextab bucket, i.e. the list of upimutexes from
    554 	 * upib_first is not too long.
    555 	 */
    556 	upimutex->upi_nextchain = upibp->upib_first;
    557 	upibp->upib_first = upimutex;
    558 }
    559 
    560 static void
    561 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
    562 {
    563 	struct upimutex **prev;
    564 
    565 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
    566 
    567 	prev = &upibp->upib_first;
    568 	while (*prev != upimutex) {
    569 		prev = &(*prev)->upi_nextchain;
    570 	}
    571 	*prev = upimutex->upi_nextchain;
    572 	upimutex->upi_nextchain = NULL;
    573 }
    574 
    575 /*
    576  * Add upimutex to chain of upimutexes held by curthread.
    577  * Returns number of upimutexes held by curthread.
    578  */
    579 static uint32_t
    580 upi_mylist_add(struct upimutex *upimutex)
    581 {
    582 	kthread_t *t = curthread;
    583 
    584 	/*
    585 	 * Insert upimutex at front of list of upimutexes owned by t. This
    586 	 * would match typical LIFO order in which nested locks are acquired
    587 	 * and released.
    588 	 */
    589 	upimutex->upi_nextowned = t->t_upimutex;
    590 	t->t_upimutex = upimutex;
    591 	t->t_nupinest++;
    592 	ASSERT(t->t_nupinest > 0);
    593 	return (t->t_nupinest);
    594 }
    595 
    596 /*
    597  * Delete upimutex from list of upimutexes owned by curthread.
    598  */
    599 static void
    600 upi_mylist_del(struct upimutex *upimutex)
    601 {
    602 	kthread_t *t = curthread;
    603 	struct upimutex **prev;
    604 
    605 	/*
    606 	 * Since the order in which nested locks are acquired and released,
    607 	 * is typically LIFO, and typical nesting levels are not too deep, the
    608 	 * following should not be expensive in the general case.
    609 	 */
    610 	prev = &t->t_upimutex;
    611 	while (*prev != upimutex) {
    612 		prev = &(*prev)->upi_nextowned;
    613 	}
    614 	*prev = upimutex->upi_nextowned;
    615 	upimutex->upi_nextowned = NULL;
    616 	ASSERT(t->t_nupinest > 0);
    617 	t->t_nupinest--;
    618 }
    619 
    620 /*
    621  * Returns true if upimutex is owned. Should be called only when upim points
    622  * to kmem which cannot disappear from underneath.
    623  */
    624 static int
    625 upi_owned(upimutex_t *upim)
    626 {
    627 	return (upim->upi_owner == curthread);
    628 }
    629 
    630 /*
    631  * Returns pointer to kernel object (upimutex_t *) if lp is owned.
    632  */
    633 static struct upimutex *
    634 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
    635 {
    636 	lwpchan_t lwpchan;
    637 	upib_t *upibp;
    638 	struct upimutex *upimutex;
    639 
    640 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
    641 	    &lwpchan, LWPCHAN_MPPOOL))
    642 		return (NULL);
    643 
    644 	upibp = &UPI_CHAIN(lwpchan);
    645 	mutex_enter(&upibp->upib_lock);
    646 	upimutex = upi_get(upibp, &lwpchan);
    647 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
    648 		mutex_exit(&upibp->upib_lock);
    649 		return (NULL);
    650 	}
    651 	mutex_exit(&upibp->upib_lock);
    652 	return (upimutex);
    653 }
    654 
    655 /*
    656  * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
    657  * no lock hand-off occurrs.
    658  */
    659 static void
    660 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
    661 {
    662 	turnstile_t *ts;
    663 	upib_t *upibp;
    664 	kthread_t *newowner;
    665 
    666 	upi_mylist_del(upimutex);
    667 	upibp = upimutex->upi_upibp;
    668 	mutex_enter(&upibp->upib_lock);
    669 	if (upimutex->upi_waiter != 0) { /* if waiters */
    670 		ts = turnstile_lookup(upimutex);
    671 		if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
    672 			/* hand-off lock to highest prio waiter */
    673 			newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
    674 			upimutex->upi_owner = newowner;
    675 			if (ts->ts_waiters == 1)
    676 				upimutex->upi_waiter = 0;
    677 			turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
    678 			mutex_exit(&upibp->upib_lock);
    679 			return;
    680 		} else if (ts != NULL) {
    681 			/* LOCK_NOTRECOVERABLE: wakeup all */
    682 			turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
    683 		} else {
    684 			/*
    685 			 * Misleading w bit. Waiters might have been
    686 			 * interrupted. No need to clear the w bit (upimutex
    687 			 * will soon be freed). Re-calculate PI from existing
    688 			 * waiters.
    689 			 */
    690 			turnstile_exit(upimutex);
    691 			turnstile_pi_recalc();
    692 		}
    693 	}
    694 	/*
    695 	 * no waiters, or LOCK_NOTRECOVERABLE.
    696 	 * remove from the bucket chain of upi mutexes.
    697 	 * de-allocate kernel memory (upimutex).
    698 	 */
    699 	upi_chain_del(upimutex->upi_upibp, upimutex);
    700 	mutex_exit(&upibp->upib_lock);
    701 	kmem_free(upimutex, sizeof (upimutex_t));
    702 }
    703 
    704 static int
    705 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
    706 {
    707 	label_t ljb;
    708 	int error = 0;
    709 	lwpchan_t lwpchan;
    710 	uint16_t flag;
    711 	upib_t *upibp;
    712 	volatile struct upimutex *upimutex = NULL;
    713 	turnstile_t *ts;
    714 	uint32_t nupinest;
    715 	volatile int upilocked = 0;
    716 
    717 	if (on_fault(&ljb)) {
    718 		if (upilocked)
    719 			upimutex_unlock((upimutex_t *)upimutex, 0);
    720 		error = EFAULT;
    721 		goto out;
    722 	}
    723 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
    724 	    &lwpchan, LWPCHAN_MPPOOL)) {
    725 		error = EFAULT;
    726 		goto out;
    727 	}
    728 	upibp = &UPI_CHAIN(lwpchan);
    729 retry:
    730 	mutex_enter(&upibp->upib_lock);
    731 	upimutex = upi_get(upibp, &lwpchan);
    732 	if (upimutex == NULL)  {
    733 		/* lock available since lwpchan has no upimutex */
    734 		upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
    735 		upi_chain_add(upibp, (upimutex_t *)upimutex);
    736 		upimutex->upi_owner = curthread; /* grab lock */
    737 		upimutex->upi_upibp = upibp;
    738 		upimutex->upi_vaddr = lp;
    739 		upimutex->upi_lwpchan = lwpchan;
    740 		mutex_exit(&upibp->upib_lock);
    741 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
    742 		upilocked = 1;
    743 		fuword16_noerr(&lp->mutex_flag, &flag);
    744 		if (nupinest > maxnestupimx &&
    745 		    secpolicy_resource(CRED()) != 0) {
    746 			upimutex_unlock((upimutex_t *)upimutex, flag);
    747 			error = ENOMEM;
    748 			goto out;
    749 		}
    750 		if (flag & LOCK_NOTRECOVERABLE) {
    751 			/*
    752 			 * Since the setting of LOCK_NOTRECOVERABLE
    753 			 * was done under the high-level upi mutex,
    754 			 * in lwp_upimutex_unlock(), this flag needs to
    755 			 * be checked while holding the upi mutex.
    756 			 * If set, this thread should return without
    757 			 * the lock held, and with the right error code.
    758 			 */
    759 			upimutex_unlock((upimutex_t *)upimutex, flag);
    760 			upilocked = 0;
    761 			error = ENOTRECOVERABLE;
    762 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
    763 			if (flag & LOCK_OWNERDEAD)
    764 				error = EOWNERDEAD;
    765 			else if (type & USYNC_PROCESS_ROBUST)
    766 				error = ELOCKUNMAPPED;
    767 			else
    768 				error = EOWNERDEAD;
    769 		}
    770 		goto out;
    771 	}
    772 	/*
    773 	 * If a upimutex object exists, it must have an owner.
    774 	 * This is due to lock hand-off, and release of upimutex when no
    775 	 * waiters are present at unlock time,
    776 	 */
    777 	ASSERT(upimutex->upi_owner != NULL);
    778 	if (upimutex->upi_owner == curthread) {
    779 		/*
    780 		 * The user wrapper can check if the mutex type is
    781 		 * ERRORCHECK: if not, it should stall at user-level.
    782 		 * If so, it should return the error code.
    783 		 */
    784 		mutex_exit(&upibp->upib_lock);
    785 		error = EDEADLK;
    786 		goto out;
    787 	}
    788 	if (try == UPIMUTEX_TRY) {
    789 		mutex_exit(&upibp->upib_lock);
    790 		error = EBUSY;
    791 		goto out;
    792 	}
    793 	/*
    794 	 * Block for the lock.
    795 	 */
    796 	if ((error = lwptp->lwpt_time_error) != 0) {
    797 		/*
    798 		 * The SUSV3 Posix spec is very clear that we
    799 		 * should get no error from validating the
    800 		 * timer until we would actually sleep.
    801 		 */
    802 		mutex_exit(&upibp->upib_lock);
    803 		goto out;
    804 	}
    805 	if (lwptp->lwpt_tsp != NULL) {
    806 		/*
    807 		 * Unlike the protocol for other lwp timedwait operations,
    808 		 * we must drop t_delay_lock before going to sleep in
    809 		 * turnstile_block() for a upi mutex.
    810 		 * See the comments below and in turnstile.c
    811 		 */
    812 		mutex_enter(&curthread->t_delay_lock);
    813 		(void) lwp_timer_enqueue(lwptp);
    814 		mutex_exit(&curthread->t_delay_lock);
    815 	}
    816 	/*
    817 	 * Now, set the waiter bit and block for the lock in turnstile_block().
    818 	 * No need to preserve the previous wbit since a lock try is not
    819 	 * attempted after setting the wait bit. Wait bit is set under
    820 	 * the upib_lock, which is not released until the turnstile lock
    821 	 * is acquired. Say, the upimutex is L:
    822 	 *
    823 	 * 1. upib_lock is held so the waiter does not have to retry L after
    824 	 *    setting the wait bit: since the owner has to grab the upib_lock
    825 	 *    to unlock L, it will certainly see the wait bit set.
    826 	 * 2. upib_lock is not released until the turnstile lock is acquired.
    827 	 *    This is the key to preventing a missed wake-up. Otherwise, the
    828 	 *    owner could acquire the upib_lock, and the tc_lock, to call
    829 	 *    turnstile_wakeup(). All this, before the waiter gets tc_lock
    830 	 *    to sleep in turnstile_block(). turnstile_wakeup() will then not
    831 	 *    find this waiter, resulting in the missed wakeup.
    832 	 * 3. The upib_lock, being a kernel mutex, cannot be released while
    833 	 *    holding the tc_lock (since mutex_exit() could need to acquire
    834 	 *    the same tc_lock)...and so is held when calling turnstile_block().
    835 	 *    The address of upib_lock is passed to turnstile_block() which
    836 	 *    releases it after releasing all turnstile locks, and before going
    837 	 *    to sleep in swtch().
    838 	 * 4. The waiter value cannot be a count of waiters, because a waiter
    839 	 *    can be interrupted. The interrupt occurs under the tc_lock, at
    840 	 *    which point, the upib_lock cannot be locked, to decrement waiter
    841 	 *    count. So, just treat the waiter state as a bit, not a count.
    842 	 */
    843 	ts = turnstile_lookup((upimutex_t *)upimutex);
    844 	upimutex->upi_waiter = 1;
    845 	error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
    846 	    &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
    847 	/*
    848 	 * Hand-off implies that we wakeup holding the lock, except when:
    849 	 *	- deadlock is detected
    850 	 *	- lock is not recoverable
    851 	 *	- we got an interrupt or timeout
    852 	 * If we wake up due to an interrupt or timeout, we may
    853 	 * or may not be holding the lock due to mutex hand-off.
    854 	 * Use lwp_upimutex_owned() to check if we do hold the lock.
    855 	 */
    856 	if (error != 0) {
    857 		if ((error == EINTR || error == ETIME) &&
    858 		    (upimutex = lwp_upimutex_owned(lp, type))) {
    859 			/*
    860 			 * Unlock and return - the re-startable syscall will
    861 			 * try the lock again if we got EINTR.
    862 			 */
    863 			(void) upi_mylist_add((upimutex_t *)upimutex);
    864 			upimutex_unlock((upimutex_t *)upimutex, 0);
    865 		}
    866 		/*
    867 		 * The only other possible error is EDEADLK.  If so, upimutex
    868 		 * is valid, since its owner is deadlocked with curthread.
    869 		 */
    870 		ASSERT(error == EINTR || error == ETIME ||
    871 		    (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
    872 		ASSERT(!lwp_upimutex_owned(lp, type));
    873 		goto out;
    874 	}
    875 	if (lwp_upimutex_owned(lp, type)) {
    876 		ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
    877 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
    878 		upilocked = 1;
    879 	}
    880 	/*
    881 	 * Now, need to read the user-level lp->mutex_flag to do the following:
    882 	 *
    883 	 * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED
    884 	 *   should be returned.
    885 	 * - if lock isn't held, check if ENOTRECOVERABLE should
    886 	 *   be returned.
    887 	 *
    888 	 * Now, either lp->mutex_flag is readable or it's not. If not
    889 	 * readable, the on_fault path will cause a return with EFAULT
    890 	 * as it should.  If it is readable, the state of the flag
    891 	 * encodes the robustness state of the lock:
    892 	 *
    893 	 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD
    894 	 * or LOCK_UNMAPPED setting will influence the return code
    895 	 * appropriately.  If the upimutex is not locked here, this
    896 	 * could be due to a spurious wake-up or a NOTRECOVERABLE
    897 	 * event.  The flag's setting can be used to distinguish
    898 	 * between these two events.
    899 	 */
    900 	fuword16_noerr(&lp->mutex_flag, &flag);
    901 	if (upilocked) {
    902 		/*
    903 		 * If the thread wakes up from turnstile_block with the lock
    904 		 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
    905 		 * since it would not have been handed-off the lock.
    906 		 * So, no need to check for this case.
    907 		 */
    908 		if (nupinest > maxnestupimx &&
    909 		    secpolicy_resource(CRED()) != 0) {
    910 			upimutex_unlock((upimutex_t *)upimutex, flag);
    911 			upilocked = 0;
    912 			error = ENOMEM;
    913 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
    914 			if (flag & LOCK_OWNERDEAD)
    915 				error = EOWNERDEAD;
    916 			else if (type & USYNC_PROCESS_ROBUST)
    917 				error = ELOCKUNMAPPED;
    918 			else
    919 				error = EOWNERDEAD;
    920 		}
    921 	} else {
    922 		/*
    923 		 * Wake-up without the upimutex held. Either this is a
    924 		 * spurious wake-up (due to signals, forkall(), whatever), or
    925 		 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
    926 		 * of the mutex flag can be used to distinguish between the
    927 		 * two events.
    928 		 */
    929 		if (flag & LOCK_NOTRECOVERABLE) {
    930 			error = ENOTRECOVERABLE;
    931 		} else {
    932 			/*
    933 			 * Here, the flag could be set to LOCK_OWNERDEAD or
    934 			 * not. In both cases, this is a spurious wakeup,
    935 			 * since the upi lock is not held, but the thread
    936 			 * has returned from turnstile_block().
    937 			 *
    938 			 * The user flag could be LOCK_OWNERDEAD if, at the
    939 			 * same time as curthread having been woken up
    940 			 * spuriously, the owner (say Tdead) has died, marked
    941 			 * the mutex flag accordingly, and handed off the lock
    942 			 * to some other waiter (say Tnew). curthread just
    943 			 * happened to read the flag while Tnew has yet to deal
    944 			 * with the owner-dead event.
    945 			 *
    946 			 * In this event, curthread should retry the lock.
    947 			 * If Tnew is able to cleanup the lock, curthread
    948 			 * will eventually get the lock with a zero error code,
    949 			 * If Tnew is unable to cleanup, its eventual call to
    950 			 * unlock the lock will result in the mutex flag being
    951 			 * set to LOCK_NOTRECOVERABLE, and the wake-up of
    952 			 * all waiters, including curthread, which will then
    953 			 * eventually return ENOTRECOVERABLE due to the above
    954 			 * check.
    955 			 *
    956 			 * Of course, if the user-flag is not set with
    957 			 * LOCK_OWNERDEAD, retrying is the thing to do, since
    958 			 * this is definitely a spurious wakeup.
    959 			 */
    960 			goto retry;
    961 		}
    962 	}
    963 
    964 out:
    965 	no_fault();
    966 	return (error);
    967 }
    968 
    969 
    970 static int
    971 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
    972 {
    973 	label_t ljb;
    974 	int error = 0;
    975 	lwpchan_t lwpchan;
    976 	uint16_t flag;
    977 	upib_t *upibp;
    978 	volatile struct upimutex *upimutex = NULL;
    979 	volatile int upilocked = 0;
    980 
    981 	if (on_fault(&ljb)) {
    982 		if (upilocked)
    983 			upimutex_unlock((upimutex_t *)upimutex, 0);
    984 		error = EFAULT;
    985 		goto out;
    986 	}
    987 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
    988 	    &lwpchan, LWPCHAN_MPPOOL)) {
    989 		error = EFAULT;
    990 		goto out;
    991 	}
    992 	upibp = &UPI_CHAIN(lwpchan);
    993 	mutex_enter(&upibp->upib_lock);
    994 	upimutex = upi_get(upibp, &lwpchan);
    995 	/*
    996 	 * If the lock is not held, or the owner is not curthread, return
    997 	 * error. The user-level wrapper can return this error or stall,
    998 	 * depending on whether mutex is of ERRORCHECK type or not.
    999 	 */
   1000 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
   1001 		mutex_exit(&upibp->upib_lock);
   1002 		error = EPERM;
   1003 		goto out;
   1004 	}
   1005 	mutex_exit(&upibp->upib_lock); /* release for user memory access */
   1006 	upilocked = 1;
   1007 	fuword16_noerr(&lp->mutex_flag, &flag);
   1008 	if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
   1009 		/*
   1010 		 * transition mutex to the LOCK_NOTRECOVERABLE state.
   1011 		 */
   1012 		flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
   1013 		flag |= LOCK_NOTRECOVERABLE;
   1014 		suword16_noerr(&lp->mutex_flag, flag);
   1015 	}
   1016 	set_owner_pid(lp, 0, 0);
   1017 	upimutex_unlock((upimutex_t *)upimutex, flag);
   1018 	upilocked = 0;
   1019 out:
   1020 	no_fault();
   1021 	return (error);
   1022 }
   1023 
   1024 /*
   1025  * Set the owner and ownerpid fields of a user-level mutex.
   1026  */
   1027 static void
   1028 set_owner_pid(lwp_mutex_t *lp, uintptr_t owner, pid_t pid)
   1029 {
   1030 	union {
   1031 		uint64_t word64;
   1032 		uint32_t word32[2];
   1033 	} un;
   1034 
   1035 	un.word64 = (uint64_t)owner;
   1036 
   1037 	suword32_noerr(&lp->mutex_ownerpid, pid);
   1038 #if defined(_LP64)
   1039 	if (((uintptr_t)lp & (_LONG_LONG_ALIGNMENT - 1)) == 0) { /* aligned */
   1040 		suword64_noerr(&lp->mutex_owner, un.word64);
   1041 		return;
   1042 	}
   1043 #endif
   1044 	/* mutex is unaligned or we are running on a 32-bit kernel */
   1045 	suword32_noerr((uint32_t *)&lp->mutex_owner, un.word32[0]);
   1046 	suword32_noerr((uint32_t *)&lp->mutex_owner + 1, un.word32[1]);
   1047 }
   1048 
   1049 /*
   1050  * Clear the contents of a user-level mutex; return the flags.
   1051  * Used only by upi_dead() and lwp_mutex_cleanup(), below.
   1052  */
   1053 static uint16_t
   1054 lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg)
   1055 {
   1056 	uint16_t flag;
   1057 
   1058 	fuword16_noerr(&lp->mutex_flag, &flag);
   1059 	if ((flag &
   1060 	    (LOCK_OWNERDEAD | LOCK_UNMAPPED | LOCK_NOTRECOVERABLE)) == 0) {
   1061 		flag |= lockflg;
   1062 		suword16_noerr(&lp->mutex_flag, flag);
   1063 	}
   1064 	set_owner_pid(lp, 0, 0);
   1065 	suword8_noerr(&lp->mutex_rcount, 0);
   1066 
   1067 	return (flag);
   1068 }
   1069 
   1070 /*
   1071  * Mark user mutex state, corresponding to kernel upimutex,
   1072  * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate
   1073  */
   1074 static int
   1075 upi_dead(upimutex_t *upip, uint16_t lockflg)
   1076 {
   1077 	label_t ljb;
   1078 	int error = 0;
   1079 	lwp_mutex_t *lp;
   1080 
   1081 	if (on_fault(&ljb)) {
   1082 		error = EFAULT;
   1083 		goto out;
   1084 	}
   1085 
   1086 	lp = upip->upi_vaddr;
   1087 	(void) lwp_clear_mutex(lp, lockflg);
   1088 	suword8_noerr(&lp->mutex_lockw, 0);
   1089 out:
   1090 	no_fault();
   1091 	return (error);
   1092 }
   1093 
   1094 /*
   1095  * Unlock all upimutexes held by curthread, since curthread is dying.
   1096  * For each upimutex, attempt to mark its corresponding user mutex object as
   1097  * dead.
   1098  */
   1099 void
   1100 upimutex_cleanup()
   1101 {
   1102 	kthread_t *t = curthread;
   1103 	uint16_t lockflg = (ttoproc(t)->p_proc_flag & P_PR_EXEC)?
   1104 	    LOCK_UNMAPPED : LOCK_OWNERDEAD;
   1105 	struct upimutex *upip;
   1106 
   1107 	while ((upip = t->t_upimutex) != NULL) {
   1108 		if (upi_dead(upip, lockflg) != 0) {
   1109 			/*
   1110 			 * If the user object associated with this upimutex is
   1111 			 * unmapped, unlock upimutex with the
   1112 			 * LOCK_NOTRECOVERABLE flag, so that all waiters are
   1113 			 * woken up. Since user object is unmapped, it could
   1114 			 * not be marked as dead or notrecoverable.
   1115 			 * The waiters will now all wake up and return
   1116 			 * ENOTRECOVERABLE, since they would find that the lock
   1117 			 * has not been handed-off to them.
   1118 			 * See lwp_upimutex_lock().
   1119 			 */
   1120 			upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
   1121 		} else {
   1122 			/*
   1123 			 * The user object has been updated as dead.
   1124 			 * Unlock the upimutex: if no waiters, upip kmem will
   1125 			 * be freed. If there is a waiter, the lock will be
   1126 			 * handed off. If exit() is in progress, each existing
   1127 			 * waiter will successively get the lock, as owners
   1128 			 * die, and each new owner will call this routine as
   1129 			 * it dies. The last owner will free kmem, since
   1130 			 * it will find the upimutex has no waiters. So,
   1131 			 * eventually, the kmem is guaranteed to be freed.
   1132 			 */
   1133 			upimutex_unlock(upip, 0);
   1134 		}
   1135 		/*
   1136 		 * Note that the call to upimutex_unlock() above will delete
   1137 		 * upimutex from the t_upimutexes chain. And so the
   1138 		 * while loop will eventually terminate.
   1139 		 */
   1140 	}
   1141 }
   1142 
   1143 int
   1144 lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp, uintptr_t owner)
   1145 {
   1146 	kthread_t *t = curthread;
   1147 	klwp_t *lwp = ttolwp(t);
   1148 	proc_t *p = ttoproc(t);
   1149 	lwp_timer_t lwpt;
   1150 	caddr_t timedwait;
   1151 	int error = 0;
   1152 	int time_error;
   1153 	clock_t tim = -1;
   1154 	uchar_t waiters;
   1155 	volatile int locked = 0;
   1156 	volatile int watched = 0;
   1157 	label_t ljb;
   1158 	volatile uint8_t type = 0;
   1159 	lwpchan_t lwpchan;
   1160 	sleepq_head_t *sqh;
   1161 	uint16_t flag;
   1162 	int imm_timeout = 0;
   1163 
   1164 	if ((caddr_t)lp >= p->p_as->a_userlimit)
   1165 		return (set_errno(EFAULT));
   1166 
   1167 	/*
   1168 	 * Put the lwp in an orderly state for debugging,
   1169 	 * in case we are stopped while sleeping, below.
   1170 	 */
   1171 	prstop(PR_REQUESTED, 0);
   1172 
   1173 	timedwait = (caddr_t)tsp;
   1174 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
   1175 	    lwpt.lwpt_imm_timeout) {
   1176 		imm_timeout = 1;
   1177 		timedwait = NULL;
   1178 	}
   1179 
   1180 	/*
   1181 	 * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
   1182 	 * this micro state is really a run state. If the thread indeed blocks,
   1183 	 * this state becomes valid. If not, the state is converted back to
   1184 	 * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
   1185 	 * when blocking.
   1186 	 */
   1187 	(void) new_mstate(t, LMS_USER_LOCK);
   1188 	if (on_fault(&ljb)) {
   1189 		if (locked)
   1190 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
   1191 		error = EFAULT;
   1192 		goto out;
   1193 	}
   1194 	/*
   1195 	 * Force Copy-on-write if necessary and ensure that the
   1196 	 * synchronization object resides in read/write memory.
   1197 	 * Cause an EFAULT return now if this is not so.
   1198 	 */
   1199 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
   1200 	suword8_noerr(&lp->mutex_type, type);
   1201 	if (UPIMUTEX(type)) {
   1202 		no_fault();
   1203 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
   1204 		if (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED)
   1205 			set_owner_pid(lp, owner,
   1206 			    (type & USYNC_PROCESS)? p->p_pid : 0);
   1207 		if (tsp && !time_error)	/* copyout the residual time left */
   1208 			error = lwp_timer_copyout(&lwpt, error);
   1209 		if (error)
   1210 			return (set_errno(error));
   1211 		return (0);
   1212 	}
   1213 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
   1214 	    &lwpchan, LWPCHAN_MPPOOL)) {
   1215 		error = EFAULT;
   1216 		goto out;
   1217 	}
   1218 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
   1219 	locked = 1;
   1220 	if (type & LOCK_ROBUST) {
   1221 		fuword16_noerr(&lp->mutex_flag, &flag);
   1222 		if (flag & LOCK_NOTRECOVERABLE) {
   1223 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
   1224 			error = ENOTRECOVERABLE;
   1225 			goto out;
   1226 		}
   1227 	}
   1228 	fuword8_noerr(&lp->mutex_waiters, &waiters);
   1229 	suword8_noerr(&lp->mutex_waiters, 1);
   1230 
   1231 	/*
   1232 	 * If watchpoints are set, they need to be restored, since
   1233 	 * atomic accesses of memory such as the call to ulock_try()
   1234 	 * below cannot be watched.
   1235 	 */
   1236 
   1237 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   1238 
   1239 	while (!ulock_try(&lp->mutex_lockw)) {
   1240 		if (time_error) {
   1241 			/*
   1242 			 * The SUSV3 Posix spec is very clear that we
   1243 			 * should get no error from validating the
   1244 			 * timer until we would actually sleep.
   1245 			 */
   1246 			error = time_error;
   1247 			break;
   1248 		}
   1249 
   1250 		if (watched) {
   1251 			watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   1252 			watched = 0;
   1253 		}
   1254 
   1255 		if (timedwait) {
   1256 			/*
   1257 			 * If we successfully queue the timeout,
   1258 			 * then don't drop t_delay_lock until
   1259 			 * we are on the sleep queue (below).
   1260 			 */
   1261 			mutex_enter(&t->t_delay_lock);
   1262 			if (lwp_timer_enqueue(&lwpt) != 0) {
   1263 				mutex_exit(&t->t_delay_lock);
   1264 				imm_timeout = 1;
   1265 				timedwait = NULL;
   1266 			}
   1267 		}
   1268 		lwp_block(&lwpchan);
   1269 		/*
   1270 		 * Nothing should happen to cause the lwp to go to
   1271 		 * sleep again until after it returns from swtch().
   1272 		 */
   1273 		if (timedwait)
   1274 			mutex_exit(&t->t_delay_lock);
   1275 		locked = 0;
   1276 		lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
   1277 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
   1278 			setrun(t);
   1279 		swtch();
   1280 		t->t_flag &= ~T_WAKEABLE;
   1281 		if (timedwait)
   1282 			tim = lwp_timer_dequeue(&lwpt);
   1283 		setallwatch();
   1284 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
   1285 			error = EINTR;
   1286 		else if (imm_timeout || (timedwait && tim == -1))
   1287 			error = ETIME;
   1288 		if (error) {
   1289 			lwp->lwp_asleep = 0;
   1290 			lwp->lwp_sysabort = 0;
   1291 			watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
   1292 			    S_WRITE);
   1293 
   1294 			/*
   1295 			 * Need to re-compute waiters bit. The waiters field in
   1296 			 * the lock is not reliable. Either of two things could
   1297 			 * have occurred: no lwp may have called lwp_release()
   1298 			 * for me but I have woken up due to a signal or
   1299 			 * timeout.  In this case, the waiter bit is incorrect
   1300 			 * since it is still set to 1, set above.
   1301 			 * OR an lwp_release() did occur for some other lwp on
   1302 			 * the same lwpchan. In this case, the waiter bit is
   1303 			 * correct.  But which event occurred, one can't tell.
   1304 			 * So, recompute.
   1305 			 */
   1306 			lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
   1307 			locked = 1;
   1308 			sqh = lwpsqhash(&lwpchan);
   1309 			disp_lock_enter(&sqh->sq_lock);
   1310 			waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
   1311 			disp_lock_exit(&sqh->sq_lock);
   1312 			break;
   1313 		}
   1314 		lwp->lwp_asleep = 0;
   1315 		watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
   1316 		    S_WRITE);
   1317 		lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
   1318 		locked = 1;
   1319 		fuword8_noerr(&lp->mutex_waiters, &waiters);
   1320 		suword8_noerr(&lp->mutex_waiters, 1);
   1321 		if (type & LOCK_ROBUST) {
   1322 			fuword16_noerr(&lp->mutex_flag, &flag);
   1323 			if (flag & LOCK_NOTRECOVERABLE) {
   1324 				error = ENOTRECOVERABLE;
   1325 				break;
   1326 			}
   1327 		}
   1328 	}
   1329 
   1330 	if (t->t_mstate == LMS_USER_LOCK)
   1331 		(void) new_mstate(t, LMS_SYSTEM);
   1332 
   1333 	if (error == 0) {
   1334 		set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
   1335 		if (type & LOCK_ROBUST) {
   1336 			fuword16_noerr(&lp->mutex_flag, &flag);
   1337 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
   1338 				if (flag & LOCK_OWNERDEAD)
   1339 					error = EOWNERDEAD;
   1340 				else if (type & USYNC_PROCESS_ROBUST)
   1341 					error = ELOCKUNMAPPED;
   1342 				else
   1343 					error = EOWNERDEAD;
   1344 			}
   1345 		}
   1346 	}
   1347 	suword8_noerr(&lp->mutex_waiters, waiters);
   1348 	locked = 0;
   1349 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
   1350 out:
   1351 	no_fault();
   1352 	if (watched)
   1353 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   1354 	if (tsp && !time_error)		/* copyout the residual time left */
   1355 		error = lwp_timer_copyout(&lwpt, error);
   1356 	if (error)
   1357 		return (set_errno(error));
   1358 	return (0);
   1359 }
   1360 
   1361 /*
   1362  * Obsolete lwp_mutex_lock() interface, no longer called from libc.
   1363  * libc now calls lwp_mutex_timedlock(lp, NULL, NULL).
   1364  * This system call trap continues to exist solely for the benefit
   1365  * of old statically-linked binaries from Solaris 9 and before.
   1366  * It should be removed from the system when we no longer care
   1367  * about such applications.
   1368  */
   1369 int
   1370 lwp_mutex_lock(lwp_mutex_t *lp)
   1371 {
   1372 	return (lwp_mutex_timedlock(lp, NULL, NULL));
   1373 }
   1374 
   1375 static int
   1376 iswanted(kthread_t *t, lwpchan_t *lwpchan)
   1377 {
   1378 	/*
   1379 	 * The caller holds the dispatcher lock on the sleep queue.
   1380 	 */
   1381 	while (t != NULL) {
   1382 		if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
   1383 		    t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
   1384 			return (1);
   1385 		t = t->t_link;
   1386 	}
   1387 	return (0);
   1388 }
   1389 
   1390 /*
   1391  * Return the highest priority thread sleeping on this lwpchan.
   1392  */
   1393 static kthread_t *
   1394 lwp_queue_waiter(lwpchan_t *lwpchan)
   1395 {
   1396 	sleepq_head_t *sqh;
   1397 	kthread_t *tp;
   1398 
   1399 	sqh = lwpsqhash(lwpchan);
   1400 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
   1401 	for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
   1402 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
   1403 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
   1404 			break;
   1405 	}
   1406 	disp_lock_exit(&sqh->sq_lock);
   1407 	return (tp);
   1408 }
   1409 
   1410 static int
   1411 lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
   1412 {
   1413 	sleepq_head_t *sqh;
   1414 	kthread_t *tp;
   1415 	kthread_t **tpp;
   1416 
   1417 	sqh = lwpsqhash(lwpchan);
   1418 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
   1419 	tpp = &sqh->sq_queue.sq_first;
   1420 	while ((tp = *tpp) != NULL) {
   1421 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
   1422 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
   1423 			/*
   1424 			 * The following is typically false. It could be true
   1425 			 * only if lwp_release() is called from
   1426 			 * lwp_mutex_wakeup() after reading the waiters field
   1427 			 * from memory in which the lwp lock used to be, but has
   1428 			 * since been re-used to hold a lwp cv or lwp semaphore.
   1429 			 * The thread "tp" found to match the lwp lock's wchan
   1430 			 * is actually sleeping for the cv or semaphore which
   1431 			 * now has the same wchan. In this case, lwp_release()
   1432 			 * should return failure.
   1433 			 */
   1434 			if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
   1435 				ASSERT(sync_type == 0);
   1436 				/*
   1437 				 * assert that this can happen only for mutexes
   1438 				 * i.e. sync_type == 0, for correctly written
   1439 				 * user programs.
   1440 				 */
   1441 				disp_lock_exit(&sqh->sq_lock);
   1442 				return (0);
   1443 			}
   1444 			*waiters = iswanted(tp->t_link, lwpchan);
   1445 			sleepq_unlink(tpp, tp);
   1446 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
   1447 			tp->t_wchan0 = NULL;
   1448 			tp->t_wchan = NULL;
   1449 			tp->t_sobj_ops = NULL;
   1450 			tp->t_release = 1;
   1451 			THREAD_TRANSITION(tp);	/* drops sleepq lock */
   1452 			CL_WAKEUP(tp);
   1453 			thread_unlock(tp);	/* drop run queue lock */
   1454 			return (1);
   1455 		}
   1456 		tpp = &tp->t_link;
   1457 	}
   1458 	*waiters = 0;
   1459 	disp_lock_exit(&sqh->sq_lock);
   1460 	return (0);
   1461 }
   1462 
   1463 static void
   1464 lwp_release_all(lwpchan_t *lwpchan)
   1465 {
   1466 	sleepq_head_t	*sqh;
   1467 	kthread_t *tp;
   1468 	kthread_t **tpp;
   1469 
   1470 	sqh = lwpsqhash(lwpchan);
   1471 	disp_lock_enter(&sqh->sq_lock);		/* lock sleep q queue */
   1472 	tpp = &sqh->sq_queue.sq_first;
   1473 	while ((tp = *tpp) != NULL) {
   1474 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
   1475 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
   1476 			sleepq_unlink(tpp, tp);
   1477 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
   1478 			tp->t_wchan0 = NULL;
   1479 			tp->t_wchan = NULL;
   1480 			tp->t_sobj_ops = NULL;
   1481 			CL_WAKEUP(tp);
   1482 			thread_unlock_high(tp);	/* release run queue lock */
   1483 		} else {
   1484 			tpp = &tp->t_link;
   1485 		}
   1486 	}
   1487 	disp_lock_exit(&sqh->sq_lock);		/* drop sleep q lock */
   1488 }
   1489 
   1490 /*
   1491  * unblock a lwp that is trying to acquire this mutex. the blocked
   1492  * lwp resumes and retries to acquire the lock.
   1493  */
   1494 int
   1495 lwp_mutex_wakeup(lwp_mutex_t *lp, int release_all)
   1496 {
   1497 	proc_t *p = ttoproc(curthread);
   1498 	lwpchan_t lwpchan;
   1499 	uchar_t waiters;
   1500 	volatile int locked = 0;
   1501 	volatile int watched = 0;
   1502 	volatile uint8_t type = 0;
   1503 	label_t ljb;
   1504 	int error = 0;
   1505 
   1506 	if ((caddr_t)lp >= p->p_as->a_userlimit)
   1507 		return (set_errno(EFAULT));
   1508 
   1509 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   1510 
   1511 	if (on_fault(&ljb)) {
   1512 		if (locked)
   1513 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
   1514 		error = EFAULT;
   1515 		goto out;
   1516 	}
   1517 	/*
   1518 	 * Force Copy-on-write if necessary and ensure that the
   1519 	 * synchronization object resides in read/write memory.
   1520 	 * Cause an EFAULT return now if this is not so.
   1521 	 */
   1522 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
   1523 	suword8_noerr(&lp->mutex_type, type);
   1524 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
   1525 	    &lwpchan, LWPCHAN_MPPOOL)) {
   1526 		error = EFAULT;
   1527 		goto out;
   1528 	}
   1529 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
   1530 	locked = 1;
   1531 	/*
   1532 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
   1533 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
   1534 	 * may fail.  If it fails, do not write into the waiter bit.
   1535 	 * The call to lwp_release() might fail due to one of three reasons:
   1536 	 *
   1537 	 * 	1. due to the thread which set the waiter bit not actually
   1538 	 *	   sleeping since it got the lock on the re-try. The waiter
   1539 	 *	   bit will then be correctly updated by that thread. This
   1540 	 *	   window may be closed by reading the wait bit again here
   1541 	 *	   and not calling lwp_release() at all if it is zero.
   1542 	 *	2. the thread which set the waiter bit and went to sleep
   1543 	 *	   was woken up by a signal. This time, the waiter recomputes
   1544 	 *	   the wait bit in the return with EINTR code.
   1545 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
   1546 	 *	   memory that has been re-used after the lock was dropped.
   1547 	 *	   In this case, writing into the waiter bit would cause data
   1548 	 *	   corruption.
   1549 	 */
   1550 	if (release_all)
   1551 		lwp_release_all(&lwpchan);
   1552 	else if (lwp_release(&lwpchan, &waiters, 0))
   1553 		suword8_noerr(&lp->mutex_waiters, waiters);
   1554 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
   1555 out:
   1556 	no_fault();
   1557 	if (watched)
   1558 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   1559 	if (error)
   1560 		return (set_errno(error));
   1561 	return (0);
   1562 }
   1563 
   1564 /*
   1565  * lwp_cond_wait() has four arguments, a pointer to a condition variable,
   1566  * a pointer to a mutex, a pointer to a timespec for a timed wait and
   1567  * a flag telling the kernel whether or not to honor the kernel/user
   1568  * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
   1569  * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
   1570  * lwpchan, returned by get_lwpchan().  If the timespec pointer is non-NULL,
   1571  * it is used an an in/out parameter.  On entry, it contains the relative
   1572  * time until timeout.  On exit, we copyout the residual time left to it.
   1573  */
   1574 int
   1575 lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
   1576 {
   1577 	kthread_t *t = curthread;
   1578 	klwp_t *lwp = ttolwp(t);
   1579 	proc_t *p = ttoproc(t);
   1580 	lwp_timer_t lwpt;
   1581 	lwpchan_t cv_lwpchan;
   1582 	lwpchan_t m_lwpchan;
   1583 	caddr_t timedwait;
   1584 	volatile uint16_t type = 0;
   1585 	volatile uint8_t mtype = 0;
   1586 	uchar_t waiters;
   1587 	volatile int error;
   1588 	clock_t tim = -1;
   1589 	volatile int locked = 0;
   1590 	volatile int m_locked = 0;
   1591 	volatile int cvwatched = 0;
   1592 	volatile int mpwatched = 0;
   1593 	label_t ljb;
   1594 	volatile int no_lwpchan = 1;
   1595 	int imm_timeout = 0;
   1596 	int imm_unpark = 0;
   1597 
   1598 	if ((caddr_t)cv >= p->p_as->a_userlimit ||
   1599 	    (caddr_t)mp >= p->p_as->a_userlimit)
   1600 		return (set_errno(EFAULT));
   1601 
   1602 	/*
   1603 	 * Put the lwp in an orderly state for debugging,
   1604 	 * in case we are stopped while sleeping, below.
   1605 	 */
   1606 	prstop(PR_REQUESTED, 0);
   1607 
   1608 	timedwait = (caddr_t)tsp;
   1609 	if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
   1610 		return (set_errno(error));
   1611 	if (lwpt.lwpt_imm_timeout) {
   1612 		imm_timeout = 1;
   1613 		timedwait = NULL;
   1614 	}
   1615 
   1616 	(void) new_mstate(t, LMS_USER_LOCK);
   1617 
   1618 	if (on_fault(&ljb)) {
   1619 		if (no_lwpchan) {
   1620 			error = EFAULT;
   1621 			goto out;
   1622 		}
   1623 		if (m_locked) {
   1624 			m_locked = 0;
   1625 			lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
   1626 		}
   1627 		if (locked) {
   1628 			locked = 0;
   1629 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
   1630 		}
   1631 		/*
   1632 		 * set up another on_fault() for a possible fault
   1633 		 * on the user lock accessed at "efault"
   1634 		 */
   1635 		if (on_fault(&ljb)) {
   1636 			if (m_locked) {
   1637 				m_locked = 0;
   1638 				lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
   1639 			}
   1640 			goto out;
   1641 		}
   1642 		error = EFAULT;
   1643 		goto efault;
   1644 	}
   1645 
   1646 	/*
   1647 	 * Force Copy-on-write if necessary and ensure that the
   1648 	 * synchronization object resides in read/write memory.
   1649 	 * Cause an EFAULT return now if this is not so.
   1650 	 */
   1651 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
   1652 	suword8_noerr(&mp->mutex_type, mtype);
   1653 	if (UPIMUTEX(mtype) == 0) {
   1654 		/* convert user level mutex, "mp", to a unique lwpchan */
   1655 		/* check if mtype is ok to use below, instead of type from cv */
   1656 		if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
   1657 		    &m_lwpchan, LWPCHAN_MPPOOL)) {
   1658 			error = EFAULT;
   1659 			goto out;
   1660 		}
   1661 	}
   1662 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
   1663 	suword16_noerr(&cv->cond_type, type);
   1664 	/* convert user level condition variable, "cv", to a unique lwpchan */
   1665 	if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
   1666 	    &cv_lwpchan, LWPCHAN_CVPOOL)) {
   1667 		error = EFAULT;
   1668 		goto out;
   1669 	}
   1670 	no_lwpchan = 0;
   1671 	cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
   1672 	if (UPIMUTEX(mtype) == 0)
   1673 		mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
   1674 		    S_WRITE);
   1675 
   1676 	/*
   1677 	 * lwpchan_lock ensures that the calling lwp is put to sleep atomically
   1678 	 * with respect to a possible wakeup which is a result of either
   1679 	 * an lwp_cond_signal() or an lwp_cond_broadcast().
   1680 	 *
   1681 	 * What's misleading, is that the lwp is put to sleep after the
   1682 	 * condition variable's mutex is released.  This is OK as long as
   1683 	 * the release operation is also done while holding lwpchan_lock.
   1684 	 * The lwp is then put to sleep when the possibility of pagefaulting
   1685 	 * or sleeping is completely eliminated.
   1686 	 */
   1687 	lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
   1688 	locked = 1;
   1689 	if (UPIMUTEX(mtype) == 0) {
   1690 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
   1691 		m_locked = 1;
   1692 		suword8_noerr(&cv->cond_waiters_kernel, 1);
   1693 		/*
   1694 		 * unlock the condition variable's mutex. (pagefaults are
   1695 		 * possible here.)
   1696 		 */
   1697 		set_owner_pid(mp, 0, 0);
   1698 		ulock_clear(&mp->mutex_lockw);
   1699 		fuword8_noerr(&mp->mutex_waiters, &waiters);
   1700 		if (waiters != 0) {
   1701 			/*
   1702 			 * Given the locking of lwpchan_lock around the release
   1703 			 * of the mutex and checking for waiters, the following
   1704 			 * call to lwp_release() can fail ONLY if the lock
   1705 			 * acquirer is interrupted after setting the waiter bit,
   1706 			 * calling lwp_block() and releasing lwpchan_lock.
   1707 			 * In this case, it could get pulled off the lwp sleep
   1708 			 * q (via setrun()) before the following call to
   1709 			 * lwp_release() occurs. In this case, the lock
   1710 			 * requestor will update the waiter bit correctly by
   1711 			 * re-evaluating it.
   1712 			 */
   1713 			if (lwp_release(&m_lwpchan, &waiters, 0))
   1714 				suword8_noerr(&mp->mutex_waiters, waiters);
   1715 		}
   1716 		m_locked = 0;
   1717 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
   1718 	} else {
   1719 		suword8_noerr(&cv->cond_waiters_kernel, 1);
   1720 		error = lwp_upimutex_unlock(mp, mtype);
   1721 		if (error) {	/* if the upimutex unlock failed */
   1722 			locked = 0;
   1723 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
   1724 			goto out;
   1725 		}
   1726 	}
   1727 	no_fault();
   1728 
   1729 	if (mpwatched) {
   1730 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
   1731 		mpwatched = 0;
   1732 	}
   1733 	if (cvwatched) {
   1734 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
   1735 		cvwatched = 0;
   1736 	}
   1737 
   1738 	if (check_park && (!schedctl_is_park() || t->t_unpark)) {
   1739 		/*
   1740 		 * We received a signal at user-level before calling here
   1741 		 * or another thread wants us to return immediately
   1742 		 * with EINTR.  See lwp_unpark().
   1743 		 */
   1744 		imm_unpark = 1;
   1745 		t->t_unpark = 0;
   1746 		timedwait = NULL;
   1747 	} else if (timedwait) {
   1748 		/*
   1749 		 * If we successfully queue the timeout,
   1750 		 * then don't drop t_delay_lock until
   1751 		 * we are on the sleep queue (below).
   1752 		 */
   1753 		mutex_enter(&t->t_delay_lock);
   1754 		if (lwp_timer_enqueue(&lwpt) != 0) {
   1755 			mutex_exit(&t->t_delay_lock);
   1756 			imm_timeout = 1;
   1757 			timedwait = NULL;
   1758 		}
   1759 	}
   1760 	t->t_flag |= T_WAITCVSEM;
   1761 	lwp_block(&cv_lwpchan);
   1762 	/*
   1763 	 * Nothing should happen to cause the lwp to go to sleep
   1764 	 * until after it returns from swtch().
   1765 	 */
   1766 	if (timedwait)
   1767 		mutex_exit(&t->t_delay_lock);
   1768 	locked = 0;
   1769 	lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
   1770 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
   1771 	    (imm_timeout | imm_unpark))
   1772 		setrun(t);
   1773 	swtch();
   1774 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
   1775 	if (timedwait)
   1776 		tim = lwp_timer_dequeue(&lwpt);
   1777 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
   1778 	    MUSTRETURN(p, t) || imm_unpark)
   1779 		error = EINTR;
   1780 	else if (imm_timeout || (timedwait && tim == -1))
   1781 		error = ETIME;
   1782 	lwp->lwp_asleep = 0;
   1783 	lwp->lwp_sysabort = 0;
   1784 	setallwatch();
   1785 
   1786 	if (t->t_mstate == LMS_USER_LOCK)
   1787 		(void) new_mstate(t, LMS_SYSTEM);
   1788 
   1789 	if (tsp && check_park)		/* copyout the residual time left */
   1790 		error = lwp_timer_copyout(&lwpt, error);
   1791 
   1792 	/* the mutex is reacquired by the caller on return to user level */
   1793 	if (error) {
   1794 		/*
   1795 		 * If we were concurrently lwp_cond_signal()d and we
   1796 		 * received a UNIX signal or got a timeout, then perform
   1797 		 * another lwp_cond_signal() to avoid consuming the wakeup.
   1798 		 */
   1799 		if (t->t_release)
   1800 			(void) lwp_cond_signal(cv);
   1801 		return (set_errno(error));
   1802 	}
   1803 	return (0);
   1804 
   1805 efault:
   1806 	/*
   1807 	 * make sure that the user level lock is dropped before
   1808 	 * returning to caller, since the caller always re-acquires it.
   1809 	 */
   1810 	if (UPIMUTEX(mtype) == 0) {
   1811 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
   1812 		m_locked = 1;
   1813 		set_owner_pid(mp, 0, 0);
   1814 		ulock_clear(&mp->mutex_lockw);
   1815 		fuword8_noerr(&mp->mutex_waiters, &waiters);
   1816 		if (waiters != 0) {
   1817 			/*
   1818 			 * See comment above on lock clearing and lwp_release()
   1819 			 * success/failure.
   1820 			 */
   1821 			if (lwp_release(&m_lwpchan, &waiters, 0))
   1822 				suword8_noerr(&mp->mutex_waiters, waiters);
   1823 		}
   1824 		m_locked = 0;
   1825 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
   1826 	} else {
   1827 		(void) lwp_upimutex_unlock(mp, mtype);
   1828 	}
   1829 out:
   1830 	no_fault();
   1831 	if (mpwatched)
   1832 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
   1833 	if (cvwatched)
   1834 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
   1835 	if (t->t_mstate == LMS_USER_LOCK)
   1836 		(void) new_mstate(t, LMS_SYSTEM);
   1837 	return (set_errno(error));
   1838 }
   1839 
   1840 /*
   1841  * wakeup one lwp that's blocked on this condition variable.
   1842  */
   1843 int
   1844 lwp_cond_signal(lwp_cond_t *cv)
   1845 {
   1846 	proc_t *p = ttoproc(curthread);
   1847 	lwpchan_t lwpchan;
   1848 	uchar_t waiters;
   1849 	volatile uint16_t type = 0;
   1850 	volatile int locked = 0;
   1851 	volatile int watched = 0;
   1852 	label_t ljb;
   1853 	int error = 0;
   1854 
   1855 	if ((caddr_t)cv >= p->p_as->a_userlimit)
   1856 		return (set_errno(EFAULT));
   1857 
   1858 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
   1859 
   1860 	if (on_fault(&ljb)) {
   1861 		if (locked)
   1862 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   1863 		error = EFAULT;
   1864 		goto out;
   1865 	}
   1866 	/*
   1867 	 * Force Copy-on-write if necessary and ensure that the
   1868 	 * synchronization object resides in read/write memory.
   1869 	 * Cause an EFAULT return now if this is not so.
   1870 	 */
   1871 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
   1872 	suword16_noerr(&cv->cond_type, type);
   1873 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
   1874 	    &lwpchan, LWPCHAN_CVPOOL)) {
   1875 		error = EFAULT;
   1876 		goto out;
   1877 	}
   1878 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
   1879 	locked = 1;
   1880 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
   1881 	if (waiters != 0) {
   1882 		/*
   1883 		 * The following call to lwp_release() might fail but it is
   1884 		 * OK to write into the waiters bit below, since the memory
   1885 		 * could not have been re-used or unmapped (for correctly
   1886 		 * written user programs) as in the case of lwp_mutex_wakeup().
   1887 		 * For an incorrect program, we should not care about data
   1888 		 * corruption since this is just one instance of other places
   1889 		 * where corruption can occur for such a program. Of course
   1890 		 * if the memory is unmapped, normal fault recovery occurs.
   1891 		 */
   1892 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
   1893 		suword8_noerr(&cv->cond_waiters_kernel, waiters);
   1894 	}
   1895 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   1896 out:
   1897 	no_fault();
   1898 	if (watched)
   1899 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
   1900 	if (error)
   1901 		return (set_errno(error));
   1902 	return (0);
   1903 }
   1904 
   1905 /*
   1906  * wakeup every lwp that's blocked on this condition variable.
   1907  */
   1908 int
   1909 lwp_cond_broadcast(lwp_cond_t *cv)
   1910 {
   1911 	proc_t *p = ttoproc(curthread);
   1912 	lwpchan_t lwpchan;
   1913 	volatile uint16_t type = 0;
   1914 	volatile int locked = 0;
   1915 	volatile int watched = 0;
   1916 	label_t ljb;
   1917 	uchar_t waiters;
   1918 	int error = 0;
   1919 
   1920 	if ((caddr_t)cv >= p->p_as->a_userlimit)
   1921 		return (set_errno(EFAULT));
   1922 
   1923 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
   1924 
   1925 	if (on_fault(&ljb)) {
   1926 		if (locked)
   1927 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   1928 		error = EFAULT;
   1929 		goto out;
   1930 	}
   1931 	/*
   1932 	 * Force Copy-on-write if necessary and ensure that the
   1933 	 * synchronization object resides in read/write memory.
   1934 	 * Cause an EFAULT return now if this is not so.
   1935 	 */
   1936 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
   1937 	suword16_noerr(&cv->cond_type, type);
   1938 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
   1939 	    &lwpchan, LWPCHAN_CVPOOL)) {
   1940 		error = EFAULT;
   1941 		goto out;
   1942 	}
   1943 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
   1944 	locked = 1;
   1945 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
   1946 	if (waiters != 0) {
   1947 		lwp_release_all(&lwpchan);
   1948 		suword8_noerr(&cv->cond_waiters_kernel, 0);
   1949 	}
   1950 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   1951 out:
   1952 	no_fault();
   1953 	if (watched)
   1954 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
   1955 	if (error)
   1956 		return (set_errno(error));
   1957 	return (0);
   1958 }
   1959 
   1960 int
   1961 lwp_sema_trywait(lwp_sema_t *sp)
   1962 {
   1963 	kthread_t *t = curthread;
   1964 	proc_t *p = ttoproc(t);
   1965 	label_t ljb;
   1966 	volatile int locked = 0;
   1967 	volatile int watched = 0;
   1968 	volatile uint16_t type = 0;
   1969 	int count;
   1970 	lwpchan_t lwpchan;
   1971 	uchar_t waiters;
   1972 	int error = 0;
   1973 
   1974 	if ((caddr_t)sp >= p->p_as->a_userlimit)
   1975 		return (set_errno(EFAULT));
   1976 
   1977 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
   1978 
   1979 	if (on_fault(&ljb)) {
   1980 		if (locked)
   1981 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   1982 		error = EFAULT;
   1983 		goto out;
   1984 	}
   1985 	/*
   1986 	 * Force Copy-on-write if necessary and ensure that the
   1987 	 * synchronization object resides in read/write memory.
   1988 	 * Cause an EFAULT return now if this is not so.
   1989 	 */
   1990 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
   1991 	suword16_noerr((void *)&sp->sema_type, type);
   1992 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
   1993 	    &lwpchan, LWPCHAN_CVPOOL)) {
   1994 		error = EFAULT;
   1995 		goto out;
   1996 	}
   1997 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
   1998 	locked = 1;
   1999 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
   2000 	if (count == 0)
   2001 		error = EBUSY;
   2002 	else
   2003 		suword32_noerr((void *)&sp->sema_count, --count);
   2004 	if (count != 0) {
   2005 		fuword8_noerr(&sp->sema_waiters, &waiters);
   2006 		if (waiters != 0) {
   2007 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
   2008 			suword8_noerr(&sp->sema_waiters, waiters);
   2009 		}
   2010 	}
   2011 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   2012 out:
   2013 	no_fault();
   2014 	if (watched)
   2015 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
   2016 	if (error)
   2017 		return (set_errno(error));
   2018 	return (0);
   2019 }
   2020 
   2021 /*
   2022  * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
   2023  */
   2024 int
   2025 lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
   2026 {
   2027 	kthread_t *t = curthread;
   2028 	klwp_t *lwp = ttolwp(t);
   2029 	proc_t *p = ttoproc(t);
   2030 	lwp_timer_t lwpt;
   2031 	caddr_t timedwait;
   2032 	clock_t tim = -1;
   2033 	label_t ljb;
   2034 	volatile int locked = 0;
   2035 	volatile int watched = 0;
   2036 	volatile uint16_t type = 0;
   2037 	int count;
   2038 	lwpchan_t lwpchan;
   2039 	uchar_t waiters;
   2040 	int error = 0;
   2041 	int time_error;
   2042 	int imm_timeout = 0;
   2043 	int imm_unpark = 0;
   2044 
   2045 	if ((caddr_t)sp >= p->p_as->a_userlimit)
   2046 		return (set_errno(EFAULT));
   2047 
   2048 	/*
   2049 	 * Put the lwp in an orderly state for debugging,
   2050 	 * in case we are stopped while sleeping, below.
   2051 	 */
   2052 	prstop(PR_REQUESTED, 0);
   2053 
   2054 	timedwait = (caddr_t)tsp;
   2055 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
   2056 	    lwpt.lwpt_imm_timeout) {
   2057 		imm_timeout = 1;
   2058 		timedwait = NULL;
   2059 	}
   2060 
   2061 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
   2062 
   2063 	if (on_fault(&ljb)) {
   2064 		if (locked)
   2065 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   2066 		error = EFAULT;
   2067 		goto out;
   2068 	}
   2069 	/*
   2070 	 * Force Copy-on-write if necessary and ensure that the
   2071 	 * synchronization object resides in read/write memory.
   2072 	 * Cause an EFAULT return now if this is not so.
   2073 	 */
   2074 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
   2075 	suword16_noerr((void *)&sp->sema_type, type);
   2076 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
   2077 	    &lwpchan, LWPCHAN_CVPOOL)) {
   2078 		error = EFAULT;
   2079 		goto out;
   2080 	}
   2081 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
   2082 	locked = 1;
   2083 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
   2084 	while (error == 0 && count == 0) {
   2085 		if (time_error) {
   2086 			/*
   2087 			 * The SUSV3 Posix spec is very clear that we
   2088 			 * should get no error from validating the
   2089 			 * timer until we would actually sleep.
   2090 			 */
   2091 			error = time_error;
   2092 			break;
   2093 		}
   2094 		suword8_noerr(&sp->sema_waiters, 1);
   2095 		if (watched)
   2096 			watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
   2097 		if (check_park && (!schedctl_is_park() || t->t_unpark)) {
   2098 			/*
   2099 			 * We received a signal at user-level before calling
   2100 			 * here or another thread wants us to return
   2101 			 * immediately with EINTR.  See lwp_unpark().
   2102 			 */
   2103 			imm_unpark = 1;
   2104 			t->t_unpark = 0;
   2105 			timedwait = NULL;
   2106 		} else if (timedwait) {
   2107 			/*
   2108 			 * If we successfully queue the timeout,
   2109 			 * then don't drop t_delay_lock until
   2110 			 * we are on the sleep queue (below).
   2111 			 */
   2112 			mutex_enter(&t->t_delay_lock);
   2113 			if (lwp_timer_enqueue(&lwpt) != 0) {
   2114 				mutex_exit(&t->t_delay_lock);
   2115 				imm_timeout = 1;
   2116 				timedwait = NULL;
   2117 			}
   2118 		}
   2119 		t->t_flag |= T_WAITCVSEM;
   2120 		lwp_block(&lwpchan);
   2121 		/*
   2122 		 * Nothing should happen to cause the lwp to sleep
   2123 		 * again until after it returns from swtch().
   2124 		 */
   2125 		if (timedwait)
   2126 			mutex_exit(&t->t_delay_lock);
   2127 		locked = 0;
   2128 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   2129 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
   2130 		    (imm_timeout | imm_unpark))
   2131 			setrun(t);
   2132 		swtch();
   2133 		t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
   2134 		if (timedwait)
   2135 			tim = lwp_timer_dequeue(&lwpt);
   2136 		setallwatch();
   2137 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
   2138 		    MUSTRETURN(p, t) || imm_unpark)
   2139 			error = EINTR;
   2140 		else if (imm_timeout || (timedwait && tim == -1))
   2141 			error = ETIME;
   2142 		lwp->lwp_asleep = 0;
   2143 		lwp->lwp_sysabort = 0;
   2144 		watched = watch_disable_addr((caddr_t)sp,
   2145 		    sizeof (*sp), S_WRITE);
   2146 		lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
   2147 		locked = 1;
   2148 		fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
   2149 	}
   2150 	if (error == 0)
   2151 		suword32_noerr((void *)&sp->sema_count, --count);
   2152 	if (count != 0) {
   2153 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
   2154 		suword8_noerr(&sp->sema_waiters, waiters);
   2155 	}
   2156 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   2157 out:
   2158 	no_fault();
   2159 	if (watched)
   2160 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
   2161 	if (tsp && check_park && !time_error)
   2162 		error = lwp_timer_copyout(&lwpt, error);
   2163 	if (error)
   2164 		return (set_errno(error));
   2165 	return (0);
   2166 }
   2167 
   2168 /*
   2169  * Obsolete lwp_sema_wait() interface, no longer called from libc.
   2170  * libc now calls lwp_sema_timedwait().
   2171  * This system call trap exists solely for the benefit of old
   2172  * statically linked applications from Solaris 9 and before.
   2173  * It should be removed when we no longer care about such applications.
   2174  */
   2175 int
   2176 lwp_sema_wait(lwp_sema_t *sp)
   2177 {
   2178 	return (lwp_sema_timedwait(sp, NULL, 0));
   2179 }
   2180 
   2181 int
   2182 lwp_sema_post(lwp_sema_t *sp)
   2183 {
   2184 	proc_t *p = ttoproc(curthread);
   2185 	label_t ljb;
   2186 	volatile int locked = 0;
   2187 	volatile int watched = 0;
   2188 	volatile uint16_t type = 0;
   2189 	int count;
   2190 	lwpchan_t lwpchan;
   2191 	uchar_t waiters;
   2192 	int error = 0;
   2193 
   2194 	if ((caddr_t)sp >= p->p_as->a_userlimit)
   2195 		return (set_errno(EFAULT));
   2196 
   2197 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
   2198 
   2199 	if (on_fault(&ljb)) {
   2200 		if (locked)
   2201 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   2202 		error = EFAULT;
   2203 		goto out;
   2204 	}
   2205 	/*
   2206 	 * Force Copy-on-write if necessary and ensure that the
   2207 	 * synchronization object resides in read/write memory.
   2208 	 * Cause an EFAULT return now if this is not so.
   2209 	 */
   2210 	fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
   2211 	suword16_noerr(&sp->sema_type, type);
   2212 	if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
   2213 	    &lwpchan, LWPCHAN_CVPOOL)) {
   2214 		error = EFAULT;
   2215 		goto out;
   2216 	}
   2217 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
   2218 	locked = 1;
   2219 	fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
   2220 	if (count == _SEM_VALUE_MAX)
   2221 		error = EOVERFLOW;
   2222 	else
   2223 		suword32_noerr(&sp->sema_count, ++count);
   2224 	if (count == 1) {
   2225 		fuword8_noerr(&sp->sema_waiters, &waiters);
   2226 		if (waiters) {
   2227 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
   2228 			suword8_noerr(&sp->sema_waiters, waiters);
   2229 		}
   2230 	}
   2231 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   2232 out:
   2233 	no_fault();
   2234 	if (watched)
   2235 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
   2236 	if (error)
   2237 		return (set_errno(error));
   2238 	return (0);
   2239 }
   2240 
   2241 #define	TRW_WANT_WRITE		0x1
   2242 #define	TRW_LOCK_GRANTED	0x2
   2243 
   2244 #define	READ_LOCK		0
   2245 #define	WRITE_LOCK		1
   2246 #define	TRY_FLAG		0x10
   2247 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
   2248 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
   2249 
   2250 /*
   2251  * Release one writer or one or more readers. Compute the rwstate word to
   2252  * reflect the new state of the queue. For a safe hand-off we copy the new
   2253  * rwstate value back to userland before we wake any of the new lock holders.
   2254  *
   2255  * Note that sleepq_insert() implements a prioritized FIFO (with writers
   2256  * being given precedence over readers of the same priority).
   2257  *
   2258  * If the first thread is a reader we scan the queue releasing all readers
   2259  * until we hit a writer or the end of the queue. If the first thread is a
   2260  * writer we still need to check for another writer.
   2261  */
   2262 void
   2263 lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
   2264 {
   2265 	sleepq_head_t *sqh;
   2266 	kthread_t *tp;
   2267 	kthread_t **tpp;
   2268 	kthread_t *tpnext;
   2269 	kthread_t *wakelist = NULL;
   2270 	uint32_t rwstate = 0;
   2271 	int wcount = 0;
   2272 	int rcount = 0;
   2273 
   2274 	sqh = lwpsqhash(lwpchan);
   2275 	disp_lock_enter(&sqh->sq_lock);
   2276 	tpp = &sqh->sq_queue.sq_first;
   2277 	while ((tp = *tpp) != NULL) {
   2278 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
   2279 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
   2280 			if (tp->t_writer & TRW_WANT_WRITE) {
   2281 				if ((wcount++ == 0) && (rcount == 0)) {
   2282 					rwstate |= URW_WRITE_LOCKED;
   2283 
   2284 					/* Just one writer to wake. */
   2285 					sleepq_unlink(tpp, tp);
   2286 					wakelist = tp;
   2287 
   2288 					/* tpp already set for next thread. */
   2289 					continue;
   2290 				} else {
   2291 					rwstate |= URW_HAS_WAITERS;
   2292 					/* We need look no further. */
   2293 					break;
   2294 				}
   2295 			} else {
   2296 				rcount++;
   2297 				if (wcount == 0) {
   2298 					rwstate++;
   2299 
   2300 					/* Add reader to wake list. */
   2301 					sleepq_unlink(tpp, tp);
   2302 					tp->t_link = wakelist;
   2303 					wakelist = tp;
   2304 
   2305 					/* tpp already set for next thread. */
   2306 					continue;
   2307 				} else {
   2308 					rwstate |= URW_HAS_WAITERS;
   2309 					/* We need look no further. */
   2310 					break;
   2311 				}
   2312 			}
   2313 		}
   2314 		tpp = &tp->t_link;
   2315 	}
   2316 
   2317 	/* Copy the new rwstate back to userland. */
   2318 	suword32_noerr(&rw->rwlock_readers, rwstate);
   2319 
   2320 	/* Wake the new lock holder(s) up. */
   2321 	tp = wakelist;
   2322 	while (tp != NULL) {
   2323 		DTRACE_SCHED1(wakeup, kthread_t *, tp);
   2324 		tp->t_wchan0 = NULL;
   2325 		tp->t_wchan = NULL;
   2326 		tp->t_sobj_ops = NULL;
   2327 		tp->t_writer |= TRW_LOCK_GRANTED;
   2328 		tpnext = tp->t_link;
   2329 		tp->t_link = NULL;
   2330 		CL_WAKEUP(tp);
   2331 		thread_unlock_high(tp);
   2332 		tp = tpnext;
   2333 	}
   2334 
   2335 	disp_lock_exit(&sqh->sq_lock);
   2336 }
   2337 
   2338 /*
   2339  * We enter here holding the user-level mutex, which we must release before
   2340  * returning or blocking. Based on lwp_cond_wait().
   2341  */
   2342 static int
   2343 lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
   2344 {
   2345 	lwp_mutex_t *mp = NULL;
   2346 	kthread_t *t = curthread;
   2347 	kthread_t *tp;
   2348 	klwp_t *lwp = ttolwp(t);
   2349 	proc_t *p = ttoproc(t);
   2350 	lwp_timer_t lwpt;
   2351 	lwpchan_t lwpchan;
   2352 	lwpchan_t mlwpchan;
   2353 	caddr_t timedwait;
   2354 	volatile uint16_t type = 0;
   2355 	volatile uint8_t mtype = 0;
   2356 	uchar_t mwaiters;
   2357 	volatile int error = 0;
   2358 	int time_error;
   2359 	clock_t tim = -1;
   2360 	volatile int locked = 0;
   2361 	volatile int mlocked = 0;
   2362 	volatile int watched = 0;
   2363 	volatile int mwatched = 0;
   2364 	label_t ljb;
   2365 	volatile int no_lwpchan = 1;
   2366 	int imm_timeout = 0;
   2367 	int try_flag;
   2368 	uint32_t rwstate;
   2369 	int acquired = 0;
   2370 
   2371 	/* We only check rw because the mutex is included in it. */
   2372 	if ((caddr_t)rw >= p->p_as->a_userlimit)
   2373 		return (set_errno(EFAULT));
   2374 
   2375 	/*
   2376 	 * Put the lwp in an orderly state for debugging,
   2377 	 * in case we are stopped while sleeping, below.
   2378 	 */
   2379 	prstop(PR_REQUESTED, 0);
   2380 
   2381 	/* We must only report this error if we are about to sleep (later). */
   2382 	timedwait = (caddr_t)tsp;
   2383 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
   2384 	    lwpt.lwpt_imm_timeout) {
   2385 		imm_timeout = 1;
   2386 		timedwait = NULL;
   2387 	}
   2388 
   2389 	(void) new_mstate(t, LMS_USER_LOCK);
   2390 
   2391 	if (on_fault(&ljb)) {
   2392 		if (no_lwpchan) {
   2393 			error = EFAULT;
   2394 			goto out_nodrop;
   2395 		}
   2396 		if (mlocked) {
   2397 			mlocked = 0;
   2398 			lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
   2399 		}
   2400 		if (locked) {
   2401 			locked = 0;
   2402 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   2403 		}
   2404 		/*
   2405 		 * Set up another on_fault() for a possible fault
   2406 		 * on the user lock accessed at "out_drop".
   2407 		 */
   2408 		if (on_fault(&ljb)) {
   2409 			if (mlocked) {
   2410 				mlocked = 0;
   2411 				lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
   2412 			}
   2413 			error = EFAULT;
   2414 			goto out_nodrop;
   2415 		}
   2416 		error = EFAULT;
   2417 		goto out_nodrop;
   2418 	}
   2419 
   2420 	/* Process rd_wr (including sanity check). */
   2421 	try_flag = (rd_wr & TRY_FLAG);
   2422 	rd_wr &= ~TRY_FLAG;
   2423 	if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
   2424 		error = EINVAL;
   2425 		goto out_nodrop;
   2426 	}
   2427 
   2428 	/*
   2429 	 * Force Copy-on-write if necessary and ensure that the
   2430 	 * synchronization object resides in read/write memory.
   2431 	 * Cause an EFAULT return now if this is not so.
   2432 	 */
   2433 	mp = &rw->mutex;
   2434 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
   2435 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
   2436 	suword8_noerr(&mp->mutex_type, mtype);
   2437 	suword16_noerr(&rw->rwlock_type, type);
   2438 
   2439 	/* We can only continue for simple USYNC_PROCESS locks. */
   2440 	if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
   2441 		error = EINVAL;
   2442 		goto out_nodrop;
   2443 	}
   2444 
   2445 	/* Convert user level mutex, "mp", to a unique lwpchan. */
   2446 	if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
   2447 	    &mlwpchan, LWPCHAN_MPPOOL)) {
   2448 		error = EFAULT;
   2449 		goto out_nodrop;
   2450 	}
   2451 
   2452 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
   2453 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
   2454 	    &lwpchan, LWPCHAN_CVPOOL)) {
   2455 		error = EFAULT;
   2456 		goto out_nodrop;
   2457 	}
   2458 
   2459 	no_lwpchan = 0;
   2460 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
   2461 	mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
   2462 
   2463 	/*
   2464 	 * lwpchan_lock() ensures that the calling LWP is put to sleep
   2465 	 * atomically with respect to a possible wakeup which is a result
   2466 	 * of lwp_rwlock_unlock().
   2467 	 *
   2468 	 * What's misleading is that the LWP is put to sleep after the
   2469 	 * rwlock's mutex is released. This is OK as long as the release
   2470 	 * operation is also done while holding mlwpchan. The LWP is then
   2471 	 * put to sleep when the possibility of pagefaulting or sleeping
   2472 	 * has been completely eliminated.
   2473 	 */
   2474 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
   2475 	locked = 1;
   2476 	lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
   2477 	mlocked = 1;
   2478 
   2479 	/*
   2480 	 * Fetch the current rwlock state.
   2481 	 *
   2482 	 * The possibility of spurious wake-ups or killed waiters means
   2483 	 * rwstate's URW_HAS_WAITERS bit may indicate false positives.
   2484 	 * We only fix these if they are important to us.
   2485 	 *
   2486 	 * Although various error states can be observed here (e.g. the lock
   2487 	 * is not held, but there are waiters) we assume these are applicaton
   2488 	 * errors and so we take no corrective action.
   2489 	 */
   2490 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
   2491 	/*
   2492 	 * We cannot legitimately get here from user-level
   2493 	 * without URW_HAS_WAITERS being set.
   2494 	 * Set it now to guard against user-level error.
   2495 	 */
   2496 	rwstate |= URW_HAS_WAITERS;
   2497 
   2498 	/*
   2499 	 * We can try only if the lock isn't held by a writer.
   2500 	 */
   2501 	if (!(rwstate & URW_WRITE_LOCKED)) {
   2502 		tp = lwp_queue_waiter(&lwpchan);
   2503 		if (tp == NULL) {
   2504 			/*
   2505 			 * Hmmm, rwstate indicates waiters but there are
   2506 			 * none queued. This could just be the result of a
   2507 			 * spurious wakeup, so let's ignore it.
   2508 			 *
   2509 			 * We now have a chance to acquire the lock
   2510 			 * uncontended, but this is the last chance for
   2511 			 * a writer to acquire the lock without blocking.
   2512 			 */
   2513 			if (rd_wr == READ_LOCK) {
   2514 				rwstate++;
   2515 				acquired = 1;
   2516 			} else if ((rwstate & URW_READERS_MASK) == 0) {
   2517 				rwstate |= URW_WRITE_LOCKED;
   2518 				acquired = 1;
   2519 			}
   2520 		} else if (rd_wr == READ_LOCK) {
   2521 			/*
   2522 			 * This is the last chance for a reader to acquire
   2523 			 * the lock now, but it can only do so if there is
   2524 			 * no writer of equal or greater priority at the
   2525 			 * head of the queue .
   2526 			 *
   2527 			 * It is also just possible that there is a reader
   2528 			 * at the head of the queue. This may be the result
   2529 			 * of a spurious wakeup or an application failure.
   2530 			 * In this case we only acquire the lock if we have
   2531 			 * equal or greater priority. It is not our job to
   2532 			 * release spurious waiters.
   2533 			 */
   2534 			pri_t our_pri = DISP_PRIO(t);
   2535 			pri_t his_pri = DISP_PRIO(tp);
   2536 
   2537 			if ((our_pri > his_pri) || ((our_pri == his_pri) &&
   2538 			    !(tp->t_writer & TRW_WANT_WRITE))) {
   2539 				rwstate++;
   2540 				acquired = 1;
   2541 			}
   2542 		}
   2543 	}
   2544 
   2545 	if (acquired || try_flag || time_error) {
   2546 		/*
   2547 		 * We're not going to block this time.
   2548 		 */
   2549 		suword32_noerr(&rw->rwlock_readers, rwstate);
   2550 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   2551 		locked = 0;
   2552 
   2553 		if (acquired) {
   2554 			/*
   2555 			 * Got the lock!
   2556 			 */
   2557 			error = 0;
   2558 
   2559 		} else if (try_flag) {
   2560 			/*
   2561 			 * We didn't get the lock and we're about to block.
   2562 			 * If we're doing a trylock, return EBUSY instead.
   2563 			 */
   2564 			error = EBUSY;
   2565 
   2566 		} else if (time_error) {
   2567 			/*
   2568 			 * The SUSV3 POSIX spec is very clear that we should
   2569 			 * get no error from validating the timer (above)
   2570 			 * until we would actually sleep.
   2571 			 */
   2572 			error = time_error;
   2573 		}
   2574 
   2575 		goto out_drop;
   2576 	}
   2577 
   2578 	/*
   2579 	 * We're about to block, so indicate what kind of waiter we are.
   2580 	 */
   2581 	t->t_writer = 0;
   2582 	if (rd_wr == WRITE_LOCK)
   2583 		t->t_writer = TRW_WANT_WRITE;
   2584 	suword32_noerr(&rw->rwlock_readers, rwstate);
   2585 
   2586 	/*
   2587 	 * Unlock the rwlock's mutex (pagefaults are possible here).
   2588 	 */
   2589 	set_owner_pid(mp, 0, 0);
   2590 	ulock_clear(&mp->mutex_lockw);
   2591 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
   2592 	if (mwaiters != 0) {
   2593 		/*
   2594 		 * Given the locking of mlwpchan around the release of
   2595 		 * the mutex and checking for waiters, the following
   2596 		 * call to lwp_release() can fail ONLY if the lock
   2597 		 * acquirer is interrupted after setting the waiter bit,
   2598 		 * calling lwp_block() and releasing mlwpchan.
   2599 		 * In this case, it could get pulled off the LWP sleep
   2600 		 * queue (via setrun()) before the following call to
   2601 		 * lwp_release() occurs, and the lock requestor will
   2602 		 * update the waiter bit correctly by re-evaluating it.
   2603 		 */
   2604 		if (lwp_release(&mlwpchan, &mwaiters, 0))
   2605 			suword8_noerr(&mp->mutex_waiters, mwaiters);
   2606 	}
   2607 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
   2608 	mlocked = 0;
   2609 	no_fault();
   2610 
   2611 	if (mwatched) {
   2612 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
   2613 		mwatched = 0;
   2614 	}
   2615 	if (watched) {
   2616 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
   2617 		watched = 0;
   2618 	}
   2619 
   2620 	if (timedwait) {
   2621 		/*
   2622 		 * If we successfully queue the timeout,
   2623 		 * then don't drop t_delay_lock until
   2624 		 * we are on the sleep queue (below).
   2625 		 */
   2626 		mutex_enter(&t->t_delay_lock);
   2627 		if (lwp_timer_enqueue(&lwpt) != 0) {
   2628 			mutex_exit(&t->t_delay_lock);
   2629 			imm_timeout = 1;
   2630 			timedwait = NULL;
   2631 		}
   2632 	}
   2633 	t->t_flag |= T_WAITCVSEM;
   2634 	lwp_block(&lwpchan);
   2635 
   2636 	/*
   2637 	 * Nothing should happen to cause the LWp to go to sleep until after
   2638 	 * it returns from swtch().
   2639 	 */
   2640 	if (timedwait)
   2641 		mutex_exit(&t->t_delay_lock);
   2642 	locked = 0;
   2643 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   2644 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
   2645 		setrun(t);
   2646 	swtch();
   2647 
   2648 	/*
   2649 	 * We're back, but we need to work out why. Were we interrupted? Did
   2650 	 * we timeout? Were we granted the lock?
   2651 	 */
   2652 	error = EAGAIN;
   2653 	acquired = (t->t_writer & TRW_LOCK_GRANTED);
   2654 	t->t_writer = 0;
   2655 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
   2656 	if (timedwait)
   2657 		tim = lwp_timer_dequeue(&lwpt);
   2658 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
   2659 		error = EINTR;
   2660 	else if (imm_timeout || (timedwait && tim == -1))
   2661 		error = ETIME;
   2662 	lwp->lwp_asleep = 0;
   2663 	lwp->lwp_sysabort = 0;
   2664 	setallwatch();
   2665 
   2666 	/*
   2667 	 * If we were granted the lock we don't care about EINTR or ETIME.
   2668 	 */
   2669 	if (acquired)
   2670 		error = 0;
   2671 
   2672 	if (t->t_mstate == LMS_USER_LOCK)
   2673 		(void) new_mstate(t, LMS_SYSTEM);
   2674 
   2675 	if (error)
   2676 		return (set_errno(error));
   2677 	return (0);
   2678 
   2679 out_drop:
   2680 	/*
   2681 	 * Make sure that the user level lock is dropped before returning
   2682 	 * to the caller.
   2683 	 */
   2684 	if (!mlocked) {
   2685 		lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
   2686 		mlocked = 1;
   2687 	}
   2688 	set_owner_pid(mp, 0, 0);
   2689 	ulock_clear(&mp->mutex_lockw);
   2690 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
   2691 	if (mwaiters != 0) {
   2692 		/*
   2693 		 * See comment above on lock clearing and lwp_release()
   2694 		 * success/failure.
   2695 		 */
   2696 		if (lwp_release(&mlwpchan, &mwaiters, 0))
   2697 			suword8_noerr(&mp->mutex_waiters, mwaiters);
   2698 	}
   2699 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
   2700 	mlocked = 0;
   2701 
   2702 out_nodrop:
   2703 	no_fault();
   2704 	if (mwatched)
   2705 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
   2706 	if (watched)
   2707 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
   2708 	if (t->t_mstate == LMS_USER_LOCK)
   2709 		(void) new_mstate(t, LMS_SYSTEM);
   2710 	if (error)
   2711 		return (set_errno(error));
   2712 	return (0);
   2713 }
   2714 
   2715 /*
   2716  * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
   2717  * we never drop the lock.
   2718  */
   2719 static int
   2720 lwp_rwlock_unlock(lwp_rwlock_t *rw)
   2721 {
   2722 	kthread_t *t = curthread;
   2723 	proc_t *p = ttoproc(t);
   2724 	lwpchan_t lwpchan;
   2725 	volatile uint16_t type = 0;
   2726 	volatile int error = 0;
   2727 	volatile int locked = 0;
   2728 	volatile int watched = 0;
   2729 	label_t ljb;
   2730 	volatile int no_lwpchan = 1;
   2731 	uint32_t rwstate;
   2732 
   2733 	/* We only check rw because the mutex is included in it. */
   2734 	if ((caddr_t)rw >= p->p_as->a_userlimit)
   2735 		return (set_errno(EFAULT));
   2736 
   2737 	if (on_fault(&ljb)) {
   2738 		if (no_lwpchan) {
   2739 			error = EFAULT;
   2740 			goto out_nodrop;
   2741 		}
   2742 		if (locked) {
   2743 			locked = 0;
   2744 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   2745 		}
   2746 		error = EFAULT;
   2747 		goto out_nodrop;
   2748 	}
   2749 
   2750 	/*
   2751 	 * Force Copy-on-write if necessary and ensure that the
   2752 	 * synchronization object resides in read/write memory.
   2753 	 * Cause an EFAULT return now if this is not so.
   2754 	 */
   2755 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
   2756 	suword16_noerr(&rw->rwlock_type, type);
   2757 
   2758 	/* We can only continue for simple USYNC_PROCESS locks. */
   2759 	if (type != USYNC_PROCESS) {
   2760 		error = EINVAL;
   2761 		goto out_nodrop;
   2762 	}
   2763 
   2764 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
   2765 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
   2766 	    &lwpchan, LWPCHAN_CVPOOL)) {
   2767 		error = EFAULT;
   2768 		goto out_nodrop;
   2769 	}
   2770 
   2771 	no_lwpchan = 0;
   2772 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
   2773 
   2774 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
   2775 	locked = 1;
   2776 
   2777 	/*
   2778 	 * We can resolve multiple readers (except the last reader) here.
   2779 	 * For the last reader or a writer we need lwp_rwlock_release(),
   2780 	 * to which we also delegate the task of copying the new rwstate
   2781 	 * back to userland (see the comment there).
   2782 	 */
   2783 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
   2784 	if (rwstate & URW_WRITE_LOCKED)
   2785 		lwp_rwlock_release(&lwpchan, rw);
   2786 	else if ((rwstate & URW_READERS_MASK) > 0) {
   2787 		rwstate--;
   2788 		if ((rwstate & URW_READERS_MASK) == 0)
   2789 			lwp_rwlock_release(&lwpchan, rw);
   2790 		else
   2791 			suword32_noerr(&rw->rwlock_readers, rwstate);
   2792 	}
   2793 
   2794 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
   2795 	locked = 0;
   2796 	error = 0;
   2797 
   2798 out_nodrop:
   2799 	no_fault();
   2800 	if (watched)
   2801 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
   2802 	if (error)
   2803 		return (set_errno(error));
   2804 	return (0);
   2805 }
   2806 
   2807 int
   2808 lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
   2809 {
   2810 	switch (subcode) {
   2811 	case 0:
   2812 		return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
   2813 	case 1:
   2814 		return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
   2815 	case 2:
   2816 		return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
   2817 	case 3:
   2818 		return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
   2819 	case 4:
   2820 		return (lwp_rwlock_unlock(rwlp));
   2821 	}
   2822 	return (set_errno(EINVAL));
   2823 }
   2824 
   2825 /*
   2826  * Return the owner of the user-level s-object.
   2827  * Since we can't really do this, return NULL.
   2828  */
   2829 /* ARGSUSED */
   2830 static kthread_t *
   2831 lwpsobj_owner(caddr_t sobj)
   2832 {
   2833 	return ((kthread_t *)NULL);
   2834 }
   2835 
   2836 /*
   2837  * Wake up a thread asleep on a user-level synchronization
   2838  * object.
   2839  */
   2840 static void
   2841 lwp_unsleep(kthread_t *t)
   2842 {
   2843 	ASSERT(THREAD_LOCK_HELD(t));
   2844 	if (t->t_wchan0 != NULL) {
   2845 		sleepq_head_t *sqh;
   2846 		sleepq_t *sqp = t->t_sleepq;
   2847 
   2848 		if (sqp != NULL) {
   2849 			sqh = lwpsqhash(&t->t_lwpchan);
   2850 			ASSERT(&sqh->sq_queue == sqp);
   2851 			sleepq_unsleep(t);
   2852 			disp_lock_exit_high(&sqh->sq_lock);
   2853 			CL_SETRUN(t);
   2854 			return;
   2855 		}
   2856 	}
   2857 	panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
   2858 }
   2859 
   2860 /*
   2861  * Change the priority of a thread asleep on a user-level
   2862  * synchronization object. To maintain proper priority order,
   2863  * we:
   2864  *	o dequeue the thread.
   2865  *	o change its priority.
   2866  *	o re-enqueue the thread.
   2867  * Assumption: the thread is locked on entry.
   2868  */
   2869 static void
   2870 lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
   2871 {
   2872 	ASSERT(THREAD_LOCK_HELD(t));
   2873 	if (t->t_wchan0 != NULL) {
   2874 		sleepq_t   *sqp = t->t_sleepq;
   2875 
   2876 		sleepq_dequeue(t);
   2877 		*t_prip = pri;
   2878 		sleepq_insert(sqp, t);
   2879 	} else
   2880 		panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
   2881 }
   2882 
   2883 /*
   2884  * Clean up a left-over process-shared robust mutex
   2885  */
   2886 static void
   2887 lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
   2888 {
   2889 	uint16_t flag;
   2890 	uchar_t waiters;
   2891 	label_t ljb;
   2892 	pid_t owner_pid;
   2893 	lwp_mutex_t *lp;
   2894 	volatile int locked = 0;
   2895 	volatile int watched = 0;
   2896 	volatile struct upimutex *upimutex = NULL;
   2897 	volatile int upilocked = 0;
   2898 
   2899 	if ((ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
   2900 	    != (USYNC_PROCESS | LOCK_ROBUST))
   2901 		return;
   2902 
   2903 	lp = (lwp_mutex_t *)ent->lwpchan_addr;
   2904 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   2905 	if (on_fault(&ljb)) {
   2906 		if (locked)
   2907 			lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
   2908 		if (upilocked)
   2909 			upimutex_unlock((upimutex_t *)upimutex, 0);
   2910 		goto out;
   2911 	}
   2912 
   2913 	fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
   2914 
   2915 	if (UPIMUTEX(ent->lwpchan_type)) {
   2916 		lwpchan_t lwpchan = ent->lwpchan_lwpchan;
   2917 		upib_t *upibp = &UPI_CHAIN(lwpchan);
   2918 
   2919 		if (owner_pid != curproc->p_pid)
   2920 			goto out;
   2921 		mutex_enter(&upibp->upib_lock);
   2922 		upimutex = upi_get(upibp, &lwpchan);
   2923 		if (upimutex == NULL || upimutex->upi_owner != curthread) {
   2924 			mutex_exit(&upibp->upib_lock);
   2925 			goto out;
   2926 		}
   2927 		mutex_exit(&upibp->upib_lock);
   2928 		upilocked = 1;
   2929 		flag = lwp_clear_mutex(lp, lockflg);
   2930 		suword8_noerr(&lp->mutex_lockw, 0);
   2931 		upimutex_unlock((upimutex_t *)upimutex, flag);
   2932 	} else {
   2933 		lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
   2934 		locked = 1;
   2935 		/*
   2936 		 * Clear the spinners count because one of our
   2937 		 * threads could have been spinning for this lock
   2938 		 * at user level when the process was suddenly killed.
   2939 		 * There is no harm in this since user-level libc code
   2940 		 * will adapt to the sudden change in the spinner count.
   2941 		 */
   2942 		suword8_noerr(&lp->mutex_spinners, 0);
   2943 		if (owner_pid != curproc->p_pid) {
   2944 			/*
   2945 			 * We are not the owner.  There may or may not be one.
   2946 			 * If there are waiters, we wake up one or all of them.
   2947 			 * It doesn't hurt to wake them up in error since
   2948 			 * they will just retry the lock and go to sleep
   2949 			 * again if necessary.
   2950 			 */
   2951 			fuword8_noerr(&lp->mutex_waiters, &waiters);
   2952 			if (waiters != 0) {	/* there are waiters */
   2953 				fuword16_noerr(&lp->mutex_flag, &flag);
   2954 				if (flag & LOCK_NOTRECOVERABLE) {
   2955 					lwp_release_all(&ent->lwpchan_lwpchan);
   2956 					suword8_noerr(&lp->mutex_waiters, 0);
   2957 				} else if (lwp_release(&ent->lwpchan_lwpchan,
   2958 				    &waiters, 0)) {
   2959 					suword8_noerr(&lp->mutex_waiters,
   2960 					    waiters);
   2961 				}
   2962 			}
   2963 		} else {
   2964 			/*
   2965 			 * We are the owner.  Release it.
   2966 			 */
   2967 			(void) lwp_clear_mutex(lp, lockflg);
   2968 			ulock_clear(&lp->mutex_lockw);
   2969 			fuword8_noerr(&lp->mutex_waiters, &waiters);
   2970 			if (waiters &&
   2971 			    lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
   2972 				suword8_noerr(&lp->mutex_waiters, waiters);
   2973 		}
   2974 		lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
   2975 	}
   2976 out:
   2977 	no_fault();
   2978 	if (watched)
   2979 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   2980 }
   2981 
   2982 /*
   2983  * Register a process-shared robust mutex in the lwpchan cache.
   2984  */
   2985 int
   2986 lwp_mutex_register(lwp_mutex_t *lp, caddr_t uaddr)
   2987 {
   2988 	int error = 0;
   2989 	volatile int watched;
   2990 	label_t ljb;
   2991 	uint8_t type;
   2992 	lwpchan_t lwpchan;
   2993 
   2994 	if ((caddr_t)lp >= (caddr_t)USERLIMIT)
   2995 		return (set_errno(EFAULT));
   2996 
   2997 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   2998 
   2999 	if (on_fault(&ljb)) {
   3000 		error = EFAULT;
   3001 	} else {
   3002 		/*
   3003 		 * Force Copy-on-write if necessary and ensure that the
   3004 		 * synchronization object resides in read/write memory.
   3005 		 * Cause an EFAULT return now if this is not so.
   3006 		 */
   3007 		fuword8_noerr(&lp->mutex_type, &type);
   3008 		suword8_noerr(&lp->mutex_type, type);
   3009 		if ((type & (USYNC_PROCESS|LOCK_ROBUST))
   3010 		    != (USYNC_PROCESS|LOCK_ROBUST)) {
   3011 			error = EINVAL;
   3012 		} else if (!lwpchan_get_mapping(curproc->p_as, (caddr_t)lp,
   3013 		    uaddr, type, &lwpchan, LWPCHAN_MPPOOL)) {
   3014 			error = EFAULT;
   3015 		}
   3016 	}
   3017 	no_fault();
   3018 	if (watched)
   3019 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   3020 	if (error)
   3021 		return (set_errno(error));
   3022 	return (0);
   3023 }
   3024 
   3025 /*
   3026  * There is a user-level robust lock registration in libc.
   3027  * Mark it as invalid by storing -1 into the location of the pointer.
   3028  */
   3029 static void
   3030 lwp_mutex_unregister(void *uaddr)
   3031 {
   3032 	if (get_udatamodel() == DATAMODEL_NATIVE) {
   3033 		(void) sulword(uaddr, (ulong_t)-1);
   3034 #ifdef _SYSCALL32_IMPL
   3035 	} else {
   3036 		(void) suword32(uaddr, (uint32_t)-1);
   3037 #endif
   3038 	}
   3039 }
   3040 
   3041 int
   3042 lwp_mutex_trylock(lwp_mutex_t *lp, uintptr_t owner)
   3043 {
   3044 	kthread_t *t = curthread;
   3045 	proc_t *p = ttoproc(t);
   3046 	int error = 0;
   3047 	volatile int locked = 0;
   3048 	volatile int watched = 0;
   3049 	label_t ljb;
   3050 	volatile uint8_t type = 0;
   3051 	uint16_t flag;
   3052 	lwpchan_t lwpchan;
   3053 
   3054 	if ((caddr_t)lp >= p->p_as->a_userlimit)
   3055 		return (set_errno(EFAULT));
   3056 
   3057 	(void) new_mstate(t, LMS_USER_LOCK);
   3058 
   3059 	if (on_fault(&ljb)) {
   3060 		if (locked)
   3061 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
   3062 		error = EFAULT;
   3063 		goto out;
   3064 	}
   3065 	/*
   3066 	 * Force Copy-on-write if necessary and ensure that the
   3067 	 * synchronization object resides in read/write memory.
   3068 	 * Cause an EFAULT return now if this is not so.
   3069 	 */
   3070 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
   3071 	suword8_noerr(&lp->mutex_type, type);
   3072 	if (UPIMUTEX(type)) {
   3073 		no_fault();
   3074 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
   3075 		if (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED)
   3076 			set_owner_pid(lp, owner,
   3077 			    (type & USYNC_PROCESS)? p->p_pid : 0);
   3078 		if (error)
   3079 			return (set_errno(error));
   3080 		return (0);
   3081 	}
   3082 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
   3083 	    &lwpchan, LWPCHAN_MPPOOL)) {
   3084 		error = EFAULT;
   3085 		goto out;
   3086 	}
   3087 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
   3088 	locked = 1;
   3089 	if (type & LOCK_ROBUST) {
   3090 		fuword16_noerr(&lp->mutex_flag, &flag);
   3091 		if (flag & LOCK_NOTRECOVERABLE) {
   3092 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
   3093 			error =  ENOTRECOVERABLE;
   3094 			goto out;
   3095 		}
   3096 	}
   3097 
   3098 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   3099 
   3100 	if (!ulock_try(&lp->mutex_lockw))
   3101 		error = EBUSY;
   3102 	else {
   3103 		set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
   3104 		if (type & LOCK_ROBUST) {
   3105 			fuword16_noerr(&lp->mutex_flag, &flag);
   3106 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
   3107 				if (flag & LOCK_OWNERDEAD)
   3108 					error = EOWNERDEAD;
   3109 				else if (type & USYNC_PROCESS_ROBUST)
   3110 					error = ELOCKUNMAPPED;
   3111 				else
   3112 					error = EOWNERDEAD;
   3113 			}
   3114 		}
   3115 	}
   3116 	locked = 0;
   3117 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
   3118 out:
   3119 
   3120 	if (t->t_mstate == LMS_USER_LOCK)
   3121 		(void) new_mstate(t, LMS_SYSTEM);
   3122 
   3123 	no_fault();
   3124 	if (watched)
   3125 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   3126 	if (error)
   3127 		return (set_errno(error));
   3128 	return (0);
   3129 }
   3130 
   3131 /*
   3132  * unlock the mutex and unblock lwps that is trying to acquire this mutex.
   3133  * the blocked lwp resumes and retries to acquire the lock.
   3134  */
   3135 int
   3136 lwp_mutex_unlock(lwp_mutex_t *lp)
   3137 {
   3138 	proc_t *p = ttoproc(curthread);
   3139 	lwpchan_t lwpchan;
   3140 	uchar_t waiters;
   3141 	volatile int locked = 0;
   3142 	volatile int watched = 0;
   3143 	volatile uint8_t type = 0;
   3144 	label_t ljb;
   3145 	uint16_t flag;
   3146 	int error = 0;
   3147 
   3148 	if ((caddr_t)lp >= p->p_as->a_userlimit)
   3149 		return (set_errno(EFAULT));
   3150 
   3151 	if (on_fault(&ljb)) {
   3152 		if (locked)
   3153 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
   3154 		error = EFAULT;
   3155 		goto out;
   3156 	}
   3157 
   3158 	/*
   3159 	 * Force Copy-on-write if necessary and ensure that the
   3160 	 * synchronization object resides in read/write memory.
   3161 	 * Cause an EFAULT return now if this is not so.
   3162 	 */
   3163 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
   3164 	suword8_noerr(&lp->mutex_type, type);
   3165 
   3166 	if (UPIMUTEX(type)) {
   3167 		no_fault();
   3168 		error = lwp_upimutex_unlock(lp, type);
   3169 		if (error)
   3170 			return (set_errno(error));
   3171 		return (0);
   3172 	}
   3173 
   3174 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   3175 
   3176 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
   3177 	    &lwpchan, LWPCHAN_MPPOOL)) {
   3178 		error = EFAULT;
   3179 		goto out;
   3180 	}
   3181 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
   3182 	locked = 1;
   3183 	if (type & LOCK_ROBUST) {
   3184 		fuword16_noerr(&lp->mutex_flag, &flag);
   3185 		if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
   3186 			flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
   3187 			flag |= LOCK_NOTRECOVERABLE;
   3188 			suword16_noerr(&lp->mutex_flag, flag);
   3189 		}
   3190 	}
   3191 	set_owner_pid(lp, 0, 0);
   3192 	ulock_clear(&lp->mutex_lockw);
   3193 	/*
   3194 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
   3195 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
   3196 	 * may fail.  If it fails, do not write into the waiter bit.
   3197 	 * The call to lwp_release() might fail due to one of three reasons:
   3198 	 *
   3199 	 * 	1. due to the thread which set the waiter bit not actually
   3200 	 *	   sleeping since it got the lock on the re-try. The waiter
   3201 	 *	   bit will then be correctly updated by that thread. This
   3202 	 *	   window may be closed by reading the wait bit again here
   3203 	 *	   and not calling lwp_release() at all if it is zero.
   3204 	 *	2. the thread which set the waiter bit and went to sleep
   3205 	 *	   was woken up by a signal. This time, the waiter recomputes
   3206 	 *	   the wait bit in the return with EINTR code.
   3207 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
   3208 	 *	   memory that has been re-used after the lock was dropped.
   3209 	 *	   In this case, writing into the waiter bit would cause data
   3210 	 *	   corruption.
   3211 	 */
   3212 	fuword8_noerr(&lp->mutex_waiters, &waiters);
   3213 	if (waiters) {
   3214 		if ((type & LOCK_ROBUST) &&
   3215 		    (flag & LOCK_NOTRECOVERABLE)) {
   3216 			lwp_release_all(&lwpchan);
   3217 			suword8_noerr(&lp->mutex_waiters, 0);
   3218 		} else if (lwp_release(&lwpchan, &waiters, 0)) {
   3219 			suword8_noerr(&lp->mutex_waiters, waiters);
   3220 		}
   3221 	}
   3222 
   3223 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
   3224 out:
   3225 	no_fault();
   3226 	if (watched)
   3227 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
   3228 	if (error)
   3229 		return (set_errno(error));
   3230 	return (0);
   3231 }
   3232