Home | History | Annotate | Download | only in syscall
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * Inter-Process Communication Semaphore Facility.
     31  *
     32  * See os/ipc.c for a description of common IPC functionality.
     33  *
     34  * Resource controls
     35  * -----------------
     36  *
     37  * Control:      zone.max-sem-ids (rc_zone_semmni)
     38  * Description:  Maximum number of semaphore ids allowed a zone.
     39  *
     40  *   When semget() is used to allocate a semaphore set, one id is
     41  *   allocated.  If the id allocation doesn't succeed, semget() fails
     42  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
     43  *   the id is deallocated.
     44  *
     45  * Control:      project.max-sem-ids (rc_project_semmni)
     46  * Description:  Maximum number of semaphore ids allowed a project.
     47  *
     48  *   When semget() is used to allocate a semaphore set, one id is
     49  *   allocated.  If the id allocation doesn't succeed, semget() fails
     50  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
     51  *   the id is deallocated.
     52  *
     53  * Control:      process.max-sem-nsems (rc_process_semmsl)
     54  * Description:  Maximum number of semaphores allowed per semaphore set.
     55  *
     56  *   When semget() is used to allocate a semaphore set, the size of the
     57  *   set is compared with this limit.  If the number of semaphores
     58  *   exceeds the limit, semget() fails and errno is set to EINVAL.
     59  *
     60  * Control:      process.max-sem-ops (rc_process_semopm)
     61  * Description:  Maximum number of semaphore operations allowed per
     62  *               semop call.
     63  *
     64  *   When semget() successfully allocates a semaphore set, the minimum
     65  *   enforced value of this limit is used to initialize the
     66  *   "system-imposed maximum" number of operations a semop() call for
     67  *   this set can perform.
     68  *
     69  * Undo structures
     70  * ---------------
     71  *
     72  * Removing the undo structure tunables involved a serious redesign of
     73  * how they were implemented.  There is now one undo structure for
     74  * every process/semaphore array combination (lazily allocated, of
     75  * course), and each is equal in size to the semaphore it corresponds
     76  * to.  To avoid scalability and performance problems, the undo
     77  * structures are stored in two places: a per-process AVL tree sorted
     78  * by ksemid pointer (p_semacct, protected by p_lock) and an unsorted
     79  * per-semaphore linked list (sem_undos, protected by the semaphore's
     80  * ID lock).  The former is used by semop, where a lookup is performed
     81  * once and cached if SEM_UNDO is specified for any of the operations,
     82  * and at process exit where the undoable operations are rolled back.
     83  * The latter is used when removing the semaphore, so the undo
     84  * structures can be removed from the appropriate processes' trees.
     85  *
     86  * The undo structure itself contains pointers to the ksemid and proc
     87  * to which it corresponds, a list node, an AVL node, and an array of
     88  * adjust-on-exit (AOE) values.  When an undo structure is allocated it
     89  * is immediately added to both the process's tree and the semaphore's
     90  * list.  Lastly, the reference count on the semaphore is increased.
     91  *
     92  * Avoiding a lock ordering violation between p_lock and the ID lock,
     93  * wont to occur when there is a race between a process exiting and the
     94  * removal of a semaphore, mandates the delicate dance that exists
     95  * between semexit and sem_rmid.
     96  *
     97  * sem_rmid, holding the ID lock, iterates through all undo structures
     98  * and for each takes the appropriate process's p_lock and checks to
     99  * see if p_semacct is NULL.  If it is, it skips that undo structure
    100  * and continues to the next.  Otherwise, it removes the undo structure
    101  * from both the AVL tree and the semaphore's list, and releases the
    102  * hold that the undo structure had on the semaphore.
    103  *
    104  * The important other half of this is semexit, which will immediately
    105  * take p_lock, obtain the AVL pointer, clear p_semacct, and drop
    106  * p_lock.  From this point on it is semexit's responsibility to clean
    107  * up all undo structures found in the tree -- a coexecuting sem_rmid
    108  * will see the NULL p_semacct and skip that undo structure.  It walks
    109  * the AVL tree (using avl_destroy_nodes) and for each undo structure
    110  * takes the appropriate semaphore's ID lock (always legal since the
    111  * undo structure has a hold on the semaphore), updates all semaphores
    112  * with non-zero AOE values, and removes the structure from the
    113  * semaphore's list.  It then drops the structure's reference on the
    114  * semaphore, drops the ID lock, and frees the undo structure.
    115  */
    116 
    117 #include <sys/types.h>
    118 #include <sys/t_lock.h>
    119 #include <sys/param.h>
    120 #include <sys/systm.h>
    121 #include <sys/sysmacros.h>
    122 #include <sys/cred.h>
    123 #include <sys/vmem.h>
    124 #include <sys/kmem.h>
    125 #include <sys/errno.h>
    126 #include <sys/time.h>
    127 #include <sys/ipc.h>
    128 #include <sys/ipc_impl.h>
    129 #include <sys/sem.h>
    130 #include <sys/sem_impl.h>
    131 #include <sys/user.h>
    132 #include <sys/proc.h>
    133 #include <sys/cpuvar.h>
    134 #include <sys/debug.h>
    135 #include <sys/var.h>
    136 #include <sys/cmn_err.h>
    137 #include <sys/modctl.h>
    138 #include <sys/syscall.h>
    139 #include <sys/avl.h>
    140 #include <sys/list.h>
    141 #include <sys/zone.h>
    142 
    143 #include <c2/audit.h>
    144 
    145 extern rctl_hndl_t rc_zone_semmni;
    146 extern rctl_hndl_t rc_project_semmni;
    147 extern rctl_hndl_t rc_process_semmsl;
    148 extern rctl_hndl_t rc_process_semopm;
    149 static ipc_service_t *sem_svc;
    150 static zone_key_t sem_zone_key;
    151 
    152 /*
    153  * The following tunables are obsolete.  Though for compatibility we
    154  * still read and interpret seminfo_semmsl, seminfo_semopm and
    155  * seminfo_semmni (see os/project.c and os/rctl_proc.c), the preferred
    156  * mechanism for administrating the IPC Semaphore facility is through
    157  * the resource controls described at the top of this file.
    158  */
    159 int seminfo_semaem = 16384;	/* (obsolete) */
    160 int seminfo_semmap = 10;	/* (obsolete) */
    161 int seminfo_semmni = 10;	/* (obsolete) */
    162 int seminfo_semmns = 60;	/* (obsolete) */
    163 int seminfo_semmnu = 30;	/* (obsolete) */
    164 int seminfo_semmsl = 25;	/* (obsolete) */
    165 int seminfo_semopm = 10;	/* (obsolete) */
    166 int seminfo_semume = 10;	/* (obsolete) */
    167 int seminfo_semusz = 96;	/* (obsolete) */
    168 int seminfo_semvmx = 32767;	/* (obsolete) */
    169 
    170 #define	SEM_MAXUCOPS	4096	/* max # of unchecked ops per semop call */
    171 #define	SEM_UNDOSZ(n)	(sizeof (struct sem_undo) + (n - 1) * sizeof (int))
    172 
    173 static int semsys(int opcode, uintptr_t a0, uintptr_t a1,
    174     uintptr_t a2, uintptr_t a3);
    175 static void sem_dtor(kipc_perm_t *);
    176 static void sem_rmid(kipc_perm_t *);
    177 static void sem_remove_zone(zoneid_t, void *);
    178 
    179 static struct sysent ipcsem_sysent = {
    180 	5,
    181 	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
    182 	semsys
    183 };
    184 
    185 /*
    186  * Module linkage information for the kernel.
    187  */
    188 static struct modlsys modlsys = {
    189 	&mod_syscallops, "System V semaphore facility", &ipcsem_sysent
    190 };
    191 
    192 #ifdef _SYSCALL32_IMPL
    193 static struct modlsys modlsys32 = {
    194 	&mod_syscallops32, "32-bit System V semaphore facility", &ipcsem_sysent
    195 };
    196 #endif
    197 
    198 static struct modlinkage modlinkage = {
    199 	MODREV_1,
    200 	&modlsys,
    201 #ifdef _SYSCALL32_IMPL
    202 	&modlsys32,
    203 #endif
    204 	NULL
    205 };
    206 
    207 
    208 int
    209 _init(void)
    210 {
    211 	int result;
    212 
    213 	sem_svc = ipcs_create("semids", rc_project_semmni, rc_zone_semmni,
    214 	    sizeof (ksemid_t), sem_dtor, sem_rmid, AT_IPC_SEM,
    215 	    offsetof(ipc_rqty_t, ipcq_semmni));
    216 	zone_key_create(&sem_zone_key, NULL, sem_remove_zone, NULL);
    217 
    218 	if ((result = mod_install(&modlinkage)) == 0)
    219 		return (0);
    220 
    221 	(void) zone_key_delete(sem_zone_key);
    222 	ipcs_destroy(sem_svc);
    223 
    224 	return (result);
    225 }
    226 
    227 int
    228 _fini(void)
    229 {
    230 	return (EBUSY);
    231 }
    232 
    233 int
    234 _info(struct modinfo *modinfop)
    235 {
    236 	return (mod_info(&modlinkage, modinfop));
    237 }
    238 
    239 static void
    240 sem_dtor(kipc_perm_t *perm)
    241 {
    242 	ksemid_t *sp = (ksemid_t *)perm;
    243 
    244 	kmem_free(sp->sem_base,
    245 	    P2ROUNDUP(sp->sem_nsems * sizeof (struct sem), 64));
    246 	list_destroy(&sp->sem_undos);
    247 }
    248 
    249 /*
    250  * sem_undo_add - Create or update adjust on exit entry.
    251  */
    252 static int
    253 sem_undo_add(short val, ushort_t num, struct sem_undo *undo)
    254 {
    255 	int newval = undo->un_aoe[num] - val;
    256 
    257 	if (newval > USHRT_MAX || newval < -USHRT_MAX)
    258 		return (ERANGE);
    259 	undo->un_aoe[num] = newval;
    260 
    261 	return (0);
    262 }
    263 
    264 /*
    265  * sem_undo_clear - clears all undo entries for specified semaphores
    266  *
    267  * Used when semaphores are reset by SETVAL or SETALL.
    268  */
    269 static void
    270 sem_undo_clear(ksemid_t *sp, ushort_t low, ushort_t high)
    271 {
    272 	struct sem_undo *undo;
    273 	int i;
    274 
    275 	ASSERT(low <= high);
    276 	ASSERT(high < sp->sem_nsems);
    277 
    278 	for (undo = list_head(&sp->sem_undos); undo;
    279 	    undo = list_next(&sp->sem_undos, undo))
    280 		for (i = low; i <= high; i++)
    281 			undo->un_aoe[i] = 0;
    282 }
    283 
    284 /*
    285  * sem_rollback - roll back work done so far if unable to complete operation
    286  */
    287 static void
    288 sem_rollback(ksemid_t *sp, struct sembuf *op, int n, struct sem_undo *undo)
    289 {
    290 	struct sem *semp;	/* semaphore ptr */
    291 
    292 	for (op += n - 1; n--; op--) {
    293 		if (op->sem_op == 0)
    294 			continue;
    295 		semp = &sp->sem_base[op->sem_num];
    296 		semp->semval -= op->sem_op;
    297 		if (op->sem_flg & SEM_UNDO) {
    298 			ASSERT(undo != NULL);
    299 			(void) sem_undo_add(-op->sem_op, op->sem_num, undo);
    300 		}
    301 	}
    302 }
    303 
    304 static void
    305 sem_rmid(kipc_perm_t *perm)
    306 {
    307 	ksemid_t *sp = (ksemid_t *)perm;
    308 	struct sem *semp;
    309 	struct sem_undo *undo;
    310 	size_t size = SEM_UNDOSZ(sp->sem_nsems);
    311 	int i;
    312 
    313 	/*LINTED*/
    314 	while (undo = list_head(&sp->sem_undos)) {
    315 		list_remove(&sp->sem_undos, undo);
    316 		mutex_enter(&undo->un_proc->p_lock);
    317 		if (undo->un_proc->p_semacct == NULL) {
    318 			mutex_exit(&undo->un_proc->p_lock);
    319 			continue;
    320 		}
    321 		avl_remove(undo->un_proc->p_semacct, undo);
    322 		mutex_exit(&undo->un_proc->p_lock);
    323 		kmem_free(undo, size);
    324 		ipc_rele_locked(sem_svc, (kipc_perm_t *)sp);
    325 	}
    326 
    327 	for (i = 0; i < sp->sem_nsems; i++) {
    328 		semp = &sp->sem_base[i];
    329 		semp->semval = semp->sempid = 0;
    330 		if (semp->semncnt) {
    331 			cv_broadcast(&semp->semncnt_cv);
    332 			semp->semncnt = 0;
    333 		}
    334 		if (semp->semzcnt) {
    335 			cv_broadcast(&semp->semzcnt_cv);
    336 			semp->semzcnt = 0;
    337 		}
    338 	}
    339 }
    340 
    341 /*
    342  * semctl - Semctl system call.
    343  */
    344 static int
    345 semctl(int semid, uint_t semnum, int cmd, uintptr_t arg)
    346 {
    347 	ksemid_t		*sp;	/* ptr to semaphore header */
    348 	struct sem		*p;	/* ptr to semaphore */
    349 	unsigned int		i;	/* loop control */
    350 	ushort_t		*vals, *vp;
    351 	size_t			vsize = 0;
    352 	int			error = 0;
    353 	int			retval = 0;
    354 	struct cred		*cr;
    355 	kmutex_t		*lock;
    356 	model_t			mdl = get_udatamodel();
    357 	STRUCT_DECL(semid_ds, sid);
    358 	struct semid_ds64	ds64;
    359 
    360 	STRUCT_INIT(sid, mdl);
    361 	cr = CRED();
    362 
    363 	/*
    364 	 * Perform pre- or non-lookup actions (e.g. copyins, RMID).
    365 	 */
    366 	switch (cmd) {
    367 	case IPC_SET:
    368 		if (copyin((void *)arg, STRUCT_BUF(sid), STRUCT_SIZE(sid)))
    369 			return (set_errno(EFAULT));
    370 		break;
    371 
    372 	case IPC_SET64:
    373 		if (copyin((void *)arg, &ds64, sizeof (struct semid_ds64)))
    374 			return (set_errno(EFAULT));
    375 		break;
    376 
    377 	case SETALL:
    378 		if ((lock = ipc_lookup(sem_svc, semid,
    379 		    (kipc_perm_t **)&sp)) == NULL)
    380 			return (set_errno(EINVAL));
    381 		vsize = sp->sem_nsems * sizeof (*vals);
    382 		mutex_exit(lock);
    383 
    384 		/* allocate space to hold all semaphore values */
    385 		vals = kmem_alloc(vsize, KM_SLEEP);
    386 
    387 		if (copyin((void *)arg, vals, vsize)) {
    388 			kmem_free(vals, vsize);
    389 			return (set_errno(EFAULT));
    390 		}
    391 		break;
    392 
    393 	case IPC_RMID:
    394 		if (error = ipc_rmid(sem_svc, semid, cr))
    395 			return (set_errno(error));
    396 		return (0);
    397 	}
    398 
    399 	if ((lock = ipc_lookup(sem_svc, semid, (kipc_perm_t **)&sp)) == NULL) {
    400 		if (vsize != 0)
    401 			kmem_free(vals, vsize);
    402 		return (set_errno(EINVAL));
    403 	}
    404 	switch (cmd) {
    405 	/* Set ownership and permissions. */
    406 	case IPC_SET:
    407 
    408 		if (error = ipcperm_set(sem_svc, cr, &sp->sem_perm,
    409 		    &STRUCT_BUF(sid)->sem_perm, mdl)) {
    410 			mutex_exit(lock);
    411 			return (set_errno(error));
    412 		}
    413 		sp->sem_ctime = gethrestime_sec();
    414 		mutex_exit(lock);
    415 		return (0);
    416 
    417 	/* Get semaphore data structure. */
    418 	case IPC_STAT:
    419 
    420 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    421 			mutex_exit(lock);
    422 			return (set_errno(error));
    423 		}
    424 
    425 		ipcperm_stat(&STRUCT_BUF(sid)->sem_perm, &sp->sem_perm, mdl);
    426 		STRUCT_FSETP(sid, sem_base, NULL);	/* kernel addr */
    427 		STRUCT_FSET(sid, sem_nsems, sp->sem_nsems);
    428 		STRUCT_FSET(sid, sem_otime, sp->sem_otime);
    429 		STRUCT_FSET(sid, sem_ctime, sp->sem_ctime);
    430 		STRUCT_FSET(sid, sem_binary, sp->sem_binary);
    431 		mutex_exit(lock);
    432 
    433 		if (copyout(STRUCT_BUF(sid), (void *)arg, STRUCT_SIZE(sid)))
    434 			return (set_errno(EFAULT));
    435 		return (0);
    436 
    437 	case IPC_SET64:
    438 
    439 		if (error = ipcperm_set64(sem_svc, cr, &sp->sem_perm,
    440 		    &ds64.semx_perm)) {
    441 			mutex_exit(lock);
    442 			return (set_errno(error));
    443 		}
    444 		sp->sem_ctime = gethrestime_sec();
    445 		mutex_exit(lock);
    446 		return (0);
    447 
    448 	case IPC_STAT64:
    449 
    450 		ipcperm_stat64(&ds64.semx_perm, &sp->sem_perm);
    451 		ds64.semx_nsems = sp->sem_nsems;
    452 		ds64.semx_otime = sp->sem_otime;
    453 		ds64.semx_ctime = sp->sem_ctime;
    454 
    455 		mutex_exit(lock);
    456 		if (copyout(&ds64, (void *)arg, sizeof (struct semid_ds64)))
    457 			return (set_errno(EFAULT));
    458 
    459 		return (0);
    460 
    461 	/* Get # of processes sleeping for greater semval. */
    462 	case GETNCNT:
    463 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    464 			mutex_exit(lock);
    465 			return (set_errno(error));
    466 		}
    467 		if (semnum >= sp->sem_nsems) {
    468 			mutex_exit(lock);
    469 			return (set_errno(EINVAL));
    470 		}
    471 		retval = sp->sem_base[semnum].semncnt;
    472 		mutex_exit(lock);
    473 		return (retval);
    474 
    475 	/* Get pid of last process to operate on semaphore. */
    476 	case GETPID:
    477 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    478 			mutex_exit(lock);
    479 			return (set_errno(error));
    480 		}
    481 		if (semnum >= sp->sem_nsems) {
    482 			mutex_exit(lock);
    483 			return (set_errno(EINVAL));
    484 		}
    485 		retval = sp->sem_base[semnum].sempid;
    486 		mutex_exit(lock);
    487 		return (retval);
    488 
    489 	/* Get semval of one semaphore. */
    490 	case GETVAL:
    491 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    492 			mutex_exit(lock);
    493 			return (set_errno(error));
    494 		}
    495 		if (semnum >= sp->sem_nsems) {
    496 			mutex_exit(lock);
    497 			return (set_errno(EINVAL));
    498 		}
    499 		retval = sp->sem_base[semnum].semval;
    500 		mutex_exit(lock);
    501 		return (retval);
    502 
    503 	/* Get all semvals in set. */
    504 	case GETALL:
    505 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    506 			mutex_exit(lock);
    507 			return (set_errno(error));
    508 		}
    509 
    510 		/* allocate space to hold all semaphore values */
    511 		vsize = sp->sem_nsems * sizeof (*vals);
    512 		vals = vp = kmem_alloc(vsize, KM_SLEEP);
    513 
    514 		for (i = sp->sem_nsems, p = sp->sem_base; i--; p++, vp++)
    515 			bcopy(&p->semval, vp, sizeof (p->semval));
    516 
    517 		mutex_exit(lock);
    518 
    519 		if (copyout((void *)vals, (void *)arg, vsize)) {
    520 			kmem_free(vals, vsize);
    521 			return (set_errno(EFAULT));
    522 		}
    523 
    524 		kmem_free(vals, vsize);
    525 		return (0);
    526 
    527 	/* Get # of processes sleeping for semval to become zero. */
    528 	case GETZCNT:
    529 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    530 			mutex_exit(lock);
    531 			return (set_errno(error));
    532 		}
    533 		if (semnum >= sp->sem_nsems) {
    534 			mutex_exit(lock);
    535 			return (set_errno(EINVAL));
    536 		}
    537 		retval = sp->sem_base[semnum].semzcnt;
    538 		mutex_exit(lock);
    539 		return (retval);
    540 
    541 	/* Set semval of one semaphore. */
    542 	case SETVAL:
    543 		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
    544 			mutex_exit(lock);
    545 			return (set_errno(error));
    546 		}
    547 		if (semnum >= sp->sem_nsems) {
    548 			mutex_exit(lock);
    549 			return (set_errno(EINVAL));
    550 		}
    551 		if ((uint_t)arg > USHRT_MAX) {
    552 			mutex_exit(lock);
    553 			return (set_errno(ERANGE));
    554 		}
    555 		p = &sp->sem_base[semnum];
    556 		if ((p->semval = (ushort_t)arg) != 0) {
    557 			if (p->semncnt) {
    558 				cv_broadcast(&p->semncnt_cv);
    559 			}
    560 		} else if (p->semzcnt) {
    561 			cv_broadcast(&p->semzcnt_cv);
    562 		}
    563 		p->sempid = curproc->p_pid;
    564 		sem_undo_clear(sp, (ushort_t)semnum, (ushort_t)semnum);
    565 		mutex_exit(lock);
    566 		return (0);
    567 
    568 	/* Set semvals of all semaphores in set. */
    569 	case SETALL:
    570 		/* Check if semaphore set has been deleted and reallocated. */
    571 		if (sp->sem_nsems * sizeof (*vals) != vsize) {
    572 			error = set_errno(EINVAL);
    573 			goto seterr;
    574 		}
    575 		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
    576 			error = set_errno(error);
    577 			goto seterr;
    578 		}
    579 		sem_undo_clear(sp, 0, sp->sem_nsems - 1);
    580 		for (i = 0, p = sp->sem_base; i < sp->sem_nsems;
    581 		    (p++)->sempid = curproc->p_pid) {
    582 			if ((p->semval = vals[i++]) != 0) {
    583 				if (p->semncnt) {
    584 					cv_broadcast(&p->semncnt_cv);
    585 				}
    586 			} else if (p->semzcnt) {
    587 				cv_broadcast(&p->semzcnt_cv);
    588 			}
    589 		}
    590 seterr:
    591 		mutex_exit(lock);
    592 		kmem_free(vals, vsize);
    593 		return (error);
    594 
    595 	default:
    596 		mutex_exit(lock);
    597 		return (set_errno(EINVAL));
    598 	}
    599 
    600 	/* NOTREACHED */
    601 }
    602 
    603 /*
    604  * semexit - Called by exit() to clean up on process exit.
    605  */
    606 void
    607 semexit(proc_t *pp)
    608 {
    609 	avl_tree_t	*tree;
    610 	struct sem_undo	*undo;
    611 	void		*cookie = NULL;
    612 
    613 	mutex_enter(&pp->p_lock);
    614 	tree = pp->p_semacct;
    615 	pp->p_semacct = NULL;
    616 	mutex_exit(&pp->p_lock);
    617 
    618 	while (undo = avl_destroy_nodes(tree, &cookie)) {
    619 		ksemid_t *sp = undo->un_sp;
    620 		size_t size = SEM_UNDOSZ(sp->sem_nsems);
    621 		int i;
    622 
    623 		(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
    624 		if (!IPC_FREE(&sp->sem_perm)) {
    625 			for (i = 0; i < sp->sem_nsems; i++) {
    626 				int adj = undo->un_aoe[i];
    627 				if (adj) {
    628 					struct sem *semp = &sp->sem_base[i];
    629 					int v = (int)semp->semval + adj;
    630 
    631 					if (v < 0 || v > USHRT_MAX)
    632 						continue;
    633 					semp->semval = (ushort_t)v;
    634 					if (v == 0 && semp->semzcnt)
    635 						cv_broadcast(&semp->semzcnt_cv);
    636 					if (adj > 0 && semp->semncnt)
    637 						cv_broadcast(&semp->semncnt_cv);
    638 				}
    639 			}
    640 			list_remove(&sp->sem_undos, undo);
    641 		}
    642 		ipc_rele(sem_svc, (kipc_perm_t *)sp);
    643 		kmem_free(undo, size);
    644 	}
    645 
    646 	avl_destroy(tree);
    647 	kmem_free(tree, sizeof (avl_tree_t));
    648 }
    649 
    650 /*
    651  * Remove all semaphores associated with a given zone.  Called by
    652  * zone_shutdown when the zone is halted.
    653  */
    654 /*ARGSUSED1*/
    655 static void
    656 sem_remove_zone(zoneid_t zoneid, void *arg)
    657 {
    658 	ipc_remove_zone(sem_svc, zoneid);
    659 }
    660 
    661 /*
    662  * semget - Semget system call.
    663  */
    664 static int
    665 semget(key_t key, int nsems, int semflg)
    666 {
    667 	ksemid_t	*sp;
    668 	kmutex_t	*lock;
    669 	int		id, error;
    670 	proc_t		*pp = curproc;
    671 
    672 top:
    673 	if (error = ipc_get(sem_svc, key, semflg, (kipc_perm_t **)&sp, &lock))
    674 		return (set_errno(error));
    675 
    676 	if (!IPC_FREE(&sp->sem_perm)) {
    677 		/*
    678 		 * A semaphore with the requested key exists.
    679 		 */
    680 		if (!((nsems >= 0) && (nsems <= sp->sem_nsems))) {
    681 			mutex_exit(lock);
    682 			return (set_errno(EINVAL));
    683 		}
    684 	} else {
    685 		/*
    686 		 * This is a new semaphore set.  Finish initialization.
    687 		 */
    688 		if (nsems <= 0 || (rctl_test(rc_process_semmsl, pp->p_rctls, pp,
    689 		    nsems, RCA_SAFE) & RCT_DENY)) {
    690 			mutex_exit(lock);
    691 			mutex_exit(&pp->p_lock);
    692 			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
    693 			return (set_errno(EINVAL));
    694 		}
    695 		mutex_exit(lock);
    696 		mutex_exit(&pp->p_lock);
    697 
    698 		/*
    699 		 * We round the allocation up to coherency granularity
    700 		 * so that multiple semaphore allocations won't result
    701 		 * in the false sharing of their sem structures.
    702 		 */
    703 		sp->sem_base =
    704 		    kmem_zalloc(P2ROUNDUP(nsems * sizeof (struct sem), 64),
    705 		    KM_SLEEP);
    706 		sp->sem_binary = (nsems == 1);
    707 		sp->sem_nsems = (ushort_t)nsems;
    708 		sp->sem_ctime = gethrestime_sec();
    709 		sp->sem_otime = 0;
    710 		list_create(&sp->sem_undos, sizeof (struct sem_undo),
    711 		    offsetof(struct sem_undo, un_list));
    712 
    713 		if (error = ipc_commit_begin(sem_svc, key, semflg,
    714 		    (kipc_perm_t *)sp)) {
    715 			if (error == EAGAIN)
    716 				goto top;
    717 			return (set_errno(error));
    718 		}
    719 		sp->sem_maxops =
    720 		    rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
    721 		if (rctl_test(rc_process_semmsl, pp->p_rctls, pp, nsems,
    722 		    RCA_SAFE) & RCT_DENY) {
    723 			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
    724 			return (set_errno(EINVAL));
    725 		}
    726 		lock = ipc_commit_end(sem_svc, &sp->sem_perm);
    727 	}
    728 	if (audit_active)
    729 		audit_ipcget(AT_IPC_SEM, (void *)sp);
    730 	id = sp->sem_perm.ipc_id;
    731 	mutex_exit(lock);
    732 	return (id);
    733 }
    734 
    735 /*
    736  * semids system call.
    737  */
    738 static int
    739 semids(int *buf, uint_t nids, uint_t *pnids)
    740 {
    741 	int error;
    742 
    743 	if (error = ipc_ids(sem_svc, buf, nids, pnids))
    744 		return (set_errno(error));
    745 
    746 	return (0);
    747 }
    748 
    749 
    750 /*
    751  * Helper function for semop - copies in the provided timespec and
    752  * computes the absolute future time after which we must return.
    753  */
    754 static int
    755 compute_timeout(timespec_t **tsp, timespec_t *ts, timespec_t *now,
    756 	timespec_t *timeout)
    757 {
    758 	model_t datamodel = get_udatamodel();
    759 
    760 	if (datamodel == DATAMODEL_NATIVE) {
    761 		if (copyin(timeout, ts, sizeof (timespec_t)))
    762 			return (EFAULT);
    763 	} else {
    764 		timespec32_t ts32;
    765 
    766 		if (copyin(timeout, &ts32, sizeof (timespec32_t)))
    767 			return (EFAULT);
    768 		TIMESPEC32_TO_TIMESPEC(ts, &ts32)
    769 	}
    770 
    771 	if (itimerspecfix(ts))
    772 		return (EINVAL);
    773 
    774 	/*
    775 	 * Convert the timespec value into absolute time.
    776 	 */
    777 	timespecadd(ts, now);
    778 	*tsp = ts;
    779 
    780 	return (0);
    781 }
    782 
    783 /*
    784  * Undo structure comparator.  We sort based on ksemid_t pointer.
    785  */
    786 static int
    787 sem_undo_compar(const void *x, const void *y)
    788 {
    789 	struct sem_undo *undo1 = (struct sem_undo *)x;
    790 	struct sem_undo *undo2 = (struct sem_undo *)y;
    791 
    792 	if (undo1->un_sp < undo2->un_sp)
    793 		return (-1);
    794 	if (undo1->un_sp > undo2->un_sp)
    795 		return (1);
    796 	return (0);
    797 }
    798 
    799 /*
    800  * Helper function for semop - creates an undo structure and adds it to
    801  * the process's avl tree and the semaphore's list.
    802  */
    803 static int
    804 sem_undo_alloc(proc_t *pp, ksemid_t *sp, kmutex_t **lock,
    805     struct sem_undo *template, struct sem_undo **un)
    806 {
    807 	size_t size;
    808 	struct sem_undo *undo;
    809 	avl_tree_t *tree = NULL;
    810 	avl_index_t where;
    811 
    812 	mutex_exit(*lock);
    813 
    814 	size = SEM_UNDOSZ(sp->sem_nsems);
    815 	undo = kmem_zalloc(size, KM_SLEEP);
    816 	undo->un_proc = pp;
    817 	undo->un_sp = sp;
    818 
    819 	if (pp->p_semacct == NULL)
    820 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
    821 
    822 	*lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
    823 	if (IPC_FREE(&sp->sem_perm)) {
    824 		kmem_free(undo, size);
    825 		if (tree)
    826 			kmem_free(tree, sizeof (avl_tree_t));
    827 		return (EIDRM);
    828 	}
    829 
    830 	mutex_enter(&pp->p_lock);
    831 	if (tree) {
    832 		if (pp->p_semacct == NULL) {
    833 			avl_create(tree, sem_undo_compar,
    834 			    sizeof (struct sem_undo),
    835 			    offsetof(struct sem_undo, un_avl));
    836 			pp->p_semacct = tree;
    837 		} else {
    838 			kmem_free(tree, sizeof (avl_tree_t));
    839 		}
    840 	}
    841 
    842 	if (*un = avl_find(pp->p_semacct, template, &where)) {
    843 		mutex_exit(&pp->p_lock);
    844 		kmem_free(undo, size);
    845 	} else {
    846 		*un = undo;
    847 		avl_insert(pp->p_semacct, undo, where);
    848 		mutex_exit(&pp->p_lock);
    849 		list_insert_head(&sp->sem_undos, undo);
    850 		ipc_hold(sem_svc, (kipc_perm_t *)sp);
    851 	}
    852 
    853 
    854 	return (0);
    855 }
    856 
    857 /*
    858  * semop - Semop system call.
    859  */
    860 static int
    861 semop(int semid, struct sembuf *sops, size_t nsops, timespec_t *timeout)
    862 {
    863 	ksemid_t	*sp = NULL;
    864 	kmutex_t	*lock;
    865 	struct sembuf	*op;	/* ptr to operation */
    866 	int		i;	/* loop control */
    867 	struct sem	*semp;	/* ptr to semaphore */
    868 	int 		error = 0;
    869 	struct sembuf	*uops;	/* ptr to copy of user ops */
    870 	struct sembuf 	x_sem;	/* avoid kmem_alloc's */
    871 	timespec_t	now, ts, *tsp = NULL;
    872 	int		timecheck = 0;
    873 	int		cvres, needundo, mode;
    874 	struct sem_undo	*undo;
    875 	proc_t		*pp = curproc;
    876 	int		held = 0;
    877 
    878 	CPU_STATS_ADDQ(CPU, sys, sema, 1); /* bump semaphore op count */
    879 
    880 	/*
    881 	 * To avoid the cost of copying in 'timeout' in the common
    882 	 * case, we could only grab the time here and defer the copyin
    883 	 * and associated computations until we are about to block.
    884 	 *
    885 	 * The down side to this is that we would then have to spin
    886 	 * some goto top nonsense to avoid the copyin behind the semid
    887 	 * lock.  As a common use of timed semaphores is as an explicit
    888 	 * blocking mechanism, this could incur a greater penalty.
    889 	 *
    890 	 * If we eventually decide that this would be a wise route to
    891 	 * take, the deferrable functionality is completely contained
    892 	 * in 'compute_timeout', and the interface is defined such that
    893 	 * we can legally not validate 'timeout' if it is unused.
    894 	 */
    895 	if (timeout != NULL) {
    896 		timecheck = timechanged;
    897 		gethrestime(&now);
    898 		if (error = compute_timeout(&tsp, &ts, &now, timeout))
    899 			return (set_errno(error));
    900 	}
    901 
    902 	/*
    903 	 * Allocate space to hold the vector of semaphore ops.  If
    904 	 * there is only 1 operation we use a preallocated buffer on
    905 	 * the stack for speed.
    906 	 *
    907 	 * Since we don't want to allow the user to allocate an
    908 	 * arbitrary amount of kernel memory, we need to check against
    909 	 * the number of operations allowed by the semaphore.  We only
    910 	 * bother doing this if the number of operations is larger than
    911 	 * SEM_MAXUCOPS.
    912 	 */
    913 	if (nsops == 1)
    914 		uops = &x_sem;
    915 	else if (nsops == 0)
    916 		return (0);
    917 	else if (nsops <= SEM_MAXUCOPS)
    918 		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
    919 
    920 	if (nsops > SEM_MAXUCOPS) {
    921 		if ((lock = ipc_lookup(sem_svc, semid,
    922 		    (kipc_perm_t **)&sp)) == NULL)
    923 			return (set_errno(EFAULT));
    924 
    925 		if (nsops > sp->sem_maxops) {
    926 			mutex_exit(lock);
    927 			return (set_errno(E2BIG));
    928 		}
    929 		held = 1;
    930 		ipc_hold(sem_svc, (kipc_perm_t *)sp);
    931 		mutex_exit(lock);
    932 
    933 		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
    934 		if (copyin(sops, uops, nsops * sizeof (*op))) {
    935 			error = EFAULT;
    936 			(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
    937 			goto semoperr;
    938 		}
    939 
    940 		lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
    941 		if (IPC_FREE(&sp->sem_perm)) {
    942 			error = EIDRM;
    943 			goto semoperr;
    944 		}
    945 	} else {
    946 		/*
    947 		 * This could be interleaved with the above code, but
    948 		 * keeping them separate improves readability.
    949 		 */
    950 		if (copyin(sops, uops, nsops * sizeof (*op))) {
    951 			error = EFAULT;
    952 			goto semoperr_unlocked;
    953 		}
    954 
    955 		if ((lock = ipc_lookup(sem_svc, semid,
    956 		    (kipc_perm_t **)&sp)) == NULL) {
    957 			error = EINVAL;
    958 			goto semoperr_unlocked;
    959 		}
    960 
    961 		if (nsops > sp->sem_maxops) {
    962 			error = E2BIG;
    963 			goto semoperr;
    964 		}
    965 	}
    966 
    967 	/*
    968 	 * Scan all operations.  Verify that sem #s are in range and
    969 	 * this process is allowed the requested operations.  If any
    970 	 * operations are marked SEM_UNDO, find (or allocate) the undo
    971 	 * structure for this process and semaphore.
    972 	 */
    973 	needundo = 0;
    974 	mode = 0;
    975 	for (i = 0, op = uops; i++ < nsops; op++) {
    976 		mode |= op->sem_op ? SEM_A : SEM_R;
    977 		if (op->sem_num >= sp->sem_nsems) {
    978 			error = EFBIG;
    979 			goto semoperr;
    980 		}
    981 		if ((op->sem_flg & SEM_UNDO) && op->sem_op)
    982 			needundo = 1;
    983 	}
    984 	if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
    985 		goto semoperr;
    986 
    987 	if (needundo) {
    988 		struct sem_undo template;
    989 
    990 		template.un_sp = sp;
    991 		mutex_enter(&pp->p_lock);
    992 		if (pp->p_semacct)
    993 			undo = avl_find(pp->p_semacct, &template, NULL);
    994 		else
    995 			undo = NULL;
    996 		mutex_exit(&pp->p_lock);
    997 		if (undo == NULL) {
    998 			if (!held) {
    999 				held = 1;
   1000 				ipc_hold(sem_svc, (kipc_perm_t *)sp);
   1001 			}
   1002 			if (error = sem_undo_alloc(pp, sp, &lock, &template,
   1003 			    &undo))
   1004 				goto semoperr;
   1005 
   1006 			/* sem_undo_alloc unlocks the semaphore */
   1007 			if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
   1008 				goto semoperr;
   1009 		}
   1010 	}
   1011 
   1012 check:
   1013 	/*
   1014 	 * Loop waiting for the operations to be satisfied atomically.
   1015 	 * Actually, do the operations and undo them if a wait is needed
   1016 	 * or an error is detected.
   1017 	 */
   1018 	for (i = 0; i < nsops; i++) {
   1019 		op = &uops[i];
   1020 		semp = &sp->sem_base[op->sem_num];
   1021 
   1022 		/*
   1023 		 * Raise the semaphore (i.e. sema_v)
   1024 		 */
   1025 		if (op->sem_op > 0) {
   1026 			if (op->sem_op + (int)semp->semval > USHRT_MAX ||
   1027 			    ((op->sem_flg & SEM_UNDO) &&
   1028 			    (error = sem_undo_add(op->sem_op, op->sem_num,
   1029 			    undo)))) {
   1030 				if (i)
   1031 					sem_rollback(sp, uops, i, undo);
   1032 				if (error == 0)
   1033 					error = ERANGE;
   1034 				goto semoperr;
   1035 			}
   1036 			semp->semval += op->sem_op;
   1037 			/*
   1038 			 * If we are only incrementing the semaphore value
   1039 			 * by one on a binary semaphore, we can cv_signal.
   1040 			 */
   1041 			if (semp->semncnt) {
   1042 				if (op->sem_op == 1 && sp->sem_binary)
   1043 					cv_signal(&semp->semncnt_cv);
   1044 				else
   1045 					cv_broadcast(&semp->semncnt_cv);
   1046 			}
   1047 			if (semp->semzcnt && !semp->semval)
   1048 				cv_broadcast(&semp->semzcnt_cv);
   1049 			continue;
   1050 		}
   1051 
   1052 		/*
   1053 		 * Lower the semaphore (i.e. sema_p)
   1054 		 */
   1055 		if (op->sem_op < 0) {
   1056 			if (semp->semval >= (unsigned)(-op->sem_op)) {
   1057 				if ((op->sem_flg & SEM_UNDO) &&
   1058 				    (error = sem_undo_add(op->sem_op,
   1059 				    op->sem_num, undo))) {
   1060 					if (i)
   1061 						sem_rollback(sp, uops, i, undo);
   1062 					goto semoperr;
   1063 				}
   1064 				semp->semval += op->sem_op;
   1065 				if (semp->semzcnt && !semp->semval)
   1066 					cv_broadcast(&semp->semzcnt_cv);
   1067 				continue;
   1068 			}
   1069 			if (i)
   1070 				sem_rollback(sp, uops, i, undo);
   1071 			if (op->sem_flg & IPC_NOWAIT) {
   1072 				error = EAGAIN;
   1073 				goto semoperr;
   1074 			}
   1075 
   1076 			/*
   1077 			 * Mark the semaphore set as not a binary type
   1078 			 * if we are decrementing the value by more than 1.
   1079 			 *
   1080 			 * V operations will resort to cv_broadcast
   1081 			 * for this set because there are too many weird
   1082 			 * cases that have to be caught.
   1083 			 */
   1084 			if (op->sem_op < -1)
   1085 				sp->sem_binary = 0;
   1086 			if (!held) {
   1087 				held = 1;
   1088 				ipc_hold(sem_svc, (kipc_perm_t *)sp);
   1089 			}
   1090 			semp->semncnt++;
   1091 			cvres = cv_waituntil_sig(&semp->semncnt_cv, lock,
   1092 			    tsp, timecheck);
   1093 			lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
   1094 
   1095 			if (!IPC_FREE(&sp->sem_perm)) {
   1096 				ASSERT(semp->semncnt != 0);
   1097 				semp->semncnt--;
   1098 				if (cvres > 0)	/* normal wakeup */
   1099 					goto check;
   1100 			}
   1101 
   1102 			/* EINTR or EAGAIN overrides EIDRM */
   1103 			if (cvres == 0)
   1104 				error = EINTR;
   1105 			else if (cvres < 0)
   1106 				error = EAGAIN;
   1107 			else
   1108 				error = EIDRM;
   1109 			goto semoperr;
   1110 		}
   1111 
   1112 		/*
   1113 		 * Wait for zero value
   1114 		 */
   1115 		if (semp->semval) {
   1116 			if (i)
   1117 				sem_rollback(sp, uops, i, undo);
   1118 			if (op->sem_flg & IPC_NOWAIT) {
   1119 				error = EAGAIN;
   1120 				goto semoperr;
   1121 			}
   1122 
   1123 			if (!held) {
   1124 				held = 1;
   1125 				ipc_hold(sem_svc, (kipc_perm_t *)sp);
   1126 			}
   1127 			semp->semzcnt++;
   1128 			cvres = cv_waituntil_sig(&semp->semzcnt_cv, lock,
   1129 			    tsp, timecheck);
   1130 			lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
   1131 
   1132 			/*
   1133 			 * Don't touch semp if the semaphores have been removed.
   1134 			 */
   1135 			if (!IPC_FREE(&sp->sem_perm)) {
   1136 				ASSERT(semp->semzcnt != 0);
   1137 				semp->semzcnt--;
   1138 				if (cvres > 0)	/* normal wakeup */
   1139 					goto check;
   1140 			}
   1141 
   1142 			/* EINTR or EAGAIN overrides EIDRM */
   1143 			if (cvres == 0)
   1144 				error = EINTR;
   1145 			else if (cvres < 0)
   1146 				error = EAGAIN;
   1147 			else
   1148 				error = EIDRM;
   1149 			goto semoperr;
   1150 		}
   1151 	}
   1152 
   1153 	/* All operations succeeded.  Update sempid for accessed semaphores. */
   1154 	for (i = 0, op = uops; i++ < nsops;
   1155 	    sp->sem_base[(op++)->sem_num].sempid = pp->p_pid)
   1156 		;
   1157 	sp->sem_otime = gethrestime_sec();
   1158 	if (held)
   1159 		ipc_rele(sem_svc, (kipc_perm_t *)sp);
   1160 	else
   1161 		mutex_exit(lock);
   1162 
   1163 	/* Before leaving, deallocate the buffer that held the user semops */
   1164 	if (nsops != 1)
   1165 		kmem_free(uops, sizeof (*uops) * nsops);
   1166 	return (0);
   1167 
   1168 	/*
   1169 	 * Error return labels
   1170 	 */
   1171 semoperr:
   1172 	if (held)
   1173 		ipc_rele(sem_svc, (kipc_perm_t *)sp);
   1174 	else
   1175 		mutex_exit(lock);
   1176 
   1177 semoperr_unlocked:
   1178 
   1179 	/* Before leaving, deallocate the buffer that held the user semops */
   1180 	if (nsops != 1)
   1181 		kmem_free(uops, sizeof (*uops) * nsops);
   1182 	return (set_errno(error));
   1183 }
   1184 
   1185 /*
   1186  * semsys - System entry point for semctl, semget, and semop system calls.
   1187  */
   1188 static int
   1189 semsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4)
   1190 {
   1191 	int error;
   1192 
   1193 	switch (opcode) {
   1194 	case SEMCTL:
   1195 		error = semctl((int)a1, (uint_t)a2, (int)a3, a4);
   1196 		break;
   1197 	case SEMGET:
   1198 		error = semget((key_t)a1, (int)a2, (int)a3);
   1199 		break;
   1200 	case SEMOP:
   1201 		error = semop((int)a1, (struct sembuf *)a2, (size_t)a3, 0);
   1202 		break;
   1203 	case SEMIDS:
   1204 		error = semids((int *)a1, (uint_t)a2, (uint_t *)a3);
   1205 		break;
   1206 	case SEMTIMEDOP:
   1207 		error = semop((int)a1, (struct sembuf *)a2, (size_t)a3,
   1208 		    (timespec_t *)a4);
   1209 		break;
   1210 	default:
   1211 		error = set_errno(EINVAL);
   1212 		break;
   1213 	}
   1214 	return (error);
   1215 }
   1216