Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
     27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
     28 /*	  All Rights Reserved  	*/
     29 
     30 /*	Copyright (c) 1987, 1988 Microsoft Corporation	*/
     31 /*	  All Rights Reserved	*/
     32 
     33 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     34 
     35 #include <sys/param.h>
     36 #include <sys/types.h>
     37 #include <sys/sysmacros.h>
     38 #include <sys/systm.h>
     39 #include <sys/signal.h>
     40 #include <sys/errno.h>
     41 #include <sys/fault.h>
     42 #include <sys/syscall.h>
     43 #include <sys/cpuvar.h>
     44 #include <sys/sysi86.h>
     45 #include <sys/psw.h>
     46 #include <sys/cred.h>
     47 #include <sys/policy.h>
     48 #include <sys/thread.h>
     49 #include <sys/debug.h>
     50 #include <sys/ontrap.h>
     51 #include <sys/privregs.h>
     52 #include <sys/x86_archext.h>
     53 #include <sys/vmem.h>
     54 #include <sys/kmem.h>
     55 #include <sys/mman.h>
     56 #include <sys/archsystm.h>
     57 #include <vm/hat.h>
     58 #include <vm/as.h>
     59 #include <vm/seg.h>
     60 #include <vm/seg_kmem.h>
     61 #include <vm/faultcode.h>
     62 #include <sys/fp.h>
     63 #include <sys/cmn_err.h>
     64 #include <sys/segments.h>
     65 #include <sys/clock.h>
     66 #if defined(__xpv)
     67 #include <sys/hypervisor.h>
     68 #include <sys/note.h>
     69 #endif
     70 
     71 static void ldt_alloc(proc_t *, uint_t);
     72 static void ldt_free(proc_t *);
     73 static void ldt_dup(proc_t *, proc_t *);
     74 static void ldt_grow(proc_t *, uint_t);
     75 
     76 /*
     77  * sysi86 System Call
     78  */
     79 
     80 /* ARGSUSED */
     81 int
     82 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
     83 {
     84 	struct ssd ssd;
     85 	int error = 0;
     86 	int c;
     87 	proc_t *pp = curproc;
     88 
     89 	switch (cmd) {
     90 
     91 	/*
     92 	 * The SI86V86 subsystem call of the SYSI86 system call
     93 	 * supports only one subcode -- V86SC_IOPL.
     94 	 */
     95 	case SI86V86:
     96 		if (arg1 == V86SC_IOPL) {
     97 			struct regs *rp = lwptoregs(ttolwp(curthread));
     98 			greg_t oldpl = rp->r_ps & PS_IOPL;
     99 			greg_t newpl = arg2 & PS_IOPL;
    100 
    101 			/*
    102 			 * Must be privileged to run this system call
    103 			 * if giving more io privilege.
    104 			 */
    105 			if (newpl > oldpl && (error =
    106 			    secpolicy_sys_config(CRED(), B_FALSE)) != 0)
    107 				return (set_errno(error));
    108 #if defined(__xpv)
    109 			kpreempt_disable();
    110 			installctx(curthread, NULL, xen_disable_user_iopl,
    111 			    xen_enable_user_iopl, NULL, NULL,
    112 			    xen_disable_user_iopl, NULL);
    113 			xen_enable_user_iopl();
    114 			kpreempt_enable();
    115 #else
    116 			rp->r_ps ^= oldpl ^ newpl;
    117 #endif
    118 		} else
    119 			error = EINVAL;
    120 		break;
    121 
    122 	/*
    123 	 * Set a segment descriptor
    124 	 */
    125 	case SI86DSCR:
    126 		/*
    127 		 * There are considerable problems here manipulating
    128 		 * resources shared by many running lwps.  Get everyone
    129 		 * into a safe state before changing the LDT.
    130 		 */
    131 		if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
    132 			error = EINTR;
    133 			break;
    134 		}
    135 
    136 		if (get_udatamodel() == DATAMODEL_LP64) {
    137 			error = EINVAL;
    138 			break;
    139 		}
    140 
    141 		if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
    142 			error = EFAULT;
    143 			break;
    144 		}
    145 
    146 		error = setdscr(&ssd);
    147 
    148 		mutex_enter(&pp->p_lock);
    149 		if (curthread != pp->p_agenttp)
    150 			continuelwps(pp);
    151 		mutex_exit(&pp->p_lock);
    152 		break;
    153 
    154 	case SI86FPHW:
    155 		c = fp_kind & 0xff;
    156 		if (suword32((void *)arg1, c) == -1)
    157 			error = EFAULT;
    158 		break;
    159 
    160 	case SI86FPSTART:
    161 		/*
    162 		 * arg1 is the address of _fp_hw
    163 		 * arg2 is the desired x87 FCW value
    164 		 * arg3 is the desired SSE MXCSR value
    165 		 * a return value of one means SSE hardware, else none.
    166 		 */
    167 		c = fp_kind & 0xff;
    168 		if (suword32((void *)arg1, c) == -1) {
    169 			error = EFAULT;
    170 			break;
    171 		}
    172 		fpsetcw((uint16_t)arg2, (uint32_t)arg3);
    173 		return (fp_kind == __FP_SSE ? 1 : 0);
    174 
    175 	/* real time clock management commands */
    176 
    177 	case WTODC:
    178 		if ((error = secpolicy_settime(CRED())) == 0) {
    179 			timestruc_t ts;
    180 			mutex_enter(&tod_lock);
    181 			gethrestime(&ts);
    182 			tod_set(ts);
    183 			mutex_exit(&tod_lock);
    184 		}
    185 		break;
    186 
    187 /* Give some timezone playing room */
    188 #define	ONEWEEK	(7 * 24 * 60 * 60)
    189 
    190 	case SGMTL:
    191 		/*
    192 		 * Called from 32 bit land, negative values
    193 		 * are not sign extended, so we do that here
    194 		 * by casting it to an int and back.  We also
    195 		 * clamp the value to within reason and detect
    196 		 * when a 64 bit call overflows an int.
    197 		 */
    198 		if ((error = secpolicy_settime(CRED())) == 0) {
    199 			int newlag = (int)arg1;
    200 
    201 #ifdef _SYSCALL32_IMPL
    202 			if (get_udatamodel() == DATAMODEL_NATIVE &&
    203 			    (long)newlag != (long)arg1) {
    204 				error = EOVERFLOW;
    205 			} else
    206 #endif
    207 			if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
    208 				sgmtl(newlag);
    209 			else
    210 				error = EOVERFLOW;
    211 		}
    212 		break;
    213 
    214 	case GGMTL:
    215 		if (get_udatamodel() == DATAMODEL_NATIVE) {
    216 			if (sulword((void *)arg1, ggmtl()) == -1)
    217 				error = EFAULT;
    218 #ifdef _SYSCALL32_IMPL
    219 		} else {
    220 			time_t gmtl;
    221 
    222 			if ((gmtl = ggmtl()) > INT32_MAX) {
    223 				/*
    224 				 * Since gmt_lag can at most be
    225 				 * +/- 12 hours, something is
    226 				 * *seriously* messed up here.
    227 				 */
    228 				error = EOVERFLOW;
    229 			} else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
    230 				error = EFAULT;
    231 #endif
    232 		}
    233 		break;
    234 
    235 	case RTCSYNC:
    236 		if ((error = secpolicy_settime(CRED())) == 0)
    237 			rtcsync();
    238 		break;
    239 
    240 	/* END OF real time clock management commands */
    241 
    242 	default:
    243 		error = EINVAL;
    244 		break;
    245 	}
    246 	return (error == 0 ? 0 : set_errno(error));
    247 }
    248 
    249 void
    250 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
    251 {
    252 	ssd->bo = USEGD_GETBASE(usd);
    253 	ssd->ls = USEGD_GETLIMIT(usd);
    254 	ssd->sel = sel;
    255 
    256 	/*
    257 	 * set type, dpl and present bits.
    258 	 */
    259 	ssd->acc1 = usd->usd_type;
    260 	ssd->acc1 |= usd->usd_dpl << 5;
    261 	ssd->acc1 |= usd->usd_p << (5 + 2);
    262 
    263 	/*
    264 	 * set avl, DB and granularity bits.
    265 	 */
    266 	ssd->acc2 = usd->usd_avl;
    267 
    268 #if defined(__amd64)
    269 	ssd->acc2 |= usd->usd_long << 1;
    270 #else
    271 	ssd->acc2 |= usd->usd_reserved << 1;
    272 #endif
    273 
    274 	ssd->acc2 |= usd->usd_def32 << (1 + 1);
    275 	ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
    276 }
    277 
    278 static void
    279 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
    280 {
    281 
    282 	ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
    283 
    284 	USEGD_SETBASE(usd, ssd->bo);
    285 	USEGD_SETLIMIT(usd, ssd->ls);
    286 
    287 	/*
    288 	 * set type, dpl and present bits.
    289 	 */
    290 	usd->usd_type = ssd->acc1;
    291 	usd->usd_dpl = ssd->acc1 >> 5;
    292 	usd->usd_p = ssd->acc1 >> (5 + 2);
    293 
    294 	ASSERT(usd->usd_type >= SDT_MEMRO);
    295 	ASSERT(usd->usd_dpl == SEL_UPL);
    296 
    297 	/*
    298 	 * 64-bit code selectors are never allowed in the LDT.
    299 	 * Reserved bit is always 0 on 32-bit sytems.
    300 	 */
    301 #if defined(__amd64)
    302 	usd->usd_long = 0;
    303 #else
    304 	usd->usd_reserved = 0;
    305 #endif
    306 
    307 	/*
    308 	 * set avl, DB and granularity bits.
    309 	 */
    310 	usd->usd_avl = ssd->acc2;
    311 	usd->usd_def32 = ssd->acc2 >> (1 + 1);
    312 	usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
    313 }
    314 
    315 
    316 #if defined(__i386)
    317 
    318 static void
    319 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd)
    320 {
    321 
    322 	ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0);
    323 
    324 	sgd->sgd_looffset = ssd->bo;
    325 	sgd->sgd_hioffset = ssd->bo >> 16;
    326 
    327 	sgd->sgd_selector = ssd->ls;
    328 
    329 	/*
    330 	 * set type, dpl and present bits.
    331 	 */
    332 	sgd->sgd_type = ssd->acc1;
    333 	sgd->sgd_dpl = ssd->acc1 >> 5;
    334 	sgd->sgd_p = ssd->acc1 >> 7;
    335 	ASSERT(sgd->sgd_type == SDT_SYSCGT);
    336 	ASSERT(sgd->sgd_dpl == SEL_UPL);
    337 	sgd->sgd_stkcpy = 0;
    338 }
    339 
    340 #endif	/* __i386 */
    341 
    342 /*
    343  * Load LDT register with the current process's LDT.
    344  */
    345 static void
    346 ldt_load(void)
    347 {
    348 #if defined(__xpv)
    349 	xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc),
    350 	    curproc->p_ldtlimit + 1);
    351 #else
    352 	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc;
    353 	wr_ldtr(ULDT_SEL);
    354 #endif
    355 }
    356 
    357 /*
    358  * Store a NULL selector in the LDTR. All subsequent illegal references to
    359  * the LDT will result in a #gp.
    360  */
    361 void
    362 ldt_unload(void)
    363 {
    364 #if defined(__xpv)
    365 	xen_set_ldt(NULL, 0);
    366 #else
    367 	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
    368 	wr_ldtr(0);
    369 #endif
    370 }
    371 
    372 /*ARGSUSED*/
    373 static void
    374 ldt_savectx(proc_t *p)
    375 {
    376 	ASSERT(p->p_ldt != NULL);
    377 	ASSERT(p == curproc);
    378 
    379 #if defined(__amd64)
    380 	/*
    381 	 * The 64-bit kernel must be sure to clear any stale ldt
    382 	 * selectors when context switching away from a process that
    383 	 * has a private ldt. Consider the following example:
    384 	 *
    385 	 * 	Wine creats a ldt descriptor and points a segment register
    386 	 * 	to it.
    387 	 *
    388 	 *	We then context switch away from wine lwp to kernel
    389 	 *	thread and hit breakpoint in kernel with kmdb
    390 	 *
    391 	 *	When we continue and resume from kmdb we will #gp
    392 	 * 	fault since kmdb will have saved the stale ldt selector
    393 	 *	from wine and will try to restore it but we are no longer in
    394 	 *	the context of the wine process and do not have our
    395 	 *	ldtr register pointing to the private ldt.
    396 	 */
    397 	reset_sregs();
    398 #endif
    399 
    400 	ldt_unload();
    401 	cpu_fast_syscall_enable(NULL);
    402 }
    403 
    404 static void
    405 ldt_restorectx(proc_t *p)
    406 {
    407 	ASSERT(p->p_ldt != NULL);
    408 	ASSERT(p == curproc);
    409 
    410 	ldt_load();
    411 	cpu_fast_syscall_disable(NULL);
    412 }
    413 
    414 /*
    415  * When a process with a private LDT execs, fast syscalls must be enabled for
    416  * the new process image.
    417  */
    418 /* ARGSUSED */
    419 static void
    420 ldt_freectx(proc_t *p, int isexec)
    421 {
    422 	ASSERT(p->p_ldt);
    423 
    424 	if (isexec) {
    425 		kpreempt_disable();
    426 		cpu_fast_syscall_enable(NULL);
    427 		kpreempt_enable();
    428 	}
    429 
    430 	/*
    431 	 * ldt_free() will free the memory used by the private LDT, reset the
    432 	 * process's descriptor, and re-program the LDTR.
    433 	 */
    434 	ldt_free(p);
    435 }
    436 
    437 /*
    438  * Install ctx op that ensures syscall/sysenter are disabled.
    439  * See comments below.
    440  *
    441  * When a thread with a private LDT forks, the new process
    442  * must have the LDT context ops installed.
    443  */
    444 /* ARGSUSED */
    445 static void
    446 ldt_installctx(proc_t *p, proc_t *cp)
    447 {
    448 	proc_t		*targ = p;
    449 	kthread_t	*t;
    450 
    451 	/*
    452 	 * If this is a fork, operate on the child process.
    453 	 */
    454 	if (cp != NULL) {
    455 		targ = cp;
    456 		ldt_dup(p, cp);
    457 	}
    458 
    459 	/*
    460 	 * The process context ops expect the target process as their argument.
    461 	 */
    462 	ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
    463 	    ldt_installctx, ldt_savectx, ldt_freectx) == 0);
    464 
    465 	installpctx(targ, targ, ldt_savectx, ldt_restorectx,
    466 	    ldt_installctx, ldt_savectx, ldt_freectx);
    467 
    468 	/*
    469 	 * We've just disabled fast system call and return instructions; take
    470 	 * the slow path out to make sure we don't try to use one to return
    471 	 * back to user. We must set t_post_sys for every thread in the
    472 	 * process to make sure none of them escape out via fast return.
    473 	 */
    474 
    475 	mutex_enter(&targ->p_lock);
    476 	t = targ->p_tlist;
    477 	do {
    478 		t->t_post_sys = 1;
    479 	} while ((t = t->t_forw) != targ->p_tlist);
    480 	mutex_exit(&targ->p_lock);
    481 }
    482 
    483 int
    484 setdscr(struct ssd *ssd)
    485 {
    486 	ushort_t seli; 		/* selector index */
    487 	user_desc_t *ldp;	/* descriptor pointer */
    488 	user_desc_t ndesc;	/* new descriptor */
    489 	proc_t	*pp = ttoproc(curthread);
    490 	int	rc = 0;
    491 
    492 	/*
    493 	 * LDT segments: executable and data at DPL 3 only.
    494 	 */
    495 	if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
    496 		return (EINVAL);
    497 
    498 	/*
    499 	 * check the selector index.
    500 	 */
    501 	seli = SELTOIDX(ssd->sel);
    502 	if (seli >= MAXNLDT || seli < LDT_UDBASE)
    503 		return (EINVAL);
    504 
    505 	ndesc = null_udesc;
    506 	mutex_enter(&pp->p_ldtlock);
    507 
    508 	/*
    509 	 * If this is the first time for this process then setup a
    510 	 * private LDT for it.
    511 	 */
    512 	if (pp->p_ldt == NULL) {
    513 		ldt_alloc(pp, seli);
    514 
    515 		/*
    516 		 * Now that this process has a private LDT, the use of
    517 		 * the syscall/sysret and sysenter/sysexit instructions
    518 		 * is forbidden for this processes because they destroy
    519 		 * the contents of %cs and %ss segment registers.
    520 		 *
    521 		 * Explicity disable them here and add a context handler
    522 		 * to the process. Note that disabling
    523 		 * them here means we can't use sysret or sysexit on
    524 		 * the way out of this system call - so we force this
    525 		 * thread to take the slow path (which doesn't make use
    526 		 * of sysenter or sysexit) back out.
    527 		 */
    528 		kpreempt_disable();
    529 		ldt_installctx(pp, NULL);
    530 		cpu_fast_syscall_disable(NULL);
    531 		ASSERT(curthread->t_post_sys != 0);
    532 		kpreempt_enable();
    533 
    534 	} else if (seli > pp->p_ldtlimit) {
    535 
    536 		/*
    537 		 * Increase size of ldt to include seli.
    538 		 */
    539 		ldt_grow(pp, seli);
    540 	}
    541 
    542 	ASSERT(seli <= pp->p_ldtlimit);
    543 	ldp = &pp->p_ldt[seli];
    544 
    545 	/*
    546 	 * On the 64-bit kernel, this is where things get more subtle.
    547 	 * Recall that in the 64-bit kernel, when we enter the kernel we
    548 	 * deliberately -don't- reload the segment selectors we came in on
    549 	 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
    550 	 * and the underlying descriptors are essentially ignored by the
    551 	 * hardware in long mode - except for the base that we override with
    552 	 * the gsbase MSRs.
    553 	 *
    554 	 * However, there's one unfortunate issue with this rosy picture --
    555 	 * a descriptor that's not marked as 'present' will still generate
    556 	 * an #np when loading a segment register.
    557 	 *
    558 	 * Consider this case.  An lwp creates a harmless LDT entry, points
    559 	 * one of it's segment registers at it, then tells the kernel (here)
    560 	 * to delete it.  In the 32-bit kernel, the #np will happen on the
    561 	 * way back to userland where we reload the segment registers, and be
    562 	 * handled in kern_gpfault().  In the 64-bit kernel, the same thing
    563 	 * will happen in the normal case too.  However, if we're trying to
    564 	 * use a debugger that wants to save and restore the segment registers,
    565 	 * and the debugger things that we have valid segment registers, we
    566 	 * have the problem that the debugger will try and restore the
    567 	 * segment register that points at the now 'not present' descriptor
    568 	 * and will take a #np right there.
    569 	 *
    570 	 * We should obviously fix the debugger to be paranoid about
    571 	 * -not- restoring segment registers that point to bad descriptors;
    572 	 * however we can prevent the problem here if we check to see if any
    573 	 * of the segment registers are still pointing at the thing we're
    574 	 * destroying; if they are, return an error instead. (That also seems
    575 	 * a lot better failure mode than SIGKILL and a core file
    576 	 * from kern_gpfault() too.)
    577 	 */
    578 	if (SI86SSD_PRES(ssd) == 0) {
    579 		kthread_t *t;
    580 		int bad = 0;
    581 
    582 		/*
    583 		 * Look carefully at the segment registers of every lwp
    584 		 * in the process (they're all stopped by our caller).
    585 		 * If we're about to invalidate a descriptor that's still
    586 		 * being referenced by *any* of them, return an error,
    587 		 * rather than having them #gp on their way out of the kernel.
    588 		 */
    589 		ASSERT(pp->p_lwprcnt == 1);
    590 
    591 		mutex_enter(&pp->p_lock);
    592 		t = pp->p_tlist;
    593 		do {
    594 			klwp_t *lwp = ttolwp(t);
    595 			struct regs *rp = lwp->lwp_regs;
    596 #if defined(__amd64)
    597 			pcb_t *pcb = &lwp->lwp_pcb;
    598 #endif
    599 
    600 			if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
    601 				bad = 1;
    602 				break;
    603 			}
    604 
    605 #if defined(__amd64)
    606 			if (pcb->pcb_rupdate == 1) {
    607 				if (ssd->sel == pcb->pcb_ds ||
    608 				    ssd->sel == pcb->pcb_es ||
    609 				    ssd->sel == pcb->pcb_fs ||
    610 				    ssd->sel == pcb->pcb_gs) {
    611 					bad = 1;
    612 					break;
    613 				}
    614 			} else
    615 #endif
    616 			{
    617 				if (ssd->sel == rp->r_ds ||
    618 				    ssd->sel == rp->r_es ||
    619 				    ssd->sel == rp->r_fs ||
    620 				    ssd->sel == rp->r_gs) {
    621 					bad = 1;
    622 					break;
    623 				}
    624 			}
    625 
    626 		} while ((t = t->t_forw) != pp->p_tlist);
    627 		mutex_exit(&pp->p_lock);
    628 
    629 		if (bad) {
    630 			mutex_exit(&pp->p_ldtlock);
    631 			return (EBUSY);
    632 		}
    633 	}
    634 
    635 	/*
    636 	 * If acc1 is zero, clear the descriptor (including the 'present' bit)
    637 	 */
    638 	if (ssd->acc1 == 0) {
    639 		rc  = ldt_update_segd(ldp, &null_udesc);
    640 		mutex_exit(&pp->p_ldtlock);
    641 		return (rc);
    642 	}
    643 
    644 	/*
    645 	 * Check segment type, allow segment not present and
    646 	 * only user DPL (3).
    647 	 */
    648 	if (SI86SSD_DPL(ssd) != SEL_UPL) {
    649 		mutex_exit(&pp->p_ldtlock);
    650 		return (EINVAL);
    651 	}
    652 
    653 #if defined(__amd64)
    654 	/*
    655 	 * Do not allow 32-bit applications to create 64-bit mode code
    656 	 * segments.
    657 	 */
    658 	if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
    659 	    SI86SSD_ISLONG(ssd)) {
    660 		mutex_exit(&pp->p_ldtlock);
    661 		return (EINVAL);
    662 	}
    663 #endif /* __amd64 */
    664 
    665 	/*
    666 	 * Set up a code or data user segment descriptor.
    667 	 */
    668 	if (SI86SSD_ISUSEG(ssd)) {
    669 		ssd_to_usd(ssd, &ndesc);
    670 		rc = ldt_update_segd(ldp, &ndesc);
    671 		mutex_exit(&pp->p_ldtlock);
    672 		return (rc);
    673 	}
    674 
    675 #if defined(__i386)
    676 	/*
    677 	 * Allow a call gate only if the destination is in the LDT
    678 	 * and the system is running in 32-bit legacy mode.
    679 	 *
    680 	 * In long mode 32-bit call gates are redefined as 64-bit call
    681 	 * gates and the hw enforces that the target code selector
    682 	 * of the call gate must be 64-bit selector. A #gp fault is
    683 	 * generated if otherwise. Since we do not allow 32-bit processes
    684 	 * to switch themselves to 64-bits we never allow call gates
    685 	 * on 64-bit system system.
    686 	 */
    687 	if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) {
    688 
    689 
    690 		ssd_to_sgd(ssd, (gate_desc_t *)&ndesc);
    691 		rc = ldt_update_segd(ldp, &ndesc);
    692 		mutex_exit(&pp->p_ldtlock);
    693 		return (rc);
    694 	}
    695 #endif	/* __i386 */
    696 
    697 	mutex_exit(&pp->p_ldtlock);
    698 	return (EINVAL);
    699 }
    700 
    701 /*
    702  * Allocate new LDT for process just large enough to contain seli.
    703  * Note we allocate and grow LDT in PAGESIZE chunks. We do this
    704  * to simplify the implementation and because on the hypervisor it's
    705  * required, since the LDT must live on pages that have PROT_WRITE
    706  * removed and which are given to the hypervisor.
    707  */
    708 static void
    709 ldt_alloc(proc_t *pp, uint_t seli)
    710 {
    711 	user_desc_t	*ldt;
    712 	size_t		ldtsz;
    713 	uint_t		nsels;
    714 
    715 	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
    716 	ASSERT(pp->p_ldt == NULL);
    717 	ASSERT(pp->p_ldtlimit == 0);
    718 
    719 	/*
    720 	 * Allocate new LDT just large enough to contain seli.
    721 	 */
    722 	ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
    723 	nsels = ldtsz / sizeof (user_desc_t);
    724 	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
    725 
    726 	ldt = kmem_zalloc(ldtsz, KM_SLEEP);
    727 	ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
    728 
    729 #if defined(__xpv)
    730 	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
    731 		panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
    732 #endif
    733 
    734 	pp->p_ldt = ldt;
    735 	pp->p_ldtlimit = nsels - 1;
    736 	set_syssegd(&pp->p_ldt_desc, ldt, ldtsz - 1, SDT_SYSLDT, SEL_KPL);
    737 
    738 	if (pp == curproc) {
    739 		kpreempt_disable();
    740 		ldt_load();
    741 		kpreempt_enable();
    742 	}
    743 }
    744 
    745 static void
    746 ldt_free(proc_t *pp)
    747 {
    748 	user_desc_t	*ldt;
    749 	size_t		ldtsz;
    750 
    751 	ASSERT(pp->p_ldt != NULL);
    752 
    753 	mutex_enter(&pp->p_ldtlock);
    754 	ldt = pp->p_ldt;
    755 	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
    756 
    757 	ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
    758 
    759 	pp->p_ldt = NULL;
    760 	pp->p_ldtlimit = 0;
    761 	pp->p_ldt_desc = null_sdesc;
    762 	mutex_exit(&pp->p_ldtlock);
    763 
    764 	if (pp == curproc) {
    765 		kpreempt_disable();
    766 		ldt_unload();
    767 		kpreempt_enable();
    768 	}
    769 
    770 #if defined(__xpv)
    771 	/*
    772 	 * We are not allowed to make the ldt writable until after
    773 	 * we tell the hypervisor to unload it.
    774 	 */
    775 	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
    776 		panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
    777 #endif
    778 
    779 	kmem_free(ldt, ldtsz);
    780 }
    781 
    782 /*
    783  * On fork copy new ldt for child.
    784  */
    785 static void
    786 ldt_dup(proc_t *pp, proc_t *cp)
    787 {
    788 	size_t	ldtsz;
    789 
    790 	ASSERT(pp->p_ldt != NULL);
    791 	ASSERT(cp != curproc);
    792 
    793 	/*
    794 	 * I assume the parent's ldt can't increase since we're in a fork.
    795 	 */
    796 	mutex_enter(&pp->p_ldtlock);
    797 	mutex_enter(&cp->p_ldtlock);
    798 
    799 	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
    800 
    801 	ldt_alloc(cp, pp->p_ldtlimit);
    802 
    803 #if defined(__xpv)
    804 	/*
    805 	 * Make child's ldt writable so it can be copied into from
    806 	 * parent's ldt. This works since ldt_alloc above did not load
    807 	 * the ldt since its for the child process. If we tried to make
    808 	 * an LDT writable that is loaded in hw the setprot operation
    809 	 * would fail.
    810 	 */
    811 	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
    812 		panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
    813 #endif
    814 
    815 	bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
    816 
    817 #if defined(__xpv)
    818 	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
    819 		panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
    820 #endif
    821 	mutex_exit(&cp->p_ldtlock);
    822 	mutex_exit(&pp->p_ldtlock);
    823 
    824 }
    825 
    826 static void
    827 ldt_grow(proc_t *pp, uint_t seli)
    828 {
    829 	user_desc_t	*oldt, *nldt;
    830 	uint_t		nsels;
    831 	size_t		oldtsz, nldtsz;
    832 
    833 	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
    834 	ASSERT(pp->p_ldt != NULL);
    835 	ASSERT(pp->p_ldtlimit != 0);
    836 
    837 	/*
    838 	 * Allocate larger LDT just large enough to contain seli.
    839 	 */
    840 	nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
    841 	nsels = nldtsz / sizeof (user_desc_t);
    842 	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
    843 	ASSERT(nsels > pp->p_ldtlimit);
    844 
    845 	oldt = pp->p_ldt;
    846 	oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
    847 
    848 	nldt = kmem_zalloc(nldtsz, KM_SLEEP);
    849 	ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
    850 
    851 	bcopy(oldt, nldt, oldtsz);
    852 
    853 	/*
    854 	 * unload old ldt.
    855 	 */
    856 	kpreempt_disable();
    857 	ldt_unload();
    858 	kpreempt_enable();
    859 
    860 #if defined(__xpv)
    861 
    862 	/*
    863 	 * Make old ldt writable and new ldt read only.
    864 	 */
    865 	if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
    866 		panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
    867 
    868 	if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
    869 		panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
    870 #endif
    871 
    872 	pp->p_ldt = nldt;
    873 	pp->p_ldtlimit = nsels - 1;
    874 
    875 	/*
    876 	 * write new ldt segment descriptor.
    877 	 */
    878 	set_syssegd(&pp->p_ldt_desc, nldt, nldtsz - 1, SDT_SYSLDT, SEL_KPL);
    879 
    880 	/*
    881 	 * load the new ldt.
    882 	 */
    883 	kpreempt_disable();
    884 	ldt_load();
    885 	kpreempt_enable();
    886 
    887 	kmem_free(oldt, oldtsz);
    888 }
    889