Home | History | Annotate | Download | only in ml
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/asm_linkage.h>
     27 #include <sys/asm_misc.h>
     28 #include <sys/regset.h>
     29 #include <sys/privregs.h>
     30 #include <sys/psw.h>
     31 #include <sys/machbrand.h>
     32 
     33 #if defined(__lint)
     34 
     35 #include <sys/types.h>
     36 #include <sys/thread.h>
     37 #include <sys/systm.h>
     38 
     39 #else	/* __lint */
     40 
     41 #include <sys/segments.h>
     42 #include <sys/pcb.h>
     43 #include <sys/trap.h>
     44 #include <sys/ftrace.h>
     45 #include <sys/traptrace.h>
     46 #include <sys/clock.h>
     47 #include <sys/model.h>
     48 #include <sys/panic.h>
     49 
     50 #if defined(__xpv)
     51 #include <sys/hypervisor.h>
     52 #endif
     53 
     54 #include "assym.h"
     55 
     56 #endif	/* __lint */
     57 
     58 /*
     59  * We implement five flavours of system call entry points
     60  *
     61  * -	syscall/sysretq		(amd64 generic)
     62  * -	syscall/sysretl		(i386 plus SYSC bit)
     63  * -	sysenter/sysexit	(i386 plus SEP bit)
     64  * -	int/iret		(i386 generic)
     65  * -	lcall/iret		(i386 generic)
     66  *
     67  * The current libc included in Solaris uses int/iret as the base unoptimized
     68  * kernel entry method. Older libc implementations and legacy binaries may use
     69  * the lcall call gate, so it must continue to be supported.
     70  *
     71  * System calls that use an lcall call gate are processed in trap() via a
     72  * segment-not-present trap, i.e. lcalls are extremely slow(!).
     73  *
     74  * The basic pattern used in the 32-bit SYSC handler at this point in time is
     75  * to have the bare minimum of assembler, and get to the C handlers as
     76  * quickly as possible.
     77  *
     78  * The 64-bit handler is much closer to the sparcv9 handler; that's
     79  * because of passing arguments in registers.  The 32-bit world still
     80  * passes arguments on the stack -- that makes that handler substantially
     81  * more complex.
     82  *
     83  * The two handlers share a few code fragments which are broken
     84  * out into preprocessor macros below.
     85  *
     86  * XX64	come back and speed all this up later.  The 32-bit stuff looks
     87  * especially easy to speed up the argument copying part ..
     88  *
     89  *
     90  * Notes about segment register usage (c.f. the 32-bit kernel)
     91  *
     92  * In the 32-bit kernel, segment registers are dutifully saved and
     93  * restored on all mode transitions because the kernel uses them directly.
     94  * When the processor is running in 64-bit mode, segment registers are
     95  * largely ignored.
     96  *
     97  * %cs and %ss
     98  *	controlled by the hardware mechanisms that make mode transitions
     99  *
    100  * The remaining segment registers have to either be pointing at a valid
    101  * descriptor i.e. with the 'present' bit set, or they can NULL descriptors
    102  *
    103  * %ds and %es
    104  *	always ignored
    105  *
    106  * %fs and %gs
    107  *	fsbase and gsbase are used to control the place they really point at.
    108  *	The kernel only depends on %gs, and controls its own gsbase via swapgs
    109  *
    110  * Note that loading segment registers is still costly because the GDT
    111  * lookup still happens (this is because the hardware can't know that we're
    112  * not setting up these segment registers for a 32-bit program).  Thus we
    113  * avoid doing this in the syscall path, and defer them to lwp context switch
    114  * handlers, so the register values remain virtualized to the lwp.
    115  */
    116 
    117 #if defined(SYSCALLTRACE)
    118 #define	ORL_SYSCALLTRACE(r32)		\
    119 	orl	syscalltrace(%rip), r32
    120 #else
    121 #define	ORL_SYSCALLTRACE(r32)
    122 #endif
    123 
    124 /*
    125  * In the 32-bit kernel, we do absolutely nothing before getting into the
    126  * brand callback checks.  In 64-bit land, we do swapgs and then come here.
    127  * We assume that the %rsp- and %r15-stashing fields in the CPU structure
    128  * are still unused.
    129  *
    130  * Check if a brand_mach_ops callback is defined for the specified callback_id
    131  * type.  If so invoke it with the kernel's %gs value loaded and the following
    132  * data on the stack:
    133  *
    134  * stack:  --------------------------------------
    135  *      40 | user %gs				|
    136  *      32 | callback pointer			|
    137  *    | 24 | user (or interrupt) stack pointer	|
    138  *    | 16 | lwp pointer			|
    139  *    v  8 | userland return address		|
    140  *       0 | callback wrapper return addr	|
    141  *         --------------------------------------
    142  *
    143  * Since we're pushing the userland return address onto the kernel stack
    144  * we need to get that address without accessing the user's stack (since we
    145  * can't trust that data).  There are different ways to get the userland
    146  * return address depending on how the syscall trap was made:
    147  *
    148  * a) For sys_syscall and sys_syscall32 the return address is in %rcx.
    149  * b) For sys_sysenter the return address is in %rdx.
    150  * c) For sys_int80 and sys_syscall_int (int91), upon entry into the macro,
    151  *    the stack pointer points at the state saved when we took the interrupt:
    152  *	 ------------------------
    153  *    |  | user's %ss		|
    154  *    |  | user's %esp		|
    155  *    |  | EFLAGS register	|
    156  *    v  | user's %cs		|
    157  *       | user's %eip		|
    158  *	 ------------------------
    159  *
    160  * The 2nd parameter to the BRAND_CALLBACK macro is either the
    161  * BRAND_URET_FROM_REG or BRAND_URET_FROM_INTR_STACK macro.  These macros are
    162  * used to generate the proper code to get the userland return address for
    163  * each syscall entry point.
    164  */
    165 #define BRAND_URET_FROM_REG(rip_reg)					\
    166 	pushq	rip_reg			/* push the return address	*/
    167 
    168 /*
    169  * The interrupt stack pointer we saved on entry to the BRAND_CALLBACK macro
    170  * is currently pointing at the user return address (%eip).
    171  */
    172 #define BRAND_URET_FROM_INTR_STACK()					\
    173 	movq	%gs:CPU_RTMP_RSP, %r15	/* grab the intr. stack pointer	*/ ;\
    174 	pushq	(%r15)			/* push the return address	*/
    175 
    176 #define	BRAND_CALLBACK(callback_id, push_userland_ret)			    \
    177 	movq	%rsp, %gs:CPU_RTMP_RSP	/* save the stack pointer	*/ ;\
    178 	movq	%r15, %gs:CPU_RTMP_R15	/* save %r15			*/ ;\
    179 	movq	%gs:CPU_THREAD, %r15	/* load the thread pointer	*/ ;\
    180 	movq	T_STACK(%r15), %rsp	/* switch to the kernel stack	*/ ;\
    181 	subq	$24, %rsp		/* save space for 3 pointers	*/ ;\
    182 	pushq	%r14			/* save %r14			*/ ;\
    183 	movq	%gs:CPU_RTMP_RSP, %r14					   ;\
    184 	movq	%r14, 8(%rsp)		/* stash the user stack pointer	*/ ;\
    185 	popq	%r14			/* restore %r14			*/ ;\
    186 	movq	T_LWP(%r15), %r15	/* load the lwp pointer		*/ ;\
    187 	pushq	%r15			/* push the lwp pointer		*/ ;\
    188 	movq	LWP_PROCP(%r15), %r15	/* load the proc pointer	*/ ;\
    189 	movq	P_BRAND(%r15), %r15	/* load the brand pointer	*/ ;\
    190 	movq	B_MACHOPS(%r15), %r15	/* load the machops pointer	*/ ;\
    191 	movq	_CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15		   ;\
    192 	cmpq	$0, %r15						   ;\
    193 	je	1f							   ;\
    194 	movq	%r15, 16(%rsp)		/* save the callback pointer	*/ ;\
    195 	push_userland_ret		/* push the return address	*/ ;\
    196 	SWAPGS				/* user gsbase			*/ ;\
    197 	mov	%gs, %r15		/* get %gs			*/ ;\
    198 	movq	%r15, 32(%rsp)		/* save %gs on stack		*/ ;\
    199 	SWAPGS				/* kernel gsbase		*/ ;\
    200 	movq	%gs:CPU_RTMP_R15, %r15	/* restore %r15			*/ ;\
    201 	call	*24(%rsp)		/* call callback		*/ ;\
    202 1:	movq	%gs:CPU_RTMP_R15, %r15	/* restore %r15			*/ ;\
    203 	movq	%gs:CPU_RTMP_RSP, %rsp	/* restore the stack pointer	*/
    204 
    205 #define	MSTATE_TRANSITION(from, to)		\
    206 	movl	$from, %edi;			\
    207 	movl	$to, %esi;			\
    208 	call	syscall_mstate
    209 
    210 /*
    211  * Check to see if a simple (direct) return is possible i.e.
    212  *
    213  *	if (t->t_post_sys_ast | syscalltrace |
    214  *	    lwp->lwp_pcb.pcb_rupdate == 1)
    215  *		do full version	;
    216  *
    217  * Preconditions:
    218  * -	t is curthread
    219  * Postconditions:
    220  * -	condition code NE is set if post-sys is too complex
    221  * -	rtmp is zeroed if it isn't (we rely on this!)
    222  * -	ltmp is smashed
    223  */
    224 #define	CHECK_POSTSYS_NE(t, ltmp, rtmp)			\
    225 	movq	T_LWP(t), ltmp;				\
    226 	movzbl	PCB_RUPDATE(ltmp), rtmp;		\
    227 	ORL_SYSCALLTRACE(rtmp);				\
    228 	orl	T_POST_SYS_AST(t), rtmp;		\
    229 	cmpl	$0, rtmp
    230 
    231 /*
    232  * Fix up the lwp, thread, and eflags for a successful return
    233  *
    234  * Preconditions:
    235  * -	zwreg contains zero
    236  */
    237 #define	SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg)		\
    238 	movb	$LWP_USER, LWP_STATE(lwp);		\
    239 	movw	zwreg, T_SYSNUM(t);			\
    240 	andb	$_CONST(0xffff - PS_C), REGOFF_RFL(%rsp)
    241 
    242 /*
    243  * ASSERT(lwptoregs(lwp) == rp);
    244  *
    245  * This may seem obvious, but very odd things happen if this
    246  * assertion is false
    247  *
    248  * Preconditions:
    249  *	(%rsp is ready for normal call sequence)
    250  * Postconditions (if assertion is true):
    251  *	%r11 is smashed
    252  *
    253  * ASSERT(rp->r_cs == descnum)
    254  *
    255  * The code selector is written into the regs structure when the
    256  * lwp stack is created.  We use this ASSERT to validate that
    257  * the regs structure really matches how we came in.
    258  *
    259  * Preconditions:
    260  *	(%rsp is ready for normal call sequence)
    261  * Postconditions (if assertion is true):
    262  *	-none-
    263  *
    264  * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0);
    265  *
    266  * If this is false, it meant that we returned to userland without
    267  * updating the segment registers as we were supposed to.
    268  *
    269  * Note that we must ensure no interrupts or other traps intervene
    270  * between entering privileged mode and performing the assertion,
    271  * otherwise we may perform a context switch on the thread, which
    272  * will end up setting pcb_rupdate to 1 again.
    273  */
    274 #if defined(DEBUG)
    275 
    276 #if !defined(__lint)
    277 
    278 __lwptoregs_msg:
    279 	.string	"syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]"
    280 
    281 __codesel_msg:
    282 	.string	"syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld"
    283 
    284 __no_rupdate_msg:
    285 	.string	"syscall_asm_amd64.s:%d lwp %p, pcb_rupdate != 0"
    286 
    287 #endif	/* !__lint */
    288 
    289 #define	ASSERT_LWPTOREGS(lwp, rp)			\
    290 	movq	LWP_REGS(lwp), %r11;			\
    291 	cmpq	rp, %r11;				\
    292 	je	7f;					\
    293 	leaq	__lwptoregs_msg(%rip), %rdi;		\
    294 	movl	$__LINE__, %esi;			\
    295 	movq	lwp, %rdx;				\
    296 	movq	%r11, %rcx;				\
    297 	movq	rp, %r8;				\
    298 	xorl	%eax, %eax;				\
    299 	call	panic;					\
    300 7:
    301 
    302 #define	ASSERT_NO_RUPDATE_PENDING(lwp)			\
    303 	testb	$0x1, PCB_RUPDATE(lwp);			\
    304 	je	8f;					\
    305 	movq	lwp, %rdx;				\
    306 	leaq	__no_rupdate_msg(%rip), %rdi;		\
    307 	movl	$__LINE__, %esi;			\
    308 	xorl	%eax, %eax;				\
    309 	call	panic;					\
    310 8:
    311 
    312 #else
    313 #define	ASSERT_LWPTOREGS(lwp, rp)
    314 #define	ASSERT_NO_RUPDATE_PENDING(lwp)
    315 #endif
    316 
    317 /*
    318  * Do the traptrace thing and restore any registers we used
    319  * in situ.  Assumes that %rsp is pointing at the base of
    320  * the struct regs, obviously ..
    321  */
    322 #ifdef TRAPTRACE
    323 #define	SYSCALL_TRAPTRACE(ttype)				\
    324 	TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype);		\
    325 	TRACE_REGS(%rdi, %rsp, %rbx, %rcx);			\
    326 	TRACE_STAMP(%rdi);	/* rdtsc clobbers %eax, %edx */	\
    327 	movq	REGOFF_RAX(%rsp), %rax;				\
    328 	movq	REGOFF_RBX(%rsp), %rbx;				\
    329 	movq	REGOFF_RCX(%rsp), %rcx;				\
    330 	movq	REGOFF_RDX(%rsp), %rdx;				\
    331 	movl	%eax, TTR_SYSNUM(%rdi);				\
    332 	movq	REGOFF_RDI(%rsp), %rdi
    333 
    334 #define	SYSCALL_TRAPTRACE32(ttype)				\
    335 	SYSCALL_TRAPTRACE(ttype);				\
    336 	/* paranoia: clean the top 32-bits of the registers */	\
    337 	orl	%eax, %eax;					\
    338 	orl	%ebx, %ebx;					\
    339 	orl	%ecx, %ecx;					\
    340 	orl	%edx, %edx;					\
    341 	orl	%edi, %edi
    342 #else	/* TRAPTRACE */
    343 #define	SYSCALL_TRAPTRACE(ttype)
    344 #define	SYSCALL_TRAPTRACE32(ttype)
    345 #endif	/* TRAPTRACE */
    346 
    347 /*
    348  * The 64-bit libc syscall wrapper does this:
    349  *
    350  * fn(<args>)
    351  * {
    352  *	movq	%rcx, %r10	-- because syscall smashes %rcx
    353  *	movl	$CODE, %eax
    354  *	syscall
    355  *	<error processing>
    356  * }
    357  *
    358  * Thus when we come into the kernel:
    359  *
    360  *	%rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args
    361  *	%rax is the syscall number
    362  *	%r12-%r15 contain caller state
    363  *
    364  * The syscall instruction arranges that:
    365  *
    366  *	%rcx contains the return %rip
    367  *	%r11d contains bottom 32-bits of %rflags
    368  *	%rflags is masked (as determined by the SFMASK msr)
    369  *	%cs is set to UCS_SEL (as determined by the STAR msr)
    370  *	%ss is set to UDS_SEL (as determined by the STAR msr)
    371  *	%rip is set to sys_syscall (as determined by the LSTAR msr)
    372  *
    373  * Or in other words, we have no registers available at all.
    374  * Only swapgs can save us!
    375  *
    376  * Under the hypervisor, the swapgs has happened already.  However, the
    377  * state of the world is very different from that we're familiar with.
    378  *
    379  * In particular, we have a stack structure like that for interrupt
    380  * gates, except that the %cs and %ss registers are modified for reasons
    381  * that are not entirely clear.  Critically, the %rcx/%r11 values do
    382  * *not* reflect the usage of those registers under a 'real' syscall[1];
    383  * the stack, therefore, looks like this:
    384  *
    385  *	0x0(rsp)	potentially junk %rcx
    386  *	0x8(rsp)	potentially junk %r11
    387  *	0x10(rsp)	user %rip
    388  *	0x18(rsp)	modified %cs
    389  *	0x20(rsp)	user %rflags
    390  *	0x28(rsp)	user %rsp
    391  *	0x30(rsp)	modified %ss
    392  *
    393  *
    394  * and before continuing on, we must load the %rip into %rcx and the
    395  * %rflags into %r11.
    396  *
    397  * [1] They used to, and we relied on it, but this was broken in 3.1.1.
    398  * Sigh.
    399  */
    400 #if defined(__xpv)
    401 #define	XPV_SYSCALL_PROD						\
    402 	movq	0x10(%rsp), %rcx;					\
    403 	movq	0x20(%rsp), %r11;					\
    404 	movq	0x28(%rsp), %rsp
    405 #else
    406 #define	XPV_SYSCALL_PROD /* nothing */
    407 #endif
    408 
    409 #if defined(__lint)
    410 
    411 /*ARGSUSED*/
    412 void
    413 sys_syscall()
    414 {}
    415 
    416 void
    417 _allsyscalls()
    418 {}
    419 
    420 size_t _allsyscalls_size;
    421 
    422 #else	/* __lint */
    423 
    424 	ENTRY_NP2(brand_sys_syscall,_allsyscalls)
    425 	SWAPGS				/* kernel gsbase */
    426 	XPV_SYSCALL_PROD
    427 	BRAND_CALLBACK(BRAND_CB_SYSCALL, BRAND_URET_FROM_REG(%rcx))
    428 	SWAPGS				/* user gsbase */
    429 
    430 #if defined(__xpv)
    431 	jmp	noprod_sys_syscall
    432 #endif
    433 
    434 	ALTENTRY(sys_syscall)
    435 	SWAPGS				/* kernel gsbase */
    436 	XPV_SYSCALL_PROD
    437 
    438 noprod_sys_syscall:
    439 
    440 	movq	%r15, %gs:CPU_RTMP_R15
    441 	movq	%rsp, %gs:CPU_RTMP_RSP
    442 
    443 	movq	%gs:CPU_THREAD, %r15
    444 	movq	T_STACK(%r15), %rsp	/* switch from user to kernel stack */
    445 
    446 	ASSERT_UPCALL_MASK_IS_SET
    447 
    448 	movl	$UCS_SEL, REGOFF_CS(%rsp)
    449 	movq	%rcx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
    450 	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
    451 	movl	$UDS_SEL, REGOFF_SS(%rsp)
    452 
    453 	movl	%eax, %eax			/* wrapper: sysc# -> %eax */
    454 	movq	%rdi, REGOFF_RDI(%rsp)
    455 	movq	%rsi, REGOFF_RSI(%rsp)
    456 	movq	%rdx, REGOFF_RDX(%rsp)
    457 	movq	%r10, REGOFF_RCX(%rsp)		/* wrapper: %rcx -> %r10 */
    458 	movq	%r10, %rcx			/* arg[3] for direct calls */
    459 
    460 	movq	%r8, REGOFF_R8(%rsp)
    461 	movq	%r9, REGOFF_R9(%rsp)
    462 	movq	%rax, REGOFF_RAX(%rsp)
    463 	movq	%rbx, REGOFF_RBX(%rsp)
    464 
    465 	movq	%rbp, REGOFF_RBP(%rsp)
    466 	movq	%r10, REGOFF_R10(%rsp)
    467 	movq	%gs:CPU_RTMP_RSP, %r11
    468 	movq	%r11, REGOFF_RSP(%rsp)
    469 	movq	%r12, REGOFF_R12(%rsp)
    470 
    471 	movq	%r13, REGOFF_R13(%rsp)
    472 	movq	%r14, REGOFF_R14(%rsp)
    473 	movq	%gs:CPU_RTMP_R15, %r10
    474 	movq	%r10, REGOFF_R15(%rsp)
    475 	movq	$0, REGOFF_SAVFP(%rsp)
    476 	movq	$0, REGOFF_SAVPC(%rsp)
    477 
    478 	/*
    479 	 * Copy these registers here in case we end up stopped with
    480 	 * someone (like, say, /proc) messing with our register state.
    481 	 * We don't -restore- them unless we have to in update_sregs.
    482 	 *
    483 	 * Since userland -can't- change fsbase or gsbase directly,
    484 	 * and capturing them involves two serializing instructions,
    485 	 * we don't bother to capture them here.
    486 	 */
    487 	xorl	%ebx, %ebx
    488 	movw	%ds, %bx
    489 	movq	%rbx, REGOFF_DS(%rsp)
    490 	movw	%es, %bx
    491 	movq	%rbx, REGOFF_ES(%rsp)
    492 	movw	%fs, %bx
    493 	movq	%rbx, REGOFF_FS(%rsp)
    494 	movw	%gs, %bx
    495 	movq	%rbx, REGOFF_GS(%rsp)
    496 
    497 	/*
    498 	 * Machine state saved in the regs structure on the stack
    499 	 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
    500 	 * %eax is the syscall number
    501 	 * %rsp is the thread's stack, %r15 is curthread
    502 	 * REG_RSP(%rsp) is the user's stack
    503 	 */
    504 
    505 	SYSCALL_TRAPTRACE($TT_SYSC64)
    506 
    507 	movq	%rsp, %rbp
    508 
    509 	movq	T_LWP(%r15), %r14
    510 	ASSERT_NO_RUPDATE_PENDING(%r14)
    511 	ENABLE_INTR_FLAGS
    512 
    513 	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
    514 	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
    515 
    516 	ASSERT_LWPTOREGS(%r14, %rsp)
    517 
    518 	movb	$LWP_SYS, LWP_STATE(%r14)
    519 	incq	LWP_RU_SYSC(%r14)
    520 	movb	$NORMALRETURN, LWP_EOSYS(%r14)
    521 
    522 	incq	%gs:CPU_STATS_SYS_SYSCALL
    523 
    524 	movw	%ax, T_SYSNUM(%r15)
    525 	movzbl	T_PRE_SYS(%r15), %ebx
    526 	ORL_SYSCALLTRACE(%ebx)
    527 	testl	%ebx, %ebx
    528 	jne	_syscall_pre
    529 
    530 _syscall_invoke:
    531 	movq	REGOFF_RDI(%rbp), %rdi
    532 	movq	REGOFF_RSI(%rbp), %rsi
    533 	movq	REGOFF_RDX(%rbp), %rdx
    534 	movq	REGOFF_RCX(%rbp), %rcx
    535 	movq	REGOFF_R8(%rbp), %r8
    536 	movq	REGOFF_R9(%rbp), %r9
    537 
    538 	cmpl	$NSYSCALL, %eax
    539 	jae	_syscall_ill
    540 	shll	$SYSENT_SIZE_SHIFT, %eax
    541 	leaq	sysent(%rax), %rbx
    542 
    543 	call	*SY_CALLC(%rbx)
    544 
    545 	movq	%rax, %r12
    546 	movq	%rdx, %r13
    547 
    548 	/*
    549 	 * If the handler returns two ints, then we need to split the
    550 	 * 64-bit return value into two 32-bit values.
    551 	 */
    552 	testw	$SE_32RVAL2, SY_FLAGS(%rbx)
    553 	je	5f
    554 	movq	%r12, %r13
    555 	shrq	$32, %r13	/* upper 32-bits into %edx */
    556 	movl	%r12d, %r12d	/* lower 32-bits into %eax */
    557 5:
    558 	/*
    559 	 * Optimistically assume that there's no post-syscall
    560 	 * work to do.  (This is to avoid having to call syscall_mstate()
    561 	 * with interrupts disabled)
    562 	 */
    563 	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
    564 
    565 	/*
    566 	 * We must protect ourselves from being descheduled here;
    567 	 * If we were, and we ended up on another cpu, or another
    568 	 * lwp got in ahead of us, it could change the segment
    569 	 * registers without us noticing before we return to userland.
    570 	 */
    571 	CLI(%r14)
    572 	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
    573 	jne	_syscall_post
    574 	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
    575 
    576 	movq	%r12, REGOFF_RAX(%rsp)
    577 	movq	%r13, REGOFF_RDX(%rsp)
    578 
    579 	/*
    580 	 * To get back to userland, we need the return %rip in %rcx and
    581 	 * the return %rfl in %r11d.  The sysretq instruction also arranges
    582 	 * to fix up %cs and %ss; everything else is our responsibility.
    583 	 */
    584 	movq	REGOFF_RDI(%rsp), %rdi
    585 	movq	REGOFF_RSI(%rsp), %rsi
    586 	movq	REGOFF_RDX(%rsp), %rdx
    587 	/* %rcx used to restore %rip value */
    588 
    589 	movq	REGOFF_R8(%rsp), %r8
    590 	movq	REGOFF_R9(%rsp), %r9
    591 	movq	REGOFF_RAX(%rsp), %rax
    592 	movq	REGOFF_RBX(%rsp), %rbx
    593 
    594 	movq	REGOFF_RBP(%rsp), %rbp
    595 	movq	REGOFF_R10(%rsp), %r10
    596 	/* %r11 used to restore %rfl value */
    597 	movq	REGOFF_R12(%rsp), %r12
    598 
    599 	movq	REGOFF_R13(%rsp), %r13
    600 	movq	REGOFF_R14(%rsp), %r14
    601 	movq	REGOFF_R15(%rsp), %r15
    602 
    603 	movq	REGOFF_RIP(%rsp), %rcx
    604 	movl	REGOFF_RFL(%rsp), %r11d
    605 
    606 #if defined(__xpv)
    607 	addq	$REGOFF_RIP, %rsp
    608 #else
    609 	movq	REGOFF_RSP(%rsp), %rsp
    610 #endif
    611 
    612         /*
    613          * There can be no instructions between the ALTENTRY below and
    614 	 * SYSRET or we could end up breaking brand support. See label usage
    615          * in sn1_brand_syscall_callback for an example.
    616          */
    617 	ASSERT_UPCALL_MASK_IS_SET
    618 #if defined(__xpv)
    619 	SYSRETQ
    620         ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
    621 
    622 	/*
    623 	 * We can only get here after executing a brand syscall
    624 	 * interposition callback handler and simply need to
    625 	 * "sysretq" back to userland. On the hypervisor this
    626 	 * involves the iret hypercall which requires us to construct
    627 	 * just enough of the stack needed for the hypercall.
    628 	 * (rip, cs, rflags, rsp, ss).
    629 	 */
    630 	movq    %rsp, %gs:CPU_RTMP_RSP		/* save user's rsp */
    631 	movq	%gs:CPU_THREAD, %r11
    632 	movq	T_STACK(%r11), %rsp
    633 
    634 	movq	%rcx, REGOFF_RIP(%rsp)
    635 	movl	$UCS_SEL, REGOFF_CS(%rsp)
    636 	movq	%gs:CPU_RTMP_RSP, %r11
    637 	movq	%r11, REGOFF_RSP(%rsp)
    638 	pushfq
    639 	popq	%r11				/* hypercall enables ints */
    640 	movq	%r11, REGOFF_RFL(%rsp)
    641 	movl	$UDS_SEL, REGOFF_SS(%rsp)
    642 	addq	$REGOFF_RIP, %rsp
    643 	/*
    644 	 * XXPV: see comment in SYSRETQ definition for future optimization
    645 	 *       we could take.
    646 	 */
    647 	ASSERT_UPCALL_MASK_IS_SET
    648 	SYSRETQ
    649 #else
    650         ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
    651 	SWAPGS				/* user gsbase */
    652 	SYSRETQ
    653 #endif
    654         /*NOTREACHED*/
    655         SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
    656 
    657 _syscall_pre:
    658 	call	pre_syscall
    659 	movl	%eax, %r12d
    660 	testl	%eax, %eax
    661 	jne	_syscall_post_call
    662 	/*
    663 	 * Didn't abort, so reload the syscall args and invoke the handler.
    664 	 */
    665 	movzwl	T_SYSNUM(%r15), %eax
    666 	jmp	_syscall_invoke
    667 
    668 _syscall_ill:
    669 	call	nosys
    670 	movq	%rax, %r12
    671 	movq	%rdx, %r13
    672 	jmp	_syscall_post_call
    673 
    674 _syscall_post:
    675 	STI
    676 	/*
    677 	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
    678 	 * so that we can account for the extra work it takes us to finish.
    679 	 */
    680 	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
    681 _syscall_post_call:
    682 	movq	%r12, %rdi
    683 	movq	%r13, %rsi
    684 	call	post_syscall
    685 	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
    686 	jmp	_sys_rtt
    687 	SET_SIZE(sys_syscall)
    688 	SET_SIZE(brand_sys_syscall)
    689 
    690 #endif	/* __lint */
    691 
    692 #if defined(__lint)
    693 
    694 /*ARGSUSED*/
    695 void
    696 sys_syscall32()
    697 {}
    698 
    699 #else	/* __lint */
    700 
    701 	ENTRY_NP(brand_sys_syscall32)
    702 	SWAPGS				/* kernel gsbase */
    703 	XPV_TRAP_POP
    704 	BRAND_CALLBACK(BRAND_CB_SYSCALL32, BRAND_URET_FROM_REG(%rcx))
    705 	SWAPGS				/* user gsbase */
    706 
    707 #if defined(__xpv)
    708 	jmp	nopop_sys_syscall32
    709 #endif
    710 
    711 	ALTENTRY(sys_syscall32)
    712 	SWAPGS				/* kernel gsbase */
    713 
    714 #if defined(__xpv)
    715 	XPV_TRAP_POP
    716 nopop_sys_syscall32:
    717 #endif
    718 
    719 	movl	%esp, %r10d
    720 	movq	%gs:CPU_THREAD, %r15
    721 	movq	T_STACK(%r15), %rsp
    722 	movl	%eax, %eax
    723 
    724 	movl	$U32CS_SEL, REGOFF_CS(%rsp)
    725 	movl	%ecx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
    726 	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
    727 	movq	%r10, REGOFF_RSP(%rsp)
    728 	movl	$UDS_SEL, REGOFF_SS(%rsp)
    729 
    730 _syscall32_save:
    731 	movl	%edi, REGOFF_RDI(%rsp)
    732 	movl	%esi, REGOFF_RSI(%rsp)
    733 	movl	%ebp, REGOFF_RBP(%rsp)
    734 	movl	%ebx, REGOFF_RBX(%rsp)
    735 	movl	%edx, REGOFF_RDX(%rsp)
    736 	movl	%ecx, REGOFF_RCX(%rsp)
    737 	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
    738 	movq	$0, REGOFF_SAVFP(%rsp)
    739 	movq	$0, REGOFF_SAVPC(%rsp)
    740 
    741 	/*
    742 	 * Copy these registers here in case we end up stopped with
    743 	 * someone (like, say, /proc) messing with our register state.
    744 	 * We don't -restore- them unless we have to in update_sregs.
    745 	 *
    746 	 * Since userland -can't- change fsbase or gsbase directly,
    747 	 * we don't bother to capture them here.
    748 	 */
    749 	xorl	%ebx, %ebx
    750 	movw	%ds, %bx
    751 	movq	%rbx, REGOFF_DS(%rsp)
    752 	movw	%es, %bx
    753 	movq	%rbx, REGOFF_ES(%rsp)
    754 	movw	%fs, %bx
    755 	movq	%rbx, REGOFF_FS(%rsp)
    756 	movw	%gs, %bx
    757 	movq	%rbx, REGOFF_GS(%rsp)
    758 
    759 	/*
    760 	 * Application state saved in the regs structure on the stack
    761 	 * %eax is the syscall number
    762 	 * %rsp is the thread's stack, %r15 is curthread
    763 	 * REG_RSP(%rsp) is the user's stack
    764 	 */
    765 
    766 	SYSCALL_TRAPTRACE32($TT_SYSC)
    767 
    768 	movq	%rsp, %rbp
    769 
    770 	movq	T_LWP(%r15), %r14
    771 	ASSERT_NO_RUPDATE_PENDING(%r14)
    772 
    773 	ENABLE_INTR_FLAGS
    774 
    775 	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
    776 	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
    777 
    778 	ASSERT_LWPTOREGS(%r14, %rsp)
    779 
    780 	incq	 %gs:CPU_STATS_SYS_SYSCALL
    781 
    782 	/*
    783 	 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed
    784 	 * into 64-bit (long) arg slots, maintaining 16 byte alignment.  Or
    785 	 * more succinctly:
    786 	 *
    787 	 *	SA(MAXSYSARGS * sizeof (long)) == 64
    788 	 */
    789 #define	SYS_DROP	64			/* drop for args */
    790 	subq	$SYS_DROP, %rsp
    791 	movb	$LWP_SYS, LWP_STATE(%r14)
    792 	movq	%r15, %rdi
    793 	movq	%rsp, %rsi
    794 	call	syscall_entry
    795 
    796 	/*
    797 	 * Fetch the arguments copied onto the kernel stack and put
    798 	 * them in the right registers to invoke a C-style syscall handler.
    799 	 * %rax contains the handler address.
    800 	 *
    801 	 * Ideas for making all this go faster of course include simply
    802 	 * forcibly fetching 6 arguments from the user stack under lofault
    803 	 * protection, reverting to copyin_args only when watchpoints
    804 	 * are in effect.
    805 	 *
    806 	 * (If we do this, make sure that exec and libthread leave
    807 	 * enough space at the top of the stack to ensure that we'll
    808 	 * never do a fetch from an invalid page.)
    809 	 *
    810 	 * Lots of ideas here, but they won't really help with bringup B-)
    811 	 * Correctness can't wait, performance can wait a little longer ..
    812 	 */
    813 
    814 	movq	%rax, %rbx
    815 	movl	0(%rsp), %edi
    816 	movl	8(%rsp), %esi
    817 	movl	0x10(%rsp), %edx
    818 	movl	0x18(%rsp), %ecx
    819 	movl	0x20(%rsp), %r8d
    820 	movl	0x28(%rsp), %r9d
    821 
    822 	call	*SY_CALLC(%rbx)
    823 
    824 	movq	%rbp, %rsp	/* pop the args */
    825 
    826 	/*
    827 	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
    828 	 * On the 32-bit kernel, they always return that value in %eax:%edx
    829 	 * as required by the 32-bit ABI.
    830 	 *
    831 	 * Simulate the same behaviour by unconditionally splitting the
    832 	 * return value in the same way.
    833 	 */
    834 	movq	%rax, %r13
    835 	shrq	$32, %r13	/* upper 32-bits into %edx */
    836 	movl	%eax, %r12d	/* lower 32-bits into %eax */
    837 
    838 	/*
    839 	 * Optimistically assume that there's no post-syscall
    840 	 * work to do.  (This is to avoid having to call syscall_mstate()
    841 	 * with interrupts disabled)
    842 	 */
    843 	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
    844 
    845 	/*
    846 	 * We must protect ourselves from being descheduled here;
    847 	 * If we were, and we ended up on another cpu, or another
    848 	 * lwp got in ahead of us, it could change the segment
    849 	 * registers without us noticing before we return to userland.
    850 	 */
    851 	CLI(%r14)
    852 	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
    853 	jne	_full_syscall_postsys32
    854 	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
    855 
    856 	/*
    857 	 * To get back to userland, we need to put the return %rip in %rcx and
    858 	 * the return %rfl in %r11d.  The sysret instruction also arranges
    859 	 * to fix up %cs and %ss; everything else is our responsibility.
    860 	 */
    861 
    862 	movl	%r12d, %eax			/* %eax: rval1 */
    863 	movl	REGOFF_RBX(%rsp), %ebx
    864 	/* %ecx used for return pointer */
    865 	movl	%r13d, %edx			/* %edx: rval2 */
    866 	movl	REGOFF_RBP(%rsp), %ebp
    867 	movl	REGOFF_RSI(%rsp), %esi
    868 	movl	REGOFF_RDI(%rsp), %edi
    869 
    870 	movl	REGOFF_RFL(%rsp), %r11d		/* %r11 -> eflags */
    871 	movl	REGOFF_RIP(%rsp), %ecx		/* %ecx -> %eip */
    872 	movl	REGOFF_RSP(%rsp), %esp
    873 
    874 	ASSERT_UPCALL_MASK_IS_SET
    875         ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
    876 	SWAPGS				/* user gsbase */
    877 	SYSRETL
    878         SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
    879 	/*NOTREACHED*/
    880 
    881 _full_syscall_postsys32:
    882 	STI
    883 	/*
    884 	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
    885 	 * so that we can account for the extra work it takes us to finish.
    886 	 */
    887 	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
    888 	movq	%r15, %rdi
    889 	movq	%r12, %rsi			/* rval1 - %eax */
    890 	movq	%r13, %rdx			/* rval2 - %edx */
    891 	call	syscall_exit
    892 	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
    893 	jmp	_sys_rtt
    894 	SET_SIZE(sys_syscall32)
    895 	SET_SIZE(brand_sys_syscall32)
    896 
    897 #endif	/* __lint */
    898 
    899 /*
    900  * System call handler via the sysenter instruction
    901  * Used only for 32-bit system calls on the 64-bit kernel.
    902  *
    903  * The caller in userland has arranged that:
    904  *
    905  * -	%eax contains the syscall number
    906  * -	%ecx contains the user %esp
    907  * -	%edx contains the return %eip
    908  * -	the user stack contains the args to the syscall
    909  *
    910  * Hardware and (privileged) initialization code have arranged that by
    911  * the time the sysenter instructions completes:
    912  *
    913  * - %rip is pointing to sys_sysenter (below).
    914  * - %cs and %ss are set to kernel text and stack (data) selectors.
    915  * - %rsp is pointing at the lwp's stack
    916  * - interrupts have been disabled.
    917  *
    918  * Note that we are unable to return both "rvals" to userland with
    919  * this call, as %edx is used by the sysexit instruction.
    920  *
    921  * One final complication in this routine is its interaction with
    922  * single-stepping in a debugger.  For most of the system call mechanisms,
    923  * the CPU automatically clears the single-step flag before we enter the
    924  * kernel.  The sysenter mechanism does not clear the flag, so a user
    925  * single-stepping through a libc routine may suddenly find him/herself
    926  * single-stepping through the kernel.  To detect this, kmdb compares the
    927  * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
    928  * If it finds that we have single-stepped to a sysenter entry point, it
    929  * explicitly clears the flag and executes the sys_sysenter routine.
    930  *
    931  * One final complication in this final complication is the fact that we
    932  * have two different entry points for sysenter: brand_sys_sysenter and
    933  * sys_sysenter.  If we enter at brand_sys_sysenter and start single-stepping
    934  * through the kernel with kmdb, we will eventually hit the instruction at
    935  * sys_sysenter.  kmdb cannot distinguish between that valid single-step
    936  * and the undesirable one mentioned above.  To avoid this situation, we
    937  * simply add a jump over the instruction at sys_sysenter to make it
    938  * impossible to single-step to it.
    939  */
    940 #if defined(__lint)
    941 
    942 void
    943 sys_sysenter()
    944 {}
    945 
    946 #else	/* __lint */
    947 
    948 	ENTRY_NP(brand_sys_sysenter)
    949 	SWAPGS				/* kernel gsbase */
    950 	ALTENTRY(_brand_sys_sysenter_post_swapgs)
    951 	BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
    952 	/*
    953 	 * Jump over sys_sysenter to allow single-stepping as described
    954 	 * above.
    955 	 */
    956 	jmp	_sys_sysenter_post_swapgs
    957 
    958 	ALTENTRY(sys_sysenter)
    959 	SWAPGS				/* kernel gsbase */
    960 
    961 	ALTENTRY(_sys_sysenter_post_swapgs)
    962 	movq	%gs:CPU_THREAD, %r15
    963 
    964 	movl	$U32CS_SEL, REGOFF_CS(%rsp)
    965 	movl	%ecx, REGOFF_RSP(%rsp)		/* wrapper: %esp -> %ecx */
    966 	movl	%edx, REGOFF_RIP(%rsp)		/* wrapper: %eip -> %edx */
    967 	pushfq
    968 	popq	%r10
    969 	movl	$UDS_SEL, REGOFF_SS(%rsp)
    970 
    971 	/*
    972 	 * Set the interrupt flag before storing the flags to the
    973 	 * flags image on the stack so we can return to user with
    974 	 * interrupts enabled if we return via sys_rtt_syscall32
    975 	 */
    976 	orq	$PS_IE, %r10
    977 	movq	%r10, REGOFF_RFL(%rsp)
    978 
    979 	movl	%edi, REGOFF_RDI(%rsp)
    980 	movl	%esi, REGOFF_RSI(%rsp)
    981 	movl	%ebp, REGOFF_RBP(%rsp)
    982 	movl	%ebx, REGOFF_RBX(%rsp)
    983 	movl	%edx, REGOFF_RDX(%rsp)
    984 	movl	%ecx, REGOFF_RCX(%rsp)
    985 	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
    986 	movq	$0, REGOFF_SAVFP(%rsp)
    987 	movq	$0, REGOFF_SAVPC(%rsp)
    988 
    989 	/*
    990 	 * Copy these registers here in case we end up stopped with
    991 	 * someone (like, say, /proc) messing with our register state.
    992 	 * We don't -restore- them unless we have to in update_sregs.
    993 	 *
    994 	 * Since userland -can't- change fsbase or gsbase directly,
    995 	 * we don't bother to capture them here.
    996 	 */
    997 	xorl	%ebx, %ebx
    998 	movw	%ds, %bx
    999 	movq	%rbx, REGOFF_DS(%rsp)
   1000 	movw	%es, %bx
   1001 	movq	%rbx, REGOFF_ES(%rsp)
   1002 	movw	%fs, %bx
   1003 	movq	%rbx, REGOFF_FS(%rsp)
   1004 	movw	%gs, %bx
   1005 	movq	%rbx, REGOFF_GS(%rsp)
   1006 
   1007 	/*
   1008 	 * Application state saved in the regs structure on the stack
   1009 	 * %eax is the syscall number
   1010 	 * %rsp is the thread's stack, %r15 is curthread
   1011 	 * REG_RSP(%rsp) is the user's stack
   1012 	 */
   1013 
   1014 	SYSCALL_TRAPTRACE($TT_SYSENTER)
   1015 
   1016 	movq	%rsp, %rbp
   1017 
   1018 	movq	T_LWP(%r15), %r14
   1019 	ASSERT_NO_RUPDATE_PENDING(%r14)
   1020 
   1021 	ENABLE_INTR_FLAGS
   1022 
   1023 	/*
   1024 	 * Catch 64-bit process trying to issue sysenter instruction
   1025 	 * on Nocona based systems.
   1026 	 */
   1027 	movq	LWP_PROCP(%r14), %rax
   1028 	cmpq	$DATAMODEL_ILP32, P_MODEL(%rax)
   1029 	je	7f
   1030 
   1031 	/*
   1032 	 * For a non-32-bit process, simulate a #ud, since that's what
   1033 	 * native hardware does.  The traptrace entry (above) will
   1034 	 * let you know what really happened.
   1035 	 */
   1036 	movq	$T_ILLINST, REGOFF_TRAPNO(%rsp)
   1037 	movq	REGOFF_CS(%rsp), %rdi
   1038 	movq	%rdi, REGOFF_ERR(%rsp)
   1039 	movq	%rsp, %rdi
   1040 	movq	REGOFF_RIP(%rsp), %rsi
   1041 	movl	%gs:CPU_ID, %edx
   1042 	call	trap
   1043 	jmp	_sys_rtt
   1044 7:
   1045 
   1046 	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
   1047 	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate calls) */
   1048 
   1049 	ASSERT_LWPTOREGS(%r14, %rsp)
   1050 
   1051 	incq	%gs:CPU_STATS_SYS_SYSCALL
   1052 
   1053 	/*
   1054 	 * Make some space for MAXSYSARGS (currently 8) 32-bit args
   1055 	 * placed into 64-bit (long) arg slots, plus one 64-bit
   1056 	 * (long) arg count, maintaining 16 byte alignment.
   1057 	 */
   1058 	subq	$SYS_DROP, %rsp
   1059 	movb	$LWP_SYS, LWP_STATE(%r14)
   1060 	movq	%r15, %rdi
   1061 	movq	%rsp, %rsi
   1062 	call	syscall_entry
   1063 
   1064 	/*
   1065 	 * Fetch the arguments copied onto the kernel stack and put
   1066 	 * them in the right registers to invoke a C-style syscall handler.
   1067 	 * %rax contains the handler address.
   1068 	 */
   1069 	movq	%rax, %rbx
   1070 	movl	0(%rsp), %edi
   1071 	movl	8(%rsp), %esi
   1072 	movl	0x10(%rsp), %edx
   1073 	movl	0x18(%rsp), %ecx
   1074 	movl	0x20(%rsp), %r8d
   1075 	movl	0x28(%rsp), %r9d
   1076 
   1077 	call	*SY_CALLC(%rbx)
   1078 
   1079 	movq	%rbp, %rsp	/* pop the args */
   1080 
   1081 	/*
   1082 	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
   1083 	 * On the 32-bit kernel, the always return that value in %eax:%edx
   1084 	 * as required by the 32-bit ABI.
   1085 	 *
   1086 	 * Simulate the same behaviour by unconditionally splitting the
   1087 	 * return value in the same way.
   1088 	 */
   1089 	movq	%rax, %r13
   1090 	shrq	$32, %r13	/* upper 32-bits into %edx */
   1091 	movl	%eax, %r12d	/* lower 32-bits into %eax */
   1092 
   1093 	/*
   1094 	 * Optimistically assume that there's no post-syscall
   1095 	 * work to do.  (This is to avoid having to call syscall_mstate()
   1096 	 * with interrupts disabled)
   1097 	 */
   1098 	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
   1099 
   1100 	/*
   1101 	 * We must protect ourselves from being descheduled here;
   1102 	 * If we were, and we ended up on another cpu, or another
   1103 	 * lwp got int ahead of us, it could change the segment
   1104 	 * registers without us noticing before we return to userland.
   1105 	 */
   1106 	cli
   1107 	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
   1108 	jne	_full_syscall_postsys32
   1109 	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
   1110 
   1111 	/*
   1112 	 * To get back to userland, load up the 32-bit registers and
   1113 	 * sysexit back where we came from.
   1114 	 */
   1115 
   1116 	/*
   1117 	 * Interrupts will be turned on by the 'sti' executed just before
   1118 	 * sysexit.  The following ensures that restoring the user's rflags
   1119 	 * doesn't enable interrupts too soon.
   1120 	 */
   1121 	andq	$_BITNOT(PS_IE), REGOFF_RFL(%rsp)
   1122 
   1123 	/*
   1124 	 * (There's no point in loading up %edx because the sysexit
   1125 	 * mechanism smashes it.)
   1126 	 */
   1127 	movl	%r12d, %eax
   1128 	movl	REGOFF_RBX(%rsp), %ebx
   1129 	movl	REGOFF_RBP(%rsp), %ebp
   1130 	movl	REGOFF_RSI(%rsp), %esi
   1131 	movl	REGOFF_RDI(%rsp), %edi
   1132 
   1133 	movl	REGOFF_RIP(%rsp), %edx	/* sysexit: %edx -> %eip */
   1134 	pushq	REGOFF_RFL(%rsp)
   1135 	popfq
   1136 	movl	REGOFF_RSP(%rsp), %ecx	/* sysexit: %ecx -> %esp */
   1137         ALTENTRY(sys_sysenter_swapgs_sysexit)
   1138 	swapgs
   1139 	sti
   1140 	sysexit
   1141 	SET_SIZE(sys_sysenter_swapgs_sysexit)
   1142 	SET_SIZE(sys_sysenter)
   1143 	SET_SIZE(_sys_sysenter_post_swapgs)
   1144 	SET_SIZE(brand_sys_sysenter)
   1145 
   1146 #endif	/* __lint */
   1147 
   1148 #if defined(__lint)
   1149 /*
   1150  * System call via an int80.  This entry point is only used by the Linux
   1151  * application environment.  Unlike the other entry points, there is no
   1152  * default action to take if no callback is registered for this process.
   1153  */
   1154 void
   1155 sys_int80()
   1156 {}
   1157 
   1158 #else	/* __lint */
   1159 
   1160 	ENTRY_NP(brand_sys_int80)
   1161 	SWAPGS				/* kernel gsbase */
   1162 	XPV_TRAP_POP
   1163 	BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK())
   1164 	SWAPGS				/* user gsbase */
   1165 #if defined(__xpv)
   1166 	jmp	nopop_int80
   1167 #endif
   1168 
   1169 	ENTRY_NP(sys_int80)
   1170 	/*
   1171 	 * We hit an int80, but this process isn't of a brand with an int80
   1172 	 * handler.  Bad process!  Make it look as if the INT failed.
   1173 	 * Modify %rip to point before the INT, push the expected error
   1174 	 * code and fake a GP fault. Note on 64-bit hypervisor we need
   1175 	 * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack
   1176 	 * because gptrap will pop them again with its own XPV_TRAP_POP.
   1177 	 */
   1178 #if defined(__xpv)
   1179 	XPV_TRAP_POP
   1180 nopop_int80:
   1181 #endif
   1182 	subq	$2, (%rsp)	/* int insn 2-bytes */
   1183 	pushq	$_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2)
   1184 #if defined(__xpv)
   1185 	push	%r11
   1186 	push	%rcx
   1187 #endif
   1188 	jmp	gptrap			/ GP fault
   1189 	SET_SIZE(sys_int80)
   1190 	SET_SIZE(brand_sys_int80)
   1191 #endif	/* __lint */
   1192 
   1193 
   1194 /*
   1195  * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
   1196  * the generic i386 libc to do system calls. We do a small amount of setup
   1197  * before jumping into the existing sys_syscall32 path.
   1198  */
   1199 #if defined(__lint)
   1200 
   1201 /*ARGSUSED*/
   1202 void
   1203 sys_syscall_int()
   1204 {}
   1205 
   1206 #else	/* __lint */
   1207 
   1208 	ENTRY_NP(brand_sys_syscall_int)
   1209 	SWAPGS				/* kernel gsbase */
   1210 	XPV_TRAP_POP
   1211 	BRAND_CALLBACK(BRAND_CB_INT91, BRAND_URET_FROM_INTR_STACK())
   1212 	SWAPGS				/* user gsbase */
   1213 
   1214 #if defined(__xpv)
   1215 	jmp	nopop_syscall_int
   1216 #endif
   1217 
   1218 	ALTENTRY(sys_syscall_int)
   1219 	SWAPGS				/* kernel gsbase */
   1220 
   1221 #if defined(__xpv)
   1222 	XPV_TRAP_POP
   1223 nopop_syscall_int:
   1224 #endif
   1225 
   1226 	movq	%gs:CPU_THREAD, %r15
   1227 	movq	T_STACK(%r15), %rsp
   1228 	movl	%eax, %eax
   1229 	/*
   1230 	 * Set t_post_sys on this thread to force ourselves out via the slow
   1231 	 * path. It might be possible at some later date to optimize this out
   1232 	 * and use a faster return mechanism.
   1233 	 */
   1234 	movb	$1, T_POST_SYS(%r15)
   1235 	CLEAN_CS
   1236 	jmp	_syscall32_save
   1237 	/*
   1238 	 * There should be no instructions between this label and SWAPGS/IRET
   1239 	 * or we could end up breaking branded zone support. See the usage of
   1240 	 * this label in lx_brand_int80_callback and sn1_brand_int91_callback
   1241 	 * for examples.
   1242 	 */
   1243         ALTENTRY(sys_sysint_swapgs_iret)
   1244 	SWAPGS				/* user gsbase */
   1245 	IRET
   1246 	/*NOTREACHED*/
   1247 	SET_SIZE(sys_sysint_swapgs_iret)
   1248 	SET_SIZE(sys_syscall_int)
   1249 	SET_SIZE(brand_sys_syscall_int)
   1250 
   1251 #endif	/* __lint */
   1252 
   1253 /*
   1254  * Legacy 32-bit applications and old libc implementations do lcalls;
   1255  * we should never get here because the LDT entry containing the syscall
   1256  * segment descriptor has the "segment present" bit cleared, which means
   1257  * we end up processing those system calls in trap() via a not-present trap.
   1258  *
   1259  * We do it this way because a call gate unhelpfully does -nothing- to the
   1260  * interrupt flag bit, so an interrupt can run us just after the lcall
   1261  * completes, but just before the swapgs takes effect.   Thus the INTR_PUSH and
   1262  * INTR_POP paths would have to be slightly more complex to dance around
   1263  * this problem, and end up depending explicitly on the first
   1264  * instruction of this handler being either swapgs or cli.
   1265  */
   1266 
   1267 #if defined(__lint)
   1268 
   1269 /*ARGSUSED*/
   1270 void
   1271 sys_lcall32()
   1272 {}
   1273 
   1274 #else	/* __lint */
   1275 
   1276 	ENTRY_NP(sys_lcall32)
   1277 	SWAPGS				/* kernel gsbase */
   1278 	pushq	$0
   1279 	pushq	%rbp
   1280 	movq	%rsp, %rbp
   1281 	leaq	__lcall_panic_str(%rip), %rdi
   1282 	xorl	%eax, %eax
   1283 	call	panic
   1284 	SET_SIZE(sys_lcall32)
   1285 
   1286 __lcall_panic_str:
   1287 	.string	"sys_lcall32: shouldn't be here!"
   1288 
   1289 /*
   1290  * Declare a uintptr_t which covers the entire pc range of syscall
   1291  * handlers for the stack walkers that need this.
   1292  */
   1293 	.align	CPTRSIZE
   1294 	.globl	_allsyscalls_size
   1295 	.type	_allsyscalls_size, @object
   1296 _allsyscalls_size:
   1297 	.NWORD	. - _allsyscalls
   1298 	SET_SIZE(_allsyscalls_size)
   1299 
   1300 #endif	/* __lint */
   1301 
   1302 /*
   1303  * These are the thread context handlers for lwps using sysenter/sysexit.
   1304  */
   1305 
   1306 #if defined(__lint)
   1307 
   1308 /*ARGSUSED*/
   1309 void
   1310 sep_save(void *ksp)
   1311 {}
   1312 
   1313 /*ARGSUSED*/
   1314 void
   1315 sep_restore(void *ksp)
   1316 {}
   1317 
   1318 #else	/* __lint */
   1319 
   1320 	/*
   1321 	 * setting this value to zero as we switch away causes the
   1322 	 * stack-pointer-on-sysenter to be NULL, ensuring that we
   1323 	 * don't silently corrupt another (preempted) thread stack
   1324 	 * when running an lwp that (somehow) didn't get sep_restore'd
   1325 	 */
   1326 	ENTRY_NP(sep_save)
   1327 	xorl	%edx, %edx
   1328 	xorl	%eax, %eax
   1329 	movl	$MSR_INTC_SEP_ESP, %ecx
   1330 	wrmsr
   1331 	ret
   1332 	SET_SIZE(sep_save)
   1333 
   1334 	/*
   1335 	 * Update the kernel stack pointer as we resume onto this cpu.
   1336 	 */
   1337 	ENTRY_NP(sep_restore)
   1338 	movq	%rdi, %rdx
   1339 	shrq	$32, %rdx
   1340 	movl	%edi, %eax
   1341 	movl	$MSR_INTC_SEP_ESP, %ecx
   1342 	wrmsr
   1343 	ret
   1344 	SET_SIZE(sep_restore)
   1345 
   1346 #endif	/* __lint */
   1347