Home | History | Annotate | Download | only in ml
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/asm_linkage.h>
     27 #include <sys/asm_misc.h>
     28 #include <sys/regset.h>
     29 #include <sys/privregs.h>
     30 #include <sys/psw.h>
     31 #include <sys/machbrand.h>
     32 
     33 #if defined(__lint)
     34 
     35 #include <sys/types.h>
     36 #include <sys/thread.h>
     37 #include <sys/systm.h>
     38 
     39 #else	/* __lint */
     40 
     41 #include <sys/segments.h>
     42 #include <sys/pcb.h>
     43 #include <sys/trap.h>
     44 #include <sys/ftrace.h>
     45 #include <sys/traptrace.h>
     46 #include <sys/clock.h>
     47 #include <sys/model.h>
     48 #include <sys/panic.h>
     49 
     50 #if defined(__xpv)
     51 #include <sys/hypervisor.h>
     52 #endif
     53 
     54 #include "assym.h"
     55 
     56 #endif	/* __lint */
     57 
     58 /*
     59  * We implement five flavours of system call entry points
     60  *
     61  * -	syscall/sysretq		(amd64 generic)
     62  * -	syscall/sysretl		(i386 plus SYSC bit)
     63  * -	sysenter/sysexit	(i386 plus SEP bit)
     64  * -	int/iret		(i386 generic)
     65  * -	lcall/iret		(i386 generic)
     66  *
     67  * The current libc included in Solaris uses int/iret as the base unoptimized
     68  * kernel entry method. Older libc implementations and legacy binaries may use
     69  * the lcall call gate, so it must continue to be supported.
     70  *
     71  * System calls that use an lcall call gate are processed in trap() via a
     72  * segment-not-present trap, i.e. lcalls are extremely slow(!).
     73  *
     74  * The basic pattern used in the 32-bit SYSC handler at this point in time is
     75  * to have the bare minimum of assembler, and get to the C handlers as
     76  * quickly as possible.
     77  *
     78  * The 64-bit handler is much closer to the sparcv9 handler; that's
     79  * because of passing arguments in registers.  The 32-bit world still
     80  * passes arguments on the stack -- that makes that handler substantially
     81  * more complex.
     82  *
     83  * The two handlers share a few code fragments which are broken
     84  * out into preprocessor macros below.
     85  *
     86  * XX64	come back and speed all this up later.  The 32-bit stuff looks
     87  * especially easy to speed up the argument copying part ..
     88  *
     89  *
     90  * Notes about segment register usage (c.f. the 32-bit kernel)
     91  *
     92  * In the 32-bit kernel, segment registers are dutifully saved and
     93  * restored on all mode transitions because the kernel uses them directly.
     94  * When the processor is running in 64-bit mode, segment registers are
     95  * largely ignored.
     96  *
     97  * %cs and %ss
     98  *	controlled by the hardware mechanisms that make mode transitions
     99  *
    100  * The remaining segment registers have to either be pointing at a valid
    101  * descriptor i.e. with the 'present' bit set, or they can NULL descriptors
    102  *
    103  * %ds and %es
    104  *	always ignored
    105  *
    106  * %fs and %gs
    107  *	fsbase and gsbase are used to control the place they really point at.
    108  *	The kernel only depends on %gs, and controls its own gsbase via swapgs
    109  *
    110  * Note that loading segment registers is still costly because the GDT
    111  * lookup still happens (this is because the hardware can't know that we're
    112  * not setting up these segment registers for a 32-bit program).  Thus we
    113  * avoid doing this in the syscall path, and defer them to lwp context switch
    114  * handlers, so the register values remain virtualized to the lwp.
    115  */
    116 
    117 #if defined(SYSCALLTRACE)
    118 #define	ORL_SYSCALLTRACE(r32)		\
    119 	orl	syscalltrace(%rip), r32
    120 #else
    121 #define	ORL_SYSCALLTRACE(r32)
    122 #endif
    123 
    124 /*
    125  * In the 32-bit kernel, we do absolutely nothing before getting into the
    126  * brand callback checks.  In 64-bit land, we do swapgs and then come here.
    127  * We assume that the %rsp- and %r15-stashing fields in the CPU structure
    128  * are still unused.
    129  *
    130  * Check if a brand_mach_ops callback is defined for the specified callback_id
    131  * type.  If so invoke it with the kernel's %gs value loaded and the following
    132  * data on the stack:
    133  *
    134  * stack:  --------------------------------------
    135  *      40 | user %gs				|
    136  *      32 | callback pointer			|
    137  *    | 24 | user stack pointer			|
    138  *    | 16 | lwp pointer			|
    139  *    v  8 | userland return address		|
    140  *       0 | callback wrapper return addr	|
    141  *         --------------------------------------
    142  *
    143  */
    144 #define	BRAND_CALLBACK(callback_id)					    \
    145 	movq	%rsp, %gs:CPU_RTMP_RSP	/* save the stack pointer	*/ ;\
    146 	movq	%r15, %gs:CPU_RTMP_R15	/* save %r15			*/ ;\
    147 	movq	%gs:CPU_THREAD, %r15	/* load the thread pointer	*/ ;\
    148 	movq	T_STACK(%r15), %rsp	/* switch to the kernel stack	*/ ;\
    149 	subq	$24, %rsp		/* save space for 3 pointers	*/ ;\
    150 	pushq	%r14			/* save %r14			*/ ;\
    151 	movq	%gs:CPU_RTMP_RSP, %r14					   ;\
    152 	movq	%r14, 8(%rsp)		/* stash the user stack pointer	*/ ;\
    153 	popq	%r14			/* restore %r14			*/ ;\
    154 	movq	T_LWP(%r15), %r15	/* load the lwp pointer		*/ ;\
    155 	pushq	%r15			/* push the lwp pointer		*/ ;\
    156 	movq	LWP_PROCP(%r15), %r15	/* load the proc pointer	*/ ;\
    157 	movq	P_BRAND(%r15), %r15	/* load the brand pointer	*/ ;\
    158 	movq	B_MACHOPS(%r15), %r15	/* load the machops pointer	*/ ;\
    159 	movq	_CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15		   ;\
    160 	cmpq	$0, %r15						   ;\
    161 	je	1f							   ;\
    162 	movq	%r15, 16(%rsp)		/* save the callback pointer	*/ ;\
    163 	movq	%gs:CPU_RTMP_RSP, %r15	/* grab the user stack pointer	*/ ;\
    164 	pushq	(%r15)			/* push the return address	*/ ;\
    165 	SWAPGS				/* user gsbase			*/ ;\
    166 	mov	%gs, %r15		/* get %gs			*/ ;\
    167 	movq	%r15, 32(%rsp)		/* save %gs on stack		*/ ;\
    168 	SWAPGS				/* kernel gsbase		*/ ;\
    169 	movq	%gs:CPU_RTMP_R15, %r15	/* restore %r15			*/ ;\
    170 	call	*24(%rsp)		/* call callback		*/ ;\
    171 1:	movq	%gs:CPU_RTMP_R15, %r15	/* restore %r15			*/ ;\
    172 	movq	%gs:CPU_RTMP_RSP, %rsp	/* restore the stack pointer	*/
    173 
    174 #define	MSTATE_TRANSITION(from, to)		\
    175 	movl	$from, %edi;			\
    176 	movl	$to, %esi;			\
    177 	call	syscall_mstate
    178 
    179 /*
    180  * Check to see if a simple (direct) return is possible i.e.
    181  *
    182  *	if (t->t_post_sys_ast | syscalltrace |
    183  *	    lwp->lwp_pcb.pcb_rupdate == 1)
    184  *		do full version	;
    185  *
    186  * Preconditions:
    187  * -	t is curthread
    188  * Postconditions:
    189  * -	condition code NE is set if post-sys is too complex
    190  * -	rtmp is zeroed if it isn't (we rely on this!)
    191  * -	ltmp is smashed
    192  */
    193 #define	CHECK_POSTSYS_NE(t, ltmp, rtmp)			\
    194 	movq	T_LWP(t), ltmp;				\
    195 	movzbl	PCB_RUPDATE(ltmp), rtmp;		\
    196 	ORL_SYSCALLTRACE(rtmp);				\
    197 	orl	T_POST_SYS_AST(t), rtmp;		\
    198 	cmpl	$0, rtmp
    199 
    200 /*
    201  * Fix up the lwp, thread, and eflags for a successful return
    202  *
    203  * Preconditions:
    204  * -	zwreg contains zero
    205  */
    206 #define	SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg)		\
    207 	movb	$LWP_USER, LWP_STATE(lwp);		\
    208 	movw	zwreg, T_SYSNUM(t);			\
    209 	andb	$_CONST(0xffff - PS_C), REGOFF_RFL(%rsp)
    210 
    211 /*
    212  * ASSERT(lwptoregs(lwp) == rp);
    213  *
    214  * This may seem obvious, but very odd things happen if this
    215  * assertion is false
    216  *
    217  * Preconditions:
    218  *	(%rsp is ready for normal call sequence)
    219  * Postconditions (if assertion is true):
    220  *	%r11 is smashed
    221  *
    222  * ASSERT(rp->r_cs == descnum)
    223  *
    224  * The code selector is written into the regs structure when the
    225  * lwp stack is created.  We use this ASSERT to validate that
    226  * the regs structure really matches how we came in.
    227  *
    228  * Preconditions:
    229  *	(%rsp is ready for normal call sequence)
    230  * Postconditions (if assertion is true):
    231  *	-none-
    232  *
    233  * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0);
    234  *
    235  * If this is false, it meant that we returned to userland without
    236  * updating the segment registers as we were supposed to.
    237  *
    238  * Note that we must ensure no interrupts or other traps intervene
    239  * between entering privileged mode and performing the assertion,
    240  * otherwise we may perform a context switch on the thread, which
    241  * will end up setting pcb_rupdate to 1 again.
    242  */
    243 #if defined(DEBUG)
    244 
    245 #if !defined(__lint)
    246 
    247 __lwptoregs_msg:
    248 	.string	"syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]"
    249 
    250 __codesel_msg:
    251 	.string	"syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld"
    252 
    253 __no_rupdate_msg:
    254 	.string	"syscall_asm_amd64.s:%d lwp %p, pcb_rupdate != 0"
    255 
    256 #endif	/* !__lint */
    257 
    258 #define	ASSERT_LWPTOREGS(lwp, rp)			\
    259 	movq	LWP_REGS(lwp), %r11;			\
    260 	cmpq	rp, %r11;				\
    261 	je	7f;					\
    262 	leaq	__lwptoregs_msg(%rip), %rdi;		\
    263 	movl	$__LINE__, %esi;			\
    264 	movq	lwp, %rdx;				\
    265 	movq	%r11, %rcx;				\
    266 	movq	rp, %r8;				\
    267 	xorl	%eax, %eax;				\
    268 	call	panic;					\
    269 7:
    270 
    271 #define	ASSERT_NO_RUPDATE_PENDING(lwp)			\
    272 	testb	$0x1, PCB_RUPDATE(lwp);			\
    273 	je	8f;					\
    274 	movq	lwp, %rdx;				\
    275 	leaq	__no_rupdate_msg(%rip), %rdi;		\
    276 	movl	$__LINE__, %esi;			\
    277 	xorl	%eax, %eax;				\
    278 	call	panic;					\
    279 8:
    280 
    281 #else
    282 #define	ASSERT_LWPTOREGS(lwp, rp)
    283 #define	ASSERT_NO_RUPDATE_PENDING(lwp)
    284 #endif
    285 
    286 /*
    287  * Do the traptrace thing and restore any registers we used
    288  * in situ.  Assumes that %rsp is pointing at the base of
    289  * the struct regs, obviously ..
    290  */
    291 #ifdef TRAPTRACE
    292 #define	SYSCALL_TRAPTRACE(ttype)				\
    293 	TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype);		\
    294 	TRACE_REGS(%rdi, %rsp, %rbx, %rcx);			\
    295 	TRACE_STAMP(%rdi);	/* rdtsc clobbers %eax, %edx */	\
    296 	movq	REGOFF_RAX(%rsp), %rax;				\
    297 	movq	REGOFF_RBX(%rsp), %rbx;				\
    298 	movq	REGOFF_RCX(%rsp), %rcx;				\
    299 	movq	REGOFF_RDX(%rsp), %rdx;				\
    300 	movl	%eax, TTR_SYSNUM(%rdi);				\
    301 	movq	REGOFF_RDI(%rsp), %rdi
    302 
    303 #define	SYSCALL_TRAPTRACE32(ttype)				\
    304 	SYSCALL_TRAPTRACE(ttype);				\
    305 	/* paranoia: clean the top 32-bits of the registers */	\
    306 	orl	%eax, %eax;					\
    307 	orl	%ebx, %ebx;					\
    308 	orl	%ecx, %ecx;					\
    309 	orl	%edx, %edx;					\
    310 	orl	%edi, %edi
    311 #else	/* TRAPTRACE */
    312 #define	SYSCALL_TRAPTRACE(ttype)
    313 #define	SYSCALL_TRAPTRACE32(ttype)
    314 #endif	/* TRAPTRACE */
    315 
    316 /*
    317  * The 64-bit libc syscall wrapper does this:
    318  *
    319  * fn(<args>)
    320  * {
    321  *	movq	%rcx, %r10	-- because syscall smashes %rcx
    322  *	movl	$CODE, %eax
    323  *	syscall
    324  *	<error processing>
    325  * }
    326  *
    327  * Thus when we come into the kernel:
    328  *
    329  *	%rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args
    330  *	%rax is the syscall number
    331  *	%r12-%r15 contain caller state
    332  *
    333  * The syscall instruction arranges that:
    334  *
    335  *	%rcx contains the return %rip
    336  *	%r11d contains bottom 32-bits of %rflags
    337  *	%rflags is masked (as determined by the SFMASK msr)
    338  *	%cs is set to UCS_SEL (as determined by the STAR msr)
    339  *	%ss is set to UDS_SEL (as determined by the STAR msr)
    340  *	%rip is set to sys_syscall (as determined by the LSTAR msr)
    341  *
    342  * Or in other words, we have no registers available at all.
    343  * Only swapgs can save us!
    344  *
    345  * Under the hypervisor, the swapgs has happened already.  However, the
    346  * state of the world is very different from that we're familiar with.
    347  *
    348  * In particular, we have a stack structure like that for interrupt
    349  * gates, except that the %cs and %ss registers are modified for reasons
    350  * that are not entirely clear.  Critically, the %rcx/%r11 values do
    351  * *not* reflect the usage of those registers under a 'real' syscall[1];
    352  * the stack, therefore, looks like this:
    353  *
    354  *	0x0(rsp)	potentially junk %rcx
    355  *	0x8(rsp)	potentially junk %r11
    356  *	0x10(rsp)	user %rip
    357  *	0x18(rsp)	modified %cs
    358  *	0x20(rsp)	user %rflags
    359  *	0x28(rsp)	user %rsp
    360  *	0x30(rsp)	modified %ss
    361  *
    362  *
    363  * and before continuing on, we must load the %rip into %rcx and the
    364  * %rflags into %r11.
    365  *
    366  * [1] They used to, and we relied on it, but this was broken in 3.1.1.
    367  * Sigh.
    368  */
    369 #if defined(__xpv)
    370 #define	XPV_SYSCALL_PROD						\
    371 	movq	0x10(%rsp), %rcx;					\
    372 	movq	0x20(%rsp), %r11;					\
    373 	movq	0x28(%rsp), %rsp
    374 #else
    375 #define	XPV_SYSCALL_PROD /* nothing */
    376 #endif
    377 
    378 #if defined(__lint)
    379 
    380 /*ARGSUSED*/
    381 void
    382 sys_syscall()
    383 {}
    384 
    385 void
    386 _allsyscalls()
    387 {}
    388 
    389 size_t _allsyscalls_size;
    390 
    391 #else	/* __lint */
    392 
    393 	ENTRY_NP2(brand_sys_syscall,_allsyscalls)
    394 	SWAPGS				/* kernel gsbase */
    395 	XPV_SYSCALL_PROD
    396 	BRAND_CALLBACK(BRAND_CB_SYSCALL)
    397 	SWAPGS				/* user gsbase */
    398 
    399 #if defined(__xpv)
    400 	jmp	noprod_sys_syscall
    401 #endif
    402 
    403 	ALTENTRY(sys_syscall)
    404 	SWAPGS				/* kernel gsbase */
    405 	XPV_SYSCALL_PROD
    406 
    407 noprod_sys_syscall:
    408 
    409 	movq	%r15, %gs:CPU_RTMP_R15
    410 	movq	%rsp, %gs:CPU_RTMP_RSP
    411 
    412 	movq	%gs:CPU_THREAD, %r15
    413 	movq	T_STACK(%r15), %rsp	/* switch from user to kernel stack */
    414 
    415 	ASSERT_UPCALL_MASK_IS_SET
    416 
    417 	movl	$UCS_SEL, REGOFF_CS(%rsp)
    418 	movq	%rcx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
    419 	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
    420 	movl	$UDS_SEL, REGOFF_SS(%rsp)
    421 
    422 	movl	%eax, %eax			/* wrapper: sysc# -> %eax */
    423 	movq	%rdi, REGOFF_RDI(%rsp)
    424 	movq	%rsi, REGOFF_RSI(%rsp)
    425 	movq	%rdx, REGOFF_RDX(%rsp)
    426 	movq	%r10, REGOFF_RCX(%rsp)		/* wrapper: %rcx -> %r10 */
    427 	movq	%r10, %rcx			/* arg[3] for direct calls */
    428 
    429 	movq	%r8, REGOFF_R8(%rsp)
    430 	movq	%r9, REGOFF_R9(%rsp)
    431 	movq	%rax, REGOFF_RAX(%rsp)
    432 	movq	%rbx, REGOFF_RBX(%rsp)
    433 
    434 	movq	%rbp, REGOFF_RBP(%rsp)
    435 	movq	%r10, REGOFF_R10(%rsp)
    436 	movq	%gs:CPU_RTMP_RSP, %r11
    437 	movq	%r11, REGOFF_RSP(%rsp)
    438 	movq	%r12, REGOFF_R12(%rsp)
    439 
    440 	movq	%r13, REGOFF_R13(%rsp)
    441 	movq	%r14, REGOFF_R14(%rsp)
    442 	movq	%gs:CPU_RTMP_R15, %r10
    443 	movq	%r10, REGOFF_R15(%rsp)
    444 	movq	$0, REGOFF_SAVFP(%rsp)
    445 	movq	$0, REGOFF_SAVPC(%rsp)
    446 
    447 	/*
    448 	 * Copy these registers here in case we end up stopped with
    449 	 * someone (like, say, /proc) messing with our register state.
    450 	 * We don't -restore- them unless we have to in update_sregs.
    451 	 *
    452 	 * Since userland -can't- change fsbase or gsbase directly,
    453 	 * and capturing them involves two serializing instructions,
    454 	 * we don't bother to capture them here.
    455 	 */
    456 	xorl	%ebx, %ebx
    457 	movw	%ds, %bx
    458 	movq	%rbx, REGOFF_DS(%rsp)
    459 	movw	%es, %bx
    460 	movq	%rbx, REGOFF_ES(%rsp)
    461 	movw	%fs, %bx
    462 	movq	%rbx, REGOFF_FS(%rsp)
    463 	movw	%gs, %bx
    464 	movq	%rbx, REGOFF_GS(%rsp)
    465 
    466 	/*
    467 	 * Machine state saved in the regs structure on the stack
    468 	 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
    469 	 * %eax is the syscall number
    470 	 * %rsp is the thread's stack, %r15 is curthread
    471 	 * REG_RSP(%rsp) is the user's stack
    472 	 */
    473 
    474 	SYSCALL_TRAPTRACE($TT_SYSC64)
    475 
    476 	movq	%rsp, %rbp
    477 
    478 	movq	T_LWP(%r15), %r14
    479 	ASSERT_NO_RUPDATE_PENDING(%r14)
    480 	ENABLE_INTR_FLAGS
    481 
    482 	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
    483 	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
    484 
    485 	ASSERT_LWPTOREGS(%r14, %rsp)
    486 
    487 	movb	$LWP_SYS, LWP_STATE(%r14)
    488 	incq	LWP_RU_SYSC(%r14)
    489 	movb	$NORMALRETURN, LWP_EOSYS(%r14)
    490 
    491 	incq	%gs:CPU_STATS_SYS_SYSCALL
    492 
    493 	movw	%ax, T_SYSNUM(%r15)
    494 	movzbl	T_PRE_SYS(%r15), %ebx
    495 	ORL_SYSCALLTRACE(%ebx)
    496 	testl	%ebx, %ebx
    497 	jne	_syscall_pre
    498 
    499 _syscall_invoke:
    500 	movq	REGOFF_RDI(%rbp), %rdi
    501 	movq	REGOFF_RSI(%rbp), %rsi
    502 	movq	REGOFF_RDX(%rbp), %rdx
    503 	movq	REGOFF_RCX(%rbp), %rcx
    504 	movq	REGOFF_R8(%rbp), %r8
    505 	movq	REGOFF_R9(%rbp), %r9
    506 
    507 	cmpl	$NSYSCALL, %eax
    508 	jae	_syscall_ill
    509 	shll	$SYSENT_SIZE_SHIFT, %eax
    510 	leaq	sysent(%rax), %rbx
    511 
    512 	call	*SY_CALLC(%rbx)
    513 
    514 	movq	%rax, %r12
    515 	movq	%rdx, %r13
    516 
    517 	/*
    518 	 * If the handler returns two ints, then we need to split the
    519 	 * 64-bit return value into two 32-bit values.
    520 	 */
    521 	testw	$SE_32RVAL2, SY_FLAGS(%rbx)
    522 	je	5f
    523 	movq	%r12, %r13
    524 	shrq	$32, %r13	/* upper 32-bits into %edx */
    525 	movl	%r12d, %r12d	/* lower 32-bits into %eax */
    526 5:
    527 	/*
    528 	 * Optimistically assume that there's no post-syscall
    529 	 * work to do.  (This is to avoid having to call syscall_mstate()
    530 	 * with interrupts disabled)
    531 	 */
    532 	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
    533 
    534 	/*
    535 	 * We must protect ourselves from being descheduled here;
    536 	 * If we were, and we ended up on another cpu, or another
    537 	 * lwp got in ahead of us, it could change the segment
    538 	 * registers without us noticing before we return to userland.
    539 	 */
    540 	CLI(%r14)
    541 	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
    542 	jne	_syscall_post
    543 	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
    544 
    545 	movq	%r12, REGOFF_RAX(%rsp)
    546 	movq	%r13, REGOFF_RDX(%rsp)
    547 
    548 	/*
    549 	 * To get back to userland, we need the return %rip in %rcx and
    550 	 * the return %rfl in %r11d.  The sysretq instruction also arranges
    551 	 * to fix up %cs and %ss; everything else is our responsibility.
    552 	 */
    553 	movq	REGOFF_RDI(%rsp), %rdi
    554 	movq	REGOFF_RSI(%rsp), %rsi
    555 	movq	REGOFF_RDX(%rsp), %rdx
    556 	/* %rcx used to restore %rip value */
    557 
    558 	movq	REGOFF_R8(%rsp), %r8
    559 	movq	REGOFF_R9(%rsp), %r9
    560 	movq	REGOFF_RAX(%rsp), %rax
    561 	movq	REGOFF_RBX(%rsp), %rbx
    562 
    563 	movq	REGOFF_RBP(%rsp), %rbp
    564 	movq	REGOFF_R10(%rsp), %r10
    565 	/* %r11 used to restore %rfl value */
    566 	movq	REGOFF_R12(%rsp), %r12
    567 
    568 	movq	REGOFF_R13(%rsp), %r13
    569 	movq	REGOFF_R14(%rsp), %r14
    570 	movq	REGOFF_R15(%rsp), %r15
    571 
    572 	movq	REGOFF_RIP(%rsp), %rcx
    573 	movl	REGOFF_RFL(%rsp), %r11d
    574 
    575 #if defined(__xpv)
    576 	addq	$REGOFF_RIP, %rsp
    577 #else
    578 	movq	REGOFF_RSP(%rsp), %rsp
    579 #endif
    580 
    581         /*
    582          * There can be no instructions between the ALTENTRY below and
    583 	 * SYSRET or we could end up breaking brand support. See label usage
    584          * in sn1_brand_syscall_callback for an example.
    585          */
    586 	ASSERT_UPCALL_MASK_IS_SET
    587 #if defined(__xpv)
    588 	SYSRETQ
    589         ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
    590 
    591 	/*
    592 	 * We can only get here after executing a brand syscall
    593 	 * interposition callback handler and simply need to
    594 	 * "sysretq" back to userland. On the hypervisor this
    595 	 * involves the iret hypercall which requires us to construct
    596 	 * just enough of the stack needed for the hypercall.
    597 	 * (rip, cs, rflags, rsp, ss).
    598 	 */
    599 	movq    %rsp, %gs:CPU_RTMP_RSP		/* save user's rsp */
    600 	movq	%gs:CPU_THREAD, %r11
    601 	movq	T_STACK(%r11), %rsp
    602 
    603 	movq	%rcx, REGOFF_RIP(%rsp)
    604 	movl	$UCS_SEL, REGOFF_CS(%rsp)
    605 	movq	%gs:CPU_RTMP_RSP, %r11
    606 	movq	%r11, REGOFF_RSP(%rsp)
    607 	pushfq
    608 	popq	%r11				/* hypercall enables ints */
    609 	movq	%r11, REGOFF_RFL(%rsp)
    610 	movl	$UDS_SEL, REGOFF_SS(%rsp)
    611 	addq	$REGOFF_RIP, %rsp
    612 	/*
    613 	 * XXPV: see comment in SYSRETQ definition for future optimization
    614 	 *       we could take.
    615 	 */
    616 	ASSERT_UPCALL_MASK_IS_SET
    617 	SYSRETQ
    618 #else
    619         ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
    620 	SWAPGS				/* user gsbase */
    621 	SYSRETQ
    622 #endif
    623         /*NOTREACHED*/
    624         SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
    625 
    626 _syscall_pre:
    627 	call	pre_syscall
    628 	movl	%eax, %r12d
    629 	testl	%eax, %eax
    630 	jne	_syscall_post_call
    631 	/*
    632 	 * Didn't abort, so reload the syscall args and invoke the handler.
    633 	 */
    634 	movzwl	T_SYSNUM(%r15), %eax
    635 	jmp	_syscall_invoke
    636 
    637 _syscall_ill:
    638 	call	nosys
    639 	movq	%rax, %r12
    640 	movq	%rdx, %r13
    641 	jmp	_syscall_post_call
    642 
    643 _syscall_post:
    644 	STI
    645 	/*
    646 	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
    647 	 * so that we can account for the extra work it takes us to finish.
    648 	 */
    649 	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
    650 _syscall_post_call:
    651 	movq	%r12, %rdi
    652 	movq	%r13, %rsi
    653 	call	post_syscall
    654 	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
    655 	jmp	_sys_rtt
    656 	SET_SIZE(sys_syscall)
    657 	SET_SIZE(brand_sys_syscall)
    658 
    659 #endif	/* __lint */
    660 
    661 #if defined(__lint)
    662 
    663 /*ARGSUSED*/
    664 void
    665 sys_syscall32()
    666 {}
    667 
    668 #else	/* __lint */
    669 
    670 	ENTRY_NP(brand_sys_syscall32)
    671 	SWAPGS				/* kernel gsbase */
    672 	XPV_TRAP_POP
    673 	BRAND_CALLBACK(BRAND_CB_SYSCALL32)
    674 	SWAPGS				/* user gsbase */
    675 
    676 #if defined(__xpv)
    677 	jmp	nopop_sys_syscall32
    678 #endif
    679 
    680 	ALTENTRY(sys_syscall32)
    681 	SWAPGS				/* kernel gsbase */
    682 
    683 #if defined(__xpv)
    684 	XPV_TRAP_POP
    685 nopop_sys_syscall32:
    686 #endif
    687 
    688 	movl	%esp, %r10d
    689 	movq	%gs:CPU_THREAD, %r15
    690 	movq	T_STACK(%r15), %rsp
    691 	movl	%eax, %eax
    692 
    693 	movl	$U32CS_SEL, REGOFF_CS(%rsp)
    694 	movl	%ecx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
    695 	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
    696 	movq	%r10, REGOFF_RSP(%rsp)
    697 	movl	$UDS_SEL, REGOFF_SS(%rsp)
    698 
    699 _syscall32_save:
    700 	movl	%edi, REGOFF_RDI(%rsp)
    701 	movl	%esi, REGOFF_RSI(%rsp)
    702 	movl	%ebp, REGOFF_RBP(%rsp)
    703 	movl	%ebx, REGOFF_RBX(%rsp)
    704 	movl	%edx, REGOFF_RDX(%rsp)
    705 	movl	%ecx, REGOFF_RCX(%rsp)
    706 	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
    707 	movq	$0, REGOFF_SAVFP(%rsp)
    708 	movq	$0, REGOFF_SAVPC(%rsp)
    709 
    710 	/*
    711 	 * Copy these registers here in case we end up stopped with
    712 	 * someone (like, say, /proc) messing with our register state.
    713 	 * We don't -restore- them unless we have to in update_sregs.
    714 	 *
    715 	 * Since userland -can't- change fsbase or gsbase directly,
    716 	 * we don't bother to capture them here.
    717 	 */
    718 	xorl	%ebx, %ebx
    719 	movw	%ds, %bx
    720 	movq	%rbx, REGOFF_DS(%rsp)
    721 	movw	%es, %bx
    722 	movq	%rbx, REGOFF_ES(%rsp)
    723 	movw	%fs, %bx
    724 	movq	%rbx, REGOFF_FS(%rsp)
    725 	movw	%gs, %bx
    726 	movq	%rbx, REGOFF_GS(%rsp)
    727 
    728 	/*
    729 	 * Application state saved in the regs structure on the stack
    730 	 * %eax is the syscall number
    731 	 * %rsp is the thread's stack, %r15 is curthread
    732 	 * REG_RSP(%rsp) is the user's stack
    733 	 */
    734 
    735 	SYSCALL_TRAPTRACE32($TT_SYSC)
    736 
    737 	movq	%rsp, %rbp
    738 
    739 	movq	T_LWP(%r15), %r14
    740 	ASSERT_NO_RUPDATE_PENDING(%r14)
    741 
    742 	ENABLE_INTR_FLAGS
    743 
    744 	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
    745 	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
    746 
    747 	ASSERT_LWPTOREGS(%r14, %rsp)
    748 
    749 	incq	 %gs:CPU_STATS_SYS_SYSCALL
    750 
    751 	/*
    752 	 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed
    753 	 * into 64-bit (long) arg slots, maintaining 16 byte alignment.  Or
    754 	 * more succinctly:
    755 	 *
    756 	 *	SA(MAXSYSARGS * sizeof (long)) == 64
    757 	 */
    758 #define	SYS_DROP	64			/* drop for args */
    759 	subq	$SYS_DROP, %rsp
    760 	movb	$LWP_SYS, LWP_STATE(%r14)
    761 	movq	%r15, %rdi
    762 	movq	%rsp, %rsi
    763 	call	syscall_entry
    764 
    765 	/*
    766 	 * Fetch the arguments copied onto the kernel stack and put
    767 	 * them in the right registers to invoke a C-style syscall handler.
    768 	 * %rax contains the handler address.
    769 	 *
    770 	 * Ideas for making all this go faster of course include simply
    771 	 * forcibly fetching 6 arguments from the user stack under lofault
    772 	 * protection, reverting to copyin_args only when watchpoints
    773 	 * are in effect.
    774 	 *
    775 	 * (If we do this, make sure that exec and libthread leave
    776 	 * enough space at the top of the stack to ensure that we'll
    777 	 * never do a fetch from an invalid page.)
    778 	 *
    779 	 * Lots of ideas here, but they won't really help with bringup B-)
    780 	 * Correctness can't wait, performance can wait a little longer ..
    781 	 */
    782 
    783 	movq	%rax, %rbx
    784 	movl	0(%rsp), %edi
    785 	movl	8(%rsp), %esi
    786 	movl	0x10(%rsp), %edx
    787 	movl	0x18(%rsp), %ecx
    788 	movl	0x20(%rsp), %r8d
    789 	movl	0x28(%rsp), %r9d
    790 
    791 	call	*SY_CALLC(%rbx)
    792 
    793 	movq	%rbp, %rsp	/* pop the args */
    794 
    795 	/*
    796 	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
    797 	 * On the 32-bit kernel, they always return that value in %eax:%edx
    798 	 * as required by the 32-bit ABI.
    799 	 *
    800 	 * Simulate the same behaviour by unconditionally splitting the
    801 	 * return value in the same way.
    802 	 */
    803 	movq	%rax, %r13
    804 	shrq	$32, %r13	/* upper 32-bits into %edx */
    805 	movl	%eax, %r12d	/* lower 32-bits into %eax */
    806 
    807 	/*
    808 	 * Optimistically assume that there's no post-syscall
    809 	 * work to do.  (This is to avoid having to call syscall_mstate()
    810 	 * with interrupts disabled)
    811 	 */
    812 	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
    813 
    814 	/*
    815 	 * We must protect ourselves from being descheduled here;
    816 	 * If we were, and we ended up on another cpu, or another
    817 	 * lwp got in ahead of us, it could change the segment
    818 	 * registers without us noticing before we return to userland.
    819 	 */
    820 	CLI(%r14)
    821 	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
    822 	jne	_full_syscall_postsys32
    823 	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
    824 
    825 	/*
    826 	 * To get back to userland, we need to put the return %rip in %rcx and
    827 	 * the return %rfl in %r11d.  The sysret instruction also arranges
    828 	 * to fix up %cs and %ss; everything else is our responsibility.
    829 	 */
    830 
    831 	movl	%r12d, %eax			/* %eax: rval1 */
    832 	movl	REGOFF_RBX(%rsp), %ebx
    833 	/* %ecx used for return pointer */
    834 	movl	%r13d, %edx			/* %edx: rval2 */
    835 	movl	REGOFF_RBP(%rsp), %ebp
    836 	movl	REGOFF_RSI(%rsp), %esi
    837 	movl	REGOFF_RDI(%rsp), %edi
    838 
    839 	movl	REGOFF_RFL(%rsp), %r11d		/* %r11 -> eflags */
    840 	movl	REGOFF_RIP(%rsp), %ecx		/* %ecx -> %eip */
    841 	movl	REGOFF_RSP(%rsp), %esp
    842 
    843 	ASSERT_UPCALL_MASK_IS_SET
    844         ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
    845 	SWAPGS				/* user gsbase */
    846 	SYSRETL
    847         SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
    848 	/*NOTREACHED*/
    849 
    850 _full_syscall_postsys32:
    851 	STI
    852 	/*
    853 	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
    854 	 * so that we can account for the extra work it takes us to finish.
    855 	 */
    856 	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
    857 	movq	%r15, %rdi
    858 	movq	%r12, %rsi			/* rval1 - %eax */
    859 	movq	%r13, %rdx			/* rval2 - %edx */
    860 	call	syscall_exit
    861 	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
    862 	jmp	_sys_rtt
    863 	SET_SIZE(sys_syscall32)
    864 	SET_SIZE(brand_sys_syscall32)
    865 
    866 #endif	/* __lint */
    867 
    868 /*
    869  * System call handler via the sysenter instruction
    870  * Used only for 32-bit system calls on the 64-bit kernel.
    871  *
    872  * The caller in userland has arranged that:
    873  *
    874  * -	%eax contains the syscall number
    875  * -	%ecx contains the user %esp
    876  * -	%edx contains the return %eip
    877  * -	the user stack contains the args to the syscall
    878  *
    879  * Hardware and (privileged) initialization code have arranged that by
    880  * the time the sysenter instructions completes:
    881  *
    882  * - %rip is pointing to sys_sysenter (below).
    883  * - %cs and %ss are set to kernel text and stack (data) selectors.
    884  * - %rsp is pointing at the lwp's stack
    885  * - interrupts have been disabled.
    886  *
    887  * Note that we are unable to return both "rvals" to userland with
    888  * this call, as %edx is used by the sysexit instruction.
    889  *
    890  * One final complication in this routine is its interaction with
    891  * single-stepping in a debugger.  For most of the system call mechanisms,
    892  * the CPU automatically clears the single-step flag before we enter the
    893  * kernel.  The sysenter mechanism does not clear the flag, so a user
    894  * single-stepping through a libc routine may suddenly find him/herself
    895  * single-stepping through the kernel.  To detect this, kmdb compares the
    896  * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
    897  * If it finds that we have single-stepped to a sysenter entry point, it
    898  * explicitly clears the flag and executes the sys_sysenter routine.
    899  *
    900  * One final complication in this final complication is the fact that we
    901  * have two different entry points for sysenter: brand_sys_sysenter and
    902  * sys_sysenter.  If we enter at brand_sys_sysenter and start single-stepping
    903  * through the kernel with kmdb, we will eventually hit the instruction at
    904  * sys_sysenter.  kmdb cannot distinguish between that valid single-step
    905  * and the undesirable one mentioned above.  To avoid this situation, we
    906  * simply add a jump over the instruction at sys_sysenter to make it
    907  * impossible to single-step to it.
    908  */
    909 #if defined(__lint)
    910 
    911 void
    912 sys_sysenter()
    913 {}
    914 
    915 #else	/* __lint */
    916 
    917 	ENTRY_NP(brand_sys_sysenter)
    918 	SWAPGS				/* kernel gsbase */
    919 	ALTENTRY(_brand_sys_sysenter_post_swapgs)
    920 	BRAND_CALLBACK(BRAND_CB_SYSENTER)
    921 	/*
    922 	 * Jump over sys_sysenter to allow single-stepping as described
    923 	 * above.
    924 	 */
    925 	jmp	_sys_sysenter_post_swapgs
    926 
    927 	ALTENTRY(sys_sysenter)
    928 	SWAPGS				/* kernel gsbase */
    929 
    930 	ALTENTRY(_sys_sysenter_post_swapgs)
    931 	movq	%gs:CPU_THREAD, %r15
    932 
    933 	movl	$U32CS_SEL, REGOFF_CS(%rsp)
    934 	movl	%ecx, REGOFF_RSP(%rsp)		/* wrapper: %esp -> %ecx */
    935 	movl	%edx, REGOFF_RIP(%rsp)		/* wrapper: %eip -> %edx */
    936 	pushfq
    937 	popq	%r10
    938 	movl	$UDS_SEL, REGOFF_SS(%rsp)
    939 
    940 	/*
    941 	 * Set the interrupt flag before storing the flags to the
    942 	 * flags image on the stack so we can return to user with
    943 	 * interrupts enabled if we return via sys_rtt_syscall32
    944 	 */
    945 	orq	$PS_IE, %r10
    946 	movq	%r10, REGOFF_RFL(%rsp)
    947 
    948 	movl	%edi, REGOFF_RDI(%rsp)
    949 	movl	%esi, REGOFF_RSI(%rsp)
    950 	movl	%ebp, REGOFF_RBP(%rsp)
    951 	movl	%ebx, REGOFF_RBX(%rsp)
    952 	movl	%edx, REGOFF_RDX(%rsp)
    953 	movl	%ecx, REGOFF_RCX(%rsp)
    954 	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
    955 	movq	$0, REGOFF_SAVFP(%rsp)
    956 	movq	$0, REGOFF_SAVPC(%rsp)
    957 
    958 	/*
    959 	 * Copy these registers here in case we end up stopped with
    960 	 * someone (like, say, /proc) messing with our register state.
    961 	 * We don't -restore- them unless we have to in update_sregs.
    962 	 *
    963 	 * Since userland -can't- change fsbase or gsbase directly,
    964 	 * we don't bother to capture them here.
    965 	 */
    966 	xorl	%ebx, %ebx
    967 	movw	%ds, %bx
    968 	movq	%rbx, REGOFF_DS(%rsp)
    969 	movw	%es, %bx
    970 	movq	%rbx, REGOFF_ES(%rsp)
    971 	movw	%fs, %bx
    972 	movq	%rbx, REGOFF_FS(%rsp)
    973 	movw	%gs, %bx
    974 	movq	%rbx, REGOFF_GS(%rsp)
    975 
    976 	/*
    977 	 * Application state saved in the regs structure on the stack
    978 	 * %eax is the syscall number
    979 	 * %rsp is the thread's stack, %r15 is curthread
    980 	 * REG_RSP(%rsp) is the user's stack
    981 	 */
    982 
    983 	SYSCALL_TRAPTRACE($TT_SYSENTER)
    984 
    985 	movq	%rsp, %rbp
    986 
    987 	movq	T_LWP(%r15), %r14
    988 	ASSERT_NO_RUPDATE_PENDING(%r14)
    989 
    990 	ENABLE_INTR_FLAGS
    991 
    992 	/*
    993 	 * Catch 64-bit process trying to issue sysenter instruction
    994 	 * on Nocona based systems.
    995 	 */
    996 	movq	LWP_PROCP(%r14), %rax
    997 	cmpq	$DATAMODEL_ILP32, P_MODEL(%rax)
    998 	je	7f
    999 
   1000 	/*
   1001 	 * For a non-32-bit process, simulate a #ud, since that's what
   1002 	 * native hardware does.  The traptrace entry (above) will
   1003 	 * let you know what really happened.
   1004 	 */
   1005 	movq	$T_ILLINST, REGOFF_TRAPNO(%rsp)
   1006 	movq	REGOFF_CS(%rsp), %rdi
   1007 	movq	%rdi, REGOFF_ERR(%rsp)
   1008 	movq	%rsp, %rdi
   1009 	movq	REGOFF_RIP(%rsp), %rsi
   1010 	movl	%gs:CPU_ID, %edx
   1011 	call	trap
   1012 	jmp	_sys_rtt
   1013 7:
   1014 
   1015 	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
   1016 	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate calls) */
   1017 
   1018 	ASSERT_LWPTOREGS(%r14, %rsp)
   1019 
   1020 	incq	%gs:CPU_STATS_SYS_SYSCALL
   1021 
   1022 	/*
   1023 	 * Make some space for MAXSYSARGS (currently 8) 32-bit args
   1024 	 * placed into 64-bit (long) arg slots, plus one 64-bit
   1025 	 * (long) arg count, maintaining 16 byte alignment.
   1026 	 */
   1027 	subq	$SYS_DROP, %rsp
   1028 	movb	$LWP_SYS, LWP_STATE(%r14)
   1029 	movq	%r15, %rdi
   1030 	movq	%rsp, %rsi
   1031 	call	syscall_entry
   1032 
   1033 	/*
   1034 	 * Fetch the arguments copied onto the kernel stack and put
   1035 	 * them in the right registers to invoke a C-style syscall handler.
   1036 	 * %rax contains the handler address.
   1037 	 */
   1038 	movq	%rax, %rbx
   1039 	movl	0(%rsp), %edi
   1040 	movl	8(%rsp), %esi
   1041 	movl	0x10(%rsp), %edx
   1042 	movl	0x18(%rsp), %ecx
   1043 	movl	0x20(%rsp), %r8d
   1044 	movl	0x28(%rsp), %r9d
   1045 
   1046 	call	*SY_CALLC(%rbx)
   1047 
   1048 	movq	%rbp, %rsp	/* pop the args */
   1049 
   1050 	/*
   1051 	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
   1052 	 * On the 32-bit kernel, the always return that value in %eax:%edx
   1053 	 * as required by the 32-bit ABI.
   1054 	 *
   1055 	 * Simulate the same behaviour by unconditionally splitting the
   1056 	 * return value in the same way.
   1057 	 */
   1058 	movq	%rax, %r13
   1059 	shrq	$32, %r13	/* upper 32-bits into %edx */
   1060 	movl	%eax, %r12d	/* lower 32-bits into %eax */
   1061 
   1062 	/*
   1063 	 * Optimistically assume that there's no post-syscall
   1064 	 * work to do.  (This is to avoid having to call syscall_mstate()
   1065 	 * with interrupts disabled)
   1066 	 */
   1067 	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
   1068 
   1069 	/*
   1070 	 * We must protect ourselves from being descheduled here;
   1071 	 * If we were, and we ended up on another cpu, or another
   1072 	 * lwp got int ahead of us, it could change the segment
   1073 	 * registers without us noticing before we return to userland.
   1074 	 */
   1075 	cli
   1076 	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
   1077 	jne	_full_syscall_postsys32
   1078 	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
   1079 
   1080 	/*
   1081 	 * To get back to userland, load up the 32-bit registers and
   1082 	 * sysexit back where we came from.
   1083 	 */
   1084 
   1085 	/*
   1086 	 * Interrupts will be turned on by the 'sti' executed just before
   1087 	 * sysexit.  The following ensures that restoring the user's rflags
   1088 	 * doesn't enable interrupts too soon.
   1089 	 */
   1090 	andq	$_BITNOT(PS_IE), REGOFF_RFL(%rsp)
   1091 
   1092 	/*
   1093 	 * (There's no point in loading up %edx because the sysexit
   1094 	 * mechanism smashes it.)
   1095 	 */
   1096 	movl	%r12d, %eax
   1097 	movl	REGOFF_RBX(%rsp), %ebx
   1098 	movl	REGOFF_RBP(%rsp), %ebp
   1099 	movl	REGOFF_RSI(%rsp), %esi
   1100 	movl	REGOFF_RDI(%rsp), %edi
   1101 
   1102 	movl	REGOFF_RIP(%rsp), %edx	/* sysexit: %edx -> %eip */
   1103 	pushq	REGOFF_RFL(%rsp)
   1104 	popfq
   1105 	movl	REGOFF_RSP(%rsp), %ecx	/* sysexit: %ecx -> %esp */
   1106         ALTENTRY(sys_sysenter_swapgs_sysexit)
   1107 	swapgs
   1108 	sti
   1109 	sysexit
   1110 	SET_SIZE(sys_sysenter_swapgs_sysexit)
   1111 	SET_SIZE(sys_sysenter)
   1112 	SET_SIZE(_sys_sysenter_post_swapgs)
   1113 	SET_SIZE(brand_sys_sysenter)
   1114 
   1115 #endif	/* __lint */
   1116 
   1117 #if defined(__lint)
   1118 /*
   1119  * System call via an int80.  This entry point is only used by the Linux
   1120  * application environment.  Unlike the other entry points, there is no
   1121  * default action to take if no callback is registered for this process.
   1122  */
   1123 void
   1124 sys_int80()
   1125 {}
   1126 
   1127 #else	/* __lint */
   1128 
   1129 	ENTRY_NP(brand_sys_int80)
   1130 	SWAPGS				/* kernel gsbase */
   1131 	XPV_TRAP_POP
   1132 	BRAND_CALLBACK(BRAND_CB_INT80)
   1133 	SWAPGS				/* user gsbase */
   1134 #if defined(__xpv)
   1135 	jmp	nopop_int80
   1136 #endif
   1137 
   1138 	ENTRY_NP(sys_int80)
   1139 	/*
   1140 	 * We hit an int80, but this process isn't of a brand with an int80
   1141 	 * handler.  Bad process!  Make it look as if the INT failed.
   1142 	 * Modify %rip to point before the INT, push the expected error
   1143 	 * code and fake a GP fault. Note on 64-bit hypervisor we need
   1144 	 * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack
   1145 	 * because gptrap will pop them again with its own XPV_TRAP_POP.
   1146 	 */
   1147 #if defined(__xpv)
   1148 	XPV_TRAP_POP
   1149 nopop_int80:
   1150 #endif
   1151 	subq	$2, (%rsp)	/* int insn 2-bytes */
   1152 	pushq	$_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2)
   1153 #if defined(__xpv)
   1154 	push	%r11
   1155 	push	%rcx
   1156 #endif
   1157 	jmp	gptrap			/ GP fault
   1158 	SET_SIZE(sys_int80)
   1159 	SET_SIZE(brand_sys_int80)
   1160 #endif	/* __lint */
   1161 
   1162 
   1163 /*
   1164  * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
   1165  * the generic i386 libc to do system calls. We do a small amount of setup
   1166  * before jumping into the existing sys_syscall32 path.
   1167  */
   1168 #if defined(__lint)
   1169 
   1170 /*ARGSUSED*/
   1171 void
   1172 sys_syscall_int()
   1173 {}
   1174 
   1175 #else	/* __lint */
   1176 
   1177 	ENTRY_NP(brand_sys_syscall_int)
   1178 	SWAPGS				/* kernel gsbase */
   1179 	XPV_TRAP_POP
   1180 	BRAND_CALLBACK(BRAND_CB_INT91)
   1181 	SWAPGS				/* user gsbase */
   1182 
   1183 #if defined(__xpv)
   1184 	jmp	nopop_syscall_int
   1185 #endif
   1186 
   1187 	ALTENTRY(sys_syscall_int)
   1188 	SWAPGS				/* kernel gsbase */
   1189 
   1190 #if defined(__xpv)
   1191 	XPV_TRAP_POP
   1192 nopop_syscall_int:
   1193 #endif
   1194 
   1195 	movq	%gs:CPU_THREAD, %r15
   1196 	movq	T_STACK(%r15), %rsp
   1197 	movl	%eax, %eax
   1198 	/*
   1199 	 * Set t_post_sys on this thread to force ourselves out via the slow
   1200 	 * path. It might be possible at some later date to optimize this out
   1201 	 * and use a faster return mechanism.
   1202 	 */
   1203 	movb	$1, T_POST_SYS(%r15)
   1204 	CLEAN_CS
   1205 	jmp	_syscall32_save
   1206 	/*
   1207 	 * There should be no instructions between this label and SWAPGS/IRET
   1208 	 * or we could end up breaking branded zone support. See the usage of
   1209 	 * this label in lx_brand_int80_callback and sn1_brand_int91_callback
   1210 	 * for examples.
   1211 	 */
   1212         ALTENTRY(sys_sysint_swapgs_iret)
   1213 	SWAPGS				/* user gsbase */
   1214 	IRET
   1215 	/*NOTREACHED*/
   1216 	SET_SIZE(sys_sysint_swapgs_iret)
   1217 	SET_SIZE(sys_syscall_int)
   1218 	SET_SIZE(brand_sys_syscall_int)
   1219 
   1220 #endif	/* __lint */
   1221 
   1222 /*
   1223  * Legacy 32-bit applications and old libc implementations do lcalls;
   1224  * we should never get here because the LDT entry containing the syscall
   1225  * segment descriptor has the "segment present" bit cleared, which means
   1226  * we end up processing those system calls in trap() via a not-present trap.
   1227  *
   1228  * We do it this way because a call gate unhelpfully does -nothing- to the
   1229  * interrupt flag bit, so an interrupt can run us just after the lcall
   1230  * completes, but just before the swapgs takes effect.   Thus the INTR_PUSH and
   1231  * INTR_POP paths would have to be slightly more complex to dance around
   1232  * this problem, and end up depending explicitly on the first
   1233  * instruction of this handler being either swapgs or cli.
   1234  */
   1235 
   1236 #if defined(__lint)
   1237 
   1238 /*ARGSUSED*/
   1239 void
   1240 sys_lcall32()
   1241 {}
   1242 
   1243 #else	/* __lint */
   1244 
   1245 	ENTRY_NP(sys_lcall32)
   1246 	SWAPGS				/* kernel gsbase */
   1247 	pushq	$0
   1248 	pushq	%rbp
   1249 	movq	%rsp, %rbp
   1250 	leaq	__lcall_panic_str(%rip), %rdi
   1251 	xorl	%eax, %eax
   1252 	call	panic
   1253 	SET_SIZE(sys_lcall32)
   1254 
   1255 __lcall_panic_str:
   1256 	.string	"sys_lcall32: shouldn't be here!"
   1257 
   1258 /*
   1259  * Declare a uintptr_t which covers the entire pc range of syscall
   1260  * handlers for the stack walkers that need this.
   1261  */
   1262 	.align	CPTRSIZE
   1263 	.globl	_allsyscalls_size
   1264 	.type	_allsyscalls_size, @object
   1265 _allsyscalls_size:
   1266 	.NWORD	. - _allsyscalls
   1267 	SET_SIZE(_allsyscalls_size)
   1268 
   1269 #endif	/* __lint */
   1270 
   1271 /*
   1272  * These are the thread context handlers for lwps using sysenter/sysexit.
   1273  */
   1274 
   1275 #if defined(__lint)
   1276 
   1277 /*ARGSUSED*/
   1278 void
   1279 sep_save(void *ksp)
   1280 {}
   1281 
   1282 /*ARGSUSED*/
   1283 void
   1284 sep_restore(void *ksp)
   1285 {}
   1286 
   1287 #else	/* __lint */
   1288 
   1289 	/*
   1290 	 * setting this value to zero as we switch away causes the
   1291 	 * stack-pointer-on-sysenter to be NULL, ensuring that we
   1292 	 * don't silently corrupt another (preempted) thread stack
   1293 	 * when running an lwp that (somehow) didn't get sep_restore'd
   1294 	 */
   1295 	ENTRY_NP(sep_save)
   1296 	xorl	%edx, %edx
   1297 	xorl	%eax, %eax
   1298 	movl	$MSR_INTC_SEP_ESP, %ecx
   1299 	wrmsr
   1300 	ret
   1301 	SET_SIZE(sep_save)
   1302 
   1303 	/*
   1304 	 * Update the kernel stack pointer as we resume onto this cpu.
   1305 	 */
   1306 	ENTRY_NP(sep_restore)
   1307 	movq	%rdi, %rdx
   1308 	shrq	$32, %rdx
   1309 	movl	%edi, %eax
   1310 	movl	$MSR_INTC_SEP_ESP, %ecx
   1311 	wrmsr
   1312 	ret
   1313 	SET_SIZE(sep_restore)
   1314 
   1315 #endif	/* __lint */
   1316