Home | History | Annotate | Download | only in amd64
      1 /*
      2  * ---------------------------------------------------------------------------
      3  * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
      4  *
      5  * LICENSE TERMS
      6  *
      7  * The free distribution and use of this software is allowed (with or without
      8  * changes) provided that:
      9  *
     10  *  1. source code distributions include the above copyright notice, this
     11  *     list of conditions and the following disclaimer;
     12  *
     13  *  2. binary distributions include the above copyright notice, this list
     14  *     of conditions and the following disclaimer in their documentation;
     15  *
     16  *  3. the name of the copyright holder is not used to endorse products
     17  *     built using this software without specific written permission.
     18  *
     19  * DISCLAIMER
     20  *
     21  * This software is provided 'as is' with no explicit or implied warranties
     22  * in respect of its properties, including, but not limited to, correctness
     23  * and/or fitness for purpose.
     24  * ---------------------------------------------------------------------------
     25  * Issue 20/12/2007
     26  *
     27  * I am grateful to Dag Arne Osvik for many discussions of the techniques that
     28  * can be used to optimise AES assembler code on AMD64/EM64T architectures.
     29  * Some of the techniques used in this implementation are the result of
     30  * suggestions made by him for which I am most grateful.
     31  *
     32  * An AES implementation for AMD64 processors using the YASM assembler.  This
     33  * implementation provides only encryption, decryption and hence requires key
     34  * scheduling support in C. It uses 8k bytes of tables but its encryption and
     35  * decryption performance is very close to that obtained using large tables.
     36  * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
     37  * which are as follows:
     38  *               ms windows  gnu/linux/opensolaris os
     39  *
     40  *   in_blk          rcx     rdi
     41  *   out_blk         rdx     rsi
     42  *   context (cx)     r8     rdx
     43  *
     44  *   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
     45  *   registers       rdi      -      on both
     46  *
     47  *   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
     48  *   registers        -      rdi     on both
     49  *
     50  * The convention used here is that for gnu/linux/opensolaris os.
     51  *
     52  * This code provides the standard AES block size (128 bits, 16 bytes) and the
     53  * three standard AES key sizes (128, 192 and 256 bits). It has the same call
     54  * interface as my C implementation.  It uses the Microsoft C AMD64 calling
     55  * conventions in which the three parameters are placed in  rcx, rdx and r8
     56  * respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
     57  *
     58  * OpenSolaris Note:
     59  * Modified to use GNU/Linux/Solaris calling conventions.
     60  * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
     61  *
     62  *     AES_RETURN aes_encrypt(const unsigned char in_blk[],
     63  *                   unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
     64  *
     65  *     AES_RETURN aes_decrypt(const unsigned char in_blk[],
     66  *                   unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
     67  *
     68  *     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
     69  *                                            const aes_encrypt_ctx cx[1])/
     70  *
     71  *     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
     72  *                                            const aes_decrypt_ctx cx[1])/
     73  *
     74  *     AES_RETURN aes_encrypt_key(const unsigned char key[],
     75  *                           unsigned int len, const aes_decrypt_ctx cx[1])/
     76  *
     77  *     AES_RETURN aes_decrypt_key(const unsigned char key[],
     78  *                           unsigned int len, const aes_decrypt_ctx cx[1])/
     79  *
     80  * where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
     81  * either bits or bytes.
     82  *
     83  * Comment in/out the following lines to obtain the desired subroutines. These
     84  * selections MUST match those in the C header file aesopt.h
     85  */
     86 #define	AES_REV_DKS	  /* define if key decryption schedule is reversed */
     87 
     88 #define	LAST_ROUND_TABLES /* define for the faster version using extra tables */
     89 
     90 /*
     91  * The encryption key schedule has the following in memory layout where N is the
     92  * number of rounds (10, 12 or 14):
     93  *
     94  * lo: | input key (round 0)  |  / each round is four 32-bit words
     95  *     | encryption round 1   |
     96  *     | encryption round 2   |
     97  *     ....
     98  *     | encryption round N-1 |
     99  * hi: | encryption round N   |
    100  *
    101  * The decryption key schedule is normally set up so that it has the same
    102  * layout as above by actually reversing the order of the encryption key
    103  * schedule in memory (this happens when AES_REV_DKS is set):
    104  *
    105  * lo: | decryption round 0   | =              | encryption round N   |
    106  *     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
    107  *     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
    108  *     ....                       ....
    109  *     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
    110  * hi: | decryption round N   | =              | input key (round 0)  |
    111  *
    112  * with rounds except the first and last modified using inv_mix_column()
    113  * But if AES_REV_DKS is NOT set the order of keys is left as it is for
    114  * encryption so that it has to be accessed in reverse when used for
    115  * decryption (although the inverse mix column modifications are done)
    116  *
    117  * lo: | decryption round 0   | =              | input key (round 0)  |
    118  *     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
    119  *     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
    120  *     ....                       ....
    121  *     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
    122  * hi: | decryption round N   | =              | encryption round N   |
    123  *
    124  * This layout is faster when the assembler key scheduling provided here
    125  * is used.
    126  *
    127  * End of user defines
    128  */
    129 
    130 /*
    131  * ---------------------------------------------------------------------------
    132  * OpenSolaris OS modifications
    133  *
    134  * This source originates from Brian Gladman file aes_amd64.asm
    135  * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
    136  * with these changes:
    137  *
    138  * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
    139  * !__GNUC__ ifdefs.  Also removed ENCRYPTION, DECRYPTION,
    140  * AES_128, AES_192, AES_256, AES_VAR ifdefs.
    141  *
    142  * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
    143  *
    144  * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
    145  *
    146  * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
    147  * (operands reversed, literals prefixed with "$", registers prefixed with "%",
    148  * and "[register+offset]", addressing changed to "offset(register)",
    149  * parenthesis in constant expressions "()" changed to square brackets "[]",
    150  * "." removed from  local (numeric) labels, and other changes.
    151  * Examples:
    152  * Intel/yasm/nasm Syntax	ATT/OpenSolaris Syntax
    153  * mov	rax,(4*20h)		mov	$[4*0x20],%rax
    154  * mov	rax,[ebx+20h]		mov	0x20(%ebx),%rax
    155  * lea	rax,[ebx+ecx]		lea	(%ebx,%ecx),%rax
    156  * sub	rax,[ebx+ecx*4-20h]	sub	-0x20(%ebx,%ecx,4),%rax
    157  *
    158  * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
    159  * /usr/include/sys/asm_linkage.h, lint(1B) guards, EXPORT DELETE START
    160  * and EXPORT DELETE END markers, and dummy C function definitions for lint.
    161  *
    162  * 6. Renamed functions and reordered parameters to match OpenSolaris:
    163  * Original Gladman interface:
    164  *	int aes_encrypt(const unsigned char *in,
    165  *		unsigned char *out, const aes_encrypt_ctx cx[1])/
    166  *	int aes_decrypt(const unsigned char *in,
    167  *		unsigned char *out, const aes_encrypt_ctx cx[1])/
    168  * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
    169  * and a union type, inf., containing inf.l, a uint32_t and
    170  * inf.b, a 4-element array of uint32_t.  Only b[0] in the array (aka "l") is
    171  * used and contains the key schedule length * 16 where key schedule length is
    172  * 10, 12, or 14 bytes.
    173  *
    174  * OpenSolaris OS interface:
    175  *	void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
    176  *		const uint32_t pt[4], uint32_t ct[4])/
    177  *	void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
    178  *		const uint32_t pt[4], uint32_t ct[4])/
    179  *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
    180  *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
    181  * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
    182  * ct is crypto text, and MAX_AES_NR is 14.
    183  * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
    184  */
    185 
    186 #if defined(lint) || defined(__lint)
    187 
    188 #include <sys/types.h>
    189 /* ARGSUSED */
    190 void
    191 aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
    192 	uint32_t ct[4]) {
    193 }
    194 /* ARGSUSED */
    195 void
    196 aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
    197 	uint32_t pt[4]) {
    198 }
    199 
    200 
    201 #else
    202 
    203 #include <sys/asm_linkage.h>
    204 
    205 #define	KS_LENGTH	60
    206 
    207 #define	raxd		eax
    208 #define	rdxd		edx
    209 #define	rcxd		ecx
    210 #define	rbxd		ebx
    211 #define	rsid		esi
    212 #define	rdid		edi
    213 
    214 #define	raxb		al
    215 #define	rdxb		dl
    216 #define	rcxb		cl
    217 #define	rbxb		bl
    218 #define	rsib		sil
    219 #define	rdib		dil
    220 
    221 / finite field multiplies by {02}, {04} and {08}
    222 
    223 #define	f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]]
    224 #define	f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]]
    225 #define	f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]]
    226 
    227 / finite field multiplies required in table generation
    228 
    229 #define	f3(x) [[f2(x)] ^ [x]]
    230 #define	f9(x) [[f8(x)] ^ [x]]
    231 #define	fb(x) [[f8(x)] ^ [f2(x)] ^ [x]]
    232 #define	fd(x) [[f8(x)] ^ [f4(x)] ^ [x]]
    233 #define	fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]]
    234 
    235 / macros for expanding S-box data
    236 
    237 #define	u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)]
    238 #define	v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x]
    239 #define	w8(x) [x], 0, 0, 0, [x], 0, 0, 0
    240 
    241 #define	enc_vals(x)	\
    242    .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
    243    .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
    244    .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
    245    .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
    246    .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
    247    .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
    248    .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
    249    .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
    250    .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
    251    .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
    252    .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
    253    .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
    254    .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
    255    .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
    256    .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
    257    .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
    258    .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
    259    .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
    260    .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
    261    .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
    262    .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
    263    .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
    264    .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
    265    .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
    266    .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
    267    .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
    268    .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
    269    .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
    270    .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
    271    .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
    272    .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
    273    .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
    274 
    275 #define	dec_vals(x) \
    276    .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
    277    .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
    278    .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
    279    .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
    280    .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
    281    .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
    282    .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
    283    .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
    284    .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
    285    .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
    286    .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
    287    .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
    288    .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
    289    .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
    290    .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
    291    .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
    292    .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
    293    .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
    294    .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
    295    .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
    296    .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
    297    .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
    298    .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
    299    .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
    300    .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
    301    .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
    302    .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
    303    .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
    304    .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
    305    .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
    306    .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
    307    .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
    308 
    309 #define	tptr	%rbp	/* table pointer */
    310 #define	kptr	%r8	/* key schedule pointer */
    311 #define	fofs	128	/* adjust offset in key schedule to keep |disp| < 128 */
    312 #define	fk_ref(x, y)	-16*x+fofs+4*y(kptr)
    313 
    314 #ifdef	AES_REV_DKS
    315 #define	rofs		128
    316 #define	ik_ref(x, y)	-16*x+rofs+4*y(kptr)
    317 
    318 #else
    319 #define	rofs		-128
    320 #define	ik_ref(x, y)	16*x+rofs+4*y(kptr)
    321 #endif	/* AES_REV_DKS */
    322 
    323 #define	tab_0(x)	(tptr,x,8)
    324 #define	tab_1(x)	3(tptr,x,8)
    325 #define	tab_2(x)	2(tptr,x,8)
    326 #define	tab_3(x)	1(tptr,x,8)
    327 #define	tab_f(x)	1(tptr,x,8)
    328 #define	tab_i(x)	7(tptr,x,8)
    329 
    330 	/* EXPORT DELETE START */
    331 #define	ff_rnd(p1, p2, p3, p4, round)	/* normal forward round */ \
    332 	mov	fk_ref(round,0), p1; \
    333 	mov	fk_ref(round,1), p2; \
    334 	mov	fk_ref(round,2), p3; \
    335 	mov	fk_ref(round,3), p4; \
    336  \
    337 	movzx	%al, %esi; \
    338 	movzx	%ah, %edi; \
    339 	shr	$16, %eax; \
    340 	xor	tab_0(%rsi), p1; \
    341 	xor	tab_1(%rdi), p4; \
    342 	movzx	%al, %esi; \
    343 	movzx	%ah, %edi; \
    344 	xor	tab_2(%rsi), p3; \
    345 	xor	tab_3(%rdi), p2; \
    346  \
    347 	movzx	%bl, %esi; \
    348 	movzx	%bh, %edi; \
    349 	shr	$16, %ebx; \
    350 	xor	tab_0(%rsi), p2; \
    351 	xor	tab_1(%rdi), p1; \
    352 	movzx	%bl, %esi; \
    353 	movzx	%bh, %edi; \
    354 	xor	tab_2(%rsi), p4; \
    355 	xor	tab_3(%rdi), p3; \
    356  \
    357 	movzx	%cl, %esi; \
    358 	movzx	%ch, %edi; \
    359 	shr	$16, %ecx; \
    360 	xor	tab_0(%rsi), p3; \
    361 	xor	tab_1(%rdi), p2; \
    362 	movzx	%cl, %esi; \
    363 	movzx	%ch, %edi; \
    364 	xor	tab_2(%rsi), p1; \
    365 	xor	tab_3(%rdi), p4; \
    366  \
    367 	movzx	%dl, %esi; \
    368 	movzx	%dh, %edi; \
    369 	shr	$16, %edx; \
    370 	xor	tab_0(%rsi), p4; \
    371 	xor	tab_1(%rdi), p3; \
    372 	movzx	%dl, %esi; \
    373 	movzx	%dh, %edi; \
    374 	xor	tab_2(%rsi), p2; \
    375 	xor	tab_3(%rdi), p1; \
    376  \
    377 	mov	p1, %eax; \
    378 	mov	p2, %ebx; \
    379 	mov	p3, %ecx; \
    380 	mov	p4, %edx
    381 
    382 #ifdef	LAST_ROUND_TABLES
    383 
    384 #define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
    385 	add	$2048, tptr; \
    386 	mov	fk_ref(round,0), p1; \
    387 	mov	fk_ref(round,1), p2; \
    388 	mov	fk_ref(round,2), p3; \
    389 	mov	fk_ref(round,3), p4; \
    390  \
    391 	movzx	%al, %esi; \
    392 	movzx	%ah, %edi; \
    393 	shr	$16, %eax; \
    394 	xor	tab_0(%rsi), p1; \
    395 	xor	tab_1(%rdi), p4; \
    396 	movzx	%al, %esi; \
    397 	movzx	%ah, %edi; \
    398 	xor	tab_2(%rsi), p3; \
    399 	xor	tab_3(%rdi), p2; \
    400  \
    401 	movzx	%bl, %esi; \
    402 	movzx	%bh, %edi; \
    403 	shr	$16, %ebx; \
    404 	xor	tab_0(%rsi), p2; \
    405 	xor	tab_1(%rdi), p1; \
    406 	movzx	%bl, %esi; \
    407 	movzx	%bh, %edi; \
    408 	xor	tab_2(%rsi), p4; \
    409 	xor	tab_3(%rdi), p3; \
    410  \
    411 	movzx	%cl, %esi; \
    412 	movzx	%ch, %edi; \
    413 	shr	$16, %ecx; \
    414 	xor	tab_0(%rsi), p3; \
    415 	xor	tab_1(%rdi), p2; \
    416 	movzx	%cl, %esi; \
    417 	movzx	%ch, %edi; \
    418 	xor	tab_2(%rsi), p1; \
    419 	xor	tab_3(%rdi), p4; \
    420  \
    421 	movzx	%dl, %esi; \
    422 	movzx	%dh, %edi; \
    423 	shr	$16, %edx; \
    424 	xor	tab_0(%rsi), p4; \
    425 	xor	tab_1(%rdi), p3; \
    426 	movzx	%dl, %esi; \
    427 	movzx	%dh, %edi; \
    428 	xor	tab_2(%rsi), p2; \
    429 	xor	tab_3(%rdi), p1
    430 
    431 #else
    432 
    433 #define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
    434 	mov	fk_ref(round,0), p1; \
    435 	mov	fk_ref(round,1), p2; \
    436 	mov	fk_ref(round,2), p3; \
    437 	mov	fk_ref(round,3), p4; \
    438  \
    439 	movzx	%al, %esi; \
    440 	movzx	%ah, %edi; \
    441 	shr	$16, %eax; \
    442 	movzx	tab_f(%rsi), %esi; \
    443 	movzx	tab_f(%rdi), %edi; \
    444 	xor	%esi, p1; \
    445 	rol	$8, %edi; \
    446 	xor	%edi, p4; \
    447 	movzx	%al, %esi; \
    448 	movzx	%ah, %edi; \
    449 	movzx	tab_f(%rsi), %esi; \
    450 	movzx	tab_f(%rdi), %edi; \
    451 	rol	$16, %esi; \
    452 	rol	$24, %edi; \
    453 	xor	%esi, p3; \
    454 	xor	%edi, p2; \
    455  \
    456 	movzx	%bl, %esi; \
    457 	movzx	%bh, %edi; \
    458 	shr	$16, %ebx; \
    459 	movzx	tab_f(%rsi), %esi; \
    460 	movzx	tab_f(%rdi), %edi; \
    461 	xor	%esi, p2; \
    462 	rol	$8, %edi; \
    463 	xor	%edi, p1; \
    464 	movzx	%bl, %esi; \
    465 	movzx	%bh, %edi; \
    466 	movzx	tab_f(%rsi), %esi; \
    467 	movzx	tab_f(%rdi), %edi; \
    468 	rol	$16, %esi; \
    469 	rol	$24, %edi; \
    470 	xor	%esi, p4; \
    471 	xor	%edi, p3; \
    472  \
    473 	movzx	%cl, %esi; \
    474 	movzx	%ch, %edi; \
    475 	movzx	tab_f(%rsi), %esi; \
    476 	movzx	tab_f(%rdi), %edi; \
    477 	shr	$16, %ecx; \
    478 	xor	%esi, p3; \
    479 	rol	$8, %edi; \
    480 	xor	%edi, p2; \
    481 	movzx	%cl, %esi; \
    482 	movzx	%ch, %edi; \
    483 	movzx	tab_f(%rsi), %esi; \
    484 	movzx	tab_f(%rdi), %edi; \
    485 	rol	$16, %esi; \
    486 	rol	$24, %edi; \
    487 	xor	%esi, p1; \
    488 	xor	%edi, p4; \
    489  \
    490 	movzx	%dl, %esi; \
    491 	movzx	%dh, %edi; \
    492 	movzx	tab_f(%rsi), %esi; \
    493 	movzx	tab_f(%rdi), %edi; \
    494 	shr	$16, %edx; \
    495 	xor	%esi, p4; \
    496 	rol	$8, %edi; \
    497 	xor	%edi, p3; \
    498 	movzx	%dl, %esi; \
    499 	movzx	%dh, %edi; \
    500 	movzx	tab_f(%rsi), %esi; \
    501 	movzx	tab_f(%rdi), %edi; \
    502 	rol	$16, %esi; \
    503 	rol	$24, %edi; \
    504 	xor	%esi, p2; \
    505 	xor	%edi, p1
    506 
    507 #endif	/* LAST_ROUND_TABLES */
    508 
    509 #define	ii_rnd(p1, p2, p3, p4, round)	/* normal inverse round */ \
    510 	mov	ik_ref(round,0), p1; \
    511 	mov	ik_ref(round,1), p2; \
    512 	mov	ik_ref(round,2), p3; \
    513 	mov	ik_ref(round,3), p4; \
    514  \
    515 	movzx	%al, %esi; \
    516 	movzx	%ah, %edi; \
    517 	shr	$16, %eax; \
    518 	xor	tab_0(%rsi), p1; \
    519 	xor	tab_1(%rdi), p2; \
    520 	movzx	%al, %esi; \
    521 	movzx	%ah, %edi; \
    522 	xor	tab_2(%rsi), p3; \
    523 	xor	tab_3(%rdi), p4; \
    524  \
    525 	movzx	%bl, %esi; \
    526 	movzx	%bh, %edi; \
    527 	shr	$16, %ebx; \
    528 	xor	tab_0(%rsi), p2; \
    529 	xor	tab_1(%rdi), p3; \
    530 	movzx	%bl, %esi; \
    531 	movzx	%bh, %edi; \
    532 	xor	tab_2(%rsi), p4; \
    533 	xor	tab_3(%rdi), p1; \
    534  \
    535 	movzx	%cl, %esi; \
    536 	movzx	%ch, %edi; \
    537 	shr	$16, %ecx; \
    538 	xor	tab_0(%rsi), p3; \
    539 	xor	tab_1(%rdi), p4; \
    540 	movzx	%cl, %esi; \
    541 	movzx	%ch, %edi; \
    542 	xor	tab_2(%rsi), p1; \
    543 	xor	tab_3(%rdi), p2; \
    544  \
    545 	movzx	%dl, %esi; \
    546 	movzx	%dh, %edi; \
    547 	shr	$16, %edx; \
    548 	xor	tab_0(%rsi), p4; \
    549 	xor	tab_1(%rdi), p1; \
    550 	movzx	%dl, %esi; \
    551 	movzx	%dh, %edi; \
    552 	xor	tab_2(%rsi), p2; \
    553 	xor	tab_3(%rdi), p3; \
    554  \
    555 	mov	p1, %eax; \
    556 	mov	p2, %ebx; \
    557 	mov	p3, %ecx; \
    558 	mov	p4, %edx
    559 
    560 #ifdef	LAST_ROUND_TABLES
    561 
    562 #define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
    563 	add	$2048, tptr; \
    564 	mov	ik_ref(round,0), p1; \
    565 	mov	ik_ref(round,1), p2; \
    566 	mov	ik_ref(round,2), p3; \
    567 	mov	ik_ref(round,3), p4; \
    568  \
    569 	movzx	%al, %esi; \
    570 	movzx	%ah, %edi; \
    571 	shr	$16, %eax; \
    572 	xor	tab_0(%rsi), p1; \
    573 	xor	tab_1(%rdi), p2; \
    574 	movzx	%al, %esi; \
    575 	movzx	%ah, %edi; \
    576 	xor	tab_2(%rsi), p3; \
    577 	xor	tab_3(%rdi), p4; \
    578  \
    579 	movzx	%bl, %esi; \
    580 	movzx	%bh, %edi; \
    581 	shr	$16, %ebx; \
    582 	xor	tab_0(%rsi), p2; \
    583 	xor	tab_1(%rdi), p3; \
    584 	movzx	%bl, %esi; \
    585 	movzx	%bh, %edi; \
    586 	xor	tab_2(%rsi), p4; \
    587 	xor	tab_3(%rdi), p1; \
    588  \
    589 	movzx	%cl, %esi; \
    590 	movzx	%ch, %edi; \
    591 	shr	$16, %ecx; \
    592 	xor	tab_0(%rsi), p3; \
    593 	xor	tab_1(%rdi), p4; \
    594 	movzx	%cl, %esi; \
    595 	movzx	%ch, %edi; \
    596 	xor	tab_2(%rsi), p1; \
    597 	xor	tab_3(%rdi), p2; \
    598  \
    599 	movzx	%dl, %esi; \
    600 	movzx	%dh, %edi; \
    601 	shr	$16, %edx; \
    602 	xor	tab_0(%rsi), p4; \
    603 	xor	tab_1(%rdi), p1; \
    604 	movzx	%dl, %esi; \
    605 	movzx	%dh, %edi; \
    606 	xor	tab_2(%rsi), p2; \
    607 	xor	tab_3(%rdi), p3
    608 
    609 #else
    610 
    611 #define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
    612 	mov	ik_ref(round,0), p1; \
    613 	mov	ik_ref(round,1), p2; \
    614 	mov	ik_ref(round,2), p3; \
    615 	mov	ik_ref(round,3), p4; \
    616  \
    617 	movzx	%al, %esi; \
    618 	movzx	%ah, %edi; \
    619 	movzx	tab_i(%rsi), %esi; \
    620 	movzx	tab_i(%rdi), %edi; \
    621 	shr	$16, %eax; \
    622 	xor	%esi, p1; \
    623 	rol	$8, %edi; \
    624 	xor	%edi, p2; \
    625 	movzx	%al, %esi; \
    626 	movzx	%ah, %edi; \
    627 	movzx	tab_i(%rsi), %esi; \
    628 	movzx	tab_i(%rdi), %edi; \
    629 	rol	$16, %esi; \
    630 	rol	$24, %edi; \
    631 	xor	%esi, p3; \
    632 	xor	%edi, p4; \
    633  \
    634 	movzx	%bl, %esi; \
    635 	movzx	%bh, %edi; \
    636 	movzx	tab_i(%rsi), %esi; \
    637 	movzx	tab_i(%rdi), %edi; \
    638 	shr	$16, %ebx; \
    639 	xor	%esi, p2; \
    640 	rol	$8, %edi; \
    641 	xor	%edi, p3; \
    642 	movzx	%bl, %esi; \
    643 	movzx	%bh, %edi; \
    644 	movzx	tab_i(%rsi), %esi; \
    645 	movzx	tab_i(%rdi), %edi; \
    646 	rol	$16, %esi; \
    647 	rol	$24, %edi; \
    648 	xor	%esi, p4; \
    649 	xor	%edi, p1; \
    650  \
    651 	movzx	%cl, %esi; \
    652 	movzx	%ch, %edi; \
    653 	movzx	tab_i(%rsi), %esi; \
    654 	movzx	tab_i(%rdi), %edi; \
    655 	shr	$16, %ecx; \
    656 	xor	%esi, p3; \
    657 	rol	$8, %edi; \
    658 	xor	%edi, p4; \
    659 	movzx	%cl, %esi; \
    660 	movzx	%ch, %edi; \
    661 	movzx	tab_i(%rsi), %esi; \
    662 	movzx	tab_i(%rdi), %edi; \
    663 	rol	$16, %esi; \
    664 	rol	$24, %edi; \
    665 	xor	%esi, p1; \
    666 	xor	%edi, p2; \
    667  \
    668 	movzx	%dl, %esi; \
    669 	movzx	%dh, %edi; \
    670 	movzx	tab_i(%rsi), %esi; \
    671 	movzx	tab_i(%rdi), %edi; \
    672 	shr	$16, %edx; \
    673 	xor	%esi, p4; \
    674 	rol	$8, %edi; \
    675 	xor	%edi, p1; \
    676 	movzx	%dl, %esi; \
    677 	movzx	%dh, %edi; \
    678 	movzx	tab_i(%rsi), %esi; \
    679 	movzx	tab_i(%rdi), %edi; \
    680 	rol	$16, %esi; \
    681 	rol	$24, %edi; \
    682 	xor	%esi, p2; \
    683 	xor	%edi, p3
    684 
    685 #endif	/* LAST_ROUND_TABLES */
    686 	/* EXPORT DELETE END */
    687 
    688 /*
    689  * OpenSolaris OS:
    690  * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
    691  *	const uint32_t pt[4], uint32_t ct[4])/
    692  *
    693  * Original interface:
    694  * int aes_encrypt(const unsigned char *in,
    695  *	unsigned char *out, const aes_encrypt_ctx cx[1])/
    696  */
    697 	.align	64
    698 enc_tab:
    699 	enc_vals(u8)
    700 #ifdef	LAST_ROUND_TABLES
    701 	/ Last Round Tables:
    702 	enc_vals(w8)
    703 #endif
    704 
    705 
    706 	ENTRY_NP(aes_encrypt_amd64)
    707 	/* EXPORT DELETE START */
    708 #ifdef	GLADMAN_INTERFACE
    709 	/ Original interface
    710 	sub	$[4*8], %rsp	/ gnu/linux/opensolaris binary interface
    711 	mov	%rsi, (%rsp)	/ output pointer (P2)
    712 	mov	%rdx, %r8	/ context (P3)
    713 
    714 	mov	%rbx, 1*8(%rsp)	/ P1: input pointer in rdi
    715 	mov	%rbp, 2*8(%rsp)	/ P2: output pointer in (rsp)
    716 	mov	%r12, 3*8(%rsp)	/ P3: context in r8
    717 	movzx	4*KS_LENGTH(kptr), %esi	/ Get byte key length * 16
    718 
    719 #else
    720 	/ OpenSolaris OS interface
    721 	sub	$[4*8], %rsp	/ Make room on stack to save registers
    722 	mov	%rcx, (%rsp)	/ Save output pointer (P4) on stack
    723 	mov	%rdi, %r8	/ context (P1)
    724 	mov	%rdx, %rdi	/ P3: save input pointer
    725 	shl	$4, %esi	/ P2: esi byte key length * 16
    726 
    727 	mov	%rbx, 1*8(%rsp)	/ Save registers
    728 	mov	%rbp, 2*8(%rsp)
    729 	mov	%r12, 3*8(%rsp)
    730 	/ P1: context in r8
    731 	/ P2: byte key length * 16 in esi
    732 	/ P3: input pointer in rdi
    733 	/ P4: output pointer in (rsp)
    734 #endif	/* GLADMAN_INTERFACE */
    735 
    736 	lea	enc_tab(%rip), tptr
    737 	sub	$fofs, kptr
    738 
    739 	/ Load input block into registers
    740 	mov	(%rdi), %eax
    741 	mov	1*4(%rdi), %ebx
    742 	mov	2*4(%rdi), %ecx
    743 	mov	3*4(%rdi), %edx
    744 
    745 	xor	fofs(kptr), %eax
    746 	xor	fofs+4(kptr), %ebx
    747 	xor	fofs+8(kptr), %ecx
    748 	xor	fofs+12(kptr), %edx
    749 
    750 	lea	(kptr,%rsi), kptr
    751 	/ Jump based on byte key length * 16:
    752 	cmp	$[10*16], %esi
    753 	je	3f
    754 	cmp	$[12*16], %esi
    755 	je	2f
    756 	cmp	$[14*16], %esi
    757 	je	1f
    758 	mov	$-1, %rax	/ error
    759 	jmp	4f
    760 
    761 	/ Perform normal forward rounds
    762 1:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
    763 	ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
    764 2:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
    765 	ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
    766 3:	ff_rnd(%r9d, %r10d, %r11d, %r12d,  9)
    767 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  8)
    768 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  7)
    769 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  6)
    770 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  5)
    771 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  4)
    772 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  3)
    773 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  2)
    774 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  1)
    775 	fl_rnd(%r9d, %r10d, %r11d, %r12d,  0)
    776 
    777 	/ Copy results
    778 	mov	(%rsp), %rbx
    779 	mov	%r9d, (%rbx)
    780 	mov	%r10d, 4(%rbx)
    781 	mov	%r11d, 8(%rbx)
    782 	mov	%r12d, 12(%rbx)
    783 	xor	%rax, %rax
    784 4:	/ Restore registers
    785 	mov	1*8(%rsp), %rbx
    786 	mov	2*8(%rsp), %rbp
    787 	mov	3*8(%rsp), %r12
    788 	add	$[4*8], %rsp
    789 	/* EXPORT DELETE END */
    790 	ret
    791 
    792 	SET_SIZE(aes_encrypt_amd64)
    793 
    794 /*
    795  * OpenSolaris OS:
    796  * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
    797  *	const uint32_t pt[4], uint32_t ct[4])/
    798  *
    799  * Original interface:
    800  * int aes_decrypt(const unsigned char *in,
    801  *	unsigned char *out, const aes_encrypt_ctx cx[1])/
    802  */
    803 	.align	64
    804 dec_tab:
    805 	dec_vals(v8)
    806 #ifdef	LAST_ROUND_TABLES
    807 	/ Last Round Tables:
    808 	dec_vals(w8)
    809 #endif
    810 
    811 
    812 	ENTRY_NP(aes_decrypt_amd64)
    813 	/* EXPORT DELETE START */
    814 #ifdef	GLADMAN_INTERFACE
    815 	/ Original interface
    816 	sub	$[4*8], %rsp	/ gnu/linux/opensolaris binary interface
    817 	mov	%rsi, (%rsp)	/ output pointer (P2)
    818 	mov	%rdx, %r8	/ context (P3)
    819 
    820 	mov	%rbx, 1*8(%rsp)	/ P1: input pointer in rdi
    821 	mov	%rbp, 2*8(%rsp)	/ P2: output pointer in (rsp)
    822 	mov	%r12, 3*8(%rsp)	/ P3: context in r8
    823 	movzx	4*KS_LENGTH(kptr), %esi	/ Get byte key length * 16
    824 
    825 #else
    826 	/ OpenSolaris OS interface
    827 	sub	$[4*8], %rsp	/ Make room on stack to save registers
    828 	mov	%rcx, (%rsp)	/ Save output pointer (P4) on stack
    829 	mov	%rdi, %r8	/ context (P1)
    830 	mov	%rdx, %rdi	/ P3: save input pointer
    831 	shl	$4, %esi	/ P2: esi byte key length * 16
    832 
    833 	mov	%rbx, 1*8(%rsp)	/ Save registers
    834 	mov	%rbp, 2*8(%rsp)
    835 	mov	%r12, 3*8(%rsp)
    836 	/ P1: context in r8
    837 	/ P2: byte key length * 16 in esi
    838 	/ P3: input pointer in rdi
    839 	/ P4: output pointer in (rsp)
    840 #endif	/* GLADMAN_INTERFACE */
    841 
    842 	lea	dec_tab(%rip), tptr
    843 	sub	$rofs, kptr
    844 
    845 	/ Load input block into registers
    846 	mov	(%rdi), %eax
    847 	mov	1*4(%rdi), %ebx
    848 	mov	2*4(%rdi), %ecx
    849 	mov	3*4(%rdi), %edx
    850 
    851 #ifdef AES_REV_DKS
    852 	mov	kptr, %rdi
    853 	lea	(kptr,%rsi), kptr
    854 #else
    855 	lea	(kptr,%rsi), %rdi
    856 #endif
    857 
    858 	xor	rofs(%rdi), %eax
    859 	xor	rofs+4(%rdi), %ebx
    860 	xor	rofs+8(%rdi), %ecx
    861 	xor	rofs+12(%rdi), %edx
    862 
    863 	/ Jump based on byte key length * 16:
    864 	cmp	$[10*16], %esi
    865 	je	3f
    866 	cmp	$[12*16], %esi
    867 	je	2f
    868 	cmp	$[14*16], %esi
    869 	je	1f
    870 	mov	$-1, %rax	/ error
    871 	jmp	4f
    872 
    873 	/ Perform normal inverse rounds
    874 1:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
    875 	ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
    876 2:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
    877 	ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
    878 3:	ii_rnd(%r9d, %r10d, %r11d, %r12d,  9)
    879 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  8)
    880 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  7)
    881 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  6)
    882 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  5)
    883 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  4)
    884 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  3)
    885 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  2)
    886 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  1)
    887 	il_rnd(%r9d, %r10d, %r11d, %r12d,  0)
    888 
    889 	/ Copy results
    890 	mov	(%rsp), %rbx
    891 	mov	%r9d, (%rbx)
    892 	mov	%r10d, 4(%rbx)
    893 	mov	%r11d, 8(%rbx)
    894 	mov	%r12d, 12(%rbx)
    895 	xor	%rax, %rax
    896 4:	/ Restore registers
    897 	mov	1*8(%rsp), %rbx
    898 	mov	2*8(%rsp), %rbp
    899 	mov	3*8(%rsp), %r12
    900 	add	$[4*8], %rsp
    901 	/* EXPORT DELETE END */
    902 	ret
    903 
    904 	SET_SIZE(aes_decrypt_amd64)
    905 #endif	/* lint || __lint */
    906