Home | History | Annotate | Download | only in amd64
      1 #!/usr/bin/perl -w
      2 #
      3 # MD5 optimized for AMD64.
      4 #
      5 # Author: Marc Bevand <bevand_m (at) epita.fr>
      6 # Licence: I hereby disclaim the copyright on this code and place it
      7 # in the public domain.
      8 #
      9 
     10 #
     11 # The following is Marc Bevand's MD5 implementation optimized for
     12 # AMD64.  It has been lifted intact, except for changing the comment
     13 # character and adding comments.
     14 #
     15 # typedef struct {
     16 #	uint32_t state[4];	/* state (ABCD) */
     17 #	uint32_t count[2];	/* number of bits, modulo 2^64 (lsb first) */
     18 #	union	{
     19 #		uint8_t		buf8[64];	/* undigested input */
     20 #		uint32_t	buf32[16];	/* realigned input */
     21 #		} buf_un;
     22 #	} MD5_CTX;
     23 #
     24 # void md5_block_asm_host_order(MD5_CTX *ctx, const void *inpp,
     25 #        unsigned int input_length_in_blocks);
     26 #
     27 # Registers used:
     28 #	rax  A		r8  old A
     29 #	rbx  B		r9  old B
     30 #	rcx  C		r10 tmp
     31 #	rdx  D		r11 tmp
     32 #	rsi  ptr	r12 tmp
     33 #	rdi  end	r13 -
     34 #	rbp  -		r14 old C
     35 #	rsp  stack	r15 old D
     36 #
     37 
     38 use strict;
     39 my $code;
     40 
     41 
     42 # round1_step() does:
     43 #   dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
     44 #   %r10d = X[k_next]
     45 #   %r11d = z' (copy of z for the next step)
     46 # Each round1_step() takes about 5.3 clocks (9 instructions, 1.7 IPC)
     47 sub round1_step
     48 {
     49     my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
     50     $code .= "	mov	0*4(%rsi),	%r10d		/* (NEXT STEP) X[0] */\n" if ($pos == -1);
     51     $code .= "	mov	%edx,		%r11d		/* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
     52 
     53     $code .= <<EOF;
     54 	xor	$y,		%r11d		/* y ^ ... */
     55 	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
     56 	and	$x,		%r11d		/* x & ... */
     57 	xor	$z,		%r11d		/* z ^ ... */
     58 	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
     59 	add	%r11d,		$dst		/* dst += ... */
     60 	rol	\$$s,		$dst		/* dst <<< s */
     61 	mov	$y,		%r11d		/* (NEXT STEP) z' = $y */
     62 	add	$x,		$dst		/* dst += x */
     63 EOF
     64 }
     65 
     66 # round2_step() does:
     67 #   dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
     68 #   %r10d = X[k_next]
     69 #   %r11d = z' (copy of z for the next step)
     70 #   %r12d = z' (copy of z for the next step)
     71 # Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC)
     72 sub round2_step
     73 {
     74     my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
     75     $code .= "	mov	1*4(%rsi),	%r10d		/* (NEXT STEP) X[1] */\n" if ($pos == -1);
     76     $code .= "	mov	%edx,		%r11d		/* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
     77     $code .= "	mov	%edx,		%r12d		/* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
     78 
     79     $code .= <<EOF;
     80 	not	%r11d				/* not z */
     81 	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
     82 	and	$x,		%r12d		/* x & z */
     83 	and	$y,		%r11d		/* y & (not z) */
     84 	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
     85 	or	%r11d,		%r12d		/* (y & (not z)) | (x & z) */
     86 	mov	$y,		%r11d		/* (NEXT STEP) z' = $y */
     87 	add	%r12d,		$dst		/* dst += ... */
     88 	mov	$y,		%r12d		/* (NEXT STEP) z' = $y */
     89 	rol	\$$s,		$dst		/* dst <<< s */
     90 	add	$x,		$dst		/* dst += x */
     91 EOF
     92 }
     93 
     94 # round3_step() does:
     95 #   dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
     96 #   %r10d = X[k_next]
     97 #   %r11d = y' (copy of y for the next step)
     98 # Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC)
     99 sub round3_step
    100 {
    101     my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
    102     $code .= "	mov	5*4(%rsi),	%r10d		/* (NEXT STEP) X[5] */\n" if ($pos == -1);
    103     $code .= "	mov	%ecx,		%r11d		/* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
    104 
    105     $code .= <<EOF;
    106 	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
    107 	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
    108 	xor	$z,		%r11d		/* z ^ ... */
    109 	xor	$x,		%r11d		/* x ^ ... */
    110 	add	%r11d,		$dst		/* dst += ... */
    111 	rol	\$$s,		$dst		/* dst <<< s */
    112 	mov	$x,		%r11d		/* (NEXT STEP) y' = $x */
    113 	add	$x,		$dst		/* dst += x */
    114 EOF
    115 }
    116 
    117 # round4_step() does:
    118 #   dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
    119 #   %r10d = X[k_next]
    120 #   %r11d = not z' (copy of not z for the next step)
    121 # Each round4_step() takes about 5.2 clocks (9 instructions, 1.7 IPC)
    122 sub round4_step
    123 {
    124     my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
    125     $code .= "	mov	0*4(%rsi),	%r10d		/* (NEXT STEP) X[0] */\n" if ($pos == -1);
    126     $code .= "	mov	\$0xffffffff,	%r11d\n" if ($pos == -1);
    127     $code .= "	xor	%edx,		%r11d		/* (NEXT STEP) not z' = not %edx*/\n"
    128     if ($pos == -1);
    129 
    130     $code .= <<EOF;
    131 	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
    132 	or	$x,		%r11d		/* x | ... */
    133 	xor	$y,		%r11d		/* y ^ ... */
    134 	add	%r11d,		$dst		/* dst += ... */
    135 	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
    136 	mov	\$0xffffffff,	%r11d
    137 	rol	\$$s,		$dst		/* dst <<< s */
    138 	xor	$y,		%r11d		/* (NEXT STEP) not z' = not $y */
    139 	add	$x,		$dst		/* dst += x */
    140 EOF
    141 }
    142 
    143 
    144 #
    145 # Execution begins here.
    146 #
    147 
    148 my $output = shift;
    149 open STDOUT,">$output" or die "can't open $output: $!";
    150 
    151 $code .= <<EOF;
    152 #if defined(lint) || defined(__lint)
    153 #include <sys/md5.h>
    154 
    155 /* ARGSUSED */
    156 void md5_block_asm_host_order(MD5_CTX *ctx, const void *inpp,
    157     unsigned int input_length_in_blocks)
    158 {
    159 }
    160 
    161 #else
    162 #include <sys/asm_linkage.h>
    163 
    164 	ENTRY_NP(md5_block_asm_host_order)
    165 	push	%rbp
    166 	push	%rbx
    167 	push	%r12
    168 	push	%r13
    169 	push	%r14
    170 	push	%r15
    171 
    172 	/ rdi = arg #1 (ctx, MD5_CTX pointer)
    173 	/ rsi = arg #2 (ptr, data pointer)
    174 	/ rdx = arg #3 (nbr, number of 64-byte blocks to process)
    175 	mov	%rdi,		%rbp	/ rbp = ctx
    176 	shl	\$6,		%rdx	/ rdx = nbr in bytes
    177 	lea	(%rsi,%rdx),	%rdi	/ rdi = end
    178 	mov	0*4(%rbp),	%eax	/ eax = ctx->A
    179 	mov	1*4(%rbp),	%ebx	/ ebx = ctx->B
    180 	mov	2*4(%rbp),	%ecx	/ ecx = ctx->C
    181 	mov	3*4(%rbp),	%edx	/ edx = ctx->D
    182 	push	%rbp			/ save ctx
    183 	/ end is 'rdi'
    184 	/ ptr is 'rsi'
    185 	/ A is 'eax'
    186 	/ B is 'ebx'
    187 	/ C is 'ecx'
    188 	/ D is 'edx'
    189 
    190 	cmp	%rdi,		%rsi		/ cmp end with ptr
    191 	je	1f				/ jmp if ptr == end
    192 
    193 	/ BEGIN of loop over 64-byte blocks
    194 2:	/ save old values of A, B, C, D
    195 	mov	%eax,		%r8d
    196 	mov	%ebx,		%r9d
    197 	mov	%ecx,		%r14d
    198 	mov	%edx,		%r15d
    199 EOF
    200 round1_step(-1,'%eax','%ebx','%ecx','%edx', '1','0xd76aa478', '7');
    201 round1_step( 0,'%edx','%eax','%ebx','%ecx', '2','0xe8c7b756','12');
    202 round1_step( 0,'%ecx','%edx','%eax','%ebx', '3','0x242070db','17');
    203 round1_step( 0,'%ebx','%ecx','%edx','%eax', '4','0xc1bdceee','22');
    204 round1_step( 0,'%eax','%ebx','%ecx','%edx', '5','0xf57c0faf', '7');
    205 round1_step( 0,'%edx','%eax','%ebx','%ecx', '6','0x4787c62a','12');
    206 round1_step( 0,'%ecx','%edx','%eax','%ebx', '7','0xa8304613','17');
    207 round1_step( 0,'%ebx','%ecx','%edx','%eax', '8','0xfd469501','22');
    208 round1_step( 0,'%eax','%ebx','%ecx','%edx', '9','0x698098d8', '7');
    209 round1_step( 0,'%edx','%eax','%ebx','%ecx','10','0x8b44f7af','12');
    210 round1_step( 0,'%ecx','%edx','%eax','%ebx','11','0xffff5bb1','17');
    211 round1_step( 0,'%ebx','%ecx','%edx','%eax','12','0x895cd7be','22');
    212 round1_step( 0,'%eax','%ebx','%ecx','%edx','13','0x6b901122', '7');
    213 round1_step( 0,'%edx','%eax','%ebx','%ecx','14','0xfd987193','12');
    214 round1_step( 0,'%ecx','%edx','%eax','%ebx','15','0xa679438e','17');
    215 round1_step( 1,'%ebx','%ecx','%edx','%eax', '0','0x49b40821','22');
    216 
    217 round2_step(-1,'%eax','%ebx','%ecx','%edx', '6','0xf61e2562', '5');
    218 round2_step( 0,'%edx','%eax','%ebx','%ecx','11','0xc040b340', '9');
    219 round2_step( 0,'%ecx','%edx','%eax','%ebx', '0','0x265e5a51','14');
    220 round2_step( 0,'%ebx','%ecx','%edx','%eax', '5','0xe9b6c7aa','20');
    221 round2_step( 0,'%eax','%ebx','%ecx','%edx','10','0xd62f105d', '5');
    222 round2_step( 0,'%edx','%eax','%ebx','%ecx','15', '0x2441453', '9');
    223 round2_step( 0,'%ecx','%edx','%eax','%ebx', '4','0xd8a1e681','14');
    224 round2_step( 0,'%ebx','%ecx','%edx','%eax', '9','0xe7d3fbc8','20');
    225 round2_step( 0,'%eax','%ebx','%ecx','%edx','14','0x21e1cde6', '5');
    226 round2_step( 0,'%edx','%eax','%ebx','%ecx', '3','0xc33707d6', '9');
    227 round2_step( 0,'%ecx','%edx','%eax','%ebx', '8','0xf4d50d87','14');
    228 round2_step( 0,'%ebx','%ecx','%edx','%eax','13','0x455a14ed','20');
    229 round2_step( 0,'%eax','%ebx','%ecx','%edx', '2','0xa9e3e905', '5');
    230 round2_step( 0,'%edx','%eax','%ebx','%ecx', '7','0xfcefa3f8', '9');
    231 round2_step( 0,'%ecx','%edx','%eax','%ebx','12','0x676f02d9','14');
    232 round2_step( 1,'%ebx','%ecx','%edx','%eax', '0','0x8d2a4c8a','20');
    233 
    234 round3_step(-1,'%eax','%ebx','%ecx','%edx', '8','0xfffa3942', '4');
    235 round3_step( 0,'%edx','%eax','%ebx','%ecx','11','0x8771f681','11');
    236 round3_step( 0,'%ecx','%edx','%eax','%ebx','14','0x6d9d6122','16');
    237 round3_step( 0,'%ebx','%ecx','%edx','%eax', '1','0xfde5380c','23');
    238 round3_step( 0,'%eax','%ebx','%ecx','%edx', '4','0xa4beea44', '4');
    239 round3_step( 0,'%edx','%eax','%ebx','%ecx', '7','0x4bdecfa9','11');
    240 round3_step( 0,'%ecx','%edx','%eax','%ebx','10','0xf6bb4b60','16');
    241 round3_step( 0,'%ebx','%ecx','%edx','%eax','13','0xbebfbc70','23');
    242 round3_step( 0,'%eax','%ebx','%ecx','%edx', '0','0x289b7ec6', '4');
    243 round3_step( 0,'%edx','%eax','%ebx','%ecx', '3','0xeaa127fa','11');
    244 round3_step( 0,'%ecx','%edx','%eax','%ebx', '6','0xd4ef3085','16');
    245 round3_step( 0,'%ebx','%ecx','%edx','%eax', '9', '0x4881d05','23');
    246 round3_step( 0,'%eax','%ebx','%ecx','%edx','12','0xd9d4d039', '4');
    247 round3_step( 0,'%edx','%eax','%ebx','%ecx','15','0xe6db99e5','11');
    248 round3_step( 0,'%ecx','%edx','%eax','%ebx', '2','0x1fa27cf8','16');
    249 round3_step( 1,'%ebx','%ecx','%edx','%eax', '0','0xc4ac5665','23');
    250 
    251 round4_step(-1,'%eax','%ebx','%ecx','%edx', '7','0xf4292244', '6');
    252 round4_step( 0,'%edx','%eax','%ebx','%ecx','14','0x432aff97','10');
    253 round4_step( 0,'%ecx','%edx','%eax','%ebx', '5','0xab9423a7','15');
    254 round4_step( 0,'%ebx','%ecx','%edx','%eax','12','0xfc93a039','21');
    255 round4_step( 0,'%eax','%ebx','%ecx','%edx', '3','0x655b59c3', '6');
    256 round4_step( 0,'%edx','%eax','%ebx','%ecx','10','0x8f0ccc92','10');
    257 round4_step( 0,'%ecx','%edx','%eax','%ebx', '1','0xffeff47d','15');
    258 round4_step( 0,'%ebx','%ecx','%edx','%eax', '8','0x85845dd1','21');
    259 round4_step( 0,'%eax','%ebx','%ecx','%edx','15','0x6fa87e4f', '6');
    260 round4_step( 0,'%edx','%eax','%ebx','%ecx', '6','0xfe2ce6e0','10');
    261 round4_step( 0,'%ecx','%edx','%eax','%ebx','13','0xa3014314','15');
    262 round4_step( 0,'%ebx','%ecx','%edx','%eax', '4','0x4e0811a1','21');
    263 round4_step( 0,'%eax','%ebx','%ecx','%edx','11','0xf7537e82', '6');
    264 round4_step( 0,'%edx','%eax','%ebx','%ecx', '2','0xbd3af235','10');
    265 round4_step( 0,'%ecx','%edx','%eax','%ebx', '9','0x2ad7d2bb','15');
    266 round4_step( 1,'%ebx','%ecx','%edx','%eax', '0','0xeb86d391','21');
    267 $code .= <<EOF;
    268 	/ add old values of A, B, C, D
    269 	add	%r8d,	%eax
    270 	add	%r9d,	%ebx
    271 	add	%r14d,	%ecx
    272 	add	%r15d,	%edx
    273 
    274 	/ loop control
    275 	add	\$64,		%rsi		/ ptr += 64
    276 	cmp	%rdi,		%rsi		/ cmp end with ptr
    277 	jb	2b				/ jmp if ptr < end
    278 	/ END of loop over 64-byte blocks
    279 
    280 1:	pop	%rbp				/ restore ctx
    281 	mov	%eax,		0*4(%rbp)	/ ctx->A = A
    282 	mov	%ebx,		1*4(%rbp)	/ ctx->B = B
    283 	mov	%ecx,		2*4(%rbp)	/ ctx->C = C
    284 	mov	%edx,		3*4(%rbp)	/ ctx->D = D
    285 
    286 	pop	%r15
    287 	pop	%r14
    288 	pop	%r13
    289 	pop	%r12
    290 	pop	%rbx
    291 	pop	%rbp
    292 	ret
    293 	SET_SIZE(md5_block_asm_host_order)
    294 
    295 #endif /* lint || __lint */
    296 EOF
    297 
    298 print $code;
    299