Home | History | Annotate | Download | only in amd64
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # sha1_block procedure for x86_64.
     11 #
     12 # It was brought to my attention that on EM64T compiler-generated code
     13 # was far behind 32-bit assembler implementation. This is unlike on
     14 # Opteron where compiler-generated code was only 15% behind 32-bit
     15 # assembler, which originally made it hard to motivate the effort.
     16 # There was suggestion to mechanically translate 32-bit code, but I
     17 # dismissed it, reasoning that x86_64 offers enough register bank
     18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
     19 # implementation:-) However! While 64-bit code does performs better
     20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
     21 # x86_64 does offer larger *addressable* bank, but out-of-order core
     22 # reaches for even more registers through dynamic aliasing, and EM64T
     23 # core must have managed to run-time optimize even 32-bit code just as
     24 # good as 64-bit one. Performance improvement is summarized in the
     25 # following table:
     26 #
     27 #		gcc 3.4		32-bit asm	cycles/byte
     28 # Opteron	+45%		+20%		6.8
     29 # Xeon P4	+65%		+0%		9.9
     30 # Core2		+60%		+10%		7.0
     31 
     32 #
     33 # OpenSolaris OS modifications
     34 #
     35 # Sun elects to use this software under the BSD license.
     36 #
     37 # This source originates from OpenSSL file sha1-x86_64.pl at
     38 # ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
     39 # (presumably for future OpenSSL release 0.9.8h), with these changes:
     40 #
     41 # 1. Added perl "use strict" and declared variables.
     42 #
     43 # 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
     44 # /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
     45 #
     46 # 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) assemblers).
     47 #
     48 
     49 use strict;
     50 my ($code, $ctx, $inp, $num, $xi, $t0, $t1, $i, @V, $A, $B, $C, $D, $E, $T);
     51 my $output = shift;
     52 open STDOUT,">$output";
     53 
     54 
     55 #
     56 # void sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks);
     57 #
     58 
     59 # Arguments:
     60 $ctx="%rdi";	# 1st arg
     61 $inp="%rsi";	# 2nd arg
     62 $num="%rdx";	# 3rd arg
     63 
     64 # reassign arguments in order to produce more compact code
     65 $ctx="%r8";
     66 $inp="%r9";
     67 $num="%r10";
     68 
     69 # Temporaries:
     70 $xi="%eax";
     71 $t0="%ebx";
     72 $t1="%ecx";
     73 # State information from SHA-1 context:
     74 $A="%edx";
     75 $B="%esi";
     76 $C="%edi";
     77 $D="%ebp";
     78 $E="%r11d";
     79 # Temporary:
     80 $T="%r12d";
     81 
     82 @V=($A,$B,$C,$D,$E,$T);
     83 
     84 sub PROLOGUE {
     85 my $func=shift;
     86 $code.=<<___;
     87 ENTRY_NP($func)
     88 	push	%rbx
     89 	push	%rbp
     90 	push	%r12
     91 	mov	%rsp,%rax
     92 	mov	%rdi,$ctx	# reassigned argument
     93 	sub	\$`8+16*4`,%rsp
     94 	mov	%rsi,$inp	# reassigned argument
     95 	and	\$-64,%rsp
     96 	mov	%rdx,$num	# reassigned argument
     97 	mov	%rax,`16*4`(%rsp)
     98 
     99 	mov	0($ctx),$A
    100 	mov	4($ctx),$B
    101 	mov	8($ctx),$C
    102 	mov	12($ctx),$D
    103 	mov	16($ctx),$E
    104 ___
    105 }
    106 
    107 sub EPILOGUE {
    108 my $func=shift;
    109 $code.=<<___;
    110 	mov	`16*4`(%rsp),%rsp
    111 	pop	%r12
    112 	pop	%rbp
    113 	pop	%rbx
    114 	ret
    115 SET_SIZE($func)
    116 ___
    117 }
    118 
    119 sub BODY_00_19 {
    120 my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
    121 my $j=$i+1;
    122 $code.=<<___ if ($i==0);
    123 	mov	`4*$i`($inp),$xi	
    124 	`"bswap	$xi"	if(!defined($host))`
    125 	mov	$xi,`4*$i`(%rsp)
    126 ___
    127 $code.=<<___ if ($i<15);
    128 	lea	0x5a827999($xi,$e),$f
    129 	mov	$c,$t0
    130 	mov	`4*$j`($inp),$xi
    131 	mov	$a,$e
    132 	xor	$d,$t0
    133 	`"bswap	$xi"	if(!defined($host))`	
    134 	rol	\$5,$e
    135 	and	$b,$t0
    136 	mov	$xi,`4*$j`(%rsp)
    137 	add	$e,$f
    138 	xor	$d,$t0
    139 	rol	\$30,$b
    140 	add	$t0,$f
    141 ___
    142 $code.=<<___ if ($i>=15);
    143 	lea	0x5a827999($xi,$e),$f
    144 	mov	`4*($j%16)`(%rsp),$xi
    145 	mov	$c,$t0
    146 	mov	$a,$e
    147 	xor	`4*(($j+2)%16)`(%rsp),$xi
    148 	xor	$d,$t0
    149 	rol	\$5,$e
    150 	xor	`4*(($j+8)%16)`(%rsp),$xi
    151 	and	$b,$t0
    152 	add	$e,$f
    153 	xor	`4*(($j+13)%16)`(%rsp),$xi
    154 	xor	$d,$t0
    155 	rol	\$30,$b
    156 	add	$t0,$f
    157 	rol	\$1,$xi
    158 	mov	$xi,`4*($j%16)`(%rsp)
    159 ___
    160 }
    161 
    162 sub BODY_20_39 {
    163 my ($i,$a,$b,$c,$d,$e,$f)=@_;
    164 my $j=$i+1;
    165 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
    166 $code.=<<___ if ($i<79);
    167 	lea	$K($xi,$e),$f
    168 	mov	`4*($j%16)`(%rsp),$xi
    169 	mov	$c,$t0
    170 	mov	$a,$e
    171 	xor	`4*(($j+2)%16)`(%rsp),$xi
    172 	xor	$b,$t0
    173 	rol	\$5,$e
    174 	xor	`4*(($j+8)%16)`(%rsp),$xi
    175 	xor	$d,$t0
    176 	add	$e,$f
    177 	xor	`4*(($j+13)%16)`(%rsp),$xi
    178 	rol	\$30,$b
    179 	add	$t0,$f
    180 	rol	\$1,$xi
    181 ___
    182 $code.=<<___ if ($i<76);
    183 	mov	$xi,`4*($j%16)`(%rsp)
    184 ___
    185 $code.=<<___ if ($i==79);
    186 	lea	$K($xi,$e),$f
    187 	mov	$c,$t0
    188 	mov	$a,$e
    189 	xor	$b,$t0
    190 	rol	\$5,$e
    191 	xor	$d,$t0
    192 	add	$e,$f
    193 	rol	\$30,$b
    194 	add	$t0,$f
    195 ___
    196 }
    197 
    198 sub BODY_40_59 {
    199 my ($i,$a,$b,$c,$d,$e,$f)=@_;
    200 my $j=$i+1;
    201 $code.=<<___;
    202 	lea	0x8f1bbcdc($xi,$e),$f
    203 	mov	`4*($j%16)`(%rsp),$xi
    204 	mov	$b,$t0
    205 	mov	$b,$t1
    206 	xor	`4*(($j+2)%16)`(%rsp),$xi
    207 	mov	$a,$e
    208 	and	$c,$t0
    209 	xor	`4*(($j+8)%16)`(%rsp),$xi
    210 	or	$c,$t1
    211 	rol	\$5,$e
    212 	xor	`4*(($j+13)%16)`(%rsp),$xi
    213 	and	$d,$t1
    214 	add	$e,$f
    215 	rol	\$1,$xi
    216 	or	$t1,$t0
    217 	rol	\$30,$b
    218 	mov	$xi,`4*($j%16)`(%rsp)
    219 	add	$t0,$f
    220 ___
    221 }
    222 
    223 
    224 #
    225 # Execution begins here
    226 #
    227 
    228 $code=<<___;
    229 #if defined(lint) || defined(__lint)
    230 #include <sys/stdint.h>
    231 #include <sys/sha1.h>
    232 
    233 /* ARGSUSED */
    234 void
    235 sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks)
    236 {
    237 }
    238 
    239 #else
    240 #include <sys/asm_linkage.h>
    241 ___
    242 
    243 
    244 &PROLOGUE("sha1_block_data_order");
    245 $code.=".align	4\n.Lloop:\n";
    246 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    247 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    248 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    249 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    250 $code.=<<___;
    251 	/ Update and save state information in SHA-1 context
    252 	add	0($ctx),$E
    253 	add	4($ctx),$T
    254 	add	8($ctx),$A
    255 	add	12($ctx),$B
    256 	add	16($ctx),$C
    257 	mov	$E,0($ctx)
    258 	mov	$T,4($ctx)
    259 	mov	$A,8($ctx)
    260 	mov	$B,12($ctx)
    261 	mov	$C,16($ctx)
    262 
    263 	xchg	$E,$A	# mov	$E,$A
    264 	xchg	$T,$B	# mov	$T,$B
    265 	xchg	$E,$C	# mov	$A,$C
    266 	xchg	$T,$D	# mov	$B,$D
    267 			# mov	$C,$E
    268 	lea	`16*4`($inp),$inp
    269 	sub	\$1,$num
    270 	jnz	.Lloop
    271 ___
    272 &EPILOGUE("sha1_block_data_order");
    273 $code.=<<___;
    274 .asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    275 
    276 #endif /* lint || __lint */
    277 ___
    278 
    279 ####################################################################
    280 
    281 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    282 print $code;
    283 close STDOUT;
    284