diff options
author | Brian Smith <brian@briansmith.org> | 2016-12-22 16:23:46 -1000 |
---|---|---|
committer | Adam Langley <alangley@gmail.com> | 2017-01-13 01:19:01 +0000 |
commit | 18a37a4211c6cb81b2e28fa619b713e5b8a9a56c (patch) | |
tree | 33e0b017bbb1586a520e02f8f108183e317e4d12 | |
parent | ac153bded307d773b833888fbed968777bcd8962 (diff) | |
download | boringssl-18a37a4211c6cb81b2e28fa619b713e5b8a9a56c.zip boringssl-18a37a4211c6cb81b2e28fa619b713e5b8a9a56c.tar.gz boringssl-18a37a4211c6cb81b2e28fa619b713e5b8a9a56c.tar.bz2 |
Remove unused "pure" MMX x86 GCM implementation.
BoringSSL will always use the SSE version so this is all dead code.
Change-Id: I0f3b51ee29144b5c83d2553c92bebae901b6366f
Reviewed-on: https://boringssl-review.googlesource.com/13023
Reviewed-by: Adam Langley <alangley@gmail.com>
-rw-r--r-- | crypto/modes/asm/ghash-x86.pl | 151 |
1 files changed, 3 insertions, 148 deletions
diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl index f174f7f..182c29a 100644 --- a/crypto/modes/asm/ghash-x86.pl +++ b/crypto/modes/asm/ghash-x86.pl @@ -265,155 +265,10 @@ if (!$x86only) {{{ if (!$sse2) {{ # pure-MMX "May" version... -$S=12; # shift factor for rem_4bit - -&function_begin_B("_mmx_gmult_4bit_inner"); -# MMX version performs 3.5 times better on P4 (see comment in non-MMX -# routine for further details), 100% better on Opteron, ~70% better -# on Core2 and PIII... In other words effort is considered to be well -# spent... Since initial release the loop was unrolled in order to -# "liberate" register previously used as loop counter. Instead it's -# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'. -# The path involves move of Z.lo from MMX to integer register, -# effective address calculation and finally merge of value to Z.hi. -# Reference to rem_4bit is scheduled so late that I had to >>4 -# rem_4bit elements. This resulted in 20-45% procent improvement -# on contemporary ยต-archs. -{ - my $cnt; - my $rem_4bit = "eax"; - my @rem = ($Zhh,$Zll); - my $nhi = $Zhl; - my $nlo = $Zlh; - - my ($Zlo,$Zhi) = ("mm0","mm1"); - my $tmp = "mm2"; - - &xor ($nlo,$nlo); # avoid partial register stalls on PIII - &mov ($nhi,$Zll); - &mov (&LB($nlo),&LB($nhi)); - &shl (&LB($nlo),4); - &and ($nhi,0xf0); - &movq ($Zlo,&QWP(8,$Htbl,$nlo)); - &movq ($Zhi,&QWP(0,$Htbl,$nlo)); - &movd ($rem[0],$Zlo); - - for ($cnt=28;$cnt>=-2;$cnt--) { - my $odd = $cnt&1; - my $nix = $odd ? $nlo : $nhi; - - &shl (&LB($nlo),4) if ($odd); - &psrlq ($Zlo,4); - &movq ($tmp,$Zhi); - &psrlq ($Zhi,4); - &pxor ($Zlo,&QWP(8,$Htbl,$nix)); - &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0); - &psllq ($tmp,60); - &and ($nhi,0xf0) if ($odd); - &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28); - &and ($rem[0],0xf); - &pxor ($Zhi,&QWP(0,$Htbl,$nix)); - &mov ($nhi,$nlo) if (!$odd && $cnt>=0); - &movd ($rem[1],$Zlo); - &pxor ($Zlo,$tmp); - - push (@rem,shift(@rem)); # "rotate" registers - } - - &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem] - - &psrlq ($Zlo,32); # lower part of Zlo is already there - &movd ($Zhl,$Zhi); - &psrlq ($Zhi,32); - &movd ($Zlh,$Zlo); - &movd ($Zhh,$Zhi); - &shl ($inp,4); # compensate for rem_4bit[i] being >>4 - - &bswap ($Zll); - &bswap ($Zhl); - &bswap ($Zlh); - &xor ($Zhh,$inp); - &bswap ($Zhh); - - &ret (); -} -&function_end_B("_mmx_gmult_4bit_inner"); - -&function_begin("gcm_gmult_4bit_mmx"); - &mov ($inp,&wparam(0)); # load Xi - &mov ($Htbl,&wparam(1)); # load Htable + # This code was removed since SSE2 is required for BoringSSL. The + # outer structure of the code was retained to minimize future merge + # conflicts. - &call (&label("pic_point")); - &set_label("pic_point"); - &blindpop("eax"); - &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); - - &movz ($Zll,&BP(15,$inp)); - - &call ("_mmx_gmult_4bit_inner"); - - &mov ($inp,&wparam(0)); # load Xi - &emms (); - &mov (&DWP(12,$inp),$Zll); - &mov (&DWP(4,$inp),$Zhl); - &mov (&DWP(8,$inp),$Zlh); - &mov (&DWP(0,$inp),$Zhh); -&function_end("gcm_gmult_4bit_mmx"); - -# Streamed version performs 20% better on P4, 7% on Opteron, -# 10% on Core2 and PIII... -&function_begin("gcm_ghash_4bit_mmx"); - &mov ($Zhh,&wparam(0)); # load Xi - &mov ($Htbl,&wparam(1)); # load Htable - &mov ($inp,&wparam(2)); # load in - &mov ($Zlh,&wparam(3)); # load len - - &call (&label("pic_point")); - &set_label("pic_point"); - &blindpop("eax"); - &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); - - &add ($Zlh,$inp); - &mov (&wparam(3),$Zlh); # len to point at the end of input - &stack_push(4+1); # +1 for stack alignment - - &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] - &mov ($Zhl,&DWP(4,$Zhh)); - &mov ($Zlh,&DWP(8,$Zhh)); - &mov ($Zhh,&DWP(0,$Zhh)); - &jmp (&label("mmx_outer_loop")); - - &set_label("mmx_outer_loop",16); - &xor ($Zll,&DWP(12,$inp)); - &xor ($Zhl,&DWP(4,$inp)); - &xor ($Zlh,&DWP(8,$inp)); - &xor ($Zhh,&DWP(0,$inp)); - &mov (&wparam(2),$inp); - &mov (&DWP(12,"esp"),$Zll); - &mov (&DWP(4,"esp"),$Zhl); - &mov (&DWP(8,"esp"),$Zlh); - &mov (&DWP(0,"esp"),$Zhh); - - &mov ($inp,"esp"); - &shr ($Zll,24); - - &call ("_mmx_gmult_4bit_inner"); - - &mov ($inp,&wparam(2)); - &lea ($inp,&DWP(16,$inp)); - &cmp ($inp,&wparam(3)); - &jb (&label("mmx_outer_loop")); - - &mov ($inp,&wparam(0)); # load Xi - &emms (); - &mov (&DWP(12,$inp),$Zll); - &mov (&DWP(4,$inp),$Zhl); - &mov (&DWP(8,$inp),$Zlh); - &mov (&DWP(0,$inp),$Zhh); - - &stack_pop(4+1); -&function_end("gcm_ghash_4bit_mmx"); - }} else {{ # "June" MMX version... # ... has slower "April" gcm_gmult_4bit_mmx with folded # loop. This is done to conserve code size... |