aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Smith <brian@briansmith.org>2016-12-22 16:23:46 -1000
committerAdam Langley <alangley@gmail.com>2017-01-13 01:19:01 +0000
commit18a37a4211c6cb81b2e28fa619b713e5b8a9a56c (patch)
tree33e0b017bbb1586a520e02f8f108183e317e4d12
parentac153bded307d773b833888fbed968777bcd8962 (diff)
downloadboringssl-18a37a4211c6cb81b2e28fa619b713e5b8a9a56c.zip
boringssl-18a37a4211c6cb81b2e28fa619b713e5b8a9a56c.tar.gz
boringssl-18a37a4211c6cb81b2e28fa619b713e5b8a9a56c.tar.bz2
Remove unused "pure" MMX x86 GCM implementation.
BoringSSL will always use the SSE version so this is all dead code. Change-Id: I0f3b51ee29144b5c83d2553c92bebae901b6366f Reviewed-on: https://boringssl-review.googlesource.com/13023 Reviewed-by: Adam Langley <alangley@gmail.com>
-rw-r--r--crypto/modes/asm/ghash-x86.pl151
1 files changed, 3 insertions, 148 deletions
diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl
index f174f7f..182c29a 100644
--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
@@ -265,155 +265,10 @@ if (!$x86only) {{{
if (!$sse2) {{ # pure-MMX "May" version...
-$S=12; # shift factor for rem_4bit
-
-&function_begin_B("_mmx_gmult_4bit_inner");
-# MMX version performs 3.5 times better on P4 (see comment in non-MMX
-# routine for further details), 100% better on Opteron, ~70% better
-# on Core2 and PIII... In other words effort is considered to be well
-# spent... Since initial release the loop was unrolled in order to
-# "liberate" register previously used as loop counter. Instead it's
-# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
-# The path involves move of Z.lo from MMX to integer register,
-# effective address calculation and finally merge of value to Z.hi.
-# Reference to rem_4bit is scheduled so late that I had to >>4
-# rem_4bit elements. This resulted in 20-45% procent improvement
-# on contemporary ยต-archs.
-{
- my $cnt;
- my $rem_4bit = "eax";
- my @rem = ($Zhh,$Zll);
- my $nhi = $Zhl;
- my $nlo = $Zlh;
-
- my ($Zlo,$Zhi) = ("mm0","mm1");
- my $tmp = "mm2";
-
- &xor ($nlo,$nlo); # avoid partial register stalls on PIII
- &mov ($nhi,$Zll);
- &mov (&LB($nlo),&LB($nhi));
- &shl (&LB($nlo),4);
- &and ($nhi,0xf0);
- &movq ($Zlo,&QWP(8,$Htbl,$nlo));
- &movq ($Zhi,&QWP(0,$Htbl,$nlo));
- &movd ($rem[0],$Zlo);
-
- for ($cnt=28;$cnt>=-2;$cnt--) {
- my $odd = $cnt&1;
- my $nix = $odd ? $nlo : $nhi;
-
- &shl (&LB($nlo),4) if ($odd);
- &psrlq ($Zlo,4);
- &movq ($tmp,$Zhi);
- &psrlq ($Zhi,4);
- &pxor ($Zlo,&QWP(8,$Htbl,$nix));
- &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0);
- &psllq ($tmp,60);
- &and ($nhi,0xf0) if ($odd);
- &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
- &and ($rem[0],0xf);
- &pxor ($Zhi,&QWP(0,$Htbl,$nix));
- &mov ($nhi,$nlo) if (!$odd && $cnt>=0);
- &movd ($rem[1],$Zlo);
- &pxor ($Zlo,$tmp);
-
- push (@rem,shift(@rem)); # "rotate" registers
- }
-
- &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem]
-
- &psrlq ($Zlo,32); # lower part of Zlo is already there
- &movd ($Zhl,$Zhi);
- &psrlq ($Zhi,32);
- &movd ($Zlh,$Zlo);
- &movd ($Zhh,$Zhi);
- &shl ($inp,4); # compensate for rem_4bit[i] being >>4
-
- &bswap ($Zll);
- &bswap ($Zhl);
- &bswap ($Zlh);
- &xor ($Zhh,$inp);
- &bswap ($Zhh);
-
- &ret ();
-}
-&function_end_B("_mmx_gmult_4bit_inner");
-
-&function_begin("gcm_gmult_4bit_mmx");
- &mov ($inp,&wparam(0)); # load Xi
- &mov ($Htbl,&wparam(1)); # load Htable
+ # This code was removed since SSE2 is required for BoringSSL. The
+ # outer structure of the code was retained to minimize future merge
+ # conflicts.
- &call (&label("pic_point"));
- &set_label("pic_point");
- &blindpop("eax");
- &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
-
- &movz ($Zll,&BP(15,$inp));
-
- &call ("_mmx_gmult_4bit_inner");
-
- &mov ($inp,&wparam(0)); # load Xi
- &emms ();
- &mov (&DWP(12,$inp),$Zll);
- &mov (&DWP(4,$inp),$Zhl);
- &mov (&DWP(8,$inp),$Zlh);
- &mov (&DWP(0,$inp),$Zhh);
-&function_end("gcm_gmult_4bit_mmx");
-
-# Streamed version performs 20% better on P4, 7% on Opteron,
-# 10% on Core2 and PIII...
-&function_begin("gcm_ghash_4bit_mmx");
- &mov ($Zhh,&wparam(0)); # load Xi
- &mov ($Htbl,&wparam(1)); # load Htable
- &mov ($inp,&wparam(2)); # load in
- &mov ($Zlh,&wparam(3)); # load len
-
- &call (&label("pic_point"));
- &set_label("pic_point");
- &blindpop("eax");
- &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
-
- &add ($Zlh,$inp);
- &mov (&wparam(3),$Zlh); # len to point at the end of input
- &stack_push(4+1); # +1 for stack alignment
-
- &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
- &mov ($Zhl,&DWP(4,$Zhh));
- &mov ($Zlh,&DWP(8,$Zhh));
- &mov ($Zhh,&DWP(0,$Zhh));
- &jmp (&label("mmx_outer_loop"));
-
- &set_label("mmx_outer_loop",16);
- &xor ($Zll,&DWP(12,$inp));
- &xor ($Zhl,&DWP(4,$inp));
- &xor ($Zlh,&DWP(8,$inp));
- &xor ($Zhh,&DWP(0,$inp));
- &mov (&wparam(2),$inp);
- &mov (&DWP(12,"esp"),$Zll);
- &mov (&DWP(4,"esp"),$Zhl);
- &mov (&DWP(8,"esp"),$Zlh);
- &mov (&DWP(0,"esp"),$Zhh);
-
- &mov ($inp,"esp");
- &shr ($Zll,24);
-
- &call ("_mmx_gmult_4bit_inner");
-
- &mov ($inp,&wparam(2));
- &lea ($inp,&DWP(16,$inp));
- &cmp ($inp,&wparam(3));
- &jb (&label("mmx_outer_loop"));
-
- &mov ($inp,&wparam(0)); # load Xi
- &emms ();
- &mov (&DWP(12,$inp),$Zll);
- &mov (&DWP(4,$inp),$Zhl);
- &mov (&DWP(8,$inp),$Zlh);
- &mov (&DWP(0,$inp),$Zhh);
-
- &stack_pop(4+1);
-&function_end("gcm_ghash_4bit_mmx");
-
}} else {{ # "June" MMX version...
# ... has slower "April" gcm_gmult_4bit_mmx with folded
# loop. This is done to conserve code size...