diff options
Diffstat (limited to 'crypto/bn/asm')
-rw-r--r-- | crypto/bn/asm/armv4-gf2m.pl | 10 | ||||
-rw-r--r-- | crypto/bn/asm/c64xplus-gf2m.pl | 12 | ||||
-rw-r--r-- | crypto/bn/asm/ia64.S | 2 | ||||
-rw-r--r-- | crypto/bn/asm/s390x-gf2m.pl | 6 | ||||
-rw-r--r-- | crypto/bn/asm/x86-gf2m.pl | 16 | ||||
-rw-r--r-- | crypto/bn/asm/x86_64-gcc.c | 2 | ||||
-rw-r--r-- | crypto/bn/asm/x86_64-gf2m.pl | 16 |
7 files changed, 32 insertions, 32 deletions
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl index f05461a..a0b018c 100644 --- a/crypto/bn/asm/armv4-gf2m.pl +++ b/crypto/bn/asm/armv4-gf2m.pl @@ -27,7 +27,7 @@ # referred below, which improves ECDH and ECDSA verify benchmarks # by 18-40%. # -# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software # Polynomial Multiplication on ARM Processors using the NEON Engine. # # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf @@ -148,7 +148,7 @@ ___ ################ # void bn_GF2m_mul_2x2(BN_ULONG *r, # BN_ULONG a1,BN_ULONG a0, -# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 +# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 { $code.=<<___; .global bn_GF2m_mul_2x2 @@ -171,7 +171,7 @@ $code.=<<___; mov $mask,#7<<2 sub sp,sp,#32 @ allocate tab[8] - bl mul_1x1_ialu @ a1·b1 + bl mul_1x1_ialu @ a1·b1 str $lo,[$ret,#8] str $hi,[$ret,#12] @@ -181,13 +181,13 @@ $code.=<<___; eor r2,r2,$a eor $b,$b,r3 eor $a,$a,r2 - bl mul_1x1_ialu @ a0·b0 + bl mul_1x1_ialu @ a0·b0 str $lo,[$ret] str $hi,[$ret,#4] eor $a,$a,r2 eor $b,$b,r3 - bl mul_1x1_ialu @ (a1+a0)·(b1+b0) + bl mul_1x1_ialu @ (a1+a0)·(b1+b0) ___ @r=map("r$_",(6..9)); $code.=<<___; diff --git a/crypto/bn/asm/c64xplus-gf2m.pl b/crypto/bn/asm/c64xplus-gf2m.pl index e4aa4e4..c79f46f 100644 --- a/crypto/bn/asm/c64xplus-gf2m.pl +++ b/crypto/bn/asm/c64xplus-gf2m.pl @@ -120,26 +120,26 @@ _bn_GF2m_mul_2x2: .asmfunc MVK 0xFF,$xFF ___ - &mul_1x1_upper($a0,$b0); # a0·b0 + &mul_1x1_upper($a0,$b0); # a0·b0 $code.=<<___; || MV $b1,$B MV $a1,$A ___ - &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1 + &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1 $code.=<<___; || XOR $b0,$b1,$B XOR $a0,$a1,$A ___ - &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1) + &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1) $code.=<<___; XOR A28,A31,A29 -|| XOR B28,B31,B29 ; a0·b0+a1·b1 +|| XOR B28,B31,B29 ; a0·b0+a1·b1 ___ - &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1) + &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1) $code.=<<___; || BNOP B3 XOR A29,A30,A30 -|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1 +|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1 XOR B28,A30,A30 || STW A28,*${rp}[0] XOR B30,A31,A31 diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S index 951abc5..c0cee82 100644 --- a/crypto/bn/asm/ia64.S +++ b/crypto/bn/asm/ia64.S @@ -568,7 +568,7 @@ bn_sqr_comba8: // I've estimated this routine to run in ~120 ticks, but in reality // (i.e. according to ar.itc) it takes ~160 ticks. Are those extra // cycles consumed for instructions fetch? Or did I misinterpret some -// clause in Itanium µ-architecture manual? Comments are welcomed and +// clause in Itanium µ-architecture manual? Comments are welcomed and // highly appreciated. // // On Itanium 2 it takes ~190 ticks. This is because of stalls on diff --git a/crypto/bn/asm/s390x-gf2m.pl b/crypto/bn/asm/s390x-gf2m.pl index cd9f13e..9d18d40 100644 --- a/crypto/bn/asm/s390x-gf2m.pl +++ b/crypto/bn/asm/s390x-gf2m.pl @@ -172,19 +172,19 @@ ___ if ($SIZE_T==8) { my @r=map("%r$_",(6..9)); $code.=<<___; - bras $ra,_mul_1x1 # a1·b1 + bras $ra,_mul_1x1 # a1·b1 stmg $lo,$hi,16($rp) lg $a,`$stdframe+128+4*$SIZE_T`($sp) lg $b,`$stdframe+128+6*$SIZE_T`($sp) - bras $ra,_mul_1x1 # a0·b0 + bras $ra,_mul_1x1 # a0·b0 stmg $lo,$hi,0($rp) lg $a,`$stdframe+128+3*$SIZE_T`($sp) lg $b,`$stdframe+128+5*$SIZE_T`($sp) xg $a,`$stdframe+128+4*$SIZE_T`($sp) xg $b,`$stdframe+128+6*$SIZE_T`($sp) - bras $ra,_mul_1x1 # (a0+a1)·(b0+b1) + bras $ra,_mul_1x1 # (a0+a1)·(b0+b1) lmg @r[0],@r[3],0($rp) xgr $lo,$hi diff --git a/crypto/bn/asm/x86-gf2m.pl b/crypto/bn/asm/x86-gf2m.pl index 808a1e5..b579530 100644 --- a/crypto/bn/asm/x86-gf2m.pl +++ b/crypto/bn/asm/x86-gf2m.pl @@ -14,7 +14,7 @@ # the time being... Except that it has three code paths: pure integer # code suitable for any x86 CPU, MMX code suitable for PIII and later # and PCLMULQDQ suitable for Westmere and later. Improvement varies -# from one benchmark and µ-arch to another. Below are interval values +# from one benchmark and µ-arch to another. Below are interval values # for 163- and 571-bit ECDH benchmarks relative to compiler-generated # code: # @@ -226,22 +226,22 @@ if ($sse2) { &push ("edi"); &mov ($a,&wparam(1)); &mov ($b,&wparam(3)); - &call ("_mul_1x1_mmx"); # a1·b1 + &call ("_mul_1x1_mmx"); # a1·b1 &movq ("mm7",$R); &mov ($a,&wparam(2)); &mov ($b,&wparam(4)); - &call ("_mul_1x1_mmx"); # a0·b0 + &call ("_mul_1x1_mmx"); # a0·b0 &movq ("mm6",$R); &mov ($a,&wparam(1)); &mov ($b,&wparam(3)); &xor ($a,&wparam(2)); &xor ($b,&wparam(4)); - &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) + &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) &pxor ($R,"mm7"); &mov ($a,&wparam(0)); - &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 + &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 &movq ($A,$R); &psllq ($R,32); @@ -266,13 +266,13 @@ if ($sse2) { &mov ($a,&wparam(1)); &mov ($b,&wparam(3)); - &call ("_mul_1x1_ialu"); # a1·b1 + &call ("_mul_1x1_ialu"); # a1·b1 &mov (&DWP(8,"esp"),$lo); &mov (&DWP(12,"esp"),$hi); &mov ($a,&wparam(2)); &mov ($b,&wparam(4)); - &call ("_mul_1x1_ialu"); # a0·b0 + &call ("_mul_1x1_ialu"); # a0·b0 &mov (&DWP(0,"esp"),$lo); &mov (&DWP(4,"esp"),$hi); @@ -280,7 +280,7 @@ if ($sse2) { &mov ($b,&wparam(3)); &xor ($a,&wparam(2)); &xor ($b,&wparam(4)); - &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) + &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) &mov ("ebp",&wparam(0)); @r=("ebx","ecx","edi","esi"); diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c index d548886..d77dc43 100644 --- a/crypto/bn/asm/x86_64-gcc.c +++ b/crypto/bn/asm/x86_64-gcc.c @@ -65,7 +65,7 @@ # undef mul_add /*- - * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; + * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; * "g"(0) let the compiler to decide where does it * want to keep the value of zero; */ diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl index 226c66c..42bbec2 100644 --- a/crypto/bn/asm/x86_64-gf2m.pl +++ b/crypto/bn/asm/x86_64-gf2m.pl @@ -13,7 +13,7 @@ # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for # the time being... Except that it has two code paths: code suitable # for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and -# later. Improvement varies from one benchmark and µ-arch to another. +# later. Improvement varies from one benchmark and µ-arch to another. # Vanilla code path is at most 20% faster than compiler-generated code # [not very impressive], while PCLMULQDQ - whole 85%-160% better on # 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that @@ -184,13 +184,13 @@ ___ $code.=<<___; movdqa %xmm0,%xmm4 movdqa %xmm1,%xmm5 - pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 + pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 pxor %xmm2,%xmm4 pxor %xmm3,%xmm5 - pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 - pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) + pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 + pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) xorps %xmm0,%xmm4 - xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 + xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 movdqa %xmm4,%xmm5 pslldq \$8,%xmm4 psrldq \$8,%xmm5 @@ -225,13 +225,13 @@ $code.=<<___; mov \$0xf,$mask mov $a1,$a mov $b1,$b - call _mul_1x1 # a1·b1 + call _mul_1x1 # a1·b1 mov $lo,16(%rsp) mov $hi,24(%rsp) mov 48(%rsp),$a mov 64(%rsp),$b - call _mul_1x1 # a0·b0 + call _mul_1x1 # a0·b0 mov $lo,0(%rsp) mov $hi,8(%rsp) @@ -239,7 +239,7 @@ $code.=<<___; mov 56(%rsp),$b xor 48(%rsp),$a xor 64(%rsp),$b - call _mul_1x1 # (a0+a1)·(b0+b1) + call _mul_1x1 # (a0+a1)·(b0+b1) ___ @r=("%rbx","%rcx","%rdi","%rsi"); $code.=<<___; |