aboutsummaryrefslogtreecommitdiff
path: root/crypto/bn/asm/x86_64-mont.pl
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2007-06-17 17:10:03 +0000
committerAndy Polyakov <appro@openssl.org>2007-06-17 17:10:03 +0000
commit7d9cf7c0bbc17a2c00339e660c83ebf1a4f9061a (patch)
tree004fe317e4795d576c92746d7e954c2db6a5d3af /crypto/bn/asm/x86_64-mont.pl
parent55525742f4c2bf416013fc3a75ec642775d97f80 (diff)
downloadopenssl-7d9cf7c0bbc17a2c00339e660c83ebf1a4f9061a.zip
openssl-7d9cf7c0bbc17a2c00339e660c83ebf1a4f9061a.tar.gz
openssl-7d9cf7c0bbc17a2c00339e660c83ebf1a4f9061a.tar.bz2
Eliminate conditional final subtraction in Montgomery assembler modules.
Diffstat (limited to 'crypto/bn/asm/x86_64-mont.pl')
-rwxr-xr-xcrypto/bn/asm/x86_64-mont.pl55
1 files changed, 28 insertions, 27 deletions
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index bc3fa83..6701bf2 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -59,6 +59,7 @@ bn_mul_mont:
neg %rax
lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2))
and \$-1024,%rsp # minimize TLB usage
+
mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp
mov %rdx,$bp # $bp reassigned, remember?
@@ -166,22 +167,38 @@ bn_mul_mont:
cmp $num,$i
jl .Louter
- xor $i,$i # i=0
+ mov -8($np,$num,8),%rax # np[num-1]
+ lea (%rsp),$ap # borrow ap for tp
+ shr \$62,%rax # check for boundary condition
+ jz .Lcopy
+
+ mov ($ap),%rax # tp[0]
lea -1($num),$j # j=num-1
- cmp \$0,%rdx # %rdx still holds upmost overflow bit
- jnz .Lsub # CF is cleared by compare with 0
- mov (%rsp,$j,8),%rax
- cmp ($np,$j,8),%rax # tp[num-1]-np[num-1]
- jae .Lsub # if taken CF was cleared by above cmp
-.align 4
-.Lcopy:
- mov (%rsp,$j,8),%rax
+ xor $i,$i # i=0 and clear CF!
+ jmp .Lsub
+.align 16
+.Lsub: sbb ($np,$i,8),%rax
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
+ dec $j # doesn't affect CF!
+ mov 8($ap,$i,8),%rax # tp[i+1]
+ lea 1($i),$i # i++
+ jge .Lsub
+
+ sbb \$0,%rax # handle upmost overflow bit
+ and %rax,$ap
+ not %rax
+ mov $rp,$np
+ and %rax,$np
+ lea -1($num),$j
+ or $np,$ap # ap=borrow?tp:rp
+.align 16
+.Lcopy: # copy or in-place refresh
+ mov ($ap,$j,8),%rax
mov %rax,($rp,$j,8) # rp[i]=tp[i]
mov $i,(%rsp,$j,8) # zap temporary vector
dec $j
jge .Lcopy
-.align 4
-.Lexit:
+
mov 8(%rsp,$num,8),%rsp # restore %rsp
mov \$1,%rax
pop %r15
@@ -191,22 +208,6 @@ bn_mul_mont:
pop %rbp
pop %rbx
ret
-
-.align 16
-.Lsub: mov (%rsp,$i,8),%rax
- sbb ($np,$i,8),%rax
- mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[j]
- lea 1($i),$i # i++
- dec $j # doesn't affect CF!
- jge .Lsub
- lea -1($num),$j # j=num-1
- sbb \$0,%rdx
- jc .Lcopy # tp was less than np
-.align 4
-.Lzap: mov $i,(%rsp,$j,8) # zap temporary vector
- dec $j
- jge .Lzap
- jmp .Lexit
.size bn_mul_mont,.-bn_mul_mont
.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___