aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/alpha
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/alpha')
-rw-r--r--sysdeps/alpha/addmul_1.s23
-rw-r--r--sysdeps/alpha/alphaev5/add_n.s175
-rw-r--r--sysdeps/alpha/alphaev5/lshift.s30
-rw-r--r--sysdeps/alpha/alphaev5/rshift.s28
-rw-r--r--sysdeps/alpha/alphaev5/sub_n.s148
-rw-r--r--sysdeps/alpha/lshift.s14
-rw-r--r--sysdeps/alpha/mul_1.s2
-rw-r--r--sysdeps/alpha/rshift.s16
-rw-r--r--sysdeps/alpha/submul_1.s23
-rw-r--r--sysdeps/alpha/udiv_qrnnd.S34
10 files changed, 323 insertions, 170 deletions
diff --git a/sysdeps/alpha/addmul_1.s b/sysdeps/alpha/addmul_1.s
index 46d277d..8b168cb 100644
--- a/sysdeps/alpha/addmul_1.s
+++ b/sysdeps/alpha/addmul_1.s
@@ -26,16 +26,7 @@
# size r18
# s2_limb r19
- # This code runs at 42 cycles/limb on the 21064.
-
- # To improve performance for long multiplications, we would use
- # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
- # these instructions without slowing down the general code: 1. We can
- # only have two prefetches in operation at any time in the Alpha
- # architecture. 2. There will seldom be any special alignment
- # between RES_PTR and S1_PTR. Maybe we can simply divide the current
- # loop into an inner and outer loop, having the inner loop handle
- # exactly one prefetch block?
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
.set noreorder
.set noat
@@ -52,7 +43,7 @@ __mpn_addmul_1:
mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
umulh $2,$19,$0 # $0 = prod_high
- beq $18,Lend1 # jump if size was == 1
+ beq $18,.Lend1 # jump if size was == 1
ldq $2,0($17) # $2 = s1_limb
addq $17,8,$17 # s1_ptr++
subq $18,1,$18 # size--
@@ -60,10 +51,10 @@ __mpn_addmul_1:
cmpult $3,$5,$4
stq $3,0($16)
addq $16,8,$16 # res_ptr++
- beq $18,Lend2 # jump if size was == 2
+ beq $18,.Lend2 # jump if size was == 2
.align 3
-Loop: mulq $2,$19,$3 # $3 = prod_low
+.Loop: mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
subq $18,1,$18 # size--
@@ -77,9 +68,9 @@ Loop: mulq $2,$19,$3 # $3 = prod_low
stq $3,0($16)
addq $16,8,$16 # res_ptr++
addq $5,$0,$0 # combine carries
- bne $18,Loop
+ bne $18,.Loop
-Lend2: mulq $2,$19,$3 # $3 = prod_low
+.Lend2: mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
umulh $2,$19,$4 # $4 = cy_limb
@@ -91,7 +82,7 @@ Lend2: mulq $2,$19,$3 # $3 = prod_low
addq $5,$0,$0 # combine carries
addq $4,$0,$0 # cy_limb = prod_high + cy
ret $31,($26),1
-Lend1: addq $5,$3,$3
+.Lend1: addq $5,$3,$3
cmpult $3,$5,$5
stq $3,0($16)
addq $0,$5,$0
diff --git a/sysdeps/alpha/alphaev5/add_n.s b/sysdeps/alpha/alphaev5/add_n.s
index 2aaf041..66cf82b 100644
--- a/sysdeps/alpha/alphaev5/add_n.s
+++ b/sysdeps/alpha/alphaev5/add_n.s
@@ -35,84 +35,113 @@
__mpn_add_n:
.frame $30,0,$26,0
- ldq $3,0($17)
- ldq $4,0($18)
-
- subq $19,1,$19
- and $19,4-1,$2 # number of limbs in first loop
- bis $31,$31,$0
- beq $2,.L0 # if multiple of 4 limbs, skip first loop
-
- subq $19,$2,$19
-
-.Loop0: subq $2,1,$2
+ or $31,$31,$25 # clear cy
+ subq $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldq $0,0($18)
+ ldq $1,8($18)
+ ldq $4,0($17)
ldq $5,8($17)
- addq $4,$0,$4
- ldq $6,8($18)
- cmpult $4,$0,$1
- addq $3,$4,$4
- cmpult $4,$3,$0
- stq $4,0($16)
- or $0,$1,$0
-
- addq $17,8,$17
- addq $18,8,$18
- bis $5,$5,$3
- bis $6,$6,$4
- addq $16,8,$16
- bne $2,.Loop0
-
-.L0: beq $19,.Lend
-
+ addq $17,32,$17 # update s1_ptr
+ ldq $2,16($18)
+ addq $0,$4,$20 # 1st main add
+ ldq $3,24($18)
+ subq $19,4,$19 # decr loop cnt
+ ldq $6,-16($17)
+ cmpult $20,$0,$25 # compute cy from last add
+ ldq $7,-8($17)
+ addq $1,$25,$28 # cy add
+ addq $18,32,$18 # update s2_ptr
+ addq $5,$28,$21 # 2nd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
.align 4
-.Loop: subq $19,4,$19
- unop
-
- ldq $6,8($18)
- addq $4,$0,$0
+.Loop: cmpult $21,$28,$25 # compute cy from last add
+ ldq $0,0($18)
+ or $8,$25,$25 # combine cy from the two adds
+ ldq $1,8($18)
+ addq $2,$25,$28 # cy add
+ ldq $4,0($17)
+ addq $28,$6,$22 # 3rd main add
ldq $5,8($17)
- cmpult $0,$4,$1
- ldq $4,16($18)
- addq $3,$0,$20
- cmpult $20,$3,$0
- ldq $3,16($17)
- or $0,$1,$0
- addq $6,$0,$0
- cmpult $0,$6,$1
- ldq $6,24($18)
- addq $5,$0,$21
- cmpult $21,$5,$0
- ldq $5,24($17)
- or $0,$1,$0
- addq $4,$0,$0
- cmpult $0,$4,$1
- ldq $4,32($18)
- addq $3,$0,$22
- cmpult $22,$3,$0
- ldq $3,32($17)
- or $0,$1,$0
- addq $6,$0,$0
- cmpult $0,$6,$1
- addq $5,$0,$23
- cmpult $23,$5,$0
- or $0,$1,$0
-
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
stq $21,8($16)
- stq $22,16($16)
- stq $23,24($16)
-
- addq $17,32,$17
- addq $18,32,$18
- addq $16,32,$16
- bne $19,.Loop
+ addq $3,$25,$28 # cy add
+ addq $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ addq $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ addq $0,$25,$28 # cy add
+ ldq $2,16($18)
+ addq $4,$28,$20 # 1st main add
+ ldq $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldq $6,-16($17)
+ cmpult $20,$28,$25 # compute cy from last add
+ ldq $7,-8($17)
+ or $8,$25,$25 # combine cy from the two adds
+ subq $19,4,$19 # decr loop cnt
+ stq $22,-16($16)
+ addq $1,$25,$28 # cy add
+ stq $23,-8($16)
+ addq $5,$28,$21 # 2nd main add
+ addq $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $2,$25,$28 # cy add
+ addq $28,$6,$22 # 3rd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+ stq $21,8($16)
+ addq $3,$25,$28 # cy add
+ addq $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ stq $22,-16($16)
+ stq $23,-8($16)
+.Lend2: addq $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldq $0,0($18)
+ ldq $4,0($17)
+ subq $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addq $0,$25,$28 # cy add
+ ldq $0,8($18)
+ addq $4,$28,$20 # main add
+ ldq $4,8($17)
+ addq $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addq $17,8,$17
+ stq $20,0($16)
+ cmpult $20,$28,$25 # compute cy from last add
+ subq $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,8,$16
+ bne $19,.Loop0
+.Lend0: addq $0,$25,$28 # cy add
+ addq $4,$28,$20 # main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $20,$28,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
-.Lend: addq $4,$0,$4
- cmpult $4,$0,$1
- addq $3,$4,$4
- cmpult $4,$3,$0
- stq $4,0($16)
- or $0,$1,$0
+.Lret: or $25,$31,$0 # return cy
ret $31,($26),1
-
.end __mpn_add_n
diff --git a/sysdeps/alpha/alphaev5/lshift.s b/sysdeps/alpha/alphaev5/lshift.s
index fdb0895..392b424 100644
--- a/sysdeps/alpha/alphaev5/lshift.s
+++ b/sysdeps/alpha/alphaev5/lshift.s
@@ -25,7 +25,7 @@
# size r18
# cnt r19
- # This code runs at 4.25 cycles/limb on the EV5.
+ # This code runs at 3.25 cycles/limb on the EV5.
.set noreorder
.set noat
@@ -44,11 +44,11 @@ __mpn_lshift:
and $18,4-1,$28 # number of limbs in first loop
srl $4,$20,$0 # compute function result
- beq $28,L0
+ beq $28,.L0
subq $18,$28,$18
.align 3
-Loop0: ldq $3,-16($17)
+.Loop0: ldq $3,-16($17)
subq $16,8,$16
sll $4,$19,$5
subq $17,8,$17
@@ -57,17 +57,17 @@ Loop0: ldq $3,-16($17)
or $3,$3,$4
or $5,$6,$8
stq $8,0($16)
- bne $28,Loop0
+ bne $28,.Loop0
-L0: sll $4,$19,$24
- beq $18,Lend
+.L0: sll $4,$19,$24
+ beq $18,.Lend
# warm up phase 1
ldq $1,-16($17)
subq $18,4,$18
ldq $2,-24($17)
ldq $3,-32($17)
ldq $4,-40($17)
- beq $18,Lcool1
+ beq $18,.Lend1
# warm up phase 2
srl $1,$20,$7
sll $1,$19,$21
@@ -84,10 +84,10 @@ L0: sll $4,$19,$24
sll $4,$19,$24
ldq $4,-72($17)
subq $18,4,$18
- beq $18,Lcool1
+ beq $18,.Lend2
.align 4
# main loop
-Loop: stq $7,-8($16)
+.Loop: stq $7,-8($16)
or $5,$22,$5
stq $8,-16($16)
or $6,$23,$6
@@ -113,16 +113,14 @@ Loop: stq $7,-8($16)
subq $16,32,$16
srl $4,$20,$6
- ldq $3,-96($17
+ ldq $3,-96($17)
sll $4,$19,$24
ldq $4,-104($17)
subq $17,32,$17
- bne $18,Loop
- unop
- unop
+ bne $18,.Loop
# cool down phase 2/1
-Lcool1: stq $7,-8($16)
+.Lend2: stq $7,-8($16)
or $5,$22,$5
stq $8,-16($16)
or $6,$23,$6
@@ -150,7 +148,7 @@ Lcool1: stq $7,-8($16)
ret $31,($26),1
# cool down phase 1/1
-Lcool1: srl $1,$20,$7
+.Lend1: srl $1,$20,$7
sll $1,$19,$21
srl $2,$20,$8
sll $2,$19,$22
@@ -170,6 +168,6 @@ Lcool1: srl $1,$20,$7
stq $24,-40($16)
ret $31,($26),1
-Lend stq $24,-8($16)
+.Lend: stq $24,-8($16)
ret $31,($26),1
.end __mpn_lshift
diff --git a/sysdeps/alpha/alphaev5/rshift.s b/sysdeps/alpha/alphaev5/rshift.s
index 1da9960..d20dde3 100644
--- a/sysdeps/alpha/alphaev5/rshift.s
+++ b/sysdeps/alpha/alphaev5/rshift.s
@@ -25,7 +25,7 @@
# size r18
# cnt r19
- # This code runs at 4.25 cycles/limb on the EV5.
+ # This code runs at 3.25 cycles/limb on the EV5.
.set noreorder
.set noat
@@ -42,11 +42,11 @@ __mpn_rshift:
and $18,4-1,$28 # number of limbs in first loop
sll $4,$20,$0 # compute function result
- beq $28,L0
+ beq $28,.L0
subq $18,$28,$18
.align 3
-Loop0: ldq $3,8($17)
+.Loop0: ldq $3,8($17)
addq $16,8,$16
srl $4,$19,$5
addq $17,8,$17
@@ -55,17 +55,17 @@ Loop0: ldq $3,8($17)
or $3,$3,$4
or $5,$6,$8
stq $8,-8($16)
- bne $28,Loop0
+ bne $28,.Loop0
-L0: srl $4,$19,$24
- beq $18,Lend
+.L0: srl $4,$19,$24
+ beq $18,.Lend
# warm up phase 1
ldq $1,8($17)
subq $18,4,$18
ldq $2,16($17)
ldq $3,24($17)
ldq $4,32($17)
- beq $18,Lcool1
+ beq $18,.Lend1
# warm up phase 2
sll $1,$20,$7
srl $1,$19,$21
@@ -82,10 +82,10 @@ L0: srl $4,$19,$24
srl $4,$19,$24
ldq $4,64($17)
subq $18,4,$18
- beq $18,Lcool2
+ beq $18,.Lend2
.align 4
# main loop
-Loop: stq $7,0($16)
+.Loop: stq $7,0($16)
or $5,$22,$5
stq $8,8($16)
or $6,$23,$6
@@ -116,11 +116,9 @@ Loop: stq $7,0($16)
ldq $4,96($17)
addq $17,32,$17
- bne $18,Loop
- unop
- unop
+ bne $18,.Loop
# cool down phase 2/1
-Lcool2: stq $7,0($16)
+.Lend2: stq $7,0($16)
or $5,$22,$5
stq $8,8($16)
or $6,$23,$6
@@ -148,7 +146,7 @@ Lcool2: stq $7,0($16)
ret $31,($26),1
# cool down phase 1/1
-Lcool1: sll $1,$20,$7
+.Lend1: sll $1,$20,$7
srl $1,$19,$21
sll $2,$20,$8
srl $2,$19,$22
@@ -168,6 +166,6 @@ Lcool1: sll $1,$20,$7
stq $24,32($16)
ret $31,($26),1
-Lend: stq $24,0($16)
+.Lend: stq $24,0($16)
ret $31,($26),1
.end __mpn_rshift
diff --git a/sysdeps/alpha/alphaev5/sub_n.s b/sysdeps/alpha/alphaev5/sub_n.s
new file mode 100644
index 0000000..c9f3a4e
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/sub_n.s
@@ -0,0 +1,148 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subq $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldq $0,0($18)
+ ldq $1,8($18)
+ ldq $4,0($17)
+ ldq $5,8($17)
+ addq $17,32,$17 # update s1_ptr
+ ldq $2,16($18)
+ subq $4,$0,$20 # 1st main sub
+ ldq $3,24($18)
+ subq $19,4,$19 # decr loop cnt
+ ldq $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last sub
+ ldq $7,-8($17)
+ addq $1,$25,$28 # cy add
+ addq $18,32,$18 # update s2_ptr
+ subq $5,$28,$21 # 2nd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $5,$21,$25 # compute cy from last add
+ ldq $0,0($18)
+ or $8,$25,$25 # combine cy from the two adds
+ ldq $1,8($18)
+ addq $2,$25,$28 # cy add
+ ldq $4,0($17)
+ subq $6,$28,$22 # 3rd main sub
+ ldq $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+ stq $21,8($16)
+ addq $3,$25,$28 # cy add
+ subq $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ addq $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ addq $0,$25,$28 # cy add
+ ldq $2,16($18)
+ subq $4,$28,$20 # 1st main sub
+ ldq $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldq $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last add
+ ldq $7,-8($17)
+ or $8,$25,$25 # combine cy from the two adds
+ subq $19,4,$19 # decr loop cnt
+ stq $22,-16($16)
+ addq $1,$25,$28 # cy add
+ stq $23,-8($16)
+ subq $5,$28,$21 # 2nd main sub
+ addq $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $2,$25,$28 # cy add
+ subq $6,$28,$22 # 3rd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+ stq $21,8($16)
+ addq $3,$25,$28 # cy add
+ subq $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ stq $22,-16($16)
+ stq $23,-8($16)
+.Lend2: addq $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldq $0,0($18)
+ ldq $4,0($17)
+ subq $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addq $0,$25,$28 # cy add
+ ldq $0,8($18)
+ subq $4,$28,$20 # main sub
+ ldq $1,8($17)
+ addq $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addq $17,8,$17
+ stq $20,0($16)
+ cmpult $4,$20,$25 # compute cy from last add
+ subq $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,8,$16
+ or $1,$31,$4
+ bne $19,.Loop0
+.Lend0: addq $0,$25,$28 # cy add
+ subq $4,$28,$20 # main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $4,$20,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_sub_n
diff --git a/sysdeps/alpha/lshift.s b/sysdeps/alpha/lshift.s
index c284349..aa8417b 100644
--- a/sysdeps/alpha/lshift.s
+++ b/sysdeps/alpha/lshift.s
@@ -53,11 +53,11 @@ __mpn_lshift:
and $18,4-1,$20 # number of limbs in first loop
srl $4,$7,$0 # compute function result
- beq $20,L0
+ beq $20,.L0
subq $18,$20,$18
.align 3
-Loop0:
+.Loop0:
ldq $3,-8($17)
subq $16,8,$16
subq $17,8,$17
@@ -67,12 +67,12 @@ Loop0:
bis $3,$3,$4
bis $5,$6,$8
stq $8,0($16)
- bne $20,Loop0
+ bne $20,.Loop0
-L0: beq $18,Lend
+.L0: beq $18,.Lend
.align 3
-Loop: ldq $3,-8($17)
+.Loop: ldq $3,-8($17)
subq $16,32,$16
subq $18,4,$18
sll $4,$19,$5
@@ -100,9 +100,9 @@ Loop: ldq $3,-8($17)
bis $1,$2,$8
stq $8,0($16)
- bgt $18,Loop
+ bgt $18,.Loop
-Lend: sll $4,$19,$8
+.Lend: sll $4,$19,$8
stq $8,-8($16)
ret $31,($26),1
.end __mpn_lshift
diff --git a/sysdeps/alpha/mul_1.s b/sysdeps/alpha/mul_1.s
index 3ef194d..58a63df 100644
--- a/sysdeps/alpha/mul_1.s
+++ b/sysdeps/alpha/mul_1.s
@@ -1,7 +1,7 @@
# Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
# the result in a second limb vector.
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
diff --git a/sysdeps/alpha/rshift.s b/sysdeps/alpha/rshift.s
index 74eab04..037b776 100644
--- a/sysdeps/alpha/rshift.s
+++ b/sysdeps/alpha/rshift.s
@@ -34,7 +34,7 @@
# 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
# 2. Only aligned instruction pairs can be paired.
# 3. The store buffer or silo might not be able to deal with the bandwidth.
-
+
.set noreorder
.set noat
.text
@@ -51,11 +51,11 @@ __mpn_rshift:
and $18,4-1,$20 # number of limbs in first loop
sll $4,$7,$0 # compute function result
- beq $20,L0
+ beq $20,.L0
subq $18,$20,$18
.align 3
-Loop0:
+.Loop0:
ldq $3,0($17)
addq $16,8,$16
addq $17,8,$17
@@ -65,12 +65,12 @@ Loop0:
bis $3,$3,$4
bis $5,$6,$8
stq $8,-8($16)
- bne $20,Loop0
+ bne $20,.Loop0
-L0: beq $18,Lend
+.L0: beq $18,.Lend
.align 3
-Loop: ldq $3,0($17)
+.Loop: ldq $3,0($17)
addq $16,32,$16
subq $18,4,$18
srl $4,$19,$5
@@ -98,9 +98,9 @@ Loop: ldq $3,0($17)
bis $1,$2,$8
stq $8,-8($16)
- bgt $18,Loop
+ bgt $18,.Loop
-Lend: srl $4,$19,$8
+.Lend: srl $4,$19,$8
stq $8,0($16)
ret $31,($26),1
.end __mpn_rshift
diff --git a/sysdeps/alpha/submul_1.s b/sysdeps/alpha/submul_1.s
index acaa11c..292b2c1 100644
--- a/sysdeps/alpha/submul_1.s
+++ b/sysdeps/alpha/submul_1.s
@@ -26,16 +26,7 @@
# size r18
# s2_limb r19
- # This code runs at 42 cycles/limb on the 21064.
-
- # To improve performance for long multiplications, we would use
- # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
- # these instructions without slowing down the general code: 1. We can
- # only have two prefetches in operation at any time in the Alpha
- # architecture. 2. There will seldom be any special alignment
- # between RES_PTR and S1_PTR. Maybe we can simply divide the current
- # loop into an inner and outer loop, having the inner loop handle
- # exactly one prefetch block?
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
.set noreorder
.set noat
@@ -52,7 +43,7 @@ __mpn_submul_1:
mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
umulh $2,$19,$0 # $0 = prod_high
- beq $18,Lend1 # jump if size was == 1
+ beq $18,.Lend1 # jump if size was == 1
ldq $2,0($17) # $2 = s1_limb
addq $17,8,$17 # s1_ptr++
subq $18,1,$18 # size--
@@ -60,10 +51,10 @@ __mpn_submul_1:
cmpult $5,$3,$4
stq $3,0($16)
addq $16,8,$16 # res_ptr++
- beq $18,Lend2 # jump if size was == 2
+ beq $18,.Lend2 # jump if size was == 2
.align 3
-Loop: mulq $2,$19,$3 # $3 = prod_low
+.Loop: mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
subq $18,1,$18 # size--
@@ -77,9 +68,9 @@ Loop: mulq $2,$19,$3 # $3 = prod_low
stq $3,0($16)
addq $16,8,$16 # res_ptr++
addq $5,$0,$0 # combine carries
- bne $18,Loop
+ bne $18,.Loop
-Lend2: mulq $2,$19,$3 # $3 = prod_low
+.Lend2: mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
umulh $2,$19,$4 # $4 = cy_limb
@@ -91,7 +82,7 @@ Lend2: mulq $2,$19,$3 # $3 = prod_low
addq $5,$0,$0 # combine carries
addq $4,$0,$0 # cy_limb = prod_high + cy
ret $31,($26),1
-Lend1: subq $5,$3,$3
+.Lend1: subq $5,$3,$3
cmpult $5,$3,$5
stq $3,0($16)
addq $0,$5,$0
diff --git a/sysdeps/alpha/udiv_qrnnd.S b/sysdeps/alpha/udiv_qrnnd.S
index bafafd6..ce590ed 100644
--- a/sysdeps/alpha/udiv_qrnnd.S
+++ b/sysdeps/alpha/udiv_qrnnd.S
@@ -1,6 +1,6 @@
# Alpha 21064 __udiv_qrnnd
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
@@ -21,13 +21,11 @@
.set noreorder
.set noat
-
.text
- .align 3
- .globl __udiv_qrnnd
- .ent __udiv_qrnnd 0
+ .align 3
+ .globl __udiv_qrnnd
+ .ent __udiv_qrnnd
__udiv_qrnnd:
-__udiv_qrnnd..ng:
.frame $30,0,$26,0
.prologue 0
#define cnt $2
@@ -39,9 +37,9 @@ __udiv_qrnnd..ng:
#define qb $20
ldiq cnt,16
- blt d,Largedivisor
+ blt d,.Largedivisor
-Loop1: cmplt n0,0,tmp
+.Loop1: cmplt n0,0,tmp
addq n1,n1,n1
bis n1,tmp,n1
addq n0,n0,n0
@@ -74,12 +72,12 @@ Loop1: cmplt n0,0,tmp
cmovne qb,tmp,n1
bis n0,qb,n0
subq cnt,1,cnt
- bgt cnt,Loop1
+ bgt cnt,.Loop1
stq n1,0(rem_ptr)
bis $31,n0,$0
ret $31,($26),1
-Largedivisor:
+.Largedivisor:
and n0,1,$4
srl n0,1,n0
@@ -91,7 +89,7 @@ Largedivisor:
srl d,1,$5
addq $5,$6,$5
-Loop2: cmplt n0,0,tmp
+.Loop2: cmplt n0,0,tmp
addq n1,n1,n1
bis n1,tmp,n1
addq n0,n0,n0
@@ -124,27 +122,27 @@ Loop2: cmplt n0,0,tmp
cmovne qb,tmp,n1
bis n0,qb,n0
subq cnt,1,cnt
- bgt cnt,Loop2
+ bgt cnt,.Loop2
addq n1,n1,n1
addq $4,n1,n1
- bne $6,Odd
+ bne $6,.LOdd
stq n1,0(rem_ptr)
bis $31,n0,$0
ret $31,($26),1
-Odd:
+.LOdd:
/* q' in n0. r' in n1 */
addq n1,n0,n1
cmpult n1,n0,tmp # tmp := carry from addq
- beq tmp,LLp6
+ beq tmp,.LLp6
addq n0,1,n0
subq n1,d,n1
-LLp6: cmpult n1,d,tmp
- bne tmp,LLp7
+.LLp6: cmpult n1,d,tmp
+ bne tmp,.LLp7
addq n0,1,n0
subq n1,d,n1
-LLp7:
+.LLp7:
stq n1,0(rem_ptr)
bis $31,n0,$0
ret $31,($26),1