diff options
Diffstat (limited to 'sysdeps/alpha')
-rw-r--r-- | sysdeps/alpha/addmul_1.s | 23 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev5/add_n.s | 175 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev5/lshift.s | 30 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev5/rshift.s | 28 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev5/sub_n.s | 148 | ||||
-rw-r--r-- | sysdeps/alpha/lshift.s | 14 | ||||
-rw-r--r-- | sysdeps/alpha/mul_1.s | 2 | ||||
-rw-r--r-- | sysdeps/alpha/rshift.s | 16 | ||||
-rw-r--r-- | sysdeps/alpha/submul_1.s | 23 | ||||
-rw-r--r-- | sysdeps/alpha/udiv_qrnnd.S | 34 |
10 files changed, 323 insertions, 170 deletions
diff --git a/sysdeps/alpha/addmul_1.s b/sysdeps/alpha/addmul_1.s index 46d277d..8b168cb 100644 --- a/sysdeps/alpha/addmul_1.s +++ b/sysdeps/alpha/addmul_1.s @@ -26,16 +26,7 @@ # size r18 # s2_limb r19 - # This code runs at 42 cycles/limb on the 21064. - - # To improve performance for long multiplications, we would use - # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use - # these instructions without slowing down the general code: 1. We can - # only have two prefetches in operation at any time in the Alpha - # architecture. 2. There will seldom be any special alignment - # between RES_PTR and S1_PTR. Maybe we can simply divide the current - # loop into an inner and outer loop, having the inner loop handle - # exactly one prefetch block? + # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5. .set noreorder .set noat @@ -52,7 +43,7 @@ __mpn_addmul_1: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr umulh $2,$19,$0 # $0 = prod_high - beq $18,Lend1 # jump if size was == 1 + beq $18,.Lend1 # jump if size was == 1 ldq $2,0($17) # $2 = s1_limb addq $17,8,$17 # s1_ptr++ subq $18,1,$18 # size-- @@ -60,10 +51,10 @@ __mpn_addmul_1: cmpult $3,$5,$4 stq $3,0($16) addq $16,8,$16 # res_ptr++ - beq $18,Lend2 # jump if size was == 2 + beq $18,.Lend2 # jump if size was == 2 .align 3 -Loop: mulq $2,$19,$3 # $3 = prod_low +.Loop: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr addq $4,$0,$0 # cy_limb = cy_limb + 'cy' subq $18,1,$18 # size-- @@ -77,9 +68,9 @@ Loop: mulq $2,$19,$3 # $3 = prod_low stq $3,0($16) addq $16,8,$16 # res_ptr++ addq $5,$0,$0 # combine carries - bne $18,Loop + bne $18,.Loop -Lend2: mulq $2,$19,$3 # $3 = prod_low +.Lend2: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr addq $4,$0,$0 # cy_limb = cy_limb + 'cy' umulh $2,$19,$4 # $4 = cy_limb @@ -91,7 +82,7 @@ Lend2: mulq $2,$19,$3 # $3 = prod_low addq $5,$0,$0 # combine carries addq $4,$0,$0 # cy_limb = prod_high + cy ret $31,($26),1 -Lend1: addq $5,$3,$3 +.Lend1: addq $5,$3,$3 cmpult $3,$5,$5 stq $3,0($16) addq $0,$5,$0 diff --git a/sysdeps/alpha/alphaev5/add_n.s b/sysdeps/alpha/alphaev5/add_n.s index 2aaf041..66cf82b 100644 --- a/sysdeps/alpha/alphaev5/add_n.s +++ b/sysdeps/alpha/alphaev5/add_n.s @@ -35,84 +35,113 @@ __mpn_add_n: .frame $30,0,$26,0 - ldq $3,0($17) - ldq $4,0($18) - - subq $19,1,$19 - and $19,4-1,$2 # number of limbs in first loop - bis $31,$31,$0 - beq $2,.L0 # if multiple of 4 limbs, skip first loop - - subq $19,$2,$19 - -.Loop0: subq $2,1,$2 + or $31,$31,$25 # clear cy + subq $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldq $0,0($18) + ldq $1,8($18) + ldq $4,0($17) ldq $5,8($17) - addq $4,$0,$4 - ldq $6,8($18) - cmpult $4,$0,$1 - addq $3,$4,$4 - cmpult $4,$3,$0 - stq $4,0($16) - or $0,$1,$0 - - addq $17,8,$17 - addq $18,8,$18 - bis $5,$5,$3 - bis $6,$6,$4 - addq $16,8,$16 - bne $2,.Loop0 - -.L0: beq $19,.Lend - + addq $17,32,$17 # update s1_ptr + ldq $2,16($18) + addq $0,$4,$20 # 1st main add + ldq $3,24($18) + subq $19,4,$19 # decr loop cnt + ldq $6,-16($17) + cmpult $20,$0,$25 # compute cy from last add + ldq $7,-8($17) + addq $1,$25,$28 # cy add + addq $18,32,$18 # update s2_ptr + addq $5,$28,$21 # 2nd main add + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline .align 4 -.Loop: subq $19,4,$19 - unop - - ldq $6,8($18) - addq $4,$0,$0 +.Loop: cmpult $21,$28,$25 # compute cy from last add + ldq $0,0($18) + or $8,$25,$25 # combine cy from the two adds + ldq $1,8($18) + addq $2,$25,$28 # cy add + ldq $4,0($17) + addq $28,$6,$22 # 3rd main add ldq $5,8($17) - cmpult $0,$4,$1 - ldq $4,16($18) - addq $3,$0,$20 - cmpult $20,$3,$0 - ldq $3,16($17) - or $0,$1,$0 - addq $6,$0,$0 - cmpult $0,$6,$1 - ldq $6,24($18) - addq $5,$0,$21 - cmpult $21,$5,$0 - ldq $5,24($17) - or $0,$1,$0 - addq $4,$0,$0 - cmpult $0,$4,$1 - ldq $4,32($18) - addq $3,$0,$22 - cmpult $22,$3,$0 - ldq $3,32($17) - or $0,$1,$0 - addq $6,$0,$0 - cmpult $0,$6,$1 - addq $5,$0,$23 - cmpult $23,$5,$0 - or $0,$1,$0 - + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds stq $21,8($16) - stq $22,16($16) - stq $23,24($16) - - addq $17,32,$17 - addq $18,32,$18 - addq $16,32,$16 - bne $19,.Loop + addq $3,$25,$28 # cy add + addq $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + addq $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + addq $0,$25,$28 # cy add + ldq $2,16($18) + addq $4,$28,$20 # 1st main add + ldq $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldq $6,-16($17) + cmpult $20,$28,$25 # compute cy from last add + ldq $7,-8($17) + or $8,$25,$25 # combine cy from the two adds + subq $19,4,$19 # decr loop cnt + stq $22,-16($16) + addq $1,$25,$28 # cy add + stq $23,-8($16) + addq $5,$28,$21 # 2nd main add + addq $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $21,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $2,$25,$28 # cy add + addq $28,$6,$22 # 3rd main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + stq $21,8($16) + addq $3,$25,$28 # cy add + addq $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + stq $22,-16($16) + stq $23,-8($16) +.Lend2: addq $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldq $0,0($18) + ldq $4,0($17) + subq $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addq $0,$25,$28 # cy add + ldq $0,8($18) + addq $4,$28,$20 # main add + ldq $4,8($17) + addq $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addq $17,8,$17 + stq $20,0($16) + cmpult $20,$28,$25 # compute cy from last add + subq $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two adds + addq $16,8,$16 + bne $19,.Loop0 +.Lend0: addq $0,$25,$28 # cy add + addq $4,$28,$20 # main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $20,$28,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds -.Lend: addq $4,$0,$4 - cmpult $4,$0,$1 - addq $3,$4,$4 - cmpult $4,$3,$0 - stq $4,0($16) - or $0,$1,$0 +.Lret: or $25,$31,$0 # return cy ret $31,($26),1 - .end __mpn_add_n diff --git a/sysdeps/alpha/alphaev5/lshift.s b/sysdeps/alpha/alphaev5/lshift.s index fdb0895..392b424 100644 --- a/sysdeps/alpha/alphaev5/lshift.s +++ b/sysdeps/alpha/alphaev5/lshift.s @@ -25,7 +25,7 @@ # size r18 # cnt r19 - # This code runs at 4.25 cycles/limb on the EV5. + # This code runs at 3.25 cycles/limb on the EV5. .set noreorder .set noat @@ -44,11 +44,11 @@ __mpn_lshift: and $18,4-1,$28 # number of limbs in first loop srl $4,$20,$0 # compute function result - beq $28,L0 + beq $28,.L0 subq $18,$28,$18 .align 3 -Loop0: ldq $3,-16($17) +.Loop0: ldq $3,-16($17) subq $16,8,$16 sll $4,$19,$5 subq $17,8,$17 @@ -57,17 +57,17 @@ Loop0: ldq $3,-16($17) or $3,$3,$4 or $5,$6,$8 stq $8,0($16) - bne $28,Loop0 + bne $28,.Loop0 -L0: sll $4,$19,$24 - beq $18,Lend +.L0: sll $4,$19,$24 + beq $18,.Lend # warm up phase 1 ldq $1,-16($17) subq $18,4,$18 ldq $2,-24($17) ldq $3,-32($17) ldq $4,-40($17) - beq $18,Lcool1 + beq $18,.Lend1 # warm up phase 2 srl $1,$20,$7 sll $1,$19,$21 @@ -84,10 +84,10 @@ L0: sll $4,$19,$24 sll $4,$19,$24 ldq $4,-72($17) subq $18,4,$18 - beq $18,Lcool1 + beq $18,.Lend2 .align 4 # main loop -Loop: stq $7,-8($16) +.Loop: stq $7,-8($16) or $5,$22,$5 stq $8,-16($16) or $6,$23,$6 @@ -113,16 +113,14 @@ Loop: stq $7,-8($16) subq $16,32,$16 srl $4,$20,$6 - ldq $3,-96($17 + ldq $3,-96($17) sll $4,$19,$24 ldq $4,-104($17) subq $17,32,$17 - bne $18,Loop - unop - unop + bne $18,.Loop # cool down phase 2/1 -Lcool1: stq $7,-8($16) +.Lend2: stq $7,-8($16) or $5,$22,$5 stq $8,-16($16) or $6,$23,$6 @@ -150,7 +148,7 @@ Lcool1: stq $7,-8($16) ret $31,($26),1 # cool down phase 1/1 -Lcool1: srl $1,$20,$7 +.Lend1: srl $1,$20,$7 sll $1,$19,$21 srl $2,$20,$8 sll $2,$19,$22 @@ -170,6 +168,6 @@ Lcool1: srl $1,$20,$7 stq $24,-40($16) ret $31,($26),1 -Lend stq $24,-8($16) +.Lend: stq $24,-8($16) ret $31,($26),1 .end __mpn_lshift diff --git a/sysdeps/alpha/alphaev5/rshift.s b/sysdeps/alpha/alphaev5/rshift.s index 1da9960..d20dde3 100644 --- a/sysdeps/alpha/alphaev5/rshift.s +++ b/sysdeps/alpha/alphaev5/rshift.s @@ -25,7 +25,7 @@ # size r18 # cnt r19 - # This code runs at 4.25 cycles/limb on the EV5. + # This code runs at 3.25 cycles/limb on the EV5. .set noreorder .set noat @@ -42,11 +42,11 @@ __mpn_rshift: and $18,4-1,$28 # number of limbs in first loop sll $4,$20,$0 # compute function result - beq $28,L0 + beq $28,.L0 subq $18,$28,$18 .align 3 -Loop0: ldq $3,8($17) +.Loop0: ldq $3,8($17) addq $16,8,$16 srl $4,$19,$5 addq $17,8,$17 @@ -55,17 +55,17 @@ Loop0: ldq $3,8($17) or $3,$3,$4 or $5,$6,$8 stq $8,-8($16) - bne $28,Loop0 + bne $28,.Loop0 -L0: srl $4,$19,$24 - beq $18,Lend +.L0: srl $4,$19,$24 + beq $18,.Lend # warm up phase 1 ldq $1,8($17) subq $18,4,$18 ldq $2,16($17) ldq $3,24($17) ldq $4,32($17) - beq $18,Lcool1 + beq $18,.Lend1 # warm up phase 2 sll $1,$20,$7 srl $1,$19,$21 @@ -82,10 +82,10 @@ L0: srl $4,$19,$24 srl $4,$19,$24 ldq $4,64($17) subq $18,4,$18 - beq $18,Lcool2 + beq $18,.Lend2 .align 4 # main loop -Loop: stq $7,0($16) +.Loop: stq $7,0($16) or $5,$22,$5 stq $8,8($16) or $6,$23,$6 @@ -116,11 +116,9 @@ Loop: stq $7,0($16) ldq $4,96($17) addq $17,32,$17 - bne $18,Loop - unop - unop + bne $18,.Loop # cool down phase 2/1 -Lcool2: stq $7,0($16) +.Lend2: stq $7,0($16) or $5,$22,$5 stq $8,8($16) or $6,$23,$6 @@ -148,7 +146,7 @@ Lcool2: stq $7,0($16) ret $31,($26),1 # cool down phase 1/1 -Lcool1: sll $1,$20,$7 +.Lend1: sll $1,$20,$7 srl $1,$19,$21 sll $2,$20,$8 srl $2,$19,$22 @@ -168,6 +166,6 @@ Lcool1: sll $1,$20,$7 stq $24,32($16) ret $31,($26),1 -Lend: stq $24,0($16) +.Lend: stq $24,0($16) ret $31,($26),1 .end __mpn_rshift diff --git a/sysdeps/alpha/alphaev5/sub_n.s b/sysdeps/alpha/alphaev5/sub_n.s new file mode 100644 index 0000000..c9f3a4e --- /dev/null +++ b/sysdeps/alpha/alphaev5/sub_n.s @@ -0,0 +1,148 @@ + # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .frame $30,0,$26,0 + + or $31,$31,$25 # clear cy + subq $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldq $0,0($18) + ldq $1,8($18) + ldq $4,0($17) + ldq $5,8($17) + addq $17,32,$17 # update s1_ptr + ldq $2,16($18) + subq $4,$0,$20 # 1st main sub + ldq $3,24($18) + subq $19,4,$19 # decr loop cnt + ldq $6,-16($17) + cmpult $4,$20,$25 # compute cy from last sub + ldq $7,-8($17) + addq $1,$25,$28 # cy add + addq $18,32,$18 # update s2_ptr + subq $5,$28,$21 # 2nd main sub + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline + .align 4 +.Loop: cmpult $5,$21,$25 # compute cy from last add + ldq $0,0($18) + or $8,$25,$25 # combine cy from the two adds + ldq $1,8($18) + addq $2,$25,$28 # cy add + ldq $4,0($17) + subq $6,$28,$22 # 3rd main sub + ldq $5,8($17) + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + stq $21,8($16) + addq $3,$25,$28 # cy add + subq $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + addq $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + addq $0,$25,$28 # cy add + ldq $2,16($18) + subq $4,$28,$20 # 1st main sub + ldq $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldq $6,-16($17) + cmpult $4,$20,$25 # compute cy from last add + ldq $7,-8($17) + or $8,$25,$25 # combine cy from the two adds + subq $19,4,$19 # decr loop cnt + stq $22,-16($16) + addq $1,$25,$28 # cy add + stq $23,-8($16) + subq $5,$28,$21 # 2nd main sub + addq $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $5,$21,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $2,$25,$28 # cy add + subq $6,$28,$22 # 3rd main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + stq $21,8($16) + addq $3,$25,$28 # cy add + subq $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + stq $22,-16($16) + stq $23,-8($16) +.Lend2: addq $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldq $0,0($18) + ldq $4,0($17) + subq $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addq $0,$25,$28 # cy add + ldq $0,8($18) + subq $4,$28,$20 # main sub + ldq $1,8($17) + addq $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addq $17,8,$17 + stq $20,0($16) + cmpult $4,$20,$25 # compute cy from last add + subq $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two adds + addq $16,8,$16 + or $1,$31,$4 + bne $19,.Loop0 +.Lend0: addq $0,$25,$28 # cy add + subq $4,$28,$20 # main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $4,$20,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + +.Lret: or $25,$31,$0 # return cy + ret $31,($26),1 + .end __mpn_sub_n diff --git a/sysdeps/alpha/lshift.s b/sysdeps/alpha/lshift.s index c284349..aa8417b 100644 --- a/sysdeps/alpha/lshift.s +++ b/sysdeps/alpha/lshift.s @@ -53,11 +53,11 @@ __mpn_lshift: and $18,4-1,$20 # number of limbs in first loop srl $4,$7,$0 # compute function result - beq $20,L0 + beq $20,.L0 subq $18,$20,$18 .align 3 -Loop0: +.Loop0: ldq $3,-8($17) subq $16,8,$16 subq $17,8,$17 @@ -67,12 +67,12 @@ Loop0: bis $3,$3,$4 bis $5,$6,$8 stq $8,0($16) - bne $20,Loop0 + bne $20,.Loop0 -L0: beq $18,Lend +.L0: beq $18,.Lend .align 3 -Loop: ldq $3,-8($17) +.Loop: ldq $3,-8($17) subq $16,32,$16 subq $18,4,$18 sll $4,$19,$5 @@ -100,9 +100,9 @@ Loop: ldq $3,-8($17) bis $1,$2,$8 stq $8,0($16) - bgt $18,Loop + bgt $18,.Loop -Lend: sll $4,$19,$8 +.Lend: sll $4,$19,$8 stq $8,-8($16) ret $31,($26),1 .end __mpn_lshift diff --git a/sysdeps/alpha/mul_1.s b/sysdeps/alpha/mul_1.s index 3ef194d..58a63df 100644 --- a/sysdeps/alpha/mul_1.s +++ b/sysdeps/alpha/mul_1.s @@ -1,7 +1,7 @@ # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store # the result in a second limb vector. - # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. # This file is part of the GNU MP Library. diff --git a/sysdeps/alpha/rshift.s b/sysdeps/alpha/rshift.s index 74eab04..037b776 100644 --- a/sysdeps/alpha/rshift.s +++ b/sysdeps/alpha/rshift.s @@ -34,7 +34,7 @@ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. # 2. Only aligned instruction pairs can be paired. # 3. The store buffer or silo might not be able to deal with the bandwidth. - + .set noreorder .set noat .text @@ -51,11 +51,11 @@ __mpn_rshift: and $18,4-1,$20 # number of limbs in first loop sll $4,$7,$0 # compute function result - beq $20,L0 + beq $20,.L0 subq $18,$20,$18 .align 3 -Loop0: +.Loop0: ldq $3,0($17) addq $16,8,$16 addq $17,8,$17 @@ -65,12 +65,12 @@ Loop0: bis $3,$3,$4 bis $5,$6,$8 stq $8,-8($16) - bne $20,Loop0 + bne $20,.Loop0 -L0: beq $18,Lend +.L0: beq $18,.Lend .align 3 -Loop: ldq $3,0($17) +.Loop: ldq $3,0($17) addq $16,32,$16 subq $18,4,$18 srl $4,$19,$5 @@ -98,9 +98,9 @@ Loop: ldq $3,0($17) bis $1,$2,$8 stq $8,-8($16) - bgt $18,Loop + bgt $18,.Loop -Lend: srl $4,$19,$8 +.Lend: srl $4,$19,$8 stq $8,0($16) ret $31,($26),1 .end __mpn_rshift diff --git a/sysdeps/alpha/submul_1.s b/sysdeps/alpha/submul_1.s index acaa11c..292b2c1 100644 --- a/sysdeps/alpha/submul_1.s +++ b/sysdeps/alpha/submul_1.s @@ -26,16 +26,7 @@ # size r18 # s2_limb r19 - # This code runs at 42 cycles/limb on the 21064. - - # To improve performance for long multiplications, we would use - # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use - # these instructions without slowing down the general code: 1. We can - # only have two prefetches in operation at any time in the Alpha - # architecture. 2. There will seldom be any special alignment - # between RES_PTR and S1_PTR. Maybe we can simply divide the current - # loop into an inner and outer loop, having the inner loop handle - # exactly one prefetch block? + # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5. .set noreorder .set noat @@ -52,7 +43,7 @@ __mpn_submul_1: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr umulh $2,$19,$0 # $0 = prod_high - beq $18,Lend1 # jump if size was == 1 + beq $18,.Lend1 # jump if size was == 1 ldq $2,0($17) # $2 = s1_limb addq $17,8,$17 # s1_ptr++ subq $18,1,$18 # size-- @@ -60,10 +51,10 @@ __mpn_submul_1: cmpult $5,$3,$4 stq $3,0($16) addq $16,8,$16 # res_ptr++ - beq $18,Lend2 # jump if size was == 2 + beq $18,.Lend2 # jump if size was == 2 .align 3 -Loop: mulq $2,$19,$3 # $3 = prod_low +.Loop: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr addq $4,$0,$0 # cy_limb = cy_limb + 'cy' subq $18,1,$18 # size-- @@ -77,9 +68,9 @@ Loop: mulq $2,$19,$3 # $3 = prod_low stq $3,0($16) addq $16,8,$16 # res_ptr++ addq $5,$0,$0 # combine carries - bne $18,Loop + bne $18,.Loop -Lend2: mulq $2,$19,$3 # $3 = prod_low +.Lend2: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr addq $4,$0,$0 # cy_limb = cy_limb + 'cy' umulh $2,$19,$4 # $4 = cy_limb @@ -91,7 +82,7 @@ Lend2: mulq $2,$19,$3 # $3 = prod_low addq $5,$0,$0 # combine carries addq $4,$0,$0 # cy_limb = prod_high + cy ret $31,($26),1 -Lend1: subq $5,$3,$3 +.Lend1: subq $5,$3,$3 cmpult $5,$3,$5 stq $3,0($16) addq $0,$5,$0 diff --git a/sysdeps/alpha/udiv_qrnnd.S b/sysdeps/alpha/udiv_qrnnd.S index bafafd6..ce590ed 100644 --- a/sysdeps/alpha/udiv_qrnnd.S +++ b/sysdeps/alpha/udiv_qrnnd.S @@ -1,6 +1,6 @@ # Alpha 21064 __udiv_qrnnd - # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. # This file is part of the GNU MP Library. @@ -21,13 +21,11 @@ .set noreorder .set noat - .text - .align 3 - .globl __udiv_qrnnd - .ent __udiv_qrnnd 0 + .align 3 + .globl __udiv_qrnnd + .ent __udiv_qrnnd __udiv_qrnnd: -__udiv_qrnnd..ng: .frame $30,0,$26,0 .prologue 0 #define cnt $2 @@ -39,9 +37,9 @@ __udiv_qrnnd..ng: #define qb $20 ldiq cnt,16 - blt d,Largedivisor + blt d,.Largedivisor -Loop1: cmplt n0,0,tmp +.Loop1: cmplt n0,0,tmp addq n1,n1,n1 bis n1,tmp,n1 addq n0,n0,n0 @@ -74,12 +72,12 @@ Loop1: cmplt n0,0,tmp cmovne qb,tmp,n1 bis n0,qb,n0 subq cnt,1,cnt - bgt cnt,Loop1 + bgt cnt,.Loop1 stq n1,0(rem_ptr) bis $31,n0,$0 ret $31,($26),1 -Largedivisor: +.Largedivisor: and n0,1,$4 srl n0,1,n0 @@ -91,7 +89,7 @@ Largedivisor: srl d,1,$5 addq $5,$6,$5 -Loop2: cmplt n0,0,tmp +.Loop2: cmplt n0,0,tmp addq n1,n1,n1 bis n1,tmp,n1 addq n0,n0,n0 @@ -124,27 +122,27 @@ Loop2: cmplt n0,0,tmp cmovne qb,tmp,n1 bis n0,qb,n0 subq cnt,1,cnt - bgt cnt,Loop2 + bgt cnt,.Loop2 addq n1,n1,n1 addq $4,n1,n1 - bne $6,Odd + bne $6,.LOdd stq n1,0(rem_ptr) bis $31,n0,$0 ret $31,($26),1 -Odd: +.LOdd: /* q' in n0. r' in n1 */ addq n1,n0,n1 cmpult n1,n0,tmp # tmp := carry from addq - beq tmp,LLp6 + beq tmp,.LLp6 addq n0,1,n0 subq n1,d,n1 -LLp6: cmpult n1,d,tmp - bne tmp,LLp7 +.LLp6: cmpult n1,d,tmp + bne tmp,.LLp7 addq n0,1,n0 subq n1,d,n1 -LLp7: +.LLp7: stq n1,0(rem_ptr) bis $31,n0,$0 ret $31,($26),1 |