diff options
author | Richard Henderson <rth@redhat.com> | 2004-03-27 00:32:28 +0000 |
---|---|---|
committer | Richard Henderson <rth@redhat.com> | 2004-03-27 00:32:28 +0000 |
commit | fdc715a1f07b7070d2fd547daa56fc0fcdb59e22 (patch) | |
tree | d64607f2f6414304ab95fac7edb807b865940f1b /sysdeps | |
parent | 4ba541a978e7fcf28af4e2930f4136acd5fbcb22 (diff) | |
download | glibc-fdc715a1f07b7070d2fd547daa56fc0fcdb59e22.zip glibc-fdc715a1f07b7070d2fd547daa56fc0fcdb59e22.tar.gz glibc-fdc715a1f07b7070d2fd547daa56fc0fcdb59e22.tar.bz2 |
* sysdeps/alpha/Makefile <gnulib> (sysdep_routines): Merge divrem variable, add unsigned variants. * sysdeps/alpha/divrem.h: Remove file. * sysdeps/alpha/div_libc.h: New file. * sysdeps/alpha/divl.S: Rewrite from scratch. * sysdeps/alpha/reml.S: Likewise. * sysdeps/alpha/divq.S: Likewise. * sysdeps/alpha/remq.S: Likewise. * sysdeps/alpha/divlu.S: New file. * sysdeps/alpha/remlu.S: New file. * sysdeps/alpha/divqu.S: New file. * sysdeps/alpha/remqu.S: New file.
2004-03-26 Richard Henderson <rth@redhat.com>
* sysdeps/alpha/Makefile <gnulib> (sysdep_routines): Merge divrem
variable, add unsigned variants.
* sysdeps/alpha/divrem.h: Remove file.
* sysdeps/alpha/div_libc.h: New file.
* sysdeps/alpha/divl.S: Rewrite from scratch.
* sysdeps/alpha/reml.S: Likewise.
* sysdeps/alpha/divq.S: Likewise.
* sysdeps/alpha/remq.S: Likewise.
* sysdeps/alpha/divlu.S: New file.
* sysdeps/alpha/remlu.S: New file.
* sysdeps/alpha/divqu.S: New file.
* sysdeps/alpha/remqu.S: New file.
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/alpha/Makefile | 4 | ||||
-rw-r--r-- | sysdeps/alpha/div_libc.h | 113 | ||||
-rw-r--r-- | sysdeps/alpha/divl.S | 79 | ||||
-rw-r--r-- | sysdeps/alpha/divlu.S | 4 | ||||
-rw-r--r-- | sysdeps/alpha/divq.S | 269 | ||||
-rw-r--r-- | sysdeps/alpha/divqu.S | 244 | ||||
-rw-r--r-- | sysdeps/alpha/divrem.h | 225 | ||||
-rw-r--r-- | sysdeps/alpha/reml.S | 84 | ||||
-rw-r--r-- | sysdeps/alpha/remlu.S | 4 | ||||
-rw-r--r-- | sysdeps/alpha/remq.S | 265 | ||||
-rw-r--r-- | sysdeps/alpha/remqu.S | 251 |
11 files changed, 1294 insertions, 248 deletions
diff --git a/sysdeps/alpha/Makefile b/sysdeps/alpha/Makefile index ce8f9b3..1e74d82 100644 --- a/sysdeps/alpha/Makefile +++ b/sysdeps/alpha/Makefile @@ -26,7 +26,7 @@ sysdep_routines += _mcount endif ifeq ($(subdir),gnulib) -sysdep_routines += $(divrem) +sysdep_routines += divl divlu divq divqu reml remlu remq remqu endif ifeq ($(subdir),string) @@ -38,8 +38,6 @@ ifeq ($(subdir),elf) CFLAGS-rtld.c = -mbuild-constants endif -divrem := divl divq reml remq - # For now, build everything with full IEEE math support. # TODO: build separate libm and libm-ieee. sysdep-CFLAGS += -mieee diff --git a/sysdeps/alpha/div_libc.h b/sysdeps/alpha/div_libc.h new file mode 100644 index 0000000..9856643 --- /dev/null +++ b/sysdeps/alpha/div_libc.h @@ -0,0 +1,113 @@ +/* Copyright (C) 2004 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* Common bits for implementing software divide. */ + +#include <sysdep.h> +#ifdef __linux__ +# include <asm/gentrap.h> +# include <asm/pal.h> +#else +# include <machine/pal.h> +#endif + +/* These are not normal C functions. Argument registers are t10 and t11; + the result goes in t12; the return address is in t9. Only t12 and AT + may be clobbered. */ +#define X t10 +#define Y t11 +#define RV t12 +#define RA t9 + +/* None of these functions should use implicit anything. */ + .set nomacro + .set noat + +/* Code fragment to invoke _mcount for profiling. This should be invoked + directly after allocation of the stack frame. */ +.macro CALL_MCOUNT +#ifdef PROF + stq ra, 0(sp) + stq pv, 8(sp) + stq gp, 16(sp) + cfi_rel_offset (ra, 0) + cfi_rel_offset (pv, 8) + cfi_rel_offset (gp, 16) + br AT, 1f + .set macro +1: ldgp gp, 0(AT) + mov RA, ra + lda AT, _mcount + jsr AT, (AT), _mcount + .set nomacro + ldq ra, 0(sp) + ldq pv, 8(sp) + ldq gp, 16(sp) + cfi_restore (ra) + cfi_restore (pv) + cfi_restore (gp) + /* Realign subsequent code with what we'd have without this + macro at all. This means aligned with one arithmetic insn + used within the bundle. */ + .align 4 + nop +#endif +.endm + +/* In order to make the below work, all top-level divide routines must + use the same frame size. */ +#define FRAME 48 + +/* Code fragment to generate an integer divide-by-zero fault. When + building libc.so, we arrange for there to be one copy of this code + placed late in the dso, such that all branches are forward. When + building libc.a, we use multiple copies to avoid having an out of + range branch. Users should jump to DIVBYZERO. */ + +.macro DO_DIVBYZERO +#ifdef PIC +#define DIVBYZERO __divbyzero + .section .gnu.linkonce.t.divbyzero, "ax", @progbits + .globl __divbyzero + .type __divbyzero, @function + .usepv __divbyzero, no + .hidden __divbyzero +#else +#define DIVBYZERO $divbyzero +#endif + + .align 4 +DIVBYZERO: + cfi_startproc + cfi_return_column (RA) + cfi_def_cfa_offset (FRAME) + + mov a0, RV + unop + lda a0, GEN_INTDIV + call_pal PAL_gentrap + + mov RV, a0 + clr RV + lda sp, FRAME(sp) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + cfi_endproc + .size DIVBYZERO, .-DIVBYZERO +.endm diff --git a/sysdeps/alpha/divl.S b/sysdeps/alpha/divl.S index fdf053f..33fa118 100644 --- a/sysdeps/alpha/divl.S +++ b/sysdeps/alpha/divl.S @@ -1,6 +1,75 @@ -#define IS_REM 0 -#define SIZE 4 -#define UFUNC_NAME __divlu -#define SFUNC_NAME __divl +/* Copyright (C) 2004 Free Software Foundation, Inc. + This file is part of the GNU C Library. -#include "divrem.h" + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "div_libc.h" + +/* 32-bit signed int divide. This is not a normal C function. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may + be clobbered. + + The FPU can handle all input values except zero. Whee! */ + +#ifndef EXTEND +#define EXTEND(S,D) sextl S, D +#endif + + .text + .align 4 + .globl __divl + .type __divl, @function + .usepv __divl, no + + cfi_startproc + cfi_return_column (RA) +__divl: + lda sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + stt $f0, 0(sp) + stt $f1, 8(sp) + beq Y, DIVBYZERO + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + + EXTEND (X, RV) + EXTEND (Y, AT) + stq RV, 16(sp) + stq AT, 24(sp) + + ldt $f0, 16(sp) + ldt $f1, 24(sp) + cvtqt $f0, $f0 + cvtqt $f1, $f1 + + divt/c $f0, $f1, $f0 + cvttq/c $f0, $f0 + stt $f0, 16(sp) + ldt $f0, 0(sp) + + ldt $f1, 8(sp) + ldl RV, 16(sp) + lda sp, FRAME(sp) + cfi_restore ($f0) + cfi_restore ($f1) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + cfi_endproc + .size __divl, .-__divl + + DO_DIVBYZERO diff --git a/sysdeps/alpha/divlu.S b/sysdeps/alpha/divlu.S new file mode 100644 index 0000000..5c54bb5 --- /dev/null +++ b/sysdeps/alpha/divlu.S @@ -0,0 +1,4 @@ +#define UNSIGNED +#define EXTEND(S,D) zapnot S, 15, D +#define __divl __divlu +#include <divl.S> diff --git a/sysdeps/alpha/divq.S b/sysdeps/alpha/divq.S index 8c88af9..464536d 100644 --- a/sysdeps/alpha/divq.S +++ b/sysdeps/alpha/divq.S @@ -1,6 +1,265 @@ -#define IS_REM 0 -#define SIZE 8 -#define UFUNC_NAME __divqu -#define SFUNC_NAME __divq +/* Copyright (C) 2004 Free Software Foundation, Inc. + This file is part of the GNU C Library. -#include "divrem.h" + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "div_libc.h" + + +/* 64-bit signed long divide. These are not normal C functions. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may + be clobbered. + + Theory of operation here is that we can use the FPU divider for virtually + all operands that we see: all dividend values between -2**53 and 2**53-1 + can be computed directly. Note that divisor values need not be checked + against that range because the rounded fp value will be close enough such + that the quotient is < 1, which will properly be truncated to zero when we + convert back to integer. + + When the dividend is outside the range for which we can compute exact + results, we use the fp quotent as an estimate from which we begin refining + an exact integral value. This reduces the number of iterations in the + shift-and-subtract loop significantly. */ + + .text + .align 4 + .globl __divq + .type __divq, @function + .usepv __divq, no + + cfi_startproc + cfi_return_column (RA) +__divq: + lda sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + + /* Get the fp divide insn issued as quickly as possible. After + that's done, we have at least 22 cycles until its results are + ready -- all the time in the world to figure out how we're + going to use the results. */ + stq X, 16(sp) + stq Y, 24(sp) + beq Y, DIVBYZERO + + stt $f0, 0(sp) + stt $f1, 8(sp) + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + ldt $f0, 16(sp) + ldt $f1, 24(sp) + + cvtqt $f0, $f0 + cvtqt $f1, $f1 + divt/c $f0, $f1, $f0 + + /* Check to see if X fit in the double as an exact value. */ + sll X, (64-53), AT + ldt $f1, 8(sp) + sra AT, (64-53), AT + cmpeq X, AT, AT + beq AT, $x_big + + /* If we get here, we're expecting exact results from the division. + Do nothing else besides convert and clean up. */ + cvttq/c $f0, $f0 + stt $f0, 16(sp) + + ldq RV, 16(sp) + ldt $f0, 0(sp) + cfi_restore ($f1) + cfi_remember_state + cfi_restore ($f0) + cfi_def_cfa_offset (0) + lda sp, FRAME(sp) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state +$x_big: + /* If we get here, X is large enough that we don't expect exact + results, and neither X nor Y got mis-translated for the fp + division. Our task is to take the fp result, figure out how + far it's off from the correct result and compute a fixup. */ + stq t0, 16(sp) + stq t1, 24(sp) + stq t2, 32(sp) + stq t5, 40(sp) + cfi_rel_offset (t0, 16) + cfi_rel_offset (t1, 24) + cfi_rel_offset (t2, 32) + cfi_rel_offset (t5, 40) + +#define Q RV /* quotient */ +#define R t0 /* remainder */ +#define SY t1 /* scaled Y */ +#define S t2 /* scalar */ +#define QY t3 /* Q*Y */ + + /* The fixup code below can only handle unsigned values. */ + or X, Y, AT + mov $31, t5 + blt AT, $fix_sign_in +$fix_sign_in_ret1: + cvttq/c $f0, $f0 + + stt $f0, 8(sp) + ldq Q, 8(sp) +$fix_sign_in_ret2: + mulq Q, Y, QY + stq t4, 8(sp) + + ldt $f0, 0(sp) + unop + cfi_rel_offset (t4, 8) + cfi_restore ($f0) + stq t3, 0(sp) + unop + cfi_rel_offset (t3, 0) + + subq QY, X, R + mov Y, SY + mov 1, S + bgt R, $q_high + +$q_high_ret: + subq X, QY, R + mov Y, SY + mov 1, S + bgt R, $q_low + +$q_low_ret: + ldq t0, 16(sp) + ldq t1, 24(sp) + ldq t2, 32(sp) + bne t5, $fix_sign_out + +$fix_sign_out_ret: + ldq t3, 0(sp) + ldq t4, 8(sp) + ldq t5, 40(sp) + lda sp, FRAME(sp) + cfi_remember_state + cfi_restore (t0) + cfi_restore (t1) + cfi_restore (t2) + cfi_restore (t3) + cfi_restore (t4) + cfi_restore (t5) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state + /* The quotient that we computed was too large. We need to reduce + it by S such that Y*S >= R. Obviously the closer we get to the + correct value the better, but overshooting high is ok, as we'll + fix that up later. */ +0: + addq SY, SY, SY + addq S, S, S +$q_high: + cmpult SY, R, AT + bne AT, 0b + + subq Q, S, Q + unop + subq QY, SY, QY + br $q_high_ret + + .align 4 + /* The quotient that we computed was too small. Divide Y by the + current remainder (R) and add that to the existing quotient (Q). + The expectation, of course, is that R is much smaller than X. */ + /* Begin with a shift-up loop. Compute S such that Y*S >= R. We + already have a copy of Y in SY and the value 1 in S. */ +0: + addq SY, SY, SY + addq S, S, S +$q_low: + cmpult SY, R, AT + bne AT, 0b + + /* Shift-down and subtract loop. Each iteration compares our scaled + Y (SY) with the remainder (R); if SY <= R then X is divisible by + Y's scalar (S) so add it to the quotient (Q). */ +2: addq Q, S, t3 + srl S, 1, S + cmpule SY, R, AT + subq R, SY, t4 + + cmovne AT, t3, Q + cmovne AT, t4, R + srl SY, 1, SY + bne S, 2b + + br $q_low_ret + + .align 4 +$fix_sign_in: + /* If we got here, then X|Y is negative. Need to adjust everything + such that we're doing unsigned division in the fixup loop. */ + /* T5 records the changes we had to make: + bit 0: set if result should be negative. + bit 2: set if X was negated. + bit 3: set if Y was negated. + */ + xor X, Y, AT + cmplt AT, 0, t5 + cmplt X, 0, AT + negq X, t0 + + s4addq AT, t5, t5 + cmovne AT, t0, X + cmplt Y, 0, AT + negq Y, t0 + + s8addq AT, t5, t5 + cmovne AT, t0, Y + unop + blbc t5, $fix_sign_in_ret1 + + cvttq/c $f0, $f0 + stt $f0, 8(sp) + ldq Q, 8(sp) + unop + + negq Q, Q + br $fix_sign_in_ret2 + + .align 4 +$fix_sign_out: + /* Now we get to undo what we did above. */ + /* ??? Is this really faster than just increasing the size of + the stack frame and storing X and Y in memory? */ + and t5, 8, AT + negq Y, t4 + cmovne AT, t4, Y + + and t5, 4, AT + negq X, t4 + cmovne AT, t4, X + + negq RV, t4 + cmovlbs t5, t4, RV + + br $fix_sign_out_ret + + cfi_endproc + .size __divq, .-__divq + + DO_DIVBYZERO diff --git a/sysdeps/alpha/divqu.S b/sysdeps/alpha/divqu.S new file mode 100644 index 0000000..6ff6c03 --- /dev/null +++ b/sysdeps/alpha/divqu.S @@ -0,0 +1,244 @@ +/* Copyright (C) 2004 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "div_libc.h" + + +/* 64-bit unsigned long divide. These are not normal C functions. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may be + clobbered. + + Theory of operation here is that we can use the FPU divider for virtually + all operands that we see: all dividend values between -2**53 and 2**53-1 + can be computed directly. Note that divisor values need not be checked + against that range because the rounded fp value will be close enough such + that the quotient is < 1, which will properly be truncated to zero when we + convert back to integer. + + When the dividend is outside the range for which we can compute exact + results, we use the fp quotent as an estimate from which we begin refining + an exact integral value. This reduces the number of iterations in the + shift-and-subtract loop significantly. */ + + .text + .align 4 + .globl __divqu + .type __divqu, @function + .usepv __divqu, no + + cfi_startproc + cfi_return_column (RA) +__divqu: + lda sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + + /* Get the fp divide insn issued as quickly as possible. After + that's done, we have at least 22 cycles until its results are + ready -- all the time in the world to figure out how we're + going to use the results. */ + stq X, 16(sp) + stq Y, 24(sp) + beq Y, DIVBYZERO + + stt $f0, 0(sp) + stt $f1, 8(sp) + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + ldt $f0, 16(sp) + ldt $f1, 24(sp) + + cvtqt $f0, $f0 + cvtqt $f1, $f1 + blt X, $x_is_neg + divt/c $f0, $f1, $f0 + + /* Check to see if Y was mis-converted as signed value. */ + ldt $f1, 8(sp) + unop + nop + blt Y, $y_is_neg + + /* Check to see if X fit in the double as an exact value. */ + srl X, 53, AT + bne AT, $x_big + + /* If we get here, we're expecting exact results from the division. + Do nothing else besides convert and clean up. */ + cvttq/c $f0, $f0 + stt $f0, 16(sp) + + ldq RV, 16(sp) + ldt $f0, 0(sp) + cfi_remember_state + cfi_restore ($f0) + cfi_restore ($f1) + cfi_def_cfa_offset (0) + lda sp, FRAME(sp) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state +$x_is_neg: + /* If we get here, X is so big that bit 63 is set, which made the + conversion come out negative. Fix it up lest we not even get + a good estimate. */ + ldah AT, 0x5f80 /* 2**64 as float. */ + stt $f2, 24(sp) + cfi_rel_offset ($f2, 24) + stl AT, 16(sp) + lds $f2, 16(sp) + + addt $f0, $f2, $f0 + unop + divt/c $f0, $f1, $f0 + unop + + /* Ok, we've now the divide issued. Continue with other checks. */ + ldt $f1, 8(sp) + unop + ldt $f2, 24(sp) + blt Y, $y_is_neg + cfi_restore ($f1) + cfi_restore ($f2) + cfi_remember_state /* for y_is_neg */ + + .align 4 +$x_big: + /* If we get here, X is large enough that we don't expect exact + results, and neither X nor Y got mis-translated for the fp + division. Our task is to take the fp result, figure out how + far it's off from the correct result and compute a fixup. */ + stq t0, 16(sp) + stq t1, 24(sp) + stq t2, 32(sp) + stq t3, 40(sp) + cfi_rel_offset (t0, 16) + cfi_rel_offset (t1, 24) + cfi_rel_offset (t2, 32) + cfi_rel_offset (t3, 40) + +#define Q RV /* quotient */ +#define R t0 /* remainder */ +#define SY t1 /* scaled Y */ +#define S t2 /* scalar */ +#define QY t3 /* Q*Y */ + + cvttq/c $f0, $f0 + stt $f0, 8(sp) + ldq Q, 8(sp) + mulq Q, Y, QY + + stq t4, 8(sp) + unop + ldt $f0, 0(sp) + unop + cfi_rel_offset (t4, 8) + cfi_restore ($f0) + + subq QY, X, R + mov Y, SY + mov 1, S + bgt R, $q_high + +$q_high_ret: + subq X, QY, R + mov Y, SY + mov 1, S + bgt R, $q_low + +$q_low_ret: + ldq t4, 8(sp) + ldq t0, 16(sp) + ldq t1, 24(sp) + ldq t2, 32(sp) + + ldq t3, 40(sp) + lda sp, FRAME(sp) + cfi_remember_state + cfi_restore (t0) + cfi_restore (t1) + cfi_restore (t2) + cfi_restore (t3) + cfi_restore (t4) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state + /* The quotient that we computed was too large. We need to reduce + it by S such that Y*S >= R. Obviously the closer we get to the + correct value the better, but overshooting high is ok, as we'll + fix that up later. */ +0: + addq SY, SY, SY + addq S, S, S +$q_high: + cmpult SY, R, AT + bne AT, 0b + + subq Q, S, Q + unop + subq QY, SY, QY + br $q_high_ret + + .align 4 + /* The quotient that we computed was too small. Divide Y by the + current remainder (R) and add that to the existing quotient (Q). + The expectation, of course, is that R is much smaller than X. */ + /* Begin with a shift-up loop. Compute S such that Y*S >= R. We + already have a copy of Y in SY and the value 1 in S. */ +0: + addq SY, SY, SY + addq S, S, S +$q_low: + cmpult SY, R, AT + bne AT, 0b + + /* Shift-down and subtract loop. Each iteration compares our scaled + Y (SY) with the remainder (R); if SY <= R then X is divisible by + Y's scalar (S) so add it to the quotient (Q). */ +2: addq Q, S, t3 + srl S, 1, S + cmpule SY, R, AT + subq R, SY, t4 + + cmovne AT, t3, Q + cmovne AT, t4, R + srl SY, 1, SY + bne S, 2b + + br $q_low_ret + + .align 4 + cfi_restore_state +$y_is_neg: + /* If we get here, Y is so big that bit 63 is set. The results + from the divide will be completely wrong. Fortunately, the + quotient must be either 0 or 1, so just compute it directly. */ + cmpult Y, X, RV + ldt $f0, 0(sp) + lda sp, FRAME(sp) + cfi_restore ($f0) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + cfi_endproc + .size __divqu, .-__divqu + + DO_DIVBYZERO diff --git a/sysdeps/alpha/divrem.h b/sysdeps/alpha/divrem.h deleted file mode 100644 index 032308d..0000000 --- a/sysdeps/alpha/divrem.h +++ /dev/null @@ -1,225 +0,0 @@ -/* Copyright (C) 1996,97,2002 Free Software Foundation, Inc. - Contributed by David Mosberger (davidm@cs.arizona.edu). - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -/* The current Alpha chips don't provide hardware for integer - division. The C compiler expects the functions - - __divqu: 64-bit unsigned long divide - __remqu: 64-bit unsigned long remainder - __divqs/__remqs: signed 64-bit - __divlu/__remlu: unsigned 32-bit - __divls/__remls: signed 32-bit - - These are not normal C functions: instead of the normal calling - sequence, these expect their arguments in registers t10 and t11, and - return the result in t12 (aka pv). Register AT may be clobbered - (assembly temporary), anything else must be saved. */ - -#include <sysdep.h> - -#ifdef __linux__ -# include <asm/gentrap.h> -# include <asm/pal.h> -#else -# include <machine/pal.h> -#endif - -#define mask v0 -#define divisor t0 -#define compare AT -#define tmp1 t2 -#define tmp2 t3 -#define retaddr t9 -#define arg1 t10 -#define arg2 t11 -#define result t12 - -#if IS_REM -# define DIV_ONLY(x,y...) -# define REM_ONLY(x,y...) x,##y -# define modulus result -# define quotient t1 -# define GETSIGN(x) mov arg1, x -# define STACK 32 -#else -# define DIV_ONLY(x,y...) x,##y -# define REM_ONLY(x,y...) -# define modulus t1 -# define quotient result -# define GETSIGN(x) xor arg1, arg2, x -# define STACK 48 -#endif - -#if SIZE == 8 -# define LONGIFY(x,y) mov x,y -# define SLONGIFY(x,y) mov x,y -# define _SLONGIFY(x) -# define NEG(x,y) negq x,y -#else -# define LONGIFY(x,y) zapnot x,15,y -# define SLONGIFY(x,y) sextl x,y -# define _SLONGIFY(x) sextl x,x -# define NEG(x,y) negl x,y -#endif - - .set noreorder - .set noat - - .ent UFUNC_NAME - .globl UFUNC_NAME - - .align 3 -UFUNC_NAME: -$udiv_entry: - lda sp, -STACK(sp) - .frame sp, STACK, retaddr, 0 -#ifdef PROF - stq ra, 0(sp) - stq pv, 8(sp) - stq gp, 16(sp) - - br AT, 1f -1: ldgp gp, 0(AT) - - mov retaddr, ra - lda AT, _mcount - jsr AT, (AT), _mcount - - ldq ra, 0(sp) - ldq pv, 8(sp) - ldq gp, 16(sp) -#endif - .prologue 0 - -$udiv: - stq t0, 0(sp) - LONGIFY (arg2, divisor) - stq t1, 8(sp) - LONGIFY (arg1, modulus) - stq v0, 16(sp) - clr quotient - stq tmp1, 24(sp) - ldiq mask, 1 - DIV_ONLY(stq tmp2,32(sp)) - - beq divisor, $divbyzero - - .align 3 -#if SIZE == 8 - /* Shift divisor left. */ -1: cmpult divisor, modulus, compare - blt divisor, 2f - addq divisor, divisor, divisor - addq mask, mask, mask - bne compare, 1b - unop -2: -#else - /* Shift divisor left using 3-bit shifts as we can't overflow. - This results in looping three times less here, but up to - two more times later. Thus using a large shift isn't worth it. */ -1: cmpult divisor, modulus, compare - s8addq divisor, zero, divisor - s8addq mask, zero, mask - bne compare, 1b -#endif - - /* Now go back to the right. */ -3: DIV_ONLY(addq quotient, mask, tmp2) - srl mask, 1, mask - cmpule divisor, modulus, compare - subq modulus, divisor, tmp1 - DIV_ONLY(cmovne compare, tmp2, quotient) - srl divisor, 1, divisor - cmovne compare, tmp1, modulus - bne mask, 3b - -$done: ldq t0, 0(sp) - ldq t1, 8(sp) - ldq v0, 16(sp) - ldq tmp1, 24(sp) - DIV_ONLY(ldq tmp2, 32(sp)) - lda sp, STACK(sp) - ret zero, (retaddr), 1 - -$divbyzero: - mov a0, tmp1 - ldiq a0, GEN_INTDIV - call_pal PAL_gentrap - mov tmp1, a0 - clr result /* If trap returns, return zero. */ - br $done - - .end UFUNC_NAME - - .ent SFUNC_NAME - .globl SFUNC_NAME - - .align 3 -SFUNC_NAME: - lda sp, -STACK(sp) - .frame sp, STACK, retaddr, 0 -#ifdef PROF - stq ra, 0(sp) - stq pv, 8(sp) - stq gp, 16(sp) - - br AT, 1f -1: ldgp gp, 0(AT) - - mov retaddr, ra - jsr AT, _mcount - - ldq ra, 0(sp) - ldq pv, 8(sp) - ldq gp, 16(sp) -#endif - .prologue 0 - - or arg1, arg2, AT - _SLONGIFY(AT) - bge AT, $udiv /* don't need to mess with signs */ - - /* Save originals and find absolute values. */ - stq arg1, 0(sp) - NEG (arg1, AT) - stq arg2, 8(sp) - cmovge AT, AT, arg1 - stq retaddr, 16(sp) - NEG (arg2, AT) - stq tmp1, 24(sp) - cmovge AT, AT, arg2 - - /* Do the unsigned division. */ - bsr retaddr, $udiv_entry - - /* Restore originals and adjust the sign of the result. */ - ldq arg1, 0(sp) - ldq arg2, 8(sp) - GETSIGN (AT) - NEG (result, tmp1) - _SLONGIFY(AT) - ldq retaddr, 16(sp) - cmovlt AT, tmp1, result - ldq tmp1, 24(sp) - - lda sp, STACK(sp) - ret zero, (retaddr), 1 - - .end SFUNC_NAME diff --git a/sysdeps/alpha/reml.S b/sysdeps/alpha/reml.S index 8c00365..c4eb426 100644 --- a/sysdeps/alpha/reml.S +++ b/sysdeps/alpha/reml.S @@ -1,6 +1,80 @@ -#define IS_REM 1 -#define SIZE 4 -#define UFUNC_NAME __remlu -#define SFUNC_NAME __reml +/* Copyright (C) 2004 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@twiddle.net> + This file is part of the GNU C Library. -#include "divrem.h" + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "div_libc.h" + +/* 32-bit signed int remainder. This is not a normal C function. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may + be clobbered. + + The FPU can handle the division for all input values except zero. + All we have to do is compute the remainder via multiply-and-subtract. */ + +#ifndef EXTEND +#define EXTEND(S,D) sextl S, D +#endif + + .text + .align 4 + .globl __reml + .type __reml, @function + .usepv __reml, no + + cfi_startproc + cfi_return_column (RA) +__reml: + lda sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + stt $f0, 0(sp) + stt $f1, 8(sp) + beq Y, DIVBYZERO + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + + EXTEND (X, RV) + EXTEND (Y, AT) + stq RV, 16(sp) + stq AT, 24(sp) + + ldt $f0, 16(sp) + ldt $f1, 24(sp) + cvtqt $f0, $f0 + cvtqt $f1, $f1 + + divt/c $f0, $f1, $f0 + cvttq/c $f0, $f0 + stt $f0, 16(sp) + ldq RV, 16(sp) + + ldt $f0, 0(sp) + mull RV, Y, RV + ldt $f1, 8(sp) + lda sp, FRAME(sp) + cfi_restore ($f0) + cfi_restore ($f1) + cfi_def_cfa_offset (0) + + subl X, RV, RV + ret $31, (RA), 1 + + cfi_endproc + .size __reml, .-__reml + + DO_DIVBYZERO diff --git a/sysdeps/alpha/remlu.S b/sysdeps/alpha/remlu.S new file mode 100644 index 0000000..f8691e1 --- /dev/null +++ b/sysdeps/alpha/remlu.S @@ -0,0 +1,4 @@ +#define UNSIGNED +#define EXTEND(S,D) zapnot S, 15, D +#define __reml __remlu +#include <reml.S> diff --git a/sysdeps/alpha/remq.S b/sysdeps/alpha/remq.S index cd1064a..ce527d1 100644 --- a/sysdeps/alpha/remq.S +++ b/sysdeps/alpha/remq.S @@ -1,6 +1,261 @@ -#define IS_REM 1 -#define SIZE 8 -#define UFUNC_NAME __remqu -#define SFUNC_NAME __remq +/* Copyright (C) 2004 Free Software Foundation, Inc. + This file is part of the GNU C Library. -#include "divrem.h" + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "div_libc.h" + + +/* 64-bit signed long remainder. These are not normal C functions. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may + be clobbered. + + Theory of operation here is that we can use the FPU divider for virtually + all operands that we see: all dividend values between -2**53 and 2**53-1 + can be computed directly. Note that divisor values need not be checked + against that range because the rounded fp value will be close enough such + that the quotient is < 1, which will properly be truncated to zero when we + convert back to integer. + + When the dividend is outside the range for which we can compute exact + results, we use the fp quotent as an estimate from which we begin refining + an exact integral value. This reduces the number of iterations in the + shift-and-subtract loop significantly. */ + + .text + .align 4 + .globl __remq + .type __remq, @function + .usepv __remq, no + + cfi_startproc + cfi_return_column (RA) +__remq: + lda sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + + /* Get the fp divide insn issued as quickly as possible. After + that's done, we have at least 22 cycles until its results are + ready -- all the time in the world to figure out how we're + going to use the results. */ + stq X, 16(sp) + stq Y, 24(sp) + beq Y, DIVBYZERO + + stt $f0, 0(sp) + stt $f1, 8(sp) + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + ldt $f0, 16(sp) + ldt $f1, 24(sp) + + cvtqt $f0, $f0 + cvtqt $f1, $f1 + divt/c $f0, $f1, $f0 + + /* Check to see if X fit in the double as an exact value. */ + sll X, (64-53), AT + ldt $f1, 8(sp) + sra AT, (64-53), AT + cmpeq X, AT, AT + beq AT, $x_big + + /* If we get here, we're expecting exact results from the division. + Do nothing else besides convert, compute remainder, clean up. */ + cvttq/c $f0, $f0 + stt $f0, 16(sp) + + ldq AT, 16(sp) + mulq AT, Y, AT + ldt $f0, 0(sp) + cfi_restore ($f1) + cfi_remember_state + cfi_restore ($f0) + cfi_def_cfa_offset (0) + lda sp, FRAME(sp) + + subq X, AT, RV + ret $31, (RA), 1 + + .align 4 + cfi_restore_state +$x_big: + /* If we get here, X is large enough that we don't expect exact + results, and neither X nor Y got mis-translated for the fp + division. Our task is to take the fp result, figure out how + far it's off from the correct result and compute a fixup. */ + stq t0, 16(sp) + stq t1, 24(sp) + stq t2, 32(sp) + stq t5, 40(sp) + cfi_rel_offset (t0, 16) + cfi_rel_offset (t1, 24) + cfi_rel_offset (t2, 32) + cfi_rel_offset (t5, 40) + +#define Q t0 /* quotient */ +#define R RV /* remainder */ +#define SY t1 /* scaled Y */ +#define S t2 /* scalar */ +#define QY t3 /* Q*Y */ + + /* The fixup code below can only handle unsigned values. */ + or X, Y, AT + mov $31, t5 + blt AT, $fix_sign_in +$fix_sign_in_ret1: + cvttq/c $f0, $f0 + + stt $f0, 8(sp) + ldq Q, 8(sp) +$fix_sign_in_ret2: + mulq Q, Y, QY + stq t4, 8(sp) + + ldt $f0, 0(sp) + unop + cfi_rel_offset (t4, 8) + cfi_restore ($f0) + stq t3, 0(sp) + unop + cfi_rel_offset (t3, 0) + + subq QY, X, R + mov Y, SY + mov 1, S + bgt R, $q_high + +$q_high_ret: + subq X, QY, R + mov Y, SY + mov 1, S + bgt R, $q_low + +$q_low_ret: + ldq t0, 16(sp) + ldq t1, 24(sp) + ldq t2, 32(sp) + bne t5, $fix_sign_out + +$fix_sign_out_ret: + ldq t3, 0(sp) + ldq t4, 8(sp) + ldq t5, 40(sp) + lda sp, FRAME(sp) + cfi_remember_state + cfi_restore (t0) + cfi_restore (t1) + cfi_restore (t2) + cfi_restore (t3) + cfi_restore (t4) + cfi_restore (t5) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state + /* The quotient that we computed was too large. We need to reduce + it by S such that Y*S >= R. Obviously the closer we get to the + correct value the better, but overshooting high is ok, as we'll + fix that up later. */ +0: + addq SY, SY, SY + addq S, S, S +$q_high: + cmpult SY, R, AT + bne AT, 0b + + subq Q, S, Q + unop + subq QY, SY, QY + br $q_high_ret + + .align 4 + /* The quotient that we computed was too small. Divide Y by the + current remainder (R) and add that to the existing quotient (Q). + The expectation, of course, is that R is much smaller than X. */ + /* Begin with a shift-up loop. Compute S such that Y*S >= R. We + already have a copy of Y in SY and the value 1 in S. */ +0: + addq SY, SY, SY + addq S, S, S +$q_low: + cmpult SY, R, AT + bne AT, 0b + + /* Shift-down and subtract loop. Each iteration compares our scaled + Y (SY) with the remainder (R); if SY <= R then X is divisible by + Y's scalar (S) so add it to the quotient (Q). */ +2: addq Q, S, t3 + srl S, 1, S + cmpule SY, R, AT + subq R, SY, t4 + + cmovne AT, t3, Q + cmovne AT, t4, R + srl SY, 1, SY + bne S, 2b + + br $q_low_ret + + .align 4 +$fix_sign_in: + /* If we got here, then X|Y is negative. Need to adjust everything + such that we're doing unsigned division in the fixup loop. */ + /* T5 records the changes we had to make: + bit 0: set if X was negated. Note that the sign of the + remainder follows the sign of the divisor. + bit 2: set if Y was negated. + */ + xor X, Y, t1 + cmplt X, 0, t5 + negq X, t0 + cmovne t5, t0, X + + cmplt Y, 0, AT + negq Y, t0 + s4addq AT, t5, t5 + cmovne AT, t0, Y + + bge t1, $fix_sign_in_ret1 + cvttq/c $f0, $f0 + stt $f0, 8(sp) + ldq Q, 8(sp) + + negq Q, Q + br $fix_sign_in_ret2 + + .align 4 +$fix_sign_out: + /* Now we get to undo what we did above. */ + /* ??? Is this really faster than just increasing the size of + the stack frame and storing X and Y in memory? */ + and t5, 4, AT + negq Y, t4 + cmovne AT, t4, Y + + negq X, t4 + cmovlbs t5, t4, X + negq RV, t4 + cmovlbs t5, t4, RV + + br $fix_sign_out_ret + + cfi_endproc + .size __remq, .-__remq + + DO_DIVBYZERO diff --git a/sysdeps/alpha/remqu.S b/sysdeps/alpha/remqu.S new file mode 100644 index 0000000..1a1dcad --- /dev/null +++ b/sysdeps/alpha/remqu.S @@ -0,0 +1,251 @@ +/* Copyright (C) 2004 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "div_libc.h" + + +/* 64-bit unsigned long remainder. These are not normal C functions. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may be + clobbered. + + Theory of operation here is that we can use the FPU divider for virtually + all operands that we see: all dividend values between -2**53 and 2**53-1 + can be computed directly. Note that divisor values need not be checked + against that range because the rounded fp value will be close enough such + that the quotient is < 1, which will properly be truncated to zero when we + convert back to integer. + + When the dividend is outside the range for which we can compute exact + results, we use the fp quotent as an estimate from which we begin refining + an exact integral value. This reduces the number of iterations in the + shift-and-subtract loop significantly. */ + + .text + .align 4 + .globl __remqu + .type __remqu, @function + .usepv __remqu, no + + cfi_startproc + cfi_return_column (RA) +__remqu: + lda sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + + /* Get the fp divide insn issued as quickly as possible. After + that's done, we have at least 22 cycles until its results are + ready -- all the time in the world to figure out how we're + going to use the results. */ + stq X, 16(sp) + stq Y, 24(sp) + beq Y, DIVBYZERO + + stt $f0, 0(sp) + stt $f1, 8(sp) + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + ldt $f0, 16(sp) + ldt $f1, 24(sp) + + cvtqt $f0, $f0 + cvtqt $f1, $f1 + blt X, $x_is_neg + divt/c $f0, $f1, $f0 + + /* Check to see if Y was mis-converted as signed value. */ + ldt $f1, 8(sp) + unop + nop + blt Y, $y_is_neg + + /* Check to see if X fit in the double as an exact value. */ + srl X, 53, AT + bne AT, $x_big + + /* If we get here, we're expecting exact results from the division. + Do nothing else besides convert, compute remainder, clean up. */ + cvttq/c $f0, $f0 + stt $f0, 16(sp) + + ldq AT, 16(sp) + mulq AT, Y, AT + ldt $f0, 0(sp) + lda sp, FRAME(sp) + cfi_remember_state + cfi_restore ($f0) + cfi_restore ($f1) + cfi_def_cfa_offset (0) + + subq X, AT, RV + ret $31, (RA), 1 + + .align 4 + cfi_restore_state +$x_is_neg: + /* If we get here, X is so big that bit 63 is set, which made the + conversion come out negative. Fix it up lest we not even get + a good estimate. */ + ldah AT, 0x5f80 /* 2**64 as float. */ + stt $f2, 24(sp) + cfi_rel_offset ($f2, 24) + stl AT, 16(sp) + lds $f2, 16(sp) + + addt $f0, $f2, $f0 + unop + divt/c $f0, $f1, $f0 + unop + + /* Ok, we've now the divide issued. Continue with other checks. */ + ldt $f1, 8(sp) + unop + ldt $f2, 24(sp) + blt Y, $y_is_neg + cfi_restore ($f1) + cfi_restore ($f2) + cfi_remember_state /* for y_is_neg */ + + .align 4 +$x_big: + /* If we get here, X is large enough that we don't expect exact + results, and neither X nor Y got mis-translated for the fp + division. Our task is to take the fp result, figure out how + far it's off from the correct result and compute a fixup. */ + stq t0, 16(sp) + stq t1, 24(sp) + stq t2, 32(sp) + stq t3, 40(sp) + cfi_rel_offset (t0, 16) + cfi_rel_offset (t1, 24) + cfi_rel_offset (t2, 32) + cfi_rel_offset (t3, 40) + +#define Q t0 /* quotient */ +#define R RV /* remainder */ +#define SY t1 /* scaled Y */ +#define S t2 /* scalar */ +#define QY t3 /* Q*Y */ + + cvttq/c $f0, $f0 + stt $f0, 8(sp) + ldq Q, 8(sp) + mulq Q, Y, QY + + stq t4, 8(sp) + unop + ldt $f0, 0(sp) + unop + cfi_rel_offset (t4, 8) + cfi_restore ($f0) + + subq QY, X, R + mov Y, SY + mov 1, S + bgt R, $q_high + +$q_high_ret: + subq X, QY, R + mov Y, SY + mov 1, S + bgt R, $q_low + +$q_low_ret: + ldq t4, 8(sp) + ldq t0, 16(sp) + ldq t1, 24(sp) + ldq t2, 32(sp) + + ldq t3, 40(sp) + lda sp, FRAME(sp) + cfi_remember_state + cfi_restore (t0) + cfi_restore (t1) + cfi_restore (t2) + cfi_restore (t3) + cfi_restore (t4) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state + /* The quotient that we computed was too large. We need to reduce + it by S such that Y*S >= R. Obviously the closer we get to the + correct value the better, but overshooting high is ok, as we'll + fix that up later. */ +0: + addq SY, SY, SY + addq S, S, S +$q_high: + cmpult SY, R, AT + bne AT, 0b + + subq Q, S, Q + unop + subq QY, SY, QY + br $q_high_ret + + .align 4 + /* The quotient that we computed was too small. Divide Y by the + current remainder (R) and add that to the existing quotient (Q). + The expectation, of course, is that R is much smaller than X. */ + /* Begin with a shift-up loop. Compute S such that Y*S >= R. We + already have a copy of Y in SY and the value 1 in S. */ +0: + addq SY, SY, SY + addq S, S, S +$q_low: + cmpult SY, R, AT + bne AT, 0b + + /* Shift-down and subtract loop. Each iteration compares our scaled + Y (SY) with the remainder (R); if SY <= R then X is divisible by + Y's scalar (S) so add it to the quotient (Q). */ +2: addq Q, S, t3 + srl S, 1, S + cmpule SY, R, AT + subq R, SY, t4 + + cmovne AT, t3, Q + cmovne AT, t4, R + srl SY, 1, SY + bne S, 2b + + br $q_low_ret + + .align 4 + cfi_restore_state +$y_is_neg: + /* If we get here, Y is so big that bit 63 is set. The results + from the divide will be completely wrong. Fortunately, the + quotient must be either 0 or 1, so the remainder must be X + or X-Y, so just compute it directly. */ + cmpult Y, X, AT + subq X, Y, RV + ldt $f0, 0(sp) + cmoveq AT, X, RV + + lda sp, FRAME(sp) + cfi_restore ($f0) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + cfi_endproc + .size __remqu, .-__remqu + + DO_DIVBYZERO |