diff options
author | Adhemerval Zanella <adhemerval.zanella@linaro.org> | 2019-11-13 12:32:17 +0000 |
---|---|---|
committer | Adhemerval Zanella <adhemerval.zanella@linaro.org> | 2019-11-27 09:37:57 -0300 |
commit | 5d9b7b9fa734c5381e0295c85c0e40520d9f6063 (patch) | |
tree | d8f050c206fdbe03167d31f8ff3916e2d52d8dc1 /sysdeps | |
parent | bfdb731438206b0f70fe7afa890681155c30b419 (diff) | |
download | glibc-5d9b7b9fa734c5381e0295c85c0e40520d9f6063.zip glibc-5d9b7b9fa734c5381e0295c85c0e40520d9f6063.tar.gz glibc-5d9b7b9fa734c5381e0295c85c0e40520d9f6063.tar.bz2 |
Remove 32 bit sparc v7 support
The patch is straighforward:
- The sparc32 v8 implementations are moved as the generic ones.
- A configure test is added to check for either __sparc_v8__ or
__sparc_v9__.
- The triple names are simplified and sparc implies sparcv8.
The idea is to keep support on sparcv8 architectures that does support
CAS instructions, such as LEON3/LEON4.
Checked on a sparcv9-linux-gnu and sparc64-linux-gnu.
Tested-by: Andreas Larsson <andreas@gaisler.com>
Diffstat (limited to 'sysdeps')
24 files changed, 410 insertions, 2702 deletions
diff --git a/sysdeps/sparc/preconfigure b/sysdeps/sparc/preconfigure index de86749..5628746 100644 --- a/sysdeps/sparc/preconfigure +++ b/sysdeps/sparc/preconfigure @@ -1,24 +1,10 @@ # preconfigure fragment for sparc. case "$machine" in -sparc | sparcv[67]) +sparc | sparcv8 | supersparc | hypersparc) base_machine=sparc machine=sparc/sparc32 ;; -sparcv8 | supersparc | hypersparc) - base_machine=sparc machine=sparc/sparc32/sparcv8 ;; -sparcv8plus | sparcv8plusa | sparcv9) +sparcv8plus* | sparcv9*) base_machine=sparc machine=sparc/sparc32/sparcv9 ;; -sparcv8plusb | sparcv9b) - base_machine=sparc machine=sparc/sparc32/sparcv9/sparcv9b ;; -sparcv9v) - base_machine=sparc machine=sparc/sparc32/sparcv9/sparcv9v ;; -sparcv9v2) - base_machine=sparc machine=sparc/sparc32/sparcv9/sparcv9v2 ;; -sparc64) +sparc64*) base_machine=sparc machine=sparc/sparc64 ;; -sparc64b) - base_machine=sparc machine=sparc/sparc64/sparcv9b ;; -sparc64v) - base_machine=sparc machine=sparc/sparc64/sparcv9v ;; -sparc64v2) - base_machine=sparc machine=sparc/sparc64/sparcv9v2 ;; esac diff --git a/sysdeps/sparc/sparc32/Makefile b/sysdeps/sparc/sparc32/Makefile index fe40f5c..008cb81 100644 --- a/sysdeps/sparc/sparc32/Makefile +++ b/sysdeps/sparc/sparc32/Makefile @@ -19,35 +19,8 @@ ifeq ($(subdir),gnulib) sysdep_routines = dotmul umul $(divrem) alloca endif # gnulib -# We distribute these files, even though they are generated, -# so as to avoid the need for a functioning m4 to build the library. divrem := sdiv udiv rem urem -+divrem-NAME-sdiv := div -+divrem-NAME-udiv := udiv -+divrem-NAME-rem := rem -+divrem-NAME-urem := urem -+divrem-NAME = $(+divrem-NAME-$(basename $(notdir $@))) -+divrem-OP-div := div -+divrem-OP-udiv := div -+divrem-OP-rem := rem -+divrem-OP-urem := rem -+divrem-S-div := true -+divrem-S-rem := true -+divrem-S-udiv := false -+divrem-S-urem := false -$(divrem:%=$(sysdep_dir)/sparc/sparc32/%.S): $(sysdep_dir)/sparc/sparc32/divrem.m4 - (echo -n "define(NAME,\`.$(+divrem-NAME)')"; \ - echo -n " define(OP,\`$(+divrem-OP-$(+divrem-NAME))')"; \ - echo -n " define(S,\`$(+divrem-S-$(+divrem-NAME))')"; \ - echo " /* This file is generated from divrem.m4; DO NOT EDIT! */"; \ - cat $<) | $(M4) > $@-tmp -# Make it unwritable so noone will edit it by mistake. - -chmod a-w $@-tmp - mv -f $@-tmp $@ - -sysdep-realclean := $(sysdep-realclean) $(divrem:%=sysdeps/sparc/sparc32/%.S) - # libgcc __divdi3 and __moddi3 uses .udiv and since it is also exported by # libc.so linker will create PLTs for the symbol. To avoid it we strong alias # the exported libc one to __wrap_.udiv and use linker option --wrap to make any diff --git a/sysdeps/sparc/sparc32/addmul_1.S b/sysdeps/sparc/sparc32/addmul_1.S index 1028f19..4708393 100644 --- a/sysdeps/sparc/sparc32/addmul_1.S +++ b/sysdeps/sparc/sparc32/addmul_1.S @@ -1,146 +1,118 @@ -! SPARC __mpn_addmul_1 -- Multiply a limb vector with a limb and add -! the result to a second limb vector. -! +! SPARC v8 __mpn_addmul_1 -- Multiply a limb vector with a limb and +! add the result to a second limb vector. + ! Copyright (C) 1992-2019 Free Software Foundation, Inc. -! + ! This file is part of the GNU MP Library. -! + ! The GNU MP Library is free software; you can redistribute it and/or modify ! it under the terms of the GNU Lesser General Public License as published by ! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. -! + ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! + ! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, ! see <https://www.gnu.org/licenses/>. ! INPUT PARAMETERS -! RES_PTR o0 -! S1_PTR o1 -! SIZE o2 -! S2_LIMB o3 +! res_ptr o0 +! s1_ptr o1 +! size o2 +! s2_limb o3 #include <sysdep.h> ENTRY(__mpn_addmul_1) - ! Make S1_PTR and RES_PTR point at the end of their blocks - ! and put (- 4 x SIZE) in index/loop counter. - sll %o2,2,%o2 - add %o0,%o2,%o4 ! RES_PTR in o4 since o0 is retval - add %o1,%o2,%o1 - sub %g0,%o2,%o2 + ld [%o1+0],%o4 ! 1 + sll %o2,4,%g1 + orcc %g0,%g0,%g2 + mov %o7,%g4 ! Save return address register + and %g1,(4-1)<<4,%g1 +1: call 2f + add %o7,3f-1b,%g3 +2: jmp %g3+%g1 + mov %g4,%o7 ! Restore return address register - cmp %o3,0xfff - bgu LOC(large) + .align 4 +3: +LOC(00): + add %o0,-4,%o0 + b LOC(loop00) /* 4, 8, 12, ... */ + add %o1,-4,%o1 + nop +LOC(01): + b LOC(loop01) /* 1, 5, 9, ... */ + nop + nop + nop +LOC(10): + add %o0,-12,%o0 /* 2, 6, 10, ... */ + b LOC(loop10) + add %o1,4,%o1 + nop +LOC(11): + add %o0,-8,%o0 /* 3, 7, 11, ... */ + b LOC(loop11) + add %o1,-8,%o1 nop - ld [%o1+%o2],%o5 - mov 0,%o0 - b LOC(0) - add %o4,-4,%o4 -LOC(loop0): - addcc %o5,%g1,%g1 - ld [%o1+%o2],%o5 - addx %o0,%g0,%o0 - st %g1,[%o4+%o2] -LOC(0): wr %g0,%o3,%y - sra %o5,31,%g2 - and %o3,%g2,%g2 - andcc %g1,0,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,0,%g1 - sra %g1,20,%g4 - sll %g1,12,%g1 - rd %y,%g3 - srl %g3,20,%g3 - or %g1,%g3,%g1 - - addcc %g1,%o0,%g1 - addx %g2,%g4,%o0 ! add sign-compensation and cy to hi limb - addcc %o2,4,%o2 ! loop counter - bne LOC(loop0) - ld [%o4+%o2],%o5 - - addcc %o5,%g1,%g1 - addx %o0,%g0,%o0 - retl - st %g1,[%o4+%o2] - - -LOC(large): - ld [%o1+%o2],%o5 - mov 0,%o0 - sra %o3,31,%g4 ! g4 = mask of ones iff S2_LIMB < 0 - b LOC(1) - add %o4,-4,%o4 LOC(loop): - addcc %o5,%g3,%g3 - ld [%o1+%o2],%o5 - addx %o0,%g0,%o0 - st %g3,[%o4+%o2] -LOC(1): wr %g0,%o5,%y - and %o5,%g4,%g2 - andcc %g0,%g0,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%g0,%g1 - rd %y,%g3 - addcc %g3,%o0,%g3 - addx %g2,%g1,%o0 - addcc %o2,4,%o2 - bne LOC(loop) - ld [%o4+%o2],%o5 + addcc %g3,%g2,%g3 ! 1 + ld [%o1+4],%o4 ! 2 + rd %y,%g2 ! 1 + addx %g0,%g2,%g2 + ld [%o0+0],%g1 ! 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+0] ! 1 +LOC(loop00): + umul %o4,%o3,%g3 ! 2 + ld [%o0+4],%g1 ! 2 + addxcc %g3,%g2,%g3 ! 2 + ld [%o1+8],%o4 ! 3 + rd %y,%g2 ! 2 + addx %g0,%g2,%g2 + nop + addcc %g1,%g3,%g3 + st %g3,[%o0+4] ! 2 +LOC(loop11): + umul %o4,%o3,%g3 ! 3 + addxcc %g3,%g2,%g3 ! 3 + ld [%o1+12],%o4 ! 4 + rd %y,%g2 ! 3 + add %o1,16,%o1 + addx %g0,%g2,%g2 + ld [%o0+8],%g1 ! 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+8] ! 3 +LOC(loop10): + umul %o4,%o3,%g3 ! 4 + addxcc %g3,%g2,%g3 ! 4 + ld [%o1+0],%o4 ! 1 + rd %y,%g2 ! 4 + addx %g0,%g2,%g2 + ld [%o0+12],%g1 ! 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+12] ! 4 + add %o0,16,%o0 + addx %g0,%g2,%g2 +LOC(loop01): + addcc %o2,-4,%o2 + bg LOC(loop) + umul %o4,%o3,%g3 ! 1 - addcc %o5,%g3,%g3 - addx %o0,%g0,%o0 + addcc %g3,%g2,%g3 ! 4 + rd %y,%g2 ! 4 + addx %g0,%g2,%g2 + ld [%o0+0],%g1 ! 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+0] ! 4 retl - st %g3,[%o4+%o2] + addx %g0,%g2,%o0 END(__mpn_addmul_1) diff --git a/sysdeps/sparc/sparc32/configure b/sysdeps/sparc/sparc32/configure new file mode 100644 index 0000000..d7f16b1 --- /dev/null +++ b/sysdeps/sparc/sparc32/configure @@ -0,0 +1,162 @@ +# This file is generated from configure.ac by Autoconf. DO NOT EDIT! + # Local configure fragment for sysdeps/sparc/sparc32 + +# Test if compiler targets at least sparcv8. + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5 +$as_echo_n "checking for grep that handles long lines and -e... " >&6; } +if ${ac_cv_path_GREP+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -z "$GREP"; then + ac_path_GREP_found=false + # Loop through the user's path and test for each of PROGNAME-LIST + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_prog in grep ggrep; do + for ac_exec_ext in '' $ac_executable_extensions; do + ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext" + as_fn_executable_p "$ac_path_GREP" || continue +# Check for GNU ac_path_GREP and select it if it is found. + # Check for GNU $ac_path_GREP +case `"$ac_path_GREP" --version 2>&1` in +*GNU*) + ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;; +*) + ac_count=0 + $as_echo_n 0123456789 >"conftest.in" + while : + do + cat "conftest.in" "conftest.in" >"conftest.tmp" + mv "conftest.tmp" "conftest.in" + cp "conftest.in" "conftest.nl" + $as_echo 'GREP' >> "conftest.nl" + "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break + diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break + as_fn_arith $ac_count + 1 && ac_count=$as_val + if test $ac_count -gt ${ac_path_GREP_max-0}; then + # Best one so far, save it but keep looking for a better one + ac_cv_path_GREP="$ac_path_GREP" + ac_path_GREP_max=$ac_count + fi + # 10*(2^10) chars as input seems more than enough + test $ac_count -gt 10 && break + done + rm -f conftest.in conftest.tmp conftest.nl conftest.out;; +esac + + $ac_path_GREP_found && break 3 + done + done + done +IFS=$as_save_IFS + if test -z "$ac_cv_path_GREP"; then + as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5 + fi +else + ac_cv_path_GREP=$GREP +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5 +$as_echo "$ac_cv_path_GREP" >&6; } + GREP="$ac_cv_path_GREP" + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5 +$as_echo_n "checking for egrep... " >&6; } +if ${ac_cv_path_EGREP+:} false; then : + $as_echo_n "(cached) " >&6 +else + if echo a | $GREP -E '(a|b)' >/dev/null 2>&1 + then ac_cv_path_EGREP="$GREP -E" + else + if test -z "$EGREP"; then + ac_path_EGREP_found=false + # Loop through the user's path and test for each of PROGNAME-LIST + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_prog in egrep; do + for ac_exec_ext in '' $ac_executable_extensions; do + ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext" + as_fn_executable_p "$ac_path_EGREP" || continue +# Check for GNU ac_path_EGREP and select it if it is found. + # Check for GNU $ac_path_EGREP +case `"$ac_path_EGREP" --version 2>&1` in +*GNU*) + ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;; +*) + ac_count=0 + $as_echo_n 0123456789 >"conftest.in" + while : + do + cat "conftest.in" "conftest.in" >"conftest.tmp" + mv "conftest.tmp" "conftest.in" + cp "conftest.in" "conftest.nl" + $as_echo 'EGREP' >> "conftest.nl" + "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break + diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break + as_fn_arith $ac_count + 1 && ac_count=$as_val + if test $ac_count -gt ${ac_path_EGREP_max-0}; then + # Best one so far, save it but keep looking for a better one + ac_cv_path_EGREP="$ac_path_EGREP" + ac_path_EGREP_max=$ac_count + fi + # 10*(2^10) chars as input seems more than enough + test $ac_count -gt 10 && break + done + rm -f conftest.in conftest.tmp conftest.nl conftest.out;; +esac + + $ac_path_EGREP_found && break 3 + done + done + done +IFS=$as_save_IFS + if test -z "$ac_cv_path_EGREP"; then + as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5 + fi +else + ac_cv_path_EGREP=$EGREP +fi + + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5 +$as_echo "$ac_cv_path_EGREP" >&6; } + EGREP="$ac_cv_path_EGREP" + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for at least sparcv8 support" >&5 +$as_echo_n "checking for at least sparcv8 support... " >&6; } +if ${libc_cv_sparcv8+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#if defined (__sparc_v8__) || defined (__sparc_v9__) + yes + #endif + +_ACEOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + $EGREP "yes" >/dev/null 2>&1; then : + libc_cv_sparcv8=yes +else + libc_cv_sparcv8=no +fi +rm -f conftest* + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_sparcv8" >&5 +$as_echo "$libc_cv_sparcv8" >&6; } +if test $libc_cv_sparcv8 = no; then + as_fn_error $? "no support for pre-v8 sparc" "$LINENO" 5 +fi diff --git a/sysdeps/sparc/sparc32/configure.ac b/sysdeps/sparc/sparc32/configure.ac new file mode 100644 index 0000000..2e336f5 --- /dev/null +++ b/sysdeps/sparc/sparc32/configure.ac @@ -0,0 +1,13 @@ +GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. +# Local configure fragment for sysdeps/sparc/sparc32 + +# Test if compiler targets at least sparcv8. +AC_CACHE_CHECK([for at least sparcv8 support], + [libc_cv_sparcv8], + [AC_EGREP_CPP(yes,[#if defined (__sparc_v8__) || defined (__sparc_v9__) + yes + #endif + ], libc_cv_sparcv8=yes, libc_cv_sparcv8=no)]) +if test $libc_cv_sparcv8 = no; then + AC_MSG_ERROR([no support for pre-v8 sparc]) +fi diff --git a/sysdeps/sparc/sparc32/divrem.m4 b/sysdeps/sparc/sparc32/divrem.m4 deleted file mode 100644 index c08c530..0000000 --- a/sysdeps/sparc/sparc32/divrem.m4 +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * NAME name of function to generate - * OP OP=div => %o0 / %o1; OP=rem => %o0 % %o1 - * S S=true => signed; S=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top `decade' of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - -define(N, `4')dnl -define(WORDSIZE, `32')dnl -define(TOPBITS, eval(WORDSIZE - N*((WORDSIZE-1)/N)))dnl -dnl -define(dividend, `%o0')dnl -define(divisor, `%o1')dnl -define(Q, `%o2')dnl -define(R, `%o3')dnl -define(ITER, `%o4')dnl -define(V, `%o5')dnl -dnl -dnl m4 reminder: ifelse(a,b,c,d) => if a is b, then c, else d -define(T, `%g1')dnl -define(SC, `%g2')dnl -ifelse(S, `true', `define(SIGN, `%g3')')dnl - -dnl -dnl This is the recursive definition for developing quotient digits. -dnl -dnl Parameters: -dnl $1 the current depth, 1 <= $1 <= N -dnl $2 the current accumulation of quotient bits -dnl N max depth -dnl -dnl We add a new bit to $2 and either recurse or insert the bits in -dnl the quotient. R, Q, and V are inputs and outputs as defined above; -dnl the condition codes are expected to reflect the input R, and are -dnl modified to reflect the output R. -dnl -define(DEVELOP_QUOTIENT_BITS, -` ! depth $1, accumulated bits $2 - bl LOC($1.eval(2**N+$2)) - srl V,1,V - ! remainder is positive - subcc R,V,R - ifelse($1, N, - ` b 9f - add Q, ($2*2+1), Q -', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2+1)')') -LOC($1.eval(2**N+$2)): - ! remainder is negative - addcc R,V,R - ifelse($1, N, - ` b 9f - add Q, ($2*2-1), Q -', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2-1)')') -ifelse($1, 1, `9:')')dnl - -#include <sysdep.h> -#include <sys/trap.h> - -ENTRY(NAME) -ifelse(S, `true', -` ! compute sign of result; if neither is negative, no problem - orcc divisor, dividend, %g0 ! either negative? - bge 2f ! no, go do the divide -ifelse(OP, `div', -` xor divisor, dividend, SIGN ! compute sign in any case', -` mov dividend, SIGN ! sign of remainder matches dividend') - tst divisor - bge 1f - tst dividend - ! divisor is definitely negative; dividend might also be negative - bge 2f ! if dividend not negative... - sub %g0, divisor, divisor ! in any case, make divisor nonneg -1: ! dividend is negative, divisor is nonnegative - sub %g0, dividend, dividend ! make dividend nonnegative -2: -') - ! Ready to divide. Compute size of quotient; scale comparand. - orcc divisor, %g0, V - bne 1f - mov dividend, R - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta ST_DIV0 - retl - clr %o0 - -1: - cmp R, V ! if divisor exceeds dividend, done - blu LOC(got_result) ! (and algorithm fails otherwise) - clr Q - sethi %hi(1 << (WORDSIZE - TOPBITS - 1)), T - cmp R, T - blu LOC(not_really_big) - clr ITER - - ! `Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R.' - 1: - cmp V, T - bgeu 3f - mov 1, SC - sll V, N, V - b 1b - add ITER, 1, ITER - - ! Now compute SC. - 2: addcc V, V, V - bcc LOC(not_too_big) - add SC, 1, SC - - ! We get here if the divisor overflowed while shifting. - ! This means that R has the high-order bit set. - ! Restore V and subtract from R. - sll T, TOPBITS, T ! high order bit - srl V, 1, V ! rest of V - add V, T, V - b LOC(do_single_div) - sub SC, 1, SC - - LOC(not_too_big): - 3: cmp V, R - blu 2b - nop - be LOC(do_single_div) - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! V > R: went too far: back up 1 step - ! srl V, 1, V - ! dec SC - ! do single-bit divide steps - ! - ! We have to be careful here. We know that R >= V, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if R >= 0. Because both R and V may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - LOC(do_single_div): - subcc SC, 1, SC - bl LOC(end_regular_divide) - nop - sub R, V, R - mov 1, Q - b LOC(end_single_divloop) - nop - LOC(single_divloop): - sll Q, 1, Q - bl 1f - srl V, 1, V - ! R >= 0 - sub R, V, R - b 2f - add Q, 1, Q - 1: ! R < 0 - add R, V, R - sub Q, 1, Q - 2: - LOC(end_single_divloop): - subcc SC, 1, SC - bge LOC(single_divloop) - tst R - b,a LOC(end_regular_divide) - -LOC(not_really_big): -1: - sll V, N, V - cmp V, R - bleu 1b - addcc ITER, 1, ITER - be LOC(got_result) - sub ITER, 1, ITER - - tst R ! set up for initial iteration -LOC(divloop): - sll Q, N, Q - DEVELOP_QUOTIENT_BITS(1, 0) -LOC(end_regular_divide): - subcc ITER, 1, ITER - bge LOC(divloop) - tst R - bl,a LOC(got_result) - ! non-restoring fixup here (one instruction only!) -ifelse(OP, `div', -` sub Q, 1, Q -', ` add R, divisor, R -') - -LOC(got_result): -ifelse(S, `true', -` ! check to see if answer should be < 0 - tst SIGN - bl,a 1f - ifelse(OP, `div', `sub %g0, Q, Q', `sub %g0, R, R') -1:') - retl - ifelse(OP, `div', `mov Q, %o0', `mov R, %o0') - -END(NAME) -ifelse(OP, `div', ifelse(S, `false', `strong_alias (.udiv, __wrap_.udiv) -'))dnl diff --git a/sysdeps/sparc/sparc32/dotmul.S b/sysdeps/sparc/sparc32/dotmul.S index d497ca6..9b20cc3 100644 --- a/sysdeps/sparc/sparc32/dotmul.S +++ b/sysdeps/sparc/sparc32/dotmul.S @@ -1,127 +1,13 @@ /* - * Signed multiply, from Appendix E of the Sparc Version 8 - * Architecture Manual. - */ - -/* - * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of - * the 64-bit product). - * - * This code optimizes short (less than 13-bit) multiplies. + * Sparc v8 has multiply. */ #include <sysdep.h> - ENTRY(.mul) - mov %o0, %y ! multiplier -> Y - andncc %o0, 0xfff, %g0 ! test bits 12..31 - be LOC(mul_shortway) ! if zero, can do it the short way - andcc %g0, %g0, %o4 ! zero the partial product and clear N and V - - /* - * Long multiply. 32 steps, followed by a final shift step. - */ - mulscc %o4, %o1, %o4 ! 1 - mulscc %o4, %o1, %o4 ! 2 - mulscc %o4, %o1, %o4 ! 3 - mulscc %o4, %o1, %o4 ! 4 - mulscc %o4, %o1, %o4 ! 5 - mulscc %o4, %o1, %o4 ! 6 - mulscc %o4, %o1, %o4 ! 7 - mulscc %o4, %o1, %o4 ! 8 - mulscc %o4, %o1, %o4 ! 9 - mulscc %o4, %o1, %o4 ! 10 - mulscc %o4, %o1, %o4 ! 11 - mulscc %o4, %o1, %o4 ! 12 - mulscc %o4, %o1, %o4 ! 13 - mulscc %o4, %o1, %o4 ! 14 - mulscc %o4, %o1, %o4 ! 15 - mulscc %o4, %o1, %o4 ! 16 - mulscc %o4, %o1, %o4 ! 17 - mulscc %o4, %o1, %o4 ! 18 - mulscc %o4, %o1, %o4 ! 19 - mulscc %o4, %o1, %o4 ! 20 - mulscc %o4, %o1, %o4 ! 21 - mulscc %o4, %o1, %o4 ! 22 - mulscc %o4, %o1, %o4 ! 23 - mulscc %o4, %o1, %o4 ! 24 - mulscc %o4, %o1, %o4 ! 25 - mulscc %o4, %o1, %o4 ! 26 - mulscc %o4, %o1, %o4 ! 27 - mulscc %o4, %o1, %o4 ! 28 - mulscc %o4, %o1, %o4 ! 29 - mulscc %o4, %o1, %o4 ! 30 - mulscc %o4, %o1, %o4 ! 31 - mulscc %o4, %o1, %o4 ! 32 - mulscc %o4, %g0, %o4 ! final shift - - ! If %o0 was negative, the result is - ! (%o0 * %o1) + (%o1 << 32)) - ! We fix that here. - -#if 0 - tst %o0 - bge 1f - rd %y, %o0 - - ! %o0 was indeed negative; fix upper 32 bits of result by subtracting - ! %o1 (i.e., return %o4 - %o1 in %o1). - retl - sub %o4, %o1, %o1 - -1: - retl - mov %o4, %o1 -#else - /* Faster code adapted from tege@sics.se's code for umul.S. */ - sra %o0, 31, %o2 ! make mask from sign bit - and %o1, %o2, %o2 ! %o2 = 0 or %o1, depending on sign of %o0 - rd %y, %o0 ! get lower half of product - retl - sub %o4, %o2, %o1 ! subtract compensation - ! and put upper half in place -#endif - -LOC(mul_shortway): - /* - * Short multiply. 12 steps, followed by a final shift step. - * The resulting bits are off by 12 and (32-12) = 20 bit positions, - * but there is no problem with %o0 being negative (unlike above). - */ - mulscc %o4, %o1, %o4 ! 1 - mulscc %o4, %o1, %o4 ! 2 - mulscc %o4, %o1, %o4 ! 3 - mulscc %o4, %o1, %o4 ! 4 - mulscc %o4, %o1, %o4 ! 5 - mulscc %o4, %o1, %o4 ! 6 - mulscc %o4, %o1, %o4 ! 7 - mulscc %o4, %o1, %o4 ! 8 - mulscc %o4, %o1, %o4 ! 9 - mulscc %o4, %o1, %o4 ! 10 - mulscc %o4, %o1, %o4 ! 11 - mulscc %o4, %o1, %o4 ! 12 - mulscc %o4, %g0, %o4 ! final shift - - /* - * %o4 has 20 of the bits that should be in the low part of the - * result; %y has the bottom 12 (as %y's top 12). That is: - * - * %o4 %y - * +----------------+----------------+ - * | -12- | -20- | -12- | -20- | - * +------(---------+------)---------+ - * --hi-- ----low-part---- - * - * The upper 12 bits of %o4 should be sign-extended to form the - * high part of the product (i.e., highpart = %o4 >> 20). - */ - rd %y, %o5 - sll %o4, 12, %o0 ! shift middle bits left 12 - srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left - or %o5, %o0, %o0 ! construct low part of result + smul %o0, %o1, %o0 retl - sra %o4, 20, %o1 ! ... and extract high part of result + rd %y, %o1 END(.mul) diff --git a/sysdeps/sparc/sparc32/mul_1.S b/sysdeps/sparc/sparc32/mul_1.S index acce57e..86619c7 100644 --- a/sysdeps/sparc/sparc32/mul_1.S +++ b/sysdeps/sparc/sparc32/mul_1.S @@ -1,198 +1,102 @@ -! SPARC __mpn_mul_1 -- Multiply a limb vector with a limb and store -! the result in a second limb vector. -! +! SPARC v8 __mpn_mul_1 -- Multiply a limb vector with a single limb and +! store the product in a second limb vector. + ! Copyright (C) 1992-2019 Free Software Foundation, Inc. -! + ! This file is part of the GNU MP Library. -! + ! The GNU MP Library is free software; you can redistribute it and/or modify ! it under the terms of the GNU Lesser General Public License as published by ! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. -! + ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! + ! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, ! see <https://www.gnu.org/licenses/>. ! INPUT PARAMETERS -! RES_PTR o0 -! S1_PTR o1 -! SIZE o2 -! S2_LIMB o3 - -! ADD CODE FOR SMALL MULTIPLIERS! -!1: ld -! st -! -!2: ld ,a -! addxcc a,a,x -! st x, -! -!3_unrolled: -! ld ,a -! addxcc a,a,x1 ! 2a + cy -! addx %g0,%g0,x2 -! addcc a,x1,x ! 3a + c -! st x, -! -! ld ,a -! addxcc a,a,y1 -! addx %g0,%g0,y2 -! addcc a,y1,x -! st x, -! -!4_unrolled: -! ld ,a -! srl a,2,x1 ! 4a -! addxcc y2,x1,x -! sll a,30,x2 -! st x, -! -! ld ,a -! srl a,2,y1 -! addxcc x2,y1,y -! sll a,30,y2 -! st x, -! -!5_unrolled: -! ld ,a -! srl a,2,x1 ! 4a -! addxcc a,x1,x ! 5a + c -! sll a,30,x2 -! addx %g0,x2,x2 -! st x, -! -! ld ,a -! srl a,2,y1 -! addxcc a,y1,x -! sll a,30,y2 -! addx %g0,y2,y2 -! st x, -! -!8_unrolled: -! ld ,a -! srl a,3,x1 ! 8a -! addxcc y2,x1,x -! sll a,29,x2 -! st x, -! -! ld ,a -! srl a,3,y1 -! addxcc x2,y1,y -! sll a,29,y2 -! st x, +! res_ptr o0 +! s1_ptr o1 +! size o2 +! s2_limb o3 #include <sysdep.h> ENTRY(__mpn_mul_1) - ! Make S1_PTR and RES_PTR point at the end of their blocks - ! and put (- 4 x SIZE) in index/loop counter. - sll %o2,2,%o2 - add %o0,%o2,%o4 ! RES_PTR in o4 since o0 is retval - add %o1,%o2,%o1 - sub %g0,%o2,%o2 + sll %o2,4,%g1 + mov %o7,%g4 ! Save return address register + and %g1,(4-1)<<4,%g1 +1: call 2f + add %o7,3f-1b,%g3 +2: mov %g4,%o7 ! Restore return address register + jmp %g3+%g1 + ld [%o1+0],%o4 ! 1 - cmp %o3,0xfff - bgu LOC(large) + .align 4 +3: +LOC(00): + add %o0,-4,%o0 + add %o1,-4,%o1 + b LOC(loop00) /* 4, 8, 12, ... */ + orcc %g0,%g0,%g2 +LOC(01): + b LOC(loop01) /* 1, 5, 9, ... */ + orcc %g0,%g0,%g2 nop + nop +LOC(10): + add %o0,-12,%o0 /* 2, 6, 10, ... */ + add %o1,4,%o1 + b LOC(loop10) + orcc %g0,%g0,%g2 + nop +LOC(11): + add %o0,-8,%o0 /* 3, 7, 11, ... */ + add %o1,-8,%o1 + b LOC(loop11) + orcc %g0,%g0,%g2 - ld [%o1+%o2],%o5 - mov 0,%o0 - b LOC(0) - add %o4,-4,%o4 -LOC(loop0): - st %g1,[%o4+%o2] -LOC(0): wr %g0,%o3,%y - sra %o5,31,%g2 - and %o3,%g2,%g2 - andcc %g1,0,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,0,%g1 - sra %g1,20,%g4 - sll %g1,12,%g1 - rd %y,%g3 - srl %g3,20,%g3 - or %g1,%g3,%g1 - - addcc %g1,%o0,%g1 - addx %g2,%g4,%o0 ! add sign-compensation and cy to hi limb - addcc %o2,4,%o2 ! loop counter - bne,a LOC(loop0) - ld [%o1+%o2],%o5 - - retl - st %g1,[%o4+%o2] - - -LOC(large): - ld [%o1+%o2],%o5 - mov 0,%o0 - sra %o3,31,%g4 ! g4 = mask of ones iff S2_LIMB < 0 - b LOC(1) - add %o4,-4,%o4 LOC(loop): - st %g3,[%o4+%o2] -LOC(1): wr %g0,%o5,%y - and %o5,%g4,%g2 ! g2 = S1_LIMB iff S2_LIMB < 0, else 0 - andcc %g0,%g0,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%g0,%g1 - rd %y,%g3 - addcc %g3,%o0,%g3 - addx %g2,%g1,%o0 ! add sign-compensation and cy to hi limb - addcc %o2,4,%o2 ! loop counter - bne,a LOC(loop) - ld [%o1+%o2],%o5 + addcc %g3,%g2,%g3 ! 1 + ld [%o1+4],%o4 ! 2 + st %g3,[%o0+0] ! 1 + rd %y,%g2 ! 1 +LOC(loop00): + umul %o4,%o3,%g3 ! 2 + addxcc %g3,%g2,%g3 ! 2 + ld [%o1+8],%o4 ! 3 + st %g3,[%o0+4] ! 2 + rd %y,%g2 ! 2 +LOC(loop11): + umul %o4,%o3,%g3 ! 3 + addxcc %g3,%g2,%g3 ! 3 + ld [%o1+12],%o4 ! 4 + add %o1,16,%o1 + st %g3,[%o0+8] ! 3 + rd %y,%g2 ! 3 +LOC(loop10): + umul %o4,%o3,%g3 ! 4 + addxcc %g3,%g2,%g3 ! 4 + ld [%o1+0],%o4 ! 1 + st %g3,[%o0+12] ! 4 + add %o0,16,%o0 + rd %y,%g2 ! 4 + addx %g0,%g2,%g2 +LOC(loop01): + addcc %o2,-4,%o2 + bg LOC(loop) + umul %o4,%o3,%g3 ! 1 + addcc %g3,%g2,%g3 ! 4 + st %g3,[%o0+0] ! 4 + rd %y,%g2 ! 4 retl - st %g3,[%o4+%o2] + addx %g0,%g2,%o0 END(__mpn_mul_1) diff --git a/sysdeps/sparc/sparc32/rem.S b/sysdeps/sparc/sparc32/rem.S index 79e09a9..a2694e6 100644 --- a/sysdeps/sparc/sparc32/rem.S +++ b/sysdeps/sparc/sparc32/rem.S @@ -1,363 +1,21 @@ - /* This file is generated from divrem.m4; DO NOT EDIT! */ /* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. + * Sparc v8 has divide. */ -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .rem name of function to generate - * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1 - * true true=true => signed; true=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - - #include <sysdep.h> -#include <sys/trap.h> ENTRY(.rem) - ! compute sign of result; if neither is negative, no problem - orcc %o1, %o0, %g0 ! either negative? - bge 2f ! no, go do the divide - mov %o0, %g3 ! sign of remainder matches %o0 - tst %o1 - bge 1f - tst %o0 - ! %o1 is definitely negative; %o0 might also be negative - bge 2f ! if %o0 not negative... - sub %g0, %o1, %o1 ! in any case, make %o1 nonneg -1: ! %o0 is negative, %o1 is nonnegative - sub %g0, %o0, %o0 ! make %o0 nonnegative -2: - - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta ST_DIV0 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu LOC(got_result) ! (and algorithm fails otherwise) - clr %o2 - sethi %hi(1 << (32 - 4 - 1)), %g1 - cmp %o3, %g1 - blu LOC(not_really_big) - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g2 - sll %o5, 4, %o5 - b 1b - add %o4, 1, %o4 - - ! Now compute %g2. - 2: addcc %o5, %o5, %o5 - bcc LOC(not_too_big) - add %g2, 1, %g2 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - b LOC(do_single_div) - sub %g2, 1, %g2 - - LOC(not_too_big): - 3: cmp %o5, %o3 - blu 2b - nop - be LOC(do_single_div) - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g2 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - LOC(do_single_div): - subcc %g2, 1, %g2 - bl LOC(end_regular_divide) - nop - sub %o3, %o5, %o3 - mov 1, %o2 - b LOC(end_single_divloop) - nop - LOC(single_divloop): - sll %o2, 1, %o2 - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - LOC(end_single_divloop): - subcc %g2, 1, %g2 - bge LOC(single_divloop) - tst %o3 - b,a LOC(end_regular_divide) - -LOC(not_really_big): -1: - sll %o5, 4, %o5 - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - be LOC(got_result) - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -LOC(divloop): - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl LOC(1.16) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl LOC(2.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl LOC(3.19) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl LOC(4.23) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - -LOC(4.23): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - - -LOC(3.19): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl LOC(4.21) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -LOC(4.21): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - - -LOC(2.17): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl LOC(3.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl LOC(4.19) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -LOC(4.19): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - -LOC(3.17): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl LOC(4.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -LOC(4.17): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - -LOC(1.16): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl LOC(2.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl LOC(3.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl LOC(4.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -LOC(4.15): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - -LOC(3.15): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl LOC(4.13) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -LOC(4.13): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - -LOC(2.15): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl LOC(3.13) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl LOC(4.11) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -LOC(4.11): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - -LOC(3.13): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl LOC(4.9) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -LOC(4.9): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - -9: -LOC(end_regular_divide): - subcc %o4, 1, %o4 - bge LOC(divloop) - tst %o3 - bl,a LOC(got_result) - ! non-restoring fixup here (one instruction only!) - add %o3, %o1, %o3 - -LOC(got_result): - ! check to see if answer should be < 0 - tst %g3 - bl,a 1f - sub %g0, %o3, %o3 -1: + sra %o0, 31, %o2 + wr %o2, 0, %y + nop + nop + nop + sdivcc %o0, %o1, %o2 + bvs,a 1f + xnor %o2, %g0, %o2 +1: smul %o2, %o1, %o2 retl - mov %o3, %o0 + sub %o0, %o2, %o0 END(.rem) diff --git a/sysdeps/sparc/sparc32/sdiv.S b/sysdeps/sparc/sparc32/sdiv.S index ab29718..bfc4acf 100644 --- a/sysdeps/sparc/sparc32/sdiv.S +++ b/sysdeps/sparc/sparc32/sdiv.S @@ -1,363 +1,20 @@ - /* This file is generated from divrem.m4; DO NOT EDIT! */ /* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. + * Sparc v8 has divide. */ -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .div name of function to generate - * div div=div => %o0 / %o1; div=rem => %o0 % %o1 - * true true=true => signed; true=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - - #include <sysdep.h> -#include <sys/trap.h> ENTRY(.div) - ! compute sign of result; if neither is negative, no problem - orcc %o1, %o0, %g0 ! either negative? - bge 2f ! no, go do the divide - xor %o1, %o0, %g3 ! compute sign in any case - tst %o1 - bge 1f - tst %o0 - ! %o1 is definitely negative; %o0 might also be negative - bge 2f ! if %o0 not negative... - sub %g0, %o1, %o1 ! in any case, make %o1 nonneg -1: ! %o0 is negative, %o1 is nonnegative - sub %g0, %o0, %o0 ! make %o0 nonnegative -2: - - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta ST_DIV0 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu LOC(got_result) ! (and algorithm fails otherwise) - clr %o2 - sethi %hi(1 << (32 - 4 - 1)), %g1 - cmp %o3, %g1 - blu LOC(not_really_big) - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g2 - sll %o5, 4, %o5 - b 1b - add %o4, 1, %o4 - - ! Now compute %g2. - 2: addcc %o5, %o5, %o5 - bcc LOC(not_too_big) - add %g2, 1, %g2 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - b LOC(do_single_div) - sub %g2, 1, %g2 - - LOC(not_too_big): - 3: cmp %o5, %o3 - blu 2b - nop - be LOC(do_single_div) - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g2 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - LOC(do_single_div): - subcc %g2, 1, %g2 - bl LOC(end_regular_divide) - nop - sub %o3, %o5, %o3 - mov 1, %o2 - b LOC(end_single_divloop) - nop - LOC(single_divloop): - sll %o2, 1, %o2 - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - LOC(end_single_divloop): - subcc %g2, 1, %g2 - bge LOC(single_divloop) - tst %o3 - b,a LOC(end_regular_divide) - -LOC(not_really_big): -1: - sll %o5, 4, %o5 - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - be LOC(got_result) - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -LOC(divloop): - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl LOC(1.16) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl LOC(2.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl LOC(3.19) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl LOC(4.23) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - -LOC(4.23): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - - -LOC(3.19): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl LOC(4.21) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -LOC(4.21): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - - -LOC(2.17): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl LOC(3.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl LOC(4.19) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -LOC(4.19): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - -LOC(3.17): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl LOC(4.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -LOC(4.17): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - -LOC(1.16): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl LOC(2.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl LOC(3.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl LOC(4.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -LOC(4.15): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - -LOC(3.15): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl LOC(4.13) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -LOC(4.13): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - -LOC(2.15): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl LOC(3.13) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl LOC(4.11) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -LOC(4.11): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - -LOC(3.13): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl LOC(4.9) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -LOC(4.9): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - -9: -LOC(end_regular_divide): - subcc %o4, 1, %o4 - bge LOC(divloop) - tst %o3 - bl,a LOC(got_result) - ! non-restoring fixup here (one instruction only!) - sub %o2, 1, %o2 - -LOC(got_result): - ! check to see if answer should be < 0 - tst %g3 - bl,a 1f - sub %g0, %o2, %o2 -1: - retl - mov %o2, %o0 + sra %o0, 31, %o2 + wr %o2, 0, %y + nop + nop + nop + sdivcc %o0, %o1, %o0 + bvs,a 1f + xnor %o0, %g0, %o0 +1: retl + nop END(.div) diff --git a/sysdeps/sparc/sparc32/sparcv8/Makefile b/sysdeps/sparc/sparc32/sparcv8/Makefile deleted file mode 100644 index 2ff9853..0000000 --- a/sysdeps/sparc/sparc32/sparcv8/Makefile +++ /dev/null @@ -1 +0,0 @@ -sysdep-CFLAGS += -mcpu=v8 diff --git a/sysdeps/sparc/sparc32/sparcv8/addmul_1.S b/sysdeps/sparc/sparc32/sparcv8/addmul_1.S deleted file mode 100644 index 4708393..0000000 --- a/sysdeps/sparc/sparc32/sparcv8/addmul_1.S +++ /dev/null @@ -1,118 +0,0 @@ -! SPARC v8 __mpn_addmul_1 -- Multiply a limb vector with a limb and -! add the result to a second limb vector. - -! Copyright (C) 1992-2019 Free Software Foundation, Inc. - -! This file is part of the GNU MP Library. - -! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Lesser General Public License as published by -! the Free Software Foundation; either version 2.1 of the License, or (at your -! option) any later version. - -! The GNU MP Library is distributed in the hope that it will be useful, but -! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -! License for more details. - -! You should have received a copy of the GNU Lesser General Public License -! along with the GNU MP Library; see the file COPYING.LIB. If not, -! see <https://www.gnu.org/licenses/>. - - -! INPUT PARAMETERS -! res_ptr o0 -! s1_ptr o1 -! size o2 -! s2_limb o3 - -#include <sysdep.h> - -ENTRY(__mpn_addmul_1) - ld [%o1+0],%o4 ! 1 - sll %o2,4,%g1 - orcc %g0,%g0,%g2 - mov %o7,%g4 ! Save return address register - and %g1,(4-1)<<4,%g1 -1: call 2f - add %o7,3f-1b,%g3 -2: jmp %g3+%g1 - mov %g4,%o7 ! Restore return address register - - .align 4 -3: -LOC(00): - add %o0,-4,%o0 - b LOC(loop00) /* 4, 8, 12, ... */ - add %o1,-4,%o1 - nop -LOC(01): - b LOC(loop01) /* 1, 5, 9, ... */ - nop - nop - nop -LOC(10): - add %o0,-12,%o0 /* 2, 6, 10, ... */ - b LOC(loop10) - add %o1,4,%o1 - nop -LOC(11): - add %o0,-8,%o0 /* 3, 7, 11, ... */ - b LOC(loop11) - add %o1,-8,%o1 - nop - -LOC(loop): - addcc %g3,%g2,%g3 ! 1 - ld [%o1+4],%o4 ! 2 - rd %y,%g2 ! 1 - addx %g0,%g2,%g2 - ld [%o0+0],%g1 ! 2 - addcc %g1,%g3,%g3 - st %g3,[%o0+0] ! 1 -LOC(loop00): - umul %o4,%o3,%g3 ! 2 - ld [%o0+4],%g1 ! 2 - addxcc %g3,%g2,%g3 ! 2 - ld [%o1+8],%o4 ! 3 - rd %y,%g2 ! 2 - addx %g0,%g2,%g2 - nop - addcc %g1,%g3,%g3 - st %g3,[%o0+4] ! 2 -LOC(loop11): - umul %o4,%o3,%g3 ! 3 - addxcc %g3,%g2,%g3 ! 3 - ld [%o1+12],%o4 ! 4 - rd %y,%g2 ! 3 - add %o1,16,%o1 - addx %g0,%g2,%g2 - ld [%o0+8],%g1 ! 2 - addcc %g1,%g3,%g3 - st %g3,[%o0+8] ! 3 -LOC(loop10): - umul %o4,%o3,%g3 ! 4 - addxcc %g3,%g2,%g3 ! 4 - ld [%o1+0],%o4 ! 1 - rd %y,%g2 ! 4 - addx %g0,%g2,%g2 - ld [%o0+12],%g1 ! 2 - addcc %g1,%g3,%g3 - st %g3,[%o0+12] ! 4 - add %o0,16,%o0 - addx %g0,%g2,%g2 -LOC(loop01): - addcc %o2,-4,%o2 - bg LOC(loop) - umul %o4,%o3,%g3 ! 1 - - addcc %g3,%g2,%g3 ! 4 - rd %y,%g2 ! 4 - addx %g0,%g2,%g2 - ld [%o0+0],%g1 ! 2 - addcc %g1,%g3,%g3 - st %g3,[%o0+0] ! 4 - retl - addx %g0,%g2,%o0 - -END(__mpn_addmul_1) diff --git a/sysdeps/sparc/sparc32/sparcv8/dotmul.S b/sysdeps/sparc/sparc32/sparcv8/dotmul.S deleted file mode 100644 index 9b20cc3..0000000 --- a/sysdeps/sparc/sparc32/sparcv8/dotmul.S +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Sparc v8 has multiply. - */ - -#include <sysdep.h> - -ENTRY(.mul) - - smul %o0, %o1, %o0 - retl - rd %y, %o1 - -END(.mul) diff --git a/sysdeps/sparc/sparc32/sparcv8/mul_1.S b/sysdeps/sparc/sparc32/sparcv8/mul_1.S deleted file mode 100644 index 86619c7..0000000 --- a/sysdeps/sparc/sparc32/sparcv8/mul_1.S +++ /dev/null @@ -1,102 +0,0 @@ -! SPARC v8 __mpn_mul_1 -- Multiply a limb vector with a single limb and -! store the product in a second limb vector. - -! Copyright (C) 1992-2019 Free Software Foundation, Inc. - -! This file is part of the GNU MP Library. - -! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Lesser General Public License as published by -! the Free Software Foundation; either version 2.1 of the License, or (at your -! option) any later version. - -! The GNU MP Library is distributed in the hope that it will be useful, but -! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -! License for more details. - -! You should have received a copy of the GNU Lesser General Public License -! along with the GNU MP Library; see the file COPYING.LIB. If not, -! see <https://www.gnu.org/licenses/>. - - -! INPUT PARAMETERS -! res_ptr o0 -! s1_ptr o1 -! size o2 -! s2_limb o3 - -#include <sysdep.h> - -ENTRY(__mpn_mul_1) - sll %o2,4,%g1 - mov %o7,%g4 ! Save return address register - and %g1,(4-1)<<4,%g1 -1: call 2f - add %o7,3f-1b,%g3 -2: mov %g4,%o7 ! Restore return address register - jmp %g3+%g1 - ld [%o1+0],%o4 ! 1 - - .align 4 -3: -LOC(00): - add %o0,-4,%o0 - add %o1,-4,%o1 - b LOC(loop00) /* 4, 8, 12, ... */ - orcc %g0,%g0,%g2 -LOC(01): - b LOC(loop01) /* 1, 5, 9, ... */ - orcc %g0,%g0,%g2 - nop - nop -LOC(10): - add %o0,-12,%o0 /* 2, 6, 10, ... */ - add %o1,4,%o1 - b LOC(loop10) - orcc %g0,%g0,%g2 - nop -LOC(11): - add %o0,-8,%o0 /* 3, 7, 11, ... */ - add %o1,-8,%o1 - b LOC(loop11) - orcc %g0,%g0,%g2 - -LOC(loop): - addcc %g3,%g2,%g3 ! 1 - ld [%o1+4],%o4 ! 2 - st %g3,[%o0+0] ! 1 - rd %y,%g2 ! 1 -LOC(loop00): - umul %o4,%o3,%g3 ! 2 - addxcc %g3,%g2,%g3 ! 2 - ld [%o1+8],%o4 ! 3 - st %g3,[%o0+4] ! 2 - rd %y,%g2 ! 2 -LOC(loop11): - umul %o4,%o3,%g3 ! 3 - addxcc %g3,%g2,%g3 ! 3 - ld [%o1+12],%o4 ! 4 - add %o1,16,%o1 - st %g3,[%o0+8] ! 3 - rd %y,%g2 ! 3 -LOC(loop10): - umul %o4,%o3,%g3 ! 4 - addxcc %g3,%g2,%g3 ! 4 - ld [%o1+0],%o4 ! 1 - st %g3,[%o0+12] ! 4 - add %o0,16,%o0 - rd %y,%g2 ! 4 - addx %g0,%g2,%g2 -LOC(loop01): - addcc %o2,-4,%o2 - bg LOC(loop) - umul %o4,%o3,%g3 ! 1 - - addcc %g3,%g2,%g3 ! 4 - st %g3,[%o0+0] ! 4 - rd %y,%g2 ! 4 - retl - addx %g0,%g2,%o0 - -END(__mpn_mul_1) diff --git a/sysdeps/sparc/sparc32/sparcv8/rem.S b/sysdeps/sparc/sparc32/sparcv8/rem.S deleted file mode 100644 index a2694e6..0000000 --- a/sysdeps/sparc/sparc32/sparcv8/rem.S +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Sparc v8 has divide. - */ - -#include <sysdep.h> - -ENTRY(.rem) - - sra %o0, 31, %o2 - wr %o2, 0, %y - nop - nop - nop - sdivcc %o0, %o1, %o2 - bvs,a 1f - xnor %o2, %g0, %o2 -1: smul %o2, %o1, %o2 - retl - sub %o0, %o2, %o0 - -END(.rem) diff --git a/sysdeps/sparc/sparc32/sparcv8/sdiv.S b/sysdeps/sparc/sparc32/sparcv8/sdiv.S deleted file mode 100644 index bfc4acf..0000000 --- a/sysdeps/sparc/sparc32/sparcv8/sdiv.S +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Sparc v8 has divide. - */ - -#include <sysdep.h> - -ENTRY(.div) - - sra %o0, 31, %o2 - wr %o2, 0, %y - nop - nop - nop - sdivcc %o0, %o1, %o0 - bvs,a 1f - xnor %o0, %g0, %o0 -1: retl - nop - -END(.div) diff --git a/sysdeps/sparc/sparc32/sparcv8/submul_1.S b/sysdeps/sparc/sparc32/sparcv8/submul_1.S deleted file mode 100644 index 11eef05..0000000 --- a/sysdeps/sparc/sparc32/sparcv8/submul_1.S +++ /dev/null @@ -1,57 +0,0 @@ -! SPARC v8 __mpn_submul_1 -- Multiply a limb vector with a limb and -! subtract the result from a second limb vector. - -! Copyright (C) 1992-2019 Free Software Foundation, Inc. - -! This file is part of the GNU MP Library. - -! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Lesser General Public License as published by -! the Free Software Foundation; either version 2.1 of the License, or (at your -! option) any later version. - -! The GNU MP Library is distributed in the hope that it will be useful, but -! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -! License for more details. - -! You should have received a copy of the GNU Lesser General Public License -! along with the GNU MP Library; see the file COPYING.LIB. If not, -! see <https://www.gnu.org/licenses/>. - - -! INPUT PARAMETERS -! res_ptr o0 -! s1_ptr o1 -! size o2 -! s2_limb o3 - -#include <sysdep.h> - -ENTRY(__mpn_submul_1) - sub %g0,%o2,%o2 ! negate ... - sll %o2,2,%o2 ! ... and scale size - sub %o1,%o2,%o1 ! o1 is offset s1_ptr - sub %o0,%o2,%g1 ! g1 is offset res_ptr - - mov 0,%o0 ! clear cy_limb - -LOC(loop): - ld [%o1+%o2],%o4 - ld [%g1+%o2],%g2 - umul %o4,%o3,%o5 - rd %y,%g3 - addcc %o5,%o0,%o5 - addx %g3,0,%o0 - subcc %g2,%o5,%g2 - addx %o0,0,%o0 - st %g2,[%g1+%o2] - - addcc %o2,4,%o2 - bne LOC(loop) - nop - - retl - nop - -END(__mpn_submul_1) diff --git a/sysdeps/sparc/sparc32/sparcv8/udiv.S b/sysdeps/sparc/sparc32/sparcv8/udiv.S deleted file mode 100644 index e9cab4e..0000000 --- a/sysdeps/sparc/sparc32/sparcv8/udiv.S +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Sparc v8 has divide. - */ - -#include <sysdep.h> - -ENTRY(.udiv) - - wr %g0, 0, %y - nop - nop - retl - udiv %o0, %o1, %o0 - -END(.udiv) -strong_alias (.udiv, __wrap_.udiv) diff --git a/sysdeps/sparc/sparc32/sparcv8/umul.S b/sysdeps/sparc/sparc32/sparcv8/umul.S deleted file mode 100644 index cec454a..0000000 --- a/sysdeps/sparc/sparc32/sparcv8/umul.S +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Sparc v8 has multiply. - */ - -#include <sysdep.h> - -ENTRY(.umul) - - umul %o0, %o1, %o0 - retl - rd %y, %o1 - -END(.umul) diff --git a/sysdeps/sparc/sparc32/sparcv8/urem.S b/sysdeps/sparc/sparc32/sparcv8/urem.S deleted file mode 100644 index cc2689d..0000000 --- a/sysdeps/sparc/sparc32/sparcv8/urem.S +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Sparc v8 has divide. - */ - -#include <sysdep.h> - -ENTRY(.urem) - - wr %g0, 0, %y - nop - nop - nop - udiv %o0, %o1, %o2 - umul %o2, %o1, %o2 - retl - sub %o0, %o2, %o0 - -END(.urem) diff --git a/sysdeps/sparc/sparc32/submul_1.S b/sysdeps/sparc/sparc32/submul_1.S index 920422a..11eef05 100644 --- a/sysdeps/sparc/sparc32/submul_1.S +++ b/sysdeps/sparc/sparc32/submul_1.S @@ -1,146 +1,57 @@ -! SPARC __mpn_submul_1 -- Multiply a limb vector with a limb and subtract -! the result from a second limb vector. -! +! SPARC v8 __mpn_submul_1 -- Multiply a limb vector with a limb and +! subtract the result from a second limb vector. + ! Copyright (C) 1992-2019 Free Software Foundation, Inc. -! + ! This file is part of the GNU MP Library. -! + ! The GNU MP Library is free software; you can redistribute it and/or modify ! it under the terms of the GNU Lesser General Public License as published by ! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. -! + ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! + ! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, ! see <https://www.gnu.org/licenses/>. ! INPUT PARAMETERS -! RES_PTR o0 -! S1_PTR o1 -! SIZE o2 -! S2_LIMB o3 +! res_ptr o0 +! s1_ptr o1 +! size o2 +! s2_limb o3 #include <sysdep.h> ENTRY(__mpn_submul_1) - ! Make S1_PTR and RES_PTR point at the end of their blocks - ! and put (- 4 x SIZE) in index/loop counter. - sll %o2,2,%o2 - add %o0,%o2,%o4 ! RES_PTR in o4 since o0 is retval - add %o1,%o2,%o1 - sub %g0,%o2,%o2 - - cmp %o3,0xfff - bgu LOC(large) - nop + sub %g0,%o2,%o2 ! negate ... + sll %o2,2,%o2 ! ... and scale size + sub %o1,%o2,%o1 ! o1 is offset s1_ptr + sub %o0,%o2,%g1 ! g1 is offset res_ptr - ld [%o1+%o2],%o5 - mov 0,%o0 - b LOC(0) - add %o4,-4,%o4 -LOC(loop0): - subcc %o5,%g1,%g1 - ld [%o1+%o2],%o5 - addx %o0,%g0,%o0 - st %g1,[%o4+%o2] -LOC(0): wr %g0,%o3,%y - sra %o5,31,%g2 - and %o3,%g2,%g2 - andcc %g1,0,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,%o5,%g1 - mulscc %g1,0,%g1 - sra %g1,20,%g4 - sll %g1,12,%g1 - rd %y,%g3 - srl %g3,20,%g3 - or %g1,%g3,%g1 + mov 0,%o0 ! clear cy_limb - addcc %g1,%o0,%g1 - addx %g2,%g4,%o0 ! add sign-compensation and cy to hi limb - addcc %o2,4,%o2 ! loop counter - bne LOC(loop0) - ld [%o4+%o2],%o5 - - subcc %o5,%g1,%g1 - addx %o0,%g0,%o0 - retl - st %g1,[%o4+%o2] - - -LOC(large): - ld [%o1+%o2],%o5 - mov 0,%o0 - sra %o3,31,%g4 ! g4 = mask of ones iff S2_LIMB < 0 - b LOC(1) - add %o4,-4,%o4 LOC(loop): - subcc %o5,%g3,%g3 - ld [%o1+%o2],%o5 - addx %o0,%g0,%o0 - st %g3,[%o4+%o2] -LOC(1): wr %g0,%o5,%y - and %o5,%g4,%g2 - andcc %g0,%g0,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%o3,%g1 - mulscc %g1,%g0,%g1 + ld [%o1+%o2],%o4 + ld [%g1+%o2],%g2 + umul %o4,%o3,%o5 rd %y,%g3 - addcc %g3,%o0,%g3 - addx %g2,%g1,%o0 + addcc %o5,%o0,%o5 + addx %g3,0,%o0 + subcc %g2,%o5,%g2 + addx %o0,0,%o0 + st %g2,[%g1+%o2] + addcc %o2,4,%o2 bne LOC(loop) - ld [%o4+%o2],%o5 + nop - subcc %o5,%g3,%g3 - addx %o0,%g0,%o0 retl - st %g3,[%o4+%o2] + nop END(__mpn_submul_1) diff --git a/sysdeps/sparc/sparc32/udiv.S b/sysdeps/sparc/sparc32/udiv.S index 1db6796..e9cab4e 100644 --- a/sysdeps/sparc/sparc32/udiv.S +++ b/sysdeps/sparc/sparc32/udiv.S @@ -1,347 +1,16 @@ - /* This file is generated from divrem.m4; DO NOT EDIT! */ /* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. + * Sparc v8 has divide. */ -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .udiv name of function to generate - * div div=div => %o0 / %o1; div=rem => %o0 % %o1 - * false false=true => signed; false=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - - #include <sysdep.h> -#include <sys/trap.h> ENTRY(.udiv) - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta ST_DIV0 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu LOC(got_result) ! (and algorithm fails otherwise) - clr %o2 - sethi %hi(1 << (32 - 4 - 1)), %g1 - cmp %o3, %g1 - blu LOC(not_really_big) - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g2 - sll %o5, 4, %o5 - b 1b - add %o4, 1, %o4 - - ! Now compute %g2. - 2: addcc %o5, %o5, %o5 - bcc LOC(not_too_big) - add %g2, 1, %g2 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - b LOC(do_single_div) - sub %g2, 1, %g2 - - LOC(not_too_big): - 3: cmp %o5, %o3 - blu 2b - nop - be LOC(do_single_div) - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g2 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - LOC(do_single_div): - subcc %g2, 1, %g2 - bl LOC(end_regular_divide) - nop - sub %o3, %o5, %o3 - mov 1, %o2 - b LOC(end_single_divloop) - nop - LOC(single_divloop): - sll %o2, 1, %o2 - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - LOC(end_single_divloop): - subcc %g2, 1, %g2 - bge LOC(single_divloop) - tst %o3 - b,a LOC(end_regular_divide) - -LOC(not_really_big): -1: - sll %o5, 4, %o5 - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - be LOC(got_result) - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -LOC(divloop): - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl LOC(1.16) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl LOC(2.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl LOC(3.19) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl LOC(4.23) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - -LOC(4.23): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - - -LOC(3.19): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl LOC(4.21) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -LOC(4.21): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - - -LOC(2.17): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl LOC(3.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl LOC(4.19) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -LOC(4.19): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - -LOC(3.17): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl LOC(4.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -LOC(4.17): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - -LOC(1.16): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl LOC(2.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl LOC(3.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl LOC(4.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -LOC(4.15): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - -LOC(3.15): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl LOC(4.13) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -LOC(4.13): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - -LOC(2.15): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl LOC(3.13) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl LOC(4.11) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -LOC(4.11): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - -LOC(3.13): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl LOC(4.9) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -LOC(4.9): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - -9: -LOC(end_regular_divide): - subcc %o4, 1, %o4 - bge LOC(divloop) - tst %o3 - bl,a LOC(got_result) - ! non-restoring fixup here (one instruction only!) - sub %o2, 1, %o2 - - -LOC(got_result): - + wr %g0, 0, %y + nop + nop retl - mov %o2, %o0 + udiv %o0, %o1, %o0 END(.udiv) strong_alias (.udiv, __wrap_.udiv) diff --git a/sysdeps/sparc/sparc32/umul.S b/sysdeps/sparc/sparc32/umul.S index 096554a..cec454a 100644 --- a/sysdeps/sparc/sparc32/umul.S +++ b/sysdeps/sparc/sparc32/umul.S @@ -1,155 +1,13 @@ /* - * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the - * upper 32 bits of the 64-bit product). - * - * This code optimizes short (less than 13-bit) multiplies. Short - * multiplies require 25 instruction cycles, and long ones require - * 45 instruction cycles. - * - * On return, overflow has occurred (%o1 is not zero) if and only if - * the Z condition code is clear, allowing, e.g., the following: - * - * call .umul - * nop - * bnz overflow (or tnz) + * Sparc v8 has multiply. */ #include <sysdep.h> ENTRY(.umul) - or %o0, %o1, %o4 - mov %o0, %y ! multiplier -> Y - andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args - be LOC(mul_shortway) ! if zero, can do it the short way - andcc %g0, %g0, %o4 ! zero the partial product; clear N & V - /* - * Long multiply. 32 steps, followed by a final shift step. - */ - mulscc %o4, %o1, %o4 ! 1 - mulscc %o4, %o1, %o4 ! 2 - mulscc %o4, %o1, %o4 ! 3 - mulscc %o4, %o1, %o4 ! 4 - mulscc %o4, %o1, %o4 ! 5 - mulscc %o4, %o1, %o4 ! 6 - mulscc %o4, %o1, %o4 ! 7 - mulscc %o4, %o1, %o4 ! 8 - mulscc %o4, %o1, %o4 ! 9 - mulscc %o4, %o1, %o4 ! 10 - mulscc %o4, %o1, %o4 ! 11 - mulscc %o4, %o1, %o4 ! 12 - mulscc %o4, %o1, %o4 ! 13 - mulscc %o4, %o1, %o4 ! 14 - mulscc %o4, %o1, %o4 ! 15 - mulscc %o4, %o1, %o4 ! 16 - mulscc %o4, %o1, %o4 ! 17 - mulscc %o4, %o1, %o4 ! 18 - mulscc %o4, %o1, %o4 ! 19 - mulscc %o4, %o1, %o4 ! 20 - mulscc %o4, %o1, %o4 ! 21 - mulscc %o4, %o1, %o4 ! 22 - mulscc %o4, %o1, %o4 ! 23 - mulscc %o4, %o1, %o4 ! 24 - mulscc %o4, %o1, %o4 ! 25 - mulscc %o4, %o1, %o4 ! 26 - mulscc %o4, %o1, %o4 ! 27 - mulscc %o4, %o1, %o4 ! 28 - mulscc %o4, %o1, %o4 ! 29 - mulscc %o4, %o1, %o4 ! 30 - mulscc %o4, %o1, %o4 ! 31 - mulscc %o4, %o1, %o4 ! 32 - mulscc %o4, %g0, %o4 ! final shift - - /* - * Normally, with the shift-and-add approach, if both numbers are - * positive you get the correct result. With 32-bit two's-complement - * numbers, -x is represented as - * - * x 32 - * ( 2 - ------ ) mod 2 * 2 - * 32 - * 2 - * - * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s, - * we can treat this as if the radix point were just to the left - * of the sign bit (multiply by 2^32), and get - * - * -x = (2 - x) mod 2 - * - * Then, ignoring the `mod 2's for convenience: - * - * x * y = xy - * -x * y = 2y - xy - * x * -y = 2x - xy - * -x * -y = 4 - 2x - 2y + xy - * - * For signed multiplies, we subtract (x << 32) from the partial - * product to fix this problem for negative multipliers (see mul.s). - * Because of the way the shift into the partial product is calculated - * (N xor V), this term is automatically removed for the multiplicand, - * so we don't have to adjust. - * - * But for unsigned multiplies, the high order bit wasn't a sign bit, - * and the correction is wrong. So for unsigned multiplies where the - * high order bit is one, we end up with xy - (y << 32). To fix it - * we add y << 32. - */ -#if 0 - tst %o1 - bl,a 1f ! if %o1 < 0 (high order bit = 1), - add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half) -1: rd %y, %o0 ! get lower half of product - retl - addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0 -#else - /* Faster code from tege@sics.se. */ - sra %o1, 31, %o2 ! make mask from sign bit - and %o0, %o2, %o2 ! %o2 = 0 or %o0, depending on sign of %o1 - rd %y, %o0 ! get lower half of product - retl - addcc %o4, %o2, %o1 ! add compensation and put upper half in place -#endif - -LOC(mul_shortway): - /* - * Short multiply. 12 steps, followed by a final shift step. - * The resulting bits are off by 12 and (32-12) = 20 bit positions, - * but there is no problem with %o0 being negative (unlike above), - * and overflow is impossible (the answer is at most 24 bits long). - */ - mulscc %o4, %o1, %o4 ! 1 - mulscc %o4, %o1, %o4 ! 2 - mulscc %o4, %o1, %o4 ! 3 - mulscc %o4, %o1, %o4 ! 4 - mulscc %o4, %o1, %o4 ! 5 - mulscc %o4, %o1, %o4 ! 6 - mulscc %o4, %o1, %o4 ! 7 - mulscc %o4, %o1, %o4 ! 8 - mulscc %o4, %o1, %o4 ! 9 - mulscc %o4, %o1, %o4 ! 10 - mulscc %o4, %o1, %o4 ! 11 - mulscc %o4, %o1, %o4 ! 12 - mulscc %o4, %g0, %o4 ! final shift - - /* - * %o4 has 20 of the bits that should be in the result; %y has - * the bottom 12 (as %y's top 12). That is: - * - * %o4 %y - * +----------------+----------------+ - * | -12- | -20- | -12- | -20- | - * +------(---------+------)---------+ - * -----result----- - * - * The 12 bits of %o4 left of the `result' area are all zero; - * in fact, all top 20 bits of %o4 are zero. - */ - - rd %y, %o5 - sll %o4, 12, %o0 ! shift middle bits left 12 - srl %o5, 20, %o5 ! shift low bits right 20 - or %o5, %o0, %o0 + umul %o0, %o1, %o0 retl - addcc %g0, %g0, %o1 ! %o1 = zero, and set Z + rd %y, %o1 END(.umul) diff --git a/sysdeps/sparc/sparc32/urem.S b/sysdeps/sparc/sparc32/urem.S index 83fb4c2..cc2689d 100644 --- a/sysdeps/sparc/sparc32/urem.S +++ b/sysdeps/sparc/sparc32/urem.S @@ -1,346 +1,18 @@ - /* This file is generated from divrem.m4; DO NOT EDIT! */ /* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. + * Sparc v8 has divide. */ -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .urem name of function to generate - * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1 - * false false=true => signed; false=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - - #include <sysdep.h> -#include <sys/trap.h> ENTRY(.urem) - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta ST_DIV0 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu LOC(got_result) ! (and algorithm fails otherwise) - clr %o2 - sethi %hi(1 << (32 - 4 - 1)), %g1 - cmp %o3, %g1 - blu LOC(not_really_big) - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g2 - sll %o5, 4, %o5 - b 1b - add %o4, 1, %o4 - - ! Now compute %g2. - 2: addcc %o5, %o5, %o5 - bcc LOC(not_too_big) - add %g2, 1, %g2 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - b LOC(do_single_div) - sub %g2, 1, %g2 - - LOC(not_too_big): - 3: cmp %o5, %o3 - blu 2b - nop - be LOC(do_single_div) - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g2 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - LOC(do_single_div): - subcc %g2, 1, %g2 - bl LOC(end_regular_divide) - nop - sub %o3, %o5, %o3 - mov 1, %o2 - b LOC(end_single_divloop) - nop - LOC(single_divloop): - sll %o2, 1, %o2 - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - LOC(end_single_divloop): - subcc %g2, 1, %g2 - bge LOC(single_divloop) - tst %o3 - b,a LOC(end_regular_divide) - -LOC(not_really_big): -1: - sll %o5, 4, %o5 - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - be LOC(got_result) - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -LOC(divloop): - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl LOC(1.16) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl LOC(2.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl LOC(3.19) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl LOC(4.23) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - -LOC(4.23): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - - -LOC(3.19): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl LOC(4.21) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -LOC(4.21): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - - -LOC(2.17): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl LOC(3.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl LOC(4.19) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -LOC(4.19): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - -LOC(3.17): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl LOC(4.17) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -LOC(4.17): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - -LOC(1.16): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl LOC(2.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl LOC(3.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl LOC(4.15) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -LOC(4.15): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - -LOC(3.15): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl LOC(4.13) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -LOC(4.13): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - -LOC(2.15): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl LOC(3.13) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl LOC(4.11) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -LOC(4.11): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - -LOC(3.13): - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl LOC(4.9) - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -LOC(4.9): - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - -9: -LOC(end_regular_divide): - subcc %o4, 1, %o4 - bge LOC(divloop) - tst %o3 - bl,a LOC(got_result) - ! non-restoring fixup here (one instruction only!) - add %o3, %o1, %o3 - - -LOC(got_result): - + wr %g0, 0, %y + nop + nop + nop + udiv %o0, %o1, %o2 + umul %o2, %o1, %o2 retl - mov %o3, %o0 + sub %o0, %o2, %o0 END(.urem) |