diff options
author | Rainer Orth <ro@CeBiTec.Uni-Bielefeld.DE> | 2011-11-02 15:03:19 +0000 |
---|---|---|
committer | Rainer Orth <ro@gcc.gnu.org> | 2011-11-02 15:03:19 +0000 |
commit | 45b86625d7edd2278c0cdcf335e007a47671813f (patch) | |
tree | bfbde9a54f663fb7556b9dacd07709ef97c1961c /libgcc/config/xtensa | |
parent | 5f73c6ccf0d9c54714b162b9d314594311747de9 (diff) | |
download | gcc-45b86625d7edd2278c0cdcf335e007a47671813f.zip gcc-45b86625d7edd2278c0cdcf335e007a47671813f.tar.gz gcc-45b86625d7edd2278c0cdcf335e007a47671813f.tar.bz2 |
Move libgcc1 to toplevel libgcc
gcc:
* Makefile.in (LIB1ASMSRC): Don't export.
(libgcc.mvars): Don't emit LIB1ASMFUNCS, LIB1ASMSRC.
* config/arm/arm.c: Update lib1funcs.asm filename.
* config/arm/linux-eabi.h: Likewise.
* config/arm/bpabi-v6m.S, config/arm/bpabi.S,
config/arm/ieee754-df.S, config/arm/ieee754-sf.S: Move to
../libgcc/config/arm.
* config/arm/lib1funcs.asm: Move to ../libgcc/config/arm/lib1funcs.S.
* config/arm/t-arm (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/arm/t-arm-elf (LIB1ASMFUNCS): Remove.
* config/arm/t-bpabi: Likewise.
* config/arm/t-linux (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/arm/t-linux-eabi (LIB1ASMFUNCS): Remove.
* config/arm/t-strongarm-elf: Likewise.
* config/arm/t-symbian: Likewise.
* config/arm/t-vxworks: Likewise.
* config/arm/t-wince-pe: Likewise.
* config/avr/libgcc.S: Move to ../libgcc/config/avr.
* config/avr/t-avr (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/bfin/lib1funcs.asm: Move to
../libgcc/config/bfin/lib1funcs.S.
* config/bfin/t-bfin: Remove.
* config/bfin/t-bfin-elf (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/bfin/t-bfin-linux: Likewise.
* config/bfin/t-bfin-uclinux: Likewise.
* config/c6x/lib1funcs.asm: Move to
../libgcc/config/c6x/lib1funcs.S.
* config/c6x/t-c6x-elf (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/fr30/lib1funcs.asm: Move to
../libgcc/config/fr30/lib1funcs.S.
* config/fr30/t-fr30 (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/frv/lib1funcs.asm: Move to
../libgcc/config/frv/lib1funcs.S.
* config/frv/t-frv (CROSS_LIBGCC1, LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/h8300/fixunssfsi.c: Update lib1funcs.asm filename.
* config/h8300/lib1funcs.asm: Move to
../libgcc/config/h8300/lib1funcs.S.
* config/h8300/t-h8300 (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/i386/cygwin.asm: Move to ../libgcc/config/i386/cygwin.S.
* config/i386/t-cygming (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/i386/t-interix: Likewise.
* config/ia64/lib1funcs.asm: Move to
../libgcc/config/ia64/lib1funcs.S.
* config/ia64/t-hpux (LIB1ASMFUNCS, LIBGCC1_TEST): Remove.
* config/ia64/t-ia64 (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/iq2000/t-iq2000 (LIBGCC1, CROSS_LIBGCC1): Remove.
* config/m32c/m32c.c: Update m32c-lib1.S filename.
* config/m32c/m32c-lib1.S: Move to ../libgcc/config/m32c/lib1funcs.S.
* config/m32c/t-m32c (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/m32r/t-linux (CROSS_LIBGCC1, LIBGCC1, LIBGCC1_TEST): Remove.
* config/m68k/lb1sf68.asm: Move to ../libgcc/config/m68k/lb1sf68.S.
* config/m68k/t-floatlib (LIB1ASMSRC, LIB1ASMFUNCS): New file.
* config/mcore/lib1.asm: Move to ../libgcc/config/mcore/lib1funcs.S.
* config/mcore/t-mcore (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/mep/mep-lib1.asm: Move to ../libgcc/config/mep/lib1funcs.S.
* config/mep/t-mep (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/mips/mips16.S: Move to ../libgcc/config/mips.
* config/mips/t-libgcc-mips16: Remove.
* config/mips/t-sr71k (LIBGCC1, CROSS_LIBGCC1): Remove.
* config/pa/milli64.S: Move to ../libgcc/config/pa.
* config/pa/t-linux (LIB1ASMFUNCS, LIB1ASMSRC): Remove.
* config/pa/t-linux64: Likewise.
* config/picochip/libgccExtras/fake_libgcc.asm: Move to
../libgcc/config/picochip/lib1funcs.S.
* config/picochip/t-picochip (LIB1ASMFUNCS, LIB1ASMSRC): Remove.
* config/sh/lib1funcs.asm: Move to ../libgcc/config/sh/lib1funcs.S.
* config/sh/lib1funcs.h: Move to ../libgcc/config/sh.
* config/sh/sh.h: Update lib1funcs.asm filename.
* config/sh/t-linux (LIB1ASMFUNCS_CACHE): Remove.
* config/sh/t-netbsd: Likewise.
* config/sh/t-sh (LIB1ASMSRC, LIB1ASMFUNCS, LIB1ASMFUNCS_CACHE):
Remove.
* config/sh/t-sh64 (LIB1ASMFUNCS): Remove.
* config/sparc/lb1spc.asm: Move to ../libgcc/config/sparc/lb1spc.S.
* config/sparc/lb1spl.asm: Remove.
* config/sparc/t-elf (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/sparc/t-leon: Likewise.
* config/spu/t-spu-elf (LIBGCC1, CROSS_LIBGCC1): Remove.
* config/v850/lib1funcs.asm: Move to ../libgcc/config/v850/lib1funcs.S.
* config/v850/t-v850 (LIB1ASMSRC, LIB1ASMFUNCS): Remove
* config/vax/lib1funcs.asm: Move to ../libgcc/config/vax/lib1funcs.S.
* config/vax/t-linux: Remove.
* config/xtensa/ieee754-df.S, config/xtensa/ieee754-sf.S: Move to
../libgcc/config/xtensa.
* config/xtensa/lib1funcs.asm: Move to
../libgcc/config/xtensa/lib1funcs.S.
* config/xtensa/t-xtensa (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config.gcc (bfin*-rtems*): Remove bfin/t-bfin from tmake_file.
(bfin*-*): Likewise.
(mips64*-*-linux*, mipsisa64*-*-linux*): Remove
mips/t-libgcc-mips16 from tmake_file.
(mips*-*-linux*): Likewise.
(mips*-sde-elf*): Likewise.
(mipsisa32-*-elf*, mipsisa32el-*-elf*, mipsisa32r2-*-elf*)
(mipsisa32r2el-*-elf*, mipsisa64-*-elf*, mipsisa64el-*-elf*)
(mipsisa64r2-*-elf*, mipsisa64r2el-*-elf*): Likewise.
(mipsisa64sb1-*-elf*, mipsisa64sb1el-*-elf*): Likewise.
(mips-*-elf*, mipsel-*-elf*): Likewise.
(mips64-*-elf*, mips64el-*-elf*): Likewise.
(mips64orion-*-elf*, mips64orionel-*-elf*): Likewise.
(mips*-*-rtems*): Likewise.
(mipstx39-*-elf*, mipstx39el-*-elf*): Likewise.
(vax-*-linux*): Remove vax/t-linux from tmake_file.
libgcc:
* Makefile.in ($(lib1asmfuncs-o), $(lib1asmfuncs-s-o)): Use
$(srcdir) to refer to $(LIB1ASMSRC).
Use $<.
* config/arm/bpabi-v6m.S, config/arm/bpabi.S,
config/arm/ieee754-df.S, config/arm/ieee754-sf.S,
config/arm/lib1funcs.S: New files.
* config/arm/libunwind.S [!__symbian__]: Use lib1funcs.S.
* config/arm/t-arm: New file.
* config/arm/t-bpabi (LIB1ASMFUNCS): Set.
* config/arm/t-elf, config/arm/t-linux, config/arm/t-linux-eabi,
config/arm/t-strongarm-elf: New files.
* config/arm/t-symbian (LIB1ASMFUNCS): Set.
* config/arm/t-vxworks, config/arm/t-wince-pe: New files.
* config/avr/lib1funcs.S: New file.
* config/avr/t-avr (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/bfin/lib1funcs.S, config/bfin/t-bfin: New files.
* config/c6x/lib1funcs.S: New file.
* config/c6x/t-elf (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/fr30/lib1funcs.S, config/fr30/t-fr30: New files.
* config/frv/lib1funcs.S: New file.
* config/frv/t-frv (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/h8300/lib1funcs.S, config/h8300/t-h8300: New files.
* config/i386/cygwin.S, config/i386/t-chkstk: New files.
* config/ia64/__divxf3.asm: Rename to ...
* config/ia64/__divxf3.S: ... this.
Adapt lib1funcs.asm filename.
* config/ia64/_fixtfdi.asm: Rename to ...
* config/ia64/_fixtfdi.S: ... this.
Adapt lib1funcs.asm filename.
* config/ia64/_fixunstfdi.asm: Rename to ...
* config/ia64/_fixunstfdi.S: ... this.
Adapt lib1funcs.asm filename.
* config/ia64/_floatditf.asm: Rename to ...
* config/ia64/_floatditf.S: ... this.
Adapt lib1funcs.asm filename.
* config/ia64/lib1funcs.S: New file.
* config/ia64/t-hpux (LIB1ASMFUNCS): Set.
* config/ia64/t-ia64 (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/ia64/t-softfp-compat (libgcc1-tf-compats): Adapt suffix.
* config/m32c/lib1funcs.S, config/m32c/t-m32c: New files.
* config/m68k/lb1sf68.S, config/m68k/t-floatlib: New files.
* config/mcore/lib1funcs.S, config/mcore/t-mcore: New files.
* config/mep/lib1funcs.S: New file.
* config/mep/t-mep (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/mips/mips16.S: New file.
* config/mips/t-mips16 (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/pa/milli64.S: New file.
* config/pa/t-linux, config/pa/t-linux64: New files.
* config/picochip/lib1funcs.S: New file.
* config/picochip/t-picochip (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/sh/lib1funcs.S, config/sh/lib1funcs.h: New files.
* config/sh/t-linux (LIB1ASMFUNCS_CACHE): Set.
* config/sh/t-netbsd: New file.
* config/sh/t-sh (LIB1ASMSRC, LIB1ASMFUNCS, LIB1ASMFUNCS_CACHE): Set.
Use $(srcdir) to refer to lib1funcs.S, adapt filename.
* config/sh/t-sh64: New file.
* config/sparc/lb1spc.S: New file.
* config/sparc/t-softmul (LIB1ASMSRC): Adapt sparc/lb1spc.asm
filename.
* config/v850/lib1funcs.S, config/v850/t-v850: New files.
* config/vax/lib1funcs.S, config/vax/t-linux: New files.
* config/xtensa/ieee754-df.S, config/xtensa/ieee754-sf.S,
config/xtensa/lib1funcs.S: New files.
* config/xtensa/t-xtensa (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config.host (arm-wrs-vxworks): Add arm/t-arm, arm/t-vxworks to
tmake_file.
(arm*-*-freebsd*): Add arm/t-arm, arm/t-strongarm-elf to tmake_file.
(arm*-*-netbsdelf*): Add arm/t-arm to tmake_file.
(arm*-*-linux*): Likewise.
Add arm/t-elf, arm/t-bpabi, arm/t-linux-eabi to tmake_file for
arm*-*-linux-*eabi, add arm/t-linux otherwise.
(arm*-*-uclinux*): Add arm/t-arm, arm/t-elf to tmake_file.
(arm*-*-ecos-elf): Likewise.
(arm*-*-eabi*, arm*-*-symbianelf*): Likewise.
(arm*-*-rtems*): Likewise.
(arm*-*-elf): Likewise.
(arm*-wince-pe*): Add arm/t-arm, arm/t-wince-pe to tmake_file.
(avr-*-rtems*): Add to tmake_file, add avr/t-avr.
(bfin*-elf*): Add bfin/t-bfin to tmake_file.
(bfin*-uclinux*): Likewise.
(bfin*-linux-uclibc*): Likewise.
(bfin*-rtems*): Likewise.
(bfin*-*): Likewise.
(fido-*-elf): Merge into m68k-*-elf*.
(fr30-*-elf)): Add fr30/t-fr30 to tmake_file.
(frv-*-*linux*): Add frv/t-frv to tmake_file.
(h8300-*-rtems*): Add h8300/t-h8300 to tmake_file.
(h8300-*-elf*): Likewise.
(hppa*64*-*-linux*): Add pa/t-linux, pa/t-linux64 to tmake_file.
(hppa*-*-linux*): Add pa/t-linux to tmake_file.
(i[34567]86-*-cygwin*): Add i386/t-chkstk to tmake_file.
(i[34567]86-*-mingw*): Likewise.
(x86_64-*-mingw*): Likewise.
(i[34567]86-*-interix3*): Likewise.
(ia64*-*-hpux*): Add ia64/t-ia64, ia64/t-hpux to tmake_file.
(ia64-hp-*vms*): Add ia64/t-ia64 to tmake_file.
(m68k-*-elf*): Also handle fido-*-elf.
Add m68k/t-floatlib to tmake_file.
(m68k-*-uclinux*): Add m68k/t-floatlib to tmake_file.
(m68k-*-linux*): Likewise.
(m68k-*-rtems*): Likewise.
(mcore-*-elf): Add mcore/t-mcore to tmake_file.
(sh-*-elf*, sh[12346l]*-*-elf*): Add sh/t-sh64 to tmake_file for
sh64*-*-*.
(sh-*-linux*, sh[2346lbe]*-*-linux*): Add sh/t-sh to tmake_file.
Add sh/t-sh64 to tmake_file for sh64*-*-linux*.
(sh-*-netbsdelf*, shl*-*-netbsdelf*, sh5-*-netbsd*)
(sh5l*-*-netbsd*, sh64-*-netbsd*, sh64l*-*-netbsd*): Add sh/t-sh,
sh/t-netbsd to tmake_file.
Add sh/t-sh64 to tmake_file for sh5*-*-netbsd*, sh64*-netbsd*.
(sh-*-rtems*): Add sh/t-sh to tmake_file.
(sh-wrs-vxworks): Likewise.
(sparc-*-linux*): Add sparc/t-softmul to tmake_file except for
*-leon[3-9]*.
(v850*-*-*): Add v850/t-v850 to tmake_file.
(vax-*-linux*): Add vax/t-linux to tmake_file.
(m32c-*-elf*, m32c-*-rtems*): Add m32c/t-m32c to tmake_file.
From-SVN: r180773
Diffstat (limited to 'libgcc/config/xtensa')
-rw-r--r-- | libgcc/config/xtensa/ieee754-df.S | 2388 | ||||
-rw-r--r-- | libgcc/config/xtensa/ieee754-sf.S | 1757 | ||||
-rw-r--r-- | libgcc/config/xtensa/lib1funcs.S | 845 | ||||
-rw-r--r-- | libgcc/config/xtensa/t-xtensa | 12 |
4 files changed, 5002 insertions, 0 deletions
diff --git a/libgcc/config/xtensa/ieee754-df.S b/libgcc/config/xtensa/ieee754-df.S new file mode 100644 index 0000000..9b46889 --- /dev/null +++ b/libgcc/config/xtensa/ieee754-df.S @@ -0,0 +1,2388 @@ +/* IEEE-754 double-precision functions for Xtensa + Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc. + Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef __XTENSA_EB__ +#define xh a2 +#define xl a3 +#define yh a4 +#define yl a5 +#else +#define xh a3 +#define xl a2 +#define yh a5 +#define yl a4 +#endif + +/* Warning! The branch displacements for some Xtensa branch instructions + are quite small, and this code has been carefully laid out to keep + branch targets in range. If you change anything, be sure to check that + the assembler is not relaxing anything to branch over a jump. */ + +#ifdef L_negdf2 + + .align 4 + .global __negdf2 + .type __negdf2, @function +__negdf2: + leaf_entry sp, 16 + movi a4, 0x80000000 + xor xh, xh, a4 + leaf_return + +#endif /* L_negdf2 */ + +#ifdef L_addsubdf3 + + /* Addition */ +__adddf3_aux: + + /* Handle NaNs and Infinities. (This code is placed before the + start of the function just to keep it in range of the limited + branch displacements.) */ + +.Ladd_xnan_or_inf: + /* If y is neither Infinity nor NaN, return x. */ + bnall yh, a6, 1f + /* If x is a NaN, return it. Otherwise, return y. */ + slli a7, xh, 12 + or a7, a7, xl + beqz a7, .Ladd_ynan_or_inf +1: leaf_return + +.Ladd_ynan_or_inf: + /* Return y. */ + mov xh, yh + mov xl, yl + leaf_return + +.Ladd_opposite_signs: + /* Operand signs differ. Do a subtraction. */ + slli a7, a6, 11 + xor yh, yh, a7 + j .Lsub_same_sign + + .align 4 + .global __adddf3 + .type __adddf3, @function +__adddf3: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + + /* Check if the two operands have the same sign. */ + xor a7, xh, yh + bltz a7, .Ladd_opposite_signs + +.Ladd_same_sign: + /* Check if either exponent == 0x7ff (i.e., NaN or Infinity). */ + ball xh, a6, .Ladd_xnan_or_inf + ball yh, a6, .Ladd_ynan_or_inf + + /* Compare the exponents. The smaller operand will be shifted + right by the exponent difference and added to the larger + one. */ + extui a7, xh, 20, 12 + extui a8, yh, 20, 12 + bltu a7, a8, .Ladd_shiftx + +.Ladd_shifty: + /* Check if the smaller (or equal) exponent is zero. */ + bnone yh, a6, .Ladd_yexpzero + + /* Replace yh sign/exponent with 0x001. */ + or yh, yh, a6 + slli yh, yh, 11 + srli yh, yh, 11 + +.Ladd_yexpdiff: + /* Compute the exponent difference. Optimize for difference < 32. */ + sub a10, a7, a8 + bgeui a10, 32, .Ladd_bigshifty + + /* Shift yh/yl right by the exponent difference. Any bits that are + shifted out of yl are saved in a9 for rounding the result. */ + ssr a10 + movi a9, 0 + src a9, yl, a9 + src yl, yh, yl + srl yh, yh + +.Ladd_addy: + /* Do the 64-bit addition. */ + add xl, xl, yl + add xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, 1 +1: + /* Check if the add overflowed into the exponent. */ + extui a10, xh, 20, 12 + beq a10, a7, .Ladd_round + mov a8, a7 + j .Ladd_carry + +.Ladd_yexpzero: + /* y is a subnormal value. Replace its sign/exponent with zero, + i.e., no implicit "1.0", and increment the apparent exponent + because subnormals behave as if they had the minimum (nonzero) + exponent. Test for the case when both exponents are zero. */ + slli yh, yh, 12 + srli yh, yh, 12 + bnone xh, a6, .Ladd_bothexpzero + addi a8, a8, 1 + j .Ladd_yexpdiff + +.Ladd_bothexpzero: + /* Both exponents are zero. Handle this as a special case. There + is no need to shift or round, and the normal code for handling + a carry into the exponent field will not work because it + assumes there is an implicit "1.0" that needs to be added. */ + add xl, xl, yl + add xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, 1 +1: leaf_return + +.Ladd_bigshifty: + /* Exponent difference > 64 -- just return the bigger value. */ + bgeui a10, 64, 1b + + /* Shift yh/yl right by the exponent difference. Any bits that are + shifted out are saved in a9 for rounding the result. */ + ssr a10 + sll a11, yl /* lost bits shifted out of yl */ + src a9, yh, yl + srl yl, yh + movi yh, 0 + beqz a11, .Ladd_addy + or a9, a9, a10 /* any positive, nonzero value will work */ + j .Ladd_addy + +.Ladd_xexpzero: + /* Same as "yexpzero" except skip handling the case when both + exponents are zero. */ + slli xh, xh, 12 + srli xh, xh, 12 + addi a7, a7, 1 + j .Ladd_xexpdiff + +.Ladd_shiftx: + /* Same thing as the "shifty" code, but with x and y swapped. Also, + because the exponent difference is always nonzero in this version, + the shift sequence can use SLL and skip loading a constant zero. */ + bnone xh, a6, .Ladd_xexpzero + + or xh, xh, a6 + slli xh, xh, 11 + srli xh, xh, 11 + +.Ladd_xexpdiff: + sub a10, a8, a7 + bgeui a10, 32, .Ladd_bigshiftx + + ssr a10 + sll a9, xl + src xl, xh, xl + srl xh, xh + +.Ladd_addx: + add xl, xl, yl + add xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, 1 +1: + /* Check if the add overflowed into the exponent. */ + extui a10, xh, 20, 12 + bne a10, a8, .Ladd_carry + +.Ladd_round: + /* Round up if the leftover fraction is >= 1/2. */ + bgez a9, 1f + addi xl, xl, 1 + beqz xl, .Ladd_roundcarry + + /* Check if the leftover fraction is exactly 1/2. */ + slli a9, a9, 1 + beqz a9, .Ladd_exactlyhalf +1: leaf_return + +.Ladd_bigshiftx: + /* Mostly the same thing as "bigshifty".... */ + bgeui a10, 64, .Ladd_returny + + ssr a10 + sll a11, xl + src a9, xh, xl + srl xl, xh + movi xh, 0 + beqz a11, .Ladd_addx + or a9, a9, a10 + j .Ladd_addx + +.Ladd_returny: + mov xh, yh + mov xl, yl + leaf_return + +.Ladd_carry: + /* The addition has overflowed into the exponent field, so the + value needs to be renormalized. The mantissa of the result + can be recovered by subtracting the original exponent and + adding 0x100000 (which is the explicit "1.0" for the + mantissa of the non-shifted operand -- the "1.0" for the + shifted operand was already added). The mantissa can then + be shifted right by one bit. The explicit "1.0" of the + shifted mantissa then needs to be replaced by the exponent, + incremented by one to account for the normalizing shift. + It is faster to combine these operations: do the shift first + and combine the additions and subtractions. If x is the + original exponent, the result is: + shifted mantissa - (x << 19) + (1 << 19) + (x << 20) + or: + shifted mantissa + ((x + 1) << 19) + Note that the exponent is incremented here by leaving the + explicit "1.0" of the mantissa in the exponent field. */ + + /* Shift xh/xl right by one bit. Save the lsb of xl. */ + mov a10, xl + ssai 1 + src xl, xh, xl + srl xh, xh + + /* See explanation above. The original exponent is in a8. */ + addi a8, a8, 1 + slli a8, a8, 19 + add xh, xh, a8 + + /* Return an Infinity if the exponent overflowed. */ + ball xh, a6, .Ladd_infinity + + /* Same thing as the "round" code except the msb of the leftover + fraction is bit 0 of a10, with the rest of the fraction in a9. */ + bbci.l a10, 0, 1f + addi xl, xl, 1 + beqz xl, .Ladd_roundcarry + beqz a9, .Ladd_exactlyhalf +1: leaf_return + +.Ladd_infinity: + /* Clear the mantissa. */ + movi xl, 0 + srli xh, xh, 20 + slli xh, xh, 20 + + /* The sign bit may have been lost in a carry-out. Put it back. */ + slli a8, a8, 1 + or xh, xh, a8 + leaf_return + +.Ladd_exactlyhalf: + /* Round down to the nearest even value. */ + srli xl, xl, 1 + slli xl, xl, 1 + leaf_return + +.Ladd_roundcarry: + /* xl is always zero when the rounding increment overflows, so + there's no need to round it to an even value. */ + addi xh, xh, 1 + /* Overflow to the exponent is OK. */ + leaf_return + + + /* Subtraction */ +__subdf3_aux: + + /* Handle NaNs and Infinities. (This code is placed before the + start of the function just to keep it in range of the limited + branch displacements.) */ + +.Lsub_xnan_or_inf: + /* If y is neither Infinity nor NaN, return x. */ + bnall yh, a6, 1f + /* Both x and y are either NaN or Inf, so the result is NaN. */ + movi a4, 0x80000 /* make it a quiet NaN */ + or xh, xh, a4 +1: leaf_return + +.Lsub_ynan_or_inf: + /* Negate y and return it. */ + slli a7, a6, 11 + xor xh, yh, a7 + mov xl, yl + leaf_return + +.Lsub_opposite_signs: + /* Operand signs differ. Do an addition. */ + slli a7, a6, 11 + xor yh, yh, a7 + j .Ladd_same_sign + + .align 4 + .global __subdf3 + .type __subdf3, @function +__subdf3: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + + /* Check if the two operands have the same sign. */ + xor a7, xh, yh + bltz a7, .Lsub_opposite_signs + +.Lsub_same_sign: + /* Check if either exponent == 0x7ff (i.e., NaN or Infinity). */ + ball xh, a6, .Lsub_xnan_or_inf + ball yh, a6, .Lsub_ynan_or_inf + + /* Compare the operands. In contrast to addition, the entire + value matters here. */ + extui a7, xh, 20, 11 + extui a8, yh, 20, 11 + bltu xh, yh, .Lsub_xsmaller + beq xh, yh, .Lsub_compare_low + +.Lsub_ysmaller: + /* Check if the smaller (or equal) exponent is zero. */ + bnone yh, a6, .Lsub_yexpzero + + /* Replace yh sign/exponent with 0x001. */ + or yh, yh, a6 + slli yh, yh, 11 + srli yh, yh, 11 + +.Lsub_yexpdiff: + /* Compute the exponent difference. Optimize for difference < 32. */ + sub a10, a7, a8 + bgeui a10, 32, .Lsub_bigshifty + + /* Shift yh/yl right by the exponent difference. Any bits that are + shifted out of yl are saved in a9 for rounding the result. */ + ssr a10 + movi a9, 0 + src a9, yl, a9 + src yl, yh, yl + srl yh, yh + +.Lsub_suby: + /* Do the 64-bit subtraction. */ + sub xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, -1 +1: sub xl, xl, yl + + /* Subtract the leftover bits in a9 from zero and propagate any + borrow from xh/xl. */ + neg a9, a9 + beqz a9, 1f + addi a5, xh, -1 + moveqz xh, a5, xl + addi xl, xl, -1 +1: + /* Check if the subtract underflowed into the exponent. */ + extui a10, xh, 20, 11 + beq a10, a7, .Lsub_round + j .Lsub_borrow + +.Lsub_compare_low: + /* The high words are equal. Compare the low words. */ + bltu xl, yl, .Lsub_xsmaller + bltu yl, xl, .Lsub_ysmaller + /* The operands are equal. Return 0.0. */ + movi xh, 0 + movi xl, 0 +1: leaf_return + +.Lsub_yexpzero: + /* y is a subnormal value. Replace its sign/exponent with zero, + i.e., no implicit "1.0". Unless x is also a subnormal, increment + y's apparent exponent because subnormals behave as if they had + the minimum (nonzero) exponent. */ + slli yh, yh, 12 + srli yh, yh, 12 + bnone xh, a6, .Lsub_yexpdiff + addi a8, a8, 1 + j .Lsub_yexpdiff + +.Lsub_bigshifty: + /* Exponent difference > 64 -- just return the bigger value. */ + bgeui a10, 64, 1b + + /* Shift yh/yl right by the exponent difference. Any bits that are + shifted out are saved in a9 for rounding the result. */ + ssr a10 + sll a11, yl /* lost bits shifted out of yl */ + src a9, yh, yl + srl yl, yh + movi yh, 0 + beqz a11, .Lsub_suby + or a9, a9, a10 /* any positive, nonzero value will work */ + j .Lsub_suby + +.Lsub_xsmaller: + /* Same thing as the "ysmaller" code, but with x and y swapped and + with y negated. */ + bnone xh, a6, .Lsub_xexpzero + + or xh, xh, a6 + slli xh, xh, 11 + srli xh, xh, 11 + +.Lsub_xexpdiff: + sub a10, a8, a7 + bgeui a10, 32, .Lsub_bigshiftx + + ssr a10 + movi a9, 0 + src a9, xl, a9 + src xl, xh, xl + srl xh, xh + + /* Negate y. */ + slli a11, a6, 11 + xor yh, yh, a11 + +.Lsub_subx: + sub xl, yl, xl + sub xh, yh, xh + bgeu yl, xl, 1f + addi xh, xh, -1 +1: + /* Subtract the leftover bits in a9 from zero and propagate any + borrow from xh/xl. */ + neg a9, a9 + beqz a9, 1f + addi a5, xh, -1 + moveqz xh, a5, xl + addi xl, xl, -1 +1: + /* Check if the subtract underflowed into the exponent. */ + extui a10, xh, 20, 11 + bne a10, a8, .Lsub_borrow + +.Lsub_round: + /* Round up if the leftover fraction is >= 1/2. */ + bgez a9, 1f + addi xl, xl, 1 + beqz xl, .Lsub_roundcarry + + /* Check if the leftover fraction is exactly 1/2. */ + slli a9, a9, 1 + beqz a9, .Lsub_exactlyhalf +1: leaf_return + +.Lsub_xexpzero: + /* Same as "yexpzero". */ + slli xh, xh, 12 + srli xh, xh, 12 + bnone yh, a6, .Lsub_xexpdiff + addi a7, a7, 1 + j .Lsub_xexpdiff + +.Lsub_bigshiftx: + /* Mostly the same thing as "bigshifty", but with the sign bit of the + shifted value set so that the subsequent subtraction flips the + sign of y. */ + bgeui a10, 64, .Lsub_returny + + ssr a10 + sll a11, xl + src a9, xh, xl + srl xl, xh + slli xh, a6, 11 /* set sign bit of xh */ + beqz a11, .Lsub_subx + or a9, a9, a10 + j .Lsub_subx + +.Lsub_returny: + /* Negate and return y. */ + slli a7, a6, 11 + xor xh, yh, a7 + mov xl, yl + leaf_return + +.Lsub_borrow: + /* The subtraction has underflowed into the exponent field, so the + value needs to be renormalized. Shift the mantissa left as + needed to remove any leading zeros and adjust the exponent + accordingly. If the exponent is not large enough to remove + all the leading zeros, the result will be a subnormal value. */ + + slli a8, xh, 12 + beqz a8, .Lsub_xhzero + do_nsau a6, a8, a7, a11 + srli a8, a8, 12 + bge a6, a10, .Lsub_subnormal + addi a6, a6, 1 + +.Lsub_shift_lt32: + /* Shift the mantissa (a8/xl/a9) left by a6. */ + ssl a6 + src a8, a8, xl + src xl, xl, a9 + sll a9, a9 + + /* Combine the shifted mantissa with the sign and exponent, + decrementing the exponent by a6. (The exponent has already + been decremented by one due to the borrow from the subtraction, + but adding the mantissa will increment the exponent by one.) */ + srli xh, xh, 20 + sub xh, xh, a6 + slli xh, xh, 20 + add xh, xh, a8 + j .Lsub_round + +.Lsub_exactlyhalf: + /* Round down to the nearest even value. */ + srli xl, xl, 1 + slli xl, xl, 1 + leaf_return + +.Lsub_roundcarry: + /* xl is always zero when the rounding increment overflows, so + there's no need to round it to an even value. */ + addi xh, xh, 1 + /* Overflow to the exponent is OK. */ + leaf_return + +.Lsub_xhzero: + /* When normalizing the result, all the mantissa bits in the high + word are zero. Shift by "20 + (leading zero count of xl) + 1". */ + do_nsau a6, xl, a7, a11 + addi a6, a6, 21 + blt a10, a6, .Lsub_subnormal + +.Lsub_normalize_shift: + bltui a6, 32, .Lsub_shift_lt32 + + ssl a6 + src a8, xl, a9 + sll xl, a9 + movi a9, 0 + + srli xh, xh, 20 + sub xh, xh, a6 + slli xh, xh, 20 + add xh, xh, a8 + j .Lsub_round + +.Lsub_subnormal: + /* The exponent is too small to shift away all the leading zeros. + Set a6 to the current exponent (which has already been + decremented by the borrow) so that the exponent of the result + will be zero. Do not add 1 to a6 in this case, because: (1) + adding the mantissa will not increment the exponent, so there is + no need to subtract anything extra from the exponent to + compensate, and (2) the effective exponent of a subnormal is 1 + not 0 so the shift amount must be 1 smaller than normal. */ + mov a6, a10 + j .Lsub_normalize_shift + +#endif /* L_addsubdf3 */ + +#ifdef L_muldf3 + + /* Multiplication */ +#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16 +#define XCHAL_NO_MUL 1 +#endif + +__muldf3_aux: + + /* Handle unusual cases (zeros, subnormals, NaNs and Infinities). + (This code is placed before the start of the function just to + keep it in range of the limited branch displacements.) */ + +.Lmul_xexpzero: + /* Clear the sign bit of x. */ + slli xh, xh, 1 + srli xh, xh, 1 + + /* If x is zero, return zero. */ + or a10, xh, xl + beqz a10, .Lmul_return_zero + + /* Normalize x. Adjust the exponent in a8. */ + beqz xh, .Lmul_xh_zero + do_nsau a10, xh, a11, a12 + addi a10, a10, -11 + ssl a10 + src xh, xh, xl + sll xl, xl + movi a8, 1 + sub a8, a8, a10 + j .Lmul_xnormalized +.Lmul_xh_zero: + do_nsau a10, xl, a11, a12 + addi a10, a10, -11 + movi a8, -31 + sub a8, a8, a10 + ssl a10 + bltz a10, .Lmul_xl_srl + sll xh, xl + movi xl, 0 + j .Lmul_xnormalized +.Lmul_xl_srl: + srl xh, xl + sll xl, xl + j .Lmul_xnormalized + +.Lmul_yexpzero: + /* Clear the sign bit of y. */ + slli yh, yh, 1 + srli yh, yh, 1 + + /* If y is zero, return zero. */ + or a10, yh, yl + beqz a10, .Lmul_return_zero + + /* Normalize y. Adjust the exponent in a9. */ + beqz yh, .Lmul_yh_zero + do_nsau a10, yh, a11, a12 + addi a10, a10, -11 + ssl a10 + src yh, yh, yl + sll yl, yl + movi a9, 1 + sub a9, a9, a10 + j .Lmul_ynormalized +.Lmul_yh_zero: + do_nsau a10, yl, a11, a12 + addi a10, a10, -11 + movi a9, -31 + sub a9, a9, a10 + ssl a10 + bltz a10, .Lmul_yl_srl + sll yh, yl + movi yl, 0 + j .Lmul_ynormalized +.Lmul_yl_srl: + srl yh, yl + sll yl, yl + j .Lmul_ynormalized + +.Lmul_return_zero: + /* Return zero with the appropriate sign bit. */ + srli xh, a7, 31 + slli xh, xh, 31 + movi xl, 0 + j .Lmul_done + +.Lmul_xnan_or_inf: + /* If y is zero, return NaN. */ + bnez yl, 1f + slli a8, yh, 1 + bnez a8, 1f + movi a4, 0x80000 /* make it a quiet NaN */ + or xh, xh, a4 + j .Lmul_done +1: + /* If y is NaN, return y. */ + bnall yh, a6, .Lmul_returnx + slli a8, yh, 12 + or a8, a8, yl + beqz a8, .Lmul_returnx + +.Lmul_returny: + mov xh, yh + mov xl, yl + +.Lmul_returnx: + /* Set the sign bit and return. */ + extui a7, a7, 31, 1 + slli xh, xh, 1 + ssai 1 + src xh, a7, xh + j .Lmul_done + +.Lmul_ynan_or_inf: + /* If x is zero, return NaN. */ + bnez xl, .Lmul_returny + slli a8, xh, 1 + bnez a8, .Lmul_returny + movi a7, 0x80000 /* make it a quiet NaN */ + or xh, yh, a7 + j .Lmul_done + + .align 4 + .global __muldf3 + .type __muldf3, @function +__muldf3: +#if __XTENSA_CALL0_ABI__ + leaf_entry sp, 32 + addi sp, sp, -32 + s32i a12, sp, 16 + s32i a13, sp, 20 + s32i a14, sp, 24 + s32i a15, sp, 28 +#elif XCHAL_NO_MUL + /* This is not really a leaf function; allocate enough stack space + to allow CALL12s to a helper function. */ + leaf_entry sp, 64 +#else + leaf_entry sp, 32 +#endif + movi a6, 0x7ff00000 + + /* Get the sign of the result. */ + xor a7, xh, yh + + /* Check for NaN and infinity. */ + ball xh, a6, .Lmul_xnan_or_inf + ball yh, a6, .Lmul_ynan_or_inf + + /* Extract the exponents. */ + extui a8, xh, 20, 11 + extui a9, yh, 20, 11 + + beqz a8, .Lmul_xexpzero +.Lmul_xnormalized: + beqz a9, .Lmul_yexpzero +.Lmul_ynormalized: + + /* Add the exponents. */ + add a8, a8, a9 + + /* Replace sign/exponent fields with explicit "1.0". */ + movi a10, 0x1fffff + or xh, xh, a6 + and xh, xh, a10 + or yh, yh, a6 + and yh, yh, a10 + + /* Multiply 64x64 to 128 bits. The result ends up in xh/xl/a6. + The least-significant word of the result is thrown away except + that if it is nonzero, the lsb of a6 is set to 1. */ +#if XCHAL_HAVE_MUL32_HIGH + + /* Compute a6 with any carry-outs in a10. */ + movi a10, 0 + mull a6, xl, yh + mull a11, xh, yl + add a6, a6, a11 + bgeu a6, a11, 1f + addi a10, a10, 1 +1: + muluh a11, xl, yl + add a6, a6, a11 + bgeu a6, a11, 1f + addi a10, a10, 1 +1: + /* If the low word of the result is nonzero, set the lsb of a6. */ + mull a11, xl, yl + beqz a11, 1f + movi a9, 1 + or a6, a6, a9 +1: + /* Compute xl with any carry-outs in a9. */ + movi a9, 0 + mull a11, xh, yh + add a10, a10, a11 + bgeu a10, a11, 1f + addi a9, a9, 1 +1: + muluh a11, xh, yl + add a10, a10, a11 + bgeu a10, a11, 1f + addi a9, a9, 1 +1: + muluh xl, xl, yh + add xl, xl, a10 + bgeu xl, a10, 1f + addi a9, a9, 1 +1: + /* Compute xh. */ + muluh xh, xh, yh + add xh, xh, a9 + +#else /* ! XCHAL_HAVE_MUL32_HIGH */ + + /* Break the inputs into 16-bit chunks and compute 16 32-bit partial + products. These partial products are: + + 0 xll * yll + + 1 xll * ylh + 2 xlh * yll + + 3 xll * yhl + 4 xlh * ylh + 5 xhl * yll + + 6 xll * yhh + 7 xlh * yhl + 8 xhl * ylh + 9 xhh * yll + + 10 xlh * yhh + 11 xhl * yhl + 12 xhh * ylh + + 13 xhl * yhh + 14 xhh * yhl + + 15 xhh * yhh + + where the input chunks are (hh, hl, lh, ll). If using the Mul16 + or Mul32 multiplier options, these input chunks must be stored in + separate registers. For Mac16, the UMUL.AA.* opcodes can specify + that the inputs come from either half of the registers, so there + is no need to shift them out ahead of time. If there is no + multiply hardware, the 16-bit chunks can be extracted when setting + up the arguments to the separate multiply function. */ + + /* Save a7 since it is needed to hold a temporary value. */ + s32i a7, sp, 4 +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + /* Calling a separate multiply function will clobber a0 and requires + use of a8 as a temporary, so save those values now. (The function + uses a custom ABI so nothing else needs to be saved.) */ + s32i a0, sp, 0 + s32i a8, sp, 8 +#endif + +#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 + +#define xlh a12 +#define ylh a13 +#define xhh a14 +#define yhh a15 + + /* Get the high halves of the inputs into registers. */ + srli xlh, xl, 16 + srli ylh, yl, 16 + srli xhh, xh, 16 + srli yhh, yh, 16 + +#define xll xl +#define yll yl +#define xhl xh +#define yhl yh + +#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16 + /* Clear the high halves of the inputs. This does not matter + for MUL16 because the high bits are ignored. */ + extui xl, xl, 0, 16 + extui xh, xh, 0, 16 + extui yl, yl, 0, 16 + extui yh, yh, 0, 16 +#endif +#endif /* MUL16 || MUL32 */ + + +#if XCHAL_HAVE_MUL16 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mul16u dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MUL32 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mull dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MAC16 + +/* The preprocessor insists on inserting a space when concatenating after + a period in the definition of do_mul below. These macros are a workaround + using underscores instead of periods when doing the concatenation. */ +#define umul_aa_ll umul.aa.ll +#define umul_aa_lh umul.aa.lh +#define umul_aa_hl umul.aa.hl +#define umul_aa_hh umul.aa.hh + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + umul_aa_ ## xhalf ## yhalf xreg, yreg; \ + rsr dst, ACCLO + +#else /* no multiply hardware */ + +#define set_arg_l(dst, src) \ + extui dst, src, 0, 16 +#define set_arg_h(dst, src) \ + srli dst, src, 16 + +#if __XTENSA_CALL0_ABI__ +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a13, xreg); \ + set_arg_ ## yhalf (a14, yreg); \ + call0 .Lmul_mulsi3; \ + mov dst, a12 +#else +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a14, xreg); \ + set_arg_ ## yhalf (a15, yreg); \ + call12 .Lmul_mulsi3; \ + mov dst, a14 +#endif /* __XTENSA_CALL0_ABI__ */ + +#endif /* no multiply hardware */ + + /* Add pp1 and pp2 into a10 with carry-out in a9. */ + do_mul(a10, xl, l, yl, h) /* pp 1 */ + do_mul(a11, xl, h, yl, l) /* pp 2 */ + movi a9, 0 + add a10, a10, a11 + bgeu a10, a11, 1f + addi a9, a9, 1 +1: + /* Initialize a6 with a9/a10 shifted into position. Note that + this value can be safely incremented without any carry-outs. */ + ssai 16 + src a6, a9, a10 + + /* Compute the low word into a10. */ + do_mul(a11, xl, l, yl, l) /* pp 0 */ + sll a10, a10 + add a10, a10, a11 + bgeu a10, a11, 1f + addi a6, a6, 1 +1: + /* Compute the contributions of pp0-5 to a6, with carry-outs in a9. + This is good enough to determine the low half of a6, so that any + nonzero bits from the low word of the result can be collapsed + into a6, freeing up a register. */ + movi a9, 0 + do_mul(a11, xl, l, yh, l) /* pp 3 */ + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + do_mul(a11, xl, h, yl, h) /* pp 4 */ + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + do_mul(a11, xh, l, yl, l) /* pp 5 */ + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Collapse any nonzero bits from the low word into a6. */ + beqz a10, 1f + movi a11, 1 + or a6, a6, a11 +1: + /* Add pp6-9 into a11 with carry-outs in a10. */ + do_mul(a7, xl, l, yh, h) /* pp 6 */ + do_mul(a11, xh, h, yl, l) /* pp 9 */ + movi a10, 0 + add a11, a11, a7 + bgeu a11, a7, 1f + addi a10, a10, 1 +1: + do_mul(a7, xl, h, yh, l) /* pp 7 */ + add a11, a11, a7 + bgeu a11, a7, 1f + addi a10, a10, 1 +1: + do_mul(a7, xh, l, yl, h) /* pp 8 */ + add a11, a11, a7 + bgeu a11, a7, 1f + addi a10, a10, 1 +1: + /* Shift a10/a11 into position, and add low half of a11 to a6. */ + src a10, a10, a11 + add a10, a10, a9 + sll a11, a11 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a10, a10, 1 +1: + /* Add pp10-12 into xl with carry-outs in a9. */ + movi a9, 0 + do_mul(xl, xl, h, yh, h) /* pp 10 */ + add xl, xl, a10 + bgeu xl, a10, 1f + addi a9, a9, 1 +1: + do_mul(a10, xh, l, yh, l) /* pp 11 */ + add xl, xl, a10 + bgeu xl, a10, 1f + addi a9, a9, 1 +1: + do_mul(a10, xh, h, yl, h) /* pp 12 */ + add xl, xl, a10 + bgeu xl, a10, 1f + addi a9, a9, 1 +1: + /* Add pp13-14 into a11 with carry-outs in a10. */ + do_mul(a11, xh, l, yh, h) /* pp 13 */ + do_mul(a7, xh, h, yh, l) /* pp 14 */ + movi a10, 0 + add a11, a11, a7 + bgeu a11, a7, 1f + addi a10, a10, 1 +1: + /* Shift a10/a11 into position, and add low half of a11 to a6. */ + src a10, a10, a11 + add a10, a10, a9 + sll a11, a11 + add xl, xl, a11 + bgeu xl, a11, 1f + addi a10, a10, 1 +1: + /* Compute xh. */ + do_mul(xh, xh, h, yh, h) /* pp 15 */ + add xh, xh, a10 + + /* Restore values saved on the stack during the multiplication. */ + l32i a7, sp, 4 +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + l32i a0, sp, 0 + l32i a8, sp, 8 +#endif +#endif /* ! XCHAL_HAVE_MUL32_HIGH */ + + /* Shift left by 12 bits, unless there was a carry-out from the + multiply, in which case, shift by 11 bits and increment the + exponent. Note: It is convenient to use the constant 0x3ff + instead of 0x400 when removing the extra exponent bias (so that + it is easy to construct 0x7fe for the overflow check). Reverse + the logic here to decrement the exponent sum by one unless there + was a carry-out. */ + movi a4, 11 + srli a5, xh, 21 - 12 + bnez a5, 1f + addi a4, a4, 1 + addi a8, a8, -1 +1: ssl a4 + src xh, xh, xl + src xl, xl, a6 + sll a6, a6 + + /* Subtract the extra bias from the exponent sum (plus one to account + for the explicit "1.0" of the mantissa that will be added to the + exponent in the final result). */ + movi a4, 0x3ff + sub a8, a8, a4 + + /* Check for over/underflow. The value in a8 is one less than the + final exponent, so values in the range 0..7fd are OK here. */ + slli a4, a4, 1 /* 0x7fe */ + bgeu a8, a4, .Lmul_overflow + +.Lmul_round: + /* Round. */ + bgez a6, .Lmul_rounded + addi xl, xl, 1 + beqz xl, .Lmul_roundcarry + slli a6, a6, 1 + beqz a6, .Lmul_exactlyhalf + +.Lmul_rounded: + /* Add the exponent to the mantissa. */ + slli a8, a8, 20 + add xh, xh, a8 + +.Lmul_addsign: + /* Add the sign bit. */ + srli a7, a7, 31 + slli a7, a7, 31 + or xh, xh, a7 + +.Lmul_done: +#if __XTENSA_CALL0_ABI__ + l32i a12, sp, 16 + l32i a13, sp, 20 + l32i a14, sp, 24 + l32i a15, sp, 28 + addi sp, sp, 32 +#endif + leaf_return + +.Lmul_exactlyhalf: + /* Round down to the nearest even value. */ + srli xl, xl, 1 + slli xl, xl, 1 + j .Lmul_rounded + +.Lmul_roundcarry: + /* xl is always zero when the rounding increment overflows, so + there's no need to round it to an even value. */ + addi xh, xh, 1 + /* Overflow is OK -- it will be added to the exponent. */ + j .Lmul_rounded + +.Lmul_overflow: + bltz a8, .Lmul_underflow + /* Return +/- Infinity. */ + addi a8, a4, 1 /* 0x7ff */ + slli xh, a8, 20 + movi xl, 0 + j .Lmul_addsign + +.Lmul_underflow: + /* Create a subnormal value, where the exponent field contains zero, + but the effective exponent is 1. The value of a8 is one less than + the actual exponent, so just negate it to get the shift amount. */ + neg a8, a8 + mov a9, a6 + ssr a8 + bgeui a8, 32, .Lmul_bigshift + + /* Shift xh/xl right. Any bits that are shifted out of xl are saved + in a6 (combined with the shifted-out bits currently in a6) for + rounding the result. */ + sll a6, xl + src xl, xh, xl + srl xh, xh + j 1f + +.Lmul_bigshift: + bgeui a8, 64, .Lmul_flush_to_zero + sll a10, xl /* lost bits shifted out of xl */ + src a6, xh, xl + srl xl, xh + movi xh, 0 + or a9, a9, a10 + + /* Set the exponent to zero. */ +1: movi a8, 0 + + /* Pack any nonzero bits shifted out into a6. */ + beqz a9, .Lmul_round + movi a9, 1 + or a6, a6, a9 + j .Lmul_round + +.Lmul_flush_to_zero: + /* Return zero with the appropriate sign bit. */ + srli xh, a7, 31 + slli xh, xh, 31 + movi xl, 0 + j .Lmul_done + +#if XCHAL_NO_MUL + + /* For Xtensa processors with no multiply hardware, this simplified + version of _mulsi3 is used for multiplying 16-bit chunks of + the floating-point mantissas. When using CALL0, this function + uses a custom ABI: the inputs are passed in a13 and a14, the + result is returned in a12, and a8 and a15 are clobbered. */ + .align 4 +.Lmul_mulsi3: + leaf_entry sp, 16 + .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2 + movi \dst, 0 +1: add \tmp1, \src2, \dst + extui \tmp2, \src1, 0, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx2 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 1, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx4 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 2, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx8 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 3, 1 + movnez \dst, \tmp1, \tmp2 + + srli \src1, \src1, 4 + slli \src2, \src2, 4 + bnez \src1, 1b + .endm +#if __XTENSA_CALL0_ABI__ + mul_mulsi3_body a12, a13, a14, a15, a8 +#else + /* The result will be written into a2, so save that argument in a4. */ + mov a4, a2 + mul_mulsi3_body a2, a4, a3, a5, a6 +#endif + leaf_return +#endif /* XCHAL_NO_MUL */ +#endif /* L_muldf3 */ + +#ifdef L_divdf3 + + /* Division */ +__divdf3_aux: + + /* Handle unusual cases (zeros, subnormals, NaNs and Infinities). + (This code is placed before the start of the function just to + keep it in range of the limited branch displacements.) */ + +.Ldiv_yexpzero: + /* Clear the sign bit of y. */ + slli yh, yh, 1 + srli yh, yh, 1 + + /* Check for division by zero. */ + or a10, yh, yl + beqz a10, .Ldiv_yzero + + /* Normalize y. Adjust the exponent in a9. */ + beqz yh, .Ldiv_yh_zero + do_nsau a10, yh, a11, a9 + addi a10, a10, -11 + ssl a10 + src yh, yh, yl + sll yl, yl + movi a9, 1 + sub a9, a9, a10 + j .Ldiv_ynormalized +.Ldiv_yh_zero: + do_nsau a10, yl, a11, a9 + addi a10, a10, -11 + movi a9, -31 + sub a9, a9, a10 + ssl a10 + bltz a10, .Ldiv_yl_srl + sll yh, yl + movi yl, 0 + j .Ldiv_ynormalized +.Ldiv_yl_srl: + srl yh, yl + sll yl, yl + j .Ldiv_ynormalized + +.Ldiv_yzero: + /* y is zero. Return NaN if x is also zero; otherwise, infinity. */ + slli xh, xh, 1 + srli xh, xh, 1 + or xl, xl, xh + srli xh, a7, 31 + slli xh, xh, 31 + or xh, xh, a6 + bnez xl, 1f + movi a4, 0x80000 /* make it a quiet NaN */ + or xh, xh, a4 +1: movi xl, 0 + leaf_return + +.Ldiv_xexpzero: + /* Clear the sign bit of x. */ + slli xh, xh, 1 + srli xh, xh, 1 + + /* If x is zero, return zero. */ + or a10, xh, xl + beqz a10, .Ldiv_return_zero + + /* Normalize x. Adjust the exponent in a8. */ + beqz xh, .Ldiv_xh_zero + do_nsau a10, xh, a11, a8 + addi a10, a10, -11 + ssl a10 + src xh, xh, xl + sll xl, xl + movi a8, 1 + sub a8, a8, a10 + j .Ldiv_xnormalized +.Ldiv_xh_zero: + do_nsau a10, xl, a11, a8 + addi a10, a10, -11 + movi a8, -31 + sub a8, a8, a10 + ssl a10 + bltz a10, .Ldiv_xl_srl + sll xh, xl + movi xl, 0 + j .Ldiv_xnormalized +.Ldiv_xl_srl: + srl xh, xl + sll xl, xl + j .Ldiv_xnormalized + +.Ldiv_return_zero: + /* Return zero with the appropriate sign bit. */ + srli xh, a7, 31 + slli xh, xh, 31 + movi xl, 0 + leaf_return + +.Ldiv_xnan_or_inf: + /* Set the sign bit of the result. */ + srli a7, yh, 31 + slli a7, a7, 31 + xor xh, xh, a7 + /* If y is NaN or Inf, return NaN. */ + bnall yh, a6, 1f + movi a4, 0x80000 /* make it a quiet NaN */ + or xh, xh, a4 +1: leaf_return + +.Ldiv_ynan_or_inf: + /* If y is Infinity, return zero. */ + slli a8, yh, 12 + or a8, a8, yl + beqz a8, .Ldiv_return_zero + /* y is NaN; return it. */ + mov xh, yh + mov xl, yl + leaf_return + +.Ldiv_highequal1: + bltu xl, yl, 2f + j 3f + + .align 4 + .global __divdf3 + .type __divdf3, @function +__divdf3: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + + /* Get the sign of the result. */ + xor a7, xh, yh + + /* Check for NaN and infinity. */ + ball xh, a6, .Ldiv_xnan_or_inf + ball yh, a6, .Ldiv_ynan_or_inf + + /* Extract the exponents. */ + extui a8, xh, 20, 11 + extui a9, yh, 20, 11 + + beqz a9, .Ldiv_yexpzero +.Ldiv_ynormalized: + beqz a8, .Ldiv_xexpzero +.Ldiv_xnormalized: + + /* Subtract the exponents. */ + sub a8, a8, a9 + + /* Replace sign/exponent fields with explicit "1.0". */ + movi a10, 0x1fffff + or xh, xh, a6 + and xh, xh, a10 + or yh, yh, a6 + and yh, yh, a10 + + /* Set SAR for left shift by one. */ + ssai (32 - 1) + + /* The first digit of the mantissa division must be a one. + Shift x (and adjust the exponent) as needed to make this true. */ + bltu yh, xh, 3f + beq yh, xh, .Ldiv_highequal1 +2: src xh, xh, xl + sll xl, xl + addi a8, a8, -1 +3: + /* Do the first subtraction and shift. */ + sub xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, -1 +1: sub xl, xl, yl + src xh, xh, xl + sll xl, xl + + /* Put the quotient into a10/a11. */ + movi a10, 0 + movi a11, 1 + + /* Divide one bit at a time for 52 bits. */ + movi a9, 52 +#if XCHAL_HAVE_LOOPS + loop a9, .Ldiv_loopend +#endif +.Ldiv_loop: + /* Shift the quotient << 1. */ + src a10, a10, a11 + sll a11, a11 + + /* Is this digit a 0 or 1? */ + bltu xh, yh, 3f + beq xh, yh, .Ldiv_highequal2 + + /* Output a 1 and subtract. */ +2: addi a11, a11, 1 + sub xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, -1 +1: sub xl, xl, yl + + /* Shift the dividend << 1. */ +3: src xh, xh, xl + sll xl, xl + +#if !XCHAL_HAVE_LOOPS + addi a9, a9, -1 + bnez a9, .Ldiv_loop +#endif +.Ldiv_loopend: + + /* Add the exponent bias (less one to account for the explicit "1.0" + of the mantissa that will be added to the exponent in the final + result). */ + movi a9, 0x3fe + add a8, a8, a9 + + /* Check for over/underflow. The value in a8 is one less than the + final exponent, so values in the range 0..7fd are OK here. */ + addmi a9, a9, 0x400 /* 0x7fe */ + bgeu a8, a9, .Ldiv_overflow + +.Ldiv_round: + /* Round. The remainder (<< 1) is in xh/xl. */ + bltu xh, yh, .Ldiv_rounded + beq xh, yh, .Ldiv_highequal3 +.Ldiv_roundup: + addi a11, a11, 1 + beqz a11, .Ldiv_roundcarry + +.Ldiv_rounded: + mov xl, a11 + /* Add the exponent to the mantissa. */ + slli a8, a8, 20 + add xh, a10, a8 + +.Ldiv_addsign: + /* Add the sign bit. */ + srli a7, a7, 31 + slli a7, a7, 31 + or xh, xh, a7 + leaf_return + +.Ldiv_highequal2: + bgeu xl, yl, 2b + j 3b + +.Ldiv_highequal3: + bltu xl, yl, .Ldiv_rounded + bne xl, yl, .Ldiv_roundup + + /* Remainder is exactly half the divisor. Round even. */ + addi a11, a11, 1 + beqz a11, .Ldiv_roundcarry + srli a11, a11, 1 + slli a11, a11, 1 + j .Ldiv_rounded + +.Ldiv_overflow: + bltz a8, .Ldiv_underflow + /* Return +/- Infinity. */ + addi a8, a9, 1 /* 0x7ff */ + slli xh, a8, 20 + movi xl, 0 + j .Ldiv_addsign + +.Ldiv_underflow: + /* Create a subnormal value, where the exponent field contains zero, + but the effective exponent is 1. The value of a8 is one less than + the actual exponent, so just negate it to get the shift amount. */ + neg a8, a8 + ssr a8 + bgeui a8, 32, .Ldiv_bigshift + + /* Shift a10/a11 right. Any bits that are shifted out of a11 are + saved in a6 for rounding the result. */ + sll a6, a11 + src a11, a10, a11 + srl a10, a10 + j 1f + +.Ldiv_bigshift: + bgeui a8, 64, .Ldiv_flush_to_zero + sll a9, a11 /* lost bits shifted out of a11 */ + src a6, a10, a11 + srl a11, a10 + movi a10, 0 + or xl, xl, a9 + + /* Set the exponent to zero. */ +1: movi a8, 0 + + /* Pack any nonzero remainder (in xh/xl) into a6. */ + or xh, xh, xl + beqz xh, 1f + movi a9, 1 + or a6, a6, a9 + + /* Round a10/a11 based on the bits shifted out into a6. */ +1: bgez a6, .Ldiv_rounded + addi a11, a11, 1 + beqz a11, .Ldiv_roundcarry + slli a6, a6, 1 + bnez a6, .Ldiv_rounded + srli a11, a11, 1 + slli a11, a11, 1 + j .Ldiv_rounded + +.Ldiv_roundcarry: + /* a11 is always zero when the rounding increment overflows, so + there's no need to round it to an even value. */ + addi a10, a10, 1 + /* Overflow to the exponent field is OK. */ + j .Ldiv_rounded + +.Ldiv_flush_to_zero: + /* Return zero with the appropriate sign bit. */ + srli xh, a7, 31 + slli xh, xh, 31 + movi xl, 0 + leaf_return + +#endif /* L_divdf3 */ + +#ifdef L_cmpdf2 + + /* Equal and Not Equal */ + + .align 4 + .global __eqdf2 + .global __nedf2 + .set __nedf2, __eqdf2 + .type __eqdf2, @function +__eqdf2: + leaf_entry sp, 16 + bne xl, yl, 2f + bne xh, yh, 4f + + /* The values are equal but NaN != NaN. Check the exponent. */ + movi a6, 0x7ff00000 + ball xh, a6, 3f + + /* Equal. */ + movi a2, 0 + leaf_return + + /* Not equal. */ +2: movi a2, 1 + leaf_return + + /* Check if the mantissas are nonzero. */ +3: slli a7, xh, 12 + or a7, a7, xl + j 5f + + /* Check if x and y are zero with different signs. */ +4: or a7, xh, yh + slli a7, a7, 1 + or a7, a7, xl /* xl == yl here */ + + /* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa + or x when exponent(x) = 0x7ff and x == y. */ +5: movi a2, 0 + movi a3, 1 + movnez a2, a3, a7 + leaf_return + + + /* Greater Than */ + + .align 4 + .global __gtdf2 + .type __gtdf2, @function +__gtdf2: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + ball xh, a6, 2f +1: bnall yh, a6, .Lle_cmp + + /* Check if y is a NaN. */ + slli a7, yh, 12 + or a7, a7, yl + beqz a7, .Lle_cmp + movi a2, 0 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, xh, 12 + or a7, a7, xl + beqz a7, 1b + movi a2, 0 + leaf_return + + + /* Less Than or Equal */ + + .align 4 + .global __ledf2 + .type __ledf2, @function +__ledf2: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + ball xh, a6, 2f +1: bnall yh, a6, .Lle_cmp + + /* Check if y is a NaN. */ + slli a7, yh, 12 + or a7, a7, yl + beqz a7, .Lle_cmp + movi a2, 1 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, xh, 12 + or a7, a7, xl + beqz a7, 1b + movi a2, 1 + leaf_return + +.Lle_cmp: + /* Check if x and y have different signs. */ + xor a7, xh, yh + bltz a7, .Lle_diff_signs + + /* Check if x is negative. */ + bltz xh, .Lle_xneg + + /* Check if x <= y. */ + bltu xh, yh, 4f + bne xh, yh, 5f + bltu yl, xl, 5f +4: movi a2, 0 + leaf_return + +.Lle_xneg: + /* Check if y <= x. */ + bltu yh, xh, 4b + bne yh, xh, 5f + bgeu xl, yl, 4b +5: movi a2, 1 + leaf_return + +.Lle_diff_signs: + bltz xh, 4b + + /* Check if both x and y are zero. */ + or a7, xh, yh + slli a7, a7, 1 + or a7, a7, xl + or a7, a7, yl + movi a2, 1 + movi a3, 0 + moveqz a2, a3, a7 + leaf_return + + + /* Greater Than or Equal */ + + .align 4 + .global __gedf2 + .type __gedf2, @function +__gedf2: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + ball xh, a6, 2f +1: bnall yh, a6, .Llt_cmp + + /* Check if y is a NaN. */ + slli a7, yh, 12 + or a7, a7, yl + beqz a7, .Llt_cmp + movi a2, -1 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, xh, 12 + or a7, a7, xl + beqz a7, 1b + movi a2, -1 + leaf_return + + + /* Less Than */ + + .align 4 + .global __ltdf2 + .type __ltdf2, @function +__ltdf2: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + ball xh, a6, 2f +1: bnall yh, a6, .Llt_cmp + + /* Check if y is a NaN. */ + slli a7, yh, 12 + or a7, a7, yl + beqz a7, .Llt_cmp + movi a2, 0 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, xh, 12 + or a7, a7, xl + beqz a7, 1b + movi a2, 0 + leaf_return + +.Llt_cmp: + /* Check if x and y have different signs. */ + xor a7, xh, yh + bltz a7, .Llt_diff_signs + + /* Check if x is negative. */ + bltz xh, .Llt_xneg + + /* Check if x < y. */ + bltu xh, yh, 4f + bne xh, yh, 5f + bgeu xl, yl, 5f +4: movi a2, -1 + leaf_return + +.Llt_xneg: + /* Check if y < x. */ + bltu yh, xh, 4b + bne yh, xh, 5f + bltu yl, xl, 4b +5: movi a2, 0 + leaf_return + +.Llt_diff_signs: + bgez xh, 5b + + /* Check if both x and y are nonzero. */ + or a7, xh, yh + slli a7, a7, 1 + or a7, a7, xl + or a7, a7, yl + movi a2, 0 + movi a3, -1 + movnez a2, a3, a7 + leaf_return + + + /* Unordered */ + + .align 4 + .global __unorddf2 + .type __unorddf2, @function +__unorddf2: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + ball xh, a6, 3f +1: ball yh, a6, 4f +2: movi a2, 0 + leaf_return + +3: slli a7, xh, 12 + or a7, a7, xl + beqz a7, 1b + movi a2, 1 + leaf_return + +4: slli a7, yh, 12 + or a7, a7, yl + beqz a7, 2b + movi a2, 1 + leaf_return + +#endif /* L_cmpdf2 */ + +#ifdef L_fixdfsi + + .align 4 + .global __fixdfsi + .type __fixdfsi, @function +__fixdfsi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7ff00000 + ball xh, a6, .Lfixdfsi_nan_or_inf + + /* Extract the exponent and check if 0 < (exp - 0x3fe) < 32. */ + extui a4, xh, 20, 11 + extui a5, a6, 19, 10 /* 0x3fe */ + sub a4, a4, a5 + bgei a4, 32, .Lfixdfsi_maxint + blti a4, 1, .Lfixdfsi_zero + + /* Add explicit "1.0" and shift << 11. */ + or a7, xh, a6 + ssai (32 - 11) + src a5, a7, xl + + /* Shift back to the right, based on the exponent. */ + ssl a4 /* shift by 32 - a4 */ + srl a5, a5 + + /* Negate the result if sign != 0. */ + neg a2, a5 + movgez a2, a5, a7 + leaf_return + +.Lfixdfsi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, xh, 12 + or a4, a4, xl + beqz a4, .Lfixdfsi_maxint + + /* Translate NaN to +maxint. */ + movi xh, 0 + +.Lfixdfsi_maxint: + slli a4, a6, 11 /* 0x80000000 */ + addi a5, a4, -1 /* 0x7fffffff */ + movgez a4, a5, xh + mov a2, a4 + leaf_return + +.Lfixdfsi_zero: + movi a2, 0 + leaf_return + +#endif /* L_fixdfsi */ + +#ifdef L_fixdfdi + + .align 4 + .global __fixdfdi + .type __fixdfdi, @function +__fixdfdi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7ff00000 + ball xh, a6, .Lfixdfdi_nan_or_inf + + /* Extract the exponent and check if 0 < (exp - 0x3fe) < 64. */ + extui a4, xh, 20, 11 + extui a5, a6, 19, 10 /* 0x3fe */ + sub a4, a4, a5 + bgei a4, 64, .Lfixdfdi_maxint + blti a4, 1, .Lfixdfdi_zero + + /* Add explicit "1.0" and shift << 11. */ + or a7, xh, a6 + ssai (32 - 11) + src xh, a7, xl + sll xl, xl + + /* Shift back to the right, based on the exponent. */ + ssl a4 /* shift by 64 - a4 */ + bgei a4, 32, .Lfixdfdi_smallshift + srl xl, xh + movi xh, 0 + +.Lfixdfdi_shifted: + /* Negate the result if sign != 0. */ + bgez a7, 1f + neg xl, xl + neg xh, xh + beqz xl, 1f + addi xh, xh, -1 +1: leaf_return + +.Lfixdfdi_smallshift: + src xl, xh, xl + srl xh, xh + j .Lfixdfdi_shifted + +.Lfixdfdi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, xh, 12 + or a4, a4, xl + beqz a4, .Lfixdfdi_maxint + + /* Translate NaN to +maxint. */ + movi xh, 0 + +.Lfixdfdi_maxint: + slli a7, a6, 11 /* 0x80000000 */ + bgez xh, 1f + mov xh, a7 + movi xl, 0 + leaf_return + +1: addi xh, a7, -1 /* 0x7fffffff */ + movi xl, -1 + leaf_return + +.Lfixdfdi_zero: + movi xh, 0 + movi xl, 0 + leaf_return + +#endif /* L_fixdfdi */ + +#ifdef L_fixunsdfsi + + .align 4 + .global __fixunsdfsi + .type __fixunsdfsi, @function +__fixunsdfsi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7ff00000 + ball xh, a6, .Lfixunsdfsi_nan_or_inf + + /* Extract the exponent and check if 0 <= (exp - 0x3ff) < 32. */ + extui a4, xh, 20, 11 + extui a5, a6, 20, 10 /* 0x3ff */ + sub a4, a4, a5 + bgei a4, 32, .Lfixunsdfsi_maxint + bltz a4, .Lfixunsdfsi_zero + + /* Add explicit "1.0" and shift << 11. */ + or a7, xh, a6 + ssai (32 - 11) + src a5, a7, xl + + /* Shift back to the right, based on the exponent. */ + addi a4, a4, 1 + beqi a4, 32, .Lfixunsdfsi_bigexp + ssl a4 /* shift by 32 - a4 */ + srl a5, a5 + + /* Negate the result if sign != 0. */ + neg a2, a5 + movgez a2, a5, a7 + leaf_return + +.Lfixunsdfsi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, xh, 12 + or a4, a4, xl + beqz a4, .Lfixunsdfsi_maxint + + /* Translate NaN to 0xffffffff. */ + movi a2, -1 + leaf_return + +.Lfixunsdfsi_maxint: + slli a4, a6, 11 /* 0x80000000 */ + movi a5, -1 /* 0xffffffff */ + movgez a4, a5, xh + mov a2, a4 + leaf_return + +.Lfixunsdfsi_zero: + movi a2, 0 + leaf_return + +.Lfixunsdfsi_bigexp: + /* Handle unsigned maximum exponent case. */ + bltz xh, 1f + mov a2, a5 /* no shift needed */ + leaf_return + + /* Return 0x80000000 if negative. */ +1: slli a2, a6, 11 + leaf_return + +#endif /* L_fixunsdfsi */ + +#ifdef L_fixunsdfdi + + .align 4 + .global __fixunsdfdi + .type __fixunsdfdi, @function +__fixunsdfdi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7ff00000 + ball xh, a6, .Lfixunsdfdi_nan_or_inf + + /* Extract the exponent and check if 0 <= (exp - 0x3ff) < 64. */ + extui a4, xh, 20, 11 + extui a5, a6, 20, 10 /* 0x3ff */ + sub a4, a4, a5 + bgei a4, 64, .Lfixunsdfdi_maxint + bltz a4, .Lfixunsdfdi_zero + + /* Add explicit "1.0" and shift << 11. */ + or a7, xh, a6 + ssai (32 - 11) + src xh, a7, xl + sll xl, xl + + /* Shift back to the right, based on the exponent. */ + addi a4, a4, 1 + beqi a4, 64, .Lfixunsdfdi_bigexp + ssl a4 /* shift by 64 - a4 */ + bgei a4, 32, .Lfixunsdfdi_smallshift + srl xl, xh + movi xh, 0 + +.Lfixunsdfdi_shifted: + /* Negate the result if sign != 0. */ + bgez a7, 1f + neg xl, xl + neg xh, xh + beqz xl, 1f + addi xh, xh, -1 +1: leaf_return + +.Lfixunsdfdi_smallshift: + src xl, xh, xl + srl xh, xh + j .Lfixunsdfdi_shifted + +.Lfixunsdfdi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, xh, 12 + or a4, a4, xl + beqz a4, .Lfixunsdfdi_maxint + + /* Translate NaN to 0xffffffff.... */ +1: movi xh, -1 + movi xl, -1 + leaf_return + +.Lfixunsdfdi_maxint: + bgez xh, 1b +2: slli xh, a6, 11 /* 0x80000000 */ + movi xl, 0 + leaf_return + +.Lfixunsdfdi_zero: + movi xh, 0 + movi xl, 0 + leaf_return + +.Lfixunsdfdi_bigexp: + /* Handle unsigned maximum exponent case. */ + bltz a7, 2b + leaf_return /* no shift needed */ + +#endif /* L_fixunsdfdi */ + +#ifdef L_floatsidf + + .align 4 + .global __floatunsidf + .type __floatunsidf, @function +__floatunsidf: + leaf_entry sp, 16 + beqz a2, .Lfloatsidf_return_zero + + /* Set the sign to zero and jump to the floatsidf code. */ + movi a7, 0 + j .Lfloatsidf_normalize + + .align 4 + .global __floatsidf + .type __floatsidf, @function +__floatsidf: + leaf_entry sp, 16 + + /* Check for zero. */ + beqz a2, .Lfloatsidf_return_zero + + /* Save the sign. */ + extui a7, a2, 31, 1 + + /* Get the absolute value. */ +#if XCHAL_HAVE_ABS + abs a2, a2 +#else + neg a4, a2 + movltz a2, a4, a2 +#endif + +.Lfloatsidf_normalize: + /* Normalize with the first 1 bit in the msb. */ + do_nsau a4, a2, a5, a6 + ssl a4 + sll a5, a2 + + /* Shift the mantissa into position. */ + srli xh, a5, 11 + slli xl, a5, (32 - 11) + + /* Set the exponent. */ + movi a5, 0x41d /* 0x3fe + 31 */ + sub a5, a5, a4 + slli a5, a5, 20 + add xh, xh, a5 + + /* Add the sign and return. */ + slli a7, a7, 31 + or xh, xh, a7 + leaf_return + +.Lfloatsidf_return_zero: + movi a3, 0 + leaf_return + +#endif /* L_floatsidf */ + +#ifdef L_floatdidf + + .align 4 + .global __floatundidf + .type __floatundidf, @function +__floatundidf: + leaf_entry sp, 16 + + /* Check for zero. */ + or a4, xh, xl + beqz a4, 2f + + /* Set the sign to zero and jump to the floatdidf code. */ + movi a7, 0 + j .Lfloatdidf_normalize + + .align 4 + .global __floatdidf + .type __floatdidf, @function +__floatdidf: + leaf_entry sp, 16 + + /* Check for zero. */ + or a4, xh, xl + beqz a4, 2f + + /* Save the sign. */ + extui a7, xh, 31, 1 + + /* Get the absolute value. */ + bgez xh, .Lfloatdidf_normalize + neg xl, xl + neg xh, xh + beqz xl, .Lfloatdidf_normalize + addi xh, xh, -1 + +.Lfloatdidf_normalize: + /* Normalize with the first 1 bit in the msb of xh. */ + beqz xh, .Lfloatdidf_bigshift + do_nsau a4, xh, a5, a6 + ssl a4 + src xh, xh, xl + sll xl, xl + +.Lfloatdidf_shifted: + /* Shift the mantissa into position, with rounding bits in a6. */ + ssai 11 + sll a6, xl + src xl, xh, xl + srl xh, xh + + /* Set the exponent. */ + movi a5, 0x43d /* 0x3fe + 63 */ + sub a5, a5, a4 + slli a5, a5, 20 + add xh, xh, a5 + + /* Add the sign. */ + slli a7, a7, 31 + or xh, xh, a7 + + /* Round up if the leftover fraction is >= 1/2. */ + bgez a6, 2f + addi xl, xl, 1 + beqz xl, .Lfloatdidf_roundcarry + + /* Check if the leftover fraction is exactly 1/2. */ + slli a6, a6, 1 + beqz a6, .Lfloatdidf_exactlyhalf +2: leaf_return + +.Lfloatdidf_bigshift: + /* xh is zero. Normalize with first 1 bit of xl in the msb of xh. */ + do_nsau a4, xl, a5, a6 + ssl a4 + sll xh, xl + movi xl, 0 + addi a4, a4, 32 + j .Lfloatdidf_shifted + +.Lfloatdidf_exactlyhalf: + /* Round down to the nearest even value. */ + srli xl, xl, 1 + slli xl, xl, 1 + leaf_return + +.Lfloatdidf_roundcarry: + /* xl is always zero when the rounding increment overflows, so + there's no need to round it to an even value. */ + addi xh, xh, 1 + /* Overflow to the exponent is OK. */ + leaf_return + +#endif /* L_floatdidf */ + +#ifdef L_truncdfsf2 + + .align 4 + .global __truncdfsf2 + .type __truncdfsf2, @function +__truncdfsf2: + leaf_entry sp, 16 + + /* Adjust the exponent bias. */ + movi a4, (0x3ff - 0x7f) << 20 + sub a5, xh, a4 + + /* Check for underflow. */ + xor a6, xh, a5 + bltz a6, .Ltrunc_underflow + extui a6, a5, 20, 11 + beqz a6, .Ltrunc_underflow + + /* Check for overflow. */ + movi a4, 255 + bge a6, a4, .Ltrunc_overflow + + /* Shift a5/xl << 3 into a5/a4. */ + ssai (32 - 3) + src a5, a5, xl + sll a4, xl + +.Ltrunc_addsign: + /* Add the sign bit. */ + extui a6, xh, 31, 1 + slli a6, a6, 31 + or a2, a6, a5 + + /* Round up if the leftover fraction is >= 1/2. */ + bgez a4, 1f + addi a2, a2, 1 + /* Overflow to the exponent is OK. The answer will be correct. */ + + /* Check if the leftover fraction is exactly 1/2. */ + slli a4, a4, 1 + beqz a4, .Ltrunc_exactlyhalf +1: leaf_return + +.Ltrunc_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + leaf_return + +.Ltrunc_overflow: + /* Check if exponent == 0x7ff. */ + movi a4, 0x7ff00000 + bnall xh, a4, 1f + + /* Check if mantissa is nonzero. */ + slli a5, xh, 12 + or a5, a5, xl + beqz a5, 1f + + /* Shift a4 to set a bit in the mantissa, making a quiet NaN. */ + srli a4, a4, 1 + +1: slli a4, a4, 4 /* 0xff000000 or 0xff800000 */ + /* Add the sign bit. */ + extui a6, xh, 31, 1 + ssai 1 + src a2, a6, a4 + leaf_return + +.Ltrunc_underflow: + /* Find shift count for a subnormal. Flush to zero if >= 32. */ + extui a6, xh, 20, 11 + movi a5, 0x3ff - 0x7f + sub a6, a5, a6 + addi a6, a6, 1 + bgeui a6, 32, 1f + + /* Replace the exponent with an explicit "1.0". */ + slli a5, a5, 13 /* 0x700000 */ + or a5, a5, xh + slli a5, a5, 11 + srli a5, a5, 11 + + /* Shift the mantissa left by 3 bits (into a5/a4). */ + ssai (32 - 3) + src a5, a5, xl + sll a4, xl + + /* Shift right by a6. */ + ssr a6 + sll a7, a4 + src a4, a5, a4 + srl a5, a5 + beqz a7, .Ltrunc_addsign + or a4, a4, a6 /* any positive, nonzero value will work */ + j .Ltrunc_addsign + + /* Return +/- zero. */ +1: extui a2, xh, 31, 1 + slli a2, a2, 31 + leaf_return + +#endif /* L_truncdfsf2 */ + +#ifdef L_extendsfdf2 + + .align 4 + .global __extendsfdf2 + .type __extendsfdf2, @function +__extendsfdf2: + leaf_entry sp, 16 + + /* Save the sign bit and then shift it off. */ + extui a5, a2, 31, 1 + slli a5, a5, 31 + slli a4, a2, 1 + + /* Extract and check the exponent. */ + extui a6, a2, 23, 8 + beqz a6, .Lextend_expzero + addi a6, a6, 1 + beqi a6, 256, .Lextend_nan_or_inf + + /* Shift >> 3 into a4/xl. */ + srli a4, a4, 4 + slli xl, a2, (32 - 3) + + /* Adjust the exponent bias. */ + movi a6, (0x3ff - 0x7f) << 20 + add a4, a4, a6 + + /* Add the sign bit. */ + or xh, a4, a5 + leaf_return + +.Lextend_nan_or_inf: + movi a4, 0x7ff00000 + + /* Check for NaN. */ + slli a7, a2, 9 + beqz a7, 1f + + slli a6, a6, 11 /* 0x80000 */ + or a4, a4, a6 + + /* Add the sign and return. */ +1: or xh, a4, a5 + movi xl, 0 + leaf_return + +.Lextend_expzero: + beqz a4, 1b + + /* Normalize it to have 8 zero bits before the first 1 bit. */ + do_nsau a7, a4, a2, a3 + addi a7, a7, -8 + ssl a7 + sll a4, a4 + + /* Shift >> 3 into a4/xl. */ + slli xl, a4, (32 - 3) + srli a4, a4, 3 + + /* Set the exponent. */ + movi a6, 0x3fe - 0x7f + sub a6, a6, a7 + slli a6, a6, 20 + add a4, a4, a6 + + /* Add the sign and return. */ + or xh, a4, a5 + leaf_return + +#endif /* L_extendsfdf2 */ + + diff --git a/libgcc/config/xtensa/ieee754-sf.S b/libgcc/config/xtensa/ieee754-sf.S new file mode 100644 index 0000000..d75be0e --- /dev/null +++ b/libgcc/config/xtensa/ieee754-sf.S @@ -0,0 +1,1757 @@ +/* IEEE-754 single-precision functions for Xtensa + Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc. + Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef __XTENSA_EB__ +#define xh a2 +#define xl a3 +#define yh a4 +#define yl a5 +#else +#define xh a3 +#define xl a2 +#define yh a5 +#define yl a4 +#endif + +/* Warning! The branch displacements for some Xtensa branch instructions + are quite small, and this code has been carefully laid out to keep + branch targets in range. If you change anything, be sure to check that + the assembler is not relaxing anything to branch over a jump. */ + +#ifdef L_negsf2 + + .align 4 + .global __negsf2 + .type __negsf2, @function +__negsf2: + leaf_entry sp, 16 + movi a4, 0x80000000 + xor a2, a2, a4 + leaf_return + +#endif /* L_negsf2 */ + +#ifdef L_addsubsf3 + + /* Addition */ +__addsf3_aux: + + /* Handle NaNs and Infinities. (This code is placed before the + start of the function just to keep it in range of the limited + branch displacements.) */ + +.Ladd_xnan_or_inf: + /* If y is neither Infinity nor NaN, return x. */ + bnall a3, a6, 1f + /* If x is a NaN, return it. Otherwise, return y. */ + slli a7, a2, 9 + beqz a7, .Ladd_ynan_or_inf +1: leaf_return + +.Ladd_ynan_or_inf: + /* Return y. */ + mov a2, a3 + leaf_return + +.Ladd_opposite_signs: + /* Operand signs differ. Do a subtraction. */ + slli a7, a6, 8 + xor a3, a3, a7 + j .Lsub_same_sign + + .align 4 + .global __addsf3 + .type __addsf3, @function +__addsf3: + leaf_entry sp, 16 + movi a6, 0x7f800000 + + /* Check if the two operands have the same sign. */ + xor a7, a2, a3 + bltz a7, .Ladd_opposite_signs + +.Ladd_same_sign: + /* Check if either exponent == 0x7f8 (i.e., NaN or Infinity). */ + ball a2, a6, .Ladd_xnan_or_inf + ball a3, a6, .Ladd_ynan_or_inf + + /* Compare the exponents. The smaller operand will be shifted + right by the exponent difference and added to the larger + one. */ + extui a7, a2, 23, 9 + extui a8, a3, 23, 9 + bltu a7, a8, .Ladd_shiftx + +.Ladd_shifty: + /* Check if the smaller (or equal) exponent is zero. */ + bnone a3, a6, .Ladd_yexpzero + + /* Replace y sign/exponent with 0x008. */ + or a3, a3, a6 + slli a3, a3, 8 + srli a3, a3, 8 + +.Ladd_yexpdiff: + /* Compute the exponent difference. */ + sub a10, a7, a8 + + /* Exponent difference > 32 -- just return the bigger value. */ + bgeui a10, 32, 1f + + /* Shift y right by the exponent difference. Any bits that are + shifted out of y are saved in a9 for rounding the result. */ + ssr a10 + movi a9, 0 + src a9, a3, a9 + srl a3, a3 + + /* Do the addition. */ + add a2, a2, a3 + + /* Check if the add overflowed into the exponent. */ + extui a10, a2, 23, 9 + beq a10, a7, .Ladd_round + mov a8, a7 + j .Ladd_carry + +.Ladd_yexpzero: + /* y is a subnormal value. Replace its sign/exponent with zero, + i.e., no implicit "1.0", and increment the apparent exponent + because subnormals behave as if they had the minimum (nonzero) + exponent. Test for the case when both exponents are zero. */ + slli a3, a3, 9 + srli a3, a3, 9 + bnone a2, a6, .Ladd_bothexpzero + addi a8, a8, 1 + j .Ladd_yexpdiff + +.Ladd_bothexpzero: + /* Both exponents are zero. Handle this as a special case. There + is no need to shift or round, and the normal code for handling + a carry into the exponent field will not work because it + assumes there is an implicit "1.0" that needs to be added. */ + add a2, a2, a3 +1: leaf_return + +.Ladd_xexpzero: + /* Same as "yexpzero" except skip handling the case when both + exponents are zero. */ + slli a2, a2, 9 + srli a2, a2, 9 + addi a7, a7, 1 + j .Ladd_xexpdiff + +.Ladd_shiftx: + /* Same thing as the "shifty" code, but with x and y swapped. Also, + because the exponent difference is always nonzero in this version, + the shift sequence can use SLL and skip loading a constant zero. */ + bnone a2, a6, .Ladd_xexpzero + + or a2, a2, a6 + slli a2, a2, 8 + srli a2, a2, 8 + +.Ladd_xexpdiff: + sub a10, a8, a7 + bgeui a10, 32, .Ladd_returny + + ssr a10 + sll a9, a2 + srl a2, a2 + + add a2, a2, a3 + + /* Check if the add overflowed into the exponent. */ + extui a10, a2, 23, 9 + bne a10, a8, .Ladd_carry + +.Ladd_round: + /* Round up if the leftover fraction is >= 1/2. */ + bgez a9, 1f + addi a2, a2, 1 + + /* Check if the leftover fraction is exactly 1/2. */ + slli a9, a9, 1 + beqz a9, .Ladd_exactlyhalf +1: leaf_return + +.Ladd_returny: + mov a2, a3 + leaf_return + +.Ladd_carry: + /* The addition has overflowed into the exponent field, so the + value needs to be renormalized. The mantissa of the result + can be recovered by subtracting the original exponent and + adding 0x800000 (which is the explicit "1.0" for the + mantissa of the non-shifted operand -- the "1.0" for the + shifted operand was already added). The mantissa can then + be shifted right by one bit. The explicit "1.0" of the + shifted mantissa then needs to be replaced by the exponent, + incremented by one to account for the normalizing shift. + It is faster to combine these operations: do the shift first + and combine the additions and subtractions. If x is the + original exponent, the result is: + shifted mantissa - (x << 22) + (1 << 22) + (x << 23) + or: + shifted mantissa + ((x + 1) << 22) + Note that the exponent is incremented here by leaving the + explicit "1.0" of the mantissa in the exponent field. */ + + /* Shift x right by one bit. Save the lsb. */ + mov a10, a2 + srli a2, a2, 1 + + /* See explanation above. The original exponent is in a8. */ + addi a8, a8, 1 + slli a8, a8, 22 + add a2, a2, a8 + + /* Return an Infinity if the exponent overflowed. */ + ball a2, a6, .Ladd_infinity + + /* Same thing as the "round" code except the msb of the leftover + fraction is bit 0 of a10, with the rest of the fraction in a9. */ + bbci.l a10, 0, 1f + addi a2, a2, 1 + beqz a9, .Ladd_exactlyhalf +1: leaf_return + +.Ladd_infinity: + /* Clear the mantissa. */ + srli a2, a2, 23 + slli a2, a2, 23 + + /* The sign bit may have been lost in a carry-out. Put it back. */ + slli a8, a8, 1 + or a2, a2, a8 + leaf_return + +.Ladd_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + leaf_return + + + /* Subtraction */ +__subsf3_aux: + + /* Handle NaNs and Infinities. (This code is placed before the + start of the function just to keep it in range of the limited + branch displacements.) */ + +.Lsub_xnan_or_inf: + /* If y is neither Infinity nor NaN, return x. */ + bnall a3, a6, 1f + /* Both x and y are either NaN or Inf, so the result is NaN. */ + movi a4, 0x400000 /* make it a quiet NaN */ + or a2, a2, a4 +1: leaf_return + +.Lsub_ynan_or_inf: + /* Negate y and return it. */ + slli a7, a6, 8 + xor a2, a3, a7 + leaf_return + +.Lsub_opposite_signs: + /* Operand signs differ. Do an addition. */ + slli a7, a6, 8 + xor a3, a3, a7 + j .Ladd_same_sign + + .align 4 + .global __subsf3 + .type __subsf3, @function +__subsf3: + leaf_entry sp, 16 + movi a6, 0x7f800000 + + /* Check if the two operands have the same sign. */ + xor a7, a2, a3 + bltz a7, .Lsub_opposite_signs + +.Lsub_same_sign: + /* Check if either exponent == 0x7f8 (i.e., NaN or Infinity). */ + ball a2, a6, .Lsub_xnan_or_inf + ball a3, a6, .Lsub_ynan_or_inf + + /* Compare the operands. In contrast to addition, the entire + value matters here. */ + extui a7, a2, 23, 8 + extui a8, a3, 23, 8 + bltu a2, a3, .Lsub_xsmaller + +.Lsub_ysmaller: + /* Check if the smaller (or equal) exponent is zero. */ + bnone a3, a6, .Lsub_yexpzero + + /* Replace y sign/exponent with 0x008. */ + or a3, a3, a6 + slli a3, a3, 8 + srli a3, a3, 8 + +.Lsub_yexpdiff: + /* Compute the exponent difference. */ + sub a10, a7, a8 + + /* Exponent difference > 32 -- just return the bigger value. */ + bgeui a10, 32, 1f + + /* Shift y right by the exponent difference. Any bits that are + shifted out of y are saved in a9 for rounding the result. */ + ssr a10 + movi a9, 0 + src a9, a3, a9 + srl a3, a3 + + sub a2, a2, a3 + + /* Subtract the leftover bits in a9 from zero and propagate any + borrow from a2. */ + neg a9, a9 + addi a10, a2, -1 + movnez a2, a10, a9 + + /* Check if the subtract underflowed into the exponent. */ + extui a10, a2, 23, 8 + beq a10, a7, .Lsub_round + j .Lsub_borrow + +.Lsub_yexpzero: + /* Return zero if the inputs are equal. (For the non-subnormal + case, subtracting the "1.0" will cause a borrow from the exponent + and this case can be detected when handling the borrow.) */ + beq a2, a3, .Lsub_return_zero + + /* y is a subnormal value. Replace its sign/exponent with zero, + i.e., no implicit "1.0". Unless x is also a subnormal, increment + y's apparent exponent because subnormals behave as if they had + the minimum (nonzero) exponent. */ + slli a3, a3, 9 + srli a3, a3, 9 + bnone a2, a6, .Lsub_yexpdiff + addi a8, a8, 1 + j .Lsub_yexpdiff + +.Lsub_returny: + /* Negate and return y. */ + slli a7, a6, 8 + xor a2, a3, a7 +1: leaf_return + +.Lsub_xsmaller: + /* Same thing as the "ysmaller" code, but with x and y swapped and + with y negated. */ + bnone a2, a6, .Lsub_xexpzero + + or a2, a2, a6 + slli a2, a2, 8 + srli a2, a2, 8 + +.Lsub_xexpdiff: + sub a10, a8, a7 + bgeui a10, 32, .Lsub_returny + + ssr a10 + movi a9, 0 + src a9, a2, a9 + srl a2, a2 + + /* Negate y. */ + slli a11, a6, 8 + xor a3, a3, a11 + + sub a2, a3, a2 + + neg a9, a9 + addi a10, a2, -1 + movnez a2, a10, a9 + + /* Check if the subtract underflowed into the exponent. */ + extui a10, a2, 23, 8 + bne a10, a8, .Lsub_borrow + +.Lsub_round: + /* Round up if the leftover fraction is >= 1/2. */ + bgez a9, 1f + addi a2, a2, 1 + + /* Check if the leftover fraction is exactly 1/2. */ + slli a9, a9, 1 + beqz a9, .Lsub_exactlyhalf +1: leaf_return + +.Lsub_xexpzero: + /* Same as "yexpzero". */ + beq a2, a3, .Lsub_return_zero + slli a2, a2, 9 + srli a2, a2, 9 + bnone a3, a6, .Lsub_xexpdiff + addi a7, a7, 1 + j .Lsub_xexpdiff + +.Lsub_return_zero: + movi a2, 0 + leaf_return + +.Lsub_borrow: + /* The subtraction has underflowed into the exponent field, so the + value needs to be renormalized. Shift the mantissa left as + needed to remove any leading zeros and adjust the exponent + accordingly. If the exponent is not large enough to remove + all the leading zeros, the result will be a subnormal value. */ + + slli a8, a2, 9 + beqz a8, .Lsub_xzero + do_nsau a6, a8, a7, a11 + srli a8, a8, 9 + bge a6, a10, .Lsub_subnormal + addi a6, a6, 1 + +.Lsub_normalize_shift: + /* Shift the mantissa (a8/a9) left by a6. */ + ssl a6 + src a8, a8, a9 + sll a9, a9 + + /* Combine the shifted mantissa with the sign and exponent, + decrementing the exponent by a6. (The exponent has already + been decremented by one due to the borrow from the subtraction, + but adding the mantissa will increment the exponent by one.) */ + srli a2, a2, 23 + sub a2, a2, a6 + slli a2, a2, 23 + add a2, a2, a8 + j .Lsub_round + +.Lsub_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + leaf_return + +.Lsub_xzero: + /* If there was a borrow from the exponent, and the mantissa and + guard digits are all zero, then the inputs were equal and the + result should be zero. */ + beqz a9, .Lsub_return_zero + + /* Only the guard digit is nonzero. Shift by min(24, a10). */ + addi a11, a10, -24 + movi a6, 24 + movltz a6, a10, a11 + j .Lsub_normalize_shift + +.Lsub_subnormal: + /* The exponent is too small to shift away all the leading zeros. + Set a6 to the current exponent (which has already been + decremented by the borrow) so that the exponent of the result + will be zero. Do not add 1 to a6 in this case, because: (1) + adding the mantissa will not increment the exponent, so there is + no need to subtract anything extra from the exponent to + compensate, and (2) the effective exponent of a subnormal is 1 + not 0 so the shift amount must be 1 smaller than normal. */ + mov a6, a10 + j .Lsub_normalize_shift + +#endif /* L_addsubsf3 */ + +#ifdef L_mulsf3 + + /* Multiplication */ +#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16 +#define XCHAL_NO_MUL 1 +#endif + +__mulsf3_aux: + + /* Handle unusual cases (zeros, subnormals, NaNs and Infinities). + (This code is placed before the start of the function just to + keep it in range of the limited branch displacements.) */ + +.Lmul_xexpzero: + /* Clear the sign bit of x. */ + slli a2, a2, 1 + srli a2, a2, 1 + + /* If x is zero, return zero. */ + beqz a2, .Lmul_return_zero + + /* Normalize x. Adjust the exponent in a8. */ + do_nsau a10, a2, a11, a12 + addi a10, a10, -8 + ssl a10 + sll a2, a2 + movi a8, 1 + sub a8, a8, a10 + j .Lmul_xnormalized + +.Lmul_yexpzero: + /* Clear the sign bit of y. */ + slli a3, a3, 1 + srli a3, a3, 1 + + /* If y is zero, return zero. */ + beqz a3, .Lmul_return_zero + + /* Normalize y. Adjust the exponent in a9. */ + do_nsau a10, a3, a11, a12 + addi a10, a10, -8 + ssl a10 + sll a3, a3 + movi a9, 1 + sub a9, a9, a10 + j .Lmul_ynormalized + +.Lmul_return_zero: + /* Return zero with the appropriate sign bit. */ + srli a2, a7, 31 + slli a2, a2, 31 + j .Lmul_done + +.Lmul_xnan_or_inf: + /* If y is zero, return NaN. */ + slli a8, a3, 1 + bnez a8, 1f + movi a4, 0x400000 /* make it a quiet NaN */ + or a2, a2, a4 + j .Lmul_done +1: + /* If y is NaN, return y. */ + bnall a3, a6, .Lmul_returnx + slli a8, a3, 9 + beqz a8, .Lmul_returnx + +.Lmul_returny: + mov a2, a3 + +.Lmul_returnx: + /* Set the sign bit and return. */ + extui a7, a7, 31, 1 + slli a2, a2, 1 + ssai 1 + src a2, a7, a2 + j .Lmul_done + +.Lmul_ynan_or_inf: + /* If x is zero, return NaN. */ + slli a8, a2, 1 + bnez a8, .Lmul_returny + movi a7, 0x400000 /* make it a quiet NaN */ + or a2, a3, a7 + j .Lmul_done + + .align 4 + .global __mulsf3 + .type __mulsf3, @function +__mulsf3: +#if __XTENSA_CALL0_ABI__ + leaf_entry sp, 32 + addi sp, sp, -32 + s32i a12, sp, 16 + s32i a13, sp, 20 + s32i a14, sp, 24 + s32i a15, sp, 28 +#elif XCHAL_NO_MUL + /* This is not really a leaf function; allocate enough stack space + to allow CALL12s to a helper function. */ + leaf_entry sp, 64 +#else + leaf_entry sp, 32 +#endif + movi a6, 0x7f800000 + + /* Get the sign of the result. */ + xor a7, a2, a3 + + /* Check for NaN and infinity. */ + ball a2, a6, .Lmul_xnan_or_inf + ball a3, a6, .Lmul_ynan_or_inf + + /* Extract the exponents. */ + extui a8, a2, 23, 8 + extui a9, a3, 23, 8 + + beqz a8, .Lmul_xexpzero +.Lmul_xnormalized: + beqz a9, .Lmul_yexpzero +.Lmul_ynormalized: + + /* Add the exponents. */ + add a8, a8, a9 + + /* Replace sign/exponent fields with explicit "1.0". */ + movi a10, 0xffffff + or a2, a2, a6 + and a2, a2, a10 + or a3, a3, a6 + and a3, a3, a10 + + /* Multiply 32x32 to 64 bits. The result ends up in a2/a6. */ + +#if XCHAL_HAVE_MUL32_HIGH + + mull a6, a2, a3 + muluh a2, a2, a3 + +#else + + /* Break the inputs into 16-bit chunks and compute 4 32-bit partial + products. These partial products are: + + 0 xl * yl + + 1 xl * yh + 2 xh * yl + + 3 xh * yh + + If using the Mul16 or Mul32 multiplier options, these input + chunks must be stored in separate registers. For Mac16, the + UMUL.AA.* opcodes can specify that the inputs come from either + half of the registers, so there is no need to shift them out + ahead of time. If there is no multiply hardware, the 16-bit + chunks can be extracted when setting up the arguments to the + separate multiply function. */ + +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + /* Calling a separate multiply function will clobber a0 and requires + use of a8 as a temporary, so save those values now. (The function + uses a custom ABI so nothing else needs to be saved.) */ + s32i a0, sp, 0 + s32i a8, sp, 4 +#endif + +#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 + +#define a2h a4 +#define a3h a5 + + /* Get the high halves of the inputs into registers. */ + srli a2h, a2, 16 + srli a3h, a3, 16 + +#define a2l a2 +#define a3l a3 + +#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16 + /* Clear the high halves of the inputs. This does not matter + for MUL16 because the high bits are ignored. */ + extui a2, a2, 0, 16 + extui a3, a3, 0, 16 +#endif +#endif /* MUL16 || MUL32 */ + + +#if XCHAL_HAVE_MUL16 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mul16u dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MUL32 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mull dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MAC16 + +/* The preprocessor insists on inserting a space when concatenating after + a period in the definition of do_mul below. These macros are a workaround + using underscores instead of periods when doing the concatenation. */ +#define umul_aa_ll umul.aa.ll +#define umul_aa_lh umul.aa.lh +#define umul_aa_hl umul.aa.hl +#define umul_aa_hh umul.aa.hh + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + umul_aa_ ## xhalf ## yhalf xreg, yreg; \ + rsr dst, ACCLO + +#else /* no multiply hardware */ + +#define set_arg_l(dst, src) \ + extui dst, src, 0, 16 +#define set_arg_h(dst, src) \ + srli dst, src, 16 + +#if __XTENSA_CALL0_ABI__ +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a13, xreg); \ + set_arg_ ## yhalf (a14, yreg); \ + call0 .Lmul_mulsi3; \ + mov dst, a12 +#else +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a14, xreg); \ + set_arg_ ## yhalf (a15, yreg); \ + call12 .Lmul_mulsi3; \ + mov dst, a14 +#endif /* __XTENSA_CALL0_ABI__ */ + +#endif /* no multiply hardware */ + + /* Add pp1 and pp2 into a6 with carry-out in a9. */ + do_mul(a6, a2, l, a3, h) /* pp 1 */ + do_mul(a11, a2, h, a3, l) /* pp 2 */ + movi a9, 0 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Shift the high half of a9/a6 into position in a9. Note that + this value can be safely incremented without any carry-outs. */ + ssai 16 + src a9, a9, a6 + + /* Compute the low word into a6. */ + do_mul(a11, a2, l, a3, l) /* pp 0 */ + sll a6, a6 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Compute the high word into a2. */ + do_mul(a2, a2, h, a3, h) /* pp 3 */ + add a2, a2, a9 + +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + /* Restore values saved on the stack during the multiplication. */ + l32i a0, sp, 0 + l32i a8, sp, 4 +#endif +#endif /* ! XCHAL_HAVE_MUL32_HIGH */ + + /* Shift left by 9 bits, unless there was a carry-out from the + multiply, in which case, shift by 8 bits and increment the + exponent. */ + movi a4, 9 + srli a5, a2, 24 - 9 + beqz a5, 1f + addi a4, a4, -1 + addi a8, a8, 1 +1: ssl a4 + src a2, a2, a6 + sll a6, a6 + + /* Subtract the extra bias from the exponent sum (plus one to account + for the explicit "1.0" of the mantissa that will be added to the + exponent in the final result). */ + movi a4, 0x80 + sub a8, a8, a4 + + /* Check for over/underflow. The value in a8 is one less than the + final exponent, so values in the range 0..fd are OK here. */ + movi a4, 0xfe + bgeu a8, a4, .Lmul_overflow + +.Lmul_round: + /* Round. */ + bgez a6, .Lmul_rounded + addi a2, a2, 1 + slli a6, a6, 1 + beqz a6, .Lmul_exactlyhalf + +.Lmul_rounded: + /* Add the exponent to the mantissa. */ + slli a8, a8, 23 + add a2, a2, a8 + +.Lmul_addsign: + /* Add the sign bit. */ + srli a7, a7, 31 + slli a7, a7, 31 + or a2, a2, a7 + +.Lmul_done: +#if __XTENSA_CALL0_ABI__ + l32i a12, sp, 16 + l32i a13, sp, 20 + l32i a14, sp, 24 + l32i a15, sp, 28 + addi sp, sp, 32 +#endif + leaf_return + +.Lmul_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + j .Lmul_rounded + +.Lmul_overflow: + bltz a8, .Lmul_underflow + /* Return +/- Infinity. */ + movi a8, 0xff + slli a2, a8, 23 + j .Lmul_addsign + +.Lmul_underflow: + /* Create a subnormal value, where the exponent field contains zero, + but the effective exponent is 1. The value of a8 is one less than + the actual exponent, so just negate it to get the shift amount. */ + neg a8, a8 + mov a9, a6 + ssr a8 + bgeui a8, 32, .Lmul_flush_to_zero + + /* Shift a2 right. Any bits that are shifted out of a2 are saved + in a6 (combined with the shifted-out bits currently in a6) for + rounding the result. */ + sll a6, a2 + srl a2, a2 + + /* Set the exponent to zero. */ + movi a8, 0 + + /* Pack any nonzero bits shifted out into a6. */ + beqz a9, .Lmul_round + movi a9, 1 + or a6, a6, a9 + j .Lmul_round + +.Lmul_flush_to_zero: + /* Return zero with the appropriate sign bit. */ + srli a2, a7, 31 + slli a2, a2, 31 + j .Lmul_done + +#if XCHAL_NO_MUL + + /* For Xtensa processors with no multiply hardware, this simplified + version of _mulsi3 is used for multiplying 16-bit chunks of + the floating-point mantissas. When using CALL0, this function + uses a custom ABI: the inputs are passed in a13 and a14, the + result is returned in a12, and a8 and a15 are clobbered. */ + .align 4 +.Lmul_mulsi3: + leaf_entry sp, 16 + .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2 + movi \dst, 0 +1: add \tmp1, \src2, \dst + extui \tmp2, \src1, 0, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx2 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 1, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx4 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 2, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx8 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 3, 1 + movnez \dst, \tmp1, \tmp2 + + srli \src1, \src1, 4 + slli \src2, \src2, 4 + bnez \src1, 1b + .endm +#if __XTENSA_CALL0_ABI__ + mul_mulsi3_body a12, a13, a14, a15, a8 +#else + /* The result will be written into a2, so save that argument in a4. */ + mov a4, a2 + mul_mulsi3_body a2, a4, a3, a5, a6 +#endif + leaf_return +#endif /* XCHAL_NO_MUL */ +#endif /* L_mulsf3 */ + +#ifdef L_divsf3 + + /* Division */ +__divsf3_aux: + + /* Handle unusual cases (zeros, subnormals, NaNs and Infinities). + (This code is placed before the start of the function just to + keep it in range of the limited branch displacements.) */ + +.Ldiv_yexpzero: + /* Clear the sign bit of y. */ + slli a3, a3, 1 + srli a3, a3, 1 + + /* Check for division by zero. */ + beqz a3, .Ldiv_yzero + + /* Normalize y. Adjust the exponent in a9. */ + do_nsau a10, a3, a4, a5 + addi a10, a10, -8 + ssl a10 + sll a3, a3 + movi a9, 1 + sub a9, a9, a10 + j .Ldiv_ynormalized + +.Ldiv_yzero: + /* y is zero. Return NaN if x is also zero; otherwise, infinity. */ + slli a4, a2, 1 + srli a4, a4, 1 + srli a2, a7, 31 + slli a2, a2, 31 + or a2, a2, a6 + bnez a4, 1f + movi a4, 0x400000 /* make it a quiet NaN */ + or a2, a2, a4 +1: leaf_return + +.Ldiv_xexpzero: + /* Clear the sign bit of x. */ + slli a2, a2, 1 + srli a2, a2, 1 + + /* If x is zero, return zero. */ + beqz a2, .Ldiv_return_zero + + /* Normalize x. Adjust the exponent in a8. */ + do_nsau a10, a2, a4, a5 + addi a10, a10, -8 + ssl a10 + sll a2, a2 + movi a8, 1 + sub a8, a8, a10 + j .Ldiv_xnormalized + +.Ldiv_return_zero: + /* Return zero with the appropriate sign bit. */ + srli a2, a7, 31 + slli a2, a2, 31 + leaf_return + +.Ldiv_xnan_or_inf: + /* Set the sign bit of the result. */ + srli a7, a3, 31 + slli a7, a7, 31 + xor a2, a2, a7 + /* If y is NaN or Inf, return NaN. */ + bnall a3, a6, 1f + movi a4, 0x400000 /* make it a quiet NaN */ + or a2, a2, a4 +1: leaf_return + +.Ldiv_ynan_or_inf: + /* If y is Infinity, return zero. */ + slli a8, a3, 9 + beqz a8, .Ldiv_return_zero + /* y is NaN; return it. */ + mov a2, a3 + leaf_return + + .align 4 + .global __divsf3 + .type __divsf3, @function +__divsf3: + leaf_entry sp, 16 + movi a6, 0x7f800000 + + /* Get the sign of the result. */ + xor a7, a2, a3 + + /* Check for NaN and infinity. */ + ball a2, a6, .Ldiv_xnan_or_inf + ball a3, a6, .Ldiv_ynan_or_inf + + /* Extract the exponents. */ + extui a8, a2, 23, 8 + extui a9, a3, 23, 8 + + beqz a9, .Ldiv_yexpzero +.Ldiv_ynormalized: + beqz a8, .Ldiv_xexpzero +.Ldiv_xnormalized: + + /* Subtract the exponents. */ + sub a8, a8, a9 + + /* Replace sign/exponent fields with explicit "1.0". */ + movi a10, 0xffffff + or a2, a2, a6 + and a2, a2, a10 + or a3, a3, a6 + and a3, a3, a10 + + /* The first digit of the mantissa division must be a one. + Shift x (and adjust the exponent) as needed to make this true. */ + bltu a3, a2, 1f + slli a2, a2, 1 + addi a8, a8, -1 +1: + /* Do the first subtraction and shift. */ + sub a2, a2, a3 + slli a2, a2, 1 + + /* Put the quotient into a10. */ + movi a10, 1 + + /* Divide one bit at a time for 23 bits. */ + movi a9, 23 +#if XCHAL_HAVE_LOOPS + loop a9, .Ldiv_loopend +#endif +.Ldiv_loop: + /* Shift the quotient << 1. */ + slli a10, a10, 1 + + /* Is this digit a 0 or 1? */ + bltu a2, a3, 1f + + /* Output a 1 and subtract. */ + addi a10, a10, 1 + sub a2, a2, a3 + + /* Shift the dividend << 1. */ +1: slli a2, a2, 1 + +#if !XCHAL_HAVE_LOOPS + addi a9, a9, -1 + bnez a9, .Ldiv_loop +#endif +.Ldiv_loopend: + + /* Add the exponent bias (less one to account for the explicit "1.0" + of the mantissa that will be added to the exponent in the final + result). */ + addi a8, a8, 0x7e + + /* Check for over/underflow. The value in a8 is one less than the + final exponent, so values in the range 0..fd are OK here. */ + movi a4, 0xfe + bgeu a8, a4, .Ldiv_overflow + +.Ldiv_round: + /* Round. The remainder (<< 1) is in a2. */ + bltu a2, a3, .Ldiv_rounded + addi a10, a10, 1 + beq a2, a3, .Ldiv_exactlyhalf + +.Ldiv_rounded: + /* Add the exponent to the mantissa. */ + slli a8, a8, 23 + add a2, a10, a8 + +.Ldiv_addsign: + /* Add the sign bit. */ + srli a7, a7, 31 + slli a7, a7, 31 + or a2, a2, a7 + leaf_return + +.Ldiv_overflow: + bltz a8, .Ldiv_underflow + /* Return +/- Infinity. */ + addi a8, a4, 1 /* 0xff */ + slli a2, a8, 23 + j .Ldiv_addsign + +.Ldiv_exactlyhalf: + /* Remainder is exactly half the divisor. Round even. */ + srli a10, a10, 1 + slli a10, a10, 1 + j .Ldiv_rounded + +.Ldiv_underflow: + /* Create a subnormal value, where the exponent field contains zero, + but the effective exponent is 1. The value of a8 is one less than + the actual exponent, so just negate it to get the shift amount. */ + neg a8, a8 + ssr a8 + bgeui a8, 32, .Ldiv_flush_to_zero + + /* Shift a10 right. Any bits that are shifted out of a10 are + saved in a6 for rounding the result. */ + sll a6, a10 + srl a10, a10 + + /* Set the exponent to zero. */ + movi a8, 0 + + /* Pack any nonzero remainder (in a2) into a6. */ + beqz a2, 1f + movi a9, 1 + or a6, a6, a9 + + /* Round a10 based on the bits shifted out into a6. */ +1: bgez a6, .Ldiv_rounded + addi a10, a10, 1 + slli a6, a6, 1 + bnez a6, .Ldiv_rounded + srli a10, a10, 1 + slli a10, a10, 1 + j .Ldiv_rounded + +.Ldiv_flush_to_zero: + /* Return zero with the appropriate sign bit. */ + srli a2, a7, 31 + slli a2, a2, 31 + leaf_return + +#endif /* L_divsf3 */ + +#ifdef L_cmpsf2 + + /* Equal and Not Equal */ + + .align 4 + .global __eqsf2 + .global __nesf2 + .set __nesf2, __eqsf2 + .type __eqsf2, @function +__eqsf2: + leaf_entry sp, 16 + bne a2, a3, 4f + + /* The values are equal but NaN != NaN. Check the exponent. */ + movi a6, 0x7f800000 + ball a2, a6, 3f + + /* Equal. */ + movi a2, 0 + leaf_return + + /* Not equal. */ +2: movi a2, 1 + leaf_return + + /* Check if the mantissas are nonzero. */ +3: slli a7, a2, 9 + j 5f + + /* Check if x and y are zero with different signs. */ +4: or a7, a2, a3 + slli a7, a7, 1 + + /* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa + or x when exponent(x) = 0x7f8 and x == y. */ +5: movi a2, 0 + movi a3, 1 + movnez a2, a3, a7 + leaf_return + + + /* Greater Than */ + + .align 4 + .global __gtsf2 + .type __gtsf2, @function +__gtsf2: + leaf_entry sp, 16 + movi a6, 0x7f800000 + ball a2, a6, 2f +1: bnall a3, a6, .Lle_cmp + + /* Check if y is a NaN. */ + slli a7, a3, 9 + beqz a7, .Lle_cmp + movi a2, 0 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, a2, 9 + beqz a7, 1b + movi a2, 0 + leaf_return + + + /* Less Than or Equal */ + + .align 4 + .global __lesf2 + .type __lesf2, @function +__lesf2: + leaf_entry sp, 16 + movi a6, 0x7f800000 + ball a2, a6, 2f +1: bnall a3, a6, .Lle_cmp + + /* Check if y is a NaN. */ + slli a7, a3, 9 + beqz a7, .Lle_cmp + movi a2, 1 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, a2, 9 + beqz a7, 1b + movi a2, 1 + leaf_return + +.Lle_cmp: + /* Check if x and y have different signs. */ + xor a7, a2, a3 + bltz a7, .Lle_diff_signs + + /* Check if x is negative. */ + bltz a2, .Lle_xneg + + /* Check if x <= y. */ + bltu a3, a2, 5f +4: movi a2, 0 + leaf_return + +.Lle_xneg: + /* Check if y <= x. */ + bgeu a2, a3, 4b +5: movi a2, 1 + leaf_return + +.Lle_diff_signs: + bltz a2, 4b + + /* Check if both x and y are zero. */ + or a7, a2, a3 + slli a7, a7, 1 + movi a2, 1 + movi a3, 0 + moveqz a2, a3, a7 + leaf_return + + + /* Greater Than or Equal */ + + .align 4 + .global __gesf2 + .type __gesf2, @function +__gesf2: + leaf_entry sp, 16 + movi a6, 0x7f800000 + ball a2, a6, 2f +1: bnall a3, a6, .Llt_cmp + + /* Check if y is a NaN. */ + slli a7, a3, 9 + beqz a7, .Llt_cmp + movi a2, -1 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, a2, 9 + beqz a7, 1b + movi a2, -1 + leaf_return + + + /* Less Than */ + + .align 4 + .global __ltsf2 + .type __ltsf2, @function +__ltsf2: + leaf_entry sp, 16 + movi a6, 0x7f800000 + ball a2, a6, 2f +1: bnall a3, a6, .Llt_cmp + + /* Check if y is a NaN. */ + slli a7, a3, 9 + beqz a7, .Llt_cmp + movi a2, 0 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, a2, 9 + beqz a7, 1b + movi a2, 0 + leaf_return + +.Llt_cmp: + /* Check if x and y have different signs. */ + xor a7, a2, a3 + bltz a7, .Llt_diff_signs + + /* Check if x is negative. */ + bltz a2, .Llt_xneg + + /* Check if x < y. */ + bgeu a2, a3, 5f +4: movi a2, -1 + leaf_return + +.Llt_xneg: + /* Check if y < x. */ + bltu a3, a2, 4b +5: movi a2, 0 + leaf_return + +.Llt_diff_signs: + bgez a2, 5b + + /* Check if both x and y are nonzero. */ + or a7, a2, a3 + slli a7, a7, 1 + movi a2, 0 + movi a3, -1 + movnez a2, a3, a7 + leaf_return + + + /* Unordered */ + + .align 4 + .global __unordsf2 + .type __unordsf2, @function +__unordsf2: + leaf_entry sp, 16 + movi a6, 0x7f800000 + ball a2, a6, 3f +1: ball a3, a6, 4f +2: movi a2, 0 + leaf_return + +3: slli a7, a2, 9 + beqz a7, 1b + movi a2, 1 + leaf_return + +4: slli a7, a3, 9 + beqz a7, 2b + movi a2, 1 + leaf_return + +#endif /* L_cmpsf2 */ + +#ifdef L_fixsfsi + + .align 4 + .global __fixsfsi + .type __fixsfsi, @function +__fixsfsi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7f800000 + ball a2, a6, .Lfixsfsi_nan_or_inf + + /* Extract the exponent and check if 0 < (exp - 0x7e) < 32. */ + extui a4, a2, 23, 8 + addi a4, a4, -0x7e + bgei a4, 32, .Lfixsfsi_maxint + blti a4, 1, .Lfixsfsi_zero + + /* Add explicit "1.0" and shift << 8. */ + or a7, a2, a6 + slli a5, a7, 8 + + /* Shift back to the right, based on the exponent. */ + ssl a4 /* shift by 32 - a4 */ + srl a5, a5 + + /* Negate the result if sign != 0. */ + neg a2, a5 + movgez a2, a5, a7 + leaf_return + +.Lfixsfsi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, a2, 9 + beqz a4, .Lfixsfsi_maxint + + /* Translate NaN to +maxint. */ + movi a2, 0 + +.Lfixsfsi_maxint: + slli a4, a6, 8 /* 0x80000000 */ + addi a5, a4, -1 /* 0x7fffffff */ + movgez a4, a5, a2 + mov a2, a4 + leaf_return + +.Lfixsfsi_zero: + movi a2, 0 + leaf_return + +#endif /* L_fixsfsi */ + +#ifdef L_fixsfdi + + .align 4 + .global __fixsfdi + .type __fixsfdi, @function +__fixsfdi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7f800000 + ball a2, a6, .Lfixsfdi_nan_or_inf + + /* Extract the exponent and check if 0 < (exp - 0x7e) < 64. */ + extui a4, a2, 23, 8 + addi a4, a4, -0x7e + bgei a4, 64, .Lfixsfdi_maxint + blti a4, 1, .Lfixsfdi_zero + + /* Add explicit "1.0" and shift << 8. */ + or a7, a2, a6 + slli xh, a7, 8 + + /* Shift back to the right, based on the exponent. */ + ssl a4 /* shift by 64 - a4 */ + bgei a4, 32, .Lfixsfdi_smallshift + srl xl, xh + movi xh, 0 + +.Lfixsfdi_shifted: + /* Negate the result if sign != 0. */ + bgez a7, 1f + neg xl, xl + neg xh, xh + beqz xl, 1f + addi xh, xh, -1 +1: leaf_return + +.Lfixsfdi_smallshift: + movi xl, 0 + sll xl, xh + srl xh, xh + j .Lfixsfdi_shifted + +.Lfixsfdi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, a2, 9 + beqz a4, .Lfixsfdi_maxint + + /* Translate NaN to +maxint. */ + movi a2, 0 + +.Lfixsfdi_maxint: + slli a7, a6, 8 /* 0x80000000 */ + bgez a2, 1f + mov xh, a7 + movi xl, 0 + leaf_return + +1: addi xh, a7, -1 /* 0x7fffffff */ + movi xl, -1 + leaf_return + +.Lfixsfdi_zero: + movi xh, 0 + movi xl, 0 + leaf_return + +#endif /* L_fixsfdi */ + +#ifdef L_fixunssfsi + + .align 4 + .global __fixunssfsi + .type __fixunssfsi, @function +__fixunssfsi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7f800000 + ball a2, a6, .Lfixunssfsi_nan_or_inf + + /* Extract the exponent and check if 0 <= (exp - 0x7f) < 32. */ + extui a4, a2, 23, 8 + addi a4, a4, -0x7f + bgei a4, 32, .Lfixunssfsi_maxint + bltz a4, .Lfixunssfsi_zero + + /* Add explicit "1.0" and shift << 8. */ + or a7, a2, a6 + slli a5, a7, 8 + + /* Shift back to the right, based on the exponent. */ + addi a4, a4, 1 + beqi a4, 32, .Lfixunssfsi_bigexp + ssl a4 /* shift by 32 - a4 */ + srl a5, a5 + + /* Negate the result if sign != 0. */ + neg a2, a5 + movgez a2, a5, a7 + leaf_return + +.Lfixunssfsi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, a2, 9 + beqz a4, .Lfixunssfsi_maxint + + /* Translate NaN to 0xffffffff. */ + movi a2, -1 + leaf_return + +.Lfixunssfsi_maxint: + slli a4, a6, 8 /* 0x80000000 */ + movi a5, -1 /* 0xffffffff */ + movgez a4, a5, a2 + mov a2, a4 + leaf_return + +.Lfixunssfsi_zero: + movi a2, 0 + leaf_return + +.Lfixunssfsi_bigexp: + /* Handle unsigned maximum exponent case. */ + bltz a2, 1f + mov a2, a5 /* no shift needed */ + leaf_return + + /* Return 0x80000000 if negative. */ +1: slli a2, a6, 8 + leaf_return + +#endif /* L_fixunssfsi */ + +#ifdef L_fixunssfdi + + .align 4 + .global __fixunssfdi + .type __fixunssfdi, @function +__fixunssfdi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7f800000 + ball a2, a6, .Lfixunssfdi_nan_or_inf + + /* Extract the exponent and check if 0 <= (exp - 0x7f) < 64. */ + extui a4, a2, 23, 8 + addi a4, a4, -0x7f + bgei a4, 64, .Lfixunssfdi_maxint + bltz a4, .Lfixunssfdi_zero + + /* Add explicit "1.0" and shift << 8. */ + or a7, a2, a6 + slli xh, a7, 8 + + /* Shift back to the right, based on the exponent. */ + addi a4, a4, 1 + beqi a4, 64, .Lfixunssfdi_bigexp + ssl a4 /* shift by 64 - a4 */ + bgei a4, 32, .Lfixunssfdi_smallshift + srl xl, xh + movi xh, 0 + +.Lfixunssfdi_shifted: + /* Negate the result if sign != 0. */ + bgez a7, 1f + neg xl, xl + neg xh, xh + beqz xl, 1f + addi xh, xh, -1 +1: leaf_return + +.Lfixunssfdi_smallshift: + movi xl, 0 + src xl, xh, xl + srl xh, xh + j .Lfixunssfdi_shifted + +.Lfixunssfdi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, a2, 9 + beqz a4, .Lfixunssfdi_maxint + + /* Translate NaN to 0xffffffff.... */ +1: movi xh, -1 + movi xl, -1 + leaf_return + +.Lfixunssfdi_maxint: + bgez a2, 1b +2: slli xh, a6, 8 /* 0x80000000 */ + movi xl, 0 + leaf_return + +.Lfixunssfdi_zero: + movi xh, 0 + movi xl, 0 + leaf_return + +.Lfixunssfdi_bigexp: + /* Handle unsigned maximum exponent case. */ + bltz a7, 2b + movi xl, 0 + leaf_return /* no shift needed */ + +#endif /* L_fixunssfdi */ + +#ifdef L_floatsisf + + .align 4 + .global __floatunsisf + .type __floatunsisf, @function +__floatunsisf: + leaf_entry sp, 16 + beqz a2, .Lfloatsisf_return + + /* Set the sign to zero and jump to the floatsisf code. */ + movi a7, 0 + j .Lfloatsisf_normalize + + .align 4 + .global __floatsisf + .type __floatsisf, @function +__floatsisf: + leaf_entry sp, 16 + + /* Check for zero. */ + beqz a2, .Lfloatsisf_return + + /* Save the sign. */ + extui a7, a2, 31, 1 + + /* Get the absolute value. */ +#if XCHAL_HAVE_ABS + abs a2, a2 +#else + neg a4, a2 + movltz a2, a4, a2 +#endif + +.Lfloatsisf_normalize: + /* Normalize with the first 1 bit in the msb. */ + do_nsau a4, a2, a5, a6 + ssl a4 + sll a5, a2 + + /* Shift the mantissa into position, with rounding bits in a6. */ + srli a2, a5, 8 + slli a6, a5, (32 - 8) + + /* Set the exponent. */ + movi a5, 0x9d /* 0x7e + 31 */ + sub a5, a5, a4 + slli a5, a5, 23 + add a2, a2, a5 + + /* Add the sign. */ + slli a7, a7, 31 + or a2, a2, a7 + + /* Round up if the leftover fraction is >= 1/2. */ + bgez a6, .Lfloatsisf_return + addi a2, a2, 1 /* Overflow to the exponent is OK. */ + + /* Check if the leftover fraction is exactly 1/2. */ + slli a6, a6, 1 + beqz a6, .Lfloatsisf_exactlyhalf + +.Lfloatsisf_return: + leaf_return + +.Lfloatsisf_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + leaf_return + +#endif /* L_floatsisf */ + +#ifdef L_floatdisf + + .align 4 + .global __floatundisf + .type __floatundisf, @function +__floatundisf: + leaf_entry sp, 16 + + /* Check for zero. */ + or a4, xh, xl + beqz a4, 2f + + /* Set the sign to zero and jump to the floatdisf code. */ + movi a7, 0 + j .Lfloatdisf_normalize + + .align 4 + .global __floatdisf + .type __floatdisf, @function +__floatdisf: + leaf_entry sp, 16 + + /* Check for zero. */ + or a4, xh, xl + beqz a4, 2f + + /* Save the sign. */ + extui a7, xh, 31, 1 + + /* Get the absolute value. */ + bgez xh, .Lfloatdisf_normalize + neg xl, xl + neg xh, xh + beqz xl, .Lfloatdisf_normalize + addi xh, xh, -1 + +.Lfloatdisf_normalize: + /* Normalize with the first 1 bit in the msb of xh. */ + beqz xh, .Lfloatdisf_bigshift + do_nsau a4, xh, a5, a6 + ssl a4 + src xh, xh, xl + sll xl, xl + +.Lfloatdisf_shifted: + /* Shift the mantissa into position, with rounding bits in a6. */ + ssai 8 + sll a5, xl + src a6, xh, xl + srl xh, xh + beqz a5, 1f + movi a5, 1 + or a6, a6, a5 +1: + /* Set the exponent. */ + movi a5, 0xbd /* 0x7e + 63 */ + sub a5, a5, a4 + slli a5, a5, 23 + add a2, xh, a5 + + /* Add the sign. */ + slli a7, a7, 31 + or a2, a2, a7 + + /* Round up if the leftover fraction is >= 1/2. */ + bgez a6, 2f + addi a2, a2, 1 /* Overflow to the exponent is OK. */ + + /* Check if the leftover fraction is exactly 1/2. */ + slli a6, a6, 1 + beqz a6, .Lfloatdisf_exactlyhalf +2: leaf_return + +.Lfloatdisf_bigshift: + /* xh is zero. Normalize with first 1 bit of xl in the msb of xh. */ + do_nsau a4, xl, a5, a6 + ssl a4 + sll xh, xl + movi xl, 0 + addi a4, a4, 32 + j .Lfloatdisf_shifted + +.Lfloatdisf_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + leaf_return + +#endif /* L_floatdisf */ diff --git a/libgcc/config/xtensa/lib1funcs.S b/libgcc/config/xtensa/lib1funcs.S new file mode 100644 index 0000000..071b917 --- /dev/null +++ b/libgcc/config/xtensa/lib1funcs.S @@ -0,0 +1,845 @@ +/* Assembly functions for the Xtensa version of libgcc1. + Copyright (C) 2001, 2002, 2003, 2005, 2006, 2007, 2009 + Free Software Foundation, Inc. + Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#include "xtensa-config.h" + +/* Define macros for the ABS and ADDX* instructions to handle cases + where they are not included in the Xtensa processor configuration. */ + + .macro do_abs dst, src, tmp +#if XCHAL_HAVE_ABS + abs \dst, \src +#else + neg \tmp, \src + movgez \tmp, \src, \src + mov \dst, \tmp +#endif + .endm + + .macro do_addx2 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx2 \dst, \as, \at +#else + slli \tmp, \as, 1 + add \dst, \tmp, \at +#endif + .endm + + .macro do_addx4 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx4 \dst, \as, \at +#else + slli \tmp, \as, 2 + add \dst, \tmp, \at +#endif + .endm + + .macro do_addx8 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx8 \dst, \as, \at +#else + slli \tmp, \as, 3 + add \dst, \tmp, \at +#endif + .endm + +/* Define macros for leaf function entry and return, supporting either the + standard register windowed ABI or the non-windowed call0 ABI. These + macros do not allocate any extra stack space, so they only work for + leaf functions that do not need to spill anything to the stack. */ + + .macro leaf_entry reg, size +#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__ + entry \reg, \size +#else + /* do nothing */ +#endif + .endm + + .macro leaf_return +#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__ + retw +#else + ret +#endif + .endm + + +#ifdef L_mulsi3 + .align 4 + .global __mulsi3 + .type __mulsi3, @function +__mulsi3: + leaf_entry sp, 16 + +#if XCHAL_HAVE_MUL32 + mull a2, a2, a3 + +#elif XCHAL_HAVE_MUL16 + or a4, a2, a3 + srai a4, a4, 16 + bnez a4, .LMUL16 + mul16u a2, a2, a3 + leaf_return +.LMUL16: + srai a4, a2, 16 + srai a5, a3, 16 + mul16u a7, a4, a3 + mul16u a6, a5, a2 + mul16u a4, a2, a3 + add a7, a7, a6 + slli a7, a7, 16 + add a2, a7, a4 + +#elif XCHAL_HAVE_MAC16 + mul.aa.hl a2, a3 + mula.aa.lh a2, a3 + rsr a5, ACCLO + umul.aa.ll a2, a3 + rsr a4, ACCLO + slli a5, a5, 16 + add a2, a4, a5 + +#else /* !MUL32 && !MUL16 && !MAC16 */ + + /* Multiply one bit at a time, but unroll the loop 4x to better + exploit the addx instructions and avoid overhead. + Peel the first iteration to save a cycle on init. */ + + /* Avoid negative numbers. */ + xor a5, a2, a3 /* Top bit is 1 if one input is negative. */ + do_abs a3, a3, a6 + do_abs a2, a2, a6 + + /* Swap so the second argument is smaller. */ + sub a7, a2, a3 + mov a4, a3 + movgez a4, a2, a7 /* a4 = max (a2, a3) */ + movltz a3, a2, a7 /* a3 = min (a2, a3) */ + + movi a2, 0 + extui a6, a3, 0, 1 + movnez a2, a4, a6 + + do_addx2 a7, a4, a2, a7 + extui a6, a3, 1, 1 + movnez a2, a7, a6 + + do_addx4 a7, a4, a2, a7 + extui a6, a3, 2, 1 + movnez a2, a7, a6 + + do_addx8 a7, a4, a2, a7 + extui a6, a3, 3, 1 + movnez a2, a7, a6 + + bgeui a3, 16, .Lmult_main_loop + neg a3, a2 + movltz a2, a3, a5 + leaf_return + + .align 4 +.Lmult_main_loop: + srli a3, a3, 4 + slli a4, a4, 4 + + add a7, a4, a2 + extui a6, a3, 0, 1 + movnez a2, a7, a6 + + do_addx2 a7, a4, a2, a7 + extui a6, a3, 1, 1 + movnez a2, a7, a6 + + do_addx4 a7, a4, a2, a7 + extui a6, a3, 2, 1 + movnez a2, a7, a6 + + do_addx8 a7, a4, a2, a7 + extui a6, a3, 3, 1 + movnez a2, a7, a6 + + bgeui a3, 16, .Lmult_main_loop + + neg a3, a2 + movltz a2, a3, a5 + +#endif /* !MUL32 && !MUL16 && !MAC16 */ + + leaf_return + .size __mulsi3, . - __mulsi3 + +#endif /* L_mulsi3 */ + + +#ifdef L_umulsidi3 + +#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16 +#define XCHAL_NO_MUL 1 +#endif + + .align 4 + .global __umulsidi3 + .type __umulsidi3, @function +__umulsidi3: +#if __XTENSA_CALL0_ABI__ + leaf_entry sp, 32 + addi sp, sp, -32 + s32i a12, sp, 16 + s32i a13, sp, 20 + s32i a14, sp, 24 + s32i a15, sp, 28 +#elif XCHAL_NO_MUL + /* This is not really a leaf function; allocate enough stack space + to allow CALL12s to a helper function. */ + leaf_entry sp, 48 +#else + leaf_entry sp, 16 +#endif + +#ifdef __XTENSA_EB__ +#define wh a2 +#define wl a3 +#else +#define wh a3 +#define wl a2 +#endif /* __XTENSA_EB__ */ + + /* This code is taken from the mulsf3 routine in ieee754-sf.S. + See more comments there. */ + +#if XCHAL_HAVE_MUL32_HIGH + mull a6, a2, a3 + muluh wh, a2, a3 + mov wl, a6 + +#else /* ! MUL32_HIGH */ + +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + /* a0 and a8 will be clobbered by calling the multiply function + but a8 is not used here and need not be saved. */ + s32i a0, sp, 0 +#endif + +#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 + +#define a2h a4 +#define a3h a5 + + /* Get the high halves of the inputs into registers. */ + srli a2h, a2, 16 + srli a3h, a3, 16 + +#define a2l a2 +#define a3l a3 + +#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16 + /* Clear the high halves of the inputs. This does not matter + for MUL16 because the high bits are ignored. */ + extui a2, a2, 0, 16 + extui a3, a3, 0, 16 +#endif +#endif /* MUL16 || MUL32 */ + + +#if XCHAL_HAVE_MUL16 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mul16u dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MUL32 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mull dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MAC16 + +/* The preprocessor insists on inserting a space when concatenating after + a period in the definition of do_mul below. These macros are a workaround + using underscores instead of periods when doing the concatenation. */ +#define umul_aa_ll umul.aa.ll +#define umul_aa_lh umul.aa.lh +#define umul_aa_hl umul.aa.hl +#define umul_aa_hh umul.aa.hh + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + umul_aa_ ## xhalf ## yhalf xreg, yreg; \ + rsr dst, ACCLO + +#else /* no multiply hardware */ + +#define set_arg_l(dst, src) \ + extui dst, src, 0, 16 +#define set_arg_h(dst, src) \ + srli dst, src, 16 + +#if __XTENSA_CALL0_ABI__ +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a13, xreg); \ + set_arg_ ## yhalf (a14, yreg); \ + call0 .Lmul_mulsi3; \ + mov dst, a12 +#else +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a14, xreg); \ + set_arg_ ## yhalf (a15, yreg); \ + call12 .Lmul_mulsi3; \ + mov dst, a14 +#endif /* __XTENSA_CALL0_ABI__ */ + +#endif /* no multiply hardware */ + + /* Add pp1 and pp2 into a6 with carry-out in a9. */ + do_mul(a6, a2, l, a3, h) /* pp 1 */ + do_mul(a11, a2, h, a3, l) /* pp 2 */ + movi a9, 0 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Shift the high half of a9/a6 into position in a9. Note that + this value can be safely incremented without any carry-outs. */ + ssai 16 + src a9, a9, a6 + + /* Compute the low word into a6. */ + do_mul(a11, a2, l, a3, l) /* pp 0 */ + sll a6, a6 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Compute the high word into wh. */ + do_mul(wh, a2, h, a3, h) /* pp 3 */ + add wh, wh, a9 + mov wl, a6 + +#endif /* !MUL32_HIGH */ + +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + /* Restore the original return address. */ + l32i a0, sp, 0 +#endif +#if __XTENSA_CALL0_ABI__ + l32i a12, sp, 16 + l32i a13, sp, 20 + l32i a14, sp, 24 + l32i a15, sp, 28 + addi sp, sp, 32 +#endif + leaf_return + +#if XCHAL_NO_MUL + + /* For Xtensa processors with no multiply hardware, this simplified + version of _mulsi3 is used for multiplying 16-bit chunks of + the floating-point mantissas. When using CALL0, this function + uses a custom ABI: the inputs are passed in a13 and a14, the + result is returned in a12, and a8 and a15 are clobbered. */ + .align 4 +.Lmul_mulsi3: + leaf_entry sp, 16 + .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2 + movi \dst, 0 +1: add \tmp1, \src2, \dst + extui \tmp2, \src1, 0, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx2 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 1, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx4 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 2, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx8 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 3, 1 + movnez \dst, \tmp1, \tmp2 + + srli \src1, \src1, 4 + slli \src2, \src2, 4 + bnez \src1, 1b + .endm +#if __XTENSA_CALL0_ABI__ + mul_mulsi3_body a12, a13, a14, a15, a8 +#else + /* The result will be written into a2, so save that argument in a4. */ + mov a4, a2 + mul_mulsi3_body a2, a4, a3, a5, a6 +#endif + leaf_return +#endif /* XCHAL_NO_MUL */ + + .size __umulsidi3, . - __umulsidi3 + +#endif /* L_umulsidi3 */ + + +/* Define a macro for the NSAU (unsigned normalize shift amount) + instruction, which computes the number of leading zero bits, + to handle cases where it is not included in the Xtensa processor + configuration. */ + + .macro do_nsau cnt, val, tmp, a +#if XCHAL_HAVE_NSA + nsau \cnt, \val +#else + mov \a, \val + movi \cnt, 0 + extui \tmp, \a, 16, 16 + bnez \tmp, 0f + movi \cnt, 16 + slli \a, \a, 16 +0: + extui \tmp, \a, 24, 8 + bnez \tmp, 1f + addi \cnt, \cnt, 8 + slli \a, \a, 8 +1: + movi \tmp, __nsau_data + extui \a, \a, 24, 8 + add \tmp, \tmp, \a + l8ui \tmp, \tmp, 0 + add \cnt, \cnt, \tmp +#endif /* !XCHAL_HAVE_NSA */ + .endm + +#ifdef L_clz + .section .rodata + .align 4 + .global __nsau_data + .type __nsau_data, @object +__nsau_data: +#if !XCHAL_HAVE_NSA + .byte 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 + .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 + .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +#endif /* !XCHAL_HAVE_NSA */ + .size __nsau_data, . - __nsau_data + .hidden __nsau_data +#endif /* L_clz */ + + +#ifdef L_clzsi2 + .align 4 + .global __clzsi2 + .type __clzsi2, @function +__clzsi2: + leaf_entry sp, 16 + do_nsau a2, a2, a3, a4 + leaf_return + .size __clzsi2, . - __clzsi2 + +#endif /* L_clzsi2 */ + + +#ifdef L_ctzsi2 + .align 4 + .global __ctzsi2 + .type __ctzsi2, @function +__ctzsi2: + leaf_entry sp, 16 + neg a3, a2 + and a3, a3, a2 + do_nsau a2, a3, a4, a5 + neg a2, a2 + addi a2, a2, 31 + leaf_return + .size __ctzsi2, . - __ctzsi2 + +#endif /* L_ctzsi2 */ + + +#ifdef L_ffssi2 + .align 4 + .global __ffssi2 + .type __ffssi2, @function +__ffssi2: + leaf_entry sp, 16 + neg a3, a2 + and a3, a3, a2 + do_nsau a2, a3, a4, a5 + neg a2, a2 + addi a2, a2, 32 + leaf_return + .size __ffssi2, . - __ffssi2 + +#endif /* L_ffssi2 */ + + +#ifdef L_udivsi3 + .align 4 + .global __udivsi3 + .type __udivsi3, @function +__udivsi3: + leaf_entry sp, 16 +#if XCHAL_HAVE_DIV32 + quou a2, a2, a3 +#else + bltui a3, 2, .Lle_one /* check if the divisor <= 1 */ + + mov a6, a2 /* keep dividend in a6 */ + do_nsau a5, a6, a2, a7 /* dividend_shift = nsau (dividend) */ + do_nsau a4, a3, a2, a7 /* divisor_shift = nsau (divisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = divisor_shift - dividend_shift */ + ssl a4 + sll a3, a3 /* divisor <<= count */ + movi a2, 0 /* quotient = 0 */ + + /* test-subtract-and-shift loop; one quotient bit on each iteration */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a6, a3, .Lzerobit + sub a6, a6, a3 + addi a2, a2, 1 +.Lzerobit: + slli a2, a2, 1 + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + + bltu a6, a3, .Lreturn + addi a2, a2, 1 /* increment quotient if dividend >= divisor */ +.Lreturn: + leaf_return + +.Lle_one: + beqz a3, .Lerror /* if divisor == 1, return the dividend */ + leaf_return + +.Lspecial: + /* return dividend >= divisor */ + bltu a6, a3, .Lreturn0 + movi a2, 1 + leaf_return + +.Lerror: + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + leaf_return + .size __udivsi3, . - __udivsi3 + +#endif /* L_udivsi3 */ + + +#ifdef L_divsi3 + .align 4 + .global __divsi3 + .type __divsi3, @function +__divsi3: + leaf_entry sp, 16 +#if XCHAL_HAVE_DIV32 + quos a2, a2, a3 +#else + xor a7, a2, a3 /* sign = dividend ^ divisor */ + do_abs a6, a2, a4 /* udividend = abs (dividend) */ + do_abs a3, a3, a4 /* udivisor = abs (divisor) */ + bltui a3, 2, .Lle_one /* check if udivisor <= 1 */ + do_nsau a5, a6, a2, a8 /* udividend_shift = nsau (udividend) */ + do_nsau a4, a3, a2, a8 /* udivisor_shift = nsau (udivisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = udivisor_shift - udividend_shift */ + ssl a4 + sll a3, a3 /* udivisor <<= count */ + movi a2, 0 /* quotient = 0 */ + + /* test-subtract-and-shift loop; one quotient bit on each iteration */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a6, a3, .Lzerobit + sub a6, a6, a3 + addi a2, a2, 1 +.Lzerobit: + slli a2, a2, 1 + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + + bltu a6, a3, .Lreturn + addi a2, a2, 1 /* increment if udividend >= udivisor */ +.Lreturn: + neg a5, a2 + movltz a2, a5, a7 /* return (sign < 0) ? -quotient : quotient */ + leaf_return + +.Lle_one: + beqz a3, .Lerror + neg a2, a6 /* if udivisor == 1, then return... */ + movgez a2, a6, a7 /* (sign < 0) ? -udividend : udividend */ + leaf_return + +.Lspecial: + bltu a6, a3, .Lreturn0 /* if dividend < divisor, return 0 */ + movi a2, 1 + movi a4, -1 + movltz a2, a4, a7 /* else return (sign < 0) ? -1 : 1 */ + leaf_return + +.Lerror: + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + leaf_return + .size __divsi3, . - __divsi3 + +#endif /* L_divsi3 */ + + +#ifdef L_umodsi3 + .align 4 + .global __umodsi3 + .type __umodsi3, @function +__umodsi3: + leaf_entry sp, 16 +#if XCHAL_HAVE_DIV32 + remu a2, a2, a3 +#else + bltui a3, 2, .Lle_one /* check if the divisor is <= 1 */ + + do_nsau a5, a2, a6, a7 /* dividend_shift = nsau (dividend) */ + do_nsau a4, a3, a6, a7 /* divisor_shift = nsau (divisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = divisor_shift - dividend_shift */ + ssl a4 + sll a3, a3 /* divisor <<= count */ + + /* test-subtract-and-shift loop */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a2, a3, .Lzerobit + sub a2, a2, a3 +.Lzerobit: + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + +.Lspecial: + bltu a2, a3, .Lreturn + sub a2, a2, a3 /* subtract once more if dividend >= divisor */ +.Lreturn: + leaf_return + +.Lle_one: + bnez a3, .Lreturn0 + + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + leaf_return + .size __umodsi3, . - __umodsi3 + +#endif /* L_umodsi3 */ + + +#ifdef L_modsi3 + .align 4 + .global __modsi3 + .type __modsi3, @function +__modsi3: + leaf_entry sp, 16 +#if XCHAL_HAVE_DIV32 + rems a2, a2, a3 +#else + mov a7, a2 /* save original (signed) dividend */ + do_abs a2, a2, a4 /* udividend = abs (dividend) */ + do_abs a3, a3, a4 /* udivisor = abs (divisor) */ + bltui a3, 2, .Lle_one /* check if udivisor <= 1 */ + do_nsau a5, a2, a6, a8 /* udividend_shift = nsau (udividend) */ + do_nsau a4, a3, a6, a8 /* udivisor_shift = nsau (udivisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = udivisor_shift - udividend_shift */ + ssl a4 + sll a3, a3 /* udivisor <<= count */ + + /* test-subtract-and-shift loop */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a2, a3, .Lzerobit + sub a2, a2, a3 +.Lzerobit: + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + +.Lspecial: + bltu a2, a3, .Lreturn + sub a2, a2, a3 /* subtract again if udividend >= udivisor */ +.Lreturn: + bgez a7, .Lpositive + neg a2, a2 /* if (dividend < 0), return -udividend */ +.Lpositive: + leaf_return + +.Lle_one: + bnez a3, .Lreturn0 + + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + leaf_return + .size __modsi3, . - __modsi3 + +#endif /* L_modsi3 */ + + +#ifdef __XTENSA_EB__ +#define uh a2 +#define ul a3 +#else +#define uh a3 +#define ul a2 +#endif /* __XTENSA_EB__ */ + + +#ifdef L_ashldi3 + .align 4 + .global __ashldi3 + .type __ashldi3, @function +__ashldi3: + leaf_entry sp, 16 + ssl a4 + bgei a4, 32, .Llow_only + src uh, uh, ul + sll ul, ul + leaf_return + +.Llow_only: + sll uh, ul + movi ul, 0 + leaf_return + .size __ashldi3, . - __ashldi3 + +#endif /* L_ashldi3 */ + + +#ifdef L_ashrdi3 + .align 4 + .global __ashrdi3 + .type __ashrdi3, @function +__ashrdi3: + leaf_entry sp, 16 + ssr a4 + bgei a4, 32, .Lhigh_only + src ul, uh, ul + sra uh, uh + leaf_return + +.Lhigh_only: + sra ul, uh + srai uh, uh, 31 + leaf_return + .size __ashrdi3, . - __ashrdi3 + +#endif /* L_ashrdi3 */ + + +#ifdef L_lshrdi3 + .align 4 + .global __lshrdi3 + .type __lshrdi3, @function +__lshrdi3: + leaf_entry sp, 16 + ssr a4 + bgei a4, 32, .Lhigh_only1 + src ul, uh, ul + srl uh, uh + leaf_return + +.Lhigh_only1: + srl ul, uh + movi uh, 0 + leaf_return + .size __lshrdi3, . - __lshrdi3 + +#endif /* L_lshrdi3 */ + + +#include "ieee754-df.S" +#include "ieee754-sf.S" diff --git a/libgcc/config/xtensa/t-xtensa b/libgcc/config/xtensa/t-xtensa index 7d9e9db..5bcc094 100644 --- a/libgcc/config/xtensa/t-xtensa +++ b/libgcc/config/xtensa/t-xtensa @@ -1,2 +1,14 @@ +LIB1ASMSRC = xtensa/lib1funcs.S +LIB1ASMFUNCS = _mulsi3 _divsi3 _modsi3 _udivsi3 _umodsi3 \ + _umulsidi3 _clz _clzsi2 _ctzsi2 _ffssi2 \ + _ashldi3 _ashrdi3 _lshrdi3 \ + _negsf2 _addsubsf3 _mulsf3 _divsf3 _cmpsf2 _fixsfsi _fixsfdi \ + _fixunssfsi _fixunssfdi _floatsisf _floatunsisf \ + _floatdisf _floatundisf \ + _negdf2 _addsubdf3 _muldf3 _divdf3 _cmpdf2 _fixdfsi _fixdfdi \ + _fixunsdfsi _fixunsdfdi _floatsidf _floatunsidf \ + _floatdidf _floatundidf \ + _truncdfsf2 _extendsfdf2 + LIB2ADDEH = $(srcdir)/config/xtensa/unwind-dw2-xtensa.c \ $(srcdir)/unwind-dw2-fde.c $(srcdir)/unwind-sjlj.c $(srcdir)/unwind-c.c |