aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJ"orn Rennecke <joern.rennecke@st.com>2006-03-23 21:39:32 +0000
committerJoern Rennecke <amylaar@gcc.gnu.org>2006-03-23 21:39:32 +0000
commitb368d6b8dff358b7f6b197ba4c95477a41645c23 (patch)
tree6eb8d5d19c35a219b954adb42735d22b79b838cd
parenta57aee2ab63245bf7678a0cb18997ceb78da8ffc (diff)
downloadgcc-b368d6b8dff358b7f6b197ba4c95477a41645c23.zip
gcc-b368d6b8dff358b7f6b197ba4c95477a41645c23.tar.gz
gcc-b368d6b8dff358b7f6b197ba4c95477a41645c23.tar.bz2
divtab-sh4.c, [...]: New files.
2006-03-23 J"orn Rennecke <joern.rennecke@st.com> * config/sh/divtab-sh4.c, config/sh/divcost-analysis: New files. * config/sh/lib1funcs.asm (div_table): Add !__SH5__ variant. * config/sh/t-sh (LIB1ASMFUNCS): Add _div_table. * config/sh/sh.opt (mdiv=): Amend description. * config/sh/sh.h (TARGET_DIVIDE_CALL_DIV1): New macro. (TARGET_DIVIDE_CALL_FP, TARGET_DIVIDE_CALL_TABLE): Likewise. (sh_divide_strategy_e): Add new members SH_DIV_CALL_DIV1, SH_DIV_CALL_FP, SH_DIV_CALL_TABLE and SH_DIV_INTRINSIC. (OVERRIDE_OPTIONS): Also process sh_div_str for TARGET_SH1. Calculate sh_divsi3_libfunc using TARGET_DIVIDE_* macros. * config/sh/sh.md (udivsi3_i4_int, divsi3_i4_int): New patterns. (udivsi3, divsi3): Use them. Check TARGET_DIVIDE_CALL_TABLE / TARGET_DIVIDE_CALL_FP. From-SVN: r112331
-rw-r--r--gcc/ChangeLog16
-rw-r--r--gcc/config/sh/divcost-analysis76
-rw-r--r--gcc/config/sh/divtab-sh4.c90
-rw-r--r--gcc/config/sh/lib1funcs.asm633
-rw-r--r--gcc/config/sh/sh.h60
-rw-r--r--gcc/config/sh/sh.md39
-rw-r--r--gcc/config/sh/sh.opt4
-rw-r--r--gcc/config/sh/t-sh1
8 files changed, 902 insertions, 17 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 5cb8291..9079462 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,19 @@
+2006-03-23 J"orn Rennecke <joern.rennecke@st.com>
+
+ * config/sh/divtab-sh4.c, config/sh/divcost-analysis: New files.
+ * config/sh/lib1funcs.asm (div_table): Add !__SH5__ variant.
+ * config/sh/t-sh (LIB1ASMFUNCS): Add _div_table.
+ * config/sh/sh.opt (mdiv=): Amend description.
+ * config/sh/sh.h (TARGET_DIVIDE_CALL_DIV1): New macro.
+ (TARGET_DIVIDE_CALL_FP, TARGET_DIVIDE_CALL_TABLE): Likewise.
+ (sh_divide_strategy_e): Add new members SH_DIV_CALL_DIV1,
+ SH_DIV_CALL_FP, SH_DIV_CALL_TABLE and SH_DIV_INTRINSIC.
+ (OVERRIDE_OPTIONS): Also process sh_div_str for TARGET_SH1.
+ Calculate sh_divsi3_libfunc using TARGET_DIVIDE_* macros.
+ * config/sh/sh.md (udivsi3_i4_int, divsi3_i4_int): New patterns.
+ (udivsi3, divsi3): Use them. Check TARGET_DIVIDE_CALL_TABLE /
+ TARGET_DIVIDE_CALL_FP.
+
2006-03-23 Maxim Kuvyrkov <mkuvyrkov@ispras.ru>
* haifa-sched.c (choose_ready): Fix type of the local variable.
diff --git a/gcc/config/sh/divcost-analysis b/gcc/config/sh/divcost-analysis
new file mode 100644
index 0000000..541e313
--- /dev/null
+++ b/gcc/config/sh/divcost-analysis
@@ -0,0 +1,76 @@
+Analysis of cycle costs for SH4:
+
+-> udiv_le128: 5
+-> udiv_ge64k: 6
+-> udiv udiv_25: 10
+-> pos_divisor: 3
+-> pos_result linear: 5
+-> pos_result - -: 5
+-> div_le128: 7
+-> div_ge64k: 9
+sdivsi3 -> udiv_25 13
+udiv25 -> div_ge64k_end: 15
+div_ge64k_end -> rts: 13
+div_le128 -> div_le128_2: 2, r1 latency 3
+udiv_le128 -> div_le128_2: 2, r1 latency 3
+(u)div_le128 -> div_by_1: 9
+(u)div_le128 -> rts: 17
+div_by_1(_neg) -> rts: 4
+div_ge64k -> div_r8: 2
+div_ge64k -> div_ge64k_2: 3
+udiv_ge64k -> udiv_r8: 3
+udiv_ge64k -> div_ge64k_2: 3 + LS
+(u)div_ge64k -> div_ge64k_end: 13
+div_r8 -> div_r8_2: 2
+udiv_r8 -> div_r8_2: 2 + LS
+(u)div_r8 -> rts: 21
+
+-> - + neg_result: 5
+-> + - neg_result: 5
+-> div_le128_neg: 7
+-> div_ge64k_neg: 9
+-> div_r8_neg: 11
+-> <64k div_ge64k_neg_end: 28
+-> >=64k div_ge64k_neg_end: 22
+div_ge64k_neg_end ft -> rts: 14
+div_r8_neg_end -> rts: 4
+div_r8_neg -> div_r8_neg_end: 18
+div_le128_neg -> div_by_1_neg: 4
+div_le128_neg -> rts 18
+
+ absolute divisor range:
+ 1 [2..128] [129..64K) [64K..|divident|/256] >=64K,>|divident/256|
+udiv 18 22 38 32 30
+sdiv pos: 20 24 41 35 32
+sdiv neg: 15 25 42 36 33
+
+
+fp-based:
+
+unsigned: 42 + 3 + 3 (lingering ftrc latency + sts fpul,rx) at caller's site
+signed: 33 + 3 + 3 (lingering ftrc latency + sts fpul,rx) at caller's site
+
+call-div1: divisor range:
+ [1..64K) >= 64K
+unsigned: 63 58
+signed: 76 76
+
+SFUNC_STATIC call overhead:
+mov.l 0f,r1
+bsrf r1
+
+SFUNC_GOT call overhead - current:
+mov.l 0f,r1
+mova 0f,r0
+mov.l 1f,r2
+add r1,r0
+mov.l @(r0,r2),r0
+jmp @r0
+; 3 cycles worse than SFUNC_STATIC
+
+SFUNC_GOT call overhead - improved assembler:
+mov.l 0f,r1
+mova 0f,r0
+mov.l @(r0,r1),r0
+jmp @r0
+; 2 cycles worse than SFUNC_STATIC
diff --git a/gcc/config/sh/divtab-sh4.c b/gcc/config/sh/divtab-sh4.c
new file mode 100644
index 0000000..e7de6c4
--- /dev/null
+++ b/gcc/config/sh/divtab-sh4.c
@@ -0,0 +1,90 @@
+/* Copyright (C) 2004 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file. (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING. If not, write to
+the Free Software Foundation, 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA. */
+
+/* Calculate division table for SH2..4 integer division
+ Contributed by Joern Rernnecke
+ joern.rennecke@superh.com */
+
+#include <stdio.h>
+#include <math.h>
+
+int
+main ()
+{
+ int i, j;
+ double q, r, err, max_err = 0, max_s_err = 0;
+
+ puts("/* This table has been generated by divtab-sh4.c. */");
+ puts ("\t.balign 4");
+ puts ("LOCAL(div_table_clz):");
+ /* output some dummy number for 1/0. */
+ printf ("\t.byte\t%d\n", 0);
+ for (i = 1; i <= 128; i++)
+ {
+ int n = 0;
+ if (i == 128)
+ puts ("\
+/* Lookup table translating positive divisor to index into table of\n\
+ normalized inverse. N.B. the '0' entry is also the last entry of the\n\
+ previous table, and causes an unaligned access for division by zero. */\n\
+LOCAL(div_table_ix):");
+ for (j = i; j <= 128; j += j)
+ n++;
+ printf ("\t.byte\t%d\n", n - 7);
+ }
+ for (i = 1; i <= 128; i++)
+ {
+ j = i < 0 ? -i : i;
+ while (j < 128)
+ j += j;
+ printf ("\t.byte\t%d\n", j * 2 - 96*4);
+ }
+ puts("\
+/* 1/64 .. 1/127, normalized. There is an implicit leading 1 in bit 32. */\n\
+ .balign 4\n\
+LOCAL(zero_l):");
+ for (i = 64; i < 128; i++)
+ {
+ if (i == 96)
+ puts ("LOCAL(div_table):");
+ q = 4.*(1<<30)*128/i;
+ r = ceil (q);
+ /* The value for 64 is actually differently scaled that it would
+ appear from this calculation. The implicit part is %01, not 10.
+ Still, since the value in the table is 0 either way, this
+ doesn't matter here. Still, the 1/64 entry is effectively a 1/128
+ entry. */
+ printf ("\t.long\t0x%X\n", (unsigned) r);
+ err = r - q;
+ if (err > max_err)
+ max_err = err;
+ err = err * i / 128;
+ if (err > max_s_err)
+ max_s_err = err;
+ }
+ printf ("\t/* maximum error: %f scaled: %f*/\n", max_err, max_s_err);
+ exit (0);
+}
diff --git a/gcc/config/sh/lib1funcs.asm b/gcc/config/sh/lib1funcs.asm
index 5333450..7dfe73e 100644
--- a/gcc/config/sh/lib1funcs.asm
+++ b/gcc/config/sh/lib1funcs.asm
@@ -1,5 +1,5 @@
/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- 2004, 2005
+ 2004, 2005, 2006
Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it
@@ -3019,8 +3019,8 @@ GLOBAL(GCC_pop_shmedia_regs_nofpu):
#endif /* __SH5__ == 32 */
#endif /* L_push_pop_shmedia_regs */
-#if __SH5__
#ifdef L_div_table
+#if __SH5__
#if defined(__pic__) && defined(__SHMEDIA__)
.global GLOBAL(sdivsi3)
FUNC(GLOBAL(sdivsi3))
@@ -3247,5 +3247,632 @@ GLOBAL(div_table):
.word 17738
.word 17136
.word 16639
+
+#elif defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__)
+/* This code used shld, thus is not suitable for SH1 / SH2. */
+
+/* Signed / unsigned division without use of FPU, optimized for SH4.
+ Uses a lookup table for divisors in the range -128 .. +128, and
+ div1 with case distinction for larger divisors in three more ranges.
+ The code is lumped together with the table to allow the use of mova. */
+#ifdef __LITTLE_ENDIAN__
+#define L_LSB 0
+#define L_LSWMSB 1
+#define L_MSWLSB 2
+#else
+#define L_LSB 3
+#define L_LSWMSB 2
+#define L_MSWLSB 1
+#endif
+
+ .balign 4
+ .global GLOBAL(udivsi3_i4i)
+ FUNC(GLOBAL(udivsi3_i4i))
+GLOBAL(udivsi3_i4i):
+ mov.w LOCAL(c128_w), r1
+ div0u
+ mov r4,r0
+ shlr8 r0
+ cmp/hi r1,r5
+ extu.w r5,r1
+ bf LOCAL(udiv_le128)
+ cmp/eq r5,r1
+ bf LOCAL(udiv_ge64k)
+ shlr r0
+ mov r5,r1
+ shll16 r5
+ mov.l r4,@-r15
+ div1 r5,r0
+ mov.l r1,@-r15
+ div1 r5,r0
+ div1 r5,r0
+ bra LOCAL(udiv_25)
+ div1 r5,r0
+
+LOCAL(div_le128):
+ mova LOCAL(div_table_ix),r0
+ bra LOCAL(div_le128_2)
+ mov.b @(r0,r5),r1
+LOCAL(udiv_le128):
+ mov.l r4,@-r15
+ mova LOCAL(div_table_ix),r0
+ mov.b @(r0,r5),r1
+ mov.l r5,@-r15
+LOCAL(div_le128_2):
+ mova LOCAL(div_table_inv),r0
+ mov.l @(r0,r1),r1
+ mov r5,r0
+ tst #0xfe,r0
+ mova LOCAL(div_table_clz),r0
+ dmulu.l r1,r4
+ mov.b @(r0,r5),r1
+ bt/s LOCAL(div_by_1)
+ mov r4,r0
+ mov.l @r15+,r5
+ sts mach,r0
+ /* clrt */
+ addc r4,r0
+ mov.l @r15+,r4
+ rotcr r0
+ rts
+ shld r1,r0
+
+LOCAL(div_by_1_neg):
+ neg r4,r0
+LOCAL(div_by_1):
+ mov.l @r15+,r5
+ rts
+ mov.l @r15+,r4
+
+LOCAL(div_ge64k):
+ bt/s LOCAL(div_r8)
+ div0u
+ shll8 r5
+ bra LOCAL(div_ge64k_2)
+ div1 r5,r0
+LOCAL(udiv_ge64k):
+ cmp/hi r0,r5
+ mov r5,r1
+ bt LOCAL(udiv_r8)
+ shll8 r5
+ mov.l r4,@-r15
+ div1 r5,r0
+ mov.l r1,@-r15
+LOCAL(div_ge64k_2):
+ div1 r5,r0
+ mov.l LOCAL(zero_l),r1
+ .rept 4
+ div1 r5,r0
+ .endr
+ mov.l r1,@-r15
+ div1 r5,r0
+ mov.w LOCAL(m256_w),r1
+ div1 r5,r0
+ mov.b r0,@(L_LSWMSB,r15)
+ xor r4,r0
+ and r1,r0
+ bra LOCAL(div_ge64k_end)
+ xor r4,r0
+
+LOCAL(div_r8):
+ shll16 r4
+ bra LOCAL(div_r8_2)
+ shll8 r4
+LOCAL(udiv_r8):
+ mov.l r4,@-r15
+ shll16 r4
+ clrt
+ shll8 r4
+ mov.l r5,@-r15
+LOCAL(div_r8_2):
+ rotcl r4
+ mov r0,r1
+ div1 r5,r1
+ mov r4,r0
+ rotcl r0
+ mov r5,r4
+ div1 r5,r1
+ .rept 5
+ rotcl r0; div1 r5,r1
+ .endr
+ rotcl r0
+ mov.l @r15+,r5
+ div1 r4,r1
+ mov.l @r15+,r4
+ rts
+ rotcl r0
+
+ ENDFUNC(GLOBAL(udivsi3_i4i))
+
+ .global GLOBAL(sdivsi3_i4i)
+ FUNC(GLOBAL(sdivsi3_i4i))
+ /* This is link-compatible with a GLOBAL(sdivsi3) call,
+ but we effectively clobber only r1. */
+GLOBAL(sdivsi3_i4i):
+ mov.l r4,@-r15
+ cmp/pz r5
+ mov.w LOCAL(c128_w), r1
+ bt/s LOCAL(pos_divisor)
+ cmp/pz r4
+ mov.l r5,@-r15
+ neg r5,r5
+ bt/s LOCAL(neg_result)
+ cmp/hi r1,r5
+ neg r4,r4
+LOCAL(pos_result):
+ extu.w r5,r0
+ bf LOCAL(div_le128)
+ cmp/eq r5,r0
+ mov r4,r0
+ shlr8 r0
+ bf/s LOCAL(div_ge64k)
+ cmp/hi r0,r5
+ div0u
+ shll16 r5
+ div1 r5,r0
+ div1 r5,r0
+ div1 r5,r0
+LOCAL(udiv_25):
+ mov.l LOCAL(zero_l),r1
+ div1 r5,r0
+ div1 r5,r0
+ mov.l r1,@-r15
+ .rept 3
+ div1 r5,r0
+ .endr
+ mov.b r0,@(L_MSWLSB,r15)
+ xtrct r4,r0
+ swap.w r0,r0
+ .rept 8
+ div1 r5,r0
+ .endr
+ mov.b r0,@(L_LSWMSB,r15)
+LOCAL(div_ge64k_end):
+ .rept 8
+ div1 r5,r0
+ .endr
+ mov.l @r15+,r4 ! zero-extension and swap using LS unit.
+ extu.b r0,r0
+ mov.l @r15+,r5
+ or r4,r0
+ mov.l @r15+,r4
+ rts
+ rotcl r0
+
+LOCAL(div_le128_neg):
+ tst #0xfe,r0
+ mova LOCAL(div_table_ix),r0
+ mov.b @(r0,r5),r1
+ mova LOCAL(div_table_inv),r0
+ bt/s LOCAL(div_by_1_neg)
+ mov.l @(r0,r1),r1
+ mova LOCAL(div_table_clz),r0
+ dmulu.l r1,r4
+ mov.b @(r0,r5),r1
+ mov.l @r15+,r5
+ sts mach,r0
+ /* clrt */
+ addc r4,r0
+ mov.l @r15+,r4
+ rotcr r0
+ shld r1,r0
+ rts
+ neg r0,r0
+
+LOCAL(pos_divisor):
+ mov.l r5,@-r15
+ bt/s LOCAL(pos_result)
+ cmp/hi r1,r5
+ neg r4,r4
+LOCAL(neg_result):
+ extu.w r5,r0
+ bf LOCAL(div_le128_neg)
+ cmp/eq r5,r0
+ mov r4,r0
+ shlr8 r0
+ bf/s LOCAL(div_ge64k_neg)
+ cmp/hi r0,r5
+ div0u
+ mov.l LOCAL(zero_l),r1
+ shll16 r5
+ div1 r5,r0
+ mov.l r1,@-r15
+ .rept 7
+ div1 r5,r0
+ .endr
+ mov.b r0,@(L_MSWLSB,r15)
+ xtrct r4,r0
+ swap.w r0,r0
+ .rept 8
+ div1 r5,r0
+ .endr
+ mov.b r0,@(L_LSWMSB,r15)
+LOCAL(div_ge64k_neg_end):
+ .rept 8
+ div1 r5,r0
+ .endr
+ mov.l @r15+,r4 ! zero-extension and swap using LS unit.
+ extu.b r0,r1
+ mov.l @r15+,r5
+ or r4,r1
+LOCAL(div_r8_neg_end):
+ mov.l @r15+,r4
+ rotcl r1
+ rts
+ neg r1,r0
+
+LOCAL(div_ge64k_neg):
+ bt/s LOCAL(div_r8_neg)
+ div0u
+ shll8 r5
+ mov.l LOCAL(zero_l),r1
+ .rept 6
+ div1 r5,r0
+ .endr
+ mov.l r1,@-r15
+ div1 r5,r0
+ mov.w LOCAL(m256_w),r1
+ div1 r5,r0
+ mov.b r0,@(L_LSWMSB,r15)
+ xor r4,r0
+ and r1,r0
+ bra LOCAL(div_ge64k_neg_end)
+ xor r4,r0
+
+LOCAL(c128_w):
+ .word 128
+
+LOCAL(div_r8_neg):
+ clrt
+ shll16 r4
+ mov r4,r1
+ shll8 r1
+ mov r5,r4
+ .rept 7
+ rotcl r1; div1 r5,r0
+ .endr
+ mov.l @r15+,r5
+ rotcl r1
+ bra LOCAL(div_r8_neg_end)
+ div1 r4,r0
+
+LOCAL(m256_w):
+ .word 0xff00
+/* This table has been generated by divtab-sh4.c. */
+ .balign 4
+LOCAL(div_table_clz):
+ .byte 0
+ .byte 1
+ .byte 0
+ .byte -1
+ .byte -1
+ .byte -2
+ .byte -2
+ .byte -2
+ .byte -2
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+/* Lookup table translating positive divisor to index into table of
+ normalized inverse. N.B. the '0' entry is also the last entry of the
+ previous table, and causes an unaligned access for division by zero. */
+LOCAL(div_table_ix):
+ .byte -6
+ .byte -128
+ .byte -128
+ .byte 0
+ .byte -128
+ .byte -64
+ .byte 0
+ .byte 64
+ .byte -128
+ .byte -96
+ .byte -64
+ .byte -32
+ .byte 0
+ .byte 32
+ .byte 64
+ .byte 96
+ .byte -128
+ .byte -112
+ .byte -96
+ .byte -80
+ .byte -64
+ .byte -48
+ .byte -32
+ .byte -16
+ .byte 0
+ .byte 16
+ .byte 32
+ .byte 48
+ .byte 64
+ .byte 80
+ .byte 96
+ .byte 112
+ .byte -128
+ .byte -120
+ .byte -112
+ .byte -104
+ .byte -96
+ .byte -88
+ .byte -80
+ .byte -72
+ .byte -64
+ .byte -56
+ .byte -48
+ .byte -40
+ .byte -32
+ .byte -24
+ .byte -16
+ .byte -8
+ .byte 0
+ .byte 8
+ .byte 16
+ .byte 24
+ .byte 32
+ .byte 40
+ .byte 48
+ .byte 56
+ .byte 64
+ .byte 72
+ .byte 80
+ .byte 88
+ .byte 96
+ .byte 104
+ .byte 112
+ .byte 120
+ .byte -128
+ .byte -124
+ .byte -120
+ .byte -116
+ .byte -112
+ .byte -108
+ .byte -104
+ .byte -100
+ .byte -96
+ .byte -92
+ .byte -88
+ .byte -84
+ .byte -80
+ .byte -76
+ .byte -72
+ .byte -68
+ .byte -64
+ .byte -60
+ .byte -56
+ .byte -52
+ .byte -48
+ .byte -44
+ .byte -40
+ .byte -36
+ .byte -32
+ .byte -28
+ .byte -24
+ .byte -20
+ .byte -16
+ .byte -12
+ .byte -8
+ .byte -4
+ .byte 0
+ .byte 4
+ .byte 8
+ .byte 12
+ .byte 16
+ .byte 20
+ .byte 24
+ .byte 28
+ .byte 32
+ .byte 36
+ .byte 40
+ .byte 44
+ .byte 48
+ .byte 52
+ .byte 56
+ .byte 60
+ .byte 64
+ .byte 68
+ .byte 72
+ .byte 76
+ .byte 80
+ .byte 84
+ .byte 88
+ .byte 92
+ .byte 96
+ .byte 100
+ .byte 104
+ .byte 108
+ .byte 112
+ .byte 116
+ .byte 120
+ .byte 124
+ .byte -128
+/* 1/64 .. 1/127, normalized. There is an implicit leading 1 in bit 32. */
+ .balign 4
+LOCAL(zero_l):
+ .long 0x0
+ .long 0xF81F81F9
+ .long 0xF07C1F08
+ .long 0xE9131AC0
+ .long 0xE1E1E1E2
+ .long 0xDAE6076C
+ .long 0xD41D41D5
+ .long 0xCD856891
+ .long 0xC71C71C8
+ .long 0xC0E07039
+ .long 0xBACF914D
+ .long 0xB4E81B4F
+ .long 0xAF286BCB
+ .long 0xA98EF607
+ .long 0xA41A41A5
+ .long 0x9EC8E952
+ .long 0x9999999A
+ .long 0x948B0FCE
+ .long 0x8F9C18FA
+ .long 0x8ACB90F7
+ .long 0x86186187
+ .long 0x81818182
+ .long 0x7D05F418
+ .long 0x78A4C818
+ .long 0x745D1746
+ .long 0x702E05C1
+ .long 0x6C16C16D
+ .long 0x68168169
+ .long 0x642C8591
+ .long 0x60581606
+ .long 0x5C9882BA
+ .long 0x58ED2309
+LOCAL(div_table_inv):
+ .long 0x55555556
+ .long 0x51D07EAF
+ .long 0x4E5E0A73
+ .long 0x4AFD6A06
+ .long 0x47AE147B
+ .long 0x446F8657
+ .long 0x41414142
+ .long 0x3E22CBCF
+ .long 0x3B13B13C
+ .long 0x38138139
+ .long 0x3521CFB3
+ .long 0x323E34A3
+ .long 0x2F684BDB
+ .long 0x2C9FB4D9
+ .long 0x29E4129F
+ .long 0x27350B89
+ .long 0x24924925
+ .long 0x21FB7813
+ .long 0x1F7047DD
+ .long 0x1CF06ADB
+ .long 0x1A7B9612
+ .long 0x18118119
+ .long 0x15B1E5F8
+ .long 0x135C8114
+ .long 0x11111112
+ .long 0xECF56BF
+ .long 0xC9714FC
+ .long 0xA6810A7
+ .long 0x8421085
+ .long 0x624DD30
+ .long 0x4104105
+ .long 0x2040811
+ /* maximum error: 0.987342 scaled: 0.921875*/
+
+ ENDFUNC(GLOBAL(sdivsi3_i4i))
+#endif /* SH3 / SH4 */
+
#endif /* L_div_table */
-#endif /* __SH5__ */
diff --git a/gcc/config/sh/sh.h b/gcc/config/sh/sh.h
index 099c938..1b7ff5d 100644
--- a/gcc/config/sh/sh.h
+++ b/gcc/config/sh/sh.h
@@ -234,6 +234,9 @@ do { \
#define TARGET_DIVIDE_INV20L (sh_div_strategy == SH_DIV_INV20L)
#define TARGET_DIVIDE_INV_CALL (sh_div_strategy == SH_DIV_INV_CALL)
#define TARGET_DIVIDE_INV_CALL2 (sh_div_strategy == SH_DIV_INV_CALL2)
+#define TARGET_DIVIDE_CALL_DIV1 (sh_div_strategy == SH_DIV_CALL_DIV1)
+#define TARGET_DIVIDE_CALL_FP (sh_div_strategy == SH_DIV_CALL_FP)
+#define TARGET_DIVIDE_CALL_TABLE (sh_div_strategy == SH_DIV_CALL_TABLE)
#define SELECT_SH1 (MASK_SH1)
#define SELECT_SH2 (MASK_SH2 | SELECT_SH1)
@@ -467,7 +470,7 @@ do { \
sh_div_str = SH_DIV_STR_FOR_SIZE ; \
} \
/* We can't meaningfully test TARGET_SHMEDIA here, because -m options \
- haven't been parsed yet, hence we';d read only the default. \
+ haven't been parsed yet, hence we'd read only the default. \
sh_target_reg_class will return NO_REGS if this is not SHMEDIA, so \
it's OK to always set flag_branch_target_load_optimize. */ \
if (LEVEL > 1) \
@@ -492,16 +495,24 @@ do { \
extern int assembler_dialect;
enum sh_divide_strategy_e {
+ /* SH5 strategies. */
SH_DIV_CALL,
SH_DIV_CALL2,
- SH_DIV_FP,
+ SH_DIV_FP, /* We could do this also for SH4. */
SH_DIV_INV,
SH_DIV_INV_MINLAT,
SH_DIV_INV20U,
SH_DIV_INV20L,
SH_DIV_INV_CALL,
SH_DIV_INV_CALL2,
- SH_DIV_INV_FP
+ SH_DIV_INV_FP,
+ /* SH1 .. SH4 strategies. Because of the small number of registers
+ available, the compiler uses knowledge of the actual et of registers
+ being clobbed by the different functions called. */
+ SH_DIV_CALL_DIV1, /* No FPU, medium size, highest latency. */
+ SH_DIV_CALL_FP, /* FPU needed, small size, high latency. */
+ SH_DIV_CALL_TABLE, /* No FPU, large size, medium latency. */
+ SH_DIV_INTRINSIC
};
extern enum sh_divide_strategy_e sh_div_strategy;
@@ -611,17 +622,46 @@ do { \
targetm.asm_out.aligned_op.di = NULL; \
targetm.asm_out.unaligned_op.di = NULL; \
} \
+ if (TARGET_SH1) \
+ { \
+ if (! strcmp (sh_div_str, "call-div1")) \
+ sh_div_strategy = SH_DIV_CALL_DIV1; \
+ else if (! strcmp (sh_div_str, "call-fp") \
+ && (TARGET_FPU_DOUBLE \
+ || (TARGET_HARD_SH4 && TARGET_SH2E) \
+ || (TARGET_SHCOMPACT && TARGET_FPU_ANY))) \
+ sh_div_strategy = SH_DIV_CALL_FP; \
+ else if (! strcmp (sh_div_str, "call-table") && TARGET_SH3) \
+ sh_div_strategy = SH_DIV_CALL_TABLE; \
+ else \
+ /* Pick one that makes most sense for the target in general. \
+ It is not much good to use different functions depending \
+ on -Os, since then we'll end up with two different functions \
+ when some of the code is compiled for size, and some for \
+ speed. */ \
+ \
+ /* SH4 tends to emphasize speed. */ \
+ if (TARGET_HARD_SH4) \
+ sh_div_strategy = SH_DIV_CALL_TABLE; \
+ /* These have their own way of doing things. */ \
+ else if (TARGET_SH2A) \
+ sh_div_strategy = SH_DIV_INTRINSIC; \
+ /* ??? Should we use the integer SHmedia function instead? */ \
+ else if (TARGET_SHCOMPACT && TARGET_FPU_ANY) \
+ sh_div_strategy = SH_DIV_CALL_FP; \
+ /* SH1 .. SH3 cores often go into small-footprint systems, so \
+ default to the smallest implementation available. */ \
+ else \
+ sh_div_strategy = SH_DIV_CALL_DIV1; \
+ } \
if (sh_divsi3_libfunc[0]) \
; /* User supplied - leave it alone. */ \
- else if (TARGET_HARD_SH4 && TARGET_SH2E) \
+ else if (TARGET_DIVIDE_CALL_FP) \
sh_divsi3_libfunc = "__sdivsi3_i4"; \
+ else if (TARGET_DIVIDE_CALL_TABLE) \
+ sh_divsi3_libfunc = "__sdivsi3_i4i"; \
else if (TARGET_SH5) \
- { \
- if (TARGET_FPU_ANY && TARGET_SH1) \
- sh_divsi3_libfunc = "__sdivsi3_i4"; \
- else \
- sh_divsi3_libfunc = "__sdivsi3_1"; \
- } \
+ sh_divsi3_libfunc = "__sdivsi3_1"; \
else \
sh_divsi3_libfunc = "__sdivsi3"; \
if (TARGET_FMOVD) \
diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md
index e2e477f..1c1357e 100644
--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@@ -1739,6 +1739,19 @@
[(set_attr "type" "sfunc")
(set_attr "needs_delay_slot" "yes")])
+(define_insn "udivsi3_i4_int"
+ [(set (match_operand:SI 0 "register_operand" "=z")
+ (udiv:SI (reg:SI R4_REG) (reg:SI R5_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI R1_REG))
+ (clobber (reg:SI PR_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH1"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
+
(define_expand "udivsi3"
[(set (match_dup 3) (symbol_ref:SI "__udivsi3"))
(set (reg:SI R4_REG) (match_operand:SI 1 "general_operand" ""))
@@ -1757,7 +1770,12 @@
operands[3] = gen_reg_rtx (Pmode);
/* Emit the move of the address to a pseudo outside of the libcall. */
- if (TARGET_HARD_SH4 && TARGET_SH2E)
+ if (TARGET_DIVIDE_CALL_TABLE)
+ {
+ function_symbol (operands[3], \"__udivsi3_i4i\", SFUNC_GOT);
+ last = gen_udivsi3_i4_int (operands[0], operands[3]);
+ }
+ else if (TARGET_DIVIDE_CALL_FP)
{
function_symbol (operands[3], \"__udivsi3_i4\", SFUNC_STATIC);
if (TARGET_FPU_SINGLE)
@@ -1975,6 +1993,18 @@
[(set_attr "type" "sfunc")
(set_attr "needs_delay_slot" "yes")])
+(define_insn "divsi3_i4_int"
+ [(set (match_operand:SI 0 "register_operand" "=z")
+ (div:SI (reg:SI R4_REG) (reg:SI R5_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH1"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
(define_expand "divsi3"
[(set (match_dup 3) (symbol_ref:SI "__sdivsi3"))
(set (reg:SI R4_REG) (match_operand:SI 1 "general_operand" ""))
@@ -1995,7 +2025,12 @@
operands[3] = gen_reg_rtx (Pmode);
/* Emit the move of the address to a pseudo outside of the libcall. */
- if (TARGET_HARD_SH4 && TARGET_SH2E)
+ if (TARGET_DIVIDE_CALL_TABLE)
+ {
+ function_symbol (operands[3], sh_divsi3_libfunc, SFUNC_GOT);
+ last = gen_divsi3_i4_int (operands[0], operands[3]);
+ }
+ else if (TARGET_DIVIDE_CALL_FP)
{
function_symbol (operands[3], sh_divsi3_libfunc, SFUNC_STATIC);
if (TARGET_FPU_SINGLE)
diff --git a/gcc/config/sh/sh.opt b/gcc/config/sh/sh.opt
index db332f3..9b072fe 100644
--- a/gcc/config/sh/sh.opt
+++ b/gcc/config/sh/sh.opt
@@ -1,6 +1,6 @@
; Options for the SH port of the compiler.
-; Copyright (C) 2005 Free Software Foundation, Inc.
+; Copyright (C) 2005, 2006 Free Software Foundation, Inc.
;
; This file is part of GCC.
;
@@ -158,7 +158,7 @@ Align doubles at 64-bit boundaries
mdiv=
Target RejectNegative Joined Var(sh_div_str) Init("")
-Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp
+Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp call-div1 call-fp call-table
mdivsi3_libfunc=
Target RejectNegative Joined Var(sh_divsi3_libfunc) Init("")
diff --git a/gcc/config/sh/t-sh b/gcc/config/sh/t-sh
index db86ad1..65cc1ec 100644
--- a/gcc/config/sh/t-sh
+++ b/gcc/config/sh/t-sh
@@ -5,6 +5,7 @@ sh-c.o: $(srcdir)/config/sh/sh-c.c \
LIB1ASMSRC = sh/lib1funcs.asm
LIB1ASMFUNCS = _ashiftrt _ashiftrt_n _ashiftlt _lshiftrt _movmem \
_movmem_i4 _mulsi3 _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \
+ _div_table \
$(LIB1ASMFUNCS_CACHE)
# We want fine grained libraries, so use the new code to build the