aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2020-02-19 18:28:48 +0000
committerRichard Sandiford <richard.sandiford@arm.com>2020-02-21 10:17:30 +0000
commit04f307cbb9e0610b2e3c70029ee6bfdbd7b8b9de (patch)
treededb8e3ec29d23564e4cd873b789139102f439a0 /gcc
parentd87778ed097f24a0bf394c0255019547008c3479 (diff)
downloadgcc-04f307cbb9e0610b2e3c70029ee6bfdbd7b8b9de.zip
gcc-04f307cbb9e0610b2e3c70029ee6bfdbd7b8b9de.tar.gz
gcc-04f307cbb9e0610b2e3c70029ee6bfdbd7b8b9de.tar.bz2
aarch64: Add SVE support for -mlow-precision-div
SVE was missing support for -mlow-precision-div, which meant that -march=armv8.2-a+sve -mlow-precision-div could cause a performance regression compared to -march=armv8.2-a -mlow-precision-div. I ended up doing this much later than originally intended, sorry... 2020-02-21 Richard Sandiford <richard.sandiford@arm.com> gcc/ * config/aarch64/aarch64.c (aarch64_emit_mult): New function. (aarch64_emit_approx_div): Add SVE support. Use aarch64_emit_mult instead of emitting multiplication instructions directly. * config/aarch64/iterators.md (SVE_COND_FP_BINARY_OPTAB): New iterator. * config/aarch64/aarch64-sve.md (div<mode>3, @aarch64_frecpe<mode>) (@aarch64_frecps<mode>): New expanders. gcc/testsuite/ * gcc.target/aarch64/sve/recip_1.c: New test. * gcc.target/aarch64/sve/recip_1_run.c: Likewise. * gcc.target/aarch64/sve/recip_2.c: Likewise. * gcc.target/aarch64/sve/recip_2_run.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog9
-rw-r--r--gcc/config/aarch64/aarch64-sve.md44
-rw-r--r--gcc/config/aarch64/aarch64.c29
-rw-r--r--gcc/config/aarch64/iterators.md11
-rw-r--r--gcc/testsuite/ChangeLog7
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/recip_1.c27
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/recip_1_run.c27
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/recip_2.c27
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/recip_2_run.c30
9 files changed, 207 insertions, 4 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b5eb87a..4d161ca 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,14 @@
2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
+ * config/aarch64/aarch64.c (aarch64_emit_mult): New function.
+ (aarch64_emit_approx_div): Add SVE support. Use aarch64_emit_mult
+ instead of emitting multiplication instructions directly.
+ * config/aarch64/iterators.md (SVE_COND_FP_BINARY_OPTAB): New iterator.
+ * config/aarch64/aarch64-sve.md (div<mode>3, @aarch64_frecpe<mode>)
+ (@aarch64_frecps<mode>): New expanders.
+
+2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
+
* config/aarch64/aarch64-protos.h (AARCH64_APPROX_MODE): Operate
on and produce uint64_ts rather than ints.
(AARCH64_APPROX_NONE, AARCH64_APPROX_ALL): Change to uint64_ts.
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index fa38529..e3b1da8 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -99,6 +99,7 @@
;; ---- [FP] Subtraction
;; ---- [FP] Absolute difference
;; ---- [FP] Multiplication
+;; ---- [FP] Division
;; ---- [FP] Binary logical operations
;; ---- [FP] Sign copying
;; ---- [FP] Maximum and minimum
@@ -4719,7 +4720,7 @@
(const_int SVE_RELAXED_GP)
(match_operand:SVE_FULL_F 1 "<sve_pred_fp_rhs1_operand>")
(match_operand:SVE_FULL_F 2 "<sve_pred_fp_rhs2_operand>")]
- SVE_COND_FP_BINARY))]
+ SVE_COND_FP_BINARY_OPTAB))]
"TARGET_SVE"
{
operands[3] = aarch64_ptrue_reg (<VPRED>mode);
@@ -5456,6 +5457,47 @@
)
;; -------------------------------------------------------------------------
+;; ---- [FP] Division
+;; -------------------------------------------------------------------------
+;; The patterns in this section are synthetic.
+;; -------------------------------------------------------------------------
+
+(define_expand "div<mode>3"
+ [(set (match_operand:SVE_FULL_F 0 "register_operand")
+ (unspec:SVE_FULL_F
+ [(match_dup 3)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_FULL_F 1 "nonmemory_operand")
+ (match_operand:SVE_FULL_F 2 "register_operand")]
+ UNSPEC_COND_FDIV))]
+ "TARGET_SVE"
+ {
+ if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+ DONE;
+
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+ operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+ }
+)
+
+(define_expand "@aarch64_frecpe<mode>"
+ [(set (match_operand:SVE_FULL_F 0 "register_operand")
+ (unspec:SVE_FULL_F
+ [(match_operand:SVE_FULL_F 1 "register_operand")]
+ UNSPEC_FRECPE))]
+ "TARGET_SVE"
+)
+
+(define_expand "@aarch64_frecps<mode>"
+ [(set (match_operand:SVE_FULL_F 0 "register_operand")
+ (unspec:SVE_FULL_F
+ [(match_operand:SVE_FULL_F 1 "register_operand")
+ (match_operand:SVE_FULL_F 2 "register_operand")]
+ UNSPEC_FRECPS))]
+ "TARGET_SVE"
+)
+
+;; -------------------------------------------------------------------------
;; ---- [FP] Binary logical operations
;; -------------------------------------------------------------------------
;; Includes
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0acaa06..c1bbc49 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12739,6 +12739,25 @@ aarch64_builtin_reciprocal (tree fndecl)
gcc_unreachable ();
}
+/* Emit code to perform the floating-point operation:
+
+ DST = SRC1 * SRC2
+
+ where all three operands are already known to be registers.
+ If the operation is an SVE one, PTRUE is a suitable all-true
+ predicate. */
+
+static void
+aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
+{
+ if (ptrue)
+ emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
+ dst, ptrue, src1, src2,
+ gen_int_mode (SVE_RELAXED_GP, SImode)));
+ else
+ emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
+}
+
/* Emit instruction sequence to compute either the approximate square root
or its approximate reciprocal, depending on the flag RECP, and return
whether the sequence was emitted or not. */
@@ -12857,6 +12876,10 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
if (!TARGET_SIMD && VECTOR_MODE_P (mode))
return false;
+ rtx pg = NULL_RTX;
+ if (aarch64_sve_mode_p (mode))
+ pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
+
/* Estimate the approximate reciprocal. */
rtx xrcp = gen_reg_rtx (mode);
emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
@@ -12876,7 +12899,7 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
if (iterations > 0)
- emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+ aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
}
if (num != CONST1_RTX (mode))
@@ -12884,11 +12907,11 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
/* As the approximate reciprocal of DEN is already calculated, only
calculate the approximate division when NUM is not 1.0. */
rtx xnum = force_reg (mode, num);
- emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
+ aarch64_emit_mult (xrcp, pg, xrcp, xnum);
}
/* Finalize the approximation. */
- emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+ aarch64_emit_mult (quo, pg, xrcp, xtmp);
return true;
}
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index d17d79a..548ee0f 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2291,6 +2291,17 @@
UNSPEC_COND_FMULX
UNSPEC_COND_FSUB])
+;; Same as SVE_COND_FP_BINARY, but without codes that have a dedicated
+;; <optab><mode>3 expander.
+(define_int_iterator SVE_COND_FP_BINARY_OPTAB [UNSPEC_COND_FADD
+ UNSPEC_COND_FMAX
+ UNSPEC_COND_FMAXNM
+ UNSPEC_COND_FMIN
+ UNSPEC_COND_FMINNM
+ UNSPEC_COND_FMUL
+ UNSPEC_COND_FMULX
+ UNSPEC_COND_FSUB])
+
(define_int_iterator SVE_COND_FP_BINARY_INT [UNSPEC_COND_FSCALE])
(define_int_iterator SVE_COND_FP_ADD [UNSPEC_COND_FADD])
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 216bac9..936260e 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@
+2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
+
+ * gcc.target/aarch64/sve/recip_1.c: New test.
+ * gcc.target/aarch64/sve/recip_1_run.c: Likewise.
+ * gcc.target/aarch64/sve/recip_2.c: Likewise.
+ * gcc.target/aarch64/sve/recip_2_run.c: Likewise.
+
2020-02-20 Martin Sebor <msebor@redhat.com>
PR c++/93801
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/recip_1.c b/gcc/testsuite/gcc.target/aarch64/sve/recip_1.c
new file mode 100644
index 0000000..c9d470f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/recip_1.c
@@ -0,0 +1,27 @@
+/* { dg-options "-Ofast -mlow-precision-div" } */
+
+#define DEF_LOOP(TYPE) \
+ void \
+ test_##TYPE (TYPE *x, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ x[i] = (TYPE) 1 / x[i]; \
+ }
+
+#define TEST_ALL(T) \
+ T (_Float16) \
+ T (float) \
+ T (double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-not {\tfrecpe\tz[0-9]+\.h} } } */
+/* { dg-final { scan-assembler-not {\tfrecps\tz[0-9]+\.h} } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s} 1 } } */
+/* { dg-final { scan-assembler-times {\tfrecpe\tz[0-9]+\.s} 1 } } */
+/* { dg-final { scan-assembler-times {\tfrecps\tz[0-9]+\.s} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d} 2 } } */
+/* { dg-final { scan-assembler-times {\tfrecpe\tz[0-9]+\.d} 1 } } */
+/* { dg-final { scan-assembler-times {\tfrecps\tz[0-9]+\.d} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/recip_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/recip_1_run.c
new file mode 100644
index 0000000..b232b88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/recip_1_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -mlow-precision-div" } */
+
+#include "recip_1.c"
+
+#define N 77
+
+#define TEST_LOOP(TYPE) \
+ { \
+ TYPE a[N]; \
+ for (int i = 0; i < N; ++i) \
+ a[i] = i + 1; \
+ test_##TYPE (a, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ double diff = a[i] - 1.0 / (i + 1); \
+ if (__builtin_fabs (diff) > 0x1.0p-8) \
+ __builtin_abort (); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP);
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/recip_2.c b/gcc/testsuite/gcc.target/aarch64/sve/recip_2.c
new file mode 100644
index 0000000..f308a6b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/recip_2.c
@@ -0,0 +1,27 @@
+/* { dg-options "-Ofast -mlow-precision-div" } */
+
+#define DEF_LOOP(TYPE) \
+ void \
+ test_##TYPE (TYPE *restrict x, TYPE *restrict y, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ x[i] /= y[i]; \
+ }
+
+#define TEST_ALL(T) \
+ T (_Float16) \
+ T (float) \
+ T (double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-not {\tfrecpe\tz[0-9]+\.h} } } */
+/* { dg-final { scan-assembler-not {\tfrecps\tz[0-9]+\.h} } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tfrecpe\tz[0-9]+\.s} 1 } } */
+/* { dg-final { scan-assembler-times {\tfrecps\tz[0-9]+\.s} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d} 3 } } */
+/* { dg-final { scan-assembler-times {\tfrecpe\tz[0-9]+\.d} 1 } } */
+/* { dg-final { scan-assembler-times {\tfrecps\tz[0-9]+\.d} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/recip_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/recip_2_run.c
new file mode 100644
index 0000000..25a31e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/recip_2_run.c
@@ -0,0 +1,30 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -mlow-precision-div" } */
+
+#include "recip_2.c"
+
+#define N 77
+
+#define TEST_LOOP(TYPE) \
+ { \
+ TYPE a[N], b[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ a[i] = i + 11; \
+ b[i] = i + 1; \
+ } \
+ test_##TYPE (a, b, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ double diff = a[i] - (i + 11.0) / (i + 1); \
+ if (__builtin_fabs (diff) > 0x1.0p-8) \
+ __builtin_abort (); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP);
+ return 0;
+}