aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2020-02-19 17:22:14 +0000
committerRichard Sandiford <richard.sandiford@arm.com>2020-02-21 10:17:31 +0000
commita0ee8352df6f4cd98830c8dbaa969e1cda39cc40 (patch)
tree4ca1ffbd94cbe5d750ce2f40609cdc8476d2ab82
parent04f307cbb9e0610b2e3c70029ee6bfdbd7b8b9de (diff)
downloadgcc-a0ee8352df6f4cd98830c8dbaa969e1cda39cc40.zip
gcc-a0ee8352df6f4cd98830c8dbaa969e1cda39cc40.tar.gz
gcc-a0ee8352df6f4cd98830c8dbaa969e1cda39cc40.tar.bz2
aarch64: Add SVE support for -mlow-precision-sqrt
SVE was missing support for -mlow-precision-sqrt, which meant that -march=armv8.2-a+sve -mlow-precision-sqrt could cause a performance regression compared to -march=armv8.2-a -mlow-precision-sqrt. 2020-02-21 Richard Sandiford <richard.sandiford@arm.com> gcc/ * config/aarch64/aarch64.c (aarch64_emit_approx_sqrt): Add SVE support. Use aarch64_emit_mult instead of emitting multiplication instructions directly. * config/aarch64/aarch64-sve.md (sqrt<mode>2, rsqrt<mode>2) (@aarch64_rsqrte<mode>, @aarch64_rsqrts<mode>): New expanders. gcc/testsuite/ * gcc.target/aarch64/sve/rsqrt_1.c: New test. * gcc.target/aarch64/sve/rsqrt_1_run.c: Likewise. * gcc.target/aarch64/sve/sqrt_1.c: Likewise. * gcc.target/aarch64/sve/sqrt_1_run.c: Likewise.
-rw-r--r--gcc/ChangeLog8
-rw-r--r--gcc/config/aarch64/aarch64-sve.md56
-rw-r--r--gcc/config/aarch64/aarch64.c58
-rw-r--r--gcc/config/aarch64/iterators.md13
-rw-r--r--gcc/testsuite/ChangeLog7
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1.c27
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1_run.c27
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/sqrt_1.c30
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/sqrt_1_run.c27
9 files changed, 234 insertions, 19 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 4d161ca..6e7de5fa 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,13 @@
2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
+ * config/aarch64/aarch64.c (aarch64_emit_approx_sqrt): Add SVE
+ support. Use aarch64_emit_mult instead of emitting multiplication
+ instructions directly.
+ * config/aarch64/aarch64-sve.md (sqrt<mode>2, rsqrt<mode>2)
+ (@aarch64_rsqrte<mode>, @aarch64_rsqrts<mode>): New expanders.
+
+2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
+
* config/aarch64/aarch64.c (aarch64_emit_mult): New function.
(aarch64_emit_approx_div): Add SVE support. Use aarch64_emit_mult
instead of emitting multiplication instructions directly.
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index e3b1da8..a661b25 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -76,6 +76,8 @@
;; ---- [INT] Logical inverse
;; ---- [FP<-INT] General unary arithmetic that maps to unspecs
;; ---- [FP] General unary arithmetic corresponding to unspecs
+;; ---- [FP] Square root
+;; ---- [FP] Reciprocal square root
;; ---- [PRED] Inverse
;; == Binary arithmetic
@@ -3246,7 +3248,7 @@
;; - FRINTP
;; - FRINTX
;; - FRINTZ
-;; - FRSQRT
+;; - FRSQRTE
;; - FSQRT
;; -------------------------------------------------------------------------
@@ -3267,7 +3269,7 @@
[(match_dup 2)
(const_int SVE_RELAXED_GP)
(match_operand:SVE_FULL_F 1 "register_operand")]
- SVE_COND_FP_UNARY))]
+ SVE_COND_FP_UNARY_OPTAB))]
"TARGET_SVE"
{
operands[2] = aarch64_ptrue_reg (<VPRED>mode);
@@ -3358,6 +3360,56 @@
)
;; -------------------------------------------------------------------------
+;; ---- [FP] Square root
+;; -------------------------------------------------------------------------
+
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:SVE_FULL_F 0 "register_operand")
+ (unspec:SVE_FULL_F
+ [(match_dup 2)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_FULL_F 1 "register_operand")]
+ UNSPEC_COND_FSQRT))]
+ "TARGET_SVE"
+{
+ if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+ DONE;
+ operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+})
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Reciprocal square root
+;; -------------------------------------------------------------------------
+
+(define_expand "rsqrt<mode>2"
+ [(set (match_operand:SVE_FULL_SDF 0 "register_operand")
+ (unspec:SVE_FULL_SDF
+ [(match_operand:SVE_FULL_SDF 1 "register_operand")]
+ UNSPEC_RSQRT))]
+ "TARGET_SVE"
+{
+ aarch64_emit_approx_sqrt (operands[0], operands[1], true);
+ DONE;
+})
+
+(define_expand "@aarch64_rsqrte<mode>"
+ [(set (match_operand:SVE_FULL_SDF 0 "register_operand")
+ (unspec:SVE_FULL_SDF
+ [(match_operand:SVE_FULL_SDF 1 "register_operand")]
+ UNSPEC_RSQRTE))]
+ "TARGET_SVE"
+)
+
+(define_expand "@aarch64_rsqrts<mode>"
+ [(set (match_operand:SVE_FULL_SDF 0 "register_operand")
+ (unspec:SVE_FULL_SDF
+ [(match_operand:SVE_FULL_SDF 1 "register_operand")
+ (match_operand:SVE_FULL_SDF 2 "register_operand")]
+ UNSPEC_RSQRTS))]
+ "TARGET_SVE"
+)
+
+;; -------------------------------------------------------------------------
;; ---- [PRED] Inverse
;; -------------------------------------------------------------------------
;; Includes:
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index c1bbc49..703f69a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12790,6 +12790,9 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
/* Caller assumes we cannot fail. */
gcc_assert (use_rsqrt_p (mode));
+ rtx pg = NULL_RTX;
+ if (aarch64_sve_mode_p (mode))
+ pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
machine_mode mmsk = (VECTOR_MODE_P (mode)
? related_int_vector_mode (mode).require ()
: int_mode_for_mode (mode).require ());
@@ -12798,11 +12801,21 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
{
/* When calculating the approximate square root, compare the
argument with 0.0 and create a mask. */
- xmsk = gen_reg_rtx (mmsk);
- emit_insn (gen_rtx_SET (xmsk,
- gen_rtx_NEG (mmsk,
- gen_rtx_EQ (mmsk, src,
- CONST0_RTX (mode)))));
+ rtx zero = CONST0_RTX (mode);
+ if (pg)
+ {
+ xmsk = gen_reg_rtx (GET_MODE (pg));
+ rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
+ emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
+ xmsk, pg, hint, src, zero));
+ }
+ else
+ {
+ xmsk = gen_reg_rtx (mmsk);
+ emit_insn (gen_rtx_SET (xmsk,
+ gen_rtx_NEG (mmsk,
+ gen_rtx_EQ (mmsk, src, zero))));
+ }
}
/* Estimate the approximate reciprocal square root. */
@@ -12824,29 +12837,40 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
while (iterations--)
{
rtx x2 = gen_reg_rtx (mode);
- emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
+ aarch64_emit_mult (x2, pg, xdst, xdst);
emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
if (iterations > 0)
- emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
+ aarch64_emit_mult (xdst, pg, xdst, x1);
}
if (!recp)
{
- /* Qualify the approximate reciprocal square root when the argument is
- 0.0 by squashing the intermediary result to 0.0. */
- rtx xtmp = gen_reg_rtx (mmsk);
- emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
- gen_rtx_SUBREG (mmsk, xdst, 0)));
- emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
-
- /* Calculate the approximate square root. */
- emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
+ if (pg)
+ /* Multiply nonzero source values by the corresponding intermediate
+ result elements, so that the final calculation is the approximate
+ square root rather than its reciprocal. Select a zero result for
+ zero source values, to avoid the Inf * 0 -> NaN that we'd get
+ otherwise. */
+ emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
+ xdst, xmsk, xdst, src, CONST0_RTX (mode)));
+ else
+ {
+ /* Qualify the approximate reciprocal square root when the
+ argument is 0.0 by squashing the intermediary result to 0.0. */
+ rtx xtmp = gen_reg_rtx (mmsk);
+ emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+ gen_rtx_SUBREG (mmsk, xdst, 0)));
+ emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
+
+ /* Calculate the approximate square root. */
+ aarch64_emit_mult (xdst, pg, xdst, src);
+ }
}
/* Finalize the approximation. */
- emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
+ aarch64_emit_mult (dst, pg, xdst, x1);
return true;
}
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 548ee0f..b106957 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2277,6 +2277,19 @@
UNSPEC_COND_FRINTZ
UNSPEC_COND_FSQRT])
+;; Same as SVE_COND_FP_UNARY, but without codes that have a dedicated
+;; <optab><mode>2 expander.
+(define_int_iterator SVE_COND_FP_UNARY_OPTAB [UNSPEC_COND_FABS
+ UNSPEC_COND_FNEG
+ UNSPEC_COND_FRECPX
+ UNSPEC_COND_FRINTA
+ UNSPEC_COND_FRINTI
+ UNSPEC_COND_FRINTM
+ UNSPEC_COND_FRINTN
+ UNSPEC_COND_FRINTP
+ UNSPEC_COND_FRINTX
+ UNSPEC_COND_FRINTZ])
+
(define_int_iterator SVE_COND_FCVT [UNSPEC_COND_FCVT])
(define_int_iterator SVE_COND_FCVTI [UNSPEC_COND_FCVTZS UNSPEC_COND_FCVTZU])
(define_int_iterator SVE_COND_ICVTF [UNSPEC_COND_SCVTF UNSPEC_COND_UCVTF])
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 936260e..8518061 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,5 +1,12 @@
2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
+ * gcc.target/aarch64/sve/rsqrt_1.c: New test.
+ * gcc.target/aarch64/sve/rsqrt_1_run.c: Likewise.
+ * gcc.target/aarch64/sve/sqrt_1.c: Likewise.
+ * gcc.target/aarch64/sve/sqrt_1_run.c: Likewise.
+
+2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
+
* gcc.target/aarch64/sve/recip_1.c: New test.
* gcc.target/aarch64/sve/recip_1_run.c: Likewise.
* gcc.target/aarch64/sve/recip_2.c: Likewise.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1.c
new file mode 100644
index 0000000..2dabfd3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1.c
@@ -0,0 +1,27 @@
+/* { dg-options "-Ofast -mlow-precision-sqrt" } */
+
+#define DEF_LOOP(TYPE, FN) \
+ void \
+ test_##TYPE (TYPE *x, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ x[i] = (TYPE) 1 / FN (x[i]); \
+ }
+
+#define TEST_ALL(T) \
+ T (_Float16, __builtin_sqrtf16) \
+ T (float, __builtin_sqrtf) \
+ T (double, __builtin_sqrt)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-not {\tfrsqrte\tz[0-9]+\.h} } } */
+/* { dg-final { scan-assembler-not {\tfrsqrts\tz[0-9]+\.h} } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.s} 1 } } */
+/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.s} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.d} 1 } } */
+/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.d} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1_run.c
new file mode 100644
index 0000000..73d309a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -mlow-precision-sqrt" } */
+
+#include "rsqrt_1.c"
+
+#define N 77
+
+#define TEST_LOOP(TYPE, FN) \
+ { \
+ TYPE a[N]; \
+ for (int i = 0; i < N; ++i) \
+ a[i] = i + 1; \
+ test_##TYPE (a, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ double diff = a[i] - 1.0 / __builtin_sqrt (i + 1); \
+ if (__builtin_fabs (diff) > 0x1.0p-8) \
+ __builtin_abort (); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP);
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1.c
new file mode 100644
index 0000000..aba2bf6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1.c
@@ -0,0 +1,30 @@
+/* { dg-options "-Ofast -mlow-precision-sqrt" } */
+
+#define DEF_LOOP(TYPE, FN) \
+ void \
+ test_##TYPE (TYPE *x, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ x[i] = FN (x[i]); \
+ }
+
+#define TEST_ALL(T) \
+ T (_Float16, __builtin_sqrtf16) \
+ T (float, __builtin_sqrtf) \
+ T (double, __builtin_sqrt)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler {\tfsqrt\tz[0-9]+\.h} } } */
+/* { dg-final { scan-assembler-not {\tfrsqrte\tz[0-9]+\.h} } } */
+/* { dg-final { scan-assembler-not {\tfrsqrts\tz[0-9]+\.h} } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s} 3 } } */
+/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.s} 1 } } */
+/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.s} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d} 5 } } */
+/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.d} 1 } } */
+/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.d} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1_run.c
new file mode 100644
index 0000000..30906ceb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -mlow-precision-sqrt" } */
+
+#include "sqrt_1.c"
+
+#define N 77
+
+#define TEST_LOOP(TYPE, FN) \
+ { \
+ TYPE a[N]; \
+ for (int i = 0; i < N; ++i) \
+ a[i] = i; \
+ test_##TYPE (a, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ double diff = a[i] - __builtin_sqrt (i); \
+ if (__builtin_fabs (diff) > 0x1.0p-8) \
+ __builtin_abort (); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP);
+ return 0;
+}