aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJennifer Schmitz <jschmitz@nvidia.com>2024-10-17 02:31:47 -0700
committerJennifer Schmitz <jschmitz@nvidia.com>2024-10-24 11:54:27 +0200
commitf6fbc0d2422ce9bea6a23226f4a13a76ffd1784b (patch)
tree024aec1f3e54777a5ece46017dcf9f8bd6d2e30c
parent3e7549ece7c6b90b9e961778361ee2b65bf104a9 (diff)
downloadgcc-f6fbc0d2422ce9bea6a23226f4a13a76ffd1784b.zip
gcc-f6fbc0d2422ce9bea6a23226f4a13a76ffd1784b.tar.gz
gcc-f6fbc0d2422ce9bea6a23226f4a13a76ffd1784b.tar.bz2
SVE intrinsics: Fold svsra with op1 all zeros to svlsr/svasr.
A common idiom in intrinsics loops is to have accumulator intrinsics in an unrolled loop with an accumulator initialized to zero at the beginning. Propagating the initial zero accumulator into the first iteration of the loop and simplifying the first accumulate instruction is a desirable transformation that we should teach GCC. Therefore, this patch folds svsra to svlsr/svasr if op1 is all zeros, producing the lower latency instructions LSR/ASR instead of USRA/SSRA. We implemented this optimization in svsra_impl::fold. Tests were added to check the produced assembly for use of LSR/ASR. The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. OK for mainline? Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com> gcc/ * config/aarch64/aarch64-sve-builtins-sve2.cc (svsra_impl::fold): Fold svsra to svlsr/svasr if op1 is all zeros. gcc/testsuite/ * gcc.target/aarch64/sve2/acle/asm/sra_s32.c: New test. * gcc.target/aarch64/sve2/acle/asm/sra_s64.c: Likewise. * gcc.target/aarch64/sve2/acle/asm/sra_u32.c: Likewise. * gcc.target/aarch64/sve2/acle/asm/sra_u64.c: Likewise.
-rw-r--r--gcc/config/aarch64/aarch64-sve-builtins-sve2.cc28
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c9
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c9
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c9
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c9
5 files changed, 64 insertions, 0 deletions
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
index 6a20a61..ddd6e46 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
@@ -418,6 +418,34 @@ public:
class svsra_impl : public function_base
{
public:
+ gimple *
+ fold (gimple_folder &f) const override
+ {
+ /* Fold to svlsr/svasr if op1 is all zeros. */
+ tree op1 = gimple_call_arg (f.call, 0);
+ if (!integer_zerop (op1))
+ return NULL;
+ function_instance instance ("svlsr", functions::svlsr,
+ shapes::binary_uint_opt_n, MODE_n,
+ f.type_suffix_ids, GROUP_none, PRED_x);
+ if (!f.type_suffix (0).unsigned_p)
+ {
+ instance.base_name = "svasr";
+ instance.base = functions::svasr;
+ }
+ gcall *call = f.redirect_call (instance);
+ /* Add a ptrue as predicate, because unlike svsra, svlsr/svasr are
+ predicated intrinsics. */
+ gimple_call_set_arg (call, 0, build_all_ones_cst (f.gp_type ()));
+ /* For svsra, the shift amount (imm3) is uint64_t for all function types,
+ but for svlsr/svasr, imm3 has the same width as the function type. */
+ tree imm3 = gimple_call_arg (f.call, 2);
+ tree imm3_prec = wide_int_to_tree (f.scalar_type (0),
+ wi::to_widest (imm3));
+ gimple_call_set_arg (call, 2, imm3_prec);
+ return call;
+ }
+public:
rtx
expand (function_expander &e) const override
{
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
index ac992dc..86cf4bd 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_32_s32_tied2, svint32_t,
TEST_UNIFORM_Z (sra_32_s32_untied, svint32_t,
z0 = svsra_n_s32 (z1, z2, 32),
z0 = svsra (z1, z2, 32))
+
+/*
+** sra_2_s32_zeroop1:
+** asr z0\.s, z1\.s, #2
+** ret
+*/
+TEST_UNIFORM_Z (sra_2_s32_zeroop1, svint32_t,
+ z0 = svsra_n_s32 (svdup_s32 (0), z1, 2),
+ z0 = svsra (svdup_s32 (0), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
index 9ea5657..7b39798 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_64_s64_tied2, svint64_t,
TEST_UNIFORM_Z (sra_64_s64_untied, svint64_t,
z0 = svsra_n_s64 (z1, z2, 64),
z0 = svsra (z1, z2, 64))
+
+/*
+** sra_2_s64_zeroop1:
+** asr z0\.d, z1\.d, #2
+** ret
+*/
+TEST_UNIFORM_Z (sra_2_s64_zeroop1, svint64_t,
+ z0 = svsra_n_s64 (svdup_s64 (0), z1, 2),
+ z0 = svsra (svdup_s64 (0), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
index 0902451..001e09c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_32_u32_tied2, svuint32_t,
TEST_UNIFORM_Z (sra_32_u32_untied, svuint32_t,
z0 = svsra_n_u32 (z1, z2, 32),
z0 = svsra (z1, z2, 32))
+
+/*
+** sra_2_u32_zeroop1:
+** lsr z0\.s, z1\.s, #2
+** ret
+*/
+TEST_UNIFORM_Z (sra_2_u32_zeroop1, svuint32_t,
+ z0 = svsra_n_u32 (svdup_u32 (0), z1, 2),
+ z0 = svsra (svdup_u32 (0), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
index ff21c36..780cf7a 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_64_u64_tied2, svuint64_t,
TEST_UNIFORM_Z (sra_64_u64_untied, svuint64_t,
z0 = svsra_n_u64 (z1, z2, 64),
z0 = svsra (z1, z2, 64))
+
+/*
+** sra_2_u64_zeroop1:
+** lsr z0\.d, z1\.d, #2
+** ret
+*/
+TEST_UNIFORM_Z (sra_2_u64_zeroop1, svuint64_t,
+ z0 = svsra_n_u64 (svdup_u64 (0), z1, 2),
+ z0 = svsra (svdup_u64 (0), z1, 2))