diff options
author | Kyrylo Tkachov <ktkachov@nvidia.com> | 2024-10-09 09:40:33 -0700 |
---|---|---|
committer | Kyrylo Tkachov <ktkachov@nvidia.com> | 2024-10-11 17:23:19 +0200 |
commit | 1dcc6a1a67165a469d4cd9b6b39514c46cc656ad (patch) | |
tree | 53c9c71e537edcd2d78d7dac2d336f5da26dfdd3 /gcc | |
parent | 70566e719f0710323251e8e9190b322f4de8faeb (diff) | |
download | gcc-1dcc6a1a67165a469d4cd9b6b39514c46cc656ad.zip gcc-1dcc6a1a67165a469d4cd9b6b39514c46cc656ad.tar.gz gcc-1dcc6a1a67165a469d4cd9b6b39514c46cc656ad.tar.bz2 |
PR target/117048 aarch64: Use more canonical and optimization-friendly representation for XAR instruction
The pattern for the Advanced SIMD XAR instruction isn't very
optimization-friendly at the moment.
In the testcase from the PR once simlify-rtx has done its work it
generates the RTL:
(set (reg:V2DI 119 [ _14 ])
(rotate:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
(reg:V2DI 116 [ *m1_01_8(D) ]))
(const_vector:V2DI [
(const_int 32 [0x20]) repeated x2
])))
which fails to match our XAR pattern because the pattern expects:
1) A ROTATERT instead of the ROTATE. However, according to the RTL ops
documentation the preferred form of rotate-by-immediate is ROTATE, which
I take to mean it's the canonical form.
ROTATE (x, C) <-> ROTATERT (x, MODE_WIDTH - C) so it's better to match just
one canonical representation.
2) A CONST_INT shift amount whereas the midend asks for a repeated vector
constant.
These issues are fixed by introducing a dedicated expander for the
aarch64_xarqv2di name, needed by the arm_neon.h intrinsic, that translate
the intrinsic-level CONST_INT immediate (the right-rotate amount) into
a repeated vector constant subtracted from 64 to give the corresponding
left-rotate amount that is fed to the new representation for the XAR
define_insn that uses the ROTATE RTL code. This is a similar approach
to have we handle the discrepancy between intrinsic-level and RTL-level
vector lane numbers for big-endian.
With this patch and [1/2] the arithmetic parts of the testcase now simplify
to just one XAR instruction.
Bootstrapped and tested on aarch64-none-linux-gnu.
Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com>
gcc/
PR target/117048
* config/aarch64/aarch64-simd.md (aarch64_xarqv2di): Redefine into a
define_expand.
(*aarch64_xarqv2di_insn): Define.
gcc/testsuite/
PR target/117048
* g++.target/aarch64/pr117048.C: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 33 | ||||
-rw-r--r-- | gcc/testsuite/g++.target/aarch64/pr117048.C | 34 |
2 files changed, 63 insertions, 4 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 11d405e..bf272bc 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9046,18 +9046,43 @@ [(set_attr "type" "crypto_sha3")] ) -(define_insn "aarch64_xarqv2di" +(define_insn "*aarch64_xarqv2di_insn" [(set (match_operand:V2DI 0 "register_operand" "=w") - (rotatert:V2DI + (rotate:V2DI (xor:V2DI (match_operand:V2DI 1 "register_operand" "%w") (match_operand:V2DI 2 "register_operand" "w")) - (match_operand:SI 3 "aarch64_simd_shift_imm_di" "Usd")))] + (match_operand:V2DI 3 "aarch64_simd_lshift_imm" "Dl")))] "TARGET_SHA3" - "xar\\t%0.2d, %1.2d, %2.2d, %3" + { + operands[3] + = GEN_INT (64 - INTVAL (unwrap_const_vec_duplicate (operands[3]))); + return "xar\\t%0.2d, %1.2d, %2.2d, %3"; + } [(set_attr "type" "crypto_sha3")] ) +;; The semantics of the vxarq_u64 intrinsics treat the immediate argument as a +;; right-rotate amount but the recommended representation of rotates by a +;; constant in RTL is with the left ROTATE code. Translate between the +;; intrinsic-provided amount and the RTL operands in the expander here. +;; The define_insn for XAR will translate back to instruction semantics in its +;; output logic. +(define_expand "aarch64_xarqv2di" + [(set (match_operand:V2DI 0 "register_operand") + (rotate:V2DI + (xor:V2DI + (match_operand:V2DI 1 "register_operand") + (match_operand:V2DI 2 "register_operand")) + (match_operand:SI 3 "aarch64_simd_shift_imm_di")))] + "TARGET_SHA3" + { + operands[3] + = aarch64_simd_gen_const_vector_dup (V2DImode, + 64 - INTVAL (operands[3])); + } +) + (define_insn "bcaxq<mode>4" [(set (match_operand:VQ_I 0 "register_operand" "=w") (xor:VQ_I diff --git a/gcc/testsuite/g++.target/aarch64/pr117048.C b/gcc/testsuite/g++.target/aarch64/pr117048.C new file mode 100644 index 0000000..ae46e58 --- /dev/null +++ b/gcc/testsuite/g++.target/aarch64/pr117048.C @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include <arm_neon.h> + +#pragma GCC target "+sha3" + +static inline uint64x2_t +rotr64_vec(uint64x2_t x, const int b) +{ + int64x2_t neg_b = vdupq_n_s64(-b); + int64x2_t left_shift = vsubq_s64(vdupq_n_s64(64), vdupq_n_s64(b)); + + uint64x2_t right_shifted = vshlq_u64(x, neg_b); + uint64x2_t left_shifted = vshlq_u64(x, left_shift); + + return vorrq_u64(right_shifted, left_shifted); +} + +void G( + int64_t* v, + int64x2_t& m1_01, + int64x2_t& m1_23, + int64x2_t& m2_01, + int64x2_t& m2_23 +) { + int64x2_t vd01 = {v[12],v[13]}; + vd01 = veorq_s64(vd01, m1_01); + vd01 = vreinterpretq_s64_u64(rotr64_vec( vreinterpretq_u64_s64 (vd01), 32)); + v[12] = vgetq_lane_s64(vd01, 0); +} + +/* { dg-final { scan-assembler {\txar\tv[0-9]+\.2d, v[0-9]+\.2d, v[0-9]+\.2d, 32\n} } } */ + |