diff options
author | Akram Ahmad <Akram.Ahmad@arm.com> | 2025-01-06 20:09:30 +0000 |
---|---|---|
committer | Richard Sandiford <richard.sandiford@arm.com> | 2025-01-06 20:09:30 +0000 |
commit | 6069f02a486054484ad638b083cb3b9486bb4321 (patch) | |
tree | 8d0a7d2503c0dd37b39ddf14a80bdd92d56e4750 /gcc | |
parent | 7a7903dec533e02c58e634b6ddd680f36c4cd933 (diff) | |
download | gcc-6069f02a486054484ad638b083cb3b9486bb4321.zip gcc-6069f02a486054484ad638b083cb3b9486bb4321.tar.gz gcc-6069f02a486054484ad638b083cb3b9486bb4321.tar.bz2 |
aarch64: remove extra XTN in vector concatenation
GIMPLE code which performs a narrowing truncation on the result of a
vector concatenation currently results in an unnecessary XTN being
emitted following a UZP1 to concate the operands. In cases such as this,
UZP1 should instead use a smaller arrangement specifier to replace the
XTN instruction. This is seen in cases such as in this GIMPLE example:
int32x2_t foo (svint64_t a, svint64_t b)
{
vector(2) int vect__2.8;
long int _1;
long int _3;
vector(2) long int _12;
<bb 2> [local count: 1073741824]:
_1 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, a_6(D));
_3 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, b_7(D));
_12 = {_1, _3};
vect__2.8_13 = (vector(2) int) _12;
return vect__2.8_13;
}
Original assembly generated:
bar:
ptrue p3.b, all
uaddv d0, p3, z0.d
uaddv d1, p3, z1.d
uzp1 v0.2d, v0.2d, v1.2d
xtn v0.2s, v0.2d
ret
This patch therefore defines the *aarch64_trunc_concat<mode> insn which
truncates the concatenation result, rather than concatenating the
truncated operands (such as in *aarch64_narrow_trunc<mode>), resulting
in the following optimised assembly being emitted:
bar:
ptrue p3.b, all
uaddv d0, p3, z0.d
uaddv d1, p3, z1.d
uzp1 v0.2s, v0.2s, v1.2s
ret
This patch passes all regression tests on aarch64 with no new failures.
A supporting test for this optimisation is also written and passes.
OK for master? I do not have commit rights so I cannot push the patch
myself.
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md: (*aarch64_trunc_concat)
new insn definition.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/sve/truncated_concatenation_1.c: new test
for the above example and other modes covered by insn
definitions.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 16 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c | 32 |
2 files changed, 48 insertions, 0 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index f43f0f6..eeb626f 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1874,6 +1874,22 @@ [(set_attr "type" "neon_permute<q>")] ) +(define_insn "*aarch64_trunc_concat<mode>" + [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w") + (truncate:<VNARROWQ> + (vec_concat:VQN + (match_operand:<VHALF> 1 "register_operand" "w") + (match_operand:<VHALF> 2 "register_operand" "w"))))] + "TARGET_SIMD" +{ + if (!BYTES_BIG_ENDIAN) + return "uzp1\\t%0.<Vntype>, %1.<Vntype>, %2.<Vntype>"; + else + return "uzp1\\t%0.<Vntype>, %2.<Vntype>, %1.<Vntype>"; +} + [(set_attr "type" "neon_permute<q>")] +) + ;; Packing doubles. (define_expand "vec_pack_trunc_<mode>" diff --git a/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c new file mode 100644 index 0000000..95577a1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -Wall -march=armv8.2-a+sve" } */ + +#include <arm_neon.h> +#include <arm_sve.h> + +int8x8_t f1 (int16x4_t a, int16x4_t b) { + int8x8_t ab = vdup_n_s8 (0); + int16x8_t ab_concat = vcombine_s16 (a, b); + ab = vmovn_s16 (ab_concat); + return ab; +} + +int16x4_t f2 (int32x2_t a, int32x2_t b) { + int16x4_t ab = vdup_n_s16 (0); + int32x4_t ab_concat = vcombine_s32 (a, b); + ab = vmovn_s32 (ab_concat); + return ab; +} + +int32x2_t f3 (svint64_t a, svint64_t b) { + int32x2_t ab = vdup_n_s32 (0); + ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), a), ab, 0); + ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), b), ab, 1); + return ab; +} + +/* { dg-final { scan-assembler-not {\txtn\t} } }*/ +/* { dg-final { scan-assembler-not {\tfcvtn\t} } }*/ +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.8b, v[0-9]+\.8b, v[0-9]+\.8b} 1 } }*/ +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.4h, v[0-9]+\.4h, v[0-9]+\.4h} 1 } }*/ +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.2s, v[0-9]+\.2s, v[0-9]+\.2s} 1 } }*/
\ No newline at end of file |