diff options
author | Christophe Lyon <christophe.lyon@arm.com> | 2022-03-18 08:30:00 +0000 |
---|---|---|
committer | Christophe Lyon <christophe.lyon@arm.com> | 2022-03-25 17:26:33 +0000 |
commit | 3ab5c8cd03d92bf4ec41e351820349d92fbc40c4 (patch) | |
tree | d7fb4667893cc9150e2bea0d960af8e83a732069 /gcc | |
parent | 25725506b85f478076770942d76799c54310c696 (diff) | |
download | gcc-3ab5c8cd03d92bf4ec41e351820349d92fbc40c4.zip gcc-3ab5c8cd03d92bf4ec41e351820349d92fbc40c4.tar.gz gcc-3ab5c8cd03d92bf4ec41e351820349d92fbc40c4.tar.bz2 |
arm: Revert Auto-vectorization for MVE: add pack/unpack patterns PR target/104882
This reverts commit r12-1434-g046a3beb1673bf to fix PR target/104882.
As discussed in the PR, it turns out that the MVE ISA has no natural
mapping with GCC's vec_pack_trunc / vec_unpack standard patterns, unlike
Neon or SVE for instance.
This patch also adds the executable testcase provided in the PR.
This test passes at -O3 because the generated code does not need
to use the pack/unpack patterns, hence the use of -O2 which now
triggers vectorization since a few months ago.
2022-03-18 Christophe Lyon <christohe.lyon@arm.com>
PR target/104882
Revert
2021-06-11 Christophe Lyon <christophe.lyon@linaro.org>
gcc/
* config/arm/mve.md (mve_vec_unpack<US>_lo_<mode>): Delete.
(mve_vec_unpack<US>_hi_<mode>): Delete.
(@mve_vec_pack_trunc_lo_<mode>): Delete.
(mve_vmovntq_<supf><mode>): Remove '@' prefix.
* config/arm/neon.md (vec_unpack<US>_hi_<mode>): Move back
from vec-common.md.
(vec_unpack<US>_lo_<mode>): Likewise.
(vec_pack_trunc_<mode>): Rename from
neon_quad_vec_pack_trunc_<mode>.
* config/arm/vec-common.md (vec_unpack<US>_hi_<mode>): Delete.
(vec_unpack<US>_lo_<mode>): Delete.
(vec_pack_trunc_<mode>): Delete.
PR target/104882
gcc/testsuite/
* gcc.target/arm/simd/mve-vclz.c: Update expected results.
* gcc.target/arm/simd/mve-vshl.c: Likewise.
* gcc.target/arm/simd/mve-vec-pack.c: Delete.
* gcc.target/arm/simd/mve-vec-unpack.c: Delete.
* gcc.target/arm/simd/pr104882.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/arm/mve.md | 35 | ||||
-rw-r--r-- | gcc/config/arm/neon.md | 39 | ||||
-rw-r--r-- | gcc/config/arm/vec-common.md | 71 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/arm/simd/mve-vclz.c | 7 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c | 26 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c | 29 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 5 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/arm/simd/pr104882.c | 16 |
8 files changed, 59 insertions, 169 deletions
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 908bedc..369d7a7 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -535,26 +535,6 @@ [(set_attr "type" "mve_move") ]) -(define_insn "mve_vec_unpack<US>_lo_<mode>" - [(set (match_operand:<V_unpack> 0 "register_operand" "=w") - (SE:<V_unpack> (vec_select:<V_HALF> - (match_operand:MVE_3 1 "register_operand" "w") - (match_operand:MVE_3 2 "vect_par_constant_low" ""))))] - "TARGET_HAVE_MVE" - "vmovlb.<US>%#<V_sz_elem> %q0, %q1" - [(set_attr "type" "mve_move")] -) - -(define_insn "mve_vec_unpack<US>_hi_<mode>" - [(set (match_operand:<V_unpack> 0 "register_operand" "=w") - (SE:<V_unpack> (vec_select:<V_HALF> - (match_operand:MVE_3 1 "register_operand" "w") - (match_operand:MVE_3 2 "vect_par_constant_high" ""))))] - "TARGET_HAVE_MVE" - "vmovlt.<US>%#<V_sz_elem> %q0, %q1" - [(set_attr "type" "mve_move")] -) - ;; ;; [vcvtpq_s, vcvtpq_u]) ;; @@ -2219,23 +2199,10 @@ [(set_attr "type" "mve_move") ]) -;; vmovnb pattern used by the vec_pack_trunc expander to avoid the -;; need for an uninitialized input operand. -(define_insn "@mve_vec_pack_trunc_lo_<mode>" - [ - (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w") - (unspec:<V_narrow_pack> [(match_operand:MVE_5 1 "s_register_operand" "w")] - VMOVNBQ_S)) - ] - "TARGET_HAVE_MVE" - "vmovnb.i%#<V_sz_elem> %q0, %q1" - [(set_attr "type" "mve_move") -]) - ;; ;; [vmovntq_s, vmovntq_u]) ;; -(define_insn "@mve_vmovntq_<supf><mode>" +(define_insn "mve_vmovntq_<supf><mode>" [ (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w") (unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index f270ded..275bcc1 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -6005,6 +6005,43 @@ if (BYTES_BIG_ENDIAN) [(set_attr "type" "neon_shift_imm_long")] ) +(define_expand "vec_unpack<US>_hi_<mode>" + [(match_operand:<V_unpack> 0 "register_operand") + (SE:<V_unpack> (match_operand:VU 1 "register_operand"))] + "TARGET_NEON && !BYTES_BIG_ENDIAN" + { + rtvec v = rtvec_alloc (<V_mode_nunits>/2) ; + rtx t1; + int i; + for (i = 0; i < (<V_mode_nunits>/2); i++) + RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i); + + t1 = gen_rtx_PARALLEL (<MODE>mode, v); + emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0], + operands[1], + t1)); + DONE; + } +) + +(define_expand "vec_unpack<US>_lo_<mode>" + [(match_operand:<V_unpack> 0 "register_operand") + (SE:<V_unpack> (match_operand:VU 1 "register_operand"))] + "TARGET_NEON && !BYTES_BIG_ENDIAN" + { + rtvec v = rtvec_alloc (<V_mode_nunits>/2) ; + rtx t1; + int i; + for (i = 0; i < (<V_mode_nunits>/2) ; i++) + RTVEC_ELT (v, i) = GEN_INT (i); + t1 = gen_rtx_PARALLEL (<MODE>mode, v); + emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0], + operands[1], + t1)); + DONE; + } +) + (define_insn "neon_vec_<US>mult_lo_<mode>" [(set (match_operand:<V_unpack> 0 "register_operand" "=w") (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF> @@ -6220,7 +6257,7 @@ if (BYTES_BIG_ENDIAN) ; because the ordering of vector elements in Q registers is different from what ; the semantics of the instructions require. -(define_insn "neon_quad_vec_pack_trunc_<mode>" +(define_insn "vec_pack_trunc_<mode>" [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w") (vec_concat:<V_narrow_pack> (truncate:<V_narrow> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index f130090..fd878cba 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -580,77 +580,6 @@ "ARM_HAVE_<MODE>_ARITH && !TARGET_REALLY_IWMMXT" ) - -;; vmovl[tb] are not available for V4SI on MVE -(define_expand "vec_unpack<US>_hi_<mode>" - [(set (match_operand:<V_unpack> 0 "register_operand") - (SE:<V_unpack> (vec_select:<V_HALF> - (match_operand:VU 1 "register_operand") - (match_dup 2))))] - "ARM_HAVE_<MODE>_ARITH - && !TARGET_REALLY_IWMMXT - && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE) - && !BYTES_BIG_ENDIAN" - { - rtvec v = rtvec_alloc (<V_mode_nunits>/2); - int i; - for (i = 0; i < (<V_mode_nunits>/2); i++) - RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i); - - operands[2] = gen_rtx_PARALLEL (<MODE>mode, v); - } -) - -;; vmovl[tb] are not available for V4SI on MVE -(define_expand "vec_unpack<US>_lo_<mode>" - [(set (match_operand:<V_unpack> 0 "register_operand") - (SE:<V_unpack> (vec_select:<V_HALF> - (match_operand:VU 1 "register_operand") - (match_dup 2))))] - "ARM_HAVE_<MODE>_ARITH - && !TARGET_REALLY_IWMMXT - && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE) - && !BYTES_BIG_ENDIAN" - { - rtvec v = rtvec_alloc (<V_mode_nunits>/2); - int i; - for (i = 0; i < (<V_mode_nunits>/2) ; i++) - RTVEC_ELT (v, i) = GEN_INT (i); - - operands[2] = gen_rtx_PARALLEL (<MODE>mode, v); - - } -) - -;; vmovn[tb] are not available for V2DI on MVE -(define_expand "vec_pack_trunc_<mode>" - [(set (match_operand:<V_narrow_pack> 0 "register_operand") - (vec_concat:<V_narrow_pack> - (truncate:<V_narrow> - (match_operand:VN 1 "register_operand")) - (truncate:<V_narrow> - (match_operand:VN 2 "register_operand"))))] - "ARM_HAVE_<MODE>_ARITH - && !TARGET_REALLY_IWMMXT - && ! (<MODE>mode == V2DImode && TARGET_HAVE_MVE) - && !BYTES_BIG_ENDIAN" - { - if (TARGET_NEON) - { - emit_insn (gen_neon_quad_vec_pack_trunc_<mode> (operands[0], operands[1], - operands[2])); - } - else - { - rtx tmpreg = gen_reg_rtx (<V_narrow_pack>mode); - emit_insn (gen_mve_vec_pack_trunc_lo (<MODE>mode, tmpreg, operands[1])); - emit_insn (gen_mve_vmovntq (VMOVNTQ_S, <MODE>mode, - operands[0], tmpreg, operands[2])); - } - DONE; - } -) - (define_expand "vec_init<mode><V_elem_l>" [(match_operand:VDQX 0 "s_register_operand") (match_operand 1 "" "")] diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c index 5d6e991c..7068736 100644 --- a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c @@ -21,9 +21,8 @@ FUNC(u, uint, 16, clz) FUNC(s, int, 8, clz) FUNC(u, uint, 8, clz) -/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for - instance instead of using vclz.i8, we need 4 vclz.i32, leading to a total of - 14 vclz.i32 expected in this testcase. */ -/* { dg-final { scan-assembler-times {vclz\.i32 q[0-9]+, q[0-9]+} 14 } } */ +/* 16 and 8-bit versions are not vectorized because they need pack/unpack + patterns since __builtin_clz uses 32-bit parameter and return value. */ +/* { dg-final { scan-assembler-times {vclz\.i32 q[0-9]+, q[0-9]+} 2 } } */ /* { dg-final { scan-assembler-times {vclz\.i16 q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {vclz\.i8 q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c b/gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c deleted file mode 100644 index 43642b2..0000000 --- a/gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c +++ /dev/null @@ -1,26 +0,0 @@ -/* { dg-do compile } */ -/* { dg-require-effective-target arm_v8_1m_mve_ok } */ -/* { dg-add-options arm_v8_1m_mve } */ -/* { dg-additional-options "-O3" } */ - -#include <stdint.h> - -#define FUNC(SIGN, TYPE, DSTBITS, BITS, NAME) \ - void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##DSTBITS##_t * __restrict__ dest, \ - TYPE##BITS##_t *a) { \ - int i; \ - for (i=0; i < (256 / BITS); i++) { \ - dest[i] = a[i]; \ - } \ - } - -FUNC(s, int, 16, 32, pack) -FUNC(u, uint, 16, 32, pack) -FUNC(s, int, 8, 16, pack) -FUNC(u, uint, 8, 16, pack) - -/* { dg-final { scan-assembler-times {vmovnt\.i32\tq[0-9]+, q[0-9]+} 2 } } */ -/* { dg-final { scan-assembler-times {vmovnb\.i32\tq[0-9]+, q[0-9]+} 2 } } */ -/* { dg-final { scan-assembler-times {vmovnt\.i16\tq[0-9]+, q[0-9]+} 2 } } */ -/* { dg-final { scan-assembler-times {vmovnb\.i16\tq[0-9]+, q[0-9]+} 2 } } */ -/* { dg-final { scan-assembler-not {vldr\.64\td[0-9]+, \.L} } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c b/gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c deleted file mode 100644 index cdc62f8..0000000 --- a/gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c +++ /dev/null @@ -1,29 +0,0 @@ -/* { dg-do compile } */ -/* { dg-require-effective-target arm_v8_1m_mve_ok } */ -/* { dg-add-options arm_v8_1m_mve } */ -/* { dg-additional-options "-O3" } */ - -#include <stdint.h> - -#define FUNC(SIGN, TYPE, DSTBITS, BITS, NAME) \ - void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##DSTBITS##_t * __restrict__ dest, \ - TYPE##BITS##_t *a) { \ - int i; \ - for (i=0; i < (128 / BITS); i++) { \ - dest[i] = a[i]; \ - } \ - } - -FUNC(s, int, 32, 16, unpack) -FUNC(u, uint, 32, 16, unpack) -FUNC(s, int, 16, 8, unpack) -FUNC(u, uint, 16, 8, unpack) - -/* { dg-final { scan-assembler-times {vmovlt\.s16 q[0-9]+, q[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {vmovlb\.s16 q[0-9]+, q[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {vmovlt\.u16 q[0-9]+, q[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {vmovlb\.u16 q[0-9]+, q[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {vmovlt\.s8 q[0-9]+, q[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {vmovlb\.s8 q[0-9]+, q[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {vmovlt\.u8 q[0-9]+, q[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {vmovlb\.u8 q[0-9]+, q[0-9]+} 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c index 91dd942..7a06449 100644 --- a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c @@ -56,10 +56,7 @@ FUNC_IMM(u, uint, 8, 16, <<, vshlimm) /* MVE has only 128-bit vectors, so we can vectorize only half of the functions above. */ /* We only emit vshl.u, which is equivalent to vshl.s anyway. */ -/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for - instance instead of using vshl.u8, we need 4 vshl.i32, leading to a total of - 14 vshl.i32 expected in this testcase. */ -/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 14 } } */ +/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */ /* We emit vshl.i when the shift amount is an immediate. */ /* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/pr104882.c b/gcc/testsuite/gcc.target/arm/simd/pr104882.c new file mode 100644 index 0000000..ae9709a --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/pr104882.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-add-options arm_v8_1m_mve } */ +/* { dg-additional-options "-O2" } */ + +int i; +char src[1072]; +char dst[72]; +int main() { + for (i = 0; i < 128; i++) + src[i] = i; + __builtin_memcpy(dst, src, 7); + for (i = 0; i < 7; i++) + if (dst[i] != i) + __builtin_abort(); +} |