From 0f41b5e02fa47db2080b77e4e1f7cd3305457c05 Mon Sep 17 00:00:00 2001 From: Dennis Zhang Date: Thu, 22 Oct 2020 01:09:33 +0100 Subject: arm: Auto-vectorization for MVE: vmul This patch enables MVE vmul instructions for auto-vectorization. It includes MVE in expander mul3 to enable vectorization for MVE. Related MVE vmul insns are modified to support the expander by using expression 'mult' instead of unspec. The mul3 for vectorization in vec-common.md uses mode iterator VDQWH instead of VALLW to cover all supported modes. The macros ARM_HAVE_NEON__ARITH are used to select supported modes for different targets. The redundant mul3 in neon.md is removed. gcc/ChangeLog: 2020-10-22 Dennis Zhang * config/arm/mve.md (mve_vmulq): New entry for vmul instruction using expression 'mult'. (mve_vmulq_f): Use mult instead of VMULQ_F. * config/arm/neon.md (mul3): Removed. * config/arm/vec-common.md (mul3): Use the new mode macros ARM_HAVE__ARITH. Use mode iterator VDQWH instead of VALLW. gcc/testsuite/ChangeLog: * gcc.target/arm/simd/mve-vmul_1.c: New test. --- gcc/ChangeLog | 9 ++++ gcc/config/arm/mve.md | 16 +++++-- gcc/config/arm/neon.md | 11 ----- gcc/config/arm/vec-common.md | 13 ++---- gcc/testsuite/ChangeLog | 4 ++ gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c | 64 ++++++++++++++++++++++++++ 6 files changed, 95 insertions(+), 22 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c (limited to 'gcc') diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a44bc09..d6a326c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2020-10-22 Dennis Zhang + + * config/arm/mve.md (mve_vmulq): New entry for vmul instruction + using expression 'mult'. + (mve_vmulq_f): Use mult instead of VMULQ_F. + * config/arm/neon.md (mul3): Removed. + * config/arm/vec-common.md (mul3): Use the new mode macros + ARM_HAVE__ARITH. Use mode iterator VDQWH instead of VALLW. + 2020-10-20 Andrew MacLeod PR tree-optimization/97505 diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 5dad388..764e201 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -1551,6 +1551,17 @@ [(set_attr "type" "mve_move") ]) +(define_insn "mve_vmulq" + [ + (set (match_operand:MVE_2 0 "s_register_operand" "=w") + (mult:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w") + (match_operand:MVE_2 2 "s_register_operand" "w"))) + ] + "TARGET_HAVE_MVE" + "vmul.i%#\t%q0, %q1, %q2" + [(set_attr "type" "mve_move") +]) + ;; ;; [vornq_u, vornq_s]) ;; @@ -2562,9 +2573,8 @@ (define_insn "mve_vmulq_f" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VMULQ_F)) + (mult:MVE_0 (match_operand:MVE_0 1 "s_register_operand" "w") + (match_operand:MVE_0 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vmul.f%# %q0, %q1, %q2" diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 85e424e..e459b9a 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -1732,17 +1732,6 @@ (const_string "neon_mul_")))] ) -(define_insn "mul3" - [(set - (match_operand:VH 0 "s_register_operand" "=w") - (mult:VH - (match_operand:VH 1 "s_register_operand" "w") - (match_operand:VH 2 "s_register_operand" "w")))] - "ARM_HAVE_NEON__ARITH" - "vmul.f16\t%0, %1, %2" - [(set_attr "type" "neon_mul_")] -) - (define_insn "neon_vmulf" [(set (match_operand:VH 0 "s_register_operand" "=w") diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index c3c86c4..45db60e 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -101,14 +101,11 @@ }) (define_expand "mul3" - [(set (match_operand:VALLW 0 "s_register_operand") - (mult:VALLW (match_operand:VALLW 1 "s_register_operand") - (match_operand:VALLW 2 "s_register_operand")))] - "(TARGET_NEON && ((mode != V2SFmode && mode != V4SFmode) - || flag_unsafe_math_optimizations)) - || (mode == V4HImode && TARGET_REALLY_IWMMXT)" -{ -}) + [(set (match_operand:VDQWH 0 "s_register_operand") + (mult:VDQWH (match_operand:VDQWH 1 "s_register_operand") + (match_operand:VDQWH 2 "s_register_operand")))] + "ARM_HAVE__ARITH" +) (define_expand "smin3" [(set (match_operand:VALLW 0 "s_register_operand") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 649f2bd..eb55a64 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2020-10-22 Dennis Zhang + + * gcc.target/arm/simd/mve-vmul_1.c: New test. + 2020-10-20 Jeff Law * gcc.dg/Wbuiltin-declaration-mismatch-9.c: Improve pruning of diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c b/gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c new file mode 100644 index 0000000..514f292 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c @@ -0,0 +1,64 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ +/* { dg-add-options arm_v8_1m_mve_fp } */ +/* { dg-additional-options "-O3" } */ + +#include + +void test_vmul_i32 (int32_t * dest, int32_t * a, int32_t * b) { + int i; + for (i=0; i<4; i++) { + dest[i] = a[i] * b[i]; + } +} + +void test_vmul_i32_u (uint32_t * dest, uint32_t * a, uint32_t * b) { + int i; + for (i=0; i<4; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.i32\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ + +void test_vmul_i16 (int16_t * dest, int16_t * a, int16_t * b) { + int i; + for (i=0; i<8; i++) { + dest[i] = a[i] * b[i]; + } +} + +void test_vmul_i16_u (uint16_t * dest, uint16_t * a, uint16_t * b) { + int i; + for (i=0; i<8; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.i16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ + +void test_vmul_i8 (int8_t * dest, int8_t * a, int8_t * b) { + int i; + for (i=0; i<16; i++) { + dest[i] = a[i] * b[i]; + } +} + +void test_vmul_i8_u (uint8_t * dest, uint8_t * a, uint8_t * b) { + int i; + for (i=0; i<16; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.i8\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ + +void test_vmul_f32 (float * dest, float * a, float * b) { + int i; + for (i=0; i<4; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.f32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + -- cgit v1.1