diff options
author | Tamar Christina <tamar.christina@arm.com> | 2021-07-15 13:16:00 +0100 |
---|---|---|
committer | Tamar Christina <tamar.christina@arm.com> | 2021-07-15 13:16:00 +0100 |
commit | 5402023f05e8fc28c2f1cfd7107264403b118a17 (patch) | |
tree | b9399f9a44003c0f02c1390c284186990b1114fa /gcc | |
parent | f6dde32b9d487dd6e343d0a1e1d1f60783f5e735 (diff) | |
download | gcc-5402023f05e8fc28c2f1cfd7107264403b118a17.zip gcc-5402023f05e8fc28c2f1cfd7107264403b118a17.tar.gz gcc-5402023f05e8fc28c2f1cfd7107264403b118a17.tar.bz2 |
Revert "AArch64: Correct dot-product auto-vect optab RTL"
This reverts commit 6d1cdb27828d2ef1ae1ab0209836646a269b9610.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-simd-builtins.def | 4 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 62 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 8 |
3 files changed, 45 insertions, 29 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 99e7348..063f503 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -375,8 +375,8 @@ BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) /* Implemented by <sur><dotprod>_prod<dot_mode>. */ - BUILTIN_VB (TERNOP, sdot_prod, 10, NONE) - BUILTIN_VB (TERNOPU, udot_prod, 10, NONE) + BUILTIN_VB (TERNOP, sdot, 0, NONE) + BUILTIN_VB (TERNOPU, udot, 0, NONE) BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE) /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 88fa5ba..7489098 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -587,28 +587,8 @@ DONE; }) -;; These expands map to the Dot Product optab the vectorizer checks for -;; and to the intrinsics patttern. -;; The auto-vectorizer expects a dot product builtin that also does an -;; accumulation into the provided register. -;; Given the following pattern -;; -;; for (i=0; i<len; i++) { -;; c = a[i] * b[i]; -;; r += c; -;; } -;; return result; -;; -;; This can be auto-vectorized to -;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; -;; -;; given enough iterations. However the vectorizer can keep unrolling the loop -;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; -;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; -;; ... -;; -;; and so the vectorizer provides r, in which the result has to be accumulated. -(define_insn "<sur>dot_prod<vsi2qi>" +;; These instructions map to the __builtins for the Dot Product operations. +(define_insn "aarch64_<sur>dot<vsi2qi>" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (match_operand:VS 1 "register_operand" "0") (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") @@ -633,6 +613,41 @@ [(set_attr "type" "neon_dot<q>")] ) +;; These expands map to the Dot Product optab the vectorizer checks for. +;; The auto-vectorizer expects a dot product builtin that also does an +;; accumulation into the provided register. +;; Given the following pattern +;; +;; for (i=0; i<len; i++) { +;; c = a[i] * b[i]; +;; r += c; +;; } +;; return result; +;; +;; This can be auto-vectorized to +;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; +;; +;; given enough iterations. However the vectorizer can keep unrolling the loop +;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; +;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; +;; ... +;; +;; and so the vectorizer provides r, in which the result has to be accumulated. +(define_expand "<sur>dot_prod<vsi2qi>" + [(set (match_operand:VS 0 "register_operand") + (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand") + (match_operand:<VSI2QI> 2 "register_operand")] + DOTPROD) + (match_operand:VS 3 "register_operand")))] + "TARGET_DOTPROD" +{ + emit_insn ( + gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1], + operands[2])); + emit_insn (gen_rtx_SET (operands[0], operands[3])); + DONE; +}) + ;; These instructions map to the __builtins for the Dot Product ;; indexed operations. (define_insn "aarch64_<sur>dot_lane<vsi2qi>" @@ -929,7 +944,8 @@ rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode)); rtx abd = gen_reg_rtx (V16QImode); emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2])); - emit_insn (gen_udot_prodv16qi (operands[0], operands[3], abd, ones)); + emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3], + abd, ones)); DONE; } rtx reduc = gen_reg_rtx (V8HImode); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 597f44c..00d76ea 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -31767,28 +31767,28 @@ __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b); + return __builtin_aarch64_sdotv8qi (__r, __a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b); + return __builtin_aarch64_sdotv16qi (__r, __a, __b); } __extension__ extern __inline uint32x2_t |