aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorTamar Christina <tamar.christina@arm.com>2021-07-14 15:23:23 +0100
committerTamar Christina <tamar.christina@arm.com>2021-07-14 15:41:31 +0100
commit6d1cdb27828d2ef1ae1ab0209836646a269b9610 (patch)
tree75555790d0fa7ad5c88cd7f87ca5ed40fa39570a /gcc
parentc9165e2d58bb037793c1c93e1b5633a61f88db30 (diff)
downloadgcc-6d1cdb27828d2ef1ae1ab0209836646a269b9610.zip
gcc-6d1cdb27828d2ef1ae1ab0209836646a269b9610.tar.gz
gcc-6d1cdb27828d2ef1ae1ab0209836646a269b9610.tar.bz2
AArch64: Correct dot-product auto-vect optab RTL
The current RTL for the vectorizer patterns for dot-product are incorrect. Operand3 isn't an output parameter so we can't write to it. This fixes this issue and reduces the number of RTL. gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def (udot, sdot): Rename to... (sdot_prod, udot_prod): ...These. * config/aarch64/aarch64-simd.md (<sur>dot_prod<vsi2qi>): Remove. (aarch64_<sur>dot<vsi2qi>): Rename to... (<sur>dot_prod<vsi2qi>): ...This. * config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32, vdotq_s32): Update builtins.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/aarch64/aarch64-simd-builtins.def4
-rw-r--r--gcc/config/aarch64/aarch64-simd.md62
-rw-r--r--gcc/config/aarch64/arm_neon.h8
3 files changed, 29 insertions, 45 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 063f503..99e7348 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -375,8 +375,8 @@
BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE)
/* Implemented by <sur><dotprod>_prod<dot_mode>. */
- BUILTIN_VB (TERNOP, sdot, 0, NONE)
- BUILTIN_VB (TERNOPU, udot, 0, NONE)
+ BUILTIN_VB (TERNOP, sdot_prod, 10, NONE)
+ BUILTIN_VB (TERNOPU, udot_prod, 10, NONE)
BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE)
/* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */
BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7489098..88fa5ba 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -587,8 +587,28 @@
DONE;
})
-;; These instructions map to the __builtins for the Dot Product operations.
-(define_insn "aarch64_<sur>dot<vsi2qi>"
+;; These expands map to the Dot Product optab the vectorizer checks for
+;; and to the intrinsics patttern.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;; c = a[i] * b[i];
+;; r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations. However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_insn "<sur>dot_prod<vsi2qi>"
[(set (match_operand:VS 0 "register_operand" "=w")
(plus:VS (match_operand:VS 1 "register_operand" "0")
(unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
@@ -613,41 +633,6 @@
[(set_attr "type" "neon_dot<q>")]
)
-;; These expands map to the Dot Product optab the vectorizer checks for.
-;; The auto-vectorizer expects a dot product builtin that also does an
-;; accumulation into the provided register.
-;; Given the following pattern
-;;
-;; for (i=0; i<len; i++) {
-;; c = a[i] * b[i];
-;; r += c;
-;; }
-;; return result;
-;;
-;; This can be auto-vectorized to
-;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
-;;
-;; given enough iterations. However the vectorizer can keep unrolling the loop
-;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
-;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
-;; ...
-;;
-;; and so the vectorizer provides r, in which the result has to be accumulated.
-(define_expand "<sur>dot_prod<vsi2qi>"
- [(set (match_operand:VS 0 "register_operand")
- (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
- (match_operand:<VSI2QI> 2 "register_operand")]
- DOTPROD)
- (match_operand:VS 3 "register_operand")))]
- "TARGET_DOTPROD"
-{
- emit_insn (
- gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1],
- operands[2]));
- emit_insn (gen_rtx_SET (operands[0], operands[3]));
- DONE;
-})
-
;; These instructions map to the __builtins for the Dot Product
;; indexed operations.
(define_insn "aarch64_<sur>dot_lane<vsi2qi>"
@@ -944,8 +929,7 @@
rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
rtx abd = gen_reg_rtx (V16QImode);
emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2]));
- emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
- abd, ones));
+ emit_insn (gen_udot_prodv16qi (operands[0], operands[3], abd, ones));
DONE;
}
rtx reduc = gen_reg_rtx (V8HImode);
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 00d76ea..597f44c 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -31767,28 +31767,28 @@ __extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b)
{
- return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b);
+ return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
{
- return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b);
+ return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b)
{
- return __builtin_aarch64_sdotv8qi (__r, __a, __b);
+ return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
{
- return __builtin_aarch64_sdotv16qi (__r, __a, __b);
+ return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b);
}
__extension__ extern __inline uint32x2_t