aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorTamar Christina <tamar.christina@arm.com>2021-07-26 10:23:21 +0100
committerTamar Christina <tamar.christina@arm.com>2021-07-26 10:23:21 +0100
commit1ab2270036dc0f2a13442ce682267bc7433ffb34 (patch)
tree4ad12bccc165ec641db1d0c97d99193450fec9ad /gcc
parent2050ac1a547eebe7de4af98b57429a934e75fff4 (diff)
downloadgcc-1ab2270036dc0f2a13442ce682267bc7433ffb34.zip
gcc-1ab2270036dc0f2a13442ce682267bc7433ffb34.tar.gz
gcc-1ab2270036dc0f2a13442ce682267bc7433ffb34.tar.bz2
AArch64: correct dot-product RTL patterns for aarch64.
The previous fix for this problem was wrong due to a subtle difference between where NEON expects the RMW values and where intrinsics expects them. The insn pattern is modeled after the intrinsics and so needs an expand for the vectorizer optab to switch the RTL. However operand[3] is not expected to be written to so the current pattern is bogus. Instead I rewrite the RTL to be in canonical ordering and merge them. gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def (sdot, udot): Rename to.. (sdot_prod, udot_prod): ... This. * config/aarch64/aarch64-simd.md (aarch64_<sur>dot<vsi2qi>): Merged into... (<sur>dot_prod<vsi2qi>): ... this. (aarch64_<sur>dot_lane<vsi2qi>, aarch64_<sur>dot_laneq<vsi2qi>): Change operands order. (<sur>sadv16qi): Use new operands order. * config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32, vdotq_s32): Use new RTL ordering.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/aarch64/aarch64-simd-builtins.def4
-rw-r--r--gcc/config/aarch64/aarch64-simd.md63
-rw-r--r--gcc/config/aarch64/arm_neon.h8
3 files changed, 31 insertions, 44 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 3bb45a8..402453a 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -375,8 +375,8 @@
BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE)
/* Implemented by <sur><dotprod>_prod<dot_mode>. */
- BUILTIN_VB (TERNOP, sdot, 0, NONE)
- BUILTIN_VB (TERNOPU, udot, 0, NONE)
+ BUILTIN_VB (TERNOP, sdot_prod, 10, NONE)
+ BUILTIN_VB (TERNOPU, udot_prod, 10, NONE)
BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE)
/* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */
BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bf667b9..13c8698 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -587,19 +587,8 @@
DONE;
})
-;; These instructions map to the __builtins for the Dot Product operations.
-(define_insn "aarch64_<sur>dot<vsi2qi>"
- [(set (match_operand:VS 0 "register_operand" "=w")
- (plus:VS (match_operand:VS 1 "register_operand" "0")
- (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
- (match_operand:<VSI2QI> 3 "register_operand" "w")]
- DOTPROD)))]
- "TARGET_DOTPROD"
- "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
- [(set_attr "type" "neon_dot<q>")]
-)
-
-;; These expands map to the Dot Product optab the vectorizer checks for.
+;; These expands map to the Dot Product optab the vectorizer checks for
+;; and to the intrinsics patttern.
;; The auto-vectorizer expects a dot product builtin that also does an
;; accumulation into the provided register.
;; Given the following pattern
@@ -619,20 +608,17 @@
;; ...
;;
;; and so the vectorizer provides r, in which the result has to be accumulated.
-(define_expand "<sur>dot_prod<vsi2qi>"
- [(set (match_operand:VS 0 "register_operand")
- (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
- (match_operand:<VSI2QI> 2 "register_operand")]
- DOTPROD)
- (match_operand:VS 3 "register_operand")))]
+(define_insn "<sur>dot_prod<vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (plus:VS
+ (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
+ (match_operand:<VSI2QI> 2 "register_operand" "w")]
+ DOTPROD)
+ (match_operand:VS 3 "register_operand" "0")))]
"TARGET_DOTPROD"
-{
- emit_insn (
- gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1],
- operands[2]));
- emit_insn (gen_rtx_SET (operands[0], operands[3]));
- DONE;
-})
+ "<sur>dot\\t%0.<Vtype>, %1.<Vdottype>, %2.<Vdottype>"
+ [(set_attr "type" "neon_dot<q>")]
+)
;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
;; (vector) Dot Product operation and the vectorized optab.
@@ -652,11 +638,12 @@
;; indexed operations.
(define_insn "aarch64_<sur>dot_lane<vsi2qi>"
[(set (match_operand:VS 0 "register_operand" "=w")
- (plus:VS (match_operand:VS 1 "register_operand" "0")
- (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
- (match_operand:V8QI 3 "register_operand" "<h_con>")
- (match_operand:SI 4 "immediate_operand" "i")]
- DOTPROD)))]
+ (plus:VS
+ (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+ (match_operand:V8QI 3 "register_operand" "<h_con>")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD)
+ (match_operand:VS 1 "register_operand" "0")))]
"TARGET_DOTPROD"
{
operands[4] = aarch64_endian_lane_rtx (V8QImode, INTVAL (operands[4]));
@@ -667,11 +654,12 @@
(define_insn "aarch64_<sur>dot_laneq<vsi2qi>"
[(set (match_operand:VS 0 "register_operand" "=w")
- (plus:VS (match_operand:VS 1 "register_operand" "0")
- (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
- (match_operand:V16QI 3 "register_operand" "<h_con>")
- (match_operand:SI 4 "immediate_operand" "i")]
- DOTPROD)))]
+ (plus:VS
+ (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+ (match_operand:V16QI 3 "register_operand" "<h_con>")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD)
+ (match_operand:VS 1 "register_operand" "0")))]
"TARGET_DOTPROD"
{
operands[4] = aarch64_endian_lane_rtx (V16QImode, INTVAL (operands[4]));
@@ -944,8 +932,7 @@
rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
rtx abd = gen_reg_rtx (V16QImode);
emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2]));
- emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
- abd, ones));
+ emit_insn (gen_udot_prodv16qi (operands[0], abd, ones, operands[3]));
DONE;
}
rtx reduc = gen_reg_rtx (V8HImode);
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 0f43994..313b35f 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -31472,28 +31472,28 @@ __extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b)
{
- return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b);
+ return __builtin_aarch64_udot_prodv8qi_uuuu (__a, __b, __r);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
{
- return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b);
+ return __builtin_aarch64_udot_prodv16qi_uuuu (__a, __b, __r);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b)
{
- return __builtin_aarch64_sdotv8qi (__r, __a, __b);
+ return __builtin_aarch64_sdot_prodv8qi (__a, __b, __r);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
{
- return __builtin_aarch64_sdotv16qi (__r, __a, __b);
+ return __builtin_aarch64_sdot_prodv16qi (__a, __b, __r);
}
__extension__ extern __inline uint32x2_t