diff options
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-simd-builtins.def | 4 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 63 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 8 |
3 files changed, 31 insertions, 44 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 3bb45a8..402453a 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -375,8 +375,8 @@ BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) /* Implemented by <sur><dotprod>_prod<dot_mode>. */ - BUILTIN_VB (TERNOP, sdot, 0, NONE) - BUILTIN_VB (TERNOPU, udot, 0, NONE) + BUILTIN_VB (TERNOP, sdot_prod, 10, NONE) + BUILTIN_VB (TERNOPU, udot_prod, 10, NONE) BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE) /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index bf667b9..13c8698 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -587,19 +587,8 @@ DONE; }) -;; These instructions map to the __builtins for the Dot Product operations. -(define_insn "aarch64_<sur>dot<vsi2qi>" - [(set (match_operand:VS 0 "register_operand" "=w") - (plus:VS (match_operand:VS 1 "register_operand" "0") - (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") - (match_operand:<VSI2QI> 3 "register_operand" "w")] - DOTPROD)))] - "TARGET_DOTPROD" - "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>" - [(set_attr "type" "neon_dot<q>")] -) - -;; These expands map to the Dot Product optab the vectorizer checks for. +;; These expands map to the Dot Product optab the vectorizer checks for +;; and to the intrinsics patttern. ;; The auto-vectorizer expects a dot product builtin that also does an ;; accumulation into the provided register. ;; Given the following pattern @@ -619,20 +608,17 @@ ;; ... ;; ;; and so the vectorizer provides r, in which the result has to be accumulated. -(define_expand "<sur>dot_prod<vsi2qi>" - [(set (match_operand:VS 0 "register_operand") - (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand") - (match_operand:<VSI2QI> 2 "register_operand")] - DOTPROD) - (match_operand:VS 3 "register_operand")))] +(define_insn "<sur>dot_prod<vsi2qi>" + [(set (match_operand:VS 0 "register_operand" "=w") + (plus:VS + (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w") + (match_operand:<VSI2QI> 2 "register_operand" "w")] + DOTPROD) + (match_operand:VS 3 "register_operand" "0")))] "TARGET_DOTPROD" -{ - emit_insn ( - gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1], - operands[2])); - emit_insn (gen_rtx_SET (operands[0], operands[3])); - DONE; -}) + "<sur>dot\\t%0.<Vtype>, %1.<Vdottype>, %2.<Vdottype>" + [(set_attr "type" "neon_dot<q>")] +) ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot ;; (vector) Dot Product operation and the vectorized optab. @@ -652,11 +638,12 @@ ;; indexed operations. (define_insn "aarch64_<sur>dot_lane<vsi2qi>" [(set (match_operand:VS 0 "register_operand" "=w") - (plus:VS (match_operand:VS 1 "register_operand" "0") - (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") - (match_operand:V8QI 3 "register_operand" "<h_con>") - (match_operand:SI 4 "immediate_operand" "i")] - DOTPROD)))] + (plus:VS + (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") + (match_operand:V8QI 3 "register_operand" "<h_con>") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VS 1 "register_operand" "0")))] "TARGET_DOTPROD" { operands[4] = aarch64_endian_lane_rtx (V8QImode, INTVAL (operands[4])); @@ -667,11 +654,12 @@ (define_insn "aarch64_<sur>dot_laneq<vsi2qi>" [(set (match_operand:VS 0 "register_operand" "=w") - (plus:VS (match_operand:VS 1 "register_operand" "0") - (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") - (match_operand:V16QI 3 "register_operand" "<h_con>") - (match_operand:SI 4 "immediate_operand" "i")] - DOTPROD)))] + (plus:VS + (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") + (match_operand:V16QI 3 "register_operand" "<h_con>") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VS 1 "register_operand" "0")))] "TARGET_DOTPROD" { operands[4] = aarch64_endian_lane_rtx (V16QImode, INTVAL (operands[4])); @@ -944,8 +932,7 @@ rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode)); rtx abd = gen_reg_rtx (V16QImode); emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2])); - emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3], - abd, ones)); + emit_insn (gen_udot_prodv16qi (operands[0], abd, ones, operands[3])); DONE; } rtx reduc = gen_reg_rtx (V8HImode); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 0f43994..313b35f 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -31472,28 +31472,28 @@ __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv8qi_uuuu (__a, __b, __r); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv16qi_uuuu (__a, __b, __r); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sdotv8qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv8qi (__a, __b, __r); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_sdotv16qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv16qi (__a, __b, __r); } __extension__ extern __inline uint32x2_t |