diff options
author | Tamar Christina <tamar.christina@arm.com> | 2021-07-14 15:19:32 +0100 |
---|---|---|
committer | Tamar Christina <tamar.christina@arm.com> | 2021-07-14 15:19:32 +0100 |
commit | 752045ed1eea0eddc48923df78999dab7f2827ba (patch) | |
tree | cb56cdafb94e5164aab1d70372c8ef7b9efd488a /gcc | |
parent | ab0a6b213abf6843b59cdea6399030e828109551 (diff) | |
download | gcc-752045ed1eea0eddc48923df78999dab7f2827ba.zip gcc-752045ed1eea0eddc48923df78999dab7f2827ba.tar.gz gcc-752045ed1eea0eddc48923df78999dab7f2827ba.tar.bz2 |
AArch64: Add support for sign differing dot-product usdot for NEON and SVE.
Hi All,
This adds optabs implementing usdot_prod.
The following testcase:
#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned
SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
SIGNEDNESS_4 char *restrict b)
{
for (__INTPTR_TYPE__ i = 0; i < N; ++i)
{
int av = a[i];
int bv = b[i];
SIGNEDNESS_2 short mult = av * bv;
res += mult;
}
return res;
}
Generates for NEON
f:
movi v0.4s, 0
mov x3, 0
.p2align 3,,7
.L2:
ldr q1, [x2, x3]
ldr q2, [x1, x3]
usdot v0.4s, v1.16b, v2.16b
add x3, x3, 16
cmp x3, 480
bne .L2
addv s0, v0.4s
fmov w1, s0
add w0, w0, w1
ret
and for SVE
f:
mov x3, 0
cntb x5
mov w4, 480
mov z1.b, #0
whilelo p0.b, wzr, w4
mov z3.b, #0
ptrue p1.b, all
.p2align 3,,7
.L2:
ld1b z2.b, p0/z, [x1, x3]
ld1b z0.b, p0/z, [x2, x3]
add x3, x3, x5
sel z0.b, p0, z0.b, z3.b
whilelo p0.b, w3, w4
usdot z1.s, z0.b, z2.b
b.any .L2
uaddv d0, p1, z1.s
fmov x1, d0
add w0, w0, w1
ret
instead of
f:
movi v0.4s, 0
mov x3, 0
.p2align 3,,7
.L2:
ldr q2, [x1, x3]
ldr q1, [x2, x3]
add x3, x3, 16
sxtl v4.8h, v2.8b
sxtl2 v3.8h, v2.16b
uxtl v2.8h, v1.8b
uxtl2 v1.8h, v1.16b
mul v2.8h, v2.8h, v4.8h
mul v1.8h, v1.8h, v3.8h
saddw v0.4s, v0.4s, v2.4h
saddw2 v0.4s, v0.4s, v2.8h
saddw v0.4s, v0.4s, v1.4h
saddw2 v0.4s, v0.4s, v1.8h
cmp x3, 480
bne .L2
addv s0, v0.4s
fmov w1, s0
add w0, w0, w1
ret
and
f:
mov x3, 0
cnth x5
mov w4, 480
mov z1.b, #0
whilelo p0.h, wzr, w4
ptrue p2.b, all
.p2align 3,,7
.L2:
ld1sb z2.h, p0/z, [x1, x3]
punpklo p1.h, p0.b
ld1b z0.h, p0/z, [x2, x3]
add x3, x3, x5
mul z0.h, p2/m, z0.h, z2.h
sunpklo z2.s, z0.h
sunpkhi z0.s, z0.h
add z1.s, p1/m, z1.s, z2.s
punpkhi p1.h, p0.b
whilelo p0.h, w3, w4
add z1.s, p1/m, z1.s, z0.s
b.any .L2
uaddv d0, p2, z1.s
fmov x1, d0
add w0, w0, w1
ret
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md (aarch64_usdot<vsi2qi>): Rename to...
(usdot_prod<vsi2qi>): ... This.
* config/aarch64/aarch64-simd-builtins.def (usdot): Rename to...
(usdot_prod): ...This.
* config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Likewise.
* config/aarch64/aarch64-sve.md (@aarch64_<sur>dot_prod<vsi2qi>):
Rename to...
(@<sur>dot_prod<vsi2qi>): ...This.
* config/aarch64/aarch64-sve-builtins-base.cc
(svusdot_impl::expand): Use it.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/vusdot-autovec.c: New test.
* gcc.target/aarch64/sve/vusdot-autovec.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-simd-builtins.def | 5 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 2 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-sve-builtins-base.cc | 2 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-sve.md | 2 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 4 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c | 38 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c | 38 |
7 files changed, 84 insertions, 7 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index ac5d4fc..063f503 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -374,10 +374,11 @@ BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, NONE) BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) - /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>. */ + /* Implemented by <sur><dotprod>_prod<dot_mode>. */ BUILTIN_VB (TERNOP, sdot, 0, NONE) BUILTIN_VB (TERNOPU, udot, 0, NONE) - BUILTIN_VB (TERNOP_SSUS, usdot, 0, NONE) + BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE) + /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE) BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 540244c..7489098 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -601,7 +601,7 @@ ;; These instructions map to the __builtins for the armv8.6a I8MM usdot ;; (vector) Dot Product operation. -(define_insn "aarch64_usdot<vsi2qi>" +(define_insn "usdot_prod<vsi2qi>" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 8fd6d3f..02e42a7 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -2366,7 +2366,7 @@ public: Hence we do the same rotation on arguments as svdot_impl does. */ e.rotate_inputs_left (0, 3); machine_mode mode = e.vector_mode (0); - insn_code icode = code_for_aarch64_dot_prod (UNSPEC_USDOT, mode); + insn_code icode = code_for_dot_prod (UNSPEC_USDOT, mode); return e.use_exact_insn (icode); } diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 9e48c0e..359fe0e 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -6870,7 +6870,7 @@ [(set_attr "movprfx" "*,yes")] ) -(define_insn "@aarch64_<sur>dot_prod<vsi2qi>" +(define_insn "@<sur>dot_prod<vsi2qi>" [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w") (plus:VNx4SI_ONLY (unspec:VNx4SI_ONLY diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 17e059e..00d76ea 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -34039,14 +34039,14 @@ __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b); + return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b); + return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b); } __extension__ extern __inline int32x2_t diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c new file mode 100644 index 0000000..b99a945 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */ + +#define N 480 +#define SIGNEDNESS_1 unsigned +#define SIGNEDNESS_2 signed +#define SIGNEDNESS_3 signed +#define SIGNEDNESS_4 unsigned + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, + SIGNEDNESS_4 char *restrict b) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +SIGNEDNESS_1 int __attribute__ ((noipa)) +g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b, + SIGNEDNESS_4 char *restrict a) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c new file mode 100644 index 0000000..094dd51 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8.2-a+i8mm+sve" } */ + +#define N 480 +#define SIGNEDNESS_1 unsigned +#define SIGNEDNESS_2 signed +#define SIGNEDNESS_3 signed +#define SIGNEDNESS_4 unsigned + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, + SIGNEDNESS_4 char *restrict b) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +SIGNEDNESS_1 int __attribute__ ((noipa)) +g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b, + SIGNEDNESS_4 char *restrict a) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */ |