diff options
author | Saurabh Jha <saurabh.jha@arm.com> | 2024-08-07 12:34:20 +0100 |
---|---|---|
committer | Saurabh Jha <saurabh.jha@arm.com> | 2024-09-23 17:26:49 +0100 |
commit | c1fb78fb03caede01b02a1ebb3275ac98343d468 (patch) | |
tree | 92befae6a21d398909ede405d3e1da700af47443 | |
parent | bfefed6c5bb62648cf0303d377c06cb45ab1f24a (diff) | |
download | gcc-c1fb78fb03caede01b02a1ebb3275ac98343d468.zip gcc-c1fb78fb03caede01b02a1ebb3275ac98343d468.tar.gz gcc-c1fb78fb03caede01b02a1ebb3275ac98343d468.tar.bz2 |
aarch64: Add codegen support for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and
mandatory from Armv9.5-a. It introduces instructions for computing the
floating point absolute maximum and minimum of the two vectors
element-wise.
This patch adds code generation support for famax and famin in terms of
existing RTL operators.
famax/famin is equivalent to first taking abs of the operands and then
taking smax/smin on the results of abs.
famax/famin (a, b) = smax/smin (abs (a), abs (b))
This fusion of operators is only possible when -march=armv9-a+faminmax
flags are passed. We also need to pass -ffast-math flag; if we don't,
then a statement like
c[i] = __builtin_fmaxf16 (a[i], b[i]);
is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin).
This code generation is only available on -O2 or -O3 as that is when
auto-vectorization is enabled.
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md
(*aarch64_faminmax_fused): Instruction pattern for faminmax
codegen.
* config/aarch64/iterators.md: Attribute for faminmax codegen.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test.
* gcc.target/aarch64/simd/faminmax-codegen.c: New test.
* gcc.target/aarch64/simd/faminmax-no-codegen.c: New test.
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 9 | ||||
-rw-r--r-- | gcc/config/aarch64/iterators.md | 3 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c | 217 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c | 197 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/simd/faminmax-no-codegen.c | 267 |
5 files changed, 693 insertions, 0 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 67f0fe2..2a44aa3 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9920,3 +9920,12 @@ "TARGET_FAMINMAX" "<faminmax_uns_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>" ) + +(define_insn "*aarch64_faminmax_fused" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FMAXMIN:VHSDF + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"))))] + "TARGET_FAMINMAX" + "<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>" +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 17ac5e0..c2fcd18 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -4472,3 +4472,6 @@ (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c new file mode 100644 index 0000000..6688a78 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c @@ -0,0 +1,217 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmaxnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmaxnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmaxnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmaxnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmaxnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vamaxq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf64 (a[i]); + b[i] = __builtin_fabsf64 (b[i]); + c[i] = __builtin_fmaxf64 (a[i], b[i]); + } + return c; +} + +/* +** test_vamin_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fminnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamin_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fminf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fminnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vaminq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fminf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamin_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fminnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamin_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fminf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fminnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vaminq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fminf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fminnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vaminq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf64 (a[i]); + b[i] = __builtin_fabsf64 (b[i]); + c[i] = __builtin_fminf64 (a[i], b[i]); + } + return c; +} diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c new file mode 100644 index 0000000..d77bd90 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c @@ -0,0 +1,197 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O2 -ffast-math -march=armv9-a+faminmax" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** famax v0.4h, v1.4h, v0.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** famax v0.8h, v1.8h, v0.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** famax v0.2s, v1.2s, v0.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** famax v0.4s, v1.4s, v0.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** famax v0.2d, v1.2d, v0.2d +** ret +*/ +float64x2_t +test_vamaxq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf64 (a[i]); + b[i] = __builtin_fabsf64 (b[i]); + c[i] = __builtin_fmaxf64 (a[i], b[i]); + } + return c; +} + +/* +** test_vamin_f16: +** famin v0.4h, v1.4h, v0.4h +** ret +*/ +float16x4_t +test_vamin_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fminf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f16: +** famin v0.8h, v1.8h, v0.8h +** ret +*/ +float16x8_t +test_vaminq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fminf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamin_f32: +** famin v0.2s, v1.2s, v0.2s +** ret +*/ +float32x2_t +test_vamin_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fminf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f32: +** famin v0.4s, v1.4s, v0.4s +** ret +*/ +float32x4_t +test_vaminq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fminf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f64: +** famin v0.2d, v1.2d, v0.2d +** ret +*/ +float64x2_t +test_vaminq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf64 (a[i]); + b[i] = __builtin_fabsf64 (b[i]); + c[i] = __builtin_fminf64 (a[i], b[i]); + } + return c; +} diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-no-codegen.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-no-codegen.c new file mode 100644 index 0000000..3ec0a52 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-no-codegen.c @@ -0,0 +1,267 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O2 -ffast-math -march=armv9-a+faminmax" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_abs_max_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmax v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_abs_max_f16 (float16x4_t a, float16x4_t b) +{ + return vmax_f16 (vabs_f16 (a), vabs_f16 (b)); +} + +/* +** test_abs_maxnm_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmaxnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_abs_maxnm_f16 (float16x4_t a, float16x4_t b) +{ + return vmaxnm_f16 (vabs_f16 (a), vabs_f16 (b)); +} + +/* +** test_abs_maxq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmax v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_abs_maxq_f16 (float16x8_t a, float16x8_t b) +{ + return vmaxq_f16 (vabsq_f16 (a), vabsq_f16 (b)); +} + +/* +** test_abs_maxnmq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmaxnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_abs_maxnmq_f16 (float16x8_t a, float16x8_t b) +{ + return vmaxnmq_f16 (vabsq_f16 (a), vabsq_f16 (b)); +} + +/* +** test_abs_max_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmax v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_abs_max_f32 (float32x2_t a, float32x2_t b) +{ + return vmax_f32 (vabs_f32 (a), vabs_f32 (b)); +} + +/* +** test_abs_maxnm_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmaxnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_abs_maxnm_f32 (float32x2_t a, float32x2_t b) +{ + return vmaxnm_f32 (vabs_f32 (a), vabs_f32 (b)); +} + +/* +** test_abs_maxq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmax v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_abs_maxq_f32 (float32x4_t a, float32x4_t b) +{ + return vmaxq_f32 (vabsq_f32 (a), vabsq_f32 (b)); +} + +/* +** test_abs_maxnmq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmaxnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_abs_maxnmq_f32 (float32x4_t a, float32x4_t b) +{ + return vmaxnmq_f32 (vabsq_f32 (a), vabsq_f32 (b)); +} + +/* +** test_abs_maxq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmax v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_abs_maxq_f64 (float64x2_t a, float64x2_t b) +{ + return vmaxq_f64 (vabsq_f64 (a), vabsq_f64 (b)); +} + +/* +** test_abs_maxnmq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmaxnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_abs_maxnmq_f64 (float64x2_t a, float64x2_t b) +{ + return vmaxnmq_f64 (vabsq_f64 (a), vabsq_f64 (b)); +} + +/* +** test_abs_min_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmin v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_abs_min_f16 (float16x4_t a, float16x4_t b) +{ + return vmin_f16 (vabs_f16 (a), vabs_f16 (b)); +} + +/* +** test_abs_minnm_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fminnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_abs_minnm_f16 (float16x4_t a, float16x4_t b) +{ + return vminnm_f16 (vabs_f16 (a), vabs_f16 (b)); +} + +/* +** test_abs_minq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmin v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_abs_minq_f16 (float16x8_t a, float16x8_t b) +{ + return vminq_f16 (vabsq_f16 (a), vabsq_f16 (b)); +} + +/* +** test_abs_minnmq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fminnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_abs_minnmq_f16 (float16x8_t a, float16x8_t b) +{ + return vminnmq_f16 (vabsq_f16 (a), vabsq_f16 (b)); +} + +/* +** test_abs_min_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmin v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_abs_min_f32 (float32x2_t a, float32x2_t b) +{ + return vmin_f32 (vabs_f32 (a), vabs_f32 (b)); +} + +/* +** test_abs_minnm_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fminnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_abs_minnm_f32 (float32x2_t a, float32x2_t b) +{ + return vminnm_f32 (vabs_f32 (a), vabs_f32 (b)); +} + +/* +** test_abs_minq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmin v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_abs_minq_f32 (float32x4_t a, float32x4_t b) +{ + return vminq_f32 (vabsq_f32 (a), vabsq_f32 (b)); +} + +/* +** test_abs_minnmq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fminnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_abs_minnmq_f32 (float32x4_t a, float32x4_t b) +{ + return vminnmq_f32 (vabsq_f32 (a), vabsq_f32 (b)); +} + +/* +** test_abs_minq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmin v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_abs_minq_f64 (float64x2_t a, float64x2_t b) +{ + return vminq_f64 (vabsq_f64 (a), vabsq_f64 (b)); +} + +/* +** test_abs_minnmq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fminnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_abs_minnmq_f64 (float64x2_t a, float64x2_t b) +{ + return vminnmq_f64 (vabsq_f64 (a), vabsq_f64 (b)); +} |