diff options
author | Richard Sandiford <richard.sandiford@arm.com> | 2022-07-18 12:57:10 +0100 |
---|---|---|
committer | Richard Sandiford <richard.sandiford@arm.com> | 2022-07-18 12:57:10 +0100 |
commit | 7313381d2ce44b72b4c9f70bd5670e5d78d1f631 (patch) | |
tree | 9df6d1d1217e63a819687b13d479466f92fde366 | |
parent | 9c8349ee1a35dac61b84bbae115ee6a1eeb6ddbd (diff) | |
download | gcc-7313381d2ce44b72b4c9f70bd5670e5d78d1f631.zip gcc-7313381d2ce44b72b4c9f70bd5670e5d78d1f631.tar.gz gcc-7313381d2ce44b72b4c9f70bd5670e5d78d1f631.tar.bz2 |
arm: Replace arm_builtin_vectorized_function [PR106253]
This patch extends the fix for PR106253 to AArch32. As with AArch64,
we were using ACLE intrinsics to vectorise scalar built-ins, even
though the two sometimes have different ECF_* flags. (That in turn
is because the ACLE intrinsics should follow the instruction semantics
as closely as possible, whereas the scalar built-ins follow language
specs.)
The patch also removes the copysignf built-in, which only existed
for this purpose and wasn't a “real” arm_neon.h built-in.
Doing this also has the side-effect of enabling vectorisation of
rint and roundeven. Logically that should be a separate patch,
but making it one would have meant adding a new int iterator
for the original set of instructions and then removing it again
when including new functions.
I've restricted the bswap tests to little-endian because we end
up with excessive spilling on big-endian. E.g.:
sub sp, sp, #8
vstr d1, [sp]
vldr d16, [sp]
vrev16.8 d16, d16
vstr d16, [sp]
vldr d0, [sp]
add sp, sp, #8
@ sp needed
bx lr
Similarly, the copysign tests require little-endian because on
big-endian we unnecessarily load the constant from the constant pool:
vldr.32 s15, .L3
vdup.32 d0, d7[1]
vbsl d0, d2, d1
bx lr
.L3:
.word -2147483648
gcc/
PR target/106253
* config/arm/arm-builtins.cc (arm_builtin_vectorized_function):
Delete.
* config/arm/arm-protos.h (arm_builtin_vectorized_function): Delete.
* config/arm/arm.cc (TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION):
Delete.
* config/arm/arm_neon_builtins.def (copysignf): Delete.
* config/arm/iterators.md (nvrint_pattern): New attribute.
* config/arm/neon.md (<NEON_VRINT:nvrint_pattern><VCVTF:mode>2):
New pattern.
(l<NEON_VCVT:nvrint_pattern><su_optab><VCVTF:mode><v_cmp_result>2):
Likewise.
(neon_copysignf<mode>): Rename to...
(copysign<mode>3): ...this.
gcc/testsuite/
PR target/106253
* gcc.target/arm/vect_unary_1.c: New test.
* gcc.target/arm/vect_binary_1.c: Likewise.
-rw-r--r-- | gcc/config/arm/arm-builtins.cc | 123 | ||||
-rw-r--r-- | gcc/config/arm/arm-protos.h | 1 | ||||
-rw-r--r-- | gcc/config/arm/arm.cc | 4 | ||||
-rw-r--r-- | gcc/config/arm/arm_neon_builtins.def | 1 | ||||
-rw-r--r-- | gcc/config/arm/iterators.md | 7 | ||||
-rw-r--r-- | gcc/config/arm/neon.md | 17 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/arm/vect_binary_1.c | 50 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/arm/vect_unary_1.c | 224 |
8 files changed, 297 insertions, 130 deletions
diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index d917137..8f8155c 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -4026,129 +4026,6 @@ arm_expand_builtin (tree exp, return NULL_RTX; } -tree -arm_builtin_vectorized_function (unsigned int fn, tree type_out, tree type_in) -{ - machine_mode in_mode, out_mode; - int in_n, out_n; - bool out_unsigned_p = TYPE_UNSIGNED (type_out); - - /* Can't provide any vectorized builtins when we can't use NEON. */ - if (!TARGET_NEON) - return NULL_TREE; - - if (TREE_CODE (type_out) != VECTOR_TYPE - || TREE_CODE (type_in) != VECTOR_TYPE) - return NULL_TREE; - - out_mode = TYPE_MODE (TREE_TYPE (type_out)); - out_n = TYPE_VECTOR_SUBPARTS (type_out); - in_mode = TYPE_MODE (TREE_TYPE (type_in)); - in_n = TYPE_VECTOR_SUBPARTS (type_in); - -/* ARM_CHECK_BUILTIN_MODE and ARM_FIND_VRINT_VARIANT are used to find the - decl of the vectorized builtin for the appropriate vector mode. - NULL_TREE is returned if no such builtin is available. */ -#undef ARM_CHECK_BUILTIN_MODE -#define ARM_CHECK_BUILTIN_MODE(C) \ - (TARGET_VFP5 \ - && flag_unsafe_math_optimizations \ - && ARM_CHECK_BUILTIN_MODE_1 (C)) - -#undef ARM_CHECK_BUILTIN_MODE_1 -#define ARM_CHECK_BUILTIN_MODE_1(C) \ - (out_mode == SFmode && out_n == C \ - && in_mode == SFmode && in_n == C) - -#undef ARM_FIND_VRINT_VARIANT -#define ARM_FIND_VRINT_VARIANT(N) \ - (ARM_CHECK_BUILTIN_MODE (2) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v2sf, false) \ - : (ARM_CHECK_BUILTIN_MODE (4) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v4sf, false) \ - : NULL_TREE)) - - switch (fn) - { - CASE_CFN_FLOOR: - return ARM_FIND_VRINT_VARIANT (vrintm); - CASE_CFN_CEIL: - return ARM_FIND_VRINT_VARIANT (vrintp); - CASE_CFN_TRUNC: - return ARM_FIND_VRINT_VARIANT (vrintz); - CASE_CFN_ROUND: - return ARM_FIND_VRINT_VARIANT (vrinta); -#undef ARM_CHECK_BUILTIN_MODE_1 -#define ARM_CHECK_BUILTIN_MODE_1(C) \ - (out_mode == SImode && out_n == C \ - && in_mode == SFmode && in_n == C) - -#define ARM_FIND_VCVT_VARIANT(N) \ - (ARM_CHECK_BUILTIN_MODE (2) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v2sfv2si, false) \ - : (ARM_CHECK_BUILTIN_MODE (4) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v4sfv4si, false) \ - : NULL_TREE)) - -#define ARM_FIND_VCVTU_VARIANT(N) \ - (ARM_CHECK_BUILTIN_MODE (2) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##uv2sfv2si, false) \ - : (ARM_CHECK_BUILTIN_MODE (4) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##uv4sfv4si, false) \ - : NULL_TREE)) - CASE_CFN_LROUND: - return (out_unsigned_p - ? ARM_FIND_VCVTU_VARIANT (vcvta) - : ARM_FIND_VCVT_VARIANT (vcvta)); - CASE_CFN_LCEIL: - return (out_unsigned_p - ? ARM_FIND_VCVTU_VARIANT (vcvtp) - : ARM_FIND_VCVT_VARIANT (vcvtp)); - CASE_CFN_LFLOOR: - return (out_unsigned_p - ? ARM_FIND_VCVTU_VARIANT (vcvtm) - : ARM_FIND_VCVT_VARIANT (vcvtm)); -#undef ARM_CHECK_BUILTIN_MODE -#define ARM_CHECK_BUILTIN_MODE(C, N) \ - (out_mode == N##mode && out_n == C \ - && in_mode == N##mode && in_n == C) - case CFN_BUILT_IN_BSWAP16: - if (ARM_CHECK_BUILTIN_MODE (4, HI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4hi, false); - else if (ARM_CHECK_BUILTIN_MODE (8, HI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv8hi, false); - else - return NULL_TREE; - case CFN_BUILT_IN_BSWAP32: - if (ARM_CHECK_BUILTIN_MODE (2, SI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2si, false); - else if (ARM_CHECK_BUILTIN_MODE (4, SI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4si, false); - else - return NULL_TREE; - case CFN_BUILT_IN_BSWAP64: - if (ARM_CHECK_BUILTIN_MODE (2, DI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2di, false); - else - return NULL_TREE; - CASE_CFN_COPYSIGN: - if (ARM_CHECK_BUILTIN_MODE (2, SF)) - return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv2sf, false); - else if (ARM_CHECK_BUILTIN_MODE (4, SF)) - return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv4sf, false); - else - return NULL_TREE; - - default: - return NULL_TREE; - } - return NULL_TREE; -} -#undef ARM_FIND_VCVT_VARIANT -#undef ARM_FIND_VCVTU_VARIANT -#undef ARM_CHECK_BUILTIN_MODE -#undef ARM_FIND_VRINT_VARIANT - void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) { diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 9d14209..f8aabbd 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -103,7 +103,6 @@ extern void neon_pairwise_reduce (rtx, rtx, machine_mode, rtx (*) (rtx, rtx, rtx)); extern rtx mve_bool_vec_to_const (rtx const_vec); extern rtx neon_make_constant (rtx, bool generate = true); -extern tree arm_builtin_vectorized_function (unsigned int, tree, tree); extern void neon_expand_vector_init (rtx, rtx); extern void neon_lane_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT, const_tree); extern void arm_const_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 33fb98d..eca99c9 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -739,10 +739,6 @@ static const struct attribute_spec arm_attribute_table[] = #undef TARGET_VECTORIZE_BUILTINS #define TARGET_VECTORIZE_BUILTINS -#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION -#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ - arm_builtin_vectorized_function - #undef TARGET_VECTOR_ALIGNMENT #define TARGET_VECTOR_ALIGNMENT arm_vector_alignment diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 445b2bf..2e642cc 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -264,7 +264,6 @@ VAR1 (UNOP, vcvtv4hf, v4sf) VAR10 (TERNOP, vbsl, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) VAR2 (TERNOP, vbsl, v8hf, v4hf) -VAR2 (UNOP, copysignf, v2sf, v4sf) VAR2 (UNOP, vrintn, v2sf, v4sf) VAR2 (UNOP, vrinta, v2sf, v4sf) VAR2 (UNOP, vrintp, v2sf, v4sf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 37cf797..29062cd 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1150,6 +1150,13 @@ (UNSPEC_VRINTA "unconditional") (UNSPEC_VRINTM "unconditional") (UNSPEC_VRINTR "nocond") (UNSPEC_VRINTX "nocond")]) +(define_int_attr nvrint_pattern [(UNSPEC_NVRINTZ "btrunc") + (UNSPEC_NVRINTP "ceil") + (UNSPEC_NVRINTA "round") + (UNSPEC_NVRINTM "floor") + (UNSPEC_NVRINTX "rint") + (UNSPEC_NVRINTN "roundeven")]) + (define_int_attr nvrint_variant [(UNSPEC_NVRINTZ "z") (UNSPEC_NVRINTP "p") (UNSPEC_NVRINTA "a") (UNSPEC_NVRINTM "m") (UNSPEC_NVRINTX "x") (UNSPEC_NVRINTN "n")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 275bcc1..e1dae28 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -635,6 +635,13 @@ [(set_attr "type" "neon_fp_mla_s<q>")] ) +(define_expand "<NEON_VRINT:nvrint_pattern><VCVTF:mode>2" + [(set (match_operand:VCVTF 0 "s_register_operand") + (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand")] + NEON_VRINT))] + "TARGET_NEON && TARGET_VFP5 && flag_unsafe_math_optimizations" +) + (define_insn "neon_vrint<NEON_VRINT:nvrint_variant><VCVTF:mode>" [(set (match_operand:VCVTF 0 "s_register_operand" "=w") (unspec:VCVTF [(match_operand:VCVTF 1 @@ -645,6 +652,14 @@ [(set_attr "type" "neon_fp_round_<V_elem_ch><q>")] ) +(define_expand "l<NEON_VCVT:nvrint_pattern><su_optab><VCVTF:mode><v_cmp_result>2" + [(set (match_operand:<V_cmp_result> 0 "register_operand") + (FIXUORS:<V_cmp_result> + (unspec:VCVTF [(match_operand:VCVTF 1 "register_operand")] + NEON_VCVT)))] + "TARGET_NEON && TARGET_VFP5 && flag_unsafe_math_optimizations" +) + (define_insn "neon_vcvt<NEON_VCVT:nvrint_variant><su_optab><VCVTF:mode><v_cmp_result>" [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w") (FIXUORS:<V_cmp_result> (unspec:VCVTF @@ -3059,7 +3074,7 @@ "TARGET_I8MM" ) -(define_expand "neon_copysignf<mode>" +(define_expand "copysign<mode>3" [(match_operand:VCVTF 0 "register_operand") (match_operand:VCVTF 1 "register_operand") (match_operand:VCVTF 2 "register_operand")] diff --git a/gcc/testsuite/gcc.target/arm/vect_binary_1.c b/gcc/testsuite/gcc.target/arm/vect_binary_1.c new file mode 100644 index 0000000..c1fc905 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/vect_binary_1.c @@ -0,0 +1,50 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_hard_ok } */ +/* { dg-require-effective-target arm_v8_neon_ok } */ +/* { dg-add-options arm_v8_neon } */ +/* { dg-additional-options "-O3 -mfloat-abi=hard" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include <stdint.h> + +#define TEST2(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 2))) \ +test2_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(IN) * 2))) y, \ + IN __attribute__((vector_size(sizeof(IN) * 2))) z) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 2))) x; \ + x[0] = __builtin_##NAME (y[0], z[0]); \ + x[1] = __builtin_##NAME (y[1], z[1]); \ + return x; \ +} + +#define TEST4(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 4))) \ +test4_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(OUT) * 4))) y, \ + IN __attribute__((vector_size(sizeof(OUT) * 4))) z) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 4))) x; \ + x[0] = __builtin_##NAME (y[0], z[0]); \ + x[1] = __builtin_##NAME (y[1], z[1]); \ + x[2] = __builtin_##NAME (y[2], z[2]); \ + x[3] = __builtin_##NAME (y[3], z[3]); \ + return x; \ +} + +/* +** test2_float_copysignf_float: { target arm_little_endian } +** vmov.i32 d0, #(0x80000000|2147483648)(\s+.*) +** vbsl d0, d2, d1 +** bx lr +*/ +TEST2 (float, copysignf, float) + +/* +** test4_float_copysignf_float: { target arm_little_endian } +** vmov.i32 q0, #(0x80000000|2147483648)(\s+.*) +** vbsl q0, q2, q1 +** bx lr +*/ +TEST4 (float, copysignf, float) diff --git a/gcc/testsuite/gcc.target/arm/vect_unary_1.c b/gcc/testsuite/gcc.target/arm/vect_unary_1.c new file mode 100644 index 0000000..4677180 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/vect_unary_1.c @@ -0,0 +1,224 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_hard_ok } */ +/* { dg-require-effective-target arm_v8_neon_ok } */ +/* { dg-add-options arm_v8_neon } */ +/* { dg-additional-options "-Ofast -mfloat-abi=hard" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include <stdint.h> + +#define TEST2(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 2))) \ +test2_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(IN) * 2))) y) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 2))) x; \ + x[0] = __builtin_##NAME (y[0]); \ + x[1] = __builtin_##NAME (y[1]); \ + return x; \ +} + +#define TEST4(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 4))) \ +test4_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(OUT) * 4))) y) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 4))) x; \ + x[0] = __builtin_##NAME (y[0]); \ + x[1] = __builtin_##NAME (y[1]); \ + x[2] = __builtin_##NAME (y[2]); \ + x[3] = __builtin_##NAME (y[3]); \ + return x; \ +} + +#define TEST8(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 8))) \ +test8_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(OUT) * 8))) y) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 8))) x; \ + x[0] = __builtin_##NAME (y[0]); \ + x[1] = __builtin_##NAME (y[1]); \ + x[2] = __builtin_##NAME (y[2]); \ + x[3] = __builtin_##NAME (y[3]); \ + x[4] = __builtin_##NAME (y[4]); \ + x[5] = __builtin_##NAME (y[5]); \ + x[6] = __builtin_##NAME (y[6]); \ + x[7] = __builtin_##NAME (y[7]); \ + return x; \ +} + +/* +** test2_float_truncf_float: +** vrintz.f32 d0, d1 +** bx lr +*/ +TEST2 (float, truncf, float) + +/* +** test4_float_truncf_float: +** vrintz.f32 q0, q1 +** bx lr +*/ +TEST4 (float, truncf, float) + +/* +** test2_float_roundf_float: +** vrinta.f32 d0, d1 +** bx lr +*/ +TEST2 (float, roundf, float) + +/* +** test4_float_roundf_float: +** vrinta.f32 q0, q1 +** bx lr +*/ +TEST4 (float, roundf, float) + +/* +** test2_float_floorf_float: +** vrintm.f32 d0, d1 +** bx lr +*/ +TEST2 (float, floorf, float) + +/* +** test4_float_floorf_float: +** vrintm.f32 q0, q1 +** bx lr +*/ +TEST4 (float, floorf, float) + +/* +** test2_float_ceilf_float: +** vrintp.f32 d0, d1 +** bx lr +*/ +TEST2 (float, ceilf, float) + +/* +** test4_float_ceilf_float: +** vrintp.f32 q0, q1 +** bx lr +*/ +TEST4 (float, ceilf, float) + +/* +** test2_float_rintf_float: +** vrintx.f32 d0, d1 +** bx lr +*/ +TEST2 (float, rintf, float) + +/* +** test4_float_rintf_float: +** vrintx.f32 q0, q1 +** bx lr +*/ +TEST4 (float, rintf, float) + +/* +** test2_float_roundevenf_float: +** vrintn.f32 d0, d1 +** bx lr +*/ +TEST2 (float, roundevenf, float) + +/* +** test4_float_roundevenf_float: +** vrintn.f32 q0, q1 +** bx lr +*/ +TEST4 (float, roundevenf, float) + +/* +** test2_int_roundf_float: +** vcvta.s32.f32 d0, d1 +** bx lr +*/ +TEST2 (int, roundf, float) + +/* +** test4_int_roundf_float: +** vcvta.s32.f32 q0, q1 +** bx lr +*/ +TEST4 (int, roundf, float) + +/* +** test2_int_floorf_float: +** vcvtm.s32.f32 d0, d1 +** bx lr +*/ +TEST2 (int, floorf, float) + +/* +** test4_int_floorf_float: +** vcvtm.s32.f32 q0, q1 +** bx lr +*/ +TEST4 (int, floorf, float) + +/* +** test2_int_ceilf_float: +** vcvtp.s32.f32 d0, d1 +** bx lr +*/ +TEST2 (int, ceilf, float) + +/* +** test4_int_ceilf_float: +** vcvtp.s32.f32 q0, q1 +** bx lr +*/ +TEST4 (int, ceilf, float) + +/* +** test2_int_clz_int: +** vclz.i32 d0, d1 +** bx lr +*/ +TEST2 (int, clz, int) + +/* +** test4_int_clz_int: +** vclz.i32 q0, q1 +** bx lr +*/ +TEST4 (int, clz, int) + +/* +** test4_int16_t_bswap16_int16_t: { target arm_little_endian } +** vrev16.8 d0, d1 +** bx lr +*/ +TEST4 (int16_t, bswap16, int16_t) + +/* +** test8_int16_t_bswap16_int16_t: { target arm_little_endian } +** vrev16.8 q0, q1 +** bx lr +*/ +TEST8 (int16_t, bswap16, int16_t) + +/* +** test2_int_bswap32_int: { target arm_little_endian } +** vrev32.8 d0, d1 +** bx lr +*/ +TEST2 (int, bswap32, int) + +/* +** test4_int_bswap32_int: { target arm_little_endian } +** vrev32.8 q0, q1 +** bx lr +*/ +TEST4 (int, bswap32, int) + +/* +** test2_int64_t_bswap64_int64_t: { target arm_little_endian } +** vrev64.8 q0, q1 +** bx lr +*/ +TEST2 (int64_t, bswap64, int64_t) |