diff options
author | Pengxuan Zheng <quic_pzheng@quicinc.com> | 2024-05-13 10:47:10 -0700 |
---|---|---|
committer | Andrew Pinski <quic_apinski@quicinc.com> | 2024-05-20 13:04:40 -0700 |
commit | a2e4fe5a53cf75cd055f64e745ebd51253e42254 (patch) | |
tree | fef737645f54dcc42f131a15e2bbf156dff25f83 /gcc/config | |
parent | e14c673ea9ab2eca5de4db91b478f0b5297ef321 (diff) | |
download | gcc-a2e4fe5a53cf75cd055f64e745ebd51253e42254.zip gcc-a2e4fe5a53cf75cd055f64e745ebd51253e42254.tar.gz gcc-a2e4fe5a53cf75cd055f64e745ebd51253e42254.tar.bz2 |
aarch64: Fold vget_low_* intrinsics to BIT_FIELD_REF [PR102171]
This patch folds vget_low_* intrinsics to BIT_FILED_REF to open up more
optimization opportunities for gimple optimizers.
While we are here, we also remove the vget_low_* definitions from arm_neon.h and
use the new intrinsics framework.
PR target/102171
gcc/ChangeLog:
* config/aarch64/aarch64-builtins.cc (AARCH64_SIMD_VGET_LOW_BUILTINS):
New macro to create definitions for all vget_low intrinsics.
(VGET_LOW_BUILTIN): Likewise.
(enum aarch64_builtins): Add vget_low function codes.
(aarch64_general_fold_builtin): Fold vget_low calls.
* config/aarch64/aarch64-simd-builtins.def: Delete vget_low builtins.
* config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Delete.
(aarch64_vget_lo_halfv8bf): Likewise.
* config/aarch64/arm_neon.h (__attribute__): Delete.
(vget_low_f16): Likewise.
(vget_low_f32): Likewise.
(vget_low_f64): Likewise.
(vget_low_p8): Likewise.
(vget_low_p16): Likewise.
(vget_low_p64): Likewise.
(vget_low_s8): Likewise.
(vget_low_s16): Likewise.
(vget_low_s32): Likewise.
(vget_low_s64): Likewise.
(vget_low_u8): Likewise.
(vget_low_u16): Likewise.
(vget_low_u32): Likewise.
(vget_low_u64): Likewise.
(vget_low_bf16): Likewise.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/pr113573.c: Replace __builtin_aarch64_get_lowv8hi
with vget_low_s16.
* gcc.target/aarch64/vget_low_2.c: New test.
* gcc.target/aarch64/vget_low_2_be.c: New test.
Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/aarch64/aarch64-builtins.cc | 60 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd-builtins.def | 5 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 23 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 105 |
4 files changed, 62 insertions, 131 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 75d21de..11b8880 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -658,6 +658,23 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { VREINTERPRET_BUILTINS \ VREINTERPRETQ_BUILTINS +#define AARCH64_SIMD_VGET_LOW_BUILTINS \ + VGET_LOW_BUILTIN(f16) \ + VGET_LOW_BUILTIN(f32) \ + VGET_LOW_BUILTIN(f64) \ + VGET_LOW_BUILTIN(p8) \ + VGET_LOW_BUILTIN(p16) \ + VGET_LOW_BUILTIN(p64) \ + VGET_LOW_BUILTIN(s8) \ + VGET_LOW_BUILTIN(s16) \ + VGET_LOW_BUILTIN(s32) \ + VGET_LOW_BUILTIN(s64) \ + VGET_LOW_BUILTIN(u8) \ + VGET_LOW_BUILTIN(u16) \ + VGET_LOW_BUILTIN(u32) \ + VGET_LOW_BUILTIN(u64) \ + VGET_LOW_BUILTIN(bf16) + typedef struct { const char *name; @@ -697,6 +714,9 @@ typedef struct #define VREINTERPRET_BUILTIN(A, B, L) \ AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B, +#define VGET_LOW_BUILTIN(A) \ + AARCH64_SIMD_BUILTIN_VGET_LOW_##A, + #undef VAR1 #define VAR1(T, N, MAP, FLAG, A) \ AARCH64_SIMD_BUILTIN_##T##_##N##A, @@ -732,6 +752,7 @@ enum aarch64_builtins AARCH64_CRC32_BUILTIN_MAX, /* SIMD intrinsic builtins. */ AARCH64_SIMD_VREINTERPRET_BUILTINS + AARCH64_SIMD_VGET_LOW_BUILTINS /* ARMv8.3-A Pointer Authentication Builtins. */ AARCH64_PAUTH_BUILTIN_AUTIA1716, AARCH64_PAUTH_BUILTIN_PACIA1716, @@ -823,8 +844,37 @@ static aarch64_fcmla_laneq_builtin_datum aarch64_fcmla_lane_builtin_data[] = { && SIMD_INTR_QUAL(A) == SIMD_INTR_QUAL(B) \ }, +#undef VGET_LOW_BUILTIN +#define VGET_LOW_BUILTIN(A) \ + {"vget_low_" #A, \ + AARCH64_SIMD_BUILTIN_VGET_LOW_##A, \ + 2, \ + { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \ + { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \ + FLAG_AUTO_FP, \ + false \ + }, + +#define AARCH64_SIMD_VGET_LOW_BUILTINS \ + VGET_LOW_BUILTIN(f16) \ + VGET_LOW_BUILTIN(f32) \ + VGET_LOW_BUILTIN(f64) \ + VGET_LOW_BUILTIN(p8) \ + VGET_LOW_BUILTIN(p16) \ + VGET_LOW_BUILTIN(p64) \ + VGET_LOW_BUILTIN(s8) \ + VGET_LOW_BUILTIN(s16) \ + VGET_LOW_BUILTIN(s32) \ + VGET_LOW_BUILTIN(s64) \ + VGET_LOW_BUILTIN(u8) \ + VGET_LOW_BUILTIN(u16) \ + VGET_LOW_BUILTIN(u32) \ + VGET_LOW_BUILTIN(u64) \ + VGET_LOW_BUILTIN(bf16) + static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] = { AARCH64_SIMD_VREINTERPRET_BUILTINS + AARCH64_SIMD_VGET_LOW_BUILTINS }; @@ -3216,6 +3266,9 @@ aarch64_fold_builtin_lane_check (tree arg0, tree arg1, tree arg2) #define VREINTERPRET_BUILTIN(A, B, L) \ case AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B: +#undef VGET_LOW_BUILTIN +#define VGET_LOW_BUILTIN(A) \ + case AARCH64_SIMD_BUILTIN_VGET_LOW_##A: /* Try to fold a call to the built-in function with subcode FCODE. The function is passed the N_ARGS arguments in ARGS and it returns a value @@ -3235,6 +3288,13 @@ aarch64_general_fold_builtin (unsigned int fcode, tree type, return fold_build1 (FLOAT_EXPR, type, args[0]); AARCH64_SIMD_VREINTERPRET_BUILTINS return fold_build1 (VIEW_CONVERT_EXPR, type, args[0]); + AARCH64_SIMD_VGET_LOW_BUILTINS + { + auto pos = BYTES_BIG_ENDIAN ? 64 : 0; + + return fold_build3 (BIT_FIELD_REF, type, args[0], bitsize_int (64), + bitsize_int (pos)); + } case AARCH64_SIMD_BUILTIN_LANE_CHECK: gcc_assert (n_args == 3); if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2])) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index da16f60..a9f0558 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -65,8 +65,6 @@ BUILTIN_VS (UNOP, ctz, 2, NONE) BUILTIN_VB (UNOP, popcount, 2, NONE) - /* Implemented by aarch64_get_low<mode>. */ - BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP) /* Implemented by aarch64_get_high<mode>. */ BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP) @@ -960,8 +958,7 @@ VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, FP, v4sf) VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, FP, v4sf) - /* Implemented by aarch64_vget_lo/hi_halfv8bf. */ - VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf) + /* Implemented by aarch64_vget_hi_halfv8bf. */ VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf) /* Implemented by aarch64_simd_<sur>mmlav16qi. */ diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 16b7445d..875ea52 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -288,17 +288,6 @@ } ) -(define_expand "aarch64_get_low<mode>" - [(match_operand:<VHALF> 0 "register_operand") - (match_operand:VQMOV 1 "register_operand")] - "TARGET_FLOAT" - { - rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); - emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo)); - DONE; - } -) - (define_expand "aarch64_get_high<mode>" [(match_operand:<VHALF> 0 "register_operand") (match_operand:VQMOV 1 "register_operand")] @@ -9774,17 +9763,7 @@ [(set_attr "type" "neon_dot<VDQSF:q>")] ) -;; vget_low/high_bf16 -(define_expand "aarch64_vget_lo_halfv8bf" - [(match_operand:V4BF 0 "register_operand") - (match_operand:V8BF 1 "register_operand")] - "TARGET_BF16_SIMD" -{ - rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false); - emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p)); - DONE; -}) - +;; vget_high_bf16 (define_expand "aarch64_vget_hi_halfv8bf" [(match_operand:V4BF 0 "register_operand") (match_operand:V8BF 1 "register_operand")] diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 0ee325d..92c2c53 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -3029,104 +3029,6 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index) __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_f16 (float16x8_t __a) -{ - return __builtin_aarch64_get_lowv8hf (__a); -} - -__extension__ extern __inline float32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_f32 (float32x4_t __a) -{ - return __builtin_aarch64_get_lowv4sf (__a); -} - -__extension__ extern __inline float64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_f64 (float64x2_t __a) -{ - return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)}; -} - -__extension__ extern __inline poly8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_p8 (poly8x16_t __a) -{ - return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a); -} - -__extension__ extern __inline poly16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_p16 (poly16x8_t __a) -{ - return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a); -} - -__extension__ extern __inline poly64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_p64 (poly64x2_t __a) -{ - return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a); -} - -__extension__ extern __inline int8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_s8 (int8x16_t __a) -{ - return __builtin_aarch64_get_lowv16qi (__a); -} - -__extension__ extern __inline int16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_s16 (int16x8_t __a) -{ - return __builtin_aarch64_get_lowv8hi (__a); -} - -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_s32 (int32x4_t __a) -{ - return __builtin_aarch64_get_lowv4si (__a); -} - -__extension__ extern __inline int64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_s64 (int64x2_t __a) -{ - return (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)}; -} - -__extension__ extern __inline uint8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_u8 (uint8x16_t __a) -{ - return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a); -} - -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_u16 (uint16x8_t __a) -{ - return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a); -} - -__extension__ extern __inline uint32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_u32 (uint32x4_t __a) -{ - return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a); -} - -__extension__ extern __inline uint64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_u64 (uint64x2_t __a) -{ - return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)}; -} - -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_f16 (float16x8_t __a) { return __builtin_aarch64_get_highv8hf (__a); @@ -28481,13 +28383,6 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, __extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_bf16 (bfloat16x8_t __a) -{ - return __builtin_aarch64_vget_lo_halfv8bf (__a); -} - -__extension__ extern __inline bfloat16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_bf16 (bfloat16x8_t __a) { return __builtin_aarch64_vget_hi_halfv8bf (__a); |