diff options
author | Alan Lawrence <alan.lawrence@arm.com> | 2014-10-27 15:45:16 +0000 |
---|---|---|
committer | Alan Lawrence <alalaw01@gcc.gnu.org> | 2014-10-27 15:45:16 +0000 |
commit | 64b0f9284445df67cc0b64004cc87793076198ae (patch) | |
tree | 7a3e7812d3a4763929b96defa43468ce041157a3 | |
parent | ec528bd1e12c07156c2507107a0fba9338536f8d (diff) | |
download | gcc-64b0f9284445df67cc0b64004cc87793076198ae.zip gcc-64b0f9284445df67cc0b64004cc87793076198ae.tar.gz gcc-64b0f9284445df67cc0b64004cc87793076198ae.tar.bz2 |
[AArch64] Use new reduc_[us](min|max)_scal optabs, inc. for builtins
* config/aarch64/aarch64-simd-builtins.def (reduc_smax_, reduc_smin_,
reduc_umax_, reduc_umin_, reduc_smax_nan_, reduc_smin_nan_): Remove.
(reduc_smax_scal_, reduc_smin_scal_, reduc_umax_scal_,
reduc_umin_scal_, reduc_smax_nan_scal_, reduc_smin_nan_scal_): New.
* config/aarch64/aarch64-simd.md
(reduc_<maxmin_uns>_<mode>): Rename VDQV_S variant to...
(reduc_<maxmin_uns>_internal<mode>): ...this.
(reduc_<maxmin_uns>_<mode>): New (VDQ_BHSI).
(reduc_<maxmin_uns>_scal_<mode>): New (*2).
(reduc_<maxmin_uns>_v2si): Combine with below, renaming...
(reduc_<maxmin_uns>_<mode>): Combine V2F with above, renaming...
(reduc_<maxmin_uns>_internal_<mode>): ...to this (VDQF).
* config/aarch64/arm_neon.h (vmaxv_f32, vmaxv_s8, vmaxv_s16,
vmaxv_s32, vmaxv_u8, vmaxv_u16, vmaxv_u32, vmaxvq_f32, vmaxvq_f64,
vmaxvq_s8, vmaxvq_s16, vmaxvq_s32, vmaxvq_u8, vmaxvq_u16, vmaxvq_u32,
vmaxnmv_f32, vmaxnmvq_f32, vmaxnmvq_f64, vminv_f32, vminv_s8,
vminv_s16, vminv_s32, vminv_u8, vminv_u16, vminv_u32, vminvq_f32,
vminvq_f64, vminvq_s8, vminvq_s16, vminvq_s32, vminvq_u8, vminvq_u16,
vminvq_u32, vminnmv_f32, vminnmvq_f32, vminnmvq_f64): Update to use
__builtin_aarch64_reduc_..._scal; remove vget_lane wrapper.
From-SVN: r216741
-rw-r--r-- | gcc/ChangeLog | 26 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd-builtins.def | 14 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 54 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 104 |
4 files changed, 108 insertions, 90 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 0656a19..a938d9d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,31 @@ 2014-10-27 Alan Lawrence <alan.lawrence@arm.com> + * config/aarch64/aarch64-simd-builtins.def (reduc_smax_, reduc_smin_, + reduc_umax_, reduc_umin_, reduc_smax_nan_, reduc_smin_nan_): Remove. + (reduc_smax_scal_, reduc_smin_scal_, reduc_umax_scal_, + reduc_umin_scal_, reduc_smax_nan_scal_, reduc_smin_nan_scal_): New. + + * config/aarch64/aarch64-simd.md + (reduc_<maxmin_uns>_<mode>): Rename VDQV_S variant to... + (reduc_<maxmin_uns>_internal<mode>): ...this. + (reduc_<maxmin_uns>_<mode>): New (VDQ_BHSI). + (reduc_<maxmin_uns>_scal_<mode>): New (*2). + + (reduc_<maxmin_uns>_v2si): Combine with below, renaming... + (reduc_<maxmin_uns>_<mode>): Combine V2F with above, renaming... + (reduc_<maxmin_uns>_internal_<mode>): ...to this (VDQF). + + * config/aarch64/arm_neon.h (vmaxv_f32, vmaxv_s8, vmaxv_s16, + vmaxv_s32, vmaxv_u8, vmaxv_u16, vmaxv_u32, vmaxvq_f32, vmaxvq_f64, + vmaxvq_s8, vmaxvq_s16, vmaxvq_s32, vmaxvq_u8, vmaxvq_u16, vmaxvq_u32, + vmaxnmv_f32, vmaxnmvq_f32, vmaxnmvq_f64, vminv_f32, vminv_s8, + vminv_s16, vminv_s32, vminv_u8, vminv_u16, vminv_u32, vminvq_f32, + vminvq_f64, vminvq_s8, vminvq_s16, vminvq_s32, vminvq_u8, vminvq_u16, + vminvq_u32, vminnmv_f32, vminnmvq_f32, vminnmvq_f64): Update to use + __builtin_aarch64_reduc_..._scal; remove vget_lane wrapper. + +2014-10-27 Alan Lawrence <alan.lawrence@arm.com> + * config/aarch64/aarch64-simd-builtins.def (reduc_splus_<mode>/VDQF, reduc_uplus_<mode>/VDQF, reduc_splus_v4sf): Remove. diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 7fe7c62..62b7f33 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -225,13 +225,13 @@ /* Implemented by aarch64_reduc_plus_<mode>. */ BUILTIN_VALL (UNOP, reduc_plus_scal_, 10) - /* Implemented by reduc_<maxmin_uns>_<mode>. */ - BUILTIN_VDQIF (UNOP, reduc_smax_, 10) - BUILTIN_VDQIF (UNOP, reduc_smin_, 10) - BUILTIN_VDQ_BHSI (UNOP, reduc_umax_, 10) - BUILTIN_VDQ_BHSI (UNOP, reduc_umin_, 10) - BUILTIN_VDQF (UNOP, reduc_smax_nan_, 10) - BUILTIN_VDQF (UNOP, reduc_smin_nan_, 10) + /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar). */ + BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10) + BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10) + BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10) + BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10) + BUILTIN_VDQF (UNOP, reduc_smax_nan_scal_, 10) + BUILTIN_VDQF (UNOP, reduc_smin_nan_scal_, 10) /* Implemented by <maxmin><mode>3. smax variants map to fmaxnm, diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 76a9366..578760a 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1847,7 +1847,40 @@ ;; 'across lanes' max and min ops. -(define_insn "reduc_<maxmin_uns>_<mode>" +;; Template for outputting a scalar, so we can create __builtins which can be +;; gimple_fold'd to the REDUC_(MAX|MIN)_EXPR tree code. (This is FP smax/smin). +(define_expand "reduc_<maxmin_uns>_scal_<mode>" + [(match_operand:<VEL> 0 "register_operand") + (unspec:VDQF [(match_operand:VDQF 1 "register_operand")] + FMAXMINV)] + "TARGET_SIMD" + { + rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0)); + rtx scratch = gen_reg_rtx (<MODE>mode); + emit_insn (gen_aarch64_reduc_<maxmin_uns>_internal<mode> (scratch, + operands[1])); + emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt)); + DONE; + } +) + +;; Likewise for integer cases, signed and unsigned. +(define_expand "reduc_<maxmin_uns>_scal_<mode>" + [(match_operand:<VEL> 0 "register_operand") + (unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand")] + MAXMINV)] + "TARGET_SIMD" + { + rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0)); + rtx scratch = gen_reg_rtx (<MODE>mode); + emit_insn (gen_aarch64_reduc_<maxmin_uns>_internal<mode> (scratch, + operands[1])); + emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt)); + DONE; + } +) + +(define_insn "aarch64_reduc_<maxmin_uns>_internal<mode>" [(set (match_operand:VDQV_S 0 "register_operand" "=w") (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")] MAXMINV))] @@ -1856,7 +1889,7 @@ [(set_attr "type" "neon_reduc_minmax<q>")] ) -(define_insn "reduc_<maxmin_uns>_v2si" +(define_insn "aarch64_reduc_<maxmin_uns>_internalv2si" [(set (match_operand:V2SI 0 "register_operand" "=w") (unspec:V2SI [(match_operand:V2SI 1 "register_operand" "w")] MAXMINV))] @@ -1865,24 +1898,15 @@ [(set_attr "type" "neon_reduc_minmax")] ) -(define_insn "reduc_<maxmin_uns>_<mode>" - [(set (match_operand:V2F 0 "register_operand" "=w") - (unspec:V2F [(match_operand:V2F 1 "register_operand" "w")] +(define_insn "aarch64_reduc_<maxmin_uns>_internal<mode>" + [(set (match_operand:VDQF 0 "register_operand" "=w") + (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")] FMAXMINV))] "TARGET_SIMD" - "<maxmin_uns_op>p\\t%<Vetype>0, %1.<Vtype>" + "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>" [(set_attr "type" "neon_fp_reduc_minmax_<Vetype><q>")] ) -(define_insn "reduc_<maxmin_uns>_v4sf" - [(set (match_operand:V4SF 0 "register_operand" "=w") - (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "w")] - FMAXMINV))] - "TARGET_SIMD" - "<maxmin_uns_op>v\\t%s0, %1.4s" - [(set_attr "type" "neon_fp_reduc_minmax_s_q")] -) - ;; aarch64_simd_bsl may compile to any of bsl/bif/bit depending on register ;; allocation. ;; Operand 1 is the mask, operands 2 and 3 are the bitfields from which diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 3d3772f..0ec1a24 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -18590,106 +18590,91 @@ vmaxnmq_f64 (float64x2_t __a, float64x2_t __b) __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vmaxv_f32 (float32x2_t __a) { - return vget_lane_f32 (__builtin_aarch64_reduc_smax_nan_v2sf (__a), - 0); + return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a); } __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vmaxv_s8 (int8x8_t __a) { - return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), 0); + return __builtin_aarch64_reduc_smax_scal_v8qi (__a); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vmaxv_s16 (int16x4_t __a) { - return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), 0); + return __builtin_aarch64_reduc_smax_scal_v4hi (__a); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vmaxv_s32 (int32x2_t __a) { - return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), 0); + return __builtin_aarch64_reduc_smax_scal_v2si (__a); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) vmaxv_u8 (uint8x8_t __a) { - return vget_lane_u8 ((uint8x8_t) - __builtin_aarch64_reduc_umax_v8qi ((int8x8_t) __a), - 0); + return __builtin_aarch64_reduc_umax_scal_v8qi_uu (__a); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) vmaxv_u16 (uint16x4_t __a) { - return vget_lane_u16 ((uint16x4_t) - __builtin_aarch64_reduc_umax_v4hi ((int16x4_t) __a), - 0); + return __builtin_aarch64_reduc_umax_scal_v4hi_uu (__a); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) vmaxv_u32 (uint32x2_t __a) { - return vget_lane_u32 ((uint32x2_t) - __builtin_aarch64_reduc_umax_v2si ((int32x2_t) __a), - 0); + return __builtin_aarch64_reduc_umax_scal_v2si_uu (__a); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vmaxvq_f32 (float32x4_t __a) { - return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_nan_v4sf (__a), - 0); + return __builtin_aarch64_reduc_smax_nan_scal_v4sf (__a); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vmaxvq_f64 (float64x2_t __a) { - return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_nan_v2df (__a), - 0); + return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a); } __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vmaxvq_s8 (int8x16_t __a) { - return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), 0); + return __builtin_aarch64_reduc_smax_scal_v16qi (__a); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vmaxvq_s16 (int16x8_t __a) { - return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), 0); + return __builtin_aarch64_reduc_smax_scal_v8hi (__a); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vmaxvq_s32 (int32x4_t __a) { - return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), 0); + return __builtin_aarch64_reduc_smax_scal_v4si (__a); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) vmaxvq_u8 (uint8x16_t __a) { - return vgetq_lane_u8 ((uint8x16_t) - __builtin_aarch64_reduc_umax_v16qi ((int8x16_t) __a), - 0); + return __builtin_aarch64_reduc_umax_scal_v16qi_uu (__a); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) vmaxvq_u16 (uint16x8_t __a) { - return vgetq_lane_u16 ((uint16x8_t) - __builtin_aarch64_reduc_umax_v8hi ((int16x8_t) __a), - 0); + return __builtin_aarch64_reduc_umax_scal_v8hi_uu (__a); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) vmaxvq_u32 (uint32x4_t __a) { - return vgetq_lane_u32 ((uint32x4_t) - __builtin_aarch64_reduc_umax_v4si ((int32x4_t) __a), - 0); + return __builtin_aarch64_reduc_umax_scal_v4si_uu (__a); } /* vmaxnmv */ @@ -18697,20 +18682,19 @@ vmaxvq_u32 (uint32x4_t __a) __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vmaxnmv_f32 (float32x2_t __a) { - return vget_lane_f32 (__builtin_aarch64_reduc_smax_v2sf (__a), - 0); + return __builtin_aarch64_reduc_smax_scal_v2sf (__a); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vmaxnmvq_f32 (float32x4_t __a) { - return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), 0); + return __builtin_aarch64_reduc_smax_scal_v4sf (__a); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vmaxnmvq_f64 (float64x2_t __a) { - return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), 0); + return __builtin_aarch64_reduc_smax_scal_v2df (__a); } /* vmin */ @@ -18836,107 +18820,91 @@ vminnmq_f64 (float64x2_t __a, float64x2_t __b) __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vminv_f32 (float32x2_t __a) { - return vget_lane_f32 (__builtin_aarch64_reduc_smin_nan_v2sf (__a), - 0); + return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a); } __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vminv_s8 (int8x8_t __a) { - return vget_lane_s8 (__builtin_aarch64_reduc_smin_v8qi (__a), - 0); + return __builtin_aarch64_reduc_smin_scal_v8qi (__a); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vminv_s16 (int16x4_t __a) { - return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), 0); + return __builtin_aarch64_reduc_smin_scal_v4hi (__a); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vminv_s32 (int32x2_t __a) { - return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), 0); + return __builtin_aarch64_reduc_smin_scal_v2si (__a); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) vminv_u8 (uint8x8_t __a) { - return vget_lane_u8 ((uint8x8_t) - __builtin_aarch64_reduc_umin_v8qi ((int8x8_t) __a), - 0); + return __builtin_aarch64_reduc_umin_scal_v8qi_uu (__a); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) vminv_u16 (uint16x4_t __a) { - return vget_lane_u16 ((uint16x4_t) - __builtin_aarch64_reduc_umin_v4hi ((int16x4_t) __a), - 0); + return __builtin_aarch64_reduc_umin_scal_v4hi_uu (__a); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) vminv_u32 (uint32x2_t __a) { - return vget_lane_u32 ((uint32x2_t) - __builtin_aarch64_reduc_umin_v2si ((int32x2_t) __a), - 0); + return __builtin_aarch64_reduc_umin_scal_v2si_uu (__a); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vminvq_f32 (float32x4_t __a) { - return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_nan_v4sf (__a), - 0); + return __builtin_aarch64_reduc_smin_nan_scal_v4sf (__a); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vminvq_f64 (float64x2_t __a) { - return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_nan_v2df (__a), - 0); + return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a); } __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vminvq_s8 (int8x16_t __a) { - return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), 0); + return __builtin_aarch64_reduc_smin_scal_v16qi (__a); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vminvq_s16 (int16x8_t __a) { - return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), 0); + return __builtin_aarch64_reduc_smin_scal_v8hi (__a); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vminvq_s32 (int32x4_t __a) { - return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), 0); + return __builtin_aarch64_reduc_smin_scal_v4si (__a); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) vminvq_u8 (uint8x16_t __a) { - return vgetq_lane_u8 ((uint8x16_t) - __builtin_aarch64_reduc_umin_v16qi ((int8x16_t) __a), - 0); + return __builtin_aarch64_reduc_umin_scal_v16qi_uu (__a); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) vminvq_u16 (uint16x8_t __a) { - return vgetq_lane_u16 ((uint16x8_t) - __builtin_aarch64_reduc_umin_v8hi ((int16x8_t) __a), - 0); + return __builtin_aarch64_reduc_umin_scal_v8hi_uu (__a); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) vminvq_u32 (uint32x4_t __a) { - return vgetq_lane_u32 ((uint32x4_t) - __builtin_aarch64_reduc_umin_v4si ((int32x4_t) __a), - 0); + return __builtin_aarch64_reduc_umin_scal_v4si_uu (__a); } /* vminnmv */ @@ -18944,19 +18912,19 @@ vminvq_u32 (uint32x4_t __a) __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vminnmv_f32 (float32x2_t __a) { - return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), 0); + return __builtin_aarch64_reduc_smin_scal_v2sf (__a); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vminnmvq_f32 (float32x4_t __a) { - return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), 0); + return __builtin_aarch64_reduc_smin_scal_v4sf (__a); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vminnmvq_f64 (float64x2_t __a) { - return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), 0); + return __builtin_aarch64_reduc_smin_scal_v2df (__a); } /* vmla */ |