diff options
author | Alan Lawrence <alan.lawrence@arm.com> | 2015-09-08 19:03:53 +0000 |
---|---|---|
committer | Alan Lawrence <alalaw01@gcc.gnu.org> | 2015-09-08 19:03:53 +0000 |
commit | 7c3694857570a7ed472cbacbb027521715a26da6 (patch) | |
tree | 9e5b275c8a6b11c641640fb4aef119d7305bafbf /gcc/config | |
parent | 71a11456ef9b85dc6cc098151d90b6651db7f0df (diff) | |
download | gcc-7c3694857570a7ed472cbacbb027521715a26da6.zip gcc-7c3694857570a7ed472cbacbb027521715a26da6.tar.gz gcc-7c3694857570a7ed472cbacbb027521715a26da6.tar.bz2 |
[AArch64] vld{2,3,4}{,_lane,_dup}, vcombine, vcreate
gcc/:
* config/aarch64/aarch64.c (aarch64_split_simd_combine): Add V4HFmode.
* config/aarch64/aarch64-builtins.c (VAR13, VAR14): New.
(aarch64_scalar_builtin_types, aarch64_init_simd_builtin_scalar_types):
Add __builtin_aarch64_simd_hf.
* config/aarch64/arm_neon.h (float16x4x2_t, float16x8x2_t,
float16x4x3_t, float16x8x3_t, float16x4x4_t, float16x8x4_t,
vcombine_f16, vst2_lane_f16, vst2q_lane_f16, vst3_lane_f16,
vst3q_lane_f16, vst4_lane_f16, vst4q_lane_f16, vld2_f16, vld2q_f16,
vld3_f16, vld3q_f16, vld4_f16, vld4q_f16, vld2_dup_f16, vld2q_dup_f16,
vld3_dup_f16, vld3q_dup_f16, vld4_dup_f16, vld4q_dup_f16,
vld2_lane_f16, vld2q_lane_f16, vld3_lane_f16, vld3q_lane_f16,
vld4_lane_f16, vld4q_lane_f16, vst2_f16, vst2q_f16, vst3_f16,
vst3q_f16, vst4_f16, vst4q_f16, vcreate_f16): New.
* config/aarch64/iterators.md (VALLDIF, Vtype, Vetype, Vbtype,
V_cmp_result, v_cmp_result): Add cases for V4HF and V8HF.
(VDC, Vdbl): Add V4HF.
gcc/testsuite/:
* gcc.target/aarch64/vldN_1.c: Add float16x4_t and float16x8_t cases.
* gcc.target/aarch64/vldN_dup_1.c: Likewise.
* gcc.target/aarch64/vldN_lane_1.c: Likewise.
(main): update orig_data to avoid float16 NaN on bigendian.
From-SVN: r227543
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/aarch64/aarch64-builtins.c | 9 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 3 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 284 | ||||
-rw-r--r-- | gcc/config/aarch64/iterators.md | 18 |
4 files changed, 305 insertions, 9 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index 243fa9f..5a04263 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -297,6 +297,12 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define VAR12(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \ VAR11 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K) \ VAR1 (T, N, MAP, L) +#define VAR13(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \ + VAR12 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \ + VAR1 (T, N, MAP, M) +#define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \ + VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \ + VAR1 (T, X, MAP, N) #include "aarch64-builtin-iterators.h" @@ -374,6 +380,7 @@ const char *aarch64_scalar_builtin_types[] = { "__builtin_aarch64_simd_qi", "__builtin_aarch64_simd_hi", "__builtin_aarch64_simd_si", + "__builtin_aarch64_simd_hf", "__builtin_aarch64_simd_sf", "__builtin_aarch64_simd_di", "__builtin_aarch64_simd_df", @@ -661,6 +668,8 @@ aarch64_init_simd_builtin_scalar_types (void) "__builtin_aarch64_simd_qi"); (*lang_hooks.types.register_builtin_type) (intHI_type_node, "__builtin_aarch64_simd_hi"); + (*lang_hooks.types.register_builtin_type) (aarch64_fp16_type_node, + "__builtin_aarch64_simd_hf"); (*lang_hooks.types.register_builtin_type) (intSI_type_node, "__builtin_aarch64_simd_si"); (*lang_hooks.types.register_builtin_type) (float_type_node, diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 5951045..7967170 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1335,6 +1335,9 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2) case V2SImode: gen = gen_aarch64_simd_combinev2si; break; + case V4HFmode: + gen = gen_aarch64_simd_combinev4hf; + break; case V2SFmode: gen = gen_aarch64_simd_combinev2sf; break; diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 5f61b5b..2bb75bb0e 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -152,6 +152,16 @@ typedef struct uint64x2x2_t uint64x2_t val[2]; } uint64x2x2_t; +typedef struct float16x4x2_t +{ + float16x4_t val[2]; +} float16x4x2_t; + +typedef struct float16x8x2_t +{ + float16x8_t val[2]; +} float16x8x2_t; + typedef struct float32x2x2_t { float32x2_t val[2]; @@ -272,6 +282,16 @@ typedef struct uint64x2x3_t uint64x2_t val[3]; } uint64x2x3_t; +typedef struct float16x4x3_t +{ + float16x4_t val[3]; +} float16x4x3_t; + +typedef struct float16x8x3_t +{ + float16x8_t val[3]; +} float16x8x3_t; + typedef struct float32x2x3_t { float32x2_t val[3]; @@ -392,6 +412,16 @@ typedef struct uint64x2x4_t uint64x2_t val[4]; } uint64x2x4_t; +typedef struct float16x4x4_t +{ + float16x4_t val[4]; +} float16x4x4_t; + +typedef struct float16x8x4_t +{ + float16x8_t val[4]; +} float16x8x4_t; + typedef struct float32x2x4_t { float32x2_t val[4]; @@ -2643,6 +2673,12 @@ vcreate_s64 (uint64_t __a) return (int64x1_t) {__a}; } +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vcreate_f16 (uint64_t __a) +{ + return (float16x4_t) __a; +} + __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vcreate_f32 (uint64_t __a) { @@ -4779,6 +4815,12 @@ vcombine_s64 (int64x1_t __a, int64x1_t __b) return __builtin_aarch64_combinedi (__a[0], __b[0]); } +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vcombine_f16 (float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_combinev4hf (__a, __b); +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vcombine_f32 (float32x2_t __a, float32x2_t __b) { @@ -9907,7 +9949,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b) +------+----+----+----+----+ |uint | Y | Y | N | N | +------+----+----+----+----+ - |float | - | - | N | N | + |float | - | Y | N | N | +------+----+----+----+----+ |poly | Y | Y | - | - | +------+----+----+----+----+ @@ -9921,7 +9963,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b) +------+----+----+----+----+ |uint | Y | Y | Y | Y | +------+----+----+----+----+ - |float | - | - | Y | Y | + |float | - | Y | Y | Y | +------+----+----+----+----+ |poly | Y | Y | - | - | +------+----+----+----+----+ @@ -9935,7 +9977,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b) +------+----+----+----+----+ |uint | Y | N | N | Y | +------+----+----+----+----+ - |float | - | - | N | Y | + |float | - | N | N | Y | +------+----+----+----+----+ |poly | Y | N | - | - | +------+----+----+----+----+ @@ -9951,6 +9993,7 @@ __STRUCTN (int, 8, 2) __STRUCTN (int, 16, 2) __STRUCTN (uint, 8, 2) __STRUCTN (uint, 16, 2) +__STRUCTN (float, 16, 2) __STRUCTN (poly, 8, 2) __STRUCTN (poly, 16, 2) /* 3-element structs. */ @@ -9962,6 +10005,7 @@ __STRUCTN (uint, 8, 3) __STRUCTN (uint, 16, 3) __STRUCTN (uint, 32, 3) __STRUCTN (uint, 64, 3) +__STRUCTN (float, 16, 3) __STRUCTN (float, 32, 3) __STRUCTN (float, 64, 3) __STRUCTN (poly, 8, 3) @@ -9999,6 +10043,8 @@ vst2_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __o, __c); \ } +__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16, + float16x8_t) __ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32, float32x4_t) __ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64, @@ -10037,6 +10083,7 @@ vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __temp.__o, __c); \ } +__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16) __ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) __ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) __ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) @@ -10078,6 +10125,8 @@ vst3_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __o, __c); \ } +__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16, + float16x8_t) __ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32, float32x4_t) __ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64, @@ -10116,6 +10165,7 @@ vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __temp.__o, __c); \ } +__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16) __ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) __ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) __ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) @@ -10162,6 +10212,8 @@ vst4_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __o, __c); \ } +__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16, + float16x8_t) __ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32, float32x4_t) __ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64, @@ -10200,6 +10252,7 @@ vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __temp.__o, __c); \ } +__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16) __ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) __ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) __ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) @@ -15255,6 +15308,17 @@ vld2_u32 (const uint32_t * __a) return ret; } +__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__)) +vld2_f16 (const float16_t * __a) +{ + float16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v4hf (__a); + ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1); + return ret; +} + __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) vld2_f32 (const float32_t * __a) { @@ -15376,6 +15440,17 @@ vld2q_u64 (const uint64_t * __a) return ret; } +__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__)) +vld2q_f16 (const float16_t * __a) +{ + float16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v8hf (__a); + ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1); + return ret; +} + __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) vld2q_f32 (const float32_t * __a) { @@ -15530,6 +15605,18 @@ vld3_u32 (const uint32_t * __a) return ret; } +__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__)) +vld3_f16 (const float16_t * __a) +{ + float16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v4hf (__a); + ret.val[0] = __builtin_aarch64_get_dregciv4hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1); + ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2); + return ret; +} + __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__)) vld3_f32 (const float32_t * __a) { @@ -15662,6 +15749,18 @@ vld3q_u64 (const uint64_t * __a) return ret; } +__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__)) +vld3q_f16 (const float16_t * __a) +{ + float16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v8hf (__a); + ret.val[0] = __builtin_aarch64_get_qregciv8hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1); + ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2); + return ret; +} + __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__)) vld3q_f32 (const float32_t * __a) { @@ -15829,6 +15928,19 @@ vld4_u32 (const uint32_t * __a) return ret; } +__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__)) +vld4_f16 (const float16_t * __a) +{ + float16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v4hf (__a); + ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1); + ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2); + ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3); + return ret; +} + __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__)) vld4_f32 (const float32_t * __a) { @@ -15972,6 +16084,19 @@ vld4q_u64 (const uint64_t * __a) return ret; } +__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__)) +vld4q_f16 (const float16_t * __a) +{ + float16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v8hf (__a); + ret.val[0] = __builtin_aarch64_get_qregxiv8hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_qregxiv8hf (__o, 1); + ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2); + ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3); + return ret; +} + __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__)) vld4q_f32 (const float32_t * __a) { @@ -16033,6 +16158,17 @@ vld2_dup_s32 (const int32_t * __a) return ret; } +__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__)) +vld2_dup_f16 (const float16_t * __a) +{ + float16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv4hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0); + ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1); + return ret; +} + __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) vld2_dup_f32 (const float32_t * __a) { @@ -16242,6 +16378,17 @@ vld2q_dup_u64 (const uint64_t * __a) return ret; } +__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__)) +vld2q_dup_f16 (const float16_t * __a) +{ + float16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1); + return ret; +} + __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) vld2q_dup_f32 (const float32_t * __a) { @@ -16396,6 +16543,18 @@ vld3_dup_u32 (const uint32_t * __a) return ret; } +__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__)) +vld3_dup_f16 (const float16_t * __a) +{ + float16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0); + ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1); + ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2); + return ret; +} + __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__)) vld3_dup_f32 (const float32_t * __a) { @@ -16528,6 +16687,18 @@ vld3q_dup_u64 (const uint64_t * __a) return ret; } +__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__)) +vld3q_dup_f16 (const float16_t * __a) +{ + float16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0); + ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1); + ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2); + return ret; +} + __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__)) vld3q_dup_f32 (const float32_t * __a) { @@ -16695,6 +16866,19 @@ vld4_dup_u32 (const uint32_t * __a) return ret; } +__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__)) +vld4_dup_f16 (const float16_t * __a) +{ + float16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 0); + ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 1); + ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2); + ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3); + return ret; +} + __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__)) vld4_dup_f32 (const float32_t * __a) { @@ -16838,6 +17022,19 @@ vld4q_dup_u64 (const uint64_t * __a) return ret; } +__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__)) +vld4q_dup_f16 (const float16_t * __a) +{ + float16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 0); + ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 1); + ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 2); + ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 3); + return ret; +} + __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__)) vld4q_dup_f32 (const float32_t * __a) { @@ -16890,6 +17087,8 @@ vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ return __b; \ } +__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf, + v8hf, hf, f16, float16x8_t) __LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32, float32x4_t) __LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df, @@ -16934,6 +17133,7 @@ vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ return ret; \ } +__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16) __LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32) __LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64) __LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8) @@ -16981,6 +17181,8 @@ vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ return __b; \ } +__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf, + v8hf, hf, f16, float16x8_t) __LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32, float32x4_t) __LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df, @@ -17027,6 +17229,7 @@ vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ return ret; \ } +__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16) __LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32) __LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64) __LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8) @@ -17082,6 +17285,8 @@ vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ /* vld4q_lane */ +__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf, + v8hf, hf, f16, float16x8_t) __LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32, float32x4_t) __LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df, @@ -17130,6 +17335,7 @@ vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ return ret; \ } +__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16) __LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32) __LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64) __LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8) @@ -22491,6 +22697,18 @@ vst2_u32 (uint32_t * __a, uint32x2x2_t val) } __extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_f16 (float16_t * __a, float16x4x2_t val) +{ + __builtin_aarch64_simd_oi __o; + float16x8x2_t temp; + temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1); + __builtin_aarch64_st2v4hf (__a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) vst2_f32 (float32_t * __a, float32x2x2_t val) { __builtin_aarch64_simd_oi __o; @@ -22593,6 +22811,15 @@ vst2q_u64 (uint64_t * __a, uint64x2x2_t val) } __extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_f16 (float16_t * __a, float16x8x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1); + __builtin_aarch64_st2v8hf (__a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) vst2q_f32 (float32_t * __a, float32x4x2_t val) { __builtin_aarch64_simd_oi __o; @@ -22765,6 +22992,20 @@ vst3_u32 (uint32_t * __a, uint32x2x3_t val) } __extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_f16 (float16_t * __a, float16x4x3_t val) +{ + __builtin_aarch64_simd_ci __o; + float16x8x3_t temp; + temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2); + __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) vst3_f32 (float32_t * __a, float32x2x3_t val) { __builtin_aarch64_simd_ci __o; @@ -22879,6 +23120,16 @@ vst3q_u64 (uint64_t * __a, uint64x2x3_t val) } __extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_f16 (float16_t * __a, float16x8x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2); + __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) vst3q_f32 (float32_t * __a, float32x4x3_t val) { __builtin_aarch64_simd_ci __o; @@ -23075,6 +23326,22 @@ vst4_u32 (uint32_t * __a, uint32x2x4_t val) } __extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_f16 (float16_t * __a, float16x4x4_t val) +{ + __builtin_aarch64_simd_xi __o; + float16x8x4_t temp; + temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); + temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3); + __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) vst4_f32 (float32_t * __a, float32x2x4_t val) { __builtin_aarch64_simd_xi __o; @@ -23201,6 +23468,17 @@ vst4q_u64 (uint64_t * __a, uint64x2x4_t val) } __extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_f16 (float16_t * __a, float16x8x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3); + __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) vst4q_f32 (float32_t * __a, float32x4x4_t val) { __builtin_aarch64_simd_xi __o; diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 3e4f07d..58cc000 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -111,9 +111,9 @@ (define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V4HF V8HF V2SF V4SF V2DF DI]) -;; All vector modes barring HF modes, plus DI and DF. +;; All vector modes, plus DI and DF. (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI - V2DI V2SF V4SF V2DF DI DF]) + V2DI V4HF V8HF V2SF V4SF V2DF DI DF]) ;; Vector modes for Integer reduction across lanes. (define_mode_iterator VDQV [V8QI V16QI V4HI V8HI V4SI V2DI]) @@ -134,7 +134,7 @@ (define_mode_iterator VQW [V16QI V8HI V4SI]) ;; Double vector modes for combines. -(define_mode_iterator VDC [V8QI V4HI V2SI V2SF DI DF]) +(define_mode_iterator VDC [V8QI V4HI V4HF V2SI V2SF DI DF]) ;; Vector modes except double int. (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF]) @@ -361,7 +361,8 @@ (V2SI "2s") (V4SI "4s") (DI "1d") (DF "1d") (V2DI "2d") (V2SF "2s") - (V4SF "4s") (V2DF "2d")]) + (V4SF "4s") (V2DF "2d") + (V4HF "4h") (V8HF "8h")]) (define_mode_attr Vrevsuff [(V4HI "16") (V8HI "16") (V2SI "32") (V4SI "32") (V2DI "64")]) @@ -387,7 +388,8 @@ (define_mode_attr Vetype [(V8QI "b") (V16QI "b") (V4HI "h") (V8HI "h") (V2SI "s") (V4SI "s") - (V2DI "d") (V2SF "s") + (V2DI "d") (V4HF "h") + (V8HF "h") (V2SF "s") (V4SF "s") (V2DF "d") (SF "s") (DF "d") (QI "b") (HI "h") @@ -397,7 +399,8 @@ (define_mode_attr Vbtype [(V8QI "8b") (V16QI "16b") (V4HI "8b") (V8HI "16b") (V2SI "8b") (V4SI "16b") - (V2DI "16b") (V2SF "8b") + (V2DI "16b") (V4HF "8b") + (V8HF "16b") (V2SF "8b") (V4SF "16b") (V2DF "16b") (DI "8b") (DF "8b") (SI "8b")]) @@ -448,6 +451,7 @@ ;; Double modes of vector modes (lower case). (define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi") + (V4HF "v8hf") (V2SI "v4si") (V2SF "v4sf") (SI "v2si") (DI "v2di") (DF "v2df")]) @@ -522,6 +526,7 @@ (V4HI "V4HI") (V8HI "V8HI") (V2SI "V2SI") (V4SI "V4SI") (DI "DI") (V2DI "V2DI") + (V4HF "V4HI") (V8HF "V8HI") (V2SF "V2SI") (V4SF "V4SI") (V2DF "V2DI") (DF "DI") (SF "SI")]) @@ -531,6 +536,7 @@ (V4HI "v4hi") (V8HI "v8hi") (V2SI "v2si") (V4SI "v4si") (DI "di") (V2DI "v2di") + (V4HF "v4hi") (V8HF "v8hi") (V2SF "v2si") (V4SF "v4si") (V2DF "v2di") (DF "di") (SF "si")]) |