aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorMihail Ionescu <mihail.ionescu@arm.com>2020-02-18 14:29:47 +0000
committerRichard Sandiford <richard.sandiford@arm.com>2020-02-25 18:36:53 +0000
commite603cd43b145c426468c95cf85b3c12c94daedaa (patch)
tree1a81b53ca570e287978ff1f06e2aec601047d734 /gcc
parent8ea6c1b89a20ef7c675535ba1994355361dac977 (diff)
downloadgcc-e603cd43b145c426468c95cf85b3c12c94daedaa.zip
gcc-e603cd43b145c426468c95cf85b3c12c94daedaa.tar.gz
gcc-e603cd43b145c426468c95cf85b3c12c94daedaa.tar.bz2
aarch64: Add bfloat16 vldn/vstn intrinsics
This patch adds the load/store bfloat16 intrinsics to the AArch64 back-end. ACLE documents are at https://developer.arm.com/docs/101028/latest ISA documents are at https://developer.arm.com/docs/ddi0596/latest 2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com> gcc/ * config/aarch64/aarch64-builtins.c (aarch64_scalar_builtin_types): Add simd_bf. (aarch64_init_simd_builtin_scalar_types): Register simd_bf. (VAR15, VAR16): New. * config/aarch64/iterators.md (VALLDIF): Enable for V4BF and V8BF. (VD): Enable for V4BF. (VDC): Likewise. (VQ): Enable for V8BF. (VQ2): Likewise. (VQ_NO2E): Likewise. (VDBL, Vdbl): Add V4BF. (V_INT_EQUIV, v_int_equiv): Add V4BF and V8BF. * config/aarch64/arm_neon.h (bfloat16x4x2_t): New typedef. (bfloat16x8x2_t): Likewise. (bfloat16x4x3_t): Likewise. (bfloat16x8x3_t): Likewise. (bfloat16x4x4_t): Likewise. (bfloat16x8x4_t): Likewise. (vcombine_bf16): New. (vld1_bf16, vld1_bf16_x2): New. (vld1_bf16_x3, vld1_bf16_x4): New. (vld1q_bf16, vld1q_bf16_x2): New. (vld1q_bf16_x3, vld1q_bf16_x4): New. (vld1_lane_bf16): New. (vld1q_lane_bf16): New. (vld1_dup_bf16): New. (vld1q_dup_bf16): New. (vld2_bf16): New. (vld2q_bf16): New. (vld2_dup_bf16): New. (vld2q_dup_bf16): New. (vld3_bf16): New. (vld3q_bf16): New. (vld3_dup_bf16): New. (vld3q_dup_bf16): New. (vld4_bf16): New. (vld4q_bf16): New. (vld4_dup_bf16): New. (vld4q_dup_bf16): New. (vst1_bf16, vst1_bf16_x2): New. (vst1_bf16_x3, vst1_bf16_x4): New. (vst1q_bf16, vst1q_bf16_x2): New. (vst1q_bf16_x3, vst1q_bf16_x4): New. (vst1_lane_bf16): New. (vst1q_lane_bf16): New. (vst2_bf16): New. (vst2q_bf16): New. (vst3_bf16): New. (vst3q_bf16): New. (vst4_bf16): New. (vst4q_bf16): New. gcc/testsuite/ * gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c: New test. * gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog54
-rw-r--r--gcc/config/aarch64/aarch64-builtins.c9
-rw-r--r--gcc/config/aarch64/arm_neon.h479
-rw-r--r--gcc/config/aarch64/iterators.md18
-rw-r--r--gcc/testsuite/ChangeLog5
-rw-r--r--gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c150
-rw-r--r--gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c107
7 files changed, 814 insertions, 8 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 334a16e..653c276 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,59 @@
2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
+ * config/aarch64/aarch64-builtins.c (aarch64_scalar_builtin_types):
+ Add simd_bf.
+ (aarch64_init_simd_builtin_scalar_types): Register simd_bf.
+ (VAR15, VAR16): New.
+ * config/aarch64/iterators.md (VALLDIF): Enable for V4BF and V8BF.
+ (VD): Enable for V4BF.
+ (VDC): Likewise.
+ (VQ): Enable for V8BF.
+ (VQ2): Likewise.
+ (VQ_NO2E): Likewise.
+ (VDBL, Vdbl): Add V4BF.
+ (V_INT_EQUIV, v_int_equiv): Add V4BF and V8BF.
+ * config/aarch64/arm_neon.h (bfloat16x4x2_t): New typedef.
+ (bfloat16x8x2_t): Likewise.
+ (bfloat16x4x3_t): Likewise.
+ (bfloat16x8x3_t): Likewise.
+ (bfloat16x4x4_t): Likewise.
+ (bfloat16x8x4_t): Likewise.
+ (vcombine_bf16): New.
+ (vld1_bf16, vld1_bf16_x2): New.
+ (vld1_bf16_x3, vld1_bf16_x4): New.
+ (vld1q_bf16, vld1q_bf16_x2): New.
+ (vld1q_bf16_x3, vld1q_bf16_x4): New.
+ (vld1_lane_bf16): New.
+ (vld1q_lane_bf16): New.
+ (vld1_dup_bf16): New.
+ (vld1q_dup_bf16): New.
+ (vld2_bf16): New.
+ (vld2q_bf16): New.
+ (vld2_dup_bf16): New.
+ (vld2q_dup_bf16): New.
+ (vld3_bf16): New.
+ (vld3q_bf16): New.
+ (vld3_dup_bf16): New.
+ (vld3q_dup_bf16): New.
+ (vld4_bf16): New.
+ (vld4q_bf16): New.
+ (vld4_dup_bf16): New.
+ (vld4q_dup_bf16): New.
+ (vst1_bf16, vst1_bf16_x2): New.
+ (vst1_bf16_x3, vst1_bf16_x4): New.
+ (vst1q_bf16, vst1q_bf16_x2): New.
+ (vst1q_bf16_x3, vst1q_bf16_x4): New.
+ (vst1_lane_bf16): New.
+ (vst1q_lane_bf16): New.
+ (vst2_bf16): New.
+ (vst2q_bf16): New.
+ (vst3_bf16): New.
+ (vst3q_bf16): New.
+ (vst4_bf16): New.
+ (vst4q_bf16): New.
+
+2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
+
* config/aarch64/iterators.md (VDQF_F16) Add V4BF and V8BF.
(VALL_F16): Likewise.
(VALLDI_F16): Likewise.
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index f50c485..9c9c6d8 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -370,6 +370,12 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
#define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
VAR1 (T, X, MAP, N)
+#define VAR15(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
+ VAR14 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
+ VAR1 (T, X, MAP, O)
+#define VAR16(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+ VAR15 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
+ VAR1 (T, X, MAP, P)
#include "aarch64-builtin-iterators.h"
@@ -534,6 +540,7 @@ const char *aarch64_scalar_builtin_types[] = {
"__builtin_aarch64_simd_oi",
"__builtin_aarch64_simd_ci",
"__builtin_aarch64_simd_xi",
+ "__builtin_aarch64_simd_bf",
NULL
};
@@ -847,6 +854,8 @@ aarch64_init_simd_builtin_scalar_types (void)
"__builtin_aarch64_simd_poly128");
(*lang_hooks.types.register_builtin_type) (intTI_type_node,
"__builtin_aarch64_simd_ti");
+ (*lang_hooks.types.register_builtin_type) (aarch64_bf16_type_node,
+ "__builtin_aarch64_simd_bf");
/* Unsigned integer types for various mode sizes. */
(*lang_hooks.types.register_builtin_type) (unsigned_intQI_type_node,
"__builtin_aarch64_simd_uqi");
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index a4f2dd2..b6f42ac 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -76,6 +76,36 @@ typedef double float64_t;
typedef __Bfloat16x4_t bfloat16x4_t;
typedef __Bfloat16x8_t bfloat16x8_t;
+typedef struct bfloat16x4x2_t
+{
+ bfloat16x4_t val[2];
+} bfloat16x4x2_t;
+
+typedef struct bfloat16x8x2_t
+{
+ bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+
+typedef struct bfloat16x4x3_t
+{
+ bfloat16x4_t val[3];
+} bfloat16x4x3_t;
+
+typedef struct bfloat16x8x3_t
+{
+ bfloat16x8_t val[3];
+} bfloat16x8x3_t;
+
+typedef struct bfloat16x4x4_t
+{
+ bfloat16x4_t val[4];
+} bfloat16x4x4_t;
+
+typedef struct bfloat16x8x4_t
+{
+ bfloat16x8_t val[4];
+} bfloat16x8x4_t;
+
typedef struct int8x8x2_t
{
int8x8_t val[2];
@@ -34589,6 +34619,13 @@ vcreate_bf16 (uint64_t __a)
return (bfloat16x4_t) __a;
}
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b)
+{
+ return (bfloat16x8_t)__builtin_aarch64_combinev4bf (__a, __b);
+}
+
/* vdup */
__extension__ extern __inline bfloat16x4_t
@@ -34647,6 +34684,448 @@ vduph_laneq_bf16 (bfloat16x8_t __a, const int __b)
return __aarch64_vget_lane_any (__a, __b);
}
+/* vld */
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16 (const bfloat16_t *__a)
+{
+ return (bfloat16x4_t) __builtin_aarch64_ld1v4bf (__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16 (const bfloat16_t *__a)
+{
+ return __builtin_aarch64_ld1v8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16_x2 (const bfloat16_t *__a)
+{
+ bfloat16x4x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld1x2v4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16_x2 (const bfloat16_t *__a)
+{
+ bfloat16x8x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld1x2v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16_x3 (const bfloat16_t *__a)
+{
+ bfloat16x4x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v4bf ((const __builtin_aarch64_simd_bf *) __a);
+ __i.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0);
+ __i.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1);
+ __i.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline bfloat16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16_x3 (const bfloat16_t *__a)
+{
+ bfloat16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ __i.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0);
+ __i.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1);
+ __i.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2);
+ return __i;
+}
+__extension__ extern __inline bfloat16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16_x4 (const bfloat16_t *__a)
+{
+ union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+ __au.__o
+ = __builtin_aarch64_ld1x4v4bf ((const __builtin_aarch64_simd_bf *) __a);
+ return __au.__i;
+}
+
+__extension__ extern __inline bfloat16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16_x4 (const bfloat16_t *__a)
+{
+ union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+ __au.__o
+ = __builtin_aarch64_ld1x4v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ return __au.__i;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_bf16 (const bfloat16_t *__src, bfloat16x4_t __vec, const int __lane)
+{
+ return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_bf16 (const bfloat16_t *__src, bfloat16x8_t __vec, const int __lane)
+{
+ return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_bf16 (const bfloat16_t* __a)
+{
+ return vdup_n_bf16 (*__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_bf16 (const bfloat16_t* __a)
+{
+ return vdupq_n_bf16 (*__a);
+}
+
+__extension__ extern __inline bfloat16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2v4bf (__a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2rv4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2rv8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3v4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1);
+ ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1);
+ ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3rv4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1);
+ ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3rv8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1);
+ ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4v4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1);
+ ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2);
+ ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1);
+ ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2);
+ ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4rv4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1);
+ ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2);
+ ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4rv8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1);
+ ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2);
+ ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3);
+ return ret;
+}
+
+/* vst */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16 (bfloat16_t *__a, bfloat16x4_t __b)
+{
+ __builtin_aarch64_st1v4bf (__a, __b);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val)
+{
+ __builtin_aarch64_simd_oi __o;
+ bfloat16x8x2_t __temp;
+ __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
+ __builtin_aarch64_st1x2v4bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
+ __builtin_aarch64_st1x2v8bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val)
+{
+ __builtin_aarch64_simd_ci __o;
+ bfloat16x8x3_t __temp;
+ __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
+ __builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
+ __builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val)
+{
+ union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+ __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val)
+{
+ union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+ __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16 (bfloat16_t *__a, bfloat16x8_t __b)
+{
+ __builtin_aarch64_st1v8bf (__a, __b);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_bf16 (bfloat16_t *__a, bfloat16x4_t __b, const int __lane)
+{
+ *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_bf16 (bfloat16_t *__a, bfloat16x8_t __b, const int __lane)
+{
+ *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val)
+{
+ __builtin_aarch64_simd_oi __o;
+ bfloat16x8x2_t __temp;
+ __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
+ __builtin_aarch64_st2v4bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
+ __builtin_aarch64_st2v8bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val)
+{
+ __builtin_aarch64_simd_ci __o;
+ bfloat16x8x3_t __temp;
+ __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
+ __builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
+ __builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val)
+{
+ __builtin_aarch64_simd_xi __o;
+ bfloat16x8x4_t __temp;
+ __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3);
+ __builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val)
+{
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3);
+ __builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
/* vreinterpret */
__extension__ extern __inline bfloat16x4_t
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 571a5fa..ec1b92c 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -87,7 +87,7 @@
(define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
;; Double vector modes.
-(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
+(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF V4BF])
;; Double vector modes suitable for moving. Includes BFmode.
(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
@@ -105,10 +105,10 @@
(define_mode_iterator VDQ_BHSI [V8QI V16QI V4HI V8HI V2SI V4SI])
;; Quad vector modes.
-(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
+(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
;; Copy of the above.
-(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
+(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
;; Quad vector modes suitable for moving. Includes BFmode.
(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
@@ -120,7 +120,7 @@
(define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
;; VQ without 2 element modes.
-(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF])
+(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
;; Quad vector with only 2 element modes.
(define_mode_iterator VQ_2E [V2DI V2DF])
@@ -200,7 +200,7 @@
V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI])
;; All Advanced SIMD modes, plus DI and DF.
-(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI
+(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF
V2DI V4HF V8HF V2SF V4SF V2DF DI DF])
;; Advanced SIMD modes for Integer reduction across lanes.
@@ -226,7 +226,7 @@
(define_mode_iterator VQW [V16QI V8HI V4SI])
;; Double vector modes for combines.
-(define_mode_iterator VDC [V8QI V4HI V4HF V2SI V2SF DI DF])
+(define_mode_iterator VDC [V8QI V4HI V4BF V4HF V2SI V2SF DI DF])
;; Advanced SIMD modes except double int.
(define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
@@ -1171,7 +1171,7 @@
;; Double modes of vector modes.
(define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
- (V4HF "V8HF")
+ (V4HF "V8HF") (V4BF "V8BF")
(V2SI "V4SI") (V2SF "V4SF")
(SI "V2SI") (DI "V2DI")
(DF "V2DF")])
@@ -1181,7 +1181,7 @@
;; Double modes of vector modes (lower case).
(define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi")
- (V4HF "v8hf")
+ (V4HF "v8hf") (V4BF "v8bf")
(V2SI "v4si") (V2SF "v4sf")
(SI "v2si") (DI "v2di")
(DF "v2df")])
@@ -1314,6 +1314,7 @@
(V2SI "V2SI") (V4SI "V4SI")
(DI "DI") (V2DI "V2DI")
(V4HF "V4HI") (V8HF "V8HI")
+ (V4BF "V4HI") (V8BF "V8HI")
(V2SF "V2SI") (V4SF "V4SI")
(DF "DI") (V2DF "V2DI")
(SF "SI") (SI "SI")
@@ -1331,6 +1332,7 @@
(V2SI "v2si") (V4SI "v4si")
(DI "di") (V2DI "v2di")
(V4HF "v4hi") (V8HF "v8hi")
+ (V4BF "v4hi") (V8BF "v8hi")
(V2SF "v2si") (V4SF "v4si")
(DF "di") (V2DF "v2di")
(SF "si")
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index c942486..f344376 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,5 +1,10 @@
2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
+ * gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c: New test.
+ * gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c: New test.
+
+2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
+
* gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c: New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c
new file mode 100644
index 0000000..cf24509
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c
@@ -0,0 +1,150 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+bfloat16x4_t
+test_vld1_dup_bf16 (bfloat16_t * ptr)
+{
+ return vld1_dup_bf16 (ptr);
+}
+
+bfloat16x8_t
+test_vld1q_dup_bf16 (bfloat16_t * ptr)
+{
+ return vld1q_dup_bf16 (ptr);
+}
+
+bfloat16x4_t
+test_vld1_lane_bf16 (bfloat16_t * ptr, bfloat16x4_t src)
+{
+ return vld1_lane_bf16 (ptr, src, 3);
+}
+
+bfloat16x8_t
+test_vld1q_lane_bf16 (bfloat16_t * ptr, bfloat16x8_t src)
+{
+ return vld1q_lane_bf16 (ptr, src, 7);
+}
+
+bfloat16x4_t
+test_vld1_bf16 (bfloat16_t * ptr)
+{
+ return vld1_bf16 (ptr);
+}
+
+bfloat16x8_t
+test_vld1q_bf16 (bfloat16_t * ptr)
+{
+ return vld1q_bf16 (ptr);
+}
+
+bfloat16x4x2_t
+test_vld1_bf16_x2 (bfloat16_t * ptr)
+{
+ return vld1_bf16_x2 (ptr);
+}
+
+bfloat16x8x2_t
+test_vld1q_bf16_x2 (bfloat16_t * ptr)
+{
+ return vld1q_bf16_x2 (ptr);
+}
+
+bfloat16x4x3_t
+test_vld1_bf16_x3 (bfloat16_t * ptr)
+{
+ return vld1_bf16_x3 (ptr);
+}
+
+bfloat16x8x3_t
+test_vld1q_bf16_x3 (bfloat16_t * ptr)
+{
+ return vld1q_bf16_x3 (ptr);
+}
+
+bfloat16x4x4_t
+test_vld1_bf16_x4 (bfloat16_t * ptr)
+{
+ return vld1_bf16_x4 (ptr);
+}
+
+bfloat16x8x4_t
+test_vld1q_bf16_x4 (bfloat16_t * ptr)
+{
+ return vld1q_bf16_x4 (ptr);
+}
+
+bfloat16x4x2_t
+test_vld2_bf16 (bfloat16_t * ptr)
+{
+ return vld2_bf16 (ptr);
+}
+
+bfloat16x8x2_t
+test_vld2q_bf16 (bfloat16_t * ptr)
+{
+ return vld2q_bf16 (ptr);
+}
+
+bfloat16x4x2_t
+test_vld2_dup_bf16 (bfloat16_t * ptr)
+{
+ return vld2_dup_bf16 (ptr);
+}
+
+bfloat16x8x2_t
+test_vld2q_dup_bf16 (bfloat16_t * ptr)
+{
+ return vld2q_dup_bf16 (ptr);
+}
+
+bfloat16x4x3_t
+test_vld3_bf16 (bfloat16_t * ptr)
+{
+ return vld3_bf16 (ptr);
+}
+
+bfloat16x8x3_t
+test_vld3q_bf16 (bfloat16_t * ptr)
+{
+ return vld3q_bf16 (ptr);
+}
+
+bfloat16x4x3_t
+test_vld3_dup_bf16 (bfloat16_t * ptr)
+{
+ return vld3_dup_bf16 (ptr);
+}
+
+bfloat16x8x3_t
+test_vld3q_dup_bf16 (bfloat16_t * ptr)
+{
+ return vld3q_dup_bf16 (ptr);
+}
+
+bfloat16x4x4_t
+test_vld4_bf16 (bfloat16_t * ptr)
+{
+ return vld4_bf16 (ptr);
+}
+
+bfloat16x8x4_t
+test_vld4q_bf16 (bfloat16_t * ptr)
+{
+ return vld4q_bf16 (ptr);
+}
+
+bfloat16x4x4_t
+test_vld4_dup_bf16 (bfloat16_t * ptr)
+{
+ return vld4_dup_bf16 (ptr);
+}
+
+bfloat16x8x4_t
+test_vld4q_dup_bf16 (bfloat16_t * ptr)
+{
+ return vld4q_dup_bf16 (ptr);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c
new file mode 100644
index 0000000..162b3ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c
@@ -0,0 +1,107 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+void
+test_vst1_bf16_x2 (bfloat16_t *ptr, bfloat16x4x2_t val)
+{
+ vst1_bf16_x2 (ptr, val);
+}
+
+void
+test_vst1q_bf16_x2 (bfloat16_t *ptr, bfloat16x8x2_t val)
+{
+ vst1q_bf16_x2 (ptr, val);
+}
+
+void
+test_vst1_bf16_x3 (bfloat16_t *ptr, bfloat16x4x3_t val)
+{
+ vst1_bf16_x3 (ptr, val);
+}
+
+void
+test_vst1q_bf16_x3 (bfloat16_t *ptr, bfloat16x8x3_t val)
+{
+ vst1q_bf16_x3 (ptr, val);
+}
+
+void
+test_vst1_bf16_x4 (bfloat16_t *ptr, bfloat16x4x4_t val)
+{
+ vst1_bf16_x4 (ptr, val);
+}
+
+void
+test_vst1q_bf16_x4 (bfloat16_t *ptr, bfloat16x8x4_t val)
+{
+ vst1q_bf16_x4 (ptr, val);
+}
+
+void
+test_vst1_lane_bf16 (bfloat16_t *ptr, bfloat16x4_t val)
+{
+ vst1_lane_bf16 (ptr, val, 3);
+}
+
+void
+test_vst1q_lane_bf16 (bfloat16_t *ptr, bfloat16x8_t val)
+{
+ vst1q_lane_bf16 (ptr, val, 7);
+}
+
+void
+test_vst1_bf16 (bfloat16_t *ptr, bfloat16x4_t val)
+{
+ vst1_bf16 (ptr, val);
+}
+
+void
+test_vst1q_bf16 (bfloat16_t *ptr, bfloat16x8_t val)
+{
+ vst1q_bf16 (ptr, val);
+}
+
+void
+test_vst2_bf16 (bfloat16_t *ptr, bfloat16x4x2_t val)
+{
+ vst2_bf16 (ptr, val);
+}
+
+void
+test_vst2q_bf16 (bfloat16_t *ptr, bfloat16x8x2_t val)
+{
+ vst2q_bf16 (ptr, val);
+}
+
+void
+test_vst3_bf16 (bfloat16_t *ptr, bfloat16x4x3_t val)
+{
+ vst3_bf16 (ptr, val);
+}
+
+void
+test_vst3q_bf16 (bfloat16_t *ptr, bfloat16x8x3_t val)
+{
+ vst3q_bf16 (ptr, val);
+}
+
+void
+test_vst4_bf16 (bfloat16_t *ptr, bfloat16x4x4_t val)
+{
+ vst4_bf16 (ptr, val);
+}
+
+void
+test_vst4q_bf16 (bfloat16_t *ptr, bfloat16x8x4_t val)
+{
+ vst4q_bf16 (ptr, val);
+}
+
+int main()
+{
+ return 0;
+}