aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
authorJames Greenhalgh <james.greenhalgh@arm.com>2013-09-16 09:53:11 +0000
committerJames Greenhalgh <jgreenhalgh@gcc.gnu.org>2013-09-16 09:53:11 +0000
commit828e70c1d7bb5c849a2df44aa832793c71833058 (patch)
tree8fa3db5b4d674a6afa031fe0d47a15cbe1e15109 /gcc/config
parent779aea46cc3b6b1a62f989ee0c04e68803733ba0 (diff)
downloadgcc-828e70c1d7bb5c849a2df44aa832793c71833058.zip
gcc-828e70c1d7bb5c849a2df44aa832793c71833058.tar.gz
gcc-828e70c1d7bb5c849a2df44aa832793c71833058.tar.bz2
[AArch64] Improve arm_neon.h vml<as>_lane handling.
gcc/ * config/aarch64/aarch64-simd-builtins.def (fma): New. * config/aarch64/aarch64-simd.md (aarch64_mla_elt<mode>): New. (aarch64_mla_elt_<vswap_width_name><mode>): Likewise. (aarch64_mls_elt<mode>): Likewise. (aarch64_mls_elt_<vswap_width_name><mode>): Likewise. (aarch64_fma4_elt<mode>): Likewise. (aarch64_fma4_elt_<vswap_width_name><mode>): Likewise. (aarch64_fma4_elt_to_128v2df): Likewise. (aarch64_fma4_elt_to_64df): Likewise. (fnma<mode>4): Likewise. (aarch64_fnma4_elt<mode>): Likewise. (aarch64_fnma4_elt_<vswap_width_name><mode>): Likewise. (aarch64_fnma4_elt_to_128v2df): Likewise. (aarch64_fnma4_elt_to_64df): Likewise. * config/aarch64/iterators.md (VDQSF): New. * config/aarch64/arm_neon.h (vfm<as><sdq>_lane<q>_f<32, 64>): Convert to C implementation. (vml<sa><q>_lane<q>_<fsu><16, 32, 64>): Likewise. gcc/testsuite/ * gcc.target/aarch64/fmla-intrinsic.c: New. * gcc.target/aarch64/mla-intrinsic.c: Likewise. * gcc.target/aarch64/fmls-intrinsic.c: Likewise. * gcc.target/aarch64/mls-intrinsic.c: Likewise. From-SVN: r202625
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/aarch64/aarch64-simd-builtins.def3
-rw-r--r--gcc/config/aarch64/aarch64-simd.md195
-rw-r--r--gcc/config/aarch64/arm_neon.h1062
-rw-r--r--gcc/config/aarch64/iterators.md3
4 files changed, 702 insertions, 561 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 4046d7a..35897f3 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -359,3 +359,6 @@
/* Implemented by aarch64_st1<VALL:mode>. */
BUILTIN_VALL (STORE1, st1, 0)
+ /* Implemented by fma<mode>4. */
+ BUILTIN_VDQF (TERNOP, fma, 4)
+
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 04d5794..f13cd5b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1070,6 +1070,38 @@
(set_attr "simd_mode" "<MODE>")]
)
+(define_insn "*aarch64_mla_elt<mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+ (plus:VDQHS
+ (mult:VDQHS
+ (vec_duplicate:VDQHS
+ (vec_select:<VEL>
+ (match_operand:VDQHS 1 "register_operand" "<h_con>")
+ (parallel [(match_operand:SI 2 "immediate_operand")])))
+ (match_operand:VDQHS 3 "register_operand" "w"))
+ (match_operand:VDQHS 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+ [(set_attr "simd_type" "simd_mla")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_mla_elt_<vswap_width_name><mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+ (plus:VDQHS
+ (mult:VDQHS
+ (vec_duplicate:VDQHS
+ (vec_select:<VEL>
+ (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+ (parallel [(match_operand:SI 2 "immediate_operand")])))
+ (match_operand:VDQHS 3 "register_operand" "w"))
+ (match_operand:VDQHS 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+ [(set_attr "simd_type" "simd_mla")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
(define_insn "aarch64_mls<mode>"
[(set (match_operand:VQ_S 0 "register_operand" "=w")
(minus:VQ_S (match_operand:VQ_S 1 "register_operand" "0")
@@ -1081,6 +1113,38 @@
(set_attr "simd_mode" "<MODE>")]
)
+(define_insn "*aarch64_mls_elt<mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+ (minus:VDQHS
+ (match_operand:VDQHS 4 "register_operand" "0")
+ (mult:VDQHS
+ (vec_duplicate:VDQHS
+ (vec_select:<VEL>
+ (match_operand:VDQHS 1 "register_operand" "<h_con>")
+ (parallel [(match_operand:SI 2 "immediate_operand")])))
+ (match_operand:VDQHS 3 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+ [(set_attr "simd_type" "simd_mla")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_mls_elt_<vswap_width_name><mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+ (minus:VDQHS
+ (match_operand:VDQHS 4 "register_operand" "0")
+ (mult:VDQHS
+ (vec_duplicate:VDQHS
+ (vec_select:<VEL>
+ (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+ (parallel [(match_operand:SI 2 "immediate_operand")])))
+ (match_operand:VDQHS 3 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+ [(set_attr "simd_type" "simd_mla")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
;; Max/Min operations.
(define_insn "<su><maxmin><mode>3"
[(set (match_operand:VQ_S 0 "register_operand" "=w")
@@ -1483,6 +1547,137 @@
(set_attr "simd_mode" "<MODE>")]
)
+(define_insn "*aarch64_fma4_elt<mode>"
+ [(set (match_operand:VDQF 0 "register_operand" "=w")
+ (fma:VDQF
+ (vec_duplicate:VDQF
+ (vec_select:<VEL>
+ (match_operand:VDQF 1 "register_operand" "<h_con>")
+ (parallel [(match_operand:SI 2 "immediate_operand")])))
+ (match_operand:VDQF 3 "register_operand" "w")
+ (match_operand:VDQF 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+ [(set_attr "simd_type" "simd_fmla_elt")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fma4_elt_<vswap_width_name><mode>"
+ [(set (match_operand:VDQSF 0 "register_operand" "=w")
+ (fma:VDQSF
+ (vec_duplicate:VDQSF
+ (vec_select:<VEL>
+ (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+ (parallel [(match_operand:SI 2 "immediate_operand")])))
+ (match_operand:VDQSF 3 "register_operand" "w")
+ (match_operand:VDQSF 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+ [(set_attr "simd_type" "simd_fmla_elt")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fma4_elt_to_128df"
+ [(set (match_operand:V2DF 0 "register_operand" "=w")
+ (fma:V2DF
+ (vec_duplicate:V2DF
+ (match_operand:DF 1 "register_operand" "w"))
+ (match_operand:V2DF 2 "register_operand" "w")
+ (match_operand:V2DF 3 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "fmla\\t%0.2d, %2.2d, %1.2d[0]"
+ [(set_attr "simd_type" "simd_fmla_elt")
+ (set_attr "simd_mode" "V2DF")]
+)
+
+(define_insn "*aarch64_fma4_elt_to_64v2df"
+ [(set (match_operand:DF 0 "register_operand" "=w")
+ (fma:DF
+ (vec_select:DF
+ (match_operand:V2DF 1 "register_operand" "w")
+ (parallel [(match_operand:SI 2 "immediate_operand")]))
+ (match_operand:DF 3 "register_operand" "w")
+ (match_operand:DF 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "fmla\\t%0.2d, %3.2d, %1.2d[%2]"
+ [(set_attr "simd_type" "simd_fmla_elt")
+ (set_attr "simd_mode" "V2DF")]
+)
+
+(define_insn "fnma<mode>4"
+ [(set (match_operand:VDQF 0 "register_operand" "=w")
+ (fma:VDQF
+ (match_operand:VDQF 1 "register_operand" "w")
+ (neg:VDQF
+ (match_operand:VDQF 2 "register_operand" "w"))
+ (match_operand:VDQF 3 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+ [(set_attr "simd_type" "simd_fmla")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fnma4_elt<mode>"
+ [(set (match_operand:VDQF 0 "register_operand" "=w")
+ (fma:VDQF
+ (neg:VDQF
+ (match_operand:VDQF 3 "register_operand" "w"))
+ (vec_duplicate:VDQF
+ (vec_select:<VEL>
+ (match_operand:VDQF 1 "register_operand" "<h_con>")
+ (parallel [(match_operand:SI 2 "immediate_operand")])))
+ (match_operand:VDQF 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+ [(set_attr "simd_type" "simd_fmla_elt")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode>"
+ [(set (match_operand:VDQSF 0 "register_operand" "=w")
+ (fma:VDQSF
+ (neg:VDQSF
+ (match_operand:VDQSF 3 "register_operand" "w"))
+ (vec_duplicate:VDQSF
+ (vec_select:<VEL>
+ (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+ (parallel [(match_operand:SI 2 "immediate_operand")])))
+ (match_operand:VDQSF 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+ [(set_attr "simd_type" "simd_fmla_elt")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fnma4_elt_to_128df"
+ [(set (match_operand:V2DF 0 "register_operand" "=w")
+ (fma:V2DF
+ (neg:V2DF
+ (match_operand:V2DF 2 "register_operand" "w"))
+ (vec_duplicate:V2DF
+ (match_operand:DF 1 "register_operand" "w"))
+ (match_operand:V2DF 3 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "fmls\\t%0.2d, %2.2d, %1.2d[0]"
+ [(set_attr "simd_type" "simd_fmla_elt")
+ (set_attr "simd_mode" "V2DF")]
+)
+
+(define_insn "*aarch64_fnma4_elt_to_64v2df"
+ [(set (match_operand:DF 0 "register_operand" "=w")
+ (fma:DF
+ (vec_select:DF
+ (match_operand:V2DF 1 "register_operand" "w")
+ (parallel [(match_operand:SI 2 "immediate_operand")]))
+ (neg:DF
+ (match_operand:DF 3 "register_operand" "w"))
+ (match_operand:DF 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "fmls\\t%0.2d, %3.2d, %1.2d[%2]"
+ [(set_attr "simd_type" "simd_fmla_elt")
+ (set_attr "simd_mode" "V2DF")]
+)
+
;; Vector versions of the floating-point frint patterns.
;; Expands to btrunc, ceil, floor, nearbyint, rint, round.
(define_insn "<frint_pattern><mode>2"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 6c9dd79..cb58602 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6100,33 +6100,6 @@ vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
return result;
}
-#define vfma_lane_f32(a, b, c, d) \
- __extension__ \
- ({ \
- float32x2_t c_ = (c); \
- float32x2_t b_ = (b); \
- float32x2_t a_ = (a); \
- float32x2_t result; \
- __asm__ ("fmla %0.2s,%2.2s,%3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vfmad_lane_f64(a, b, c) \
- __extension__ \
- ({ \
- float64x2_t b_ = (b); \
- float64_t a_ = (a); \
- float64_t result; \
- __asm__ ("fmla %d0,%d1,%2.d[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
-
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
{
@@ -6149,47 +6122,6 @@ vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
return result;
}
-#define vfmaq_lane_f32(a, b, c, d) \
- __extension__ \
- ({ \
- float32x4_t c_ = (c); \
- float32x4_t b_ = (b); \
- float32x4_t a_ = (a); \
- float32x4_t result; \
- __asm__ ("fmla %0.4s,%2.4s,%3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vfmaq_lane_f64(a, b, c, d) \
- __extension__ \
- ({ \
- float64x2_t c_ = (c); \
- float64x2_t b_ = (b); \
- float64x2_t a_ = (a); \
- float64x2_t result; \
- __asm__ ("fmla %0.2d,%2.2d,%3.d[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vfmas_lane_f32(a, b, c) \
- __extension__ \
- ({ \
- float32x4_t b_ = (b); \
- float32_t a_ = (a); \
- float32_t result; \
- __asm__ ("fmla %s0,%s1,%2.s[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
-
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
{
@@ -6234,19 +6166,6 @@ vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
return result;
}
-#define vfmsd_lane_f64(a, b, c) \
- __extension__ \
- ({ \
- float64x2_t b_ = (b); \
- float64_t a_ = (a); \
- float64_t result; \
- __asm__ ("fmls %d0,%d1,%2.d[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
-
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vfmsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
{
@@ -6269,19 +6188,6 @@ vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
return result;
}
-#define vfmss_lane_f32(a, b, c) \
- __extension__ \
- ({ \
- float32x4_t b_ = (b); \
- float32_t a_ = (a); \
- float32_t result; \
- __asm__ ("fmls %s0,%s1,%2.s[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
-
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vget_high_f32 (float32x4_t a)
{
@@ -7122,133 +7028,6 @@ vld1q_dup_u64 (const uint64_t * a)
result; \
})
-#define vmla_lane_f32(a, b, c, d) \
- __extension__ \
- ({ \
- float32x2_t c_ = (c); \
- float32x2_t b_ = (b); \
- float32x2_t a_ = (a); \
- float32x2_t result; \
- float32x2_t t1; \
- __asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fadd %0.2s, %0.2s, %1.2s" \
- : "=w"(result), "=w"(t1) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmla_lane_s16(a, b, c, d) \
- __extension__ \
- ({ \
- int16x4_t c_ = (c); \
- int16x4_t b_ = (b); \
- int16x4_t a_ = (a); \
- int16x4_t result; \
- __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmla_lane_s32(a, b, c, d) \
- __extension__ \
- ({ \
- int32x2_t c_ = (c); \
- int32x2_t b_ = (b); \
- int32x2_t a_ = (a); \
- int32x2_t result; \
- __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmla_lane_u16(a, b, c, d) \
- __extension__ \
- ({ \
- uint16x4_t c_ = (c); \
- uint16x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint16x4_t result; \
- __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmla_lane_u32(a, b, c, d) \
- __extension__ \
- ({ \
- uint32x2_t c_ = (c); \
- uint32x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint32x2_t result; \
- __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmla_laneq_s16(a, b, c, d) \
- __extension__ \
- ({ \
- int16x8_t c_ = (c); \
- int16x4_t b_ = (b); \
- int16x4_t a_ = (a); \
- int16x4_t result; \
- __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmla_laneq_s32(a, b, c, d) \
- __extension__ \
- ({ \
- int32x4_t c_ = (c); \
- int32x2_t b_ = (b); \
- int32x2_t a_ = (a); \
- int32x2_t result; \
- __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmla_laneq_u16(a, b, c, d) \
- __extension__ \
- ({ \
- uint16x8_t c_ = (c); \
- uint16x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint16x4_t result; \
- __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmla_laneq_u32(a, b, c, d) \
- __extension__ \
- ({ \
- uint32x4_t c_ = (c); \
- uint32x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint32x2_t result; \
- __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
{
@@ -7815,133 +7594,6 @@ vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
return result;
}
-#define vmlaq_lane_f32(a, b, c, d) \
- __extension__ \
- ({ \
- float32x4_t c_ = (c); \
- float32x4_t b_ = (b); \
- float32x4_t a_ = (a); \
- float32x4_t result; \
- float32x4_t t1; \
- __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fadd %0.4s, %0.4s, %1.4s" \
- : "=w"(result), "=w"(t1) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlaq_lane_s16(a, b, c, d) \
- __extension__ \
- ({ \
- int16x8_t c_ = (c); \
- int16x8_t b_ = (b); \
- int16x8_t a_ = (a); \
- int16x8_t result; \
- __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlaq_lane_s32(a, b, c, d) \
- __extension__ \
- ({ \
- int32x4_t c_ = (c); \
- int32x4_t b_ = (b); \
- int32x4_t a_ = (a); \
- int32x4_t result; \
- __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlaq_lane_u16(a, b, c, d) \
- __extension__ \
- ({ \
- uint16x8_t c_ = (c); \
- uint16x8_t b_ = (b); \
- uint16x8_t a_ = (a); \
- uint16x8_t result; \
- __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlaq_lane_u32(a, b, c, d) \
- __extension__ \
- ({ \
- uint32x4_t c_ = (c); \
- uint32x4_t b_ = (b); \
- uint32x4_t a_ = (a); \
- uint32x4_t result; \
- __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlaq_laneq_s16(a, b, c, d) \
- __extension__ \
- ({ \
- int16x8_t c_ = (c); \
- int16x8_t b_ = (b); \
- int16x8_t a_ = (a); \
- int16x8_t result; \
- __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlaq_laneq_s32(a, b, c, d) \
- __extension__ \
- ({ \
- int32x4_t c_ = (c); \
- int32x4_t b_ = (b); \
- int32x4_t a_ = (a); \
- int32x4_t result; \
- __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlaq_laneq_u16(a, b, c, d) \
- __extension__ \
- ({ \
- uint16x8_t c_ = (c); \
- uint16x8_t b_ = (b); \
- uint16x8_t a_ = (a); \
- uint16x8_t result; \
- __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlaq_laneq_u32(a, b, c, d) \
- __extension__ \
- ({ \
- uint32x4_t c_ = (c); \
- uint32x4_t b_ = (b); \
- uint32x4_t a_ = (a); \
- uint32x4_t result; \
- __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
{
@@ -8076,77 +7728,6 @@ vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
return result;
}
-#define vmls_lane_f32(a, b, c, d) \
- __extension__ \
- ({ \
- float32x2_t c_ = (c); \
- float32x2_t b_ = (b); \
- float32x2_t a_ = (a); \
- float32x2_t result; \
- float32x2_t t1; \
- __asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fsub %0.2s, %0.2s, %1.2s" \
- : "=w"(result), "=w"(t1) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmls_lane_s16(a, b, c, d) \
- __extension__ \
- ({ \
- int16x4_t c_ = (c); \
- int16x4_t b_ = (b); \
- int16x4_t a_ = (a); \
- int16x4_t result; \
- __asm__ ("mls %0.4h,%2.4h,%3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmls_lane_s32(a, b, c, d) \
- __extension__ \
- ({ \
- int32x2_t c_ = (c); \
- int32x2_t b_ = (b); \
- int32x2_t a_ = (a); \
- int32x2_t result; \
- __asm__ ("mls %0.2s,%2.2s,%3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmls_lane_u16(a, b, c, d) \
- __extension__ \
- ({ \
- uint16x4_t c_ = (c); \
- uint16x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint16x4_t result; \
- __asm__ ("mls %0.4h,%2.4h,%3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmls_lane_u32(a, b, c, d) \
- __extension__ \
- ({ \
- uint32x2_t c_ = (c); \
- uint32x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint32x2_t result; \
- __asm__ ("mls %0.2s,%2.2s,%3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
{
@@ -8713,148 +8294,6 @@ vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
return result;
}
-#define vmlsq_lane_f32(a, b, c, d) \
- __extension__ \
- ({ \
- float32x4_t c_ = (c); \
- float32x4_t b_ = (b); \
- float32x4_t a_ = (a); \
- float32x4_t result; \
- float32x4_t t1; \
- __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s" \
- : "=w"(result), "=w"(t1) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlsq_lane_s16(a, b, c, d) \
- __extension__ \
- ({ \
- int16x8_t c_ = (c); \
- int16x8_t b_ = (b); \
- int16x8_t a_ = (a); \
- int16x8_t result; \
- __asm__ ("mls %0.8h,%2.8h,%3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlsq_lane_s32(a, b, c, d) \
- __extension__ \
- ({ \
- int32x4_t c_ = (c); \
- int32x4_t b_ = (b); \
- int32x4_t a_ = (a); \
- int32x4_t result; \
- __asm__ ("mls %0.4s,%2.4s,%3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlsq_lane_u16(a, b, c, d) \
- __extension__ \
- ({ \
- uint16x8_t c_ = (c); \
- uint16x8_t b_ = (b); \
- uint16x8_t a_ = (a); \
- uint16x8_t result; \
- __asm__ ("mls %0.8h,%2.8h,%3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlsq_lane_u32(a, b, c, d) \
- __extension__ \
- ({ \
- uint32x4_t c_ = (c); \
- uint32x4_t b_ = (b); \
- uint32x4_t a_ = (a); \
- uint32x4_t result; \
- __asm__ ("mls %0.4s,%2.4s,%3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vmlsq_laneq_f32(__a, __b, __c, __d) \
- __extension__ \
- ({ \
- float32x4_t __c_ = (__c); \
- float32x4_t __b_ = (__b); \
- float32x4_t __a_ = (__a); \
- float32x4_t __result; \
- float32x4_t __t1; \
- __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s" \
- : "=w"(__result), "=w"(__t1) \
- : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \
- : /* No clobbers */); \
- __result; \
- })
-
-#define vmlsq_laneq_s16(__a, __b, __c, __d) \
- __extension__ \
- ({ \
- int16x8_t __c_ = (__c); \
- int16x8_t __b_ = (__b); \
- int16x8_t __a_ = (__a); \
- int16x8_t __result; \
- __asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \
- : "=w"(__result) \
- : "0"(__a_), "w"(__b_), "x"(__c_), "i"(__d) \
- : /* No clobbers */); \
- __result; \
- })
-
-#define vmlsq_laneq_s32(__a, __b, __c, __d) \
- __extension__ \
- ({ \
- int32x4_t __c_ = (__c); \
- int32x4_t __b_ = (__b); \
- int32x4_t __a_ = (__a); \
- int32x4_t __result; \
- __asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \
- : "=w"(__result) \
- : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \
- : /* No clobbers */); \
- __result; \
- })
-
-#define vmlsq_laneq_u16(__a, __b, __c, __d) \
- __extension__ \
- ({ \
- uint16x8_t __c_ = (__c); \
- uint16x8_t __b_ = (__b); \
- uint16x8_t __a_ = (__a); \
- uint16x8_t __result; \
- __asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \
- : "=w"(__result) \
- : "0"(__a_), "w"(__b_), "x"(__c_), "i"(__d) \
- : /* No clobbers */); \
- __result; \
- })
-
-#define vmlsq_laneq_u32(__a, __b, __c, __d) \
- __extension__ \
- ({ \
- uint32x4_t __c_ = (__c); \
- uint32x4_t __b_ = (__b); \
- uint32x4_t __a_ = (__a); \
- uint32x4_t __result; \
- __asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \
- : "=w"(__result) \
- : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \
- : /* No clobbers */); \
- __result; \
- })
-
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
{
@@ -19614,6 +19053,210 @@ vdupd_laneq_u64 (uint64x2_t __a, const int __b)
return __aarch64_vgetq_lane_u64 (__a, __b);
}
+/* vfma_lane */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_lane_f32 (float32x2_t __a, float32x2_t __b,
+ float32x2_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav2sf (__b,
+ __aarch64_vdup_lane_f32 (__c, __lane),
+ __a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfma_lane_f64 (float64_t __a, float64_t __b,
+ float64_t __c, const int __lane)
+{
+ return __builtin_fma (__b, __c, __a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfmad_lane_f64 (float64_t __a, float64_t __b,
+ float64_t __c, const int __lane)
+{
+ return __builtin_fma (__b, __c, __a);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vfmas_lane_f32 (float32_t __a, float32_t __b,
+ float32x2_t __c, const int __lane)
+{
+ return __builtin_fmaf (__b, __aarch64_vget_lane_f32 (__c, __lane), __a);
+}
+
+/* vfma_laneq */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_laneq_f32 (float32x2_t __a, float32x2_t __b,
+ float32x4_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav2sf (__b,
+ __aarch64_vdup_laneq_f32 (__c, __lane),
+ __a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfma_laneq_f64 (float64_t __a, float64_t __b,
+ float64x2_t __c, const int __lane)
+{
+ return __builtin_fma (__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfmad_laneq_f64 (float64_t __a, float64_t __b,
+ float64x2_t __c, const int __lane)
+{
+ return __builtin_fma (__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vfmas_laneq_f32 (float32_t __a, float32_t __b,
+ float32x4_t __c, const int __lane)
+{
+ return __builtin_fmaf (__b, __aarch64_vgetq_lane_f32 (__c, __lane), __a);
+}
+
+/* vfmaq_lane */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_lane_f32 (float32x4_t __a, float32x4_t __b,
+ float32x2_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav4sf (__b,
+ __aarch64_vdupq_lane_f32 (__c, __lane),
+ __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmaq_lane_f64 (float64x2_t __a, float64x2_t __b,
+ float64_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
+}
+
+/* vfmaq_laneq */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+ float32x4_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav4sf (__b,
+ __aarch64_vdupq_laneq_f32 (__c, __lane),
+ __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmaq_laneq_f64 (float64x2_t __a, float64x2_t __b,
+ float64x2_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav2df (__b,
+ __aarch64_vdupq_laneq_f64 (__c, __lane),
+ __a);
+}
+
+/* vfms_lane */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_lane_f32 (float32x2_t __a, float32x2_t __b,
+ float32x2_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav2sf (-__b,
+ __aarch64_vdup_lane_f32 (__c, __lane),
+ __a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfms_lane_f64 (float64_t __a, float64_t __b,
+ float64_t __c, const int __lane)
+{
+ return __builtin_fma (-__b, __c, __a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfmsd_lane_f64 (float64_t __a, float64_t __b,
+ float64_t __c, const int __lane)
+{
+ return __builtin_fma (-__b, __c, __a);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vfmss_lane_f32 (float32_t __a, float32_t __b,
+ float32x2_t __c, const int __lane)
+{
+ return __builtin_fmaf (-__b, __aarch64_vget_lane_f32 (__c, __lane), __a);
+}
+
+/* vfms_laneq */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_laneq_f32 (float32x2_t __a, float32x2_t __b,
+ float32x4_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav2sf (-__b,
+ __aarch64_vdup_laneq_f32 (__c, __lane),
+ __a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfms_laneq_f64 (float64_t __a, float64_t __b,
+ float64x2_t __c, const int __lane)
+{
+ return __builtin_fma (-__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfmsd_laneq_f64 (float64_t __a, float64_t __b,
+ float64x2_t __c, const int __lane)
+{
+ return __builtin_fma (-__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vfmss_laneq_f32 (float32_t __a, float32_t __b,
+ float32x4_t __c, const int __lane)
+{
+ return __builtin_fmaf (-__b, __aarch64_vgetq_lane_f32 (__c, __lane), __a);
+}
+
+/* vfmsq_lane */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b,
+ float32x2_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav4sf (-__b,
+ __aarch64_vdupq_lane_f32 (__c, __lane),
+ __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b,
+ float64_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
+}
+
+/* vfmsq_laneq */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+ float32x4_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav4sf (-__b,
+ __aarch64_vdupq_laneq_f32 (__c, __lane),
+ __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
+ float64x2_t __c, const int __lane)
+{
+ return __builtin_aarch64_fmav2df (-__b,
+ __aarch64_vdupq_laneq_f64 (__c, __lane),
+ __a);
+}
+
/* vld1 */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
@@ -21131,6 +20774,156 @@ vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
return a + b * c;
}
+/* vmla_lane */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmla_lane_f32 (float32x2_t __a, float32x2_t __b,
+ float32x2_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vget_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmla_lane_s16 (int16x4_t __a, int16x4_t __b,
+ int16x4_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vget_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmla_lane_s32 (int32x2_t __a, int32x2_t __b,
+ int32x2_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vget_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b,
+ uint16x4_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vget_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b,
+ uint32x2_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vget_lane_u32 (__c, __lane)));
+}
+
+/* vmla_laneq */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmla_laneq_f32 (float32x2_t __a, float32x2_t __b,
+ float32x4_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmla_laneq_s16 (int16x4_t __a, int16x4_t __b,
+ int16x8_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmla_laneq_s32 (int32x2_t __a, int32x2_t __b,
+ int32x4_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
+ uint16x8_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
+ uint32x4_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
+}
+
+/* vmlaq_lane */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
+ float32x2_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vget_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b,
+ int16x4_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vget_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b,
+ int32x2_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vget_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
+ uint16x4_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vget_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
+ uint32x2_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vget_lane_u32 (__c, __lane)));
+}
+
+ /* vmlaq_laneq */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+ float32x4_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b,
+ int16x8_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b,
+ int32x4_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
+ uint16x8_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
+ uint32x4_t __c, const int __lane)
+{
+ return (__a + (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
+}
+
+/* vmls */
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
{
@@ -21149,6 +20942,153 @@ vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
return a - b * c;
}
+/* vmls_lane */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmls_lane_f32 (float32x2_t __a, float32x2_t __b,
+ float32x2_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vget_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmls_lane_s16 (int16x4_t __a, int16x4_t __b,
+ int16x4_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vget_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmls_lane_s32 (int32x2_t __a, int32x2_t __b,
+ int32x2_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vget_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b,
+ uint16x4_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vget_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b,
+ uint32x2_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vget_lane_u32 (__c, __lane)));
+}
+
+/* vmls_laneq */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmls_laneq_f32 (float32x2_t __a, float32x2_t __b,
+ float32x4_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmls_laneq_s16 (int16x4_t __a, int16x4_t __b,
+ int16x8_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmls_laneq_s32 (int32x2_t __a, int32x2_t __b,
+ int32x4_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
+ uint16x8_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
+ uint32x4_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
+}
+
+/* vmlsq_lane */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b,
+ float32x2_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vget_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b,
+ int16x4_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vget_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b,
+ int32x2_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vget_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
+ uint16x4_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vget_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
+ uint32x2_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vget_lane_u32 (__c, __lane)));
+}
+
+ /* vmlsq_laneq */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+ float32x4_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b,
+ int16x8_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b,
+ int32x4_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
+}
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
+ uint16x8_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
+ uint32x4_t __c, const int __lane)
+{
+ return (__a - (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
+}
+
/* vmul_lane */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index a6b3117..ec8d813 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -89,6 +89,9 @@
;; Vector Float modes.
(define_mode_iterator VDQF [V2SF V4SF V2DF])
+;; Vector single Float modes.
+(define_mode_iterator VDQSF [V2SF V4SF])
+
;; Modes suitable to use as the return type of a vcond expression.
(define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])